pax_global_header00006660000000000000000000000064126427736670014536gustar00rootroot0000000000000052 comment=d16f7b3b13a4c4a10ed0be20e84fcd5472cd7d09 clblas-2.10/000077500000000000000000000000001264277366700127205ustar00rootroot00000000000000clblas-2.10/.gitattributes000066400000000000000000000007431264277366700156170ustar00rootroot00000000000000# Auto detect text files and perform LF normalization * text=auto # Custom for Visual Studio *.cs diff=csharp *.sln merge=union *.csproj merge=union *.vbproj merge=union *.fsproj merge=union *.dbproj merge=union # Standard to msysgit *.doc diff=astextplain *.DOC diff=astextplain *.docx diff=astextplain *.DOCX diff=astextplain *.dot diff=astextplain *.DOT diff=astextplain *.pdf diff=astextplain *.PDF diff=astextplain *.rtf diff=astextplain *.RTF diff=astextplain clblas-2.10/.gitignore000066400000000000000000000003741264277366700147140ustar00rootroot00000000000000# Compiled Object files *.slo *.lo *.o *.obj # Compiled Dynamic libraries *.so *.dylib *.dll # Compiled Static libraries *.lai *.la *.a *.lib # Generated kernel template files *.clT # flags.txt file *flags.txt # vim temp files .*.swp src/build/ clblas-2.10/.travis.yml000066400000000000000000000143571264277366700150430ustar00rootroot00000000000000# Ubuntu name decoder ring; https://en.wikipedia.org/wiki/List_of_Ubuntu_releases # Ubuntu 12.04 LTS (Precise Pangolin) <== Travis CI VM image # Ubuntu 12.10 (Quantal Quetzal) # Ubuntu 13.04 (Raring Ringtail) # Ubuntu 13.10 (Saucy Salamander) # Ubuntu 14.04 LTS (Trusty Tahr) # Ubuntu 14.10 (Utopic Unicorn) # Ubuntu 15.04 (Vivid Vervet) # Ubuntu 15.10 (Wily Werewolf) # Ubuntu 16.04 LTS (Xenial Xantus) # language: instructs travis what compilers && environment to set up in build matrix language: cpp # sudo: false instructs travis to build our project in a docker VM (faster) # Can not yet install fglrx packages with 'false' sudo: required # false dist: trusty # os: expands the build matrix to include multiple os's # disable linux, as we get sporadic failures on building boost, needs investigation os: - linux - osx # compiler: expands the build matrix to include multiple compilers (per os) compiler: - gcc - clang addons: # apt: is disabled on osx builds # apt: needed by docker framework to install project dependencies without # sudo. Apt uses published Ubunto PPA's from https://launchpad.net/ # https://github.com/travis-ci/apt-source-whitelist/blob/master/ubuntu.json apt: sources: # ubuntu-toolchain-r-test contains newer versions of gcc to install # - ubuntu-toolchain-r-test # llvm-toolchain-precise-3.6 contains newer versions of clang to install # - llvm-toolchain-precise-3.6 # kubuntu-backports contains newer versions of cmake to install - kubuntu-backports # boost-latest contains boost v1.55 - boost-latest packages: - gfortran # g++-4.8 is minimum version considered to be the first good c++11 gnu compiler # - g++-4.8 # - clang-3.6 # We require v2.8.12 minimum - cmake # I'm finding problems between pre-compiled versions of boost ublas, with gtest # stl_algobase.h: error: no matching function for call to swap() - libboost-program-options1.55-dev # - libboost-serialization1.55-dev # - libboost-filesystem1.55-dev # - libboost-system1.55-dev # - libboost-regex1.55-dev # The package opencl-headers on 'precise' only installs v1.1 cl headers; uncomment for 'trusty' or greater # - opencl-headers # Uncomment one of the following when fglrx modules are added to the apt whitelist # - fglrx # - fglrx=2:8.960-0ubuntu1 # - fglrx=2:13.350.1-0ubuntu0.0.1 # env: specifies additional global variables to define per row in build matrix env: global: - CLBLAS_ROOT=${TRAVIS_BUILD_DIR}/bin/make/release - OPENCL_REGISTRY=https://www.khronos.org/registry/cl - OPENCL_ROOT=${TRAVIS_BUILD_DIR}/bin/opencl # The following filters our build matrix; we are interested in linux-gcc & osx-clang matrix: exclude: - os: linux compiler: clang - os: osx compiler: gcc before_install: # Remove the following linux clause when fglrx can be installed with sudo: false #- if [ ${TRAVIS_OS_NAME} == "linux" ]; then # sudo apt-get update -qq && # sudo apt-get install -qq fglrx=2:13.350.1-0ubuntu0.0.1; # fi #- if [ ${TRAVIS_OS_NAME} == "linux" ]; then # export OPENCL_ROOT="${TRAVIS_BUILD_DIR}/opencl-headers"; # fi - if [ ${TRAVIS_OS_NAME} == "osx" ]; then brew update; brew outdated boost || brew upgrade boost; brew outdated cmake || brew upgrade cmake; fi # - if [ ${CXX} = "g++" ]; then export CXX="g++-4.8" CC="gcc-4.8"; fi - cmake --version; - ${CC} --version; - ${CXX} --version; install: # 'Precise' only distributes v1.1 opencl headers; download 1.2 headers from khronos website # Remove when the travis VM upgrades to 'trusty' or beyond #- if [ ${TRAVIS_OS_NAME} == "linux" ]; then # mkdir -p ${OPENCL_ROOT}/include/CL; # pushd ${OPENCL_ROOT}/include/CL; # wget -w 1 -r -np -nd -nv -A h,hpp https://www.khronos.org/registry/cl/api/1.2/; # popd; # fi # The following linux logic is necessary because of Travis's move to the GCE platform, which does not # currently contain packages for fglrx: https://github.com/travis-ci/travis-ci/issues/5221 # We build our own linkable .so file - if [ ${TRAVIS_OS_NAME} == "linux" ]; then mkdir -p ${OPENCL_ROOT}; pushd ${OPENCL_ROOT}; wget ${OPENCL_REGISTRY}/specs/opencl-icd-1.2.11.0.tgz; tar -xf opencl-icd-1.2.11.0.tgz; mv ./icd/* .; mkdir -p inc/CL; pushd inc/CL; wget -r -w 1 -np -nd -nv -A h,hpp https://www.khronos.org/registry/cl/api/1.2/; wget -w 1 -np -nd -nv -A h,hpp https://www.khronos.org/registry/cl/api/2.1/cl.hpp; popd; mkdir -p lib; pushd lib; cmake -G "Unix Makefiles" ..; make; cp ../bin/libOpenCL.so .; popd; mv inc/ include/; popd; fi # osx image does not contain cl.hpp file; download from Khronos # - if [ ${TRAVIS_OS_NAME} == "osx" ]; then # pushd /System/Library/Frameworks/OpenCL.framework/Versions/A/Headers/; # sudo wget -w 1 -np -nd -nv -A h,hpp https://www.khronos.org/registry/cl/api/1.2/cl.hpp; # popd; # fi # Use before_script: to run configure steps before_script: - mkdir -p ${CLBLAS_ROOT} - pushd ${CLBLAS_ROOT} - cmake -DCMAKE_BUILD_TYPE=Release -DBUILD_TEST=OFF -DBUILD_CLIENT=OFF -DOCL_VERSION=2.0 -DOPENCL_ROOT=${OPENCL_ROOT} ${TRAVIS_BUILD_DIR}/src # use script: to execute build steps script: - make package #deploy: # provider: releases # prerelease: true # draft: true # skip_cleanup: true # api_key: # secure: MBkxtcfSk+4UvGRO+WRhmS86vIVzAs0LIF2sAtr/S+Ed+OdUAuhZypUsDXGWtK3mL55v9c8BZXefFfHfJqElcNmyHKwCptbCR/JiM8YBtjoy2/RW1NcJUZp+QuRlk23xPADj7QkPjv7dfrQUMitkLUXAD+uTmMe2l8gmlbhMrQqPBKhb+31FNv6Lmo6oa6GjbiGi7qjsrJc7uQjhppLam+M7BZbBALGbIqMIrb2BMDMMhBoDbb4zSKrSg3+krd3kKiCClJlK7xjIlyFXZ527ETQ+PMtIeQb0eJ3aQwa4caBRCm5BDzt8GnJ48S88EkynbQioCEE87ebcyOM7M+wfslW/Fm1Y86X5odIljkOmTNKoDvgLxc9vUCBtMyVHNIgZcToPdsrMsGxcHV+JtU3yVQVm6dnA5P/zG5bA+aBjsd7p7BdOE4fdhvZV5XRAk/wmiyWalF7hKJxHIiWAKknL+tpPDDUF+fHmDDsdf7yRDJBegNcKfw4+m19MIvLn9fbiNVCtwCAL1T4yWkIEpi4MRMDPtftmkZPbi6UwluOJUTeCeHe4en99Yu2haemNPqXs6rR0LlXGk31GQwzlrNfb+94F5tT2a4Ka4PsruA2NMW/IYCYEE5Gu7PihVDR031Fn9cdCU9kefUgyB07rJD6q/W+ljsU0osyg7VxyfMg8rkw= # file: ${CLBLAS_ROOT}/clBLAS-build/*.tar.gz # file_glob: true # on: # all_branches: true # tags: true clblas-2.10/CHANGELOG000066400000000000000000000240641264277366700141400ustar00rootroot00000000000000# ######################################################################## # Copyright 2013 Advanced Micro Devices, Inc. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # ######################################################################## clBLAS Readme Version: 1.10 Release Date: April 2013 ChangeLog: ____________ Current Version: New: * New Level 1 routines added (an 'x' implies all 4 precisions) xSWAP, xCOPY, xSCAL, CSSCAL, ZDSCAL, xAXPY, SDOT, DDOT, CDOTU, ZDOTU, CDOTC, ZDOTC, xROTG, SROTMG, DROTMG, SROT, DROT, CSROT, ZDROT, SROTM, DROTM, SNRM2, DNRM2, SCNRM2, DZNRM2, ixAMAX, SASUM, DASUM, SCASUM, DZASUM * Samples have been added for the new functions * This release tested using the 9.012 runtime driver and the 2.8 APPSDK Fixed: * Failures in *trsm functions with clMAGMA tests Known Issues: * Failures & hangs in ztrmm, *trsv, *tpsv functions on Southern Island GPU devices * Failures in zgemm functions on Northern Island GPU devices * Failures & hangs are expected to be fixed in the upcoming AMD graphics driver versions. It is strongly recommended that users keep their graphics driver versions up to date. ____________ Version 1.8.291: Fixed: * Failures in the following functions: ssyr2, ssyr2k, strsm, strsv, ssyrk, cher, ctrsv, csymm, cher2, ztrmm on Southern Island GPU devices. * Failures in the following functions: dsyr, dsyr2, dgemv, dsyrk, dsyr2k, zsyr2k on Trinity platforms. Known Issues: * Failures in *trsm functions with clMAGMA tests ____________ Version 1.8.269 (Beta, clMAGMA support): New: * No new routines * This release tested using the 8.961 runtime driver and the 2.6 APPSDK Known Issues: * The clBLASTune executable has been observed to hang on Windows. If this happens, abort execution of the tune program; it is not required for correct operation of the BLAS routines (as of 8.872). * clBLAS can return invalid results on CPU devices (as of 8.961). The CPU device is primarily a test/debug device, and GPU devices are unaffected. * clBLAS can return invalid results for double precision functions (dsyr, dsyr2, dgemv, dsyrk, dsyr2k, zsyr2k) on Trinity platforms (as of 8.961). * clBLAS can return invalid results (ssyr2, ssyr2k, strsm, strsv, ssyrk, cher, ctrsv, csymm, cher2, ztrmm) on Southern Island GPU devices (as of 8.961). ____________ Version 1.7 (Beta, clMAGMA support): New: * New Level 3 routines added (an 'x' implies all 4 precisions) CHER2K, ZHER2K * New Level 2 routines added (an 'x' implies all 4 precisions) xTPMV, xTPSV, SSPVM, DSPMV, CHPMV, ZHPMV, SSPR, DSPR, CHPR, ZHPR, SSPR2, DSPR2, CHPR2, ZHPR2, xGBMV, CHBMV, ZHBMV, SSBMV, DSBMV, xTBMV, xTBSV * Samples have been added for the new functions, but are not fully tested * This release tested using the 8.951 runtime driver and the 2.6 APPSDK * Note that documentation is incomplete for the new functions Known Issues: * The clBLASTune executable has been observed to hang on Windows. If this happens, abort execution of the tune program; it is not required for correct operation of the BLAS routines (as of 8.872). * clBLAS can return invalid results on CPU devices that support AVX (as of 8.951). CPU devices that support up to SSE3 are unaffected. The CPU device is primarily a test/debug device, and GPU devices are unaffected. * clBLAS can return invalid results for double precision functions (dsyr, dsyr2, dgemv, dsyrk, dsyr2k, zsyr2k) on Trinity platforms (as of 8.951). * clBLAS can return invalid results (ssyr, ssyr2, strsv, ctrsv, ssyrk, ssyr2k, ztrmm) on Southern Island GPU devices (as of 8.951). ____________ Version 1.6: New: * New Level 3 routines added (an 'x' implies all 4 precisions) CSYRK, ZSYRK, CSYR2K, ZSYR2K, CHEMM, ZHEMM, CHERK, ZHERK, xSYMM * New Level 2 routines added (an 'x' implies all 4 precisions) CGEMV, ZGEMV, xTRMV, xTRSV, CHEMV, ZHEMV, SGER, DGER, CGERU, ZGERU, CGERC, ZGERC, CHER, ZHER, CHER2, ZHER2, SSYR, DSYR, SSYR2, DSYR2 * For all the original functions prior to 1.6, a new API has been introduced with an *Ex suffix. These extended API's add new parameters that allow users to specify an offset to a matrix argument. This allows efficient sub-matrix indexing within a clBLAS routine without requiring expensive sub-matrix copy operations. * Samples have been added for the new functions * Preview: Support for AMD Radeon™ HD7000 series GPUs * This release tested using the 8.92 runtime driver and the 2.6 APP SDK Known Issues: * The clBLASTune executable has been observed to hang on Windows. If this happens, abort execution of the tune program; it is not required for correct operation of the BLAS routines (as of 8.872). * The CPU device for clBLAS is not functioning for this release (as of 8.872). The CPU device is primarily a test/debug device, and GPU devices are unaffected. ____________ Version 1.4: New: * New Level 3 routines added SSYRK, DSYRK, SSYR2K, DSYR2K * New Level 2 routines added SGEMV, DGEMV, SSYMV, DSYMV * The image support functions (clblasAddScratchImage, clblasRemoveScratchImage) have been deprecated. Images are no longer required for the highest performance. * InstallShield is now used for APPML libraries. The default install location has changed from c:\amd\clBLAS to C:\Program Files (x86)\AMD\clBLAS. It is recommended that previous versions of clBLAS are uninstalled first. * Samples have been added for the new functions * This release tested using the 8.872 runtime driver and the 2.5 APP SDK Known Issues: * The clBLASTune executable has been observed to hang on Windows. If this happens, abort execution of the tune program; it is not required for correct operation of the BLAS routines (as of 8.872). * The CPU device for clBLAS is not functioning for this release (as of 8.872). The CPU device is primarily a test/debug device, and GPU devices are unaffected. ____________ Version 1.2: * The library now supports both 32- and 64-bit Windows and Linux operating systems. * xTRSM routines are available in 1.2. * clBLAS routines return clBLASStatus error codes, instead of native OpenCL error codes Fixed: * xTRMM routines were not properly handling implicit unit diagonal elements and implicit off-diagonal zero values specified by the BLAS parameters SIDE, UPLO and DIAG. * Possible crash with CPU device on 32-bit systems. * clblasDgemm routine return an invalid event as its last argument. * clBLAS routines return clblasStatus error codes, instead of native OpenCL error codes. Known Issues: * The clBLASTune executable has been observed to hang on Windows. If this happens, abort execution of the tune program; it is not required for correct operation of the BLAS routines (as of 8.872). * The CPU device for clBLAS is not functioning for this release (as of 8.872). The CPU device is primarily a test/debug device, and GPU devices are unaffected. ____________________ Version 1.0: * Initial release Known Issues: * Available only on Linux64. * xTRMM routines were not properly handling implicit unit diagonal elements and implicit off-diagonal zero values specified by the BLAS parameters SIDE, UPLO and DIAG * clblasDgemm returned an invalid event as its last argument _____________ Building the Samples: To install the Linux versions of clBLAS, uncompress the initial download, then execute the install script. For example: tar -xf clBLAS-${version}-Linux.tar.gz - This installs three files into the local directory, one being an executable bash script. sudo mkdir /opt/clBLAS-${version} - This pre-creates the install directory with proper permissions in /opt if it is to be installed there. (This is the default.) ./install-clBLAS-${version}.sh - This prints an EULA and uncompresses files into the chosen install directory. cd ${installDir}/bin64 export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:${OpenCLLibDir}:${clBLASLibDir} - Be sure to export library dependencies to resolve all external linkages to the client program; you can create a bash script to help automate this procedure. ./example_sgemm - Run a simple client; one example is provided for each supported main BLAS function family. The sample program does not ship with native build files; instead, a CMake file is shipped, and the user generates a native build file for their system. For example: cd ${installDir} mkdir samplesBin/ - This creates a sister directory to the samples directory that houses the native makefiles and the generated files from the build. cd samplesBin/ ccmake ../samples/ - ccmake is a curses-based cmake program; it takes a parameter that specifies the location of the source code to compile. - Hit 'c' to configure for the platform; ensure that the dependencies to external libraries are satisfied, including paths to 'ATI Stream SDK'. - After dependencies are satisfied, hit 'c' again to finalize configuration. Then, hit 'g' to generate a makefile and exit ccmake. make help - Look at the options available for make. make - Build the sample client program. ./example_sgemm - Run a simple client; one example is provided for each supported main BLAS function family. clblas-2.10/CONTRIBUTING.md000066400000000000000000000142221264277366700151520ustar00rootroot00000000000000## Contributor guidelines Contributing code to this project is intended to be light weight and intuitive to users familiar with GitHub to actively encourage contributions, but a process is documented and should be followed to prevent chaos, confusion and despair. ## The mechanics of contributing code Firstly, in order to contribute code to this project, a contributor must have a valid and current [GitHub account](https://help.github.com/articles/set-up-git) available to use. Given an account, * The potential contributor forks this project into his/her account following the traditional [forking](https://help.github.com/articles/fork-a-repo) model native to GitHub * After forking, the contributor [clones their repository](https://help.github.com/articles/create-a-repo) locally on their machine * Code is developed and checked into the contributor's repository. These commits are eventually pushed upstream to their GitHub repository * The contributor then issues a [pull-request](https://help.github.com/articles/using-pull-requests) against the **develop** branch of this repository, which is the [git flow](http://nvie.com/posts/a-successful-git-branching-model/) workflow which is well suited for working with GitHub * A [git extension](https://github.com/nvie/gitflow) has been developed to ease the use of the 'git flow' methodology, but requires manual installation by the user. Refer to the projects wiki At this point, the repository maintainers will be notified by GitHub that a 'pull request' exists pending against their repository. A code review should be completed within a few days, depending on the scope of submitted code, and the code will either be accepted, rejected or commented on for extra feedback. ## Code submission guidelines We want to ensure that the project code base maintains a level of quality over time, such that future contributors find it as easy to jump into the code as hopefully it is today. As such, pull requests should * remember that clMath is a project licensed under the [Apache License, Version 2.0]( http://www.apache.org/licenses/LICENSE-2.0 ). If you are not already familiar, please review the license before issuing a pull request. We intend this project to be open to external contributors, and encourage developers to contribute code back that they believe will provide value to the overall community. We will interpret an explicit 'pull request' back to this repository as an implicit acknowledgement from the contributor that they wish to share the code with the community under the terms of the Apache license v2.0. * follow the [code style guidelines]( ) of the project as posted to the project wiki. Unfortunately, there was no unifying code guidelines defined between the BLAS & FFT projects, but code submissions should not mix styles within an individual file. We have since defined and posted a code style guideline for the projects and we expect the code to slowly transition to the new guidelines over time * separate check-ins that modify a files style from the ones that add/change/delete code. * target the **develop** branch in the repository * ensure that the [code properly builds]( https://github.com/kknox/clBLAS/wiki/Build ) * cannot break existing test cases * we encourage contributors to [run the test-short]( https://github.com/kknox/clBLAS/wiki/Testing ) suite of tests on their end before the pull-request * if possible, upload the test results associated with the pull request to a personal [gist repository]( https://gist.github.com/ ) and insert a link to the test results in the pull request so that collaborators can browse the results * if no test results are provided with the pull request, official collaborators will run the test suite on their test machines against the patch before we will accept the pull-request * if we detect failing test cases, we will request that the code associated with the pull request be fixed before the pull request will be merged * if new functionality is introduced with the pull request, sufficient test cases should be added to verify the new functionality is correct * new tests should integrate with the existing [googletest framework]( https://code.google.com/p/googletest/wiki/Primer ) located in the src/tests directory of the repo * if the collaborators feel the new tests do not provide sufficient coverage, feedback on the pull request will be left with suggestions on how to improve the tests before the pull request will be merged Pull requests will be reviewed by the set of collaborators that are assigned for the repository. Pull requests may be accepted, declined or a conversation may start on the pull request thread with feedback. If the pull request is trivial and all the submission guidelines defined above are honored, the pull request may be accepted without delay. If the pull request is good, but the guidelines defined above are not followed, the collaborators may leave feedback on the pull request and engage in a conversation with the contributor with what they can do to improve the pull request. At any time, collaborators may decline a pull request if they decide the contribution is not appropriate for the project, or the feedback from reviewers on a pull request is not being addressed in an appropriate amount of time. ## Is it possible to become an official collaborator of the repository? Yes, we hope to promote trusted members of the community, who have proven themselves to be competent and request to take on the extra responsibility to be official collaborators of the project. When an individual requests to be an official collaborator, current project collaborators will browse through the history of the requester's prior pull requests and take a vote amongst themselves if the requester should be promoted to collaborator. These individuals will then have the right to approve/decline pull requests and help shape the path that the project goes. It is worth noting, that on GitHub everybody has read-only access to the source and that everybody has the ability to issue a pull request to contribute to the project. The benefit of being a repository collaborator allows you to be able to manage other peoples pull requests. clblas-2.10/LICENSE000066400000000000000000000236761264277366700137430ustar00rootroot00000000000000 Apache License Version 2.0, January 2004 http://www.apache.org/licenses/ TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 1. Definitions. "License" shall mean the terms and conditions for use, reproduction, and distribution as defined by Sections 1 through 9 of this document. "Licensor" shall mean the copyright owner or entity authorized by the copyright owner that is granting the License. "Legal Entity" shall mean the union of the acting entity and all other entities that control, are controlled by, or are under common control with that entity. For the purposes of this definition, "control" means (i) the power, direct or indirect, to cause the direction or management of such entity, whether by contract or otherwise, or (ii) ownership of fifty percent (50%) or more of the outstanding shares, or (iii) beneficial ownership of such entity. "You" (or "Your") shall mean an individual or Legal Entity exercising permissions granted by this License. "Source" form shall mean the preferred form for making modifications, including but not limited to software source code, documentation source, and configuration files. "Object" form shall mean any form resulting from mechanical transformation or translation of a Source form, including but not limited to compiled object code, generated documentation, and conversions to other media types. "Work" shall mean the work of authorship, whether in Source or Object form, made available under the License, as indicated by a copyright notice that is included in or attached to the work (an example is provided in the Appendix below). "Derivative Works" shall mean any work, whether in Source or Object form, that is based on (or derived from) the Work and for which the editorial revisions, annotations, elaborations, or other modifications represent, as a whole, an original work of authorship. For the purposes of this License, Derivative Works shall not include works that remain separable from, or merely link (or bind by name) to the interfaces of, the Work and Derivative Works thereof. "Contribution" shall mean any work of authorship, including the original version of the Work and any modifications or additions to that Work or Derivative Works thereof, that is intentionally submitted to Licensor for inclusion in the Work by the copyright owner or by an individual or Legal Entity authorized to submit on behalf of the copyright owner. For the purposes of this definition, "submitted" means any form of electronic, verbal, or written communication sent to the Licensor or its representatives, including but not limited to communication on electronic mailing lists, source code control systems, and issue tracking systems that are managed by, or on behalf of, the Licensor for the purpose of discussing and improving the Work, but excluding communication that is conspicuously marked or otherwise designated in writing by the copyright owner as "Not a Contribution." "Contributor" shall mean Licensor and any individual or Legal Entity on behalf of whom a Contribution has been received by Licensor and subsequently incorporated within the Work. 2. Grant of Copyright License. Subject to the terms and conditions of this License, each Contributor hereby grants to You a perpetual, worldwide, non-exclusive, no-charge, royalty-free, irrevocable copyright license to reproduce, prepare Derivative Works of, publicly display, publicly perform, sublicense, and distribute the Work and such Derivative Works in Source or Object form. 3. Grant of Patent License. Subject to the terms and conditions of this License, each Contributor hereby grants to You a perpetual, worldwide, non-exclusive, no-charge, royalty-free, irrevocable (except as stated in this section) patent license to make, have made, use, offer to sell, sell, import, and otherwise transfer the Work, where such license applies only to those patent claims licensable by such Contributor that are necessarily infringed by their Contribution(s) alone or by combination of their Contribution(s) with the Work to which such Contribution(s) was submitted. If You institute patent litigation against any entity (including a cross-claim or counterclaim in a lawsuit) alleging that the Work or a Contribution incorporated within the Work constitutes direct or contributory patent infringement, then any patent licenses granted to You under this License for that Work shall terminate as of the date such litigation is filed. 4. Redistribution. You may reproduce and distribute copies of the Work or Derivative Works thereof in any medium, with or without modifications, and in Source or Object form, provided that You meet the following conditions: (a) You must give any other recipients of the Work or Derivative Works a copy of this License; and (b) You must cause any modified files to carry prominent notices stating that You changed the files; and (c) You must retain, in the Source form of any Derivative Works that You distribute, all copyright, patent, trademark, and attribution notices from the Source form of the Work, excluding those notices that do not pertain to any part of the Derivative Works; and (d) If the Work includes a "NOTICE" text file as part of its distribution, then any Derivative Works that You distribute must include a readable copy of the attribution notices contained within such NOTICE file, excluding those notices that do not pertain to any part of the Derivative Works, in at least one of the following places: within a NOTICE text file distributed as part of the Derivative Works; within the Source form or documentation, if provided along with the Derivative Works; or, within a display generated by the Derivative Works, if and wherever such third-party notices normally appear. The contents of the NOTICE file are for informational purposes only and do not modify the License. You may add Your own attribution notices within Derivative Works that You distribute, alongside or as an addendum to the NOTICE text from the Work, provided that such additional attribution notices cannot be construed as modifying the License. You may add Your own copyright statement to Your modifications and may provide additional or different license terms and conditions for use, reproduction, or distribution of Your modifications, or for any such Derivative Works as a whole, provided Your use, reproduction, and distribution of the Work otherwise complies with the conditions stated in this License. 5. Submission of Contributions. Unless You explicitly state otherwise, any Contribution intentionally submitted for inclusion in the Work by You to the Licensor shall be under the terms and conditions of this License, without any additional terms or conditions. Notwithstanding the above, nothing herein shall supersede or modify the terms of any separate license agreement you may have executed with Licensor regarding such Contributions. 6. Trademarks. This License does not grant permission to use the trade names, trademarks, service marks, or product names of the Licensor, except as required for reasonable and customary use in describing the origin of the Work and reproducing the content of the NOTICE file. 7. Disclaimer of Warranty. Unless required by applicable law or agreed to in writing, Licensor provides the Work (and each Contributor provides its Contributions) on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied, including, without limitation, any warranties or conditions of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A PARTICULAR PURPOSE. You are solely responsible for determining the appropriateness of using or redistributing the Work and assume any risks associated with Your exercise of permissions under this License. 8. Limitation of Liability. In no event and under no legal theory, whether in tort (including negligence), contract, or otherwise, unless required by applicable law (such as deliberate and grossly negligent acts) or agreed to in writing, shall any Contributor be liable to You for damages, including any direct, indirect, special, incidental, or consequential damages of any character arising as a result of this License or out of the use or inability to use the Work (including but not limited to damages for loss of goodwill, work stoppage, computer failure or malfunction, or any and all other commercial damages or losses), even if such Contributor has been advised of the possibility of such damages. 9. Accepting Warranty or Additional Liability. While redistributing the Work or Derivative Works thereof, You may choose to offer, and charge a fee for, acceptance of support, warranty, indemnity, or other liability obligations and/or rights consistent with this License. However, in accepting such obligations, You may act only on Your own behalf and on Your sole responsibility, not on behalf of any other Contributor, and only if You agree to indemnify, defend, and hold each Contributor harmless for any liability incurred by, or claims asserted against, such Contributor by reason of your accepting any such warranty or additional liability. END OF TERMS AND CONDITIONS clblas-2.10/NOTICE000066400000000000000000000002431264277366700136230ustar00rootroot00000000000000AMD clBLAS Copyright 2013 Advanced Micro Devices, Inc. This product includes software developed at Advanced Micro Devices, Inc. (http://www.amd.com). clblas-2.10/README.md000066400000000000000000000177501264277366700142110ustar00rootroot00000000000000## Build Status | Build branch | master | develop | |-----|-----|-----| | GCC/Clang x64 | [![Build Status](https://travis-ci.org/clMathLibraries/clBLAS.svg?branch=master)](https://travis-ci.org/clMathLibraries/clBLAS/branches) | [![Build Status](https://travis-ci.org/clMathLibraries/clBLAS.svg?branch=develop)](https://travis-ci.org/clMathLibraries/clBLAS/branches) | | Visual Studio x64 | [![Build status](https://ci.appveyor.com/api/projects/status/v384bi6e8xv8nxjm/branch/master?svg=true)](https://ci.appveyor.com/project/kknox/clblas-5ph9i/branch/master)|[![Build status](https://ci.appveyor.com/api/projects/status/v384bi6e8xv8nxjm/branch/develop?svg=true)](https://ci.appveyor.com/project/kknox/clblas-5ph9i/branch/develop) | clBLAS ===== This repository houses the code for the OpenCL™ BLAS portion of clMath. The complete set of BLAS level 1, 2 & 3 routines is implemented. Please see Netlib BLAS for the list of supported routines. In addition to GPU devices, the library also supports running on CPU devices to facilitate debugging and multicore programming. APPML 1.10 is the most current generally available pre-packaged binary version of the library available for download for both Linux and Windows platforms. The primary goal of clBLAS is to make it easier for developers to utilize the inherent performance and power efficiency benefits of heterogeneous computing. clBLAS interfaces do not hide nor wrap OpenCL interfaces, but rather leaves OpenCL state management to the control of the user to allow for maximum performance and flexibility. The clBLAS library does generate and enqueue optimized OpenCL kernels, relieving the user from the task of writing, optimizing and maintaining kernel code themselves. ## clBLAS update notes 09/2015 - Introducing [AutoGemm](http://github.com/clMathLibraries/clBLAS/wiki/AutoGemm) - clBLAS's Gemm implementation has been comprehensively overhauled to use AutoGemm. AutoGemm is a suite of python scripts which generate optimized kernels and kernel selection logic, for all precisions, transposes, tile sizes and so on. - CMake is configured to use AutoGemm for clBLAS so the build and usage experience of Gemm remains unchanged (only performance and maintainability has been improved). Kernel sources are generated at build time (not runtime) and can be configured within CMake to be pre-compiled at build time. - clBLAS users with unique Gemm requirements can customize AutoGemm to their needs (such as non-default tile sizes for very small or very skinny matrices); see [AutoGemm](http://github.com/clMathLibraries/clBLAS/wiki/AutoGemm) documentation for details. ## clBLAS library user documentation [Library and API documentation][] for developers is available online as a GitHub Pages website ## Google Groups Two mailing lists have been created for the clMath projects: - [clmath@googlegroups.com][] - group whose focus is to answer questions on using the library or reporting issues - [clmath-developers@googlegroups.com][] - group whose focus is for developers interested in contributing to the library code itself ## clBLAS Wiki The [project wiki][] contains helpful documentation, including a [build primer][] ## Contributing code Please refer to and read the [Contributing][] document for guidelines on how to contribute code to this open source project. The code in the /master branch is considered to be stable, and all pull-requests should be made against the /develop branch. ## License The source for clBLAS is licensed under the [Apache License, Version 2.0]( http://www.apache.org/licenses/LICENSE-2.0 ) ## Example The simple example below shows how to use clBLAS to compute an OpenCL accelerated SGEMM ```c #include #include /* Include the clBLAS header. It includes the appropriate OpenCL headers */ #include /* This example uses predefined matrices and their characteristics for * simplicity purpose. */ #define M 4 #define N 3 #define K 5 static const cl_float alpha = 10; static const cl_float A[M*K] = { 11, 12, 13, 14, 15, 21, 22, 23, 24, 25, 31, 32, 33, 34, 35, 41, 42, 43, 44, 45, }; static const size_t lda = K; /* i.e. lda = K */ static const cl_float B[K*N] = { 11, 12, 13, 21, 22, 23, 31, 32, 33, 41, 42, 43, 51, 52, 53, }; static const size_t ldb = N; /* i.e. ldb = N */ static const cl_float beta = 20; static cl_float C[M*N] = { 11, 12, 13, 21, 22, 23, 31, 32, 33, 41, 42, 43, }; static const size_t ldc = N; /* i.e. ldc = N */ static cl_float result[M*N]; int main( void ) { cl_int err; cl_platform_id platform = 0; cl_device_id device = 0; cl_context_properties props[3] = { CL_CONTEXT_PLATFORM, 0, 0 }; cl_context ctx = 0; cl_command_queue queue = 0; cl_mem bufA, bufB, bufC; cl_event event = NULL; int ret = 0; /* Setup OpenCL environment. */ err = clGetPlatformIDs( 1, &platform, NULL ); err = clGetDeviceIDs( platform, CL_DEVICE_TYPE_GPU, 1, &device, NULL ); props[1] = (cl_context_properties)platform; ctx = clCreateContext( props, 1, &device, NULL, NULL, &err ); queue = clCreateCommandQueue( ctx, device, 0, &err ); /* Setup clBLAS */ err = clblasSetup( ); /* Prepare OpenCL memory objects and place matrices inside them. */ bufA = clCreateBuffer( ctx, CL_MEM_READ_ONLY, M * K * sizeof(*A), NULL, &err ); bufB = clCreateBuffer( ctx, CL_MEM_READ_ONLY, K * N * sizeof(*B), NULL, &err ); bufC = clCreateBuffer( ctx, CL_MEM_READ_WRITE, M * N * sizeof(*C), NULL, &err ); err = clEnqueueWriteBuffer( queue, bufA, CL_TRUE, 0, M * K * sizeof( *A ), A, 0, NULL, NULL ); err = clEnqueueWriteBuffer( queue, bufB, CL_TRUE, 0, K * N * sizeof( *B ), B, 0, NULL, NULL ); err = clEnqueueWriteBuffer( queue, bufC, CL_TRUE, 0, M * N * sizeof( *C ), C, 0, NULL, NULL ); /* Call clBLAS extended function. Perform gemm for the lower right sub-matrices */ err = clblasSgemm( clblasRowMajor, clblasNoTrans, clblasNoTrans, M, N, K, alpha, bufA, 0, lda, bufB, 0, ldb, beta, bufC, 0, ldc, 1, &queue, 0, NULL, &event ); /* Wait for calculations to be finished. */ err = clWaitForEvents( 1, &event ); /* Fetch results of calculations from GPU memory. */ err = clEnqueueReadBuffer( queue, bufC, CL_TRUE, 0, M * N * sizeof(*result), result, 0, NULL, NULL ); /* Release OpenCL memory objects. */ clReleaseMemObject( bufC ); clReleaseMemObject( bufB ); clReleaseMemObject( bufA ); /* Finalize work with clBLAS */ clblasTeardown( ); /* Release OpenCL working objects. */ clReleaseCommandQueue( queue ); clReleaseContext( ctx ); return ret; } ``` ## Build dependencies ### Library for Windows * Windows® 7/8 * Visual Studio 2010 SP1, 2012 * An OpenCL SDK, such as APP SDK 2.8 * Latest CMake ### Library for Linux * GCC 4.6 and onwards * An OpenCL SDK, such as APP SDK 2.9 * Latest CMake ### Library for Mac OSX * Recommended to generate Unix makefiles with cmake ### Test infrastructure * Googletest v1.6 * ACML on windows/linux; Accelerate on Mac OSX * Latest Boost ### Performance infrastructure * Python [Library and API documentation]: http://clmathlibraries.github.io/clBLAS/ [clmath@googlegroups.com]: https://groups.google.com/forum/#!forum/clmath [clmath-developers@googlegroups.com]: https://groups.google.com/forum/#!forum/clmath-developers [project wiki]: https://github.com/clMathLibraries/clBLAS/wiki [build primer]: https://github.com/clMathLibraries/clBLAS/wiki/Build [Contributing]: CONTRIBUTING.md [Apache License, Version 2.0]: http://www.apache.org/licenses/LICENSE-2.0 clblas-2.10/appveyor.yml000066400000000000000000000075521264277366700153210ustar00rootroot00000000000000# Appveyor OS list # Windows Server 2012 R2 (x64) <== Appveyor default image # Visual Studio 2015 # os: expands the build matrix to include multiple os's os: - Windows Server 2012 # compiler: expands the build matrix to include multiple compilers (per os) platform: - x64 configuration: - Release # Only clone the top level commit; don't bother with history shallow_clone: true # environment: specifies additional global variables to define per row in build matrix environment: global: CLBLAS_ROOT: "%APPVEYOR_BUILD_FOLDER%\\bin\\nmake\\release" OPENCL_ROOT: "%APPVEYOR_BUILD_FOLDER%\\bin\\opencl" # BOOST_ROOT: "C:/Libraries/boost" # boost 1.56, 32-bit only BOOST_ROOT: "C:\\Libraries\\boost_1_58_0" OPENCL_REGISTRY: "https://www.khronos.org/registry/cl" init: - echo init step - cmake --version - C:\"Program Files (x86)"\"Microsoft Visual Studio 12.0"\VC\vcvarsall.bat %PLATFORM% # Uncomment the following to display Remote Desktop connection details # - ps: iex ((new-object net.webclient).DownloadString('https://raw.githubusercontent.com/appveyor/ci/master/scripts/enable-rdp.ps1')) # We need to create an opencl import library that clblas can link against # Vendor based OpenCL packages are hard to use because of download size, registration requirements # and unattended installs not well supported install: - echo install step - ps: mkdir $env:OPENCL_ROOT - ps: pushd $env:OPENCL_ROOT - ps: $opencl_registry = $env:OPENCL_REGISTRY # This downloads the source to the example/demo icd library - ps: wget $opencl_registry/specs/opencl-icd-1.2.11.0.tgz -OutFile opencl-icd-1.2.11.0.tgz - ps: 7z x opencl-icd-1.2.11.0.tgz - ps: 7z x opencl-icd-1.2.11.0.tar - ps: mv .\icd\* . # This downloads all the opencl header files # The cmake build files expect a directory called inc - ps: mkdir inc/CL - ps: wget $opencl_registry/api/1.2/ | select -ExpandProperty links | where {$_.href -like "*.h*"} | select -ExpandProperty outerText | foreach{ wget $opencl_registry/api/1.2/$_ -OutFile inc/CL/$_ } # - ps: dir; if( $lastexitcode -eq 0 ){ dir include/CL } else { Write-Output boom } # Create the static import lib in a directory called lib, so findopencl() will find it - ps: mkdir lib - ps: pushd lib - cmake -G "NMake Makefiles" .. - nmake - ps: popd # Rename the inc directory to include, so FindOpencl() will find it - ps: ren inc include - ps: popd - ps: popd # before_build is used to run configure steps before_build: - echo before_build step # Boost 1.58 is not installed in typical fashion, help FindBoost() find binary libs with BOOST_LIBRARYDIR - ps: $env:BOOST_LIBRARYDIR = "$env:BOOST_ROOT/lib64-msvc-12.0" - ps: mkdir $env:CLBLAS_ROOT - ps: pushd $env:CLBLAS_ROOT - cmake -G "NMake Makefiles" -DCMAKE_BUILD_TYPE=%CONFIGURATION% -DBUILD_TEST=OFF -DBUILD_CLIENT=ON -DOCL_VERSION=2.0 -DOPENCL_ROOT=%OPENCL_ROOT% %APPVEYOR_BUILD_FOLDER%/src # build_script invokes the compiler build_script: - echo build_script step - nmake package after_build: - echo after_build step - ps: ls $env:CLBLAS_ROOT - ps: mv $env:CLBLAS_ROOT\*.zip $env:APPVEYOR_BUILD_FOLDER # Appyeyor will save a copy of the package in it's personal storage artifacts: - path: '*.zip' name: binary_zip type: zip # on_finish always executes regardless of passed or failed builds on_finish: - echo on_finish step # Appveyor will push the artifacts it has saved to GitHub 'releases' tab # deploy: # provider: GitHub # auth_token: # secure: dRXIWJKpU7h2RsHX7RqmyYCtCw+Q9O3X5MArloY6p34GZC1w7bp+jQYTZqbdO7bw # artifact: binary_zip # draft: true # prerelease: true # on: # appveyor_repo_tag: true # Uncomment the following to pause the VM and wait for RDP connetion to debug # - ps: $blockRdp = $true; iex ((new-object net.webclient).DownloadString('https://raw.githubusercontent.com/appveyor/ci/master/scripts/enable-rdp.ps1')) clblas-2.10/doc/000077500000000000000000000000001264277366700134655ustar00rootroot00000000000000clblas-2.10/doc/README-BinaryCacheOnDisk.txt000066400000000000000000000041511264277366700204420ustar00rootroot00000000000000S. Chauveau CAPS Entreprise clBLAS Project ------------------------------ April 30,2014 The implementation of a binary cache for CL programs can be found in files src/include/binary_lookup.h and src/library/blas/generic/binary_lookup.cc The cache is currently disabled by default. It can be enabled by setting the environment variable 'CLBLAS_CACHE_PATH' to the directory containing the cache entries. In the code itself, accesses to the cache are controlled by the BinaryLookup class. A typical cache query looks as follow: (1) Create a local instance of BinaryLookup (2) Specify the additional characteristics (i.e. variants) of the requested program. That information combined with the program name and the OpenCL context and device shall form a unique signature for the binary program. (3) Perform the effective search by calling the 'found' method (4a) If the search was successful then cl_program can be retrieved by a call to the 'getProgram' method (4b) If the search was not successful then a cl_program must be created and populated in the cache by a call to the 'setProgram' method. (5) Destroy the BinaryLookup local instance. So in practice a typical query shall looks as follow: cl_program program ; // The program name is part of the signature and shall be unique const char * program_name = "... my unique program name ... " ; BinaryLookup bl(context, device, program_name); // Specify some additional information used to build a // unique signature for that cache entry bl.variantInt( vectorSize ); bl.variantInt( hasBorder ); ... // Perform the query if ( bl.found() ) { // Success! use the cl_program retrieved from the cache program = bl.getProgram(); } else { // Failure! we need to build the program program = build_my_program(context,device,vectorSize,...) ; // and inform the lookup object of the program bl.setProgram(program); // and finally populate the cache bl.populateCache() } // The BinaryLookup shall now be destroyed clblas-2.10/doc/README-FunctorConcepts.txt000066400000000000000000000102661264277366700203050ustar00rootroot00000000000000S. Chauveau CAPS Entreprise April 30, 2014 The Functor concept was introduced in clBLAS to simplify the creation of specialized versions for dedicated architectures. The original system, referred as the 'Solver' system in this document, is very centralized and not flexible enough to insert customized kernels. The Functor =========== A functor is simply a C++ object that provides an implementation of a function. In the current case, that function is one of the BLAS calls implemented in OpenCL. The base class of all functors is clblasFunctor - see src/library/blas/functor/include/functor.h - see src/library/blas/functor/functor.cc That class does not provide much by itself but it is supposed to be derived once for each BLAS function to be implemented. For instance the clblasSgemmFunctor class will be the base class of all functors providing a generic or specific implementation of SGEMM. A generic functor is one that is applicable to all possible arguments of the function it implements. In most cases, there will be at least one generic functor that will simply call the existing Solver-based implementation of the function. For SGEMM, that is the class clblasSgemmFunctorFallback. A specific functor is one that is applicable to only a subset of the possible arguments of the function it implements. For instance, a SGEMM functor could only implement it for matrices of a given block size or only for square matrices or only for a specific device architecture (e.g. AMD Hawai) etc The Functor Selector ==================== Multiple generic and specific functors may be available to implement each clBLAS call. The selection of the proper functor is delegated to the class clblasFunctorSelector whose default implementation typically returns the fallback functors. - see src/library/blas/functor/include/functor_selector.h - see src/library/blas/functor/functor_selector.cc So clblasFunctorSelector provides a large set of virtual selection methods. Typically, a method to select a specific functor will be provided for each supported BLAS function. Another method may be provided to select a generic functor but that is not mandatory. The default implementation of clblasFunctorSelector is typically that the specific selector is redirected to the generic one returning the fallback functor (so using the existing Solver-based implementation). The class clblasFunctorSelector is supposed to be derived once for each supported architecture (e.g. Hawai, Tahiti, ...) and a single global instance of each of those derived classes shall be created. This is important because those instances register themselves in a global data structure that is later used to find the proper clblasFunctorSelector according to the architecture (see clblasFunctorSelector::find() ) Functor Management & Cache ========================== Each functor contains a reference counter that, when it reaches zero, causes the functor destruction. See the members clblasFunctor::retain() and clblasFunctor::release(). Of course, to be efficient, functors must be reusable between BLAS calls so some mechanisms must be implemented to manage the functors. Some functors, such as the fallback functors, are independent of the arguments and of the opencl context & device. Those can typically be implemented using a single global instance that will never be destroyed. Other functors, such as those that manage a cl_program internally, are dependent of the opencl context & device and sometimes of some arguments. They need to be stored in caches using some information as keys. In the current implementation, we propose that each functor class shall implement its own private cache. Such functors shall not be created directly using its constructor but via a dedicated 'provide' function (the name 'provide' is not mandatory) that will take care of managing the internal cache. The template class clblasFunctorCache is provided as a simple implementation of a cache of functors of type F. Use of that cache is not a mandatory part of the functor design. Another strategies could be to keep a single instance of the functor and implement a cache for the cl_program or to implement a global cache shared by multiple functor classes. clblas-2.10/doc/README-HowToIntroduceFunctors.txt000066400000000000000000000307711264277366700216320ustar00rootroot00000000000000S. Chauveau CAPS Entreprise clBLAS Project ------------------------------ April 30,2014 This document describes the steps needed to introduce the Functor framework for a clBLAS function currently implemented using the previous Solver mechanism. The procedure is composed of the following steps: (1) Declaration of a new base functor classes for the considered clBLAS function. (2) Create a new fallback class derived from the class created in (1) and using the existing Solver implementation. (3) Add the appropriate members to the clblasFunctorSolver class (4) Modify the clBLAS function to use the functor. In the following, we will consider the case of the XSCAL functions. Initial State ============= The XSCAL functions are originally implemented in the file src/library/blas/xscal.c Most of the Solver-based implementation occurs within the static function doScal() that is shared by all SCAL functions. clblasSscal(), clblasDscal() ... are basically a single call to doScal() clblasStatus doScal(...) { ... // Do all the magic } clblasStatus clblasSscal( size_t N, float alpha, cl_mem X, size_t offx, int incx, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events ) { CLBlasKargs kargs; #ifdef DEBUG_SCAL printf("\nSSCAL Called\n"); #endif memset(&kargs, 0, sizeof(kargs)); kargs.dtype = TYPE_FLOAT; kargs.alpha.argFloat = alpha; return doScal(&kargs, N, X, offx, incx, numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events); } clblasStatus clblasDscal(...) ... clblasStatus clblasCscal(...) ... clblasStatus clblasZscal(...) ... ... Step 1: Declaration of new base functor classes ================================================ All the SCAL variants have identical arguments so it is reasonable to use a templates to avoid rewriting similar classes again and again. Using macros would also work. That is just a matter of personal taste. For convenience, the base template class will provide an internal structure type called Args that will be used to store the argument. Using an Args type is not strictly needed but it simplifies a lot the creation of the functor classes and of their future derived classes. So create a new file src/library/blas/functor/include/functor_xscal.h containing the base functor class. In that specific case we also have to consider the case of clblasZdscal() and clblasCsscal(), which explains why the template requires two types TX and Talpha. TX is the type of the vector elements while Talpha is the type of the alpha argument. template class clblasXscalFunctor : public clblasFunctor { public: // Structure used to store all XSCAL arguments struct Args { size_t N; Talpha alpha; cl_mem X; size_t offx; int incx; cl_command_queue queue; cl_uint numEventsInWaitList; const cl_event * eventWaitList; cl_event * events; Args(size_t N, Talpha alpha, cl_mem X, size_t offx, int incx, cl_command_queue queue, cl_uint numEventsInWaitList, const cl_event * eventWaitList, cl_event * events) : N(N), alpha(alpha), X(X), offx(offx), incx(incx), queue(queue), numEventsInWaitList(numEventsInWaitList), eventWaitList(eventWaitList), events(events) { } }; virtual clblasStatus execute(Args & args) = 0; }; Using this template class it is now possible to define the base functor class corresponding to each SCAL function: class clblasSscalFunctor: public clblasXscalFunctor { }; // // Base class for all functors providing a DSCAL implementation // class clblasDscalFunctor: public clblasXscalFunctor { }; // // Base class for all functors providing a CSCAL implementation // class clblasCscalFunctor: public clblasXscalFunctor { }; // // Base class for all functors providing a ZSCAL implementation // class clblasZscalFunctor: public clblasXscalFunctor { }; // // Base class for all functors providing a CSSCAL implementation // class clblasCsscalFunctor: public clblasXscalFunctor { }; // // Base class for all functors providing a ZDSCAL implementation // class clblasZdscalFunctor: public clblasXscalFunctor { }; A shorter alternative could be to use 'typedef' instead but using class offers the opportunity to extend the functor with specific features (i.e. it is possible to add new members to a class but not to a typedef). STEP 2: Create the new fallback classes ======================================= In the following, we only consider the case of clblasSscal. For each of the functor classes declared during STEP 1, we should now declare the fallback functor class that will provide the Solver-based implementation of the function. We add the following src/library/blas/functor/include/functor_xscal.h // // Fallback functor for SSCAL : implement the sscal using the old solver mechanism // class clblasSscalFunctorFallback : public clblasSscalFunctor { public: // Inherited members from clblasFunctor virtual void retain(); virtual void release(); public: // Inherited members from clblasSscalFunctor virtual clblasStatus execute(Args & a); public: static clblasSscalFunctorFallback * provide (); }; The file src/library/blas/xscal.c is then renamed into src/library/blas/functor/functor_xscal.cc and modified as follow: First, the clblasSscal() function is transformed into clblasSscalFunctorFallback::execute() clblasStatus clblasSscalFunctorFallback::execute(Args & args) { CLBlasKargs kargs; memset(&kargs, 0, sizeof(kargs)); kargs.dtype = TYPE_DOUBLE; kargs.alpha.argDouble = args.alpha; return doScal(&kargs, args.N, args.X, args.offx, args.incx, 1, &args.queue, args.numEventsInWaitList, args.eventWaitList, args.events); } Second, a single instance of clblasSscalFunctorFallback is created as a static variable that will be returned by the clblasSscalFunctorFallback::provide() member. static clblasSscalFunctorFallback dscal_fallback; clblasSscalFunctorFallback * clblasSscalFunctorFallback::provide () { static clblasSscalFunctorFallback dscal_fallback; return & dscal_fallback; } Third, the retain() and release() members must be reimplemented to prevent the destruction of the unique clblasSscalFunctorFallback instance. void clblasSscalFunctorFallback::retain() { // clblasSscalFunctorFallback has a single global instance // and shall never be freed } void clblasSscalFunctorFallback::release() { // clblasSscalFunctorFallback has a single global instance // and shall never be freed } STEP 3: Add the appropriate members to the clblasFunctorSolver class ======================================================================= The clblasFunctorSolver shall typically be extended with two new virtual methods: one to select a specific functor and one to select a generic functor. Edit the file src/library/blas/functor/include/functor_selector.h and add the following members declarations to the class clblasFunctorSelector: // Provide a XSCAL Functor usable in all cases virtual clblasSscalFunctor * select_sscal_generic(); virtual clblasDscalFunctor * select_dscal_generic(); virtual clblasCscalFunctor * select_cscal_generic(); virtual clblasZscalFunctor * select_zscal_generic(); virtual clblasCsscalFunctor * select_csscal_generic(); virtual clblasZdscalFunctor * select_zdscal_generic(); // Provide XSCAL functors optimized for specific arguments virtual clblasSscalFunctor * select_sscal_specific(clblasSscalFunctor::Args & args); virtual clblasDscalFunctor * select_dscal_specific(clblasDscalFunctor::Args & args); virtual clblasCscalFunctor * select_cscal_specific(clblasCscalFunctor::Args & args); virtual clblasZscalFunctor * select_zscal_specific(clblasZscalFunctor::Args & args); virtual clblasCsscalFunctor * select_csscal_specific(clblasCsscalFunctor::Args & args); virtual clblasZdscalFunctor * select_zdscal_specific(clblasZdscalFunctor::Args & args); The naming scheme used here is not mandatory but is recommended to keep the whole infrastructure consistent. Then, add their default implementation in src/library/blas/functor/functor_selector.cc. clblasSscalFunctor * clblasFunctorSelector::select_sscal_generic() { return clblasSscalFunctorFallback::provide(); } clblasSscalFunctor * clblasFunctorSelector::select_sscal_specific(clblasSscalFunctor::Args &) { return this->select_sscal_generic() ; } ... STEP 4: Modify the clBLAS function to use the functor ===================================================== Create a file src/library/blas/xscal.cc to reimplement the clBLAS API functions. First, copy the original functions skeletons from the now obsolete file src/library/blas/xscal.c Then fill the skeleton to perform the following actions: (A) Perform some consistency checks on the arguments (B) Create and initialize a local Args object (C) Obtain the clblasFunctorSelector corresponding to the current device (via the queue) (D) Ask that selector for a specific functor (E) Execute the functor (F) Release the functor The code shall typically look like that extern "C" clblasStatus clblasSscal( size_t N, float alpha, cl_mem X, size_t offx, int incx, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events) { CHECK_VECTOR_X( X , N, offx, incx ) ; CHECK_QUEUES( numCommandQueues, commandQueues ) ; CHECK_WAITLIST( numEventsInWaitList, eventWaitList ) ; if ( numCommandQueues>1 ) { numCommandQueues = 1 ; // No support for multi-device (yet) } cl_command_queue queue = commandQueues[0]; clblasSscalFunctor::Args args(N, alpha,H X, offx, incx, queue, numEventsInWaitList, eventWaitList, events); clblasFunctorSelector * fselector = clblasFunctorSelector::find(queue); functor = fselector->select_sscal_specific(args); clblasStatus res = functor->execute(args); functor->release(); return res; } Reminder: this is a C++ file so the API functions shall be declared extern "C" Remark: what is missing in that exemple is a proper verification of the arguments (e.g. numCommandQueues shall be strictly positive. commandQueues[0] shall be non-NULL, ...) Conclusion ========== After following all the steps above, the clBLAS APIs shall now use the Solver based implementation via their respective fallback functor. Other specialized functors can then be implemented and integrated in the appropriate methods of the functor selector. clblas-2.10/doc/README-TransformASolverIntoAFunctor.txt000066400000000000000000000336451264277366700227370ustar00rootroot00000000000000S. Chauveau CAPS Entreprise clBLAS Project ------------------------------ April 30,2014 This document describes a possible procedure to transform an existing solver-based implementation for a given BLAS function into a functor-based implementation. We assume here that the basic functor infrastructure is already implemented for that function. More precisely, we will consider the case of the family of the XSCAL functions for which this exercise was already performed. The resulting code can currently be found in the files - src/library/blas/functor/include/functor_xscal_generic.h - src/library/blas/functor/functor_xscal_generic.cc So XSCAL consists of 6 functions all performing the scaling (i.e. multiplication) of a vector X by a scalar alpha. SSCAL, DSCAL, CSCAL and ZSCAL are respectively for the float, double, complex float and complex double cases while CSSCAL and ZDSCAL are special cases when the vector X is complex and the scalar alpha is not. The file 'functor_xscal.h' defines a generic functor type for each of those functions: - clblasSscalFunctor - clblasDscalFunctor - clblasCscalFunctor - clblasZscalFunctor - clblasCSscalFunctor - clblasZDscalFunctor Each of those base functor types defines a similar internal type Args that is used to store the corresponding SCAL arguments (that is Talpha in the code sample below) . struct Args { size_t N; Talpha alpha; cl_mem X; size_t offx; int incx; cl_command_queue queue; cl_uint numEventsInWaitList; const cl_event * eventWaitList; cl_event * events; ... } The OpenCL code used in the generic functor can be found in the file - src/library/blas/gens/clTemplates/scal.cl This file is not really an OpenCL program but a template that needs to be processed using the existing 'kprint' API. We assume that the reader is already familiar with that API. Apart from the data type (float, double, ...) that template can also be parametrized using two coefficients: - a vector size - whether the X is properly aligned for the chosen vector size. Those coefficient (combined with the OpenCL context and device) will form what could be called the signature of the functors. Since all generic functors will use the same kind of signature, the file 'functor_xscal_generic.h' starts by defining a reusable POD (Plain Old Data) type for it: struct _clblasXscalFunctorGenericData { int vecLen ; // Vectorization size bool doVLOAD ; // if aligned vector load/store can be used // // The operator < is needed for the cache // bool operator<(const _clblasXscalFunctorGenericData &b) const { const _clblasXscalFunctorGenericData &a = *this ; if ( a.vecLen != b.vecLen ) return a.vecLen < b.vecLen ; if ( a.doVLOAD != b.doVLOAD ) return a.doVLOAD < b.doVLOAD ; return false ; } } ; This type will later be used as key in the functor caches so it is given the 'operator<' implementation needed to use it as key in the class clblasFunctorCache. Next, the file 'functor_xscal_generic.h' provides the declaration of the functors. For SSCAL that is that the class clblasSscalFunctorGeneric defined as follow: class clblasSscalFunctorGeneric : public clblasSscalFunctor { public: typedef _clblasXscalFunctorGenericData Data ; Data data; public: // Constructor & Destructor clblasSscalFunctorGeneric(cl_context ctxt, cl_device_id dev, const Data & data, cl_int & err); ~clblasSscalFunctorGeneric(); public: // Inherited members from clblasSscalFunctor virtual clblasStatus execute(Args & a); public: static clblasSscalFunctorGeneric * provide (Args & a); public: typedef clblasFunctorCache Cache; static Cache cache; public: cl_program program; }; You should recognize here the execute() method that has to be implemented by all implementation of clblasSscalFunctor and the provide() method that will be used in place of the constructor to insure that the functor is properly cached. A static cache is also provided using _clblasXscalFunctorGenericData (or its local version Data) as custom key. Ideally the constructor shall be private to prevent using it directly but for technical reasons (i.e. the use of templates to factorize the implementation of 'provide') it had to be made public. Each functor also carries a Data member and a cl_program member that will be used by execute function. The other functors are implemented in a very similar way. In fact, the 6 functor classes defined in this file are almost identical except for minor details. Their implementation in 'functor_xscal_generic.cpp' will make extensive use of templates to avoid rewriting too much code. Another approach that requires some minor but not negligible architectural changes is possible and described in Appendix A below. The 'execute' method is implemented is implemented in functor_xscal_generic.cc using the static templated function 'xscalExecute': clblasStatus clblasSscalFunctorGeneric::execute(Args & args) { size_t nThreads = args.N; //to customize accord ing to the device, data and args return xscalExecute(args.queue, this->program, "Sscal_kernel", args.alpha, args.X, args.N, args.offx, args.incx, nThreads); } The last argument of xscalExecute represents the number of threads. In that version it is simply set to args.N which is functionally correct but clearly not optimal. A more complex formula depending of the architecture and of the Data is clearly needed here. The template type cl_float is the type of the alpha which is strictly speaking not mandatory in this case (because it can be inferred from the argument). Apart from the arguments, the command queue and the OpenCL program, the template mechanism also changes the kernel name which is passed here as third argument. The implementation of xscalExecute() is a typical OpenCL kernel call: template static clblasStatus xscalExecute(cl_command_queue queue, cl_program program, const char * kernelName, TA alpha, cl_mem X, uint N, uint offx, int incx, size_t nThreads) { cl_int err; cl_kernel kernel = clCreateKernel( program, kernelName, &err); if (err != CL_SUCCESS) return clblasStatus(err) ; clblasFunctor::setKernelArg (kernel, 0, alpha); clblasFunctor::setKernelArg (kernel, 1, X); clblasFunctor::setKernelArg (kernel, 2, N); clblasFunctor::setKernelArg (kernel, 3, offx); clblasFunctor::setKernelArg (kernel, 4, incx); size_t globalThreads[1] = { nThreads }; cl_event event; err = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, globalThreads, NULL , 0, NULL, &event); clReleaseKernel(kernel) ; return clblasStatus(err) ; } The functor constructor is implemented in a similar way using the templated static function xcalCreateProgram: template static cl_program xcalCreateProgram(cl_context ctxt, cl_device_id dev, char type, const char* functorName, const typename FUNCTOR::Data & data, cl_int & err) { BinaryLookup bl(ctxt, dev, functorName); bl.variantInt(data.vecLen); bl.variantInt(data.doVLOAD); if ( bl.found() ) // may create empty file or may wait until file is ready { return bl.getProgram(); } else { char tempTemplate[32*1024]; char buf [32*1024]; cl_program scalProgram; strcpy( tempTemplate, (char*)scal_kernel ); kprintf kobj( type, data.vecLen, data.doVLOAD, data.doVLOAD); kobj.spit((char*)buf, tempTemplate); scalProgram = BinaryLookup::buildProgramFromSource(buf, ctxt, dev, err /*, options*/); if(scalProgram) { bl.setProgram(scalProgram); bl.populateCache(); } return scalProgram; } } clblasSscalFunctorGeneric::clblasSscalFunctorGeneric(cl_context ctxt, cl_device_id dev, const Data & data, cl_int & err) : program(0) { this->program = xcalCreateProgram( ctxt, dev, 'S', "clblasSscalFunctorGeneric", data, err ); } We recognize here a typical use of the BinaryLookup class used to manage the binary cache on disk when enabled. The query effectively occurs during the call to 'bl.found()'. Before that, the members of the Data structure which is also used as key in this cache are added to the lookup object. It is very important not to forget any member else binary cache entries could be reused for incompatible functors. The functorName argument is also used to index the cache entries. It shall be unique so the functor class name is used here. After the call to bl.found(), then 2 cases are to be considered. First, if a compatible entry was found in the binary cache then a proper cl_program can be obtain bl.getProgram(). Else, the program shall be manually built in that case using the kprintf and the BinaryLookup::buildProgramFromSource utility functions. The resulting program shall then be stored in the binary cache for further reuses. Last but not least, the 'provide' member is also implemented using a templated call: template static FUNCTOR * xscalProvide(typename FUNCTOR::Args & args) { cl_device_id dev; cl_context ctxt; cl_int err = clblasFunctor::getDeviceAndContext(args.queue, dev, ctxt); if (err != CL_SUCCESS) { return NULL; } uint vecLen = 1 ; // To customize according to the device and args uint doVLOAD = false ; // TO DO (see scal_reg.cpp) typename FUNCTOR::Data data = { vecLen , doVLOAD }; typename FUNCTOR::Cache::Lookup lookup(FUNCTOR::cache, ctxt, dev, data ) ; if ( lookup.ok() ){ FUNCTOR * functor = lookup.get(); functor->retain(); return functor; } FUNCTOR * functor = new FUNCTOR(ctxt, dev, data, err); if (err != CL_SUCCESS) { return NULL; } lookup.set(functor) ; return functor; } clblasSscalFunctorGeneric * clblasSscalFunctorGeneric::provide (clblasSscalFunctor::Args & args) { return xscalProvide(args); } This implementation of xscalProvide is pretty basic. After extracting the device and context from the queue (which would probably be better done earlier), the coefficients of the Data structure are chosen. In this early implementation, a vector size of 1 is used which is safe but not optimal. What is still missing is a set of utility function to help analyze the properties of the vector argument X in order to figure out the best choice for the vectorization. This code probably exists somewhere in the current Solver implementation but still needs to be provided for the functor. On the Data structure is populated a lookup in the private cache of this functor can be performed. As with the binary cache, we have to differentiate the cases where a cache entry already exists or not. In the later case, the functor must be manually created. Appendix A - Alternative approach to improve code reuse ======================================================= The implementation described above suffers from a significant problem due to a variant of the so called diamond problem of C++ (and most Object-Oriented programming languages). Simply speaking, the technical choice made to create one base class for each functor of the SCAL family implies that it becomes very difficult to share code between the classes derived from those base classes. clblasFunctor / | \ / | \ clblasSscalFunctor clblasDscalFunctor ... | | | | clblasSscalFunctorGeneric clblaDscalFunctorGeneric ... The problem is partially solved by introducing some templated functions but this is not a very elegant solution. An alternative could be to define only base class for all functors of the SCAL family and then to define only one 'generic' functor call (and one fallback class). The immediate advantage is that the number of functor classes to be written would be greatly reduced (typically 4 times for most BLAS functions). The disadvantage is that the implementation of those classes would be slightly more complex since they would have to manage all their variants at once. It is not too late to switch to that new design as long as not too many functors are written. This is a minor change to the overall functor design. clblas-2.10/doc/clBLAS.doxy000066400000000000000000002411231264277366700154350ustar00rootroot00000000000000# Doxyfile 1.8.4 # This file describes the settings to be used by the documentation system # doxygen (www.doxygen.org) for a project # # All text after a hash (#) is considered a comment and will be ignored # The format is: # TAG = value [value, ...] # For lists items can also be appended using: # TAG += value [value, ...] # Values that contain spaces should be placed between quotes (" ") #--------------------------------------------------------------------------- # Project related configuration options #--------------------------------------------------------------------------- # This tag specifies the encoding used for all characters in the config file # that follow. The default is UTF-8 which is also the encoding used for all # text before the first occurrence of this tag. Doxygen uses libiconv (or the # iconv built into libc) for the transcoding. See # http://www.gnu.org/software/libiconv for the list of possible encodings. DOXYFILE_ENCODING = UTF-8 # The PROJECT_NAME tag is a single word (or sequence of words) that should # identify the project. Note that if you do not use Doxywizard you need # to put quotes around the project name if it contains spaces. PROJECT_NAME = clBLAS # The PROJECT_NUMBER tag can be used to enter a project or revision number. # This could be handy for archiving the generated documentation or # if some version control system is used. PROJECT_NUMBER = 2.0 # Using the PROJECT_BRIEF tag one can provide an optional one line description # for a project that appears at the top of each page and should give viewer # a quick idea about the purpose of the project. Keep the description short. PROJECT_BRIEF = # With the PROJECT_LOGO tag one can specify an logo or icon that is # included in the documentation. The maximum height of the logo should not # exceed 55 pixels and the maximum width should not exceed 200 pixels. # Doxygen will copy the logo to the output directory. PROJECT_LOGO = # The OUTPUT_DIRECTORY tag is used to specify the (relative or absolute) # base path where the generated documentation will be put. # If a relative path is entered, it will be relative to the location # where doxygen was started. If left blank the current directory will be used. OUTPUT_DIRECTORY = ..\..\bin\clBLAS.doxy # If the CREATE_SUBDIRS tag is set to YES, then doxygen will create # 4096 sub-directories (in 2 levels) under the output directory of each output # format and will distribute the generated files over these directories. # Enabling this option can be useful when feeding doxygen a huge amount of # source files, where putting all generated files in the same directory would # otherwise cause performance problems for the file system. CREATE_SUBDIRS = NO # The OUTPUT_LANGUAGE tag is used to specify the language in which all # documentation generated by doxygen is written. Doxygen will use this # information to generate all constant output in the proper language. # The default language is English, other supported languages are: # Afrikaans, Arabic, Brazilian, Catalan, Chinese, Chinese-Traditional, # Croatian, Czech, Danish, Dutch, Esperanto, Farsi, Finnish, French, German, # Greek, Hungarian, Italian, Japanese, Japanese-en (Japanese with English # messages), Korean, Korean-en, Latvian, Lithuanian, Norwegian, Macedonian, # Persian, Polish, Portuguese, Romanian, Russian, Serbian, Serbian-Cyrillic, # Slovak, Slovene, Spanish, Swedish, Ukrainian, and Vietnamese. OUTPUT_LANGUAGE = English # If the BRIEF_MEMBER_DESC tag is set to YES (the default) Doxygen will # include brief member descriptions after the members that are listed in # the file and class documentation (similar to JavaDoc). # Set to NO to disable this. BRIEF_MEMBER_DESC = YES # If the REPEAT_BRIEF tag is set to YES (the default) Doxygen will prepend # the brief description of a member or function before the detailed description. # Note: if both HIDE_UNDOC_MEMBERS and BRIEF_MEMBER_DESC are set to NO, the # brief descriptions will be completely suppressed. REPEAT_BRIEF = YES # This tag implements a quasi-intelligent brief description abbreviator # that is used to form the text in various listings. Each string # in this list, if found as the leading text of the brief description, will be # stripped from the text and the result after processing the whole list, is # used as the annotated text. Otherwise, the brief description is used as-is. # If left blank, the following values are used ("$name" is automatically # replaced with the name of the entity): "The $name class" "The $name widget" # "The $name file" "is" "provides" "specifies" "contains" # "represents" "a" "an" "the" ABBREVIATE_BRIEF = # If the ALWAYS_DETAILED_SEC and REPEAT_BRIEF tags are both set to YES then # Doxygen will generate a detailed section even if there is only a brief # description. ALWAYS_DETAILED_SEC = NO # If the INLINE_INHERITED_MEMB tag is set to YES, doxygen will show all # inherited members of a class in the documentation of that class as if those # members were ordinary class members. Constructors, destructors and assignment # operators of the base classes will not be shown. INLINE_INHERITED_MEMB = NO # If the FULL_PATH_NAMES tag is set to YES then Doxygen will prepend the full # path before files name in the file list and in the header files. If set # to NO the shortest path that makes the file name unique will be used. FULL_PATH_NAMES = YES # If the FULL_PATH_NAMES tag is set to YES then the STRIP_FROM_PATH tag # can be used to strip a user-defined part of the path. Stripping is # only done if one of the specified strings matches the left-hand part of # the path. The tag can be used to show relative paths in the file list. # If left blank the directory from which doxygen is run is used as the # path to strip. Note that you specify absolute paths here, but also # relative paths, which will be relative from the directory where doxygen is # started. STRIP_FROM_PATH = # The STRIP_FROM_INC_PATH tag can be used to strip a user-defined part of # the path mentioned in the documentation of a class, which tells # the reader which header file to include in order to use a class. # If left blank only the name of the header file containing the class # definition is used. Otherwise one should specify the include paths that # are normally passed to the compiler using the -I flag. STRIP_FROM_INC_PATH = # If the SHORT_NAMES tag is set to YES, doxygen will generate much shorter # (but less readable) file names. This can be useful if your file system # doesn't support long names like on DOS, Mac, or CD-ROM. SHORT_NAMES = NO # If the JAVADOC_AUTOBRIEF tag is set to YES then Doxygen # will interpret the first line (until the first dot) of a JavaDoc-style # comment as the brief description. If set to NO, the JavaDoc # comments will behave just like regular Qt-style comments # (thus requiring an explicit @brief command for a brief description.) JAVADOC_AUTOBRIEF = NO # If the QT_AUTOBRIEF tag is set to YES then Doxygen will # interpret the first line (until the first dot) of a Qt-style # comment as the brief description. If set to NO, the comments # will behave just like regular Qt-style comments (thus requiring # an explicit \brief command for a brief description.) QT_AUTOBRIEF = NO # The MULTILINE_CPP_IS_BRIEF tag can be set to YES to make Doxygen # treat a multi-line C++ special comment block (i.e. a block of //! or /// # comments) as a brief description. This used to be the default behaviour. # The new default is to treat a multi-line C++ comment block as a detailed # description. Set this tag to YES if you prefer the old behaviour instead. MULTILINE_CPP_IS_BRIEF = NO # If the INHERIT_DOCS tag is set to YES (the default) then an undocumented # member inherits the documentation from any documented member that it # re-implements. INHERIT_DOCS = YES # If the SEPARATE_MEMBER_PAGES tag is set to YES, then doxygen will produce # a new page for each member. If set to NO, the documentation of a member will # be part of the file/class/namespace that contains it. SEPARATE_MEMBER_PAGES = NO # The TAB_SIZE tag can be used to set the number of spaces in a tab. # Doxygen uses this value to replace tabs by spaces in code fragments. TAB_SIZE = 4 # This tag can be used to specify a number of aliases that acts # as commands in the documentation. An alias has the form "name=value". # For example adding "sideeffect=\par Side Effects:\n" will allow you to # put the command \sideeffect (or @sideeffect) in the documentation, which # will result in a user-defined paragraph with heading "Side Effects:". # You can put \n's in the value part of an alias to insert newlines. ALIASES = # This tag can be used to specify a number of word-keyword mappings (TCL only). # A mapping has the form "name=value". For example adding # "class=itcl::class" will allow you to use the command class in the # itcl::class meaning. TCL_SUBST = # Set the OPTIMIZE_OUTPUT_FOR_C tag to YES if your project consists of C # sources only. Doxygen will then generate output that is more tailored for C. # For instance, some of the names that are used will be different. The list # of all members will be omitted, etc. OPTIMIZE_OUTPUT_FOR_C = YES # Set the OPTIMIZE_OUTPUT_JAVA tag to YES if your project consists of Java # sources only. Doxygen will then generate output that is more tailored for # Java. For instance, namespaces will be presented as packages, qualified # scopes will look different, etc. OPTIMIZE_OUTPUT_JAVA = NO # Set the OPTIMIZE_FOR_FORTRAN tag to YES if your project consists of Fortran # sources only. Doxygen will then generate output that is more tailored for # Fortran. OPTIMIZE_FOR_FORTRAN = NO # Set the OPTIMIZE_OUTPUT_VHDL tag to YES if your project consists of VHDL # sources. Doxygen will then generate output that is tailored for # VHDL. OPTIMIZE_OUTPUT_VHDL = NO # Doxygen selects the parser to use depending on the extension of the files it # parses. With this tag you can assign which parser to use for a given # extension. Doxygen has a built-in mapping, but you can override or extend it # using this tag. The format is ext=language, where ext is a file extension, # and language is one of the parsers supported by doxygen: IDL, Java, # Javascript, CSharp, C, C++, D, PHP, Objective-C, Python, Fortran, VHDL, C, # C++. For instance to make doxygen treat .inc files as Fortran files (default # is PHP), and .f files as C (default is Fortran), use: inc=Fortran f=C. Note # that for custom extensions you also need to set FILE_PATTERNS otherwise the # files are not read by doxygen. EXTENSION_MAPPING = # If MARKDOWN_SUPPORT is enabled (the default) then doxygen pre-processes all # comments according to the Markdown format, which allows for more readable # documentation. See http://daringfireball.net/projects/markdown/ for details. # The output of markdown processing is further processed by doxygen, so you # can mix doxygen, HTML, and XML commands with Markdown formatting. # Disable only in case of backward compatibilities issues. MARKDOWN_SUPPORT = YES # When enabled doxygen tries to link words that correspond to documented # classes, or namespaces to their corresponding documentation. Such a link can # be prevented in individual cases by by putting a % sign in front of the word # or globally by setting AUTOLINK_SUPPORT to NO. AUTOLINK_SUPPORT = YES # If you use STL classes (i.e. std::string, std::vector, etc.) but do not want # to include (a tag file for) the STL sources as input, then you should # set this tag to YES in order to let doxygen match functions declarations and # definitions whose arguments contain STL classes (e.g. func(std::string); v.s. # func(std::string) {}). This also makes the inheritance and collaboration # diagrams that involve STL classes more complete and accurate. BUILTIN_STL_SUPPORT = NO # If you use Microsoft's C++/CLI language, you should set this option to YES to # enable parsing support. CPP_CLI_SUPPORT = NO # Set the SIP_SUPPORT tag to YES if your project consists of sip sources only. # Doxygen will parse them like normal C++ but will assume all classes use public # instead of private inheritance when no explicit protection keyword is present. SIP_SUPPORT = NO # For Microsoft's IDL there are propget and propput attributes to indicate # getter and setter methods for a property. Setting this option to YES (the # default) will make doxygen replace the get and set methods by a property in # the documentation. This will only work if the methods are indeed getting or # setting a simple type. If this is not the case, or you want to show the # methods anyway, you should set this option to NO. IDL_PROPERTY_SUPPORT = YES # If member grouping is used in the documentation and the DISTRIBUTE_GROUP_DOC # tag is set to YES, then doxygen will reuse the documentation of the first # member in the group (if any) for the other members of the group. By default # all members of a group must be documented explicitly. DISTRIBUTE_GROUP_DOC = NO # Set the SUBGROUPING tag to YES (the default) to allow class member groups of # the same type (for instance a group of public functions) to be put as a # subgroup of that type (e.g. under the Public Functions section). Set it to # NO to prevent subgrouping. Alternatively, this can be done per class using # the \nosubgrouping command. SUBGROUPING = YES # When the INLINE_GROUPED_CLASSES tag is set to YES, classes, structs and # unions are shown inside the group in which they are included (e.g. using # @ingroup) instead of on a separate page (for HTML and Man pages) or # section (for LaTeX and RTF). INLINE_GROUPED_CLASSES = NO # When the INLINE_SIMPLE_STRUCTS tag is set to YES, structs, classes, and # unions with only public data fields or simple typedef fields will be shown # inline in the documentation of the scope in which they are defined (i.e. file, # namespace, or group documentation), provided this scope is documented. If set # to NO (the default), structs, classes, and unions are shown on a separate # page (for HTML and Man pages) or section (for LaTeX and RTF). INLINE_SIMPLE_STRUCTS = NO # When TYPEDEF_HIDES_STRUCT is enabled, a typedef of a struct, union, or enum # is documented as struct, union, or enum with the name of the typedef. So # typedef struct TypeS {} TypeT, will appear in the documentation as a struct # with name TypeT. When disabled the typedef will appear as a member of a file, # namespace, or class. And the struct will be named TypeS. This can typically # be useful for C code in case the coding convention dictates that all compound # types are typedef'ed and only the typedef is referenced, never the tag name. TYPEDEF_HIDES_STRUCT = NO # The size of the symbol lookup cache can be set using LOOKUP_CACHE_SIZE. This # cache is used to resolve symbols given their name and scope. Since this can # be an expensive process and often the same symbol appear multiple times in # the code, doxygen keeps a cache of pre-resolved symbols. If the cache is too # small doxygen will become slower. If the cache is too large, memory is wasted. # The cache size is given by this formula: 2^(16+LOOKUP_CACHE_SIZE). The valid # range is 0..9, the default is 0, corresponding to a cache size of 2^16 = 65536 # symbols. LOOKUP_CACHE_SIZE = 0 #--------------------------------------------------------------------------- # Build related configuration options #--------------------------------------------------------------------------- # If the EXTRACT_ALL tag is set to YES doxygen will assume all entities in # documentation are documented, even if no documentation was available. # Private class members and static file members will be hidden unless # the EXTRACT_PRIVATE respectively EXTRACT_STATIC tags are set to YES EXTRACT_ALL = NO # If the EXTRACT_PRIVATE tag is set to YES all private members of a class # will be included in the documentation. EXTRACT_PRIVATE = NO # If the EXTRACT_PACKAGE tag is set to YES all members with package or internal # scope will be included in the documentation. EXTRACT_PACKAGE = NO # If the EXTRACT_STATIC tag is set to YES all static members of a file # will be included in the documentation. EXTRACT_STATIC = NO # If the EXTRACT_LOCAL_CLASSES tag is set to YES classes (and structs) # defined locally in source files will be included in the documentation. # If set to NO only classes defined in header files are included. EXTRACT_LOCAL_CLASSES = NO # This flag is only useful for Objective-C code. When set to YES local # methods, which are defined in the implementation section but not in # the interface are included in the documentation. # If set to NO (the default) only methods in the interface are included. EXTRACT_LOCAL_METHODS = NO # If this flag is set to YES, the members of anonymous namespaces will be # extracted and appear in the documentation as a namespace called # 'anonymous_namespace{file}', where file will be replaced with the base # name of the file that contains the anonymous namespace. By default # anonymous namespaces are hidden. EXTRACT_ANON_NSPACES = NO # If the HIDE_UNDOC_MEMBERS tag is set to YES, Doxygen will hide all # undocumented members of documented classes, files or namespaces. # If set to NO (the default) these members will be included in the # various overviews, but no documentation section is generated. # This option has no effect if EXTRACT_ALL is enabled. HIDE_UNDOC_MEMBERS = NO # If the HIDE_UNDOC_CLASSES tag is set to YES, Doxygen will hide all # undocumented classes that are normally visible in the class hierarchy. # If set to NO (the default) these classes will be included in the various # overviews. This option has no effect if EXTRACT_ALL is enabled. HIDE_UNDOC_CLASSES = YES # If the HIDE_FRIEND_COMPOUNDS tag is set to YES, Doxygen will hide all # friend (class|struct|union) declarations. # If set to NO (the default) these declarations will be included in the # documentation. HIDE_FRIEND_COMPOUNDS = NO # If the HIDE_IN_BODY_DOCS tag is set to YES, Doxygen will hide any # documentation blocks found inside the body of a function. # If set to NO (the default) these blocks will be appended to the # function's detailed documentation block. HIDE_IN_BODY_DOCS = NO # The INTERNAL_DOCS tag determines if documentation # that is typed after a \internal command is included. If the tag is set # to NO (the default) then the documentation will be excluded. # Set it to YES to include the internal documentation. INTERNAL_DOCS = NO # If the CASE_SENSE_NAMES tag is set to NO then Doxygen will only generate # file names in lower-case letters. If set to YES upper-case letters are also # allowed. This is useful if you have classes or files whose names only differ # in case and if your file system supports case sensitive file names. Windows # and Mac users are advised to set this option to NO. CASE_SENSE_NAMES = YES # If the HIDE_SCOPE_NAMES tag is set to NO (the default) then Doxygen # will show members with their full class and namespace scopes in the # documentation. If set to YES the scope will be hidden. HIDE_SCOPE_NAMES = NO # If the SHOW_INCLUDE_FILES tag is set to YES (the default) then Doxygen # will put a list of the files that are included by a file in the documentation # of that file. SHOW_INCLUDE_FILES = YES # If the FORCE_LOCAL_INCLUDES tag is set to YES then Doxygen # will list include files with double quotes in the documentation # rather than with sharp brackets. FORCE_LOCAL_INCLUDES = NO # If the INLINE_INFO tag is set to YES (the default) then a tag [inline] # is inserted in the documentation for inline members. INLINE_INFO = YES # If the SORT_MEMBER_DOCS tag is set to YES (the default) then doxygen # will sort the (detailed) documentation of file and class members # alphabetically by member name. If set to NO the members will appear in # declaration order. SORT_MEMBER_DOCS = YES # If the SORT_BRIEF_DOCS tag is set to YES then doxygen will sort the # brief documentation of file, namespace and class members alphabetically # by member name. If set to NO (the default) the members will appear in # declaration order. SORT_BRIEF_DOCS = NO # If the SORT_MEMBERS_CTORS_1ST tag is set to YES then doxygen # will sort the (brief and detailed) documentation of class members so that # constructors and destructors are listed first. If set to NO (the default) # the constructors will appear in the respective orders defined by # SORT_MEMBER_DOCS and SORT_BRIEF_DOCS. # This tag will be ignored for brief docs if SORT_BRIEF_DOCS is set to NO # and ignored for detailed docs if SORT_MEMBER_DOCS is set to NO. SORT_MEMBERS_CTORS_1ST = NO # If the SORT_GROUP_NAMES tag is set to YES then doxygen will sort the # hierarchy of group names into alphabetical order. If set to NO (the default) # the group names will appear in their defined order. SORT_GROUP_NAMES = NO # If the SORT_BY_SCOPE_NAME tag is set to YES, the class list will be # sorted by fully-qualified names, including namespaces. If set to # NO (the default), the class list will be sorted only by class name, # not including the namespace part. # Note: This option is not very useful if HIDE_SCOPE_NAMES is set to YES. # Note: This option applies only to the class list, not to the # alphabetical list. SORT_BY_SCOPE_NAME = NO # If the STRICT_PROTO_MATCHING option is enabled and doxygen fails to # do proper type resolution of all parameters of a function it will reject a # match between the prototype and the implementation of a member function even # if there is only one candidate or it is obvious which candidate to choose # by doing a simple string match. By disabling STRICT_PROTO_MATCHING doxygen # will still accept a match between prototype and implementation in such cases. STRICT_PROTO_MATCHING = NO # The GENERATE_TODOLIST tag can be used to enable (YES) or # disable (NO) the todo list. This list is created by putting \todo # commands in the documentation. GENERATE_TODOLIST = YES # The GENERATE_TESTLIST tag can be used to enable (YES) or # disable (NO) the test list. This list is created by putting \test # commands in the documentation. GENERATE_TESTLIST = YES # The GENERATE_BUGLIST tag can be used to enable (YES) or # disable (NO) the bug list. This list is created by putting \bug # commands in the documentation. GENERATE_BUGLIST = YES # The GENERATE_DEPRECATEDLIST tag can be used to enable (YES) or # disable (NO) the deprecated list. This list is created by putting # \deprecated commands in the documentation. GENERATE_DEPRECATEDLIST= YES # The ENABLED_SECTIONS tag can be used to enable conditional # documentation sections, marked by \if section-label ... \endif # and \cond section-label ... \endcond blocks. ENABLED_SECTIONS = # The MAX_INITIALIZER_LINES tag determines the maximum number of lines # the initial value of a variable or macro consists of for it to appear in # the documentation. If the initializer consists of more lines than specified # here it will be hidden. Use a value of 0 to hide initializers completely. # The appearance of the initializer of individual variables and macros in the # documentation can be controlled using \showinitializer or \hideinitializer # command in the documentation regardless of this setting. MAX_INITIALIZER_LINES = 30 # Set the SHOW_USED_FILES tag to NO to disable the list of files generated # at the bottom of the documentation of classes and structs. If set to YES the # list will mention the files that were used to generate the documentation. SHOW_USED_FILES = YES # Set the SHOW_FILES tag to NO to disable the generation of the Files page. # This will remove the Files entry from the Quick Index and from the # Folder Tree View (if specified). The default is YES. SHOW_FILES = YES # Set the SHOW_NAMESPACES tag to NO to disable the generation of the # Namespaces page. This will remove the Namespaces entry from the Quick Index # and from the Folder Tree View (if specified). The default is YES. SHOW_NAMESPACES = YES # The FILE_VERSION_FILTER tag can be used to specify a program or script that # doxygen should invoke to get the current version for each file (typically from # the version control system). Doxygen will invoke the program by executing (via # popen()) the command , where is the value of # the FILE_VERSION_FILTER tag, and is the name of an input file # provided by doxygen. Whatever the program writes to standard output # is used as the file version. See the manual for examples. FILE_VERSION_FILTER = # The LAYOUT_FILE tag can be used to specify a layout file which will be parsed # by doxygen. The layout file controls the global structure of the generated # output files in an output format independent way. To create the layout file # that represents doxygen's defaults, run doxygen with the -l option. # You can optionally specify a file name after the option, if omitted # DoxygenLayout.xml will be used as the name of the layout file. LAYOUT_FILE = # The CITE_BIB_FILES tag can be used to specify one or more bib files # containing the references data. This must be a list of .bib files. The # .bib extension is automatically appended if omitted. Using this command # requires the bibtex tool to be installed. See also # http://en.wikipedia.org/wiki/BibTeX for more info. For LaTeX the style # of the bibliography can be controlled using LATEX_BIB_STYLE. To use this # feature you need bibtex and perl available in the search path. Do not use # file names with spaces, bibtex cannot handle them. CITE_BIB_FILES = #--------------------------------------------------------------------------- # configuration options related to warning and progress messages #--------------------------------------------------------------------------- # The QUIET tag can be used to turn on/off the messages that are generated # by doxygen. Possible values are YES and NO. If left blank NO is used. QUIET = NO # The WARNINGS tag can be used to turn on/off the warning messages that are # generated by doxygen. Possible values are YES and NO. If left blank # NO is used. WARNINGS = YES # If WARN_IF_UNDOCUMENTED is set to YES, then doxygen will generate warnings # for undocumented members. If EXTRACT_ALL is set to YES then this flag will # automatically be disabled. WARN_IF_UNDOCUMENTED = YES # If WARN_IF_DOC_ERROR is set to YES, doxygen will generate warnings for # potential errors in the documentation, such as not documenting some # parameters in a documented function, or documenting parameters that # don't exist or using markup commands wrongly. WARN_IF_DOC_ERROR = YES # The WARN_NO_PARAMDOC option can be enabled to get warnings for # functions that are documented, but have no documentation for their parameters # or return value. If set to NO (the default) doxygen will only warn about # wrong or incomplete parameter documentation, but not about the absence of # documentation. WARN_NO_PARAMDOC = NO # The WARN_FORMAT tag determines the format of the warning messages that # doxygen can produce. The string should contain the $file, $line, and $text # tags, which will be replaced by the file and line number from which the # warning originated and the warning text. Optionally the format may contain # $version, which will be replaced by the version of the file (if it could # be obtained via FILE_VERSION_FILTER) WARN_FORMAT = "$file:$line: $text" # The WARN_LOGFILE tag can be used to specify a file to which warning # and error messages should be written. If left blank the output is written # to stderr. WARN_LOGFILE = #--------------------------------------------------------------------------- # configuration options related to the input files #--------------------------------------------------------------------------- # The INPUT tag can be used to specify the files and/or directories that contain # documented source files. You may enter file names like "myfile.cpp" or # directories like "/usr/src/myproject". Separate the files or directories # with spaces. INPUT = ../src/clBLAS.h \ ../src/include/cltypes.h \ ../src/include/kerngen.h \ ../src/include/solver.h \ ../src/include/mempat.h \ ../src/library/gens/blas_kgen.h \ ../src/library/include/clblas-internal.h \ ../src/library/include/kernel_extra.h \ ../src/library/include/solution_seq.h \ ../src/include/granulation.h \ ../src/library/tools/ktest/step.h # This tag can be used to specify the character encoding of the source files # that doxygen parses. Internally doxygen uses the UTF-8 encoding, which is # also the default input encoding. Doxygen uses libiconv (or the iconv built # into libc) for the transcoding. See http://www.gnu.org/software/libiconv for # the list of possible encodings. INPUT_ENCODING = UTF-8 # If the value of the INPUT tag contains directories, you can use the # FILE_PATTERNS tag to specify one or more wildcard pattern (like *.cpp # and *.h) to filter out the source-files in the directories. If left # blank the following patterns are tested: # *.c *.cc *.cxx *.cpp *.c++ *.d *.java *.ii *.ixx *.ipp *.i++ *.inl *.h *.hh # *.hxx *.hpp *.h++ *.idl *.odl *.cs *.php *.php3 *.inc *.m *.mm *.dox *.py # *.f90 *.f *.for *.vhd *.vhdl FILE_PATTERNS = # The RECURSIVE tag can be used to turn specify whether or not subdirectories # should be searched for input files as well. Possible values are YES and NO. # If left blank NO is used. RECURSIVE = NO # The EXCLUDE tag can be used to specify files and/or directories that should be # excluded from the INPUT source files. This way you can easily exclude a # subdirectory from a directory tree whose root is specified with the INPUT tag. # Note that relative paths are relative to the directory from which doxygen is # run. EXCLUDE = # The EXCLUDE_SYMLINKS tag can be used to select whether or not files or # directories that are symbolic links (a Unix file system feature) are excluded # from the input. EXCLUDE_SYMLINKS = NO # If the value of the INPUT tag contains directories, you can use the # EXCLUDE_PATTERNS tag to specify one or more wildcard patterns to exclude # certain files from those directories. Note that the wildcards are matched # against the file with absolute path, so to exclude all test directories # for example use the pattern */test/* EXCLUDE_PATTERNS = # The EXCLUDE_SYMBOLS tag can be used to specify one or more symbol names # (namespaces, classes, functions, etc.) that should be excluded from the # output. The symbol name can be a fully qualified name, a word, or if the # wildcard * is used, a substring. Examples: ANamespace, AClass, # AClass::ANamespace, ANamespace::*Test EXCLUDE_SYMBOLS = # The EXAMPLE_PATH tag can be used to specify one or more files or # directories that contain example code fragments that are included (see # the \include command). EXAMPLE_PATH = ../src/samples # If the value of the EXAMPLE_PATH tag contains directories, you can use the # EXAMPLE_PATTERNS tag to specify one or more wildcard pattern (like *.cpp # and *.h) to filter out the source-files in the directories. If left # blank all files are included. EXAMPLE_PATTERNS = # If the EXAMPLE_RECURSIVE tag is set to YES then subdirectories will be # searched for input files to be used with the \include or \dontinclude # commands irrespective of the value of the RECURSIVE tag. # Possible values are YES and NO. If left blank NO is used. EXAMPLE_RECURSIVE = NO # The IMAGE_PATH tag can be used to specify one or more files or # directories that contain image that are included in the documentation (see # the \image command). IMAGE_PATH = # The INPUT_FILTER tag can be used to specify a program that doxygen should # invoke to filter for each input file. Doxygen will invoke the filter program # by executing (via popen()) the command , where # is the value of the INPUT_FILTER tag, and is the name of an # input file. Doxygen will then use the output that the filter program writes # to standard output. If FILTER_PATTERNS is specified, this tag will be ignored. # Note that the filter must not add or remove lines; it is applied before the # code is scanned, but not when the output code is generated. If lines are added # or removed, the anchors will not be placed correctly. INPUT_FILTER = # The FILTER_PATTERNS tag can be used to specify filters on a per file pattern # basis. Doxygen will compare the file name with each pattern and apply the # filter if there is a match. The filters are a list of the form: # pattern=filter (like *.cpp=my_cpp_filter). See INPUT_FILTER for further # info on how filters are used. If FILTER_PATTERNS is empty or if # non of the patterns match the file name, INPUT_FILTER is applied. FILTER_PATTERNS = # If the FILTER_SOURCE_FILES tag is set to YES, the input filter (if set using # INPUT_FILTER) will be used to filter the input files when producing source # files to browse (i.e. when SOURCE_BROWSER is set to YES). FILTER_SOURCE_FILES = NO # The FILTER_SOURCE_PATTERNS tag can be used to specify source filters per file # pattern. A pattern will override the setting for FILTER_PATTERN (if any) # and it is also possible to disable source filtering for a specific pattern # using *.ext= (so without naming a filter). This option only has effect when # FILTER_SOURCE_FILES is enabled. FILTER_SOURCE_PATTERNS = # If the USE_MD_FILE_AS_MAINPAGE tag refers to the name of a markdown file that # is part of the input, its contents will be placed on the main page # (index.html). This can be useful if you have a project on for instance GitHub # and want reuse the introduction page also for the doxygen output. USE_MDFILE_AS_MAINPAGE = #--------------------------------------------------------------------------- # configuration options related to source browsing #--------------------------------------------------------------------------- # If the SOURCE_BROWSER tag is set to YES then a list of source files will # be generated. Documented entities will be cross-referenced with these sources. # Note: To get rid of all source code in the generated output, make sure also # VERBATIM_HEADERS is set to NO. SOURCE_BROWSER = NO # Setting the INLINE_SOURCES tag to YES will include the body # of functions and classes directly in the documentation. INLINE_SOURCES = NO # Setting the STRIP_CODE_COMMENTS tag to YES (the default) will instruct # doxygen to hide any special comment blocks from generated source code # fragments. Normal C, C++ and Fortran comments will always remain visible. STRIP_CODE_COMMENTS = YES # If the REFERENCED_BY_RELATION tag is set to YES # then for each documented function all documented # functions referencing it will be listed. REFERENCED_BY_RELATION = NO # If the REFERENCES_RELATION tag is set to YES # then for each documented function all documented entities # called/used by that function will be listed. REFERENCES_RELATION = NO # If the REFERENCES_LINK_SOURCE tag is set to YES (the default) # and SOURCE_BROWSER tag is set to YES, then the hyperlinks from # functions in REFERENCES_RELATION and REFERENCED_BY_RELATION lists will # link to the source code. Otherwise they will link to the documentation. REFERENCES_LINK_SOURCE = YES # If the USE_HTAGS tag is set to YES then the references to source code # will point to the HTML generated by the htags(1) tool instead of doxygen # built-in source browser. The htags tool is part of GNU's global source # tagging system (see http://www.gnu.org/software/global/global.html). You # will need version 4.8.6 or higher. USE_HTAGS = NO # If the VERBATIM_HEADERS tag is set to YES (the default) then Doxygen # will generate a verbatim copy of the header file for each class for # which an include is specified. Set to NO to disable this. VERBATIM_HEADERS = YES # If CLANG_ASSISTED_PARSING is set to YES, then doxygen will use the clang parser # for more acurate parsing at the cost of reduced performance. This can be # particularly helpful with template rich C++ code for which doxygen's built-in # parser lacks the necessairy type information. CLANG_ASSISTED_PARSING = NO # If clang assisted parsing is enabled you can provide the compiler with command # line options that you would normally use when invoking the compiler. Note that # the include paths will already be set by doxygen for the files and directories # specified at INPUT and INCLUDE_PATH. CLANG_OPTIONS = #--------------------------------------------------------------------------- # configuration options related to the alphabetical class index #--------------------------------------------------------------------------- # If the ALPHABETICAL_INDEX tag is set to YES, an alphabetical index # of all compounds will be generated. Enable this if the project # contains a lot of classes, structs, unions or interfaces. ALPHABETICAL_INDEX = NO # If the alphabetical index is enabled (see ALPHABETICAL_INDEX) then # the COLS_IN_ALPHA_INDEX tag can be used to specify the number of columns # in which this list will be split (can be a number in the range [1..20]) COLS_IN_ALPHA_INDEX = 5 # In case all classes in a project start with a common prefix, all # classes will be put under the same header in the alphabetical index. # The IGNORE_PREFIX tag can be used to specify one or more prefixes that # should be ignored while generating the index headers. IGNORE_PREFIX = #--------------------------------------------------------------------------- # configuration options related to the HTML output #--------------------------------------------------------------------------- # If the GENERATE_HTML tag is set to YES (the default) Doxygen will # generate HTML output. GENERATE_HTML = YES # The HTML_OUTPUT tag is used to specify where the HTML docs will be put. # If a relative path is entered the value of OUTPUT_DIRECTORY will be # put in front of it. If left blank `html' will be used as the default path. HTML_OUTPUT = html # The HTML_FILE_EXTENSION tag can be used to specify the file extension for # each generated HTML page (for example: .htm,.php,.asp). If it is left blank # doxygen will generate files with .html extension. HTML_FILE_EXTENSION = .html # The HTML_HEADER tag can be used to specify a personal HTML header for # each generated HTML page. If it is left blank doxygen will generate a # standard header. Note that when using a custom header you are responsible # for the proper inclusion of any scripts and style sheets that doxygen # needs, which is dependent on the configuration options used. # It is advised to generate a default header using "doxygen -w html # header.html footer.html stylesheet.css YourConfigFile" and then modify # that header. Note that the header is subject to change so you typically # have to redo this when upgrading to a newer version of doxygen or when # changing the value of configuration settings such as GENERATE_TREEVIEW! HTML_HEADER = # The HTML_FOOTER tag can be used to specify a personal HTML footer for # each generated HTML page. If it is left blank doxygen will generate a # standard footer. HTML_FOOTER = # The HTML_STYLESHEET tag can be used to specify a user-defined cascading # style sheet that is used by each HTML page. It can be used to # fine-tune the look of the HTML output. If left blank doxygen will # generate a default style sheet. Note that it is recommended to use # HTML_EXTRA_STYLESHEET instead of this one, as it is more robust and this # tag will in the future become obsolete. HTML_STYLESHEET = # The HTML_EXTRA_STYLESHEET tag can be used to specify an additional # user-defined cascading style sheet that is included after the standard # style sheets created by doxygen. Using this option one can overrule # certain style aspects. This is preferred over using HTML_STYLESHEET # since it does not replace the standard style sheet and is therefor more # robust against future updates. Doxygen will copy the style sheet file to # the output directory. HTML_EXTRA_STYLESHEET = # The HTML_EXTRA_FILES tag can be used to specify one or more extra images or # other source files which should be copied to the HTML output directory. Note # that these files will be copied to the base HTML output directory. Use the # $relpath^ marker in the HTML_HEADER and/or HTML_FOOTER files to load these # files. In the HTML_STYLESHEET file, use the file name only. Also note that # the files will be copied as-is; there are no commands or markers available. HTML_EXTRA_FILES = # The HTML_COLORSTYLE_HUE tag controls the color of the HTML output. # Doxygen will adjust the colors in the style sheet and background images # according to this color. Hue is specified as an angle on a colorwheel, # see http://en.wikipedia.org/wiki/Hue for more information. # For instance the value 0 represents red, 60 is yellow, 120 is green, # 180 is cyan, 240 is blue, 300 purple, and 360 is red again. # The allowed range is 0 to 359. HTML_COLORSTYLE_HUE = 220 # The HTML_COLORSTYLE_SAT tag controls the purity (or saturation) of # the colors in the HTML output. For a value of 0 the output will use # grayscales only. A value of 255 will produce the most vivid colors. HTML_COLORSTYLE_SAT = 100 # The HTML_COLORSTYLE_GAMMA tag controls the gamma correction applied to # the luminance component of the colors in the HTML output. Values below # 100 gradually make the output lighter, whereas values above 100 make # the output darker. The value divided by 100 is the actual gamma applied, # so 80 represents a gamma of 0.8, The value 220 represents a gamma of 2.2, # and 100 does not change the gamma. HTML_COLORSTYLE_GAMMA = 80 # If the HTML_TIMESTAMP tag is set to YES then the footer of each generated HTML # page will contain the date and time when the page was generated. Setting # this to NO can help when comparing the output of multiple runs. HTML_TIMESTAMP = YES # If the HTML_DYNAMIC_SECTIONS tag is set to YES then the generated HTML # documentation will contain sections that can be hidden and shown after the # page has loaded. HTML_DYNAMIC_SECTIONS = YES # With HTML_INDEX_NUM_ENTRIES one can control the preferred number of # entries shown in the various tree structured indices initially; the user # can expand and collapse entries dynamically later on. Doxygen will expand # the tree to such a level that at most the specified number of entries are # visible (unless a fully collapsed tree already exceeds this amount). # So setting the number of entries 1 will produce a full collapsed tree by # default. 0 is a special value representing an infinite number of entries # and will result in a full expanded tree by default. HTML_INDEX_NUM_ENTRIES = 100 # If the GENERATE_DOCSET tag is set to YES, additional index files # will be generated that can be used as input for Apple's Xcode 3 # integrated development environment, introduced with OSX 10.5 (Leopard). # To create a documentation set, doxygen will generate a Makefile in the # HTML output directory. Running make will produce the docset in that # directory and running "make install" will install the docset in # ~/Library/Developer/Shared/Documentation/DocSets so that Xcode will find # it at startup. # See http://developer.apple.com/tools/creatingdocsetswithdoxygen.html # for more information. GENERATE_DOCSET = NO # When GENERATE_DOCSET tag is set to YES, this tag determines the name of the # feed. A documentation feed provides an umbrella under which multiple # documentation sets from a single provider (such as a company or product suite) # can be grouped. DOCSET_FEEDNAME = "Doxygen generated docs" # When GENERATE_DOCSET tag is set to YES, this tag specifies a string that # should uniquely identify the documentation set bundle. This should be a # reverse domain-name style string, e.g. com.mycompany.MyDocSet. Doxygen # will append .docset to the name. DOCSET_BUNDLE_ID = org.doxygen.Project # When GENERATE_PUBLISHER_ID tag specifies a string that should uniquely # identify the documentation publisher. This should be a reverse domain-name # style string, e.g. com.mycompany.MyDocSet.documentation. DOCSET_PUBLISHER_ID = org.doxygen.Publisher # The GENERATE_PUBLISHER_NAME tag identifies the documentation publisher. DOCSET_PUBLISHER_NAME = Publisher # If the GENERATE_HTMLHELP tag is set to YES, additional index files # will be generated that can be used as input for tools like the # Microsoft HTML help workshop to generate a compiled HTML help file (.chm) # of the generated HTML documentation. GENERATE_HTMLHELP = NO # If the GENERATE_HTMLHELP tag is set to YES, the CHM_FILE tag can # be used to specify the file name of the resulting .chm file. You # can add a path in front of the file if the result should not be # written to the html output directory. CHM_FILE = # If the GENERATE_HTMLHELP tag is set to YES, the HHC_LOCATION tag can # be used to specify the location (absolute path including file name) of # the HTML help compiler (hhc.exe). If non-empty doxygen will try to run # the HTML help compiler on the generated index.hhp. HHC_LOCATION = # If the GENERATE_HTMLHELP tag is set to YES, the GENERATE_CHI flag # controls if a separate .chi index file is generated (YES) or that # it should be included in the master .chm file (NO). GENERATE_CHI = NO # If the GENERATE_HTMLHELP tag is set to YES, the CHM_INDEX_ENCODING # is used to encode HtmlHelp index (hhk), content (hhc) and project file # content. CHM_INDEX_ENCODING = # If the GENERATE_HTMLHELP tag is set to YES, the BINARY_TOC flag # controls whether a binary table of contents is generated (YES) or a # normal table of contents (NO) in the .chm file. BINARY_TOC = NO # The TOC_EXPAND flag can be set to YES to add extra items for group members # to the contents of the HTML help documentation and to the tree view. TOC_EXPAND = NO # If the GENERATE_QHP tag is set to YES and both QHP_NAMESPACE and # QHP_VIRTUAL_FOLDER are set, an additional index file will be generated # that can be used as input for Qt's qhelpgenerator to generate a # Qt Compressed Help (.qch) of the generated HTML documentation. GENERATE_QHP = NO # If the QHG_LOCATION tag is specified, the QCH_FILE tag can # be used to specify the file name of the resulting .qch file. # The path specified is relative to the HTML output folder. QCH_FILE = # The QHP_NAMESPACE tag specifies the namespace to use when generating # Qt Help Project output. For more information please see # http://doc.trolltech.com/qthelpproject.html#namespace QHP_NAMESPACE = org.doxygen.Project # The QHP_VIRTUAL_FOLDER tag specifies the namespace to use when generating # Qt Help Project output. For more information please see # http://doc.trolltech.com/qthelpproject.html#virtual-folders QHP_VIRTUAL_FOLDER = doc # If QHP_CUST_FILTER_NAME is set, it specifies the name of a custom filter to # add. For more information please see # http://doc.trolltech.com/qthelpproject.html#custom-filters QHP_CUST_FILTER_NAME = # The QHP_CUST_FILT_ATTRS tag specifies the list of the attributes of the # custom filter to add. For more information please see # # Qt Help Project / Custom Filters. QHP_CUST_FILTER_ATTRS = # The QHP_SECT_FILTER_ATTRS tag specifies the list of the attributes this # project's # filter section matches. # # Qt Help Project / Filter Attributes. QHP_SECT_FILTER_ATTRS = # If the GENERATE_QHP tag is set to YES, the QHG_LOCATION tag can # be used to specify the location of Qt's qhelpgenerator. # If non-empty doxygen will try to run qhelpgenerator on the generated # .qhp file. QHG_LOCATION = # If the GENERATE_ECLIPSEHELP tag is set to YES, additional index files # will be generated, which together with the HTML files, form an Eclipse help # plugin. To install this plugin and make it available under the help contents # menu in Eclipse, the contents of the directory containing the HTML and XML # files needs to be copied into the plugins directory of eclipse. The name of # the directory within the plugins directory should be the same as # the ECLIPSE_DOC_ID value. After copying Eclipse needs to be restarted before # the help appears. GENERATE_ECLIPSEHELP = NO # A unique identifier for the eclipse help plugin. When installing the plugin # the directory name containing the HTML and XML files should also have # this name. ECLIPSE_DOC_ID = org.doxygen.Project # The DISABLE_INDEX tag can be used to turn on/off the condensed index (tabs) # at top of each HTML page. The value NO (the default) enables the index and # the value YES disables it. Since the tabs have the same information as the # navigation tree you can set this option to NO if you already set # GENERATE_TREEVIEW to YES. DISABLE_INDEX = NO # The GENERATE_TREEVIEW tag is used to specify whether a tree-like index # structure should be generated to display hierarchical information. # If the tag value is set to YES, a side panel will be generated # containing a tree-like index structure (just like the one that # is generated for HTML Help). For this to work a browser that supports # JavaScript, DHTML, CSS and frames is required (i.e. any modern browser). # Windows users are probably better off using the HTML help feature. # Since the tree basically has the same information as the tab index you # could consider to set DISABLE_INDEX to NO when enabling this option. GENERATE_TREEVIEW = NO # The ENUM_VALUES_PER_LINE tag can be used to set the number of enum values # (range [0,1..20]) that doxygen will group on one line in the generated HTML # documentation. Note that a value of 0 will completely suppress the enum # values from appearing in the overview section. ENUM_VALUES_PER_LINE = 4 # If the treeview is enabled (see GENERATE_TREEVIEW) then this tag can be # used to set the initial width (in pixels) of the frame in which the tree # is shown. TREEVIEW_WIDTH = 250 # When the EXT_LINKS_IN_WINDOW option is set to YES doxygen will open # links to external symbols imported via tag files in a separate window. EXT_LINKS_IN_WINDOW = NO # Use this tag to change the font size of Latex formulas included # as images in the HTML documentation. The default is 10. Note that # when you change the font size after a successful doxygen run you need # to manually remove any form_*.png images from the HTML output directory # to force them to be regenerated. FORMULA_FONTSIZE = 10 # Use the FORMULA_TRANPARENT tag to determine whether or not the images # generated for formulas are transparent PNGs. Transparent PNGs are # not supported properly for IE 6.0, but are supported on all modern browsers. # Note that when changing this option you need to delete any form_*.png files # in the HTML output before the changes have effect. FORMULA_TRANSPARENT = YES # Enable the USE_MATHJAX option to render LaTeX formulas using MathJax # (see http://www.mathjax.org) which uses client side Javascript for the # rendering instead of using prerendered bitmaps. Use this if you do not # have LaTeX installed or if you want to formulas look prettier in the HTML # output. When enabled you may also need to install MathJax separately and # configure the path to it using the MATHJAX_RELPATH option. USE_MATHJAX = YES # When MathJax is enabled you can set the default output format to be used for # the MathJax output. Supported types are HTML-CSS, NativeMML (i.e. MathML) and # SVG. The default value is HTML-CSS, which is slower, but has the best # compatibility. MATHJAX_FORMAT = HTML-CSS # When MathJax is enabled you need to specify the location relative to the # HTML output directory using the MATHJAX_RELPATH option. The destination # directory should contain the MathJax.js script. For instance, if the mathjax # directory is located at the same level as the HTML output directory, then # MATHJAX_RELPATH should be ../mathjax. The default value points to # the MathJax Content Delivery Network so you can quickly see the result without # installing MathJax. However, it is strongly recommended to install a local # copy of MathJax from http://www.mathjax.org before deployment. MATHJAX_RELPATH = http://www.mathjax.org/mathjax # The MATHJAX_EXTENSIONS tag can be used to specify one or MathJax extension # names that should be enabled during MathJax rendering. MATHJAX_EXTENSIONS = # The MATHJAX_CODEFILE tag can be used to specify a file with javascript # pieces of code that will be used on startup of the MathJax code. MATHJAX_CODEFILE = # When the SEARCHENGINE tag is enabled doxygen will generate a search box # for the HTML output. The underlying search engine uses javascript # and DHTML and should work on any modern browser. Note that when using # HTML help (GENERATE_HTMLHELP), Qt help (GENERATE_QHP), or docsets # (GENERATE_DOCSET) there is already a search function so this one should # typically be disabled. For large projects the javascript based search engine # can be slow, then enabling SERVER_BASED_SEARCH may provide a better solution. SEARCHENGINE = YES # When the SERVER_BASED_SEARCH tag is enabled the search engine will be # implemented using a web server instead of a web client using Javascript. # There are two flavours of web server based search depending on the # EXTERNAL_SEARCH setting. When disabled, doxygen will generate a PHP script for # searching and an index file used by the script. When EXTERNAL_SEARCH is # enabled the indexing and searching needs to be provided by external tools. # See the manual for details. SERVER_BASED_SEARCH = NO # When EXTERNAL_SEARCH is enabled doxygen will no longer generate the PHP # script for searching. Instead the search results are written to an XML file # which needs to be processed by an external indexer. Doxygen will invoke an # external search engine pointed to by the SEARCHENGINE_URL option to obtain # the search results. Doxygen ships with an example indexer (doxyindexer) and # search engine (doxysearch.cgi) which are based on the open source search # engine library Xapian. See the manual for configuration details. EXTERNAL_SEARCH = NO # The SEARCHENGINE_URL should point to a search engine hosted by a web server # which will returned the search results when EXTERNAL_SEARCH is enabled. # Doxygen ships with an example search engine (doxysearch) which is based on # the open source search engine library Xapian. See the manual for configuration # details. SEARCHENGINE_URL = # When SERVER_BASED_SEARCH and EXTERNAL_SEARCH are both enabled the unindexed # search data is written to a file for indexing by an external tool. With the # SEARCHDATA_FILE tag the name of this file can be specified. SEARCHDATA_FILE = searchdata.xml # When SERVER_BASED_SEARCH AND EXTERNAL_SEARCH are both enabled the # EXTERNAL_SEARCH_ID tag can be used as an identifier for the project. This is # useful in combination with EXTRA_SEARCH_MAPPINGS to search through multiple # projects and redirect the results back to the right project. EXTERNAL_SEARCH_ID = # The EXTRA_SEARCH_MAPPINGS tag can be used to enable searching through doxygen # projects other than the one defined by this configuration file, but that are # all added to the same external search index. Each project needs to have a # unique id set via EXTERNAL_SEARCH_ID. The search mapping then maps the id # of to a relative location where the documentation can be found. # The format is: EXTRA_SEARCH_MAPPINGS = id1=loc1 id2=loc2 ... EXTRA_SEARCH_MAPPINGS = #--------------------------------------------------------------------------- # configuration options related to the LaTeX output #--------------------------------------------------------------------------- # If the GENERATE_LATEX tag is set to YES (the default) Doxygen will # generate Latex output. GENERATE_LATEX = NO # The LATEX_OUTPUT tag is used to specify where the LaTeX docs will be put. # If a relative path is entered the value of OUTPUT_DIRECTORY will be # put in front of it. If left blank `latex' will be used as the default path. LATEX_OUTPUT = latex # The LATEX_CMD_NAME tag can be used to specify the LaTeX command name to be # invoked. If left blank `latex' will be used as the default command name. # Note that when enabling USE_PDFLATEX this option is only used for # generating bitmaps for formulas in the HTML output, but not in the # Makefile that is written to the output directory. LATEX_CMD_NAME = latex # The MAKEINDEX_CMD_NAME tag can be used to specify the command name to # generate index for LaTeX. If left blank `makeindex' will be used as the # default command name. MAKEINDEX_CMD_NAME = makeindex # If the COMPACT_LATEX tag is set to YES Doxygen generates more compact # LaTeX documents. This may be useful for small projects and may help to # save some trees in general. COMPACT_LATEX = NO # The PAPER_TYPE tag can be used to set the paper type that is used # by the printer. Possible values are: a4, letter, legal and # executive. If left blank a4 will be used. PAPER_TYPE = a4wide # The EXTRA_PACKAGES tag can be to specify one or more names of LaTeX # packages that should be included in the LaTeX output. EXTRA_PACKAGES = # The LATEX_HEADER tag can be used to specify a personal LaTeX header for # the generated latex document. The header should contain everything until # the first chapter. If it is left blank doxygen will generate a # standard header. Notice: only use this tag if you know what you are doing! LATEX_HEADER = # The LATEX_FOOTER tag can be used to specify a personal LaTeX footer for # the generated latex document. The footer should contain everything after # the last chapter. If it is left blank doxygen will generate a # standard footer. Notice: only use this tag if you know what you are doing! LATEX_FOOTER = # The LATEX_EXTRA_FILES tag can be used to specify one or more extra images # or other source files which should be copied to the LaTeX output directory. # Note that the files will be copied as-is; there are no commands or markers # available. LATEX_EXTRA_FILES = # If the PDF_HYPERLINKS tag is set to YES, the LaTeX that is generated # is prepared for conversion to pdf (using ps2pdf). The pdf file will # contain links (just like the HTML output) instead of page references # This makes the output suitable for online browsing using a pdf viewer. PDF_HYPERLINKS = YES # If the USE_PDFLATEX tag is set to YES, pdflatex will be used instead of # plain latex in the generated Makefile. Set this option to YES to get a # higher quality PDF documentation. USE_PDFLATEX = YES # If the LATEX_BATCHMODE tag is set to YES, doxygen will add the \\batchmode. # command to the generated LaTeX files. This will instruct LaTeX to keep # running if errors occur, instead of asking the user for help. # This option is also used when generating formulas in HTML. LATEX_BATCHMODE = NO # If LATEX_HIDE_INDICES is set to YES then doxygen will not # include the index chapters (such as File Index, Compound Index, etc.) # in the output. LATEX_HIDE_INDICES = NO # If LATEX_SOURCE_CODE is set to YES then doxygen will include # source code with syntax highlighting in the LaTeX output. # Note that which sources are shown also depends on other settings # such as SOURCE_BROWSER. LATEX_SOURCE_CODE = NO # The LATEX_BIB_STYLE tag can be used to specify the style to use for the # bibliography, e.g. plainnat, or ieeetr. The default style is "plain". See # http://en.wikipedia.org/wiki/BibTeX for more info. LATEX_BIB_STYLE = plain #--------------------------------------------------------------------------- # configuration options related to the RTF output #--------------------------------------------------------------------------- # If the GENERATE_RTF tag is set to YES Doxygen will generate RTF output # The RTF output is optimized for Word 97 and may not look very pretty with # other RTF readers or editors. GENERATE_RTF = NO # The RTF_OUTPUT tag is used to specify where the RTF docs will be put. # If a relative path is entered the value of OUTPUT_DIRECTORY will be # put in front of it. If left blank `rtf' will be used as the default path. RTF_OUTPUT = rtf # If the COMPACT_RTF tag is set to YES Doxygen generates more compact # RTF documents. This may be useful for small projects and may help to # save some trees in general. COMPACT_RTF = NO # If the RTF_HYPERLINKS tag is set to YES, the RTF that is generated # will contain hyperlink fields. The RTF file will # contain links (just like the HTML output) instead of page references. # This makes the output suitable for online browsing using WORD or other # programs which support those fields. # Note: wordpad (write) and others do not support links. RTF_HYPERLINKS = NO # Load style sheet definitions from file. Syntax is similar to doxygen's # config file, i.e. a series of assignments. You only have to provide # replacements, missing definitions are set to their default value. RTF_STYLESHEET_FILE = # Set optional variables used in the generation of an rtf document. # Syntax is similar to doxygen's config file. RTF_EXTENSIONS_FILE = #--------------------------------------------------------------------------- # configuration options related to the man page output #--------------------------------------------------------------------------- # If the GENERATE_MAN tag is set to YES (the default) Doxygen will # generate man pages GENERATE_MAN = NO # The MAN_OUTPUT tag is used to specify where the man pages will be put. # If a relative path is entered the value of OUTPUT_DIRECTORY will be # put in front of it. If left blank `man' will be used as the default path. MAN_OUTPUT = man # The MAN_EXTENSION tag determines the extension that is added to # the generated man pages (default is the subroutine's section .3) MAN_EXTENSION = .3 # If the MAN_LINKS tag is set to YES and Doxygen generates man output, # then it will generate one additional man file for each entity # documented in the real man page(s). These additional files # only source the real man page, but without them the man command # would be unable to find the correct page. The default is NO. MAN_LINKS = NO #--------------------------------------------------------------------------- # configuration options related to the XML output #--------------------------------------------------------------------------- # If the GENERATE_XML tag is set to YES Doxygen will # generate an XML file that captures the structure of # the code including all documentation. GENERATE_XML = NO # The XML_OUTPUT tag is used to specify where the XML pages will be put. # If a relative path is entered the value of OUTPUT_DIRECTORY will be # put in front of it. If left blank `xml' will be used as the default path. XML_OUTPUT = xml # The XML_SCHEMA tag can be used to specify an XML schema, # which can be used by a validating XML parser to check the # syntax of the XML files. XML_SCHEMA = # The XML_DTD tag can be used to specify an XML DTD, # which can be used by a validating XML parser to check the # syntax of the XML files. XML_DTD = # If the XML_PROGRAMLISTING tag is set to YES Doxygen will # dump the program listings (including syntax highlighting # and cross-referencing information) to the XML output. Note that # enabling this will significantly increase the size of the XML output. XML_PROGRAMLISTING = YES #--------------------------------------------------------------------------- # configuration options related to the DOCBOOK output #--------------------------------------------------------------------------- # If the GENERATE_DOCBOOK tag is set to YES Doxygen will generate DOCBOOK files # that can be used to generate PDF. GENERATE_DOCBOOK = NO # The DOCBOOK_OUTPUT tag is used to specify where the DOCBOOK pages will be put. # If a relative path is entered the value of OUTPUT_DIRECTORY will be put in # front of it. If left blank docbook will be used as the default path. DOCBOOK_OUTPUT = docbook #--------------------------------------------------------------------------- # configuration options for the AutoGen Definitions output #--------------------------------------------------------------------------- # If the GENERATE_AUTOGEN_DEF tag is set to YES Doxygen will # generate an AutoGen Definitions (see autogen.sf.net) file # that captures the structure of the code including all # documentation. Note that this feature is still experimental # and incomplete at the moment. GENERATE_AUTOGEN_DEF = NO #--------------------------------------------------------------------------- # configuration options related to the Perl module output #--------------------------------------------------------------------------- # If the GENERATE_PERLMOD tag is set to YES Doxygen will # generate a Perl module file that captures the structure of # the code including all documentation. Note that this # feature is still experimental and incomplete at the # moment. GENERATE_PERLMOD = NO # If the PERLMOD_LATEX tag is set to YES Doxygen will generate # the necessary Makefile rules, Perl scripts and LaTeX code to be able # to generate PDF and DVI output from the Perl module output. PERLMOD_LATEX = NO # If the PERLMOD_PRETTY tag is set to YES the Perl module output will be # nicely formatted so it can be parsed by a human reader. This is useful # if you want to understand what is going on. On the other hand, if this # tag is set to NO the size of the Perl module output will be much smaller # and Perl will parse it just the same. PERLMOD_PRETTY = YES # The names of the make variables in the generated doxyrules.make file # are prefixed with the string contained in PERLMOD_MAKEVAR_PREFIX. # This is useful so different doxyrules.make files included by the same # Makefile don't overwrite each other's variables. PERLMOD_MAKEVAR_PREFIX = #--------------------------------------------------------------------------- # Configuration options related to the preprocessor #--------------------------------------------------------------------------- # If the ENABLE_PREPROCESSING tag is set to YES (the default) Doxygen will # evaluate all C-preprocessor directives found in the sources and include # files. ENABLE_PREPROCESSING = YES # If the MACRO_EXPANSION tag is set to YES Doxygen will expand all macro # names in the source code. If set to NO (the default) only conditional # compilation will be performed. Macro expansion can be done in a controlled # way by setting EXPAND_ONLY_PREDEF to YES. MACRO_EXPANSION = NO # If the EXPAND_ONLY_PREDEF and MACRO_EXPANSION tags are both set to YES # then the macro expansion is limited to the macros specified with the # PREDEFINED and EXPAND_AS_DEFINED tags. EXPAND_ONLY_PREDEF = NO # If the SEARCH_INCLUDES tag is set to YES (the default) the includes files # pointed to by INCLUDE_PATH will be searched when a #include is found. SEARCH_INCLUDES = YES # The INCLUDE_PATH tag can be used to specify one or more directories that # contain include files that are not input files but should be processed by # the preprocessor. INCLUDE_PATH = # You can use the INCLUDE_FILE_PATTERNS tag to specify one or more wildcard # patterns (like *.h and *.hpp) to filter out the header-files in the # directories. If left blank, the patterns specified with FILE_PATTERNS will # be used. INCLUDE_FILE_PATTERNS = # The PREDEFINED tag can be used to specify one or more macro names that # are defined before the preprocessor is started (similar to the -D option of # gcc). The argument of the tag is a list of macros of the form: name # or name=definition (no spaces). If the definition and the = are # omitted =1 is assumed. To prevent a macro definition from being # undefined via #undef or recursively expanded use the := operator # instead of the = operator. PREDEFINED = # If the MACRO_EXPANSION and EXPAND_ONLY_PREDEF tags are set to YES then # this tag can be used to specify a list of macro names that should be expanded. # The macro definition that is found in the sources will be used. # Use the PREDEFINED tag if you want to use a different macro definition that # overrules the definition found in the source code. EXPAND_AS_DEFINED = # If the SKIP_FUNCTION_MACROS tag is set to YES (the default) then # doxygen's preprocessor will remove all references to function-like macros # that are alone on a line, have an all uppercase name, and do not end with a # semicolon, because these will confuse the parser if not removed. SKIP_FUNCTION_MACROS = YES #--------------------------------------------------------------------------- # Configuration::additions related to external references #--------------------------------------------------------------------------- # The TAGFILES option can be used to specify one or more tagfiles. For each # tag file the location of the external documentation should be added. The # format of a tag file without this location is as follows: # TAGFILES = file1 file2 ... # Adding location for the tag files is done as follows: # TAGFILES = file1=loc1 "file2 = loc2" ... # where "loc1" and "loc2" can be relative or absolute paths # or URLs. Note that each tag file must have a unique name (where the name does # NOT include the path). If a tag file is not located in the directory in which # doxygen is run, you must also specify the path to the tagfile here. TAGFILES = # When a file name is specified after GENERATE_TAGFILE, doxygen will create # a tag file that is based on the input files it reads. GENERATE_TAGFILE = # If the ALLEXTERNALS tag is set to YES all external classes will be listed # in the class index. If set to NO only the inherited external classes # will be listed. ALLEXTERNALS = NO # If the EXTERNAL_GROUPS tag is set to YES all external groups will be listed # in the modules index. If set to NO, only the current project's groups will # be listed. EXTERNAL_GROUPS = YES # If the EXTERNAL_PAGES tag is set to YES all external pages will be listed # in the related pages index. If set to NO, only the current project's # pages will be listed. EXTERNAL_PAGES = YES # The PERL_PATH should be the absolute path and name of the perl script # interpreter (i.e. the result of `which perl'). PERL_PATH = /usr/bin/perl #--------------------------------------------------------------------------- # Configuration options related to the dot tool #--------------------------------------------------------------------------- # If the CLASS_DIAGRAMS tag is set to YES (the default) Doxygen will # generate a inheritance diagram (in HTML, RTF and LaTeX) for classes with base # or super classes. Setting the tag to NO turns the diagrams off. Note that # this option also works with HAVE_DOT disabled, but it is recommended to # install and use dot, since it yields more powerful graphs. CLASS_DIAGRAMS = NO # You can define message sequence charts within doxygen comments using the \msc # command. Doxygen will then run the mscgen tool (see # http://www.mcternan.me.uk/mscgen/) to produce the chart and insert it in the # documentation. The MSCGEN_PATH tag allows you to specify the directory where # the mscgen tool resides. If left empty the tool is assumed to be found in the # default search path. MSCGEN_PATH = # If set to YES, the inheritance and collaboration graphs will hide # inheritance and usage relations if the target is undocumented # or is not a class. HIDE_UNDOC_RELATIONS = YES # If you set the HAVE_DOT tag to YES then doxygen will assume the dot tool is # available from the path. This tool is part of Graphviz, a graph visualization # toolkit from AT&T and Lucent Bell Labs. The other options in this section # have no effect if this option is set to NO (the default) HAVE_DOT = NO # The DOT_NUM_THREADS specifies the number of dot invocations doxygen is # allowed to run in parallel. When set to 0 (the default) doxygen will # base this on the number of processors available in the system. You can set it # explicitly to a value larger than 0 to get control over the balance # between CPU load and processing speed. DOT_NUM_THREADS = 0 # By default doxygen will use the Helvetica font for all dot files that # doxygen generates. When you want a differently looking font you can specify # the font name using DOT_FONTNAME. You need to make sure dot is able to find # the font, which can be done by putting it in a standard location or by setting # the DOTFONTPATH environment variable or by setting DOT_FONTPATH to the # directory containing the font. DOT_FONTNAME = FreeSans # The DOT_FONTSIZE tag can be used to set the size of the font of dot graphs. # The default size is 10pt. DOT_FONTSIZE = 10 # By default doxygen will tell dot to use the Helvetica font. # If you specify a different font using DOT_FONTNAME you can use DOT_FONTPATH to # set the path where dot can find it. DOT_FONTPATH = # If the CLASS_GRAPH and HAVE_DOT tags are set to YES then doxygen # will generate a graph for each documented class showing the direct and # indirect inheritance relations. Setting this tag to YES will force the # CLASS_DIAGRAMS tag to NO. CLASS_GRAPH = YES # If the COLLABORATION_GRAPH and HAVE_DOT tags are set to YES then doxygen # will generate a graph for each documented class showing the direct and # indirect implementation dependencies (inheritance, containment, and # class references variables) of the class with other documented classes. COLLABORATION_GRAPH = YES # If the GROUP_GRAPHS and HAVE_DOT tags are set to YES then doxygen # will generate a graph for groups, showing the direct groups dependencies GROUP_GRAPHS = YES # If the UML_LOOK tag is set to YES doxygen will generate inheritance and # collaboration diagrams in a style similar to the OMG's Unified Modeling # Language. UML_LOOK = NO # If the UML_LOOK tag is enabled, the fields and methods are shown inside # the class node. If there are many fields or methods and many nodes the # graph may become too big to be useful. The UML_LIMIT_NUM_FIELDS # threshold limits the number of items for each type to make the size more # manageable. Set this to 0 for no limit. Note that the threshold may be # exceeded by 50% before the limit is enforced. UML_LIMIT_NUM_FIELDS = 10 # If set to YES, the inheritance and collaboration graphs will show the # relations between templates and their instances. TEMPLATE_RELATIONS = NO # If the ENABLE_PREPROCESSING, SEARCH_INCLUDES, INCLUDE_GRAPH, and HAVE_DOT # tags are set to YES then doxygen will generate a graph for each documented # file showing the direct and indirect include dependencies of the file with # other documented files. INCLUDE_GRAPH = YES # If the ENABLE_PREPROCESSING, SEARCH_INCLUDES, INCLUDED_BY_GRAPH, and # HAVE_DOT tags are set to YES then doxygen will generate a graph for each # documented header file showing the documented files that directly or # indirectly include this file. INCLUDED_BY_GRAPH = YES # If the CALL_GRAPH and HAVE_DOT options are set to YES then # doxygen will generate a call dependency graph for every global function # or class method. Note that enabling this option will significantly increase # the time of a run. So in most cases it will be better to enable call graphs # for selected functions only using the \callgraph command. CALL_GRAPH = NO # If the CALLER_GRAPH and HAVE_DOT tags are set to YES then # doxygen will generate a caller dependency graph for every global function # or class method. Note that enabling this option will significantly increase # the time of a run. So in most cases it will be better to enable caller # graphs for selected functions only using the \callergraph command. CALLER_GRAPH = NO # If the GRAPHICAL_HIERARCHY and HAVE_DOT tags are set to YES then doxygen # will generate a graphical hierarchy of all classes instead of a textual one. GRAPHICAL_HIERARCHY = YES # If the DIRECTORY_GRAPH and HAVE_DOT tags are set to YES # then doxygen will show the dependencies a directory has on other directories # in a graphical way. The dependency relations are determined by the #include # relations between the files in the directories. DIRECTORY_GRAPH = YES # The DOT_IMAGE_FORMAT tag can be used to set the image format of the images # generated by dot. Possible values are svg, png, jpg, or gif. # If left blank png will be used. If you choose svg you need to set # HTML_FILE_EXTENSION to xhtml in order to make the SVG files # visible in IE 9+ (other browsers do not have this requirement). DOT_IMAGE_FORMAT = png # If DOT_IMAGE_FORMAT is set to svg, then this option can be set to YES to # enable generation of interactive SVG images that allow zooming and panning. # Note that this requires a modern browser other than Internet Explorer. # Tested and working are Firefox, Chrome, Safari, and Opera. For IE 9+ you # need to set HTML_FILE_EXTENSION to xhtml in order to make the SVG files # visible. Older versions of IE do not have SVG support. INTERACTIVE_SVG = NO # The tag DOT_PATH can be used to specify the path where the dot tool can be # found. If left blank, it is assumed the dot tool can be found in the path. DOT_PATH = # The DOTFILE_DIRS tag can be used to specify one or more directories that # contain dot files that are included in the documentation (see the # \dotfile command). DOTFILE_DIRS = # The MSCFILE_DIRS tag can be used to specify one or more directories that # contain msc files that are included in the documentation (see the # \mscfile command). MSCFILE_DIRS = # The DOT_GRAPH_MAX_NODES tag can be used to set the maximum number of # nodes that will be shown in the graph. If the number of nodes in a graph # becomes larger than this value, doxygen will truncate the graph, which is # visualized by representing a node as a red box. Note that doxygen if the # number of direct children of the root node in a graph is already larger than # DOT_GRAPH_MAX_NODES then the graph will not be shown at all. Also note # that the size of a graph can be further restricted by MAX_DOT_GRAPH_DEPTH. DOT_GRAPH_MAX_NODES = 50 # The MAX_DOT_GRAPH_DEPTH tag can be used to set the maximum depth of the # graphs generated by dot. A depth value of 3 means that only nodes reachable # from the root by following a path via at most 3 edges will be shown. Nodes # that lay further from the root node will be omitted. Note that setting this # option to 1 or 2 may greatly reduce the computation time needed for large # code bases. Also note that the size of a graph can be further restricted by # DOT_GRAPH_MAX_NODES. Using a depth of 0 means no depth restriction. MAX_DOT_GRAPH_DEPTH = 0 # Set the DOT_TRANSPARENT tag to YES to generate images with a transparent # background. This is disabled by default, because dot on Windows does not # seem to support this out of the box. Warning: Depending on the platform used, # enabling this option may lead to badly anti-aliased labels on the edges of # a graph (i.e. they become hard to read). DOT_TRANSPARENT = NO # Set the DOT_MULTI_TARGETS tag to YES allow dot to generate multiple output # files in one run (i.e. multiple -o and -T options on the command line). This # makes dot run faster, but since only newer versions of dot (>1.8.10) # support this, this feature is disabled by default. DOT_MULTI_TARGETS = NO # If the GENERATE_LEGEND tag is set to YES (the default) Doxygen will # generate a legend page explaining the meaning of the various boxes and # arrows in the dot generated graphs. GENERATE_LEGEND = YES # If the DOT_CLEANUP tag is set to YES (the default) Doxygen will # remove the intermediate dot files that are used to generate # the various graphs. DOT_CLEANUP = YES clblas-2.10/doc/performance/000077500000000000000000000000001264277366700157665ustar00rootroot00000000000000clblas-2.10/doc/performance/clBLAS_2.6.0/000077500000000000000000000000001264277366700175515ustar00rootroot00000000000000clblas-2.10/doc/performance/clBLAS_2.6.0/S9150/000077500000000000000000000000001264277366700202725ustar00rootroot00000000000000clblas-2.10/doc/performance/clBLAS_2.6.0/S9150/README.txt000066400000000000000000000017731264277366700220000ustar00rootroot00000000000000################################ # # # Benchmarking Methodology # # # ################################ ############ # Hardware # ############ S9150 ############ # Software # ############ CentOS 6.6 clBLAS 2.6.0 driver 14.502 ############ # Settings # ############ gpu clocks: set to max level using proprietary tool though public alternatives exist clBLAS: m=n=k=lda=ldb=ldc (for simplicity) alpha=beta=1 gemms were column-major, op(A,B)=N,T ############ # Sampling # ############ For each data point, we took 10 samples. Each sample consists of 10 gemm calls with a wait afterward. Outlying samples beyond 1 standard deviation were removed (rarely if ever did this actually need to happen). Before running the 10 samples, one warm-up sample was executed (but not included in the stastics). GFlop/s was calculated as (2*m*n*k flops) / (host time for 10 kernels / 10) // real data (8*m*n*k flops) / (host time for 10 kernels / 10) // complex data clblas-2.10/doc/performance/clBLAS_2.6.0/S9150/cgemmNT_S9150_14.50.2_2.6.0_8.csv000066400000000000000000002346161264277366700250170ustar00rootroot00000000000000m,n,k,lda,ldb,ldc,offa,offb,offc,alpha,beta,order,transa,transb,side,uplo,diag,function,device,library,label,GFLOPS 8,8,8,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,0.169186 16,16,16,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,1.07683 24,24,24,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,3.01834 32,32,32,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,6.24599 40,40,40,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,11.275 48,48,48,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,16.6805 56,56,56,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,24.2271 64,64,64,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,35.0577 72,72,72,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,49.4778 80,80,80,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,67.4683 88,88,88,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,80.5641 96,96,96,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,95.8804 104,104,104,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,120.872 112,112,112,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,139.516 120,120,120,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,153.09 128,128,128,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,188.572 136,136,136,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,200.235 144,144,144,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,240.127 152,152,152,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,260.279 160,160,160,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,307.277 168,168,168,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,335.958 176,176,176,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,365.584 184,184,184,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,416.063 192,192,192,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,446.273 200,200,200,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,475.766 208,208,208,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,524.145 216,216,216,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,550.731 224,224,224,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,593.579 232,232,232,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,625.806 240,240,240,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,674.712 248,248,248,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,720.5 256,256,256,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,781.927 264,264,264,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,828.258 272,272,272,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,874.134 280,280,280,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,956.67 288,288,288,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,1025.51 296,296,296,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,1063.97 304,304,304,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,1139.21 312,312,312,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,1187.02 320,320,320,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,1274.96 328,328,328,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,1329.6 336,336,336,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,1378.51 344,344,344,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,1451.44 352,352,352,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,1552.87 360,360,360,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,1619.51 368,368,368,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,1674.39 376,376,376,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,1737.74 384,384,384,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,1858.32 392,392,392,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,1915.84 400,400,400,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,2006.19 408,408,408,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,2079.53 416,416,416,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,2201.48 424,424,424,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,1224.82 432,432,432,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,1272.94 440,440,440,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,1324.02 448,448,448,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,1845.93 456,456,456,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,1445.3 464,464,464,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,1484.14 472,472,472,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,1526.63 480,480,480,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,2111.69 488,488,488,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,1641.3 496,496,496,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,1702.08 504,504,504,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,1755.56 512,512,512,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,2382.12 520,520,520,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,1883.06 528,528,528,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,1959.24 536,536,536,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,1999.03 544,544,544,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,2655.55 552,552,552,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,2130.12 560,560,560,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,2194.07 568,568,568,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,2262.91 576,576,576,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,2733.56 584,584,584,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,1655.58 592,592,592,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,1684.97 600,600,600,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,1738.78 608,608,608,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,2062.95 616,616,616,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,1837.44 624,624,624,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,1887.88 632,632,632,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,1927.49 640,640,640,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,2292.19 648,648,648,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,2046.56 656,656,656,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,2092.8 664,664,664,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,2147.3 672,672,672,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,2538.81 680,680,680,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,2251.77 688,688,688,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,2298.75 696,696,696,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,2356.15 704,704,704,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,2771.71 712,712,712,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,1882.52 720,720,720,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,1912.41 728,728,728,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,1956.31 736,736,736,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,2510.89 744,744,744,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,2048.92 752,752,752,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,2103.89 760,760,760,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,2149.5 768,768,768,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,2565.51 776,776,776,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,2243.2 784,784,784,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,2284.65 792,792,792,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,2336.01 800,800,800,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,2790.72 808,808,808,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,2435.42 816,816,816,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,2479.92 824,824,824,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,2531.37 832,832,832,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,3018.92 840,840,840,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,2118.65 848,848,848,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,2162.42 856,856,856,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,2199.0 864,864,864,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,2655.78 872,872,872,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,2282.32 880,880,880,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,2317.88 888,888,888,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,2367.54 896,896,896,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,2829.91 904,904,904,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,2456.04 912,912,912,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,2494.7 920,920,920,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,2533.84 928,928,928,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,3022.78 936,936,936,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,2209.28 944,944,944,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,2243.04 952,952,952,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,2281.28 960,960,960,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,2758.84 968,968,968,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,2367.21 976,976,976,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,2403.13 984,984,984,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,2442.88 992,992,992,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,2918.61 1000,1000,1000,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,2533.17 1008,1008,1008,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,2576.43 1016,1016,1016,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,2618.65 1024,1024,1024,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,3083.66 1032,1032,1032,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,2321.96 1040,1040,1040,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,2349.16 1048,1048,1048,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,2383.14 1056,1056,1056,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,2875.41 1064,1064,1064,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,2467.85 1072,1072,1072,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,2505.82 1080,1080,1080,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,2536.99 1088,1088,1088,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,3024.02 1096,1096,1096,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,2617.28 1104,1104,1104,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,2652.43 1112,1112,1112,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,2689.86 1120,1120,1120,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,3182.6 1128,1128,1128,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,2428.56 1136,1136,1136,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,2461.76 1144,1144,1144,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,2498.23 1152,1152,1152,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,2985.27 1160,1160,1160,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,2555.81 1168,1168,1168,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,2595.46 1176,1176,1176,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,2623.62 1184,1184,1184,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,3149.77 1192,1192,1192,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,2416.54 1200,1200,1200,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,2426.85 1208,1208,1208,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,2460.56 1216,1216,1216,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,2986.32 1224,1224,1224,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,2528.04 1232,1232,1232,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,2563.95 1240,1240,1240,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,2593.12 1248,1248,1248,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,3120.76 1256,1256,1256,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,2402.89 1264,1264,1264,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,2436.1 1272,1272,1272,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,2484.08 1280,1280,1280,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,2999.55 1288,1288,1288,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,2533.91 1296,1296,1296,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,2568.16 1304,1304,1304,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,2596.73 1312,1312,1312,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,3115.85 1320,1320,1320,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,2424.53 1328,1328,1328,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,2448.34 1336,1336,1336,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,2478.67 1344,1344,1344,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,3024.34 1352,1352,1352,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,2525.54 1360,1360,1360,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,2569.97 1368,1368,1368,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,2582.13 1376,1376,1376,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,3122.51 1384,1384,1384,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,2647.07 1392,1392,1392,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,2676.98 1400,1400,1400,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,2705.66 1408,1408,1408,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,3249.48 1416,1416,1416,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,2542.39 1424,1424,1424,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,2567.95 1432,1432,1432,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,2602.75 1440,1440,1440,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,3144.1 1448,1448,1448,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,2473.17 1456,1456,1456,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,2498.2 1464,1464,1464,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,2528.73 1472,1472,1472,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,3079.67 1480,1480,1480,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,2591.08 1488,1488,1488,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,2614.7 1496,1496,1496,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,2640.81 1504,1504,1504,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,3170.07 1512,1512,1512,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,2527.09 1520,1520,1520,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,2557.19 1528,1528,1528,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,2583.11 1536,1536,1536,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,3091.1 1544,1544,1544,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,2643.23 1552,1552,1552,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,2658.16 1560,1560,1560,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,2685.83 1568,1568,1568,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,3201.16 1576,1576,1576,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,2550.24 1584,1584,1584,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,2577.85 1592,1592,1592,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,2597.72 1600,1600,1600,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,3134.72 1608,1608,1608,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,2658.5 1616,1616,1616,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,2682.65 1624,1624,1624,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,2703.2 1632,1632,1632,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,3237.38 1640,1640,1640,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,2565.66 1648,1648,1648,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,2591.08 1656,1656,1656,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,2618.91 1664,1664,1664,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,3173.05 1672,1672,1672,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,2660.68 1680,1680,1680,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,2691.84 1688,1688,1688,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,2716.21 1696,1696,1696,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,3274.51 1704,1704,1704,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,2624.01 1712,1712,1712,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,2633.09 1720,1720,1720,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,2663.8 1728,1728,1728,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,3213.95 1736,1736,1736,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,2561.45 1744,1744,1744,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,2578.55 1752,1752,1752,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,2591.77 1760,1760,1760,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,3173.0 1768,1768,1768,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,2655.44 1776,1776,1776,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,2678.46 1784,1784,1784,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,2732.26 1792,1792,1792,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,3254.62 1800,1800,1800,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,2620.85 1808,1808,1808,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,2643.5 1816,1816,1816,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,2664.57 1824,1824,1824,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,3216.74 1832,1832,1832,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,2558.09 1840,1840,1840,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,2580.69 1848,1848,1848,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,2606.68 1856,1856,1856,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,3183.15 1864,1864,1864,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,2657.48 1872,1872,1872,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,2689.18 1880,1880,1880,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,2697.55 1888,1888,1888,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,3263.31 1896,1896,1896,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,2616.8 1904,1904,1904,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,2634.27 1912,1912,1912,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,2658.57 1920,1920,1920,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,3227.57 1928,1928,1928,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,2579.71 1936,1936,1936,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,2606.7 1944,1944,1944,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,2623.21 1952,1952,1952,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,3203.63 1960,1960,1960,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,2667.67 1968,1968,1968,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,2693.31 1976,1976,1976,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,2719.18 1984,1984,1984,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,3277.69 1992,1992,1992,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,2654.46 2000,2000,2000,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,2670.44 2008,2008,2008,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,2697.58 2016,2016,2016,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,3249.91 2024,2024,2024,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,2653.05 2032,2032,2032,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,2675.7 2040,2040,2040,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,2697.92 2048,2048,2048,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,3213.29 2056,2056,2056,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,2631.0 2064,2064,2064,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,2633.46 2072,2072,2072,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,2655.07 2080,2080,2080,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,3207.86 2088,2088,2088,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,2671.79 2096,2096,2096,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,2695.04 2104,2104,2104,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,2718.17 2112,2112,2112,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,3275.52 2120,2120,2120,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,2658.6 2128,2128,2128,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,2675.16 2136,2136,2136,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,2685.07 2144,2144,2144,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,3256.08 2152,2152,2152,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,2619.3 2160,2160,2160,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,2643.87 2168,2168,2168,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,2666.42 2176,2176,2176,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,3241.21 2184,2184,2184,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,2608.89 2192,2192,2192,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,2629.13 2200,2200,2200,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,2640.56 2208,2208,2208,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,3227.56 2216,2216,2216,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,2695.31 2224,2224,2224,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,2701.44 2232,2232,2232,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,2720.24 2240,2240,2240,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,3290.82 2248,2248,2248,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,2672.8 2256,2256,2256,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,2685.42 2264,2264,2264,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,2697.74 2272,2272,2272,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,3275.93 2280,2280,2280,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,2645.85 2288,2288,2288,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,2663.75 2296,2296,2296,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,2719.22 2304,2304,2304,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,3262.29 2312,2312,2312,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,2663.33 2320,2320,2320,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,2659.98 2328,2328,2328,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,2674.78 2336,2336,2336,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,3255.56 2344,2344,2344,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,2626.74 2352,2352,2352,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,2646.8 2360,2360,2360,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,2670.27 2368,2368,2368,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,3246.43 2376,2376,2376,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,2700.04 2384,2384,2384,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,2727.46 2392,2392,2392,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,2738.53 2400,2400,2400,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,3305.57 2408,2408,2408,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,2679.06 2416,2416,2416,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,2702.33 2424,2424,2424,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,2726.49 2432,2432,2432,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,3296.19 2440,2440,2440,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,2678.38 2448,2448,2448,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,2699.65 2456,2456,2456,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,2709.34 2464,2464,2464,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,3288.21 2472,2472,2472,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,2669.43 2480,2480,2480,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,2691.76 2488,2488,2488,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,2712.69 2496,2496,2496,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,3285.13 2504,2504,2504,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,2667.18 2512,2512,2512,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,2681.46 2520,2520,2520,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,2698.41 2528,2528,2528,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,3280.13 2536,2536,2536,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,2693.68 2544,2544,2544,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,2707.41 2552,2552,2552,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,2730.2 2560,2560,2560,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,3267.57 2568,2568,2568,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,2700.99 2576,2576,2576,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,2692.37 2584,2584,2584,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,2716.4 2592,2592,2592,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,3273.86 2600,2600,2600,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,2661.82 2608,2608,2608,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,2677.87 2616,2616,2616,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,2696.97 2624,2624,2624,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,3271.07 2632,2632,2632,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,2660.76 2640,2640,2640,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,2676.75 2648,2648,2648,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,2683.22 2656,2656,2656,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,3269.16 2664,2664,2664,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,2647.99 2672,2672,2672,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,2679.03 2680,2680,2680,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,2697.42 2688,2688,2688,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,3267.88 2696,2696,2696,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,2664.67 2704,2704,2704,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,2683.55 2712,2712,2712,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,2686.74 2720,2720,2720,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,3267.91 2728,2728,2728,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,2658.67 2736,2736,2736,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,2684.1 2744,2744,2744,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,2693.56 2752,2752,2752,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,3268.46 2760,2760,2760,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,2644.86 2768,2768,2768,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,2664.81 2776,2776,2776,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,2660.59 2784,2784,2784,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,3269.93 2792,2792,2792,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,2664.52 2800,2800,2800,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,2680.24 2808,2808,2808,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,2748.18 2816,2816,2816,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,3312.31 2824,2824,2824,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,2662.48 2832,2832,2832,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,2660.24 2840,2840,2840,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,2671.91 2848,2848,2848,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,3273.48 2856,2856,2856,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,2672.9 2864,2864,2864,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,2688.16 2872,2872,2872,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,2703.48 2880,2880,2880,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,3274.76 2888,2888,2888,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,2681.55 2896,2896,2896,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,2699.01 2904,2904,2904,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,2710.13 2912,2912,2912,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,3277.97 2920,2920,2920,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,2681.29 2928,2928,2928,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,2703.95 2936,2936,2936,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,2715.33 2944,2944,2944,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,3281.05 2952,2952,2952,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,2690.23 2960,2960,2960,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,2709.39 2968,2968,2968,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,2717.1 2976,2976,2976,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,3285.19 2984,2984,2984,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,2694.58 2992,2992,2992,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,2711.59 3000,3000,3000,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,2720.96 3008,3008,3008,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,3289.42 3016,3016,3016,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,2684.95 3024,3024,3024,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,2697.2 3032,3032,3032,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,2710.01 3040,3040,3040,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,3294.27 3048,3048,3048,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,2718.16 3056,3056,3056,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,2737.18 3064,3064,3064,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,2758.99 3072,3072,3072,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,3290.47 3080,3080,3080,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,2736.66 3088,3088,3088,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,2734.15 3096,3096,3096,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,2749.34 3104,3104,3104,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,3302.58 3112,3112,3112,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,2711.58 3120,3120,3120,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,2725.08 3128,3128,3128,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,2740.04 3136,3136,3136,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,3306.81 3144,3144,3144,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,2723.55 3152,3152,3152,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,2741.98 3160,3160,3160,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,2740.01 3168,3168,3168,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,3310.27 3176,3176,3176,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,2713.3 3184,3184,3184,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,2745.51 3192,3192,3192,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,2756.7 3200,3200,3200,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,3314.21 3208,3208,3208,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,2738.0 3216,3216,3216,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,2752.12 3224,3224,3224,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,2752.07 3232,3232,3232,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,3319.73 3240,3240,3240,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,2702.29 3248,3248,3248,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,2716.46 3256,3256,3256,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,2717.17 3264,3264,3264,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,3294.53 3272,3272,3272,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,2707.92 3280,3280,3280,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,2727.68 3288,3288,3288,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,2721.86 3296,3296,3296,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,3302.14 3304,3304,3304,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,2696.83 3312,3312,3312,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,2725.22 3320,3320,3320,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,2742.01 3328,3328,3328,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,3305.8 3336,3336,3336,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,2723.15 3344,3344,3344,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,2732.59 3352,3352,3352,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,2733.32 3360,3360,3360,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,3313.64 3368,3368,3368,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,2722.33 3376,3376,3376,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,2749.0 3384,3384,3384,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,2750.21 3392,3392,3392,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,3317.92 3400,3400,3400,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,2700.23 3408,3408,3408,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,2722.89 3416,3416,3416,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,2724.44 3424,3424,3424,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,3297.31 3432,3432,3432,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,2697.72 3440,3440,3440,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,2727.16 3448,3448,3448,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,2736.07 3456,3456,3456,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,3305.62 3464,3464,3464,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,2721.07 3472,3472,3472,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,2740.92 3480,3480,3480,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,2741.88 3488,3488,3488,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,3312.46 3496,3496,3496,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,2713.02 3504,3504,3504,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,2730.37 3512,3512,3512,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,2747.22 3520,3520,3520,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,3317.13 3528,3528,3528,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,2701.75 3536,3536,3536,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,2711.82 3544,3544,3544,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,2720.12 3552,3552,3552,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,3300.4 3560,3560,3560,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,2724.47 3568,3568,3568,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,2738.65 3576,3576,3576,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,2759.03 3584,3584,3584,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,3303.02 3592,3592,3592,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,2747.61 3600,3600,3600,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,2744.56 3608,3608,3608,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,2760.05 3616,3616,3616,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,3315.95 3624,3624,3624,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,2733.2 3632,3632,3632,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,2745.48 3640,3640,3640,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,2765.0 3648,3648,3648,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,3320.96 3656,3656,3656,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,2727.55 3664,3664,3664,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,2735.32 3672,3672,3672,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,2736.55 3680,3680,3680,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,3306.74 3688,3688,3688,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,2726.98 3696,3696,3696,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,2745.79 3704,3704,3704,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,2756.78 3712,3712,3712,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,3314.8 3720,3720,3720,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,2746.38 3728,3728,3728,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,2759.87 3736,3736,3736,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,2761.97 3744,3744,3744,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,3320.24 3752,3752,3752,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,2713.87 3760,3760,3760,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,2729.33 3768,3768,3768,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,2736.33 3776,3776,3776,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,3308.03 3784,3784,3784,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,2738.75 3792,3792,3792,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,2749.51 3800,3800,3800,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,2750.14 3808,3808,3808,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,3316.22 3816,3816,3816,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,2741.93 3824,3824,3824,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,2754.66 3832,3832,3832,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,2770.55 3840,3840,3840,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,3321.09 3848,3848,3848,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,2730.7 3856,3856,3856,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,2736.97 3864,3864,3864,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,2743.5 3872,3872,3872,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,3311.82 3880,3880,3880,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,2739.8 3888,3888,3888,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,2757.11 3896,3896,3896,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,2765.74 3904,3904,3904,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,3320.0 3912,3912,3912,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,2754.56 3920,3920,3920,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,2767.78 3928,3928,3928,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,2773.53 3936,3936,3936,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,3325.6 3944,3944,3944,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,2735.53 3952,3952,3952,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,2746.19 3960,3960,3960,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,2760.34 3968,3968,3968,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,3317.3 3976,3976,3976,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,2751.11 3984,3984,3984,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,2761.08 3992,3992,3992,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,2764.69 4000,4000,4000,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,3322.89 4008,4008,4008,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,2721.37 4016,4016,4016,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,2733.82 4024,4024,4024,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,2754.43 4032,4032,4032,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,3314.79 4040,4040,4040,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,2751.98 4048,4048,4048,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,2755.75 4056,4056,4056,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,2766.14 4064,4064,4064,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,3323.42 4072,4072,4072,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,2734.27 4080,4080,4080,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,2746.6 4088,4088,4088,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,2761.35 4096,4096,4096,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,3307.66 4104,4104,4104,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,2744.31 4112,4112,4112,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,2747.19 4120,4120,4120,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,2762.54 4128,4128,4128,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,3322.32 4136,4136,4136,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,2721.85 4144,4144,4144,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,2743.2 4152,4152,4152,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,2749.36 4160,4160,4160,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,3313.07 4168,4168,4168,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,2748.16 4176,4176,4176,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,2760.28 4184,4184,4184,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,2759.26 4192,4192,4192,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,3321.83 4200,4200,4200,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,2724.41 4208,4208,4208,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,2721.66 4216,4216,4216,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,2732.74 4224,4224,4224,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,3327.02 4232,4232,4232,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,2751.23 4240,4240,4240,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,2759.21 4248,4248,4248,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,2761.34 4256,4256,4256,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,3322.5 4264,4264,4264,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,2735.99 4272,4272,4272,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,2746.72 4280,4280,4280,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,2759.17 4288,4288,4288,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,3315.39 4296,4296,4296,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,2752.6 4304,4304,4304,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,2764.86 4312,4312,4312,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,2768.09 4320,4320,4320,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,3323.95 4328,4328,4328,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,2734.75 4336,4336,4336,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,2747.47 4344,4344,4344,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,2760.87 4352,4352,4352,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,3315.19 4360,4360,4360,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,2758.14 4368,4368,4368,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,2767.01 4376,4376,4376,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,2769.97 4384,4384,4384,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,3325.72 4392,4392,4392,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,2740.72 4400,4400,4400,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,2754.25 4408,4408,4408,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,2764.89 4416,4416,4416,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,3319.88 4424,4424,4424,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,2756.75 4432,4432,4432,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,2766.8 4440,4440,4440,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,2770.03 4448,4448,4448,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,3326.17 4456,4456,4456,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,2737.57 4464,4464,4464,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,2748.85 4472,4472,4472,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,2764.74 4480,4480,4480,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,3323.2 4488,4488,4488,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,2761.57 4496,4496,4496,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,2770.9 4504,4504,4504,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,2767.32 4512,4512,4512,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,3328.74 4520,4520,4520,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,2740.61 4528,4528,4528,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,2753.48 4536,4536,4536,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,2768.55 4544,4544,4544,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,3326.03 4552,4552,4552,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,2741.14 4560,4560,4560,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,2741.24 4568,4568,4568,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,2740.86 4576,4576,4576,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,3321.45 4584,4584,4584,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,2748.97 4592,4592,4592,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,2762.21 4600,4600,4600,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,2778.24 4608,4608,4608,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,3324.5 4616,4616,4616,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,2754.6 4624,4624,4624,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,2750.68 4632,4632,4632,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,2762.03 4640,4640,4640,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,3325.45 4648,4648,4648,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,2721.42 4656,4656,4656,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,2732.38 4664,4664,4664,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,2756.66 4672,4672,4672,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,3321.55 4680,4680,4680,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,2751.11 4688,4688,4688,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,2753.95 4696,4696,4696,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,2755.06 4704,4704,4704,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,3327.45 4712,4712,4712,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,2743.04 4720,4720,4720,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,2751.06 4728,4728,4728,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,2761.53 4736,4736,4736,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,3325.85 4744,4744,4744,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,2746.06 4752,4752,4752,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,2750.95 4760,4760,4760,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,2749.29 4768,4768,4768,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,3323.0 4776,4776,4776,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,2748.21 4784,4784,4784,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,2757.26 4792,4792,4792,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,2770.69 4800,4800,4800,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,3328.81 4808,4808,4808,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,2753.57 4816,4816,4816,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,2760.31 4824,4824,4824,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,2758.23 4832,4832,4832,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,3327.74 4840,4840,4840,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,2734.19 4848,4848,4848,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,2743.66 4856,4856,4856,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,2760.77 4864,4864,4864,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,3324.29 4872,4872,4872,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,2740.73 4880,4880,4880,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,2742.52 4888,4888,4888,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,2741.41 4896,4896,4896,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,3323.71 4904,4904,4904,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,2702.0 4912,4912,4912,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,2712.15 4920,4920,4920,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,2741.48 4928,4928,4928,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,3328.68 4936,4936,4936,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,2749.15 4944,4944,4944,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,2762.24 4952,4952,4952,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,2762.74 4960,4960,4960,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,3327.89 4968,4968,4968,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,2743.12 4976,4976,4976,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,2755.22 4984,4984,4984,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,2760.75 4992,4992,4992,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,3326.04 5000,5000,5000,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,2745.25 5008,5008,5008,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,2747.26 5016,5016,5016,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,2747.03 5024,5024,5024,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,3324.46 5032,5032,5032,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,2739.63 5040,5040,5040,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,2754.49 5048,5048,5048,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,2769.13 5056,5056,5056,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,3329.8 5064,5064,5064,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,2745.38 5072,5072,5072,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,2744.5 5080,5080,5080,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,2753.05 5088,5088,5088,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,3330.07 5096,5096,5096,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,2730.73 5104,5104,5104,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,2741.31 5112,5112,5112,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,2765.21 5120,5120,5120,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,3325.85 5128,5128,5128,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,2750.55 5136,5136,5136,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,2740.17 5144,5144,5144,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,2750.29 5152,5152,5152,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,3327.58 5160,5160,5160,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,2716.27 5168,5168,5168,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,2730.55 5176,5176,5176,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,2756.42 5184,5184,5184,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,3326.5 5192,5192,5192,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,2744.87 5200,5200,5200,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,2742.83 5208,5208,5208,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,2744.96 5216,5216,5216,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,3331.36 5224,5224,5224,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,2727.59 5232,5232,5232,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,2743.67 5240,5240,5240,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,2754.04 5248,5248,5248,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,3329.89 5256,5256,5256,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,2742.23 5264,5264,5264,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,2743.56 5272,5272,5272,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,2738.95 5280,5280,5280,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,3331.02 5288,5288,5288,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,2730.85 5296,5296,5296,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,2737.0 5304,5304,5304,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,2758.87 5312,5312,5312,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,3330.47 5320,5320,5320,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,2743.19 5328,5328,5328,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,2738.62 5336,5336,5336,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,2742.49 5344,5344,5344,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,3329.89 5352,5352,5352,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,2718.61 5360,5360,5360,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,2726.63 5368,5368,5368,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,2744.82 5376,5376,5376,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,3329.31 5384,5384,5384,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,2730.44 5392,5392,5392,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,2732.9 5400,5400,5400,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,2728.28 5408,5408,5408,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,3329.27 5416,5416,5416,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,2711.88 5424,5424,5424,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,2727.58 5432,5432,5432,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,2751.33 5440,5440,5440,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,3328.85 5448,5448,5448,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,2724.48 5456,5456,5456,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,2722.9 5464,5464,5464,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,2723.67 5472,5472,5472,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,3328.84 5480,5480,5480,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,2721.82 5488,5488,5488,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,2726.22 5496,5496,5496,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,2756.22 5504,5504,5504,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,3327.48 5512,5512,5512,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,2734.67 5520,5520,5520,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,2740.32 5528,5528,5528,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,2745.2 5536,5536,5536,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,3328.67 5544,5544,5544,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,2714.14 5552,5552,5552,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,2709.42 5560,5560,5560,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,2722.54 5568,5568,5568,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,3329.22 5576,5576,5576,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,2698.59 5584,5584,5584,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,2699.62 5592,5592,5592,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,2705.53 5600,5600,5600,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,3329.7 5608,5608,5608,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,2694.06 5616,5616,5616,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,2731.89 5624,5624,5624,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,2760.15 5632,5632,5632,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,3330.76 5640,5640,5640,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,2747.1 5648,5648,5648,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,2751.04 5656,5656,5656,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,2749.63 5664,5664,5664,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,3330.13 5672,5672,5672,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,2747.36 5680,5680,5680,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,2760.13 5688,5688,5688,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,2763.68 5696,5696,5696,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,3329.9 5704,5704,5704,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,2753.35 5712,5712,5712,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,2765.87 5720,5720,5720,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,2769.2 5728,5728,5728,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,3329.89 5736,5736,5736,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,2757.54 5744,5744,5744,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,2766.53 5752,5752,5752,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,2772.94 5760,5760,5760,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,S9150_14.50.2,3328.53 clblas-2.10/doc/performance/clBLAS_2.6.0/S9150/dgemmNT_S9150_14.50.2_2.6.0_8.csv000066400000000000000000002346211264277366700250140ustar00rootroot00000000000000m,n,k,lda,ldb,ldc,offa,offb,offc,alpha,beta,order,transa,transb,side,uplo,diag,function,device,library,label,GFLOPS 8,8,8,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,0.0343509 16,16,16,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,0.420318 24,24,24,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,1.04926 32,32,32,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,3.01731 40,40,40,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,2.31716 48,48,48,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,9.61252 56,56,56,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,3.28654 64,64,64,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,20.5523 72,72,72,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,20.3738 80,80,80,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,37.7025 88,88,88,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,9.37569 96,96,96,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,61.3548 104,104,104,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,14.4019 112,112,112,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,84.8386 120,120,120,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,67.5396 128,128,128,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,117.127 136,136,136,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,25.7784 144,144,144,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,161.974 152,152,152,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,31.0931 160,160,160,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,187.675 168,168,168,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,157.739 176,176,176,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,244.639 184,184,184,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,46.1291 192,192,192,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,284.653 200,200,200,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,101.943 208,208,208,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,339.07 216,216,216,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,264.402 224,224,224,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,371.551 232,232,232,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,74.282 240,240,240,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,447.813 248,248,248,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,80.7293 256,256,256,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,493.593 264,264,264,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,416.143 272,272,272,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,523.032 280,280,280,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,216.372 288,288,288,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,585.415 296,296,296,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,121.339 304,304,304,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,536.769 312,312,312,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,572.18 320,320,320,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,610.887 328,328,328,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,152.529 336,336,336,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,589.618 344,344,344,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,159.998 352,352,352,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,653.592 360,360,360,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,675.929 368,368,368,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,578.648 376,376,376,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,190.528 384,384,384,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,592.695 392,392,392,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,216.339 400,400,400,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,683.469 408,408,408,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,868.23 416,416,416,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,715.122 424,424,424,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,201.339 432,432,432,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,690.194 440,440,440,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,561.603 448,448,448,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,722.444 456,456,456,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,877.871 464,464,464,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,775.661 472,472,472,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,241.484 480,480,480,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,743.526 488,488,488,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,265.964 496,496,496,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,761.104 504,504,504,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,1036.55 512,512,512,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,959.829 520,520,520,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,779.877 528,528,528,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,784.972 536,536,536,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,310.744 544,544,544,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,1079.09 552,552,552,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,342.145 560,560,560,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,741.606 568,568,568,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,350.122 576,576,576,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,931.165 584,584,584,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,315.695 592,592,592,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,324.205 600,600,600,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,857.092 608,608,608,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,911.64 616,616,616,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,352.397 624,624,624,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,1098.52 632,632,632,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,359.401 640,640,640,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,1012.76 648,648,648,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,390.015 656,656,656,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,398.836 664,664,664,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,397.597 672,672,672,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,908.223 680,680,680,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,1110.54 688,688,688,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,375.323 696,696,696,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,370.14 704,704,704,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,1231.06 712,712,712,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,401.207 720,720,720,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,1048.23 728,728,728,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,405.386 736,736,736,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,1115.76 744,744,744,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,415.079 752,752,752,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,424.567 760,760,760,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,897.234 768,768,768,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,1178.68 776,776,776,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,476.73 784,784,784,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,486.967 792,792,792,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,484.738 800,800,800,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,1307.36 808,808,808,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,449.585 816,816,816,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,1331.39 824,824,824,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,455.327 832,832,832,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,1414.07 840,840,840,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,1097.76 848,848,848,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,495.412 856,856,856,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,492.795 864,864,864,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,1492.56 872,872,872,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,464.279 880,880,880,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,1206.4 888,888,888,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,471.889 896,896,896,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,1333.07 904,904,904,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,500.626 912,912,912,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,1143.49 920,920,920,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,1103.24 928,928,928,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,1432.57 936,936,936,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,535.868 944,944,944,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,544.824 952,952,952,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,541.549 960,960,960,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,1259.16 968,968,968,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,570.512 976,976,976,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,580.006 984,984,984,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,579.098 992,992,992,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,1408.37 1000,1000,1000,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,1295.43 1008,1008,1008,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,1389.44 1016,1016,1016,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,528.089 1024,1024,1024,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,517.896 1032,1032,1032,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,499.138 1040,1040,1040,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,508.008 1048,1048,1048,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,477.821 1056,1056,1056,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,1516.93 1064,1064,1064,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,576.398 1072,1072,1072,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,585.935 1080,1080,1080,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,543.766 1088,1088,1088,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,556.224 1096,1096,1096,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,518.809 1104,1104,1104,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,1326.37 1112,1112,1112,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,577.409 1120,1120,1120,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,1553.37 1128,1128,1128,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,547.73 1136,1136,1136,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,556.412 1144,1144,1144,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,525.236 1152,1152,1152,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,1434.43 1160,1160,1160,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,625.179 1168,1168,1168,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,635.164 1176,1176,1176,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,592.888 1184,1184,1184,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,605.096 1192,1192,1192,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,567.842 1200,1200,1200,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,1550.01 1208,1208,1208,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,680.266 1216,1216,1216,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,690.393 1224,1224,1224,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,643.745 1232,1232,1232,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,653.022 1240,1240,1240,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,615.744 1248,1248,1248,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,1675.8 1256,1256,1256,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,732.452 1264,1264,1264,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,741.958 1272,1272,1272,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,693.154 1280,1280,1280,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,1472.95 1288,1288,1288,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,662.231 1296,1296,1296,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,1473.78 1304,1304,1304,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,720.833 1312,1312,1312,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,732.363 1320,1320,1320,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,687.06 1328,1328,1328,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,694.965 1336,1336,1336,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,659.906 1344,1344,1344,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,1585.6 1352,1352,1352,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,772.91 1360,1360,1360,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,783.322 1368,1368,1368,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,733.985 1376,1376,1376,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,748.366 1384,1384,1384,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,706.027 1392,1392,1392,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,1697.97 1400,1400,1400,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,829.4 1408,1408,1408,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,838.64 1416,1416,1416,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,789.432 1424,1424,1424,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,798.795 1432,1432,1432,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,756.034 1440,1440,1440,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,1546.63 1448,1448,1448,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,817.733 1456,1456,1456,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,827.625 1464,1464,1464,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,781.132 1472,1472,1472,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,794.577 1480,1480,1480,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,752.901 1488,1488,1488,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,1644.85 1496,1496,1496,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,872.498 1504,1504,1504,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,884.718 1512,1512,1512,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,834.606 1520,1520,1520,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,843.553 1528,1528,1528,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,798.914 1536,1536,1536,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,1701.79 1544,1544,1544,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,923.327 1552,1552,1552,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,936.132 1560,1560,1560,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,885.677 1568,1568,1568,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,898.443 1576,1576,1576,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,852.731 1584,1584,1584,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,1617.3 1592,1592,1592,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,914.633 1600,1600,1600,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,1564.89 1608,1608,1608,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,876.784 1616,1616,1616,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,886.003 1624,1624,1624,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,845.614 1632,1632,1632,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,1712.21 1640,1640,1640,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,966.012 1648,1648,1648,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,977.432 1656,1656,1656,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,925.65 1664,1664,1664,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,936.333 1672,1672,1672,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,893.681 1680,1680,1680,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,1805.44 1688,1688,1688,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,1023.87 1696,1696,1696,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,1037.0 1704,1704,1704,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,979.973 1712,1712,1712,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,992.058 1720,1720,1720,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,946.959 1728,1728,1728,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,1693.19 1736,1736,1736,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,1008.56 1744,1744,1744,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,1020.98 1752,1752,1752,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,969.703 1760,1760,1760,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,1589.41 1768,1768,1768,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,938.015 1776,1776,1776,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,1779.51 1784,1784,1784,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,1063.54 1792,1792,1792,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,1066.12 1800,1800,1800,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,1022.2 1808,1808,1808,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,1034.19 1816,1816,1816,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,989.689 1824,1824,1824,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,1688.96 1832,1832,1832,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,1051.65 1840,1840,1840,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,1063.49 1848,1848,1848,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,1013.48 1856,1856,1856,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,1027.31 1864,1864,1864,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,982.58 1872,1872,1872,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,1770.31 1880,1880,1880,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,1106.07 1888,1888,1888,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,1118.4 1896,1896,1896,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,1064.66 1904,1904,1904,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,1076.98 1912,1912,1912,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,1030.96 1920,1920,1920,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,1679.4 1928,1928,1928,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,1092.91 1936,1936,1936,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,1104.77 1944,1944,1944,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,1056.64 1952,1952,1952,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,1070.76 1960,1960,1960,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,1025.63 1968,1968,1968,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,1770.4 1976,1976,1976,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,1146.59 1984,1984,1984,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,1158.95 1992,1992,1992,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,1106.95 2000,2000,2000,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,1117.36 2008,2008,2008,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,1075.97 2016,2016,2016,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,1706.51 2024,2024,2024,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,1137.03 2032,2032,2032,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,1148.64 2040,2040,2040,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,1097.41 2048,2048,2048,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,1067.63 2056,2056,2056,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,1065.82 2064,2064,2064,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,1769.12 2072,2072,2072,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,1186.26 2080,2080,2080,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,1621.15 2088,2088,2088,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,1149.38 2096,2096,2096,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,1159.11 2104,2104,2104,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,1117.35 2112,2112,2112,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,1846.81 2120,2120,2120,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,1235.43 2128,2128,2128,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,1247.04 2136,2136,2136,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,1194.99 2144,2144,2144,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,1210.13 2152,2152,2152,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,1161.71 2160,2160,2160,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,1785.98 2168,2168,2168,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,1225.4 2176,2176,2176,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,1231.71 2184,2184,2184,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,1188.82 2192,2192,2192,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,1200.39 2200,2200,2200,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,1158.84 2208,2208,2208,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,1738.4 2216,2216,2216,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,1220.3 2224,2224,2224,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,1234.23 2232,2232,2232,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,1185.77 2240,2240,2240,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,1671.41 2248,2248,2248,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,1155.56 2256,2256,2256,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,1801.24 2264,2264,2264,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,1267.71 2272,2272,2272,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,1282.19 2280,2280,2280,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,1231.1 2288,2288,2288,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,1241.96 2296,2296,2296,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,1197.37 2304,2304,2304,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,1734.5 2312,2312,2312,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,1258.28 2320,2320,2320,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,1269.72 2328,2328,2328,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,1225.98 2336,2336,2336,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,1240.38 2344,2344,2344,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,1198.36 2352,2352,2352,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,1819.87 2360,2360,2360,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,1309.88 2368,2368,2368,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,1323.53 2376,2376,2376,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,1273.73 2384,2384,2384,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,1285.68 2392,2392,2392,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,1240.34 2400,2400,2400,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,1779.9 2408,2408,2408,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,1300.79 2416,2416,2416,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,1314.51 2424,2424,2424,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,1266.63 2432,2432,2432,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,1278.01 2440,2440,2440,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,1238.68 2448,2448,2448,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,1842.33 2456,2456,2456,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,1351.3 2464,2464,2464,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,1364.65 2472,2472,2472,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,1313.27 2480,2480,2480,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,1325.46 2488,2488,2488,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,1285.05 2496,2496,2496,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,1804.01 2504,2504,2504,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,1341.42 2512,2512,2512,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,1353.2 2520,2520,2520,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,1308.46 2528,2528,2528,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,1322.09 2536,2536,2536,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,1281.22 2544,2544,2544,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,1859.99 2552,2552,2552,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,1384.47 2560,2560,2560,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,369.118 2568,2568,2568,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,1348.67 2576,2576,2576,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,1366.45 2584,2584,2584,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,1328.45 2592,2592,2592,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,1830.49 2600,2600,2600,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,1381.74 2608,2608,2608,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,1394.01 2616,2616,2616,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,1349.91 2624,2624,2624,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,1364.11 2632,2632,2632,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,1322.31 2640,2640,2640,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,1799.69 2648,2648,2648,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,1376.97 2656,2656,2656,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,1392.06 2664,2664,2664,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,1346.85 2672,2672,2672,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,1357.55 2680,2680,2680,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,1320.43 2688,2688,2688,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,1852.92 2696,2696,2696,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,1422.17 2704,2704,2704,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,1435.28 2712,2712,2712,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,1389.93 2720,2720,2720,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,1663.15 2728,2728,2728,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,1357.78 2736,2736,2736,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,1823.49 2744,2744,2744,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,1417.24 2752,2752,2752,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,1432.49 2760,2760,2760,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,1386.73 2768,2768,2768,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,1399.96 2776,2776,2776,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,1361.42 2784,2784,2784,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,1806.95 2792,2792,2792,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,1414.3 2800,2800,2800,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,1423.59 2808,2808,2808,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,1380.83 2816,2816,2816,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,1378.74 2824,2824,2824,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,1357.48 2832,2832,2832,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,1856.21 2840,2840,2840,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,1459.23 2848,2848,2848,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,1474.11 2856,2856,2856,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,1429.99 2864,2864,2864,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,1441.27 2872,2872,2872,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,1405.64 2880,2880,2880,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,1838.07 2888,2888,2888,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,1455.44 2896,2896,2896,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,1465.73 2904,2904,2904,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,1420.66 2912,2912,2912,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,1438.68 2920,2920,2920,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,1400.95 2928,2928,2928,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,1816.75 2936,2936,2936,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,1453.1 2944,2944,2944,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,1462.46 2952,2952,2952,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,1424.63 2960,2960,2960,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,1435.56 2968,2968,2968,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,1401.92 2976,2976,2976,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,1871.06 2984,2984,2984,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,1496.0 2992,2992,2992,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,1507.35 3000,3000,3000,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,1466.64 3008,3008,3008,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,1481.05 3016,3016,3016,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,1443.92 3024,3024,3024,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,1851.04 3032,3032,3032,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,1494.05 3040,3040,3040,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,1680.82 3048,3048,3048,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,1465.39 3056,3056,3056,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,1474.93 3064,3064,3064,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,1435.81 3072,3072,3072,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,1144.65 3080,3080,3080,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,1483.24 3088,3088,3088,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,1498.98 3096,3096,3096,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,1464.72 3104,3104,3104,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,1477.64 3112,3112,3112,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,1443.42 3120,3120,3120,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,1821.24 3128,3128,3128,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,1493.3 3136,3136,3136,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,1505.39 3144,3144,3144,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,1466.35 3152,3152,3152,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,1475.86 3160,3160,3160,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,1443.99 3168,3168,3168,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,1869.7 3176,3176,3176,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,1529.99 3184,3184,3184,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,1542.52 3192,3192,3192,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,1503.75 3200,3200,3200,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,1699.01 3208,3208,3208,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,1482.28 3216,3216,3216,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,1854.34 3224,3224,3224,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,1531.71 3232,3232,3232,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,1544.08 3240,3240,3240,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,1502.3 3248,3248,3248,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,1515.72 3256,3256,3256,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,1485.03 3264,3264,3264,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,1844.78 3272,3272,3272,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,1529.4 3280,3280,3280,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,1539.29 3288,3288,3288,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,1504.48 3296,3296,3296,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,1518.22 3304,3304,3304,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,1484.73 3312,3312,3312,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,1830.6 3320,3320,3320,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,1528.14 3328,3328,3328,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,1524.37 3336,3336,3336,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,1503.55 3344,3344,3344,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,1515.86 3352,3352,3352,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,1487.91 3360,3360,3360,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,1880.22 3368,3368,3368,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,1570.54 3376,3376,3376,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,1579.57 3384,3384,3384,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,1544.76 3392,3392,3392,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,1559.02 3400,3400,3400,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,1525.59 3408,3408,3408,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,1867.42 3416,3416,3416,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,1568.97 3424,3424,3424,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,1583.2 3432,3432,3432,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,1545.74 3440,3440,3440,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,1557.33 3448,3448,3448,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,1526.37 3456,3456,3456,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,1855.63 3464,3464,3464,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,1570.16 3472,3472,3472,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,1580.88 3480,3480,3480,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,1546.67 3488,3488,3488,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,1559.98 3496,3496,3496,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,1527.62 3504,3504,3504,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,1852.21 3512,3512,3512,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,1571.73 3520,3520,3520,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,1702.94 3528,3528,3528,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,1549.8 3536,3536,3536,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,1559.41 3544,3544,3544,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,1532.16 3552,3552,3552,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,1847.7 3560,3560,3560,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,1573.52 3568,3568,3568,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,1580.8 3576,3576,3576,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,1544.76 3584,3584,3584,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,792.349 3592,3592,3592,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,1526.74 3600,3600,3600,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,1881.26 3608,3608,3608,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,1609.29 3616,3616,3616,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,1621.57 3624,3624,3624,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,1587.69 3632,3632,3632,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,1597.04 3640,3640,3640,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,1571.0 3648,3648,3648,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,1884.3 3656,3656,3656,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,1613.4 3664,3664,3664,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,1622.91 3672,3672,3672,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,1589.22 3680,3680,3680,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,1693.92 3688,3688,3688,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,1572.84 3696,3696,3696,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,1877.36 3704,3704,3704,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,1614.36 3712,3712,3712,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,1621.48 3720,3720,3720,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,1592.54 3728,3728,3728,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,1602.96 3736,3736,3736,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,1576.91 3744,3744,3744,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,1874.21 3752,3752,3752,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,1615.06 3760,3760,3760,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,1626.2 3768,3768,3768,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,1596.38 3776,3776,3776,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,1609.59 3784,3784,3784,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,1579.77 3792,3792,3792,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,1867.5 3800,3800,3800,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,1620.07 3808,3808,3808,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,1631.75 3816,3816,3816,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,1598.57 3824,3824,3824,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,1606.65 3832,3832,3832,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,1575.82 3840,3840,3840,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,1821.38 3848,3848,3848,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,1612.9 3856,3856,3856,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,1629.3 3864,3864,3864,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,1601.73 3872,3872,3872,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,1615.62 3880,3880,3880,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,1588.41 3888,3888,3888,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,1862.3 3896,3896,3896,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,1626.43 3904,3904,3904,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,1637.96 3912,3912,3912,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,1607.09 3920,3920,3920,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,1616.86 3928,3928,3928,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,1589.09 3936,3936,3936,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,1863.25 3944,3944,3944,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,1629.88 3952,3952,3952,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,1639.32 3960,3960,3960,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,1609.4 3968,3968,3968,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,1613.6 3976,3976,3976,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,1595.29 3984,3984,3984,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,1860.41 3992,3992,3992,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,1633.15 4000,4000,4000,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,1709.24 4008,4008,4008,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,1613.43 4016,4016,4016,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,1624.63 4024,4024,4024,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,1601.6 4032,4032,4032,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,1862.42 4040,4040,4040,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,1638.86 4048,4048,4048,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,1647.7 4056,4056,4056,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,1620.11 4064,4064,4064,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,1632.87 4072,4072,4072,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,1605.72 4080,4080,4080,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,1852.89 4088,4088,4088,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,1628.57 4096,4096,4096,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,407.738 4104,4104,4104,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,1609.7 4112,4112,4112,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,1629.27 4120,4120,4120,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,1609.07 4128,4128,4128,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,1860.41 4136,4136,4136,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,1646.24 4144,4144,4144,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,1656.51 4152,4152,4152,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,1630.0 4160,4160,4160,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,1643.53 4168,4168,4168,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,1617.22 4176,4176,4176,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,1860.03 4184,4184,4184,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,1648.6 4192,4192,4192,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,1662.3 4200,4200,4200,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,1633.76 4208,4208,4208,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,1643.75 4216,4216,4216,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,1621.27 4224,4224,4224,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,1878.98 4232,4232,4232,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,1678.1 4240,4240,4240,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,1687.64 4248,4248,4248,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,1661.8 4256,4256,4256,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,1673.62 4264,4264,4264,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,1647.4 4272,4272,4272,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,1861.66 4280,4280,4280,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,1662.18 4288,4288,4288,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,1672.43 4296,4296,4296,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,1645.07 4304,4304,4304,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,1655.49 4312,4312,4312,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,1633.57 4320,4320,4320,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,1866.95 4328,4328,4328,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,1665.96 4336,4336,4336,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,1673.28 4344,4344,4344,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,1642.58 4352,4352,4352,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,1446.21 4360,4360,4360,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,1630.01 4368,4368,4368,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,1862.63 4376,4376,4376,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,1671.28 4384,4384,4384,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,1682.89 4392,4392,4392,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,1656.88 4400,4400,4400,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,1666.05 4408,4408,4408,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,1646.41 4416,4416,4416,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,1870.08 4424,4424,4424,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,1677.56 4432,4432,4432,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,1685.03 4440,4440,4440,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,1659.11 4448,4448,4448,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,1674.27 4456,4456,4456,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,1651.05 4464,4464,4464,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,1869.97 4472,4472,4472,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,1681.87 4480,4480,4480,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,1687.47 4488,4488,4488,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,1666.5 4496,4496,4496,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,1677.05 4504,4504,4504,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,1656.66 4512,4512,4512,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,1874.11 4520,4520,4520,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,1687.15 4528,4528,4528,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,1697.12 4536,4536,4536,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,1673.19 4544,4544,4544,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,1686.48 4552,4552,4552,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,1663.96 4560,4560,4560,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,1875.1 4568,4568,4568,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,1693.79 4576,4576,4576,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,1704.19 4584,4584,4584,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,1678.74 4592,4592,4592,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,1685.91 4600,4600,4600,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,1660.11 4608,4608,4608,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,621.698 4616,4616,4616,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,1688.24 4624,4624,4624,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,1703.73 4632,4632,4632,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,1684.72 4640,4640,4640,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,1697.7 4648,4648,4648,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,1675.73 4656,4656,4656,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,1881.09 4664,4664,4664,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,1705.85 4672,4672,4672,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,1715.58 4680,4680,4680,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,1690.1 4688,4688,4688,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,1700.48 4696,4696,4696,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,1680.49 4704,4704,4704,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,1885.87 4712,4712,4712,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,1709.6 4720,4720,4720,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,1719.71 4728,4728,4728,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,1696.31 4736,4736,4736,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,1704.35 4744,4744,4744,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,1686.77 4752,4752,4752,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,1887.54 4760,4760,4760,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,1717.49 4768,4768,4768,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,1726.45 4776,4776,4776,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,1700.09 4784,4784,4784,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,1711.92 4792,4792,4792,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,1694.71 4800,4800,4800,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,1893.62 4808,4808,4808,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,1721.59 4816,4816,4816,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,1729.32 4824,4824,4824,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,1710.11 4832,4832,4832,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,1723.07 4840,4840,4840,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,1700.61 4848,4848,4848,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,1891.99 4856,4856,4856,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,1724.04 4864,4864,4864,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,1255.23 4872,4872,4872,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,1711.11 4880,4880,4880,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,1722.74 4888,4888,4888,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,1705.96 4896,4896,4896,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,1877.6 4904,4904,4904,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,1714.43 4912,4912,4912,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,1720.13 4920,4920,4920,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,1700.45 4928,4928,4928,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,1714.89 4936,4936,4936,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,1694.55 4944,4944,4944,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,1877.46 4952,4952,4952,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,1716.76 4960,4960,4960,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,1729.98 4968,4968,4968,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,1709.3 4976,4976,4976,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,1718.23 4984,4984,4984,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,1700.14 4992,4992,4992,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,1879.72 5000,5000,5000,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,1726.01 5008,5008,5008,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,1735.72 5016,5016,5016,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,1713.14 5024,5024,5024,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,1726.84 5032,5032,5032,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,1706.17 5040,5040,5040,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,1887.49 5048,5048,5048,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,1732.57 5056,5056,5056,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,1743.77 5064,5064,5064,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,1722.32 5072,5072,5072,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,1730.47 5080,5080,5080,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,1714.09 5088,5088,5088,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,1894.85 5096,5096,5096,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,1738.71 5104,5104,5104,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,1744.44 5112,5112,5112,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,1718.31 5120,5120,5120,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,552.809 5128,5128,5128,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,1710.01 5136,5136,5136,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,1871.05 5144,5144,5144,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,1726.08 5152,5152,5152,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,1737.61 5160,5160,5160,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,1717.63 5168,5168,5168,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,1726.52 5176,5176,5176,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,1710.76 5184,5184,5184,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,1883.46 5192,5192,5192,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,1736.18 5200,5200,5200,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,1742.64 5208,5208,5208,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,1723.99 5216,5216,5216,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,1736.88 5224,5224,5224,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,1716.15 5232,5232,5232,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,1885.87 5240,5240,5240,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,1740.79 5248,5248,5248,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,1744.88 5256,5256,5256,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,1730.22 5264,5264,5264,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,1739.19 5272,5272,5272,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,1723.89 5280,5280,5280,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,1892.61 5288,5288,5288,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,1742.6 5296,5296,5296,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,1753.0 5304,5304,5304,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,1738.67 5312,5312,5312,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,1750.73 5320,5320,5320,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,1730.42 5328,5328,5328,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,1875.88 5336,5336,5336,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,1739.13 5344,5344,5344,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,1749.52 5352,5352,5352,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,1729.9 5360,5360,5360,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,1737.13 5368,5368,5368,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,1717.59 5376,5376,5376,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,1073.02 5384,5384,5384,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,1740.46 5392,5392,5392,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,1751.36 5400,5400,5400,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,1736.61 5408,5408,5408,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,1748.77 5416,5416,5416,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,1729.6 5424,5424,5424,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,1885.48 5432,5432,5432,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,1752.21 5440,5440,5440,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,1761.35 5448,5448,5448,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,1742.82 5456,5456,5456,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,1745.94 5464,5464,5464,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,1722.26 5472,5472,5472,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,1890.39 5480,5480,5480,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,1757.87 5488,5488,5488,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,1766.85 5496,5496,5496,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,1748.75 5504,5504,5504,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,1754.58 5512,5512,5512,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,1741.62 5520,5520,5520,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,1881.16 5528,5528,5528,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,1749.58 5536,5536,5536,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,1760.46 5544,5544,5544,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,1740.47 5552,5552,5552,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,1750.51 5560,5560,5560,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,1735.36 5568,5568,5568,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,1887.73 5576,5576,5576,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,1757.91 5584,5584,5584,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,1765.08 5592,5592,5592,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,1748.73 5600,5600,5600,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,1760.44 5608,5608,5608,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,1740.8 5616,5616,5616,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,1884.78 5624,5624,5624,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,1750.14 5632,5632,5632,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,507.665 5640,5640,5640,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,1741.06 5648,5648,5648,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,1758.08 5656,5656,5656,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,1747.13 5664,5664,5664,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,1882.71 5672,5672,5672,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,1756.18 5680,5680,5680,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,1765.05 5688,5688,5688,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,1750.32 5696,5696,5696,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,1760.88 5704,5704,5704,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,1742.86 5712,5712,5712,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,1885.97 5720,5720,5720,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,1762.93 5728,5728,5728,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,1772.99 5736,5736,5736,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,1754.77 5744,5744,5744,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,1764.46 5752,5752,5752,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,1747.73 5760,5760,5760,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150_14.50.2,1887.53 clblas-2.10/doc/performance/clBLAS_2.6.0/S9150/dgemm_32.csv000066400000000000000000000444361264277366700224170ustar00rootroot00000000000000m,n,k,lda,ldb,ldc,offa,offb,offc,alpha,beta,order,transa,transb,side,uplo,diag,function,device,library,label,GFLOPS 32,32,32,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150,3.0696 64,64,64,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150,21.5402 96,96,96,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150,62.8587 128,128,128,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150,112.902 160,160,160,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150,200.147 192,192,192,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150,290.018 224,224,224,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150,395.545 256,256,256,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150,519.982 288,288,288,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150,592.678 320,320,320,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150,601.524 352,352,352,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150,668.212 384,384,384,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150,611.778 416,416,416,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150,697.387 448,448,448,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150,724.044 480,480,480,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150,751.024 512,512,512,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150,967.998 544,544,544,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150,1076.81 576,576,576,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150,937.285 608,608,608,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150,919.622 640,640,640,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150,1010.81 672,672,672,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150,909.679 704,704,704,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150,1237.09 736,736,736,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150,1117.71 768,768,768,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150,1177.76 800,800,800,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150,1309.21 832,832,832,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150,1416.19 864,864,864,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150,1494.86 896,896,896,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150,1337.24 928,928,928,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150,1434.5 960,960,960,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150,1261.75 992,992,992,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150,1411.41 1024,1024,1024,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150,518.687 1056,1056,1056,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150,1521.25 1088,1088,1088,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150,555.507 1120,1120,1120,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150,1554.71 1152,1152,1152,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150,1434.59 1184,1184,1184,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150,605.271 1216,1216,1216,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150,690.687 1248,1248,1248,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150,1677.97 1280,1280,1280,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150,1472.57 1312,1312,1312,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150,732.106 1344,1344,1344,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150,1585.51 1376,1376,1376,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150,748.442 1408,1408,1408,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150,838.587 1440,1440,1440,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150,1547.27 1472,1472,1472,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150,794.286 1504,1504,1504,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150,884.365 1536,1536,1536,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150,1702.19 1568,1568,1568,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150,898.697 1600,1600,1600,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150,1564.87 1632,1632,1632,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150,1712.63 1664,1664,1664,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150,936.582 1696,1696,1696,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150,1036.94 1728,1728,1728,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150,1691.28 1760,1760,1760,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150,1589.79 1792,1792,1792,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150,1066.43 1824,1824,1824,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150,1688.98 1856,1856,1856,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150,1027.12 1888,1888,1888,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150,1118.29 1920,1920,1920,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150,1679.2 1952,1952,1952,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150,1070.08 1984,1984,1984,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150,1158.23 2016,2016,2016,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150,1706.24 2048,2048,2048,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150,1067.31 2080,2080,2080,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150,1620.84 2112,2112,2112,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150,1846.23 2144,2144,2144,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150,1210.3 2176,2176,2176,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150,1232.15 2208,2208,2208,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150,1738.61 2240,2240,2240,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150,1671.16 2272,2272,2272,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150,1282.5 2304,2304,2304,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150,1735.26 2336,2336,2336,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150,1240.73 2368,2368,2368,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150,1323.87 2400,2400,2400,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150,1780.01 2432,2432,2432,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150,1278.57 2464,2464,2464,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150,1364.25 2496,2496,2496,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150,1803.69 2528,2528,2528,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150,1321.87 2560,2560,2560,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150,369.068 2592,2592,2592,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150,1829.92 2624,2624,2624,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150,1364.05 2656,2656,2656,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150,1391.88 2688,2688,2688,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150,1852.93 2720,2720,2720,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150,1663.15 2752,2752,2752,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150,1432.06 2784,2784,2784,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150,1807.14 2816,2816,2816,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150,1378.94 2848,2848,2848,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150,1474.52 2880,2880,2880,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150,1837.23 2912,2912,2912,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150,1438.7 2944,2944,2944,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150,1462.51 2976,2976,2976,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150,1870.96 3008,3008,3008,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150,1480.68 3040,3040,3040,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150,1680.85 3072,3072,3072,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150,1153.64 3104,3104,3104,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150,1477.69 3136,3136,3136,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150,1504.86 3168,3168,3168,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150,1869.52 3200,3200,3200,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150,1698.86 3232,3232,3232,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150,1543.89 3264,3264,3264,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150,1844.1 3296,3296,3296,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150,1518.86 3328,3328,3328,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150,1524.0 3360,3360,3360,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150,1880.59 3392,3392,3392,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150,1558.51 3424,3424,3424,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150,1582.87 3456,3456,3456,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150,1855.69 3488,3488,3488,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150,1560.05 3520,3520,3520,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150,1702.84 3552,3552,3552,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150,1847.63 3584,3584,3584,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150,793.465 3616,3616,3616,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150,1621.33 3648,3648,3648,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150,1883.89 3680,3680,3680,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150,1693.98 3712,3712,3712,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150,1621.37 3744,3744,3744,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150,1874.28 3776,3776,3776,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150,1609.12 3808,3808,3808,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150,1632.06 3840,3840,3840,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150,1824.94 3872,3872,3872,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150,1616.16 3904,3904,3904,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150,1637.42 3936,3936,3936,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150,1863.14 3968,3968,3968,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150,1613.83 4000,4000,4000,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150,1709.27 4032,4032,4032,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150,1861.9 4064,4064,4064,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150,1632.95 4096,4096,4096,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150,407.74 4128,4128,4128,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150,1860.46 4160,4160,4160,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150,1643.02 4192,4192,4192,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150,1662.36 4224,4224,4224,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150,1878.57 4256,4256,4256,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150,1673.36 4288,4288,4288,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150,1672.24 4320,4320,4320,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150,1867.23 4352,4352,4352,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150,1436.52 4384,4384,4384,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150,1683.46 4416,4416,4416,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150,1869.6 4448,4448,4448,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150,1674.35 4480,4480,4480,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150,1687.45 4512,4512,4512,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150,1874.17 4544,4544,4544,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150,1686.06 4576,4576,4576,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150,1704.25 4608,4608,4608,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150,622.126 4640,4640,4640,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150,1697.6 4672,4672,4672,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150,1715.48 4704,4704,4704,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150,1885.94 4736,4736,4736,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150,1703.94 4768,4768,4768,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150,1726.79 4800,4800,4800,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150,1893.31 4832,4832,4832,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150,1723.58 4864,4864,4864,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150,1258.61 4896,4896,4896,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150,1877.67 4928,4928,4928,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150,1714.15 4960,4960,4960,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150,1730.74 4992,4992,4992,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150,1879.83 5024,5024,5024,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150,1727.26 5056,5056,5056,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150,1743.04 5088,5088,5088,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150,1894.9 5120,5120,5120,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150,553.027 5152,5152,5152,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150,1736.85 5184,5184,5184,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150,1883.3 5216,5216,5216,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150,1736.6 5248,5248,5248,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150,1745.1 5280,5280,5280,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150,1892.44 5312,5312,5312,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150,1750.69 5344,5344,5344,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150,1749.86 5376,5376,5376,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150,1057.62 5408,5408,5408,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150,1749.09 5440,5440,5440,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150,1761.61 5472,5472,5472,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150,1890.31 5504,5504,5504,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150,1754.66 5536,5536,5536,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150,1760.27 5568,5568,5568,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150,1887.85 5600,5600,5600,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150,1760.38 5632,5632,5632,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150,507.924 5664,5664,5664,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150,1882.64 5696,5696,5696,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150,1760.72 5728,5728,5728,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150,1772.79 5760,5760,5760,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150,1887.03 clblas-2.10/doc/performance/clBLAS_2.6.0/S9150/dgemm_96.csv000066400000000000000000000142551264277366700224250ustar00rootroot00000000000000m,n,k,lda,ldb,ldc,offa,offb,offc,alpha,beta,order,transa,transb,side,uplo,diag,function,device,library,label,GFLOPS 96,96,96,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150,62.8587 192,192,192,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150,290.018 288,288,288,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150,592.678 384,384,384,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150,611.778 480,480,480,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150,751.024 576,576,576,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150,937.285 672,672,672,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150,909.679 768,768,768,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150,1177.76 864,864,864,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150,1494.86 960,960,960,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150,1261.75 1056,1056,1056,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150,1521.25 1152,1152,1152,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150,1434.59 1248,1248,1248,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150,1677.97 1344,1344,1344,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150,1585.51 1440,1440,1440,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150,1547.27 1536,1536,1536,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150,1702.19 1632,1632,1632,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150,1712.63 1728,1728,1728,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150,1691.28 1824,1824,1824,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150,1688.98 1920,1920,1920,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150,1679.2 2016,2016,2016,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150,1706.24 2112,2112,2112,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150,1846.23 2208,2208,2208,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150,1738.61 2304,2304,2304,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150,1735.26 2400,2400,2400,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150,1780.01 2496,2496,2496,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150,1803.69 2592,2592,2592,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150,1829.92 2688,2688,2688,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150,1852.93 2784,2784,2784,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150,1807.14 2880,2880,2880,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150,1837.23 2976,2976,2976,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150,1870.96 3072,3072,3072,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150,1153.64 3168,3168,3168,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150,1869.52 3264,3264,3264,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150,1844.1 3360,3360,3360,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150,1880.59 3456,3456,3456,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150,1855.69 3552,3552,3552,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150,1847.63 3648,3648,3648,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150,1883.89 3744,3744,3744,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150,1874.28 3840,3840,3840,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150,1824.94 3936,3936,3936,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150,1863.14 4032,4032,4032,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150,1861.9 4128,4128,4128,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150,1860.46 4224,4224,4224,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150,1878.57 4320,4320,4320,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150,1867.23 4416,4416,4416,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150,1869.6 4512,4512,4512,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150,1874.17 4608,4608,4608,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150,622.126 4704,4704,4704,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150,1885.94 4800,4800,4800,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150,1893.31 4896,4896,4896,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150,1877.67 4992,4992,4992,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150,1879.83 5088,5088,5088,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150,1894.9 5184,5184,5184,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150,1883.3 5280,5280,5280,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150,1892.44 5376,5376,5376,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150,1057.62 5472,5472,5472,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150,1890.31 5568,5568,5568,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150,1887.85 5664,5664,5664,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150,1882.64 5760,5760,5760,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150,1887.03 clblas-2.10/doc/performance/clBLAS_2.6.0/S9150/dtrsm_192.csv000066400000000000000000000066001264277366700225350ustar00rootroot00000000000000m,n,k,lda,ldb,ldc,offa,offb,offc,alpha,beta,order,transa,transb,side,uplo,diag,function,device,library,label,GFLOPS 192,192,192,0,0,0,0,0,0,1.0,1.0,column,none,none,right,upper,unit,dtrsm,gpu,clblas,s9150_dtrsm_14502,8.9202 384,384,384,0,0,0,0,0,0,1.0,1.0,column,none,none,right,upper,unit,dtrsm,gpu,clblas,s9150_dtrsm_14502,46.185 576,576,576,0,0,0,0,0,0,1.0,1.0,column,none,none,right,upper,unit,dtrsm,gpu,clblas,s9150_dtrsm_14502,126.686 768,768,768,0,0,0,0,0,0,1.0,1.0,column,none,none,right,upper,unit,dtrsm,gpu,clblas,s9150_dtrsm_14502,235.366 960,960,960,0,0,0,0,0,0,1.0,1.0,column,none,none,right,upper,unit,dtrsm,gpu,clblas,s9150_dtrsm_14502,375.406 1152,1152,1152,0,0,0,0,0,0,1.0,1.0,column,none,none,right,upper,unit,dtrsm,gpu,clblas,s9150_dtrsm_14502,475.497 1344,1344,1344,0,0,0,0,0,0,1.0,1.0,column,none,none,right,upper,unit,dtrsm,gpu,clblas,s9150_dtrsm_14502,599.527 1536,1536,1536,0,0,0,0,0,0,1.0,1.0,column,none,none,right,upper,unit,dtrsm,gpu,clblas,s9150_dtrsm_14502,437.835 1728,1728,1728,0,0,0,0,0,0,1.0,1.0,column,none,none,right,upper,unit,dtrsm,gpu,clblas,s9150_dtrsm_14502,778.815 1920,1920,1920,0,0,0,0,0,0,1.0,1.0,column,none,none,right,upper,unit,dtrsm,gpu,clblas,s9150_dtrsm_14502,845.844 2112,2112,2112,0,0,0,0,0,0,1.0,1.0,column,none,none,right,upper,unit,dtrsm,gpu,clblas,s9150_dtrsm_14502,969.624 2304,2304,2304,0,0,0,0,0,0,1.0,1.0,column,none,none,right,upper,unit,dtrsm,gpu,clblas,s9150_dtrsm_14502,943.48 2496,2496,2496,0,0,0,0,0,0,1.0,1.0,column,none,none,right,upper,unit,dtrsm,gpu,clblas,s9150_dtrsm_14502,1026.58 2688,2688,2688,0,0,0,0,0,0,1.0,1.0,column,none,none,right,upper,unit,dtrsm,gpu,clblas,s9150_dtrsm_14502,1074.56 2880,2880,2880,0,0,0,0,0,0,1.0,1.0,column,none,none,right,upper,unit,dtrsm,gpu,clblas,s9150_dtrsm_14502,1102.6 3072,3072,3072,0,0,0,0,0,0,1.0,1.0,column,none,none,right,upper,unit,dtrsm,gpu,clblas,s9150_dtrsm_14502,848.076 3264,3264,3264,0,0,0,0,0,0,1.0,1.0,column,none,none,right,upper,unit,dtrsm,gpu,clblas,s9150_dtrsm_14502,1010.06 3456,3456,3456,0,0,0,0,0,0,1.0,1.0,column,none,none,right,upper,unit,dtrsm,gpu,clblas,s9150_dtrsm_14502,1034.51 3648,3648,3648,0,0,0,0,0,0,1.0,1.0,column,none,none,right,upper,unit,dtrsm,gpu,clblas,s9150_dtrsm_14502,1059.02 3840,3840,3840,0,0,0,0,0,0,1.0,1.0,column,none,none,right,upper,unit,dtrsm,gpu,clblas,s9150_dtrsm_14502,1037.95 4032,4032,4032,0,0,0,0,0,0,1.0,1.0,column,none,none,right,upper,unit,dtrsm,gpu,clblas,s9150_dtrsm_14502,1103.8 4224,4224,4224,0,0,0,0,0,0,1.0,1.0,column,none,none,right,upper,unit,dtrsm,gpu,clblas,s9150_dtrsm_14502,1109.83 4416,4416,4416,0,0,0,0,0,0,1.0,1.0,column,none,none,right,upper,unit,dtrsm,gpu,clblas,s9150_dtrsm_14502,1096.15 4608,4608,4608,0,0,0,0,0,0,1.0,1.0,column,none,none,right,upper,unit,dtrsm,gpu,clblas,s9150_dtrsm_14502,1055.28 4800,4800,4800,0,0,0,0,0,0,1.0,1.0,column,none,none,right,upper,unit,dtrsm,gpu,clblas,s9150_dtrsm_14502,1140.07 4992,4992,4992,0,0,0,0,0,0,1.0,1.0,column,none,none,right,upper,unit,dtrsm,gpu,clblas,s9150_dtrsm_14502,1152.31 5184,5184,5184,0,0,0,0,0,0,1.0,1.0,column,none,none,right,upper,unit,dtrsm,gpu,clblas,s9150_dtrsm_14502,1165.47 5376,5376,5376,0,0,0,0,0,0,1.0,1.0,column,none,none,right,upper,unit,dtrsm,gpu,clblas,s9150_dtrsm_14502,1152.36 5568,5568,5568,0,0,0,0,0,0,1.0,1.0,column,none,none,right,upper,unit,dtrsm,gpu,clblas,s9150_dtrsm_14502,1193.66 5760,5760,5760,0,0,0,0,0,0,1.0,1.0,column,none,none,right,upper,unit,dtrsm,gpu,clblas,s9150_dtrsm_14502,1199.05 clblas-2.10/doc/performance/clBLAS_2.6.0/S9150/generate_graphs.sh000077500000000000000000000052261264277366700237740ustar00rootroot00000000000000 # sgemm AMD vs NVIDIA python ../../../../src/scripts/perf/plotPerformance.py \ -d peak_sp.csv \ -d ../../cuBLAS_7.0/Tesla_K40/peak_sp.csv \ -d sgemm_32.csv \ -d ../../cuBLAS_7.0/Tesla_K40/sgemm.csv \ -x sizem --x_axis_label "m,n,k" \ -y gflops --y_axis_label "GFlop/s" \ --x_axis_scale linear \ --plot label \ --title "sgemm S9150 vs K40" --outputfile sgemm_S9150_K40.png # sgemm AMD only python ../../../../src/scripts/perf/plotPerformance.py \ -d peak_sp.csv \ -d sgemm_32.csv \ -x sizem --x_axis_label "m,n,k" \ -y gflops --y_axis_label "GFlop/s" \ --x_axis_scale linear \ --plot label \ --title "sgemm S9150" --outputfile sgemm_S9150.png # dgemm AMD vs NVIDIA python ../../../../src/scripts/perf/plotPerformance.py \ -d peak_dp.csv \ -d ../../cuBLAS_7.0/Tesla_K40/peak_dp.csv \ -d dgemm_96.csv \ -d ../../cuBLAS_7.0/Tesla_K40/dgemm.csv \ -x sizem --x_axis_label "m,n,k" \ -y gflops --y_axis_label "GFlop/s" \ --x_axis_scale linear \ --plot label \ --title "dgemm S9150 vs K40" --outputfile dgemm_S9150_K40.png # dgemm AMD only python ../../../../src/scripts/perf/plotPerformance.py \ -d peak_dp.csv \ -d dgemm_96.csv \ -x sizem --x_axis_label "m,n,k" \ -y gflops --y_axis_label "GFlop/s" \ --x_axis_scale linear \ --plot label \ --title "dgemm S9150" --outputfile dgemm_S9150.png # zgemm AMD vs NVIDIA python ../../../../src/scripts/perf/plotPerformance.py \ -d peak_dp.csv \ -d ../../cuBLAS_7.0/Tesla_K40/peak_dp.csv \ -d zgemm_64.csv \ -d ../../cuBLAS_7.0/Tesla_K40/zgemm.csv \ -x sizem --x_axis_label "m,n,k" \ -y gflops --y_axis_label "GFlop/s" \ --x_axis_scale linear \ --plot label \ --title "zgemm S9150 vs K40" --outputfile zgemm_S9150_K40.png # zgemm AMD only python ../../../../src/scripts/perf/plotPerformance.py \ -d peak_dp.csv \ -d zgemm_64.csv \ -x sizem --x_axis_label "m,n,k" \ -y gflops --y_axis_label "GFlop/s" \ --x_axis_scale linear \ --plot label \ --title "zgemm S9150" --outputfile zgemm_S9150.png # dtrsm AMD vs NVIDIA python ../../../../src/scripts/perf/plotPerformance.py \ -d peak_dp.csv \ -d ../../cuBLAS_7.0/Tesla_K40/peak_dp.csv \ -d dtrsm_192.csv \ -d ../../cuBLAS_7.0/Tesla_K40/dtrsm.csv \ -x sizem --x_axis_label "m,n,k" \ -y gflops --y_axis_label "GFlop/s" \ --x_axis_scale linear \ --plot label \ --title "dtrsm S9150 vs K40" --outputfile dtrsm_S9150_K40.png # dtrsm AMD only python ../../../../src/scripts/perf/plotPerformance.py \ -d peak_dp.csv \ -d dtrsm_192.csv \ -x sizem --x_axis_label "m,n,k" \ -y gflops --y_axis_label "GFlop/s" \ --x_axis_scale linear \ --plot label \ --title "dtrsm S9150" --outputfile dtrsm_S9150.png clblas-2.10/doc/performance/clBLAS_2.6.0/S9150/peak_dp.csv000066400000000000000000000437061264277366700224240ustar00rootroot00000000000000m,n,k,lda,ldb,ldc,offa,offb,offc,alpha,beta,order,transa,transb,side,uplo,diag,function,device,library,label,GFLOPS 32,32,32,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150 Peak,2530 64,64,64,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150 Peak,2530 96,96,96,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150 Peak,2530 128,128,128,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150 Peak,2530 160,160,160,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150 Peak,2530 192,192,192,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150 Peak,2530 224,224,224,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150 Peak,2530 256,256,256,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150 Peak,2530 288,288,288,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150 Peak,2530 320,320,320,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150 Peak,2530 352,352,352,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150 Peak,2530 384,384,384,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150 Peak,2530 416,416,416,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150 Peak,2530 448,448,448,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150 Peak,2530 480,480,480,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150 Peak,2530 512,512,512,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150 Peak,2530 544,544,544,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150 Peak,2530 576,576,576,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150 Peak,2530 608,608,608,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150 Peak,2530 640,640,640,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150 Peak,2530 672,672,672,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150 Peak,2530 704,704,704,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150 Peak,2530 736,736,736,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150 Peak,2530 768,768,768,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150 Peak,2530 800,800,800,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150 Peak,2530 832,832,832,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150 Peak,2530 864,864,864,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150 Peak,2530 896,896,896,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150 Peak,2530 928,928,928,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150 Peak,2530 960,960,960,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150 Peak,2530 992,992,992,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150 Peak,2530 1024,1024,1024,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150 Peak,2530 1056,1056,1056,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150 Peak,2530 1088,1088,1088,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150 Peak,2530 1120,1120,1120,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150 Peak,2530 1152,1152,1152,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150 Peak,2530 1184,1184,1184,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150 Peak,2530 1216,1216,1216,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150 Peak,2530 1248,1248,1248,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150 Peak,2530 1280,1280,1280,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150 Peak,2530 1312,1312,1312,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150 Peak,2530 1344,1344,1344,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150 Peak,2530 1376,1376,1376,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150 Peak,2530 1408,1408,1408,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150 Peak,2530 1440,1440,1440,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150 Peak,2530 1472,1472,1472,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150 Peak,2530 1504,1504,1504,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150 Peak,2530 1536,1536,1536,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150 Peak,2530 1568,1568,1568,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150 Peak,2530 1600,1600,1600,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150 Peak,2530 1632,1632,1632,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150 Peak,2530 1664,1664,1664,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150 Peak,2530 1696,1696,1696,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150 Peak,2530 1728,1728,1728,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150 Peak,2530 1760,1760,1760,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150 Peak,2530 1792,1792,1792,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150 Peak,2530 1824,1824,1824,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150 Peak,2530 1856,1856,1856,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150 Peak,2530 1888,1888,1888,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150 Peak,2530 1920,1920,1920,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150 Peak,2530 1952,1952,1952,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150 Peak,2530 1984,1984,1984,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150 Peak,2530 2016,2016,2016,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150 Peak,2530 2048,2048,2048,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150 Peak,2530 2080,2080,2080,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150 Peak,2530 2112,2112,2112,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150 Peak,2530 2144,2144,2144,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150 Peak,2530 2176,2176,2176,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150 Peak,2530 2208,2208,2208,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150 Peak,2530 2240,2240,2240,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150 Peak,2530 2272,2272,2272,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150 Peak,2530 2304,2304,2304,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150 Peak,2530 2336,2336,2336,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150 Peak,2530 2368,2368,2368,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150 Peak,2530 2400,2400,2400,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150 Peak,2530 2432,2432,2432,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150 Peak,2530 2464,2464,2464,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150 Peak,2530 2496,2496,2496,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150 Peak,2530 2528,2528,2528,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150 Peak,2530 2560,2560,2560,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150 Peak,2530 2592,2592,2592,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150 Peak,2530 2624,2624,2624,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150 Peak,2530 2656,2656,2656,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150 Peak,2530 2688,2688,2688,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150 Peak,2530 2720,2720,2720,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150 Peak,2530 2752,2752,2752,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150 Peak,2530 2784,2784,2784,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150 Peak,2530 2816,2816,2816,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150 Peak,2530 2848,2848,2848,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150 Peak,2530 2880,2880,2880,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150 Peak,2530 2912,2912,2912,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150 Peak,2530 2944,2944,2944,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150 Peak,2530 2976,2976,2976,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150 Peak,2530 3008,3008,3008,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150 Peak,2530 3040,3040,3040,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150 Peak,2530 3072,3072,3072,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150 Peak,2530 3104,3104,3104,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150 Peak,2530 3136,3136,3136,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150 Peak,2530 3168,3168,3168,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150 Peak,2530 3200,3200,3200,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150 Peak,2530 3232,3232,3232,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150 Peak,2530 3264,3264,3264,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150 Peak,2530 3296,3296,3296,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150 Peak,2530 3328,3328,3328,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150 Peak,2530 3360,3360,3360,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150 Peak,2530 3392,3392,3392,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150 Peak,2530 3424,3424,3424,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150 Peak,2530 3456,3456,3456,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150 Peak,2530 3488,3488,3488,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150 Peak,2530 3520,3520,3520,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150 Peak,2530 3552,3552,3552,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150 Peak,2530 3584,3584,3584,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150 Peak,2530 3616,3616,3616,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150 Peak,2530 3648,3648,3648,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150 Peak,2530 3680,3680,3680,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150 Peak,2530 3712,3712,3712,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150 Peak,2530 3744,3744,3744,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150 Peak,2530 3776,3776,3776,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150 Peak,2530 3808,3808,3808,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150 Peak,2530 3840,3840,3840,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150 Peak,2530 3872,3872,3872,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150 Peak,2530 3904,3904,3904,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150 Peak,2530 3936,3936,3936,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150 Peak,2530 3968,3968,3968,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150 Peak,2530 4000,4000,4000,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150 Peak,2530 4032,4032,4032,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150 Peak,2530 4064,4064,4064,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150 Peak,2530 4096,4096,4096,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150 Peak,2530 4128,4128,4128,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150 Peak,2530 4160,4160,4160,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150 Peak,2530 4192,4192,4192,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150 Peak,2530 4224,4224,4224,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150 Peak,2530 4256,4256,4256,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150 Peak,2530 4288,4288,4288,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150 Peak,2530 4320,4320,4320,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150 Peak,2530 4352,4352,4352,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150 Peak,2530 4384,4384,4384,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150 Peak,2530 4416,4416,4416,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150 Peak,2530 4448,4448,4448,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150 Peak,2530 4480,4480,4480,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150 Peak,2530 4512,4512,4512,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150 Peak,2530 4544,4544,4544,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150 Peak,2530 4576,4576,4576,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150 Peak,2530 4608,4608,4608,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150 Peak,2530 4640,4640,4640,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150 Peak,2530 4672,4672,4672,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150 Peak,2530 4704,4704,4704,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150 Peak,2530 4736,4736,4736,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150 Peak,2530 4768,4768,4768,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150 Peak,2530 4800,4800,4800,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150 Peak,2530 4832,4832,4832,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150 Peak,2530 4864,4864,4864,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150 Peak,2530 4896,4896,4896,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150 Peak,2530 4928,4928,4928,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150 Peak,2530 4960,4960,4960,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150 Peak,2530 4992,4992,4992,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150 Peak,2530 5024,5024,5024,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150 Peak,2530 5056,5056,5056,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150 Peak,2530 5088,5088,5088,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150 Peak,2530 5120,5120,5120,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150 Peak,2530 5152,5152,5152,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150 Peak,2530 5184,5184,5184,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150 Peak,2530 5216,5216,5216,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150 Peak,2530 5248,5248,5248,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150 Peak,2530 5280,5280,5280,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150 Peak,2530 5312,5312,5312,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150 Peak,2530 5344,5344,5344,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150 Peak,2530 5376,5376,5376,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150 Peak,2530 5408,5408,5408,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150 Peak,2530 5440,5440,5440,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150 Peak,2530 5472,5472,5472,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150 Peak,2530 5504,5504,5504,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150 Peak,2530 5536,5536,5536,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150 Peak,2530 5568,5568,5568,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150 Peak,2530 5600,5600,5600,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150 Peak,2530 5632,5632,5632,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150 Peak,2530 5664,5664,5664,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150 Peak,2530 5696,5696,5696,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150 Peak,2530 5728,5728,5728,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150 Peak,2530 5760,5760,5760,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150 Peak,2530 clblas-2.10/doc/performance/clBLAS_2.6.0/S9150/peak_sp.csv000066400000000000000000000437061264277366700224430ustar00rootroot00000000000000m,n,k,lda,ldb,ldc,offa,offb,offc,alpha,beta,order,transa,transb,side,uplo,diag,function,device,library,label,GFLOPS 32,32,32,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150 Peak,5070 64,64,64,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150 Peak,5070 96,96,96,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150 Peak,5070 128,128,128,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150 Peak,5070 160,160,160,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150 Peak,5070 192,192,192,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150 Peak,5070 224,224,224,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150 Peak,5070 256,256,256,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150 Peak,5070 288,288,288,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150 Peak,5070 320,320,320,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150 Peak,5070 352,352,352,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150 Peak,5070 384,384,384,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150 Peak,5070 416,416,416,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150 Peak,5070 448,448,448,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150 Peak,5070 480,480,480,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150 Peak,5070 512,512,512,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150 Peak,5070 544,544,544,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150 Peak,5070 576,576,576,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150 Peak,5070 608,608,608,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150 Peak,5070 640,640,640,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150 Peak,5070 672,672,672,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150 Peak,5070 704,704,704,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150 Peak,5070 736,736,736,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150 Peak,5070 768,768,768,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150 Peak,5070 800,800,800,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150 Peak,5070 832,832,832,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150 Peak,5070 864,864,864,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150 Peak,5070 896,896,896,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150 Peak,5070 928,928,928,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150 Peak,5070 960,960,960,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150 Peak,5070 992,992,992,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150 Peak,5070 1024,1024,1024,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150 Peak,5070 1056,1056,1056,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150 Peak,5070 1088,1088,1088,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150 Peak,5070 1120,1120,1120,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150 Peak,5070 1152,1152,1152,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150 Peak,5070 1184,1184,1184,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150 Peak,5070 1216,1216,1216,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150 Peak,5070 1248,1248,1248,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150 Peak,5070 1280,1280,1280,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150 Peak,5070 1312,1312,1312,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150 Peak,5070 1344,1344,1344,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150 Peak,5070 1376,1376,1376,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150 Peak,5070 1408,1408,1408,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150 Peak,5070 1440,1440,1440,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150 Peak,5070 1472,1472,1472,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150 Peak,5070 1504,1504,1504,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150 Peak,5070 1536,1536,1536,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150 Peak,5070 1568,1568,1568,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150 Peak,5070 1600,1600,1600,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150 Peak,5070 1632,1632,1632,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150 Peak,5070 1664,1664,1664,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150 Peak,5070 1696,1696,1696,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150 Peak,5070 1728,1728,1728,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150 Peak,5070 1760,1760,1760,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150 Peak,5070 1792,1792,1792,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150 Peak,5070 1824,1824,1824,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150 Peak,5070 1856,1856,1856,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150 Peak,5070 1888,1888,1888,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150 Peak,5070 1920,1920,1920,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150 Peak,5070 1952,1952,1952,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150 Peak,5070 1984,1984,1984,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150 Peak,5070 2016,2016,2016,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150 Peak,5070 2048,2048,2048,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150 Peak,5070 2080,2080,2080,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150 Peak,5070 2112,2112,2112,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150 Peak,5070 2144,2144,2144,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150 Peak,5070 2176,2176,2176,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150 Peak,5070 2208,2208,2208,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150 Peak,5070 2240,2240,2240,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150 Peak,5070 2272,2272,2272,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150 Peak,5070 2304,2304,2304,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150 Peak,5070 2336,2336,2336,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150 Peak,5070 2368,2368,2368,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150 Peak,5070 2400,2400,2400,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150 Peak,5070 2432,2432,2432,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150 Peak,5070 2464,2464,2464,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150 Peak,5070 2496,2496,2496,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150 Peak,5070 2528,2528,2528,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150 Peak,5070 2560,2560,2560,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150 Peak,5070 2592,2592,2592,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150 Peak,5070 2624,2624,2624,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150 Peak,5070 2656,2656,2656,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150 Peak,5070 2688,2688,2688,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150 Peak,5070 2720,2720,2720,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150 Peak,5070 2752,2752,2752,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150 Peak,5070 2784,2784,2784,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150 Peak,5070 2816,2816,2816,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150 Peak,5070 2848,2848,2848,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150 Peak,5070 2880,2880,2880,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150 Peak,5070 2912,2912,2912,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150 Peak,5070 2944,2944,2944,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150 Peak,5070 2976,2976,2976,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150 Peak,5070 3008,3008,3008,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150 Peak,5070 3040,3040,3040,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150 Peak,5070 3072,3072,3072,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150 Peak,5070 3104,3104,3104,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150 Peak,5070 3136,3136,3136,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150 Peak,5070 3168,3168,3168,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150 Peak,5070 3200,3200,3200,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150 Peak,5070 3232,3232,3232,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150 Peak,5070 3264,3264,3264,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150 Peak,5070 3296,3296,3296,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150 Peak,5070 3328,3328,3328,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150 Peak,5070 3360,3360,3360,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150 Peak,5070 3392,3392,3392,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150 Peak,5070 3424,3424,3424,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150 Peak,5070 3456,3456,3456,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150 Peak,5070 3488,3488,3488,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150 Peak,5070 3520,3520,3520,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150 Peak,5070 3552,3552,3552,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150 Peak,5070 3584,3584,3584,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150 Peak,5070 3616,3616,3616,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150 Peak,5070 3648,3648,3648,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150 Peak,5070 3680,3680,3680,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150 Peak,5070 3712,3712,3712,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150 Peak,5070 3744,3744,3744,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150 Peak,5070 3776,3776,3776,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150 Peak,5070 3808,3808,3808,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150 Peak,5070 3840,3840,3840,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150 Peak,5070 3872,3872,3872,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150 Peak,5070 3904,3904,3904,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150 Peak,5070 3936,3936,3936,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150 Peak,5070 3968,3968,3968,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150 Peak,5070 4000,4000,4000,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150 Peak,5070 4032,4032,4032,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150 Peak,5070 4064,4064,4064,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150 Peak,5070 4096,4096,4096,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150 Peak,5070 4128,4128,4128,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150 Peak,5070 4160,4160,4160,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150 Peak,5070 4192,4192,4192,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150 Peak,5070 4224,4224,4224,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150 Peak,5070 4256,4256,4256,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150 Peak,5070 4288,4288,4288,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150 Peak,5070 4320,4320,4320,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150 Peak,5070 4352,4352,4352,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150 Peak,5070 4384,4384,4384,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150 Peak,5070 4416,4416,4416,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150 Peak,5070 4448,4448,4448,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150 Peak,5070 4480,4480,4480,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150 Peak,5070 4512,4512,4512,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150 Peak,5070 4544,4544,4544,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150 Peak,5070 4576,4576,4576,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150 Peak,5070 4608,4608,4608,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150 Peak,5070 4640,4640,4640,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150 Peak,5070 4672,4672,4672,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150 Peak,5070 4704,4704,4704,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150 Peak,5070 4736,4736,4736,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150 Peak,5070 4768,4768,4768,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150 Peak,5070 4800,4800,4800,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150 Peak,5070 4832,4832,4832,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150 Peak,5070 4864,4864,4864,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150 Peak,5070 4896,4896,4896,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150 Peak,5070 4928,4928,4928,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150 Peak,5070 4960,4960,4960,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150 Peak,5070 4992,4992,4992,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150 Peak,5070 5024,5024,5024,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150 Peak,5070 5056,5056,5056,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150 Peak,5070 5088,5088,5088,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150 Peak,5070 5120,5120,5120,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150 Peak,5070 5152,5152,5152,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150 Peak,5070 5184,5184,5184,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150 Peak,5070 5216,5216,5216,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150 Peak,5070 5248,5248,5248,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150 Peak,5070 5280,5280,5280,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150 Peak,5070 5312,5312,5312,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150 Peak,5070 5344,5344,5344,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150 Peak,5070 5376,5376,5376,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150 Peak,5070 5408,5408,5408,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150 Peak,5070 5440,5440,5440,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150 Peak,5070 5472,5472,5472,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150 Peak,5070 5504,5504,5504,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150 Peak,5070 5536,5536,5536,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150 Peak,5070 5568,5568,5568,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150 Peak,5070 5600,5600,5600,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150 Peak,5070 5632,5632,5632,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150 Peak,5070 5664,5664,5664,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150 Peak,5070 5696,5696,5696,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150 Peak,5070 5728,5728,5728,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150 Peak,5070 5760,5760,5760,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,S9150 Peak,5070 clblas-2.10/doc/performance/clBLAS_2.6.0/S9150/sgemmNT_S9150_14.50.2_2.6.0_8.csv000066400000000000000000002346271264277366700250410ustar00rootroot00000000000000m,n,k,lda,ldb,ldc,offa,offb,offc,alpha,beta,order,transa,transb,side,uplo,diag,function,device,library,label,GFLOPS 8,8,8,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,0.034122 16,16,16,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,0.447406 24,24,24,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,0.677813 32,32,32,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,3.02847 40,40,40,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,2.18318 48,48,48,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,8.47773 56,56,56,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,4.93997 64,64,64,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,20.1417 72,72,72,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,8.17182 80,80,80,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,32.2215 88,88,88,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,12.0487 96,96,96,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,52.6002 104,104,104,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,7.22642 112,112,112,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,72.8508 120,120,120,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,9.55488 128,128,128,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,102.5 136,136,136,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,12.0308 144,144,144,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,125.015 152,152,152,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,14.785 160,160,160,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,173.486 168,168,168,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,17.7559 176,176,176,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,203.159 184,184,184,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,21.0275 192,192,192,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,266.236 200,200,200,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,30.1415 208,208,208,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,290.709 216,216,216,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,34.7699 224,224,224,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,378.114 232,232,232,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,38.919 240,240,240,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,401.161 248,248,248,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,42.9426 256,256,256,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,488.206 264,264,264,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,46.227 272,272,272,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,527.004 280,280,280,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,49.5044 288,288,288,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,657.796 296,296,296,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,70.3409 304,304,304,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,645.479 312,312,312,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,74.672 320,320,320,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,797.081 328,328,328,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,78.6247 336,336,336,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,794.493 344,344,344,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,82.6097 352,352,352,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,987.529 360,360,360,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,86.6214 368,368,368,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,899.73 376,376,376,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,90.8487 384,384,384,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,1013.84 392,392,392,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,124.646 400,400,400,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,1051.42 408,408,408,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,129.097 416,416,416,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,1214.84 424,424,424,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,134.156 432,432,432,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,1130.98 440,440,440,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,137.308 448,448,448,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,1185.52 456,456,456,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,140.081 464,464,464,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,857.267 472,472,472,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,144.297 480,480,480,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,1003.88 488,488,488,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,197.766 496,496,496,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,968.329 504,504,504,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,202.071 512,512,512,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,1118.81 520,520,520,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,204.512 528,528,528,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,1096.57 536,536,536,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,207.021 544,544,544,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,1305.3 552,552,552,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,208.928 560,560,560,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,1236.08 568,568,568,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,212.998 576,576,576,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,1396.75 584,584,584,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,284.927 592,592,592,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,1348.51 600,600,600,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,288.749 608,608,608,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,1436.74 616,616,616,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,287.829 624,624,624,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,1376.6 632,632,632,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,289.685 640,640,640,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,1741.01 648,648,648,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,289.326 656,656,656,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,1212.47 664,664,664,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,292.739 672,672,672,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,1489.18 680,680,680,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,369.955 688,688,688,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,1286.23 696,696,696,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,370.356 704,704,704,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,2128.37 712,712,712,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,369.729 720,720,720,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,1418.36 728,728,728,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,368.834 736,736,736,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,1528.01 744,744,744,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,367.832 752,752,752,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,1515.67 760,760,760,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,372.219 768,768,768,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,1942.77 776,776,776,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,482.178 784,784,784,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,1491.46 792,792,792,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,482.089 800,800,800,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,1589.42 808,808,808,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,478.999 816,816,816,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,1397.82 824,824,824,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,473.916 832,832,832,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,2480.16 840,840,840,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,470.31 848,848,848,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,1500.27 856,856,856,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,471.16 864,864,864,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,2478.95 872,872,872,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,613.497 880,880,880,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,1592.37 888,888,888,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,613.022 896,896,896,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,2454.94 904,904,904,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,603.047 912,912,912,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,1547.38 920,920,920,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,595.473 928,928,928,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,1665.16 936,936,936,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,589.497 944,944,944,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,1496.5 952,952,952,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,590.101 960,960,960,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,2334.67 968,968,968,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,709.378 976,976,976,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,1580.53 984,984,984,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,703.557 992,992,992,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,1750.09 1000,1000,1000,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,694.775 1008,1008,1008,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,1590.62 1016,1016,1016,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,684.233 1024,1024,1024,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,2635.73 1032,1032,1032,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,678.682 1040,1040,1040,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,1610.02 1048,1048,1048,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,678.019 1056,1056,1056,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,2836.42 1064,1064,1064,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,859.432 1072,1072,1072,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,1582.08 1080,1080,1080,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,851.971 1088,1088,1088,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,2765.96 1096,1096,1096,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,838.361 1104,1104,1104,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,1611.82 1112,1112,1112,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,825.662 1120,1120,1120,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,1793.99 1128,1128,1128,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,815.772 1136,1136,1136,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,1610.85 1144,1144,1144,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,811.885 1152,1152,1152,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,2733.31 1160,1160,1160,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,948.322 1168,1168,1168,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,1588.51 1176,1176,1176,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,942.586 1184,1184,1184,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,1865.6 1192,1192,1192,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,926.71 1200,1200,1200,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,1615.59 1208,1208,1208,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,913.373 1216,1216,1216,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,2813.75 1224,1224,1224,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,902.025 1232,1232,1232,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,1623.58 1240,1240,1240,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,898.484 1248,1248,1248,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,3232.09 1256,1256,1256,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,1117.95 1264,1264,1264,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,1601.97 1272,1272,1272,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,1107.24 1280,1280,1280,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,2837.21 1288,1288,1288,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,1089.18 1296,1296,1296,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,1609.52 1304,1304,1304,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,1068.26 1312,1312,1312,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,2032.78 1320,1320,1320,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,1053.14 1328,1328,1328,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,1579.8 1336,1336,1336,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,1049.58 1344,1344,1344,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,3100.19 1352,1352,1352,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,1187.27 1360,1360,1360,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,1618.79 1368,1368,1368,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,1175.5 1376,1376,1376,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,2113.03 1384,1384,1384,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,1157.17 1392,1392,1392,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,1621.39 1400,1400,1400,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,1138.26 1408,1408,1408,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,3177.43 1416,1416,1416,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,1124.45 1424,1424,1424,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,1607.43 1432,1432,1432,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,1116.46 1440,1440,1440,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,3064.18 1448,1448,1448,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,1278.62 1456,1456,1456,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,1617.07 1464,1464,1464,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,1267.11 1472,1472,1472,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,2990.14 1480,1480,1480,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,1246.7 1488,1488,1488,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,1597.24 1496,1496,1496,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,1231.96 1504,1504,1504,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,2282.56 1512,1512,1512,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,1217.88 1520,1520,1520,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,1628.95 1528,1528,1528,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,1216.91 1536,1536,1536,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,3482.1 1544,1544,1544,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,1453.66 1552,1552,1552,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,1635.94 1560,1560,1560,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,1440.58 1568,1568,1568,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,2363.28 1576,1576,1576,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,1416.94 1584,1584,1584,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,1617.42 1592,1592,1592,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,1392.73 1600,1600,1600,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,3109.14 1608,1608,1608,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,1373.16 1616,1616,1616,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,1625.16 1624,1624,1624,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,1366.38 1632,1632,1632,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,3444.49 1640,1640,1640,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,1545.94 1648,1648,1648,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,1618.86 1656,1656,1656,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,1528.96 1664,1664,1664,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,3171.86 1672,1672,1672,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,1504.47 1680,1680,1680,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,1625.49 1688,1688,1688,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,1486.18 1696,1696,1696,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,2534.47 1704,1704,1704,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,1465.96 1712,1712,1712,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,1630.4 1720,1720,1720,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,1459.92 1728,1728,1728,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,3435.83 1736,1736,1736,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,1630.49 1744,1744,1744,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,1625.93 1752,1752,1752,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,1615.66 1760,1760,1760,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,2618.2 1768,1768,1768,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,1589.6 1776,1776,1776,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,1625.8 1784,1784,1784,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,1574.55 1792,1792,1792,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,3302.63 1800,1800,1800,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,1553.55 1808,1808,1808,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,1627.14 1816,1816,1816,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,1541.24 1824,1824,1824,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,3443.38 1832,1832,1832,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,1716.25 1840,1840,1840,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,1630.47 1848,1848,1848,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,1701.15 1856,1856,1856,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,3212.74 1864,1864,1864,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,1676.5 1872,1872,1872,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,1633.88 1880,1880,1880,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,1654.83 1888,1888,1888,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,2688.33 1896,1896,1896,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,1634.98 1904,1904,1904,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,1900.34 1912,1912,1912,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,1627.98 1920,1920,1920,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,3468.45 1928,1928,1928,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,1801.94 1936,1936,1936,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,2131.36 1944,1944,1944,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,1786.09 1952,1952,1952,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,2766.95 1960,1960,1960,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,1762.94 1968,1968,1968,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,2062.33 1976,1976,1976,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,1746.32 1984,1984,1984,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,3359.31 1992,1992,1992,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,1726.29 2000,2000,2000,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,2003.48 2008,2008,2008,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,1719.18 2016,2016,2016,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,3501.07 2024,2024,2024,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,1887.09 2032,2032,2032,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,2226.35 2040,2040,2040,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,1867.7 2048,2048,2048,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,3262.61 2056,2056,2056,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,1846.35 2064,2064,2064,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,2163.58 2072,2072,2072,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,1830.17 2080,2080,2080,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,2845.36 2088,2088,2088,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,1813.72 2096,2096,2096,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,2102.11 2104,2104,2104,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,1805.69 2112,2112,2112,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,3851.17 2120,2120,2120,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,2072.13 2128,2128,2128,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,2441.97 2136,2136,2136,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,2054.23 2144,2144,2144,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,2925.77 2152,2152,2152,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,2024.25 2160,2160,2160,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,2363.28 2168,2168,2168,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,2001.04 2176,2176,2176,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,3332.69 2184,2184,2184,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,1975.43 2192,2192,2192,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,2283.0 2200,2200,2200,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,1964.75 2208,2208,2208,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,3600.29 2216,2216,2216,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,2062.7 2224,2224,2224,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,2417.27 2232,2232,2232,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,2047.46 2240,2240,2240,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,3412.41 2248,2248,2248,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,2023.53 2256,2256,2256,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,2350.17 2264,2264,2264,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,2001.46 2272,2272,2272,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,3007.46 2280,2280,2280,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,1981.21 2288,2288,2288,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,2287.68 2296,2296,2296,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,1977.31 2304,2304,2304,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,3656.23 2312,2312,2312,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,2146.71 2320,2320,2320,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,2508.79 2328,2328,2328,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,2131.33 2336,2336,2336,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,3005.11 2344,2344,2344,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,2106.94 2352,2352,2352,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,2442.25 2360,2360,2360,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,2089.09 2368,2368,2368,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,3360.51 2376,2376,2376,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,2069.63 2384,2384,2384,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,2380.93 2392,2392,2392,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,2059.0 2400,2400,2400,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,3722.06 2408,2408,2408,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,2232.86 2416,2416,2416,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,2605.84 2424,2424,2424,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,2216.76 2432,2432,2432,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,3439.1 2440,2440,2440,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,2194.7 2448,2448,2448,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,2541.67 2456,2456,2456,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,2170.7 2464,2464,2464,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,3099.24 2472,2472,2472,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,2151.96 2480,2480,2480,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,2471.69 2488,2488,2488,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,2144.2 2496,2496,2496,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,3791.51 2504,2504,2504,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,2317.59 2512,2512,2512,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,2694.96 2520,2520,2520,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,2301.14 2528,2528,2528,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,3107.44 2536,2536,2536,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,2280.67 2544,2544,2544,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,2635.88 2552,2552,2552,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,2260.18 2560,2560,2560,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,3401.05 2568,2568,2568,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,2241.41 2576,2576,2576,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,2572.46 2584,2584,2584,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,2232.42 2592,2592,2592,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,3856.03 2600,2600,2600,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,2402.88 2608,2608,2608,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,2791.73 2616,2616,2616,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,2387.15 2624,2624,2624,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,3412.34 2632,2632,2632,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,2364.14 2640,2640,2640,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,2728.13 2648,2648,2648,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,2342.92 2656,2656,2656,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,3137.78 2664,2664,2664,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,2320.43 2672,2672,2672,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,2662.28 2680,2680,2680,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,2316.33 2688,2688,2688,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,3927.97 2696,2696,2696,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,2486.84 2704,2704,2704,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,2883.49 2712,2712,2712,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,2473.52 2720,2720,2720,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,3160.71 2728,2728,2728,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,2447.06 2736,2736,2736,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,2819.54 2744,2744,2744,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,2430.18 2752,2752,2752,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,3415.54 2760,2760,2760,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,2409.94 2768,2768,2768,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,2754.03 2776,2776,2776,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,2396.74 2784,2784,2784,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,3813.02 2792,2792,2792,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,2486.92 2800,2800,2800,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,2874.44 2808,2808,2808,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,2475.81 2816,2816,2816,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,3494.03 2824,2824,2824,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,2456.43 2832,2832,2832,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,2820.17 2840,2840,2840,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,2433.26 2848,2848,2848,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,3254.38 2856,2856,2856,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,2415.57 2864,2864,2864,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,2758.67 2872,2872,2872,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,2409.97 2880,2880,2880,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,3893.37 2888,2888,2888,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,2571.02 2896,2896,2896,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,2964.88 2904,2904,2904,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,2557.51 2912,2912,2912,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,3207.6 2920,2920,2920,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,2536.38 2928,2928,2928,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,2908.87 2936,2936,2936,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,2518.72 2944,2944,2944,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,3443.98 2952,2952,2952,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,2500.61 2960,2960,2960,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,2850.13 2968,2968,2968,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,2494.96 2976,2976,2976,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,3974.79 2984,2984,2984,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,2657.04 2992,2992,2992,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,3062.15 3000,3000,3000,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,2644.89 3008,3008,3008,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,3458.37 3016,3016,3016,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,2626.18 3024,3024,3024,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,3010.25 3032,3032,3032,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,2607.24 3040,3040,3040,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,3255.86 3048,3048,3048,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,2588.14 3056,3056,3056,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,2955.31 3064,3064,3064,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,2582.11 3072,3072,3072,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,3882.76 3080,3080,3080,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,2654.92 3088,3088,3088,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,3055.58 3096,3096,3096,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,2646.0 3104,3104,3104,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,3280.97 3112,3112,3112,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,2631.64 3120,3120,3120,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,3010.62 3128,3128,3128,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,2616.58 3136,3136,3136,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,3492.04 3144,3144,3144,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,2600.58 3152,3152,3152,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,2957.51 3160,3160,3160,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,2593.93 3168,3168,3168,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,3983.55 3176,3176,3176,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,2746.04 3184,3184,3184,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,3153.31 3192,3192,3192,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,2734.99 3200,3200,3200,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,3511.25 3208,3208,3208,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,2714.86 3216,3216,3216,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,3100.55 3224,3224,3224,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,2696.12 3232,3232,3232,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,3339.7 3240,3240,3240,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,2679.9 3248,3248,3248,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,3046.47 3256,3256,3256,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,2676.13 3264,3264,3264,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,3925.77 3272,3272,3272,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,2751.24 3280,3280,3280,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,3152.23 3288,3288,3288,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,2741.43 3296,3296,3296,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,3318.5 3304,3304,3304,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,2725.33 3312,3312,3312,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,3107.46 3320,3320,3320,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,2712.5 3328,3328,3328,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,3498.34 3336,3336,3336,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,2696.34 3344,3344,3344,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,3055.92 3352,3352,3352,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,2686.67 3360,3360,3360,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,4015.39 3368,3368,3368,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,2840.71 3376,3376,3376,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,3250.02 3384,3384,3384,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,2828.01 3392,3392,3392,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,3523.63 3400,3400,3400,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,2811.71 3408,3408,3408,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,3202.88 3416,3416,3416,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,2792.86 3424,3424,3424,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,3376.95 3432,3432,3432,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,2779.62 3440,3440,3440,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,3153.9 3448,3448,3448,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,2775.45 3456,3456,3456,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,3973.43 3464,3464,3464,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,2850.21 3472,3472,3472,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,3256.13 3480,3480,3480,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,2837.62 3488,3488,3488,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,3363.12 3496,3496,3496,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,2823.42 3504,3504,3504,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,3211.33 3512,3512,3512,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,2808.05 3520,3520,3520,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,3524.82 3528,3528,3528,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,2793.52 3536,3536,3536,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,3166.38 3544,3544,3544,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,2792.11 3552,3552,3552,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,3946.17 3560,3560,3560,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,2864.07 3568,3568,3568,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,3268.16 3576,3576,3576,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,2851.73 3584,3584,3584,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,3492.88 3592,3592,3592,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,2838.9 3600,3600,3600,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,3228.88 3608,3608,3608,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,2827.09 3616,3616,3616,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,3384.2 3624,3624,3624,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,2814.34 3632,3632,3632,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,3186.53 3640,3640,3640,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,2809.55 3648,3648,3648,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,4035.36 3656,3656,3656,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,2950.52 3664,3664,3664,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,3361.04 3672,3672,3672,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,2939.33 3680,3680,3680,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,3420.93 3688,3688,3688,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,2921.67 3696,3696,3696,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,3318.59 3704,3704,3704,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,2909.31 3712,3712,3712,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,3520.42 3720,3720,3720,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,2896.83 3728,3728,3728,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,3276.66 3736,3736,3736,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,2893.89 3744,3744,3744,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,4014.19 3752,3752,3752,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,2967.43 3760,3760,3760,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,3379.53 3768,3768,3768,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,2955.77 3776,3776,3776,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,3509.18 3784,3784,3784,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,2942.64 3792,3792,3792,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,3336.82 3800,3800,3800,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,2927.71 3808,3808,3808,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,3411.35 3816,3816,3816,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,2916.01 3824,3824,3824,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,3294.52 3832,3832,3832,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,2917.23 3840,3840,3840,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,3999.72 3848,3848,3848,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,2986.57 3856,3856,3856,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,3393.89 3864,3864,3864,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,2976.33 3872,3872,3872,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,3450.3 3880,3880,3880,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,2961.71 3888,3888,3888,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,3356.23 3896,3896,3896,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,2950.42 3904,3904,3904,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,3533.8 3912,3912,3912,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,2939.34 3920,3920,3920,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,3316.93 3928,3928,3928,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,2935.67 3936,3936,3936,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,3994.07 3944,3944,3944,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,3008.51 3952,3952,3952,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,3417.37 3960,3960,3960,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,2997.35 3968,3968,3968,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,3527.62 3976,3976,3976,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,2985.34 3984,3984,3984,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,3379.08 3992,3992,3992,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,2973.92 4000,4000,4000,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,3363.94 4008,4008,4008,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,2966.09 4016,4016,4016,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,3348.56 4024,4024,4024,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,2965.54 4032,4032,4032,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,3991.5 4040,4040,4040,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,3031.06 4048,4048,4048,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,3437.65 4056,4056,4056,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,3019.39 4064,4064,4064,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,3424.03 4072,4072,4072,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,3007.46 4080,4080,4080,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,3406.31 4088,4088,4088,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,2993.94 4096,4096,4096,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,3253.4 4104,4104,4104,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,2984.37 4112,4112,4112,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,3374.71 4120,4120,4120,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,2986.68 4128,4128,4128,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,3992.35 4136,4136,4136,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,3055.37 4144,4144,4144,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,3463.21 4152,4152,4152,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,3045.06 4160,4160,4160,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,3453.6 4168,4168,4168,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,3035.79 4176,4176,4176,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,3430.09 4184,4184,4184,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,3025.25 4192,4192,4192,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,3414.5 4200,4200,4200,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,3016.47 4208,4208,4208,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,3397.31 4216,4216,4216,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,3015.22 4224,4224,4224,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,4086.38 4232,4232,4232,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,3136.75 4240,4240,4240,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,3554.0 4248,4248,4248,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,3126.01 4256,4256,4256,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,3539.31 4264,4264,4264,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,3115.8 4272,4272,4272,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,3518.35 4280,4280,4280,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,3105.71 4288,4288,4288,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,3508.97 4296,4296,4296,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,3095.15 4304,4304,4304,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,3485.42 4312,4312,4312,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,3092.55 4320,4320,4320,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,4007.58 4328,4328,4328,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,3028.45 4336,4336,4336,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,3443.52 4344,4344,4344,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,3024.46 4352,4352,4352,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,3438.47 4360,4360,4360,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,3020.27 4368,4368,4368,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,3430.0 4376,4376,4376,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,3016.09 4384,4384,4384,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,3455.18 4392,4392,4392,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,3014.85 4400,4400,4400,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,3459.23 4408,4408,4408,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,3041.05 4416,4416,4416,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,4019.42 4424,4424,4424,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,3059.0 4432,4432,4432,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,3474.41 4440,4440,4440,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,3055.01 4448,4448,4448,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,3472.58 4456,4456,4456,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,3054.59 4464,4464,4464,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,3463.07 4472,4472,4472,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,3051.29 4480,4480,4480,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,3483.14 4488,4488,4488,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,3049.52 4496,4496,4496,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,3491.46 4504,4504,4504,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,3070.73 4512,4512,4512,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,4034.24 4520,4520,4520,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,3090.46 4528,4528,4528,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,3507.35 4536,4536,4536,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,3087.41 4544,4544,4544,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,3506.64 4552,4552,4552,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,3086.22 4560,4560,4560,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,3497.53 4568,4568,4568,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,3085.09 4576,4576,4576,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,3512.45 4584,4584,4584,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,3083.13 4592,4592,4592,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,3529.48 4600,4600,4600,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,3083.1 4608,4608,4608,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,4049.75 4616,4616,4616,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,3114.24 4624,4624,4624,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,3538.89 4632,4632,4632,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,3119.84 4640,4640,4640,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,3539.86 4648,4648,4648,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,3118.56 4656,4656,4656,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,3530.65 4664,4664,4664,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,3117.15 4672,4672,4672,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,3533.75 4680,4680,4680,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,3111.9 4688,4688,4688,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,3558.8 4696,4696,4696,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,3136.96 4704,4704,4704,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,4070.94 4712,4712,4712,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,3153.24 4720,4720,4720,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,3573.73 4728,4728,4728,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,3154.21 4736,4736,4736,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,3574.27 4744,4744,4744,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,3152.87 4752,4752,4752,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,3566.73 4760,4760,4760,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,3151.71 4768,4768,4768,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,3572.87 4776,4776,4776,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,3148.77 4784,4784,4784,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,3594.01 4792,4792,4792,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,3170.94 4800,4800,4800,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,4092.13 4808,4808,4808,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,3186.04 4816,4816,4816,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,3606.45 4824,4824,4824,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,3186.1 4832,4832,4832,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,3607.24 4840,4840,4840,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,3185.95 4848,4848,4848,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,3600.7 4856,4856,4856,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,3186.26 4864,4864,4864,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,3601.28 4872,4872,4872,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,3184.6 4880,4880,4880,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,3628.73 4888,4888,4888,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,3203.74 4896,4896,4896,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,4048.43 4904,4904,4904,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,3175.22 4912,4912,4912,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,3590.01 4920,4920,4920,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,3173.94 4928,4928,4928,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,3592.23 4936,4936,4936,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,3176.69 4944,4944,4944,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,3586.13 4952,4952,4952,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,3175.3 4960,4960,4960,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,3587.61 4968,4968,4968,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,3175.4 4976,4976,4976,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,3614.31 4984,4984,4984,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,3190.87 4992,4992,4992,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,4074.66 5000,5000,5000,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,3209.99 5008,5008,5008,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,3626.22 5016,5016,5016,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,3210.06 5024,5024,5024,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,3627.93 5032,5032,5032,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,3211.01 5040,5040,5040,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,3622.69 5048,5048,5048,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,3211.89 5056,5056,5056,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,3624.24 5064,5064,5064,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,3211.4 5072,5072,5072,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,3655.79 5080,5080,5080,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,3216.92 5088,5088,5088,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,4101.18 5096,5096,5096,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,3240.03 5104,5104,5104,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,3662.36 5112,5112,5112,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,3231.74 5120,5120,5120,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,3408.33 5128,5128,5128,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,3232.1 5136,5136,5136,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,3661.35 5144,5144,5144,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,3243.17 5152,5152,5152,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,3662.59 5160,5160,5160,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,3246.85 5168,5168,5168,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,3693.46 5176,5176,5176,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,3253.16 5184,5184,5184,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,4070.78 5192,5192,5192,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,3237.45 5200,5200,5200,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,3652.57 5208,5208,5208,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,3239.79 5216,5216,5216,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,3655.54 5224,5224,5224,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,3239.92 5232,5232,5232,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,3650.67 5240,5240,5240,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,3241.32 5248,5248,5248,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,3653.21 5256,5256,5256,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,3241.31 5264,5264,5264,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,3679.89 5272,5272,5272,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,3252.4 5280,5280,5280,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,4101.55 5288,5288,5288,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,3272.29 5296,5296,5296,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,3690.83 5304,5304,5304,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,3275.38 5312,5312,5312,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,3695.49 5320,5320,5320,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,3276.65 5328,5328,5328,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,3690.03 5336,5336,5336,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,3277.52 5344,5344,5344,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,3691.43 5352,5352,5352,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,3277.41 5360,5360,5360,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,3720.61 5368,5368,5368,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,3283.82 5376,5376,5376,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,4076.74 5384,5384,5384,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,3270.85 5392,5392,5392,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,3684.48 5400,5400,5400,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,3273.49 5408,5408,5408,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,3687.94 5416,5416,5416,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,3274.01 5424,5424,5424,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,3683.83 5432,5432,5432,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,3274.2 5440,5440,5440,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,3685.76 5448,5448,5448,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,3271.09 5456,5456,5456,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,3712.34 5464,5464,5464,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,3270.77 5472,5472,5472,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,4110.09 5480,5480,5480,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,3306.4 5488,5488,5488,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,3724.18 5496,5496,5496,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,3309.34 5504,5504,5504,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,3728.42 5512,5512,5512,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,3311.15 5520,5520,5520,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,3724.71 5528,5528,5528,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,3311.04 5536,5536,5536,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,3726.29 5544,5544,5544,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,3312.47 5552,5552,5552,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,3755.84 5560,5560,5560,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,3318.84 5568,5568,5568,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,4092.85 5576,5576,5576,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,3305.69 5584,5584,5584,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,3721.63 5592,5592,5592,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,3309.3 5600,5600,5600,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,3726.7 5608,5608,5608,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,3310.54 5616,5616,5616,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,3723.7 5624,5624,5624,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,3308.36 5632,5632,5632,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,3723.24 5640,5640,5640,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,3308.9 5648,5648,5648,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,3754.01 5656,5656,5656,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,3319.89 5664,5664,5664,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,4078.75 5672,5672,5672,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,3308.9 5680,5680,5680,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,3722.17 5688,5688,5688,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,3312.39 5696,5696,5696,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,3727.03 5704,5704,5704,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,3313.15 5712,5712,5712,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,3724.17 5720,5720,5720,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,3315.86 5728,5728,5728,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,3726.4 5736,5736,5736,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,3317.29 5744,5744,5744,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,3752.54 5752,5752,5752,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,3325.36 5760,5760,5760,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,S9150_14.50.2,4115.51 clblas-2.10/doc/performance/clBLAS_2.6.0/S9150/sgemm_32.csv000066400000000000000000000513571264277366700224360ustar00rootroot00000000000000m,n,k,lda,ldb,ldc,offa,offb,offc,alpha,beta,order,transa,transb,side,uplo,diag,function,device,library,label,GFLOPS 32,32,32,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,s9150_sgemmNT_14502,3.0797 64,64,64,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,s9150_sgemmNT_14502,19.7101 96,96,96,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,s9150_sgemmNT_14502,53.0735 128,128,128,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,s9150_sgemmNT_14502,103.054 160,160,160,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,s9150_sgemmNT_14502,168.49 192,192,192,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,s9150_sgemmNT_14502,271.913 224,224,224,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,s9150_sgemmNT_14502,371.551 256,256,256,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,s9150_sgemmNT_14502,484.961 288,288,288,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,s9150_sgemmNT_14502,661.622 320,320,320,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,s9150_sgemmNT_14502,786.558 352,352,352,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,s9150_sgemmNT_14502,1045.9 384,384,384,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,s9150_sgemmNT_14502,1026.9 416,416,416,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,s9150_sgemmNT_14502,1245.96 448,448,448,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,s9150_sgemmNT_14502,1188.41 480,480,480,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,s9150_sgemmNT_14502,1020.74 512,512,512,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,s9150_sgemmNT_14502,1133.79 544,544,544,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,s9150_sgemmNT_14502,1305.51 576,576,576,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,s9150_sgemmNT_14502,1437.68 608,608,608,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,s9150_sgemmNT_14502,1430.24 640,640,640,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,s9150_sgemmNT_14502,1738.47 672,672,672,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,s9150_sgemmNT_14502,1472.84 704,704,704,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,s9150_sgemmNT_14502,2147.36 736,736,736,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,s9150_sgemmNT_14502,1535.45 768,768,768,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,s9150_sgemmNT_14502,1974.22 800,800,800,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,s9150_sgemmNT_14502,1595.49 832,832,832,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,s9150_sgemmNT_14502,2487.82 864,864,864,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,s9150_sgemmNT_14502,2517.41 896,896,896,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,s9150_sgemmNT_14502,2459.1 928,928,928,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,s9150_sgemmNT_14502,1666.41 960,960,960,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,s9150_sgemmNT_14502,2355.5 992,992,992,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,s9150_sgemmNT_14502,1748.43 1024,1024,1024,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,s9150_sgemmNT_14502,2644.94 1056,1056,1056,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,s9150_sgemmNT_14502,2855.86 1088,1088,1088,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,s9150_sgemmNT_14502,2776.18 1120,1120,1120,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,s9150_sgemmNT_14502,1798.07 1152,1152,1152,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,s9150_sgemmNT_14502,2742.11 1184,1184,1184,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,s9150_sgemmNT_14502,1872.52 1216,1216,1216,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,s9150_sgemmNT_14502,2814.92 1248,1248,1248,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,s9150_sgemmNT_14502,3236.78 1280,1280,1280,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,s9150_sgemmNT_14502,2846.51 1312,1312,1312,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,s9150_sgemmNT_14502,2038.2 1344,1344,1344,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,s9150_sgemmNT_14502,3112.24 1376,1376,1376,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,s9150_sgemmNT_14502,2118.01 1408,1408,1408,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,s9150_sgemmNT_14502,3183.25 1440,1440,1440,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,s9150_sgemmNT_14502,3072.06 1472,1472,1472,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,s9150_sgemmNT_14502,2993.48 1504,1504,1504,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,s9150_sgemmNT_14502,2288.82 1536,1536,1536,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,s9150_sgemmNT_14502,3484.8 1568,1568,1568,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,s9150_sgemmNT_14502,2368.77 1600,1600,1600,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,s9150_sgemmNT_14502,3112.06 1632,1632,1632,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,s9150_sgemmNT_14502,3452.04 1664,1664,1664,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,s9150_sgemmNT_14502,3174.6 1696,1696,1696,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,s9150_sgemmNT_14502,2538.04 1728,1728,1728,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,s9150_sgemmNT_14502,3432.49 1760,1760,1760,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,s9150_sgemmNT_14502,2622.84 1792,1792,1792,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,s9150_sgemmNT_14502,3302.22 1824,1824,1824,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,s9150_sgemmNT_14502,3449.61 1856,1856,1856,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,s9150_sgemmNT_14502,3215.07 1888,1888,1888,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,s9150_sgemmNT_14502,2691.81 1920,1920,1920,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,s9150_sgemmNT_14502,3472.84 1952,1952,1952,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,s9150_sgemmNT_14502,2771.69 1984,1984,1984,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,s9150_sgemmNT_14502,3360.98 2016,2016,2016,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,s9150_sgemmNT_14502,3504.29 2048,2048,2048,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,s9150_sgemmNT_14502,3261.47 2080,2080,2080,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,s9150_sgemmNT_14502,2843.54 2112,2112,2112,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,s9150_sgemmNT_14502,3853.03 2144,2144,2144,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,s9150_sgemmNT_14502,2929.13 2176,2176,2176,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,s9150_sgemmNT_14502,3334.19 2208,2208,2208,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,s9150_sgemmNT_14502,3601.86 2240,2240,2240,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,s9150_sgemmNT_14502,3413.71 2272,2272,2272,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,s9150_sgemmNT_14502,3005.75 2304,2304,2304,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,s9150_sgemmNT_14502,3655.15 2336,2336,2336,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,s9150_sgemmNT_14502,3008.11 2368,2368,2368,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,s9150_sgemmNT_14502,3361.83 2400,2400,2400,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,s9150_sgemmNT_14502,3724.54 2432,2432,2432,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,s9150_sgemmNT_14502,3438.36 2464,2464,2464,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,s9150_sgemmNT_14502,3101.16 2496,2496,2496,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,s9150_sgemmNT_14502,3790.09 2528,2528,2528,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,s9150_sgemmNT_14502,3110.1 2560,2560,2560,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,s9150_sgemmNT_14502,3401.51 2592,2592,2592,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,s9150_sgemmNT_14502,3859.16 2624,2624,2624,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,s9150_sgemmNT_14502,3413.18 2656,2656,2656,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,s9150_sgemmNT_14502,3140.71 2688,2688,2688,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,s9150_sgemmNT_14502,3930.08 2720,2720,2720,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,s9150_sgemmNT_14502,3162.77 2752,2752,2752,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,s9150_sgemmNT_14502,3416.12 2784,2784,2784,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,s9150_sgemmNT_14502,3813.78 2816,2816,2816,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,s9150_sgemmNT_14502,3494.48 2848,2848,2848,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,s9150_sgemmNT_14502,3256.76 2880,2880,2880,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,s9150_sgemmNT_14502,3894.53 2912,2912,2912,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,s9150_sgemmNT_14502,3208.74 2944,2944,2944,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,s9150_sgemmNT_14502,3444.13 2976,2976,2976,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,s9150_sgemmNT_14502,3976.02 3008,3008,3008,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,s9150_sgemmNT_14502,3458.86 3040,3040,3040,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,s9150_sgemmNT_14502,3256.31 3072,3072,3072,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,s9150_sgemmNT_14502,3883.12 3104,3104,3104,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,s9150_sgemmNT_14502,3281.61 3136,3136,3136,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,s9150_sgemmNT_14502,3492.46 3168,3168,3168,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,s9150_sgemmNT_14502,3983.2 3200,3200,3200,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,s9150_sgemmNT_14502,3511.08 3232,3232,3232,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,s9150_sgemmNT_14502,3339.78 3264,3264,3264,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,s9150_sgemmNT_14502,3926.82 3296,3296,3296,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,s9150_sgemmNT_14502,3319.1 3328,3328,3328,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,s9150_sgemmNT_14502,3498.7 3360,3360,3360,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,s9150_sgemmNT_14502,4015.7 3392,3392,3392,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,s9150_sgemmNT_14502,3524.21 3424,3424,3424,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,s9150_sgemmNT_14502,3377.49 3456,3456,3456,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,s9150_sgemmNT_14502,3974.18 3488,3488,3488,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,s9150_sgemmNT_14502,3363.15 3520,3520,3520,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,s9150_sgemmNT_14502,3525.16 3552,3552,3552,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,s9150_sgemmNT_14502,3945.32 3584,3584,3584,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,s9150_sgemmNT_14502,3492.94 3616,3616,3616,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,s9150_sgemmNT_14502,3384.64 3648,3648,3648,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,s9150_sgemmNT_14502,4036.39 3680,3680,3680,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,s9150_sgemmNT_14502,3421.93 3712,3712,3712,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,s9150_sgemmNT_14502,3520.42 3744,3744,3744,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,s9150_sgemmNT_14502,4015.18 3776,3776,3776,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,s9150_sgemmNT_14502,3509.59 3808,3808,3808,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,s9150_sgemmNT_14502,3412.4 3840,3840,3840,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,s9150_sgemmNT_14502,4000.1 3872,3872,3872,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,s9150_sgemmNT_14502,3451.25 3904,3904,3904,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,s9150_sgemmNT_14502,3534.18 3936,3936,3936,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,s9150_sgemmNT_14502,3994.37 3968,3968,3968,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,s9150_sgemmNT_14502,3527.49 4000,4000,4000,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,s9150_sgemmNT_14502,3364.04 4032,4032,4032,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,s9150_sgemmNT_14502,3991.69 4064,4064,4064,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,s9150_sgemmNT_14502,3424.0 4096,4096,4096,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,s9150_sgemmNT_14502,3231.34 4128,4128,4128,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,s9150_sgemmNT_14502,3992.35 4160,4160,4160,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,s9150_sgemmNT_14502,3453.23 4192,4192,4192,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,s9150_sgemmNT_14502,3415.41 4224,4224,4224,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,s9150_sgemmNT_14502,4086.52 4256,4256,4256,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,s9150_sgemmNT_14502,3539.8 4288,4288,4288,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,s9150_sgemmNT_14502,3508.89 4320,4320,4320,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,s9150_sgemmNT_14502,4008.28 4352,4352,4352,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,s9150_sgemmNT_14502,3438.98 4384,4384,4384,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,s9150_sgemmNT_14502,3456.03 4416,4416,4416,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,s9150_sgemmNT_14502,4019.47 4448,4448,4448,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,s9150_sgemmNT_14502,3473.35 4480,4480,4480,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,s9150_sgemmNT_14502,3478.05 4512,4512,4512,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,s9150_sgemmNT_14502,4034.76 4544,4544,4544,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,s9150_sgemmNT_14502,3507.42 4576,4576,4576,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,s9150_sgemmNT_14502,3514.78 4608,4608,4608,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,s9150_sgemmNT_14502,4050.23 4640,4640,4640,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,s9150_sgemmNT_14502,3540.25 4672,4672,4672,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,s9150_sgemmNT_14502,3534.1 4704,4704,4704,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,s9150_sgemmNT_14502,4070.74 4736,4736,4736,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,s9150_sgemmNT_14502,3574.49 4768,4768,4768,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,s9150_sgemmNT_14502,3574.08 4800,4800,4800,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,s9150_sgemmNT_14502,4092.24 4832,4832,4832,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,s9150_sgemmNT_14502,3607.49 4864,4864,4864,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,s9150_sgemmNT_14502,3601.67 4896,4896,4896,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,s9150_sgemmNT_14502,4048.2 4928,4928,4928,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,s9150_sgemmNT_14502,3592.38 4960,4960,4960,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,s9150_sgemmNT_14502,3587.92 4992,4992,4992,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,s9150_sgemmNT_14502,4074.27 5024,5024,5024,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,s9150_sgemmNT_14502,3627.57 5056,5056,5056,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,s9150_sgemmNT_14502,3624.23 5088,5088,5088,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,s9150_sgemmNT_14502,4101.37 5120,5120,5120,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,s9150_sgemmNT_14502,3435.36 5152,5152,5152,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,s9150_sgemmNT_14502,3662.97 5184,5184,5184,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,s9150_sgemmNT_14502,4070.59 5216,5216,5216,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,s9150_sgemmNT_14502,3655.5 5248,5248,5248,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,s9150_sgemmNT_14502,3653.55 5280,5280,5280,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,s9150_sgemmNT_14502,4101.64 5312,5312,5312,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,s9150_sgemmNT_14502,3694.86 5344,5344,5344,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,s9150_sgemmNT_14502,3690.74 5376,5376,5376,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,s9150_sgemmNT_14502,4076.52 5408,5408,5408,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,s9150_sgemmNT_14502,3688.2 5440,5440,5440,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,s9150_sgemmNT_14502,3685.97 5472,5472,5472,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,s9150_sgemmNT_14502,4110.0 5504,5504,5504,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,s9150_sgemmNT_14502,3728.33 5536,5536,5536,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,s9150_sgemmNT_14502,3726.29 5568,5568,5568,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,s9150_sgemmNT_14502,4093.07 5600,5600,5600,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,s9150_sgemmNT_14502,3726.56 5632,5632,5632,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,s9150_sgemmNT_14502,3723.32 5664,5664,5664,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,s9150_sgemmNT_14502,4078.61 5696,5696,5696,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,s9150_sgemmNT_14502,3727.07 5728,5728,5728,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,s9150_sgemmNT_14502,3726.47 5760,5760,5760,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,s9150_sgemmNT_14502,4115.41 clblas-2.10/doc/performance/clBLAS_2.6.0/S9150/zgemmNT_S9150_14.50.2_2.6.0_8.csv000066400000000000000000002346201264277366700250410ustar00rootroot00000000000000m,n,k,lda,ldb,ldc,offa,offb,offc,alpha,beta,order,transa,transb,side,uplo,diag,function,device,library,label,GFLOPS 8,8,8,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,0.174298 16,16,16,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,1.40696 24,24,24,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,3.92031 32,32,32,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,8.23834 40,40,40,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,15.8122 48,48,48,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,25.0776 56,56,56,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,35.8766 64,64,64,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,32.9793 72,72,72,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,69.3931 80,80,80,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,87.8404 88,88,88,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,108.97 96,96,96,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,124.501 104,104,104,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,149.36 112,112,112,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,172.941 120,120,120,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,211.765 128,128,128,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,145.965 136,136,136,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,275.969 144,144,144,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,315.185 152,152,152,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,348.35 160,160,160,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,385.824 168,168,168,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,438.38 176,176,176,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,467.964 184,184,184,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,466.062 192,192,192,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,361.301 200,200,200,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,522.79 208,208,208,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,562.608 216,216,216,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,512.176 224,224,224,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,562.499 232,232,232,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,537.487 240,240,240,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,569.68 248,248,248,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,614.513 256,256,256,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,624.065 264,264,264,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,613.708 272,272,272,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,640.804 280,280,280,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,617.214 288,288,288,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,651.007 296,296,296,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,541.554 304,304,304,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,575.426 312,312,312,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,561.366 320,320,320,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,783.9 328,328,328,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,604.692 336,336,336,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,632.033 344,344,344,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,574.55 352,352,352,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,689.58 360,360,360,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,672.24 368,368,368,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,690.728 376,376,376,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,655.162 384,384,384,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,1129.22 392,392,392,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,666.699 400,400,400,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,695.936 408,408,408,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,716.56 416,416,416,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,748.399 424,424,424,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,656.978 432,432,432,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,697.162 440,440,440,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,680.159 448,448,448,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,1191.19 456,456,456,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,733.899 464,464,464,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,757.594 472,472,472,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,713.489 480,480,480,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,735.564 488,488,488,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,710.948 496,496,496,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,745.116 504,504,504,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,752.933 512,512,512,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,1543.07 520,520,520,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,711.673 528,528,528,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,745.731 536,536,536,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,735.948 544,544,544,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,757.307 552,552,552,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,780.898 560,560,560,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,802.651 568,568,568,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,739.823 576,576,576,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,1504.17 584,584,584,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,772.179 592,592,592,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,789.072 600,600,600,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,749.743 608,608,608,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,770.368 616,616,616,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,766.562 624,624,624,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,782.325 632,632,632,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,760.776 640,640,640,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,1547.27 648,648,648,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,767.207 656,656,656,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,784.056 664,664,664,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,770.857 672,672,672,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,785.399 680,680,680,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,770.438 688,688,688,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,788.029 696,696,696,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,787.859 704,704,704,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,1612.08 712,712,712,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,775.705 720,720,720,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,793.364 728,728,728,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,780.976 736,736,736,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,794.415 744,744,744,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,782.586 752,752,752,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,796.613 760,760,760,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,781.804 768,768,768,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,1642.07 776,776,776,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,790.231 784,784,784,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,802.823 792,792,792,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,786.645 800,800,800,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,800.769 808,808,808,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,800.087 816,816,816,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,812.745 824,824,824,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,791.131 832,832,832,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,1729.26 840,840,840,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,809.964 848,848,848,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,823.385 856,856,856,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,796.453 864,864,864,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,806.994 872,872,872,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,787.352 880,880,880,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,806.884 888,888,888,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,804.822 896,896,896,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,1790.53 904,904,904,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,795.742 912,912,912,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,811.731 920,920,920,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,797.491 928,928,928,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,808.521 936,936,936,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,808.233 944,944,944,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,818.821 952,952,952,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,803.003 960,960,960,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,1715.95 968,968,968,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,795.754 976,976,976,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,810.981 984,984,984,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,810.691 992,992,992,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,822.755 1000,1000,1000,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,804.904 1008,1008,1008,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,816.226 1016,1016,1016,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,804.934 1024,1024,1024,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,1781.44 1032,1032,1032,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,809.249 1040,1040,1040,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,814.659 1048,1048,1048,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,810.495 1056,1056,1056,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,821.558 1064,1064,1064,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,807.06 1072,1072,1072,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,819.21 1080,1080,1080,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,808.378 1088,1088,1088,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,1755.32 1096,1096,1096,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,808.694 1104,1104,1104,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,818.463 1112,1112,1112,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,814.922 1120,1120,1120,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,825.969 1128,1128,1128,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,812.718 1136,1136,1136,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,820.897 1144,1144,1144,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,812.173 1152,1152,1152,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,1834.6 1160,1160,1160,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,807.824 1168,1168,1168,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,820.13 1176,1176,1176,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,811.035 1184,1184,1184,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,819.735 1192,1192,1192,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,821.324 1200,1200,1200,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,830.202 1208,1208,1208,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,817.013 1216,1216,1216,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,1820.76 1224,1224,1224,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,816.696 1232,1232,1232,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,824.251 1240,1240,1240,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,816.167 1248,1248,1248,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,824.011 1256,1256,1256,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,814.317 1264,1264,1264,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,824.496 1272,1272,1272,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,816.206 1280,1280,1280,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,1801.01 1288,1288,1288,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,812.275 1296,1296,1296,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,823.547 1304,1304,1304,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,815.107 1312,1312,1312,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,823.286 1320,1320,1320,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,812.856 1328,1328,1328,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,822.815 1336,1336,1336,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,815.553 1344,1344,1344,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,1815.26 1352,1352,1352,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,816.977 1360,1360,1360,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,824.45 1368,1368,1368,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,816.592 1376,1376,1376,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,824.614 1384,1384,1384,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,822.149 1392,1392,1392,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,824.542 1400,1400,1400,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,823.667 1408,1408,1408,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,1892.93 1416,1416,1416,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,820.046 1424,1424,1424,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,824.987 1432,1432,1432,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,817.732 1440,1440,1440,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,825.459 1448,1448,1448,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,819.239 1456,1456,1456,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,825.964 1464,1464,1464,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,818.351 1472,1472,1472,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,1838.12 1480,1480,1480,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,817.985 1488,1488,1488,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,825.395 1496,1496,1496,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,819.057 1504,1504,1504,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,826.305 1512,1512,1512,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,817.86 1520,1520,1520,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,827.026 1528,1528,1528,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,821.272 1536,1536,1536,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,1845.03 1544,1544,1544,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,820.517 1552,1552,1552,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,828.502 1560,1560,1560,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,822.442 1568,1568,1568,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,828.883 1576,1576,1576,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,823.214 1584,1584,1584,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,828.942 1592,1592,1592,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,823.301 1600,1600,1600,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,1877.72 1608,1608,1608,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,826.272 1616,1616,1616,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,832.98 1624,1624,1624,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,821.065 1632,1632,1632,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,827.566 1640,1640,1640,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,820.171 1648,1648,1648,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,828.568 1656,1656,1656,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,823.33 1664,1664,1664,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,1895.49 1672,1672,1672,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,824.024 1680,1680,1680,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,829.369 1688,1688,1688,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,824.432 1696,1696,1696,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,831.698 1704,1704,1704,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,822.877 1712,1712,1712,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,828.87 1720,1720,1720,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,822.779 1728,1728,1728,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,1874.6 1736,1736,1736,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,823.494 1744,1744,1744,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,830.355 1752,1752,1752,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,824.328 1760,1760,1760,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,830.985 1768,1768,1768,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,824.694 1776,1776,1776,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,829.04 1784,1784,1784,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,823.622 1792,1792,1792,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,1894.27 1800,1800,1800,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,824.519 1808,1808,1808,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,830.798 1816,1816,1816,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,825.645 1824,1824,1824,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,832.327 1832,1832,1832,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,823.339 1840,1840,1840,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,829.445 1848,1848,1848,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,825.388 1856,1856,1856,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,1889.81 1864,1864,1864,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,826.766 1872,1872,1872,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,831.9 1880,1880,1880,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,823.627 1888,1888,1888,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,829.852 1896,1896,1896,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,824.771 1904,1904,1904,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,831.079 1912,1912,1912,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,826.205 1920,1920,1920,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,1917.44 1928,1928,1928,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,823.811 1936,1936,1936,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,830.262 1944,1944,1944,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,825.988 1952,1952,1952,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,831.803 1960,1960,1960,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,828.366 1968,1968,1968,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,833.91 1976,1976,1976,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,825.862 1984,1984,1984,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,1916.84 1992,1992,1992,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,827.544 2000,2000,2000,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,832.343 2008,2008,2008,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,824.964 2016,2016,2016,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,830.706 2024,2024,2024,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,826.888 2032,2032,2032,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,831.973 2040,2040,2040,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,825.364 2048,2048,2048,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,1900.35 2056,2056,2056,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,826.443 2064,2064,2064,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,832.297 2072,2072,2072,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,825.48 2080,2080,2080,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,831.083 2088,2088,2088,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,826.373 2096,2096,2096,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,832.32 2104,2104,2104,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,828.458 2112,2112,2112,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,1915.05 2120,2120,2120,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,826.709 2128,2128,2128,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,832.367 2136,2136,2136,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,825.907 2144,2144,2144,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,831.407 2152,2152,2152,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,827.18 2160,2160,2160,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,832.739 2168,2168,2168,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,826.138 2176,2176,2176,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,1912.66 2184,2184,2184,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,827.842 2192,2192,2192,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,832.625 2200,2200,2200,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,826.331 2208,2208,2208,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,831.697 2216,2216,2216,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,828.71 2224,2224,2224,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,833.03 2232,2232,2232,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,827.479 2240,2240,2240,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,1925.83 2248,2248,2248,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,829.514 2256,2256,2256,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,834.416 2264,2264,2264,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,827.865 2272,2272,2272,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,833.159 2280,2280,2280,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,826.846 2288,2288,2288,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,832.105 2296,2296,2296,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,828.353 2304,2304,2304,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,1924.85 2312,2312,2312,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,827.634 2320,2320,2320,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,832.661 2328,2328,2328,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,827.004 2336,2336,2336,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,832.53 2344,2344,2344,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,829.136 2352,2352,2352,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,833.348 2360,2360,2360,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,828.521 2368,2368,2368,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,1916.22 2376,2376,2376,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,827.743 2384,2384,2384,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,832.827 2392,2392,2392,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,828.831 2400,2400,2400,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,833.806 2408,2408,2408,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,828.474 2416,2416,2416,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,833.439 2424,2424,2424,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,827.85 2432,2432,2432,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,1923.63 2440,2440,2440,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,829.3 2448,2448,2448,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,832.46 2456,2456,2456,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,828.574 2464,2464,2464,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,833.407 2472,2472,2472,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,828.605 2480,2480,2480,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,833.31 2488,2488,2488,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,828.153 2496,2496,2496,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,1918.13 2504,2504,2504,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,828.89 2512,2512,2512,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,832.861 2520,2520,2520,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,829.285 2528,2528,2528,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,834.021 2536,2536,2536,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,829.308 2544,2544,2544,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,833.572 2552,2552,2552,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,829.326 2560,2560,2560,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,1924.96 2568,2568,2568,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,828.491 2576,2576,2576,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,833.216 2584,2584,2584,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,828.234 2592,2592,2592,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,833.286 2600,2600,2600,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,830.454 2608,2608,2608,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,834.74 2616,2616,2616,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,829.46 2624,2624,2624,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,1928.33 2632,2632,2632,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,829.766 2640,2640,2640,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,833.633 2648,2648,2648,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,829.337 2656,2656,2656,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,833.96 2664,2664,2664,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,829.207 2672,2672,2672,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,833.483 2680,2680,2680,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,829.324 2688,2688,2688,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,1919.07 2696,2696,2696,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,829.207 2704,2704,2704,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,833.459 2712,2712,2712,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,828.743 2720,2720,2720,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,833.715 2728,2728,2728,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,829.321 2736,2736,2736,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,833.781 2744,2744,2744,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,828.982 2752,2752,2752,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,1924.05 2760,2760,2760,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,829.929 2768,2768,2768,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,833.45 2776,2776,2776,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,829.044 2784,2784,2784,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,833.435 2792,2792,2792,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,830.668 2800,2800,2800,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,833.378 2808,2808,2808,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,830.747 2816,2816,2816,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,1934.29 2824,2824,2824,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,830.698 2832,2832,2832,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,833.426 2840,2840,2840,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,829.279 2848,2848,2848,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,833.521 2856,2856,2856,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,830.128 2864,2864,2864,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,833.636 2872,2872,2872,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,829.44 2880,2880,2880,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,1927.23 2888,2888,2888,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,830.048 2896,2896,2896,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,834.059 2904,2904,2904,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,829.455 2912,2912,2912,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,834.223 2920,2920,2920,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,830.115 2928,2928,2928,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,833.852 2936,2936,2936,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,830.067 2944,2944,2944,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,1925.1 2952,2952,2952,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,830.23 2960,2960,2960,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,833.921 2968,2968,2968,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,830.229 2976,2976,2976,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,834.359 2984,2984,2984,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,830.76 2992,2992,2992,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,834.325 3000,3000,3000,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,830.459 3008,3008,3008,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,1935.5 3016,3016,3016,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,831.429 3024,3024,3024,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,834.996 3032,3032,3032,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,829.863 3040,3040,3040,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,834.155 3048,3048,3048,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,830.439 3056,3056,3056,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,834.041 3064,3064,3064,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,830.715 3072,3072,3072,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,1931.93 3080,3080,3080,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,830.941 3088,3088,3088,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,834.584 3096,3096,3096,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,830.729 3104,3104,3104,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,834.615 3112,3112,3112,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,830.973 3120,3120,3120,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,834.048 3128,3128,3128,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,830.272 3136,3136,3136,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,1933.03 3144,3144,3144,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,830.83 3152,3152,3152,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,834.323 3160,3160,3160,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,830.583 3168,3168,3168,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,834.371 3176,3176,3176,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,831.422 3184,3184,3184,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,834.027 3192,3192,3192,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,830.425 3200,3200,3200,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,1936.37 3208,3208,3208,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,831.087 3216,3216,3216,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,834.532 3224,3224,3224,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,830.98 3232,3232,3232,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,834.712 3240,3240,3240,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,831.083 3248,3248,3248,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,834.509 3256,3256,3256,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,831.089 3264,3264,3264,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,1936.3 3272,3272,3272,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,831.592 3280,3280,3280,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,834.562 3288,3288,3288,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,830.424 3296,3296,3296,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,834.465 3304,3304,3304,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,831.183 3312,3312,3312,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,834.335 3320,3320,3320,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,831.098 3328,3328,3328,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,1937.0 3336,3336,3336,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,831.258 3344,3344,3344,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,834.733 3352,3352,3352,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,831.031 3360,3360,3360,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,834.694 3368,3368,3368,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,832.011 3376,3376,3376,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,835.145 3384,3384,3384,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,831.085 3392,3392,3392,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,1944.39 3400,3400,3400,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,831.807 3408,3408,3408,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,834.663 3416,3416,3416,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,830.815 3424,3424,3424,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,834.842 3432,3432,3432,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,831.659 3440,3440,3440,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,834.897 3448,3448,3448,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,830.826 3456,3456,3456,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,1937.58 3464,3464,3464,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,831.605 3472,3472,3472,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,834.717 3480,3480,3480,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,831.108 3488,3488,3488,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,834.451 3496,3496,3496,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,831.649 3504,3504,3504,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,834.618 3512,3512,3512,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,831.882 3520,3520,3520,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,1943.02 3528,3528,3528,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,831.889 3536,3536,3536,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,834.637 3544,3544,3544,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,831.238 3552,3552,3552,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,834.527 3560,3560,3560,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,831.781 3568,3568,3568,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,834.774 3576,3576,3576,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,831.12 3584,3584,3584,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,1933.29 3592,3592,3592,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,831.908 3600,3600,3600,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,834.953 3608,3608,3608,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,831.228 3616,3616,3616,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,834.985 3624,3624,3624,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,832.125 3632,3632,3632,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,834.813 3640,3640,3640,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,831.562 3648,3648,3648,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,1946.3 3656,3656,3656,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,832.377 3664,3664,3664,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,835.253 3672,3672,3672,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,831.539 3680,3680,3680,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,834.876 3688,3688,3688,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,831.934 3696,3696,3696,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,835.017 3704,3704,3704,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,831.749 3712,3712,3712,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,1944.21 3720,3720,3720,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,832.051 3728,3728,3728,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,834.662 3736,3736,3736,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,831.436 3744,3744,3744,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,834.854 3752,3752,3752,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,832.296 3760,3760,3760,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,834.876 3768,3768,3768,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,831.935 3776,3776,3776,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,1942.07 3784,3784,3784,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,832.087 3792,3792,3792,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,834.971 3800,3800,3800,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,831.933 3808,3808,3808,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,835.029 3816,3816,3816,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,832.189 3824,3824,3824,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,834.916 3832,3832,3832,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,831.692 3840,3840,3840,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,1919.74 3848,3848,3848,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,832.518 3856,3856,3856,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,834.755 3864,3864,3864,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,831.825 3872,3872,3872,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,834.889 3880,3880,3880,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,832.308 3888,3888,3888,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,834.855 3896,3896,3896,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,831.79 3904,3904,3904,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,1942.44 3912,3912,3912,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,832.4 3920,3920,3920,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,834.867 3928,3928,3928,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,832.073 3936,3936,3936,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,835.083 3944,3944,3944,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,832.375 3952,3952,3952,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,835.138 3960,3960,3960,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,832.199 3968,3968,3968,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,1943.28 3976,3976,3976,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,832.39 3984,3984,3984,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,834.865 3992,3992,3992,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,831.901 4000,4000,4000,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,835.031 4008,4008,4008,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,832.722 4016,4016,4016,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,835.355 4024,4024,4024,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,832.123 4032,4032,4032,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,1945.82 4040,4040,4040,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,832.525 4048,4048,4048,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,835.171 4056,4056,4056,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,832.059 4064,4064,4064,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,835.057 4072,4072,4072,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,832.56 4080,4080,4080,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,834.869 4088,4088,4088,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,832.178 4096,4096,4096,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,210.841 4104,4104,4104,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,831.743 4112,4112,4112,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,834.895 4120,4120,4120,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,832.024 4128,4128,4128,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,835.279 4136,4136,4136,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,832.572 4144,4144,4144,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,835.179 4152,4152,4152,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,832.168 4160,4160,4160,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,1944.42 4168,4168,4168,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,832.751 4176,4176,4176,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,835.039 4184,4184,4184,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,832.179 4192,4192,4192,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,835.027 4200,4200,4200,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,832.859 4208,4208,4208,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,835.003 4216,4216,4216,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,832.634 4224,4224,4224,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,1946.55 4232,4232,4232,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,832.944 4240,4240,4240,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,835.031 4248,4248,4248,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,832.251 4256,4256,4256,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,835.061 4264,4264,4264,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,832.798 4272,4272,4272,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,835.101 4280,4280,4280,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,832.346 4288,4288,4288,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,1945.58 4296,4296,4296,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,832.754 4304,4304,4304,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,835.261 4312,4312,4312,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,832.276 4320,4320,4320,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,835.366 4328,4328,4328,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,832.859 4336,4336,4336,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,835.049 4344,4344,4344,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,832.475 4352,4352,4352,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,1513.59 4360,4360,4360,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,832.874 4368,4368,4368,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,835.011 4376,4376,4376,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,832.387 4384,4384,4384,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,835.168 4392,4392,4392,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,832.863 4400,4400,4400,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,835.282 4408,4408,4408,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,832.539 4416,4416,4416,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,1948.0 4424,4424,4424,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,833.068 4432,4432,4432,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,835.333 4440,4440,4440,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,832.568 4448,4448,4448,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,835.272 4456,4456,4456,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,832.996 4464,4464,4464,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,835.109 4472,4472,4472,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,832.736 4480,4480,4480,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,1945.49 4488,4488,4488,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,832.934 4496,4496,4496,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,835.324 4504,4504,4504,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,832.672 4512,4512,4512,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,835.28 4520,4520,4520,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,833.073 4528,4528,4528,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,835.239 4536,4536,4536,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,832.575 4544,4544,4544,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,1947.09 4552,4552,4552,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,833.029 4560,4560,4560,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,835.136 4568,4568,4568,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,832.612 4576,4576,4576,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,835.206 4584,4584,4584,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,833.197 4592,4592,4592,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,835.208 4600,4600,4600,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,832.64 4608,4608,4608,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,1418.68 4616,4616,4616,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,833.031 4624,4624,4624,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,835.217 4632,4632,4632,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,832.772 4640,4640,4640,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,835.281 4648,4648,4648,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,833.115 4656,4656,4656,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,835.414 4664,4664,4664,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,832.893 4672,4672,4672,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,1948.13 4680,4680,4680,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,833.164 4688,4688,4688,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,835.261 4696,4696,4696,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,832.811 4704,4704,4704,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,835.38 4712,4712,4712,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,833.215 4720,4720,4720,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,835.137 4728,4728,4728,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,832.826 4736,4736,4736,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,1947.56 4744,4744,4744,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,833.204 4752,4752,4752,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,835.516 4760,4760,4760,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,832.708 4768,4768,4768,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,835.268 4776,4776,4776,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,833.317 4784,4784,4784,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,835.208 4792,4792,4792,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,832.911 4800,4800,4800,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,1951.52 4808,4808,4808,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,833.236 4816,4816,4816,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,835.298 4824,4824,4824,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,832.936 4832,4832,4832,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,835.549 4840,4840,4840,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,833.214 4848,4848,4848,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,835.384 4856,4856,4856,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,832.927 4864,4864,4864,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,1363.34 4872,4872,4872,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,833.24 4880,4880,4880,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,835.28 4888,4888,4888,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,832.904 4896,4896,4896,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,835.317 4904,4904,4904,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,833.297 4912,4912,4912,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,835.235 4920,4920,4920,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,833.103 4928,4928,4928,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,1950.76 4936,4936,4936,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,833.345 4944,4944,4944,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,835.245 4952,4952,4952,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,832.967 4960,4960,4960,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,835.329 4968,4968,4968,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,833.298 4976,4976,4976,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,835.309 4984,4984,4984,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,833.039 4992,4992,4992,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,1945.85 5000,5000,5000,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,833.308 5008,5008,5008,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,835.417 5016,5016,5016,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,833.118 5024,5024,5024,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,835.595 5032,5032,5032,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,833.382 5040,5040,5040,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,835.365 5048,5048,5048,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,833.121 5056,5056,5056,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,1952.18 5064,5064,5064,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,833.48 5072,5072,5072,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,835.297 5080,5080,5080,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,832.952 5088,5088,5088,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,835.345 5096,5096,5096,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,833.466 5104,5104,5104,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,835.605 5112,5112,5112,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,833.095 5120,5120,5120,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,653.937 5128,5128,5128,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,833.515 5136,5136,5136,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,835.271 5144,5144,5144,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,833.184 5152,5152,5152,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,835.499 5160,5160,5160,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,833.426 5168,5168,5168,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,835.391 5176,5176,5176,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,833.286 5184,5184,5184,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,1950.2 5192,5192,5192,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,833.531 5200,5200,5200,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,835.563 5208,5208,5208,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,833.187 5216,5216,5216,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,835.267 5224,5224,5224,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,833.474 5232,5232,5232,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,835.376 5240,5240,5240,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,833.151 5248,5248,5248,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,1946.76 5256,5256,5256,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,833.58 5264,5264,5264,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,835.408 5272,5272,5272,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,833.111 5280,5280,5280,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,835.385 5288,5288,5288,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,833.517 5296,5296,5296,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,835.345 5304,5304,5304,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,833.197 5312,5312,5312,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,1950.59 5320,5320,5320,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,833.608 5328,5328,5328,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,835.474 5336,5336,5336,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,833.245 5344,5344,5344,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,835.284 5352,5352,5352,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,833.526 5360,5360,5360,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,835.384 5368,5368,5368,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,833.417 5376,5376,5376,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,1274.63 5384,5384,5384,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,833.661 5392,5392,5392,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,835.415 5400,5400,5400,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,833.362 5408,5408,5408,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,835.537 5416,5416,5416,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,833.645 5424,5424,5424,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,835.407 5432,5432,5432,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,833.228 5440,5440,5440,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,1951.78 5448,5448,5448,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,833.586 5456,5456,5456,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,835.384 5464,5464,5464,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,833.211 5472,5472,5472,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,835.427 5480,5480,5480,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,833.711 5488,5488,5488,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,835.368 5496,5496,5496,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,833.407 5504,5504,5504,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,1946.3 5512,5512,5512,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,833.731 5520,5520,5520,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,835.447 5528,5528,5528,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,833.493 5536,5536,5536,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,835.639 5544,5544,5544,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,833.74 5552,5552,5552,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,835.604 5560,5560,5560,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,833.442 5568,5568,5568,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,1951.71 5576,5576,5576,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,833.753 5584,5584,5584,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,835.517 5592,5592,5592,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,833.395 5600,5600,5600,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,835.469 5608,5608,5608,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,833.74 5616,5616,5616,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,835.48 5624,5624,5624,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,833.425 5632,5632,5632,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,1184.8 5640,5640,5640,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,833.767 5648,5648,5648,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,835.486 5656,5656,5656,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,833.444 5664,5664,5664,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,835.478 5672,5672,5672,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,833.796 5680,5680,5680,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,835.528 5688,5688,5688,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,833.521 5696,5696,5696,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,1952.1 5704,5704,5704,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,833.823 5712,5712,5712,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,835.623 5720,5720,5720,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,833.613 5728,5728,5728,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,835.628 5736,5736,5736,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,833.855 5744,5744,5744,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,835.504 5752,5752,5752,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,833.553 5760,5760,5760,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150_14.50.2,1946.8 clblas-2.10/doc/performance/clBLAS_2.6.0/S9150/zgemm_32.csv000066400000000000000000000444321264277366700224410ustar00rootroot00000000000000m,n,k,lda,ldb,ldc,offa,offb,offc,alpha,beta,order,transa,transb,side,uplo,diag,function,device,library,label,GFLOPS 32,32,32,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150,8.04616 64,64,64,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150,32.7834 96,96,96,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150,121.301 128,128,128,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150,146.232 160,160,160,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150,396.227 192,192,192,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150,371.665 224,224,224,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150,594.521 256,256,256,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150,629.008 288,288,288,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150,653.21 320,320,320,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150,782.566 352,352,352,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150,688.559 384,384,384,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150,1131.81 416,416,416,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150,745.444 448,448,448,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150,1193.07 480,480,480,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150,737.79 512,512,512,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150,1543.44 544,544,544,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150,758.122 576,576,576,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150,1499.33 608,608,608,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150,772.137 640,640,640,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150,1547.58 672,672,672,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150,785.92 704,704,704,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150,1611.66 736,736,736,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150,794.837 768,768,768,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150,1642.67 800,800,800,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150,800.571 832,832,832,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150,1731.36 864,864,864,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150,807.124 896,896,896,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150,1789.87 928,928,928,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150,809.237 960,960,960,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150,1717.53 992,992,992,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150,823.274 1024,1024,1024,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150,1781.6 1056,1056,1056,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150,821.9 1088,1088,1088,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150,1755.75 1120,1120,1120,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150,826.203 1152,1152,1152,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150,1834.06 1184,1184,1184,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150,820.024 1216,1216,1216,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150,1821.08 1248,1248,1248,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150,823.397 1280,1280,1280,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150,1800.41 1312,1312,1312,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150,823.252 1344,1344,1344,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150,1815.68 1376,1376,1376,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150,824.6 1408,1408,1408,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150,1892.53 1440,1440,1440,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150,825.788 1472,1472,1472,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150,1838.56 1504,1504,1504,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150,826.347 1536,1536,1536,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150,1844.11 1568,1568,1568,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150,828.516 1600,1600,1600,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150,1877.71 1632,1632,1632,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150,827.484 1664,1664,1664,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150,1895.03 1696,1696,1696,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150,831.788 1728,1728,1728,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150,1875.16 1760,1760,1760,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150,830.937 1792,1792,1792,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150,1893.66 1824,1824,1824,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150,832.34 1856,1856,1856,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150,1890.04 1888,1888,1888,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150,830.042 1920,1920,1920,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150,1916.79 1952,1952,1952,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150,831.924 1984,1984,1984,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150,1917.0 2016,2016,2016,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150,830.716 2048,2048,2048,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150,1900.29 2080,2080,2080,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150,831.168 2112,2112,2112,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150,1915.15 2144,2144,2144,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150,831.373 2176,2176,2176,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150,1912.05 2208,2208,2208,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150,831.688 2240,2240,2240,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150,1925.93 2272,2272,2272,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150,833.211 2304,2304,2304,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150,1923.8 2336,2336,2336,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150,832.551 2368,2368,2368,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150,1916.24 2400,2400,2400,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150,833.814 2432,2432,2432,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150,1922.86 2464,2464,2464,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150,833.399 2496,2496,2496,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150,1918.26 2528,2528,2528,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150,834.028 2560,2560,2560,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150,1923.58 2592,2592,2592,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150,833.314 2624,2624,2624,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150,1928.36 2656,2656,2656,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150,833.898 2688,2688,2688,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150,1918.17 2720,2720,2720,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150,833.771 2752,2752,2752,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150,1924.09 2784,2784,2784,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150,833.396 2816,2816,2816,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150,1931.87 2848,2848,2848,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150,833.549 2880,2880,2880,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150,1927.29 2912,2912,2912,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150,834.193 2944,2944,2944,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150,1924.09 2976,2976,2976,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150,834.356 3008,3008,3008,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150,1935.57 3040,3040,3040,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150,834.149 3072,3072,3072,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150,1875.62 3104,3104,3104,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150,834.62 3136,3136,3136,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150,1933.14 3168,3168,3168,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150,834.396 3200,3200,3200,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150,1935.39 3232,3232,3232,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150,834.715 3264,3264,3264,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150,1936.38 3296,3296,3296,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150,834.468 3328,3328,3328,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150,1932.56 3360,3360,3360,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150,834.704 3392,3392,3392,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150,1944.43 3424,3424,3424,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150,834.852 3456,3456,3456,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150,1936.54 3488,3488,3488,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150,834.458 3520,3520,3520,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150,1943.09 3552,3552,3552,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150,834.52 3584,3584,3584,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150,1895.5 3616,3616,3616,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150,834.993 3648,3648,3648,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150,1946.38 3680,3680,3680,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150,834.874 3712,3712,3712,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150,1942.98 3744,3744,3744,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150,834.855 3776,3776,3776,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150,1942.14 3808,3808,3808,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150,835.029 3840,3840,3840,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150,1688.07 3872,3872,3872,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150,834.91 3904,3904,3904,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150,1942.56 3936,3936,3936,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150,835.085 3968,3968,3968,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150,1941.91 4000,4000,4000,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150,835.039 4032,4032,4032,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150,1945.9 4064,4064,4064,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150,835.06 4096,4096,4096,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150,210.643 4128,4128,4128,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150,835.29 4160,4160,4160,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150,1944.62 4192,4192,4192,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150,835.013 4224,4224,4224,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150,1944.57 4256,4256,4256,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150,835.061 4288,4288,4288,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150,1945.67 4320,4320,4320,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150,835.359 4352,4352,4352,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150,1325.61 4384,4384,4384,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150,835.173 4416,4416,4416,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150,1948.09 4448,4448,4448,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150,835.271 4480,4480,4480,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150,1943.65 4512,4512,4512,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150,835.281 4544,4544,4544,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150,1947.21 4576,4576,4576,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150,835.199 4608,4608,4608,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150,1263.36 4640,4640,4640,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150,835.301 4672,4672,4672,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150,1948.21 4704,4704,4704,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150,835.377 4736,4736,4736,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150,1945.34 4768,4768,4768,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150,835.278 4800,4800,4800,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150,1951.59 4832,4832,4832,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150,835.547 4864,4864,4864,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150,1291.69 4896,4896,4896,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150,835.316 4928,4928,4928,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150,1950.82 4960,4960,4960,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150,835.331 4992,4992,4992,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150,1943.77 5024,5024,5024,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150,835.593 5056,5056,5056,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150,1952.27 5088,5088,5088,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150,835.335 5120,5120,5120,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150,645.717 5152,5152,5152,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150,835.499 5184,5184,5184,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150,1950.28 5216,5216,5216,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150,835.232 5248,5248,5248,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150,1944.16 5280,5280,5280,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150,835.378 5312,5312,5312,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150,1950.68 5344,5344,5344,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150,835.28 5376,5376,5376,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150,1233.51 5408,5408,5408,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150,835.536 5440,5440,5440,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150,1951.87 5472,5472,5472,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150,835.42 5504,5504,5504,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150,1943.23 5536,5536,5536,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150,835.639 5568,5568,5568,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150,1951.76 5600,5600,5600,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150,835.464 5632,5632,5632,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150,1156.26 5664,5664,5664,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150,835.476 5696,5696,5696,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150,1952.11 5728,5728,5728,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150,835.623 5760,5760,5760,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150,1942.99 clblas-2.10/doc/performance/clBLAS_2.6.0/S9150/zgemm_64.csv000066400000000000000000000223171264277366700224440ustar00rootroot00000000000000m,n,k,lda,ldb,ldc,offa,offb,offc,alpha,beta,order,transa,transb,side,uplo,diag,function,device,library,label,GFLOPS 64,64,64,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150,32.7834 128,128,128,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150,146.232 192,192,192,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150,371.665 256,256,256,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150,629.008 320,320,320,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150,782.566 384,384,384,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150,1131.81 448,448,448,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150,1193.07 512,512,512,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150,1543.44 576,576,576,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150,1499.33 640,640,640,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150,1547.58 704,704,704,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150,1611.66 768,768,768,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150,1642.67 832,832,832,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150,1731.36 896,896,896,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150,1789.87 960,960,960,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150,1717.53 1024,1024,1024,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150,1781.6 1088,1088,1088,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150,1755.75 1152,1152,1152,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150,1834.06 1216,1216,1216,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150,1821.08 1280,1280,1280,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150,1800.41 1344,1344,1344,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150,1815.68 1408,1408,1408,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150,1892.53 1472,1472,1472,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150,1838.56 1536,1536,1536,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150,1844.11 1600,1600,1600,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150,1877.71 1664,1664,1664,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150,1895.03 1728,1728,1728,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150,1875.16 1792,1792,1792,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150,1893.66 1856,1856,1856,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150,1890.04 1920,1920,1920,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150,1916.79 1984,1984,1984,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150,1917.0 2048,2048,2048,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150,1900.29 2112,2112,2112,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150,1915.15 2176,2176,2176,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150,1912.05 2240,2240,2240,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150,1925.93 2304,2304,2304,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150,1923.8 2368,2368,2368,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150,1916.24 2432,2432,2432,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150,1922.86 2496,2496,2496,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150,1918.26 2560,2560,2560,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150,1923.58 2624,2624,2624,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150,1928.36 2688,2688,2688,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150,1918.17 2752,2752,2752,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150,1924.09 2816,2816,2816,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150,1931.87 2880,2880,2880,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150,1927.29 2944,2944,2944,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150,1924.09 3008,3008,3008,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150,1935.57 3072,3072,3072,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150,1875.62 3136,3136,3136,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150,1933.14 3200,3200,3200,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150,1935.39 3264,3264,3264,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150,1936.38 3328,3328,3328,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150,1932.56 3392,3392,3392,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150,1944.43 3456,3456,3456,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150,1936.54 3520,3520,3520,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150,1943.09 3584,3584,3584,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150,1895.5 3648,3648,3648,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150,1946.38 3712,3712,3712,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150,1942.98 3776,3776,3776,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150,1942.14 3840,3840,3840,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150,1688.07 3904,3904,3904,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150,1942.56 3968,3968,3968,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150,1941.91 4032,4032,4032,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150,1945.9 4096,4096,4096,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150,210.643 4160,4160,4160,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150,1944.62 4224,4224,4224,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150,1944.57 4288,4288,4288,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150,1945.67 4352,4352,4352,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150,1325.61 4416,4416,4416,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150,1948.09 4480,4480,4480,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150,1943.65 4544,4544,4544,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150,1947.21 4608,4608,4608,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150,1263.36 4672,4672,4672,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150,1948.21 4736,4736,4736,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150,1945.34 4800,4800,4800,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150,1951.59 4864,4864,4864,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150,1291.69 4928,4928,4928,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150,1950.82 4992,4992,4992,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150,1943.77 5056,5056,5056,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150,1952.27 5120,5120,5120,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150,645.717 5184,5184,5184,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150,1950.28 5248,5248,5248,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150,1944.16 5312,5312,5312,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150,1950.68 5376,5376,5376,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150,1233.51 5440,5440,5440,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150,1951.87 5504,5504,5504,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150,1943.23 5568,5568,5568,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150,1951.76 5632,5632,5632,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150,1156.26 5696,5696,5696,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150,1952.11 5760,5760,5760,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,S9150,1942.99 clblas-2.10/doc/performance/clBLAS_2.6.0/W9100/000077500000000000000000000000001264277366700202715ustar00rootroot00000000000000clblas-2.10/doc/performance/clBLAS_2.6.0/W9100/README.txt000066400000000000000000000017731264277366700217770ustar00rootroot00000000000000################################ # # # Benchmarking Methodology # # # ################################ ############ # Hardware # ############ W9100 ############ # Software # ############ CentOS 6.6 clBLAS 2.6.0 driver 14.502 ############ # Settings # ############ gpu clocks: set to max level using proprietary tool though public alternatives exist clBLAS: m=n=k=lda=ldb=ldc (for simplicity) alpha=beta=1 gemms were column-major, op(A,B)=N,T ############ # Sampling # ############ For each data point, we took 10 samples. Each sample consists of 10 gemm calls with a wait afterward. Outlying samples beyond 1 standard deviation were removed (rarely if ever did this actually need to happen). Before running the 10 samples, one warm-up sample was executed (but not included in the stastics). GFlop/s was calculated as (2*m*n*k flops) / (host time for 10 kernels / 10) // real data (8*m*n*k flops) / (host time for 10 kernels / 10) // complex data clblas-2.10/doc/performance/clBLAS_2.6.0/W9100/clblas_sgemmNT_w9100_14502.csv000066400000000000000000000560301264277366700252770ustar00rootroot00000000000000m,n,k,lda,ldb,ldc,offa,offb,offc,alpha,beta,order,transa,transb,side,uplo,diag,function,device,library,label,GFLOPS 32,32,32,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,clblas_sgemmNT_w9100_14502driver,2.71214 64,64,64,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,clblas_sgemmNT_w9100_14502driver,17.8432 96,96,96,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,clblas_sgemmNT_w9100_14502driver,51.5175 128,128,128,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,clblas_sgemmNT_w9100_14502driver,105.512 160,160,160,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,clblas_sgemmNT_w9100_14502driver,175.986 192,192,192,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,clblas_sgemmNT_w9100_14502driver,264.83 224,224,224,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,clblas_sgemmNT_w9100_14502driver,378.523 256,256,256,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,clblas_sgemmNT_w9100_14502driver,498.189 288,288,288,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,clblas_sgemmNT_w9100_14502driver,661.745 320,320,320,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,clblas_sgemmNT_w9100_14502driver,790.917 352,352,352,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,clblas_sgemmNT_w9100_14502driver,985.034 384,384,384,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,clblas_sgemmNT_w9100_14502driver,1095.86 416,416,416,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,clblas_sgemmNT_w9100_14502driver,1274.24 448,448,448,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,clblas_sgemmNT_w9100_14502driver,1251.71 480,480,480,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,clblas_sgemmNT_w9100_14502driver,1053.99 512,512,512,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,clblas_sgemmNT_w9100_14502driver,1186.2 544,544,544,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,clblas_sgemmNT_w9100_14502driver,1380.06 576,576,576,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,clblas_sgemmNT_w9100_14502driver,1477.06 608,608,608,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,clblas_sgemmNT_w9100_14502driver,1507.08 640,640,640,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,clblas_sgemmNT_w9100_14502driver,1828.85 672,672,672,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,clblas_sgemmNT_w9100_14502driver,1551.74 704,704,704,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,clblas_sgemmNT_w9100_14502driver,2239.85 736,736,736,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,clblas_sgemmNT_w9100_14502driver,1605.21 768,768,768,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,clblas_sgemmNT_w9100_14502driver,2059.63 800,800,800,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,clblas_sgemmNT_w9100_14502driver,1666.8 832,832,832,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,clblas_sgemmNT_w9100_14502driver,2604.03 864,864,864,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,clblas_sgemmNT_w9100_14502driver,2611.62 896,896,896,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,clblas_sgemmNT_w9100_14502driver,2577.19 928,928,928,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,clblas_sgemmNT_w9100_14502driver,1734.15 960,960,960,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,clblas_sgemmNT_w9100_14502driver,2450.49 992,992,992,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,clblas_sgemmNT_w9100_14502driver,1813.39 1024,1024,1024,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,clblas_sgemmNT_w9100_14502driver,2746.99 1056,1056,1056,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,clblas_sgemmNT_w9100_14502driver,2968.76 1088,1088,1088,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,clblas_sgemmNT_w9100_14502driver,2883.05 1120,1120,1120,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,clblas_sgemmNT_w9100_14502driver,1860.83 1152,1152,1152,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,clblas_sgemmNT_w9100_14502driver,2859.35 1184,1184,1184,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,clblas_sgemmNT_w9100_14502driver,1970.8 1216,1216,1216,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,clblas_sgemmNT_w9100_14502driver,2922.33 1248,1248,1248,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,clblas_sgemmNT_w9100_14502driver,3367.9 1280,1280,1280,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,clblas_sgemmNT_w9100_14502driver,2948.22 1312,1312,1312,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,clblas_sgemmNT_w9100_14502driver,2140.97 1344,1344,1344,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,clblas_sgemmNT_w9100_14502driver,3227.59 1376,1376,1376,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,clblas_sgemmNT_w9100_14502driver,2221.69 1408,1408,1408,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,clblas_sgemmNT_w9100_14502driver,3298.39 1440,1440,1440,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,clblas_sgemmNT_w9100_14502driver,3183.35 1472,1472,1472,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,clblas_sgemmNT_w9100_14502driver,3102.48 1504,1504,1504,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,clblas_sgemmNT_w9100_14502driver,2393.05 1536,1536,1536,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,clblas_sgemmNT_w9100_14502driver,3613.68 1568,1568,1568,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,clblas_sgemmNT_w9100_14502driver,2477.31 1600,1600,1600,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,clblas_sgemmNT_w9100_14502driver,3221.4 1632,1632,1632,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,clblas_sgemmNT_w9100_14502driver,3576.13 1664,1664,1664,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,clblas_sgemmNT_w9100_14502driver,3287.13 1696,1696,1696,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,clblas_sgemmNT_w9100_14502driver,2649.74 1728,1728,1728,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,clblas_sgemmNT_w9100_14502driver,3561.52 1760,1760,1760,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,clblas_sgemmNT_w9100_14502driver,2734.68 1792,1792,1792,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,clblas_sgemmNT_w9100_14502driver,3418.31 1824,1824,1824,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,clblas_sgemmNT_w9100_14502driver,3570.12 1856,1856,1856,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,clblas_sgemmNT_w9100_14502driver,3327.97 1888,1888,1888,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,clblas_sgemmNT_w9100_14502driver,2799.64 1920,1920,1920,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,clblas_sgemmNT_w9100_14502driver,3590.96 1952,1952,1952,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,clblas_sgemmNT_w9100_14502driver,2882.53 1984,1984,1984,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,clblas_sgemmNT_w9100_14502driver,3476.27 2016,2016,2016,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,clblas_sgemmNT_w9100_14502driver,3625.49 2048,2048,2048,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,clblas_sgemmNT_w9100_14502driver,3369.61 2080,2080,2080,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,clblas_sgemmNT_w9100_14502driver,2955.14 2112,2112,2112,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,clblas_sgemmNT_w9100_14502driver,3985.41 2144,2144,2144,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,clblas_sgemmNT_w9100_14502driver,3042.31 2176,2176,2176,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,clblas_sgemmNT_w9100_14502driver,3447.19 2208,2208,2208,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,clblas_sgemmNT_w9100_14502driver,3726.59 2240,2240,2240,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,clblas_sgemmNT_w9100_14502driver,3528.64 2272,2272,2272,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,clblas_sgemmNT_w9100_14502driver,3121.93 2304,2304,2304,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,clblas_sgemmNT_w9100_14502driver,3783.88 2336,2336,2336,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,clblas_sgemmNT_w9100_14502driver,3122.23 2368,2368,2368,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,clblas_sgemmNT_w9100_14502driver,3474.13 2400,2400,2400,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,clblas_sgemmNT_w9100_14502driver,3849.93 2432,2432,2432,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,clblas_sgemmNT_w9100_14502driver,3554.9 2464,2464,2464,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,clblas_sgemmNT_w9100_14502driver,3219.16 2496,2496,2496,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,clblas_sgemmNT_w9100_14502driver,3917.27 2528,2528,2528,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,clblas_sgemmNT_w9100_14502driver,3222.66 2560,2560,2560,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,clblas_sgemmNT_w9100_14502driver,3513.67 2592,2592,2592,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,clblas_sgemmNT_w9100_14502driver,3988.07 2624,2624,2624,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,clblas_sgemmNT_w9100_14502driver,3525.84 2656,2656,2656,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,clblas_sgemmNT_w9100_14502driver,3257.9 2688,2688,2688,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,clblas_sgemmNT_w9100_14502driver,4062.56 2720,2720,2720,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,clblas_sgemmNT_w9100_14502driver,3276.63 2752,2752,2752,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,clblas_sgemmNT_w9100_14502driver,3529.36 2784,2784,2784,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,clblas_sgemmNT_w9100_14502driver,3942.43 2816,2816,2816,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,clblas_sgemmNT_w9100_14502driver,3610.12 2848,2848,2848,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,clblas_sgemmNT_w9100_14502driver,3371.44 2880,2880,2880,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,clblas_sgemmNT_w9100_14502driver,4024.48 2912,2912,2912,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,clblas_sgemmNT_w9100_14502driver,3318.54 2944,2944,2944,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,clblas_sgemmNT_w9100_14502driver,3558.24 2976,2976,2976,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,clblas_sgemmNT_w9100_14502driver,4108.88 3008,3008,3008,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,clblas_sgemmNT_w9100_14502driver,3572.61 3040,3040,3040,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,clblas_sgemmNT_w9100_14502driver,3372.35 3072,3072,3072,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,clblas_sgemmNT_w9100_14502driver,4012.02 3104,3104,3104,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,clblas_sgemmNT_w9100_14502driver,3395.6 3136,3136,3136,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,clblas_sgemmNT_w9100_14502driver,3606.9 3168,3168,3168,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,clblas_sgemmNT_w9100_14502driver,4116.7 3200,3200,3200,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,clblas_sgemmNT_w9100_14502driver,3626.86 3232,3232,3232,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,clblas_sgemmNT_w9100_14502driver,3449.88 3264,3264,3264,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,clblas_sgemmNT_w9100_14502driver,4057.12 3296,3296,3296,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,clblas_sgemmNT_w9100_14502driver,3429.28 3328,3328,3328,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,clblas_sgemmNT_w9100_14502driver,3613.44 3360,3360,3360,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,clblas_sgemmNT_w9100_14502driver,4149.72 3392,3392,3392,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,clblas_sgemmNT_w9100_14502driver,3639.42 3424,3424,3424,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,clblas_sgemmNT_w9100_14502driver,3495.73 3456,3456,3456,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,clblas_sgemmNT_w9100_14502driver,4105.99 3488,3488,3488,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,clblas_sgemmNT_w9100_14502driver,3480.25 3520,3520,3520,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,clblas_sgemmNT_w9100_14502driver,3640.35 3552,3552,3552,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,clblas_sgemmNT_w9100_14502driver,4076.11 3584,3584,3584,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,clblas_sgemmNT_w9100_14502driver,3606.75 3616,3616,3616,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,clblas_sgemmNT_w9100_14502driver,3500.32 3648,3648,3648,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,clblas_sgemmNT_w9100_14502driver,4169.67 3680,3680,3680,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,clblas_sgemmNT_w9100_14502driver,3536.23 3712,3712,3712,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,clblas_sgemmNT_w9100_14502driver,3636.42 3744,3744,3744,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,clblas_sgemmNT_w9100_14502driver,4141.6 3776,3776,3776,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,clblas_sgemmNT_w9100_14502driver,3619.89 3808,3808,3808,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,clblas_sgemmNT_w9100_14502driver,3525.36 3840,3840,3840,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,clblas_sgemmNT_w9100_14502driver,4127.42 3872,3872,3872,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,clblas_sgemmNT_w9100_14502driver,3564.83 3904,3904,3904,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,clblas_sgemmNT_w9100_14502driver,3645.84 3936,3936,3936,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,clblas_sgemmNT_w9100_14502driver,4121.56 3968,3968,3968,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,clblas_sgemmNT_w9100_14502driver,3639.09 4000,4000,4000,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,clblas_sgemmNT_w9100_14502driver,3473.41 4032,4032,4032,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,clblas_sgemmNT_w9100_14502driver,4118.84 4064,4064,4064,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,clblas_sgemmNT_w9100_14502driver,3534.78 4096,4096,4096,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,clblas_sgemmNT_w9100_14502driver,2814.25 4128,4128,4128,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,clblas_sgemmNT_w9100_14502driver,4120.12 4160,4160,4160,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,clblas_sgemmNT_w9100_14502driver,3563.78 4192,4192,4192,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,clblas_sgemmNT_w9100_14502driver,3526.68 4224,4224,4224,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,clblas_sgemmNT_w9100_14502driver,4216.93 4256,4256,4256,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,clblas_sgemmNT_w9100_14502driver,3653.39 4288,4288,4288,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,clblas_sgemmNT_w9100_14502driver,3621.5 4320,4320,4320,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,clblas_sgemmNT_w9100_14502driver,4136.37 4352,4352,4352,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,clblas_sgemmNT_w9100_14502driver,3551.06 4384,4384,4384,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,clblas_sgemmNT_w9100_14502driver,3589.02 4416,4416,4416,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,clblas_sgemmNT_w9100_14502driver,4148.81 4448,4448,4448,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,clblas_sgemmNT_w9100_14502driver,3586.44 4480,4480,4480,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,clblas_sgemmNT_w9100_14502driver,3594.72 4512,4512,4512,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,clblas_sgemmNT_w9100_14502driver,4165.11 4544,4544,4544,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,clblas_sgemmNT_w9100_14502driver,3621.42 4576,4576,4576,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,clblas_sgemmNT_w9100_14502driver,3629.16 4608,4608,4608,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,clblas_sgemmNT_w9100_14502driver,4179.76 4640,4640,4640,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,clblas_sgemmNT_w9100_14502driver,3654.94 4672,4672,4672,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,clblas_sgemmNT_w9100_14502driver,3651.85 4704,4704,4704,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,clblas_sgemmNT_w9100_14502driver,4202.51 4736,4736,4736,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,clblas_sgemmNT_w9100_14502driver,3691.03 4768,4768,4768,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,clblas_sgemmNT_w9100_14502driver,3694.8 4800,4800,4800,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,clblas_sgemmNT_w9100_14502driver,4223.95 4832,4832,4832,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,clblas_sgemmNT_w9100_14502driver,3725.54 4864,4864,4864,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,clblas_sgemmNT_w9100_14502driver,3719.53 4896,4896,4896,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,clblas_sgemmNT_w9100_14502driver,4179.5 4928,4928,4928,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,clblas_sgemmNT_w9100_14502driver,3709.53 4960,4960,4960,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,clblas_sgemmNT_w9100_14502driver,3707.04 4992,4992,4992,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,clblas_sgemmNT_w9100_14502driver,4206.47 5024,5024,5024,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,clblas_sgemmNT_w9100_14502driver,3746.47 5056,5056,5056,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,clblas_sgemmNT_w9100_14502driver,3742.52 5088,5088,5088,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,clblas_sgemmNT_w9100_14502driver,4234.38 5120,5120,5120,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,clblas_sgemmNT_w9100_14502driver,2780.42 5152,5152,5152,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,clblas_sgemmNT_w9100_14502driver,3783.83 5184,5184,5184,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,clblas_sgemmNT_w9100_14502driver,4203.26 5216,5216,5216,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,clblas_sgemmNT_w9100_14502driver,3775.48 5248,5248,5248,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,clblas_sgemmNT_w9100_14502driver,3773.26 5280,5280,5280,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,clblas_sgemmNT_w9100_14502driver,4234.75 5312,5312,5312,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,clblas_sgemmNT_w9100_14502driver,3816.08 5344,5344,5344,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,clblas_sgemmNT_w9100_14502driver,3812.14 5376,5376,5376,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,clblas_sgemmNT_w9100_14502driver,4209.13 5408,5408,5408,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,clblas_sgemmNT_w9100_14502driver,3808.44 5440,5440,5440,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,clblas_sgemmNT_w9100_14502driver,3807.77 5472,5472,5472,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,clblas_sgemmNT_w9100_14502driver,4243.99 5504,5504,5504,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,clblas_sgemmNT_w9100_14502driver,3850.45 5536,5536,5536,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,clblas_sgemmNT_w9100_14502driver,3851.7 5568,5568,5568,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,clblas_sgemmNT_w9100_14502driver,4226.47 5600,5600,5600,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,clblas_sgemmNT_w9100_14502driver,3847.7 5632,5632,5632,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,clblas_sgemmNT_w9100_14502driver,3844.71 5664,5664,5664,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,clblas_sgemmNT_w9100_14502driver,4212.0 5696,5696,5696,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,clblas_sgemmNT_w9100_14502driver,3849.83 5728,5728,5728,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,clblas_sgemmNT_w9100_14502driver,3849.57 5760,5760,5760,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,clblas_sgemmNT_w9100_14502driver,4249.86 clblas-2.10/doc/performance/clBLAS_2.6.0/W9100/dgemm_32.csv000066400000000000000000000444361264277366700224160ustar00rootroot00000000000000m,n,k,lda,ldb,ldc,offa,offb,offc,alpha,beta,order,transa,transb,side,uplo,diag,function,device,library,label,GFLOPS 32,32,32,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,w9100,3.53294 64,64,64,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,w9100,23.281 96,96,96,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,w9100,68.5045 128,128,128,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,w9100,138.38 160,160,160,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,w9100,234.593 192,192,192,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,w9100,352.309 224,224,224,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,w9100,432.702 256,256,256,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,w9100,574.267 288,288,288,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,w9100,659.067 320,320,320,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,w9100,697.117 352,352,352,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,w9100,735.236 384,384,384,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,w9100,671.606 416,416,416,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,w9100,785.888 448,448,448,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,w9100,793.605 480,480,480,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,w9100,811.297 512,512,512,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,w9100,1043.48 544,544,544,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,w9100,1171.21 576,576,576,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,w9100,1002.95 608,608,608,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,w9100,982.968 640,640,640,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,w9100,1088.05 672,672,672,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,w9100,963.961 704,704,704,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,w9100,1319.22 736,736,736,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,w9100,1185.36 768,768,768,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,w9100,1242.09 800,800,800,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,w9100,1385.64 832,832,832,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,w9100,1495.15 864,864,864,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,w9100,1568.8 896,896,896,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,w9100,1406.22 928,928,928,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,w9100,1506.07 960,960,960,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,w9100,1316.41 992,992,992,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,w9100,1467.01 1024,1024,1024,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,w9100,540.962 1056,1056,1056,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,w9100,1584.14 1088,1088,1088,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,w9100,579.422 1120,1120,1120,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,w9100,1619.84 1152,1152,1152,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,w9100,1490.92 1184,1184,1184,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,w9100,630.033 1216,1216,1216,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,w9100,718.691 1248,1248,1248,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,w9100,1742.64 1280,1280,1280,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,w9100,1530.05 1312,1312,1312,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,w9100,761.23 1344,1344,1344,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,w9100,1643.83 1376,1376,1376,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,w9100,777.14 1408,1408,1408,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,w9100,871.324 1440,1440,1440,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,w9100,1604.44 1472,1472,1472,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,w9100,823.931 1504,1504,1504,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,w9100,918.899 1536,1536,1536,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,w9100,1757.94 1568,1568,1568,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,w9100,932.339 1600,1600,1600,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,w9100,1619.46 1632,1632,1632,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,w9100,1768.83 1664,1664,1664,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,w9100,970.777 1696,1696,1696,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,w9100,1075.29 1728,1728,1728,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,w9100,1748.72 1760,1760,1760,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,w9100,1645.08 1792,1792,1792,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,w9100,1105.57 1824,1824,1824,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,w9100,1742.95 1856,1856,1856,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,w9100,1062.56 1888,1888,1888,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,w9100,1157.13 1920,1920,1920,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,w9100,1732.54 1952,1952,1952,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,w9100,1107.65 1984,1984,1984,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,w9100,1198.47 2016,2016,2016,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,w9100,1762.15 2048,2048,2048,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,w9100,1088.46 2080,2080,2080,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,w9100,1675.2 2112,2112,2112,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,w9100,1906.43 2144,2144,2144,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,w9100,1251.57 2176,2176,2176,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,w9100,1272.21 2208,2208,2208,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,w9100,1793.87 2240,2240,2240,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,w9100,1725.09 2272,2272,2272,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,w9100,1324.81 2304,2304,2304,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,w9100,1789.88 2336,2336,2336,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,w9100,1281.5 2368,2368,2368,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,w9100,1366.99 2400,2400,2400,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,w9100,1834.29 2432,2432,2432,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,w9100,1319.44 2464,2464,2464,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,w9100,1409.1 2496,2496,2496,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,w9100,1859.41 2528,2528,2528,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,w9100,1364.64 2560,2560,2560,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,w9100,364.688 2592,2592,2592,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,w9100,1886.36 2624,2624,2624,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,w9100,1407.96 2656,2656,2656,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,w9100,1436.54 2688,2688,2688,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,w9100,1908.73 2720,2720,2720,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,w9100,1716.77 2752,2752,2752,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,w9100,1477.84 2784,2784,2784,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,w9100,1861.51 2816,2816,2816,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,w9100,1421.3 2848,2848,2848,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,w9100,1520.31 2880,2880,2880,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,w9100,1893.66 2912,2912,2912,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,w9100,1482.96 2944,2944,2944,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,w9100,1508.46 2976,2976,2976,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,w9100,1927.05 3008,3008,3008,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,w9100,1527.51 3040,3040,3040,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,w9100,1734.89 3072,3072,3072,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,w9100,1042.86 3104,3104,3104,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,w9100,1523.43 3136,3136,3136,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,w9100,1552.01 3168,3168,3168,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,w9100,1925.31 3200,3200,3200,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,w9100,1753.33 3232,3232,3232,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,w9100,1590.26 3264,3264,3264,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,w9100,1899.25 3296,3296,3296,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,w9100,1565.16 3328,3328,3328,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,w9100,1567.99 3360,3360,3360,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,w9100,1935.88 3392,3392,3392,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,w9100,1605.91 3424,3424,3424,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,w9100,1630.5 3456,3456,3456,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,w9100,1910.09 3488,3488,3488,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,w9100,1607.84 3520,3520,3520,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,w9100,1756.93 3552,3552,3552,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,w9100,1903.1 3584,3584,3584,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,w9100,761.282 3616,3616,3616,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,w9100,1671.66 3648,3648,3648,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,w9100,1939.9 3680,3680,3680,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,w9100,1748.8 3712,3712,3712,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,w9100,1669.46 3744,3744,3744,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,w9100,1928.93 3776,3776,3776,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,w9100,1658.34 3808,3808,3808,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,w9100,1681.33 3840,3840,3840,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,w9100,1784.76 3872,3872,3872,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,w9100,1665.18 3904,3904,3904,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,w9100,1687.22 3936,3936,3936,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,w9100,1917.97 3968,3968,3968,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,w9100,1661.86 4000,4000,4000,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,w9100,1764.93 4032,4032,4032,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,w9100,1917.83 4064,4064,4064,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,w9100,1683.12 4096,4096,4096,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,w9100,367.047 4128,4128,4128,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,w9100,1915.69 4160,4160,4160,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,w9100,1693.53 4192,4192,4192,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,w9100,1712.76 4224,4224,4224,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,w9100,1933.97 4256,4256,4256,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,w9100,1725.19 4288,4288,4288,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,w9100,1722.64 4320,4320,4320,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,w9100,1922.95 4352,4352,4352,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,w9100,1294.01 4384,4384,4384,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,w9100,1733.28 4416,4416,4416,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,w9100,1925.72 4448,4448,4448,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,w9100,1724.82 4480,4480,4480,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,w9100,1737.52 4512,4512,4512,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,w9100,1930.11 4544,4544,4544,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,w9100,1737.68 4576,4576,4576,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,w9100,1755.23 4608,4608,4608,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,w9100,610.07 4640,4640,4640,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,w9100,1749.19 4672,4672,4672,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,w9100,1766.64 4704,4704,4704,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,w9100,1941.76 4736,4736,4736,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,w9100,1754.68 4768,4768,4768,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,w9100,1778.24 4800,4800,4800,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,w9100,1949.17 4832,4832,4832,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,w9100,1775.02 4864,4864,4864,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,w9100,1173.65 4896,4896,4896,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,w9100,1933.12 4928,4928,4928,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,w9100,1766.65 4960,4960,4960,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,w9100,1781.87 4992,4992,4992,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,w9100,1934.64 5024,5024,5024,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,w9100,1779.78 5056,5056,5056,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,w9100,1794.82 5088,5088,5088,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,w9100,1950.67 5120,5120,5120,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,w9100,543.611 5152,5152,5152,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,w9100,1788.87 5184,5184,5184,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,w9100,1939.15 5216,5216,5216,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,w9100,1788.23 5248,5248,5248,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,w9100,1795.89 5280,5280,5280,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,w9100,1947.25 5312,5312,5312,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,w9100,1802.7 5344,5344,5344,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,w9100,1802.04 5376,5376,5376,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,w9100,1016.35 5408,5408,5408,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,w9100,1800.7 5440,5440,5440,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,w9100,1812.73 5472,5472,5472,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,w9100,1942.56 5504,5504,5504,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,w9100,1803.43 5536,5536,5536,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,w9100,1811.49 5568,5568,5568,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,w9100,1942.93 5600,5600,5600,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,w9100,1812.13 5632,5632,5632,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,w9100,502.244 5664,5664,5664,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,w9100,1937.41 5696,5696,5696,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,w9100,1812.22 5728,5728,5728,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,w9100,1824.13 5760,5760,5760,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,w9100,1935.53 clblas-2.10/doc/performance/clBLAS_2.6.0/W9100/dgemm_96.csv000066400000000000000000000142611264277366700224210ustar00rootroot00000000000000m,n,k,lda,ldb,ldc,offa,offb,offc,alpha,beta,order,transa,transb,side,uplo,diag,function,device,library,label,GFLOPS 96,96,96,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,w9100,68.3722 192,192,192,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,w9100,354.426 288,288,288,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,w9100,661.531 384,384,384,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,w9100,671.407 480,480,480,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,w9100,809.931 576,576,576,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,w9100,1002.17 672,672,672,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,w9100,964.788 768,768,768,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,w9100,1243.32 864,864,864,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,w9100,1569.26 960,960,960,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,w9100,1314.69 1056,1056,1056,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,w9100,1583.11 1152,1152,1152,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,w9100,1491.83 1248,1248,1248,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,w9100,1743.47 1344,1344,1344,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,w9100,1644.05 1440,1440,1440,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,w9100,1603.53 1536,1536,1536,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,w9100,1761.43 1632,1632,1632,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,w9100,1768.79 1728,1728,1728,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,w9100,1748.98 1824,1824,1824,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,w9100,1745.31 1920,1920,1920,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,w9100,1731.84 2016,2016,2016,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,w9100,1761.08 2112,2112,2112,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,w9100,1904.14 2208,2208,2208,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,w9100,1792.97 2304,2304,2304,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,w9100,1788.94 2400,2400,2400,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,w9100,1835.3 2496,2496,2496,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,w9100,1859.23 2592,2592,2592,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,w9100,1886.85 2688,2688,2688,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,w9100,1907.93 2784,2784,2784,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,w9100,1862.47 2880,2880,2880,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,w9100,1892.57 2976,2976,2976,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,w9100,1928.16 3072,3072,3072,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,w9100,1036.59 3168,3168,3168,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,w9100,1925.25 3264,3264,3264,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,w9100,1900.1 3360,3360,3360,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,w9100,1935.78 3456,3456,3456,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,w9100,1910.43 3552,3552,3552,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,w9100,1903.26 3648,3648,3648,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,w9100,1939.59 3744,3744,3744,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,w9100,1929.17 3840,3840,3840,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,w9100,1791.19 3936,3936,3936,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,w9100,1918.13 4032,4032,4032,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,w9100,1917.51 4128,4128,4128,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,w9100,1915.76 4224,4224,4224,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,w9100,1934.22 4320,4320,4320,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,w9100,1922.69 4416,4416,4416,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,w9100,1925.57 4512,4512,4512,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,w9100,1930.22 4608,4608,4608,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,w9100,610.825 4704,4704,4704,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,w9100,1941.67 4800,4800,4800,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,w9100,1949.36 4896,4896,4896,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,w9100,1933.11 4992,4992,4992,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,w9100,1934.95 5088,5088,5088,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,w9100,1950.66 5184,5184,5184,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,w9100,1938.98 5280,5280,5280,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,w9100,1946.98 5376,5376,5376,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,w9100,1009.04 5472,5472,5472,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,w9100,1943.35 5568,5568,5568,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,w9100,1942.79 5664,5664,5664,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,w9100,1937.59 5760,5760,5760,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,w9100,1938.05 clblas-2.10/doc/performance/clBLAS_2.6.0/W9100/dtrsm_w9100_14502.csv000066400000000000000000000066001264277366700235340ustar00rootroot00000000000000m,n,k,lda,ldb,ldc,offa,offb,offc,alpha,beta,order,transa,transb,side,uplo,diag,function,device,library,label,GFLOPS 192,192,192,0,0,0,0,0,0,1.0,1.0,column,none,none,right,upper,unit,dtrsm,gpu,clblas,w9100_dtrsm_14502,9.2894 384,384,384,0,0,0,0,0,0,1.0,1.0,column,none,none,right,upper,unit,dtrsm,gpu,clblas,w9100_dtrsm_14502,54.8031 576,576,576,0,0,0,0,0,0,1.0,1.0,column,none,none,right,upper,unit,dtrsm,gpu,clblas,w9100_dtrsm_14502,139.601 768,768,768,0,0,0,0,0,0,1.0,1.0,column,none,none,right,upper,unit,dtrsm,gpu,clblas,w9100_dtrsm_14502,255.809 960,960,960,0,0,0,0,0,0,1.0,1.0,column,none,none,right,upper,unit,dtrsm,gpu,clblas,w9100_dtrsm_14502,408.175 1152,1152,1152,0,0,0,0,0,0,1.0,1.0,column,none,none,right,upper,unit,dtrsm,gpu,clblas,w9100_dtrsm_14502,527.893 1344,1344,1344,0,0,0,0,0,0,1.0,1.0,column,none,none,right,upper,unit,dtrsm,gpu,clblas,w9100_dtrsm_14502,664.403 1536,1536,1536,0,0,0,0,0,0,1.0,1.0,column,none,none,right,upper,unit,dtrsm,gpu,clblas,w9100_dtrsm_14502,464.17 1728,1728,1728,0,0,0,0,0,0,1.0,1.0,column,none,none,right,upper,unit,dtrsm,gpu,clblas,w9100_dtrsm_14502,838.67 1920,1920,1920,0,0,0,0,0,0,1.0,1.0,column,none,none,right,upper,unit,dtrsm,gpu,clblas,w9100_dtrsm_14502,915.902 2112,2112,2112,0,0,0,0,0,0,1.0,1.0,column,none,none,right,upper,unit,dtrsm,gpu,clblas,w9100_dtrsm_14502,1037.9 2304,2304,2304,0,0,0,0,0,0,1.0,1.0,column,none,none,right,upper,unit,dtrsm,gpu,clblas,w9100_dtrsm_14502,994.425 2496,2496,2496,0,0,0,0,0,0,1.0,1.0,column,none,none,right,upper,unit,dtrsm,gpu,clblas,w9100_dtrsm_14502,1080.46 2688,2688,2688,0,0,0,0,0,0,1.0,1.0,column,none,none,right,upper,unit,dtrsm,gpu,clblas,w9100_dtrsm_14502,1132.82 2880,2880,2880,0,0,0,0,0,0,1.0,1.0,column,none,none,right,upper,unit,dtrsm,gpu,clblas,w9100_dtrsm_14502,1167.03 3072,3072,3072,0,0,0,0,0,0,1.0,1.0,column,none,none,right,upper,unit,dtrsm,gpu,clblas,w9100_dtrsm_14502,974.311 3264,3264,3264,0,0,0,0,0,0,1.0,1.0,column,none,none,right,upper,unit,dtrsm,gpu,clblas,w9100_dtrsm_14502,1213.33 3456,3456,3456,0,0,0,0,0,0,1.0,1.0,column,none,none,right,upper,unit,dtrsm,gpu,clblas,w9100_dtrsm_14502,1238.22 3648,3648,3648,0,0,0,0,0,0,1.0,1.0,column,none,none,right,upper,unit,dtrsm,gpu,clblas,w9100_dtrsm_14502,1247.78 3840,3840,3840,0,0,0,0,0,0,1.0,1.0,column,none,none,right,upper,unit,dtrsm,gpu,clblas,w9100_dtrsm_14502,1206.61 4032,4032,4032,0,0,0,0,0,0,1.0,1.0,column,none,none,right,upper,unit,dtrsm,gpu,clblas,w9100_dtrsm_14502,1280.78 4224,4224,4224,0,0,0,0,0,0,1.0,1.0,column,none,none,right,upper,unit,dtrsm,gpu,clblas,w9100_dtrsm_14502,1286.87 4416,4416,4416,0,0,0,0,0,0,1.0,1.0,column,none,none,right,upper,unit,dtrsm,gpu,clblas,w9100_dtrsm_14502,1285.17 4608,4608,4608,0,0,0,0,0,0,1.0,1.0,column,none,none,right,upper,unit,dtrsm,gpu,clblas,w9100_dtrsm_14502,1212.33 4800,4800,4800,0,0,0,0,0,0,1.0,1.0,column,none,none,right,upper,unit,dtrsm,gpu,clblas,w9100_dtrsm_14502,1315.21 4992,4992,4992,0,0,0,0,0,0,1.0,1.0,column,none,none,right,upper,unit,dtrsm,gpu,clblas,w9100_dtrsm_14502,1322.38 5184,5184,5184,0,0,0,0,0,0,1.0,1.0,column,none,none,right,upper,unit,dtrsm,gpu,clblas,w9100_dtrsm_14502,1340.63 5376,5376,5376,0,0,0,0,0,0,1.0,1.0,column,none,none,right,upper,unit,dtrsm,gpu,clblas,w9100_dtrsm_14502,1313.7 5568,5568,5568,0,0,0,0,0,0,1.0,1.0,column,none,none,right,upper,unit,dtrsm,gpu,clblas,w9100_dtrsm_14502,1355.15 5760,5760,5760,0,0,0,0,0,0,1.0,1.0,column,none,none,right,upper,unit,dtrsm,gpu,clblas,w9100_dtrsm_14502,1363.73 clblas-2.10/doc/performance/clBLAS_2.6.0/W9100/peak_dp.csv000066400000000000000000000437061264277366700224230ustar00rootroot00000000000000m,n,k,lda,ldb,ldc,offa,offb,offc,alpha,beta,order,transa,transb,side,uplo,diag,function,device,library,label,GFLOPS 32,32,32,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,W9100 Peak,2619 64,64,64,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,W9100 Peak,2619 96,96,96,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,W9100 Peak,2619 128,128,128,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,W9100 Peak,2619 160,160,160,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,W9100 Peak,2619 192,192,192,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,W9100 Peak,2619 224,224,224,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,W9100 Peak,2619 256,256,256,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,W9100 Peak,2619 288,288,288,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,W9100 Peak,2619 320,320,320,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,W9100 Peak,2619 352,352,352,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,W9100 Peak,2619 384,384,384,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,W9100 Peak,2619 416,416,416,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,W9100 Peak,2619 448,448,448,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,W9100 Peak,2619 480,480,480,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,W9100 Peak,2619 512,512,512,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,W9100 Peak,2619 544,544,544,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,W9100 Peak,2619 576,576,576,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,W9100 Peak,2619 608,608,608,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,W9100 Peak,2619 640,640,640,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,W9100 Peak,2619 672,672,672,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,W9100 Peak,2619 704,704,704,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,W9100 Peak,2619 736,736,736,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,W9100 Peak,2619 768,768,768,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,W9100 Peak,2619 800,800,800,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,W9100 Peak,2619 832,832,832,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,W9100 Peak,2619 864,864,864,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,W9100 Peak,2619 896,896,896,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,W9100 Peak,2619 928,928,928,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,W9100 Peak,2619 960,960,960,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,W9100 Peak,2619 992,992,992,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,W9100 Peak,2619 1024,1024,1024,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,W9100 Peak,2619 1056,1056,1056,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,W9100 Peak,2619 1088,1088,1088,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,W9100 Peak,2619 1120,1120,1120,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,W9100 Peak,2619 1152,1152,1152,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,W9100 Peak,2619 1184,1184,1184,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,W9100 Peak,2619 1216,1216,1216,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,W9100 Peak,2619 1248,1248,1248,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,W9100 Peak,2619 1280,1280,1280,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,W9100 Peak,2619 1312,1312,1312,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,W9100 Peak,2619 1344,1344,1344,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,W9100 Peak,2619 1376,1376,1376,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,W9100 Peak,2619 1408,1408,1408,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,W9100 Peak,2619 1440,1440,1440,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,W9100 Peak,2619 1472,1472,1472,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,W9100 Peak,2619 1504,1504,1504,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,W9100 Peak,2619 1536,1536,1536,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,W9100 Peak,2619 1568,1568,1568,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,W9100 Peak,2619 1600,1600,1600,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,W9100 Peak,2619 1632,1632,1632,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,W9100 Peak,2619 1664,1664,1664,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,W9100 Peak,2619 1696,1696,1696,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,W9100 Peak,2619 1728,1728,1728,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,W9100 Peak,2619 1760,1760,1760,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,W9100 Peak,2619 1792,1792,1792,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,W9100 Peak,2619 1824,1824,1824,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,W9100 Peak,2619 1856,1856,1856,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,W9100 Peak,2619 1888,1888,1888,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,W9100 Peak,2619 1920,1920,1920,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,W9100 Peak,2619 1952,1952,1952,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,W9100 Peak,2619 1984,1984,1984,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,W9100 Peak,2619 2016,2016,2016,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,W9100 Peak,2619 2048,2048,2048,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,W9100 Peak,2619 2080,2080,2080,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,W9100 Peak,2619 2112,2112,2112,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,W9100 Peak,2619 2144,2144,2144,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,W9100 Peak,2619 2176,2176,2176,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,W9100 Peak,2619 2208,2208,2208,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,W9100 Peak,2619 2240,2240,2240,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,W9100 Peak,2619 2272,2272,2272,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,W9100 Peak,2619 2304,2304,2304,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,W9100 Peak,2619 2336,2336,2336,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,W9100 Peak,2619 2368,2368,2368,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,W9100 Peak,2619 2400,2400,2400,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,W9100 Peak,2619 2432,2432,2432,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,W9100 Peak,2619 2464,2464,2464,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,W9100 Peak,2619 2496,2496,2496,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,W9100 Peak,2619 2528,2528,2528,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,W9100 Peak,2619 2560,2560,2560,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,W9100 Peak,2619 2592,2592,2592,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,W9100 Peak,2619 2624,2624,2624,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,W9100 Peak,2619 2656,2656,2656,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,W9100 Peak,2619 2688,2688,2688,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,W9100 Peak,2619 2720,2720,2720,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,W9100 Peak,2619 2752,2752,2752,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,W9100 Peak,2619 2784,2784,2784,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,W9100 Peak,2619 2816,2816,2816,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,W9100 Peak,2619 2848,2848,2848,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,W9100 Peak,2619 2880,2880,2880,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,W9100 Peak,2619 2912,2912,2912,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,W9100 Peak,2619 2944,2944,2944,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,W9100 Peak,2619 2976,2976,2976,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,W9100 Peak,2619 3008,3008,3008,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,W9100 Peak,2619 3040,3040,3040,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,W9100 Peak,2619 3072,3072,3072,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,W9100 Peak,2619 3104,3104,3104,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,W9100 Peak,2619 3136,3136,3136,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,W9100 Peak,2619 3168,3168,3168,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,W9100 Peak,2619 3200,3200,3200,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,W9100 Peak,2619 3232,3232,3232,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,W9100 Peak,2619 3264,3264,3264,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,W9100 Peak,2619 3296,3296,3296,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,W9100 Peak,2619 3328,3328,3328,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,W9100 Peak,2619 3360,3360,3360,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,W9100 Peak,2619 3392,3392,3392,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,W9100 Peak,2619 3424,3424,3424,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,W9100 Peak,2619 3456,3456,3456,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,W9100 Peak,2619 3488,3488,3488,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,W9100 Peak,2619 3520,3520,3520,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,W9100 Peak,2619 3552,3552,3552,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,W9100 Peak,2619 3584,3584,3584,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,W9100 Peak,2619 3616,3616,3616,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,W9100 Peak,2619 3648,3648,3648,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,W9100 Peak,2619 3680,3680,3680,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,W9100 Peak,2619 3712,3712,3712,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,W9100 Peak,2619 3744,3744,3744,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,W9100 Peak,2619 3776,3776,3776,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,W9100 Peak,2619 3808,3808,3808,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,W9100 Peak,2619 3840,3840,3840,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,W9100 Peak,2619 3872,3872,3872,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,W9100 Peak,2619 3904,3904,3904,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,W9100 Peak,2619 3936,3936,3936,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,W9100 Peak,2619 3968,3968,3968,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,W9100 Peak,2619 4000,4000,4000,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,W9100 Peak,2619 4032,4032,4032,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,W9100 Peak,2619 4064,4064,4064,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,W9100 Peak,2619 4096,4096,4096,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,W9100 Peak,2619 4128,4128,4128,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,W9100 Peak,2619 4160,4160,4160,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,W9100 Peak,2619 4192,4192,4192,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,W9100 Peak,2619 4224,4224,4224,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,W9100 Peak,2619 4256,4256,4256,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,W9100 Peak,2619 4288,4288,4288,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,W9100 Peak,2619 4320,4320,4320,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,W9100 Peak,2619 4352,4352,4352,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,W9100 Peak,2619 4384,4384,4384,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,W9100 Peak,2619 4416,4416,4416,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,W9100 Peak,2619 4448,4448,4448,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,W9100 Peak,2619 4480,4480,4480,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,W9100 Peak,2619 4512,4512,4512,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,W9100 Peak,2619 4544,4544,4544,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,W9100 Peak,2619 4576,4576,4576,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,W9100 Peak,2619 4608,4608,4608,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,W9100 Peak,2619 4640,4640,4640,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,W9100 Peak,2619 4672,4672,4672,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,W9100 Peak,2619 4704,4704,4704,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,W9100 Peak,2619 4736,4736,4736,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,W9100 Peak,2619 4768,4768,4768,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,W9100 Peak,2619 4800,4800,4800,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,W9100 Peak,2619 4832,4832,4832,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,W9100 Peak,2619 4864,4864,4864,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,W9100 Peak,2619 4896,4896,4896,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,W9100 Peak,2619 4928,4928,4928,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,W9100 Peak,2619 4960,4960,4960,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,W9100 Peak,2619 4992,4992,4992,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,W9100 Peak,2619 5024,5024,5024,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,W9100 Peak,2619 5056,5056,5056,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,W9100 Peak,2619 5088,5088,5088,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,W9100 Peak,2619 5120,5120,5120,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,W9100 Peak,2619 5152,5152,5152,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,W9100 Peak,2619 5184,5184,5184,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,W9100 Peak,2619 5216,5216,5216,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,W9100 Peak,2619 5248,5248,5248,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,W9100 Peak,2619 5280,5280,5280,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,W9100 Peak,2619 5312,5312,5312,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,W9100 Peak,2619 5344,5344,5344,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,W9100 Peak,2619 5376,5376,5376,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,W9100 Peak,2619 5408,5408,5408,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,W9100 Peak,2619 5440,5440,5440,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,W9100 Peak,2619 5472,5472,5472,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,W9100 Peak,2619 5504,5504,5504,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,W9100 Peak,2619 5536,5536,5536,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,W9100 Peak,2619 5568,5568,5568,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,W9100 Peak,2619 5600,5600,5600,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,W9100 Peak,2619 5632,5632,5632,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,W9100 Peak,2619 5664,5664,5664,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,W9100 Peak,2619 5696,5696,5696,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,W9100 Peak,2619 5728,5728,5728,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,W9100 Peak,2619 5760,5760,5760,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,W9100 Peak,2619 clblas-2.10/doc/performance/clBLAS_2.6.0/W9100/peak_sp.csv000066400000000000000000000437061264277366700224420ustar00rootroot00000000000000m,n,k,lda,ldb,ldc,offa,offb,offc,alpha,beta,order,transa,transb,side,uplo,diag,function,device,library,label,GFLOPS 32,32,32,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,W9100 Peak,5238 64,64,64,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,W9100 Peak,5238 96,96,96,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,W9100 Peak,5238 128,128,128,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,W9100 Peak,5238 160,160,160,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,W9100 Peak,5238 192,192,192,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,W9100 Peak,5238 224,224,224,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,W9100 Peak,5238 256,256,256,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,W9100 Peak,5238 288,288,288,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,W9100 Peak,5238 320,320,320,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,W9100 Peak,5238 352,352,352,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,W9100 Peak,5238 384,384,384,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,W9100 Peak,5238 416,416,416,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,W9100 Peak,5238 448,448,448,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,W9100 Peak,5238 480,480,480,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,W9100 Peak,5238 512,512,512,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,W9100 Peak,5238 544,544,544,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,W9100 Peak,5238 576,576,576,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,W9100 Peak,5238 608,608,608,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,W9100 Peak,5238 640,640,640,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,W9100 Peak,5238 672,672,672,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,W9100 Peak,5238 704,704,704,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,W9100 Peak,5238 736,736,736,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,W9100 Peak,5238 768,768,768,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,W9100 Peak,5238 800,800,800,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,W9100 Peak,5238 832,832,832,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,W9100 Peak,5238 864,864,864,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,W9100 Peak,5238 896,896,896,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,W9100 Peak,5238 928,928,928,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,W9100 Peak,5238 960,960,960,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,W9100 Peak,5238 992,992,992,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,W9100 Peak,5238 1024,1024,1024,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,W9100 Peak,5238 1056,1056,1056,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,W9100 Peak,5238 1088,1088,1088,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,W9100 Peak,5238 1120,1120,1120,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,W9100 Peak,5238 1152,1152,1152,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,W9100 Peak,5238 1184,1184,1184,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,W9100 Peak,5238 1216,1216,1216,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,W9100 Peak,5238 1248,1248,1248,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,W9100 Peak,5238 1280,1280,1280,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,W9100 Peak,5238 1312,1312,1312,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,W9100 Peak,5238 1344,1344,1344,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,W9100 Peak,5238 1376,1376,1376,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,W9100 Peak,5238 1408,1408,1408,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,W9100 Peak,5238 1440,1440,1440,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,W9100 Peak,5238 1472,1472,1472,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,W9100 Peak,5238 1504,1504,1504,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,W9100 Peak,5238 1536,1536,1536,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,W9100 Peak,5238 1568,1568,1568,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,W9100 Peak,5238 1600,1600,1600,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,W9100 Peak,5238 1632,1632,1632,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,W9100 Peak,5238 1664,1664,1664,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,W9100 Peak,5238 1696,1696,1696,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,W9100 Peak,5238 1728,1728,1728,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,W9100 Peak,5238 1760,1760,1760,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,W9100 Peak,5238 1792,1792,1792,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,W9100 Peak,5238 1824,1824,1824,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,W9100 Peak,5238 1856,1856,1856,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,W9100 Peak,5238 1888,1888,1888,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,W9100 Peak,5238 1920,1920,1920,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,W9100 Peak,5238 1952,1952,1952,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,W9100 Peak,5238 1984,1984,1984,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,W9100 Peak,5238 2016,2016,2016,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,W9100 Peak,5238 2048,2048,2048,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,W9100 Peak,5238 2080,2080,2080,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,W9100 Peak,5238 2112,2112,2112,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,W9100 Peak,5238 2144,2144,2144,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,W9100 Peak,5238 2176,2176,2176,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,W9100 Peak,5238 2208,2208,2208,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,W9100 Peak,5238 2240,2240,2240,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,W9100 Peak,5238 2272,2272,2272,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,W9100 Peak,5238 2304,2304,2304,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,W9100 Peak,5238 2336,2336,2336,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,W9100 Peak,5238 2368,2368,2368,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,W9100 Peak,5238 2400,2400,2400,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,W9100 Peak,5238 2432,2432,2432,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,W9100 Peak,5238 2464,2464,2464,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,W9100 Peak,5238 2496,2496,2496,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,W9100 Peak,5238 2528,2528,2528,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,W9100 Peak,5238 2560,2560,2560,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,W9100 Peak,5238 2592,2592,2592,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,W9100 Peak,5238 2624,2624,2624,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,W9100 Peak,5238 2656,2656,2656,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,W9100 Peak,5238 2688,2688,2688,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,W9100 Peak,5238 2720,2720,2720,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,W9100 Peak,5238 2752,2752,2752,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,W9100 Peak,5238 2784,2784,2784,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,W9100 Peak,5238 2816,2816,2816,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,W9100 Peak,5238 2848,2848,2848,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,W9100 Peak,5238 2880,2880,2880,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,W9100 Peak,5238 2912,2912,2912,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,W9100 Peak,5238 2944,2944,2944,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,W9100 Peak,5238 2976,2976,2976,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,W9100 Peak,5238 3008,3008,3008,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,W9100 Peak,5238 3040,3040,3040,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,W9100 Peak,5238 3072,3072,3072,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,W9100 Peak,5238 3104,3104,3104,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,W9100 Peak,5238 3136,3136,3136,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,W9100 Peak,5238 3168,3168,3168,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,W9100 Peak,5238 3200,3200,3200,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,W9100 Peak,5238 3232,3232,3232,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,W9100 Peak,5238 3264,3264,3264,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,W9100 Peak,5238 3296,3296,3296,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,W9100 Peak,5238 3328,3328,3328,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,W9100 Peak,5238 3360,3360,3360,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,W9100 Peak,5238 3392,3392,3392,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,W9100 Peak,5238 3424,3424,3424,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,W9100 Peak,5238 3456,3456,3456,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,W9100 Peak,5238 3488,3488,3488,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,W9100 Peak,5238 3520,3520,3520,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,W9100 Peak,5238 3552,3552,3552,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,W9100 Peak,5238 3584,3584,3584,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,W9100 Peak,5238 3616,3616,3616,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,W9100 Peak,5238 3648,3648,3648,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,W9100 Peak,5238 3680,3680,3680,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,W9100 Peak,5238 3712,3712,3712,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,W9100 Peak,5238 3744,3744,3744,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,W9100 Peak,5238 3776,3776,3776,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,W9100 Peak,5238 3808,3808,3808,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,W9100 Peak,5238 3840,3840,3840,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,W9100 Peak,5238 3872,3872,3872,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,W9100 Peak,5238 3904,3904,3904,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,W9100 Peak,5238 3936,3936,3936,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,W9100 Peak,5238 3968,3968,3968,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,W9100 Peak,5238 4000,4000,4000,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,W9100 Peak,5238 4032,4032,4032,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,W9100 Peak,5238 4064,4064,4064,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,W9100 Peak,5238 4096,4096,4096,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,W9100 Peak,5238 4128,4128,4128,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,W9100 Peak,5238 4160,4160,4160,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,W9100 Peak,5238 4192,4192,4192,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,W9100 Peak,5238 4224,4224,4224,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,W9100 Peak,5238 4256,4256,4256,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,W9100 Peak,5238 4288,4288,4288,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,W9100 Peak,5238 4320,4320,4320,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,W9100 Peak,5238 4352,4352,4352,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,W9100 Peak,5238 4384,4384,4384,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,W9100 Peak,5238 4416,4416,4416,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,W9100 Peak,5238 4448,4448,4448,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,W9100 Peak,5238 4480,4480,4480,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,W9100 Peak,5238 4512,4512,4512,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,W9100 Peak,5238 4544,4544,4544,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,W9100 Peak,5238 4576,4576,4576,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,W9100 Peak,5238 4608,4608,4608,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,W9100 Peak,5238 4640,4640,4640,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,W9100 Peak,5238 4672,4672,4672,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,W9100 Peak,5238 4704,4704,4704,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,W9100 Peak,5238 4736,4736,4736,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,W9100 Peak,5238 4768,4768,4768,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,W9100 Peak,5238 4800,4800,4800,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,W9100 Peak,5238 4832,4832,4832,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,W9100 Peak,5238 4864,4864,4864,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,W9100 Peak,5238 4896,4896,4896,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,W9100 Peak,5238 4928,4928,4928,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,W9100 Peak,5238 4960,4960,4960,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,W9100 Peak,5238 4992,4992,4992,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,W9100 Peak,5238 5024,5024,5024,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,W9100 Peak,5238 5056,5056,5056,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,W9100 Peak,5238 5088,5088,5088,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,W9100 Peak,5238 5120,5120,5120,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,W9100 Peak,5238 5152,5152,5152,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,W9100 Peak,5238 5184,5184,5184,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,W9100 Peak,5238 5216,5216,5216,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,W9100 Peak,5238 5248,5248,5248,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,W9100 Peak,5238 5280,5280,5280,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,W9100 Peak,5238 5312,5312,5312,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,W9100 Peak,5238 5344,5344,5344,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,W9100 Peak,5238 5376,5376,5376,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,W9100 Peak,5238 5408,5408,5408,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,W9100 Peak,5238 5440,5440,5440,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,W9100 Peak,5238 5472,5472,5472,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,W9100 Peak,5238 5504,5504,5504,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,W9100 Peak,5238 5536,5536,5536,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,W9100 Peak,5238 5568,5568,5568,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,W9100 Peak,5238 5600,5600,5600,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,W9100 Peak,5238 5632,5632,5632,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,W9100 Peak,5238 5664,5664,5664,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,W9100 Peak,5238 5696,5696,5696,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,W9100 Peak,5238 5728,5728,5728,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,W9100 Peak,5238 5760,5760,5760,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,W9100 Peak,5238 clblas-2.10/doc/performance/clBLAS_2.6.0/W9100/zgemm_32.csv000066400000000000000000000444371264277366700224450ustar00rootroot00000000000000m,n,k,lda,ldb,ldc,offa,offb,offc,alpha,beta,order,transa,transb,side,uplo,diag,function,device,library,label,GFLOPS 32,32,32,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,w9100,9.09906 64,64,64,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,w9100,36.574 96,96,96,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,w9100,160.206 128,128,128,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,w9100,170.275 160,160,160,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,w9100,444.432 192,192,192,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,w9100,394.669 224,224,224,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,w9100,623.893 256,256,256,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,w9100,678.45 288,288,288,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,w9100,711.398 320,320,320,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,w9100,846.746 352,352,352,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,w9100,742.543 384,384,384,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,w9100,1214.5 416,416,416,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,w9100,793.883 448,448,448,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,w9100,1234.17 480,480,480,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,w9100,776.65 512,512,512,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,w9100,1628.34 544,544,544,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,w9100,794.229 576,576,576,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,w9100,1575.83 608,608,608,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,w9100,806.479 640,640,640,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,w9100,1619.74 672,672,672,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,w9100,818.052 704,704,704,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,w9100,1681.82 736,736,736,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,w9100,825.816 768,768,768,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,w9100,1709.53 800,800,800,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,w9100,830.103 832,832,832,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,w9100,1797.21 864,864,864,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,w9100,837.551 896,896,896,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,w9100,1859.35 928,928,928,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,w9100,837.728 960,960,960,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,w9100,1779.02 992,992,992,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,w9100,852.477 1024,1024,1024,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,w9100,1845.33 1056,1056,1056,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,w9100,850.55 1088,1088,1088,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,w9100,1818.33 1120,1120,1120,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,w9100,854.314 1152,1152,1152,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,w9100,1899.54 1184,1184,1184,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,w9100,847.583 1216,1216,1216,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,w9100,1885.12 1248,1248,1248,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,w9100,851.694 1280,1280,1280,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,w9100,1862.59 1312,1312,1312,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,w9100,850.521 1344,1344,1344,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,w9100,1877.91 1376,1376,1376,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,w9100,852.384 1408,1408,1408,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,w9100,1956.99 1440,1440,1440,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,w9100,853.803 1472,1472,1472,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,w9100,1900.96 1504,1504,1504,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,w9100,854.0 1536,1536,1536,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,w9100,1907.1 1568,1568,1568,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,w9100,857.023 1600,1600,1600,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,w9100,1941.45 1632,1632,1632,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,w9100,855.335 1664,1664,1664,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,w9100,1959.34 1696,1696,1696,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,w9100,859.688 1728,1728,1728,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,w9100,1938.07 1760,1760,1760,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,w9100,859.107 1792,1792,1792,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,w9100,1957.57 1824,1824,1824,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,w9100,860.442 1856,1856,1856,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,w9100,1953.24 1888,1888,1888,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,w9100,858.02 1920,1920,1920,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,w9100,1981.42 1952,1952,1952,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,w9100,859.935 1984,1984,1984,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,w9100,1980.84 2016,2016,2016,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,w9100,858.648 2048,2048,2048,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,w9100,1961.25 2080,2080,2080,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,w9100,858.983 2112,2112,2112,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,w9100,1978.93 2144,2144,2144,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,w9100,859.372 2176,2176,2176,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,w9100,1975.99 2208,2208,2208,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,w9100,859.38 2240,2240,2240,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,w9100,1990.04 2272,2272,2272,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,w9100,860.902 2304,2304,2304,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,w9100,1988.39 2336,2336,2336,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,w9100,860.384 2368,2368,2368,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,w9100,1980.03 2400,2400,2400,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,w9100,861.728 2432,2432,2432,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,w9100,1987.55 2464,2464,2464,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,w9100,861.298 2496,2496,2496,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,w9100,1982.25 2528,2528,2528,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,w9100,861.935 2560,2560,2560,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,w9100,1988.65 2592,2592,2592,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,w9100,861.244 2624,2624,2624,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,w9100,1992.56 2656,2656,2656,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,w9100,861.833 2688,2688,2688,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,w9100,1982.82 2720,2720,2720,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,w9100,861.546 2752,2752,2752,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,w9100,1988.26 2784,2784,2784,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,w9100,861.273 2816,2816,2816,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,w9100,1998.23 2848,2848,2848,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,w9100,861.389 2880,2880,2880,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,w9100,1991.54 2912,2912,2912,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,w9100,862.057 2944,2944,2944,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,w9100,1988.85 2976,2976,2976,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,w9100,862.213 3008,3008,3008,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,w9100,2000.02 3040,3040,3040,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,w9100,862.014 3072,3072,3072,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,w9100,1984.65 3104,3104,3104,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,w9100,862.498 3136,3136,3136,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,w9100,1997.44 3168,3168,3168,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,w9100,862.264 3200,3200,3200,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,w9100,2000.61 3232,3232,3232,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,w9100,862.602 3264,3264,3264,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,w9100,2000.81 3296,3296,3296,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,w9100,862.338 3328,3328,3328,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,w9100,2000.64 3360,3360,3360,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,w9100,862.545 3392,3392,3392,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,w9100,2009.05 3424,3424,3424,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,w9100,862.716 3456,3456,3456,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,w9100,2001.82 3488,3488,3488,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,w9100,862.301 3520,3520,3520,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,w9100,2007.62 3552,3552,3552,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,w9100,862.389 3584,3584,3584,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,w9100,1996.04 3616,3616,3616,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,w9100,862.869 3648,3648,3648,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,w9100,2011.04 3680,3680,3680,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,w9100,862.737 3712,3712,3712,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,w9100,2008.65 3744,3744,3744,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,w9100,862.729 3776,3776,3776,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,w9100,2006.6 3808,3808,3808,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,w9100,862.901 3840,3840,3840,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,w9100,1864.43 3872,3872,3872,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,w9100,862.754 3904,3904,3904,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,w9100,2007.03 3936,3936,3936,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,w9100,862.955 3968,3968,3968,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,w9100,2007.52 4000,4000,4000,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,w9100,862.901 4032,4032,4032,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,w9100,2010.49 4064,4064,4064,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,w9100,862.923 4096,4096,4096,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,w9100,206.078 4128,4128,4128,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,w9100,863.149 4160,4160,4160,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,w9100,2009.11 4192,4192,4192,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,w9100,862.88 4224,4224,4224,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,w9100,2010.84 4256,4256,4256,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,w9100,862.917 4288,4288,4288,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,w9100,2010.17 4320,4320,4320,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,w9100,863.218 4352,4352,4352,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,w9100,1384.22 4384,4384,4384,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,w9100,848.617 4416,4416,4416,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,w9100,2012.66 4448,4448,4448,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,w9100,863.123 4480,4480,4480,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,w9100,2009.68 4512,4512,4512,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,w9100,863.128 4544,4544,4544,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,w9100,2011.71 4576,4576,4576,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,w9100,863.046 4608,4608,4608,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,w9100,1278.38 4640,4640,4640,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,w9100,863.115 4672,4672,4672,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,w9100,2012.76 4704,4704,4704,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,w9100,863.218 4736,4736,4736,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,w9100,2011.28 4768,4768,4768,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,w9100,863.102 4800,4800,4800,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,w9100,2016.26 4832,4832,4832,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,w9100,863.363 4864,4864,4864,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,w9100,1280.49 4896,4896,4896,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,w9100,863.129 4928,4928,4928,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,w9100,2015.45 4960,4960,4960,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,w9100,863.145 4992,4992,4992,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,w9100,2009.67 5024,5024,5024,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,w9100,863.391 5056,5056,5056,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,w9100,2016.91 5088,5088,5088,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,w9100,863.158 5120,5120,5120,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,w9100,625.809 5152,5152,5152,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,w9100,863.298 5184,5184,5184,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,w9100,2014.79 5216,5216,5216,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,w9100,863.045 5248,5248,5248,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,w9100,2010.58 5280,5280,5280,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,w9100,863.16 5312,5312,5312,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,w9100,2015.19 5344,5344,5344,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,w9100,863.075 5376,5376,5376,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,w9100,1187.8 5408,5408,5408,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,w9100,862.913 5440,5440,5440,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,w9100,2016.42 5472,5472,5472,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,w9100,863.172 5504,5504,5504,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,w9100,2009.88 5536,5536,5536,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,w9100,863.346 5568,5568,5568,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,w9100,2016.31 5600,5600,5600,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,w9100,863.156 5632,5632,5632,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,w9100,1101.25 5664,5664,5664,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,w9100,863.14 5696,5696,5696,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,w9100,2016.67 5728,5728,5728,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,w9100,863.228 5760,5760,5760,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,w9100,2010.23 clblas-2.10/doc/performance/clBLAS_2.6.0/W9100/zgemm_64.csv000066400000000000000000000223121264277366700224360ustar00rootroot00000000000000m,n,k,lda,ldb,ldc,offa,offb,offc,alpha,beta,order,transa,transb,side,uplo,diag,function,device,library,label,GFLOPS 64,64,64,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,w9100,36.6379 128,128,128,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,w9100,170.206 192,192,192,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,w9100,393.872 256,256,256,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,w9100,680.963 320,320,320,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,w9100,839.909 384,384,384,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,w9100,1215.58 448,448,448,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,w9100,1264.9 512,512,512,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,w9100,1629.5 576,576,576,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,w9100,1575.07 640,640,640,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,w9100,1619.45 704,704,704,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,w9100,1681.05 768,768,768,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,w9100,1709.63 832,832,832,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,w9100,1796.53 896,896,896,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,w9100,1859.68 960,960,960,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,w9100,1778.43 1024,1024,1024,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,w9100,1846.89 1088,1088,1088,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,w9100,1818.09 1152,1152,1152,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,w9100,1898.85 1216,1216,1216,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,w9100,1885.16 1280,1280,1280,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,w9100,1862.89 1344,1344,1344,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,w9100,1877.38 1408,1408,1408,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,w9100,1958.02 1472,1472,1472,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,w9100,1900.78 1536,1536,1536,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,w9100,1906.82 1600,1600,1600,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,w9100,1941.51 1664,1664,1664,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,w9100,1959.27 1728,1728,1728,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,w9100,1938.19 1792,1792,1792,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,w9100,1957.24 1856,1856,1856,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,w9100,1953.55 1920,1920,1920,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,w9100,1981.44 1984,1984,1984,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,w9100,1981.13 2048,2048,2048,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,w9100,1961.45 2112,2112,2112,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,w9100,1979.17 2176,2176,2176,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,w9100,1975.94 2240,2240,2240,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,w9100,1989.75 2304,2304,2304,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,w9100,1988.35 2368,2368,2368,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,w9100,1979.84 2432,2432,2432,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,w9100,1987.51 2496,2496,2496,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,w9100,1981.99 2560,2560,2560,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,w9100,1988.55 2624,2624,2624,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,w9100,1992.41 2688,2688,2688,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,w9100,1982.58 2752,2752,2752,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,w9100,1987.95 2816,2816,2816,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,w9100,1998.21 2880,2880,2880,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,w9100,1991.09 2944,2944,2944,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,w9100,1988.81 3008,3008,3008,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,w9100,1999.75 3072,3072,3072,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,w9100,1992.7 3136,3136,3136,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,w9100,1997.17 3200,3200,3200,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,w9100,2000.57 3264,3264,3264,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,w9100,2000.53 3328,3328,3328,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,w9100,1999.6 3392,3392,3392,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,w9100,2008.82 3456,3456,3456,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,w9100,2001.74 3520,3520,3520,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,w9100,2007.43 3584,3584,3584,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,w9100,1995.4 3648,3648,3648,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,w9100,2010.74 3712,3712,3712,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,w9100,2008.59 3776,3776,3776,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,w9100,2006.33 3840,3840,3840,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,w9100,1860.79 3904,3904,3904,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,w9100,2006.66 3968,3968,3968,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,w9100,2007.46 4032,4032,4032,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,w9100,2010.19 4096,4096,4096,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,w9100,203.949 4160,4160,4160,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,w9100,2008.73 4224,4224,4224,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,w9100,2010.87 4288,4288,4288,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,w9100,2009.73 4352,4352,4352,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,w9100,1372.58 4416,4416,4416,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,w9100,2012.38 4480,4480,4480,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,w9100,2009.7 4544,4544,4544,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,w9100,2011.35 4608,4608,4608,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,w9100,1285.38 4672,4672,4672,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,w9100,2012.41 4736,4736,4736,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,w9100,2011.7 4800,4800,4800,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,w9100,2015.97 4864,4864,4864,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,w9100,1274.72 4928,4928,4928,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,w9100,2015.15 4992,4992,4992,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,w9100,2009.77 5056,5056,5056,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,w9100,2016.62 5120,5120,5120,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,w9100,626.213 5184,5184,5184,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,w9100,2014.47 5248,5248,5248,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,w9100,2010.52 5312,5312,5312,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,w9100,2014.83 5376,5376,5376,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,w9100,1191.5 5440,5440,5440,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,w9100,2016.1 5504,5504,5504,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,w9100,2009.98 5568,5568,5568,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,w9100,2015.97 5632,5632,5632,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,w9100,1098.96 5696,5696,5696,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,w9100,2016.34 5760,5760,5760,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,w9100,2010.3 clblas-2.10/doc/performance/clBLAS_2.7.1/000077500000000000000000000000001264277366700175535ustar00rootroot00000000000000clblas-2.10/doc/performance/clBLAS_2.7.1/S9150/000077500000000000000000000000001264277366700202745ustar00rootroot00000000000000clblas-2.10/doc/performance/clBLAS_2.7.1/S9150/cgemmNT_S9150_14.50.2_2.7.1_8.csv000066400000000000000000002374561264277366700250300ustar00rootroot00000000000000m,n,k,lda,ldb,ldc,offa,offb,offc,alpha,beta,order,transa,transb,side,uplo,diag,function,device,library,numQueues,label,GFLOPS 8,8,8,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,0.361199 16,16,16,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,2.2276 24,24,24,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,2.32776 32,32,32,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,16.0137 40,40,40,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,10.1607 48,48,48,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,49.2067 56,56,56,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,23.9463 64,64,64,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,88.2268 72,72,72,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,41.8205 80,80,80,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,149.0 88,88,88,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,81.8954 96,96,96,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,262.241 104,104,104,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,113.023 112,112,112,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,358.057 120,120,120,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,149.481 128,128,128,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,437.362 136,136,136,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,202.941 144,144,144,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,642.493 152,152,152,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,266.475 160,160,160,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,675.49 168,168,168,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,354.581 176,176,176,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,947.929 184,184,184,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,427.227 192,192,192,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,965.276 200,200,200,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,470.865 208,208,208,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,1151.68 216,216,216,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,564.261 224,224,224,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,1169.86 232,232,232,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,660.697 240,240,240,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,1100.42 248,248,248,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,672.012 256,256,256,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,1275.83 264,264,264,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,752.584 272,272,272,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,1257.92 280,280,280,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,792.885 288,288,288,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,1637.14 296,296,296,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,862.501 304,304,304,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,1310.91 312,312,312,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,868.528 320,320,320,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,1763.14 328,328,328,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,939.498 336,336,336,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,1426.93 344,344,344,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,994.839 352,352,352,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,2184.67 360,360,360,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,1061.54 368,368,368,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,1455.81 376,376,376,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,1064.5 384,384,384,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,2197.57 392,392,392,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,1113.97 400,400,400,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,1541.7 408,408,408,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,1157.94 416,416,416,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,2593.58 424,424,424,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,1218.16 432,432,432,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,2275.68 440,440,440,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,1649.97 448,448,448,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,2598.43 456,456,456,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,1663.01 464,464,464,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,1632.04 472,472,472,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,1784.65 480,480,480,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,2401.63 488,488,488,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,1655.42 496,496,496,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,1720.4 504,504,504,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,1742.48 512,512,512,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,2630.69 520,520,520,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,1848.68 528,528,528,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,2676.57 536,536,536,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,1966.52 544,544,544,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,2898.03 552,552,552,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,1996.34 560,560,560,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,1772.8 568,568,568,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,2131.22 576,576,576,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,2964.79 584,584,584,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,2111.07 592,592,592,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,1757.07 600,600,600,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,2227.84 608,608,608,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,3014.12 616,616,616,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,2201.04 624,624,624,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,3130.76 632,632,632,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,2315.9 640,640,640,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,3045.8 648,648,648,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,2338.89 656,656,656,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,1800.66 664,664,664,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,2439.24 672,672,672,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,3054.27 680,680,680,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,2367.09 688,688,688,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,1809.57 696,696,696,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,2476.7 704,704,704,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,3376.0 712,712,712,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,2519.86 720,720,720,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,2964.37 728,728,728,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,2620.45 736,736,736,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,3225.01 744,744,744,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,2544.21 752,752,752,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,2581.26 760,760,760,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,2650.18 768,768,768,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,3084.65 776,776,776,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,2628.89 784,784,784,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,2678.1 792,792,792,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,2733.58 800,800,800,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,3370.11 808,808,808,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,2721.69 816,816,816,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,3362.14 824,824,824,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,2822.73 832,832,832,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,3626.77 840,840,840,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,2802.4 848,848,848,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,2855.3 856,856,856,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,2906.21 864,864,864,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,3364.23 872,872,872,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,2875.21 880,880,880,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,2973.62 888,888,888,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,2986.76 896,896,896,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,3470.13 904,904,904,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,2949.61 912,912,912,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,3372.22 920,920,920,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,3049.12 928,928,928,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,3498.8 936,936,936,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,2937.82 944,944,944,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,2991.87 952,952,952,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,3040.92 960,960,960,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,3420.03 968,968,968,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,3018.84 976,976,976,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,3081.84 984,984,984,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,3117.29 992,992,992,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,3666.15 1000,1000,1000,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,3098.29 1008,1008,1008,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,3438.92 1016,1016,1016,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,3193.93 1024,1024,1024,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,3883.16 1032,1032,1032,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,3099.96 1040,1040,1040,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,3333.83 1048,1048,1048,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,3191.77 1056,1056,1056,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,3797.2 1064,1064,1064,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,3159.09 1072,1072,1072,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,3213.32 1080,1080,1080,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,3264.48 1088,1088,1088,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,3842.68 1096,1096,1096,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,3180.58 1104,1104,1104,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,3557.59 1112,1112,1112,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,3302.29 1120,1120,1120,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,3731.33 1128,1128,1128,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,3246.25 1136,1136,1136,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,3282.78 1144,1144,1144,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,3348.68 1152,1152,1152,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,3824.43 1160,1160,1160,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,3264.17 1168,1168,1168,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,3322.03 1176,1176,1176,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,3361.29 1184,1184,1184,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,3676.42 1192,1192,1192,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,3265.04 1200,1200,1200,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,3698.68 1208,1208,1208,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,3348.0 1216,1216,1216,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,3833.48 1224,1224,1224,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,3340.9 1232,1232,1232,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,3370.34 1240,1240,1240,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,3427.27 1248,1248,1248,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,3769.5 1256,1256,1256,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,3354.76 1264,1264,1264,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,3394.77 1272,1272,1272,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,3428.27 1280,1280,1280,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,3854.46 1288,1288,1288,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,3356.23 1296,1296,1296,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,3842.82 1304,1304,1304,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,3455.47 1312,1312,1312,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,3738.39 1320,1320,1320,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,3379.58 1328,1328,1328,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,3434.34 1336,1336,1336,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,3471.3 1344,1344,1344,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,3892.39 1352,1352,1352,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,3406.18 1360,1360,1360,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,3390.86 1368,1368,1368,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,3478.34 1376,1376,1376,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,3742.88 1384,1384,1384,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,3429.71 1392,1392,1392,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,3803.2 1400,1400,1400,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,3494.35 1408,1408,1408,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,4275.9 1416,1416,1416,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,3467.88 1424,1424,1424,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,3515.66 1432,1432,1432,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,3560.81 1440,1440,1440,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,3885.56 1448,1448,1448,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,3446.02 1456,1456,1456,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,3495.0 1464,1464,1464,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,3522.73 1472,1472,1472,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,3999.45 1480,1480,1480,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,3479.55 1488,1488,1488,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,3967.79 1496,1496,1496,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,3561.68 1504,1504,1504,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,3796.64 1512,1512,1512,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,3498.68 1520,1520,1520,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,3342.08 1528,1528,1528,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,3578.73 1536,1536,1536,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,4060.0 1544,1544,1544,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,3524.12 1552,1552,1552,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,3566.02 1560,1560,1560,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,3595.36 1568,1568,1568,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,3835.6 1576,1576,1576,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,3548.26 1584,1584,1584,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,3978.68 1592,1592,1592,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,3620.63 1600,1600,1600,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,4136.84 1608,1608,1608,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,3570.85 1616,1616,1616,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,3607.68 1624,1624,1624,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,3647.09 1632,1632,1632,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,3922.49 1640,1640,1640,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,3554.89 1648,1648,1648,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,3594.82 1656,1656,1656,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,3624.58 1664,1664,1664,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,4209.68 1672,1672,1672,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,3584.03 1680,1680,1680,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,4011.22 1688,1688,1688,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,3657.67 1696,1696,1696,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,3873.08 1704,1704,1704,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,3628.26 1712,1712,1712,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,3647.35 1720,1720,1720,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,3676.68 1728,1728,1728,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,4287.66 1736,1736,1736,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,3614.69 1744,1744,1744,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,3640.17 1752,1752,1752,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,3687.91 1760,1760,1760,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,3874.12 1768,1768,1768,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,3637.53 1776,1776,1776,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,3945.51 1784,1784,1784,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,3699.49 1792,1792,1792,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,4364.49 1800,1800,1800,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,3638.23 1808,1808,1808,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,3665.15 1816,1816,1816,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,3697.55 1824,1824,1824,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,4035.19 1832,1832,1832,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,3672.7 1840,1840,1840,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,3449.74 1848,1848,1848,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,3721.62 1856,1856,1856,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,4236.62 1864,1864,1864,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,3665.9 1872,1872,1872,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,4016.01 1880,1880,1880,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,3718.1 1888,1888,1888,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,3860.81 1896,1896,1896,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,3658.29 1904,1904,1904,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,3694.95 1912,1912,1912,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,3716.06 1920,1920,1920,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,4325.98 1928,1928,1928,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,3681.02 1936,1936,1936,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,3715.82 1944,1944,1944,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,3747.33 1952,1952,1952,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,3887.78 1960,1960,1960,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,3688.67 1968,1968,1968,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,3996.85 1976,1976,1976,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,3756.42 1984,1984,1984,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,4419.58 1992,1992,1992,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,3690.6 2000,2000,2000,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,3556.11 2008,2008,2008,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,3759.25 2016,2016,2016,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,3995.97 2024,2024,2024,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,3692.59 2032,2032,2032,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,3722.97 2040,2040,2040,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,3738.7 2048,2048,2048,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,4322.25 2056,2056,2056,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,3690.07 2064,2064,2064,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,3989.35 2072,2072,2072,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,3750.43 2080,2080,2080,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,3877.36 2088,2088,2088,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,3700.9 2096,2096,2096,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,3729.14 2104,2104,2104,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,3763.96 2112,2112,2112,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,4427.9 2120,2120,2120,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,3742.16 2128,2128,2128,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,3764.93 2136,2136,2136,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,3797.07 2144,2144,2144,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,3884.58 2152,2152,2152,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,3724.5 2160,2160,2160,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,4012.59 2168,2168,2168,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,3768.88 2176,2176,2176,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,4362.42 2184,2184,2184,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,3724.85 2192,2192,2192,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,3768.7 2200,2200,2200,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,3779.79 2208,2208,2208,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,4027.22 2216,2216,2216,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,3735.91 2224,2224,2224,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,4152.28 2232,2232,2232,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,4165.81 2240,2240,2240,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,4464.33 2248,2248,2248,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,4059.78 2256,2256,2256,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,4041.6 2264,2264,2264,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,4079.58 2272,2272,2272,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,3912.26 2280,2280,2280,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,4135.45 2288,2288,2288,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,4151.52 2296,2296,2296,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,4209.77 2304,2304,2304,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,4413.94 2312,2312,2312,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,4045.23 2320,2320,2320,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,4072.2 2328,2328,2328,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,4104.25 2336,2336,2336,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,3903.07 2344,2344,2344,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,4148.19 2352,2352,2352,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,4076.74 2360,2360,2360,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,4199.66 2368,2368,2368,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,4381.57 2376,2376,2376,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,4039.41 2384,2384,2384,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,4077.39 2392,2392,2392,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,4107.44 2400,2400,2400,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,4095.99 2408,2408,2408,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,4143.21 2416,2416,2416,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,4173.09 2424,2424,2424,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,4204.57 2432,2432,2432,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,4483.84 2440,2440,2440,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,4103.93 2448,2448,2448,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,4058.58 2456,2456,2456,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,4203.27 2464,2464,2464,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,3927.66 2472,2472,2472,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,4247.1 2480,2480,2480,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,4221.69 2488,2488,2488,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,4280.11 2496,2496,2496,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,4460.52 2504,2504,2504,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,4169.81 2512,2512,2512,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,4139.63 2520,2520,2520,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,4163.01 2528,2528,2528,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,3931.64 2536,2536,2536,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,4239.33 2544,2544,2544,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,4111.08 2552,2552,2552,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,4253.93 2560,2560,2560,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,4439.55 2568,2568,2568,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,4152.0 2576,2576,2576,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,4170.51 2584,2584,2584,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,4176.27 2592,2592,2592,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,4160.97 2600,2600,2600,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,4229.72 2608,2608,2608,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,4246.13 2616,2616,2616,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,4275.74 2624,2624,2624,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,4435.39 2632,2632,2632,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,4138.54 2640,2640,2640,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,4109.73 2648,2648,2648,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,4182.95 2656,2656,2656,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,4207.03 2664,2664,2664,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,4237.41 2672,2672,2672,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,4260.02 2680,2680,2680,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,4307.7 2688,2688,2688,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,4431.53 2696,2696,2696,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,4149.44 2704,2704,2704,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,4168.89 2712,2712,2712,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,4204.26 2720,2720,2720,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,4235.88 2728,2728,2728,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,4259.82 2736,2736,2736,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,4113.69 2744,2744,2744,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,4306.36 2752,2752,2752,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,4433.54 2760,2760,2760,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,4184.93 2768,2768,2768,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,4194.42 2776,2776,2776,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,4220.0 2784,2784,2784,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,4095.4 2792,2792,2792,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,4266.96 2800,2800,2800,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,4291.18 2808,2808,2808,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,4319.9 2816,2816,2816,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,4534.16 2824,2824,2824,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,4202.48 2832,2832,2832,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,4091.81 2840,2840,2840,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,4258.21 2848,2848,2848,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,4302.23 2856,2856,2856,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,4308.92 2864,2864,2864,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,4311.36 2872,2872,2872,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,4345.07 2880,2880,2880,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,4450.24 2888,2888,2888,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,4166.38 2896,2896,2896,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,4191.16 2904,2904,2904,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,4210.86 2912,2912,2912,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,4233.28 2920,2920,2920,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,4253.67 2928,2928,2928,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,4118.38 2936,2936,2936,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,4302.21 2944,2944,2944,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,4463.35 2952,2952,2952,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,4187.53 2960,2960,2960,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,4204.78 2968,2968,2968,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,4229.74 2976,2976,2976,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,4263.11 2984,2984,2984,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,4270.54 2992,2992,2992,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,4285.81 3000,3000,3000,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,4319.45 3008,3008,3008,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,4480.55 3016,3016,3016,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,4194.43 3024,3024,3024,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,4108.66 3032,3032,3032,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,4243.52 3040,3040,3040,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,4268.24 3048,3048,3048,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,4289.28 3056,3056,3056,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,4305.39 3064,3064,3064,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,4347.44 3072,3072,3072,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,4494.89 3080,3080,3080,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,4220.15 3088,3088,3088,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,4239.43 3096,3096,3096,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,4267.48 3104,3104,3104,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,4282.14 3112,3112,3112,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,4307.53 3120,3120,3120,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,4105.7 3128,3128,3128,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,4364.01 3136,3136,3136,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,4520.91 3144,3144,3144,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,4258.13 3152,3152,3152,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,4299.19 3160,3160,3160,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,4283.66 3168,3168,3168,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,4259.08 3176,3176,3176,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,4341.06 3184,3184,3184,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,4340.29 3192,3192,3192,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,4368.61 3200,3200,3200,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,4543.9 3208,3208,3208,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,4269.73 3216,3216,3216,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,4111.35 3224,3224,3224,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,4301.24 3232,3232,3232,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,4321.68 3240,3240,3240,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,4348.82 3248,3248,3248,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,4374.19 3256,3256,3256,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,4386.66 3264,3264,3264,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,4495.83 3272,3272,3272,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,4250.2 3280,3280,3280,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,4277.2 3288,3288,3288,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,4301.24 3296,3296,3296,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,4306.49 3304,3304,3304,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,4339.31 3312,3312,3312,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,4123.86 3320,3320,3320,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,4382.35 3328,3328,3328,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,4522.86 3336,3336,3336,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,4278.97 3344,3344,3344,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,4300.56 3352,3352,3352,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,4329.68 3360,3360,3360,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,4282.54 3368,3368,3368,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,4367.67 3376,3376,3376,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,4379.7 3384,3384,3384,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,4416.05 3392,3392,3392,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,4555.1 3400,3400,3400,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,4289.73 3408,3408,3408,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,4136.73 3416,3416,3416,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,4346.59 3424,3424,3424,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,4358.73 3432,3432,3432,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,4379.36 3440,3440,3440,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,4389.88 3448,3448,3448,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,4426.37 3456,3456,3456,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,4519.75 3464,3464,3464,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,4298.9 3472,3472,3472,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,4330.66 3480,3480,3480,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,4331.55 3488,3488,3488,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,4369.68 3496,3496,3496,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,4365.76 3504,3504,3504,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,4127.24 3512,3512,3512,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,4420.5 3520,3520,3520,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,4554.76 3528,3528,3528,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,4352.75 3536,3536,3536,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,4334.51 3544,3544,3544,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,4343.86 3552,3552,3552,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,4194.7 3560,3560,3560,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,4386.63 3568,3568,3568,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,4403.43 3576,3576,3576,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,4437.95 3584,3584,3584,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,4524.23 3592,3592,3592,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,4314.77 3600,3600,3600,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,4147.1 3608,3608,3608,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,4353.05 3616,3616,3616,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,4365.07 3624,3624,3624,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,4384.52 3632,3632,3632,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,4408.61 3640,3640,3640,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,4432.58 3648,3648,3648,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,4565.19 3656,3656,3656,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,4344.62 3664,3664,3664,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,4354.33 3672,3672,3672,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,4383.69 3680,3680,3680,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,4387.47 3688,3688,3688,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,4410.71 3696,3696,3696,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,4148.56 3704,3704,3704,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,4445.91 3712,3712,3712,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,4544.8 3720,3720,3720,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,4336.82 3728,3728,3728,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,4354.45 3736,3736,3736,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,4381.61 3744,3744,3744,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,4265.95 3752,3752,3752,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,4410.73 3760,3760,3760,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,4432.23 3768,3768,3768,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,4439.63 3776,3776,3776,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,4529.11 3784,3784,3784,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,4340.59 3792,3792,3792,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,4152.15 3800,3800,3800,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,4368.89 3808,3808,3808,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,4391.54 3816,3816,3816,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,4403.58 3824,3824,3824,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,4428.0 3832,3832,3832,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,4446.62 3840,3840,3840,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,4568.27 3848,3848,3848,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,4358.12 3856,3856,3856,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,4383.98 3864,3864,3864,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,4394.45 3872,3872,3872,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,4417.01 3880,3880,3880,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,4427.74 3888,3888,3888,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,4447.88 3896,3896,3896,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,4464.3 3904,3904,3904,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,4558.73 3912,3912,3912,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,4363.91 3920,3920,3920,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,4374.92 3928,3928,3928,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,4388.66 3936,3936,3936,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,4423.81 3944,3944,3944,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,4426.44 3952,3952,3952,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,4458.09 3960,3960,3960,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,4467.83 3968,3968,3968,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,4550.2 3976,3976,3976,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,4361.26 3984,3984,3984,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,4380.44 3992,3992,3992,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,4398.02 4000,4000,4000,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,4417.93 4008,4008,4008,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,4429.1 4016,4016,4016,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,4442.4 4024,4024,4024,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,4463.09 4032,4032,4032,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,4545.59 4040,4040,4040,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,4372.77 4048,4048,4048,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,4387.61 4056,4056,4056,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,4399.37 4064,4064,4064,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,4421.1 4072,4072,4072,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,4432.73 4080,4080,4080,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,4452.67 4088,4088,4088,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,4467.68 4096,4096,4096,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,4536.02 4104,4104,4104,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,4372.23 4112,4112,4112,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,4388.69 4120,4120,4120,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,4407.33 4128,4128,4128,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,4431.87 4136,4136,4136,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,4448.59 4144,4144,4144,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,4461.94 4152,4152,4152,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,4483.25 4160,4160,4160,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,4542.46 4168,4168,4168,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,4375.42 4176,4176,4176,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,4405.9 4184,4184,4184,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,4408.97 4192,4192,4192,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,4429.56 4200,4200,4200,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,4446.96 4208,4208,4208,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,4458.89 4216,4216,4216,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,4478.17 4224,4224,4224,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,4587.85 4232,4232,4232,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,4390.32 4240,4240,4240,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,4406.37 4248,4248,4248,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,4421.81 4256,4256,4256,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,4442.44 4264,4264,4264,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,4457.53 4272,4272,4272,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,4480.26 4280,4280,4280,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,4489.15 4288,4288,4288,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,4547.5 4296,4296,4296,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,4385.74 4304,4304,4304,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,4404.07 4312,4312,4312,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,4422.47 4320,4320,4320,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,4431.8 4328,4328,4328,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,4455.27 4336,4336,4336,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,4473.62 4344,4344,4344,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,4483.56 4352,4352,4352,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,4551.1 4360,4360,4360,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,4396.91 4368,4368,4368,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,4414.63 4376,4376,4376,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,4429.0 4384,4384,4384,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,4446.38 4392,4392,4392,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,4463.01 4400,4400,4400,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,4481.41 4408,4408,4408,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,4494.62 4416,4416,4416,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,4559.23 4424,4424,4424,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,4409.05 4432,4432,4432,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,4422.38 4440,4440,4440,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,4432.55 4448,4448,4448,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,4447.46 4456,4456,4456,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,4466.16 4464,4464,4464,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,4487.6 4472,4472,4472,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,4495.44 4480,4480,4480,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,4567.24 4488,4488,4488,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,4407.3 4496,4496,4496,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,4421.89 4504,4504,4504,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,4444.47 4512,4512,4512,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,4453.27 4520,4520,4520,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,4478.61 4528,4528,4528,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,4488.37 4536,4536,4536,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,4501.87 4544,4544,4544,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,4577.04 4552,4552,4552,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,4415.16 4560,4560,4560,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,4432.25 4568,4568,4568,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,4450.49 4576,4576,4576,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,4460.13 4584,4584,4584,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,4476.51 4592,4592,4592,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,4492.17 4600,4600,4600,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,4524.63 4608,4608,4608,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,4584.96 4616,4616,4616,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,4419.48 4624,4624,4624,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,4444.45 4632,4632,4632,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,4459.13 4640,4640,4640,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,4468.23 4648,4648,4648,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,4481.39 4656,4656,4656,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,4496.38 4664,4664,4664,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,4521.38 4672,4672,4672,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,4563.5 4680,4680,4680,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,4423.78 4688,4688,4688,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,4441.45 4696,4696,4696,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,4455.81 4704,4704,4704,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,4469.31 4712,4712,4712,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,4484.76 4720,4720,4720,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,4500.86 4728,4728,4728,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,4513.65 4736,4736,4736,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,4576.93 4744,4744,4744,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,4432.52 4752,4752,4752,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,4447.64 4760,4760,4760,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,4463.98 4768,4768,4768,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,4481.59 4776,4776,4776,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,4487.63 4784,4784,4784,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,4507.17 4792,4792,4792,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,4517.66 4800,4800,4800,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,4592.0 4808,4808,4808,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,4441.1 4816,4816,4816,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,4448.64 4824,4824,4824,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,4474.45 4832,4832,4832,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,4489.8 4840,4840,4840,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,4496.47 4848,4848,4848,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,4512.44 4856,4856,4856,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,4539.6 4864,4864,4864,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,4572.54 4872,4872,4872,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,4431.87 4880,4880,4880,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,4447.87 4888,4888,4888,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,4464.26 4896,4896,4896,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,4475.5 4904,4904,4904,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,4492.01 4912,4912,4912,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,4506.54 4920,4920,4920,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,4525.94 4928,4928,4928,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,4590.92 4936,4936,4936,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,4440.29 4944,4944,4944,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,4461.77 4952,4952,4952,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,4471.37 4960,4960,4960,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,4492.36 4968,4968,4968,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,4511.65 4976,4976,4976,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,4519.0 4984,4984,4984,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,4526.24 4992,4992,4992,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,4576.33 5000,5000,5000,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,4442.67 5008,5008,5008,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,4455.78 5016,5016,5016,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,4467.44 5024,5024,5024,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,4486.25 5032,5032,5032,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,4497.23 5040,5040,5040,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,4513.36 5048,5048,5048,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,4529.15 5056,5056,5056,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,4595.7 5064,5064,5064,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,4451.81 5072,5072,5072,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,4466.45 5080,5080,5080,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,4478.95 5088,5088,5088,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,4502.74 5096,5096,5096,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,4513.4 5104,5104,5104,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,4517.44 5112,5112,5112,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,4541.34 5120,5120,5120,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,4581.48 5128,5128,5128,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,4450.48 5136,5136,5136,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,4466.61 5144,5144,5144,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,4478.64 5152,5152,5152,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,4492.96 5160,5160,5160,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,4505.04 5168,5168,5168,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,4521.99 5176,5176,5176,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,4539.53 5184,5184,5184,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,4575.78 5192,5192,5192,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,4458.11 5200,5200,5200,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,4473.02 5208,5208,5208,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,4483.83 5216,5216,5216,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,4500.29 5224,5224,5224,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,4510.62 5232,5232,5232,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,4527.95 5240,5240,5240,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,4537.74 5248,5248,5248,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,4597.23 5256,5256,5256,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,4460.92 5264,5264,5264,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,4477.57 5272,5272,5272,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,4488.2 5280,5280,5280,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,4501.27 5288,5288,5288,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,4513.08 5296,5296,5296,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,4529.49 5304,5304,5304,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,4548.25 5312,5312,5312,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,4591.1 5320,5320,5320,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,4464.77 5328,5328,5328,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,4475.4 5336,5336,5336,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,4488.52 5344,5344,5344,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,4504.75 5352,5352,5352,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,4522.29 5360,5360,5360,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,4530.57 5368,5368,5368,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,4543.75 5376,5376,5376,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,4585.14 5384,5384,5384,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,4470.06 5392,5392,5392,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,4480.36 5400,5400,5400,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,4490.36 5408,5408,5408,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,4502.95 5416,5416,5416,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,4519.23 5424,5424,5424,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,4532.33 5432,5432,5432,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,4542.75 5440,5440,5440,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,4583.08 5448,5448,5448,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,4465.1 5456,5456,5456,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,4479.07 5464,5464,5464,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,4490.49 5472,5472,5472,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,4507.42 5480,5480,5480,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,4518.44 5488,5488,5488,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,4533.35 5496,5496,5496,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,4542.64 5504,5504,5504,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,4581.07 5512,5512,5512,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,4469.92 5520,5520,5520,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,4481.11 5528,5528,5528,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,4494.5 5536,5536,5536,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,4507.02 5544,5544,5544,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,4520.43 5552,5552,5552,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,4534.3 5560,5560,5560,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,4545.12 5568,5568,5568,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,4580.74 5576,5576,5576,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,4470.51 5584,5584,5584,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,4484.71 5592,5592,5592,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,4495.75 5600,5600,5600,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,4513.12 5608,5608,5608,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,4524.8 5616,5616,5616,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,4535.79 5624,5624,5624,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,4546.03 5632,5632,5632,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,4600.44 5640,5640,5640,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,4478.73 5648,5648,5648,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,4493.21 5656,5656,5656,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,4499.7 5664,5664,5664,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,4521.22 5672,5672,5672,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,4523.6 5680,5680,5680,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,4548.72 5688,5688,5688,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,4553.25 5696,5696,5696,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,4582.92 5704,5704,5704,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,4459.77 5712,5712,5712,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,4475.92 5720,5720,5720,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,4487.92 5728,5728,5728,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,4498.19 5736,5736,5736,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,4517.69 5744,5744,5744,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,4524.72 5752,5752,5752,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,4541.09 5760,5760,5760,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,4,S9150_14.50.2,4585.3 clblas-2.10/doc/performance/clBLAS_2.7.1/S9150/dgemmNT_S9150_14.50.2_2.7.1_8.csv000066400000000000000000002374451264277366700250270ustar00rootroot00000000000000m,n,k,lda,ldb,ldc,offa,offb,offc,alpha,beta,order,transa,transb,side,uplo,diag,function,device,library,numQueues,label,GFLOPS 8,8,8,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,0.0782277 16,16,16,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,0.5681 24,24,24,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,0.516013 32,32,32,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,3.88477 40,40,40,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,2.06385 48,48,48,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,11.6967 56,56,56,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,5.64319 64,64,64,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,23.4581 72,72,72,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,9.6459 80,80,80,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,43.7794 88,88,88,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,18.2334 96,96,96,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,61.978 104,104,104,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,26.7411 112,112,112,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,86.724 120,120,120,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,40.5681 128,128,128,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,110.464 136,136,136,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,48.896 144,144,144,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,143.007 152,152,152,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,66.4737 160,160,160,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,177.508 168,168,168,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,92.619 176,176,176,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,216.04 184,184,184,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,114.66 192,192,192,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,272.488 200,200,200,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,122.521 208,208,208,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,297.485 216,216,216,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,139.513 224,224,224,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,296.008 232,232,232,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,162.435 240,240,240,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,276.729 248,248,248,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,177.464 256,256,256,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,379.876 264,264,264,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,196.306 272,272,272,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,315.195 280,280,280,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,203.466 288,288,288,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,473.778 296,296,296,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,220.821 304,304,304,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,329.592 312,312,312,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,234.927 320,320,320,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,544.998 328,328,328,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,237.02 336,336,336,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,353.293 344,344,344,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,253.086 352,352,352,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,685.812 360,360,360,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,263.4 368,368,368,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,367.008 376,376,376,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,265.887 384,384,384,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,534.861 392,392,392,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,389.967 400,400,400,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,413.624 408,408,408,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,422.542 416,416,416,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,805.407 424,424,424,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,427.834 432,432,432,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,528.043 440,440,440,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,449.757 448,448,448,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,622.532 456,456,456,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,402.525 464,464,464,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,408.52 472,472,472,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,432.377 480,480,480,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,663.758 488,488,488,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,460.209 496,496,496,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,430.199 504,504,504,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,475.625 512,512,512,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,822.135 520,520,520,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,509.634 528,528,528,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,796.375 536,536,536,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,544.868 544,544,544,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,849.077 552,552,552,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,563.803 560,560,560,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,686.442 568,568,568,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,600.319 576,576,576,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,1046.71 584,584,584,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,586.245 592,592,592,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,601.428 600,600,600,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,618.318 608,608,608,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,755.215 616,616,616,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,567.68 624,624,624,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,1129.73 632,632,632,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,594.142 640,640,640,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,1059.47 648,648,648,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,616.561 656,656,656,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,630.705 664,664,664,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,645.133 672,672,672,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,921.096 680,680,680,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,654.596 688,688,688,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,673.583 696,696,696,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,687.732 704,704,704,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,1289.17 712,712,712,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,683.322 720,720,720,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,1066.22 728,728,728,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,722.16 736,736,736,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,829.15 744,744,744,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,664.736 752,752,752,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,680.6 760,760,760,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,694.681 768,768,768,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,1278.81 776,776,776,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,713.576 784,784,784,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,728.586 792,792,792,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,748.047 800,800,800,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,890.241 808,808,808,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,737.865 816,816,816,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,1355.38 824,824,824,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,764.625 832,832,832,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,1501.81 840,840,840,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,757.808 848,848,848,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,770.412 856,856,856,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,785.928 864,864,864,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,1519.03 872,872,872,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,752.633 880,880,880,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,1078.2 888,888,888,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,779.672 896,896,896,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,1375.67 904,904,904,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,786.676 912,912,912,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,1152.8 920,920,920,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,810.897 928,928,928,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,950.979 936,936,936,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,793.2 944,944,944,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,808.555 952,952,952,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,820.103 960,960,960,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,1389.73 968,968,968,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,797.267 976,976,976,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,809.817 984,984,984,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,821.175 992,992,992,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,976.231 1000,1000,1000,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,826.375 1008,1008,1008,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,1406.36 1016,1016,1016,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,839.216 1024,1024,1024,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,1578.51 1032,1032,1032,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,833.802 1040,1040,1040,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,1278.99 1048,1048,1048,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,863.496 1056,1056,1056,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,1540.55 1064,1064,1064,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,831.673 1072,1072,1072,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,853.314 1080,1080,1080,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,853.339 1088,1088,1088,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,1582.68 1096,1096,1096,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,855.113 1104,1104,1104,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,1337.01 1112,1112,1112,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,872.876 1120,1120,1120,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,1163.87 1128,1128,1128,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,862.628 1136,1136,1136,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,1182.83 1144,1144,1144,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,1207.89 1152,1152,1152,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,1445.51 1160,1160,1160,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,1153.03 1168,1168,1168,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,1175.4 1176,1176,1176,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,1176.53 1184,1184,1184,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,981.487 1192,1192,1192,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,1227.48 1200,1200,1200,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,1573.43 1208,1208,1208,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,1248.7 1216,1216,1216,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,1578.47 1224,1224,1224,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,1199.22 1232,1232,1232,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,1209.44 1240,1240,1240,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,1230.22 1248,1248,1248,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,1699.59 1256,1256,1256,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,1251.0 1264,1264,1264,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,1273.02 1272,1272,1272,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,1287.57 1280,1280,1280,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,1590.53 1288,1288,1288,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,1244.0 1296,1296,1296,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,1485.26 1304,1304,1304,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,1274.51 1312,1312,1312,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,993.004 1320,1320,1320,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,1301.79 1328,1328,1328,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,1314.03 1336,1336,1336,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,1326.27 1344,1344,1344,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,1601.15 1352,1352,1352,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,1276.58 1360,1360,1360,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,1278.46 1368,1368,1368,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,1299.77 1376,1376,1376,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,992.864 1384,1384,1384,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,1328.08 1392,1392,1392,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,1714.85 1400,1400,1400,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,1356.69 1408,1408,1408,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,1772.2 1416,1416,1416,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,1345.4 1424,1424,1424,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,1353.81 1432,1432,1432,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,1370.64 1440,1440,1440,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,1559.24 1448,1448,1448,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,1405.67 1456,1456,1456,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,1419.07 1464,1464,1464,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,1413.53 1472,1472,1472,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,1662.62 1480,1480,1480,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,1356.02 1488,1488,1488,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,1662.51 1496,1496,1496,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,1386.62 1504,1504,1504,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,1004.78 1512,1512,1512,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,1417.87 1520,1520,1520,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,1264.72 1528,1528,1528,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,1443.8 1536,1536,1536,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,1710.95 1544,1544,1544,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,1401.42 1552,1552,1552,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,1419.84 1560,1560,1560,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,1425.75 1568,1568,1568,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,1444.9 1576,1576,1576,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,1457.56 1584,1584,1584,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,1630.1 1592,1592,1592,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,1472.9 1600,1600,1600,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,1727.37 1608,1608,1608,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,1432.28 1616,1616,1616,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,1455.28 1624,1624,1624,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,1465.8 1632,1632,1632,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,1726.17 1640,1640,1640,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,1477.81 1648,1648,1648,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,1502.69 1656,1656,1656,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,1510.27 1664,1664,1664,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,1759.82 1672,1672,1672,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,1469.09 1680,1680,1680,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,1820.71 1688,1688,1688,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,1494.65 1696,1696,1696,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,1505.76 1704,1704,1704,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,1526.07 1712,1712,1712,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,1533.92 1720,1720,1720,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,1554.46 1728,1728,1728,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,1706.04 1736,1736,1736,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,1497.58 1744,1744,1744,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,1520.38 1752,1752,1752,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,1535.59 1760,1760,1760,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,1410.64 1768,1768,1768,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,1549.15 1776,1776,1776,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,1795.3 1784,1784,1784,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,1590.0 1792,1792,1792,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,1826.15 1800,1800,1800,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,1554.98 1808,1808,1808,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,1520.45 1816,1816,1816,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,1546.2 1824,1824,1824,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,1701.26 1832,1832,1832,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,1565.86 1840,1840,1840,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,1314.82 1848,1848,1848,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,1584.09 1856,1856,1856,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,1778.42 1864,1864,1864,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,1526.05 1872,1872,1872,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,1784.29 1880,1880,1880,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,1557.5 1888,1888,1888,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,1581.99 1896,1896,1896,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,1575.6 1904,1904,1904,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,1596.08 1912,1912,1912,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,1604.35 1920,1920,1920,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,1688.12 1928,1928,1928,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,1558.41 1936,1936,1936,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,1574.73 1944,1944,1944,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,1577.48 1952,1952,1952,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,1598.76 1960,1960,1960,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,1618.8 1968,1968,1968,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,1782.94 1976,1976,1976,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,1652.05 1984,1984,1984,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,1855.55 1992,1992,1992,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,1590.48 2000,2000,2000,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,1357.68 2008,2008,2008,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,1606.04 2016,2016,2016,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,1717.41 2024,2024,2024,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,1631.74 2032,2032,2032,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,1639.12 2040,2040,2040,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,1646.23 2048,2048,2048,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,1808.61 2056,2056,2056,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,1598.26 2064,2064,2064,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,1782.64 2072,2072,2072,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,1626.44 2080,2080,2080,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,1375.23 2088,2088,2088,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,1645.57 2096,2096,2096,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,1667.1 2104,2104,2104,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,1682.16 2112,2112,2112,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,1858.4 2120,2120,2120,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,1632.53 2128,2128,2128,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,1644.0 2136,2136,2136,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,1655.66 2144,2144,2144,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,1690.0 2152,2152,2152,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,1683.11 2160,2160,2160,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,1798.42 2168,2168,2168,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,1707.75 2176,2176,2176,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,1836.16 2184,2184,2184,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,1634.52 2192,2192,2192,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,1655.56 2200,2200,2200,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,1658.9 2208,2208,2208,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,1749.34 2216,2216,2216,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,1679.08 2224,2224,2224,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,1703.58 2232,2232,2232,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,1699.52 2240,2240,2240,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,1879.27 2248,2248,2248,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,1642.14 2256,2256,2256,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,1813.71 2264,2264,2264,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,1678.36 2272,2272,2272,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,1698.37 2280,2280,2280,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,1713.16 2288,2288,2288,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,1715.03 2296,2296,2296,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,1723.88 2304,2304,2304,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,1743.05 2312,2312,2312,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,1664.36 2320,2320,2320,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,1381.55 2328,2328,2328,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,1683.17 2336,2336,2336,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,1707.21 2344,2344,2344,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,1706.76 2352,2352,2352,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,1832.1 2360,2360,2360,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,1716.37 2368,2368,2368,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,1847.84 2376,2376,2376,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,1669.33 2384,2384,2384,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,1673.69 2392,2392,2392,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,1685.5 2400,2400,2400,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,1790.95 2408,2408,2408,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,1720.19 2416,2416,2416,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,1718.34 2424,2424,2424,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,1736.55 2432,2432,2432,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,1890.07 2440,2440,2440,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,1702.25 2448,2448,2448,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,1854.32 2456,2456,2456,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,1710.38 2464,2464,2464,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,1736.55 2472,2472,2472,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,1732.99 2480,2480,2480,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,1436.84 2488,2488,2488,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,1759.82 2496,2496,2496,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,1814.88 2504,2504,2504,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,1721.7 2512,2512,2512,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,1718.25 2520,2520,2520,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,1733.34 2528,2528,2528,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,1732.51 2536,2536,2536,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,1741.26 2544,2544,2544,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,1870.67 2552,2552,2552,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,1745.95 2560,2560,2560,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,1867.74 2568,2568,2568,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,1713.11 2576,2576,2576,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,1726.88 2584,2584,2584,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,1732.91 2592,2592,2592,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,1841.7 2600,2600,2600,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,1748.15 2608,2608,2608,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,1761.62 2616,2616,2616,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,1774.03 2624,2624,2624,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,1873.32 2632,2632,2632,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,1718.68 2640,2640,2640,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,1809.86 2648,2648,2648,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,1744.32 2656,2656,2656,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,1756.26 2664,2664,2664,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,1766.71 2672,2672,2672,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,1774.71 2680,2680,2680,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,1777.9 2688,2688,2688,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,1863.24 2696,2696,2696,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,1720.91 2704,2704,2704,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,1733.77 2712,2712,2712,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,1740.23 2720,2720,2720,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,1412.32 2728,2728,2728,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,1767.66 2736,2736,2736,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,1832.54 2744,2744,2744,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,1781.34 2752,2752,2752,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,1873.56 2760,2760,2760,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,1732.24 2768,2768,2768,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,1747.72 2776,2776,2776,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,1759.76 2784,2784,2784,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,1816.62 2792,2792,2792,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,1768.1 2800,2800,2800,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,1798.07 2808,2808,2808,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,1790.26 2816,2816,2816,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,1915.33 2824,2824,2824,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,1752.44 2832,2832,2832,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,1866.96 2840,2840,2840,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,1766.86 2848,2848,2848,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,1788.84 2856,2856,2856,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,1784.01 2864,2864,2864,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,1801.38 2872,2872,2872,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,1798.08 2880,2880,2880,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,1848.1 2888,2888,2888,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,1739.84 2896,2896,2896,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,1754.53 2904,2904,2904,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,1763.81 2912,2912,2912,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,1773.12 2920,2920,2920,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,1779.79 2928,2928,2928,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,1826.35 2936,2936,2936,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,1802.52 2944,2944,2944,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,1887.66 2952,2952,2952,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,1752.03 2960,2960,2960,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,1764.91 2968,2968,2968,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,1770.76 2976,2976,2976,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,1881.04 2984,2984,2984,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,1785.73 2992,2992,2992,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,1798.06 3000,3000,3000,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,1806.23 3008,3008,3008,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,1895.41 3016,3016,3016,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,1763.57 3024,3024,3024,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,1861.28 3032,3032,3032,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,1783.78 3040,3040,3040,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,1794.23 3048,3048,3048,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,1800.23 3056,3056,3056,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,1805.91 3064,3064,3064,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,1815.35 3072,3072,3072,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,1145.82 3080,3080,3080,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,1764.72 3088,3088,3088,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,1786.22 3096,3096,3096,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,1788.54 3104,3104,3104,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,1796.04 3112,3112,3112,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,1806.82 3120,3120,3120,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,1830.3 3128,3128,3128,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,1830.75 3136,3136,3136,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,1912.97 3144,3144,3144,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,1786.21 3152,3152,3152,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,1786.1 3160,3160,3160,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,1805.01 3168,3168,3168,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,1879.37 3176,3176,3176,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,1820.17 3184,3184,3184,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,1824.26 3192,3192,3192,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,1833.5 3200,3200,3200,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,1922.49 3208,3208,3208,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,1781.27 3216,3216,3216,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,1864.25 3224,3224,3224,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,1804.95 3232,3232,3232,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,1813.63 3240,3240,3240,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,1823.89 3248,3248,3248,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,1838.35 3256,3256,3256,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,1839.8 3264,3264,3264,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,1853.65 3272,3272,3272,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,1784.92 3280,3280,3280,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,1797.01 3288,3288,3288,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,1804.77 3296,3296,3296,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,1810.73 3304,3304,3304,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,1824.78 3312,3312,3312,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,1839.13 3320,3320,3320,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,1842.3 3328,3328,3328,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,1914.21 3336,3336,3336,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,1790.83 3344,3344,3344,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,1810.9 3352,3352,3352,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,1809.77 3360,3360,3360,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,1889.02 3368,3368,3368,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,1828.63 3376,3376,3376,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,1837.25 3384,3384,3384,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,1853.34 3392,3392,3392,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,1928.83 3400,3400,3400,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,1798.49 3408,3408,3408,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,1876.35 3416,3416,3416,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,1821.3 3424,3424,3424,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,1832.2 3432,3432,3432,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,1838.28 3440,3440,3440,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,1851.45 3448,3448,3448,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,1854.44 3456,3456,3456,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,1863.47 3464,3464,3464,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,1796.43 3472,3472,3472,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,1811.93 3480,3480,3480,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,1818.32 3488,3488,3488,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,1835.75 3496,3496,3496,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,1836.4 3504,3504,3504,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,1861.08 3512,3512,3512,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,1852.45 3520,3520,3520,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,1929.45 3528,3528,3528,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,1813.9 3536,3536,3536,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,1822.88 3544,3544,3544,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,1828.24 3552,3552,3552,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,1856.14 3560,3560,3560,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,1866.6 3568,3568,3568,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,1843.59 3576,3576,3576,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,1854.67 3584,3584,3584,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,1887.81 3592,3592,3592,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,1810.44 3600,3600,3600,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,1887.43 3608,3608,3608,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,1833.77 3616,3616,3616,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,1842.11 3624,3624,3624,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,1841.88 3632,3632,3632,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,1853.79 3640,3640,3640,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,1865.82 3648,3648,3648,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,1892.59 3656,3656,3656,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,1822.57 3664,3664,3664,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,1833.03 3672,3672,3672,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,1833.44 3680,3680,3680,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,1847.14 3688,3688,3688,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,1849.47 3696,3696,3696,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,1886.54 3704,3704,3704,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,1871.59 3712,3712,3712,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,1925.93 3720,3720,3720,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,1825.68 3728,3728,3728,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,1827.33 3736,3736,3736,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,1838.0 3744,3744,3744,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,1883.21 3752,3752,3752,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,1855.21 3760,3760,3760,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,1857.87 3768,3768,3768,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,1868.98 3776,3776,3776,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,1920.23 3784,3784,3784,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,1824.77 3792,3792,3792,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,1876.17 3800,3800,3800,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,1842.45 3808,3808,3808,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,1844.57 3816,3816,3816,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,1855.39 3824,3824,3824,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,1866.18 3832,3832,3832,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,1870.1 3840,3840,3840,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,1825.16 3848,3848,3848,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,1831.01 3856,3856,3856,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,1837.94 3864,3864,3864,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,1847.99 3872,3872,3872,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,1862.48 3880,3880,3880,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,1862.36 3888,3888,3888,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,1870.49 3896,3896,3896,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,1881.83 3904,3904,3904,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,1932.93 3912,3912,3912,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,1839.2 3920,3920,3920,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,1851.38 3928,3928,3928,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,1852.17 3936,3936,3936,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,1871.51 3944,3944,3944,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,1857.47 3952,3952,3952,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,1873.73 3960,3960,3960,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,1873.69 3968,3968,3968,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,1928.95 3976,3976,3976,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,1831.5 3984,3984,3984,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,1868.62 3992,3992,3992,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,1848.86 4000,4000,4000,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,1866.37 4008,4008,4008,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,1874.72 4016,4016,4016,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,1871.11 4024,4024,4024,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,1877.14 4032,4032,4032,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,1870.57 4040,4040,4040,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,1845.67 4048,4048,4048,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,1851.38 4056,4056,4056,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,1857.85 4064,4064,4064,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,1860.38 4072,4072,4072,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,1874.15 4080,4080,4080,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,1858.95 4088,4088,4088,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,1863.02 4096,4096,4096,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,532.301 4104,4104,4104,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,1817.29 4112,4112,4112,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,1843.93 4120,4120,4120,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,1857.48 4128,4128,4128,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,1867.54 4136,4136,4136,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,1869.15 4144,4144,4144,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,1876.2 4152,4152,4152,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,1888.28 4160,4160,4160,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,1927.0 4168,4168,4168,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,1844.25 4176,4176,4176,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,1867.52 4184,4184,4184,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,1855.35 4192,4192,4192,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,1869.01 4200,4200,4200,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,1873.02 4208,4208,4208,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,1878.2 4216,4216,4216,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,1885.04 4224,4224,4224,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,1881.27 4232,4232,4232,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,1845.18 4240,4240,4240,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,1855.33 4248,4248,4248,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,1859.1 4256,4256,4256,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,1872.64 4264,4264,4264,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,1872.8 4272,4272,4272,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,1868.57 4280,4280,4280,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,1886.51 4288,4288,4288,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,1929.65 4296,4296,4296,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,1845.66 4304,4304,4304,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,1854.42 4312,4312,4312,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,1858.38 4320,4320,4320,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,1874.68 4328,4328,4328,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,1873.98 4336,4336,4336,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,1882.43 4344,4344,4344,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,1881.25 4352,4352,4352,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,1929.96 4360,4360,4360,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,1847.39 4368,4368,4368,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,1869.65 4376,4376,4376,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,1863.1 4384,4384,4384,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,1872.66 4392,4392,4392,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,1876.6 4400,4400,4400,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,1885.25 4408,4408,4408,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,1888.88 4416,4416,4416,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,1877.44 4424,4424,4424,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,1851.18 4432,4432,4432,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,1860.47 4440,4440,4440,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,1863.96 4448,4448,4448,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,1876.31 4456,4456,4456,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,1878.54 4464,4464,4464,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,1877.46 4472,4472,4472,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,1891.67 4480,4480,4480,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,1938.13 4488,4488,4488,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,1855.52 4496,4496,4496,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,1865.85 4504,4504,4504,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,1869.31 4512,4512,4512,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,1881.69 4520,4520,4520,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,1881.67 4528,4528,4528,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,1890.02 4536,4536,4536,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,1897.5 4544,4544,4544,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,1942.65 4552,4552,4552,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,1857.5 4560,4560,4560,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,1883.03 4568,4568,4568,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,1877.6 4576,4576,4576,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,1881.62 4584,4584,4584,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,1890.3 4592,4592,4592,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,1891.48 4600,4600,4600,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,1889.99 4608,4608,4608,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,627.033 4616,4616,4616,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,1851.91 4624,4624,4624,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,1864.71 4632,4632,4632,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,1878.46 4640,4640,4640,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,1882.3 4648,4648,4648,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,1886.65 4656,4656,4656,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,1888.97 4664,4664,4664,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,1899.69 4672,4672,4672,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,1937.44 4680,4680,4680,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,1862.79 4688,4688,4688,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,1867.64 4696,4696,4696,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,1877.06 4704,4704,4704,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,1893.7 4712,4712,4712,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,1886.1 4720,4720,4720,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,1894.58 4728,4728,4728,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,1901.42 4736,4736,4736,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,1942.71 4744,4744,4744,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,1862.99 4752,4752,4752,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,1894.74 4760,4760,4760,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,1878.23 4768,4768,4768,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,1890.07 4776,4776,4776,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,1890.49 4784,4784,4784,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,1896.94 4792,4792,4792,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,1901.95 4800,4800,4800,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,1898.94 4808,4808,4808,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,1867.9 4816,4816,4816,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,1879.67 4824,4824,4824,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,1886.77 4832,4832,4832,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,1888.51 4840,4840,4840,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,1894.08 4848,4848,4848,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,1896.75 4856,4856,4856,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,1900.83 4864,4864,4864,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,1939.52 4872,4872,4872,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,1864.58 4880,4880,4880,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,1873.1 4888,4888,4888,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,1879.62 4896,4896,4896,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,1884.23 4904,4904,4904,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,1891.4 4912,4912,4912,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,1899.06 4920,4920,4920,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,1902.86 4928,4928,4928,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,1949.16 4936,4936,4936,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,1870.09 4944,4944,4944,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,1883.57 4952,4952,4952,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,1883.86 4960,4960,4960,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,1891.14 4968,4968,4968,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,1900.2 4976,4976,4976,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,1906.92 4984,4984,4984,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,1908.64 4992,4992,4992,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,1884.35 5000,5000,5000,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,1873.82 5008,5008,5008,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,1882.9 5016,5016,5016,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,1885.16 5024,5024,5024,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,1892.09 5032,5032,5032,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,1894.53 5040,5040,5040,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,1894.85 5048,5048,5048,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,1906.85 5056,5056,5056,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,1951.43 5064,5064,5064,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,1877.9 5072,5072,5072,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,1885.03 5080,5080,5080,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,1889.79 5088,5088,5088,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,1899.5 5096,5096,5096,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,1897.1 5104,5104,5104,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,1907.66 5112,5112,5112,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,1901.9 5120,5120,5120,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,1167.52 5128,5128,5128,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,1870.32 5136,5136,5136,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,1875.69 5144,5144,5144,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,1886.25 5152,5152,5152,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,1894.82 5160,5160,5160,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,1903.03 5168,5168,5168,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,1909.26 5176,5176,5176,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,1912.68 5184,5184,5184,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,1889.98 5192,5192,5192,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,1879.96 5200,5200,5200,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,1885.96 5208,5208,5208,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,1890.54 5216,5216,5216,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,1897.93 5224,5224,5224,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,1901.07 5232,5232,5232,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,1893.22 5240,5240,5240,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,1912.17 5248,5248,5248,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,1951.88 5256,5256,5256,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,1880.3 5264,5264,5264,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,1885.35 5272,5272,5272,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,1890.84 5280,5280,5280,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,1897.85 5288,5288,5288,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,1897.57 5296,5296,5296,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,1907.51 5304,5304,5304,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,1914.74 5312,5312,5312,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,1949.99 5320,5320,5320,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,1882.96 5328,5328,5328,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,1881.43 5336,5336,5336,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,1890.87 5344,5344,5344,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,1899.55 5352,5352,5352,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,1900.56 5360,5360,5360,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,1908.78 5368,5368,5368,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,1915.03 5376,5376,5376,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,1052.81 5384,5384,5384,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,1876.17 5392,5392,5392,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,1889.13 5400,5400,5400,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,1891.59 5408,5408,5408,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,1789.14 5416,5416,5416,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,1800.09 5424,5424,5424,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,1891.55 5432,5432,5432,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,1773.1 5440,5440,5440,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,1946.86 5448,5448,5448,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,1780.36 5456,5456,5456,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,1775.18 5464,5464,5464,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,1762.86 5472,5472,5472,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,1894.92 5480,5480,5480,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,1761.19 5488,5488,5488,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,1779.9 5496,5496,5496,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,1764.95 5504,5504,5504,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,1945.64 5512,5512,5512,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,1779.96 5520,5520,5520,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,1886.92 5528,5528,5528,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,1781.68 5536,5536,5536,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,1779.2 5544,5544,5544,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,1772.69 5552,5552,5552,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,1783.51 5560,5560,5560,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,1788.97 5568,5568,5568,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,1893.65 5576,5576,5576,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,1794.52 5584,5584,5584,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,1785.93 5592,5592,5592,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,1787.07 5600,5600,5600,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,1798.78 5608,5608,5608,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,1791.5 5616,5616,5616,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,1889.68 5624,5624,5624,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,1726.9 5632,5632,5632,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,1081.06 5640,5640,5640,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,1610.88 5648,5648,5648,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,1775.56 5656,5656,5656,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,1782.07 5664,5664,5664,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,1887.6 5672,5672,5672,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,1774.36 5680,5680,5680,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,1787.88 5688,5688,5688,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,1779.13 5696,5696,5696,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,1947.02 5704,5704,5704,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,1789.26 5712,5712,5712,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,1891.45 5720,5720,5720,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,1772.2 5728,5728,5728,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,1787.8 5736,5736,5736,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,1789.13 5744,5744,5744,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,1803.68 5752,5752,5752,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,1782.79 5760,5760,5760,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,4,S9150_14.50.2,1882.52 clblas-2.10/doc/performance/clBLAS_2.7.1/S9150/sgemmNT_S9150_14.50.2_2.7.1_8.csv000066400000000000000000002374711264277366700250450ustar00rootroot00000000000000m,n,k,lda,ldb,ldc,offa,offb,offc,alpha,beta,order,transa,transb,side,uplo,diag,function,device,library,numQueues,label,GFLOPS 8,8,8,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,0.0752941 16,16,16,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,0.630639 24,24,24,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,0.645378 32,32,32,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,4.15838 40,40,40,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,2.5869 48,48,48,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,9.41609 56,56,56,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,5.78636 64,64,64,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,32.2242 72,72,72,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,13.1264 80,80,80,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,31.0585 88,88,88,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,20.7229 96,96,96,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,81.0198 104,104,104,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,28.3842 112,112,112,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,71.2258 120,120,120,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,45.3246 128,128,128,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,154.259 136,136,136,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,61.4951 144,144,144,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,133.991 152,152,152,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,77.7551 160,160,160,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,258.016 168,168,168,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,102.179 176,176,176,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,235.854 184,184,184,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,122.737 192,192,192,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,361.948 200,200,200,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,146.493 208,208,208,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,324.929 216,216,216,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,179.622 224,224,224,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,531.541 232,232,232,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,211.182 240,240,240,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,409.904 248,248,248,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,222.736 256,256,256,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,546.845 264,264,264,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,271.003 272,272,272,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,539.002 280,280,280,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,288.918 288,288,288,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,579.311 296,296,296,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,297.959 304,304,304,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,673.324 312,312,312,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,351.093 320,320,320,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,888.022 328,328,328,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,413.179 336,336,336,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,821.595 344,344,344,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,426.794 352,352,352,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,1086.96 360,360,360,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,473.209 368,368,368,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,965.066 376,376,376,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,485.766 384,384,384,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,1089.43 392,392,392,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,547.777 400,400,400,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,1099.66 408,408,408,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,576.01 416,416,416,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,1334.16 424,424,424,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,641.76 432,432,432,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,1165.64 440,440,440,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,667.691 448,448,448,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,1243.64 456,456,456,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,696.787 464,464,464,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,883.578 472,472,472,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,523.493 480,480,480,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,1409.71 488,488,488,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,568.646 496,496,496,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,989.932 504,504,504,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,600.319 512,512,512,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,1408.08 520,520,520,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,635.143 528,528,528,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,1110.55 536,536,536,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,685.851 544,544,544,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,1440.62 552,552,552,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,706.486 560,560,560,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,1251.76 568,568,568,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,746.362 576,576,576,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,1673.11 584,584,584,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,768.636 592,592,592,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,1361.16 600,600,600,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,790.398 608,608,608,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,1500.57 616,616,616,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,844.988 624,624,624,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,1381.5 632,632,632,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,882.381 640,640,640,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,1829.08 648,648,648,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,929.407 656,656,656,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,1228.09 664,664,664,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,978.983 672,672,672,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,1809.19 680,680,680,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,981.986 688,688,688,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,1296.32 696,696,696,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,1021.62 704,704,704,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,2244.61 712,712,712,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,1063.12 720,720,720,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,1427.83 728,728,728,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,1144.69 736,736,736,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,1606.29 744,744,744,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,1109.32 752,752,752,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,1520.38 760,760,760,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,1157.19 768,768,768,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,2164.54 776,776,776,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,1194.91 784,784,784,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,1490.42 792,792,792,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,1230.61 800,800,800,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,2098.02 808,808,808,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,1230.57 816,816,816,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,1410.19 824,824,824,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,1277.33 832,832,832,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,2579.52 840,840,840,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,1275.59 848,848,848,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,1507.54 856,856,856,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,1319.9 864,864,864,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,2542.21 872,872,872,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,1312.84 880,880,880,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,1595.26 888,888,888,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,1355.13 896,896,896,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,2551.15 904,904,904,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,1346.09 912,912,912,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,1554.92 920,920,920,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,1395.0 928,928,928,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,1737.86 936,936,936,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,1375.87 944,944,944,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,1502.76 952,952,952,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,1420.9 960,960,960,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,2496.19 968,968,968,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,1407.31 976,976,976,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,1589.31 984,984,984,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,1448.41 992,992,992,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,1830.78 1000,1000,1000,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,1434.17 1008,1008,1008,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,1593.39 1016,1016,1016,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,1308.82 1024,1024,1024,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,2743.3 1032,1032,1032,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,1356.73 1040,1040,1040,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,1616.43 1048,1048,1048,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,1364.57 1056,1056,1056,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,2887.83 1064,1064,1064,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,1502.1 1072,1072,1072,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,1588.62 1080,1080,1080,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,1497.45 1088,1088,1088,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,2872.85 1096,1096,1096,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,1496.71 1104,1104,1104,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,2332.84 1112,1112,1112,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,1526.47 1120,1120,1120,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,2704.49 1128,1128,1128,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,1563.19 1136,1136,1136,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,1681.86 1144,1144,1144,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,1609.44 1152,1152,1152,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,2765.7 1160,1160,1160,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,1556.16 1168,1168,1168,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,1801.52 1176,1176,1176,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,1598.34 1184,1184,1184,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,1848.52 1192,1192,1192,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,1569.46 1200,1200,1200,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,2663.36 1208,1208,1208,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,1567.39 1216,1216,1216,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,2906.0 1224,1224,1224,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,1672.94 1232,1232,1232,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,1897.52 1240,1240,1240,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,1632.37 1248,1248,1248,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,3259.11 1256,1256,1256,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,1736.72 1264,1264,1264,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,1887.1 1272,1272,1272,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,1761.23 1280,1280,1280,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,2933.74 1288,1288,1288,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,1808.47 1296,1296,1296,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,2533.56 1304,1304,1304,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,1800.1 1312,1312,1312,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,2024.49 1320,1320,1320,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,1829.19 1328,1328,1328,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,1985.04 1336,1336,1336,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,1840.55 1344,1344,1344,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,3120.44 1352,1352,1352,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,1899.49 1360,1360,1360,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,2986.98 1368,1368,1368,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,1916.14 1376,1376,1376,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,2103.99 1384,1384,1384,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,1890.78 1392,1392,1392,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,2515.02 1400,1400,1400,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,1935.62 1408,1408,1408,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,3282.93 1416,1416,1416,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,1943.06 1424,1424,1424,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,2130.9 1432,1432,1432,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,2016.16 1440,1440,1440,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,3076.11 1448,1448,1448,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,1971.27 1456,1456,1456,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,2135.6 1464,1464,1464,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,1991.87 1472,1472,1472,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,3084.78 1480,1480,1480,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,1951.77 1488,1488,1488,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,2628.53 1496,1496,1496,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,1974.23 1504,1504,1504,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,2269.66 1512,1512,1512,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,2063.77 1520,1520,1520,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,2988.14 1528,1528,1528,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,2093.01 1536,1536,1536,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,3494.58 1544,1544,1544,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,2218.6 1552,1552,1552,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,2333.13 1560,1560,1560,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,2241.41 1568,1568,1568,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,2355.68 1576,1576,1576,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,2249.38 1584,1584,1584,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,2640.63 1592,1592,1592,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,2314.01 1600,1600,1600,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,3203.58 1608,1608,1608,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,2338.3 1616,1616,1616,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,2343.12 1624,1624,1624,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,2380.17 1632,1632,1632,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,3460.26 1640,1640,1640,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,2263.13 1648,1648,1648,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,2463.42 1656,1656,1656,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,2314.59 1664,1664,1664,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,3265.3 1672,1672,1672,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,2329.25 1680,1680,1680,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,3037.68 1688,1688,1688,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,2377.59 1696,1696,1696,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,2526.29 1704,1704,1704,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,2425.76 1712,1712,1712,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,2594.83 1720,1720,1720,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,2447.62 1728,1728,1728,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,3441.49 1736,1736,1736,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,2364.98 1744,1744,1744,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,2579.22 1752,1752,1752,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,2416.51 1760,1760,1760,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,2609.42 1768,1768,1768,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,2405.62 1776,1776,1776,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,2626.62 1784,1784,1784,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,2442.61 1792,1792,1792,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,3396.97 1800,1800,1800,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,2483.41 1808,1808,1808,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,2696.71 1816,1816,1816,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,2535.6 1824,1824,1824,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,3452.7 1832,1832,1832,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,2453.22 1840,1840,1840,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,3124.15 1848,1848,1848,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,2495.7 1856,1856,1856,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,3306.18 1864,1864,1864,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,2485.57 1872,1872,1872,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,2722.47 1880,1880,1880,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,2528.34 1888,1888,1888,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,2675.7 1896,1896,1896,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,2559.38 1904,1904,1904,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,2812.67 1912,1912,1912,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,2600.55 1920,1920,1920,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,3475.99 1928,1928,1928,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,2529.35 1936,1936,1936,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,2753.53 1944,1944,1944,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,2573.6 1952,1952,1952,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,2759.17 1960,1960,1960,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,2576.04 1968,1968,1968,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,2718.38 1976,1976,1976,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,2606.14 1984,1984,1984,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,3452.1 1992,1992,1992,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,2645.53 2000,2000,2000,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,3235.8 2008,2008,2008,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,2685.98 2016,2016,2016,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,3506.98 2024,2024,2024,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,2584.53 2032,2032,2032,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,2804.81 2040,2040,2040,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,2621.78 2048,2048,2048,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,3358.59 2056,2056,2056,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,2624.67 2064,2064,2064,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,2809.45 2072,2072,2072,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,2665.51 2080,2080,2080,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,2836.33 2088,2088,2088,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,2704.67 2096,2096,2096,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,2850.75 2104,2104,2104,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,2741.95 2112,2112,2112,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,3856.81 2120,2120,2120,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,2698.17 2128,2128,2128,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,2967.15 2136,2136,2136,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,2739.78 2144,2144,2144,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,2920.42 2152,2152,2152,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,2719.03 2160,2160,2160,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,3351.97 2168,2168,2168,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,2758.83 2176,2176,2176,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,3423.0 2184,2184,2184,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,2798.34 2192,2192,2192,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,3085.43 2200,2200,2200,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,2822.42 2208,2208,2208,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,3606.4 2216,2216,2216,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,2711.49 2224,2224,2224,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,2999.36 2232,2232,2232,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,2753.45 2240,2240,2240,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,3503.85 2248,2248,2248,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,2736.74 2256,2256,2256,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,3033.28 2264,2264,2264,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,2788.21 2272,2272,2272,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,3007.87 2280,2280,2280,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,2817.92 2288,2288,2288,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,3106.69 2296,2296,2296,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,2861.65 2304,2304,2304,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,3661.85 2312,2312,2312,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,2795.29 2320,2320,2320,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,3317.26 2328,2328,2328,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,2829.84 2336,2336,2336,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,3001.59 2344,2344,2344,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,2856.51 2352,2352,2352,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,3125.19 2360,2360,2360,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,2872.07 2368,2368,2368,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,3451.31 2376,2376,2376,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,2897.33 2384,2384,2384,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,3151.45 2392,2392,2392,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,2930.13 2400,2400,2400,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,3727.48 2408,2408,2408,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,2846.6 2416,2416,2416,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,3115.35 2424,2424,2424,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,2876.27 2432,2432,2432,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,3530.07 2440,2440,2440,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,2876.05 2448,2448,2448,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,3141.16 2456,2456,2456,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,2934.48 2464,2464,2464,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,3097.69 2472,2472,2472,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,2978.61 2480,2480,2480,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,3226.76 2488,2488,2488,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,3001.28 2496,2496,2496,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,3793.04 2504,2504,2504,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,2969.32 2512,2512,2512,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,3147.45 2520,2520,2520,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,2966.32 2528,2528,2528,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,3106.32 2536,2536,2536,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,3032.09 2544,2544,2544,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,3219.33 2552,2552,2552,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,3060.47 2560,2560,2560,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,3495.84 2568,2568,2568,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,2960.43 2576,2576,2576,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,3165.34 2584,2584,2584,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,3005.2 2592,2592,2592,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,3861.44 2600,2600,2600,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,3034.4 2608,2608,2608,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,3243.15 2616,2616,2616,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,3083.76 2624,2624,2624,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,3504.08 2632,2632,2632,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,2974.15 2640,2640,2640,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,3184.42 2648,2648,2648,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,3009.67 2656,2656,2656,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,3137.93 2664,2664,2664,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,3044.31 2672,2672,2672,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,3253.09 2680,2680,2680,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,3085.1 2688,2688,2688,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,3933.31 2696,2696,2696,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,2987.82 2704,2704,2704,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,3181.2 2712,2712,2712,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,3012.33 2720,2720,2720,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,3156.65 2728,2728,2728,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,3036.03 2736,2736,2736,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,3256.77 2744,2744,2744,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,3103.3 2752,2752,2752,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,3504.49 2760,2760,2760,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,3019.4 2768,2768,2768,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,3213.21 2776,2776,2776,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,3040.81 2784,2784,2784,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,3816.67 2792,2792,2792,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,3053.76 2800,2800,2800,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,3270.0 2808,2808,2808,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,3084.65 2816,2816,2816,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,3584.77 2824,2824,2824,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,3058.12 2832,2832,2832,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,3232.05 2840,2840,2840,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,3102.75 2848,2848,2848,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,3252.69 2856,2856,2856,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,3121.9 2864,2864,2864,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,3296.07 2872,2872,2872,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,3167.84 2880,2880,2880,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,3896.52 2888,2888,2888,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,3033.21 2896,2896,2896,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,3241.4 2904,2904,2904,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,3072.87 2912,2912,2912,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,3202.88 2920,2920,2920,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,3083.02 2928,2928,2928,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,3300.61 2936,2936,2936,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,3136.25 2944,2944,2944,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,3532.36 2952,2952,2952,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,3065.08 2960,2960,2960,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,3269.98 2968,2968,2968,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,3078.02 2976,2976,2976,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,3977.35 2984,2984,2984,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,3106.74 2992,2992,2992,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,3316.3 3000,3000,3000,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,3157.35 3008,3008,3008,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,3547.3 3016,3016,3016,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,3065.97 3024,3024,3024,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,3285.29 3032,3032,3032,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,3111.06 3040,3040,3040,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,3252.75 3048,3048,3048,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,3122.03 3056,3056,3056,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,3347.38 3064,3064,3064,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,3147.01 3072,3072,3072,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,3885.04 3080,3080,3080,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,3088.55 3088,3088,3088,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,3309.49 3096,3096,3096,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,3126.87 3104,3104,3104,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,3276.83 3112,3112,3112,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,3170.01 3120,3120,3120,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,3358.68 3128,3128,3128,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,3184.36 3136,3136,3136,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,3581.12 3144,3144,3144,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,3121.12 3152,3152,3152,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,3315.84 3160,3160,3160,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,3135.86 3168,3168,3168,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,3985.87 3176,3176,3176,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,3160.72 3184,3184,3184,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,3389.52 3192,3192,3192,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,3210.68 3200,3200,3200,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,3599.86 3208,3208,3208,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,3131.57 3216,3216,3216,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,3329.72 3224,3224,3224,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,3164.37 3232,3232,3232,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,3335.55 3240,3240,3240,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,3195.69 3248,3248,3248,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,3407.62 3256,3256,3256,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,3233.03 3264,3264,3264,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,3928.12 3272,3272,3272,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,3117.09 3280,3280,3280,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,3322.7 3288,3288,3288,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,3153.57 3296,3296,3296,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,3314.08 3304,3304,3304,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,3171.69 3312,3312,3312,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,3392.57 3320,3320,3320,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,3196.64 3328,3328,3328,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,3585.94 3336,3336,3336,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,3133.99 3344,3344,3344,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,3345.42 3352,3352,3352,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,3176.97 3360,3360,3360,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,4017.75 3368,3368,3368,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,3190.75 3376,3376,3376,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,3415.31 3384,3384,3384,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,3228.74 3392,3392,3392,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,3611.86 3400,3400,3400,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,3173.37 3408,3408,3408,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,3369.86 3416,3416,3416,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,3203.24 3424,3424,3424,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,3374.78 3432,3432,3432,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,3216.74 3440,3440,3440,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,3431.0 3448,3448,3448,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,3251.97 3456,3456,3456,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,3975.91 3464,3464,3464,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,3157.28 3472,3472,3472,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,3361.93 3480,3480,3480,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,3186.05 3488,3488,3488,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,3361.2 3496,3496,3496,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,3216.33 3504,3504,3504,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,3420.55 3512,3512,3512,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,3241.68 3520,3520,3520,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,3613.33 3528,3528,3528,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,3175.87 3536,3536,3536,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,3387.88 3544,3544,3544,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,3204.09 3552,3552,3552,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,3946.48 3560,3560,3560,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,3245.05 3568,3568,3568,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,3442.57 3576,3576,3576,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,3259.64 3584,3584,3584,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,3584.05 3592,3592,3592,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,3170.86 3600,3600,3600,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,3384.23 3608,3608,3608,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,3203.95 3616,3616,3616,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,3383.7 3624,3624,3624,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,3235.75 3632,3632,3632,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,3439.59 3640,3640,3640,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,3245.7 3648,3648,3648,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,4037.09 3656,3656,3656,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,3200.42 3664,3664,3664,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,3406.23 3672,3672,3672,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,3220.4 3680,3680,3680,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,3418.96 3688,3688,3688,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,3262.75 3696,3696,3696,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,3475.12 3704,3704,3704,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,3273.2 3712,3712,3712,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,3607.25 3720,3720,3720,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,3206.24 3728,3728,3728,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,3409.67 3736,3736,3736,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,3226.62 3744,3744,3744,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,4016.15 3752,3752,3752,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,3259.84 3760,3760,3760,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,3462.8 3768,3768,3768,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,3285.03 3776,3776,3776,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,3595.5 3784,3784,3784,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,3200.62 3792,3792,3792,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,3406.59 3800,3800,3800,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,3218.35 3808,3808,3808,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,3410.29 3816,3816,3816,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,3256.63 3824,3824,3824,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,3470.08 3832,3832,3832,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,3285.86 3840,3840,3840,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,4001.55 3848,3848,3848,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,3215.9 3856,3856,3856,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,3431.1 3864,3864,3864,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,3245.84 3872,3872,3872,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,3446.27 3880,3880,3880,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,3284.85 3888,3888,3888,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,3489.41 3896,3896,3896,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,3303.92 3904,3904,3904,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,3620.38 3912,3912,3912,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,3222.48 3920,3920,3920,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,3431.0 3928,3928,3928,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,3247.22 3936,3936,3936,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,3995.38 3944,3944,3944,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,3275.51 3952,3952,3952,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,3484.03 3960,3960,3960,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,3294.29 3968,3968,3968,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,3614.04 3976,3976,3976,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,3232.88 3984,3984,3984,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,3442.08 3992,3992,3992,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,3246.11 4000,4000,4000,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,3785.29 4008,4008,4008,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,3463.78 4016,4016,4016,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,3787.68 4024,4024,4024,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,3470.58 4032,4032,4032,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,3992.31 4040,4040,4040,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,3347.38 4048,4048,4048,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,3702.85 4056,4056,4056,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,3380.15 4064,4064,4064,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,3749.35 4072,4072,4072,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,3386.11 4080,4080,4080,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,3742.55 4088,4088,4088,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,3402.5 4096,4096,4096,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,3348.79 4104,4104,4104,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,3438.54 4112,4112,4112,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,3798.86 4120,4120,4120,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,3436.66 4128,4128,4128,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,3993.06 4136,4136,4136,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,3350.32 4144,4144,4144,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,3730.84 4152,4152,4152,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,3379.55 4160,4160,4160,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,3754.6 4168,4168,4168,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,3409.53 4176,4176,4176,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,3754.21 4184,4184,4184,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,3449.67 4192,4192,4192,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,3836.33 4200,4200,4200,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,3460.85 4208,4208,4208,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,3829.89 4216,4216,4216,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,3488.7 4224,4224,4224,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,4087.28 4232,4232,4232,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,3438.94 4240,4240,4240,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,3753.58 4248,4248,4248,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,3464.05 4256,4256,4256,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,3793.45 4264,4264,4264,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,3459.71 4272,4272,4272,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,3778.02 4280,4280,4280,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,3499.33 4288,4288,4288,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,3816.95 4296,4296,4296,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,3509.46 4304,4304,4304,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,3830.8 4312,4312,4312,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,3521.8 4320,4320,4320,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,4008.85 4328,4328,4328,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,3382.09 4336,4336,4336,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,3731.3 4344,4344,4344,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,3400.04 4352,4352,4352,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,3764.17 4360,4360,4360,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,3427.59 4368,4368,4368,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,3784.66 4376,4376,4376,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,3451.84 4384,4384,4384,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,3809.45 4392,4392,4392,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,3485.17 4400,4400,4400,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,3833.38 4408,4408,4408,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,3517.48 4416,4416,4416,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,4020.56 4424,4424,4424,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,3413.52 4432,4432,4432,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,3751.24 4440,4440,4440,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,3404.62 4448,4448,4448,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,3778.41 4456,4456,4456,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,3440.26 4464,4464,4464,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,3800.86 4472,4472,4472,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,3465.24 4480,4480,4480,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,3828.89 4488,4488,4488,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,3493.09 4496,4496,4496,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,3854.17 4504,4504,4504,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,3516.6 4512,4512,4512,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,4035.35 4520,4520,4520,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,3436.38 4528,4528,4528,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,3770.94 4536,4536,4536,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,3430.96 4544,4544,4544,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,3797.67 4552,4552,4552,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,3460.18 4560,4560,4560,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,3811.85 4568,4568,4568,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,3480.66 4576,4576,4576,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,3836.16 4584,4584,4584,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,3509.6 4592,4592,4592,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,3864.47 4600,4600,4600,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,3528.01 4608,4608,4608,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,4050.77 4616,4616,4616,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,3411.09 4624,4624,4624,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,3792.9 4632,4632,4632,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,3451.56 4640,4640,4640,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,3817.44 4648,4648,4648,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,3488.94 4656,4656,4656,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,3836.69 4664,4664,4664,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,3510.44 4672,4672,4672,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,3868.05 4680,4680,4680,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,3526.9 4688,4688,4688,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,3888.59 4696,4696,4696,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,3550.29 4704,4704,4704,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,4071.57 4712,4712,4712,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,3441.25 4720,4720,4720,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,3811.89 4728,4728,4728,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,3471.5 4736,4736,4736,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,3832.37 4744,4744,4744,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,3504.09 4752,4752,4752,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,3848.87 4760,4760,4760,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,3518.82 4768,4768,4768,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,3871.71 4776,4776,4776,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,3535.84 4784,4784,4784,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,3896.53 4792,4792,4792,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,3554.14 4800,4800,4800,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,4092.6 4808,4808,4808,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,3469.53 4816,4816,4816,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,3827.05 4824,4824,4824,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,3485.15 4832,4832,4832,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,3849.27 4840,4840,4840,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,3529.96 4848,4848,4848,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,3873.8 4856,4856,4856,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,3552.01 4864,4864,4864,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,3895.96 4872,4872,4872,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,3575.7 4880,4880,4880,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,3914.82 4888,4888,4888,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,3580.45 4896,4896,4896,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,4048.93 4904,4904,4904,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,3444.65 4912,4912,4912,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,3817.2 4920,4920,4920,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,3483.84 4928,4928,4928,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,3840.02 4936,4936,4936,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,3498.87 4944,4944,4944,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,3862.43 4952,4952,4952,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,3523.9 4960,4960,4960,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,3890.69 4968,4968,4968,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,3556.43 4976,4976,4976,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,3907.93 4984,4984,4984,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,3579.4 4992,4992,4992,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,4075.14 5000,5000,5000,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,3482.59 5008,5008,5008,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,3844.58 5016,5016,5016,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,3497.77 5024,5024,5024,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,3869.76 5032,5032,5032,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,3535.61 5040,5040,5040,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,3880.04 5048,5048,5048,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,3560.09 5056,5056,5056,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,3908.27 5064,5064,5064,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,3581.79 5072,5072,5072,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,3932.35 5080,5080,5080,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,3587.45 5088,5088,5088,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,4101.8 5096,5096,5096,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,3473.06 5104,5104,5104,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,3854.38 5112,5112,5112,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,3476.0 5120,5120,5120,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,3391.73 5128,5128,5128,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,3517.17 5136,5136,5136,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,3907.23 5144,5144,5144,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,3538.84 5152,5152,5152,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,3917.37 5160,5160,5160,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,3581.88 5168,5168,5168,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,3939.61 5176,5176,5176,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,3619.4 5184,5184,5184,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,4071.22 5192,5192,5192,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,3481.23 5200,5200,5200,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,3859.51 5208,5208,5208,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,3503.62 5216,5216,5216,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,3879.4 5224,5224,5224,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,3538.73 5232,5232,5232,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,3896.94 5240,5240,5240,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,3574.84 5248,5248,5248,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,3917.89 5256,5256,5256,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,3581.68 5264,5264,5264,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,3942.1 5272,5272,5272,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,3619.42 5280,5280,5280,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,4101.91 5288,5288,5288,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,3523.72 5296,5296,5296,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,3880.04 5304,5304,5304,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,3525.33 5312,5312,5312,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,3900.1 5320,5320,5320,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,3553.59 5328,5328,5328,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,3913.77 5336,5336,5336,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,3595.28 5344,5344,5344,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,3934.11 5352,5352,5352,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,3608.89 5360,5360,5360,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,3959.25 5368,5368,5368,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,3626.94 5376,5376,5376,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,4077.09 5384,5384,5384,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,3519.76 5392,5392,5392,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,3877.3 5400,5400,5400,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,3527.67 5408,5408,5408,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,3897.33 5416,5416,5416,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,3559.26 5424,5424,5424,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,3921.7 5432,5432,5432,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,3571.95 5440,5440,5440,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,3940.16 5448,5448,5448,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,3581.72 5456,5456,5456,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,3955.92 5464,5464,5464,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,3610.31 5472,5472,5472,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,4110.71 5480,5480,5480,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,3536.23 5488,5488,5488,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,3902.23 5496,5496,5496,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,3563.75 5504,5504,5504,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,3921.18 5512,5512,5512,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,3580.09 5520,5520,5520,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,3925.84 5528,5528,5528,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,3612.93 5536,5536,5536,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,3952.41 5544,5544,5544,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,3617.32 5552,5552,5552,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,3972.51 5560,5560,5560,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,3656.52 5568,5568,5568,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,4093.37 5576,5576,5576,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,3546.37 5584,5584,5584,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,3897.24 5592,5592,5592,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,3549.19 5600,5600,5600,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,3920.06 5608,5608,5608,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,3565.77 5616,5616,5616,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,3937.68 5624,5624,5624,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,3578.86 5632,5632,5632,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,3693.88 5640,5640,5640,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,3594.87 5648,5648,5648,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,3979.86 5656,5656,5656,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,3624.36 5664,5664,5664,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,4079.02 5672,5672,5672,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,3517.56 5680,5680,5680,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,3897.5 5688,5688,5688,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,3543.44 5696,5696,5696,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,3915.5 5704,5704,5704,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,3583.85 5712,5712,5712,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,3934.71 5720,5720,5720,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,3592.24 5728,5728,5728,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,3955.14 5736,5736,5736,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,3609.86 5744,5744,5744,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,3970.68 5752,5752,5752,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,3651.09 5760,5760,5760,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,4,S9150_14.50.2,4115.82 clblas-2.10/doc/performance/clBLAS_2.7.1/S9150/zgemmNT_S9150_14.50.2_2.7.1_8.csv000066400000000000000000002374611264277366700250530ustar00rootroot00000000000000m,n,k,lda,ldb,ldc,offa,offb,offc,alpha,beta,order,transa,transb,side,uplo,diag,function,device,library,numQueues,label,GFLOPS 8,8,8,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,0.286634 16,16,16,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,2.31086 24,24,24,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,2.20435 32,32,32,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,14.937 40,40,40,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,8.32114 48,48,48,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,40.715 56,56,56,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,19.8324 64,64,64,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,84.0879 72,72,72,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,38.1498 80,80,80,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,119.417 88,88,88,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,60.8933 96,96,96,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,185.382 104,104,104,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,92.3439 112,112,112,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,256.666 120,120,120,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,127.575 128,128,128,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,298.953 136,136,136,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,155.299 144,144,144,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,395.037 152,152,152,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,202.395 160,160,160,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,414.208 168,168,168,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,239.613 176,176,176,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,514.987 184,184,184,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,276.56 192,192,192,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,506.921 200,200,200,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,300.117 208,208,208,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,569.281 216,216,216,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,340.981 224,224,224,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,602.125 232,232,232,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,370.374 240,240,240,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,493.582 248,248,248,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,360.006 256,256,256,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,773.723 264,264,264,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,400.462 272,272,272,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,591.09 280,280,280,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,445.635 288,288,288,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,1016.07 296,296,296,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,462.896 304,304,304,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,597.183 312,312,312,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,471.385 320,320,320,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,915.179 328,328,328,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,487.81 336,336,336,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,775.529 344,344,344,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,587.284 352,352,352,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,1115.74 360,360,360,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,658.262 368,368,368,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,631.105 376,376,376,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,722.271 384,384,384,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,1020.4 392,392,392,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,726.417 400,400,400,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,648.504 408,408,408,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,790.782 416,416,416,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,1214.81 424,424,424,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,807.704 432,432,432,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,1307.47 440,440,440,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,866.428 448,448,448,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,1142.64 456,456,456,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,837.493 464,464,464,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,672.178 472,472,472,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,909.421 480,480,480,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,1236.65 488,488,488,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,877.644 496,496,496,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,702.427 504,504,504,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,930.044 512,512,512,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,1293.74 520,520,520,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,944.946 528,528,528,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,1506.36 536,536,536,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,1014.87 544,544,544,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,1274.11 552,552,552,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,990.492 560,560,560,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,1006.81 568,568,568,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,1030.75 576,576,576,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,1334.83 584,584,584,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,1001.15 592,592,592,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,1034.88 600,600,600,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,1060.77 608,608,608,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,1259.68 616,616,616,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,1025.67 624,624,624,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,1571.76 632,632,632,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,1082.02 640,640,640,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,1265.58 648,648,648,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,1071.5 656,656,656,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,1092.34 664,664,664,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,1125.23 672,672,672,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,1532.14 680,680,680,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,1081.06 688,688,688,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,1103.54 696,696,696,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,1130.92 704,704,704,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,1404.32 712,712,712,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,1125.76 720,720,720,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,1506.62 728,728,728,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,1177.32 736,736,736,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,1308.68 744,744,744,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,1126.75 752,752,752,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,1148.53 760,760,760,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,1173.46 768,768,768,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,1701.02 776,776,776,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,1152.55 784,784,784,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,1179.56 792,792,792,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,1203.31 800,800,800,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,1350.22 808,808,808,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,1183.62 816,816,816,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,1671.91 824,824,824,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,1228.49 832,832,832,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,1583.91 840,840,840,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,1199.85 848,848,848,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,1222.57 856,856,856,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,1248.61 864,864,864,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,1665.84 872,872,872,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,1227.54 880,880,880,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,1247.77 888,888,888,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,1274.62 896,896,896,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,1487.4 904,904,904,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,1251.05 912,912,912,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,1665.2 920,920,920,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,1296.84 928,928,928,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,1376.58 936,936,936,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,1238.44 944,944,944,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,1261.94 952,952,952,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,1282.54 960,960,960,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,1669.81 968,968,968,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,1264.59 976,976,976,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,1283.34 984,984,984,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,1306.32 992,992,992,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,1433.67 1000,1000,1000,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,1287.71 1008,1008,1008,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,1682.55 1016,1016,1016,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,1335.7 1024,1024,1024,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,1651.62 1032,1032,1032,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,1280.44 1040,1040,1040,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,1476.02 1048,1048,1048,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,1498.49 1056,1056,1056,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,1849.66 1064,1064,1064,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,1478.95 1072,1072,1072,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,1497.24 1080,1080,1080,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,1520.57 1088,1088,1088,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,1622.68 1096,1096,1096,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,1549.51 1104,1104,1104,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,1726.01 1112,1112,1112,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,1474.8 1120,1120,1120,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,1444.24 1128,1128,1128,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,1522.2 1136,1136,1136,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,1544.91 1144,1144,1144,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,1561.65 1152,1152,1152,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,1746.65 1160,1160,1160,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,1516.18 1168,1168,1168,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,1535.85 1176,1176,1176,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,1554.5 1184,1184,1184,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,1416.3 1192,1192,1192,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,1598.22 1200,1200,1200,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,1781.46 1208,1208,1208,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,1545.01 1216,1216,1216,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,1605.85 1224,1224,1224,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,1591.72 1232,1232,1232,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,1609.65 1240,1240,1240,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,1629.63 1248,1248,1248,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,1811.85 1256,1256,1256,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,1582.46 1264,1264,1264,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,1606.76 1272,1272,1272,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,1617.74 1280,1280,1280,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,1606.33 1288,1288,1288,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,1652.71 1296,1296,1296,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,1838.86 1304,1304,1304,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,1614.54 1312,1312,1312,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,1431.3 1320,1320,1320,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,1648.66 1328,1328,1328,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,1672.05 1336,1336,1336,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,1701.97 1344,1344,1344,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,1874.72 1352,1352,1352,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,1639.37 1360,1360,1360,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,1668.56 1368,1368,1368,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,1679.69 1376,1376,1376,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,1705.76 1384,1384,1384,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,1721.89 1392,1392,1392,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,1817.38 1400,1400,1400,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,1631.44 1408,1408,1408,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,1770.98 1416,1416,1416,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,1657.63 1424,1424,1424,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,1675.51 1432,1432,1432,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,1708.86 1440,1440,1440,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,1853.36 1448,1448,1448,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,1659.34 1456,1456,1456,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,1679.41 1464,1464,1464,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,1701.39 1472,1472,1472,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,1661.38 1480,1480,1480,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,1738.48 1488,1488,1488,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,1892.76 1496,1496,1496,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,1697.45 1504,1504,1504,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,1717.86 1512,1512,1512,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,1731.42 1520,1520,1520,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,1766.98 1528,1528,1528,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,1763.09 1536,1536,1536,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,1842.11 1544,1544,1544,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,1685.99 1552,1552,1552,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,1700.89 1560,1560,1560,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,1714.48 1568,1568,1568,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,1735.86 1576,1576,1576,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,1747.88 1584,1584,1584,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,1892.56 1592,1592,1592,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,1718.02 1600,1600,1600,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,1710.69 1608,1608,1608,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,1759.0 1616,1616,1616,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,1770.37 1624,1624,1624,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,1786.74 1632,1632,1632,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,1865.67 1640,1640,1640,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,1709.16 1648,1648,1648,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,1724.03 1656,1656,1656,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,1738.33 1664,1664,1664,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,1734.57 1672,1672,1672,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,1770.25 1680,1680,1680,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,1904.35 1688,1688,1688,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,1750.26 1696,1696,1696,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,1760.97 1704,1704,1704,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,1784.86 1712,1712,1712,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,1804.06 1720,1720,1720,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,1827.38 1728,1728,1728,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,1886.01 1736,1736,1736,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,1739.5 1744,1744,1744,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,1756.23 1752,1752,1752,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,1774.13 1760,1760,1760,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,1792.45 1768,1768,1768,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,1801.33 1776,1776,1776,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,1869.54 1784,1784,1784,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,1742.28 1792,1792,1792,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,1790.31 1800,1800,1800,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,1767.6 1808,1808,1808,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,1776.84 1816,1816,1816,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,1799.03 1824,1824,1824,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,1911.0 1832,1832,1832,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,1762.7 1840,1840,1840,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,1776.4 1848,1848,1848,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,1796.4 1856,1856,1856,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,1743.94 1864,1864,1864,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,1830.32 1872,1872,1872,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,1903.19 1880,1880,1880,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,1774.2 1888,1888,1888,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,1791.8 1896,1896,1896,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,1813.75 1904,1904,1904,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,1826.96 1912,1912,1912,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,1825.84 1920,1920,1920,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,1889.43 1928,1928,1928,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,1775.95 1936,1936,1936,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,1787.3 1944,1944,1944,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,1797.73 1952,1952,1952,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,1820.36 1960,1960,1960,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,1832.06 1968,1968,1968,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,1890.55 1976,1976,1976,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,1775.32 1984,1984,1984,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,1812.38 1992,1992,1992,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,1801.24 2000,2000,2000,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,1816.65 2008,2008,2008,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,1827.92 2016,2016,2016,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,1889.31 2024,2024,2024,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,1780.23 2032,2032,2032,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,1795.64 2040,2040,2040,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,1808.72 2048,2048,2048,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,1765.35 2056,2056,2056,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,1823.96 2064,2064,2064,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,1885.57 2072,2072,2072,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,1778.13 2080,2080,2080,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,1797.56 2088,2088,2088,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,1803.88 2096,2096,2096,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,1824.12 2104,2104,2104,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,1842.51 2112,2112,2112,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,1931.8 2120,2120,2120,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,1802.18 2128,2128,2128,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,1823.47 2136,2136,2136,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,1828.8 2144,2144,2144,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,1842.73 2152,2152,2152,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,1852.13 2160,2160,2160,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,1895.35 2168,2168,2168,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,1776.88 2176,2176,2176,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,1781.83 2184,2184,2184,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,1800.75 2192,2192,2192,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,1807.56 2200,2200,2200,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,1817.28 2208,2208,2208,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,1900.33 2216,2216,2216,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,1778.92 2224,2224,2224,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,1790.0 2232,2232,2232,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,1802.82 2240,2240,2240,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,1823.98 2248,2248,2248,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,1831.46 2256,2256,2256,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,1908.0 2264,2264,2264,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,1790.22 2272,2272,2272,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,1810.69 2280,2280,2280,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,1819.72 2288,2288,2288,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,1825.01 2296,2296,2296,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,1843.32 2304,2304,2304,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,1906.65 2312,2312,2312,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,1803.76 2320,2320,2320,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,1806.34 2328,2328,2328,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,1831.52 2336,2336,2336,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,1840.44 2344,2344,2344,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,1853.87 2352,2352,2352,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,1922.99 2360,2360,2360,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,1811.58 2368,2368,2368,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,1792.31 2376,2376,2376,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,1838.37 2384,2384,2384,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,1858.6 2392,2392,2392,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,1860.07 2400,2400,2400,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,1933.42 2408,2408,2408,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,1819.33 2416,2416,2416,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,1836.25 2424,2424,2424,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,1848.54 2432,2432,2432,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,1823.62 2440,2440,2440,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,1869.54 2448,2448,2448,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,1911.67 2456,2456,2456,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,1813.75 2464,2464,2464,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,1821.91 2472,2472,2472,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,1840.62 2480,2480,2480,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,1850.91 2488,2488,2488,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,1864.61 2496,2496,2496,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,1924.5 2504,2504,2504,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,1830.34 2512,2512,2512,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,1834.66 2520,2520,2520,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,1857.55 2528,2528,2528,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,1858.24 2536,2536,2536,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,1874.39 2544,2544,2544,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,1936.44 2552,2552,2552,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,1834.66 2560,2560,2560,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,1806.38 2568,2568,2568,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,1856.05 2576,2576,2576,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,1874.94 2584,2584,2584,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,1885.09 2592,2592,2592,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,1920.25 2600,2600,2600,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,1834.23 2608,2608,2608,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,1841.57 2616,2616,2616,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,1850.73 2624,2624,2624,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,1809.89 2632,2632,2632,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,1878.44 2640,2640,2640,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,1936.74 2648,2648,2648,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,1846.46 2656,2656,2656,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,1860.67 2664,2664,2664,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,1861.86 2672,2672,2672,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,1873.34 2680,2680,2680,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,1894.3 2688,2688,2688,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,1922.02 2696,2696,2696,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,1837.1 2704,2704,2704,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,1851.46 2712,2712,2712,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,1863.85 2720,2720,2720,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,1885.4 2728,2728,2728,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,1878.47 2736,2736,2736,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,1939.13 2744,2744,2744,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,1853.11 2752,2752,2752,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,1807.01 2760,2760,2760,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,1878.25 2768,2768,2768,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,1889.55 2776,2776,2776,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,1900.28 2784,2784,2784,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,1932.27 2792,2792,2792,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,1853.29 2800,2800,2800,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,1861.52 2808,2808,2808,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,1869.94 2816,2816,2816,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,1831.82 2824,2824,2824,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,1893.92 2832,2832,2832,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,1922.26 2840,2840,2840,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,1845.63 2848,2848,2848,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,1854.77 2856,2856,2856,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,1867.66 2864,2864,2864,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,1882.59 2872,2872,2872,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,1892.89 2880,2880,2880,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,1942.04 2888,2888,2888,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,1857.45 2896,2896,2896,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,1871.57 2904,2904,2904,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,1879.6 2912,2912,2912,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,1896.88 2920,2920,2920,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,1902.42 2928,2928,2928,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,1936.76 2936,2936,2936,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,1862.84 2944,2944,2944,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,1811.73 2952,2952,2952,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,1886.62 2960,2960,2960,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,1886.08 2968,2968,2968,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,1899.94 2976,2976,2976,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,1932.83 2984,2984,2984,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,1857.91 2992,2992,2992,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,1867.83 3000,3000,3000,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,1875.91 3008,3008,3008,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,1889.47 3016,3016,3016,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,1901.35 3024,3024,3024,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,1931.98 3032,3032,3032,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,1862.22 3040,3040,3040,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,1868.68 3048,3048,3048,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,1888.37 3056,3056,3056,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,1891.01 3064,3064,3064,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,1902.23 3072,3072,3072,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,1920.76 3080,3080,3080,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,1857.73 3088,3088,3088,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,1866.27 3096,3096,3096,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,1878.16 3104,3104,3104,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,1886.68 3112,3112,3112,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,1899.16 3120,3120,3120,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,1929.64 3128,3128,3128,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,1861.46 3136,3136,3136,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,1874.3 3144,3144,3144,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,1885.98 3152,3152,3152,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,1891.92 3160,3160,3160,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,1908.1 3168,3168,3168,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,1948.38 3176,3176,3176,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,1870.94 3184,3184,3184,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,1880.62 3192,3192,3192,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,1891.83 3200,3200,3200,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,1899.79 3208,3208,3208,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,1902.34 3216,3216,3216,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,1930.77 3224,3224,3224,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,1864.67 3232,3232,3232,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,1879.72 3240,3240,3240,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,1882.8 3248,3248,3248,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,1900.55 3256,3256,3256,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,1903.72 3264,3264,3264,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,1933.9 3272,3272,3272,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,1870.54 3280,3280,3280,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,1878.6 3288,3288,3288,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,1887.61 3296,3296,3296,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,1897.91 3304,3304,3304,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,1909.06 3312,3312,3312,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,1935.67 3320,3320,3320,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,1872.91 3328,3328,3328,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,1858.24 3336,3336,3336,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,1888.31 3344,3344,3344,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,1897.73 3352,3352,3352,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,1907.72 3360,3360,3360,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,1937.65 3368,3368,3368,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,1877.45 3376,3376,3376,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,1888.07 3384,3384,3384,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,1897.18 3392,3392,3392,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,1902.43 3400,3400,3400,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,1913.7 3408,3408,3408,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,1943.8 3416,3416,3416,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,1882.12 3424,3424,3424,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,1891.37 3432,3432,3432,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,1898.53 3440,3440,3440,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,1907.38 3448,3448,3448,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,1914.74 3456,3456,3456,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,1943.66 3464,3464,3464,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,1884.19 3472,3472,3472,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,1893.45 3480,3480,3480,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,1905.41 3488,3488,3488,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,1911.34 3496,3496,3496,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,1920.97 3504,3504,3504,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,1937.7 3512,3512,3512,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,1884.17 3520,3520,3520,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,1890.26 3528,3528,3528,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,1897.99 3536,3536,3536,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,1905.18 3544,3544,3544,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,1914.37 3552,3552,3552,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,1943.47 3560,3560,3560,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,1885.01 3568,3568,3568,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,1897.54 3576,3576,3576,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,1900.57 3584,3584,3584,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,1742.1 3592,3592,3592,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,1916.35 3600,3600,3600,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,1945.89 3608,3608,3608,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,1890.1 3616,3616,3616,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,1898.8 3624,3624,3624,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,1908.82 3632,3632,3632,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,1918.68 3640,3640,3640,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,1924.56 3648,3648,3648,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,1941.74 3656,3656,3656,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,1887.02 3664,3664,3664,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,1894.27 3672,3672,3672,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,1902.95 3680,3680,3680,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,1911.5 3688,3688,3688,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,1918.74 3696,3696,3696,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,1948.22 3704,3704,3704,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,1892.34 3712,3712,3712,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,1898.86 3720,3720,3720,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,1912.97 3728,3728,3728,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,1918.72 3736,3736,3736,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,1924.3 3744,3744,3744,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,1943.05 3752,3752,3752,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,1889.03 3760,3760,3760,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,1899.49 3768,3768,3768,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,1907.51 3776,3776,3776,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,1914.3 3784,3784,3784,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,1925.07 3792,3792,3792,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,1950.38 3800,3800,3800,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,1897.19 3808,3808,3808,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,1908.06 3816,3816,3816,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,1912.6 3824,3824,3824,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,1921.82 3832,3832,3832,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,1927.14 3840,3840,3840,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,1875.15 3848,3848,3848,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,1887.66 3856,3856,3856,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,1893.96 3864,3864,3864,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,1903.78 3872,3872,3872,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,1909.99 3880,3880,3880,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,1924.53 3888,3888,3888,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,1941.83 3896,3896,3896,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,1892.19 3904,3904,3904,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,1900.6 3912,3912,3912,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,1910.37 3920,3920,3920,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,1915.83 3928,3928,3928,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,1925.4 3936,3936,3936,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,1950.96 3944,3944,3944,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,1898.41 3952,3952,3952,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,1905.99 3960,3960,3960,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,1914.44 3968,3968,3968,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,1916.96 3976,3976,3976,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,1927.24 3984,3984,3984,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,1947.14 3992,3992,3992,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,1899.03 4000,4000,4000,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,1907.76 4008,4008,4008,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,1915.77 4016,4016,4016,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,1923.13 4024,4024,4024,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,1930.9 4032,4032,4032,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,1946.1 4040,4040,4040,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,1899.49 4048,4048,4048,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,1906.57 4056,4056,4056,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,1916.28 4064,4064,4064,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,1921.84 4072,4072,4072,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,1929.5 4080,4080,4080,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,1943.23 4088,4088,4088,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,1892.9 4096,4096,4096,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,311.358 4104,4104,4104,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,1901.62 4112,4112,4112,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,1912.47 4120,4120,4120,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,1921.16 4128,4128,4128,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,1937.53 4136,4136,4136,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,1899.16 4144,4144,4144,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,1906.86 4152,4152,4152,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,1914.95 4160,4160,4160,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,1921.26 4168,4168,4168,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,1927.26 4176,4176,4176,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,1944.16 4184,4184,4184,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,1902.27 4192,4192,4192,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,1909.18 4200,4200,4200,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,1914.68 4208,4208,4208,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,1922.57 4216,4216,4216,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,1929.24 4224,4224,4224,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,1949.57 4232,4232,4232,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,1902.94 4240,4240,4240,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,1911.17 4248,4248,4248,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,1919.7 4256,4256,4256,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,1925.57 4264,4264,4264,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,1937.29 4272,4272,4272,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,1944.64 4280,4280,4280,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,1895.88 4288,4288,4288,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,1902.5 4296,4296,4296,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,1917.95 4304,4304,4304,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,1914.55 4312,4312,4312,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,1928.93 4320,4320,4320,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,1945.5 4328,4328,4328,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,1896.89 4336,4336,4336,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,1903.3 4344,4344,4344,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,1906.96 4352,4352,4352,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,1495.41 4360,4360,4360,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,1916.02 4368,4368,4368,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,1938.51 4376,4376,4376,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,1891.98 4384,4384,4384,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,1896.57 4392,4392,4392,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,1908.74 4400,4400,4400,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,1918.89 4408,4408,4408,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,1924.97 4416,4416,4416,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,1948.4 4424,4424,4424,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,1904.36 4432,4432,4432,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,1907.68 4440,4440,4440,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,1919.97 4448,4448,4448,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,1921.5 4456,4456,4456,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,1931.41 4464,4464,4464,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,1950.84 4472,4472,4472,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,1907.0 4480,4480,4480,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,1905.62 4488,4488,4488,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,1915.43 4496,4496,4496,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,1924.73 4504,4504,4504,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,1928.57 4512,4512,4512,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,1952.77 4520,4520,4520,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,1905.1 4528,4528,4528,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,1912.57 4536,4536,4536,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,1921.25 4544,4544,4544,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,1924.79 4552,4552,4552,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,1930.5 4560,4560,4560,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,1947.34 4568,4568,4568,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,1905.15 4576,4576,4576,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,1909.28 4584,4584,4584,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,1916.39 4592,4592,4592,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,1927.85 4600,4600,4600,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,1926.08 4608,4608,4608,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,1324.96 4616,4616,4616,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,1900.53 4624,4624,4624,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,1909.54 4632,4632,4632,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,1916.76 4640,4640,4640,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,1922.73 4648,4648,4648,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,1932.99 4656,4656,4656,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,1953.0 4664,4664,4664,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,1908.23 4672,4672,4672,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,1917.3 4680,4680,4680,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,1921.49 4688,4688,4688,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,1931.45 4696,4696,4696,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,1935.24 4704,4704,4704,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,1949.75 4712,4712,4712,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,1904.86 4720,4720,4720,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,1912.98 4728,4728,4728,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,1917.34 4736,4736,4736,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,1922.51 4744,4744,4744,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,1928.95 4752,4752,4752,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,1951.93 4760,4760,4760,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,1909.16 4768,4768,4768,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,1915.45 4776,4776,4776,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,1922.55 4784,4784,4784,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,1931.51 4792,4792,4792,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,1937.56 4800,4800,4800,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,1949.39 4808,4808,4808,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,1907.03 4816,4816,4816,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,1915.79 4824,4824,4824,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,1919.68 4832,4832,4832,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,1926.19 4840,4840,4840,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,1932.28 4848,4848,4848,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,1953.69 4856,4856,4856,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,1912.05 4864,4864,4864,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,1385.21 4872,4872,4872,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,1922.36 4880,4880,4880,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,1931.1 4888,4888,4888,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,1939.93 4896,4896,4896,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,1949.3 4904,4904,4904,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,1911.49 4912,4912,4912,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,1915.34 4920,4920,4920,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,1922.18 4928,4928,4928,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,1927.8 4936,4936,4936,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,1934.44 4944,4944,4944,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,1949.67 4952,4952,4952,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,1909.27 4960,4960,4960,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,1915.97 4968,4968,4968,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,1922.57 4976,4976,4976,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,1927.38 4984,4984,4984,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,1935.7 4992,4992,4992,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,1950.77 5000,5000,5000,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,1912.25 5008,5008,5008,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,1917.39 5016,5016,5016,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,1923.3 5024,5024,5024,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,1930.67 5032,5032,5032,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,1936.51 5040,5040,5040,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,1952.69 5048,5048,5048,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,1915.71 5056,5056,5056,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,1920.02 5064,5064,5064,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,1926.49 5072,5072,5072,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,1931.87 5080,5080,5080,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,1937.45 5088,5088,5088,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,1951.47 5096,5096,5096,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,1912.54 5104,5104,5104,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,1919.38 5112,5112,5112,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,1922.88 5120,5120,5120,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,659.501 5128,5128,5128,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,1933.5 5136,5136,5136,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,1947.68 5144,5144,5144,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,1910.48 5152,5152,5152,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,1917.72 5160,5160,5160,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,1923.71 5168,5168,5168,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,1930.92 5176,5176,5176,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,1938.75 5184,5184,5184,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,1949.89 5192,5192,5192,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,1913.85 5200,5200,5200,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,1922.47 5208,5208,5208,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,1927.83 5216,5216,5216,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,1930.76 5224,5224,5224,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,1936.66 5232,5232,5232,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,1950.24 5240,5240,5240,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,1914.26 5248,5248,5248,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,1914.61 5256,5256,5256,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,1926.71 5264,5264,5264,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,1930.73 5272,5272,5272,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,1935.98 5280,5280,5280,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,1955.27 5288,5288,5288,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,1914.98 5296,5296,5296,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,1920.82 5304,5304,5304,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,1928.84 5312,5312,5312,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,1932.24 5320,5320,5320,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,1939.9 5328,5328,5328,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,1950.69 5336,5336,5336,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,1916.03 5344,5344,5344,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,1921.11 5352,5352,5352,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,1927.81 5360,5360,5360,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,1931.99 5368,5368,5368,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,1936.04 5376,5376,5376,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,1317.72 5384,5384,5384,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,1912.73 5392,5392,5392,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,1918.38 5400,5400,5400,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,1925.7 5408,5408,5408,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,1931.53 5416,5416,5416,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,1938.21 5424,5424,5424,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,1950.83 5432,5432,5432,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,1921.66 5440,5440,5440,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,1922.97 5448,5448,5448,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,1927.9 5456,5456,5456,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,1933.03 5464,5464,5464,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,1940.08 5472,5472,5472,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,1951.92 5480,5480,5480,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,1917.98 5488,5488,5488,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,1924.78 5496,5496,5496,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,1929.16 5504,5504,5504,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,1928.22 5512,5512,5512,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,1939.12 5520,5520,5520,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,1953.18 5528,5528,5528,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,1919.09 5536,5536,5536,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,1924.8 5544,5544,5544,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,1931.11 5552,5552,5552,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,1936.74 5560,5560,5560,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,1942.7 5568,5568,5568,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,1954.79 5576,5576,5576,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,1920.56 5584,5584,5584,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,1925.34 5592,5592,5592,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,1931.03 5600,5600,5600,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,1938.76 5608,5608,5608,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,1941.86 5616,5616,5616,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,1951.54 5624,5624,5624,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,1916.83 5632,5632,5632,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,1080.8 5640,5640,5640,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,1927.04 5648,5648,5648,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,1931.79 5656,5656,5656,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,1939.1 5664,5664,5664,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,1951.2 5672,5672,5672,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,1919.46 5680,5680,5680,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,1927.55 5688,5688,5688,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,1931.1 5696,5696,5696,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,1936.2 5704,5704,5704,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,1942.42 5712,5712,5712,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,1955.27 5720,5720,5720,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,1923.53 5728,5728,5728,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,1928.69 5736,5736,5736,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,1933.32 5744,5744,5744,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,1939.19 5752,5752,5752,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,1943.62 5760,5760,5760,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,4,S9150_14.50.2,1947.44 clblas-2.10/doc/performance/clBLAS_2.7.1/W9100/000077500000000000000000000000001264277366700202735ustar00rootroot00000000000000clblas-2.10/doc/performance/clBLAS_2.7.1/W9100/clblas271_w9100_dtrsm_col_left_lower_unit_14502.csv000066400000000000000000000103471264277366700314310ustar00rootroot00000000000000m,n,k,lda,ldb,ldc,offa,offb,offc,alpha,beta,order,transa,transb,side,uplo,diag,function,device,library,label,GFLOPS 192,192,192,0,0,0,0,0,0,1.0,1.0,column,none,none,left,lower,unit,dtrsm,gpu,clblas,clblas271_w9100_dtrsm_col_left_lower_unit_14502,8.76417 384,384,384,0,0,0,0,0,0,1.0,1.0,column,none,none,left,lower,unit,dtrsm,gpu,clblas,clblas271_w9100_dtrsm_col_left_lower_unit_14502,58.4721 576,576,576,0,0,0,0,0,0,1.0,1.0,column,none,none,left,lower,unit,dtrsm,gpu,clblas,clblas271_w9100_dtrsm_col_left_lower_unit_14502,149.36 768,768,768,0,0,0,0,0,0,1.0,1.0,column,none,none,left,lower,unit,dtrsm,gpu,clblas,clblas271_w9100_dtrsm_col_left_lower_unit_14502,266.636 960,960,960,0,0,0,0,0,0,1.0,1.0,column,none,none,left,lower,unit,dtrsm,gpu,clblas,clblas271_w9100_dtrsm_col_left_lower_unit_14502,402.608 1152,1152,1152,0,0,0,0,0,0,1.0,1.0,column,none,none,left,lower,unit,dtrsm,gpu,clblas,clblas271_w9100_dtrsm_col_left_lower_unit_14502,509.134 1344,1344,1344,0,0,0,0,0,0,1.0,1.0,column,none,none,left,lower,unit,dtrsm,gpu,clblas,clblas271_w9100_dtrsm_col_left_lower_unit_14502,620.235 1536,1536,1536,0,0,0,0,0,0,1.0,1.0,column,none,none,left,lower,unit,dtrsm,gpu,clblas,clblas271_w9100_dtrsm_col_left_lower_unit_14502,381.995 1728,1728,1728,0,0,0,0,0,0,1.0,1.0,column,none,none,left,lower,unit,dtrsm,gpu,clblas,clblas271_w9100_dtrsm_col_left_lower_unit_14502,786.587 1920,1920,1920,0,0,0,0,0,0,1.0,1.0,column,none,none,left,lower,unit,dtrsm,gpu,clblas,clblas271_w9100_dtrsm_col_left_lower_unit_14502,850.987 2112,2112,2112,0,0,0,0,0,0,1.0,1.0,column,none,none,left,lower,unit,dtrsm,gpu,clblas,clblas271_w9100_dtrsm_col_left_lower_unit_14502,924.658 2304,2304,2304,0,0,0,0,0,0,1.0,1.0,column,none,none,left,lower,unit,dtrsm,gpu,clblas,clblas271_w9100_dtrsm_col_left_lower_unit_14502,873.356 2496,2496,2496,0,0,0,0,0,0,1.0,1.0,column,none,none,left,lower,unit,dtrsm,gpu,clblas,clblas271_w9100_dtrsm_col_left_lower_unit_14502,1015.78 2688,2688,2688,0,0,0,0,0,0,1.0,1.0,column,none,none,left,lower,unit,dtrsm,gpu,clblas,clblas271_w9100_dtrsm_col_left_lower_unit_14502,1040.93 2880,2880,2880,0,0,0,0,0,0,1.0,1.0,column,none,none,left,lower,unit,dtrsm,gpu,clblas,clblas271_w9100_dtrsm_col_left_lower_unit_14502,1060.2 3072,3072,3072,0,0,0,0,0,0,1.0,1.0,column,none,none,left,lower,unit,dtrsm,gpu,clblas,clblas271_w9100_dtrsm_col_left_lower_unit_14502,613.395 3264,3264,3264,0,0,0,0,0,0,1.0,1.0,column,none,none,left,lower,unit,dtrsm,gpu,clblas,clblas271_w9100_dtrsm_col_left_lower_unit_14502,1111.35 3456,3456,3456,0,0,0,0,0,0,1.0,1.0,column,none,none,left,lower,unit,dtrsm,gpu,clblas,clblas271_w9100_dtrsm_col_left_lower_unit_14502,1125.58 3648,3648,3648,0,0,0,0,0,0,1.0,1.0,column,none,none,left,lower,unit,dtrsm,gpu,clblas,clblas271_w9100_dtrsm_col_left_lower_unit_14502,1150.94 3840,3840,3840,0,0,0,0,0,0,1.0,1.0,column,none,none,left,lower,unit,dtrsm,gpu,clblas,clblas271_w9100_dtrsm_col_left_lower_unit_14502,1008.84 4032,4032,4032,0,0,0,0,0,0,1.0,1.0,column,none,none,left,lower,unit,dtrsm,gpu,clblas,clblas271_w9100_dtrsm_col_left_lower_unit_14502,1193.15 4224,4224,4224,0,0,0,0,0,0,1.0,1.0,column,none,none,left,lower,unit,dtrsm,gpu,clblas,clblas271_w9100_dtrsm_col_left_lower_unit_14502,1180.7 4416,4416,4416,0,0,0,0,0,0,1.0,1.0,column,none,none,left,lower,unit,dtrsm,gpu,clblas,clblas271_w9100_dtrsm_col_left_lower_unit_14502,1189.88 4608,4608,4608,0,0,0,0,0,0,1.0,1.0,column,none,none,left,lower,unit,dtrsm,gpu,clblas,clblas271_w9100_dtrsm_col_left_lower_unit_14502,820.989 4800,4800,4800,0,0,0,0,0,0,1.0,1.0,column,none,none,left,lower,unit,dtrsm,gpu,clblas,clblas271_w9100_dtrsm_col_left_lower_unit_14502,1213.96 4992,4992,4992,0,0,0,0,0,0,1.0,1.0,column,none,none,left,lower,unit,dtrsm,gpu,clblas,clblas271_w9100_dtrsm_col_left_lower_unit_14502,1209.88 5184,5184,5184,0,0,0,0,0,0,1.0,1.0,column,none,none,left,lower,unit,dtrsm,gpu,clblas,clblas271_w9100_dtrsm_col_left_lower_unit_14502,1233.14 5376,5376,5376,0,0,0,0,0,0,1.0,1.0,column,none,none,left,lower,unit,dtrsm,gpu,clblas,clblas271_w9100_dtrsm_col_left_lower_unit_14502,1091.0 5568,5568,5568,0,0,0,0,0,0,1.0,1.0,column,none,none,left,lower,unit,dtrsm,gpu,clblas,clblas271_w9100_dtrsm_col_left_lower_unit_14502,1246.52 5760,5760,5760,0,0,0,0,0,0,1.0,1.0,column,none,none,left,lower,unit,dtrsm,gpu,clblas,clblas271_w9100_dtrsm_col_left_lower_unit_14502,1236.78 clblas-2.10/doc/performance/clBLAS_2.7.1/W9100/clblas271_w9100_dtrsm_col_left_upper_unit_14502.csv000066400000000000000000000103511264277366700314270ustar00rootroot00000000000000m,n,k,lda,ldb,ldc,offa,offb,offc,alpha,beta,order,transa,transb,side,uplo,diag,function,device,library,label,GFLOPS 192,192,192,0,0,0,0,0,0,1.0,1.0,column,none,none,left,upper,unit,dtrsm,gpu,clblas,clblas271_w9100_dtrsm_col_left_upper_unit_14502,9.27487 384,384,384,0,0,0,0,0,0,1.0,1.0,column,none,none,left,upper,unit,dtrsm,gpu,clblas,clblas271_w9100_dtrsm_col_left_upper_unit_14502,60.4199 576,576,576,0,0,0,0,0,0,1.0,1.0,column,none,none,left,upper,unit,dtrsm,gpu,clblas,clblas271_w9100_dtrsm_col_left_upper_unit_14502,152.283 768,768,768,0,0,0,0,0,0,1.0,1.0,column,none,none,left,upper,unit,dtrsm,gpu,clblas,clblas271_w9100_dtrsm_col_left_upper_unit_14502,271.667 960,960,960,0,0,0,0,0,0,1.0,1.0,column,none,none,left,upper,unit,dtrsm,gpu,clblas,clblas271_w9100_dtrsm_col_left_upper_unit_14502,410.521 1152,1152,1152,0,0,0,0,0,0,1.0,1.0,column,none,none,left,upper,unit,dtrsm,gpu,clblas,clblas271_w9100_dtrsm_col_left_upper_unit_14502,514.267 1344,1344,1344,0,0,0,0,0,0,1.0,1.0,column,none,none,left,upper,unit,dtrsm,gpu,clblas,clblas271_w9100_dtrsm_col_left_upper_unit_14502,629.426 1536,1536,1536,0,0,0,0,0,0,1.0,1.0,column,none,none,left,upper,unit,dtrsm,gpu,clblas,clblas271_w9100_dtrsm_col_left_upper_unit_14502,380.803 1728,1728,1728,0,0,0,0,0,0,1.0,1.0,column,none,none,left,upper,unit,dtrsm,gpu,clblas,clblas271_w9100_dtrsm_col_left_upper_unit_14502,786.084 1920,1920,1920,0,0,0,0,0,0,1.0,1.0,column,none,none,left,upper,unit,dtrsm,gpu,clblas,clblas271_w9100_dtrsm_col_left_upper_unit_14502,854.922 2112,2112,2112,0,0,0,0,0,0,1.0,1.0,column,none,none,left,upper,unit,dtrsm,gpu,clblas,clblas271_w9100_dtrsm_col_left_upper_unit_14502,926.558 2304,2304,2304,0,0,0,0,0,0,1.0,1.0,column,none,none,left,upper,unit,dtrsm,gpu,clblas,clblas271_w9100_dtrsm_col_left_upper_unit_14502,874.374 2496,2496,2496,0,0,0,0,0,0,1.0,1.0,column,none,none,left,upper,unit,dtrsm,gpu,clblas,clblas271_w9100_dtrsm_col_left_upper_unit_14502,1004.46 2688,2688,2688,0,0,0,0,0,0,1.0,1.0,column,none,none,left,upper,unit,dtrsm,gpu,clblas,clblas271_w9100_dtrsm_col_left_upper_unit_14502,1045.8 2880,2880,2880,0,0,0,0,0,0,1.0,1.0,column,none,none,left,upper,unit,dtrsm,gpu,clblas,clblas271_w9100_dtrsm_col_left_upper_unit_14502,1054.79 3072,3072,3072,0,0,0,0,0,0,1.0,1.0,column,none,none,left,upper,unit,dtrsm,gpu,clblas,clblas271_w9100_dtrsm_col_left_upper_unit_14502,617.848 3264,3264,3264,0,0,0,0,0,0,1.0,1.0,column,none,none,left,upper,unit,dtrsm,gpu,clblas,clblas271_w9100_dtrsm_col_left_upper_unit_14502,1101.03 3456,3456,3456,0,0,0,0,0,0,1.0,1.0,column,none,none,left,upper,unit,dtrsm,gpu,clblas,clblas271_w9100_dtrsm_col_left_upper_unit_14502,1126.04 3648,3648,3648,0,0,0,0,0,0,1.0,1.0,column,none,none,left,upper,unit,dtrsm,gpu,clblas,clblas271_w9100_dtrsm_col_left_upper_unit_14502,1145.07 3840,3840,3840,0,0,0,0,0,0,1.0,1.0,column,none,none,left,upper,unit,dtrsm,gpu,clblas,clblas271_w9100_dtrsm_col_left_upper_unit_14502,1009.83 4032,4032,4032,0,0,0,0,0,0,1.0,1.0,column,none,none,left,upper,unit,dtrsm,gpu,clblas,clblas271_w9100_dtrsm_col_left_upper_unit_14502,1186.18 4224,4224,4224,0,0,0,0,0,0,1.0,1.0,column,none,none,left,upper,unit,dtrsm,gpu,clblas,clblas271_w9100_dtrsm_col_left_upper_unit_14502,1180.93 4416,4416,4416,0,0,0,0,0,0,1.0,1.0,column,none,none,left,upper,unit,dtrsm,gpu,clblas,clblas271_w9100_dtrsm_col_left_upper_unit_14502,1181.57 4608,4608,4608,0,0,0,0,0,0,1.0,1.0,column,none,none,left,upper,unit,dtrsm,gpu,clblas,clblas271_w9100_dtrsm_col_left_upper_unit_14502,824.356 4800,4800,4800,0,0,0,0,0,0,1.0,1.0,column,none,none,left,upper,unit,dtrsm,gpu,clblas,clblas271_w9100_dtrsm_col_left_upper_unit_14502,1206.0 4992,4992,4992,0,0,0,0,0,0,1.0,1.0,column,none,none,left,upper,unit,dtrsm,gpu,clblas,clblas271_w9100_dtrsm_col_left_upper_unit_14502,1210.18 5184,5184,5184,0,0,0,0,0,0,1.0,1.0,column,none,none,left,upper,unit,dtrsm,gpu,clblas,clblas271_w9100_dtrsm_col_left_upper_unit_14502,1226.17 5376,5376,5376,0,0,0,0,0,0,1.0,1.0,column,none,none,left,upper,unit,dtrsm,gpu,clblas,clblas271_w9100_dtrsm_col_left_upper_unit_14502,1092.12 5568,5568,5568,0,0,0,0,0,0,1.0,1.0,column,none,none,left,upper,unit,dtrsm,gpu,clblas,clblas271_w9100_dtrsm_col_left_upper_unit_14502,1238.66 5760,5760,5760,0,0,0,0,0,0,1.0,1.0,column,none,none,left,upper,unit,dtrsm,gpu,clblas,clblas271_w9100_dtrsm_col_left_upper_unit_14502,1237.26 clblas-2.10/doc/performance/clBLAS_2.7.1/W9100/clblas271_w9100_dtrsm_col_right_lower_unit_14502.csv000066400000000000000000000104431264277366700316110ustar00rootroot00000000000000m,n,k,lda,ldb,ldc,offa,offb,offc,alpha,beta,order,transa,transb,side,uplo,diag,function,device,library,label,GFLOPS 192,192,192,0,0,0,0,0,0,1.0,1.0,column,none,none,right,lower,unit,dtrsm,gpu,clblas,clblas271_w9100_dtrsm_col_right_lower_unit_14502,8.93028 384,384,384,0,0,0,0,0,0,1.0,1.0,column,none,none,right,lower,unit,dtrsm,gpu,clblas,clblas271_w9100_dtrsm_col_right_lower_unit_14502,58.3999 576,576,576,0,0,0,0,0,0,1.0,1.0,column,none,none,right,lower,unit,dtrsm,gpu,clblas,clblas271_w9100_dtrsm_col_right_lower_unit_14502,149.116 768,768,768,0,0,0,0,0,0,1.0,1.0,column,none,none,right,lower,unit,dtrsm,gpu,clblas,clblas271_w9100_dtrsm_col_right_lower_unit_14502,279.346 960,960,960,0,0,0,0,0,0,1.0,1.0,column,none,none,right,lower,unit,dtrsm,gpu,clblas,clblas271_w9100_dtrsm_col_right_lower_unit_14502,408.532 1152,1152,1152,0,0,0,0,0,0,1.0,1.0,column,none,none,right,lower,unit,dtrsm,gpu,clblas,clblas271_w9100_dtrsm_col_right_lower_unit_14502,518.924 1344,1344,1344,0,0,0,0,0,0,1.0,1.0,column,none,none,right,lower,unit,dtrsm,gpu,clblas,clblas271_w9100_dtrsm_col_right_lower_unit_14502,631.504 1536,1536,1536,0,0,0,0,0,0,1.0,1.0,column,none,none,right,lower,unit,dtrsm,gpu,clblas,clblas271_w9100_dtrsm_col_right_lower_unit_14502,540.503 1728,1728,1728,0,0,0,0,0,0,1.0,1.0,column,none,none,right,lower,unit,dtrsm,gpu,clblas,clblas271_w9100_dtrsm_col_right_lower_unit_14502,790.175 1920,1920,1920,0,0,0,0,0,0,1.0,1.0,column,none,none,right,lower,unit,dtrsm,gpu,clblas,clblas271_w9100_dtrsm_col_right_lower_unit_14502,868.159 2112,2112,2112,0,0,0,0,0,0,1.0,1.0,column,none,none,right,lower,unit,dtrsm,gpu,clblas,clblas271_w9100_dtrsm_col_right_lower_unit_14502,940.942 2304,2304,2304,0,0,0,0,0,0,1.0,1.0,column,none,none,right,lower,unit,dtrsm,gpu,clblas,clblas271_w9100_dtrsm_col_right_lower_unit_14502,964.969 2496,2496,2496,0,0,0,0,0,0,1.0,1.0,column,none,none,right,lower,unit,dtrsm,gpu,clblas,clblas271_w9100_dtrsm_col_right_lower_unit_14502,1019.57 2688,2688,2688,0,0,0,0,0,0,1.0,1.0,column,none,none,right,lower,unit,dtrsm,gpu,clblas,clblas271_w9100_dtrsm_col_right_lower_unit_14502,1074.38 2880,2880,2880,0,0,0,0,0,0,1.0,1.0,column,none,none,right,lower,unit,dtrsm,gpu,clblas,clblas271_w9100_dtrsm_col_right_lower_unit_14502,1060.53 3072,3072,3072,0,0,0,0,0,0,1.0,1.0,column,none,none,right,lower,unit,dtrsm,gpu,clblas,clblas271_w9100_dtrsm_col_right_lower_unit_14502,956.6 3264,3264,3264,0,0,0,0,0,0,1.0,1.0,column,none,none,right,lower,unit,dtrsm,gpu,clblas,clblas271_w9100_dtrsm_col_right_lower_unit_14502,1114.54 3456,3456,3456,0,0,0,0,0,0,1.0,1.0,column,none,none,right,lower,unit,dtrsm,gpu,clblas,clblas271_w9100_dtrsm_col_right_lower_unit_14502,1144.7 3648,3648,3648,0,0,0,0,0,0,1.0,1.0,column,none,none,right,lower,unit,dtrsm,gpu,clblas,clblas271_w9100_dtrsm_col_right_lower_unit_14502,1156.12 3840,3840,3840,0,0,0,0,0,0,1.0,1.0,column,none,none,right,lower,unit,dtrsm,gpu,clblas,clblas271_w9100_dtrsm_col_right_lower_unit_14502,1133.81 4032,4032,4032,0,0,0,0,0,0,1.0,1.0,column,none,none,right,lower,unit,dtrsm,gpu,clblas,clblas271_w9100_dtrsm_col_right_lower_unit_14502,1196.38 4224,4224,4224,0,0,0,0,0,0,1.0,1.0,column,none,none,right,lower,unit,dtrsm,gpu,clblas,clblas271_w9100_dtrsm_col_right_lower_unit_14502,1193.58 4416,4416,4416,0,0,0,0,0,0,1.0,1.0,column,none,none,right,lower,unit,dtrsm,gpu,clblas,clblas271_w9100_dtrsm_col_right_lower_unit_14502,1191.64 4608,4608,4608,0,0,0,0,0,0,1.0,1.0,column,none,none,right,lower,unit,dtrsm,gpu,clblas,clblas271_w9100_dtrsm_col_right_lower_unit_14502,1125.08 4800,4800,4800,0,0,0,0,0,0,1.0,1.0,column,none,none,right,lower,unit,dtrsm,gpu,clblas,clblas271_w9100_dtrsm_col_right_lower_unit_14502,1216.42 4992,4992,4992,0,0,0,0,0,0,1.0,1.0,column,none,none,right,lower,unit,dtrsm,gpu,clblas,clblas271_w9100_dtrsm_col_right_lower_unit_14502,1229.49 5184,5184,5184,0,0,0,0,0,0,1.0,1.0,column,none,none,right,lower,unit,dtrsm,gpu,clblas,clblas271_w9100_dtrsm_col_right_lower_unit_14502,1236.81 5376,5376,5376,0,0,0,0,0,0,1.0,1.0,column,none,none,right,lower,unit,dtrsm,gpu,clblas,clblas271_w9100_dtrsm_col_right_lower_unit_14502,1223.5 5568,5568,5568,0,0,0,0,0,0,1.0,1.0,column,none,none,right,lower,unit,dtrsm,gpu,clblas,clblas271_w9100_dtrsm_col_right_lower_unit_14502,1248.05 5760,5760,5760,0,0,0,0,0,0,1.0,1.0,column,none,none,right,lower,unit,dtrsm,gpu,clblas,clblas271_w9100_dtrsm_col_right_lower_unit_14502,1250.83 clblas-2.10/doc/performance/clBLAS_2.7.1/W9100/clblas271_w9100_dtrsm_col_right_upper_unit_14502.csv000066400000000000000000000104471264277366700316200ustar00rootroot00000000000000m,n,k,lda,ldb,ldc,offa,offb,offc,alpha,beta,order,transa,transb,side,uplo,diag,function,device,library,label,GFLOPS 192,192,192,0,0,0,0,0,0,1.0,1.0,column,none,none,right,upper,unit,dtrsm,gpu,clblas,clblas271_w9100_dtrsm_col_right_upper_unit_14502,10.6716 384,384,384,0,0,0,0,0,0,1.0,1.0,column,none,none,right,upper,unit,dtrsm,gpu,clblas,clblas271_w9100_dtrsm_col_right_upper_unit_14502,70.5926 576,576,576,0,0,0,0,0,0,1.0,1.0,column,none,none,right,upper,unit,dtrsm,gpu,clblas,clblas271_w9100_dtrsm_col_right_upper_unit_14502,180.089 768,768,768,0,0,0,0,0,0,1.0,1.0,column,none,none,right,upper,unit,dtrsm,gpu,clblas,clblas271_w9100_dtrsm_col_right_upper_unit_14502,285.734 960,960,960,0,0,0,0,0,0,1.0,1.0,column,none,none,right,upper,unit,dtrsm,gpu,clblas,clblas271_w9100_dtrsm_col_right_upper_unit_14502,408.363 1152,1152,1152,0,0,0,0,0,0,1.0,1.0,column,none,none,right,upper,unit,dtrsm,gpu,clblas,clblas271_w9100_dtrsm_col_right_upper_unit_14502,555.832 1344,1344,1344,0,0,0,0,0,0,1.0,1.0,column,none,none,right,upper,unit,dtrsm,gpu,clblas,clblas271_w9100_dtrsm_col_right_upper_unit_14502,690.637 1536,1536,1536,0,0,0,0,0,0,1.0,1.0,column,none,none,right,upper,unit,dtrsm,gpu,clblas,clblas271_w9100_dtrsm_col_right_upper_unit_14502,519.496 1728,1728,1728,0,0,0,0,0,0,1.0,1.0,column,none,none,right,upper,unit,dtrsm,gpu,clblas,clblas271_w9100_dtrsm_col_right_upper_unit_14502,863.036 1920,1920,1920,0,0,0,0,0,0,1.0,1.0,column,none,none,right,upper,unit,dtrsm,gpu,clblas,clblas271_w9100_dtrsm_col_right_upper_unit_14502,901.732 2112,2112,2112,0,0,0,0,0,0,1.0,1.0,column,none,none,right,upper,unit,dtrsm,gpu,clblas,clblas271_w9100_dtrsm_col_right_upper_unit_14502,1036.75 2304,2304,2304,0,0,0,0,0,0,1.0,1.0,column,none,none,right,upper,unit,dtrsm,gpu,clblas,clblas271_w9100_dtrsm_col_right_upper_unit_14502,1045.06 2496,2496,2496,0,0,0,0,0,0,1.0,1.0,column,none,none,right,upper,unit,dtrsm,gpu,clblas,clblas271_w9100_dtrsm_col_right_upper_unit_14502,1126.92 2688,2688,2688,0,0,0,0,0,0,1.0,1.0,column,none,none,right,upper,unit,dtrsm,gpu,clblas,clblas271_w9100_dtrsm_col_right_upper_unit_14502,1170.15 2880,2880,2880,0,0,0,0,0,0,1.0,1.0,column,none,none,right,upper,unit,dtrsm,gpu,clblas,clblas271_w9100_dtrsm_col_right_upper_unit_14502,1148.58 3072,3072,3072,0,0,0,0,0,0,1.0,1.0,column,none,none,right,upper,unit,dtrsm,gpu,clblas,clblas271_w9100_dtrsm_col_right_upper_unit_14502,981.035 3264,3264,3264,0,0,0,0,0,0,1.0,1.0,column,none,none,right,upper,unit,dtrsm,gpu,clblas,clblas271_w9100_dtrsm_col_right_upper_unit_14502,1198.84 3456,3456,3456,0,0,0,0,0,0,1.0,1.0,column,none,none,right,upper,unit,dtrsm,gpu,clblas,clblas271_w9100_dtrsm_col_right_upper_unit_14502,1220.03 3648,3648,3648,0,0,0,0,0,0,1.0,1.0,column,none,none,right,upper,unit,dtrsm,gpu,clblas,clblas271_w9100_dtrsm_col_right_upper_unit_14502,1244.95 3840,3840,3840,0,0,0,0,0,0,1.0,1.0,column,none,none,right,upper,unit,dtrsm,gpu,clblas,clblas271_w9100_dtrsm_col_right_upper_unit_14502,1177.25 4032,4032,4032,0,0,0,0,0,0,1.0,1.0,column,none,none,right,upper,unit,dtrsm,gpu,clblas,clblas271_w9100_dtrsm_col_right_upper_unit_14502,1272.59 4224,4224,4224,0,0,0,0,0,0,1.0,1.0,column,none,none,right,upper,unit,dtrsm,gpu,clblas,clblas271_w9100_dtrsm_col_right_upper_unit_14502,1262.03 4416,4416,4416,0,0,0,0,0,0,1.0,1.0,column,none,none,right,upper,unit,dtrsm,gpu,clblas,clblas271_w9100_dtrsm_col_right_upper_unit_14502,1301.38 4608,4608,4608,0,0,0,0,0,0,1.0,1.0,column,none,none,right,upper,unit,dtrsm,gpu,clblas,clblas271_w9100_dtrsm_col_right_upper_unit_14502,1216.99 4800,4800,4800,0,0,0,0,0,0,1.0,1.0,column,none,none,right,upper,unit,dtrsm,gpu,clblas,clblas271_w9100_dtrsm_col_right_upper_unit_14502,1315.69 4992,4992,4992,0,0,0,0,0,0,1.0,1.0,column,none,none,right,upper,unit,dtrsm,gpu,clblas,clblas271_w9100_dtrsm_col_right_upper_unit_14502,1319.76 5184,5184,5184,0,0,0,0,0,0,1.0,1.0,column,none,none,right,upper,unit,dtrsm,gpu,clblas,clblas271_w9100_dtrsm_col_right_upper_unit_14502,1343.41 5376,5376,5376,0,0,0,0,0,0,1.0,1.0,column,none,none,right,upper,unit,dtrsm,gpu,clblas,clblas271_w9100_dtrsm_col_right_upper_unit_14502,1312.79 5568,5568,5568,0,0,0,0,0,0,1.0,1.0,column,none,none,right,upper,unit,dtrsm,gpu,clblas,clblas271_w9100_dtrsm_col_right_upper_unit_14502,1357.02 5760,5760,5760,0,0,0,0,0,0,1.0,1.0,column,none,none,right,upper,unit,dtrsm,gpu,clblas,clblas271_w9100_dtrsm_col_right_upper_unit_14502,1348.39 clblas-2.10/doc/performance/clBLAS_2.9.0/000077500000000000000000000000001264277366700175545ustar00rootroot00000000000000clblas-2.10/doc/performance/clBLAS_2.9.0/FIJINANO/000077500000000000000000000000001264277366700207515ustar00rootroot00000000000000clblas-2.10/doc/performance/clBLAS_2.9.0/FIJINANO/clblas290_fijinano_cgemm_col_nt_1520.csv000066400000000000000000000530061264277366700302170ustar00rootroot00000000000000m,n,k,lda,ldb,ldc,offa,offb,offc,alpha,beta,order,transa,transb,side,uplo,diag,function,device,library,numQueues,label,GFLOPS 32,32,32,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,1,cgemm_col_nt_fijinano_1520,2.27109 64,64,64,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,1,cgemm_col_nt_fijinano_1520,16.9685 96,96,96,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,1,cgemm_col_nt_fijinano_1520,56.0058 128,128,128,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,1,cgemm_col_nt_fijinano_1520,114.849 160,160,160,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,1,cgemm_col_nt_fijinano_1520,212.577 192,192,192,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,1,cgemm_col_nt_fijinano_1520,355.603 224,224,224,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,1,cgemm_col_nt_fijinano_1520,519.542 256,256,256,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,1,cgemm_col_nt_fijinano_1520,761.754 288,288,288,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,1,cgemm_col_nt_fijinano_1520,1040.16 320,320,320,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,1,cgemm_col_nt_fijinano_1520,1299.26 352,352,352,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,1,cgemm_col_nt_fijinano_1520,1566.48 384,384,384,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,1,cgemm_col_nt_fijinano_1520,1828.98 416,416,416,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,1,cgemm_col_nt_fijinano_1520,2134.13 448,448,448,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,1,cgemm_col_nt_fijinano_1520,2345.89 480,480,480,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,1,cgemm_col_nt_fijinano_1520,2754.89 512,512,512,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,1,cgemm_col_nt_fijinano_1520,3076.47 544,544,544,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,1,cgemm_col_nt_fijinano_1520,3224.44 576,576,576,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,1,cgemm_col_nt_fijinano_1520,2827.18 608,608,608,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,1,cgemm_col_nt_fijinano_1520,3179.31 640,640,640,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,1,cgemm_col_nt_fijinano_1520,3620.86 672,672,672,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,1,cgemm_col_nt_fijinano_1520,3874.67 704,704,704,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,1,cgemm_col_nt_fijinano_1520,3822.13 736,736,736,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,1,cgemm_col_nt_fijinano_1520,3994.36 768,768,768,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,1,cgemm_col_nt_fijinano_1520,4218.83 800,800,800,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,1,cgemm_col_nt_fijinano_1520,4307.77 832,832,832,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,1,cgemm_col_nt_fijinano_1520,4039.92 864,864,864,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,1,cgemm_col_nt_fijinano_1520,4377.48 896,896,896,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,1,cgemm_col_nt_fijinano_1520,4318.9 928,928,928,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,1,cgemm_col_nt_fijinano_1520,4234.38 960,960,960,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,1,cgemm_col_nt_fijinano_1520,4378.36 992,992,992,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,1,cgemm_col_nt_fijinano_1520,4352.72 1024,1024,1024,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,1,cgemm_col_nt_fijinano_1520,4574.29 1056,1056,1056,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,1,cgemm_col_nt_fijinano_1520,4368.32 1088,1088,1088,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,1,cgemm_col_nt_fijinano_1520,4806.25 1120,1120,1120,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,1,cgemm_col_nt_fijinano_1520,4460.99 1152,1152,1152,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,1,cgemm_col_nt_fijinano_1520,4507.26 1184,1184,1184,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,1,cgemm_col_nt_fijinano_1520,4297.52 1216,1216,1216,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,1,cgemm_col_nt_fijinano_1520,4860.26 1248,1248,1248,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,1,cgemm_col_nt_fijinano_1520,4952.9 1280,1280,1280,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,1,cgemm_col_nt_fijinano_1520,4731.69 1312,1312,1312,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,1,cgemm_col_nt_fijinano_1520,4559.33 1344,1344,1344,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,1,cgemm_col_nt_fijinano_1520,5372.62 1376,1376,1376,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,1,cgemm_col_nt_fijinano_1520,5095.49 1408,1408,1408,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,1,cgemm_col_nt_fijinano_1520,5481.53 1440,1440,1440,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,1,cgemm_col_nt_fijinano_1520,4799.71 1472,1472,1472,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,1,cgemm_col_nt_fijinano_1520,5530.91 1504,1504,1504,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,1,cgemm_col_nt_fijinano_1520,5105.86 1536,1536,1536,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,1,cgemm_col_nt_fijinano_1520,6066.23 1568,1568,1568,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,1,cgemm_col_nt_fijinano_1520,4847.53 1600,1600,1600,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,1,cgemm_col_nt_fijinano_1520,5688.9 1632,1632,1632,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,1,cgemm_col_nt_fijinano_1520,5136.93 1664,1664,1664,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,1,cgemm_col_nt_fijinano_1520,5813.04 1696,1696,1696,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,1,cgemm_col_nt_fijinano_1520,5092.65 1728,1728,1728,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,1,cgemm_col_nt_fijinano_1520,5820.54 1760,1760,1760,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,1,cgemm_col_nt_fijinano_1520,5121.03 1792,1792,1792,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,1,cgemm_col_nt_fijinano_1520,5812.71 1824,1824,1824,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,1,cgemm_col_nt_fijinano_1520,5508.38 1856,1856,1856,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,1,cgemm_col_nt_fijinano_1520,5795.56 1888,1888,1888,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,1,cgemm_col_nt_fijinano_1520,5101.97 1920,1920,1920,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,1,cgemm_col_nt_fijinano_1520,5860.74 1952,1952,1952,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,1,cgemm_col_nt_fijinano_1520,5136.77 1984,1984,1984,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,1,cgemm_col_nt_fijinano_1520,5855.37 2016,2016,2016,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,1,cgemm_col_nt_fijinano_1520,6058.81 2048,2048,2048,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,1,cgemm_col_nt_fijinano_1520,6210.21 2080,2080,2080,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,1,cgemm_col_nt_fijinano_1520,5369.68 2112,2112,2112,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,1,cgemm_col_nt_fijinano_1520,5953.98 2144,2144,2144,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,1,cgemm_col_nt_fijinano_1520,5208.62 2176,2176,2176,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,1,cgemm_col_nt_fijinano_1520,6042.74 2208,2208,2208,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,1,cgemm_col_nt_fijinano_1520,5710.64 2240,2240,2240,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,1,cgemm_col_nt_fijinano_1520,6060.97 2272,2272,2272,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,1,cgemm_col_nt_fijinano_1520,4660.35 2304,2304,2304,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,1,cgemm_col_nt_fijinano_1520,6165.61 2336,2336,2336,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,1,cgemm_col_nt_fijinano_1520,4724.36 2368,2368,2368,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,1,cgemm_col_nt_fijinano_1520,6208.71 2400,2400,2400,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,1,cgemm_col_nt_fijinano_1520,6125.46 2432,2432,2432,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,1,cgemm_col_nt_fijinano_1520,6220.96 2464,2464,2464,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,1,cgemm_col_nt_fijinano_1520,4926.11 2496,2496,2496,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,1,cgemm_col_nt_fijinano_1520,6275.64 2528,2528,2528,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,1,cgemm_col_nt_fijinano_1520,4941.56 2560,2560,2560,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,1,cgemm_col_nt_fijinano_1520,6288.88 2592,2592,2592,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,1,cgemm_col_nt_fijinano_1520,5955.3 2624,2624,2624,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,1,cgemm_col_nt_fijinano_1520,6148.15 2656,2656,2656,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,1,cgemm_col_nt_fijinano_1520,4995.12 2688,2688,2688,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,1,cgemm_col_nt_fijinano_1520,6216.62 2720,2720,2720,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,1,cgemm_col_nt_fijinano_1520,5107.04 2752,2752,2752,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,1,cgemm_col_nt_fijinano_1520,6278.74 2784,2784,2784,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,1,cgemm_col_nt_fijinano_1520,5950.11 2816,2816,2816,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,1,cgemm_col_nt_fijinano_1520,6209.58 2848,2848,2848,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,1,cgemm_col_nt_fijinano_1520,5166.41 2880,2880,2880,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,1,cgemm_col_nt_fijinano_1520,6235.75 2912,2912,2912,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,1,cgemm_col_nt_fijinano_1520,5255.43 2944,2944,2944,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,1,cgemm_col_nt_fijinano_1520,6167.68 2976,2976,2976,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,1,cgemm_col_nt_fijinano_1520,5983.15 3008,3008,3008,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,1,cgemm_col_nt_fijinano_1520,6241.18 3040,3040,3040,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,1,cgemm_col_nt_fijinano_1520,5296.91 3072,3072,3072,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,1,cgemm_col_nt_fijinano_1520,6328.93 3104,3104,3104,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,1,cgemm_col_nt_fijinano_1520,5401.12 3136,3136,3136,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,1,cgemm_col_nt_fijinano_1520,6256.43 3168,3168,3168,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,1,cgemm_col_nt_fijinano_1520,6054.99 3200,3200,3200,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,1,cgemm_col_nt_fijinano_1520,6210.48 3232,3232,3232,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,1,cgemm_col_nt_fijinano_1520,5382.7 3264,3264,3264,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,1,cgemm_col_nt_fijinano_1520,6283.52 3296,3296,3296,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,1,cgemm_col_nt_fijinano_1520,5484.96 3328,3328,3328,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,1,cgemm_col_nt_fijinano_1520,6238.47 3360,3360,3360,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,1,cgemm_col_nt_fijinano_1520,6158.6 3392,3392,3392,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,1,cgemm_col_nt_fijinano_1520,6333 3424,3424,3424,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,1,cgemm_col_nt_fijinano_1520,5554.46 3456,3456,3456,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,1,cgemm_col_nt_fijinano_1520,6278.21 3488,3488,3488,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,1,cgemm_col_nt_fijinano_1520,5546.68 3520,3520,3520,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,1,cgemm_col_nt_fijinano_1520,6253.18 3552,3552,3552,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,1,cgemm_col_nt_fijinano_1520,6312.68 3584,3584,3584,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,1,cgemm_col_nt_fijinano_1520,6322.61 3616,3616,3616,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,1,cgemm_col_nt_fijinano_1520,5653.63 3648,3648,3648,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,1,cgemm_col_nt_fijinano_1520,6320.16 3680,3680,3680,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,1,cgemm_col_nt_fijinano_1520,5647.68 3712,3712,3712,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,1,cgemm_col_nt_fijinano_1520,6298.23 3744,3744,3744,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,1,cgemm_col_nt_fijinano_1520,6458.47 3776,3776,3776,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,1,cgemm_col_nt_fijinano_1520,6280.78 3808,3808,3808,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,1,cgemm_col_nt_fijinano_1520,5675.14 3840,3840,3840,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,1,cgemm_col_nt_fijinano_1520,6266.23 3904,3904,3904,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,1,cgemm_col_nt_fijinano_1520,6304.8 3936,3936,3936,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,1,cgemm_col_nt_fijinano_1520,6380.82 3968,3968,3968,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,1,cgemm_col_nt_fijinano_1520,6365.85 4000,4000,4000,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,1,cgemm_col_nt_fijinano_1520,5775.86 4032,4032,4032,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,1,cgemm_col_nt_fijinano_1520,6317.74 4064,4064,4064,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,1,cgemm_col_nt_fijinano_1520,5762.54 4096,4096,4096,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,1,cgemm_col_nt_fijinano_1520,6390.06 4128,4128,4128,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,1,cgemm_col_nt_fijinano_1520,6539.81 4160,4160,4160,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,1,cgemm_col_nt_fijinano_1520,6302.33 4192,4192,4192,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,1,cgemm_col_nt_fijinano_1520,5713.8 4224,4224,4224,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,1,cgemm_col_nt_fijinano_1520,6294.71 4256,4256,4256,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,1,cgemm_col_nt_fijinano_1520,5731.62 4288,4288,4288,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,1,cgemm_col_nt_fijinano_1520,6296.78 4320,4320,4320,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,1,cgemm_col_nt_fijinano_1520,6466.8 4352,4352,4352,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,1,cgemm_col_nt_fijinano_1520,6307.8 4384,4384,4384,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,1,cgemm_col_nt_fijinano_1520,5785.34 4416,4416,4416,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,1,cgemm_col_nt_fijinano_1520,6313.17 4448,4448,4448,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,1,cgemm_col_nt_fijinano_1520,5816.38 4480,4480,4480,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,1,cgemm_col_nt_fijinano_1520,6322.05 4512,4512,4512,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,1,cgemm_col_nt_fijinano_1520,6506.97 4544,4544,4544,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,1,cgemm_col_nt_fijinano_1520,6329.95 4576,4576,4576,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,1,cgemm_col_nt_fijinano_1520,5854.87 4608,4608,4608,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,1,cgemm_col_nt_fijinano_1520,6387.85 4640,4640,4640,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,1,cgemm_col_nt_fijinano_1520,5915.68 4672,4672,4672,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,1,cgemm_col_nt_fijinano_1520,6328.11 4704,4704,4704,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,1,cgemm_col_nt_fijinano_1520,6528.81 4736,4736,4736,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,1,cgemm_col_nt_fijinano_1520,6336.43 4768,4768,4768,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,1,cgemm_col_nt_fijinano_1520,5887.01 4800,4800,4800,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,1,cgemm_col_nt_fijinano_1520,6351.91 4832,4832,4832,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,1,cgemm_col_nt_fijinano_1520,5920.75 4864,4864,4864,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,1,cgemm_col_nt_fijinano_1520,6305 4896,4896,4896,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,1,cgemm_col_nt_fijinano_1520,6565.29 4928,4928,4928,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,1,cgemm_col_nt_fijinano_1520,6326.93 4960,4960,4960,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,1,cgemm_col_nt_fijinano_1520,5919.32 4992,4992,4992,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,1,cgemm_col_nt_fijinano_1520,6282.22 5024,5024,5024,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,1,cgemm_col_nt_fijinano_1520,5906.62 5056,5056,5056,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,1,cgemm_col_nt_fijinano_1520,6309.3 5088,5088,5088,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,1,cgemm_col_nt_fijinano_1520,6602.44 5120,5120,5120,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,1,cgemm_col_nt_fijinano_1520,6315.78 5152,5152,5152,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,1,cgemm_col_nt_fijinano_1520,5936.27 5184,5184,5184,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,1,cgemm_col_nt_fijinano_1520,6302.72 5216,5216,5216,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,1,cgemm_col_nt_fijinano_1520,5945.09 5248,5248,5248,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,1,cgemm_col_nt_fijinano_1520,6260.44 5280,5280,5280,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,1,cgemm_col_nt_fijinano_1520,6504.82 5312,5312,5312,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,1,cgemm_col_nt_fijinano_1520,6297.45 5344,5344,5344,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,1,cgemm_col_nt_fijinano_1520,5985.25 5376,5376,5376,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,1,cgemm_col_nt_fijinano_1520,6288.26 5408,5408,5408,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,1,cgemm_col_nt_fijinano_1520,5955.09 5440,5440,5440,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,1,cgemm_col_nt_fijinano_1520,6337.93 5472,5472,5472,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,1,cgemm_col_nt_fijinano_1520,6633.58 5504,5504,5504,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,1,cgemm_col_nt_fijinano_1520,6329.98 5536,5536,5536,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,1,cgemm_col_nt_fijinano_1520,6012.14 5568,5568,5568,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,1,cgemm_col_nt_fijinano_1520,6320.96 5600,5600,5600,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,1,cgemm_col_nt_fijinano_1520,6011.62 5632,5632,5632,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,1,cgemm_col_nt_fijinano_1520,6328.59 5664,5664,5664,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,1,cgemm_col_nt_fijinano_1520,6596.78 5696,5696,5696,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,1,cgemm_col_nt_fijinano_1520,6344.19 5728,5728,5728,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,1,cgemm_col_nt_fijinano_1520,6052.5 5760,5760,5760,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,clblas,1,cgemm_col_nt_fijinano_1520,6327.9 clblas-2.10/doc/performance/clBLAS_2.9.0/FIJINANO/clblas290_fijinano_dgemm_col_nt_1520.csv000066400000000000000000000530101264277366700302130ustar00rootroot00000000000000m,n,k,lda,ldb,ldc,offa,offb,offc,alpha,beta,order,transa,transb,side,uplo,diag,function,device,library,numQueues,label,GFLOPS 32,32,32,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,1,dgemm_col_nt_fijinano_1520,0.568253 64,64,64,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,1,dgemm_col_nt_fijinano_1520,4.20387 96,96,96,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,1,dgemm_col_nt_fijinano_1520,13.4051 128,128,128,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,1,dgemm_col_nt_fijinano_1520,29.3509 160,160,160,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,1,dgemm_col_nt_fijinano_1520,56.6476 192,192,192,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,1,dgemm_col_nt_fijinano_1520,86.8476 224,224,224,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,1,dgemm_col_nt_fijinano_1520,103.255 256,256,256,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,1,dgemm_col_nt_fijinano_1520,149.302 288,288,288,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,1,dgemm_col_nt_fijinano_1520,184.036 320,320,320,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,1,dgemm_col_nt_fijinano_1520,219.252 352,352,352,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,1,dgemm_col_nt_fijinano_1520,254.561 384,384,384,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,1,dgemm_col_nt_fijinano_1520,285.834 416,416,416,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,1,dgemm_col_nt_fijinano_1520,285.295 448,448,448,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,1,dgemm_col_nt_fijinano_1520,316.146 480,480,480,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,1,dgemm_col_nt_fijinano_1520,324.806 512,512,512,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,1,dgemm_col_nt_fijinano_1520,346.069 544,544,544,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,1,dgemm_col_nt_fijinano_1520,349.757 576,576,576,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,1,dgemm_col_nt_fijinano_1520,367.622 608,608,608,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,1,dgemm_col_nt_fijinano_1520,374.997 640,640,640,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,1,dgemm_col_nt_fijinano_1520,375.769 672,672,672,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,1,dgemm_col_nt_fijinano_1520,392.726 704,704,704,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,1,dgemm_col_nt_fijinano_1520,384.136 736,736,736,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,1,dgemm_col_nt_fijinano_1520,380.629 768,768,768,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,1,dgemm_col_nt_fijinano_1520,396.55 800,800,800,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,1,dgemm_col_nt_fijinano_1520,377.09 832,832,832,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,1,dgemm_col_nt_fijinano_1520,365.329 864,864,864,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,1,dgemm_col_nt_fijinano_1520,401.467 896,896,896,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,1,dgemm_col_nt_fijinano_1520,392.719 928,928,928,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,1,dgemm_col_nt_fijinano_1520,384.13 960,960,960,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,1,dgemm_col_nt_fijinano_1520,415.676 992,992,992,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,1,dgemm_col_nt_fijinano_1520,394.292 1024,1024,1024,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,1,dgemm_col_nt_fijinano_1520,412.433 1056,1056,1056,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,1,dgemm_col_nt_fijinano_1520,413.993 1088,1088,1088,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,1,dgemm_col_nt_fijinano_1520,410.952 1120,1120,1120,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,1,dgemm_col_nt_fijinano_1520,418.839 1152,1152,1152,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,1,dgemm_col_nt_fijinano_1520,425.103 1184,1184,1184,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,1,dgemm_col_nt_fijinano_1520,429.722 1216,1216,1216,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,1,dgemm_col_nt_fijinano_1520,430.361 1248,1248,1248,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,1,dgemm_col_nt_fijinano_1520,435.794 1280,1280,1280,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,1,dgemm_col_nt_fijinano_1520,401.553 1312,1312,1312,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,1,dgemm_col_nt_fijinano_1520,431.411 1344,1344,1344,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,1,dgemm_col_nt_fijinano_1520,443.986 1376,1376,1376,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,1,dgemm_col_nt_fijinano_1520,449.915 1408,1408,1408,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,1,dgemm_col_nt_fijinano_1520,427.727 1440,1440,1440,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,1,dgemm_col_nt_fijinano_1520,380.408 1472,1472,1472,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,1,dgemm_col_nt_fijinano_1520,419.598 1504,1504,1504,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,1,dgemm_col_nt_fijinano_1520,448.978 1536,1536,1536,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,1,dgemm_col_nt_fijinano_1520,457.268 1568,1568,1568,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,1,dgemm_col_nt_fijinano_1520,444.243 1600,1600,1600,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,1,dgemm_col_nt_fijinano_1520,448.552 1632,1632,1632,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,1,dgemm_col_nt_fijinano_1520,454.429 1664,1664,1664,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,1,dgemm_col_nt_fijinano_1520,445.253 1696,1696,1696,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,1,dgemm_col_nt_fijinano_1520,458.888 1728,1728,1728,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,1,dgemm_col_nt_fijinano_1520,440.84 1760,1760,1760,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,1,dgemm_col_nt_fijinano_1520,453.311 1792,1792,1792,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,1,dgemm_col_nt_fijinano_1520,438.923 1824,1824,1824,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,1,dgemm_col_nt_fijinano_1520,457.468 1856,1856,1856,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,1,dgemm_col_nt_fijinano_1520,437.897 1888,1888,1888,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,1,dgemm_col_nt_fijinano_1520,457.644 1920,1920,1920,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,1,dgemm_col_nt_fijinano_1520,438.892 1952,1952,1952,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,1,dgemm_col_nt_fijinano_1520,455.919 1984,1984,1984,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,1,dgemm_col_nt_fijinano_1520,439.695 2016,2016,2016,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,1,dgemm_col_nt_fijinano_1520,456.162 2048,2048,2048,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,1,dgemm_col_nt_fijinano_1520,469.513 2080,2080,2080,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,1,dgemm_col_nt_fijinano_1520,456.825 2112,2112,2112,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,1,dgemm_col_nt_fijinano_1520,444.518 2144,2144,2144,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,1,dgemm_col_nt_fijinano_1520,459.391 2176,2176,2176,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,1,dgemm_col_nt_fijinano_1520,447.856 2208,2208,2208,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,1,dgemm_col_nt_fijinano_1520,461.491 2240,2240,2240,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,1,dgemm_col_nt_fijinano_1520,451.1 2272,2272,2272,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,1,dgemm_col_nt_fijinano_1520,461.958 2304,2304,2304,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,1,dgemm_col_nt_fijinano_1520,454.788 2336,2336,2336,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,1,dgemm_col_nt_fijinano_1520,456.006 2368,2368,2368,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,1,dgemm_col_nt_fijinano_1520,459.22 2400,2400,2400,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,1,dgemm_col_nt_fijinano_1520,465.416 2432,2432,2432,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,1,dgemm_col_nt_fijinano_1520,463.054 2464,2464,2464,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,1,dgemm_col_nt_fijinano_1520,462.106 2496,2496,2496,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,1,dgemm_col_nt_fijinano_1520,468.368 2528,2528,2528,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,1,dgemm_col_nt_fijinano_1520,464.866 2560,2560,2560,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,1,dgemm_col_nt_fijinano_1520,473.232 2592,2592,2592,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,1,dgemm_col_nt_fijinano_1520,465.142 2624,2624,2624,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,1,dgemm_col_nt_fijinano_1520,461.494 2656,2656,2656,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,1,dgemm_col_nt_fijinano_1520,466.096 2688,2688,2688,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,1,dgemm_col_nt_fijinano_1520,466.385 2720,2720,2720,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,1,dgemm_col_nt_fijinano_1520,466.829 2752,2752,2752,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,1,dgemm_col_nt_fijinano_1520,472.532 2784,2784,2784,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,1,dgemm_col_nt_fijinano_1520,464.989 2816,2816,2816,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,1,dgemm_col_nt_fijinano_1520,463.849 2848,2848,2848,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,1,dgemm_col_nt_fijinano_1520,467.435 2880,2880,2880,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,1,dgemm_col_nt_fijinano_1520,469.371 2912,2912,2912,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,1,dgemm_col_nt_fijinano_1520,461.155 2944,2944,2944,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,1,dgemm_col_nt_fijinano_1520,462.733 2976,2976,2976,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,1,dgemm_col_nt_fijinano_1520,464.826 3008,3008,3008,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,1,dgemm_col_nt_fijinano_1520,469.237 3040,3040,3040,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,1,dgemm_col_nt_fijinano_1520,465.539 3072,3072,3072,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,1,dgemm_col_nt_fijinano_1520,475.803 3104,3104,3104,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,1,dgemm_col_nt_fijinano_1520,462.843 3136,3136,3136,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,1,dgemm_col_nt_fijinano_1520,470.2 3168,3168,3168,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,1,dgemm_col_nt_fijinano_1520,466.096 3200,3200,3200,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,1,dgemm_col_nt_fijinano_1520,465.521 3232,3232,3232,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,1,dgemm_col_nt_fijinano_1520,467.517 3264,3264,3264,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,1,dgemm_col_nt_fijinano_1520,472.479 3296,3296,3296,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,1,dgemm_col_nt_fijinano_1520,468.456 3328,3328,3328,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,1,dgemm_col_nt_fijinano_1520,467.975 3360,3360,3360,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,1,dgemm_col_nt_fijinano_1520,453.119 3392,3392,3392,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,1,dgemm_col_nt_fijinano_1520,475.463 3424,3424,3424,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,1,dgemm_col_nt_fijinano_1520,469.422 3456,3456,3456,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,1,dgemm_col_nt_fijinano_1520,472.629 3488,3488,3488,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,1,dgemm_col_nt_fijinano_1520,468.91 3520,3520,3520,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,1,dgemm_col_nt_fijinano_1520,469.651 3552,3552,3552,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,1,dgemm_col_nt_fijinano_1520,468.308 3584,3584,3584,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,1,dgemm_col_nt_fijinano_1520,477.378 3616,3616,3616,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,1,dgemm_col_nt_fijinano_1520,468.993 3648,3648,3648,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,1,dgemm_col_nt_fijinano_1520,475.061 3680,3680,3680,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,1,dgemm_col_nt_fijinano_1520,448.349 3712,3712,3712,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,1,dgemm_col_nt_fijinano_1520,473.331 3744,3744,3744,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,1,dgemm_col_nt_fijinano_1520,468.825 3776,3776,3776,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,1,dgemm_col_nt_fijinano_1520,471.944 3808,3808,3808,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,1,dgemm_col_nt_fijinano_1520,467.996 3840,3840,3840,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,1,dgemm_col_nt_fijinano_1520,471.366 3872,3872,3872,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,1,dgemm_col_nt_fijinano_1520,469.394 3904,3904,3904,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,1,dgemm_col_nt_fijinano_1520,470.755 3936,3936,3936,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,1,dgemm_col_nt_fijinano_1520,468.739 3968,3968,3968,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,1,dgemm_col_nt_fijinano_1520,470.603 4000,4000,4000,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,1,dgemm_col_nt_fijinano_1520,447.095 4032,4032,4032,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,1,dgemm_col_nt_fijinano_1520,470.389 4064,4064,4064,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,1,dgemm_col_nt_fijinano_1520,468.23 4096,4096,4096,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,1,dgemm_col_nt_fijinano_1520,477.789 4128,4128,4128,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,1,dgemm_col_nt_fijinano_1520,468.56 4160,4160,4160,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,1,dgemm_col_nt_fijinano_1520,470.986 4192,4192,4192,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,1,dgemm_col_nt_fijinano_1520,468.34 4224,4224,4224,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,1,dgemm_col_nt_fijinano_1520,471.605 4256,4256,4256,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,1,dgemm_col_nt_fijinano_1520,466.997 4288,4288,4288,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,1,dgemm_col_nt_fijinano_1520,472.334 4320,4320,4320,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,1,dgemm_col_nt_fijinano_1520,454.612 4352,4352,4352,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,1,dgemm_col_nt_fijinano_1520,473.338 4384,4384,4384,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,1,dgemm_col_nt_fijinano_1520,469.383 4416,4416,4416,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,1,dgemm_col_nt_fijinano_1520,474.496 4448,4448,4448,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,1,dgemm_col_nt_fijinano_1520,470.206 4480,4480,4480,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,1,dgemm_col_nt_fijinano_1520,475.792 4512,4512,4512,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,1,dgemm_col_nt_fijinano_1520,470.1 4544,4544,4544,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,1,dgemm_col_nt_fijinano_1520,476.957 4576,4576,4576,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,1,dgemm_col_nt_fijinano_1520,469.829 4608,4608,4608,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,1,dgemm_col_nt_fijinano_1520,478.697 4640,4640,4640,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,1,dgemm_col_nt_fijinano_1520,457.665 4672,4672,4672,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,1,dgemm_col_nt_fijinano_1520,474.469 4704,4704,4704,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,1,dgemm_col_nt_fijinano_1520,470.294 4736,4736,4736,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,1,dgemm_col_nt_fijinano_1520,476.366 4768,4768,4768,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,1,dgemm_col_nt_fijinano_1520,470.72 4800,4800,4800,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,1,dgemm_col_nt_fijinano_1520,478.014 4832,4832,4832,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,1,dgemm_col_nt_fijinano_1520,469.845 4864,4864,4864,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,1,dgemm_col_nt_fijinano_1520,474.922 4896,4896,4896,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,1,dgemm_col_nt_fijinano_1520,470.549 4928,4928,4928,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,1,dgemm_col_nt_fijinano_1520,476.999 4960,4960,4960,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,1,dgemm_col_nt_fijinano_1520,454.416 4992,4992,4992,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,1,dgemm_col_nt_fijinano_1520,474.149 5024,5024,5024,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,1,dgemm_col_nt_fijinano_1520,469.952 5056,5056,5056,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,1,dgemm_col_nt_fijinano_1520,476.613 5088,5088,5088,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,1,dgemm_col_nt_fijinano_1520,469.82 5120,5120,5120,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,1,dgemm_col_nt_fijinano_1520,478.929 5152,5152,5152,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,1,dgemm_col_nt_fijinano_1520,469.876 5184,5184,5184,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,1,dgemm_col_nt_fijinano_1520,476.077 5216,5216,5216,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,1,dgemm_col_nt_fijinano_1520,470.222 5248,5248,5248,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,1,dgemm_col_nt_fijinano_1520,474.855 5280,5280,5280,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,1,dgemm_col_nt_fijinano_1520,457.659 5312,5312,5312,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,1,dgemm_col_nt_fijinano_1520,477.576 5344,5344,5344,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,1,dgemm_col_nt_fijinano_1520,470.947 5376,5376,5376,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,1,dgemm_col_nt_fijinano_1520,475.983 5408,5408,5408,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,1,dgemm_col_nt_fijinano_1520,470.102 5440,5440,5440,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,1,dgemm_col_nt_fijinano_1520,478.681 5472,5472,5472,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,1,dgemm_col_nt_fijinano_1520,470.904 5504,5504,5504,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,1,dgemm_col_nt_fijinano_1520,477.53 5536,5536,5536,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,1,dgemm_col_nt_fijinano_1520,470.762 5568,5568,5568,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,1,dgemm_col_nt_fijinano_1520,476.448 5600,5600,5600,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,1,dgemm_col_nt_fijinano_1520,455.526 5632,5632,5632,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,1,dgemm_col_nt_fijinano_1520,479.392 5696,5696,5696,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,1,dgemm_col_nt_fijinano_1520,478.471 5728,5728,5728,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,1,dgemm_col_nt_fijinano_1520,471.071 5760,5760,5760,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,clblas,1,dgemm_col_nt_fijinano_1520,477.734 clblas-2.10/doc/performance/clBLAS_2.9.0/FIJINANO/clblas290_fijinano_sgemm_col_nt_1520.csv000066400000000000000000000532111264277366700302350ustar00rootroot00000000000000m,n,k,lda,ldb,ldc,offa,offb,offc,alpha,beta,order,transa,transb,side,uplo,diag,function,device,library,numQueues,label,GFLOPS 32,32,32,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,1,sgemm_col_nt_fijinano_1520,0.60684 64,64,64,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,1,sgemm_col_nt_fijinano_1520,5.09699 96,96,96,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,1,sgemm_col_nt_fijinano_1520,14.5356 128,128,128,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,1,sgemm_col_nt_fijinano_1520,34.6077 160,160,160,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,1,sgemm_col_nt_fijinano_1520,60.6455 192,192,192,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,1,sgemm_col_nt_fijinano_1520,104.155 224,224,224,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,1,sgemm_col_nt_fijinano_1520,153.214 256,256,256,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,1,sgemm_col_nt_fijinano_1520,207.73 288,288,288,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,1,sgemm_col_nt_fijinano_1520,299.856 320,320,320,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,1,sgemm_col_nt_fijinano_1520,374.548 352,352,352,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,1,sgemm_col_nt_fijinano_1520,540.344 384,384,384,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,1,sgemm_col_nt_fijinano_1520,622.353 416,416,416,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,1,sgemm_col_nt_fijinano_1520,821.506 448,448,448,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,1,sgemm_col_nt_fijinano_1520,944.622 480,480,480,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,1,sgemm_col_nt_fijinano_1520,1004.71 512,512,512,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,1,sgemm_col_nt_fijinano_1520,1158.14 544,544,544,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,1,sgemm_col_nt_fijinano_1520,1383.02 576,576,576,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,1,sgemm_col_nt_fijinano_1520,1329.11 608,608,608,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,1,sgemm_col_nt_fijinano_1520,1452.31 640,640,640,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,1,sgemm_col_nt_fijinano_1520,1724.13 672,672,672,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,1,sgemm_col_nt_fijinano_1520,1782.92 704,704,704,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,1,sgemm_col_nt_fijinano_1520,2163.34 736,736,736,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,1,sgemm_col_nt_fijinano_1520,1905.82 768,768,768,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,1,sgemm_col_nt_fijinano_1520,2397.62 800,800,800,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,1,sgemm_col_nt_fijinano_1520,2313.14 832,832,832,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,1,sgemm_col_nt_fijinano_1520,2795.54 864,864,864,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,1,sgemm_col_nt_fijinano_1520,2525.39 896,896,896,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,1,sgemm_col_nt_fijinano_1520,2736.36 928,928,928,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,1,sgemm_col_nt_fijinano_1520,2127.52 960,960,960,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,1,sgemm_col_nt_fijinano_1520,2980.87 992,992,992,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,1,sgemm_col_nt_fijinano_1520,2372.01 1024,1024,1024,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,1,sgemm_col_nt_fijinano_1520,3117.52 1056,1056,1056,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,1,sgemm_col_nt_fijinano_1520,3579.84 1088,1088,1088,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,1,sgemm_col_nt_fijinano_1520,3192.79 1120,1120,1120,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,1,sgemm_col_nt_fijinano_1520,2966.26 1152,1152,1152,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,1,sgemm_col_nt_fijinano_1520,3277.58 1184,1184,1184,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,1,sgemm_col_nt_fijinano_1520,1988.19 1216,1216,1216,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,1,sgemm_col_nt_fijinano_1520,3728.34 1248,1248,1248,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,1,sgemm_col_nt_fijinano_1520,3773.44 1280,1280,1280,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,1,sgemm_col_nt_fijinano_1520,3635.89 1312,1312,1312,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,1,sgemm_col_nt_fijinano_1520,2415.59 1344,1344,1344,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,1,sgemm_col_nt_fijinano_1520,3630.75 1376,1376,1376,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,1,sgemm_col_nt_fijinano_1520,2421.11 1408,1408,1408,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,1,sgemm_col_nt_fijinano_1520,3928.51 1440,1440,1440,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,1,sgemm_col_nt_fijinano_1520,3748.43 1472,1472,1472,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,1,sgemm_col_nt_fijinano_1520,4153.55 1504,1504,1504,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,1,sgemm_col_nt_fijinano_1520,2646.48 1536,1536,1536,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,1,sgemm_col_nt_fijinano_1520,3959.66 1568,1568,1568,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,1,sgemm_col_nt_fijinano_1520,2796.67 1600,1600,1600,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,1,sgemm_col_nt_fijinano_1520,3930.28 1632,1632,1632,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,1,sgemm_col_nt_fijinano_1520,4353.31 1664,1664,1664,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,1,sgemm_col_nt_fijinano_1520,4296.96 1696,1696,1696,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,1,sgemm_col_nt_fijinano_1520,3079.04 1728,1728,1728,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,1,sgemm_col_nt_fijinano_1520,4356.07 1760,1760,1760,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,1,sgemm_col_nt_fijinano_1520,3151.57 1792,1792,1792,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,1,sgemm_col_nt_fijinano_1520,4386.75 1824,1824,1824,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,1,sgemm_col_nt_fijinano_1520,4603.33 1856,1856,1856,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,1,sgemm_col_nt_fijinano_1520,4476.75 1888,1888,1888,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,1,sgemm_col_nt_fijinano_1520,3402.99 1920,1920,1920,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,1,sgemm_col_nt_fijinano_1520,4693.63 1952,1952,1952,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,1,sgemm_col_nt_fijinano_1520,3530.31 1984,1984,1984,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,1,sgemm_col_nt_fijinano_1520,4452.01 2016,2016,2016,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,1,sgemm_col_nt_fijinano_1520,4646.45 2048,2048,2048,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,1,sgemm_col_nt_fijinano_1520,3823.33 2080,2080,2080,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,1,sgemm_col_nt_fijinano_1520,3537.6 2112,2112,2112,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,1,sgemm_col_nt_fijinano_1520,4830.55 2144,2144,2144,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,1,sgemm_col_nt_fijinano_1520,3674.05 2176,2176,2176,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,1,sgemm_col_nt_fijinano_1520,4603.17 2208,2208,2208,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,1,sgemm_col_nt_fijinano_1520,4957.33 2240,2240,2240,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,1,sgemm_col_nt_fijinano_1520,4666.21 2272,2272,2272,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,1,sgemm_col_nt_fijinano_1520,3851.61 2304,2304,2304,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,1,sgemm_col_nt_fijinano_1520,4984.33 2336,2336,2336,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,1,sgemm_col_nt_fijinano_1520,3883.45 2368,2368,2368,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,1,sgemm_col_nt_fijinano_1520,4863.74 2400,2400,2400,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,1,sgemm_col_nt_fijinano_1520,4834.35 2432,2432,2432,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,1,sgemm_col_nt_fijinano_1520,4916.05 2464,2464,2464,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,1,sgemm_col_nt_fijinano_1520,3941.51 2496,2496,2496,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,1,sgemm_col_nt_fijinano_1520,5336.53 2528,2528,2528,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,1,sgemm_col_nt_fijinano_1520,3979.52 2560,2560,2560,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,1,sgemm_col_nt_fijinano_1520,4763.61 2592,2592,2592,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,1,sgemm_col_nt_fijinano_1520,5406.48 2624,2624,2624,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,1,sgemm_col_nt_fijinano_1520,4853.11 2656,2656,2656,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,1,sgemm_col_nt_fijinano_1520,4176.85 2688,2688,2688,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,1,sgemm_col_nt_fijinano_1520,5303.34 2720,2720,2720,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,1,sgemm_col_nt_fijinano_1520,4106.41 2752,2752,2752,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,1,sgemm_col_nt_fijinano_1520,4853.53 2784,2784,2784,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,1,sgemm_col_nt_fijinano_1520,5317.52 2816,2816,2816,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,1,sgemm_col_nt_fijinano_1520,4891.49 2848,2848,2848,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,1,sgemm_col_nt_fijinano_1520,4276.56 2880,2880,2880,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,1,sgemm_col_nt_fijinano_1520,5357.33 2912,2912,2912,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,1,sgemm_col_nt_fijinano_1520,4211.37 2944,2944,2944,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,1,sgemm_col_nt_fijinano_1520,4921.81 2976,2976,2976,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,1,sgemm_col_nt_fijinano_1520,5351.42 3008,3008,3008,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,1,sgemm_col_nt_fijinano_1520,4858.36 3040,3040,3040,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,1,sgemm_col_nt_fijinano_1520,4299.44 3072,3072,3072,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,1,sgemm_col_nt_fijinano_1520,5371.58 3104,3104,3104,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,1,sgemm_col_nt_fijinano_1520,4376.05 3136,3136,3136,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,1,sgemm_col_nt_fijinano_1520,4886.07 3168,3168,3168,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,1,sgemm_col_nt_fijinano_1520,5454.74 3200,3200,3200,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,1,sgemm_col_nt_fijinano_1520,5558.2 3232,3232,3232,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,1,sgemm_col_nt_fijinano_1520,4429.5 3264,3264,3264,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,1,sgemm_col_nt_fijinano_1520,5496.64 3296,3296,3296,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,1,sgemm_col_nt_fijinano_1520,4439.38 3328,3328,3328,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,1,sgemm_col_nt_fijinano_1520,4991.01 3360,3360,3360,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,1,sgemm_col_nt_fijinano_1520,5575.6 3392,3392,3392,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,1,sgemm_col_nt_fijinano_1520,4976.63 3424,3424,3424,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,1,sgemm_col_nt_fijinano_1520,4516.52 3456,3456,3456,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,1,sgemm_col_nt_fijinano_1520,5559.08 3488,3488,3488,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,1,sgemm_col_nt_fijinano_1520,4511.75 3520,3520,3520,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,1,sgemm_col_nt_fijinano_1520,5472.91 3552,3552,3552,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,1,sgemm_col_nt_fijinano_1520,5403.71 3584,3584,3584,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,1,sgemm_col_nt_fijinano_1520,4988.5 3616,3616,3616,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,1,sgemm_col_nt_fijinano_1520,4527.98 3648,3648,3648,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,1,sgemm_col_nt_fijinano_1520,5462.29 3680,3680,3680,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,1,sgemm_col_nt_fijinano_1520,4582.86 3712,3712,3712,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,1,sgemm_col_nt_fijinano_1520,4967.53 3744,3744,3744,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,1,sgemm_col_nt_fijinano_1520,5519.76 3776,3776,3776,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,1,sgemm_col_nt_fijinano_1520,4972.07 3808,3808,3808,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,1,sgemm_col_nt_fijinano_1520,4571.72 3840,3840,3840,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,1,sgemm_col_nt_fijinano_1520,5552.53 3872,3872,3872,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,1,sgemm_col_nt_fijinano_1520,4565.33 3904,3904,3904,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,1,sgemm_col_nt_fijinano_1520,4935.22 3936,3936,3936,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,1,sgemm_col_nt_fijinano_1520,5456.71 3968,3968,3968,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,1,sgemm_col_nt_fijinano_1520,5023.36 4000,4000,4000,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,1,sgemm_col_nt_fijinano_1520,5356.04 4032,4032,4032,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,1,sgemm_col_nt_fijinano_1520,5499.31 4064,4064,4064,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,1,sgemm_col_nt_fijinano_1520,4332.46 4096,4096,4096,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,1,sgemm_col_nt_fijinano_1520,5328.37 4128,4128,4128,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,1,sgemm_col_nt_fijinano_1520,5594.86 4160,4160,4160,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,1,sgemm_col_nt_fijinano_1520,5359.37 4192,4192,4192,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,1,sgemm_col_nt_fijinano_1520,4301.34 4224,4224,4224,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,1,sgemm_col_nt_fijinano_1520,5643.82 4256,4256,4256,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,1,sgemm_col_nt_fijinano_1520,4461.93 4288,4288,4288,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,1,sgemm_col_nt_fijinano_1520,4979.14 4320,4320,4320,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,1,sgemm_col_nt_fijinano_1520,5568.24 4352,4352,4352,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,1,sgemm_col_nt_fijinano_1520,5005.11 4384,4384,4384,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,1,sgemm_col_nt_fijinano_1520,4391.89 4416,4416,4416,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,1,sgemm_col_nt_fijinano_1520,5641.16 4448,4448,4448,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,1,sgemm_col_nt_fijinano_1520,4562.46 4480,4480,4480,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,1,sgemm_col_nt_fijinano_1520,5398.88 4512,4512,4512,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,1,sgemm_col_nt_fijinano_1520,5586.66 4544,4544,4544,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,1,sgemm_col_nt_fijinano_1520,5052.13 4576,4576,4576,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,1,sgemm_col_nt_fijinano_1520,4470.7 4608,4608,4608,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,1,sgemm_col_nt_fijinano_1520,5653.48 4640,4640,4640,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,1,sgemm_col_nt_fijinano_1520,5345.03 4672,4672,4672,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,1,sgemm_col_nt_fijinano_1520,5033.8 4704,4704,4704,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,1,sgemm_col_nt_fijinano_1520,5610.38 4736,4736,4736,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,1,sgemm_col_nt_fijinano_1520,5049.65 4768,4768,4768,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,1,sgemm_col_nt_fijinano_1520,4574.24 4800,4800,4800,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,1,sgemm_col_nt_fijinano_1520,5579.89 4832,4832,4832,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,1,sgemm_col_nt_fijinano_1520,4659.21 4864,4864,4864,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,1,sgemm_col_nt_fijinano_1520,5040.33 4896,4896,4896,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,1,sgemm_col_nt_fijinano_1520,5657.27 4928,4928,4928,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,1,sgemm_col_nt_fijinano_1520,5012.52 4960,4960,4960,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,1,sgemm_col_nt_fijinano_1520,5310.31 4992,4992,4992,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,1,sgemm_col_nt_fijinano_1520,5613.26 5024,5024,5024,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,1,sgemm_col_nt_fijinano_1520,4744.89 5056,5056,5056,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,1,sgemm_col_nt_fijinano_1520,5067.34 5088,5088,5088,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,1,sgemm_col_nt_fijinano_1520,5589.2 5120,5120,5120,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,1,sgemm_col_nt_fijinano_1520,5587.76 5152,5152,5152,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,1,sgemm_col_nt_fijinano_1520,4696.9 5184,5184,5184,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,1,sgemm_col_nt_fijinano_1520,5710.75 5216,5216,5216,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,1,sgemm_col_nt_fijinano_1520,4859.73 5248,5248,5248,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,1,sgemm_col_nt_fijinano_1520,5071.53 5280,5280,5280,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,1,sgemm_col_nt_fijinano_1520,5669.19 5312,5312,5312,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,1,sgemm_col_nt_fijinano_1520,5050.98 5344,5344,5344,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,1,sgemm_col_nt_fijinano_1520,4784.7 5376,5376,5376,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,1,sgemm_col_nt_fijinano_1520,5643.69 5408,5408,5408,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,1,sgemm_col_nt_fijinano_1520,4870.21 5440,5440,5440,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,1,sgemm_col_nt_fijinano_1520,5331.77 5472,5472,5472,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,1,sgemm_col_nt_fijinano_1520,5614.28 5504,5504,5504,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,1,sgemm_col_nt_fijinano_1520,5044.21 5536,5536,5536,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,1,sgemm_col_nt_fijinano_1520,4803.81 5568,5568,5568,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,1,sgemm_col_nt_fijinano_1520,5698.98 5600,5600,5600,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,1,sgemm_col_nt_fijinano_1520,5315.34 5632,5632,5632,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,1,sgemm_col_nt_fijinano_1520,4754.54 5664,5664,5664,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,1,sgemm_col_nt_fijinano_1520,5687.94 5696,5696,5696,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,1,sgemm_col_nt_fijinano_1520,5065.11 5728,5728,5728,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,1,sgemm_col_nt_fijinano_1520,4918.36 5760,5760,5760,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,1,sgemm_col_nt_fijinano_1520,5681.15 clblas-2.10/doc/performance/clBLAS_2.9.0/FIJINANO/clblas290_fijinano_zgemm_col_nt_1520.csv000066400000000000000000000455671264277366700302630ustar00rootroot00000000000000m,n,k,lda,ldb,ldc,offa,offb,offc,alpha,beta,order,transa,transb,side,uplo,diag,function,device,library,numQueues,label,GFLOPS 32,32,32,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,1,zgemm_col_nt_fijinano_1520,1.98154 64,64,64,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,1,zgemm_col_nt_fijinano_1520,14.361 96,96,96,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,1,zgemm_col_nt_fijinano_1520,45.0171 128,128,128,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,1,zgemm_col_nt_fijinano_1520,95.0874 160,160,160,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,1,zgemm_col_nt_fijinano_1520,150.551 192,192,192,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,1,zgemm_col_nt_fijinano_1520,195.774 224,224,224,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,1,zgemm_col_nt_fijinano_1520,238.205 256,256,256,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,1,zgemm_col_nt_fijinano_1520,294.947 288,288,288,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,1,zgemm_col_nt_fijinano_1520,289.614 320,320,320,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,1,zgemm_col_nt_fijinano_1520,313.955 352,352,352,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,1,zgemm_col_nt_fijinano_1520,360.831 384,384,384,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,1,zgemm_col_nt_fijinano_1520,400.813 416,416,416,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,1,zgemm_col_nt_fijinano_1520,391.687 448,448,448,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,1,zgemm_col_nt_fijinano_1520,382.485 480,480,480,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,1,zgemm_col_nt_fijinano_1520,387.621 512,512,512,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,1,zgemm_col_nt_fijinano_1520,426.092 544,544,544,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,1,zgemm_col_nt_fijinano_1520,406.356 576,576,576,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,1,zgemm_col_nt_fijinano_1520,410.041 608,608,608,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,1,zgemm_col_nt_fijinano_1520,429.266 640,640,640,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,1,zgemm_col_nt_fijinano_1520,430.198 672,672,672,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,1,zgemm_col_nt_fijinano_1520,433.371 704,704,704,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,1,zgemm_col_nt_fijinano_1520,437.61 736,736,736,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,1,zgemm_col_nt_fijinano_1520,440.554 768,768,768,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,1,zgemm_col_nt_fijinano_1520,459.728 800,800,800,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,1,zgemm_col_nt_fijinano_1520,446.541 832,832,832,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,1,zgemm_col_nt_fijinano_1520,442.845 864,864,864,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,1,zgemm_col_nt_fijinano_1520,434.908 896,896,896,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,1,zgemm_col_nt_fijinano_1520,436.278 928,928,928,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,1,zgemm_col_nt_fijinano_1520,436.605 960,960,960,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,1,zgemm_col_nt_fijinano_1520,438.456 992,992,992,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,1,zgemm_col_nt_fijinano_1520,437.347 1024,1024,1024,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,1,zgemm_col_nt_fijinano_1520,469.33 1056,1056,1056,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,1,zgemm_col_nt_fijinano_1520,443.024 1088,1088,1088,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,1,zgemm_col_nt_fijinano_1520,445.782 1120,1120,1120,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,1,zgemm_col_nt_fijinano_1520,450.069 1152,1152,1152,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,1,zgemm_col_nt_fijinano_1520,454.125 1184,1184,1184,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,1,zgemm_col_nt_fijinano_1520,459.064 1216,1216,1216,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,1,zgemm_col_nt_fijinano_1520,463.283 1248,1248,1248,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,1,zgemm_col_nt_fijinano_1520,468.245 1280,1280,1280,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,1,zgemm_col_nt_fijinano_1520,473.741 1312,1312,1312,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,1,zgemm_col_nt_fijinano_1520,461.61 1344,1344,1344,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,1,zgemm_col_nt_fijinano_1520,467.48 1376,1376,1376,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,1,zgemm_col_nt_fijinano_1520,471.432 1408,1408,1408,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,1,zgemm_col_nt_fijinano_1520,464.372 1440,1440,1440,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,1,zgemm_col_nt_fijinano_1520,466.837 1472,1472,1472,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,1,zgemm_col_nt_fijinano_1520,463.102 1504,1504,1504,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,1,zgemm_col_nt_fijinano_1520,470.008 1536,1536,1536,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,1,zgemm_col_nt_fijinano_1520,476.685 1568,1568,1568,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,1,zgemm_col_nt_fijinano_1520,470.618 1600,1600,1600,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,1,zgemm_col_nt_fijinano_1520,466.159 1632,1632,1632,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,1,zgemm_col_nt_fijinano_1520,473.025 1664,1664,1664,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,1,zgemm_col_nt_fijinano_1520,468.973 1696,1696,1696,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,1,zgemm_col_nt_fijinano_1520,476.288 1728,1728,1728,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,1,zgemm_col_nt_fijinano_1520,473.113 1760,1760,1760,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,1,zgemm_col_nt_fijinano_1520,470.842 1792,1792,1792,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,1,zgemm_col_nt_fijinano_1520,478.093 1824,1824,1824,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,1,zgemm_col_nt_fijinano_1520,476.26 1856,1856,1856,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,1,zgemm_col_nt_fijinano_1520,474.714 1888,1888,1888,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,1,zgemm_col_nt_fijinano_1520,473.502 1920,1920,1920,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,1,zgemm_col_nt_fijinano_1520,471.809 1952,1952,1952,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,1,zgemm_col_nt_fijinano_1520,471.965 1984,1984,1984,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,1,zgemm_col_nt_fijinano_1520,471.754 2016,2016,2016,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,1,zgemm_col_nt_fijinano_1520,471.775 2048,2048,2048,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,1,zgemm_col_nt_fijinano_1520,479.181 2080,2080,2080,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,1,zgemm_col_nt_fijinano_1520,471.916 2112,2112,2112,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,1,zgemm_col_nt_fijinano_1520,472.708 2144,2144,2144,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,1,zgemm_col_nt_fijinano_1520,473.697 2176,2176,2176,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,1,zgemm_col_nt_fijinano_1520,474.63 2208,2208,2208,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,1,zgemm_col_nt_fijinano_1520,475.811 2240,2240,2240,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,1,zgemm_col_nt_fijinano_1520,476.911 2272,2272,2272,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,1,zgemm_col_nt_fijinano_1520,478.246 2304,2304,2304,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,1,zgemm_col_nt_fijinano_1520,479.88 2336,2336,2336,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,1,zgemm_col_nt_fijinano_1520,475.594 2368,2368,2368,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,1,zgemm_col_nt_fijinano_1520,477.516 2400,2400,2400,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,1,zgemm_col_nt_fijinano_1520,479.326 2432,2432,2432,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,1,zgemm_col_nt_fijinano_1520,475.801 2464,2464,2464,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,1,zgemm_col_nt_fijinano_1520,478 2496,2496,2496,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,1,zgemm_col_nt_fijinano_1520,475.062 2528,2528,2528,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,1,zgemm_col_nt_fijinano_1520,477.614 2560,2560,2560,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,1,zgemm_col_nt_fijinano_1520,478.176 2592,2592,2592,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,1,zgemm_col_nt_fijinano_1520,477.935 2624,2624,2624,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,1,zgemm_col_nt_fijinano_1520,465.732 2656,2656,2656,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,1,zgemm_col_nt_fijinano_1520,478.459 2688,2688,2688,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,1,zgemm_col_nt_fijinano_1520,471.274 2720,2720,2720,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,1,zgemm_col_nt_fijinano_1520,479.952 2752,2752,2752,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,1,zgemm_col_nt_fijinano_1520,477.113 2784,2784,2784,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,1,zgemm_col_nt_fijinano_1520,477.558 2816,2816,2816,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,1,zgemm_col_nt_fijinano_1520,467.556 2848,2848,2848,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,1,zgemm_col_nt_fijinano_1520,479.646 2880,2880,2880,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,1,zgemm_col_nt_fijinano_1520,473.863 2912,2912,2912,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,1,zgemm_col_nt_fijinano_1520,478.495 2944,2944,2944,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,1,zgemm_col_nt_fijinano_1520,466.05 2976,2976,2976,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,1,zgemm_col_nt_fijinano_1520,477.713 3008,3008,3008,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,1,zgemm_col_nt_fijinano_1520,472.739 3040,3040,3040,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,1,zgemm_col_nt_fijinano_1520,477.459 3072,3072,3072,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,1,zgemm_col_nt_fijinano_1520,479.41 3104,3104,3104,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,1,zgemm_col_nt_fijinano_1520,477.734 3136,3136,3136,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,1,zgemm_col_nt_fijinano_1520,473.497 3168,3168,3168,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,1,zgemm_col_nt_fijinano_1520,478.236 3200,3200,3200,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,1,zgemm_col_nt_fijinano_1520,467.953 3232,3232,3232,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,1,zgemm_col_nt_fijinano_1520,479.219 3264,3264,3264,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,1,zgemm_col_nt_fijinano_1520,475.557 3296,3296,3296,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,1,zgemm_col_nt_fijinano_1520,480.365 3328,3328,3328,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,1,zgemm_col_nt_fijinano_1520,471.463 3360,3360,3360,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,1,zgemm_col_nt_fijinano_1520,479.015 3392,3392,3392,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,1,zgemm_col_nt_fijinano_1520,478.665 3424,3424,3424,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,1,zgemm_col_nt_fijinano_1520,480.862 3456,3456,3456,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,1,zgemm_col_nt_fijinano_1520,475.338 3488,3488,3488,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,1,zgemm_col_nt_fijinano_1520,480.238 3520,3520,3520,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,1,zgemm_col_nt_fijinano_1520,472.632 3552,3552,3552,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,1,zgemm_col_nt_fijinano_1520,479.901 3584,3584,3584,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,1,zgemm_col_nt_fijinano_1520,480.052 3616,3616,3616,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,1,zgemm_col_nt_fijinano_1520,479.988 3648,3648,3648,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,1,zgemm_col_nt_fijinano_1520,477.89 3680,3680,3680,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,1,zgemm_col_nt_fijinano_1520,480.265 3712,3712,3712,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,1,zgemm_col_nt_fijinano_1520,476.202 3744,3744,3744,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,1,zgemm_col_nt_fijinano_1520,480.926 3776,3776,3776,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,1,zgemm_col_nt_fijinano_1520,474.886 3808,3808,3808,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,1,zgemm_col_nt_fijinano_1520,479.617 3840,3840,3840,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,1,zgemm_col_nt_fijinano_1520,473.963 3872,3872,3872,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,1,zgemm_col_nt_fijinano_1520,480.733 3904,3904,3904,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,1,zgemm_col_nt_fijinano_1520,473.352 3936,3936,3936,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,1,zgemm_col_nt_fijinano_1520,480.02 3968,3968,3968,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,1,zgemm_col_nt_fijinano_1520,473.043 4000,4000,4000,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,1,zgemm_col_nt_fijinano_1520,479.627 4032,4032,4032,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,1,zgemm_col_nt_fijinano_1520,472.901 4064,4064,4064,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,1,zgemm_col_nt_fijinano_1520,479.464 4096,4096,4096,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,1,zgemm_col_nt_fijinano_1520,480.308 4128,4128,4128,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,1,zgemm_col_nt_fijinano_1520,479.546 4160,4160,4160,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,1,zgemm_col_nt_fijinano_1520,473.493 4192,4192,4192,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,1,zgemm_col_nt_fijinano_1520,479.867 4224,4224,4224,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,1,zgemm_col_nt_fijinano_1520,474.024 4256,4256,4256,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,1,zgemm_col_nt_fijinano_1520,480.363 4288,4288,4288,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,1,zgemm_col_nt_fijinano_1520,474.749 4320,4320,4320,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,1,zgemm_col_nt_fijinano_1520,481.049 4352,4352,4352,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,1,zgemm_col_nt_fijinano_1520,475.613 4384,4384,4384,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,1,zgemm_col_nt_fijinano_1520,480.277 4416,4416,4416,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,1,zgemm_col_nt_fijinano_1520,476.714 4448,4448,4448,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,1,zgemm_col_nt_fijinano_1520,481.331 4480,4480,4480,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,1,zgemm_col_nt_fijinano_1520,477.911 4512,4512,4512,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,1,zgemm_col_nt_fijinano_1520,480.977 4544,4544,4544,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,1,zgemm_col_nt_fijinano_1520,479.296 4576,4576,4576,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,1,zgemm_col_nt_fijinano_1520,480.836 4608,4608,4608,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,1,zgemm_col_nt_fijinano_1520,480.697 4640,4640,4640,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,1,zgemm_col_nt_fijinano_1520,480.873 4672,4672,4672,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,1,zgemm_col_nt_fijinano_1520,476.518 4704,4704,4704,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,1,zgemm_col_nt_fijinano_1520,479.683 4736,4736,4736,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,1,zgemm_col_nt_fijinano_1520,478.089 4768,4768,4768,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,1,zgemm_col_nt_fijinano_1520,481.439 4800,4800,4800,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,1,zgemm_col_nt_fijinano_1520,479.959 4832,4832,4832,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,1,zgemm_col_nt_fijinano_1520,480.616 4864,4864,4864,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,1,zgemm_col_nt_fijinano_1520,476.623 4896,4896,4896,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,1,zgemm_col_nt_fijinano_1520,481.322 4928,4928,4928,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,1,zgemm_col_nt_fijinano_1520,478.839 4960,4960,4960,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,1,zgemm_col_nt_fijinano_1520,472.755 4992,4992,4992,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,1,zgemm_col_nt_fijinano_1520,467.832 5024,5024,5024,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,clblas,1,zgemm_col_nt_fijinano_1520,470.144 clblas-2.10/doc/performance/cuBLAS_7.0/000077500000000000000000000000001264277366700174235ustar00rootroot00000000000000clblas-2.10/doc/performance/cuBLAS_7.0/Tesla_K40/000077500000000000000000000000001264277366700211115ustar00rootroot00000000000000clblas-2.10/doc/performance/cuBLAS_7.0/Tesla_K40/README.txt000066400000000000000000000017331264277366700226130ustar00rootroot00000000000000################################ # # # Benchmarking Methodology # # # ################################ ############ # Hardware # ############ Tesla K40 ############ # Software # ############ openSUSE 13.2 cuBLAS 7.0 driver 346.47 ############ # Settings # ############ gpu clocks: set to boost level using nvidia-smi cuBLAS: m=n=k=lda=ldb=ldc (for simplicity) alpha=beta=1 gemms were column-major, op(A,B)=N,T ############ # Sampling # ############ For each data point, we took 10 samples. Each sample consists of 10 gemm calls with a wait afterward. Outlying samples beyond 1 standard deviation were removed (rarely if ever did this actually need to happen). Before running the 10 samples, one warm-up sample was executed (but not included in the stastics). GFlop/s was calculated as (2*m*n*k flops) / (host time for 10 kernels / 10) // real data (8*m*n*k flops) / (host time for 10 kernels / 10) // complex data clblas-2.10/doc/performance/cuBLAS_7.0/Tesla_K40/dgemm.csv000066400000000000000000000436671264277366700227370ustar00rootroot00000000000000m,n,k,lda,ldb,ldc,offa,offb,offc,alpha,beta,order,transa,transb,side,uplo,diag,function,device,library,label,GFLOPS 32,32,32,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40,3.16141 64,64,64,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40,18.7984 96,96,96,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40,49.6624 128,128,128,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40,100.535 160,160,160,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40,171.884 192,192,192,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40,264.594 224,224,224,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40,378.304 256,256,256,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40,468.899 288,288,288,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40,591.284 320,320,320,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40,738.767 352,352,352,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40,625.024 384,384,384,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40,746.514 416,416,416,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40,871.618 448,448,448,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40,1016.62 480,480,480,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40,791.271 512,512,512,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40,725.501 544,544,544,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40,796.07 576,576,576,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40,892.025 608,608,608,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40,931.533 640,640,640,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40,1128.18 672,672,672,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40,975.19 704,704,704,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40,1066.17 736,736,736,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40,1018.67 768,768,768,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40,1117.65 800,800,800,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40,1125.88 832,832,832,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40,1212.63 864,864,864,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40,1105.09 896,896,896,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40,1159.84 928,928,928,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40,1186.92 960,960,960,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40,1245.23 992,992,992,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40,1154.6 1024,1024,1024,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40,1216.7 1056,1056,1056,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40,1195.69 1088,1088,1088,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40,1267.04 1120,1120,1120,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40,1212.05 1152,1152,1152,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40,1279.68 1184,1184,1184,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40,1209.98 1216,1216,1216,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40,1275.97 1248,1248,1248,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40,1222.62 1280,1280,1280,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40,1323.61 1312,1312,1312,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40,1235.81 1344,1344,1344,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40,1265.62 1376,1376,1376,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40,1215.76 1408,1408,1408,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40,1257.26 1440,1440,1440,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40,1220.06 1472,1472,1472,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40,1278.86 1504,1504,1504,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40,1242.6 1536,1536,1536,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40,1305.04 1568,1568,1568,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40,1244.24 1600,1600,1600,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40,1294.75 1632,1632,1632,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40,1234.82 1664,1664,1664,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40,1286.07 1696,1696,1696,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40,1240.76 1728,1728,1728,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40,1286.32 1760,1760,1760,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40,1230.09 1792,1792,1792,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40,1283.41 1824,1824,1824,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40,1228.72 1856,1856,1856,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40,1261.06 1888,1888,1888,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40,1210.65 1920,1920,1920,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40,1273.73 1952,1952,1952,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40,1241.72 1984,1984,1984,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40,1272.8 2016,2016,2016,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40,1231.53 2048,2048,2048,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40,1308.15 2080,2080,2080,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40,1241.36 2112,2112,2112,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40,1240.04 2144,2144,2144,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40,1195.59 2176,2176,2176,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40,1255.23 2208,2208,2208,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40,1210.91 2240,2240,2240,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40,1244.09 2272,2272,2272,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40,1213.67 2304,2304,2304,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40,1253.53 2336,2336,2336,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40,1215.32 2368,2368,2368,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40,1249.09 2400,2400,2400,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40,1208.89 2432,2432,2432,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40,1232.54 2464,2464,2464,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40,1214.86 2496,2496,2496,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40,1244.99 2528,2528,2528,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40,1220.02 2560,2560,2560,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40,1252.48 2592,2592,2592,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40,1219.44 2624,2624,2624,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40,1249.59 2656,2656,2656,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40,1219.07 2688,2688,2688,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40,1249.54 2720,2720,2720,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40,1221.87 2752,2752,2752,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40,1249.4 2784,2784,2784,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40,1219.21 2816,2816,2816,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40,1246.43 2848,2848,2848,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40,1227.33 2880,2880,2880,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40,1253.02 2912,2912,2912,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40,1216.81 2944,2944,2944,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40,1245.43 2976,2976,2976,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40,1220.85 3008,3008,3008,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40,1244.69 3040,3040,3040,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40,1220.33 3072,3072,3072,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40,1251.9 3104,3104,3104,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40,1222.41 3136,3136,3136,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40,1246.65 3168,3168,3168,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40,1222.15 3200,3200,3200,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40,1248.37 3232,3232,3232,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40,1223.94 3264,3264,3264,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40,1248.55 3296,3296,3296,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40,1225.1 3328,3328,3328,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40,1251.62 3360,3360,3360,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40,1225.31 3392,3392,3392,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40,1248.51 3424,3424,3424,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40,1228.08 3456,3456,3456,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40,1255.73 3488,3488,3488,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40,1230.74 3520,3520,3520,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40,1253.47 3552,3552,3552,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40,1229.31 3584,3584,3584,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40,1252.74 3616,3616,3616,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40,1230.44 3648,3648,3648,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40,1252.14 3680,3680,3680,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40,1230.54 3712,3712,3712,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40,1253.13 3744,3744,3744,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40,1232.32 3776,3776,3776,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40,1253.31 3808,3808,3808,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40,1233.0 3840,3840,3840,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40,1257.5 3872,3872,3872,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40,1234.26 3904,3904,3904,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40,1254.61 3936,3936,3936,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40,1232.91 3968,3968,3968,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40,1252.44 4000,4000,4000,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40,1233.66 4032,4032,4032,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40,1253.19 4064,4064,4064,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40,1235.11 4096,4096,4096,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40,1257.21 4128,4128,4128,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40,1237.07 4160,4160,4160,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40,1256.45 4192,4192,4192,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40,1237.48 4224,4224,4224,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40,1259.16 4256,4256,4256,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40,1236.92 4288,4288,4288,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40,1256.3 4320,4320,4320,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40,1237.04 4352,4352,4352,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40,1256.22 4384,4384,4384,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40,1237.72 4416,4416,4416,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40,1255.73 4448,4448,4448,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40,1239.15 4480,4480,4480,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40,1258.95 4512,4512,4512,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40,1239.22 4544,4544,4544,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40,1256.17 4576,4576,4576,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40,1239.32 4608,4608,4608,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40,1258.14 4640,4640,4640,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40,1241.53 4672,4672,4672,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40,1259.02 4704,4704,4704,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40,1239.91 4736,4736,4736,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40,1257.2 4768,4768,4768,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40,1243.42 4800,4800,4800,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40,1260.38 4832,4832,4832,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40,1241.12 4864,4864,4864,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40,1257.8 4896,4896,4896,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40,1243.45 4928,4928,4928,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40,1259.71 4960,4960,4960,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40,1242.14 4992,4992,4992,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40,1259.76 5024,5024,5024,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40,1242.62 5056,5056,5056,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40,1258.55 5088,5088,5088,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40,1243.27 5120,5120,5120,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40,1259.61 5152,5152,5152,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40,1244.44 5184,5184,5184,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40,1257.7 5216,5216,5216,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40,1228.81 5248,5248,5248,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40,1260.18 5280,5280,5280,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40,1244.29 5312,5312,5312,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40,1259.48 5344,5344,5344,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40,1245.69 5376,5376,5376,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40,1263.04 5408,5408,5408,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40,1246.48 5440,5440,5440,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40,1261.15 5472,5472,5472,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40,1245.99 5504,5504,5504,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40,1262.14 5536,5536,5536,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40,1246.24 5568,5568,5568,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40,1260.62 5600,5600,5600,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40,1246.29 5632,5632,5632,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40,1261.69 5664,5664,5664,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40,1246.91 5696,5696,5696,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40,1260.99 5728,5728,5728,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40,1246.81 5760,5760,5760,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40,1262.55 clblas-2.10/doc/performance/cuBLAS_7.0/Tesla_K40/dtrsm.csv000066400000000000000000000062231264277366700227620ustar00rootroot00000000000000m,n,k,lda,ldb,ldc,offa,offb,offc,alpha,beta,order,transa,transb,side,uplo,diag,function,device,library,label,GFLOPS 192,192,192,0,0,0,0,0,0,1.0,1.0,column,none,none,right,upper,unit,dtrsm,gpu,cublas,K40_dtrsm,46.1487 384,384,384,0,0,0,0,0,0,1.0,1.0,column,none,none,right,upper,unit,dtrsm,gpu,cublas,K40_dtrsm,147.678 576,576,576,0,0,0,0,0,0,1.0,1.0,column,none,none,right,upper,unit,dtrsm,gpu,cublas,K40_dtrsm,244.579 768,768,768,0,0,0,0,0,0,1.0,1.0,column,none,none,right,upper,unit,dtrsm,gpu,cublas,K40_dtrsm,374.837 960,960,960,0,0,0,0,0,0,1.0,1.0,column,none,none,right,upper,unit,dtrsm,gpu,cublas,K40_dtrsm,416.398 1152,1152,1152,0,0,0,0,0,0,1.0,1.0,column,none,none,right,upper,unit,dtrsm,gpu,cublas,K40_dtrsm,506.299 1344,1344,1344,0,0,0,0,0,0,1.0,1.0,column,none,none,right,upper,unit,dtrsm,gpu,cublas,K40_dtrsm,594.796 1536,1536,1536,0,0,0,0,0,0,1.0,1.0,column,none,none,right,upper,unit,dtrsm,gpu,cublas,K40_dtrsm,688.01 1728,1728,1728,0,0,0,0,0,0,1.0,1.0,column,none,none,right,upper,unit,dtrsm,gpu,cublas,K40_dtrsm,774.825 1920,1920,1920,0,0,0,0,0,0,1.0,1.0,column,none,none,right,upper,unit,dtrsm,gpu,cublas,K40_dtrsm,883.356 2112,2112,2112,0,0,0,0,0,0,1.0,1.0,column,none,none,right,upper,unit,dtrsm,gpu,cublas,K40_dtrsm,603.195 2304,2304,2304,0,0,0,0,0,0,1.0,1.0,column,none,none,right,upper,unit,dtrsm,gpu,cublas,K40_dtrsm,671.339 2496,2496,2496,0,0,0,0,0,0,1.0,1.0,column,none,none,right,upper,unit,dtrsm,gpu,cublas,K40_dtrsm,717.217 2688,2688,2688,0,0,0,0,0,0,1.0,1.0,column,none,none,right,upper,unit,dtrsm,gpu,cublas,K40_dtrsm,785.274 2880,2880,2880,0,0,0,0,0,0,1.0,1.0,column,none,none,right,upper,unit,dtrsm,gpu,cublas,K40_dtrsm,830.294 3072,3072,3072,0,0,0,0,0,0,1.0,1.0,column,none,none,right,upper,unit,dtrsm,gpu,cublas,K40_dtrsm,895.983 3264,3264,3264,0,0,0,0,0,0,1.0,1.0,column,none,none,right,upper,unit,dtrsm,gpu,cublas,K40_dtrsm,946.919 3456,3456,3456,0,0,0,0,0,0,1.0,1.0,column,none,none,right,upper,unit,dtrsm,gpu,cublas,K40_dtrsm,1009.79 3648,3648,3648,0,0,0,0,0,0,1.0,1.0,column,none,none,right,upper,unit,dtrsm,gpu,cublas,K40_dtrsm,1057.67 3840,3840,3840,0,0,0,0,0,0,1.0,1.0,column,none,none,right,upper,unit,dtrsm,gpu,cublas,K40_dtrsm,1126.96 4032,4032,4032,0,0,0,0,0,0,1.0,1.0,column,none,none,right,upper,unit,dtrsm,gpu,cublas,K40_dtrsm,849.814 4224,4224,4224,0,0,0,0,0,0,1.0,1.0,column,none,none,right,upper,unit,dtrsm,gpu,cublas,K40_dtrsm,903.301 4416,4416,4416,0,0,0,0,0,0,1.0,1.0,column,none,none,right,upper,unit,dtrsm,gpu,cublas,K40_dtrsm,928.784 4608,4608,4608,0,0,0,0,0,0,1.0,1.0,column,none,none,right,upper,unit,dtrsm,gpu,cublas,K40_dtrsm,980.563 4800,4800,4800,0,0,0,0,0,0,1.0,1.0,column,none,none,right,upper,unit,dtrsm,gpu,cublas,K40_dtrsm,1005.71 4992,4992,4992,0,0,0,0,0,0,1.0,1.0,column,none,none,right,upper,unit,dtrsm,gpu,cublas,K40_dtrsm,1061.05 5184,5184,5184,0,0,0,0,0,0,1.0,1.0,column,none,none,right,upper,unit,dtrsm,gpu,cublas,K40_dtrsm,1089.49 5376,5376,5376,0,0,0,0,0,0,1.0,1.0,column,none,none,right,upper,unit,dtrsm,gpu,cublas,K40_dtrsm,1142.6 5568,5568,5568,0,0,0,0,0,0,1.0,1.0,column,none,none,right,upper,unit,dtrsm,gpu,cublas,K40_dtrsm,1169.34 5760,5760,5760,0,0,0,0,0,0,1.0,1.0,column,none,none,right,upper,unit,dtrsm,gpu,cublas,K40_dtrsm,1221.82 clblas-2.10/doc/performance/cuBLAS_7.0/Tesla_K40/peak_dp.csv000066400000000000000000000431361264277366700232400ustar00rootroot00000000000000m,n,k,lda,ldb,ldc,offa,offb,offc,alpha,beta,order,transa,transb,side,uplo,diag,function,device,library,label,GFLOPS 32,32,32,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,1660 64,64,64,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,1660 96,96,96,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,1660 128,128,128,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,1660 160,160,160,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,1660 192,192,192,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,1660 224,224,224,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,1660 256,256,256,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,1660 288,288,288,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,1660 320,320,320,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,1660 352,352,352,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,1660 384,384,384,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,1660 416,416,416,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,1660 448,448,448,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,1660 480,480,480,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,1660 512,512,512,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,1660 544,544,544,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,1660 576,576,576,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,1660 608,608,608,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,1660 640,640,640,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,1660 672,672,672,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,1660 704,704,704,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,1660 736,736,736,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,1660 768,768,768,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,1660 800,800,800,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,1660 832,832,832,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,1660 864,864,864,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,1660 896,896,896,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,1660 928,928,928,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,1660 960,960,960,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,1660 992,992,992,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,1660 1024,1024,1024,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,1660 1056,1056,1056,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,1660 1088,1088,1088,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,1660 1120,1120,1120,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,1660 1152,1152,1152,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,1660 1184,1184,1184,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,1660 1216,1216,1216,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,1660 1248,1248,1248,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,1660 1280,1280,1280,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,1660 1312,1312,1312,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,1660 1344,1344,1344,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,1660 1376,1376,1376,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,1660 1408,1408,1408,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,1660 1440,1440,1440,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,1660 1472,1472,1472,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,1660 1504,1504,1504,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,1660 1536,1536,1536,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,1660 1568,1568,1568,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,1660 1600,1600,1600,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,1660 1632,1632,1632,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,1660 1664,1664,1664,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,1660 1696,1696,1696,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,1660 1728,1728,1728,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,1660 1760,1760,1760,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,1660 1792,1792,1792,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,1660 1824,1824,1824,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,1660 1856,1856,1856,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,1660 1888,1888,1888,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,1660 1920,1920,1920,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,1660 1952,1952,1952,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,1660 1984,1984,1984,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,1660 2016,2016,2016,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,1660 2048,2048,2048,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,1660 2080,2080,2080,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,1660 2112,2112,2112,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,1660 2144,2144,2144,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,1660 2176,2176,2176,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,1660 2208,2208,2208,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,1660 2240,2240,2240,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,1660 2272,2272,2272,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,1660 2304,2304,2304,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,1660 2336,2336,2336,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,1660 2368,2368,2368,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,1660 2400,2400,2400,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,1660 2432,2432,2432,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,1660 2464,2464,2464,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,1660 2496,2496,2496,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,1660 2528,2528,2528,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,1660 2560,2560,2560,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,1660 2592,2592,2592,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,1660 2624,2624,2624,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,1660 2656,2656,2656,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,1660 2688,2688,2688,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,1660 2720,2720,2720,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,1660 2752,2752,2752,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,1660 2784,2784,2784,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,1660 2816,2816,2816,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,1660 2848,2848,2848,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,1660 2880,2880,2880,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,1660 2912,2912,2912,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,1660 2944,2944,2944,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,1660 2976,2976,2976,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,1660 3008,3008,3008,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,1660 3040,3040,3040,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,1660 3072,3072,3072,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,1660 3104,3104,3104,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,1660 3136,3136,3136,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,1660 3168,3168,3168,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,1660 3200,3200,3200,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,1660 3232,3232,3232,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,1660 3264,3264,3264,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,1660 3296,3296,3296,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,1660 3328,3328,3328,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,1660 3360,3360,3360,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,1660 3392,3392,3392,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,1660 3424,3424,3424,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,1660 3456,3456,3456,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,1660 3488,3488,3488,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,1660 3520,3520,3520,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,1660 3552,3552,3552,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,1660 3584,3584,3584,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,1660 3616,3616,3616,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,1660 3648,3648,3648,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,1660 3680,3680,3680,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,1660 3712,3712,3712,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,1660 3744,3744,3744,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,1660 3776,3776,3776,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,1660 3808,3808,3808,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,1660 3840,3840,3840,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,1660 3872,3872,3872,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,1660 3904,3904,3904,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,1660 3936,3936,3936,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,1660 3968,3968,3968,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,1660 4000,4000,4000,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,1660 4032,4032,4032,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,1660 4064,4064,4064,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,1660 4096,4096,4096,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,1660 4128,4128,4128,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,1660 4160,4160,4160,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,1660 4192,4192,4192,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,1660 4224,4224,4224,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,1660 4256,4256,4256,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,1660 4288,4288,4288,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,1660 4320,4320,4320,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,1660 4352,4352,4352,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,1660 4384,4384,4384,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,1660 4416,4416,4416,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,1660 4448,4448,4448,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,1660 4480,4480,4480,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,1660 4512,4512,4512,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,1660 4544,4544,4544,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,1660 4576,4576,4576,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,1660 4608,4608,4608,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,1660 4640,4640,4640,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,1660 4672,4672,4672,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,1660 4704,4704,4704,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,1660 4736,4736,4736,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,1660 4768,4768,4768,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,1660 4800,4800,4800,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,1660 4832,4832,4832,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,1660 4864,4864,4864,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,1660 4896,4896,4896,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,1660 4928,4928,4928,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,1660 4960,4960,4960,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,1660 4992,4992,4992,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,1660 5024,5024,5024,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,1660 5056,5056,5056,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,1660 5088,5088,5088,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,1660 5120,5120,5120,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,1660 5152,5152,5152,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,1660 5184,5184,5184,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,1660 5216,5216,5216,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,1660 5248,5248,5248,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,1660 5280,5280,5280,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,1660 5312,5312,5312,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,1660 5344,5344,5344,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,1660 5376,5376,5376,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,1660 5408,5408,5408,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,1660 5440,5440,5440,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,1660 5472,5472,5472,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,1660 5504,5504,5504,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,1660 5536,5536,5536,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,1660 5568,5568,5568,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,1660 5600,5600,5600,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,1660 5632,5632,5632,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,1660 5664,5664,5664,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,1660 5696,5696,5696,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,1660 5728,5728,5728,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,1660 5760,5760,5760,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,1660 clblas-2.10/doc/performance/cuBLAS_7.0/Tesla_K40/peak_sp.csv000066400000000000000000000431361264277366700232570ustar00rootroot00000000000000m,n,k,lda,ldb,ldc,offa,offb,offc,alpha,beta,order,transa,transb,side,uplo,diag,function,device,library,label,GFLOPS 32,32,32,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,5000 64,64,64,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,5000 96,96,96,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,5000 128,128,128,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,5000 160,160,160,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,5000 192,192,192,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,5000 224,224,224,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,5000 256,256,256,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,5000 288,288,288,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,5000 320,320,320,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,5000 352,352,352,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,5000 384,384,384,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,5000 416,416,416,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,5000 448,448,448,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,5000 480,480,480,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,5000 512,512,512,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,5000 544,544,544,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,5000 576,576,576,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,5000 608,608,608,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,5000 640,640,640,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,5000 672,672,672,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,5000 704,704,704,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,5000 736,736,736,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,5000 768,768,768,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,5000 800,800,800,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,5000 832,832,832,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,5000 864,864,864,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,5000 896,896,896,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,5000 928,928,928,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,5000 960,960,960,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,5000 992,992,992,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,5000 1024,1024,1024,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,5000 1056,1056,1056,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,5000 1088,1088,1088,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,5000 1120,1120,1120,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,5000 1152,1152,1152,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,5000 1184,1184,1184,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,5000 1216,1216,1216,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,5000 1248,1248,1248,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,5000 1280,1280,1280,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,5000 1312,1312,1312,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,5000 1344,1344,1344,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,5000 1376,1376,1376,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,5000 1408,1408,1408,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,5000 1440,1440,1440,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,5000 1472,1472,1472,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,5000 1504,1504,1504,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,5000 1536,1536,1536,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,5000 1568,1568,1568,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,5000 1600,1600,1600,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,5000 1632,1632,1632,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,5000 1664,1664,1664,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,5000 1696,1696,1696,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,5000 1728,1728,1728,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,5000 1760,1760,1760,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,5000 1792,1792,1792,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,5000 1824,1824,1824,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,5000 1856,1856,1856,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,5000 1888,1888,1888,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,5000 1920,1920,1920,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,5000 1952,1952,1952,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,5000 1984,1984,1984,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,5000 2016,2016,2016,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,5000 2048,2048,2048,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,5000 2080,2080,2080,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,5000 2112,2112,2112,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,5000 2144,2144,2144,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,5000 2176,2176,2176,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,5000 2208,2208,2208,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,5000 2240,2240,2240,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,5000 2272,2272,2272,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,5000 2304,2304,2304,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,5000 2336,2336,2336,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,5000 2368,2368,2368,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,5000 2400,2400,2400,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,5000 2432,2432,2432,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,5000 2464,2464,2464,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,5000 2496,2496,2496,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,5000 2528,2528,2528,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,5000 2560,2560,2560,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,5000 2592,2592,2592,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,5000 2624,2624,2624,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,5000 2656,2656,2656,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,5000 2688,2688,2688,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,5000 2720,2720,2720,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,5000 2752,2752,2752,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,5000 2784,2784,2784,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,5000 2816,2816,2816,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,5000 2848,2848,2848,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,5000 2880,2880,2880,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,5000 2912,2912,2912,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,5000 2944,2944,2944,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,5000 2976,2976,2976,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,5000 3008,3008,3008,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,5000 3040,3040,3040,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,5000 3072,3072,3072,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,5000 3104,3104,3104,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,5000 3136,3136,3136,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,5000 3168,3168,3168,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,5000 3200,3200,3200,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,5000 3232,3232,3232,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,5000 3264,3264,3264,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,5000 3296,3296,3296,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,5000 3328,3328,3328,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,5000 3360,3360,3360,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,5000 3392,3392,3392,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,5000 3424,3424,3424,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,5000 3456,3456,3456,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,5000 3488,3488,3488,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,5000 3520,3520,3520,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,5000 3552,3552,3552,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,5000 3584,3584,3584,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,5000 3616,3616,3616,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,5000 3648,3648,3648,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,5000 3680,3680,3680,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,5000 3712,3712,3712,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,5000 3744,3744,3744,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,5000 3776,3776,3776,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,5000 3808,3808,3808,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,5000 3840,3840,3840,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,5000 3872,3872,3872,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,5000 3904,3904,3904,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,5000 3936,3936,3936,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,5000 3968,3968,3968,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,5000 4000,4000,4000,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,5000 4032,4032,4032,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,5000 4064,4064,4064,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,5000 4096,4096,4096,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,5000 4128,4128,4128,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,5000 4160,4160,4160,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,5000 4192,4192,4192,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,5000 4224,4224,4224,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,5000 4256,4256,4256,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,5000 4288,4288,4288,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,5000 4320,4320,4320,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,5000 4352,4352,4352,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,5000 4384,4384,4384,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,5000 4416,4416,4416,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,5000 4448,4448,4448,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,5000 4480,4480,4480,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,5000 4512,4512,4512,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,5000 4544,4544,4544,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,5000 4576,4576,4576,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,5000 4608,4608,4608,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,5000 4640,4640,4640,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,5000 4672,4672,4672,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,5000 4704,4704,4704,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,5000 4736,4736,4736,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,5000 4768,4768,4768,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,5000 4800,4800,4800,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,5000 4832,4832,4832,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,5000 4864,4864,4864,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,5000 4896,4896,4896,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,5000 4928,4928,4928,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,5000 4960,4960,4960,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,5000 4992,4992,4992,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,5000 5024,5024,5024,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,5000 5056,5056,5056,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,5000 5088,5088,5088,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,5000 5120,5120,5120,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,5000 5152,5152,5152,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,5000 5184,5184,5184,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,5000 5216,5216,5216,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,5000 5248,5248,5248,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,5000 5280,5280,5280,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,5000 5312,5312,5312,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,5000 5344,5344,5344,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,5000 5376,5376,5376,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,5000 5408,5408,5408,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,5000 5440,5440,5440,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,5000 5472,5472,5472,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,5000 5504,5504,5504,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,5000 5536,5536,5536,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,5000 5568,5568,5568,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,5000 5600,5600,5600,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,5000 5632,5632,5632,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,5000 5664,5664,5664,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,5000 5696,5696,5696,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,5000 5728,5728,5728,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,5000 5760,5760,5760,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,5000 clblas-2.10/doc/performance/cuBLAS_7.0/Tesla_K40/sgemm.csv000066400000000000000000000511001264277366700227330ustar00rootroot00000000000000m,n,k,lda,ldb,ldc,offa,offb,offc,alpha,beta,order,transa,transb,side,uplo,diag,function,device,library,label,GFLOPS 32,32,32,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,cublas_sgemmNT_k40,3.97911 64,64,64,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,cublas_sgemmNT_k40,24.6608 96,96,96,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,cublas_sgemmNT_k40,76.7334 128,128,128,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,cublas_sgemmNT_k40,164.289 160,160,160,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,cublas_sgemmNT_k40,287.539 192,192,192,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,cublas_sgemmNT_k40,464.733 224,224,224,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,cublas_sgemmNT_k40,543.099 256,256,256,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,cublas_sgemmNT_k40,732.31 288,288,288,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,cublas_sgemmNT_k40,934.397 320,320,320,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,cublas_sgemmNT_k40,1185.96 352,352,352,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,cublas_sgemmNT_k40,1087.23 384,384,384,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,cublas_sgemmNT_k40,1279.18 416,416,416,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,cublas_sgemmNT_k40,1185.33 448,448,448,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,cublas_sgemmNT_k40,1372.44 480,480,480,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,cublas_sgemmNT_k40,1367.44 512,512,512,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,cublas_sgemmNT_k40,1378.57 544,544,544,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,cublas_sgemmNT_k40,1543.45 576,576,576,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,cublas_sgemmNT_k40,1759.29 608,608,608,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,cublas_sgemmNT_k40,1633.16 640,640,640,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,cublas_sgemmNT_k40,2180.63 672,672,672,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,cublas_sgemmNT_k40,1809.14 704,704,704,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,cublas_sgemmNT_k40,1984.89 736,736,736,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,cublas_sgemmNT_k40,1954.4 768,768,768,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,cublas_sgemmNT_k40,2077.67 800,800,800,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,cublas_sgemmNT_k40,1960.48 832,832,832,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,cublas_sgemmNT_k40,2070.35 864,864,864,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,cublas_sgemmNT_k40,2083.88 896,896,896,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,cublas_sgemmNT_k40,2394.47 928,928,928,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,cublas_sgemmNT_k40,2348.25 960,960,960,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,cublas_sgemmNT_k40,2716.16 992,992,992,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,cublas_sgemmNT_k40,2405.18 1024,1024,1024,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,cublas_sgemmNT_k40,2274.73 1056,1056,1056,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,cublas_sgemmNT_k40,2227.42 1088,1088,1088,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,cublas_sgemmNT_k40,2344.43 1120,1120,1120,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,cublas_sgemmNT_k40,2383.68 1152,1152,1152,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,cublas_sgemmNT_k40,2703.3 1184,1184,1184,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,cublas_sgemmNT_k40,2512.67 1216,1216,1216,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,cublas_sgemmNT_k40,2614.45 1248,1248,1248,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,cublas_sgemmNT_k40,2552.61 1280,1280,1280,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,cublas_sgemmNT_k40,3009.58 1312,1312,1312,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,cublas_sgemmNT_k40,2722.31 1344,1344,1344,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,cublas_sgemmNT_k40,2602.76 1376,1376,1376,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,cublas_sgemmNT_k40,2692.22 1408,1408,1408,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,cublas_sgemmNT_k40,2817.5 1440,1440,1440,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,cublas_sgemmNT_k40,2732.14 1472,1472,1472,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,cublas_sgemmNT_k40,2856.67 1504,1504,1504,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,cublas_sgemmNT_k40,2693.53 1536,1536,1536,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,cublas_sgemmNT_k40,2701.02 1568,1568,1568,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,cublas_sgemmNT_k40,2744.06 1600,1600,1600,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,cublas_sgemmNT_k40,2858.5 1632,1632,1632,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,cublas_sgemmNT_k40,2849.46 1664,1664,1664,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,cublas_sgemmNT_k40,2971.45 1696,1696,1696,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,cublas_sgemmNT_k40,2884.4 1728,1728,1728,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,cublas_sgemmNT_k40,3024.61 1760,1760,1760,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,cublas_sgemmNT_k40,2968.11 1792,1792,1792,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,cublas_sgemmNT_k40,2911.38 1824,1824,1824,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,cublas_sgemmNT_k40,2940.92 1856,1856,1856,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,cublas_sgemmNT_k40,3011.91 1888,1888,1888,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,cublas_sgemmNT_k40,3005.63 1920,1920,1920,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,cublas_sgemmNT_k40,3187.09 1952,1952,1952,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,cublas_sgemmNT_k40,2983.85 1984,1984,1984,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,cublas_sgemmNT_k40,3088.32 2016,2016,2016,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,cublas_sgemmNT_k40,2985.92 2048,2048,2048,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,cublas_sgemmNT_k40,3159.47 2080,2080,2080,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,cublas_sgemmNT_k40,3004.28 2112,2112,2112,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,cublas_sgemmNT_k40,3143.35 2144,2144,2144,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,cublas_sgemmNT_k40,3032.66 2176,2176,2176,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,cublas_sgemmNT_k40,3113.98 2208,2208,2208,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,cublas_sgemmNT_k40,3004.97 2240,2240,2240,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,cublas_sgemmNT_k40,3087.58 2272,2272,2272,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,cublas_sgemmNT_k40,2997.36 2304,2304,2304,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,cublas_sgemmNT_k40,3137.38 2336,2336,2336,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,cublas_sgemmNT_k40,2995.4 2368,2368,2368,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,cublas_sgemmNT_k40,3069.01 2400,2400,2400,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,cublas_sgemmNT_k40,3013.48 2432,2432,2432,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,cublas_sgemmNT_k40,3107.17 2464,2464,2464,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,cublas_sgemmNT_k40,2955.76 2496,2496,2496,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,cublas_sgemmNT_k40,3152.14 2528,2528,2528,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,cublas_sgemmNT_k40,3043.96 2560,2560,2560,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,cublas_sgemmNT_k40,3208.24 2592,2592,2592,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,cublas_sgemmNT_k40,3077.94 2624,2624,2624,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,cublas_sgemmNT_k40,3152.33 2656,2656,2656,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,cublas_sgemmNT_k40,3059.59 2688,2688,2688,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,cublas_sgemmNT_k40,3133.47 2720,2720,2720,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,cublas_sgemmNT_k40,3053.69 2752,2752,2752,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,cublas_sgemmNT_k40,3128.14 2784,2784,2784,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,cublas_sgemmNT_k40,3058.3 2816,2816,2816,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,cublas_sgemmNT_k40,3188.94 2848,2848,2848,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,cublas_sgemmNT_k40,3064.88 2880,2880,2880,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,cublas_sgemmNT_k40,3239.46 2912,2912,2912,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,cublas_sgemmNT_k40,3121.65 2944,2944,2944,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,cublas_sgemmNT_k40,3188.38 2976,2976,2976,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,cublas_sgemmNT_k40,3109.72 3008,3008,3008,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,cublas_sgemmNT_k40,3172.39 3040,3040,3040,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,cublas_sgemmNT_k40,3063.9 3072,3072,3072,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,cublas_sgemmNT_k40,3240.27 3104,3104,3104,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,cublas_sgemmNT_k40,3131.28 3136,3136,3136,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,cublas_sgemmNT_k40,3196.59 3168,3168,3168,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,cublas_sgemmNT_k40,3123.56 3200,3200,3200,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,cublas_sgemmNT_k40,3186.85 3232,3232,3232,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,cublas_sgemmNT_k40,3105.19 3264,3264,3264,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,cublas_sgemmNT_k40,3185.27 3296,3296,3296,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,cublas_sgemmNT_k40,3123.99 3328,3328,3328,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,cublas_sgemmNT_k40,3241.21 3360,3360,3360,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,cublas_sgemmNT_k40,3141.84 3392,3392,3392,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,cublas_sgemmNT_k40,3200.51 3424,3424,3424,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,cublas_sgemmNT_k40,3132.12 3456,3456,3456,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,cublas_sgemmNT_k40,3230.72 3488,3488,3488,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,cublas_sgemmNT_k40,3153.98 3520,3520,3520,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,cublas_sgemmNT_k40,3205.6 3552,3552,3552,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,cublas_sgemmNT_k40,3127.98 3584,3584,3584,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,cublas_sgemmNT_k40,3240.97 3616,3616,3616,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,cublas_sgemmNT_k40,3160.2 3648,3648,3648,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,cublas_sgemmNT_k40,3254.24 3680,3680,3680,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,cublas_sgemmNT_k40,3177.79 3712,3712,3712,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,cublas_sgemmNT_k40,3230.99 3744,3744,3744,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,cublas_sgemmNT_k40,3155.12 3776,3776,3776,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,cublas_sgemmNT_k40,3208.55 3808,3808,3808,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,cublas_sgemmNT_k40,3137.72 3840,3840,3840,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,cublas_sgemmNT_k40,3296.57 3872,3872,3872,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,cublas_sgemmNT_k40,3214.6 3904,3904,3904,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,cublas_sgemmNT_k40,3265.45 3936,3936,3936,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,cublas_sgemmNT_k40,3190.31 3968,3968,3968,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,cublas_sgemmNT_k40,3238.14 4000,4000,4000,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,cublas_sgemmNT_k40,3164.75 4032,4032,4032,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,cublas_sgemmNT_k40,3250.24 4064,4064,4064,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,cublas_sgemmNT_k40,3184.64 4096,4096,4096,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,cublas_sgemmNT_k40,3275.29 4128,4128,4128,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,cublas_sgemmNT_k40,3208.99 4160,4160,4160,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,cublas_sgemmNT_k40,3256.53 4192,4192,4192,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,cublas_sgemmNT_k40,3188.44 4224,4224,4224,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,cublas_sgemmNT_k40,3259.74 4256,4256,4256,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,cublas_sgemmNT_k40,3195.44 4288,4288,4288,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,cublas_sgemmNT_k40,3242.99 4320,4320,4320,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,cublas_sgemmNT_k40,3182.62 4352,4352,4352,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,cublas_sgemmNT_k40,3282.08 4384,4384,4384,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,cublas_sgemmNT_k40,3167.87 4416,4416,4416,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,cublas_sgemmNT_k40,3271.03 4448,4448,4448,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,cublas_sgemmNT_k40,3221.74 4480,4480,4480,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,cublas_sgemmNT_k40,3259.13 4512,4512,4512,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,cublas_sgemmNT_k40,3193.43 4544,4544,4544,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,cublas_sgemmNT_k40,3196.28 4576,4576,4576,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,cublas_sgemmNT_k40,3113.92 4608,4608,4608,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,cublas_sgemmNT_k40,3315.38 4640,4640,4640,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,cublas_sgemmNT_k40,3206.18 4672,4672,4672,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,cublas_sgemmNT_k40,3205.44 4704,4704,4704,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,cublas_sgemmNT_k40,3174.2 4736,4736,4736,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,cublas_sgemmNT_k40,3208.93 4768,4768,4768,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,cublas_sgemmNT_k40,3119.11 4800,4800,4800,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,cublas_sgemmNT_k40,3237.65 4832,4832,4832,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,cublas_sgemmNT_k40,3153.55 4864,4864,4864,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,cublas_sgemmNT_k40,3260.31 4896,4896,4896,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,cublas_sgemmNT_k40,3160.7 4928,4928,4928,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,cublas_sgemmNT_k40,3191.98 4960,4960,4960,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,cublas_sgemmNT_k40,3125.23 4992,4992,4992,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,cublas_sgemmNT_k40,3210.36 5024,5024,5024,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,cublas_sgemmNT_k40,3155.2 5056,5056,5056,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,cublas_sgemmNT_k40,3163.38 5088,5088,5088,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,cublas_sgemmNT_k40,3106.3 5120,5120,5120,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,cublas_sgemmNT_k40,3206.29 5152,5152,5152,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,cublas_sgemmNT_k40,3128.68 5184,5184,5184,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,cublas_sgemmNT_k40,3208.48 5216,5216,5216,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,cublas_sgemmNT_k40,3154.3 5248,5248,5248,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,cublas_sgemmNT_k40,3195.15 5280,5280,5280,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,cublas_sgemmNT_k40,3130.09 5312,5312,5312,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,cublas_sgemmNT_k40,3145.16 5344,5344,5344,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,cublas_sgemmNT_k40,3099.87 5376,5376,5376,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,cublas_sgemmNT_k40,3295.12 5408,5408,5408,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,cublas_sgemmNT_k40,3155.32 5440,5440,5440,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,cublas_sgemmNT_k40,3188.74 5472,5472,5472,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,cublas_sgemmNT_k40,3133.55 5504,5504,5504,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,cublas_sgemmNT_k40,3145.54 5536,5536,5536,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,cublas_sgemmNT_k40,3083.16 5568,5568,5568,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,cublas_sgemmNT_k40,3168.53 5600,5600,5600,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,cublas_sgemmNT_k40,3117.89 5632,5632,5632,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,cublas_sgemmNT_k40,3221.5 5664,5664,5664,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,cublas_sgemmNT_k40,3123.38 5696,5696,5696,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,cublas_sgemmNT_k40,3159.69 5728,5728,5728,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,cublas_sgemmNT_k40,3098.44 5760,5760,5760,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,sgemm,gpu,clblas,cublas_sgemmNT_k40,3201.51 clblas-2.10/doc/performance/cuBLAS_7.0/Tesla_K40/zgemm.csv000066400000000000000000000436631264277366700227610ustar00rootroot00000000000000m,n,k,lda,ldb,ldc,offa,offb,offc,alpha,beta,order,transa,transb,side,uplo,diag,function,device,library,label,GFLOPS 32,32,32,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,K40,8.85622 64,64,64,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,K40,55.9241 96,96,96,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,K40,158.697 128,128,128,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,K40,324.511 160,160,160,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,K40,537.18 192,192,192,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,K40,599.186 224,224,224,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,K40,802.1 256,256,256,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,K40,815.418 288,288,288,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,K40,1028.54 320,320,320,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,K40,946.027 352,352,352,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,K40,1015.76 384,384,384,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,K40,1223.95 416,416,416,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,K40,1050.01 448,448,448,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,K40,1082.34 480,480,480,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,K40,1211.97 512,512,512,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,K40,1271.15 544,544,544,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,K40,1197.06 576,576,576,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,K40,1313.54 608,608,608,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,K40,1192.42 640,640,640,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,K40,1309.74 672,672,672,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,K40,1166.05 704,704,704,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,K40,1278.31 736,736,736,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,K40,1237.92 768,768,768,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,K40,1336.88 800,800,800,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,K40,1303.92 832,832,832,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,K40,1368.21 864,864,864,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,K40,1309.95 896,896,896,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,K40,1350.68 928,928,928,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,K40,1311.2 960,960,960,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,K40,1400.09 992,992,992,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,K40,1325.0 1024,1024,1024,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,K40,1373.51 1056,1056,1056,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,K40,1345.62 1088,1088,1088,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,K40,1390.97 1120,1120,1120,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,K40,1364.29 1152,1152,1152,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,K40,1358.79 1184,1184,1184,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,K40,1344.7 1216,1216,1216,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,K40,1388.08 1248,1248,1248,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,K40,1352.81 1280,1280,1280,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,K40,1397.32 1312,1312,1312,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,K40,1357.03 1344,1344,1344,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,K40,1370.3 1376,1376,1376,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,K40,1330.2 1408,1408,1408,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,K40,1361.45 1440,1440,1440,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,K40,1335.76 1472,1472,1472,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,K40,1337.47 1504,1504,1504,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,K40,1308.98 1536,1536,1536,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,K40,1324.72 1568,1568,1568,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,K40,1281.6 1600,1600,1600,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,K40,1317.0 1632,1632,1632,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,K40,1276.15 1664,1664,1664,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,K40,1302.72 1696,1696,1696,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,K40,1264.41 1728,1728,1728,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,K40,1292.11 1760,1760,1760,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,K40,1251.54 1792,1792,1792,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,K40,1275.55 1824,1824,1824,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,K40,1255.08 1856,1856,1856,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,K40,1265.29 1888,1888,1888,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,K40,1248.93 1920,1920,1920,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,K40,1260.71 1952,1952,1952,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,K40,1248.52 1984,1984,1984,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,K40,1259.53 2016,2016,2016,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,K40,1236.35 2048,2048,2048,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,K40,1255.18 2080,2080,2080,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,K40,1235.09 2112,2112,2112,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,K40,1255.38 2144,2144,2144,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,K40,1226.74 2176,2176,2176,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,K40,1246.9 2208,2208,2208,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,K40,1234.59 2240,2240,2240,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,K40,1248.11 2272,2272,2272,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,K40,1225.74 2304,2304,2304,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,K40,1232.54 2336,2336,2336,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,K40,1217.53 2368,2368,2368,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,K40,1229.74 2400,2400,2400,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,K40,1223.69 2432,2432,2432,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,K40,1235.74 2464,2464,2464,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,K40,1221.44 2496,2496,2496,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,K40,1228.84 2528,2528,2528,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,K40,1212.09 2560,2560,2560,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,K40,1226.52 2592,2592,2592,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,K40,1210.88 2624,2624,2624,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,K40,1227.2 2656,2656,2656,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,K40,1212.57 2688,2688,2688,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,K40,1225.92 2720,2720,2720,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,K40,1218.08 2752,2752,2752,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,K40,1219.52 2784,2784,2784,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,K40,1207.58 2816,2816,2816,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,K40,1218.47 2848,2848,2848,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,K40,1212.67 2880,2880,2880,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,K40,1219.91 2912,2912,2912,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,K40,1215.19 2944,2944,2944,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,K40,1228.28 2976,2976,2976,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,K40,1209.97 3008,3008,3008,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,K40,1223.56 3040,3040,3040,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,K40,1209.93 3072,3072,3072,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,K40,1223.13 3104,3104,3104,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,K40,1211.51 3136,3136,3136,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,K40,1223.06 3168,3168,3168,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,K40,1213.86 3200,3200,3200,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,K40,1223.15 3232,3232,3232,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,K40,1211.68 3264,3264,3264,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,K40,1223.02 3296,3296,3296,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,K40,1197.04 3328,3328,3328,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,K40,1222.72 3360,3360,3360,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,K40,1214.58 3392,3392,3392,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,K40,1224.13 3424,3424,3424,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,K40,1180.78 3456,3456,3456,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,K40,1190.28 3488,3488,3488,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,K40,1179.39 3520,3520,3520,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,K40,1188.9 3552,3552,3552,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,K40,1182.33 3584,3584,3584,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,K40,1215.58 3616,3616,3616,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,K40,1211.95 3648,3648,3648,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,K40,1216.44 3680,3680,3680,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,K40,1206.74 3712,3712,3712,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,K40,1221.32 3744,3744,3744,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,K40,1207.58 3776,3776,3776,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,K40,1217.07 3808,3808,3808,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,K40,1211.03 3840,3840,3840,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,K40,1220.29 3872,3872,3872,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,K40,1210.99 3904,3904,3904,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,K40,1221.11 3936,3936,3936,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,K40,1210.6 3968,3968,3968,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,K40,1219.52 4000,4000,4000,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,K40,1210.33 4032,4032,4032,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,K40,1220.61 4064,4064,4064,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,K40,1211.28 4096,4096,4096,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,K40,1220.14 4128,4128,4128,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,K40,1212.22 4160,4160,4160,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,K40,1219.51 4192,4192,4192,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,K40,1210.31 4224,4224,4224,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,K40,1221.0 4256,4256,4256,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,K40,1210.01 4288,4288,4288,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,K40,1221.03 4320,4320,4320,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,K40,1210.61 4352,4352,4352,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,K40,1221.04 4384,4384,4384,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,K40,1209.95 4416,4416,4416,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,K40,1219.6 4448,4448,4448,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,K40,1210.45 4480,4480,4480,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,K40,1218.83 4512,4512,4512,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,K40,1210.82 4544,4544,4544,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,K40,1218.4 4576,4576,4576,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,K40,1210.59 4608,4608,4608,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,K40,1218.29 4640,4640,4640,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,K40,1209.13 4672,4672,4672,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,K40,1218.82 4704,4704,4704,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,K40,1210.76 4736,4736,4736,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,K40,1218.25 4768,4768,4768,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,K40,1210.92 4800,4800,4800,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,K40,1218.34 4832,4832,4832,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,K40,1209.81 4864,4864,4864,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,K40,1217.88 4896,4896,4896,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,K40,1210.86 4928,4928,4928,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,K40,1217.97 4960,4960,4960,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,K40,1210.78 4992,4992,4992,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,K40,1217.54 5024,5024,5024,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,K40,1210.49 5056,5056,5056,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,K40,1217.92 5088,5088,5088,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,K40,1209.54 5120,5120,5120,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,K40,1217.9 5152,5152,5152,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,K40,1210.69 5184,5184,5184,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,K40,1217.85 5216,5216,5216,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,K40,1210.93 5248,5248,5248,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,K40,1217.57 5280,5280,5280,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,K40,1210.29 5312,5312,5312,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,K40,1217.4 5344,5344,5344,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,K40,1210.74 5376,5376,5376,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,K40,1216.84 5408,5408,5408,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,K40,1210.79 5440,5440,5440,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,K40,1217.77 5472,5472,5472,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,K40,1210.61 5504,5504,5504,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,K40,1217.71 5536,5536,5536,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,K40,1210.11 5568,5568,5568,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,K40,1217.21 5600,5600,5600,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,K40,1211.23 5632,5632,5632,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,K40,1218.04 5664,5664,5664,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,K40,1210.93 5696,5696,5696,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,K40,1217.29 5728,5728,5728,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,K40,1210.48 5760,5760,5760,0,0,0,0,0,0,1.0,1.0,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,K40,1216.58 clblas-2.10/doc/performance/cuBLAS_7.5/000077500000000000000000000000001264277366700174305ustar00rootroot00000000000000clblas-2.10/doc/performance/cuBLAS_7.5/Tesla_K40/000077500000000000000000000000001264277366700211165ustar00rootroot00000000000000clblas-2.10/doc/performance/cuBLAS_7.5/Tesla_K40/cublas75_k40_dtrsm_col_left_lower_unit.csv000066400000000000000000000077311264277366700312750ustar00rootroot00000000000000m,n,k,lda,ldb,ldc,offa,offb,offc,alpha,beta,order,transa,transb,side,uplo,diag,function,device,library,label,GFLOPS 192,192,192,0,0,0,0,0,0,1.0,1.0,column,none,none,left,lower,unit,dtrsm,gpu,clblas,cublas75_k40_dtrsm_col_left_lower_unit,44.2873 384,384,384,0,0,0,0,0,0,1.0,1.0,column,none,none,left,lower,unit,dtrsm,gpu,clblas,cublas75_k40_dtrsm_col_left_lower_unit,142.747 576,576,576,0,0,0,0,0,0,1.0,1.0,column,none,none,left,lower,unit,dtrsm,gpu,clblas,cublas75_k40_dtrsm_col_left_lower_unit,236.511 768,768,768,0,0,0,0,0,0,1.0,1.0,column,none,none,left,lower,unit,dtrsm,gpu,clblas,cublas75_k40_dtrsm_col_left_lower_unit,355.784 960,960,960,0,0,0,0,0,0,1.0,1.0,column,none,none,left,lower,unit,dtrsm,gpu,clblas,cublas75_k40_dtrsm_col_left_lower_unit,396.26 1152,1152,1152,0,0,0,0,0,0,1.0,1.0,column,none,none,left,lower,unit,dtrsm,gpu,clblas,cublas75_k40_dtrsm_col_left_lower_unit,487.216 1344,1344,1344,0,0,0,0,0,0,1.0,1.0,column,none,none,left,lower,unit,dtrsm,gpu,clblas,cublas75_k40_dtrsm_col_left_lower_unit,566.971 1536,1536,1536,0,0,0,0,0,0,1.0,1.0,column,none,none,left,lower,unit,dtrsm,gpu,clblas,cublas75_k40_dtrsm_col_left_lower_unit,655.908 1728,1728,1728,0,0,0,0,0,0,1.0,1.0,column,none,none,left,lower,unit,dtrsm,gpu,clblas,cublas75_k40_dtrsm_col_left_lower_unit,732.537 1920,1920,1920,0,0,0,0,0,0,1.0,1.0,column,none,none,left,lower,unit,dtrsm,gpu,clblas,cublas75_k40_dtrsm_col_left_lower_unit,847.395 2112,2112,2112,0,0,0,0,0,0,1.0,1.0,column,none,none,left,lower,unit,dtrsm,gpu,clblas,cublas75_k40_dtrsm_col_left_lower_unit,591.049 2304,2304,2304,0,0,0,0,0,0,1.0,1.0,column,none,none,left,lower,unit,dtrsm,gpu,clblas,cublas75_k40_dtrsm_col_left_lower_unit,654.173 2496,2496,2496,0,0,0,0,0,0,1.0,1.0,column,none,none,left,lower,unit,dtrsm,gpu,clblas,cublas75_k40_dtrsm_col_left_lower_unit,702.401 2688,2688,2688,0,0,0,0,0,0,1.0,1.0,column,none,none,left,lower,unit,dtrsm,gpu,clblas,cublas75_k40_dtrsm_col_left_lower_unit,770.226 2880,2880,2880,0,0,0,0,0,0,1.0,1.0,column,none,none,left,lower,unit,dtrsm,gpu,clblas,cublas75_k40_dtrsm_col_left_lower_unit,811.34 3072,3072,3072,0,0,0,0,0,0,1.0,1.0,column,none,none,left,lower,unit,dtrsm,gpu,clblas,cublas75_k40_dtrsm_col_left_lower_unit,883.42 3264,3264,3264,0,0,0,0,0,0,1.0,1.0,column,none,none,left,lower,unit,dtrsm,gpu,clblas,cublas75_k40_dtrsm_col_left_lower_unit,926.542 3456,3456,3456,0,0,0,0,0,0,1.0,1.0,column,none,none,left,lower,unit,dtrsm,gpu,clblas,cublas75_k40_dtrsm_col_left_lower_unit,996.371 3648,3648,3648,0,0,0,0,0,0,1.0,1.0,column,none,none,left,lower,unit,dtrsm,gpu,clblas,cublas75_k40_dtrsm_col_left_lower_unit,1039.3 3840,3840,3840,0,0,0,0,0,0,1.0,1.0,column,none,none,left,lower,unit,dtrsm,gpu,clblas,cublas75_k40_dtrsm_col_left_lower_unit,1101.67 4032,4032,4032,0,0,0,0,0,0,1.0,1.0,column,none,none,left,lower,unit,dtrsm,gpu,clblas,cublas75_k40_dtrsm_col_left_lower_unit,839.118 4224,4224,4224,0,0,0,0,0,0,1.0,1.0,column,none,none,left,lower,unit,dtrsm,gpu,clblas,cublas75_k40_dtrsm_col_left_lower_unit,891.395 4416,4416,4416,0,0,0,0,0,0,1.0,1.0,column,none,none,left,lower,unit,dtrsm,gpu,clblas,cublas75_k40_dtrsm_col_left_lower_unit,916.978 4608,4608,4608,0,0,0,0,0,0,1.0,1.0,column,none,none,left,lower,unit,dtrsm,gpu,clblas,cublas75_k40_dtrsm_col_left_lower_unit,965.043 4800,4800,4800,0,0,0,0,0,0,1.0,1.0,column,none,none,left,lower,unit,dtrsm,gpu,clblas,cublas75_k40_dtrsm_col_left_lower_unit,987.096 4992,4992,4992,0,0,0,0,0,0,1.0,1.0,column,none,none,left,lower,unit,dtrsm,gpu,clblas,cublas75_k40_dtrsm_col_left_lower_unit,1043.79 5184,5184,5184,0,0,0,0,0,0,1.0,1.0,column,none,none,left,lower,unit,dtrsm,gpu,clblas,cublas75_k40_dtrsm_col_left_lower_unit,1069.03 5376,5376,5376,0,0,0,0,0,0,1.0,1.0,column,none,none,left,lower,unit,dtrsm,gpu,clblas,cublas75_k40_dtrsm_col_left_lower_unit,1124.31 5568,5568,5568,0,0,0,0,0,0,1.0,1.0,column,none,none,left,lower,unit,dtrsm,gpu,clblas,cublas75_k40_dtrsm_col_left_lower_unit,1149.83 5760,5760,5760,0,0,0,0,0,0,1.0,1.0,column,none,none,left,lower,unit,dtrsm,gpu,clblas,cublas75_k40_dtrsm_col_left_lower_unit,1199.03 clblas-2.10/doc/performance/cuBLAS_7.5/Tesla_K40/cublas75_k40_dtrsm_col_left_upper_unit.csv000066400000000000000000000077311264277366700313000ustar00rootroot00000000000000m,n,k,lda,ldb,ldc,offa,offb,offc,alpha,beta,order,transa,transb,side,uplo,diag,function,device,library,label,GFLOPS 192,192,192,0,0,0,0,0,0,1.0,1.0,column,none,none,left,upper,unit,dtrsm,gpu,clblas,cublas75_k40_dtrsm_col_left_upper_unit,43.9888 384,384,384,0,0,0,0,0,0,1.0,1.0,column,none,none,left,upper,unit,dtrsm,gpu,clblas,cublas75_k40_dtrsm_col_left_upper_unit,141.241 576,576,576,0,0,0,0,0,0,1.0,1.0,column,none,none,left,upper,unit,dtrsm,gpu,clblas,cublas75_k40_dtrsm_col_left_upper_unit,233.953 768,768,768,0,0,0,0,0,0,1.0,1.0,column,none,none,left,upper,unit,dtrsm,gpu,clblas,cublas75_k40_dtrsm_col_left_upper_unit,351.516 960,960,960,0,0,0,0,0,0,1.0,1.0,column,none,none,left,upper,unit,dtrsm,gpu,clblas,cublas75_k40_dtrsm_col_left_upper_unit,392.283 1152,1152,1152,0,0,0,0,0,0,1.0,1.0,column,none,none,left,upper,unit,dtrsm,gpu,clblas,cublas75_k40_dtrsm_col_left_upper_unit,479.918 1344,1344,1344,0,0,0,0,0,0,1.0,1.0,column,none,none,left,upper,unit,dtrsm,gpu,clblas,cublas75_k40_dtrsm_col_left_upper_unit,558.511 1536,1536,1536,0,0,0,0,0,0,1.0,1.0,column,none,none,left,upper,unit,dtrsm,gpu,clblas,cublas75_k40_dtrsm_col_left_upper_unit,654.557 1728,1728,1728,0,0,0,0,0,0,1.0,1.0,column,none,none,left,upper,unit,dtrsm,gpu,clblas,cublas75_k40_dtrsm_col_left_upper_unit,726.139 1920,1920,1920,0,0,0,0,0,0,1.0,1.0,column,none,none,left,upper,unit,dtrsm,gpu,clblas,cublas75_k40_dtrsm_col_left_upper_unit,848.549 2112,2112,2112,0,0,0,0,0,0,1.0,1.0,column,none,none,left,upper,unit,dtrsm,gpu,clblas,cublas75_k40_dtrsm_col_left_upper_unit,584.527 2304,2304,2304,0,0,0,0,0,0,1.0,1.0,column,none,none,left,upper,unit,dtrsm,gpu,clblas,cublas75_k40_dtrsm_col_left_upper_unit,654.559 2496,2496,2496,0,0,0,0,0,0,1.0,1.0,column,none,none,left,upper,unit,dtrsm,gpu,clblas,cublas75_k40_dtrsm_col_left_upper_unit,695.97 2688,2688,2688,0,0,0,0,0,0,1.0,1.0,column,none,none,left,upper,unit,dtrsm,gpu,clblas,cublas75_k40_dtrsm_col_left_upper_unit,760.952 2880,2880,2880,0,0,0,0,0,0,1.0,1.0,column,none,none,left,upper,unit,dtrsm,gpu,clblas,cublas75_k40_dtrsm_col_left_upper_unit,805.657 3072,3072,3072,0,0,0,0,0,0,1.0,1.0,column,none,none,left,upper,unit,dtrsm,gpu,clblas,cublas75_k40_dtrsm_col_left_upper_unit,872.28 3264,3264,3264,0,0,0,0,0,0,1.0,1.0,column,none,none,left,upper,unit,dtrsm,gpu,clblas,cublas75_k40_dtrsm_col_left_upper_unit,918.805 3456,3456,3456,0,0,0,0,0,0,1.0,1.0,column,none,none,left,upper,unit,dtrsm,gpu,clblas,cublas75_k40_dtrsm_col_left_upper_unit,983.471 3648,3648,3648,0,0,0,0,0,0,1.0,1.0,column,none,none,left,upper,unit,dtrsm,gpu,clblas,cublas75_k40_dtrsm_col_left_upper_unit,1027.22 3840,3840,3840,0,0,0,0,0,0,1.0,1.0,column,none,none,left,upper,unit,dtrsm,gpu,clblas,cublas75_k40_dtrsm_col_left_upper_unit,1088.97 4032,4032,4032,0,0,0,0,0,0,1.0,1.0,column,none,none,left,upper,unit,dtrsm,gpu,clblas,cublas75_k40_dtrsm_col_left_upper_unit,831.697 4224,4224,4224,0,0,0,0,0,0,1.0,1.0,column,none,none,left,upper,unit,dtrsm,gpu,clblas,cublas75_k40_dtrsm_col_left_upper_unit,882.067 4416,4416,4416,0,0,0,0,0,0,1.0,1.0,column,none,none,left,upper,unit,dtrsm,gpu,clblas,cublas75_k40_dtrsm_col_left_upper_unit,907.162 4608,4608,4608,0,0,0,0,0,0,1.0,1.0,column,none,none,left,upper,unit,dtrsm,gpu,clblas,cublas75_k40_dtrsm_col_left_upper_unit,955.774 4800,4800,4800,0,0,0,0,0,0,1.0,1.0,column,none,none,left,upper,unit,dtrsm,gpu,clblas,cublas75_k40_dtrsm_col_left_upper_unit,978.45 4992,4992,4992,0,0,0,0,0,0,1.0,1.0,column,none,none,left,upper,unit,dtrsm,gpu,clblas,cublas75_k40_dtrsm_col_left_upper_unit,1034.06 5184,5184,5184,0,0,0,0,0,0,1.0,1.0,column,none,none,left,upper,unit,dtrsm,gpu,clblas,cublas75_k40_dtrsm_col_left_upper_unit,1059.97 5376,5376,5376,0,0,0,0,0,0,1.0,1.0,column,none,none,left,upper,unit,dtrsm,gpu,clblas,cublas75_k40_dtrsm_col_left_upper_unit,1113.23 5568,5568,5568,0,0,0,0,0,0,1.0,1.0,column,none,none,left,upper,unit,dtrsm,gpu,clblas,cublas75_k40_dtrsm_col_left_upper_unit,1140.36 5760,5760,5760,0,0,0,0,0,0,1.0,1.0,column,none,none,left,upper,unit,dtrsm,gpu,clblas,cublas75_k40_dtrsm_col_left_upper_unit,1189.4 clblas-2.10/doc/performance/cuBLAS_7.5/Tesla_K40/cublas75_k40_dtrsm_col_right_lower_unit.csv000066400000000000000000000100261264277366700314470ustar00rootroot00000000000000m,n,k,lda,ldb,ldc,offa,offb,offc,alpha,beta,order,transa,transb,side,uplo,diag,function,device,library,label,GFLOPS 192,192,192,0,0,0,0,0,0,1.0,1.0,column,none,none,right,lower,unit,dtrsm,gpu,clblas,cublas75_k40_dtrsm_col_right_lower_unit,45.3487 384,384,384,0,0,0,0,0,0,1.0,1.0,column,none,none,right,lower,unit,dtrsm,gpu,clblas,cublas75_k40_dtrsm_col_right_lower_unit,145.134 576,576,576,0,0,0,0,0,0,1.0,1.0,column,none,none,right,lower,unit,dtrsm,gpu,clblas,cublas75_k40_dtrsm_col_right_lower_unit,239.863 768,768,768,0,0,0,0,0,0,1.0,1.0,column,none,none,right,lower,unit,dtrsm,gpu,clblas,cublas75_k40_dtrsm_col_right_lower_unit,367.172 960,960,960,0,0,0,0,0,0,1.0,1.0,column,none,none,right,lower,unit,dtrsm,gpu,clblas,cublas75_k40_dtrsm_col_right_lower_unit,406.918 1152,1152,1152,0,0,0,0,0,0,1.0,1.0,column,none,none,right,lower,unit,dtrsm,gpu,clblas,cublas75_k40_dtrsm_col_right_lower_unit,494.478 1344,1344,1344,0,0,0,0,0,0,1.0,1.0,column,none,none,right,lower,unit,dtrsm,gpu,clblas,cublas75_k40_dtrsm_col_right_lower_unit,582.255 1536,1536,1536,0,0,0,0,0,0,1.0,1.0,column,none,none,right,lower,unit,dtrsm,gpu,clblas,cublas75_k40_dtrsm_col_right_lower_unit,668.13 1728,1728,1728,0,0,0,0,0,0,1.0,1.0,column,none,none,right,lower,unit,dtrsm,gpu,clblas,cublas75_k40_dtrsm_col_right_lower_unit,751.387 1920,1920,1920,0,0,0,0,0,0,1.0,1.0,column,none,none,right,lower,unit,dtrsm,gpu,clblas,cublas75_k40_dtrsm_col_right_lower_unit,858.809 2112,2112,2112,0,0,0,0,0,0,1.0,1.0,column,none,none,right,lower,unit,dtrsm,gpu,clblas,cublas75_k40_dtrsm_col_right_lower_unit,594.17 2304,2304,2304,0,0,0,0,0,0,1.0,1.0,column,none,none,right,lower,unit,dtrsm,gpu,clblas,cublas75_k40_dtrsm_col_right_lower_unit,653.234 2496,2496,2496,0,0,0,0,0,0,1.0,1.0,column,none,none,right,lower,unit,dtrsm,gpu,clblas,cublas75_k40_dtrsm_col_right_lower_unit,704.493 2688,2688,2688,0,0,0,0,0,0,1.0,1.0,column,none,none,right,lower,unit,dtrsm,gpu,clblas,cublas75_k40_dtrsm_col_right_lower_unit,772.181 2880,2880,2880,0,0,0,0,0,0,1.0,1.0,column,none,none,right,lower,unit,dtrsm,gpu,clblas,cublas75_k40_dtrsm_col_right_lower_unit,816.622 3072,3072,3072,0,0,0,0,0,0,1.0,1.0,column,none,none,right,lower,unit,dtrsm,gpu,clblas,cublas75_k40_dtrsm_col_right_lower_unit,884.772 3264,3264,3264,0,0,0,0,0,0,1.0,1.0,column,none,none,right,lower,unit,dtrsm,gpu,clblas,cublas75_k40_dtrsm_col_right_lower_unit,929.677 3456,3456,3456,0,0,0,0,0,0,1.0,1.0,column,none,none,right,lower,unit,dtrsm,gpu,clblas,cublas75_k40_dtrsm_col_right_lower_unit,998.746 3648,3648,3648,0,0,0,0,0,0,1.0,1.0,column,none,none,right,lower,unit,dtrsm,gpu,clblas,cublas75_k40_dtrsm_col_right_lower_unit,1041.95 3840,3840,3840,0,0,0,0,0,0,1.0,1.0,column,none,none,right,lower,unit,dtrsm,gpu,clblas,cublas75_k40_dtrsm_col_right_lower_unit,1108.13 4032,4032,4032,0,0,0,0,0,0,1.0,1.0,column,none,none,right,lower,unit,dtrsm,gpu,clblas,cublas75_k40_dtrsm_col_right_lower_unit,839.438 4224,4224,4224,0,0,0,0,0,0,1.0,1.0,column,none,none,right,lower,unit,dtrsm,gpu,clblas,cublas75_k40_dtrsm_col_right_lower_unit,891.82 4416,4416,4416,0,0,0,0,0,0,1.0,1.0,column,none,none,right,lower,unit,dtrsm,gpu,clblas,cublas75_k40_dtrsm_col_right_lower_unit,916.948 4608,4608,4608,0,0,0,0,0,0,1.0,1.0,column,none,none,right,lower,unit,dtrsm,gpu,clblas,cublas75_k40_dtrsm_col_right_lower_unit,969.092 4800,4800,4800,0,0,0,0,0,0,1.0,1.0,column,none,none,right,lower,unit,dtrsm,gpu,clblas,cublas75_k40_dtrsm_col_right_lower_unit,995.416 4992,4992,4992,0,0,0,0,0,0,1.0,1.0,column,none,none,right,lower,unit,dtrsm,gpu,clblas,cublas75_k40_dtrsm_col_right_lower_unit,1049.13 5184,5184,5184,0,0,0,0,0,0,1.0,1.0,column,none,none,right,lower,unit,dtrsm,gpu,clblas,cublas75_k40_dtrsm_col_right_lower_unit,1077.21 5376,5376,5376,0,0,0,0,0,0,1.0,1.0,column,none,none,right,lower,unit,dtrsm,gpu,clblas,cublas75_k40_dtrsm_col_right_lower_unit,1129.98 5568,5568,5568,0,0,0,0,0,0,1.0,1.0,column,none,none,right,lower,unit,dtrsm,gpu,clblas,cublas75_k40_dtrsm_col_right_lower_unit,1155.79 5760,5760,5760,0,0,0,0,0,0,1.0,1.0,column,none,none,right,lower,unit,dtrsm,gpu,clblas,cublas75_k40_dtrsm_col_right_lower_unit,1207.01 clblas-2.10/doc/performance/cuBLAS_7.5/Tesla_K40/cublas75_k40_dtrsm_col_right_upper_unit.csv000066400000000000000000000100241264277366700314500ustar00rootroot00000000000000m,n,k,lda,ldb,ldc,offa,offb,offc,alpha,beta,order,transa,transb,side,uplo,diag,function,device,library,label,GFLOPS 192,192,192,0,0,0,0,0,0,1.0,1.0,column,none,none,right,upper,unit,dtrsm,gpu,clblas,cublas75_k40_dtrsm_col_right_upper_unit,46.1787 384,384,384,0,0,0,0,0,0,1.0,1.0,column,none,none,right,upper,unit,dtrsm,gpu,clblas,cublas75_k40_dtrsm_col_right_upper_unit,147.372 576,576,576,0,0,0,0,0,0,1.0,1.0,column,none,none,right,upper,unit,dtrsm,gpu,clblas,cublas75_k40_dtrsm_col_right_upper_unit,245.046 768,768,768,0,0,0,0,0,0,1.0,1.0,column,none,none,right,upper,unit,dtrsm,gpu,clblas,cublas75_k40_dtrsm_col_right_upper_unit,374.345 960,960,960,0,0,0,0,0,0,1.0,1.0,column,none,none,right,upper,unit,dtrsm,gpu,clblas,cublas75_k40_dtrsm_col_right_upper_unit,416.657 1152,1152,1152,0,0,0,0,0,0,1.0,1.0,column,none,none,right,upper,unit,dtrsm,gpu,clblas,cublas75_k40_dtrsm_col_right_upper_unit,506.312 1344,1344,1344,0,0,0,0,0,0,1.0,1.0,column,none,none,right,upper,unit,dtrsm,gpu,clblas,cublas75_k40_dtrsm_col_right_upper_unit,594.72 1536,1536,1536,0,0,0,0,0,0,1.0,1.0,column,none,none,right,upper,unit,dtrsm,gpu,clblas,cublas75_k40_dtrsm_col_right_upper_unit,688.445 1728,1728,1728,0,0,0,0,0,0,1.0,1.0,column,none,none,right,upper,unit,dtrsm,gpu,clblas,cublas75_k40_dtrsm_col_right_upper_unit,774.941 1920,1920,1920,0,0,0,0,0,0,1.0,1.0,column,none,none,right,upper,unit,dtrsm,gpu,clblas,cublas75_k40_dtrsm_col_right_upper_unit,884.055 2112,2112,2112,0,0,0,0,0,0,1.0,1.0,column,none,none,right,upper,unit,dtrsm,gpu,clblas,cublas75_k40_dtrsm_col_right_upper_unit,602.674 2304,2304,2304,0,0,0,0,0,0,1.0,1.0,column,none,none,right,upper,unit,dtrsm,gpu,clblas,cublas75_k40_dtrsm_col_right_upper_unit,670.976 2496,2496,2496,0,0,0,0,0,0,1.0,1.0,column,none,none,right,upper,unit,dtrsm,gpu,clblas,cublas75_k40_dtrsm_col_right_upper_unit,716.48 2688,2688,2688,0,0,0,0,0,0,1.0,1.0,column,none,none,right,upper,unit,dtrsm,gpu,clblas,cublas75_k40_dtrsm_col_right_upper_unit,784.703 2880,2880,2880,0,0,0,0,0,0,1.0,1.0,column,none,none,right,upper,unit,dtrsm,gpu,clblas,cublas75_k40_dtrsm_col_right_upper_unit,829.554 3072,3072,3072,0,0,0,0,0,0,1.0,1.0,column,none,none,right,upper,unit,dtrsm,gpu,clblas,cublas75_k40_dtrsm_col_right_upper_unit,895.88 3264,3264,3264,0,0,0,0,0,0,1.0,1.0,column,none,none,right,upper,unit,dtrsm,gpu,clblas,cublas75_k40_dtrsm_col_right_upper_unit,946.6 3456,3456,3456,0,0,0,0,0,0,1.0,1.0,column,none,none,right,upper,unit,dtrsm,gpu,clblas,cublas75_k40_dtrsm_col_right_upper_unit,1008.89 3648,3648,3648,0,0,0,0,0,0,1.0,1.0,column,none,none,right,upper,unit,dtrsm,gpu,clblas,cublas75_k40_dtrsm_col_right_upper_unit,1056.94 3840,3840,3840,0,0,0,0,0,0,1.0,1.0,column,none,none,right,upper,unit,dtrsm,gpu,clblas,cublas75_k40_dtrsm_col_right_upper_unit,1126.45 4032,4032,4032,0,0,0,0,0,0,1.0,1.0,column,none,none,right,upper,unit,dtrsm,gpu,clblas,cublas75_k40_dtrsm_col_right_upper_unit,847.914 4224,4224,4224,0,0,0,0,0,0,1.0,1.0,column,none,none,right,upper,unit,dtrsm,gpu,clblas,cublas75_k40_dtrsm_col_right_upper_unit,901.107 4416,4416,4416,0,0,0,0,0,0,1.0,1.0,column,none,none,right,upper,unit,dtrsm,gpu,clblas,cublas75_k40_dtrsm_col_right_upper_unit,928.518 4608,4608,4608,0,0,0,0,0,0,1.0,1.0,column,none,none,right,upper,unit,dtrsm,gpu,clblas,cublas75_k40_dtrsm_col_right_upper_unit,980.991 4800,4800,4800,0,0,0,0,0,0,1.0,1.0,column,none,none,right,upper,unit,dtrsm,gpu,clblas,cublas75_k40_dtrsm_col_right_upper_unit,1005.44 4992,4992,4992,0,0,0,0,0,0,1.0,1.0,column,none,none,right,upper,unit,dtrsm,gpu,clblas,cublas75_k40_dtrsm_col_right_upper_unit,1060.87 5184,5184,5184,0,0,0,0,0,0,1.0,1.0,column,none,none,right,upper,unit,dtrsm,gpu,clblas,cublas75_k40_dtrsm_col_right_upper_unit,1089.33 5376,5376,5376,0,0,0,0,0,0,1.0,1.0,column,none,none,right,upper,unit,dtrsm,gpu,clblas,cublas75_k40_dtrsm_col_right_upper_unit,1142.84 5568,5568,5568,0,0,0,0,0,0,1.0,1.0,column,none,none,right,upper,unit,dtrsm,gpu,clblas,cublas75_k40_dtrsm_col_right_upper_unit,1169.42 5760,5760,5760,0,0,0,0,0,0,1.0,1.0,column,none,none,right,upper,unit,dtrsm,gpu,clblas,cublas75_k40_dtrsm_col_right_upper_unit,1222.24 clblas-2.10/doc/performance/cuBLAS_7.5/Tesla_K40/cublas_cgemm_8.csv000066400000000000000000002137121264277366700245110ustar00rootroot00000000000000m,n,k,lda,ldb,ldc,offa,offb,offc,alpha,beta,order,transa,transb,side,uplo,diag,function,device,library,numQueues,label,GFLOPS 8,8,8,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,0.333008 16,16,16,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,1.86076 24,24,24,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,5.76901 32,32,32,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,12.8881 40,40,40,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,22.9391 48,48,48,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,37.8254 56,56,56,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,55.6628 64,64,64,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,79.5279 72,72,72,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,107.101 80,80,80,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,142.025 88,88,88,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,179.099 96,96,96,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,226.131 104,104,104,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,273.856 112,112,112,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,333.811 120,120,120,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,391.948 128,128,128,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,462.31 136,136,136,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,529.988 144,144,144,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,613.138 152,152,152,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,694.376 160,160,160,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,792.646 168,168,168,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,883.191 176,176,176,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,993.038 184,184,184,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,1095.78 192,192,192,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,1222.43 200,200,200,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,926.328 208,208,208,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,1010.12 216,216,216,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,1091.25 224,224,224,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,1187.32 232,232,232,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,1257.04 240,240,240,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,1389.17 248,248,248,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,1432.37 256,256,256,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,1527.11 264,264,264,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,1590.47 272,272,272,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,1719.05 280,280,280,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,1825.15 288,288,288,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,1970.34 296,296,296,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,2106.56 304,304,304,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,2227.95 312,312,312,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,2327.75 320,320,320,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,2483.36 328,328,328,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,1444.14 336,336,336,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,1551.61 344,344,344,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,1729.2 352,352,352,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,1954.15 360,360,360,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,2066.25 368,368,368,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,2204.16 376,376,376,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,2282.29 384,384,384,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,2391.56 392,392,392,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,2026.62 400,400,400,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,2141.27 408,408,408,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,2198.95 416,416,416,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,2300.5 424,424,424,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,2377.39 432,432,432,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,2494.48 440,440,440,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,2582.7 448,448,448,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,2691.98 456,456,456,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,2221.82 464,464,464,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,2301.25 472,472,472,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,2396.06 480,480,480,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,2493.41 488,488,488,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,2523.38 496,496,496,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,2624.52 504,504,504,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,2691.7 512,512,512,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,2795.77 520,520,520,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,2458.83 528,528,528,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,2555.58 536,536,536,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,2645.6 544,544,544,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,2728.98 552,552,552,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,2792.34 560,560,560,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,2894.91 568,568,568,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,2979.5 576,576,576,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,3071.22 584,584,584,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,2414.85 592,592,592,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,2502.22 600,600,600,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,2625.06 608,608,608,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,2855.49 616,616,616,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,2891.18 624,624,624,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,3007.11 632,632,632,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,3103.23 640,640,640,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,3185.52 648,648,648,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,2610.05 656,656,656,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,2674.06 664,664,664,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,2749.32 672,672,672,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,2825.88 680,680,680,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,2881.1 688,688,688,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,2971.86 696,696,696,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,3063.5 704,704,704,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,3124.86 712,712,712,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,2850.55 720,720,720,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,2926.83 728,728,728,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,2990.05 736,736,736,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,3057.81 744,744,744,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,3117.8 752,752,752,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,3201.65 760,760,760,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,3262.43 768,768,768,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,3334.17 776,776,776,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,2873.15 784,784,784,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,2930.69 792,792,792,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,2979.63 800,800,800,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,3029.05 808,808,808,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,3067.32 816,816,816,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,3130.19 824,824,824,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,3201.95 832,832,832,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,3279.06 840,840,840,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,2871.51 848,848,848,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,2942.18 856,856,856,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,3027.19 864,864,864,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,3097.03 872,872,872,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,3141.79 880,880,880,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,3204.61 888,888,888,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,3256.74 896,896,896,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,3323.3 904,904,904,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,2931.56 912,912,912,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,3003.54 920,920,920,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,3044.68 928,928,928,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,3106.12 936,936,936,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,3115.23 944,944,944,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,3190.73 952,952,952,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,3228.82 960,960,960,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,3285.87 968,968,968,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,2946.15 976,976,976,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,3008.61 984,984,984,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,3053.03 992,992,992,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,3109.03 1000,1000,1000,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,3138.88 1008,1008,1008,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,3183.86 1016,1016,1016,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,3224.78 1024,1024,1024,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,3293.3 1032,1032,1032,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,2979.54 1040,1040,1040,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,3021.19 1048,1048,1048,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,2988.99 1056,1056,1056,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,3110.33 1064,1064,1064,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,3065.08 1072,1072,1072,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,3132.61 1080,1080,1080,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,3158.35 1088,1088,1088,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,3284.48 1096,1096,1096,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,2927.3 1104,1104,1104,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,2972.26 1112,1112,1112,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,3016.35 1120,1120,1120,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,3146.75 1128,1128,1128,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,3177.64 1136,1136,1136,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,3170.15 1144,1144,1144,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,3212.79 1152,1152,1152,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,3290.3 1160,1160,1160,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,2950.85 1168,1168,1168,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,3006.33 1176,1176,1176,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,3028.72 1184,1184,1184,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,3152.24 1192,1192,1192,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,3109.63 1200,1200,1200,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,3147.3 1208,1208,1208,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,3180.02 1216,1216,1216,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,3243.02 1224,1224,1224,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,2934.35 1232,1232,1232,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,2976.4 1240,1240,1240,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,3009.96 1248,1248,1248,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,3081.16 1256,1256,1256,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,3069.86 1264,1264,1264,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,3148.25 1272,1272,1272,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,3183.82 1280,1280,1280,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,3221.79 1288,1288,1288,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,2924.17 1296,1296,1296,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,3023.11 1304,1304,1304,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,3050.22 1312,1312,1312,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,3109.16 1320,1320,1320,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,3121.8 1328,1328,1328,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,3165.01 1336,1336,1336,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,3169.13 1344,1344,1344,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,3250.88 1352,1352,1352,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,2958.42 1360,1360,1360,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,3014.25 1368,1368,1368,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,2961.69 1376,1376,1376,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,3129.82 1384,1384,1384,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,3074.59 1392,1392,1392,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,3131.63 1400,1400,1400,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,3158.14 1408,1408,1408,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,3181.29 1416,1416,1416,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,2998.04 1424,1424,1424,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,3040.7 1432,1432,1432,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,3058.62 1440,1440,1440,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,3140.26 1448,1448,1448,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,3011.47 1456,1456,1456,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,3073.2 1464,1464,1464,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,3092.45 1472,1472,1472,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,3203.43 1480,1480,1480,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,2964.86 1488,1488,1488,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,3057.79 1496,1496,1496,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,3023.78 1504,1504,1504,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,3120.55 1512,1512,1512,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,3022.64 1520,1520,1520,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,3144.74 1528,1528,1528,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,3082.56 1536,1536,1536,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,3148.1 1544,1544,1544,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,2877.58 1552,1552,1552,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,2929.74 1560,1560,1560,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,2981.76 1568,1568,1568,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,2994.1 1576,1576,1576,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,3026.52 1584,1584,1584,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,3030.09 1592,1592,1592,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,3060.62 1600,1600,1600,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,3093.11 1608,1608,1608,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,2861.43 1616,1616,1616,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,2935.76 1624,1624,1624,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,2929.37 1632,1632,1632,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,3077.14 1640,1640,1640,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,2988.24 1648,1648,1648,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,3033.5 1656,1656,1656,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,3087.64 1664,1664,1664,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,3126.51 1672,1672,1672,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,2912.42 1680,1680,1680,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,2979.09 1688,1688,1688,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,2930.28 1696,1696,1696,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,3106.92 1704,1704,1704,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,3010.14 1712,1712,1712,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,3072.07 1720,1720,1720,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,3031.35 1728,1728,1728,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,3085.61 1736,1736,1736,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,2887.48 1744,1744,1744,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,2930.14 1752,1752,1752,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,2953.51 1760,1760,1760,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,2988.27 1768,1768,1768,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,3003.87 1776,1776,1776,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,3038.32 1784,1784,1784,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,3063.65 1792,1792,1792,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,3087.52 1800,1800,1800,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,2907.75 1808,1808,1808,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,2933.01 1816,1816,1816,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,2946.65 1824,1824,1824,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,2981.95 1832,1832,1832,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,3004.08 1840,1840,1840,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,3018.34 1848,1848,1848,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,3069.57 1856,1856,1856,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,3105.58 1864,1864,1864,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,2913.45 1872,1872,1872,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,2928.96 1880,1880,1880,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,2950.1 1888,1888,1888,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,2982.18 1896,1896,1896,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,3000.7 1904,1904,1904,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,3049.2 1912,1912,1912,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,3051.42 1920,1920,1920,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,3076.58 1928,1928,1928,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,2892.53 1936,1936,1936,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,2925.99 1944,1944,1944,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,2957.38 1952,1952,1952,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,3006.21 1960,1960,1960,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,2999.06 1968,1968,1968,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,3042.49 1976,1976,1976,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,3053.24 1984,1984,1984,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,3068.05 1992,1992,1992,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,2869.3 2000,2000,2000,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,2897.74 2008,2008,2008,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,2920.77 2016,2016,2016,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,2963.82 2024,2024,2024,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,3005.15 2032,2032,2032,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,3060.14 2040,2040,2040,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,3080.01 2048,2048,2048,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,3112.16 2056,2056,2056,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,2947.41 2064,2064,2064,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,2976.71 2072,2072,2072,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,2992.87 2080,2080,2080,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,2999.03 2088,2088,2088,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,2997.7 2096,2096,2096,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,3038.88 2104,2104,2104,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,3065.8 2112,2112,2112,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,3081.79 2120,2120,2120,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,2897.6 2128,2128,2128,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,2921.54 2136,2136,2136,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,2937.93 2144,2144,2144,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,2984.81 2152,2152,2152,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,3010.1 2160,2160,2160,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,3005.06 2168,2168,2168,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,3027.68 2176,2176,2176,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,3055.71 2184,2184,2184,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,2896.12 2192,2192,2192,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,2940.32 2200,2200,2200,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,2936.23 2208,2208,2208,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,2956.04 2216,2216,2216,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,2991.5 2224,2224,2224,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,3020.85 2232,2232,2232,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,3043.31 2240,2240,2240,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,3071.14 2248,2248,2248,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,2962.22 2256,2256,2256,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,2983.75 2264,2264,2264,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,2959.5 2272,2272,2272,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,2969.98 2280,2280,2280,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,2983.73 2288,2288,2288,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,3008.4 2296,2296,2296,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,3036.04 2304,2304,2304,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,3065.38 2312,2312,2312,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,2946.9 2320,2320,2320,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,2975.45 2328,2328,2328,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,3008.22 2336,2336,2336,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,3032.88 2344,2344,2344,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,3043.95 2352,2352,2352,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,3077.88 2360,2360,2360,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,3101.14 2368,2368,2368,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,3115.06 2376,2376,2376,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,2959.39 2384,2384,2384,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,2962.33 2392,2392,2392,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,3018.99 2400,2400,2400,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,3011.09 2408,2408,2408,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,2987.67 2416,2416,2416,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,3010.32 2424,2424,2424,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,3023.62 2432,2432,2432,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,3067.14 2440,2440,2440,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,2933.93 2448,2448,2448,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,2931.89 2456,2456,2456,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,2960.72 2464,2464,2464,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,2978.18 2472,2472,2472,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,2998.78 2480,2480,2480,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,3033.84 2488,2488,2488,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,3034.93 2496,2496,2496,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,3045.05 2504,2504,2504,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,2941.6 2512,2512,2512,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,2966.91 2520,2520,2520,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,2972.83 2528,2528,2528,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,2971.36 2536,2536,2536,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,3012.83 2544,2544,2544,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,3067.86 2552,2552,2552,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,3093.11 2560,2560,2560,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,3084.8 2568,2568,2568,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,2928.78 2576,2576,2576,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,2943.9 2584,2584,2584,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,2965.3 2592,2592,2592,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,2994.21 2600,2600,2600,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,3027.15 2608,2608,2608,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,3058.38 2616,2616,2616,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,3093.92 2624,2624,2624,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,3119.14 2632,2632,2632,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,2985.8 2640,2640,2640,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,3010.18 2648,2648,2648,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,3022.89 2656,2656,2656,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,3050.24 2664,2664,2664,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,3044.24 2672,2672,2672,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,3054.2 2680,2680,2680,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,3054.23 2688,2688,2688,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,3082.19 2696,2696,2696,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,2932.35 2704,2704,2704,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,2951.16 2712,2712,2712,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,2969.06 2720,2720,2720,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,3000.14 2728,2728,2728,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,3017.6 2736,2736,2736,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,3024 2744,2744,2744,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,3041.11 2752,2752,2752,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,3062.68 2760,2760,2760,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,2942.13 2768,2768,2768,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,2965.17 2776,2776,2776,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,2970.41 2784,2784,2784,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,2992.29 2792,2792,2792,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,3017.92 2800,2800,2800,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,3061.67 2808,2808,2808,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,3073.09 2816,2816,2816,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,3053.25 2824,2824,2824,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,2946.35 2832,2832,2832,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,3012.06 2840,2840,2840,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,3039.65 2848,2848,2848,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,3062.41 2856,2856,2856,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,3050.89 2864,2864,2864,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,3057.75 2872,2872,2872,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,3049.37 2880,2880,2880,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,3073.47 2888,2888,2888,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,2972.76 2896,2896,2896,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,2996.56 2904,2904,2904,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,3021.82 2912,2912,2912,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,3049.54 2920,2920,2920,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,3066.45 2928,2928,2928,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,3097.8 2936,2936,2936,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,3111.75 2944,2944,2944,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,3135.54 2952,2952,2952,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,3008.08 2960,2960,2960,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,2999.39 2968,2968,2968,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,3002.79 2976,2976,2976,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,3028.36 2984,2984,2984,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,3011.62 2992,2992,2992,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,3030.25 3000,3000,3000,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,3040.16 3008,3008,3008,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,3073.83 3016,3016,3016,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,2957.03 3024,3024,3024,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,2963.99 3032,3032,3032,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,2979.24 3040,3040,3040,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,2998.66 3048,3048,3048,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,3008.86 3056,3056,3056,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,3044.01 3064,3064,3064,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,3052.5 3072,3072,3072,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,3069.8 3080,3080,3080,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,2961.62 3088,3088,3088,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,2998.01 3096,3096,3096,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,3014.55 3104,3104,3104,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,3024.98 3112,3112,3112,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,3026.02 3120,3120,3120,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,3080.3 3128,3128,3128,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,3107.83 3136,3136,3136,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,3126.29 3144,3144,3144,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,3018.52 3152,3152,3152,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,3037.98 3160,3160,3160,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,3007.71 3168,3168,3168,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,3011.98 3176,3176,3176,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,3037.9 3184,3184,3184,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,3068.53 3192,3192,3192,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,3086.99 3200,3200,3200,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,3110.42 3208,3208,3208,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,3012.36 3216,3216,3216,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,3040.23 3224,3224,3224,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,3057.35 3232,3232,3232,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,3077.32 3240,3240,3240,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,3070.76 3248,3248,3248,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,3067.93 3256,3256,3256,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,3081.49 3264,3264,3264,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,3075.63 3272,3272,3272,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,2954.93 3280,3280,3280,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,2971.32 3288,3288,3288,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,2981.97 3296,3296,3296,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,3017.61 3304,3304,3304,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,3023.63 3312,3312,3312,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,3030.12 3320,3320,3320,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,3045.45 3328,3328,3328,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,3063.03 3336,3336,3336,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,2962.74 3344,3344,3344,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,2993.22 3352,3352,3352,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,3003.23 3360,3360,3360,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,3017.66 3368,3368,3368,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,3045.47 3376,3376,3376,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,3066.84 3384,3384,3384,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,3080.41 3392,3392,3392,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,3070.27 3400,3400,3400,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,2973.07 3408,3408,3408,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,3030.62 3416,3416,3416,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,3052.44 3424,3424,3424,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,3070.29 3432,3432,3432,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,3084.38 3440,3440,3440,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,3104.29 3448,3448,3448,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,3085.81 3456,3456,3456,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,3074.83 3464,3464,3464,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,2993.07 3472,3472,3472,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,3011.26 3480,3480,3480,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,3031.91 3488,3488,3488,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,3054.03 3496,3496,3496,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,3075.01 3504,3504,3504,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,3093.28 3512,3512,3512,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,3116.69 3520,3520,3520,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,3129.63 3528,3528,3528,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,3028.7 3536,3536,3536,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,3022.29 3544,3544,3544,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,3028.68 3552,3552,3552,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,3013.04 3560,3560,3560,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,3022.04 3568,3568,3568,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,3037.37 3576,3576,3576,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,3048.99 3584,3584,3584,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,3079.82 3592,3592,3592,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,2976.09 3600,3600,3600,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,2984.17 3608,3608,3608,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,2997.61 3616,3616,3616,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,3011.82 3624,3624,3624,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,3019.7 3632,3632,3632,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,3040.86 3640,3640,3640,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,3080.83 3648,3648,3648,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,3073.55 3656,3656,3656,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,3006.26 3664,3664,3664,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,3022.49 3672,3672,3672,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,3006.57 3680,3680,3680,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,3028.97 3688,3688,3688,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,3055.74 3696,3696,3696,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,3091.85 3704,3704,3704,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,3108.88 3712,3712,3712,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,3127.22 3720,3720,3720,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,3043.27 3728,3728,3728,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,3059.54 3736,3736,3736,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,3072.82 3744,3744,3744,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,3039.79 3752,3752,3752,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,3050.31 3760,3760,3760,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,3070.06 3768,3768,3768,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,3091.48 3776,3776,3776,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,3111.07 3784,3784,3784,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,3024.84 3792,3792,3792,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,3037.94 3800,3800,3800,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,3055.41 3808,3808,3808,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,3069.92 3816,3816,3816,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,3081.4 3824,3824,3824,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,3082.65 3832,3832,3832,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,3093.37 3840,3840,3840,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,3068.36 3848,3848,3848,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,2979.55 3856,3856,3856,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,2992.68 3864,3864,3864,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,3004.07 3872,3872,3872,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,3030.47 3880,3880,3880,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,3031.53 3888,3888,3888,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,3044 3896,3896,3896,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,3055.5 3904,3904,3904,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,3069.6 3912,3912,3912,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,2976.54 3920,3920,3920,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,2994.24 3928,3928,3928,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,3026.77 3936,3936,3936,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,3029.69 3944,3944,3944,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,3066.62 3952,3952,3952,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,3067.7 3960,3960,3960,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,3051.66 3968,3968,3968,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,3088.92 3976,3976,3976,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,3034.25 3984,3984,3984,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,3050.06 3992,3992,3992,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,3065.72 4000,4000,4000,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,3077.78 4008,4008,4008,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,3099.06 4016,4016,4016,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,3114.77 4024,4024,4024,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,3125.98 4032,4032,4032,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,3115.52 4040,4040,4040,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,3011.54 4048,4048,4048,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,3025.65 4056,4056,4056,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,3043.86 4064,4064,4064,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,3062.87 4072,4072,4072,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,3078.12 4080,4080,4080,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,3088.44 4088,4088,4088,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,3100.8 4096,4096,4096,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,3115.89 4104,4104,4104,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,3035.91 4112,4112,4112,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,3040.89 4120,4120,4120,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,3043.55 4128,4128,4128,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,3021.12 4136,4136,4136,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,3031.46 4144,4144,4144,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,3044.17 4152,4152,4152,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,3056.18 4160,4160,4160,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,3080.4 4168,4168,4168,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,2990.17 4176,4176,4176,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,3001.45 4184,4184,4184,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,3012.13 4192,4192,4192,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,3026.33 4200,4200,4200,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,3033.16 4208,4208,4208,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,3046.73 4216,4216,4216,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,3072.54 4224,4224,4224,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,3084.67 4232,4232,4232,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,3023.57 4240,4240,4240,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,3017.17 4248,4248,4248,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,3016.69 4256,4256,4256,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,3053.78 4264,4264,4264,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,3087.48 4272,4272,4272,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,3103.03 4280,4280,4280,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,3120.86 4288,4288,4288,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,3125.82 4296,4296,4296,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,3053.71 4304,4304,4304,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,3067.26 4312,4312,4312,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,3084.66 4320,4320,4320,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,3086.26 4328,4328,4328,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,3065.97 4336,4336,4336,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,3077.69 4344,4344,4344,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,3093.92 4352,4352,4352,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,3110.77 4360,4360,4360,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,3035.41 4368,4368,4368,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,3044.93 4376,4376,4376,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,3052.92 4384,4384,4384,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,3068.22 4392,4392,4392,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,3075.6 4400,4400,4400,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,3089.99 4408,4408,4408,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,3084.86 4416,4416,4416,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,3071.81 4424,4424,4424,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,2993.9 4432,4432,4432,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,3005.43 4440,4440,4440,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,3015.87 4448,4448,4448,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,3038.53 4456,4456,4456,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,3038.4 4464,4464,4464,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,3050.56 4472,4472,4472,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,3060.08 4480,4480,4480,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,3071.79 4488,4488,4488,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,2997.34 4496,4496,4496,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,3009.48 4504,4504,4504,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,3024.73 4512,4512,4512,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,3049.73 4520,4520,4520,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,3072.43 4528,4528,4528,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,3057.38 4536,4536,4536,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,3080.69 4544,4544,4544,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,3109.07 4552,4552,4552,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,3050.35 4560,4560,4560,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,3062.7 4568,4568,4568,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,3079.3 4576,4576,4576,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,3086.87 4584,4584,4584,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,3101.44 4592,4592,4592,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,3111.7 4600,4600,4600,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,3131.18 4608,4608,4608,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,3134.36 4616,4616,4616,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,3052.98 4624,4624,4624,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,3043.98 4632,4632,4632,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,3054.78 4640,4640,4640,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,3070.27 4648,4648,4648,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,3081.56 4656,4656,4656,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,3089.61 4664,4664,4664,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,3096.54 4672,4672,4672,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,3109.04 4680,4680,4680,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,3033.48 4688,4688,4688,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,3045.24 4696,4696,4696,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,3040.8 4704,4704,4704,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,3033.05 4712,4712,4712,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,3041.32 4720,4720,4720,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,3053.22 4728,4728,4728,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,3064.46 4736,4736,4736,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,3081.98 4744,4744,4744,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,3003.39 4752,4752,4752,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,3012.96 4760,4760,4760,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,3021.17 4768,4768,4768,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,3031.56 4776,4776,4776,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,3039.42 4784,4784,4784,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,3052.45 4792,4792,4792,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,3062.17 4800,4800,4800,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,3093.23 4808,4808,4808,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,3024.04 4816,4816,4816,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,3041.6 4824,4824,4824,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,3075.92 4832,4832,4832,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,3078.75 4840,4840,4840,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,3097.57 4848,4848,4848,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,3105.72 4856,4856,4856,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,3122.8 4864,4864,4864,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,3131.22 4872,4872,4872,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,3065.13 4880,4880,4880,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,3070.63 4888,4888,4888,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,3092 4896,4896,4896,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,3093.83 4904,4904,4904,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,3095.37 4912,4912,4912,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,3095.11 4920,4920,4920,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,3105.42 4928,4928,4928,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,3112.38 4936,4936,4936,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,3047.5 4944,4944,4944,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,3049.8 4952,4952,4952,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,3054.31 4960,4960,4960,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,3070.65 4968,4968,4968,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,3078.28 4976,4976,4976,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,3089.94 4984,4984,4984,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,3069.99 4992,4992,4992,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,3079.39 5000,5000,5000,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,3008.53 5008,5008,5008,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,3015.86 5016,5016,5016,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,3028.41 5024,5024,5024,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,3047.01 5032,5032,5032,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,3048.26 5040,5040,5040,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,3059.26 5048,5048,5048,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,3065.53 5056,5056,5056,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,3077.68 5064,5064,5064,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,3009.66 5072,5072,5072,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,3022.31 5080,5080,5080,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,3029.12 5088,5088,5088,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,3061.33 5096,5096,5096,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,3055.79 5104,5104,5104,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,3076.47 5112,5112,5112,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,3118.82 5120,5120,5120,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,3132.67 5128,5128,5128,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,3064.31 5136,5136,5136,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,3074.73 5144,5144,5144,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,3082.38 5152,5152,5152,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,3096.69 5160,5160,5160,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,3106.61 5168,5168,5168,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,3114.95 5176,5176,5176,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,3125.94 5184,5184,5184,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,3132.33 5192,5192,5192,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,3068.21 5200,5200,5200,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,3061.77 5208,5208,5208,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,3075.28 5216,5216,5216,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,3079 5224,5224,5224,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,3083.36 5232,5232,5232,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,3092.45 5240,5240,5240,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,3091.47 5248,5248,5248,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,3104.75 5256,5256,5256,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,3035.95 5264,5264,5264,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,3051.23 5272,5272,5272,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,3032.08 5280,5280,5280,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,3042.61 5288,5288,5288,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,3047.5 5296,5296,5296,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,3062.58 5304,5304,5304,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,3069.66 5312,5312,5312,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,3084.7 5320,5320,5320,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,3013.49 5328,5328,5328,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,3025.8 5336,5336,5336,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,3031.27 5344,5344,5344,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,3043.2 5352,5352,5352,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,3047.02 5360,5360,5360,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,3060.65 5368,5368,5368,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,3067.25 5376,5376,5376,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,3092.19 5384,5384,5384,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,3016.48 5392,5392,5392,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,3053.86 5400,5400,5400,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,3097.53 5408,5408,5408,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,3101.34 5416,5416,5416,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,3109.31 5424,5424,5424,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,3112.5 5432,5432,5432,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,3128.59 5440,5440,5440,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,3132.78 5448,5448,5448,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,3076.27 5456,5456,5456,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,3080.56 5464,5464,5464,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,3093.79 5472,5472,5472,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,3098 5480,5480,5480,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,3110.11 5488,5488,5488,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,3111.93 5496,5496,5496,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,3117.28 5504,5504,5504,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,3117.31 5512,5512,5512,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,3053.96 5520,5520,5520,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,3058.02 5528,5528,5528,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,3059.28 5536,5536,5536,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,3068.24 5544,5544,5544,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,3074.2 5552,5552,5552,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,3085.74 5560,5560,5560,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,3068.8 5568,5568,5568,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,3079.75 5576,5576,5576,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,3017.01 5584,5584,5584,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,3027.46 5592,5592,5592,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,3039.72 5600,5600,5600,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,3049.43 5608,5608,5608,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,3054.34 5616,5616,5616,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,3065.62 5624,5624,5624,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,3074.26 5632,5632,5632,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,3078.97 5640,5640,5640,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,3017.38 5648,5648,5648,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,3027.33 5656,5656,5656,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,3036.39 5664,5664,5664,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,3058.87 5672,5672,5672,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,3052.41 5680,5680,5680,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,3104.76 5688,5688,5688,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,3133.14 5696,5696,5696,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,3141.78 5704,5704,5704,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,3080.45 5712,5712,5712,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,3082.14 5720,5720,5720,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,3091.84 5728,5728,5728,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,3096.9 5736,5736,5736,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,3111.52 5744,5744,5744,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,3116.88 5752,5752,5752,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,3129.8 5760,5760,5760,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,cgemm,gpu,cublas,1,K40,3134.5 clblas-2.10/doc/performance/cuBLAS_7.5/Tesla_K40/cublas_dgemm_8.csv000066400000000000000000002137371264277366700245210ustar00rootroot00000000000000m,n,k,lda,ldb,ldc,offa,offb,offc,alpha,beta,order,transa,transb,side,uplo,diag,function,device,library,numQueues,label,GFLOPS 8,8,8,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,0.146077 16,16,16,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,1.15056 24,24,24,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,1.43701 32,32,32,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,3.10744 40,40,40,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,5.59196 48,48,48,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,9.03529 56,56,56,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,13.4417 64,64,64,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,18.8797 72,72,72,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,23.9032 80,80,80,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,31.1625 88,88,88,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,39.8056 96,96,96,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,49.6346 104,104,104,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,60.3144 112,112,112,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,72.5311 120,120,120,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,85.7994 128,128,128,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,100.366 136,136,136,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,116.161 144,144,144,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,133.333 152,152,152,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,151.961 160,160,160,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,171.704 168,168,168,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,192.867 176,176,176,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,215.528 184,184,184,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,239.412 192,192,192,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,264.544 200,200,200,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,290.645 208,208,208,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,318.602 216,216,216,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,347.507 224,224,224,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,376.783 232,232,232,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,408.678 240,240,240,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,440.185 248,248,248,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,453.486 256,256,256,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,475.477 264,264,264,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,488.77 272,272,272,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,519.857 280,280,280,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,554.343 288,288,288,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,590.48 296,296,296,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,628.483 304,304,304,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,662.449 312,312,312,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,704.018 320,320,320,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,744.727 328,328,328,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,534.093 336,336,336,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,565.996 344,344,344,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,592.8 352,352,352,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,621.861 360,360,360,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,653.08 368,368,368,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,681.938 376,376,376,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,712.327 384,384,384,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,746.563 392,392,392,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,756.5 400,400,400,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,794.192 408,408,408,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,831.098 416,416,416,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,865.852 424,424,424,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,899.676 432,432,432,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,939.098 440,440,440,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,976.545 448,448,448,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,1013.3 456,456,456,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,715.911 464,464,464,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,741.656 472,472,472,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,764.924 480,480,480,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,791.639 488,488,488,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,778.03 496,496,496,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,830.688 504,504,504,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,859.886 512,512,512,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,726.031 520,520,520,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,722.419 528,528,528,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,748.166 536,536,536,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,770.763 544,544,544,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,796.503 552,552,552,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,818.694 560,560,560,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,843.801 568,568,568,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,866.473 576,576,576,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,892.317 584,584,584,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,839.664 592,592,592,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,859.375 600,600,600,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,891.567 608,608,608,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,917.408 616,616,616,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,949.237 624,624,624,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,973.694 632,632,632,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,973.003 640,640,640,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,1128.18 648,648,648,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,903.904 656,656,656,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,931.3 664,664,664,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,950.21 672,672,672,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,975.315 680,680,680,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,989.838 688,688,688,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,1018.63 696,696,696,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,1037.3 704,704,704,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,1066.69 712,712,712,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,947.708 720,720,720,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,974.055 728,728,728,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,991.21 736,736,736,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,1019.47 744,744,744,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,1035.9 752,752,752,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,1057.54 760,760,760,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,1080.92 768,768,768,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,1117.47 776,776,776,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,1050.06 784,784,784,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,1068.53 792,792,792,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,1090.61 800,800,800,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,1119.57 808,808,808,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,1129.64 816,816,816,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,1166.51 824,824,824,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,1189.65 832,832,832,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,1212.92 840,840,840,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,1043.34 848,848,848,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,1061.45 856,856,856,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,1080.13 864,864,864,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,1099.58 872,872,872,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,1114.88 880,880,880,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,1138.21 888,888,888,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,1158.61 896,896,896,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,1158.85 904,904,904,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,1102.16 912,912,912,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,1131.36 920,920,920,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,1146.79 928,928,928,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,1189.12 936,936,936,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,1188.84 944,944,944,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,1209.54 952,952,952,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,1232.61 960,960,960,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,1256.68 968,968,968,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,1096.19 976,976,976,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,1115.23 984,984,984,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,1129.32 992,992,992,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,1149.98 1000,1000,1000,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,1157.96 1008,1008,1008,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,1180.88 1016,1016,1016,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,1203.46 1024,1024,1024,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,1222.64 1032,1032,1032,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,1109.21 1040,1040,1040,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,1126.81 1048,1048,1048,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,1146.85 1056,1056,1056,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,1162.16 1064,1064,1064,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,1171.37 1072,1072,1072,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,1185.26 1080,1080,1080,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,1188.76 1088,1088,1088,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,1222.6 1096,1096,1096,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,1112.58 1104,1104,1104,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,1133.5 1112,1112,1112,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,1149.62 1120,1120,1120,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,1158.56 1128,1128,1128,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,1173.36 1136,1136,1136,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,1198.47 1144,1144,1144,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,1204.26 1152,1152,1152,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,1258.76 1160,1160,1160,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,1109.8 1168,1168,1168,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,1128.5 1176,1176,1176,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,1139.02 1184,1184,1184,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,1160.26 1192,1192,1192,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,1165.43 1200,1200,1200,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,1181.8 1208,1208,1208,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,1194.85 1216,1216,1216,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,1217.3 1224,1224,1224,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,1119.86 1232,1232,1232,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,1137.05 1240,1240,1240,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,1154.47 1248,1248,1248,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,1173.09 1256,1256,1256,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,1173.07 1264,1264,1264,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,1183.37 1272,1272,1272,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,1197.05 1280,1280,1280,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,1268.48 1288,1288,1288,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,1133.01 1296,1296,1296,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,1158.56 1304,1304,1304,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,1172.49 1312,1312,1312,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,1190.59 1320,1320,1320,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,1172.29 1328,1328,1328,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,1193.22 1336,1336,1336,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,1199.65 1344,1344,1344,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,1238.03 1352,1352,1352,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,1125.95 1360,1360,1360,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,1151.08 1368,1368,1368,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,1153.78 1376,1376,1376,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,1189.73 1384,1384,1384,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,1169.92 1392,1392,1392,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,1187.82 1400,1400,1400,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,1210.92 1408,1408,1408,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,1228.59 1416,1416,1416,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,1129.01 1424,1424,1424,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,1137.09 1432,1432,1432,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,1152.07 1440,1440,1440,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,1200.47 1448,1448,1448,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,1181.63 1456,1456,1456,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,1195.3 1464,1464,1464,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,1201.01 1472,1472,1472,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,1223.14 1480,1480,1480,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,1144.93 1488,1488,1488,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,1161.41 1496,1496,1496,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,1171.25 1504,1504,1504,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,1184.5 1512,1512,1512,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,1193.73 1520,1520,1520,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,1208.03 1528,1528,1528,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,1216.35 1536,1536,1536,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,1243.37 1544,1544,1544,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,1149.11 1552,1552,1552,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,1157.01 1560,1560,1560,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,1168.76 1568,1568,1568,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,1183.55 1576,1576,1576,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,1193.16 1584,1584,1584,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,1205.33 1592,1592,1592,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,1216.08 1600,1600,1600,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,1229.32 1608,1608,1608,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,1129.22 1616,1616,1616,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,1144.31 1624,1624,1624,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,1153.1 1632,1632,1632,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,1166.41 1640,1640,1640,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,1174.11 1648,1648,1648,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,1184.9 1656,1656,1656,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,1197.73 1664,1664,1664,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,1268.47 1672,1672,1672,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,1133.96 1680,1680,1680,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,1147.77 1688,1688,1688,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,1156.19 1696,1696,1696,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,1168.86 1704,1704,1704,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,1173.4 1712,1712,1712,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,1189.46 1720,1720,1720,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,1198.24 1728,1728,1728,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,1209.16 1736,1736,1736,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,1151.63 1744,1744,1744,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,1164.66 1752,1752,1752,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,1175.48 1760,1760,1760,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,1186.06 1768,1768,1768,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,1196.96 1776,1776,1776,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,1206.62 1784,1784,1784,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,1215.29 1792,1792,1792,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,1237.35 1800,1800,1800,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,1159.84 1808,1808,1808,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,1169.68 1816,1816,1816,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,1181.2 1824,1824,1824,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,1192.29 1832,1832,1832,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,1199.37 1840,1840,1840,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,1209.67 1848,1848,1848,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,1218.79 1856,1856,1856,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,1230.57 1864,1864,1864,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,1171.86 1872,1872,1872,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,1181.52 1880,1880,1880,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,1184.91 1888,1888,1888,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,1200.06 1896,1896,1896,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,1207.09 1904,1904,1904,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,1218.17 1912,1912,1912,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,1225.09 1920,1920,1920,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,1260.61 1928,1928,1928,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,1167.21 1936,1936,1936,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,1178.05 1944,1944,1944,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,1185.15 1952,1952,1952,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,1195.16 1960,1960,1960,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,1205.62 1968,1968,1968,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,1215.66 1976,1976,1976,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,1224.86 1984,1984,1984,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,1234.53 1992,1992,1992,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,1169.11 2000,2000,2000,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,1182.36 2008,2008,2008,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,1187.52 2016,2016,2016,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,1197.34 2024,2024,2024,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,1201.45 2032,2032,2032,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,1213.33 2040,2040,2040,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,1220.28 2048,2048,2048,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,1219.17 2056,2056,2056,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,1162.95 2064,2064,2064,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,1170.49 2072,2072,2072,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,1179.36 2080,2080,2080,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,1187.34 2088,2088,2088,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,1198.08 2096,2096,2096,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,1206.78 2104,2104,2104,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,1215.93 2112,2112,2112,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,1225 2120,2120,2120,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,1164.54 2128,2128,2128,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,1173.23 2136,2136,2136,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,1182.46 2144,2144,2144,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,1192.44 2152,2152,2152,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,1195.6 2160,2160,2160,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,1209.17 2168,2168,2168,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,1217.45 2176,2176,2176,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,1229.26 2184,2184,2184,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,1175.63 2192,2192,2192,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,1184.15 2200,2200,2200,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,1193.38 2208,2208,2208,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,1202.2 2216,2216,2216,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,1205.61 2224,2224,2224,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,1215.19 2232,2232,2232,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,1220.87 2240,2240,2240,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,1236.91 2248,2248,2248,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,1175.36 2256,2256,2256,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,1184.95 2264,2264,2264,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,1192.17 2272,2272,2272,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,1202.56 2280,2280,2280,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,1209.33 2288,2288,2288,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,1217.86 2296,2296,2296,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,1225.43 2304,2304,2304,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,1244.93 2312,2312,2312,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,1183.4 2320,2320,2320,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,1189.68 2328,2328,2328,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,1194.64 2336,2336,2336,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,1206.78 2344,2344,2344,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,1210.78 2352,2352,2352,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,1223.12 2360,2360,2360,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,1224.93 2368,2368,2368,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,1236.2 2376,2376,2376,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,1179.93 2384,2384,2384,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,1185.91 2392,2392,2392,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,1193.03 2400,2400,2400,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,1203.18 2408,2408,2408,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,1209.62 2416,2416,2416,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,1222.87 2424,2424,2424,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,1227.12 2432,2432,2432,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,1232.7 2440,2440,2440,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,1180.82 2448,2448,2448,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,1192.26 2456,2456,2456,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,1196.66 2464,2464,2464,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,1204.7 2472,2472,2472,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,1210.54 2480,2480,2480,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,1220.66 2488,2488,2488,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,1226.95 2496,2496,2496,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,1239.33 2504,2504,2504,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,1185.05 2512,2512,2512,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,1193.21 2520,2520,2520,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,1199.49 2528,2528,2528,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,1211.42 2536,2536,2536,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,1214.57 2544,2544,2544,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,1223.73 2552,2552,2552,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,1230.46 2560,2560,2560,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,1245.55 2568,2568,2568,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,1189.16 2576,2576,2576,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,1197.06 2584,2584,2584,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,1203.75 2592,2592,2592,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,1215.76 2600,2600,2600,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,1219.24 2608,2608,2608,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,1226.64 2616,2616,2616,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,1234.04 2624,2624,2624,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,1241.53 2632,2632,2632,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,1188.76 2640,2640,2640,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,1196.81 2648,2648,2648,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,1203.95 2656,2656,2656,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,1211.11 2664,2664,2664,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,1217.54 2672,2672,2672,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,1225.32 2680,2680,2680,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,1232.2 2688,2688,2688,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,1246.32 2696,2696,2696,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,1190.99 2704,2704,2704,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,1198 2712,2712,2712,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,1205.94 2720,2720,2720,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,1214.72 2728,2728,2728,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,1219.44 2736,2736,2736,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,1226.97 2744,2744,2744,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,1233.56 2752,2752,2752,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,1242.02 2760,2760,2760,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,1191.93 2768,2768,2768,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,1199.52 2776,2776,2776,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,1206.11 2784,2784,2784,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,1213.36 2792,2792,2792,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,1218.24 2800,2800,2800,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,1226.46 2808,2808,2808,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,1233.42 2816,2816,2816,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,1243.62 2824,2824,2824,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,1195.95 2832,2832,2832,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,1203.35 2840,2840,2840,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,1209.57 2848,2848,2848,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,1219.01 2856,2856,2856,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,1222.45 2864,2864,2864,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,1229.84 2872,2872,2872,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,1236.53 2880,2880,2880,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,1243.92 2888,2888,2888,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,1196.32 2896,2896,2896,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,1203.47 2904,2904,2904,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,1209.69 2912,2912,2912,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,1216.61 2920,2920,2920,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,1222.61 2928,2928,2928,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,1229.84 2936,2936,2936,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,1234.95 2944,2944,2944,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,1243.98 2952,2952,2952,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,1198.8 2960,2960,2960,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,1205.35 2968,2968,2968,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,1211.74 2976,2976,2976,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,1218.11 2984,2984,2984,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,1224.64 2992,2992,2992,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,1231.21 3000,3000,3000,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,1237.39 3008,3008,3008,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,1244.92 3016,3016,3016,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,1201.46 3024,3024,3024,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,1207.42 3032,3032,3032,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,1213.66 3040,3040,3040,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,1220.64 3048,3048,3048,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,1225.9 3056,3056,3056,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,1232.79 3064,3064,3064,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,1238.89 3072,3072,3072,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,1251.54 3080,3080,3080,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,1203.06 3088,3088,3088,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,1209.75 3096,3096,3096,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,1214.52 3104,3104,3104,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,1224.17 3112,3112,3112,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,1227.39 3120,3120,3120,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,1234.13 3128,3128,3128,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,1239.85 3136,3136,3136,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,1247.26 3144,3144,3144,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,1202.42 3152,3152,3152,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,1209.57 3160,3160,3160,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,1215.11 3168,3168,3168,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,1221.87 3176,3176,3176,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,1227.19 3184,3184,3184,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,1234.09 3192,3192,3192,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,1239.48 3200,3200,3200,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,1246.92 3208,3208,3208,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,1205.3 3216,3216,3216,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,1211.6 3224,3224,3224,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,1217.57 3232,3232,3232,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,1224.37 3240,3240,3240,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,1229.69 3248,3248,3248,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,1236.25 3256,3256,3256,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,1241.47 3264,3264,3264,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,1248.69 3272,3272,3272,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,1206.75 3280,3280,3280,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,1213.41 3288,3288,3288,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,1218.87 3296,3296,3296,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,1225.21 3304,3304,3304,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,1230.58 3312,3312,3312,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,1236.96 3320,3320,3320,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,1242.64 3328,3328,3328,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,1252.71 3336,3336,3336,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,1207.48 3344,3344,3344,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,1213.37 3352,3352,3352,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,1218.76 3360,3360,3360,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,1225.13 3368,3368,3368,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,1230.28 3376,3376,3376,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,1236.6 3384,3384,3384,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,1241.4 3392,3392,3392,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,1248.44 3400,3400,3400,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,1209.78 3408,3408,3408,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,1216.19 3416,3416,3416,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,1221.89 3424,3424,3424,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,1228.01 3432,3432,3432,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,1232.89 3440,3440,3440,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,1239.23 3448,3448,3448,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,1244.9 3456,3456,3456,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,1256.45 3464,3464,3464,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,1212.63 3472,3472,3472,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,1218.13 3480,3480,3480,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,1223.17 3488,3488,3488,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,1229.73 3496,3496,3496,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,1234.38 3504,3504,3504,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,1240.43 3512,3512,3512,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,1245.61 3520,3520,3520,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,1252.13 3528,3528,3528,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,1212.37 3536,3536,3536,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,1218.62 3544,3544,3544,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,1223.87 3552,3552,3552,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,1229.42 3560,3560,3560,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,1234.88 3568,3568,3568,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,1240.89 3576,3576,3576,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,1245.84 3584,3584,3584,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,1252.94 3592,3592,3592,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,1213.84 3600,3600,3600,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,1219.81 3608,3608,3608,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,1224.66 3616,3616,3616,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,1230.74 3624,3624,3624,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,1235.51 3632,3632,3632,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,1241.35 3640,3640,3640,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,1246.68 3648,3648,3648,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,1252.37 3656,3656,3656,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,1214.42 3664,3664,3664,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,1219.38 3672,3672,3672,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,1224.29 3680,3680,3680,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,1231.41 3688,3688,3688,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,1234.92 3696,3696,3696,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,1240.83 3704,3704,3704,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,1245.73 3712,3712,3712,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,1253.21 3720,3720,3720,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,1216.01 3728,3728,3728,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,1222.1 3736,3736,3736,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,1226.08 3744,3744,3744,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,1233.51 3752,3752,3752,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,1236.87 3760,3760,3760,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,1242.56 3768,3768,3768,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,1247.75 3776,3776,3776,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,1253.67 3784,3784,3784,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,1217.07 3792,3792,3792,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,1222.51 3800,3800,3800,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,1227.37 3808,3808,3808,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,1233.46 3816,3816,3816,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,1237.79 3824,3824,3824,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,1243.39 3832,3832,3832,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,1247.37 3840,3840,3840,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,1257.36 3848,3848,3848,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,1218.48 3856,3856,3856,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,1224.11 3864,3864,3864,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,1228.14 3872,3872,3872,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,1234.78 3880,3880,3880,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,1238.03 3888,3888,3888,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,1243.41 3896,3896,3896,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,1248.92 3904,3904,3904,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,1254.3 3912,3912,3912,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,1217.96 3920,3920,3920,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,1222.96 3928,3928,3928,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,1227.79 3936,3936,3936,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,1232.89 3944,3944,3944,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,1237.2 3952,3952,3952,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,1242.48 3960,3960,3960,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,1246.78 3968,3968,3968,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,1254.17 3976,3976,3976,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,1218.84 3984,3984,3984,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,1224.22 3992,3992,3992,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,1228.71 4000,4000,4000,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,1234.11 4008,4008,4008,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,1238.54 4016,4016,4016,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,1243.85 4024,4024,4024,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,1248.41 4032,4032,4032,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,1253.74 4040,4040,4040,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,1220.52 4048,4048,4048,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,1225.55 4056,4056,4056,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,1229.92 4064,4064,4064,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,1235.04 4072,4072,4072,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,1239.48 4080,4080,4080,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,1244.61 4088,4088,4088,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,1249.42 4096,4096,4096,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,1256.82 4104,4104,4104,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,1223.12 4112,4112,4112,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,1227.96 4120,4120,4120,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,1232.11 4128,4128,4128,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,1237.07 4136,4136,4136,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,1241.83 4144,4144,4144,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,1247.17 4152,4152,4152,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,1251.4 4160,4160,4160,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,1256.52 4168,4168,4168,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,1223.06 4176,4176,4176,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,1228.07 4184,4184,4184,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,1232.28 4192,4192,4192,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,1237.54 4200,4200,4200,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,1241.42 4208,4208,4208,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,1246.85 4216,4216,4216,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,1250.7 4224,4224,4224,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,1259.4 4232,4232,4232,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,1223.63 4240,4240,4240,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,1228.68 4248,4248,4248,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,1232.16 4256,4256,4256,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,1237.36 4264,4264,4264,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,1242.1 4272,4272,4272,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,1245.84 4280,4280,4280,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,1250.62 4288,4288,4288,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,1256.02 4296,4296,4296,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,1223.34 4304,4304,4304,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,1227.92 4312,4312,4312,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,1232.36 4320,4320,4320,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,1237.19 4328,4328,4328,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,1241.34 4336,4336,4336,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,1246.05 4344,4344,4344,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,1250.36 4352,4352,4352,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,1257.35 4360,4360,4360,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,1223.6 4368,4368,4368,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,1229.48 4376,4376,4376,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,1232.88 4384,4384,4384,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,1237.66 4392,4392,4392,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,1241.47 4400,4400,4400,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,1246.45 4408,4408,4408,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,1250.92 4416,4416,4416,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,1255.91 4424,4424,4424,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,1226.49 4432,4432,4432,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,1231.49 4440,4440,4440,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,1235.24 4448,4448,4448,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,1240.01 4456,4456,4456,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,1243.99 4464,4464,4464,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,1248.88 4472,4472,4472,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,1253.17 4480,4480,4480,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,1259.64 4488,4488,4488,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,1225.49 4496,4496,4496,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,1230.03 4504,4504,4504,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,1233.85 4512,4512,4512,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,1238.89 4520,4520,4520,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,1242.62 4528,4528,4528,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,1247.29 4536,4536,4536,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,1251.45 4544,4544,4544,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,1256.37 4552,4552,4552,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,1225.91 4560,4560,4560,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,1230.62 4568,4568,4568,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,1234.55 4576,4576,4576,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,1239.01 4584,4584,4584,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,1243.04 4592,4592,4592,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,1247.47 4600,4600,4600,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,1251.46 4608,4608,4608,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,1258.83 4616,4616,4616,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,1228.9 4624,4624,4624,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,1233.56 4632,4632,4632,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,1237.41 4640,4640,4640,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,1242.66 4648,4648,4648,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,1245.74 4656,4656,4656,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,1250.41 4664,4664,4664,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,1254.7 4672,4672,4672,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,1258.65 4680,4680,4680,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,1227.09 4688,4688,4688,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,1231.52 4696,4696,4696,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,1235.25 4704,4704,4704,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,1240.31 4712,4712,4712,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,1243.53 4720,4720,4720,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,1248.11 4728,4728,4728,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,1252.28 4736,4736,4736,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,1257.43 4744,4744,4744,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,1230.61 4752,4752,4752,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,1235.18 4760,4760,4760,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,1238.85 4768,4768,4768,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,1243.27 4776,4776,4776,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,1247.54 4784,4784,4784,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,1251.81 4792,4792,4792,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,1255.62 4800,4800,4800,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,1260.3 4808,4808,4808,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,1228.37 4816,4816,4816,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,1232.71 4824,4824,4824,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,1236.84 4832,4832,4832,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,1241.13 4840,4840,4840,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,1244.75 4848,4848,4848,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,1249.02 4856,4856,4856,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,1252.58 4864,4864,4864,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,1257.61 4872,4872,4872,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,1231.44 4880,4880,4880,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,1235.34 4888,4888,4888,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,1239.21 4896,4896,4896,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,1243.56 4904,4904,4904,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,1247.47 4912,4912,4912,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,1252 4920,4920,4920,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,1255.88 4928,4928,4928,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,1260.35 4936,4936,4936,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,1230.37 4944,4944,4944,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,1234.98 4952,4952,4952,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,1238.65 4960,4960,4960,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,1242.93 4968,4968,4968,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,1246.59 4976,4976,4976,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,1250.75 4984,4984,4984,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,1254.34 4992,4992,4992,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,1260.49 5000,5000,5000,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,1231.34 5008,5008,5008,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,1235.87 5016,5016,5016,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,1239.28 5024,5024,5024,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,1243.34 5032,5032,5032,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,1246.86 5040,5040,5040,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,1251.37 5048,5048,5048,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,1255.05 5056,5056,5056,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,1259.13 5064,5064,5064,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,1231.84 5072,5072,5072,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,1236.2 5080,5080,5080,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,1239.93 5088,5088,5088,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,1244.01 5096,5096,5096,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,1247.62 5104,5104,5104,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,1251.9 5112,5112,5112,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,1255.45 5120,5120,5120,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,1260.47 5128,5128,5128,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,1233.39 5136,5136,5136,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,1237.49 5144,5144,5144,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,1240.96 5152,5152,5152,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,1245.18 5160,5160,5160,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,1248.49 5168,5168,5168,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,1252.69 5176,5176,5176,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,1256.38 5184,5184,5184,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,1260.14 5192,5192,5192,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,1231.95 5200,5200,5200,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,1236.1 5208,5208,5208,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,1239.58 5216,5216,5216,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,1244.07 5224,5224,5224,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,1247.09 5232,5232,5232,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,1251.33 5240,5240,5240,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,1255.03 5248,5248,5248,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,1260.04 5256,5256,5256,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,1233.41 5264,5264,5264,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,1237.62 5272,5272,5272,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,1240.98 5280,5280,5280,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,1245.06 5288,5288,5288,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,1248.62 5296,5296,5296,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,1252.79 5304,5304,5304,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,1256.25 5312,5312,5312,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,1260.27 5320,5320,5320,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,1234.18 5328,5328,5328,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,1238.77 5336,5336,5336,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,1241.95 5344,5344,5344,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,1245.91 5352,5352,5352,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,1249.63 5360,5360,5360,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,1253.9 5368,5368,5368,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,1256.97 5376,5376,5376,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,1264.13 5384,5384,5384,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,1235.31 5392,5392,5392,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,1239.46 5400,5400,5400,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,1242.89 5408,5408,5408,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,1246.6 5416,5416,5416,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,1249.53 5424,5424,5424,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,1254.01 5432,5432,5432,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,1257.18 5440,5440,5440,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,1261.75 5448,5448,5448,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,1234.99 5456,5456,5456,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,1238.79 5464,5464,5464,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,1242.38 5472,5472,5472,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,1246.19 5480,5480,5480,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,1249.09 5488,5488,5488,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,1252.9 5496,5496,5496,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,1256.2 5504,5504,5504,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,1261.46 5512,5512,5512,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,1236.47 5520,5520,5520,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,1240.34 5528,5528,5528,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,1243.5 5536,5536,5536,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,1247.47 5544,5544,5544,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,1250.82 5552,5552,5552,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,1254.28 5560,5560,5560,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,1257.32 5568,5568,5568,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,1261.48 5576,5576,5576,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,1236.13 5584,5584,5584,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,1239.98 5592,5592,5592,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,1243.29 5600,5600,5600,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,1247.03 5608,5608,5608,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,1250.44 5616,5616,5616,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,1253.94 5624,5624,5624,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,1257.28 5632,5632,5632,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,1262.77 5640,5640,5640,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,1237.05 5648,5648,5648,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,1240.69 5656,5656,5656,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,1243.83 5664,5664,5664,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,1247.71 5672,5672,5672,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,1251.03 5680,5680,5680,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,1254.68 5688,5688,5688,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,1257.96 5696,5696,5696,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,1261.79 5704,5704,5704,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,1236.77 5712,5712,5712,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,1240.83 5720,5720,5720,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,1243.93 5728,5728,5728,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,1247.43 5736,5736,5736,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,1250.76 5744,5744,5744,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,1254.49 5752,5752,5752,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,1257.76 5760,5760,5760,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,1,K40,1263.24 clblas-2.10/doc/performance/cuBLAS_7.5/Tesla_K40/cublas_sgemm_8.csv000066400000000000000000002137031264277366700245310ustar00rootroot00000000000000m,n,k,lda,ldb,ldc,offa,offb,offc,alpha,beta,order,transa,transb,side,uplo,diag,function,device,library,numQueues,label,GFLOPS 8,8,8,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,0.104703 16,16,16,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,0.821665 24,24,24,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,1.67361 32,32,32,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,3.91727 40,40,40,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,6.80128 48,48,48,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,11.7464 56,56,56,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,16.6461 64,64,64,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,24.9542 72,72,72,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,33.8547 80,80,80,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,46.0846 88,88,88,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,58.6717 96,96,96,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,76.6005 104,104,104,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,90.1333 112,112,112,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,113.073 120,120,120,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,133.128 128,128,128,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,162.193 136,136,136,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,184.418 144,144,144,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,223.168 152,152,152,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,245.409 160,160,160,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,286.835 168,168,168,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,322.78 176,176,176,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,372.516 184,184,184,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,407.157 192,192,192,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,456.491 200,200,200,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,405.577 208,208,208,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,455.872 216,216,216,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,487.906 224,224,224,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,545.603 232,232,232,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,603.245 240,240,240,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,667.826 248,248,248,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,663.318 256,256,256,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,731.831 264,264,264,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,749.175 272,272,272,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,824.402 280,280,280,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,852.67 288,288,288,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,933.667 296,296,296,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,973.146 304,304,304,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,1061.17 312,312,312,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,1099.02 320,320,320,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,1180.4 328,328,328,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,920.385 336,336,336,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,994.444 344,344,344,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,1005.37 352,352,352,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,1061.69 360,360,360,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,1149.02 368,368,368,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,1178.44 376,376,376,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,1199.67 384,384,384,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,1271.57 392,392,392,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,1026.7 400,400,400,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,1100.22 408,408,408,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,1110.12 416,416,416,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,1181.35 424,424,424,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,1201.34 432,432,432,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,1273.34 440,440,440,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,1296.17 448,448,448,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,1374.43 456,456,456,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,1177.73 464,464,464,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,1247.31 472,472,472,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,1289.44 480,480,480,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,1368.46 488,488,488,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,1369.97 496,496,496,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,1443.64 504,504,504,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,1465.64 512,512,512,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,1386.62 520,520,520,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,1365.86 528,528,528,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,1431.96 536,536,536,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,1482.96 544,544,544,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,1547.97 552,552,552,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,1581.24 560,560,560,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,1658.48 568,568,568,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,1689.72 576,576,576,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,1769.72 584,584,584,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,1517.48 592,592,592,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,1588.69 600,600,600,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,1605.47 608,608,608,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,1639.66 616,616,616,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,1667.52 624,624,624,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,1721.91 632,632,632,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,1760.36 640,640,640,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,2171.68 648,648,648,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,1657.11 656,656,656,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,1717.63 664,664,664,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,1750.82 672,672,672,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,1812.49 680,680,680,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,1840.94 688,688,688,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,1894.81 696,696,696,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,1926.87 704,704,704,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,1990.32 712,712,712,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,1791.46 720,720,720,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,1857.37 728,728,728,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,1888.86 736,736,736,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,1952.92 744,744,744,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,1978.53 752,752,752,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,2040.84 760,760,760,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,2077.94 768,768,768,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,2068.85 776,776,776,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,1840.95 784,784,784,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,1887.58 792,792,792,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,1907.44 800,800,800,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,1967.15 808,808,808,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,2003.7 816,816,816,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,2056.31 824,824,824,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,2061.41 832,832,832,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,2068.31 840,840,840,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,1934.03 848,848,848,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,1972.57 856,856,856,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,2021.63 864,864,864,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,2097.88 872,872,872,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,2111.61 880,880,880,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,2158.95 888,888,888,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,2173.91 896,896,896,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,2394.67 904,904,904,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,2206.65 912,912,912,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,2246.59 920,920,920,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,2281.97 928,928,928,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,2333.1 936,936,936,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,2363.43 944,944,944,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,2427.73 952,952,952,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,2441.22 960,960,960,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,2717.04 968,968,968,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,2273.85 976,976,976,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,2320.57 984,984,984,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,2350.18 992,992,992,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,2415.78 1000,1000,1000,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,2397.94 1008,1008,1008,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,2426.77 1016,1016,1016,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,2438.69 1024,1024,1024,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,2273.43 1032,1032,1032,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,2098.63 1040,1040,1040,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,2137.89 1048,1048,1048,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,2143.01 1056,1056,1056,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,2218.38 1064,1064,1064,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,2214.71 1072,1072,1072,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,2266.9 1080,1080,1080,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,2308.48 1088,1088,1088,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,2344.05 1096,1096,1096,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,2282.81 1104,1104,1104,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,2326.05 1112,1112,1112,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,2339.71 1120,1120,1120,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,2402.63 1128,1128,1128,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,2443.66 1136,1136,1136,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,2484.06 1144,1144,1144,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,2502.6 1152,1152,1152,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,2691.61 1160,1160,1160,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,2420.8 1168,1168,1168,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,2463.91 1176,1176,1176,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,2485.45 1184,1184,1184,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,2540.62 1192,1192,1192,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,2512.77 1200,1200,1200,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,2547.92 1208,1208,1208,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,2542.77 1216,1216,1216,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,2619.32 1224,1224,1224,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,2435.93 1232,1232,1232,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,2485.57 1240,1240,1240,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,2529.47 1248,1248,1248,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,2567.04 1256,1256,1256,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,2583.41 1264,1264,1264,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,2644.8 1272,1272,1272,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,2638.76 1280,1280,1280,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,3001.66 1288,1288,1288,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,2594.52 1296,1296,1296,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,2634.01 1304,1304,1304,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,2657.25 1312,1312,1312,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,2717.76 1320,1320,1320,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,2730.76 1328,1328,1328,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,2783.09 1336,1336,1336,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,2814.94 1344,1344,1344,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,2607.29 1352,1352,1352,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,2580.68 1360,1360,1360,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,2654.62 1368,1368,1368,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,2659.48 1376,1376,1376,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,2730.85 1384,1384,1384,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,2731.84 1392,1392,1392,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,2771.63 1400,1400,1400,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,2810.81 1408,1408,1408,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,2819.08 1416,1416,1416,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,2624.71 1424,1424,1424,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,2648.2 1432,1432,1432,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,2662.29 1440,1440,1440,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,2725.92 1448,1448,1448,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,2716.62 1456,1456,1456,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,2778.05 1464,1464,1464,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,2793.76 1472,1472,1472,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,2832.67 1480,1480,1480,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,2645.13 1488,1488,1488,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,2651.36 1496,1496,1496,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,2668.75 1504,1504,1504,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,2722.44 1512,1512,1512,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,2696.12 1520,1520,1520,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,2765.58 1528,1528,1528,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,2783 1536,1536,1536,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,2700.43 1544,1544,1544,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,2647.29 1552,1552,1552,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,2681.72 1560,1560,1560,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,2682.36 1568,1568,1568,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,2762.72 1576,1576,1576,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,2760.25 1584,1584,1584,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,2801.98 1592,1592,1592,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,2807.66 1600,1600,1600,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,2886.43 1608,1608,1608,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,2715.46 1616,1616,1616,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,2744.85 1624,1624,1624,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,2787.74 1632,1632,1632,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,2846.73 1640,1640,1640,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,2856.76 1648,1648,1648,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,2898.4 1656,1656,1656,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,2924.24 1664,1664,1664,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,2974.9 1672,1672,1672,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,2769.87 1680,1680,1680,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,2808.24 1688,1688,1688,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,2835.25 1696,1696,1696,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,2870.25 1704,1704,1704,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,2886.39 1712,1712,1712,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,2888.15 1720,1720,1720,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,2922.27 1728,1728,1728,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,3014.4 1736,1736,1736,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,2809.68 1744,1744,1744,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,2851.03 1752,1752,1752,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,2866.26 1760,1760,1760,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,2913.5 1768,1768,1768,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,2889.26 1776,1776,1776,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,2925.25 1784,1784,1784,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,2950.55 1792,1792,1792,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,2912.6 1800,1800,1800,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,2836.37 1808,1808,1808,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,2886.91 1816,1816,1816,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,2902.92 1824,1824,1824,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,2930.15 1832,1832,1832,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,2946.57 1840,1840,1840,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,2942.49 1848,1848,1848,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,2949.67 1856,1856,1856,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,2977.64 1864,1864,1864,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,2809.2 1872,1872,1872,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,2840.4 1880,1880,1880,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,2831.54 1888,1888,1888,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,2898.19 1896,1896,1896,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,2902.89 1904,1904,1904,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,2913.74 1912,1912,1912,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,2915.76 1920,1920,1920,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,3105.77 1928,1928,1928,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,2861.64 1936,1936,1936,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,2894.3 1944,1944,1944,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,2890.78 1952,1952,1952,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,2940.56 1960,1960,1960,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,2942.54 1968,1968,1968,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,2969.41 1976,1976,1976,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,2983.18 1984,1984,1984,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,3024.03 1992,1992,1992,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,2828.5 2000,2000,2000,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,2861.69 2008,2008,2008,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,2868.39 2016,2016,2016,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,2909.4 2024,2024,2024,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,2919.09 2032,2032,2032,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,2950.49 2040,2040,2040,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,2957.77 2048,2048,2048,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,3041.55 2056,2056,2056,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,2832.54 2064,2064,2064,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,2855.08 2072,2072,2072,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,2869.87 2080,2080,2080,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,2918.86 2088,2088,2088,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,2916.55 2096,2096,2096,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,2944.66 2104,2104,2104,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,2969.08 2112,2112,2112,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,3109.67 2120,2120,2120,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,2922.92 2128,2128,2128,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,2954.1 2136,2136,2136,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,2965.68 2144,2144,2144,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,3005.65 2152,2152,2152,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,3006.96 2160,2160,2160,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,3040.58 2168,2168,2168,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,3057.55 2176,2176,2176,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,3089.4 2184,2184,2184,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,2907.34 2192,2192,2192,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,2933.35 2200,2200,2200,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,2946.13 2208,2208,2208,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,2986.82 2216,2216,2216,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,2989.94 2224,2224,2224,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,3017.09 2232,2232,2232,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,3030.64 2240,2240,2240,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,3071.62 2248,2248,2248,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,2903.48 2256,2256,2256,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,2928.26 2264,2264,2264,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,2944.53 2272,2272,2272,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,2985.01 2280,2280,2280,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,2984.88 2288,2288,2288,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,2986.05 2296,2296,2296,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,2999.75 2304,2304,2304,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,2993.91 2312,2312,2312,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,2903.16 2320,2320,2320,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,2907.19 2328,2328,2328,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,2938.61 2336,2336,2336,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,2983.75 2344,2344,2344,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,2975.75 2352,2352,2352,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,3005.2 2360,2360,2360,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,3019.96 2368,2368,2368,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,3052.77 2376,2376,2376,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,2942.61 2384,2384,2384,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,2966.39 2392,2392,2392,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,2963.28 2400,2400,2400,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,3009.87 2408,2408,2408,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,3015.92 2416,2416,2416,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,3035 2424,2424,2424,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,3050.42 2432,2432,2432,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,3097.29 2440,2440,2440,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,2869.28 2448,2448,2448,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,2906.64 2456,2456,2456,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,2890.19 2464,2464,2464,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,2931.03 2472,2472,2472,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,2936.24 2480,2480,2480,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,2978.41 2488,2488,2488,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,2976.06 2496,2496,2496,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,3139.45 2504,2504,2504,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,2948.79 2512,2512,2512,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,2960.15 2520,2520,2520,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,2990.6 2528,2528,2528,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,3022.1 2536,2536,2536,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,3008.37 2544,2544,2544,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,3035.22 2552,2552,2552,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,3048.54 2560,2560,2560,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,3157.9 2568,2568,2568,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,2978.54 2576,2576,2576,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,3001.93 2584,2584,2584,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,3018.03 2592,2592,2592,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,3050.05 2600,2600,2600,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,3054.56 2608,2608,2608,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,3096.06 2616,2616,2616,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,3090.23 2624,2624,2624,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,3148.4 2632,2632,2632,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,2963.72 2640,2640,2640,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,2988.68 2648,2648,2648,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,2997.99 2656,2656,2656,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,3052.01 2664,2664,2664,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,3038.87 2672,2672,2672,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,3065.53 2680,2680,2680,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,3074.35 2688,2688,2688,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,3125.2 2696,2696,2696,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,2970.68 2704,2704,2704,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,2978.12 2712,2712,2712,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,3006.34 2720,2720,2720,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,3029.75 2728,2728,2728,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,3026.73 2736,2736,2736,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,3047.27 2744,2744,2744,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,3066.31 2752,2752,2752,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,3122.69 2760,2760,2760,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,2966.98 2768,2768,2768,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,2991.74 2776,2776,2776,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,3001.79 2784,2784,2784,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,3044.76 2792,2792,2792,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,3029.25 2800,2800,2800,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,3061.99 2808,2808,2808,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,3061.15 2816,2816,2816,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,3156.25 2824,2824,2824,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,2978 2832,2832,2832,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,2996.89 2840,2840,2840,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,3006.45 2848,2848,2848,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,3057.19 2856,2856,2856,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,3056.15 2864,2864,2864,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,3080.72 2872,2872,2872,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,3067.81 2880,2880,2880,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,3236.05 2888,2888,2888,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,3034.63 2896,2896,2896,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,3056.1 2904,2904,2904,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,3065.76 2912,2912,2912,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,3099.68 2920,2920,2920,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,3114.08 2928,2928,2928,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,3124.14 2936,2936,2936,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,3133.3 2944,2944,2944,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,3181.67 2952,2952,2952,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,3026.52 2960,2960,2960,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,3049.78 2968,2968,2968,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,3059.63 2976,2976,2976,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,3092.66 2984,2984,2984,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,3090.98 2992,2992,2992,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,3114.59 3000,3000,3000,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,3123.17 3008,3008,3008,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,3157.27 3016,3016,3016,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,2997.05 3024,3024,3024,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,3018.95 3032,3032,3032,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,3027.06 3040,3040,3040,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,3061.87 3048,3048,3048,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,3059.17 3056,3056,3056,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,3084.61 3064,3064,3064,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,3092.47 3072,3072,3072,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,3237.52 3080,3080,3080,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,3063.5 3088,3088,3088,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,3086.15 3096,3096,3096,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,3093.39 3104,3104,3104,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,3126.55 3112,3112,3112,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,3127.03 3120,3120,3120,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,3147.38 3128,3128,3128,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,3157.7 3136,3136,3136,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,3189.11 3144,3144,3144,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,3058.74 3152,3152,3152,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,3078.73 3160,3160,3160,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,3089.58 3168,3168,3168,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,3113.52 3176,3176,3176,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,3117.21 3184,3184,3184,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,3144.36 3192,3192,3192,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,3149.81 3200,3200,3200,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,3183.05 3208,3208,3208,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,3038.56 3216,3216,3216,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,3059.08 3224,3224,3224,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,3068.48 3232,3232,3232,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,3099.9 3240,3240,3240,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,3097.39 3248,3248,3248,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,3119.16 3256,3256,3256,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,3128.46 3264,3264,3264,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,3186.52 3272,3272,3272,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,3048.96 3280,3280,3280,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,3072.86 3288,3288,3288,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,3077.6 3296,3296,3296,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,3111.1 3304,3304,3304,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,3113.05 3312,3312,3312,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,3129.37 3320,3320,3320,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,3142.24 3328,3328,3328,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,3225.42 3336,3336,3336,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,3076.42 3344,3344,3344,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,3098 3352,3352,3352,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,3107.94 3360,3360,3360,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,3136.25 3368,3368,3368,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,3137.99 3376,3376,3376,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,3156.42 3384,3384,3384,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,3163.77 3392,3392,3392,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,3208.21 3400,3400,3400,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,3069.82 3408,3408,3408,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,3095.53 3416,3416,3416,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,3099.22 3424,3424,3424,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,3127.16 3432,3432,3432,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,3130.14 3440,3440,3440,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,3147.99 3448,3448,3448,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,3156.42 3456,3456,3456,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,3225.67 3464,3464,3464,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,3091.68 3472,3472,3472,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,3110.44 3480,3480,3480,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,3120.09 3488,3488,3488,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,3148.24 3496,3496,3496,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,3147.03 3504,3504,3504,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,3167.48 3512,3512,3512,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,3174.21 3520,3520,3520,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,3205.81 3528,3528,3528,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,3067.43 3536,3536,3536,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,3090.04 3544,3544,3544,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,3098.44 3552,3552,3552,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,3124.08 3560,3560,3560,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,3123.75 3568,3568,3568,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,3145.89 3576,3576,3576,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,3150.1 3584,3584,3584,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,3238.13 3592,3592,3592,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,3097.16 3600,3600,3600,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,3118.6 3608,3608,3608,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,3126.8 3616,3616,3616,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,3155.9 3624,3624,3624,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,3152.26 3632,3632,3632,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,3173.47 3640,3640,3640,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,3162.4 3648,3648,3648,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,3250.15 3656,3656,3656,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,3119.64 3664,3664,3664,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,3138.28 3672,3672,3672,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,3144.86 3680,3680,3680,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,3175.17 3688,3688,3688,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,3173.87 3696,3696,3696,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,3191.15 3704,3704,3704,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,3201.78 3712,3712,3712,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,3229.93 3720,3720,3720,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,3097.95 3728,3728,3728,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,3114.4 3736,3736,3736,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,3125.12 3744,3744,3744,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,3153.31 3752,3752,3752,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,3150.07 3760,3760,3760,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,3167.8 3768,3768,3768,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,3179.58 3776,3776,3776,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,3203.56 3784,3784,3784,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,3070.02 3792,3792,3792,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,3101.47 3800,3800,3800,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,3093.68 3808,3808,3808,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,3133.45 3816,3816,3816,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,3090 3824,3824,3824,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,3150.71 3832,3832,3832,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,3132.72 3840,3840,3840,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,3293.83 3848,3848,3848,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,3161.82 3856,3856,3856,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,3176.58 3864,3864,3864,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,3185.09 3872,3872,3872,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,3214 3880,3880,3880,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,3213.08 3888,3888,3888,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,3230.01 3896,3896,3896,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,3238.24 3904,3904,3904,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,3264.19 3912,3912,3912,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,3136.15 3920,3920,3920,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,3153.77 3928,3928,3928,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,3162.94 3936,3936,3936,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,3187.41 3944,3944,3944,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,3186.03 3952,3952,3952,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,3204.92 3960,3960,3960,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,3210.44 3968,3968,3968,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,3241.4 3976,3976,3976,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,3099.04 3984,3984,3984,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,3129.69 3992,3992,3992,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,3127.69 4000,4000,4000,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,3162.19 4008,4008,4008,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,3158.42 4016,4016,4016,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,3183.32 4024,4024,4024,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,3189.33 4032,4032,4032,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,3248.32 4040,4040,4040,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,3134.9 4048,4048,4048,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,3151.77 4056,4056,4056,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,3159.7 4064,4064,4064,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,3186.17 4072,4072,4072,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,3185.79 4080,4080,4080,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,3202.37 4088,4088,4088,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,3205.7 4096,4096,4096,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,3277.98 4104,4104,4104,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,3158.32 4112,4112,4112,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,3174.66 4120,4120,4120,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,3186.96 4128,4128,4128,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,3208.72 4136,4136,4136,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,3212.69 4144,4144,4144,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,3224 4152,4152,4152,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,3231.38 4160,4160,4160,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,3261.81 4168,4168,4168,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,3134.31 4176,4176,4176,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,3119.38 4184,4184,4184,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,3129.63 4192,4192,4192,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,3183.98 4200,4200,4200,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,3116.7 4208,4208,4208,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,3155.29 4216,4216,4216,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,3130.73 4224,4224,4224,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,3257.71 4232,4232,4232,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,3116.99 4240,4240,4240,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,3140.69 4248,4248,4248,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,3111.53 4256,4256,4256,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,3192.45 4264,4264,4264,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,3120.91 4272,4272,4272,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,3172.14 4280,4280,4280,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,3131.96 4288,4288,4288,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,3242.3 4296,4296,4296,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,3034.6 4304,4304,4304,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,3067.17 4312,4312,4312,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,3039.71 4320,4320,4320,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,3169.22 4328,4328,4328,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,3060.78 4336,4336,4336,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,3103.52 4344,4344,4344,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,3072.31 4352,4352,4352,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,3269.24 4360,4360,4360,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,3029.61 4368,4368,4368,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,3055.85 4376,4376,4376,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,3051.82 4384,4384,4384,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,3152.41 4392,4392,4392,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,3053.54 4400,4400,4400,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,3096.4 4408,4408,4408,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,3074.51 4416,4416,4416,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,3269.81 4424,4424,4424,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,3063.72 4432,4432,4432,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,3096.69 4440,4440,4440,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,3113.47 4448,4448,4448,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,3216.37 4456,4456,4456,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,3110.36 4464,4464,4464,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,3148.21 4472,4472,4472,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,3149.53 4480,4480,4480,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,3242.31 4488,4488,4488,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,3055.24 4496,4496,4496,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,3098.45 4504,4504,4504,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,3068.3 4512,4512,4512,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,3162.01 4520,4520,4520,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,3065.83 4528,4528,4528,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,3108.38 4536,4536,4536,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,3074.32 4544,4544,4544,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,3172.82 4552,4552,4552,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,3009.86 4560,4560,4560,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,3027.29 4568,4568,4568,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,3019.38 4576,4576,4576,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,3102.6 4584,4584,4584,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,3030.55 4592,4592,4592,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,3059.24 4600,4600,4600,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,3053.28 4608,4608,4608,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,3313.99 4616,4616,4616,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,3080.21 4624,4624,4624,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,3111.94 4632,4632,4632,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,3077.32 4640,4640,4640,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,3182.9 4648,4648,4648,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,3087.37 4656,4656,4656,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,3130.87 4664,4664,4664,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,3122.53 4672,4672,4672,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,3209.48 4680,4680,4680,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,3053.89 4688,4688,4688,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,3079.09 4696,4696,4696,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,3051.11 4704,4704,4704,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,3147.74 4712,4712,4712,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,3075 4720,4720,4720,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,3117.61 4728,4728,4728,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,3097.17 4736,4736,4736,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,3175.61 4744,4744,4744,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,3014.53 4752,4752,4752,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,3040.9 4760,4760,4760,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,3019.39 4768,4768,4768,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,3104.9 4776,4776,4776,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,3042.89 4784,4784,4784,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,3070.28 4792,4792,4792,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,3053.38 4800,4800,4800,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,3224.95 4808,4808,4808,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,3029.86 4816,4816,4816,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,3064.22 4824,4824,4824,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,3060.22 4832,4832,4832,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,3135.26 4840,4840,4840,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,3067.5 4848,4848,4848,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,3095.49 4856,4856,4856,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,3102.68 4864,4864,4864,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,3237.03 4872,4872,4872,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,3052.72 4880,4880,4880,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,3095.79 4888,4888,4888,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,3070.88 4896,4896,4896,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,3159.32 4904,4904,4904,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,3094.28 4912,4912,4912,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,3114.42 4920,4920,4920,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,3109.16 4928,4928,4928,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,3183.18 4936,4936,4936,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,3028.59 4944,4944,4944,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,3053.51 4952,4952,4952,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,3033.29 4960,4960,4960,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,3122.6 4968,4968,4968,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,3049.71 4976,4976,4976,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,3088.69 4984,4984,4984,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,3074.5 4992,4992,4992,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,3219.93 5000,5000,5000,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,3042.84 5008,5008,5008,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,3079.17 5016,5016,5016,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,3054.48 5024,5024,5024,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,3140.92 5032,5032,5032,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,3068.88 5040,5040,5040,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,3096.9 5048,5048,5048,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,3071.15 5056,5056,5056,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,3158.52 5064,5064,5064,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,3026.78 5072,5072,5072,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,3045.55 5080,5080,5080,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,3008.5 5088,5088,5088,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,3091.96 5096,5096,5096,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,3021.41 5104,5104,5104,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,3059.16 5112,5112,5112,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,3050.83 5120,5120,5120,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,3202.16 5128,5128,5128,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,3024.34 5136,5136,5136,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,3055.79 5144,5144,5144,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,3034.59 5152,5152,5152,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,3126.44 5160,5160,5160,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,3042.27 5168,5168,5168,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,3090.15 5176,5176,5176,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,3076.22 5184,5184,5184,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,3208.01 5192,5192,5192,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,3055.27 5200,5200,5200,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,3078.26 5208,5208,5208,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,3069.06 5216,5216,5216,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,3153.78 5224,5224,5224,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,3093.05 5232,5232,5232,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,3114.84 5240,5240,5240,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,3115.5 5248,5248,5248,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,3208.09 5256,5256,5256,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,3049.03 5264,5264,5264,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,3060.21 5272,5272,5272,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,3055.4 5280,5280,5280,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,3134.16 5288,5288,5288,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,3043.35 5296,5296,5296,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,3066.97 5304,5304,5304,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,3065.53 5312,5312,5312,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,3143.3 5320,5320,5320,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,2995.15 5328,5328,5328,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,3032.51 5336,5336,5336,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,3013.77 5344,5344,5344,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,3089.64 5352,5352,5352,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,3027.16 5360,5360,5360,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,3065.96 5368,5368,5368,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,3039.2 5376,5376,5376,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,3288.19 5384,5384,5384,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,3048.04 5392,5392,5392,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,3096.49 5400,5400,5400,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,3078.3 5408,5408,5408,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,3173.05 5416,5416,5416,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,3088.57 5424,5424,5424,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,3109.5 5432,5432,5432,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,3088.76 5440,5440,5440,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,3183.5 5448,5448,5448,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,3033.34 5456,5456,5456,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,3066.74 5464,5464,5464,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,3039.62 5472,5472,5472,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,3122.02 5480,5480,5480,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,3061.92 5488,5488,5488,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,3087 5496,5496,5496,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,3065.62 5504,5504,5504,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,3139.84 5512,5512,5512,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,3004.77 5520,5520,5520,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,3038.14 5528,5528,5528,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,3021.63 5536,5536,5536,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,3090.6 5544,5544,5544,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,3024.89 5552,5552,5552,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,3049.24 5560,5560,5560,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,3031.74 5568,5568,5568,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,3175.02 5576,5576,5576,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,3003.5 5584,5584,5584,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,3045.22 5592,5592,5592,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,3030.96 5600,5600,5600,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,3105.13 5608,5608,5608,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,3041.44 5616,5616,5616,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,3087.82 5624,5624,5624,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,3082.16 5632,5632,5632,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,3214.64 5640,5640,5640,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,3053.49 5648,5648,5648,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,3076.86 5656,5656,5656,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,3048.93 5664,5664,5664,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,3125.23 5672,5672,5672,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,3045.15 5680,5680,5680,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,3079.54 5688,5688,5688,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,3065.05 5696,5696,5696,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,3148.65 5704,5704,5704,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,3011.52 5712,5712,5712,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,3044.74 5720,5720,5720,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,3029.6 5728,5728,5728,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,3092.26 5736,5736,5736,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,3040.66 5744,5744,5744,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,3066.77 5752,5752,5752,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,3052.22 5760,5760,5760,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,sgemm,gpu,cublas,1,K40,3194.63 clblas-2.10/doc/performance/cuBLAS_7.5/Tesla_K40/cublas_zgemm_8.csv000066400000000000000000002136771264277366700245520ustar00rootroot00000000000000m,n,k,lda,ldb,ldc,offa,offb,offc,alpha,beta,order,transa,transb,side,uplo,diag,function,device,library,numQueues,label,GFLOPS 8,8,8,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,0.553514 16,16,16,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,4.42213 24,24,24,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,11.3428 32,32,32,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,11.1267 40,40,40,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,20.0942 48,48,48,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,32.3014 56,56,56,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,47.8192 64,64,64,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,67.3892 72,72,72,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,90.8975 80,80,80,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,118.621 88,88,88,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,151.48 96,96,96,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,187.842 104,104,104,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,227.648 112,112,112,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,272.141 120,120,120,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,319.704 128,128,128,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,372.496 136,136,136,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,429.717 144,144,144,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,489.005 152,152,152,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,557.651 160,160,160,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,626.06 168,168,168,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,528.905 176,176,176,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,582.766 184,184,184,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,611.11 192,192,192,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,671.606 200,200,200,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,705.079 208,208,208,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,762.297 216,216,216,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,828.502 224,224,224,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,894.859 232,232,232,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,732.332 240,240,240,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,787.076 248,248,248,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,836.64 256,256,256,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,887.742 264,264,264,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,919.7 272,272,272,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,981.701 280,280,280,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,1046.33 288,288,288,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,1108.1 296,296,296,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,818.311 304,304,304,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,867.347 312,312,312,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,952.154 320,320,320,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,1016.57 328,328,328,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,912.383 336,336,336,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,958.813 344,344,344,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,1009.83 352,352,352,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,1056 360,360,360,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,1100.34 368,368,368,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,1152.11 376,376,376,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,1206.13 384,384,384,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,1261.41 392,392,392,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,947.987 400,400,400,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,992.133 408,408,408,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,1035.23 416,416,416,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,1070.62 424,424,424,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,999.853 432,432,432,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,1039.22 440,440,440,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,1071.29 448,448,448,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,1115.26 456,456,456,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,1024.04 464,464,464,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,1061.61 472,472,472,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,1111.43 480,480,480,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,1195.12 488,488,488,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,1170.04 496,496,496,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,1210.75 504,504,504,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,1253.94 512,512,512,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,1292.89 520,520,520,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,1101.72 528,528,528,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,1134.63 536,536,536,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,1170.08 544,544,544,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,1208.19 552,552,552,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,1213.07 560,560,560,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,1236.38 568,568,568,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,1292.13 576,576,576,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,1345.84 584,584,584,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,1102.9 592,592,592,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,1125.23 600,600,600,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,1151.05 608,608,608,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,1184.76 616,616,616,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,1166.52 624,624,624,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,1178.91 632,632,632,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,1222.67 640,640,640,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,1239.06 648,648,648,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,1072.47 656,656,656,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,1099.58 664,664,664,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,1115.55 672,672,672,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,1134.09 680,680,680,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,1113.95 688,688,688,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,1139.48 696,696,696,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,1165.99 704,704,704,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,1200.52 712,712,712,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,1093.31 720,720,720,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,1101.34 728,728,728,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,1128.95 736,736,736,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,1156.18 744,744,744,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,1141.83 752,752,752,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,1171.44 760,760,760,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,1191.91 768,768,768,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,1201.62 776,776,776,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,1086.57 784,784,784,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,1112.6 792,792,792,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,1136.23 800,800,800,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,1162.26 808,808,808,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,1142.11 816,816,816,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,1149.28 824,824,824,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,1209.99 832,832,832,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,1228.83 840,840,840,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,1099.71 848,848,848,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,1116.25 856,856,856,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,1132.23 864,864,864,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,1153.94 872,872,872,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,1154.41 880,880,880,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,1172.38 888,888,888,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,1191.73 896,896,896,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,1211.3 904,904,904,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,1108.47 912,912,912,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,1128.2 920,920,920,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,1149.26 928,928,928,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,1169.31 936,936,936,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,1176.75 944,944,944,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,1196.6 952,952,952,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,1215.64 960,960,960,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,1238.44 968,968,968,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,1122.22 976,976,976,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,1141.77 984,984,984,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,1153.77 992,992,992,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,1179.19 1000,1000,1000,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,1143.13 1008,1008,1008,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,1170.37 1016,1016,1016,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,1187.51 1024,1024,1024,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,1207.89 1032,1032,1032,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,1117.35 1040,1040,1040,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,1123.38 1048,1048,1048,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,1162.55 1056,1056,1056,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,1179.93 1064,1064,1064,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,1164.99 1072,1072,1072,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,1181.06 1080,1080,1080,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,1195.25 1088,1088,1088,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,1215.54 1096,1096,1096,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,1139.15 1104,1104,1104,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,1156.07 1112,1112,1112,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,1171.18 1120,1120,1120,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,1188.67 1128,1128,1128,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,1139.66 1136,1136,1136,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,1157.61 1144,1144,1144,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,1175.42 1152,1152,1152,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,1189.36 1160,1160,1160,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,1126.5 1168,1168,1168,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,1135.77 1176,1176,1176,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,1150.97 1184,1184,1184,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,1167.56 1192,1192,1192,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,1148.35 1200,1200,1200,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,1175.17 1208,1208,1208,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,1184.81 1216,1216,1216,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,1200.78 1224,1224,1224,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,1119.46 1232,1232,1232,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,1139.92 1240,1240,1240,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,1148.13 1248,1248,1248,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,1163.35 1256,1256,1256,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,1156.3 1264,1264,1264,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,1169.7 1272,1272,1272,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,1185.99 1280,1280,1280,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,1203.58 1288,1288,1288,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,1129.7 1296,1296,1296,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,1145.87 1304,1304,1304,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,1153.55 1312,1312,1312,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,1171.13 1320,1320,1320,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,1157.76 1328,1328,1328,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,1167.77 1336,1336,1336,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,1177.86 1344,1344,1344,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,1199.06 1352,1352,1352,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,1133.25 1360,1360,1360,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,1151.32 1368,1368,1368,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,1159.58 1376,1376,1376,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,1174.4 1384,1384,1384,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,1151.88 1392,1392,1392,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,1166.51 1400,1400,1400,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,1178.78 1408,1408,1408,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,1197.96 1416,1416,1416,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,1135.38 1424,1424,1424,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,1148.69 1432,1432,1432,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,1160.41 1440,1440,1440,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,1174.43 1448,1448,1448,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,1155.74 1456,1456,1456,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,1168.57 1464,1464,1464,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,1183.86 1472,1472,1472,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,1196.34 1480,1480,1480,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,1140.29 1488,1488,1488,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,1152.87 1496,1496,1496,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,1163.75 1504,1504,1504,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,1177.99 1512,1512,1512,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,1161.59 1520,1520,1520,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,1173.5 1528,1528,1528,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,1183.79 1536,1536,1536,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,1196.56 1544,1544,1544,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,1135.56 1552,1552,1552,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,1150.42 1560,1560,1560,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,1159 1568,1568,1568,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,1174.2 1576,1576,1576,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,1161.92 1584,1584,1584,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,1172.06 1592,1592,1592,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,1183.85 1600,1600,1600,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,1200.5 1608,1608,1608,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,1140.21 1616,1616,1616,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,1152.47 1624,1624,1624,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,1163.68 1632,1632,1632,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,1174.74 1640,1640,1640,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,1166.64 1648,1648,1648,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,1177.83 1656,1656,1656,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,1188.28 1664,1664,1664,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,1200.57 1672,1672,1672,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,1145.87 1680,1680,1680,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,1157.55 1688,1688,1688,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,1169.72 1696,1696,1696,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,1179.65 1704,1704,1704,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,1167.68 1712,1712,1712,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,1177.81 1720,1720,1720,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,1188.49 1728,1728,1728,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,1201.11 1736,1736,1736,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,1148.42 1744,1744,1744,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,1159.09 1752,1752,1752,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,1168.68 1760,1760,1760,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,1180.29 1768,1768,1768,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,1165.17 1776,1776,1776,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,1175.82 1784,1784,1784,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,1185.86 1792,1792,1792,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,1196.96 1800,1800,1800,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,1150.75 1808,1808,1808,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,1158.91 1816,1816,1816,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,1167.22 1824,1824,1824,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,1180 1832,1832,1832,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,1170.48 1840,1840,1840,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,1180.51 1848,1848,1848,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,1189.97 1856,1856,1856,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,1200.52 1864,1864,1864,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,1152.82 1872,1872,1872,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,1163.81 1880,1880,1880,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,1174.97 1888,1888,1888,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,1185.04 1896,1896,1896,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,1172.05 1904,1904,1904,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,1182.09 1912,1912,1912,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,1193.17 1920,1920,1920,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,1202.96 1928,1928,1928,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,1154.45 1936,1936,1936,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,1163.3 1944,1944,1944,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,1172.74 1952,1952,1952,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,1183.77 1960,1960,1960,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,1176.18 1968,1968,1968,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,1185.6 1976,1976,1976,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,1195 1984,1984,1984,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,1205.72 1992,1992,1992,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,1157.71 2000,2000,2000,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,1163.7 2008,2008,2008,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,1173.33 2016,2016,2016,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,1183.37 2024,2024,2024,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,1177.04 2032,2032,2032,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,1186.13 2040,2040,2040,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,1194.11 2048,2048,2048,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,1204.88 2056,2056,2056,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,1159.05 2064,2064,2064,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,1168.18 2072,2072,2072,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,1177.49 2080,2080,2080,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,1186.7 2088,2088,2088,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,1177.1 2096,2096,2096,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,1186.01 2104,2104,2104,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,1195.84 2112,2112,2112,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,1205.02 2120,2120,2120,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,1160.55 2128,2128,2128,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,1169.78 2136,2136,2136,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,1177.72 2144,2144,2144,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,1186.5 2152,2152,2152,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,1179.45 2160,2160,2160,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,1187.39 2168,2168,2168,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,1195.94 2176,2176,2176,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,1205.66 2184,2184,2184,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,1163.03 2192,2192,2192,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,1172.81 2200,2200,2200,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,1180.03 2208,2208,2208,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,1189.08 2216,2216,2216,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,1180.73 2224,2224,2224,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,1189.31 2232,2232,2232,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,1196.59 2240,2240,2240,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,1206.05 2248,2248,2248,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,1161.53 2256,2256,2256,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,1169.96 2264,2264,2264,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,1178.24 2272,2272,2272,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,1186.73 2280,2280,2280,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,1180.08 2288,2288,2288,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,1188.96 2296,2296,2296,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,1198.04 2304,2304,2304,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,1205.42 2312,2312,2312,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,1163.31 2320,2320,2320,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,1172.12 2328,2328,2328,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,1179.93 2336,2336,2336,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,1188.11 2344,2344,2344,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,1180.87 2352,2352,2352,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,1188.98 2360,2360,2360,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,1196.97 2368,2368,2368,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,1205.63 2376,2376,2376,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,1167.34 2384,2384,2384,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,1174.04 2392,2392,2392,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,1182.13 2400,2400,2400,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,1190.25 2408,2408,2408,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,1181.5 2416,2416,2416,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,1189.95 2424,2424,2424,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,1197.39 2432,2432,2432,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,1204.94 2440,2440,2440,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,1169.11 2448,2448,2448,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,1176.64 2456,2456,2456,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,1183.24 2464,2464,2464,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,1191.96 2472,2472,2472,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,1184.26 2480,2480,2480,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,1191.41 2488,2488,2488,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,1199.49 2496,2496,2496,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,1207.36 2504,2504,2504,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,1167.89 2512,2512,2512,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,1175.95 2520,2520,2520,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,1183.5 2528,2528,2528,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,1190.74 2536,2536,2536,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,1182.33 2544,2544,2544,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,1189.75 2552,2552,2552,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,1197.66 2560,2560,2560,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,1205.83 2568,2568,2568,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,1172.65 2576,2576,2576,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,1179.34 2584,2584,2584,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,1186.76 2592,2592,2592,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,1193.9 2600,2600,2600,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,1186.21 2608,2608,2608,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,1194.7 2616,2616,2616,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,1201.73 2624,2624,2624,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,1210.3 2632,2632,2632,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,1172.6 2640,2640,2640,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,1179.85 2648,2648,2648,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,1186.93 2656,2656,2656,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,1194.52 2664,2664,2664,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,1186.8 2672,2672,2672,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,1192.73 2680,2680,2680,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,1200.03 2688,2688,2688,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,1207.04 2696,2696,2696,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,1170.77 2704,2704,2704,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,1178.42 2712,2712,2712,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,1185.72 2720,2720,2720,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,1192.01 2728,2728,2728,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,1185.35 2736,2736,2736,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,1192.5 2744,2744,2744,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,1199.78 2752,2752,2752,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,1206.02 2760,2760,2760,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,1172.09 2768,2768,2768,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,1179.54 2776,2776,2776,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,1186.01 2784,2784,2784,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,1193.12 2792,2792,2792,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,1185.73 2800,2800,2800,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,1192.83 2808,2808,2808,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,1199.06 2816,2816,2816,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,1206.25 2824,2824,2824,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,1174.88 2832,2832,2832,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,1181.33 2840,2840,2840,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,1186.11 2848,2848,2848,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,1193.78 2856,2856,2856,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,1186.39 2864,2864,2864,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,1193.87 2872,2872,2872,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,1200.36 2880,2880,2880,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,1206.19 2888,2888,2888,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,1174.73 2896,2896,2896,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,1181.56 2904,2904,2904,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,1187.4 2912,2912,2912,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,1192.34 2920,2920,2920,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,1187.3 2928,2928,2928,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,1193.69 2936,2936,2936,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,1200.62 2944,2944,2944,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,1206.37 2952,2952,2952,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,1175.05 2960,2960,2960,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,1182.74 2968,2968,2968,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,1189.08 2976,2976,2976,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,1195.39 2984,2984,2984,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,1189 2992,2992,2992,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,1194.9 3000,3000,3000,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,1201.84 3008,3008,3008,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,1208.21 3016,3016,3016,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,1177.01 3024,3024,3024,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,1183.69 3032,3032,3032,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,1189.72 3040,3040,3040,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,1196.44 3048,3048,3048,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,1189.78 3056,3056,3056,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,1196.81 3064,3064,3064,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,1202.34 3072,3072,3072,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,1208.96 3080,3080,3080,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,1178.89 3088,3088,3088,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,1184.97 3096,3096,3096,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,1191.38 3104,3104,3104,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,1197.02 3112,3112,3112,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,1191.02 3120,3120,3120,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,1197.07 3128,3128,3128,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,1203 3136,3136,3136,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,1209.3 3144,3144,3144,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,1179.17 3152,3152,3152,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,1184.97 3160,3160,3160,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,1190.92 3168,3168,3168,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,1197.01 3176,3176,3176,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,1191.31 3184,3184,3184,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,1197.89 3192,3192,3192,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,1203.68 3200,3200,3200,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,1209.91 3208,3208,3208,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,1180.22 3216,3216,3216,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,1185.65 3224,3224,3224,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,1192 3232,3232,3232,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,1197.64 3240,3240,3240,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,1191.46 3248,3248,3248,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,1196.94 3256,3256,3256,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,1202.61 3264,3264,3264,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,1209.33 3272,3272,3272,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,1180.32 3280,3280,3280,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,1186.5 3288,3288,3288,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,1192.42 3296,3296,3296,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,1198.52 3304,3304,3304,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,1193.72 3312,3312,3312,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,1199.26 3320,3320,3320,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,1205.28 3328,3328,3328,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,1210.24 3336,3336,3336,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,1180.49 3344,3344,3344,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,1185.6 3352,3352,3352,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,1191.79 3360,3360,3360,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,1197.52 3368,3368,3368,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,1192.38 3376,3376,3376,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,1198.33 3384,3384,3384,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,1203.71 3392,3392,3392,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,1209.28 3400,3400,3400,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,1182.41 3408,3408,3408,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,1187.89 3416,3416,3416,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,1193.44 3424,3424,3424,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,1199.67 3432,3432,3432,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,1193.18 3440,3440,3440,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,1198.66 3448,3448,3448,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,1204.04 3456,3456,3456,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,1209.17 3464,3464,3464,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,1183.07 3472,3472,3472,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,1189 3480,3480,3480,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,1194.13 3488,3488,3488,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,1200.64 3496,3496,3496,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,1194.21 3504,3504,3504,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,1199.64 3512,3512,3512,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,1203.94 3520,3520,3520,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,1210.26 3528,3528,3528,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,1183.65 3536,3536,3536,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,1189.6 3544,3544,3544,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,1194.2 3552,3552,3552,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,1201.12 3560,3560,3560,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,1194.64 3568,3568,3568,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,1199.36 3576,3576,3576,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,1205.12 3584,3584,3584,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,1210.08 3592,3592,3592,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,1183.42 3600,3600,3600,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,1189.3 3608,3608,3608,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,1195.05 3616,3616,3616,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,1200.2 3624,3624,3624,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,1194.48 3632,3632,3632,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,1199.81 3640,3640,3640,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,1204.9 3648,3648,3648,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,1210.87 3656,3656,3656,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,1184.68 3664,3664,3664,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,1189.88 3672,3672,3672,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,1194.93 3680,3680,3680,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,1200.54 3688,3688,3688,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,1194.62 3696,3696,3696,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,1200.36 3704,3704,3704,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,1205.25 3712,3712,3712,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,1210.28 3720,3720,3720,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,1185.02 3728,3728,3728,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,1190.21 3736,3736,3736,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,1195.04 3744,3744,3744,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,1200.52 3752,3752,3752,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,1196.24 3760,3760,3760,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,1201.15 3768,3768,3768,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,1205.69 3776,3776,3776,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,1211.53 3784,3784,3784,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,1185.56 3792,3792,3792,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,1190.53 3800,3800,3800,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,1195.15 3808,3808,3808,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,1200.97 3816,3816,3816,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,1195.04 3824,3824,3824,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,1201.13 3832,3832,3832,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,1205.82 3840,3840,3840,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,1211.14 3848,3848,3848,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,1186.05 3856,3856,3856,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,1190.88 3864,3864,3864,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,1195.73 3872,3872,3872,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,1201.45 3880,3880,3880,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,1196.53 3888,3888,3888,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,1201.23 3896,3896,3896,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,1206.12 3904,3904,3904,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,1211.09 3912,3912,3912,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,1186.75 3920,3920,3920,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,1192.1 3928,3928,3928,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,1195.94 3936,3936,3936,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,1201.5 3944,3944,3944,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,1196.94 3952,3952,3952,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,1202.06 3960,3960,3960,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,1206.49 3968,3968,3968,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,1211.08 3976,3976,3976,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,1187.26 3984,3984,3984,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,1191.71 3992,3992,3992,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,1196.31 4000,4000,4000,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,1200.84 4008,4008,4008,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,1196.9 4016,4016,4016,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,1201.51 4024,4024,4024,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,1205.98 4032,4032,4032,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,1210.87 4040,4040,4040,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,1187.4 4048,4048,4048,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,1192.94 4056,4056,4056,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,1198.24 4064,4064,4064,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,1202.41 4072,4072,4072,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,1196.64 4080,4080,4080,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,1201.63 4088,4088,4088,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,1206.2 4096,4096,4096,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,1210.81 4104,4104,4104,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,1188.66 4112,4112,4112,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,1193.51 4120,4120,4120,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,1198.24 4128,4128,4128,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,1202.66 4136,4136,4136,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,1198.07 4144,4144,4144,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,1202.12 4152,4152,4152,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,1206.39 4160,4160,4160,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,1211.07 4168,4168,4168,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,1188.03 4176,4176,4176,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,1192.7 4184,4184,4184,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,1197.05 4192,4192,4192,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,1201.92 4200,4200,4200,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,1197.73 4208,4208,4208,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,1203 4216,4216,4216,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,1207 4224,4224,4224,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,1211.83 4232,4232,4232,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,1188.32 4240,4240,4240,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,1193.31 4248,4248,4248,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,1197.08 4256,4256,4256,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,1201.96 4264,4264,4264,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,1198.55 4272,4272,4272,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,1203.44 4280,4280,4280,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,1208.07 4288,4288,4288,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,1212.49 4296,4296,4296,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,1190.74 4304,4304,4304,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,1195.84 4312,4312,4312,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,1198.12 4320,4320,4320,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,1201.01 4328,4328,4328,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,1199.42 4336,4336,4336,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,1205.54 4344,4344,4344,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,1208.07 4352,4352,4352,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,1214.77 4360,4360,4360,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,1190.18 4368,4368,4368,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,1193.74 4376,4376,4376,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,1198.88 4384,4384,4384,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,1202.63 4392,4392,4392,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,1198.38 4400,4400,4400,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,1202.2 4408,4408,4408,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,1207.47 4416,4416,4416,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,1212.38 4424,4424,4424,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,1189.98 4432,4432,4432,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,1195.49 4440,4440,4440,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,1199.14 4448,4448,4448,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,1192.47 4456,4456,4456,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,1198.83 4464,4464,4464,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,1202.45 4472,4472,4472,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,1206.92 4480,4480,4480,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,1211.22 4488,4488,4488,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,1190.57 4496,4496,4496,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,1195.32 4504,4504,4504,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,1199.57 4512,4512,4512,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,1205.31 4520,4520,4520,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,1198.8 4528,4528,4528,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,1204.91 4536,4536,4536,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,1207.16 4544,4544,4544,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,1213 4552,4552,4552,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,1191.56 4560,4560,4560,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,1197.33 4568,4568,4568,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,1200.38 4576,4576,4576,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,1204.95 4584,4584,4584,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,1199.74 4592,4592,4592,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,1203.21 4600,4600,4600,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,1207.79 4608,4608,4608,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,1211.85 4616,4616,4616,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,1190.63 4624,4624,4624,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,1187.72 4632,4632,4632,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,1199.01 4640,4640,4640,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,1195.77 4648,4648,4648,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,1194.46 4656,4656,4656,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,1205.59 4664,4664,4664,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,1203.9 4672,4672,4672,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,1212.48 4680,4680,4680,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,1170.21 4688,4688,4688,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,1194.81 4696,4696,4696,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,1180.6 4704,4704,4704,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,1202.97 4712,4712,4712,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,1190.12 4720,4720,4720,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,1203.42 4728,4728,4728,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,1209.13 4736,4736,4736,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,1212.91 4744,4744,4744,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,1192.09 4752,4752,4752,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,1196.84 4760,4760,4760,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,1200.48 4768,4768,4768,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,1204.81 4776,4776,4776,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,1195.02 4784,4784,4784,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,1204.93 4792,4792,4792,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,1208.54 4800,4800,4800,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,1213 4808,4808,4808,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,1189.89 4816,4816,4816,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,1197.08 4824,4824,4824,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,1200.72 4832,4832,4832,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,1203.56 4840,4840,4840,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,1200.33 4848,4848,4848,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,1204.37 4856,4856,4856,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,1208.66 4864,4864,4864,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,1212.27 4872,4872,4872,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,1192.54 4880,4880,4880,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,1197.25 4888,4888,4888,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,1200.12 4896,4896,4896,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,1204.6 4904,4904,4904,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,1199.68 4912,4912,4912,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,1204.64 4920,4920,4920,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,1208.42 4928,4928,4928,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,1212.22 4936,4936,4936,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,1192.38 4944,4944,4944,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,1196.59 4952,4952,4952,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,1200.92 4960,4960,4960,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,1204.84 4968,4968,4968,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,1201.15 4976,4976,4976,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,1205.37 4984,4984,4984,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,1209.05 4992,4992,4992,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,1213.4 5000,5000,5000,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,1193.85 5008,5008,5008,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,1198.19 5016,5016,5016,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,1201.44 5024,5024,5024,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,1205 5032,5032,5032,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,1202.24 5040,5040,5040,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,1205.57 5048,5048,5048,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,1208.67 5056,5056,5056,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,1213.04 5064,5064,5064,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,1193.21 5072,5072,5072,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,1196.71 5080,5080,5080,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,1200.57 5088,5088,5088,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,1204.56 5096,5096,5096,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,1201.6 5104,5104,5104,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,1205.36 5112,5112,5112,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,1209.15 5120,5120,5120,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,1211.51 5128,5128,5128,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,1193.28 5136,5136,5136,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,1197.37 5144,5144,5144,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,1200.28 5152,5152,5152,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,1203.98 5160,5160,5160,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,1200.88 5168,5168,5168,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,1203.54 5176,5176,5176,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,1208.63 5184,5184,5184,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,1212.32 5192,5192,5192,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,1194.08 5200,5200,5200,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,1197.91 5208,5208,5208,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,1201.82 5216,5216,5216,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,1205.63 5224,5224,5224,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,1201.44 5232,5232,5232,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,1204.72 5240,5240,5240,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,1209.12 5248,5248,5248,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,1213.24 5256,5256,5256,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,1195.22 5264,5264,5264,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,1198.86 5272,5272,5272,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,1202.3 5280,5280,5280,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,1205.57 5288,5288,5288,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,1201.63 5296,5296,5296,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,1205.23 5304,5304,5304,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,1206.27 5312,5312,5312,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,1212.28 5320,5320,5320,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,1193.91 5328,5328,5328,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,1198.28 5336,5336,5336,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,1201.85 5344,5344,5344,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,1205.86 5352,5352,5352,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,1201.52 5360,5360,5360,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,1204.65 5368,5368,5368,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,1208.72 5376,5376,5376,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,1212.04 5384,5384,5384,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,1194.34 5392,5392,5392,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,1197.73 5400,5400,5400,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,1202.11 5408,5408,5408,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,1205.75 5416,5416,5416,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,1201.69 5424,5424,5424,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,1205.8 5432,5432,5432,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,1209.48 5440,5440,5440,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,1212.98 5448,5448,5448,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,1195.25 5456,5456,5456,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,1199.21 5464,5464,5464,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,1202.79 5472,5472,5472,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,1206.31 5480,5480,5480,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,1202.36 5488,5488,5488,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,1206.32 5496,5496,5496,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,1209.31 5504,5504,5504,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,1212.54 5512,5512,5512,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,1195.14 5520,5520,5520,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,1198.73 5528,5528,5528,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,1202.02 5536,5536,5536,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,1205.55 5544,5544,5544,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,1201.62 5552,5552,5552,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,1205.97 5560,5560,5560,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,1209.54 5568,5568,5568,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,1212.29 5576,5576,5576,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,1195.12 5584,5584,5584,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,1197.13 5592,5592,5592,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,1196.34 5600,5600,5600,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,1203.9 5608,5608,5608,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,1201.35 5616,5616,5616,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,1205.05 5624,5624,5624,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,1208.67 5632,5632,5632,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,1212.72 5640,5640,5640,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,1195.4 5648,5648,5648,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,1199.54 5656,5656,5656,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,1203.04 5664,5664,5664,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,1206.13 5672,5672,5672,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,1202.99 5680,5680,5680,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,1206.91 5688,5688,5688,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,1210.29 5696,5696,5696,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,1213.52 5704,5704,5704,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,1196.91 5712,5712,5712,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,1200.34 5720,5720,5720,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,1202.72 5728,5728,5728,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,1206.25 5736,5736,5736,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,1202.87 5744,5744,5744,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,1206.16 5752,5752,5752,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,1208.22 5760,5760,5760,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,zgemm,gpu,cublas,1,K40,1212.52 clblas-2.10/doc/performance/cuBLAS_7.5/Tesla_K40/peak_dp.csv000066400000000000000000000431361264277366700232450ustar00rootroot00000000000000m,n,k,lda,ldb,ldc,offa,offb,offc,alpha,beta,order,transa,transb,side,uplo,diag,function,device,library,label,GFLOPS 32,32,32,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,1660 64,64,64,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,1660 96,96,96,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,1660 128,128,128,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,1660 160,160,160,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,1660 192,192,192,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,1660 224,224,224,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,1660 256,256,256,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,1660 288,288,288,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,1660 320,320,320,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,1660 352,352,352,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,1660 384,384,384,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,1660 416,416,416,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,1660 448,448,448,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,1660 480,480,480,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,1660 512,512,512,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,1660 544,544,544,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,1660 576,576,576,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,1660 608,608,608,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,1660 640,640,640,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,1660 672,672,672,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,1660 704,704,704,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,1660 736,736,736,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,1660 768,768,768,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,1660 800,800,800,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,1660 832,832,832,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,1660 864,864,864,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,1660 896,896,896,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,1660 928,928,928,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,1660 960,960,960,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,1660 992,992,992,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,1660 1024,1024,1024,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,1660 1056,1056,1056,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,1660 1088,1088,1088,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,1660 1120,1120,1120,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,1660 1152,1152,1152,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,1660 1184,1184,1184,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,1660 1216,1216,1216,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,1660 1248,1248,1248,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,1660 1280,1280,1280,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,1660 1312,1312,1312,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,1660 1344,1344,1344,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,1660 1376,1376,1376,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,1660 1408,1408,1408,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,1660 1440,1440,1440,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,1660 1472,1472,1472,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,1660 1504,1504,1504,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,1660 1536,1536,1536,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,1660 1568,1568,1568,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,1660 1600,1600,1600,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,1660 1632,1632,1632,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,1660 1664,1664,1664,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,1660 1696,1696,1696,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,1660 1728,1728,1728,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,1660 1760,1760,1760,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,1660 1792,1792,1792,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,1660 1824,1824,1824,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,1660 1856,1856,1856,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,1660 1888,1888,1888,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,1660 1920,1920,1920,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,1660 1952,1952,1952,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,1660 1984,1984,1984,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,1660 2016,2016,2016,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,1660 2048,2048,2048,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,1660 2080,2080,2080,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,1660 2112,2112,2112,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,1660 2144,2144,2144,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,1660 2176,2176,2176,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,1660 2208,2208,2208,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,1660 2240,2240,2240,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,1660 2272,2272,2272,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,1660 2304,2304,2304,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,1660 2336,2336,2336,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,1660 2368,2368,2368,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,1660 2400,2400,2400,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,1660 2432,2432,2432,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,1660 2464,2464,2464,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,1660 2496,2496,2496,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,1660 2528,2528,2528,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,1660 2560,2560,2560,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,1660 2592,2592,2592,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,1660 2624,2624,2624,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,1660 2656,2656,2656,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,1660 2688,2688,2688,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,1660 2720,2720,2720,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,1660 2752,2752,2752,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,1660 2784,2784,2784,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,1660 2816,2816,2816,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,1660 2848,2848,2848,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,1660 2880,2880,2880,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,1660 2912,2912,2912,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,1660 2944,2944,2944,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,1660 2976,2976,2976,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,1660 3008,3008,3008,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,1660 3040,3040,3040,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,1660 3072,3072,3072,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,1660 3104,3104,3104,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,1660 3136,3136,3136,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,1660 3168,3168,3168,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,1660 3200,3200,3200,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,1660 3232,3232,3232,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,1660 3264,3264,3264,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,1660 3296,3296,3296,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,1660 3328,3328,3328,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,1660 3360,3360,3360,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,1660 3392,3392,3392,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,1660 3424,3424,3424,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,1660 3456,3456,3456,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,1660 3488,3488,3488,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,1660 3520,3520,3520,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,1660 3552,3552,3552,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,1660 3584,3584,3584,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,1660 3616,3616,3616,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,1660 3648,3648,3648,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,1660 3680,3680,3680,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,1660 3712,3712,3712,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,1660 3744,3744,3744,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,1660 3776,3776,3776,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,1660 3808,3808,3808,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,1660 3840,3840,3840,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,1660 3872,3872,3872,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,1660 3904,3904,3904,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,1660 3936,3936,3936,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,1660 3968,3968,3968,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,1660 4000,4000,4000,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,1660 4032,4032,4032,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,1660 4064,4064,4064,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,1660 4096,4096,4096,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,1660 4128,4128,4128,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,1660 4160,4160,4160,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,1660 4192,4192,4192,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,1660 4224,4224,4224,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,1660 4256,4256,4256,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,1660 4288,4288,4288,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,1660 4320,4320,4320,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,1660 4352,4352,4352,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,1660 4384,4384,4384,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,1660 4416,4416,4416,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,1660 4448,4448,4448,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,1660 4480,4480,4480,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,1660 4512,4512,4512,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,1660 4544,4544,4544,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,1660 4576,4576,4576,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,1660 4608,4608,4608,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,1660 4640,4640,4640,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,1660 4672,4672,4672,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,1660 4704,4704,4704,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,1660 4736,4736,4736,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,1660 4768,4768,4768,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,1660 4800,4800,4800,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,1660 4832,4832,4832,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,1660 4864,4864,4864,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,1660 4896,4896,4896,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,1660 4928,4928,4928,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,1660 4960,4960,4960,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,1660 4992,4992,4992,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,1660 5024,5024,5024,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,1660 5056,5056,5056,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,1660 5088,5088,5088,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,1660 5120,5120,5120,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,1660 5152,5152,5152,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,1660 5184,5184,5184,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,1660 5216,5216,5216,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,1660 5248,5248,5248,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,1660 5280,5280,5280,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,1660 5312,5312,5312,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,1660 5344,5344,5344,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,1660 5376,5376,5376,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,1660 5408,5408,5408,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,1660 5440,5440,5440,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,1660 5472,5472,5472,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,1660 5504,5504,5504,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,1660 5536,5536,5536,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,1660 5568,5568,5568,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,1660 5600,5600,5600,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,1660 5632,5632,5632,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,1660 5664,5664,5664,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,1660 5696,5696,5696,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,1660 5728,5728,5728,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,1660 5760,5760,5760,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,1660 clblas-2.10/doc/performance/cuBLAS_7.5/Tesla_K40/peak_sp.csv000066400000000000000000000431361264277366700232640ustar00rootroot00000000000000m,n,k,lda,ldb,ldc,offa,offb,offc,alpha,beta,order,transa,transb,side,uplo,diag,function,device,library,label,GFLOPS 32,32,32,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,5000 64,64,64,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,5000 96,96,96,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,5000 128,128,128,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,5000 160,160,160,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,5000 192,192,192,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,5000 224,224,224,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,5000 256,256,256,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,5000 288,288,288,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,5000 320,320,320,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,5000 352,352,352,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,5000 384,384,384,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,5000 416,416,416,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,5000 448,448,448,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,5000 480,480,480,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,5000 512,512,512,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,5000 544,544,544,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,5000 576,576,576,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,5000 608,608,608,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,5000 640,640,640,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,5000 672,672,672,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,5000 704,704,704,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,5000 736,736,736,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,5000 768,768,768,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,5000 800,800,800,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,5000 832,832,832,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,5000 864,864,864,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,5000 896,896,896,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,5000 928,928,928,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,5000 960,960,960,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,5000 992,992,992,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,5000 1024,1024,1024,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,5000 1056,1056,1056,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,5000 1088,1088,1088,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,5000 1120,1120,1120,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,5000 1152,1152,1152,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,5000 1184,1184,1184,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,5000 1216,1216,1216,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,5000 1248,1248,1248,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,5000 1280,1280,1280,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,5000 1312,1312,1312,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,5000 1344,1344,1344,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,5000 1376,1376,1376,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,5000 1408,1408,1408,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,5000 1440,1440,1440,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,5000 1472,1472,1472,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,5000 1504,1504,1504,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,5000 1536,1536,1536,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,5000 1568,1568,1568,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,5000 1600,1600,1600,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,5000 1632,1632,1632,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,5000 1664,1664,1664,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,5000 1696,1696,1696,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,5000 1728,1728,1728,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,5000 1760,1760,1760,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,5000 1792,1792,1792,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,5000 1824,1824,1824,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,5000 1856,1856,1856,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,5000 1888,1888,1888,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,5000 1920,1920,1920,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,5000 1952,1952,1952,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,5000 1984,1984,1984,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,5000 2016,2016,2016,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,5000 2048,2048,2048,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,5000 2080,2080,2080,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,5000 2112,2112,2112,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,5000 2144,2144,2144,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,5000 2176,2176,2176,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,5000 2208,2208,2208,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,5000 2240,2240,2240,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,5000 2272,2272,2272,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,5000 2304,2304,2304,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,5000 2336,2336,2336,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,5000 2368,2368,2368,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,5000 2400,2400,2400,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,5000 2432,2432,2432,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,5000 2464,2464,2464,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,5000 2496,2496,2496,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,5000 2528,2528,2528,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,5000 2560,2560,2560,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,5000 2592,2592,2592,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,5000 2624,2624,2624,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,5000 2656,2656,2656,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,5000 2688,2688,2688,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,5000 2720,2720,2720,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,5000 2752,2752,2752,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,5000 2784,2784,2784,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,5000 2816,2816,2816,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,5000 2848,2848,2848,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,5000 2880,2880,2880,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,5000 2912,2912,2912,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,5000 2944,2944,2944,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,5000 2976,2976,2976,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,5000 3008,3008,3008,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,5000 3040,3040,3040,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,5000 3072,3072,3072,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,5000 3104,3104,3104,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,5000 3136,3136,3136,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,5000 3168,3168,3168,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,5000 3200,3200,3200,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,5000 3232,3232,3232,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,5000 3264,3264,3264,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,5000 3296,3296,3296,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,5000 3328,3328,3328,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,5000 3360,3360,3360,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,5000 3392,3392,3392,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,5000 3424,3424,3424,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,5000 3456,3456,3456,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,5000 3488,3488,3488,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,5000 3520,3520,3520,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,5000 3552,3552,3552,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,5000 3584,3584,3584,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,5000 3616,3616,3616,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,5000 3648,3648,3648,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,5000 3680,3680,3680,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,5000 3712,3712,3712,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,5000 3744,3744,3744,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,5000 3776,3776,3776,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,5000 3808,3808,3808,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,5000 3840,3840,3840,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,5000 3872,3872,3872,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,5000 3904,3904,3904,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,5000 3936,3936,3936,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,5000 3968,3968,3968,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,5000 4000,4000,4000,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,5000 4032,4032,4032,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,5000 4064,4064,4064,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,5000 4096,4096,4096,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,5000 4128,4128,4128,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,5000 4160,4160,4160,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,5000 4192,4192,4192,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,5000 4224,4224,4224,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,5000 4256,4256,4256,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,5000 4288,4288,4288,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,5000 4320,4320,4320,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,5000 4352,4352,4352,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,5000 4384,4384,4384,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,5000 4416,4416,4416,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,5000 4448,4448,4448,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,5000 4480,4480,4480,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,5000 4512,4512,4512,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,5000 4544,4544,4544,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,5000 4576,4576,4576,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,5000 4608,4608,4608,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,5000 4640,4640,4640,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,5000 4672,4672,4672,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,5000 4704,4704,4704,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,5000 4736,4736,4736,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,5000 4768,4768,4768,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,5000 4800,4800,4800,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,5000 4832,4832,4832,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,5000 4864,4864,4864,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,5000 4896,4896,4896,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,5000 4928,4928,4928,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,5000 4960,4960,4960,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,5000 4992,4992,4992,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,5000 5024,5024,5024,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,5000 5056,5056,5056,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,5000 5088,5088,5088,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,5000 5120,5120,5120,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,5000 5152,5152,5152,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,5000 5184,5184,5184,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,5000 5216,5216,5216,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,5000 5248,5248,5248,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,5000 5280,5280,5280,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,5000 5312,5312,5312,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,5000 5344,5344,5344,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,5000 5376,5376,5376,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,5000 5408,5408,5408,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,5000 5440,5440,5440,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,5000 5472,5472,5472,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,5000 5504,5504,5504,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,5000 5536,5536,5536,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,5000 5568,5568,5568,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,5000 5600,5600,5600,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,5000 5632,5632,5632,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,5000 5664,5664,5664,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,5000 5696,5696,5696,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,5000 5728,5728,5728,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,5000 5760,5760,5760,0,0,0,0,0,0,1,1,column,none,transpose,left,upper,unit,dgemm,gpu,cublas,K40 Peak,5000 clblas-2.10/src/000077500000000000000000000000001264277366700135075ustar00rootroot00000000000000clblas-2.10/src/CMakeLists.txt000066400000000000000000000404461264277366700162570ustar00rootroot00000000000000# ######################################################################## # Copyright 2013 Advanced Micro Devices, Inc. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # ######################################################################## cmake_minimum_required(VERSION 2.8) #User toggle-able options that can be changed on the command line with -D option( BUILD_RUNTIME "Build the BLAS runtime library" ON ) option( BUILD_TEST "Build the library testing suite (dependency on google test, Boost, and ACML)" ON ) option( BUILD_PERFORMANCE "Copy the performance scripts that can measure and graph performance" OFF ) option( BUILD_SAMPLE "Build the sample programs" OFF ) option( BUILD_CLIENT "Build a command line clBLAS client program with a variety of configurable parameters (dependency on Boost)" OFF ) option( BUILD_KTEST "A command line tool for testing single clBLAS kernel" ON ) option( BUILD_SHARED_LIBS "Build shared libraries" ON ) #enable or disable offline compilation for different devices. Currently only Hawaii, Bonaire, Tahiti have the option. #option( OPENCL_OFFLINE_BUILD_HAWAII_KERNEL "Offline compile the OpenCL kernels for Hawaii device" OFF) #option( OPENCL_OFFLINE_BUILD_BONAIRE_KERNEL "Offline compile the OpenCL kernels for Bonaire device" OFF) #option( OPENCL_OFFLINE_BUILD_TAHITI_KERNEL "Offline compile the OpenCL kernels for Tathit device" OFF) set( OPENCL_OFFLINE_BUILD_HAWAII_KERNEL OFF) set( OPENCL_OFFLINE_BUILD_BONAIRE_KERNEL OFF) set( OPENCL_OFFLINE_BUILD_TAHITI_KERNEL OFF) #if( (OPENCL_OFFLINE_BUILD_HAWAII_KERNEL AND OPENCL_OFFLINE_BUILD_BONAIRE_KERNEL) OR (OPENCL_OFFLINE_BUILD_HAWAII_KERNEL AND OPENCL_OFFLINE_BUILD_TAHITI_KERNEL) OR (OPENCL_OFFLINE_BUILD_BONAIRE_KERNEL AND OPENCL_OFFLINE_BUILD_TAHITI_KERNEL)) # MESSAGE( WARNING "More than one device is chosen for offline compilation of static kernels. This might result in running out of heap memory with certain driver. Please consider offline compliation for ONE device only." ) #endif( ) #if( NOT OPENCL_OFFLINE_BUILD_HAWAII_KERNEL ) #use dynamic generated kernels # MESSAGE(STATUS "Build dynamic Hawaii kernels.") # MESSAGE(STATUS "Check OPENCL_OFFLINE_BUILD_HAWAII_KERNEL to build kernls at compile-time. This will eliminates clBuildProgram() overhead and better kernel performance with certain driver.") add_definitions(-DCLBLAS_HAWAII_DYNAMIC_KERNEL) #else() # MESSAGE(STATUS "Build static Hawaii kernels.") # MESSAGE(STATUS "Uncheck OPENCL_OFFLINE_BUILD_HAWAII_KERNEL to build kernls at run-time") # MESSAGE(STATUS "Please ensure the presence of Hawaii device in the system. With certain driver/compiler flags, this might result in compile-time error.") #endif( ) #if( NOT OPENCL_OFFLINE_BUILD_BONAIRE_KERNEL ) #use dynamic generated kernels # MESSAGE(STATUS "Build dynamic Bonaire kernels.") # MESSAGE(STATUS "Check OPENCL_OFFLINE_BUILD_BONAIRE_KERNEL to build kernls at compile-time. This will eliminates clBuildProgram() overhead and better kernel performance with certain driver.") add_definitions(-DCLBLAS_BONAIRE_DYNAMIC_KERNEL) #else() # MESSAGE(STATUS "Build static Bonaire kernels.") # MESSAGE(STATUS "Uncheck OPENCL_OFFLINE_BUILD_BONAIRE_KERNEL to build kernls at run-time") # MESSAGE(STATUS "Please ensure the presence of Bonaire device in the system. With certain driver/compiler flags, this might result in compile-time error.") #endif( ) #if( NOT OPENCL_OFFLINE_BUILD_TAHITI_KERNEL ) #use dynamic generated kernels # MESSAGE(STATUS "Build dynamic Tahiti kernels.") # MESSAGE(STATUS "Check OPENCL_OFFLINE_BUILD_TAHITI_KERNEL to build kernls at compile-time. This will eliminates clBuildProgram() overhead and better kernel performance with certain driver.") add_definitions(-DCLBLAS_TAHITI_DYNAMIC_KERNEL) #else( ) # MESSAGE(STATUS "Build static Tahiti kernels.") # MESSAGE(STATUS "Uncheck OPENCL_OFFLINE_BUILD_TAHITI_KERNEL to build kernls at run-time") # MESSAGE(STATUS "Please ensure the presence of Tahiti device in the system. With certain driver/compiler flags, this might result in compile-time error.") #endif( ) # Ask the user to verify compiler version. If OpenCL 2.0 is supported. Certain public flags can be user set( OPENCL_VERSION "1.2" CACHE STRING "The version of OpenCL supported by your driver/device" ) set_property( CACHE OPENCL_VERSION PROPERTY STRINGS 2.0 1.2 1.1 ) message( STATUS "You have confirmed OpenCL ${OPENCL_VERSION} is supported in your system" ) # By default test-correctness is linked and tested against ACML library. # However, test-correctness can instead use NETLIB as a reference library # On Mac OSX systems, this must be set to OFF for the build to succeed (due to nesting of FindBLAS code) if ( APPLE ) set(CORR_TEST_WITH_ACML OFF CACHE BOOL "Use ACML library in correctness tests") else ( ) message(STATUS "CORR_TEST_WITH_ACML set to OFF. Try link with libblas.so") set(CORR_TEST_WITH_ACML OFF CACHE BOOL "Use ACML library in correctness tests") endif( ) if( CMAKE_GENERATOR MATCHES "NMake" ) option( NMAKE_COMPILE_VERBOSE "Print compile and link strings to the console" OFF ) if( NMAKE_COMPILE_VERBOSE ) set( CMAKE_START_TEMP_FILE "" ) set( CMAKE_END_TEMP_FILE "" ) set( CMAKE_VERBOSE_MAKEFILE 1 ) endif( ) endif( ) # If we are on linux, and we wish to link with the netlib BLAS implementation when BUILD_TEST is ON, we need to have a valid fortran compiler if(BUILD_TEST AND NOT CORR_TEST_WITH_ACML AND NOT WIN32 AND NOT APPLE) project(clBLAS Fortran C CXX ) else( ) project(clBLAS C CXX) endif( ) # Define a version for the code if( NOT DEFINED clBLAS_VERSION_MAJOR ) set( clBLAS_VERSION_MAJOR 2 ) endif( ) if( NOT DEFINED clBLAS_VERSION_MINOR ) set( clBLAS_VERSION_MINOR 10 ) endif( ) if( NOT DEFINED clBLAS_VERSION_PATCH ) set( clBLAS_VERSION_PATCH 0 ) endif( ) set( clBLAS_VERSION "${clBLAS_VERSION_MAJOR}.${clBLAS_VERSION_MINOR}.${clBLAS_VERSION_PATCH}") # Increment this if we break backward compatibility. set( clBLAS_SOVERSION 2 ) # We have custom written Find* modules now in the root source directory set( CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} ${PROJECT_SOURCE_DIR} ) # On windows, it's convenient to change the default install prefix such that it does NOT point to 'program files' (permissions problems) # Need to check out CMAKE_RUNTIME_OUTPUT_DIRECTORY variable, and see if that eliminates the need to modify install path if( WIN32 AND CMAKE_INSTALL_PREFIX_INITIALIZED_TO_DEFAULT ) set( CMAKE_INSTALL_PREFIX "${PROJECT_BINARY_DIR}/package" CACHE PATH "Install path prefix, prepended onto install directories" FORCE ) endif( ) if(NOT CMAKE_BUILD_TYPE) set(CMAKE_BUILD_TYPE Debug CACHE STRING "Choose the type of build, options are: None Debug Release RelWithDebInfo MinSizeRel." FORCE) endif() # These variables are meant to contain string which should be appended to the installation paths # of library and executable binaries, respectively. They are meant to be user configurable/overridable. set( SUFFIX_LIB_DEFAULT "" ) set( SUFFIX_BIN_DEFAULT "" ) if(TARGET_PLATFORM EQUAL 32 OR TARGET_PLATFORM EQUAL 64) set(TARGET_PLATFORM ${TARGET_PLATFORM} CACHE STRING "Target platform type (32-bit or 64-bit)" FORCE) else() if(CMAKE_SIZEOF_VOID_P MATCHES 8) set(TARGET_PLATFORM "64" CACHE STRING "Target platform type (32-bit or 64-bit)" FORCE) else() set(TARGET_PLATFORM "32" CACHE STRING "Target platform type (32-bit or 64-bit)" FORCE) endif() endif() message(STATUS "Target platform: ${TARGET_PLATFORM}-bit") if(TARGET_PLATFORM EQUAL 32) set(_arch "x86" INTERNAL) set_property(GLOBAL PROPERTY FIND_LIBRARY_USE_LIB64_PATHS FALSE) else() set(_arch "x86_64" INTERNAL) set_property(GLOBAL PROPERTY FIND_LIBRARY_USE_LIB64_PATHS TRUE) if( NOT APPLE ) set( SUFFIX_LIB_DEFAULT "64" ) endif( ) endif() set( SUFFIX_LIB ${SUFFIX_LIB_DEFAULT} CACHE STRING "String to append to 'lib' install path" ) set( SUFFIX_BIN ${SUFFIX_BIN_DEFAULT} CACHE STRING "String to append to 'bin' install path" ) if( MSVC_IDE ) set_property( GLOBAL PROPERTY USE_FOLDERS TRUE ) endif( ) # add the math library for Linux if( UNIX ) set(MATH_LIBRARY "m") endif() # set the path to specific OpenCL compiler set( OPENCL_COMPILER_DIR "OPENCL COMPILER PATH" CACHE PATH "OPENCL COMPILER PATH") if ( ${OPENCL_COMPILER_DIR} STREQUAL "OPENCL COMPILER PATH") message( STATUS "Using default OpenCL Compiler") set(ENV_PATH "$ENV{PATH}") else () message( STATUS "OPENCL COMPILER: ${OPENCL_COMPILER_DIR}") if(UNIX) set(ENV_PATH "${OPENCL_COMPILER_DIR}") else() set(ENV_PATH "${OPENCL_COMPILER_DIR}") endif() endif() # Find the BLAS library # TODO: maybe this could be written using the FindBLAS module in the future if( BUILD_TEST ) if(NOT CORR_TEST_WITH_ACML) if(APPLE) find_library(BLAS_LIBRARIES Accelerate HINTS /System/Library/Frameworks/Accelerate.framework) MARK_AS_ADVANCED(BLAS_LIBRARIES) message(STATUS "Using Accelerate framework on Mac OS-X") else() find_package( Netlib COMPONENTS BLAS REQUIRED ) endif() else( ) # Find ACML BLAS implementation # platform dependent ACML subdirectory if (WIN32) set(ACML_SUBDIR ifort${TARGET_PLATFORM}_mp) else() set(ACML_SUBDIR gfortran${TARGET_PLATFORM}_mp) endif() find_path(ACML_INCLUDE_DIRS acml.h HINTS ${ACML_ROOT}/include ${ACML_ROOT}/${ACML_SUBDIR}/include $ENV{ACML_ROOT}/include $ENV{ACML_ROOT}/${ACML_SUBDIR}/include ) if( ACML_INCLUDE_DIRS ) else() message(WARNING "Cannot find acml.h") endif() if( UNIX ) find_library(ACML_LIBRARIES acml_mp HINTS ${ACML_ROOT}/lib ${ACML_ROOT}/${ACML_SUBDIR}/lib $ENV{ACML_ROOT}/lib $ENV{ACML_ROOT}/${ACML_SUBDIR}/lib ) find_library(_acml_mv_library acml_mv HINTS ${ACML_ROOT}/lib ${ACML_ROOT}/${ACML_SUBDIR}/lib $ENV{ACML_ROOT}/lib $ENV{ACML_ROOT}/${ACML_SUBDIR}/lib ) mark_as_advanced(_acml_mv_library) endif( ) if(WIN32) find_library(ACML_LIBRARIES libacml_mp_dll HINTS ${ACML_ROOT}/lib ${ACML_ROOT}/${ACML_SUBDIR}/lib $ENV{ACML_ROOT}/lib $ENV{ACML_ROOT}/${ACML_SUBDIR}/lib ) endif( ) if( NOT ACML_LIBRARIES ) message(WARNING "Cannot find libacml") endif( ) if(ACML_INCLUDE_DIRS AND ACML_LIBRARIES) if(_acml_mv_library) list(APPEND ACML_LIBRARIES ${_acml_mv_library}) endif() message(STATUS "Found ACML: ${ACML_LIBRARIES}") set(ACML_FOUND TRUE BOOL "Found the ACML package") endif() mark_as_advanced(ACML_FOUND ACML_INCLUDE_DIRS ACML_LIBRARIES) endif( ) endif( ) # This will define OPENCL_FOUND find_package( OpenCL ) # Find Boost on the system, and configure the type of boost build we want set( Boost_USE_MULTITHREADED ON ) set( Boost_USE_STATIC_LIBS ON ) set( Boost_DETAILED_FAILURE_MSG ON ) set( Boost_DEBUG ON ) set( Boost_ADDITIONAL_VERSIONS "1.44.0" "1.44" "1.47.0" "1.47" ) find_package( Boost 1.33.0 COMPONENTS program_options ) message(STATUS "Boost_PROGRAM_OPTIONS_LIBRARY: ${Boost_PROGRAM_OPTIONS_LIBRARY}") if( NOT Boost_FOUND ) message( STATUS "The clBLAS ktest requires boost to be installed" ) set( BUILD_KTEST OFF ) message( STATUS "The clBLAS client requires boost to be installed" ) set( BUILD_CLIENT OFF ) endif() # Turn on maximum compiler verbosity if(CMAKE_COMPILER_IS_GNUCXX) add_definitions(-pedantic -Wall -Wextra -D_POSIX_C_SOURCE=199309L -D_XOPEN_SOURCE=500 ) set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -std=c99 -Wstrict-prototypes" CACHE STRING "Default CFLAGS" FORCE) # Don't use -rpath. set(CMAKE_SKIP_RPATH ON CACHE BOOL "Skip RPATH" FORCE) set(CMAKE_C_FLAGS "-m${TARGET_PLATFORM} ${CMAKE_C_FLAGS}") set(CMAKE_CXX_FLAGS "-m${TARGET_PLATFORM} ${CMAKE_CXX_FLAGS}") set(CMAKE_Fortran_FLAGS "-m${TARGET_PLATFORM} ${CMAKE_Fortran_FLAGS}") if(TARGET_PLATFORM EQUAL 32) set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -fno-builtin") endif() elseif( MSVC ) # CMake sets huge stack frames for windows, for whatever reason. We go with compiler default. string( REGEX REPLACE "/STACK:[0-9]+" "" CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS}" ) string( REGEX REPLACE "/STACK:[0-9]+" "" CMAKE_SHARED_LINKER_FLAGS "${CMAKE_SHARED_LINKER_FLAGS}" ) string( REGEX REPLACE "/STACK:[0-9]+" "" CMAKE_MODULE_LINKER_FLAGS "${CMAKE_MODULE_LINKER_FLAGS}" ) endif( ) if (WIN32) add_definitions(-D_CRT_SECURE_NO_WARNINGS) endif( ) #TODO: We should remove this pre-processor define for our 1.8 build; this means removing our deprecated image functions such as calls clCreateImage2D( ) add_definitions( -DCL_USE_DEPRECATED_OPENCL_1_1_APIS ) configure_file( "${PROJECT_SOURCE_DIR}/clBLAS.version.h.in" "${PROJECT_BINARY_DIR}/include/clBLAS.version.h" ) # configure a header file to pass the CMake version settings to the source, and package the header files in the output archive install( FILES "clBLAS.h" "clAmdBlas.h" "clAmdBlas.version.h" "clBLAS-complex.h" "${PROJECT_BINARY_DIR}/include/clBLAS.version.h" DESTINATION "./include" ) if( BUILD_CLIENT AND IS_DIRECTORY "${PROJECT_SOURCE_DIR}/client") add_subdirectory( client ) endif( ) if( BUILD_PERFORMANCE AND IS_DIRECTORY "${PROJECT_SOURCE_DIR}/scripts/perf" ) add_subdirectory( scripts/perf ) endif( ) if( BUILD_RUNTIME AND IS_DIRECTORY "${PROJECT_SOURCE_DIR}/library" ) # add_subdirectory( library/tools/bingen ) add_subdirectory( library ) add_subdirectory( library/tools/tune ) if( BUILD_KTEST ) add_subdirectory( library/tools/ktest ) endif( ) endif() if( BUILD_SAMPLE AND IS_DIRECTORY "${PROJECT_SOURCE_DIR}/samples" ) add_subdirectory( samples ) endif( ) # The build server is not supposed to build or package any of the tests; build server script will define this on the command line with # cmake -G "Visual Studio 10 Win64" -D BUILDSERVER:BOOL=ON ../.. if( BUILD_TEST ) if( IS_DIRECTORY "${PROJECT_SOURCE_DIR}/tests" ) add_subdirectory(tests) endif( ) # These tests #include , which is not windows compliant if (NOT WIN32 AND IS_DIRECTORY "${PROJECT_SOURCE_DIR}/library" ) add_subdirectory( library/blas/gens/tests ) add_subdirectory( library/blas/gens/legacy/tests ) add_subdirectory( library/common/tests ) endif( ) endif( ) if(WIN32) set(destdir CMake) else() set(destdir lib${SUFFIX_LIB}/cmake/clBLAS) endif() string(REGEX REPLACE "[^/]+" ".." reldir "${destdir}") configure_file( ${CMAKE_CURRENT_SOURCE_DIR}/clBLASConfigVersion.cmake.in ${CMAKE_CURRENT_BINARY_DIR}/clBLASConfigVersion.cmake @ONLY) configure_file( ${CMAKE_CURRENT_SOURCE_DIR}/clBLASConfig.cmake.in ${CMAKE_CURRENT_BINARY_DIR}/clBLASConfig.cmake @ONLY) install(EXPORT Library DESTINATION ${destdir} FILE clBLASTargets.cmake) install(FILES ${CMAKE_CURRENT_BINARY_DIR}/clBLASConfigVersion.cmake ${CMAKE_CURRENT_BINARY_DIR}/clBLASConfig.cmake DESTINATION ${destdir}) # The following code is setting variables to control the behavior of CPack to generate our if( WIN32 ) set( CPACK_SOURCE_GENERATOR "ZIP" ) set( CPACK_GENERATOR "ZIP" ) else( ) set( CPACK_SOURCE_GENERATOR "TGZ" ) set( CPACK_GENERATOR "TGZ" ) endif( ) if( TARGET_PLATFORM EQUAL 64 ) set( CPACK_PACKAGE_FILE_NAME "${CMAKE_PROJECT_NAME}-${clBLAS_VERSION}-${CMAKE_HOST_SYSTEM_NAME}-x64") else( ) set( CPACK_PACKAGE_FILE_NAME "${CMAKE_PROJECT_NAME}-${clBLAS_VERSION}-${CMAKE_HOST_SYSTEM_NAME}-x32") endif( ) set( CPACK_SOURCE_PACKAGE_FILE_NAME "${CMAKE_PROJECT_NAME}-${clBLAS_VERSION}-${CMAKE_HOST_SYSTEM_NAME}-Source") set( CPACK_PACKAGE_VERSION_MAJOR ${clBLAS_VERSION_MAJOR} ) set( CPACK_PACKAGE_VERSION_MINOR ${clBLAS_VERSION_MINOR} ) set( CPACK_PACKAGE_VERSION_PATCH ${clBLAS_VERSION_PATCH} ) set( CPACK_PACKAGE_DESCRIPTION_SUMMARY "OpenCL implementation of a BLAS library") set( CPACK_PACKAGE_VENDOR "Neutral") set( CPACK_SOURCE_IGNORE_FILES "/\\\\.hg/;/\\\\.svn/;/\\\\.git/" ) # Define all variables that influence CPack before including CPack, such as install targets include( CPack ) clblas-2.10/src/FindNetlib.cmake000066400000000000000000000077171264277366700165430ustar00rootroot00000000000000# ######################################################################## # Copyright 2013 Advanced Micro Devices, Inc. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # ######################################################################## # Locate an Netlib implementation. # Pre-built binaries for windows can be found at http://icl.cs.utk.edu/lapack-for-windows/lapack/ # # Defines the following variables: # # Netlib_FOUND - Found the OPENCL framework # # Also defines the library variables below as normal # variables. These contain debug/optimized keywords when # a debugging library is found. # # Netlib_LIBRARIES - libNetlib # # Accepts the following variables as input: # # Netlib_ROOT - (as a CMake or environment variable) # The root directory of where Netlib libraries are found # # FIND_LIBRARY_USE_LIB64_PATHS - Global property that controls whether FindNetlib should search for # 64bit or 32bit libs # # Netlib_COMPILERS - Prioritized list of compiler flavors that this find package should search for when # looking for libraries. The user could have multiple flavors of Netlib installed # and setting this before calling FindPackage will alter order searched #----------------------- # Example Usage: # # find_package(Netlib REQUIRED) # include_directories(${Netlib_INCLUDE_DIRS}) # # add_executable(foo foo.cc) # target_link_libraries(foo ${Netlib_LIBRARIES}) # #----------------------- #TODO: Extend this to use Netlib_FIND_COMPONENTS, Netlib_FIND_REQUIRED, Netlib_FIND_QUIETLY include( FindPackageHandleStandardArgs ) # Search for 64bit libs if FIND_LIBRARY_USE_LIB64_PATHS is set to true in the global environment, 32bit libs else get_property( LIB64 GLOBAL PROPERTY FIND_LIBRARY_USE_LIB64_PATHS ) # This is a prioritized list of Netlib compiler versions that this FindModule looks for if( NOT DEFINED Netlib_COMPILERS ) set( Netlib_COMPILERS minGW intel ) endif( ) # Debug print statements #message( "Netlib_LIBRARY_PATH_SUFFIXES: ${Netlib_LIBRARY_PATH_SUFFIXES}" ) #message( "ENV{Netlib_ROOT}: $ENV{Netlib_ROOT}" ) #message( "Netlib_FIND_COMPONENTS: ${Netlib_FIND_COMPONENTS}" ) #message( "Netlib_FIND_REQUIRED: ${Netlib_FIND_REQUIRED}" ) # If the user does not set which components to find, then default to all components if( NOT Netlib_FIND_COMPONENTS ) set( Netlib_FIND_COMPONENTS BLAS ) endif( ) # The library name available from Netlib has different names for 64bit and 32bit libs if( LIB64 ) set( Netlib_BLAS_LIBNAME blas ) # set( Netlib_BLAS_LIBNAME BLAS ) Even though the download is named BLAS, the linker expects the .dll to be called libblas.dll else( ) set( Netlib_BLAS_LIBNAME blas ) endif( ) list( FIND Netlib_FIND_COMPONENTS BLAS contains_BLAS ) if( NOT contains_BLAS EQUAL -1 ) # Find and set the location of main Netlib lib file find_library( Netlib_BLAS_LIBRARY NAMES ${Netlib_BLAS_LIBNAME} HINTS ${Netlib_ROOT} ENV Netlib_ROOT PATHS /usr/lib /usr/local/lib /usr/lib/libblas DOC "Netlib dynamic library path" PATH_SUFFIXES lib ) mark_as_advanced( Netlib_BLAS_LIBRARY ) FIND_PACKAGE_HANDLE_STANDARD_ARGS( NETLIB DEFAULT_MSG Netlib_BLAS_LIBRARY ) endif( ) if( NETLIB_FOUND ) list( APPEND Netlib_LIBRARIES ${Netlib_BLAS_LIBRARY} ) else( ) if( NOT Netlib_FIND_QUIETLY ) message( WARNING "FindNetlib could not find the Netlib library" ) message( STATUS "Did you remember to set the Netlib_ROOT environment variable?" ) endif( ) endif() clblas-2.10/src/FindOpenCL.cmake000066400000000000000000000063501264277366700164360ustar00rootroot00000000000000# ######################################################################## # Copyright 2013 Advanced Micro Devices, Inc. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # ######################################################################## # Locate an OpenCL implementation. # Currently supports AMD APP SDK (http://developer.amd.com/sdks/AMDAPPSDK/Pages/default.aspx/) # # Defines the following variables: # # OPENCL_FOUND - Found the OPENCL framework # OPENCL_INCLUDE_DIRS - Include directories # # Also defines the library variables below as normal # variables. These contain debug/optimized keywords when # a debugging library is found. # # OPENCL_LIBRARIES - libopencl # # Accepts the following variables as input: # # OPENCL_ROOT - (as a CMake or environment variable) # The root directory of the OpenCL implementation found # # FIND_LIBRARY_USE_LIB64_PATHS - Global property that controls whether findOpenCL should search for # 64bit or 32bit libs #----------------------- # Example Usage: # # find_package(OPENCL REQUIRED) # include_directories(${OPENCL_INCLUDE_DIRS}) # # add_executable(foo foo.cc) # target_link_libraries(foo ${OPENCL_LIBRARIES}) # #----------------------- find_path(OPENCL_INCLUDE_DIRS NAMES OpenCL/cl.h CL/cl.h HINTS ${OPENCL_ROOT}/include $ENV{AMDAPPSDKROOT}/include $ENV{CUDA_PATH}/include PATHS /usr/include /usr/local/include /usr/local/cuda/include /opt/cuda/include DOC "OpenCL header file path" ) mark_as_advanced( OPENCL_INCLUDE_DIRS ) # Search for 64bit libs if FIND_LIBRARY_USE_LIB64_PATHS is set to true in the global environment, 32bit libs else get_property( LIB64 GLOBAL PROPERTY FIND_LIBRARY_USE_LIB64_PATHS ) if( LIB64 ) find_library( OPENCL_LIBRARIES NAMES OpenCL HINTS ${OPENCL_ROOT}/lib $ENV{AMDAPPSDKROOT}/lib $ENV{CUDA_PATH}/lib DOC "OpenCL dynamic library path" PATH_SUFFIXES x86_64 x64 x86_64/sdk PATHS /usr/lib /usr/local/cuda/lib /opt/cuda/lib ) else( ) find_library( OPENCL_LIBRARIES NAMES OpenCL HINTS ${OPENCL_ROOT}/lib $ENV{AMDAPPSDKROOT}/lib $ENV{CUDA_PATH}/lib DOC "OpenCL dynamic library path" PATH_SUFFIXES x86 Win32 PATHS /usr/lib /usr/local/cuda/lib /opt/cuda/lib ) endif( ) mark_as_advanced( OPENCL_LIBRARIES ) include( FindPackageHandleStandardArgs ) FIND_PACKAGE_HANDLE_STANDARD_ARGS( OPENCL DEFAULT_MSG OPENCL_LIBRARIES OPENCL_INCLUDE_DIRS ) if( NOT OPENCL_FOUND ) message( STATUS "FindOpenCL looked for libraries named: OpenCL" ) endif() clblas-2.10/src/clAmdBlas.h000066400000000000000000016653631264277366700155260ustar00rootroot00000000000000/* ************************************************************************ * Copyright 2013 Advanced Micro Devices, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * ************************************************************************/ #ifndef CLAMDBLAS_H_ #define CLAMDBLAS_H_ /*! @file clAmdBlas.h * /note clAmdBlas.h is a deprecated header file. * This header is provided to help projects that were written with the older clAmdBlas codebase, to help them * port to the new API at their own schedule. It will not be maintained or updated, and will be removed after * a reasonable amount of time has passed. All new code should be written against clFFT.h. * Older projects should migrate to the new header at their earliest convenience. */ /** * @mainpage OpenCL BLAS * */ #include "clBLAS.h" /* The following header defines a fixed version number as this header is deprecated and won't be updated */ #include "clAmdBlas.version.h" #ifdef __cplusplus extern "C" { #endif /** * @defgroup OVERVIEW Overview * * This library provides an implementation of the Basic Linear Algebra Subprograms levels 1, 2 and 3, * using OpenCL and optimized for AMD GPU hardware. It provides BLAS-1 functions * SWAP, SCAL, COPY, AXPY, DOT, DOTU, DOTC, ROTG, ROTMG, ROT, ROTM, iAMAX, ASUM and NRM2, * BLAS-2 functions GEMV, SYMV, TRMV, TRSV, HEMV, SYR, SYR2, HER, HER2, GER, GERU, GERC, * TPMV, SPMV, HPMV, TPSV, SPR, SPR2, HPR, HPR2, GBMV, TBMV, SBMV, HBMV and TBSV * and BLAS-3 functions GEMM, SYMM, TRMM, TRSM, HEMM, HERK, HER2K, SYRK and SYR2K. * * This library’s primary goal is to assist the end user to enqueue OpenCL * kernels to process BLAS functions in an OpenCL-efficient manner, while * keeping interfaces familiar to users who know how to use BLAS. All * functions accept matrices through buffer objects. * * @section deprecated * This library provided support for the creation of scratch images to achieve better performance * on older AMD APP SDK's. * However, memory buffers now give the same performance as buffers objects in the current SDK's. * Scratch image buffers are being deprecated and users are advised not to use scratch images in * new applications. */ /** * @defgroup TYPES clAmdBlas types */ /*@{*/ /* Since there is no method to inherit or extend an enum, clAmdBlasOrder is now a set of macro's and typedefs that 'behave' like an enum. The advantage is there is no need to cast between clblasOrder and clAmdBlasOrder */ #define clAmdBlasRowMajor clblasRowMajor #define clAmdBlasColumnMajor clblasColumnMajor typedef enum clblasOrder_ clAmdBlasOrder; /* Since there is no method to inherit or extend an enum, clAmdBlasTranspose is now a set of macro's and typedefs that 'behave' like an enum. The advantage is there is no need to cast between clblasTranspose and clAmdBlasTranspose */ #define clAmdBlasNoTrans clblasNoTrans #define clAmdBlasTrans clblasTrans #define clAmdBlasConjTrans clblasConjTrans typedef enum clblasTranspose_ clAmdBlasTranspose; /* Since there is no method to inherit or extend an enum, clAmdBlasUplo is now a set of macro's and typedefs that 'behave' like an enum. The advantage is there is no need to cast between clblasUplo and clAmdBlasUplo */ #define clAmdBlasUpper clblasUpper #define clAmdBlasLower clblasLower typedef enum clblasUplo_ clAmdBlasUplo; /* Since there is no method to inherit or extend an enum, clAmdBlasDiag is now a set of macro's and typedefs that 'behave' like an enum. The advantage is there is no need to cast between clblasDiag and clAmdBlasDiag */ #define clAmdBlasUnit clblasUnit #define clAmdBlasNonUnit clblasNonUnit typedef enum clblasDiag_ clAmdBlasDiag; /* Since there is no method to inherit or extend an enum, clAmdBlasSide is now a set of macro's and typedefs that 'behave' like an enum. The advantage is there is no need to cast between clblasSide and clAmdBlasSide */ #define clAmdBlasLeft clblasLeft #define clAmdBlasRight clblasRight typedef enum clblasSide_ clAmdBlasSide; /* Since there is no method to inherit or extend an enum, clAmdBlasStatus is now a set of macro's and typedefs that 'behave' like an enum. The advantage is there is no need to cast between clblasStatus and clAmdBlasStatus */ #define clAmdBlasSuccess clblasSuccess #define clAmdBlasInvalidValue clblasInvalidValue #define clAmdBlasInvalidCommandQueue clblasInvalidCommandQueue #define clAmdBlasInvalidContext clblasInvalidContext #define clAmdBlasInvalidMemObject clblasInvalidMemObject #define clAmdBlasInvalidDevice clblasInvalidDevice #define clAmdBlasInvalidEventWaitList clblasInvalidEventWaitList #define clAmdBlasOutOfResources clblasOutOfResources #define clAmdBlasOutOfHostMemory clblasOutOfHostMemory #define clAmdBlasInvalidOperation clblasInvalidOperation #define clAmdBlasCompilerNotAvailable clblasCompilerNotAvailable #define clAmdBlasBuildProgramFailure clblasBuildProgramFailure #define clAmdBlasNotImplemented clblasNotImplemented #define clAmdBlasNotInitialized clblasNotInitialized #define clAmdBlasInvalidMatA clblasInvalidMatA #define clAmdBlasInvalidMatB clblasInvalidMatB #define clAmdBlasInvalidMatC clblasInvalidMatC #define clAmdBlasInvalidVecX clblasInvalidVecX #define clAmdBlasInvalidVecY clblasInvalidVecY #define clAmdBlasInvalidDim clblasInvalidDim #define clAmdBlasInvalidLeadDimA clblasInvalidLeadDimA #define clAmdBlasInvalidLeadDimB clblasInvalidLeadDimB #define clAmdBlasInvalidLeadDimC clblasInvalidLeadDimC #define clAmdBlasInvalidIncX clblasInvalidIncX #define clAmdBlasInvalidIncY clblasInvalidIncY #define clAmdBlasInsufficientMemMatA clblasInsufficientMemMatA #define clAmdBlasInsufficientMemMatB clblasInsufficientMemMatB #define clAmdBlasInsufficientMemMatC clblasInsufficientMemMatC #define clAmdBlasInsufficientMemVecX clblasInsufficientMemVecX #define clAmdBlasInsufficientMemVecY clblasInsufficientMemVecY typedef enum clblasStatus_ clAmdBlasStatus; /*@}*/ /** * @defgroup VERSION Version information */ /*@{*/ /** * @brief Get the clAmdBlas library version info. * * @param[out] major Location to store library's major version. * @param[out] minor Location to store library's minor version. * @param[out] patch Location to store library's patch version. * * @returns always \b clAmdBlasSuccess. * * @ingroup VERSION */ __inline clAmdBlasStatus clAmdBlasGetVersion( cl_uint* major, cl_uint* minor, cl_uint* patch ) { return clblasGetVersion( major, minor, patch ); } /*@}*/ /** * @defgroup INIT Initialize library */ /*@{*/ /** * @brief Initialize the clAmdBlas library. * * Must be called before any other clAmdBlas API function is invoked. * @note This function is not thread-safe. * * @return * - \b clAmdBlasSucces on success; * - \b clAmdBlasOutOfHostMemory if there is not enough of memory to allocate * library's internal structures; * - \b clAmdBlasOutOfResources in case of requested resources scarcity. * * @ingroup INIT */ __inline clAmdBlasStatus clAmdBlasSetup( ) { return clblasSetup( ); } /** * @brief Finalize the usage of the clAmdBlas library. * * Frees all memory allocated for different computational kernel and other * internal data. * @note This function is not thread-safe. * * @ingroup INIT */ __inline void clAmdBlasTeardown( ) { clblasTeardown( ); } /*@}*/ /** * @defgroup MISC Miscellaneous */ /*@{*/ /** * @deprecated * @brief Create scratch image. * * Images created with this function can be used by the library to switch from * a buffer-based to an image-based implementation. This can increase * performance up to 2 or 3 times over buffer-objects-based ones on same systems. * To leverage the GEMM and TRMM kernels, it is necessary to create two images. * * The following description provides bounds for the width and height arguments * for functions that can use scratch images. * * Let \c type be the data type of the function in question. * * Let fl4RelSize(type) = sizeof(cl_float4) / sizeof(type). * * Let \c width1 and \c width2 be the respective values of the width argument * passed into the function for the two images needed to activate the image-based * algorithm. Similarly, let \c height1 and \c height2 be the values for the * height argument. * * Let div_up(x,y) = (x + y – 1) / y. * * Let round_up(x,y) = div_up(x,y) * y. * * Let round_down(x,y) = (x / y) * y. * * Then: * * For \b xGEMM there should be 2 images satisfying the following requirements: * - width1 >= round_up(K, 64) / fl4RelSize(type), * - width2 >= round_up(K, 64) / fl4RelSize(type), * - height >= 64M, * * for any transA, transB, and order. * * For \b xTRMM: * - width1 >= round_up(T, 64) / fl4RelSize(type), * - width2 >= round_up(N, 64) / fl4RelSize(type), * - height >= 64, * * for any transA, transB and order, where * - \c T = M, for \c side = clAmdBlasLeft, and * - \c T = N, for \c side = clAmdBlasRight. * * For \b xTRSM: * - round_down(width, 32) * round_down(height, 32) * fl4RelSize(type) >= 1/2 * (round_up(T, 32)^2 + div_up(T, 32) * 32^2) * * for any transA, transB and order, where * - \c T = M, for \c side = clAmdBlasLeft, and * - \c T = N, for \c side = clAmdBlasRight. * * A call to clAmdAddScratchImage with arguments \c width and \c height reserves * approximately width * height * 16 bytes of device memory. * * @return A created image identifier. * * @ingroup MISC */ cl_ulong clAmdBlasAddScratchImage( cl_context context, size_t width, size_t height, clAmdBlasStatus *status); /** * @deprecated * @brief Release scratch image. * * @return * - \b clAmdBlasSuccess on success; * - \b clAmdBlasNotInitialized if clAmdBlasSetup() was not called; * - \b clAmdBlasInvalidValue if an invalid image identified is passed. * * @ingroup MISC */ clAmdBlasStatus clAmdBlasRemoveScratchImage( cl_ulong imageID); /*@}*/ /** * @defgroup BLAS1 BLAS-1 functions * * The Level 1 Basic Linear Algebra Subprograms are functions that perform * vector-vector operations. */ /*@{*/ /*@}*/ /** * @defgroup SWAP SWAP - Swap elements from 2 vectors * @ingroup BLAS1 */ /*@{*/ /** * @brief interchanges two vectors of float. * * * @param[in] N Number of elements in vector \b X. * @param[out] X Buffer object storing vector \b X. * @param[in] offx Offset of first element of vector \b X in buffer object. * Counted in elements. * @param[in] incx Increment for the elements of \b X. Must not be zero. * @param[out] Y Buffer object storing the vector \b Y. * @param[in] offy Offset of first element of vector \b Y in buffer object. * Counted in elements. * @param[in] incy Increment for the elements of \b Y. Must not be zero. * @param[in] numCommandQueues Number of OpenCL command queues in which the * task is to be performed. * @param[in] commandQueues OpenCL command queues. * @param[in] numEventsInWaitList Number of events in the event wait list. * @param[in] eventWaitList Event wait list. * @param[in] events Event objects per each command queue that identify * a particular kernel execution instance. * * @return * - \b clAmdBlasSuccess on success; * - \b clAmdBlasNotInitialized if clAmdBlasSetup() was not called; * - \b clAmdBlasInvalidValue if invalid parameters are passed: * - \b N is zero, or * - either \b incx or \b incy is zero, or * - the vector sizes along with the increments lead to * accessing outside of any of the buffers; * - \b clAmdBlasInvalidMemObject if either \b X, or \b Y object is * Invalid, or an image object rather than the buffer one; * - \b clAmdBlasOutOfHostMemory if the library can't allocate memory for * internal structures; * - \b clAmdBlasInvalidCommandQueue if the passed command queue is invalid; * - \b clAmdBlasInvalidContext if a context a passed command queue belongs * to was released; * - \b clAmdBlasInvalidOperation if kernel compilation relating to a previous * call has not completed for any of the target devices; * - \b clAmdBlasCompilerNotAvailable if a compiler is not available; * - \b clAmdBlasBuildProgramFailure if there is a failure to build a program * executable. * * @ingroup SWAP */ __inline clAmdBlasStatus clAmdBlasSswap( size_t N, cl_mem X, size_t offx, int incx, cl_mem Y, size_t offy, int incy, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events) { return clblasSswap( N, X, offx, incx, Y, offy, incy, numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events ); } /** * @example example_sswap.c * Example of how to use the @ref clAmdBlasSswap function. */ /** * @brief interchanges two vectors of double. * * * @param[in] N Number of elements in vector \b X. * @param[out] X Buffer object storing vector \b X. * @param[in] offx Offset of first element of vector \b X in buffer object. * Counted in elements. * @param[in] incx Increment for the elements of \b X. Must not be zero. * @param[out] Y Buffer object storing the vector \b Y. * @param[in] offy Offset of first element of vector \b Y in buffer object. * Counted in elements. * @param[in] incy Increment for the elements of \b Y. Must not be zero. * @param[in] numCommandQueues Number of OpenCL command queues in which the * task is to be performed. * @param[in] commandQueues OpenCL command queues. * @param[in] numEventsInWaitList Number of events in the event wait list. * @param[in] eventWaitList Event wait list. * @param[in] events Event objects per each command queue that identify * a particular kernel execution instance. * * @return * - \b clAmdBlasSuccess on success; * - \b clAmdBlasInvalidDevice if a target device does not support the * floating point arithmetic with double precision; * - the same error codes as the clAmdBlasSswap() function otherwise. * * @ingroup SWAP */ __inline clAmdBlasStatus clAmdBlasDswap( size_t N, cl_mem X, size_t offx, int incx, cl_mem Y, size_t offy, int incy, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events) { return clblasDswap( N, X, offx, incx, Y, offy, incy, numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events ); } /** * @brief interchanges two vectors of complex-float elements. * * * @param[in] N Number of elements in vector \b X. * @param[out] X Buffer object storing vector \b X. * @param[in] offx Offset of first element of vector \b X in buffer object. * Counted in elements. * @param[in] incx Increment for the elements of \b X. Must not be zero. * @param[out] Y Buffer object storing the vector \b Y. * @param[in] offy Offset of first element of vector \b Y in buffer object. * Counted in elements. * @param[in] incy Increment for the elements of \b Y. Must not be zero. * @param[in] numCommandQueues Number of OpenCL command queues in which the * task is to be performed. * @param[in] commandQueues OpenCL command queues. * @param[in] numEventsInWaitList Number of events in the event wait list. * @param[in] eventWaitList Event wait list. * @param[in] events Event objects per each command queue that identify * a particular kernel execution instance. * * @return * - \b clAmdBlasSuccess on success; * - the same error codes as the clAmdBlasSwap() function otherwise. * * @ingroup SWAP */ __inline clAmdBlasStatus clAmdBlasCswap( size_t N, cl_mem X, size_t offx, int incx, cl_mem Y, size_t offy, int incy, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events) { return clblasCswap( N, X, offx, incx, Y, offy, incy, numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events ); } /** * @brief interchanges two vectors of double-complex elements. * * * @param[in] N Number of elements in vector \b X. * @param[out] X Buffer object storing vector \b X. * @param[in] offx Offset of first element of vector \b X in buffer object. * Counted in elements. * @param[in] incx Increment for the elements of \b X. Must not be zero. * @param[out] Y Buffer object storing the vector \b Y. * @param[in] offy Offset of first element of vector \b Y in buffer object. * Counted in elements. * @param[in] incy Increment for the elements of \b Y. Must not be zero. * @param[in] numCommandQueues Number of OpenCL command queues in which the * task is to be performed. * @param[in] commandQueues OpenCL command queues. * @param[in] numEventsInWaitList Number of events in the event wait list. * @param[in] eventWaitList Event wait list. * @param[in] events Event objects per each command queue that identify * a particular kernel execution instance. * * @return * - \b clAmdBlasSuccess on success; * - the same error codes as the clAmdBlasDwap() function otherwise. * * @ingroup SWAP */ __inline clAmdBlasStatus clAmdBlasZswap( size_t N, cl_mem X, size_t offx, int incx, cl_mem Y, size_t offy, int incy, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events) { return clblasZswap( N, X, offx, incx, Y, offy, incy, numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events ); } /*@}*/ /** * @defgroup SCAL SCAL - Scales a vector by a constant * @ingroup BLAS1 */ /*@{*/ /** * @brief Scales a float vector by a float constant * * - \f$ X \leftarrow \alpha X \f$ * * @param[in] N Number of elements in vector \b X. * @param[in] alpha The constant factor for vector \b X. * @param[out] X Buffer object storing vector \b X. * @param[in] offx Offset of first element of vector \b X in buffer object. * Counted in elements. * @param[in] incx Increment for the elements of \b X. Must not be zero. * @param[in] numCommandQueues Number of OpenCL command queues in which the * task is to be performed. * @param[in] commandQueues OpenCL command queues. * @param[in] numEventsInWaitList Number of events in the event wait list. * @param[in] eventWaitList Event wait list. * @param[in] events Event objects per each command queue that identify * a particular kernel execution instance. * * @return * - \b clAmdBlasSuccess on success; * - \b clAmdBlasNotInitialized if clAmdBlasSetup() was not called; * - \b clAmdBlasInvalidValue if invalid parameters are passed: * - \b N is zero, or * - \b incx zero, or * - the vector sizes along with the increments lead to * accessing outside of any of the buffers; * - \b clAmdBlasInvalidMemObject if either \b X, or \b Y object is * Invalid, or an image object rather than the buffer one; * - \b clAmdBlasOutOfHostMemory if the library can't allocate memory for * internal structures; * - \b clAmdBlasInvalidCommandQueue if the passed command queue is invalid; * - \b clAmdBlasInvalidContext if a context a passed command queue belongs * to was released; * - \b clAmdBlasInvalidOperation if kernel compilation relating to a previous * call has not completed for any of the target devices; * - \b clAmdBlasCompilerNotAvailable if a compiler is not available; * - \b clAmdBlasBuildProgramFailure if there is a failure to build a program * executable. * * @ingroup SCAL */ __inline clAmdBlasStatus clAmdBlasSscal( size_t N, cl_float alpha, cl_mem X, size_t offx, int incx, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events) { return clblasSscal( N, alpha, X, offx, incx, numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events ); } /** * @example example_sscal.c * Example of how to use the @ref clAmdBlasSscal function. */ /** * @brief Scales a double vector by a double constant * * - \f$ X \leftarrow \alpha X \f$ * * @param[in] N Number of elements in vector \b X. * @param[in] alpha The constant factor for vector \b X. * @param[out] X Buffer object storing vector \b X. * @param[in] offx Offset of first element of vector \b X in buffer object. * Counted in elements. * @param[in] incx Increment for the elements of \b X. Must not be zero. * @param[in] numCommandQueues Number of OpenCL command queues in which the * task is to be performed. * @param[in] commandQueues OpenCL command queues. * @param[in] numEventsInWaitList Number of events in the event wait list. * @param[in] eventWaitList Event wait list. * @param[in] events Event objects per each command queue that identify * a particular kernel execution instance. * * @return * - \b clAmdBlasSuccess on success; * - \b clAmdBlasInvalidDevice if a target device does not support the * floating point arithmetic with double precision; * - the same error codes as the clAmdBlasSscal() function otherwise. * * @ingroup SCAL */ __inline clAmdBlasStatus clAmdBlasDscal( size_t N, cl_double alpha, cl_mem X, size_t offx, int incx, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events) { return clblasDscal( N, alpha, X, offx, incx, numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events ); } /** * @brief Scales a complex-float vector by a complex-float constant * * - \f$ X \leftarrow \alpha X \f$ * * @param[in] N Number of elements in vector \b X. * @param[in] alpha The constant factor for vector \b X. * @param[out] X Buffer object storing vector \b X. * @param[in] offx Offset of first element of vector \b X in buffer object. * Counted in elements. * @param[in] incx Increment for the elements of \b X. Must not be zero. * @param[in] numCommandQueues Number of OpenCL command queues in which the * task is to be performed. * @param[in] commandQueues OpenCL command queues. * @param[in] numEventsInWaitList Number of events in the event wait list. * @param[in] eventWaitList Event wait list. * @param[in] events Event objects per each command queue that identify * a particular kernel execution instance. * * @return * - \b clAmdBlasSuccess on success; * - the same error codes as the clAmdBlasSscal() function otherwise. * * @ingroup SCAL */ __inline clAmdBlasStatus clAmdBlasCscal( size_t N, cl_float2 alpha, cl_mem X, size_t offx, int incx, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events) { return clblasCscal( N, alpha, X, offx, incx, numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events ); } /** * @brief Scales a complex-double vector by a complex-double constant * * - \f$ X \leftarrow \alpha X \f$ * * @param[in] N Number of elements in vector \b X. * @param[in] alpha The constant factor for vector \b X. * @param[out] X Buffer object storing vector \b X. * @param[in] offx Offset of first element of vector \b X in buffer object. * Counted in elements. * @param[in] incx Increment for the elements of \b X. Must not be zero. * @param[in] numCommandQueues Number of OpenCL command queues in which the * task is to be performed. * @param[in] commandQueues OpenCL command queues. * @param[in] numEventsInWaitList Number of events in the event wait list. * @param[in] eventWaitList Event wait list. * @param[in] events Event objects per each command queue that identify * a particular kernel execution instance. * * @return * - \b clAmdBlasSuccess on success; * - the same error codes as the clAmdBlasDscal() function otherwise. * * @ingroup SCAL */ __inline clAmdBlasStatus clAmdBlasZscal( size_t N, cl_double2 alpha, cl_mem X, size_t offx, int incx, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events) { return clblasZscal( N, alpha, X, offx, incx, numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events ); } /*@}*/ /** * @defgroup SSCAL SSCAL - Scales a complex vector by a real constant * @ingroup BLAS1 */ /*@{*/ /** * @brief Scales a complex-float vector by a float constant * * - \f$ X \leftarrow \alpha X \f$ * * @param[in] N Number of elements in vector \b X. * @param[in] alpha The constant factor for vector \b X. * @param[out] X Buffer object storing vector \b X. * @param[in] offx Offset of first element of vector \b X in buffer object. * Counted in elements. * @param[in] incx Increment for the elements of \b X. Must not be zero. * @param[in] numCommandQueues Number of OpenCL command queues in which the * task is to be performed. * @param[in] commandQueues OpenCL command queues. * @param[in] numEventsInWaitList Number of events in the event wait list. * @param[in] eventWaitList Event wait list. * @param[in] events Event objects per each command queue that identify * a particular kernel execution instance. * * @return * - \b clAmdBlasSuccess on success; * - \b clAmdBlasNotInitialized if clAmdBlasSetup() was not called; * - \b clAmdBlasInvalidValue if invalid parameters are passed: * - \b N is zero, or * - \b incx zero, or * - the vector sizes along with the increments lead to * accessing outside of any of the buffers; * - \b clAmdBlasInvalidMemObject if either \b X, or \b Y object is * Invalid, or an image object rather than the buffer one; * - \b clAmdBlasOutOfHostMemory if the library can't allocate memory for * internal structures; * - \b clAmdBlasInvalidCommandQueue if the passed command queue is invalid; * - \b clAmdBlasInvalidContext if a context a passed command queue belongs * to was released; * - \b clAmdBlasInvalidOperation if kernel compilation relating to a previous * call has not completed for any of the target devices; * - \b clAmdBlasCompilerNotAvailable if a compiler is not available; * - \b clAmdBlasBuildProgramFailure if there is a failure to build a program * executable. * * @ingroup SSCAL */ __inline clAmdBlasStatus clAmdBlasCsscal( size_t N, cl_float alpha, cl_mem X, size_t offx, int incx, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events) { return clblasCsscal( N, alpha, X, offx, incx, numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events ); } /*@}*/ /** * @example example_csscal.c * Example of how to use the @ref clAmdBlasCsscal function. */ /** * @brief Scales a complex-double vector by a double constant * * - \f$ X \leftarrow \alpha X \f$ * * @param[in] N Number of elements in vector \b X. * @param[in] alpha The constant factor for vector \b X. * @param[out] X Buffer object storing vector \b X. * @param[in] offx Offset of first element of vector \b X in buffer object. * Counted in elements. * @param[in] incx Increment for the elements of \b X. Must not be zero. * @param[in] numCommandQueues Number of OpenCL command queues in which the * task is to be performed. * @param[in] commandQueues OpenCL command queues. * @param[in] numEventsInWaitList Number of events in the event wait list. * @param[in] eventWaitList Event wait list. * @param[in] events Event objects per each command queue that identify * a particular kernel execution instance. * * @return * - \b clAmdBlasSuccess on success; * - \b clAmdBlasInvalidDevice if a target device does not support the * floating point arithmetic with double precision; * - the same error codes as the clAmdBlasCsscal() function otherwise. * * @ingroup SSCAL */ __inline clAmdBlasStatus clAmdBlasZdscal( size_t N, cl_double alpha, cl_mem X, size_t offx, int incx, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events) { return clblasZdscal( N, alpha, X, offx, incx, numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events ); } /*@}*/ /** * @defgroup COPY COPY - Copies elements from vector X to vector Y * @ingroup BLAS1 */ /*@{*/ /** * @brief Copies float elements from vector X to vector Y * * - \f$ Y \leftarrow X \f$ * * @param[in] N Number of elements in vector \b X. * @param[in] X Buffer object storing vector \b X. * @param[in] offx Offset of first element of vector \b X in buffer object. * Counted in elements. * @param[in] incx Increment for the elements of \b X. Must not be zero. * @param[out] Y Buffer object storing the vector \b Y. * @param[in] offy Offset of first element of vector \b Y in buffer object. * Counted in elements. * @param[in] incy Increment for the elements of \b Y. Must not be zero. * @param[in] numCommandQueues Number of OpenCL command queues in which the * task is to be performed. * @param[in] commandQueues OpenCL command queues. * @param[in] numEventsInWaitList Number of events in the event wait list. * @param[in] eventWaitList Event wait list. * @param[in] events Event objects per each command queue that identify * a particular kernel execution instance. * * @return * - \b clAmdBlasSuccess on success; * - \b clAmdBlasNotInitialized if clAmdBlasSetup() was not called; * - \b clAmdBlasInvalidValue if invalid parameters are passed: * - \b N is zero, or * - either \b incx or \b incy is zero, or * - the vector sizes along with the increments lead to * accessing outside of any of the buffers; * - \b clAmdBlasInvalidMemObject if either \b X, or \b Y object is * Invalid, or an image object rather than the buffer one; * - \b clAmdBlasOutOfHostMemory if the library can't allocate memory for * internal structures; * - \b clAmdBlasInvalidCommandQueue if the passed command queue is invalid; * - \b clAmdBlasInvalidContext if a context a passed command queue belongs * to was released; * - \b clAmdBlasInvalidOperation if kernel compilation relating to a previous * call has not completed for any of the target devices; * - \b clAmdBlasCompilerNotAvailable if a compiler is not available; * - \b clAmdBlasBuildProgramFailure if there is a failure to build a program * executable. * * @ingroup COPY */ __inline clAmdBlasStatus clAmdBlasScopy( size_t N, const cl_mem X, size_t offx, int incx, cl_mem Y, size_t offy, int incy, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events) { return clblasScopy( N, X, offx, incx, Y, offy, incy, numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events ); } /** * @example example_scopy.c * Example of how to use the @ref clAmdBlasScopy function. */ /** * @brief Copies double elements from vector X to vector Y * * - \f$ Y \leftarrow X \f$ * * @param[in] N Number of elements in vector \b X. * @param[in] X Buffer object storing vector \b X. * @param[in] offx Offset of first element of vector \b X in buffer object. * Counted in elements. * @param[in] incx Increment for the elements of \b X. Must not be zero. * @param[out] Y Buffer object storing the vector \b Y. * @param[in] offy Offset of first element of vector \b Y in buffer object. * Counted in elements. * @param[in] incy Increment for the elements of \b Y. Must not be zero. * @param[in] numCommandQueues Number of OpenCL command queues in which the * task is to be performed. * @param[in] commandQueues OpenCL command queues. * @param[in] numEventsInWaitList Number of events in the event wait list. * @param[in] eventWaitList Event wait list. * @param[in] events Event objects per each command queue that identify * a particular kernel execution instance. * * @return * - \b clAmdBlasSuccess on success; * - \b clAmdBlasInvalidDevice if a target device does not support the * floating point arithmetic with double precision; * - the same error codes as the clAmdBlasScopy() function otherwise. * * @ingroup COPY */ __inline clAmdBlasStatus clAmdBlasDcopy( size_t N, const cl_mem X, size_t offx, int incx, cl_mem Y, size_t offy, int incy, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events) { return clblasDcopy( N, X, offx, incx, Y, offy, incy, numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events ); } /** * @brief Copies complex-float elements from vector X to vector Y * * - \f$ Y \leftarrow X \f$ * * @param[in] N Number of elements in vector \b X. * @param[in] X Buffer object storing vector \b X. * @param[in] offx Offset of first element of vector \b X in buffer object. * Counted in elements. * @param[in] incx Increment for the elements of \b X. Must not be zero. * @param[out] Y Buffer object storing the vector \b Y. * @param[in] offy Offset of first element of vector \b Y in buffer object. * Counted in elements. * @param[in] incy Increment for the elements of \b Y. Must not be zero. * @param[in] numCommandQueues Number of OpenCL command queues in which the * task is to be performed. * @param[in] commandQueues OpenCL command queues. * @param[in] numEventsInWaitList Number of events in the event wait list. * @param[in] eventWaitList Event wait list. * @param[in] events Event objects per each command queue that identify * a particular kernel execution instance. * * @return * - \b clAmdBlasSuccess on success; * - the same error codes as the clAmdBlasScopy() function otherwise. * * @ingroup COPY */ __inline clAmdBlasStatus clAmdBlasCcopy( size_t N, const cl_mem X, size_t offx, int incx, cl_mem Y, size_t offy, int incy, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events) { return clblasCcopy( N, X, offx, incx, Y, offy, incy, numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events ); } /** * @brief Copies complex-double elements from vector X to vector Y * * - \f$ Y \leftarrow X \f$ * * @param[in] N Number of elements in vector \b X. * @param[in] X Buffer object storing vector \b X. * @param[in] offx Offset of first element of vector \b X in buffer object. * Counted in elements. * @param[in] incx Increment for the elements of \b X. Must not be zero. * @param[out] Y Buffer object storing the vector \b Y. * @param[in] offy Offset of first element of vector \b Y in buffer object. * Counted in elements. * @param[in] incy Increment for the elements of \b Y. Must not be zero. * @param[in] numCommandQueues Number of OpenCL command queues in which the * task is to be performed. * @param[in] commandQueues OpenCL command queues. * @param[in] numEventsInWaitList Number of events in the event wait list. * @param[in] eventWaitList Event wait list. * @param[in] events Event objects per each command queue that identify * a particular kernel execution instance. * * @return * - \b clAmdBlasSuccess on success; * - the same error codes as the clAmdBlasDcopy() function otherwise. * * @ingroup COPY */ __inline clAmdBlasStatus clAmdBlasZcopy( size_t N, const cl_mem X, size_t offx, int incx, cl_mem Y, size_t offy, int incy, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events) { return clblasZcopy( N, X, offx, incx, Y, offy, incy, numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events ); } /*@}*/ /** * @defgroup AXPY AXPY - Scale X and add to Y * @ingroup BLAS1 */ /*@{*/ /** * @brief Scale vector X of float elements and add to Y * * - \f$ Y \leftarrow \alpha X + Y \f$ * * @param[in] N Number of elements in vector \b X. * @param[in] alpha The constant factor for vector \b X. * @param[in] X Buffer object storing vector \b X. * @param[in] offx Offset of first element of vector \b X in buffer object. * Counted in elements. * @param[in] incx Increment for the elements of \b X. Must not be zero. * @param[out] Y Buffer object storing the vector \b Y. * @param[in] offy Offset of first element of vector \b Y in buffer object. * Counted in elements. * @param[in] incy Increment for the elements of \b Y. Must not be zero. * @param[in] numCommandQueues Number of OpenCL command queues in which the * task is to be performed. * @param[in] commandQueues OpenCL command queues. * @param[in] numEventsInWaitList Number of events in the event wait list. * @param[in] eventWaitList Event wait list. * @param[in] events Event objects per each command queue that identify * a particular kernel execution instance. * * @return * - \b clAmdBlasSuccess on success; * - \b clAmdBlasNotInitialized if clAmdBlasSetup() was not called; * - \b clAmdBlasInvalidValue if invalid parameters are passed: * - \b N is zero, or * - either \b incx or \b incy is zero, or * - the vector sizes along with the increments lead to * accessing outside of any of the buffers; * - \b clAmdBlasInvalidMemObject if either \b X, or \b Y object is * Invalid, or an image object rather than the buffer one; * - \b clAmdBlasOutOfHostMemory if the library can't allocate memory for * internal structures; * - \b clAmdBlasInvalidCommandQueue if the passed command queue is invalid; * - \b clAmdBlasInvalidContext if a context a passed command queue belongs * to was released; * - \b clAmdBlasInvalidOperation if kernel compilation relating to a previous * call has not completed for any of the target devices; * - \b clAmdBlasCompilerNotAvailable if a compiler is not available; * - \b clAmdBlasBuildProgramFailure if there is a failure to build a program * executable. * * @ingroup AXPY */ __inline clAmdBlasStatus clAmdBlasSaxpy( size_t N, cl_float alpha, const cl_mem X, size_t offx, int incx, cl_mem Y, size_t offy, int incy, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events) { return clblasSaxpy( N, alpha, X, offx, incx, Y, offy, incy, numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events ); } /** * @example example_saxpy.c * Example of how to use the @ref clAmdBlasSaxpy function. */ /** * @brief Scale vector X of double elements and add to Y * * - \f$ Y \leftarrow \alpha X + Y \f$ * * @param[in] N Number of elements in vector \b X. * @param[in] alpha The constant factor for vector \b X. * @param[in] X Buffer object storing vector \b X. * @param[in] offx Offset of first element of vector \b X in buffer object. * Counted in elements. * @param[in] incx Increment for the elements of \b X. Must not be zero. * @param[out] Y Buffer object storing the vector \b Y. * @param[in] offy Offset of first element of vector \b Y in buffer object. * Counted in elements. * @param[in] incy Increment for the elements of \b Y. Must not be zero. * @param[in] numCommandQueues Number of OpenCL command queues in which the * task is to be performed. * @param[in] commandQueues OpenCL command queues. * @param[in] numEventsInWaitList Number of events in the event wait list. * @param[in] eventWaitList Event wait list. * @param[in] events Event objects per each command queue that identify * a particular kernel execution instance. * * @return * - \b clAmdBlasSuccess on success; * - \b clAmdBlasInvalidDevice if a target device does not support the * floating point arithmetic with double precision; * - the same error codes as the clAmdBlasSaxpy() function otherwise. * * @ingroup AXPY */ __inline clAmdBlasStatus clAmdBlasDaxpy( size_t N, cl_double alpha, const cl_mem X, size_t offx, int incx, cl_mem Y, size_t offy, int incy, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events) { return clblasDaxpy( N, alpha, X, offx, incx, Y, offy, incy, numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events ); } /** * @brief Scale vector X of complex-float elements and add to Y * * - \f$ Y \leftarrow \alpha X + Y \f$ * * @param[in] N Number of elements in vector \b X. * @param[in] alpha The constant factor for vector \b X. * @param[in] X Buffer object storing vector \b X. * @param[in] offx Offset of first element of vector \b X in buffer object. * Counted in elements. * @param[in] incx Increment for the elements of \b X. Must not be zero. * @param[out] Y Buffer object storing the vector \b Y. * @param[in] offy Offset of first element of vector \b Y in buffer object. * Counted in elements. * @param[in] incy Increment for the elements of \b Y. Must not be zero. * @param[in] numCommandQueues Number of OpenCL command queues in which the * task is to be performed. * @param[in] commandQueues OpenCL command queues. * @param[in] numEventsInWaitList Number of events in the event wait list. * @param[in] eventWaitList Event wait list. * @param[in] events Event objects per each command queue that identify * a particular kernel execution instance. * * @return * - \b clAmdBlasSuccess on success; * - the same error codes as the clAmdBlasSaxpy() function otherwise. * * @ingroup AXPY */ __inline clAmdBlasStatus clAmdBlasCaxpy( size_t N, cl_float2 alpha, const cl_mem X, size_t offx, int incx, cl_mem Y, size_t offy, int incy, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events) { return clblasCaxpy( N, alpha, X, offx, incx, Y, offy, incy, numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events ); } /** * @brief Scale vector X of double-complex elements and add to Y * * - \f$ Y \leftarrow \alpha X + Y \f$ * * @param[in] N Number of elements in vector \b X. * @param[in] alpha The constant factor for vector \b X. * @param[in] X Buffer object storing vector \b X. * @param[in] offx Offset of first element of vector \b X in buffer object. * Counted in elements. * @param[in] incx Increment for the elements of \b X. Must not be zero. * @param[out] Y Buffer object storing the vector \b Y. * @param[in] offy Offset of first element of vector \b Y in buffer object. * Counted in elements. * @param[in] incy Increment for the elements of \b Y. Must not be zero. * @param[in] numCommandQueues Number of OpenCL command queues in which the * task is to be performed. * @param[in] commandQueues OpenCL command queues. * @param[in] numEventsInWaitList Number of events in the event wait list. * @param[in] eventWaitList Event wait list. * @param[in] events Event objects per each command queue that identify * a particular kernel execution instance. * * @return * - \b clAmdBlasSuccess on success; * - the same error codes as the clAmdBlasDaxpy() function otherwise. * * @ingroup AXPY */ __inline clAmdBlasStatus clAmdBlasZaxpy( size_t N, cl_double2 alpha, const cl_mem X, size_t offx, int incx, cl_mem Y, size_t offy, int incy, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events) { return clblasZaxpy( N, alpha, X, offx, incx, Y, offy, incy, numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events ); } /*@}*/ /** * @defgroup DOT DOT - Dot product of two vectors * @ingroup BLAS1 */ /*@{*/ /** * @brief dot product of two vectors containing float elements * * @param[in] N Number of elements in vector \b X. * @param[out] dotProduct Buffer object that will contain the dot-product value * @param[in] offDP Offset to dot-product in \b dotProduct buffer object. * Counted in elements. * @param[in] X Buffer object storing vector \b X. * @param[in] offx Offset of first element of vector \b X in buffer object. * Counted in elements. * @param[in] incx Increment for the elements of \b X. Must not be zero. * @param[in] Y Buffer object storing the vector \b Y. * @param[in] offy Offset of first element of vector \b Y in buffer object. * Counted in elements. * @param[in] incy Increment for the elements of \b Y. Must not be zero. * @param[in] scratchBuff Temporary cl_mem scratch buffer object of minimum size N * @param[in] numCommandQueues Number of OpenCL command queues in which the * task is to be performed. * @param[in] commandQueues OpenCL command queues. * @param[in] numEventsInWaitList Number of events in the event wait list. * @param[in] eventWaitList Event wait list. * @param[in] events Event objects per each command queue that identify * a particular kernel execution instance. * * @return * - \b clAmdBlasSuccess on success; * - \b clAmdBlasNotInitialized if clAmdBlasSetup() was not called; * - \b clAmdBlasInvalidValue if invalid parameters are passed: * - \b N is zero, or * - either \b incx or \b incy is zero, or * - the vector sizes along with the increments lead to * accessing outside of any of the buffers; * - \b clAmdBlasInvalidMemObject if either \b X, \b Y or \b dotProduct object is * Invalid, or an image object rather than the buffer one; * - \b clAmdBlasOutOfHostMemory if the library can't allocate memory for * internal structures; * - \b clAmdBlasInvalidCommandQueue if the passed command queue is invalid; * - \b clAmdBlasInvalidContext if a context a passed command queue belongs * to was released; * - \b clAmdBlasInvalidOperation if kernel compilation relating to a previous * call has not completed for any of the target devices; * - \b clAmdBlasCompilerNotAvailable if a compiler is not available; * - \b clAmdBlasBuildProgramFailure if there is a failure to build a program * executable. * * @ingroup DOT */ __inline clAmdBlasStatus clAmdBlasSdot( size_t N, cl_mem dotProduct, size_t offDP, const cl_mem X, size_t offx, int incx, const cl_mem Y, size_t offy, int incy, cl_mem scratchBuff, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events) { return clblasSdot( N, dotProduct, offDP, X, offx, incx, Y, offy, incy, scratchBuff, numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events ); } /** * @example example_sdot.c * Example of how to use the @ref clAmdBlasSdot function. */ /** * @brief dot product of two vectors containing double elements * * @param[in] N Number of elements in vector \b X. * @param[out] dotProduct Buffer object that will contain the dot-product value * @param[in] offDP Offset to dot-product in \b dotProduct buffer object. * Counted in elements. * @param[in] X Buffer object storing vector \b X. * @param[in] offx Offset of first element of vector \b X in buffer object. * Counted in elements. * @param[in] incx Increment for the elements of \b X. Must not be zero. * @param[in] Y Buffer object storing the vector \b Y. * @param[in] offy Offset of first element of vector \b Y in buffer object. * Counted in elements. * @param[in] incy Increment for the elements of \b Y. Must not be zero. * @param[in] scratchBuff Temporary cl_mem scratch buffer object of minimum size N * @param[in] numCommandQueues Number of OpenCL command queues in which the * task is to be performed. * @param[in] commandQueues OpenCL command queues. * @param[in] numEventsInWaitList Number of events in the event wait list. * @param[in] eventWaitList Event wait list. * @param[in] events Event objects per each command queue that identify * a particular kernel execution instance. * * @return * - \b clAmdBlasSuccess on success; * - \b clAmdBlasInvalidDevice if a target device does not support the * floating point arithmetic with double precision; * - the same error codes as the clAmdBlasSdot() function otherwise. * * @ingroup DOT */ __inline clAmdBlasStatus clAmdBlasDdot( size_t N, cl_mem dotProduct, size_t offDP, const cl_mem X, size_t offx, int incx, const cl_mem Y, size_t offy, int incy, cl_mem scratchBuff, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events) { return clblasDdot( N, dotProduct, offDP, X, offx, incx, Y, offy, incy, scratchBuff, numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events ); } /** * @brief dot product of two vectors containing float-complex elements * * @param[in] N Number of elements in vector \b X. * @param[out] dotProduct Buffer object that will contain the dot-product value * @param[in] offDP Offset to dot-product in \b dotProduct buffer object. * Counted in elements. * @param[in] X Buffer object storing vector \b X. * @param[in] offx Offset of first element of vector \b X in buffer object. * Counted in elements. * @param[in] incx Increment for the elements of \b X. Must not be zero. * @param[in] Y Buffer object storing the vector \b Y. * @param[in] offy Offset of first element of vector \b Y in buffer object. * Counted in elements. * @param[in] incy Increment for the elements of \b Y. Must not be zero. * @param[in] scratchBuff Temporary cl_mem scratch buffer object of minimum size N * @param[in] numCommandQueues Number of OpenCL command queues in which the * task is to be performed. * @param[in] commandQueues OpenCL command queues. * @param[in] numEventsInWaitList Number of events in the event wait list. * @param[in] eventWaitList Event wait list. * @param[in] events Event objects per each command queue that identify * a particular kernel execution instance. * * @return * - \b clAmdBlasSuccess on success; * - the same error codes as the clAmdBlasSdot() function otherwise. * * @ingroup DOT */ __inline clAmdBlasStatus clAmdBlasCdotu( size_t N, cl_mem dotProduct, size_t offDP, const cl_mem X, size_t offx, int incx, const cl_mem Y, size_t offy, int incy, cl_mem scratchBuff, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events) { return clblasCdotu( N, dotProduct, offDP, X, offx, incx, Y, offy, incy, scratchBuff, numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events ); } /** * @brief dot product of two vectors containing double-complex elements * * @param[in] N Number of elements in vector \b X. * @param[out] dotProduct Buffer object that will contain the dot-product value * @param[in] offDP Offset to dot-product in \b dotProduct buffer object. * Counted in elements. * @param[in] X Buffer object storing vector \b X. * @param[in] offx Offset of first element of vector \b X in buffer object. * Counted in elements. * @param[in] incx Increment for the elements of \b X. Must not be zero. * @param[in] Y Buffer object storing the vector \b Y. * @param[in] offy Offset of first element of vector \b Y in buffer object. * Counted in elements. * @param[in] incy Increment for the elements of \b Y. Must not be zero. * @param[in] scratchBuff Temporary cl_mem scratch buffer object of minimum size N * @param[in] numCommandQueues Number of OpenCL command queues in which the * task is to be performed. * @param[in] commandQueues OpenCL command queues. * @param[in] numEventsInWaitList Number of events in the event wait list. * @param[in] eventWaitList Event wait list. * @param[in] events Event objects per each command queue that identify * a particular kernel execution instance. * * @return * - \b clAmdBlasSuccess on success; * - \b clAmdBlasInvalidDevice if a target device does not support the * floating point arithmetic with double precision; * - the same error codes as the clAmdBlasSdot() function otherwise. * * @ingroup DOT */ __inline clAmdBlasStatus clAmdBlasZdotu( size_t N, cl_mem dotProduct, size_t offDP, const cl_mem X, size_t offx, int incx, const cl_mem Y, size_t offy, int incy, cl_mem scratchBuff, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events) { return clblasZdotu( N, dotProduct, offDP, X, offx, incx, Y, offy, incy, scratchBuff, numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events ); } /** * @brief dot product of two vectors containing float-complex elements conjugating the first vector * * @param[in] N Number of elements in vector \b X. * @param[out] dotProduct Buffer object that will contain the dot-product value * @param[in] offDP Offset to dot-product in \b dotProduct buffer object. * Counted in elements. * @param[in] X Buffer object storing vector \b X. * @param[in] offx Offset of first element of vector \b X in buffer object. * Counted in elements. * @param[in] incx Increment for the elements of \b X. Must not be zero. * @param[in] Y Buffer object storing the vector \b Y. * @param[in] offy Offset of first element of vector \b Y in buffer object. * Counted in elements. * @param[in] incy Increment for the elements of \b Y. Must not be zero. * @param[in] scratchBuff Temporary cl_mem scratch buffer object of minimum size N * @param[in] numCommandQueues Number of OpenCL command queues in which the * task is to be performed. * @param[in] commandQueues OpenCL command queues. * @param[in] numEventsInWaitList Number of events in the event wait list. * @param[in] eventWaitList Event wait list. * @param[in] events Event objects per each command queue that identify * a particular kernel execution instance. * * @return * - \b clAmdBlasSuccess on success; * - the same error codes as the clAmdBlasSdot() function otherwise. * * @ingroup DOT */ __inline clAmdBlasStatus clAmdBlasCdotc( size_t N, cl_mem dotProduct, size_t offDP, const cl_mem X, size_t offx, int incx, const cl_mem Y, size_t offy, int incy, cl_mem scratchBuff, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events) { return clblasCdotc( N, dotProduct, offDP, X, offx, incx, Y, offy, incy, scratchBuff, numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events ); } /** * @brief dot product of two vectors containing double-complex elements conjugating the first vector * * @param[in] N Number of elements in vector \b X. * @param[out] dotProduct Buffer object that will contain the dot-product value * @param[in] offDP Offset to dot-product in \b dotProduct buffer object. * Counted in elements. * @param[in] X Buffer object storing vector \b X. * @param[in] offx Offset of first element of vector \b X in buffer object. * Counted in elements. * @param[in] incx Increment for the elements of \b X. Must not be zero. * @param[in] Y Buffer object storing the vector \b Y. * @param[in] offy Offset of first element of vector \b Y in buffer object. * Counted in elements. * @param[in] incy Increment for the elements of \b Y. Must not be zero. * @param[in] scratchBuff Temporary cl_mem scratch buffer object of minimum size N * @param[in] numCommandQueues Number of OpenCL command queues in which the * task is to be performed. * @param[in] commandQueues OpenCL command queues. * @param[in] numEventsInWaitList Number of events in the event wait list. * @param[in] eventWaitList Event wait list. * @param[in] events Event objects per each command queue that identify * a particular kernel execution instance. * * @return * - \b clAmdBlasSuccess on success; * - \b clAmdBlasInvalidDevice if a target device does not support the * floating point arithmetic with double precision; * - the same error codes as the clAmdBlasSdot() function otherwise. * * @ingroup DOT */ __inline clAmdBlasStatus clAmdBlasZdotc( size_t N, cl_mem dotProduct, size_t offDP, const cl_mem X, size_t offx, int incx, const cl_mem Y, size_t offy, int incy, cl_mem scratchBuff, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events) { return clblasZdotc( N, dotProduct, offDP, X, offx, incx, Y, offy, incy, scratchBuff, numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events ); } /*@}*/ /** * @defgroup ROTG ROTG - Constructs givens plane rotation * @ingroup BLAS1 */ /*@{*/ /** * @brief construct givens plane rotation on float elements * * @param[out] SA Buffer object that contains SA * @param[in] offSA Offset to SA in \b SA buffer object. * Counted in elements. * @param[out] SB Buffer object that contains SB * @param[in] offSB Offset to SB in \b SB buffer object. * Counted in elements. * @param[out] C Buffer object that contains C * @param[in] offC Offset to C in \b C buffer object. * Counted in elements. * @param[out] S Buffer object that contains S * @param[in] offS Offset to S in \b S buffer object. * Counted in elements. * @param[in] numCommandQueues Number of OpenCL command queues in which the * task is to be performed. * @param[in] commandQueues OpenCL command queues. * @param[in] numEventsInWaitList Number of events in the event wait list. * @param[in] eventWaitList Event wait list. * @param[in] events Event objects per each command queue that identify * a particular kernel execution instance. * * @return * - \b clAmdBlasSuccess on success; * - \b clAmdBlasNotInitialized if clAmdBlasSetup() was not called; * - \b clAmdBlasInvalidMemObject if either \b SA, \b SB, \b C or \b S object is * Invalid, or an image object rather than the buffer one; * - \b clAmdBlasOutOfHostMemory if the library can't allocate memory for * internal structures; * - \b clAmdBlasInvalidCommandQueue if the passed command queue is invalid; * - \b clAmdBlasInvalidContext if a context a passed command queue belongs * to was released; * - \b clAmdBlasInvalidOperation if kernel compilation relating to a previous * call has not completed for any of the target devices; * - \b clAmdBlasCompilerNotAvailable if a compiler is not available; * - \b clAmdBlasBuildProgramFailure if there is a failure to build a program * executable. * * @ingroup ROTG */ __inline clAmdBlasStatus clAmdBlasSrotg( cl_mem SA, size_t offSA, cl_mem SB, size_t offSB, cl_mem C, size_t offC, cl_mem S, size_t offS, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events) { return clblasSrotg( SA, offSA, SB, offSB, C, offC, S, offS, numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events ); } /** * @example example_srotg.c * Example of how to use the @ref clAmdBlasSrotg function. */ /** * @brief construct givens plane rotation on double elements * * @param[out] DA Buffer object that contains DA * @param[in] offDA Offset to DA in \b DA buffer object. * Counted in elements. * @param[out] DB Buffer object that contains DB * @param[in] offDB Offset to DB in \b DB buffer object. * Counted in elements. * @param[out] C Buffer object that contains C * @param[in] offC Offset to C in \b C buffer object. * Counted in elements. * @param[out] S Buffer object that contains S * @param[in] offS Offset to S in \b S buffer object. * Counted in elements. * @param[in] numCommandQueues Number of OpenCL command queues in which the * task is to be performed. * @param[in] commandQueues OpenCL command queues. * @param[in] numEventsInWaitList Number of events in the event wait list. * @param[in] eventWaitList Event wait list. * @param[in] events Event objects per each command queue that identify * a particular kernel execution instance. * * @return * - \b clAmdBlasSuccess on success; * - \b clAmdBlasInvalidDevice if a target device does not support the * floating point arithmetic with double precision; * - the same error codes as the clAmdBlasSrotg() function otherwise. * * @ingroup ROTG */ __inline clAmdBlasStatus clAmdBlasDrotg( cl_mem DA, size_t offDA, cl_mem DB, size_t offDB, cl_mem C, size_t offC, cl_mem S, size_t offS, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events) { return clblasDrotg( DA, offDA, DB, offDB, C, offC, S, offS, numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events ); } /** * @brief construct givens plane rotation on float-complex elements * * @param[out] CA Buffer object that contains CA * @param[in] offCA Offset to CA in \b CA buffer object. * Counted in elements. * @param[out] CB Buffer object that contains CB * @param[in] offCB Offset to CB in \b CB buffer object. * Counted in elements. * @param[out] C Buffer object that contains C. C is real. * @param[in] offC Offset to C in \b C buffer object. * Counted in elements. * @param[out] S Buffer object that contains S * @param[in] offS Offset to S in \b S buffer object. * Counted in elements. * @param[in] numCommandQueues Number of OpenCL command queues in which the * task is to be performed. * @param[in] commandQueues OpenCL command queues. * @param[in] numEventsInWaitList Number of events in the event wait list. * @param[in] eventWaitList Event wait list. * @param[in] events Event objects per each command queue that identify * a particular kernel execution instance. * * @return * - \b clAmdBlasSuccess on success; * - the same error codes as the clAmdBlasSrotg() function otherwise. * * @ingroup ROTG */ __inline clAmdBlasStatus clAmdBlasCrotg( cl_mem CA, size_t offCA, cl_mem CB, size_t offCB, cl_mem C, size_t offC, cl_mem S, size_t offS, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events) { return clblasCrotg( CA, offCA, CB, offCB, C, offC, S, offS, numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events ); } /** * @brief construct givens plane rotation on double-complex elements * * @param[out] CA Buffer object that contains CA * @param[in] offCA Offset to CA in \b CA buffer object. * Counted in elements. * @param[out] CB Buffer object that contains CB * @param[in] offCB Offset to CB in \b CB buffer object. * Counted in elements. * @param[out] C Buffer object that contains C. C is real. * @param[in] offC Offset to C in \b C buffer object. * Counted in elements. * @param[out] S Buffer object that contains S * @param[in] offS Offset to S in \b S buffer object. * Counted in elements. * @param[in] numCommandQueues Number of OpenCL command queues in which the * task is to be performed. * @param[in] commandQueues OpenCL command queues. * @param[in] numEventsInWaitList Number of events in the event wait list. * @param[in] eventWaitList Event wait list. * @param[in] events Event objects per each command queue that identify * a particular kernel execution instance. * * @return * - \b clAmdBlasSuccess on success; * - the same error codes as the clAmdBlasDrotg() function otherwise. * * @ingroup ROTG */ __inline clAmdBlasStatus clAmdBlasZrotg( cl_mem CA, size_t offCA, cl_mem CB, size_t offCB, cl_mem C, size_t offC, cl_mem S, size_t offS, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events) { return clblasZrotg( CA, offCA, CB, offCB, C, offC, S, offS, numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events ); } /*@}*/ /** * @defgroup ROTMG ROTMG - Constructs the modified givens rotation * @ingroup BLAS1 */ /*@{*/ /** * @brief construct the modified givens rotation on float elements * * @param[out] SD1 Buffer object that contains SD1 * @param[in] offSD1 Offset to SD1 in \b SD1 buffer object. * Counted in elements. * @param[out] SD2 Buffer object that contains SD2 * @param[in] offSD2 Offset to SD2 in \b SD2 buffer object. * Counted in elements. * @param[out] SX1 Buffer object that contains SX1 * @param[in] offSX1 Offset to SX1 in \b SX1 buffer object. * Counted in elements. * @param[in] SY1 Buffer object that contains SY1 * @param[in] offSY1 Offset to SY1 in \b SY1 buffer object. * Counted in elements. * @param[out] SPARAM Buffer object that contains SPARAM array of minimum length 5 SPARAM(0) = SFLAG SPARAM(1) = SH11 SPARAM(2) = SH21 SPARAM(3) = SH12 SPARAM(4) = SH22 * @param[in] offSparam Offset to SPARAM in \b SPARAM buffer object. * Counted in elements. * @param[in] numCommandQueues Number of OpenCL command queues in which the * task is to be performed. * @param[in] commandQueues OpenCL command queues. * @param[in] numEventsInWaitList Number of events in the event wait list. * @param[in] eventWaitList Event wait list. * @param[in] events Event objects per each command queue that identify * a particular kernel execution instance. * * @return * - \b clAmdBlasSuccess on success; * - \b clAmdBlasNotInitialized if clAmdBlasSetup() was not called; * - \b clAmdBlasInvalidMemObject if either \b SX1, \b SY1, \b SD1, \b SD2 or \b SPARAM object is * Invalid, or an image object rather than the buffer one; * - \b clAmdBlasOutOfHostMemory if the library can't allocate memory for * internal structures; * - \b clAmdBlasInvalidCommandQueue if the passed command queue is invalid; * - \b clAmdBlasInvalidContext if a context a passed command queue belongs * to was released; * - \b clAmdBlasInvalidOperation if kernel compilation relating to a previous * call has not completed for any of the target devices; * - \b clAmdBlasCompilerNotAvailable if a compiler is not available; * - \b clAmdBlasBuildProgramFailure if there is a failure to build a program * executable. * * @ingroup ROTMG */ __inline clAmdBlasStatus clAmdBlasSrotmg( cl_mem SD1, size_t offSD1, cl_mem SD2, size_t offSD2, cl_mem SX1, size_t offSX1, const cl_mem SY1, size_t offSY1, cl_mem SPARAM, size_t offSparam, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events) { return clblasSrotmg( SD1, offSD1, SD2, offSD2, SX1, offSX1, SY1, offSY1, SPARAM, offSparam, numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events ); } /** * @example example_srotmg.c * Example of how to use the @ref clAmdBlasSrotmg function. */ /** * @brief construct the modified givens rotation on double elements * * @param[out] DD1 Buffer object that contains DD1 * @param[in] offDD1 Offset to DD1 in \b DD1 buffer object. * Counted in elements. * @param[out] DD2 Buffer object that contains DD2 * @param[in] offDD2 Offset to DD2 in \b DD2 buffer object. * Counted in elements. * @param[out] DX1 Buffer object that contains DX1 * @param[in] offDX1 Offset to DX1 in \b DX1 buffer object. * Counted in elements. * @param[in] DY1 Buffer object that contains DY1 * @param[in] offDY1 Offset to DY1 in \b DY1 buffer object. * Counted in elements. * @param[out] DPARAM Buffer object that contains DPARAM array of minimum length 5 DPARAM(0) = DFLAG DPARAM(1) = DH11 DPARAM(2) = DH21 DPARAM(3) = DH12 DPARAM(4) = DH22 * @param[in] offDparam Offset to DPARAM in \b DPARAM buffer object. * Counted in elements. * @param[in] numCommandQueues Number of OpenCL command queues in which the * task is to be performed. * @param[in] commandQueues OpenCL command queues. * @param[in] numEventsInWaitList Number of events in the event wait list. * @param[in] eventWaitList Event wait list. * @param[in] events Event objects per each command queue that identify * a particular kernel execution instance. * * @return * - \b clAmdBlasSuccess on success; * - \b clAmdBlasInvalidDevice if a target device does not support the * floating point arithmetic with double precision; * - the same error codes as the clAmdBlasSrotmg() function otherwise. * * @ingroup ROTMG */ __inline clAmdBlasStatus clAmdBlasDrotmg( cl_mem DD1, size_t offDD1, cl_mem DD2, size_t offDD2, cl_mem DX1, size_t offDX1, const cl_mem DY1, size_t offDY1, cl_mem DPARAM, size_t offDparam, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events) { return clblasDrotmg( DD1, offDD1, DD2, offDD2, DX1, offDX1, DY1, offDY1, DPARAM, offDparam, numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events ); } /*@}*/ /** * @defgroup ROT ROT - Apply givens rotation * @ingroup BLAS1 */ /*@{*/ /** * @brief applies a plane rotation for float elements * * @param[in] N Number of elements in vector \b X and \b Y. * @param[out] X Buffer object storing vector \b X. * @param[in] offx Offset of first element of vector \b X in buffer object. * Counted in elements. * @param[in] incx Increment for the elements of \b X. Must not be zero. * @param[out] Y Buffer object storing the vector \b Y. * @param[in] offy Offset of first element of vector \b Y in buffer object. * Counted in elements. * @param[in] incy Increment for the elements of \b Y. Must not be zero. * @param[in] C C specifies the cosine, cos. * @param[in] S S specifies the sine, sin. * @param[in] numCommandQueues Number of OpenCL command queues in which the * task is to be performed. * @param[in] commandQueues OpenCL command queues. * @param[in] numEventsInWaitList Number of events in the event wait list. * @param[in] eventWaitList Event wait list. * @param[in] events Event objects per each command queue that identify * a particular kernel execution instance. * * @return * - \b clAmdBlasSuccess on success; * - \b clAmdBlasNotInitialized if clAmdBlasSetup() was not called; * - \b clAmdBlasInvalidValue if invalid parameters are passed: * - \b N is zero, or * - either \b incx or \b incy is zero, or * - the vector sizes along with the increments lead to * accessing outside of any of the buffers; * - \b clAmdBlasInvalidMemObject if either \b X, or \b Y object is * Invalid, or an image object rather than the buffer one; * - \b clAmdBlasOutOfHostMemory if the library can't allocate memory for * internal structures; * - \b clAmdBlasInvalidCommandQueue if the passed command queue is invalid; * - \b clAmdBlasInvalidContext if a context a passed command queue belongs * to was released; * - \b clAmdBlasInvalidOperation if kernel compilation relating to a previous * call has not completed for any of the target devices; * - \b clAmdBlasCompilerNotAvailable if a compiler is not available; * - \b clAmdBlasBuildProgramFailure if there is a failure to build a program * executable. * * @ingroup ROT */ __inline clAmdBlasStatus clAmdBlasSrot( size_t N, cl_mem X, size_t offx, int incx, cl_mem Y, size_t offy, int incy, cl_float C, cl_float S, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events) { return clblasSrot( N, X, offx, incx, Y, offy, incy, C, S, numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events ); } /** * @example example_srot.c * Example of how to use the @ref clAmdBlasSrot function. */ /** * @brief applies a plane rotation for double elements * * @param[in] N Number of elements in vector \b X and \b Y. * @param[out] X Buffer object storing vector \b X. * @param[in] offx Offset of first element of vector \b X in buffer object. * Counted in elements. * @param[in] incx Increment for the elements of \b X. Must not be zero. * @param[out] Y Buffer object storing the vector \b Y. * @param[in] offy Offset of first element of vector \b Y in buffer object. * Counted in elements. * @param[in] incy Increment for the elements of \b Y. Must not be zero. * @param[in] C C specifies the cosine, cos. * @param[in] S S specifies the sine, sin. * @param[in] numCommandQueues Number of OpenCL command queues in which the * task is to be performed. * @param[in] commandQueues OpenCL command queues. * @param[in] numEventsInWaitList Number of events in the event wait list. * @param[in] eventWaitList Event wait list. * @param[in] events Event objects per each command queue that identify * a particular kernel execution instance. * * @return * - \b clAmdBlasSuccess on success; * - \b clAmdBlasInvalidDevice if a target device does not support the * floating point arithmetic with double precision; * - the same error codes as the clAmdBlasSrot() function otherwise. * * @ingroup ROT */ __inline clAmdBlasStatus clAmdBlasDrot( size_t N, cl_mem X, size_t offx, int incx, cl_mem Y, size_t offy, int incy, cl_double C, cl_double S, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events) { return clblasDrot( N, X, offx, incx, Y, offy, incy, C, S, numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events ); } /** * @brief applies a plane rotation for float-complex elements * * @param[in] N Number of elements in vector \b X and \b Y. * @param[out] X Buffer object storing vector \b X. * @param[in] offx Offset of first element of vector \b X in buffer object. * Counted in elements. * @param[in] incx Increment for the elements of \b X. Must not be zero. * @param[out] Y Buffer object storing the vector \b Y. * @param[in] offy Offset of first element of vector \b Y in buffer object. * Counted in elements. * @param[in] incy Increment for the elements of \b Y. Must not be zero. * @param[in] C C specifies the cosine, cos. This number is real * @param[in] S S specifies the sine, sin. This number is real * @param[in] numCommandQueues Number of OpenCL command queues in which the * task is to be performed. * @param[in] commandQueues OpenCL command queues. * @param[in] numEventsInWaitList Number of events in the event wait list. * @param[in] eventWaitList Event wait list. * @param[in] events Event objects per each command queue that identify * a particular kernel execution instance. * * @return * - \b clAmdBlasSuccess on success; * - the same error codes as the clAmdBlasSrot() function otherwise. * * @ingroup ROT */ __inline clAmdBlasStatus clAmdBlasCsrot( size_t N, cl_mem X, size_t offx, int incx, cl_mem Y, size_t offy, int incy, cl_float C, cl_float S, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events) { return clblasCsrot( N, X, offx, incx, Y, offy, incy, C, S, numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events ); } /** * @brief applies a plane rotation for double-complex elements * * @param[in] N Number of elements in vector \b X and \b Y. * @param[out] X Buffer object storing vector \b X. * @param[in] offx Offset of first element of vector \b X in buffer object. * Counted in elements. * @param[in] incx Increment for the elements of \b X. Must not be zero. * @param[out] Y Buffer object storing the vector \b Y. * @param[in] offy Offset of first element of vector \b Y in buffer object. * Counted in elements. * @param[in] incy Increment for the elements of \b Y. Must not be zero. * @param[in] C C specifies the cosine, cos. This number is real * @param[in] S S specifies the sine, sin. This number is real * @param[in] numCommandQueues Number of OpenCL command queues in which the * task is to be performed. * @param[in] commandQueues OpenCL command queues. * @param[in] numEventsInWaitList Number of events in the event wait list. * @param[in] eventWaitList Event wait list. * @param[in] events Event objects per each command queue that identify * a particular kernel execution instance. * * @return * - \b clAmdBlasSuccess on success; * - \b clAmdBlasInvalidDevice if a target device does not support the * floating point arithmetic with double precision; * - the same error codes as the clAmdBlasSrot() function otherwise. * * @ingroup ROT */ __inline clAmdBlasStatus clAmdBlasZdrot( size_t N, cl_mem X, size_t offx, int incx, cl_mem Y, size_t offy, int incy, cl_double C, cl_double S, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events) { return clblasZdrot( N, X, offx, incx, Y, offy, incy, C, S, numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events ); } /*@}*/ /** * @defgroup ROTM ROTM - Apply modified givens rotation for points in the plane * @ingroup BLAS1 */ /*@{*/ /** * @brief modified givens rotation for float elements * * @param[in] N Number of elements in vector \b X and \b Y. * @param[out] X Buffer object storing vector \b X. * @param[in] offx Offset of first element of vector \b X in buffer object. * Counted in elements. * @param[in] incx Increment for the elements of \b X. Must not be zero. * @param[out] Y Buffer object storing the vector \b Y. * @param[in] offy Offset of first element of vector \b Y in buffer object. * Counted in elements. * @param[in] incy Increment for the elements of \b Y. Must not be zero. * @param[in] SPARAM Buffer object that contains SPARAM array of minimum length 5 * SPARAM(1)=SFLAG * SPARAM(2)=SH11 * SPARAM(3)=SH21 * SPARAM(4)=SH12 * SPARAM(5)=SH22 * @param[in] offSparam Offset of first element of array \b SPARAM in buffer object. * Counted in elements. * @param[in] numCommandQueues Number of OpenCL command queues in which the * task is to be performed. * @param[in] commandQueues OpenCL command queues. * @param[in] numEventsInWaitList Number of events in the event wait list. * @param[in] eventWaitList Event wait list. * @param[in] events Event objects per each command queue that identify * a particular kernel execution instance. * * @return * - \b clAmdBlasSuccess on success; * - \b clAmdBlasNotInitialized if clAmdBlasSetup() was not called; * - \b clAmdBlasInvalidValue if invalid parameters are passed: * - \b N is zero, or * - either \b incx or \b incy is zero, or * - the vector sizes along with the increments lead to * accessing outside of any of the buffers; * - \b clAmdBlasInvalidMemObject if either \b X, \b Y or \b SPARAM object is * Invalid, or an image object rather than the buffer one; * - \b clAmdBlasOutOfHostMemory if the library can't allocate memory for * internal structures; * - \b clAmdBlasInvalidCommandQueue if the passed command queue is invalid; * - \b clAmdBlasInvalidContext if a context a passed command queue belongs * to was released; * - \b clAmdBlasInvalidOperation if kernel compilation relating to a previous * call has not completed for any of the target devices; * - \b clAmdBlasCompilerNotAvailable if a compiler is not available; * - \b clAmdBlasBuildProgramFailure if there is a failure to build a program * executable. * * @ingroup ROTM */ __inline clAmdBlasStatus clAmdBlasSrotm( size_t N, cl_mem X, size_t offx, int incx, cl_mem Y, size_t offy, int incy, const cl_mem SPARAM, size_t offSparam, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events) { return clblasSrotm( N, X, offx, incx, Y, offy, incy, SPARAM, offSparam, numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events ); } /** * @example example_srotm.c * Example of how to use the @ref clAmdBlasSrotm function. */ /** * @brief modified givens rotation for double elements * * @param[in] N Number of elements in vector \b X and \b Y. * @param[out] X Buffer object storing vector \b X. * @param[in] offx Offset of first element of vector \b X in buffer object. * Counted in elements. * @param[in] incx Increment for the elements of \b X. Must not be zero. * @param[out] Y Buffer object storing the vector \b Y. * @param[in] offy Offset of first element of vector \b Y in buffer object. * Counted in elements. * @param[in] incy Increment for the elements of \b Y. Must not be zero. * @param[in] DPARAM Buffer object that contains SPARAM array of minimum length 5 * DPARAM(1)=DFLAG * DPARAM(2)=DH11 * DPARAM(3)=DH21 * DPARAM(4)=DH12 * DPARAM(5)=DH22 * @param[in] offDparam Offset of first element of array \b DPARAM in buffer object. * Counted in elements. * @param[in] numCommandQueues Number of OpenCL command queues in which the * task is to be performed. * @param[in] commandQueues OpenCL command queues. * @param[in] numEventsInWaitList Number of events in the event wait list. * @param[in] eventWaitList Event wait list. * @param[in] events Event objects per each command queue that identify * a particular kernel execution instance. * * @return * - \b clAmdBlasSuccess on success; * - \b clAmdBlasInvalidDevice if a target device does not support the * floating point arithmetic with double precision; * - the same error codes as the clAmdBlasSrotm() function otherwise. * * @ingroup ROTM */ __inline clAmdBlasStatus clAmdBlasDrotm( size_t N, cl_mem X, size_t offx, int incx, cl_mem Y, size_t offy, int incy, const cl_mem DPARAM, size_t offDparam, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events) { return clblasDrotm( N, X, offx, incx, Y, offy, incy, DPARAM, offDparam, numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events ); } /*@}*/ /** * @defgroup NRM2 NRM2 - Euclidean norm of a vector * @ingroup BLAS1 */ /*@{*/ /** * @brief computes the euclidean norm of vector containing float elements * * NRM2 = sqrt( X' * X ) * * @param[in] N Number of elements in vector \b X. * @param[out] NRM2 Buffer object that will contain the NRM2 value * @param[in] offNRM2 Offset to NRM2 value in \b NRM2 buffer object. * Counted in elements. * @param[in] X Buffer object storing vector \b X. * @param[in] offx Offset of first element of vector \b X in buffer object. * Counted in elements. * @param[in] incx Increment for the elements of \b X. Must not be zero. * @param[in] scratchBuff Temporary cl_mem scratch buffer object that can hold minimum of (2*N) elements * @param[in] numCommandQueues Number of OpenCL command queues in which the * task is to be performed. * @param[in] commandQueues OpenCL command queues. * @param[in] numEventsInWaitList Number of events in the event wait list. * @param[in] eventWaitList Event wait list. * @param[in] events Event objects per each command queue that identify * a particular kernel execution instance. * * @return * - \b clAmdBlasSuccess on success; * - \b clAmdBlasNotInitialized if clAmdBlasSetup() was not called; * - \b clAmdBlasInvalidValue if invalid parameters are passed: * - \b N is zero, or * - either \b incx is zero, or * - the vector sizes along with the increments lead to * accessing outside of any of the buffers; * - \b clAmdBlasInvalidMemObject if any of \b X or \b NRM2 or \b scratchBuff object is * Invalid, or an image object rather than the buffer one; * - \b clAmdBlasOutOfHostMemory if the library can't allocate memory for * internal structures; * - \b clAmdBlasInvalidCommandQueue if the passed command queue is invalid; * - \b clAmdBlasInvalidContext if a context a passed command queue belongs * to was released; * - \b clAmdBlasInvalidOperation if kernel compilation relating to a previous * call has not completed for any of the target devices; * - \b clAmdBlasCompilerNotAvailable if a compiler is not available; * - \b clAmdBlasBuildProgramFailure if there is a failure to build a program * executable. * * @ingroup NRM2 */ __inline clAmdBlasStatus clAmdBlasSnrm2( size_t N, cl_mem NRM2, size_t offNRM2, const cl_mem X, size_t offx, int incx, cl_mem scratchBuff, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events) { return clblasSnrm2( N, NRM2, offNRM2, X, offx, incx, scratchBuff, numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events ); } /** * @example example_snrm2.c * Example of how to use the @ref clAmdBlasSnrm2 function. */ /** * @brief computes the euclidean norm of vector containing double elements * * NRM2 = sqrt( X' * X ) * * @param[in] N Number of elements in vector \b X. * @param[out] NRM2 Buffer object that will contain the NRM2 value * @param[in] offNRM2 Offset to NRM2 value in \b NRM2 buffer object. * Counted in elements. * @param[in] X Buffer object storing vector \b X. * @param[in] offx Offset of first element of vector \b X in buffer object. * Counted in elements. * @param[in] incx Increment for the elements of \b X. Must not be zero. * @param[in] scratchBuff Temporary cl_mem scratch buffer object that can hold minimum of (2*N) elements * @param[in] numCommandQueues Number of OpenCL command queues in which the * task is to be performed. * @param[in] commandQueues OpenCL command queues. * @param[in] numEventsInWaitList Number of events in the event wait list. * @param[in] eventWaitList Event wait list. * @param[in] events Event objects per each command queue that identify * a particular kernel execution instance. * * @return * - \b clAmdBlasSuccess on success; * - \b clAmdBlasInvalidDevice if a target device does not support the * floating point arithmetic with double precision; * - the same error codes as the clAmdBlasSnrm2() function otherwise. * * @ingroup NRM2 */ __inline clAmdBlasStatus clAmdBlasDnrm2( size_t N, cl_mem NRM2, size_t offNRM2, const cl_mem X, size_t offx, int incx, cl_mem scratchBuff, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events) { return clblasDnrm2( N, NRM2, offNRM2, X, offx, incx, scratchBuff, numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events ); } /** * @brief computes the euclidean norm of vector containing float-complex elements * * NRM2 = sqrt( X**H * X ) * * @param[in] N Number of elements in vector \b X. * @param[out] NRM2 Buffer object that will contain the NRM2 value. * Note that the answer of Scnrm2 is a real value. * @param[in] offNRM2 Offset to NRM2 value in \b NRM2 buffer object. * Counted in elements. * @param[in] X Buffer object storing vector \b X. * @param[in] offx Offset of first element of vector \b X in buffer object. * Counted in elements. * @param[in] incx Increment for the elements of \b X. Must not be zero. * @param[in] scratchBuff Temporary cl_mem scratch buffer object that can hold minimum of (2*N) elements * @param[in] numCommandQueues Number of OpenCL command queues in which the * task is to be performed. * @param[in] commandQueues OpenCL command queues. * @param[in] numEventsInWaitList Number of events in the event wait list. * @param[in] eventWaitList Event wait list. * @param[in] events Event objects per each command queue that identify * a particular kernel execution instance. * * @return * - \b clAmdBlasSuccess on success; * - the same error codes as the clAmdBlasSnrm2() function otherwise. * * @ingroup NRM2 */ __inline clAmdBlasStatus clAmdBlasScnrm2( size_t N, cl_mem NRM2, size_t offNRM2, const cl_mem X, size_t offx, int incx, cl_mem scratchBuff, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events) { return clblasScnrm2( N, NRM2, offNRM2, X, offx, incx, scratchBuff, numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events ); } /** * @brief computes the euclidean norm of vector containing double-complex elements * * NRM2 = sqrt( X**H * X ) * * @param[in] N Number of elements in vector \b X. * @param[out] NRM2 Buffer object that will contain the NRM2 value. * Note that the answer of Dznrm2 is a real value. * @param[in] offNRM2 Offset to NRM2 value in \b NRM2 buffer object. * Counted in elements. * @param[in] X Buffer object storing vector \b X. * @param[in] offx Offset of first element of vector \b X in buffer object. * Counted in elements. * @param[in] incx Increment for the elements of \b X. Must not be zero. * @param[in] scratchBuff Temporary cl_mem scratch buffer object that can hold minimum of (2*N) elements * @param[in] numCommandQueues Number of OpenCL command queues in which the * task is to be performed. * @param[in] commandQueues OpenCL command queues. * @param[in] numEventsInWaitList Number of events in the event wait list. * @param[in] eventWaitList Event wait list. * @param[in] events Event objects per each command queue that identify * a particular kernel execution instance. * * @return * - \b clAmdBlasSuccess on success; * - \b clAmdBlasInvalidDevice if a target device does not support the * floating point arithmetic with double precision; * - the same error codes as the clAmdBlasSnrm2() function otherwise. * executable. * * @ingroup NRM2 */ __inline clAmdBlasStatus clAmdBlasDznrm2( size_t N, cl_mem NRM2, size_t offNRM2, const cl_mem X, size_t offx, int incx, cl_mem scratchBuff, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events) { return clblasDznrm2( N, NRM2, offNRM2, X, offx, incx, scratchBuff, numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events ); } /*@}*/ /** * @defgroup iAMAX iAMAX - Index of max absolute value * @ingroup BLAS1 */ /*@{*/ /** * @brief index of max absolute value in a float array * * @param[in] N Number of elements in vector \b X. * @param[out] iMax Buffer object storing the index of first absolute max. * The index will be of type unsigned int * @param[in] offiMax Offset for storing index in the buffer iMax * Counted in elements. * @param[in] X Buffer object storing vector \b X. * @param[in] offx Offset of first element of vector \b X in buffer object. * Counted in elements. * @param[in] incx Increment for the elements of \b X. Must not be zero. * @param[in] scratchBuff Temprory cl_mem object to store intermediate results It should be able to hold minimum of (2*N) elements * @param[in] numCommandQueues Number of OpenCL command queues in which the * task is to be performed. * @param[in] commandQueues OpenCL command queues. * @param[in] numEventsInWaitList Number of events in the event wait list. * @param[in] eventWaitList Event wait list. * @param[in] events Event objects per each command queue that identify * a particular kernel execution instance. * * @return * - \b clAmdBlasSuccess on success; * - \b clAmdBlasNotInitialized if clAmdBlasSetup() was not called; * - \b clAmdBlasInvalidValue if invalid parameters are passed: * - \b N is zero, or * - either \b incx is zero, or * - the vector sizes along with the increments lead to * accessing outside of any of the buffers; * - \b clAmdBlasInvalidMemObject if any of \b iMax \b X or \b scratchBuff object is * Invalid, or an image object rather than the buffer one; * - \b clAmdBlasOutOfHostMemory if the library can't allocate memory for * internal structures; * - \b clAmdBlasInvalidCommandQueue if the passed command queue is invalid; * - \b clAmdBlasInvalidContext if the context, the passed command queue belongs * to was released; * - \b clAmdBlasInvalidOperation if kernel compilation relating to a previous * call has not completed for any of the target devices; * - \b clAmdBlasCompilerNotAvailable if a compiler is not available; * - \b clAmdBlasBuildProgramFailure if there is a failure to build a program * executable. * * @ingroup iAMAX */ __inline clAmdBlasStatus clAmdBlasiSamax( size_t N, cl_mem iMax, size_t offiMax, const cl_mem X, size_t offx, int incx, cl_mem scratchBuff, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events) { return clblasiSamax( N, iMax, offiMax, X, offx, incx, scratchBuff, numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events ); } /** * @example example_isamax.c * Example of how to use the @ref clAmdBlasiSamax function. */ /** * @brief index of max absolute value in a double array * * @param[in] N Number of elements in vector \b X. * @param[out] iMax Buffer object storing the index of first absolute max. * The index will be of type unsigned int * @param[in] offiMax Offset for storing index in the buffer iMax * Counted in elements. * @param[in] X Buffer object storing vector \b X. * @param[in] offx Offset of first element of vector \b X in buffer object. * Counted in elements. * @param[in] incx Increment for the elements of \b X. Must not be zero. * @param[in] scratchBuff Temprory cl_mem object to store intermediate results It should be able to hold minimum of (2*N) elements * @param[in] numCommandQueues Number of OpenCL command queues in which the * task is to be performed. * @param[in] commandQueues OpenCL command queues. * @param[in] numEventsInWaitList Number of events in the event wait list. * @param[in] eventWaitList Event wait list. * @param[in] events Event objects per each command queue that identify * a particular kernel execution instance. * * @return * - \b clAmdBlasSuccess on success; * - \b clAmdBlasInvalidDevice if a target device does not support the * floating point arithmetic with double precision; * - the same error codes as the clAmdBlasiSamax() function otherwise. * * @ingroup iAMAX */ __inline clAmdBlasStatus clAmdBlasiDamax( size_t N, cl_mem iMax, size_t offiMax, const cl_mem X, size_t offx, int incx, cl_mem scratchBuff, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events) { return clblasiDamax( N, iMax, offiMax, X, offx, incx, scratchBuff, numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events ); } /** * @brief index of max absolute value in a complex float array * * @param[in] N Number of elements in vector \b X. * @param[out] iMax Buffer object storing the index of first absolute max. * The index will be of type unsigned int * @param[in] offiMax Offset for storing index in the buffer iMax * Counted in elements. * @param[in] X Buffer object storing vector \b X. * @param[in] offx Offset of first element of vector \b X in buffer object. * Counted in elements. * @param[in] incx Increment for the elements of \b X. Must not be zero. * @param[in] scratchBuff Temprory cl_mem object to store intermediate results It should be able to hold minimum of (2*N) elements * @param[in] numCommandQueues Number of OpenCL command queues in which the * task is to be performed. * @param[in] commandQueues OpenCL command queues. * @param[in] numEventsInWaitList Number of events in the event wait list. * @param[in] eventWaitList Event wait list. * @param[in] events Event objects per each command queue that identify * a particular kernel execution instance. * * @return * - \b clAmdBlasSuccess on success; * - the same error codes as the clAmdBlasiSamax() function otherwise. * * @ingroup iAMAX */ __inline clAmdBlasStatus clAmdBlasiCamax( size_t N, cl_mem iMax, size_t offiMax, const cl_mem X, size_t offx, int incx, cl_mem scratchBuff, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events) { return clblasiCamax( N, iMax, offiMax, X, offx, incx, scratchBuff, numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events ); } /** * @brief index of max absolute value in a complex double array * * @param[in] N Number of elements in vector \b X. * @param[out] iMax Buffer object storing the index of first absolute max. * The index will be of type unsigned int * @param[in] offiMax Offset for storing index in the buffer iMax * Counted in elements. * @param[in] X Buffer object storing vector \b X. * @param[in] offx Offset of first element of vector \b X in buffer object. * Counted in elements. * @param[in] incx Increment for the elements of \b X. Must not be zero. * @param[in] scratchBuff Temprory cl_mem object to store intermediate results It should be able to hold minimum of (2*N) elements * @param[in] numCommandQueues Number of OpenCL command queues in which the * task is to be performed. * @param[in] commandQueues OpenCL command queues. * @param[in] numEventsInWaitList Number of events in the event wait list. * @param[in] eventWaitList Event wait list. * @param[in] events Event objects per each command queue that identify * a particular kernel execution instance. * * @return * @return * - \b clAmdBlasSuccess on success; * - \b clAmdBlasInvalidDevice if a target device does not support the * floating point arithmetic with double precision; * - the same error codes as the clAmdBlasiSamax() function otherwise. * * @ingroup iAMAX */ __inline clAmdBlasStatus clAmdBlasiZamax( size_t N, cl_mem iMax, size_t offiMax, const cl_mem X, size_t offx, int incx, cl_mem scratchBuff, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events) { return clblasiZamax( N, iMax, offiMax, X, offx, incx, scratchBuff, numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events ); } /*@}*/ /** * @defgroup ASUM ASUM - Sum of absolute values * @ingroup BLAS1 */ /*@{*/ /** * @brief absolute sum of values of a vector containing float elements * * @param[in] N Number of elements in vector \b X. * @param[out] asum Buffer object that will contain the absoule sum value * @param[in] offAsum Offset to absolute sum in \b asum buffer object. * Counted in elements. * @param[in] X Buffer object storing vector \b X. * @param[in] offx Offset of first element of vector \b X in buffer object. * Counted in elements. * @param[in] incx Increment for the elements of \b X. Must not be zero. * @param[in] scratchBuff Temporary cl_mem scratch buffer object of minimum size N * @param[in] numCommandQueues Number of OpenCL command queues in which the * task is to be performed. * @param[in] commandQueues OpenCL command queues. * @param[in] numEventsInWaitList Number of events in the event wait list. * @param[in] eventWaitList Event wait list. * @param[in] events Event objects per each command queue that identify * a particular kernel execution instance. * * @return * - \b clAmdBlasSuccess on success; * - \b clAmdBlasNotInitialized if clAmdBlasSetup() was not called; * - \b clAmdBlasInvalidValue if invalid parameters are passed: * - \b N is zero, or * - either \b incx is zero, or * - the vector sizes along with the increments lead to * accessing outside of any of the buffers; * - \b clAmdBlasInvalidMemObject if any of \b X or \b asum or \b scratchBuff object is * Invalid, or an image object rather than the buffer one; * - \b clAmdBlasOutOfHostMemory if the library can't allocate memory for * internal structures; * - \b clAmdBlasInvalidCommandQueue if the passed command queue is invalid; * - \b clAmdBlasInvalidContext if a context a passed command queue belongs * to was released; * - \b clAmdBlasInvalidOperation if kernel compilation relating to a previous * call has not completed for any of the target devices; * - \b clAmdBlasCompilerNotAvailable if a compiler is not available; * - \b clAmdBlasBuildProgramFailure if there is a failure to build a program * executable. * * @ingroup ASUM */ __inline clAmdBlasStatus clAmdBlasSasum( size_t N, cl_mem asum, size_t offAsum, const cl_mem X, size_t offx, int incx, cl_mem scratchBuff, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events) { return clblasSasum( N, asum, offAsum, X, offx, incx, scratchBuff, numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events ); } /** * @example example_sasum.c * Example of how to use the @ref clAmdBlasSasum function. */ /** * @brief absolute sum of values of a vector containing double elements * * @param[in] N Number of elements in vector \b X. * @param[out] asum Buffer object that will contain the absoulte sum value * @param[in] offAsum Offset to absoule sum in \b asum buffer object. * Counted in elements. * @param[in] X Buffer object storing vector \b X. * @param[in] offx Offset of first element of vector \b X in buffer object. * Counted in elements. * @param[in] incx Increment for the elements of \b X. Must not be zero. * @param[in] scratchBuff Temporary cl_mem scratch buffer object of minimum size N * @param[in] numCommandQueues Number of OpenCL command queues in which the * task is to be performed. * @param[in] commandQueues OpenCL command queues. * @param[in] numEventsInWaitList Number of events in the event wait list. * @param[in] eventWaitList Event wait list. * @param[in] events Event objects per each command queue that identify * a particular kernel execution instance. * * @return * - \b clAmdBlasSuccess on success; * - \b clAmdBlasInvalidDevice if a target device does not support the * floating point arithmetic with double precision; * - the same error codes as the clAmdBlasSasum() function otherwise. * * @ingroup ASUM */ __inline clAmdBlasStatus clAmdBlasDasum( size_t N, cl_mem asum, size_t offAsum, const cl_mem X, size_t offx, int incx, cl_mem scratchBuff, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events) { return clblasDasum( N, asum, offAsum, X, offx, incx, scratchBuff, numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events ); } /** * @brief absolute sum of values of a vector containing float-complex elements * * @param[in] N Number of elements in vector \b X. * @param[out] asum Buffer object that will contain the absolute sum value * @param[in] offAsum Offset to absolute sum in \b asum buffer object. * Counted in elements. * @param[in] X Buffer object storing vector \b X. * @param[in] offx Offset of first element of vector \b X in buffer object. * Counted in elements. * @param[in] incx Increment for the elements of \b X. Must not be zero. * @param[in] scratchBuff Temporary cl_mem scratch buffer object of minimum size N * @param[in] numCommandQueues Number of OpenCL command queues in which the * task is to be performed. * @param[in] commandQueues OpenCL command queues. * @param[in] numEventsInWaitList Number of events in the event wait list. * @param[in] eventWaitList Event wait list. * @param[in] events Event objects per each command queue that identify * a particular kernel execution instance. * * @return * - \b clAmdBlasSuccess on success; * - the same error codes as the clAmdBlasSasum() function otherwise. * * @ingroup ASUM */ __inline clAmdBlasStatus clAmdBlasScasum( size_t N, cl_mem asum, size_t offAsum, const cl_mem X, size_t offx, int incx, cl_mem scratchBuff, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events) { return clblasScasum( N, asum, offAsum, X, offx, incx, scratchBuff, numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events ); } /** * @brief absolute sum of values of a vector containing double-complex elements * * @param[in] N Number of elements in vector \b X. * @param[out] asum Buffer object that will contain the absolute sum value * @param[in] offAsum Offset to absolute sum in \b asum buffer object. * Counted in elements. * @param[in] X Buffer object storing vector \b X. * @param[in] offx Offset of first element of vector \b X in buffer object. * Counted in elements. * @param[in] incx Increment for the elements of \b X. Must not be zero. * @param[in] scratchBuff Temporary cl_mem scratch buffer object of minimum size N * @param[in] numCommandQueues Number of OpenCL command queues in which the * task is to be performed. * @param[in] commandQueues OpenCL command queues. * @param[in] numEventsInWaitList Number of events in the event wait list. * @param[in] eventWaitList Event wait list. * @param[in] events Event objects per each command queue that identify * a particular kernel execution instance. * * @return * - \b clAmdBlasSuccess on success; * - \b clAmdBlasInvalidDevice if a target device does not support the * floating point arithmetic with double precision; * - the same error codes as the clAmdBlasSasum() function otherwise. * * @ingroup ASUM */ __inline clAmdBlasStatus clAmdBlasDzasum( size_t N, cl_mem asum, size_t offAsum, const cl_mem X, size_t offx, int incx, cl_mem scratchBuff, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events) { return clblasDzasum( N, asum, offAsum, X, offx, incx, scratchBuff, numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events ); } /*@}*/ /** * @defgroup BLAS2 BLAS-2 functions * * The Level 2 Basic Linear Algebra Subprograms are functions that perform * matrix-vector operations. */ /*@{*/ /*@}*/ /** * @defgroup GEMV GEMV - General matrix-Vector multiplication * @ingroup BLAS2 */ /*@{*/ /** * @brief Matrix-vector product with a general rectangular matrix and * float elements. * * Matrix-vector products: * - \f$ y \leftarrow \alpha A x + \beta y \f$ * - \f$ y \leftarrow \alpha A^T x + \beta y \f$ * * @param[in] order Row/column order. * @param[in] transA How matrix \b A is to be transposed. * @param[in] M Number of rows in matrix \b A. * @param[in] N Number of columns in matrix \b A. * @param[in] alpha The factor of matrix \b A. * @param[in] A Buffer object storing matrix \b A. * @param[in] lda Leading dimension of matrix \b A. It cannot be less * than \b N when the \b order parameter is set to * \b clAmdBlasRowMajor,\n or less than \b M when the * parameter is set to \b clAmdBlasColumnMajor. * @param[in] x Buffer object storing vector \b x. * @param[in] offx Offset of first element of vector \b x in buffer object. * Counted in elements. * @param[in] incx Increment for the elements of \b x. Must not be zero. * @param[in] beta The factor of the vector \b y. * @param[out] y Buffer object storing the vector \b y. * @param[in] offy Offset of first element of vector \b y in buffer object. * Counted in elements. * @param[in] incy Increment for the elements of \b y. Must not be zero. * @param[in] numCommandQueues Number of OpenCL command queues in which the * task is to be performed. * @param[in] commandQueues OpenCL command queues. * @param[in] numEventsInWaitList Number of events in the event wait list. * @param[in] eventWaitList Event wait list. * @param[in] events Event objects per each command queue that identify * a particular kernel execution instance. * * The function is obsolete and is not recommended for using in new * applications. Use the superseding function clAmdBlasSgemvEx() instead. * * @return * - \b clAmdBlasSuccess on success; * - \b clAmdBlasNotInitialized if clAmdBlasSetup() was not called; * - \b clAmdBlasInvalidValue if invalid parameters are passed: * - either \b M or \b N is zero, or * - either \b incx or \b incy is zero, or * - any of the leading dimensions is invalid; * - the matrix size or the vector sizes along with the increments lead to * accessing outside of any of the buffers; * - \b clAmdBlasInvalidMemObject if either \b A, \b x, or \b y object is * Invalid, or an image object rather than the buffer one; * - \b clAmdBlasOutOfHostMemory if the library can't allocate memory for * internal structures; * - \b clAmdBlasInvalidCommandQueue if the passed command queue is invalid; * - \b clAmdBlasInvalidContext if a context a passed command queue belongs * to was released; * - \b clAmdBlasInvalidOperation if kernel compilation relating to a previous * call has not completed for any of the target devices; * - \b clAmdBlasCompilerNotAvailable if a compiler is not available; * - \b clAmdBlasBuildProgramFailure if there is a failure to build a program * executable. * * @ingroup GEMV */ __inline clAmdBlasStatus clAmdBlasSgemv( clblasOrder order, clblasTranspose transA, size_t M, size_t N, cl_float alpha, const cl_mem A, size_t lda, const cl_mem x, size_t offx, int incx, cl_float beta, cl_mem y, size_t offy, int incy, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events) { return clblasSgemv( order, transA, M, N, alpha, A, 0, lda, x, offx, incx, beta, y, offy, incy, numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events ); } /** * @example example_sgemv.c * Example of how to use the @ref clAmdBlasSgemv function. */ /** * @brief Matrix-vector product with a general rectangular matrix and * double elements. * * Matrix-vector products: * - \f$ y \leftarrow \alpha A x + \beta y \f$ * - \f$ y \leftarrow \alpha A^T x + \beta y \f$ * * @param[in] order Row/column order. * @param[in] transA How matrix \b A is to be transposed. * @param[in] M Number of rows in matrix \b A. * @param[in] N Number of columns in matrix \b A. * @param[in] alpha The factor of matrix \b A. * @param[in] A Buffer object storing matrix \b A. * @param[in] lda Leading dimension of matrix \b A. For a detailed description, * see clAmdBlasSgemv(). * @param[in] x Buffer object storing vector \b x. * @param[in] offx Offset of first element of vector \b x in buffer object. * Counted in elements. * @param[in] incx Increment for the elements of \b x. It cannot be zero. * @param[in] beta The factor of the vector \b y. * @param[out] y Buffer object storing the vector \b y. * @param[in] offy Offset of first element of vector \b y in buffer object. * Counted in elements. * @param[in] incy Increment for the elements of \b y. It cannot be zero. * @param[in] numCommandQueues Number of OpenCL command queues in which the * task is to be performed. * @param[in] commandQueues OpenCL command queues. * @param[in] numEventsInWaitList Number of events in the event wait list. * @param[in] eventWaitList Event wait list. * @param[in] events Event objects per each command queue that identify * a particular kernel execution instance. * * The function is obsolete and is not recommended for using in new * applications. Use the superseding function clAmdBlasDgemvEx() instead. * * @return * - \b clAmdBlasSuccess on success; * - \b clAmdBlasInvalidDevice if a target device does not support the * floating point arithmetic with double precision; * - the same error codes as the clAmdBlasSgemv() function otherwise. * * @ingroup GEMV */ __inline clAmdBlasStatus clAmdBlasDgemv( clblasOrder order, clblasTranspose transA, size_t M, size_t N, cl_double alpha, const cl_mem A, size_t lda, const cl_mem x, size_t offx, int incx, cl_double beta, cl_mem y, size_t offy, int incy, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events) { return clblasDgemv( order, transA, M, N, alpha, A, 0, lda, x, offx, incx, beta, y, offy, incy, numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events ); } /** * @brief Matrix-vector product with a general rectangular matrix and * float complex elements. * * Matrix-vector products: * - \f$ y \leftarrow \alpha A x + \beta y \f$ * - \f$ y \leftarrow \alpha A^T x + \beta y \f$ * * @param[in] order Row/column order. * @param[in] transA How matrix \b A is to be transposed. * @param[in] M Number of rows in matrix \b A. * @param[in] N Number of columns in matrix \b A. * @param[in] alpha The factor of matrix \b A. * @param[in] A Buffer object storing matrix \b A. * @param[in] lda Leading dimension of matrix \b A. For a detailed description, * see clAmdBlasSgemv(). * @param[in] x Buffer object storing vector \b x. * @param[in] offx Offset of first element of vector \b x in buffer object. * Counted in elements. * @param[in] incx Increment for the elements of \b x. It cannot be zero. * @param[in] beta The factor of the vector \b y. * @param[out] y Buffer object storing the vector \b y. * @param[in] offy Offset of first element of vector \b y in buffer object. * Counted in elements. * @param[in] incy Increment for the elements of \b y. It cannot be zero. * @param[in] numCommandQueues Number of OpenCL command queues in which the * task is to be performed. * @param[in] commandQueues OpenCL command queues. * @param[in] numEventsInWaitList Number of events in the event wait list. * @param[in] eventWaitList Event wait list. * @param[in] events Event objects per each command queue that identify * a particular kernel execution instance. * * The function is obsolete and is not recommended for using in new * applications. Use the superseding function clAmdBlasCgemvEx() instead. * * @return * - \b clAmdBlasSuccess on success; * - the same error codes as the clAmdBlasSgemv() function otherwise. * * @ingroup GEMV */ __inline clAmdBlasStatus clAmdBlasCgemv( clblasOrder order, clblasTranspose transA, size_t M, size_t N, FloatComplex alpha, const cl_mem A, size_t lda, const cl_mem x, size_t offx, int incx, FloatComplex beta, cl_mem y, size_t offy, int incy, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events) { return clblasCgemv( order, transA, M, N, alpha, A, 0, lda, x, offx, incx, beta, y, offy, incy, numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events ); } /** * @brief Matrix-vector product with a general rectangular matrix and * double complex elements. * * Matrix-vector products: * - \f$ y \leftarrow \alpha A x + \beta y \f$ * - \f$ y \leftarrow \alpha A^T x + \beta y \f$ * * @param[in] order Row/column order. * @param[in] transA How matrix \b A is to be transposed. * @param[in] M Number of rows in matrix \b A. * @param[in] N Number of columns in matrix \b A. * @param[in] alpha The factor of matrix \b A. * @param[in] A Buffer object storing matrix \b A. * @param[in] lda Leading dimension of matrix \b A. For a detailed description, * see clAmdBlasSgemv(). * @param[in] x Buffer object storing vector \b x. * @param[in] offx Offset of first element of vector \b x in buffer object. * Counted in elements. * @param[in] incx Increment for the elements of \b x. It cannot be zero. * @param[in] beta The factor of the vector \b y. * @param[out] y Buffer object storing the vector \b y. * @param[in] offy Offset of first element of vector \b y in buffer object. * Counted in elements. * @param[in] incy Increment for the elements of \b y. It cannot be zero. * @param[in] numCommandQueues Number of OpenCL command queues in which the * task is to be performed. * @param[in] commandQueues OpenCL command queues. * @param[in] numEventsInWaitList Number of events in the event wait list. * @param[in] eventWaitList Event wait list. * @param[in] events Event objects per each command queue that identify * a particular kernel execution instance. * * The function is obsolete and is not recommended for using in new * applications. Use the superseding function clAmdBlasZgemvEx() instead. * * @return * - \b clAmdBlasSuccess on success; * - \b clAmdBlasInvalidDevice if a target device does not support the * floating point arithmetic with double precision; * - the same error codes as the clAmdBlasSgemv() function otherwise. * * @ingroup GEMV */ __inline clAmdBlasStatus clAmdBlasZgemv( clblasOrder order, clblasTranspose transA, size_t M, size_t N, DoubleComplex alpha, const cl_mem A, size_t lda, const cl_mem x, size_t offx, int incx, DoubleComplex beta, cl_mem y, size_t offy, int incy, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events) { return clblasZgemv( order, transA, M, N, alpha, A, 0, lda, x, offx, incx, beta, y, offy, incy, numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events ); } /** * @brief Matrix-vector product with a general rectangular matrix and * float elements. Extended version. * * Matrix-vector products: * - \f$ y \leftarrow \alpha A x + \beta y \f$ * - \f$ y \leftarrow \alpha A^T x + \beta y \f$ * * @param[in] order Row/column order. * @param[in] transA How matrix \b A is to be transposed. * @param[in] M Number of rows in matrix \b A. * @param[in] N Number of columns in matrix \b A. * @param[in] alpha The factor of matrix \b A. * @param[in] A Buffer object storing matrix \b A. * @param[in] offA Offset of the first element of the matrix \b A in * the buffer object. Counted in elements. * @param[in] lda Leading dimension of matrix \b A. It cannot be less * than \b N when the \b order parameter is set to * \b clAmdBlasRowMajor,\n or less than \b M when the * parameter is set to \b clAmdBlasColumnMajor. * @param[in] x Buffer object storing vector \b x. * @param[in] offx Offset of first element of vector \b x in buffer object. * Counted in elements. * @param[in] incx Increment for the elements of \b x. It cannot be zero. * @param[in] beta The factor of the vector \b y. * @param[out] y Buffer object storing the vector \b y. * @param[in] offy Offset of first element of vector \b y in buffer object. * Counted in elements. * @param[in] incy Increment for the elements of \b y. It cannot be zero. * @param[in] numCommandQueues Number of OpenCL command queues in which the * task is to be performed. * @param[in] commandQueues OpenCL command queues. * @param[in] numEventsInWaitList Number of events in the event wait list. * @param[in] eventWaitList Event wait list. * @param[in] events Event objects per each command queue that identify * a particular kernel execution instance. * * @return * - \b clAmdBlasSuccess on success; * - \b clAmdBlasInvalidValue if \b offA exceeds the size of \b A buffer * object; * - the same error codes as the clAmdBlasSgemv() function otherwise. * * @ingroup GEMV */ __inline clAmdBlasStatus clAmdBlasSgemvEx( clblasOrder order, clblasTranspose transA, size_t M, size_t N, cl_float alpha, const cl_mem A, size_t offA, size_t lda, const cl_mem x, size_t offx, int incx, cl_float beta, cl_mem y, size_t offy, int incy, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events) { return clblasSgemv( order, transA, M, N, alpha, A, offA, lda, x, offx, incx, beta, y, offy, incy, numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events ); } /** * @example example_sgemv.c * This is an example of how to use the @ref clAmdBlasSgemvEx function. */ /** * @brief Matrix-vector product with a general rectangular matrix and * double elements. Extended version. * * Matrix-vector products: * - \f$ y \leftarrow \alpha A x + \beta y \f$ * - \f$ y \leftarrow \alpha A^T x + \beta y \f$ * * @param[in] order Row/column order. * @param[in] transA How matrix \b A is to be transposed. * @param[in] M Number of rows in matrix \b A. * @param[in] N Number of columns in matrix \b A. * @param[in] alpha The factor of matrix \b A. * @param[in] A Buffer object storing matrix \b A. * @param[in] offA Offset of the first element of \b A in the buffer * object. Counted in elements. * @param[in] lda Leading dimension of matrix \b A. For a detailed description, * see clAmdBlasSgemv(). * @param[in] x Buffer object storing vector \b x. * @param[in] offx Offset of first element of vector \b x in buffer object. * Counted in elements. * @param[in] incx Increment for the elements of \b x. It cannot be zero. * @param[in] beta The factor of the vector \b y. * @param[out] y Buffer object storing the vector \b y. * @param[in] offy Offset of first element of vector \b y in buffer object. * Counted in elements. * @param[in] incy Increment for the elements of \b y. It cannot be zero. * @param[in] numCommandQueues Number of OpenCL command queues in which the * task is to be performed. * @param[in] commandQueues OpenCL command queues. * @param[in] numEventsInWaitList Number of events in the event wait list. * @param[in] eventWaitList Event wait list. * @param[in] events Event objects per each command queue that identify * a particular kernel execution instance. * * @return * - \b clAmdBlasSuccess on success; * - \b clAmdBlasInvalidDevice if a target device does not support the * floating point arithmetic with double precision; * - \b clAmdBlasInvalidValue if \b offA exceeds the size of \b A buffer * object; * - the same error codes as the clAmdBlasSgemv() function otherwise. * * @ingroup GEMV */ __inline clAmdBlasStatus clAmdBlasDgemvEx( clblasOrder order, clblasTranspose transA, size_t M, size_t N, cl_double alpha, const cl_mem A, size_t offA, size_t lda, const cl_mem x, size_t offx, int incx, cl_double beta, cl_mem y, size_t offy, int incy, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events) { return clblasDgemv( order, transA, M, N, alpha, A, offA, lda, x, offx, incx, beta, y, offy, incy, numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events ); } /** * @brief Matrix-vector product with a general rectangular matrix and * float complex elements. Extended version. * * Matrix-vector products: * - \f$ y \leftarrow \alpha A x + \beta y \f$ * - \f$ y \leftarrow \alpha A^T x + \beta y \f$ * * @param[in] order Row/column order. * @param[in] transA How matrix \b A is to be transposed. * @param[in] M Number of rows in matrix \b A. * @param[in] N Number of columns in matrix \b A. * @param[in] alpha The factor of matrix \b A. * @param[in] A Buffer object storing matrix \b A. * @param[in] offA Offset of the first element of the matrix \b A in * the buffer object. Counted in elements * @param[in] lda Leading dimension of matrix \b A. For a detailed description, * see clAmdBlasSgemv(). * @param[in] x Buffer object storing vector \b x. * @param[in] offx Offset of first element of vector \b x in buffer object. * Counted in elements. * @param[in] incx Increment for the elements of \b x. It cannot be zero. * @param[in] beta The factor of the vector \b y. * @param[out] y Buffer object storing the vector \b y. * @param[in] offy Offset of first element of vector \b y in buffer object. * Counted in elements. * @param[in] incy Increment for the elements of \b y. It cannot be zero. * @param[in] numCommandQueues Number of OpenCL command queues in which the * task is to be performed. * @param[in] commandQueues OpenCL command queues. * @param[in] numEventsInWaitList Number of events in the event wait list. * @param[in] eventWaitList Event wait list. * @param[in] events Event objects per each command queue that identify * a particular kernel execution instance. * * @return * - \b clAmdBlasSuccess on success; * - \b clAmdBlasInvalidValue if \b offA exceeds the size of \b A buffer * object; * - the same error codes as the clAmdBlasSgemv() function otherwise. * * @ingroup GEMV */ __inline clAmdBlasStatus clAmdBlasCgemvEx( clblasOrder order, clblasTranspose transA, size_t M, size_t N, FloatComplex alpha, const cl_mem A, size_t offA, size_t lda, const cl_mem x, size_t offx, int incx, FloatComplex beta, cl_mem y, size_t offy, int incy, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events) { return clblasCgemv( order, transA, M, N, alpha, A, offA, lda, x, offx, incx, beta, y, offy, incy, numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events ); } /** * @brief Matrix-vector product with a general rectangular matrix and * double complex elements. Extended version. * * Matrix-vector products: * - \f$ y \leftarrow \alpha A x + \beta y \f$ * - \f$ y \leftarrow \alpha A^T x + \beta y \f$ * * @param[in] order Row/column order. * @param[in] transA How matrix \b A is to be transposed. * @param[in] M Number of rows in matrix \b A. * @param[in] N Number of columns in matrix \b A. * @param[in] alpha The factor of matrix \b A. * @param[in] A Buffer object storing matrix \b A. * @param[in] offA Offset of the first element of the matrix \b A in * the buffer object. Counted in elements. * @param[in] lda Leading dimension of matrix \b A. For a detailed description, * see clAmdBlasSgemv(). * @param[in] x Buffer object storing vector \b x. * @param[in] offx Offset of first element of vector \b x in buffer object. * Counted in elements. * @param[in] incx Increment for the elements of \b x. It cannot be zero. * @param[in] beta The factor of the vector \b y. * @param[out] y Buffer object storing the vector \b y. * @param[in] offy Offset of first element of vector \b y in buffer object. * Counted in elements. * @param[in] incy Increment for the elements of \b y. It cannot be zero. * @param[in] numCommandQueues Number of OpenCL command queues in which the * task is to be performed. * @param[in] commandQueues OpenCL command queues. * @param[in] numEventsInWaitList Number of events in the event wait list. * @param[in] eventWaitList Event wait list. * @param[in] events Event objects per each command queue that identify * a particular kernel execution instance. * * @return * - \b clAmdBlasSuccess on success; * - \b clAmdBlasInvalidDevice if a target device does not support the * floating point arithmetic with double precision; * - \b clAmdBlasInvalidValue if \b offA exceeds the size of \b A buffer * object; * - the same error codes as the clAmdBlasSgemv() function otherwise. * * @ingroup GEMV */ __inline clAmdBlasStatus clAmdBlasZgemvEx( clblasOrder order, clblasTranspose transA, size_t M, size_t N, DoubleComplex alpha, const cl_mem A, size_t offA, size_t lda, const cl_mem x, size_t offx, int incx, DoubleComplex beta, cl_mem y, size_t offy, int incy, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events) { return clblasZgemv( order, transA, M, N, alpha, A, offA, lda, x, offx, incx, beta, y, offy, incy, numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events ); } /*@}*/ /** * @defgroup SYMV SYMV - Symmetric matrix-Vector multiplication * @ingroup BLAS2 */ /*@{*/ /** * @brief Matrix-vector product with a symmetric matrix and float elements. * * Matrix-vector products: * - \f$ y \leftarrow \alpha A x + \beta y \f$ * * @param[in] order Row/columns order. * @param[in] uplo The triangle in matrix being referenced. * @param[in] N Number of rows and columns in matrix \b A. * @param[in] alpha The factor of matrix \b A. * @param[in] A Buffer object storing matrix \b A. * @param[in] lda Leading dimension of matrix \b A. It cannot less * than \b N. * @param[in] x Buffer object storing vector \b x. * @param[in] offx Offset of first element of vector \b x in buffer object. * Counted in elements. * @param[in] incx Increment for the elements of vector \b x. It cannot be zero. * @param[in] beta The factor of vector \b y. * @param[out] y Buffer object storing vector \b y. * @param[in] offy Offset of first element of vector \b y in buffer object. * Counted in elements. * @param[in] incy Increment for the elements of vector \b y. It cannot be zero. * @param[in] numCommandQueues Number of OpenCL command queues in which the * task is to be performed. * @param[in] commandQueues OpenCL command queues. * @param[in] numEventsInWaitList Number of events in the event wait list. * @param[in] eventWaitList Event wait list. * @param[in] events Event objects per each command queue that identify * a particular kernel execution instance. * * The function is obsolete and is not recommended for using in new * applications. Use the superseding function clAmdBlasSsymvEx() instead. * * @return * - \b clAmdBlasSuccess on success; * - \b clAmdBlasNotInitialized if clAmdBlasSetup() was not called; * - \b clAmdBlasInvalidValue if invalid parameters are passed: * - \b N is zero, or * - either \b incx or \b incy is zero, or * - any of the leading dimensions is invalid; * - the matrix sizes or the vector sizes along with the increments lead to * accessing outsize of any of the buffers; * - \b clAmdBlasInvalidMemObject if either \b A, \b x, or \b y object is * invalid, or an image object rather than the buffer one; * - \b clAmdBlasOutOfHostMemory if the library can't allocate memory for * internal structures; * - \b clAmdBlasInvalidCommandQueue if the passed command queue is invalid; * - \b clAmdBlasInvalidContext if a context a passed command queue belongs to * was released; * - \b clAmdBlasInvalidOperation if kernel compilation relating to a previous * call has not completed for any of the target devices; * - \b clAmdBlasCompilerNotAvailable if a compiler is not available; * - \b clAmdBlasBuildProgramFailure if there is a failure to build a program * executable. * * @ingroup SYMV */ __inline clAmdBlasStatus clAmdBlasSsymv( clblasOrder order, clblasUplo uplo, size_t N, cl_float alpha, const cl_mem A, size_t lda, const cl_mem x, size_t offx, int incx, cl_float beta, cl_mem y, size_t offy, int incy, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events) { return clblasSsymv( order, uplo, N, alpha, A, 0, lda, x, offx, incx, beta, y, offy, incy, numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events ); } /** * @example example_ssymv.c * This is an example of how to use the @ref clAmdBlasSsymv function. */ /** * @brief Matrix-vector product with a symmetric matrix and double elements. * * Matrix-vector products: * - \f$ y \leftarrow \alpha A x + \beta y \f$ * * @param[in] order Row/columns order. * @param[in] uplo The triangle in matrix being referenced. * @param[in] N Number of rows and columns in matrix \b A. * @param[in] alpha The factor of matrix \b A. * @param[in] A Buffer object storing matrix \b A. * @param[in] lda Leading dimension of matrix \b A. It cannot less * than \b N. * @param[in] x Buffer object storing vector \b x. * @param[in] offx Offset of first element of vector \b x in buffer object. * Counted in elements. * @param[in] incx Increment for the elements of vector \b x. It cannot be zero. * @param[in] beta The factor of vector \b y. * @param[out] y Buffer object storing vector \b y. * @param[in] offy Offset of first element of vector \b y in buffer object. * Counted in elements. * @param[in] incy Increment for the elements of vector \b y. It cannot be zero. * @param[in] numCommandQueues Number of OpenCL command queues in which the * task is to be performed. * @param[in] commandQueues OpenCL command queues. * @param[in] numEventsInWaitList Number of events in the event wait list. * @param[in] eventWaitList Event wait list. * @param[in] events Event objects per each command queue that identify * a particular kernel execution instance. * * The function is obsolete and is not recommended for using in new * applications. Use the superseding function clAmdBlasDsymvEx() instead. * * @return * - \b clAmdBlasSuccess on success; * - \b clAmdBlasInvalidDevice if a target device does not support floating * point arithmetic with double precision; * - the same error codes as the clAmdBlasSsymv() function otherwise. * * @ingroup SYMV */ __inline clAmdBlasStatus clAmdBlasDsymv( clblasOrder order, clblasUplo uplo, size_t N, cl_double alpha, const cl_mem A, size_t lda, const cl_mem x, size_t offx, int incx, cl_double beta, cl_mem y, size_t offy, int incy, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events) { return clblasDsymv( order, uplo, N, alpha, A, 0, lda, x, offx, incx, beta, y, offy, incy, numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events ); } /** * @brief Matrix-vector product with a symmetric matrix and float elements. * Extended version. * * Matrix-vector products: * - \f$ y \leftarrow \alpha A x + \beta y \f$ * * @param[in] order Row/columns order. * @param[in] uplo The triangle in matrix being referenced. * @param[in] N Number of rows and columns in matrix \b A. * @param[in] alpha The factor of matrix \b A. * @param[in] A Buffer object storing matrix \b A. * @param[in] offA Offset of the first element of the matrix \b A in * the buffer object. Counted in elements. * @param[in] lda Leading dimension of matrix \b A. It cannot less * than \b N. * @param[in] x Buffer object storing vector \b x. * @param[in] offx Offset of first element of vector \b x in buffer object. * Counted in elements. * @param[in] incx Increment for the elements of vector \b x. It cannot be zero. * @param[in] beta The factor of vector \b y. * @param[out] y Buffer object storing vector \b y. * @param[in] offy Offset of first element of vector \b y in buffer object. * Counted in elements. * @param[in] incy Increment for the elements of vector \b y. It cannot be zero. * @param[in] numCommandQueues Number of OpenCL command queues in which the * task is to be performed. * @param[in] commandQueues OpenCL command queues. * @param[in] numEventsInWaitList Number of events in the event wait list. * @param[in] eventWaitList Event wait list. * @param[in] events Event objects per each command queue that identify * a particular kernel execution instance. * * @return * - \b clAmdBlasSuccess on success; * - \b clAmdBlasInvalidValue if \b offA exceeds the size of \b A buffer * object; * - the same error codes as the clAmdBlasSgemv() function otherwise. * * @ingroup SYMV */ __inline clAmdBlasStatus clAmdBlasSsymvEx( clblasOrder order, clblasUplo uplo, size_t N, cl_float alpha, const cl_mem A, size_t offA, size_t lda, const cl_mem x, size_t offx, int incx, cl_float beta, cl_mem y, size_t offy, int incy, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events) { return clblasSsymv( order, uplo, N, alpha, A, offA, lda, x, offx, incx, beta, y, offy, incy, numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events ); } /** * @example example_ssymv.c * This is an example of how to use the @ref clAmdBlasSsymv function. */ /** * @brief Matrix-vector product with a symmetric matrix and double elements. * Extended version. * * Matrix-vector products: * - \f$ y \leftarrow \alpha A x + \beta y \f$ * * @param[in] order Row/columns order. * @param[in] uplo The triangle in matrix being referenced. * @param[in] N Number of rows and columns in matrix \b A. * @param[in] alpha The factor of matrix \b A. * @param[in] A Buffer object storing matrix \b A. * @param[in] offA Offset of the first element of the matrix \b A in * the buffer object. Counted in elements. * @param[in] lda Leading dimension of matrix \b A. It cannot less * than \b N. * @param[in] x Buffer object storing vector \b x. * @param[in] offx Offset of first element of vector \b x in buffer object. * Counted in elements. * @param[in] incx Increment for the elements of vector \b x. It cannot be zero. * @param[in] beta The factor of vector \b y. * @param[out] y Buffer object storing vector \b y. * @param[in] offy Offset of first element of vector \b y in buffer object. * Counted in elements. * @param[in] incy Increment for the elements of vector \b y. It cannot be zero. * @param[in] numCommandQueues Number of OpenCL command queues in which the * task is to be performed. * @param[in] commandQueues OpenCL command queues. * @param[in] numEventsInWaitList Number of events in the event wait list. * @param[in] eventWaitList Event wait list. * @param[in] events Event objects per each command queue that identify * a particular kernel execution instance. * * @return * - \b clAmdBlasSuccess on success; * - \b clAmdBlasInvalidDevice if a target device does not support floating * point arithmetic with double precision; * - \b clAmdBlasInvalidValue if \b offA exceeds the size of \b A buffer * object; * - the same error codes as the clAmdBlasSsymv() function otherwise. * * @ingroup SYMV */ __inline clAmdBlasStatus clAmdBlasDsymvEx( clblasOrder order, clblasUplo uplo, size_t N, cl_double alpha, const cl_mem A, size_t offA, size_t lda, const cl_mem x, size_t offx, int incx, cl_double beta, cl_mem y, size_t offy, int incy, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events) { return clblasDsymv( order, uplo, N, alpha, A, offA, lda, x, offx, incx, beta, y, offy, incy, numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events ); } /*@}*/ /** * @defgroup HEMV HEMV - Hermitian matrix-vector multiplication * @ingroup BLAS2 */ /*@{*/ /** * @brief Matrix-vector product with a hermitian matrix and float-complex elements. * * Matrix-vector products: * - \f$ Y \leftarrow \alpha A X + \beta Y \f$ * * @param[in] order Row/columns order. * @param[in] uplo The triangle in matrix being referenced. * @param[in] N Number of rows and columns in matrix \b A. * @param[in] alpha The factor of matrix \b A. * @param[in] A Buffer object storing matrix \b A. * @param[in] offa Offset in number of elements for first element in matrix \b A. * @param[in] lda Leading dimension of matrix \b A. It cannot less * than \b N. * @param[in] X Buffer object storing vector \b X. * @param[in] offx Offset of first element of vector \b X in buffer object. * Counted in elements. * @param[in] incx Increment for the elements of vector \b X. It cannot be zero. * @param[in] beta The factor of vector \b Y. * @param[out] Y Buffer object storing vector \b Y. * @param[in] offy Offset of first element of vector \b Y in buffer object. * Counted in elements. * @param[in] incy Increment for the elements of vector \b Y. It cannot be zero. * @param[in] numCommandQueues Number of OpenCL command queues in which the * task is to be performed. * @param[in] commandQueues OpenCL command queues. * @param[in] numEventsInWaitList Number of events in the event wait list. * @param[in] eventWaitList Event wait list. * @param[in] events Event objects per each command queue that identify * a particular kernel execution instance. * * @return * - \b clAmdBlasSuccess on success; * - \b clAmdBlasNotInitialized if clAmdBlasSetup() was not called; * - \b clAmdBlasInvalidValue if invalid parameters are passed: * - \b N is zero, or * - either \b incx or \b incy is zero, or * - any of the leading dimensions is invalid; * - the matrix sizes or the vector sizes along with the increments lead to * accessing outsize of any of the buffers; * - \b clAmdBlasInvalidMemObject if either \b A, \b X, or \b Y object is * invalid, or an image object rather than the buffer one; * - \b clAmdBlasOutOfHostMemory if the library can't allocate memory for * internal structures; * - \b clAmdBlasInvalidCommandQueue if the passed command queue is invalid; * - \b clAmdBlasInvalidContext if a context a passed command queue belongs to * was released; * - \b clAmdBlasInvalidOperation if kernel compilation relating to a previous * call has not completed for any of the target devices; * - \b clAmdBlasCompilerNotAvailable if a compiler is not available; * - \b clAmdBlasBuildProgramFailure if there is a failure to build a program * executable. * * @ingroup HEMV */ __inline clAmdBlasStatus clAmdBlasChemv( clblasOrder order, clblasUplo uplo, size_t N, FloatComplex alpha, const cl_mem A, size_t offa, size_t lda, const cl_mem X, size_t offx, int incx, FloatComplex beta, cl_mem Y, size_t offy, int incy, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events) { return clblasChemv( order, uplo, N, alpha, A, offa, lda, X, offx, incx, beta, Y, offy, incy, numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events ); } /** * @brief Matrix-vector product with a hermitian matrix and double-complex elements. * * Matrix-vector products: * - \f$ Y \leftarrow \alpha A X + \beta Y \f$ * * @param[in] order Row/columns order. * @param[in] uplo The triangle in matrix being referenced. * @param[in] N Number of rows and columns in matrix \b A. * @param[in] alpha The factor of matrix \b A. * @param[in] A Buffer object storing matrix \b A. * @param[in] offa Offset in number of elements for first element in matrix \b A. * @param[in] lda Leading dimension of matrix \b A. It cannot less * than \b N. * @param[in] X Buffer object storing vector \b X. * @param[in] offx Offset of first element of vector \b X in buffer object. * Counted in elements. * @param[in] incx Increment for the elements of vector \b X. It cannot be zero. * @param[in] beta The factor of vector \b Y. * @param[out] Y Buffer object storing vector \b Y. * @param[in] offy Offset of first element of vector \b Y in buffer object. * Counted in elements. * @param[in] incy Increment for the elements of vector \b Y. It cannot be zero. * @param[in] numCommandQueues Number of OpenCL command queues in which the * task is to be performed. * @param[in] commandQueues OpenCL command queues. * @param[in] numEventsInWaitList Number of events in the event wait list. * @param[in] eventWaitList Event wait list. * @param[in] events Event objects per each command queue that identify * a particular kernel execution instance. * * @return * - \b clAmdBlasSuccess on success; * - \b clAmdBlasInvalidDevice if a target device does not support floating * point arithmetic with double precision; * - the same error codes as the clAmdBlasChemv() function otherwise. * * @ingroup HEMV */ __inline clAmdBlasStatus clAmdBlasZhemv( clblasOrder order, clblasUplo uplo, size_t N, DoubleComplex alpha, const cl_mem A, size_t offa, size_t lda, const cl_mem X, size_t offx, int incx, DoubleComplex beta, cl_mem Y, size_t offy, int incy, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events) { return clblasZhemv( order, uplo, N, alpha, A, offa, lda, X, offx, incx, beta, Y, offy, incy, numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events ); } /** * @example example_zhemv.cpp * Example of how to use the @ref clAmdBlasZhemv function. */ /*@}*/ /** * @defgroup TRMV TRMV - Triangular matrix vector multiply * @ingroup BLAS2 */ /*@{*/ /** * @brief Matrix-vector product with a triangular matrix and * float elements. * * Matrix-vector products: * - \f$ X \leftarrow A X \f$ * - \f$ X \leftarrow A^T X \f$ * * @param[in] order Row/column order. * @param[in] uplo The triangle in matrix being referenced. * @param[in] trans How matrix \b A is to be transposed. * @param[in] diag Specify whether matrix \b A is unit triangular. * @param[in] N Number of rows/columns in matrix \b A. * @param[in] A Buffer object storing matrix \b A. * @param[in] offa Offset in number of elements for first element in matrix \b A. * @param[in] lda Leading dimension of matrix \b A. It cannot be less * than \b N * @param[out] X Buffer object storing vector \b X. * @param[in] offx Offset in number of elements for first element in vector \b X. * @param[in] incx Increment for the elements of \b X. Must not be zero. * @param[in] scratchBuff Temporary cl_mem scratch buffer object which can hold a * minimum of (1 + (N-1)*abs(incx)) elements * @param[in] numCommandQueues Number of OpenCL command queues in which the * task is to be performed. * @param[in] commandQueues OpenCL command queues. * @param[in] numEventsInWaitList Number of events in the event wait list. * @param[in] eventWaitList Event wait list. * @param[in] events Event objects per each command queue that identify * a particular kernel execution instance. * * @return * - \b clAmdBlasSuccess on success; * - \b clAmdBlasNotInitialized if clAmdBlasSetup() was not called; * - \b clAmdBlasInvalidValue if invalid parameters are passed: * - either \b N or \b incx is zero, or * - the leading dimension is invalid; * - \b clAmdBlasInvalidMemObject if either \b A or \b X object is * Invalid, or an image object rather than the buffer one; * - \b clAmdBlasOutOfHostMemory if the library can't allocate memory for * internal structures; * - \b clAmdBlasInvalidCommandQueue if the passed command queue is invalid; * - \b clAmdBlasInvalidContext if a context a passed command queue belongs * to was released; * - \b clAmdBlasInvalidOperation if kernel compilation relating to a previous * call has not completed for any of the target devices; * - \b clAmdBlasCompilerNotAvailable if a compiler is not available; * - \b clAmdBlasBuildProgramFailure if there is a failure to build a program * executable. * * @ingroup TRMV */ __inline clAmdBlasStatus clAmdBlasStrmv( clblasOrder order, clblasUplo uplo, clblasTranspose trans, clblasDiag diag, size_t N, const cl_mem A, size_t offa, size_t lda, cl_mem X, size_t offx, int incx, cl_mem scratchBuff, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events) { return clblasStrmv( order, uplo, trans, diag, N, A, offa, lda, X, offx, incx, scratchBuff, numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events ); } /** * @example example_strmv.c * Example of how to use the @ref clAmdBlasStrmv function. */ /** * @brief Matrix-vector product with a triangular matrix and * double elements. * * Matrix-vector products: * - \f$ X \leftarrow A X \f$ * - \f$ X \leftarrow A^T X \f$ * * @param[in] order Row/column order. * @param[in] uplo The triangle in matrix being referenced. * @param[in] trans How matrix \b A is to be transposed. * @param[in] diag Specify whether matrix \b A is unit triangular. * @param[in] N Number of rows/columns in matrix \b A. * @param[in] A Buffer object storing matrix \b A. * @param[in] offa Offset in number of elements for first element in matrix \b A. * @param[in] lda Leading dimension of matrix \b A. It cannot be less * than \b N * @param[out] X Buffer object storing vector \b X. * @param[in] offx Offset in number of elements for first element in vector \b X. * @param[in] incx Increment for the elements of \b X. Must not be zero. * @param[in] scratchBuff Temporary cl_mem scratch buffer object which can hold a * minimum of (1 + (N-1)*abs(incx)) elements * @param[in] numCommandQueues Number of OpenCL command queues in which the * task is to be performed. * @param[in] commandQueues OpenCL command queues. * @param[in] numEventsInWaitList Number of events in the event wait list. * @param[in] eventWaitList Event wait list. * @param[in] events Event objects per each command queue that identify * a particular kernel execution instance. * * @return * - \b clAmdBlasSuccess on success; * - \b clAmdBlasInvalidDevice if a target device does not support floating * point arithmetic with double precision; * - the same error codes as the clAmdBlasStrmv() function otherwise. * * @ingroup TRMV */ __inline clAmdBlasStatus clAmdBlasDtrmv( clblasOrder order, clblasUplo uplo, clblasTranspose trans, clblasDiag diag, size_t N, const cl_mem A, size_t offa, size_t lda, cl_mem X, size_t offx, int incx, cl_mem scratchBuff, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events) { return clblasDtrmv( order, uplo, trans, diag, N, A, offa, lda, X, offx, incx, scratchBuff, numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events ); } /** * @brief Matrix-vector product with a triangular matrix and * float complex elements. * * Matrix-vector products: * - \f$ X \leftarrow A X \f$ * - \f$ X \leftarrow A^T X \f$ * * @param[in] order Row/column order. * @param[in] uplo The triangle in matrix being referenced. * @param[in] trans How matrix \b A is to be transposed. * @param[in] diag Specify whether matrix \b A is unit triangular. * @param[in] N Number of rows/columns in matrix \b A. * @param[in] A Buffer object storing matrix \b A. * @param[in] offa Offset in number of elements for first element in matrix \b A. * @param[in] lda Leading dimension of matrix \b A. It cannot be less * than \b N * @param[out] X Buffer object storing vector \b X. * @param[in] offx Offset in number of elements for first element in vector \b X. * @param[in] incx Increment for the elements of \b X. Must not be zero. * @param[in] scratchBuff Temporary cl_mem scratch buffer object which can hold a * minimum of (1 + (N-1)*abs(incx)) elements * @param[in] numCommandQueues Number of OpenCL command queues in which the * task is to be performed. * @param[in] commandQueues OpenCL command queues. * @param[in] numEventsInWaitList Number of events in the event wait list. * @param[in] eventWaitList Event wait list. * @param[in] events Event objects per each command queue that identify * a particular kernel execution instance. * * @return The same result as the clAmdBlasStrmv() function. * @ingroup TRMV */ __inline clAmdBlasStatus clAmdBlasCtrmv( clblasOrder order, clblasUplo uplo, clblasTranspose trans, clblasDiag diag, size_t N, const cl_mem A, size_t offa, size_t lda, cl_mem X, size_t offx, int incx, cl_mem scratchBuff, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events) { return clblasCtrmv( order, uplo, trans, diag, N, A, offa, lda, X, offx, incx, scratchBuff, numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events ); } /** * @brief Matrix-vector product with a triangular matrix and * double complex elements. * * Matrix-vector products: * - \f$ X \leftarrow A X \f$ * - \f$ X \leftarrow A^T X \f$ * * @param[in] order Row/column order. * @param[in] uplo The triangle in matrix being referenced. * @param[in] trans How matrix \b A is to be transposed. * @param[in] diag Specify whether matrix \b A is unit triangular. * @param[in] N Number of rows/columns in matrix \b A. * @param[in] A Buffer object storing matrix \b A. * @param[in] offa Offset in number of elements for first element in matrix \b A. * @param[in] lda Leading dimension of matrix \b A. It cannot be less * than \b N * @param[out] X Buffer object storing vector \b X. * @param[in] offx Offset in number of elements for first element in vector \b X. * @param[in] incx Increment for the elements of \b X. Must not be zero. * @param[in] scratchBuff Temporary cl_mem scratch buffer object which can hold a * minimum of (1 + (N-1)*abs(incx)) elements * @param[in] numCommandQueues Number of OpenCL command queues in which the * task is to be performed. * @param[in] commandQueues OpenCL command queues. * @param[in] numEventsInWaitList Number of events in the event wait list. * @param[in] eventWaitList Event wait list. * @param[in] events Event objects per each command queue that identify * a particular kernel execution instance. * * @return The same result as the clAmdBlasDtrmv() function. * @ingroup TRMV */ __inline clAmdBlasStatus clAmdBlasZtrmv( clblasOrder order, clblasUplo uplo, clblasTranspose trans, clblasDiag diag, size_t N, const cl_mem A, size_t offa, size_t lda, cl_mem X, size_t offx, int incx, cl_mem scratchBuff, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events) { return clblasZtrmv( order, uplo, trans, diag, N, A, offa, lda, X, offx, incx, scratchBuff, numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events ); } /*@}*/ /** * @defgroup TRSV TRSV - Triangular matrix vector Solve * @ingroup BLAS2 */ /*@{*/ /** * @brief solving triangular matrix problems with float elements. * * Matrix-vector products: * - \f$ A X \leftarrow X \f$ * - \f$ A^T X \leftarrow X \f$ * * @param[in] order Row/column order. * @param[in] uplo The triangle in matrix being referenced. * @param[in] trans How matrix \b A is to be transposed. * @param[in] diag Specify whether matrix \b A is unit triangular. * @param[in] N Number of rows/columns in matrix \b A. * @param[in] A Buffer object storing matrix \b A. * @param[in] offa Offset in number of elements for first element in matrix \b A. * @param[in] lda Leading dimension of matrix \b A. It cannot be less * than \b N * @param[out] X Buffer object storing vector \b X. * @param[in] offx Offset in number of elements for first element in vector \b X. * @param[in] incx Increment for the elements of \b X. Must not be zero. * @param[in] numCommandQueues Number of OpenCL command queues in which the * task is to be performed. * @param[in] commandQueues OpenCL command queues. * @param[in] numEventsInWaitList Number of events in the event wait list. * @param[in] eventWaitList Event wait list. * @param[in] events Event objects per each command queue that identify * a particular kernel execution instance. * * @return * - \b clAmdBlasSuccess on success; * - \b clAmdBlasNotInitialized if clAmdBlasSetup() was not called; * - \b clAmdBlasInvalidValue if invalid parameters are passed: * - either \b N or \b incx is zero, or * - the leading dimension is invalid; * - \b clAmdBlasInvalidMemObject if either \b A or \b X object is * Invalid, or an image object rather than the buffer one; * - \b clAmdBlasOutOfHostMemory if the library can't allocate memory for * internal structures; * - \b clAmdBlasInvalidCommandQueue if the passed command queue is invalid; * - \b clAmdBlasInvalidContext if a context a passed command queue belongs * to was released; * - \b clAmdBlasInvalidOperation if kernel compilation relating to a previous * call has not completed for any of the target devices; * - \b clAmdBlasCompilerNotAvailable if a compiler is not available; * - \b clAmdBlasBuildProgramFailure if there is a failure to build a program * executable. * * @ingroup TRSV */ __inline clAmdBlasStatus clAmdBlasStrsv( clblasOrder order, clblasUplo uplo, clblasTranspose trans, clblasDiag diag, size_t N, const cl_mem A, size_t offa, size_t lda, cl_mem X, size_t offx, int incx, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events) { return clblasStrsv( order, uplo, trans, diag, N, A, offa, lda, X, offx, incx, numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events ); } /** * @example example_strsv.c * Example of how to use the @ref clAmdBlasStrsv function. */ /** * @brief solving triangular matrix problems with double elements. * * Matrix-vector products: * - \f$ A X \leftarrow X \f$ * - \f$ A^T X \leftarrow X \f$ * * @param[in] order Row/column order. * @param[in] uplo The triangle in matrix being referenced. * @param[in] trans How matrix \b A is to be transposed. * @param[in] diag Specify whether matrix \b A is unit triangular. * @param[in] N Number of rows/columns in matrix \b A. * @param[in] A Buffer object storing matrix \b A. * @param[in] offa Offset in number of elements for first element in matrix \b A. * @param[in] lda Leading dimension of matrix \b A. It cannot be less * than \b N * @param[out] X Buffer object storing vector \b X. * @param[in] offx Offset in number of elements for first element in vector \b X. * @param[in] incx Increment for the elements of \b X. Must not be zero. * @param[in] numCommandQueues Number of OpenCL command queues in which the * task is to be performed. * @param[in] commandQueues OpenCL command queues. * @param[in] numEventsInWaitList Number of events in the event wait list. * @param[in] eventWaitList Event wait list. * @param[in] events Event objects per each command queue that identify * a particular kernel execution instance. * * @return * - \b clAmdBlasSuccess on success; * - \b clAmdBlasInvalidDevice if a target device does not support floating * point arithmetic with double precision; * - the same error codes as the clAmdBlasStrsv() function otherwise. * * @ingroup TRSV */ __inline clAmdBlasStatus clAmdBlasDtrsv( clblasOrder order, clblasUplo uplo, clblasTranspose trans, clblasDiag diag, size_t N, const cl_mem A, size_t offa, size_t lda, cl_mem X, size_t offx, int incx, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events) { return clblasDtrsv( order, uplo, trans, diag, N, A, offa, lda, X, offx, incx, numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events ); } /** * @brief solving triangular matrix problems with float-complex elements. * * Matrix-vector products: * - \f$ A X \leftarrow X \f$ * - \f$ A^T X \leftarrow X \f$ * * @param[in] order Row/column order. * @param[in] uplo The triangle in matrix being referenced. * @param[in] trans How matrix \b A is to be transposed. * @param[in] diag Specify whether matrix \b A is unit triangular. * @param[in] N Number of rows/columns in matrix \b A. * @param[in] A Buffer object storing matrix \b A. * @param[in] offa Offset in number of elements for first element in matrix \b A. * @param[in] lda Leading dimension of matrix \b A. It cannot be less * than \b N * @param[out] X Buffer object storing vector \b X. * @param[in] offx Offset in number of elements for first element in vector \b X. * @param[in] incx Increment for the elements of \b X. Must not be zero. * @param[in] numCommandQueues Number of OpenCL command queues in which the * task is to be performed. * @param[in] commandQueues OpenCL command queues. * @param[in] numEventsInWaitList Number of events in the event wait list. * @param[in] eventWaitList Event wait list. * @param[in] events Event objects per each command queue that identify * a particular kernel execution instance. * * @return The same result as the clAmdBlasStrsv() function. * * @ingroup TRSV */ __inline clAmdBlasStatus clAmdBlasCtrsv( clblasOrder order, clblasUplo uplo, clblasTranspose trans, clblasDiag diag, size_t N, const cl_mem A, size_t offa, size_t lda, cl_mem X, size_t offx, int incx, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events) { return clblasCtrsv( order, uplo, trans, diag, N, A, offa, lda, X, offx, incx, numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events ); } /** * @brief solving triangular matrix problems with double-complex elements. * * Matrix-vector products: * - \f$ A X \leftarrow X \f$ * - \f$ A^T X \leftarrow X \f$ * * @param[in] order Row/column order. * @param[in] uplo The triangle in matrix being referenced. * @param[in] trans How matrix \b A is to be transposed. * @param[in] diag Specify whether matrix \b A is unit triangular. * @param[in] N Number of rows/columns in matrix \b A. * @param[in] A Buffer object storing matrix \b A. * @param[in] offa Offset in number of elements for first element in matrix \b A. * @param[in] lda Leading dimension of matrix \b A. It cannot be less * than \b N * @param[out] X Buffer object storing vector \b X. * @param[in] offx Offset in number of elements for first element in vector \b X. * @param[in] incx Increment for the elements of \b X. Must not be zero. * @param[in] numCommandQueues Number of OpenCL command queues in which the * task is to be performed. * @param[in] commandQueues OpenCL command queues. * @param[in] numEventsInWaitList Number of events in the event wait list. * @param[in] eventWaitList Event wait list. * @param[in] events Event objects per each command queue that identify * a particular kernel execution instance. * * @return The same result as the clAmdBlasDtrsv() function. * * @ingroup TRSV */ __inline clAmdBlasStatus clAmdBlasZtrsv( clblasOrder order, clblasUplo uplo, clblasTranspose trans, clblasDiag diag, size_t N, const cl_mem A, size_t offa, size_t lda, cl_mem X, size_t offx, int incx, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events) { return clblasZtrsv( order, uplo, trans, diag, N, A, offa, lda, X, offx, incx, numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events ); } /*@}*/ /** * @defgroup GER GER - General matrix rank 1 operation * @ingroup BLAS2 */ /*@{*/ /** * @brief vector-vector product with float elements and * performs the rank 1 operation A * * Vector-vector products: * - \f$ A \leftarrow \alpha X Y^T + A \f$ * * @param[in] order Row/column order. * @param[in] M Number of rows in matrix \b A. * @param[in] N Number of columns in matrix \b A. * @param[in] alpha specifies the scalar alpha. * @param[in] X Buffer object storing vector \b X. * @param[in] offx Offset in number of elements for the first element in vector \b X. * @param[in] incx Increment for the elements of \b X. Must not be zero. * @param[in] Y Buffer object storing vector \b Y. * @param[in] offy Offset in number of elements for the first element in vector \b Y. * @param[in] incy Increment for the elements of \b Y. Must not be zero. * @param[out] A Buffer object storing matrix \b A. On exit, A is * overwritten by the updated matrix. * @param[in] offa Offset in number of elements for the first element in matrix \b A. * @param[in] lda Leading dimension of matrix \b A. It cannot be less * than \b N when the \b order parameter is set to * \b clAmdBlasRowMajor,\n or less than \b M when the * parameter is set to \b clAmdBlasColumnMajor. * @param[in] numCommandQueues Number of OpenCL command queues in which the * task is to be performed. * @param[in] commandQueues OpenCL command queues. * @param[in] numEventsInWaitList Number of events in the event wait list. * @param[in] eventWaitList Event wait list. * @param[in] events Event objects per each command queue that identify * a particular kernel execution instance. * * @return * - \b clAmdBlasSuccess on success; * - \b clAmdBlasNotInitialized if clAmdBlasSetup() was not called; * - \b clAmdBlasInvalidValue if invalid parameters are passed: * - \b M, \b N or * - either \b incx or \b incy is zero, or * - a leading dimension is invalid; * - \b clAmdBlasInvalidMemObject if A, X, or Y object is invalid, * or an image object rather than the buffer one; * - \b clAmdBlasOutOfResources if you use image-based function implementation * and no suitable scratch image available; * - \b clAmdBlasOutOfHostMemory if the library can't allocate memory for * internal structures; * - \b clAmdBlasInvalidCommandQueue if the passed command queue is invalid; * - \b clAmdBlasInvalidContext if a context a passed command queue belongs to * was released; * - \b clAmdBlasInvalidOperation if kernel compilation relating to a previous * call has not completed for any of the target devices; * - \b clAmdBlasCompilerNotAvailable if a compiler is not available; * - \b clAmdBlasBuildProgramFailure if there is a failure to build a program * executable. * * @ingroup GER */ __inline clAmdBlasStatus clAmdBlasSger( clblasOrder order, size_t M, size_t N, cl_float alpha, const cl_mem X, size_t offx, int incx, const cl_mem Y, size_t offy, int incy, cl_mem A, size_t offa, size_t lda, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events) { return clblasSger( order, M, N, alpha, X, offx, incx, Y, offy, incy, A, offa, lda, numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events ); } /** * @example example_sger.c * Example of how to use the @ref clAmdBlasSger function. */ /** * @brief vector-vector product with double elements and * performs the rank 1 operation A * * Vector-vector products: * - \f$ A \leftarrow \alpha X Y^T + A \f$ * * @param[in] order Row/column order. * @param[in] M Number of rows in matrix \b A. * @param[in] N Number of columns in matrix \b A. * @param[in] alpha specifies the scalar alpha. * @param[in] X Buffer object storing vector \b X. * @param[in] offx Offset in number of elements for the first element in vector \b X. * @param[in] incx Increment for the elements of \b X. Must not be zero. * @param[in] Y Buffer object storing vector \b Y. * @param[in] offy Offset in number of elements for the first element in vector \b Y. * @param[in] incy Increment for the elements of \b Y. Must not be zero. * @param[out] A Buffer object storing matrix \b A. On exit, A is * overwritten by the updated matrix. * @param[in] offa Offset in number of elements for the first element in matrix \b A. * @param[in] lda Leading dimension of matrix \b A. It cannot be less * than \b N when the \b order parameter is set to * \b clAmdBlasRowMajor,\n or less than \b M when the * parameter is set to \b clAmdBlasColumnMajor. * @param[in] numCommandQueues Number of OpenCL command queues in which the * task is to be performed. * @param[in] commandQueues OpenCL command queues. * @param[in] numEventsInWaitList Number of events in the event wait list. * @param[in] eventWaitList Event wait list. * @param[in] events Event objects per each command queue that identify * a particular kernel execution instance. * * @return * - \b clAmdBlasSuccess on success; * - \b clAmdBlasInvalidDevice if a target device does not support floating * point arithmetic with double precision; * - the same error codes as the clAmdBlasSger() function otherwise. * * @ingroup GER */ __inline clAmdBlasStatus clAmdBlasDger( clblasOrder order, size_t M, size_t N, cl_double alpha, const cl_mem X, size_t offx, int incx, const cl_mem Y, size_t offy, int incy, cl_mem A, size_t offa, size_t lda, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events) { return clblasDger( order, M, N, alpha, X, offx, incx, Y, offy, incy, A, offa, lda, numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events ); } /*@}*/ /** * @defgroup GERU GERU - General matrix rank 1 operation * @ingroup BLAS2 */ /*@{*/ /** * @brief vector-vector product with float complex elements and * performs the rank 1 operation A * * Vector-vector products: * - \f$ A \leftarrow \alpha X Y^T + A \f$ * * @param[in] order Row/column order. * @param[in] M Number of rows in matrix \b A. * @param[in] N Number of columns in matrix \b A. * @param[in] alpha specifies the scalar alpha. * @param[in] X Buffer object storing vector \b X. * @param[in] offx Offset in number of elements for the first element in vector \b X. * @param[in] incx Increment for the elements of \b X. Must not be zero. * @param[in] Y Buffer object storing vector \b Y. * @param[in] offy Offset in number of elements for the first element in vector \b Y. * @param[in] incy Increment for the elements of \b Y. Must not be zero. * @param[out] A Buffer object storing matrix \b A. On exit, A is * overwritten by the updated matrix. * @param[in] offa Offset in number of elements for the first element in matrix \b A. * @param[in] lda Leading dimension of matrix \b A. It cannot be less * than \b N when the \b order parameter is set to * \b clAmdBlasRowMajor,\n or less than \b M when the * parameter is set to \b clAmdBlasColumnMajor. * @param[in] numCommandQueues Number of OpenCL command queues in which the * task is to be performed. * @param[in] commandQueues OpenCL command queues. * @param[in] numEventsInWaitList Number of events in the event wait list. * @param[in] eventWaitList Event wait list. * @param[in] events Event objects per each command queue that identify * a particular kernel execution instance. * * @return * - \b clAmdBlasSuccess on success; * - \b clAmdBlasNotInitialized if clAmdBlasSetup() was not called; * - \b clAmdBlasInvalidValue if invalid parameters are passed: * - \b M, \b N or * - either \b incx or \b incy is zero, or * - a leading dimension is invalid; * - \b clAmdBlasInvalidMemObject if A, X, or Y object is invalid, * or an image object rather than the buffer one; * - \b clAmdBlasOutOfResources if you use image-based function implementation * and no suitable scratch image available; * - \b clAmdBlasOutOfHostMemory if the library can't allocate memory for * internal structures; * - \b clAmdBlasInvalidCommandQueue if the passed command queue is invalid; * - \b clAmdBlasInvalidContext if a context a passed command queue belongs to * was released; * - \b clAmdBlasInvalidOperation if kernel compilation relating to a previous * call has not completed for any of the target devices; * - \b clAmdBlasCompilerNotAvailable if a compiler is not available; * - \b clAmdBlasBuildProgramFailure if there is a failure to build a program * executable. * * @ingroup GERU */ __inline clAmdBlasStatus clAmdBlasCgeru( clblasOrder order, size_t M, size_t N, cl_float2 alpha, const cl_mem X, size_t offx, int incx, const cl_mem Y, size_t offy, int incy, cl_mem A , size_t offa, size_t lda, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events) { return clblasCgeru( order, M, N, alpha, X, offx, incx, Y, offy, incy, A, offa, lda, numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events ); } /** * @brief vector-vector product with double complex elements and * performs the rank 1 operation A * * Vector-vector products: * - \f$ A \leftarrow \alpha X Y^T + A \f$ * * @param[in] order Row/column order. * @param[in] M Number of rows in matrix \b A. * @param[in] N Number of columns in matrix \b A. * @param[in] alpha specifies the scalar alpha. * @param[in] X Buffer object storing vector \b X. * @param[in] offx Offset in number of elements for the first element in vector \b X. * @param[in] incx Increment for the elements of \b X. Must not be zero. * @param[in] Y Buffer object storing vector \b Y. * @param[in] offy Offset in number of elements for the first element in vector \b Y. * @param[in] incy Increment for the elements of \b Y. Must not be zero. * @param[out] A Buffer object storing matrix \b A. On exit, A is * overwritten by the updated matrix. * @param[in] offa Offset in number of elements for the first element in matrix \b A. * @param[in] lda Leading dimension of matrix \b A. It cannot be less * than \b N when the \b order parameter is set to * \b clAmdBlasRowMajor,\n or less than \b M when the * parameter is set to \b clAmdBlasColumnMajor. * @param[in] numCommandQueues Number of OpenCL command queues in which the * task is to be performed. * @param[in] commandQueues OpenCL command queues. * @param[in] numEventsInWaitList Number of events in the event wait list. * @param[in] eventWaitList Event wait list. * @param[in] events Event objects per each command queue that identify * a particular kernel execution instance. * * @return * - \b clAmdBlasSuccess on success; * - \b clAmdBlasInvalidDevice if a target device does not support floating * point arithmetic with double precision; * - the same error codes as the clAmdBlasCgeru() function otherwise. * * @ingroup GERU */ __inline clAmdBlasStatus clAmdBlasZgeru( clblasOrder order, size_t M, size_t N, cl_double2 alpha, const cl_mem X, size_t offx, int incx, const cl_mem Y, size_t offy, int incy, cl_mem A, size_t offa, size_t lda, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events) { return clblasZgeru( order, M, N, alpha, X, offx, incx, Y, offy, incy, A, offa, lda, numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events ); } /*@}*/ /** * @defgroup GERC GERC - General matrix rank 1 operation * @ingroup BLAS2 */ /*@{*/ /** * @brief vector-vector product with float complex elements and * performs the rank 1 operation A * * Vector-vector products: * - \f$ A \leftarrow \alpha X Y^H + A \f$ * * @param[in] order Row/column order. * @param[in] M Number of rows in matrix \b A. * @param[in] N Number of columns in matrix \b A. * @param[in] alpha specifies the scalar alpha. * @param[in] X Buffer object storing vector \b X. * @param[in] offx Offset in number of elements for the first element in vector \b X. * @param[in] incx Increment for the elements of \b X. Must not be zero. * @param[in] Y Buffer object storing vector \b Y. * @param[in] offy Offset in number of elements for the first element in vector \b Y. * @param[in] incy Increment for the elements of \b Y. Must not be zero. * @param[out] A Buffer object storing matrix \b A. On exit, A is * overwritten by the updated matrix. * @param[in] offa Offset in number of elements for the first element in matrix \b A. * @param[in] lda Leading dimension of matrix \b A. It cannot be less * than \b N when the \b order parameter is set to * \b clAmdBlasRowMajor,\n or less than \b M when the * parameter is set to \b clAmdBlasColumnMajor. * @param[in] numCommandQueues Number of OpenCL command queues in which the * task is to be performed. * @param[in] commandQueues OpenCL command queues. * @param[in] numEventsInWaitList Number of events in the event wait list. * @param[in] eventWaitList Event wait list. * @param[in] events Event objects per each command queue that identify * a particular kernel execution instance. * * @return * - \b clAmdBlasSuccess on success; * - \b clAmdBlasNotInitialized if clAmdBlasSetup() was not called; * - \b clAmdBlasInvalidValue if invalid parameters are passed: * - \b M, \b N or * - either \b incx or \b incy is zero, or * - a leading dimension is invalid; * - \b clAmdBlasInvalidMemObject if A, X, or Y object is invalid, * or an image object rather than the buffer one; * - \b clAmdBlasOutOfResources if you use image-based function implementation * and no suitable scratch image available; * - \b clAmdBlasOutOfHostMemory if the library can't allocate memory for * internal structures; * - \b clAmdBlasInvalidCommandQueue if the passed command queue is invalid; * - \b clAmdBlasInvalidContext if a context a passed command queue belongs to * was released; * - \b clAmdBlasInvalidOperation if kernel compilation relating to a previous * call has not completed for any of the target devices; * - \b clAmdBlasCompilerNotAvailable if a compiler is not available; * - \b clAmdBlasBuildProgramFailure if there is a failure to build a program * executable. * * @ingroup GERC */ __inline clAmdBlasStatus clAmdBlasCgerc( clblasOrder order, size_t M, size_t N, cl_float2 alpha, const cl_mem X, size_t offx, int incx, const cl_mem Y, size_t offy, int incy, cl_mem A , size_t offa, size_t lda, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events) { return clblasCgerc( order, M, N, alpha, X, offx, incx, Y, offy, incy, A, offa, lda, numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events ); } /** * @brief vector-vector product with double complex elements and * performs the rank 1 operation A * * Vector-vector products: * - \f$ A \leftarrow \alpha X Y^H + A \f$ * * @param[in] order Row/column order. * @param[in] M Number of rows in matrix \b A. * @param[in] N Number of columns in matrix \b A. * @param[in] alpha specifies the scalar alpha. * @param[in] X Buffer object storing vector \b X. * @param[in] offx Offset in number of elements for the first element in vector \b X. * @param[in] incx Increment for the elements of \b X. Must not be zero. * @param[in] Y Buffer object storing vector \b Y. * @param[in] offy Offset in number of elements for the first element in vector \b Y. * @param[in] incy Increment for the elements of \b Y. Must not be zero. * @param[out] A Buffer object storing matrix \b A. On exit, A is * overwritten by the updated matrix. * @param[in] offa Offset in number of elements for the first element in matrix \b A. * @param[in] lda Leading dimension of matrix \b A. It cannot be less * than \b N when the \b order parameter is set to * \b clAmdBlasRowMajor,\n or less than \b M when the * parameter is set to \b clAmdBlasColumnMajor. * @param[in] numCommandQueues Number of OpenCL command queues in which the * task is to be performed. * @param[in] commandQueues OpenCL command queues. * @param[in] numEventsInWaitList Number of events in the event wait list. * @param[in] eventWaitList Event wait list. * @param[in] events Event objects per each command queue that identify * a particular kernel execution instance. * * @return * - \b clAmdBlasSuccess on success; * - \b clAmdBlasInvalidDevice if a target device does not support floating * point arithmetic with double precision; * - the same error codes as the clAmdBlasCgerc() function otherwise. * * @ingroup GERC */ __inline clAmdBlasStatus clAmdBlasZgerc( clblasOrder order, size_t M, size_t N, cl_double2 alpha, const cl_mem X, size_t offx, int incx, const cl_mem Y, size_t offy, int incy, cl_mem A, size_t offa, size_t lda, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events) { return clblasZgerc( order, M, N, alpha, X, offx, incx, Y, offy, incy, A, offa, lda, numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events ); } /*@}*/ /** * @defgroup SYR SYR - Symmetric rank 1 update * * The Level 2 Basic Linear Algebra Subprograms are functions that perform * symmetric rank 1 update operations. * @ingroup BLAS2 */ /*@{*/ /** * @brief Symmetric rank 1 operation with a general triangular matrix and * float elements. * * Symmetric rank 1 operation: * - \f$ A \leftarrow \alpha x x^T + A \f$ * * @param[in] order Row/column order. * @param[in] uplo The triangle in matrix being referenced. * @param[in] N Number of columns in matrix \b A. * @param[in] alpha The factor of matrix \b A. * @param[in] X Buffer object storing vector \b X. * @param[in] offx Offset of first element of vector \b X in buffer object. * @param[in] incx Increment for the elements of \b X. Must not be zero. * @param[out] A Buffer object storing matrix \b A. * @param[in] offa Offset of first element of matrix \b A in buffer object. * @param[in] lda Leading dimension of matrix \b A. It cannot be less * than \b N. * @param[in] numCommandQueues Number of OpenCL command queues in which the * task is to be performed. * @param[in] commandQueues OpenCL command queues. * @param[in] numEventsInWaitList Number of events in the event wait list. * @param[in] eventWaitList Event wait list. * @param[in] events Event objects per each command queue that identify * a particular kernel execution instance. * * @return * - \b clAmdBlasSuccess on success; * - \b clAmdBlasNotInitialized if clAmdBlasSetup() was not called; * - \b clAmdBlasInvalidValue if invalid parameters are passed: * - \b N is zero, or * - either \b incx is zero, or * - the leading dimension is invalid; * - \b clAmdBlasInvalidMemObject if either \b A, \b X object is * Invalid, or an image object rather than the buffer one; * - \b clAmdBlasOutOfHostMemory if the library can't allocate memory for * internal structures; * - \b clAmdBlasInvalidCommandQueue if the passed command queue is invalid; * - \b clAmdBlasInvalidContext if a context a passed command queue belongs * to was released; * - \b clAmdBlasInvalidOperation if kernel compilation relating to a previous * call has not completed for any of the target devices; * - \b clAmdBlasCompilerNotAvailable if a compiler is not available; * - \b clAmdBlasBuildProgramFailure if there is a failure to build a program * executable. * * @ingroup SYR */ __inline clAmdBlasStatus clAmdBlasSsyr( clblasOrder order, clblasUplo uplo, size_t N, cl_float alpha, const cl_mem X, size_t offx, int incx, cl_mem A, size_t offa, size_t lda, cl_uint numCommandQueues, cl_command_queue* commandQueues, cl_uint numEventsInWaitList, const cl_event* eventWaitList, cl_event* events) { return clblasSsyr( order, uplo, N, alpha, X, offx, incx, A, offa, lda, numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events ); } /** * @brief Symmetric rank 1 operation with a general triangular matrix and * double elements. * * Symmetric rank 1 operation: * - \f$ A \leftarrow \alpha x x^T + A \f$ * * @param[in] order Row/column order. * @param[in] uplo The triangle in matrix being referenced. * @param[in] N Number of columns in matrix \b A. * @param[in] alpha The factor of matrix \b A. * @param[in] X Buffer object storing vector \b X. * @param[in] offx Offset of first element of vector \b X in buffer object. * @param[in] incx Increment for the elements of \b X. Must not be zero. * @param[out] A Buffer object storing matrix \b A. * @param[in] offa Offset of first element of matrix \b A in buffer object. * @param[in] lda Leading dimension of matrix \b A. It cannot be less * than \b N. * @param[in] numCommandQueues Number of OpenCL command queues in which the * task is to be performed. * @param[in] commandQueues OpenCL command queues. * @param[in] numEventsInWaitList Number of events in the event wait list. * @param[in] eventWaitList Event wait list. * @param[in] events Event objects per each command queue that identify * a particular kernel execution instance. * * @return * - \b clAmdBlasSuccess on success; * - \b clAmdBlasInvalidDevice if a target device does not support floating * point arithmetic with double precision; * - the same error codes as the clAmdBlasSsyr() function otherwise. * * @ingroup SYR */ __inline clAmdBlasStatus clAmdBlasDsyr( clblasOrder order, clblasUplo uplo, size_t N, cl_double alpha, const cl_mem X, size_t offx, int incx, cl_mem A, size_t offa, size_t lda, cl_uint numCommandQueues, cl_command_queue* commandQueues, cl_uint numEventsInWaitList, const cl_event* eventWaitList, cl_event* events) { return clblasDsyr( order, uplo, N, alpha, X, offx, incx, A, offa, lda, numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events ); } /*@}*/ /** * @defgroup HER HER - Hermitian rank 1 operation * * The Level 2 Basic Linear Algebra Subprogram functions that perform * hermitian rank 1 operations. * @ingroup BLAS2 */ /*@{*/ /** * @brief hermitian rank 1 operation with a general triangular matrix and * float-complex elements. * * hermitian rank 1 operation: * - \f$ A \leftarrow \alpha X X^H + A \f$ * * @param[in] order Row/column order. * @param[in] uplo The triangle in matrix being referenced. * @param[in] N Number of columns in matrix \b A. * @param[in] alpha The factor of matrix \b A (a scalar float value) * @param[in] X Buffer object storing vector \b X. * @param[in] offx Offset in number of elements for the first element in vector \b X. * @param[in] incx Increment for the elements of \b X. Must not be zero. * @param[out] A Buffer object storing matrix \b A. * @param[in] offa Offset in number of elements for the first element in matrix \b A. * @param[in] lda Leading dimension of matrix \b A. It cannot be less * than \b N. * @param[in] numCommandQueues Number of OpenCL command queues in which the * task is to be performed. * @param[in] commandQueues OpenCL command queues. * @param[in] numEventsInWaitList Number of events in the event wait list. * @param[in] eventWaitList Event wait list. * @param[in] events Event objects per each command queue that identify * a particular kernel execution instance. * * @return * - \b clAmdBlasSuccess on success; * - \b clAmdBlasNotInitialized if clAmdBlasSetup() was not called; * - \b clAmdBlasInvalidValue if invalid parameters are passed: * - \b N is zero, or * - either \b incx is zero, or * - the leading dimension is invalid; * - \b clAmdBlasInvalidMemObject if either \b A, \b X object is * Invalid, or an image object rather than the buffer one; * - \b clAmdBlasOutOfHostMemory if the library can't allocate memory for * internal structures; * - \b clAmdBlasInvalidCommandQueue if the passed command queue is invalid; * - \b clAmdBlasInvalidContext if a context a passed command queue belongs * to was released; * - \b clAmdBlasInvalidOperation if kernel compilation relating to a previous * call has not completed for any of the target devices; * - \b clAmdBlasCompilerNotAvailable if a compiler is not available; * - \b clAmdBlasBuildProgramFailure if there is a failure to build a program * executable. * * @ingroup HER */ __inline clAmdBlasStatus clAmdBlasCher( clblasOrder order, clblasUplo uplo, size_t N, cl_float alpha, const cl_mem X, size_t offx, int incx, cl_mem A, size_t offa, size_t lda, cl_uint numCommandQueues, cl_command_queue* commandQueues, cl_uint numEventsInWaitList, const cl_event* eventWaitList, cl_event* events) { return clblasCher( order, uplo, N, alpha, X, offx, incx, A, offa, lda, numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events ); } /** * @example example_cher.c * Example of how to use the @ref clAmdBlasCher function. */ /** * @brief hermitian rank 1 operation with a general triangular matrix and * double-complex elements. * * hermitian rank 1 operation: * - \f$ A \leftarrow \alpha X X^H + A \f$ * * @param[in] order Row/column order. * @param[in] uplo The triangle in matrix being referenced. * @param[in] N Number of columns in matrix \b A. * @param[in] alpha The factor of matrix \b A (a scalar double value) * @param[in] X Buffer object storing vector \b X. * @param[in] offx Offset in number of elements for the first element in vector \b X. * @param[in] incx Increment for the elements of \b X. Must not be zero. * @param[out] A Buffer object storing matrix \b A. * @param[in] offa Offset in number of elements for the first element in matrix \b A. * @param[in] lda Leading dimension of matrix \b A. It cannot be less * than \b N. * @param[in] numCommandQueues Number of OpenCL command queues in which the * task is to be performed. * @param[in] commandQueues OpenCL command queues. * @param[in] numEventsInWaitList Number of events in the event wait list. * @param[in] eventWaitList Event wait list. * @param[in] events Event objects per each command queue that identify * a particular kernel execution instance. * * @return * - \b clAmdBlasSuccess on success; * - \b clAmdBlasInvalidDevice if a target device does not support floating * point arithmetic with double precision; * - the same error codes as the clAmdBlasCher() function otherwise. * * @ingroup HER */ __inline clAmdBlasStatus clAmdBlasZher( clblasOrder order, clblasUplo uplo, size_t N, cl_double alpha, const cl_mem X, size_t offx, int incx, cl_mem A, size_t offa, size_t lda, cl_uint numCommandQueues, cl_command_queue* commandQueues, cl_uint numEventsInWaitList, const cl_event* eventWaitList, cl_event* events) { return clblasZher( order, uplo, N, alpha, X, offx, incx, A, offa, lda, numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events ); } /*@}*/ /** * @defgroup SYR2 SYR2 - Symmetric rank 2 update * * The Level 2 Basic Linear Algebra Subprograms are functions that perform * symmetric rank 2 update operations. * @ingroup BLAS2 */ /*@{*/ /** * @brief Symmetric rank 2 operation with a general triangular matrix and * float elements. * * Symmetric rank 2 operation: * - \f$ A \leftarrow \alpha x y^T + \alpha y x^T + A \f$ * * @param[in] order Row/column order. * @param[in] uplo The triangle in matrix being referenced. * @param[in] N Number of columns in matrix \b A. * @param[in] alpha The factor of matrix \b A. * @param[in] X Buffer object storing vector \b X. * @param[in] offx Offset of first element of vector \b X in buffer object. * @param[in] incx Increment for the elements of \b X. Must not be zero. * @param[in] Y Buffer object storing vector \b Y. * @param[in] offy Offset of first element of vector \b Y in buffer object. * @param[in] incy Increment for the elements of \b Y. Must not be zero. * @param[out] A Buffer object storing matrix \b A. * @param[in] offa Offset of first element of matrix \b A in buffer object. * @param[in] lda Leading dimension of matrix \b A. It cannot be less * than \b N. * @param[in] numCommandQueues Number of OpenCL command queues in which the * task is to be performed. * @param[in] commandQueues OpenCL command queues. * @param[in] numEventsInWaitList Number of events in the event wait list. * @param[in] eventWaitList Event wait list. * @param[in] events Event objects per each command queue that identify * a particular kernel execution instance. * * @return * - \b clAmdBlasSuccess on success; * - \b clAmdBlasNotInitialized if clAmdBlasSetup() was not called; * - \b clAmdBlasInvalidValue if invalid parameters are passed: * - either \b N is zero, or * - either \b incx or \b incy is zero, or * - the leading dimension is invalid; * - \b clAmdBlasInvalidMemObject if either \b A, \b X, or \b Y object is * Invalid, or an image object rather than the buffer one; * - \b clAmdBlasOutOfHostMemory if the library can't allocate memory for * internal structures; * - \b clAmdBlasInvalidCommandQueue if the passed command queue is invalid; * - \b clAmdBlasInvalidContext if a context a passed command queue belongs * to was released; * - \b clAmdBlasInvalidOperation if kernel compilation relating to a previous * call has not completed for any of the target devices; * - \b clAmdBlasCompilerNotAvailable if a compiler is not available; * - \b clAmdBlasBuildProgramFailure if there is a failure to build a program * executable. * * @ingroup SYR2 */ __inline clAmdBlasStatus clAmdBlasSsyr2( clblasOrder order, clblasUplo uplo, size_t N, cl_float alpha, const cl_mem X, size_t offx, int incx, const cl_mem Y, size_t offy, int incy, cl_mem A, size_t offa, size_t lda, cl_uint numCommandQueues, cl_command_queue* commandQueues, cl_uint numEventsInWaitList, const cl_event* eventWaitList, cl_event* events) { return clblasSsyr2( order, uplo, N, alpha, X, offx, incx, Y, offy, incy, A, offa, lda, numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events ); } /** * @brief Symmetric rank 2 operation with a general triangular matrix and * double elements. * * Symmetric rank 2 operation: * - \f$ A \leftarrow \alpha x y^T + \alpha y x^T + A \f$ * * @param[in] order Row/column order. * @param[in] uplo The triangle in matrix being referenced. * @param[in] N Number of columns in matrix \b A. * @param[in] alpha The factor of matrix \b A. * @param[in] X Buffer object storing vector \b X. * @param[in] offx Offset of first element of vector \b X in buffer object. * @param[in] incx Increment for the elements of \b X. Must not be zero. * @param[in] Y Buffer object storing vector \b Y. * @param[in] offy Offset of first element of vector \b Y in buffer object. * @param[in] incy Increment for the elements of \b Y. Must not be zero. * @param[out] A Buffer object storing matrix \b A. * @param[in] offa Offset of first element of matrix \b A in buffer object. * @param[in] lda Leading dimension of matrix \b A. It cannot be less * than \b N. * @param[in] numCommandQueues Number of OpenCL command queues in which the * task is to be performed. * @param[in] commandQueues OpenCL command queues. * @param[in] numEventsInWaitList Number of events in the event wait list. * @param[in] eventWaitList Event wait list. * @param[in] events Event objects per each command queue that identify * a particular kernel execution instance. * * @return * - \b clAmdBlasSuccess on success; * - \b clAmdBlasNotInitialized if clAmdBlasSetup() was not called; * - \b clAmdBlasInvalidValue if invalid parameters are passed: * - either \b N is zero, or * - either \b incx or \b incy is zero, or * - the leading dimension is invalid; * - \b clAmdBlasInvalidMemObject if either \b A, \b X, or \b Y object is * Invalid, or an image object rather than the buffer one; * - \b clAmdBlasOutOfHostMemory if the library can't allocate memory for * internal structures; * - \b clAmdBlasInvalidCommandQueue if the passed command queue is invalid; * - \b clAmdBlasInvalidContext if a context a passed command queue belongs * to was released; * - \b clAmdBlasInvalidOperation if kernel compilation relating to a previous * call has not completed for any of the target devices; * - \b clAmdBlasCompilerNotAvailable if a compiler is not available; * - \b clAmdBlasBuildProgramFailure if there is a failure to build a program * executable. * * @ingroup SYR2 */ __inline clAmdBlasStatus clAmdBlasDsyr2( clblasOrder order, clblasUplo uplo, size_t N, cl_double alpha, const cl_mem X, size_t offx, int incx, const cl_mem Y, size_t offy, int incy, cl_mem A, size_t offa, size_t lda, cl_uint numCommandQueues, cl_command_queue* commandQueues, cl_uint numEventsInWaitList, const cl_event* eventWaitList, cl_event* events) { return clblasDsyr2( order, uplo, N, alpha, X, offx, incx, Y, offy, incy, A, offa, lda, numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events ); } /*@}*/ /** * @defgroup HER2 HER2 - Hermitian rank 2 update * * The Level 2 Basic Linear Algebra Subprograms are functions that perform * hermitian rank 2 update operations. * @ingroup BLAS2 */ /*@{*/ /** * @brief Hermitian rank 2 operation with a general triangular matrix and * float-compelx elements. * * Hermitian rank 2 operation: * - \f$ A \leftarrow \alpha X Y^H + \overline{ \alpha } Y X^H + A \f$ * * @param[in] order Row/column order. * @param[in] uplo The triangle in matrix being referenced. * @param[in] N Number of columns in matrix \b A. * @param[in] alpha The factor of matrix \b A. * @param[in] X Buffer object storing vector \b X. * @param[in] offx Offset in number of elements for the first element in vector \b X. * @param[in] incx Increment for the elements of \b X. Must not be zero. * @param[in] Y Buffer object storing vector \b Y. * @param[in] offy Offset in number of elements for the first element in vector \b Y. * @param[in] incy Increment for the elements of \b Y. Must not be zero. * @param[out] A Buffer object storing matrix \b A. * @param[in] offa Offset in number of elements for the first element in matrix \b A. * @param[in] lda Leading dimension of matrix \b A. It cannot be less * than \b N. * @param[in] numCommandQueues Number of OpenCL command queues in which the * task is to be performed. * @param[in] commandQueues OpenCL command queues. * @param[in] numEventsInWaitList Number of events in the event wait list. * @param[in] eventWaitList Event wait list. * @param[in] events Event objects per each command queue that identify * a particular kernel execution instance. * * @return * - \b clAmdBlasSuccess on success; * - \b clAmdBlasNotInitialized if clAmdBlasSetup() was not called; * - \b clAmdBlasInvalidValue if invalid parameters are passed: * - either \b N is zero, or * - either \b incx or \b incy is zero, or * - the leading dimension is invalid; * - \b clAmdBlasInvalidMemObject if either \b A, \b X, or \b Y object is * Invalid, or an image object rather than the buffer one; * - \b clAmdBlasOutOfHostMemory if the library can't allocate memory for * internal structures; * - \b clAmdBlasInvalidCommandQueue if the passed command queue is invalid; * - \b clAmdBlasInvalidContext if a context a passed command queue belongs * to was released; * - \b clAmdBlasInvalidOperation if kernel compilation relating to a previous * call has not completed for any of the target devices; * - \b clAmdBlasCompilerNotAvailable if a compiler is not available; * - \b clAmdBlasBuildProgramFailure if there is a failure to build a program * executable. * * @ingroup HER2 */ __inline clAmdBlasStatus clAmdBlasCher2( clblasOrder order, clblasUplo uplo, size_t N, cl_float2 alpha, const cl_mem X, size_t offx, int incx, const cl_mem Y, size_t offy, int incy, cl_mem A, size_t offa, size_t lda, cl_uint numCommandQueues, cl_command_queue* commandQueues, cl_uint numEventsInWaitList, const cl_event* eventWaitList, cl_event* events) { return clblasCher2( order, uplo, N, alpha, X, offx, incx, Y, offy, incy, A, offa, lda, numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events ); } /** * @brief Hermitian rank 2 operation with a general triangular matrix and * double-compelx elements. * * Hermitian rank 2 operation: * - \f$ A \leftarrow \alpha X Y^H + \overline{ \alpha } Y X^H + A \f$ * * @param[in] order Row/column order. * @param[in] uplo The triangle in matrix being referenced. * @param[in] N Number of columns in matrix \b A. * @param[in] alpha The factor of matrix \b A. * @param[in] X Buffer object storing vector \b X. * @param[in] offx Offset in number of elements for the first element in vector \b X. * @param[in] incx Increment for the elements of \b X. Must not be zero. * @param[in] Y Buffer object storing vector \b Y. * @param[in] offy Offset in number of elements for the first element in vector \b Y. * @param[in] incy Increment for the elements of \b Y. Must not be zero. * @param[out] A Buffer object storing matrix \b A. * @param[in] offa Offset in number of elements for the first element in matrix \b A. * @param[in] lda Leading dimension of matrix \b A. It cannot be less * than \b N. * @param[in] numCommandQueues Number of OpenCL command queues in which the * task is to be performed. * @param[in] commandQueues OpenCL command queues. * @param[in] numEventsInWaitList Number of events in the event wait list. * @param[in] eventWaitList Event wait list. * @param[in] events Event objects per each command queue that identify * a particular kernel execution instance. * * @return * - \b clAmdBlasSuccess on success; * - \b clAmdBlasInvalidDevice if a target device does not support floating * point arithmetic with double precision; * - the same error codes as the clAmdBlasCher2() function otherwise. * * @ingroup HER2 */ __inline clAmdBlasStatus clAmdBlasZher2( clblasOrder order, clblasUplo uplo, size_t N, cl_double2 alpha, const cl_mem X, size_t offx, int incx, const cl_mem Y, size_t offy, int incy, cl_mem A, size_t offa, size_t lda, cl_uint numCommandQueues, cl_command_queue* commandQueues, cl_uint numEventsInWaitList, const cl_event* eventWaitList, cl_event* events) { return clblasZher2( order, uplo, N, alpha, X, offx, incx, Y, offy, incy, A, offa, lda, numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events ); } /** * @example example_zher2.c * Example of how to use the @ref clAmdBlasZher2 function. */ /*@}*/ /** * @defgroup TPMV TPMV - Triangular packed matrix-vector multiply * @ingroup BLAS2 */ /*@{*/ /** * @brief Matrix-vector product with a packed triangular matrix and * float elements. * * Matrix-vector products: * - \f$ X \leftarrow A X \f$ * - \f$ X \leftarrow A^T X \f$ * * @param[in] order Row/column order. * @param[in] uplo The triangle in matrix being referenced. * @param[in] trans How matrix \b AP is to be transposed. * @param[in] diag Specify whether matrix \b AP is unit triangular. * @param[in] N Number of rows/columns in matrix \b A. * @param[in] AP Buffer object storing matrix \b AP in packed format. * @param[in] offa Offset in number of elements for first element in matrix \b AP. * @param[out] X Buffer object storing vector \b X. * @param[in] offx Offset in number of elements for first element in vector \b X. * @param[in] incx Increment for the elements of \b X. Must not be zero. * @param[in] scratchBuff Temporary cl_mem scratch buffer object which can hold a * minimum of (1 + (N-1)*abs(incx)) elements * @param[in] numCommandQueues Number of OpenCL command queues in which the * task is to be performed. * @param[in] commandQueues OpenCL command queues. * @param[in] numEventsInWaitList Number of events in the event wait list. * @param[in] eventWaitList Event wait list. * @param[in] events Event objects per each command queue that identify * a particular kernel execution instance. * * @return * - \b clAmdBlasSuccess on success; * - \b clAmdBlasNotInitialized if clAmdBlasSetup() was not called; * - \b clAmdBlasInvalidValue if invalid parameters are passed: * - either \b N or \b incx is zero * - \b clAmdBlasInvalidMemObject if either \b AP or \b X object is * Invalid, or an image object rather than the buffer one; * - \b clAmdBlasOutOfHostMemory if the library can't allocate memory for * internal structures; * - \b clAmdBlasInvalidCommandQueue if the passed command queue is invalid; * - \b clAmdBlasInvalidContext if a context a passed command queue belongs * to was released; * - \b clAmdBlasInvalidOperation if kernel compilation relating to a previous * call has not completed for any of the target devices; * - \b clAmdBlasCompilerNotAvailable if a compiler is not available; * - \b clAmdBlasBuildProgramFailure if there is a failure to build a program * executable. * * @ingroup TPMV */ __inline clAmdBlasStatus clAmdBlasStpmv( clblasOrder order, clblasUplo uplo, clblasTranspose trans, clblasDiag diag, size_t N, const cl_mem AP, size_t offa, cl_mem X, size_t offx, int incx, cl_mem scratchBuff, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events) { return clblasStpmv( order, uplo, trans, diag, N, AP, offa, X, offx, incx, scratchBuff, numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events ); } /** * @example example_stpmv.c * Example of how to use the @ref clAmdBlasStpmv function. */ /** * @brief Matrix-vector product with a packed triangular matrix and * double elements. * * Matrix-vector products: * - \f$ X \leftarrow A X \f$ * - \f$ X \leftarrow A^T X \f$ * * @param[in] order Row/column order. * @param[in] uplo The triangle in matrix being referenced. * @param[in] trans How matrix \b AP is to be transposed. * @param[in] diag Specify whether matrix \b AP is unit triangular. * @param[in] N Number of rows/columns in matrix \b AP. * @param[in] AP Buffer object storing matrix \b AP in packed format. * @param[in] offa Offset in number of elements for first element in matrix \b AP. * @param[out] X Buffer object storing vector \b X. * @param[in] offx Offset in number of elements for first element in vector \b X. * @param[in] incx Increment for the elements of \b X. Must not be zero. * @param[in] scratchBuff Temporary cl_mem scratch buffer object which can hold a * minimum of (1 + (N-1)*abs(incx)) elements * @param[in] numCommandQueues Number of OpenCL command queues in which the * task is to be performed. * @param[in] commandQueues OpenCL command queues. * @param[in] numEventsInWaitList Number of events in the event wait list. * @param[in] eventWaitList Event wait list. * @param[in] events Event objects per each command queue that identify * a particular kernel execution instance. * * @return * - \b clAmdBlasSuccess on success; * - \b clAmdBlasInvalidDevice if a target device does not support floating * point arithmetic with double precision; * - the same error codes as the clAmdBlasStpmv() function otherwise. * * @ingroup TPMV */ __inline clAmdBlasStatus clAmdBlasDtpmv( clblasOrder order, clblasUplo uplo, clblasTranspose trans, clblasDiag diag, size_t N, const cl_mem AP, size_t offa, cl_mem X, size_t offx, int incx, cl_mem scratchBuff, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events) { return clblasDtpmv( order, uplo, trans, diag, N, AP, offa, X, offx, incx, scratchBuff, numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events ); } /** * @brief Matrix-vector product with a packed triangular matrix and * float-complex elements. * * Matrix-vector products: * - \f$ X \leftarrow A X \f$ * - \f$ X \leftarrow A^T X \f$ * * @param[in] order Row/column order. * @param[in] uplo The triangle in matrix being referenced. * @param[in] trans How matrix \b AP is to be transposed. * @param[in] diag Specify whether matrix \b AP is unit triangular. * @param[in] N Number of rows/columns in matrix \b AP. * @param[in] AP Buffer object storing matrix \b AP in packed format. * @param[in] offa Offset in number of elements for first element in matrix \b AP. * @param[out] X Buffer object storing vector \b X. * @param[in] offx Offset in number of elements for first element in vector \b X. * @param[in] incx Increment for the elements of \b X. Must not be zero. * @param[in] scratchBuff Temporary cl_mem scratch buffer object which can hold a * minimum of (1 + (N-1)*abs(incx)) elements * @param[in] numCommandQueues Number of OpenCL command queues in which the * task is to be performed. * @param[in] commandQueues OpenCL command queues. * @param[in] numEventsInWaitList Number of events in the event wait list. * @param[in] eventWaitList Event wait list. * @param[in] events Event objects per each command queue that identify * a particular kernel execution instance. * * @return The same result as the clAmdBlasStpmv() function. * @ingroup TPMV */ __inline clAmdBlasStatus clAmdBlasCtpmv( clblasOrder order, clblasUplo uplo, clblasTranspose trans, clblasDiag diag, size_t N, const cl_mem AP, size_t offa, cl_mem X, size_t offx, int incx, cl_mem scratchBuff, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events) { return clblasCtpmv( order, uplo, trans, diag, N, AP, offa, X, offx, incx, scratchBuff, numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events ); } /** * @brief Matrix-vector product with a packed triangular matrix and * double-complex elements. * * Matrix-vector products: * - \f$ X \leftarrow A X \f$ * - \f$ X \leftarrow A^T X \f$ * * @param[in] order Row/column order. * @param[in] uplo The triangle in matrix being referenced. * @param[in] trans How matrix \b AP is to be transposed. * @param[in] diag Specify whether matrix \b AP is unit triangular. * @param[in] N Number of rows/columns in matrix \b AP. * @param[in] AP Buffer object storing matrix \b AP in packed format. * @param[in] offa Offset in number of elements for first element in matrix \b AP. * @param[out] X Buffer object storing vector \b X. * @param[in] offx Offset in number of elements for first element in vector \b X. * @param[in] incx Increment for the elements of \b X. Must not be zero. * @param[in] scratchBuff Temporary cl_mem scratch buffer object which can hold a * minimum of (1 + (N-1)*abs(incx)) elements * @param[in] numCommandQueues Number of OpenCL command queues in which the * task is to be performed. * @param[in] commandQueues OpenCL command queues. * @param[in] numEventsInWaitList Number of events in the event wait list. * @param[in] eventWaitList Event wait list. * @param[in] events Event objects per each command queue that identify * a particular kernel execution instance. * * @return The same result as the clAmdBlasDtpmv() function. * @ingroup TPMV */ __inline clAmdBlasStatus clAmdBlasZtpmv( clblasOrder order, clblasUplo uplo, clblasTranspose trans, clblasDiag diag, size_t N, const cl_mem AP, size_t offa, cl_mem X, size_t offx, int incx, cl_mem scratchBuff, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events) { return clblasZtpmv( order, uplo, trans, diag, N, AP, offa, X, offx, incx, scratchBuff, numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events ); } /*@}*/ /** * @defgroup TPSV TPSV - Triangular packed matrix vector solve * @ingroup BLAS2 */ /*@{*/ /** * @brief solving triangular packed matrix problems with float elements. * * Matrix-vector products: * - \f$ A X \leftarrow X \f$ * - \f$ A^T X \leftarrow X \f$ * * @param[in] order Row/column order. * @param[in] uplo The triangle in matrix being referenced. * @param[in] trans How matrix \b A is to be transposed. * @param[in] diag Specify whether matrix \b A is unit triangular. * @param[in] N Number of rows/columns in matrix \b A. * @param[in] A Buffer object storing matrix in packed format.\b A. * @param[in] offa Offset in number of elements for first element in matrix \b A. * @param[out] X Buffer object storing vector \b X. * @param[in] offx Offset in number of elements for first element in vector \b X. * @param[in] incx Increment for the elements of \b X. Must not be zero. * @param[in] numCommandQueues Number of OpenCL command queues in which the * task is to be performed. * @param[in] commandQueues OpenCL command queues. * @param[in] numEventsInWaitList Number of events in the event wait list. * @param[in] eventWaitList Event wait list. * @param[in] events Event objects per each command queue that identify * a particular kernel execution instance. * * @return * - \b clAmdBlasSuccess on success; * - \b clAmdBlasNotInitialized if clAmdBlasSetup() was not called; * - \b clAmdBlasInvalidValue if invalid parameters are passed: * - either \b N or \b incx is zero, or * - the leading dimension is invalid; * - \b clAmdBlasInvalidMemObject if either \b A or \b X object is * Invalid, or an image object rather than the buffer one; * - \b clAmdBlasOutOfHostMemory if the library can't allocate memory for * internal structures; * - \b clAmdBlasInvalidCommandQueue if the passed command queue is invalid; * - \b clAmdBlasInvalidContext if a context a passed command queue belongs * to was released; * - \b clAmdBlasInvalidOperation if kernel compilation relating to a previous * call has not completed for any of the target devices; * - \b clAmdBlasCompilerNotAvailable if a compiler is not available; * - \b clAmdBlasBuildProgramFailure if there is a failure to build a program * executable. * * @ingroup TPSV */ __inline clAmdBlasStatus clAmdBlasStpsv( clblasOrder order, clblasUplo uplo, clblasTranspose trans, clblasDiag diag, size_t N, const cl_mem A, size_t offa, cl_mem X, size_t offx, int incx, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events) { return clblasStpsv( order, uplo, trans, diag, N, A, offa, X, offx, incx, numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events ); } /** * @example example_stpsv.c * Example of how to use the @ref clAmdBlasStpsv function. */ /** * @brief solving triangular packed matrix problems with double elements. * * Matrix-vector products: * - \f$ A X \leftarrow X \f$ * - \f$ A^T X \leftarrow X \f$ * * @param[in] order Row/column order. * @param[in] uplo The triangle in matrix being referenced. * @param[in] trans How matrix \b A is to be transposed. * @param[in] diag Specify whether matrix \b A is unit triangular. * @param[in] N Number of rows/columns in matrix \b A. * @param[in] A Buffer object storing matrix in packed format.\b A. * @param[in] offa Offset in number of elements for first element in matrix \b A. * @param[out] X Buffer object storing vector \b X. * @param[in] offx Offset in number of elements for first element in vector \b X. * @param[in] incx Increment for the elements of \b X. Must not be zero. * @param[in] numCommandQueues Number of OpenCL command queues in which the * task is to be performed. * @param[in] commandQueues OpenCL command queues. * @param[in] numEventsInWaitList Number of events in the event wait list. * @param[in] eventWaitList Event wait list. * @param[in] events Event objects per each command queue that identify * a particular kernel execution instance. * * @return * - \b clAmdBlasSuccess on success; * - \b clAmdBlasNotInitialized if clAmdBlasSetup() was not called; * - \b clAmdBlasInvalidValue if invalid parameters are passed: * - either \b N or \b incx is zero, or * - the leading dimension is invalid; * - \b clAmdBlasInvalidMemObject if either \b A or \b X object is * Invalid, or an image object rather than the buffer one; * - \b clAmdBlasOutOfHostMemory if the library can't allocate memory for * internal structures; * - \b clAmdBlasInvalidCommandQueue if the passed command queue is invalid; * - \b clAmdBlasInvalidContext if a context a passed command queue belongs * to was released; * - \b clAmdBlasInvalidOperation if kernel compilation relating to a previous * call has not completed for any of the target devices; * - \b clAmdBlasCompilerNotAvailable if a compiler is not available; * - \b clAmdBlasBuildProgramFailure if there is a failure to build a program * executable. * * @ingroup TPSV */ __inline clAmdBlasStatus clAmdBlasDtpsv( clblasOrder order, clblasUplo uplo, clblasTranspose trans, clblasDiag diag, size_t N, const cl_mem A, size_t offa, cl_mem X, size_t offx, int incx, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events) { return clblasDtpsv( order, uplo, trans, diag, N, A, offa, X, offx, incx, numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events ); } /** * @brief solving triangular packed matrix problems with float complex elements. * * Matrix-vector products: * - \f$ A X \leftarrow X \f$ * - \f$ A^T X \leftarrow X \f$ * * @param[in] order Row/column order. * @param[in] uplo The triangle in matrix being referenced. * @param[in] trans How matrix \b A is to be transposed. * @param[in] diag Specify whether matrix \b A is unit triangular. * @param[in] N Number of rows/columns in matrix \b A. * @param[in] A Buffer object storing matrix in packed format.\b A. * @param[in] offa Offset in number of elements for first element in matrix \b A. * @param[out] X Buffer object storing vector \b X. * @param[in] offx Offset in number of elements for first element in vector \b X. * @param[in] incx Increment for the elements of \b X. Must not be zero. * @param[in] numCommandQueues Number of OpenCL command queues in which the * task is to be performed. * @param[in] commandQueues OpenCL command queues. * @param[in] numEventsInWaitList Number of events in the event wait list. * @param[in] eventWaitList Event wait list. * @param[in] events Event objects per each command queue that identify * a particular kernel execution instance. * * @return * - \b clAmdBlasSuccess on success; * - \b clAmdBlasNotInitialized if clAmdBlasSetup() was not called; * - \b clAmdBlasInvalidValue if invalid parameters are passed: * - either \b N or \b incx is zero, or * - the leading dimension is invalid; * - \b clAmdBlasInvalidMemObject if either \b A or \b X object is * Invalid, or an image object rather than the buffer one; * - \b clAmdBlasOutOfHostMemory if the library can't allocate memory for * internal structures; * - \b clAmdBlasInvalidCommandQueue if the passed command queue is invalid; * - \b clAmdBlasInvalidContext if a context a passed command queue belongs * to was released; * - \b clAmdBlasInvalidOperation if kernel compilation relating to a previous * call has not completed for any of the target devices; * - \b clAmdBlasCompilerNotAvailable if a compiler is not available; * - \b clAmdBlasBuildProgramFailure if there is a failure to build a program * executable. * * @ingroup TPSV */ __inline clAmdBlasStatus clAmdBlasCtpsv( clblasOrder order, clblasUplo uplo, clblasTranspose trans, clblasDiag diag, size_t N, const cl_mem A, size_t offa, cl_mem X, size_t offx, int incx, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events) { return clblasCtpsv( order, uplo, trans, diag, N, A, offa, X, offx, incx, numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events ); } /** * @brief solving triangular packed matrix problems with double complex elements. * * Matrix-vector products: * - \f$ A X \leftarrow X \f$ * - \f$ A^T X \leftarrow X \f$ * * @param[in] order Row/column order. * @param[in] uplo The triangle in matrix being referenced. * @param[in] trans How matrix \b A is to be transposed. * @param[in] diag Specify whether matrix \b A is unit triangular. * @param[in] N Number of rows/columns in matrix \b A. * @param[in] A Buffer object storing matrix in packed format.\b A. * @param[in] offa Offset in number of elements for first element in matrix \b A. * @param[out] X Buffer object storing vector \b X. * @param[in] offx Offset in number of elements for first element in vector \b X. * @param[in] incx Increment for the elements of \b X. Must not be zero. * @param[in] numCommandQueues Number of OpenCL command queues in which the * task is to be performed. * @param[in] commandQueues OpenCL command queues. * @param[in] numEventsInWaitList Number of events in the event wait list. * @param[in] eventWaitList Event wait list. * @param[in] events Event objects per each command queue that identify * a particular kernel execution instance. * * @return * - \b clAmdBlasSuccess on success; * - \b clAmdBlasNotInitialized if clAmdBlasSetup() was not called; * - \b clAmdBlasInvalidValue if invalid parameters are passed: * - either \b N or \b incx is zero, or * - the leading dimension is invalid; * - \b clAmdBlasInvalidMemObject if either \b A or \b X object is * Invalid, or an image object rather than the buffer one; * - \b clAmdBlasOutOfHostMemory if the library can't allocate memory for * internal structures; * - \b clAmdBlasInvalidCommandQueue if the passed command queue is invalid; * - \b clAmdBlasInvalidContext if a context a passed command queue belongs * to was released; * - \b clAmdBlasInvalidOperation if kernel compilation relating to a previous * call has not completed for any of the target devices; * - \b clAmdBlasCompilerNotAvailable if a compiler is not available; * - \b clAmdBlasBuildProgramFailure if there is a failure to build a program * executable. * * @ingroup TPSV */ __inline clAmdBlasStatus clAmdBlasZtpsv( clblasOrder order, clblasUplo uplo, clblasTranspose trans, clblasDiag diag, size_t N, const cl_mem A, size_t offa, cl_mem X, size_t offx, int incx, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events) { return clblasZtpsv( order, uplo, trans, diag, N, A, offa, X, offx, incx, numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events ); } /*@}*/ /** * @defgroup SPMV SPMV - Symmetric packed matrix vector multiply * @ingroup BLAS2 */ /*@{*/ /** * @brief Matrix-vector product with a symmetric packed-matrix and float elements. * * Matrix-vector products: * - \f$ Y \leftarrow \alpha A X + \beta Y \f$ * * @param[in] order Row/columns order. * @param[in] uplo The triangle in matrix being referenced. * @param[in] N Number of rows and columns in matrix \b AP. * @param[in] alpha The factor of matrix \b AP. * @param[in] AP Buffer object storing matrix \b AP. * @param[in] offa Offset in number of elements for first element in matrix \b AP. * @param[in] X Buffer object storing vector \b X. * @param[in] offx Offset of first element of vector \b X in buffer object. * Counted in elements. * @param[in] incx Increment for the elements of vector \b X. It cannot be zero. * @param[in] beta The factor of vector \b Y. * @param[out] Y Buffer object storing vector \b Y. * @param[in] offy Offset of first element of vector \b Y in buffer object. * Counted in elements. * @param[in] incy Increment for the elements of vector \b Y. It cannot be zero. * @param[in] numCommandQueues Number of OpenCL command queues in which the * task is to be performed. * @param[in] commandQueues OpenCL command queues. * @param[in] numEventsInWaitList Number of events in the event wait list. * @param[in] eventWaitList Event wait list. * @param[in] events Event objects per each command queue that identify * a particular kernel execution instance. * * @return * - \b clAmdBlasSuccess on success; * - \b clAmdBlasNotInitialized if clAmdBlasSetup() was not called; * - \b clAmdBlasInvalidValue if invalid parameters are passed: * - \b N is zero, or * - either \b incx or \b incy is zero, or * - the matrix sizes or the vector sizes along with the increments lead to * accessing outsize of any of the buffers; * - \b clAmdBlasInvalidMemObject if either \b AP, \b X, or \b Y object is * invalid, or an image object rather than the buffer one; * - \b clAmdBlasOutOfHostMemory if the library can't allocate memory for * internal structures; * - \b clAmdBlasInvalidCommandQueue if the passed command queue is invalid; * - \b clAmdBlasInvalidContext if a context a passed command queue belongs to * was released; * - \b clAmdBlasInvalidOperation if kernel compilation relating to a previous * call has not completed for any of the target devices; * - \b clAmdBlasCompilerNotAvailable if a compiler is not available; * - \b clAmdBlasBuildProgramFailure if there is a failure to build a program * executable. * * @ingroup SPMV */ __inline clAmdBlasStatus clAmdBlasSspmv( clblasOrder order, clblasUplo uplo, size_t N, cl_float alpha, const cl_mem AP, size_t offa, const cl_mem X, size_t offx, int incx, cl_float beta, cl_mem Y, size_t offy, int incy, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events) { return clblasSspmv( order, uplo, N, alpha, AP, offa, X, offx, incx, beta, Y, offy, incy, numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events ); } /** * @example example_sspmv.c * This is an example of how to use the @ref clAmdBlasSspmv function. */ /** * @brief Matrix-vector product with a symmetric packed-matrix and double elements. * * Matrix-vector products: * - \f$ Y \leftarrow \alpha A X + \beta Y \f$ * * @param[in] order Row/columns order. * @param[in] uplo The triangle in matrix being referenced. * @param[in] N Number of rows and columns in matrix \b AP. * @param[in] alpha The factor of matrix \b AP. * @param[in] AP Buffer object storing matrix \b AP. * @param[in] offa Offset in number of elements for first element in matrix \b AP. * @param[in] X Buffer object storing vector \b X. * @param[in] offx Offset of first element of vector \b X in buffer object. * Counted in elements. * @param[in] incx Increment for the elements of vector \b X. It cannot be zero. * @param[in] beta The factor of vector \b Y. * @param[out] Y Buffer object storing vector \b Y. * @param[in] offy Offset of first element of vector \b Y in buffer object. * Counted in elements. * @param[in] incy Increment for the elements of vector \b Y. It cannot be zero. * @param[in] numCommandQueues Number of OpenCL command queues in which the * task is to be performed. * @param[in] commandQueues OpenCL command queues. * @param[in] numEventsInWaitList Number of events in the event wait list. * @param[in] eventWaitList Event wait list. * @param[in] events Event objects per each command queue that identify * a particular kernel execution instance. * * @return * - \b clAmdBlasSuccess on success; * - \b clAmdBlasInvalidDevice if a target device does not support floating * point arithmetic with double precision; * - the same error codes as the clAmdBlasSspmv() function otherwise. * * @ingroup SPMV */ __inline clAmdBlasStatus clAmdBlasDspmv( clblasOrder order, clblasUplo uplo, size_t N, cl_double alpha, const cl_mem AP, size_t offa, const cl_mem X, size_t offx, int incx, cl_double beta, cl_mem Y, size_t offy, int incy, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events) { return clblasDspmv( order, uplo, N, alpha, AP, offa, X, offx, incx, beta, Y, offy, incy, numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events ); } /*@}*/ /** * @defgroup HPMV HPMV - Hermitian packed matrix-vector multiplication * @ingroup BLAS2 */ /*@{*/ /** * @brief Matrix-vector product with a packed hermitian matrix and float-complex elements. * * Matrix-vector products: * - \f$ Y \leftarrow \alpha A X + \beta Y \f$ * * @param[in] order Row/columns order. * @param[in] uplo The triangle in matrix being referenced. * @param[in] N Number of rows and columns in matrix \b AP. * @param[in] alpha The factor of matrix \b AP. * @param[in] AP Buffer object storing packed matrix \b AP. * @param[in] offa Offset in number of elements for first element in matrix \b AP. * @param[in] X Buffer object storing vector \b X. * @param[in] offx Offset of first element of vector \b X in buffer object. * Counted in elements. * @param[in] incx Increment for the elements of vector \b X. It cannot be zero. * @param[in] beta The factor of vector \b Y. * @param[out] Y Buffer object storing vector \b Y. * @param[in] offy Offset of first element of vector \b Y in buffer object. * Counted in elements. * @param[in] incy Increment for the elements of vector \b Y. It cannot be zero. * @param[in] numCommandQueues Number of OpenCL command queues in which the * task is to be performed. * @param[in] commandQueues OpenCL command queues. * @param[in] numEventsInWaitList Number of events in the event wait list. * @param[in] eventWaitList Event wait list. * @param[in] events Event objects per each command queue that identify * a particular kernel execution instance. * * @return * - \b clAmdBlasSuccess on success; * - \b clAmdBlasNotInitialized if clAmdBlasSetup() was not called; * - \b clAmdBlasInvalidValue if invalid parameters are passed: * - \b N is zero, or * - either \b incx or \b incy is zero, or * - the matrix sizes or the vector sizes along with the increments lead to * accessing outsize of any of the buffers; * - \b clAmdBlasInvalidMemObject if either \b AP, \b X, or \b Y object is * invalid, or an image object rather than the buffer one; * - \b clAmdBlasOutOfHostMemory if the library can't allocate memory for * internal structures; * - \b clAmdBlasInvalidCommandQueue if the passed command queue is invalid; * - \b clAmdBlasInvalidContext if a context a passed command queue belongs to * was released; * - \b clAmdBlasInvalidOperation if kernel compilation relating to a previous * call has not completed for any of the target devices; * - \b clAmdBlasCompilerNotAvailable if a compiler is not available; * - \b clAmdBlasBuildProgramFailure if there is a failure to build a program * executable. * * @ingroup HPMV */ __inline clAmdBlasStatus clAmdBlasChpmv( clblasOrder order, clblasUplo uplo, size_t N, cl_float2 alpha, const cl_mem AP, size_t offa, const cl_mem X, size_t offx, int incx, cl_float2 beta, cl_mem Y, size_t offy, int incy, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events) { return clblasChpmv( order, uplo, N, alpha, AP, offa, X, offx, incx, beta, Y, offy, incy, numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events ); } /** * @example example_chpmv.c * This is an example of how to use the @ref clAmdBlasChpmv function. */ /** * @brief Matrix-vector product with a packed hermitian matrix and double-complex elements. * * Matrix-vector products: * - \f$ Y \leftarrow \alpha A X + \beta Y \f$ * * @param[in] order Row/columns order. * @param[in] uplo The triangle in matrix being referenced. * @param[in] N Number of rows and columns in matrix \b AP. * @param[in] alpha The factor of matrix \b AP. * @param[in] AP Buffer object storing packed matrix \b AP. * @param[in] offa Offset in number of elements for first element in matrix \b AP. * @param[in] X Buffer object storing vector \b X. * @param[in] offx Offset of first element of vector \b X in buffer object. * Counted in elements. * @param[in] incx Increment for the elements of vector \b X. It cannot be zero. * @param[in] beta The factor of vector \b Y. * @param[out] Y Buffer object storing vector \b Y. * @param[in] offy Offset of first element of vector \b Y in buffer object. * Counted in elements. * @param[in] incy Increment for the elements of vector \b Y. It cannot be zero. * @param[in] numCommandQueues Number of OpenCL command queues in which the * task is to be performed. * @param[in] commandQueues OpenCL command queues. * @param[in] numEventsInWaitList Number of events in the event wait list. * @param[in] eventWaitList Event wait list. * @param[in] events Event objects per each command queue that identify * a particular kernel execution instance. * * @return * - \b clAmdBlasSuccess on success; * - \b clAmdBlasInvalidDevice if a target device does not support floating * point arithmetic with double precision; * - the same error codes as the clAmdBlasChpmv() function otherwise. * * @ingroup HPMV */ __inline clAmdBlasStatus clAmdBlasZhpmv( clblasOrder order, clblasUplo uplo, size_t N, cl_double2 alpha, const cl_mem AP, size_t offa, const cl_mem X, size_t offx, int incx, cl_double2 beta, cl_mem Y, size_t offy, int incy, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events) { return clblasZhpmv( order, uplo, N, alpha, AP, offa, X, offx, incx, beta, Y, offy, incy, numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events ); } /*@}*/ /** * @defgroup SPR SPR - Symmetric packed matrix rank 1 update * * The Level 2 Basic Linear Algebra Subprograms are functions that perform * symmetric rank 1 update operations on packed matrix * @ingroup BLAS2 */ /*@{*/ /** * @brief Symmetric rank 1 operation with a general triangular packed-matrix and * float elements. * * Symmetric rank 1 operation: * - \f$ A \leftarrow \alpha X X^T + A \f$ * * @param[in] order Row/column order. * @param[in] uplo The triangle in matrix being referenced. * @param[in] N Number of columns in matrix \b A. * @param[in] alpha The factor of matrix \b A. * @param[in] X Buffer object storing vector \b X. * @param[in] offx Offset of first element of vector \b X in buffer object. * @param[in] incx Increment for the elements of \b X. Must not be zero. * @param[out] AP Buffer object storing packed-matrix \b AP. * @param[in] offa Offset of first element of matrix \b AP in buffer object. * @param[in] numCommandQueues Number of OpenCL command queues in which the * task is to be performed. * @param[in] commandQueues OpenCL command queues. * @param[in] numEventsInWaitList Number of events in the event wait list. * @param[in] eventWaitList Event wait list. * @param[in] events Event objects per each command queue that identify * a particular kernel execution instance. * * @return * - \b clAmdBlasSuccess on success; * - \b clAmdBlasNotInitialized if clAmdBlasSetup() was not called; * - \b clAmdBlasInvalidValue if invalid parameters are passed: * - \b N is zero, or * - either \b incx is zero * - \b clAmdBlasInvalidMemObject if either \b AP, \b X object is * Invalid, or an image object rather than the buffer one; * - \b clAmdBlasOutOfHostMemory if the library can't allocate memory for * internal structures; * - \b clAmdBlasInvalidCommandQueue if the passed command queue is invalid; * - \b clAmdBlasInvalidContext if a context a passed command queue belongs * to was released; * - \b clAmdBlasInvalidOperation if kernel compilation relating to a previous * call has not completed for any of the target devices; * - \b clAmdBlasCompilerNotAvailable if a compiler is not available; * - \b clAmdBlasBuildProgramFailure if there is a failure to build a program * executable. * * @ingroup SPR */ __inline clAmdBlasStatus clAmdBlasSspr( clblasOrder order, clblasUplo uplo, size_t N, cl_float alpha, const cl_mem X, size_t offx, int incx, cl_mem AP, size_t offa, cl_uint numCommandQueues, cl_command_queue* commandQueues, cl_uint numEventsInWaitList, const cl_event* eventWaitList, cl_event* events) { return clblasSspr( order, uplo, N, alpha, X, offx, incx, AP, offa, numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events ); } /** * @example example_sspr.c * Example of how to use the @ref clAmdBlasSspr function. */ /** * @brief Symmetric rank 1 operation with a general triangular packed-matrix and * double elements. * * Symmetric rank 1 operation: * - \f$ A \leftarrow \alpha X X^T + A \f$ * * @param[in] order Row/column order. * @param[in] uplo The triangle in matrix being referenced. * @param[in] N Number of columns in matrix \b A. * @param[in] alpha The factor of matrix \b A. * @param[in] X Buffer object storing vector \b X. * @param[in] offx Offset of first element of vector \b X in buffer object. * @param[in] incx Increment for the elements of \b X. Must not be zero. * @param[out] AP Buffer object storing packed-matrix \b AP. * @param[in] offa Offset of first element of matrix \b AP in buffer object. * @param[in] numCommandQueues Number of OpenCL command queues in which the * task is to be performed. * @param[in] commandQueues OpenCL command queues. * @param[in] numEventsInWaitList Number of events in the event wait list. * @param[in] eventWaitList Event wait list. * @param[in] events Event objects per each command queue that identify * a particular kernel execution instance. * * @return * - \b clAmdBlasSuccess on success; * - \b clAmdBlasInvalidDevice if a target device does not support floating * point arithmetic with double precision; * - the same error codes as the clAmdBlasSspr() function otherwise. * * @ingroup SPR */ __inline clAmdBlasStatus clAmdBlasDspr( clblasOrder order, clblasUplo uplo, size_t N, cl_double alpha, const cl_mem X, size_t offx, int incx, cl_mem AP, size_t offa, cl_uint numCommandQueues, cl_command_queue* commandQueues, cl_uint numEventsInWaitList, const cl_event* eventWaitList, cl_event* events) { return clblasDspr( order, uplo, N, alpha, X, offx, incx, AP, offa, numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events ); } /*@}*/ /** * @defgroup HPR HPR - Hermitian packed matrix rank 1 update * * The Level 2 Basic Linear Algebra Subprogram functions that perform * hermitian rank 1 operations on packed matrix * @ingroup BLAS2 */ /*@{*/ /** * @brief hermitian rank 1 operation with a general triangular packed-matrix and * float-complex elements. * * hermitian rank 1 operation: * - \f$ A \leftarrow \alpha X X^H + A \f$ * * @param[in] order Row/column order. * @param[in] uplo The triangle in matrix being referenced. * @param[in] N Number of columns in matrix \b A. * @param[in] alpha The factor of matrix \b A (a scalar float value) * @param[in] X Buffer object storing vector \b X. * @param[in] offx Offset in number of elements for the first element in vector \b X. * @param[in] incx Increment for the elements of \b X. Must not be zero. * @param[out] AP Buffer object storing matrix \b AP. * @param[in] offa Offset in number of elements for the first element in matrix \b AP. * @param[in] numCommandQueues Number of OpenCL command queues in which the * task is to be performed. * @param[in] commandQueues OpenCL command queues. * @param[in] numEventsInWaitList Number of events in the event wait list. * @param[in] eventWaitList Event wait list. * @param[in] events Event objects per each command queue that identify * a particular kernel execution instance. * * @return * - \b clAmdBlasSuccess on success; * - \b clAmdBlasNotInitialized if clAmdBlasSetup() was not called; * - \b clAmdBlasInvalidValue if invalid parameters are passed: * - \b N is zero, or * - either \b incx is zero * - \b clAmdBlasInvalidMemObject if either \b AP, \b X object is * Invalid, or an image object rather than the buffer one; * - \b clAmdBlasOutOfHostMemory if the library can't allocate memory for * internal structures; * - \b clAmdBlasInvalidCommandQueue if the passed command queue is invalid; * - \b clAmdBlasInvalidContext if a context a passed command queue belongs * to was released; * - \b clAmdBlasInvalidOperation if kernel compilation relating to a previous * call has not completed for any of the target devices; * - \b clAmdBlasCompilerNotAvailable if a compiler is not available; * - \b clAmdBlasBuildProgramFailure if there is a failure to build a program * executable. * * @ingroup HPR */ __inline clAmdBlasStatus clAmdBlasChpr( clblasOrder order, clblasUplo uplo, size_t N, cl_float alpha, const cl_mem X, size_t offx, int incx, cl_mem AP, size_t offa, cl_uint numCommandQueues, cl_command_queue* commandQueues, cl_uint numEventsInWaitList, const cl_event* eventWaitList, cl_event* events) { return clblasChpr( order, uplo, N, alpha, X, offx, incx, AP, offa, numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events ); } /** * @example example_chpr.c * Example of how to use the @ref clAmdBlasChpr function. */ /** * @brief hermitian rank 1 operation with a general triangular packed-matrix and * double-complex elements. * * hermitian rank 1 operation: * - \f$ A \leftarrow \alpha X X^H + A \f$ * * @param[in] order Row/column order. * @param[in] uplo The triangle in matrix being referenced. * @param[in] N Number of columns in matrix \b A. * @param[in] alpha The factor of matrix \b A (a scalar float value) * @param[in] X Buffer object storing vector \b X. * @param[in] offx Offset in number of elements for the first element in vector \b X. * @param[in] incx Increment for the elements of \b X. Must not be zero. * @param[out] AP Buffer object storing matrix \b AP. * @param[in] offa Offset in number of elements for the first element in matrix \b AP. * @param[in] numCommandQueues Number of OpenCL command queues in which the * task is to be performed. * @param[in] commandQueues OpenCL command queues. * @param[in] numEventsInWaitList Number of events in the event wait list. * @param[in] eventWaitList Event wait list. * @param[in] events Event objects per each command queue that identify * a particular kernel execution instance. * * @return * - \b clAmdBlasSuccess on success; * - \b clAmdBlasInvalidDevice if a target device does not support floating * point arithmetic with double precision; * - the same error codes as the clAmdBlasChpr() function otherwise. * * @ingroup HPR */ __inline clAmdBlasStatus clAmdBlasZhpr( clblasOrder order, clblasUplo uplo, size_t N, cl_double alpha, const cl_mem X, size_t offx, int incx, cl_mem AP, size_t offa, cl_uint numCommandQueues, cl_command_queue* commandQueues, cl_uint numEventsInWaitList, const cl_event* eventWaitList, cl_event* events) { return clblasZhpr( order, uplo, N, alpha, X, offx, incx, AP, offa, numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events ); } /*@}*/ /** * @defgroup SPR2 SPR2 - Symmetric packed matrix rank 2 update * * The Level 2 Basic Linear Algebra Subprograms are functions that perform * symmetric rank 2 update operations on packed matrices * @ingroup BLAS2 */ /*@{*/ /** * @brief Symmetric rank 2 operation with a general triangular packed-matrix and * float elements. * * Symmetric rank 2 operation: * - \f$ A \leftarrow \alpha X Y^T + \alpha Y X^T + A \f$ * * @param[in] order Row/column order. * @param[in] uplo The triangle in matrix being referenced. * @param[in] N Number of columns in matrix \b A. * @param[in] alpha The factor of matrix \b A. * @param[in] X Buffer object storing vector \b X. * @param[in] offx Offset of first element of vector \b X in buffer object. * @param[in] incx Increment for the elements of \b X. Must not be zero. * @param[in] Y Buffer object storing vector \b Y. * @param[in] offy Offset of first element of vector \b Y in buffer object. * @param[in] incy Increment for the elements of \b Y. Must not be zero. * @param[out] AP Buffer object storing packed-matrix \b AP. * @param[in] offa Offset of first element of matrix \b AP in buffer object. * @param[in] numCommandQueues Number of OpenCL command queues in which the * task is to be performed. * @param[in] commandQueues OpenCL command queues. * @param[in] numEventsInWaitList Number of events in the event wait list. * @param[in] eventWaitList Event wait list. * @param[in] events Event objects per each command queue that identify * a particular kernel execution instance. * * @return * - \b clAmdBlasSuccess on success; * - \b clAmdBlasNotInitialized if clAmdBlasSetup() was not called; * - \b clAmdBlasInvalidValue if invalid parameters are passed: * - either \b N is zero, or * - either \b incx or \b incy is zero * - \b clAmdBlasInvalidMemObject if either \b AP, \b X, or \b Y object is * Invalid, or an image object rather than the buffer one; * - \b clAmdBlasOutOfHostMemory if the library can't allocate memory for * internal structures; * - \b clAmdBlasInvalidCommandQueue if the passed command queue is invalid; * - \b clAmdBlasInvalidContext if a context a passed command queue belongs * to was released; * - \b clAmdBlasInvalidOperation if kernel compilation relating to a previous * call has not completed for any of the target devices; * - \b clAmdBlasCompilerNotAvailable if a compiler is not available; * - \b clAmdBlasBuildProgramFailure if there is a failure to build a program * executable. * * @ingroup SPR2 */ __inline clAmdBlasStatus clAmdBlasSspr2( clblasOrder order, clblasUplo uplo, size_t N, cl_float alpha, const cl_mem X, size_t offx, int incx, const cl_mem Y, size_t offy, int incy, cl_mem AP, size_t offa, cl_uint numCommandQueues, cl_command_queue* commandQueues, cl_uint numEventsInWaitList, const cl_event* eventWaitList, cl_event* events) { return clblasSspr2( order, uplo, N, alpha, X, offx, incx, Y, offy, incy, AP, offa, numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events ); } /** * @example example_sspr2.c * Example of how to use the @ref clAmdBlasSspr2 function. */ /** * @brief Symmetric rank 2 operation with a general triangular packed-matrix and * double elements. * * Symmetric rank 2 operation: * - \f$ A \leftarrow \alpha X Y^T + \alpha Y X^T + A \f$ * * @param[in] order Row/column order. * @param[in] uplo The triangle in matrix being referenced. * @param[in] N Number of columns in matrix \b A. * @param[in] alpha The factor of matrix \b A. * @param[in] X Buffer object storing vector \b X. * @param[in] offx Offset of first element of vector \b X in buffer object. * @param[in] incx Increment for the elements of \b X. Must not be zero. * @param[in] Y Buffer object storing vector \b Y. * @param[in] offy Offset of first element of vector \b Y in buffer object. * @param[in] incy Increment for the elements of \b Y. Must not be zero. * @param[out] AP Buffer object storing packed-matrix \b AP. * @param[in] offa Offset of first element of matrix \b AP in buffer object. * @param[in] numCommandQueues Number of OpenCL command queues in which the * task is to be performed. * @param[in] commandQueues OpenCL command queues. * @param[in] numEventsInWaitList Number of events in the event wait list. * @param[in] eventWaitList Event wait list. * @param[in] events Event objects per each command queue that identify * a particular kernel execution instance. * * @return * - \b clAmdBlasSuccess on success; * - \b clAmdBlasInvalidDevice if a target device does not support floating * point arithmetic with double precision; * - the same error codes as the clAmdBlasSspr2() function otherwise. * * @ingroup SPR2 */ __inline clAmdBlasStatus clAmdBlasDspr2( clblasOrder order, clblasUplo uplo, size_t N, cl_double alpha, const cl_mem X, size_t offx, int incx, const cl_mem Y, size_t offy, int incy, cl_mem AP, size_t offa, cl_uint numCommandQueues, cl_command_queue* commandQueues, cl_uint numEventsInWaitList, const cl_event* eventWaitList, cl_event* events) { return clblasDspr2( order, uplo, N, alpha, X, offx, incx, Y, offy, incy, AP, offa, numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events ); } /*@}*/ /** * @defgroup HPR2 HPR2 - Hermitian packed matrix rank 2 update * * The Level 2 Basic Linear Algebra Subprograms are functions that perform * hermitian rank 2 update operations on packed matrices * @ingroup BLAS2 */ /*@{*/ /** * @brief Hermitian rank 2 operation with a general triangular packed-matrix and * float-compelx elements. * * Hermitian rank 2 operation: * - \f$ A \leftarrow \alpha X Y^H + \conjg( alpha ) Y X^H + A \f$ * * @param[in] order Row/column order. * @param[in] uplo The triangle in matrix being referenced. * @param[in] N Number of columns in matrix \b A. * @param[in] alpha The factor of matrix \b A. * @param[in] X Buffer object storing vector \b X. * @param[in] offx Offset in number of elements for the first element in vector \b X. * @param[in] incx Increment for the elements of \b X. Must not be zero. * @param[in] Y Buffer object storing vector \b Y. * @param[in] offy Offset in number of elements for the first element in vector \b Y. * @param[in] incy Increment for the elements of \b Y. Must not be zero. * @param[out] AP Buffer object storing packed-matrix \b AP. * @param[in] offa Offset in number of elements for the first element in matrix \b AP. * @param[in] numCommandQueues Number of OpenCL command queues in which the * task is to be performed. * @param[in] commandQueues OpenCL command queues. * @param[in] numEventsInWaitList Number of events in the event wait list. * @param[in] eventWaitList Event wait list. * @param[in] events Event objects per each command queue that identify * a particular kernel execution instance. * * @return * - \b clAmdBlasSuccess on success; * - \b clAmdBlasNotInitialized if clAmdBlasSetup() was not called; * - \b clAmdBlasInvalidValue if invalid parameters are passed: * - either \b N is zero, or * - either \b incx or \b incy is zero * - \b clAmdBlasInvalidMemObject if either \b AP, \b X, or \b Y object is * Invalid, or an image object rather than the buffer one; * - \b clAmdBlasOutOfHostMemory if the library can't allocate memory for * internal structures; * - \b clAmdBlasInvalidCommandQueue if the passed command queue is invalid; * - \b clAmdBlasInvalidContext if a context a passed command queue belongs * to was released; * - \b clAmdBlasInvalidOperation if kernel compilation relating to a previous * call has not completed for any of the target devices; * - \b clAmdBlasCompilerNotAvailable if a compiler is not available; * - \b clAmdBlasBuildProgramFailure if there is a failure to build a program * executable. * * @ingroup HPR2 */ __inline clAmdBlasStatus clAmdBlasChpr2( clblasOrder order, clblasUplo uplo, size_t N, cl_float2 alpha, const cl_mem X, size_t offx, int incx, const cl_mem Y, size_t offy, int incy, cl_mem AP, size_t offa, cl_uint numCommandQueues, cl_command_queue* commandQueues, cl_uint numEventsInWaitList, const cl_event* eventWaitList, cl_event* events) { return clblasChpr2( order, uplo, N, alpha, X, offx, incx, Y, offy, incy, AP, offa, numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events ); } /** * @brief Hermitian rank 2 operation with a general triangular packed-matrix and * double-compelx elements. * * Hermitian rank 2 operation: * - \f$ A \leftarrow \alpha X Y^H + \conjg( alpha ) Y X^H + A \f$ * * @param[in] order Row/column order. * @param[in] uplo The triangle in matrix being referenced. * @param[in] N Number of columns in matrix \b A. * @param[in] alpha The factor of matrix \b A. * @param[in] X Buffer object storing vector \b X. * @param[in] offx Offset in number of elements for the first element in vector \b X. * @param[in] incx Increment for the elements of \b X. Must not be zero. * @param[in] Y Buffer object storing vector \b Y. * @param[in] offy Offset in number of elements for the first element in vector \b Y. * @param[in] incy Increment for the elements of \b Y. Must not be zero. * @param[out] AP Buffer object storing packed-matrix \b AP. * @param[in] offa Offset in number of elements for the first element in matrix \b AP. * @param[in] numCommandQueues Number of OpenCL command queues in which the * task is to be performed. * @param[in] commandQueues OpenCL command queues. * @param[in] numEventsInWaitList Number of events in the event wait list. * @param[in] eventWaitList Event wait list. * @param[in] events Event objects per each command queue that identify * a particular kernel execution instance. * * @return * - \b clAmdBlasSuccess on success; * - \b clAmdBlasInvalidDevice if a target device does not support floating * point arithmetic with double precision; * - the same error codes as the clAmdBlasChpr2() function otherwise. * * @ingroup HPR2 */ __inline clAmdBlasStatus clAmdBlasZhpr2( clblasOrder order, clblasUplo uplo, size_t N, cl_double2 alpha, const cl_mem X, size_t offx, int incx, const cl_mem Y, size_t offy, int incy, cl_mem AP, size_t offa, cl_uint numCommandQueues, cl_command_queue* commandQueues, cl_uint numEventsInWaitList, const cl_event* eventWaitList, cl_event* events) { return clblasZhpr2( order, uplo, N, alpha, X, offx, incx, Y, offy, incy, AP, offa, numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events ); } /** * @example example_zhpr2.c * Example of how to use the @ref clAmdBlasZhpr2 function. */ /*@}*/ /** * @defgroup GBMV GBMV - General banded matrix-vector multiplication * @ingroup BLAS2 */ /*@{*/ /** * @brief Matrix-vector product with a general rectangular banded matrix and * float elements. * * Matrix-vector products: * - \f$ Y \leftarrow \alpha A X + \beta Y \f$ * - \f$ Y \leftarrow \alpha A^T X + \beta Y \f$ * * @param[in] order Row/column order. * @param[in] trans How matrix \b A is to be transposed. * @param[in] M Number of rows in banded matrix \b A. * @param[in] N Number of columns in banded matrix \b A. * @param[in] KL Number of sub-diagonals in banded matrix \b A. * @param[in] KU Number of super-diagonals in banded matrix \b A. * @param[in] alpha The factor of banded matrix \b A. * @param[in] A Buffer object storing banded matrix \b A. * @param[in] offa Offset in number of elements for the first element in banded matrix \b A. * @param[in] lda Leading dimension of banded matrix \b A. It cannot be less * than ( \b KL + \b KU + 1 ) * @param[in] X Buffer object storing vector \b X. * @param[in] offx Offset of first element of vector \b X in buffer object. * Counted in elements. * @param[in] incx Increment for the elements of \b X. Must not be zero. * @param[in] beta The factor of the vector \b Y. * @param[out] Y Buffer object storing the vector \b y. * @param[in] offy Offset of first element of vector \b Y in buffer object. * Counted in elements. * @param[in] incy Increment for the elements of \b Y. Must not be zero. * @param[in] numCommandQueues Number of OpenCL command queues in which the * task is to be performed. * @param[in] commandQueues OpenCL command queues. * @param[in] numEventsInWaitList Number of events in the event wait list. * @param[in] eventWaitList Event wait list. * @param[in] events Event objects per each command queue that identify * a particular kernel execution instance. * * @return * - \b clAmdBlasSuccess on success; * - \b clAmdBlasNotInitialized if clAmdBlasSetup() was not called; * - \b clAmdBlasInvalidValue if invalid parameters are passed: * - either \b M or \b N is zero, or * - KL is greater than \b M - 1, or * - KU is greater than \b N - 1, or * - either \b incx or \b incy is zero, or * - any of the leading dimensions is invalid; * - the matrix size or the vector sizes along with the increments lead to * accessing outside of any of the buffers; * - \b clAmdBlasInvalidMemObject if either \b A, \b X, or \b Y object is * Invalid, or an image object rather than the buffer one; * - \b clAmdBlasOutOfHostMemory if the library can't allocate memory for * internal structures; * - \b clAmdBlasInvalidCommandQueue if the passed command queue is invalid; * - \b clAmdBlasInvalidContext if a context a passed command queue belongs * to was released; * - \b clAmdBlasInvalidOperation if kernel compilation relating to a previous * call has not completed for any of the target devices; * - \b clAmdBlasCompilerNotAvailable if a compiler is not available; * - \b clAmdBlasBuildProgramFailure if there is a failure to build a program * executable. * * @ingroup GBMV */ __inline clAmdBlasStatus clAmdBlasSgbmv( clblasOrder order, clblasTranspose trans, size_t M, size_t N, size_t KL, size_t KU, cl_float alpha, const cl_mem A, size_t offa, size_t lda, const cl_mem X, size_t offx, int incx, cl_float beta, cl_mem Y, size_t offy, int incy, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events) { return clblasSgbmv( order, trans, M, N, KL, KU, alpha, A, offa, lda, X, offx, incx, beta, Y, offy, incy, numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events ); } /** * @example example_sgbmv.c * Example of how to use the @ref clAmdBlasSgbmv function. */ /** * @brief Matrix-vector product with a general rectangular banded matrix and * double elements. * * Matrix-vector products: * - \f$ Y \leftarrow \alpha A X + \beta Y \f$ * - \f$ Y \leftarrow \alpha A^T X + \beta Y \f$ * * @param[in] order Row/column order. * @param[in] trans How matrix \b A is to be transposed. * @param[in] M Number of rows in banded matrix \b A. * @param[in] N Number of columns in banded matrix \b A. * @param[in] KL Number of sub-diagonals in banded matrix \b A. * @param[in] KU Number of super-diagonals in banded matrix \b A. * @param[in] alpha The factor of banded matrix \b A. * @param[in] A Buffer object storing banded matrix \b A. * @param[in] offa Offset in number of elements for the first element in banded matrix \b A. * @param[in] lda Leading dimension of banded matrix \b A. It cannot be less * than ( \b KL + \b KU + 1 ) * @param[in] X Buffer object storing vector \b X. * @param[in] offx Offset of first element of vector \b X in buffer object. * Counted in elements. * @param[in] incx Increment for the elements of \b X. Must not be zero. * @param[in] beta The factor of the vector \b Y. * @param[out] Y Buffer object storing the vector \b y. * @param[in] offy Offset of first element of vector \b Y in buffer object. * Counted in elements. * @param[in] incy Increment for the elements of \b Y. Must not be zero. * @param[in] numCommandQueues Number of OpenCL command queues in which the * task is to be performed. * @param[in] commandQueues OpenCL command queues. * @param[in] numEventsInWaitList Number of events in the event wait list. * @param[in] eventWaitList Event wait list. * @param[in] events Event objects per each command queue that identify * a particular kernel execution instance. * * @return * - \b clAmdBlasSuccess on success; * - \b clAmdBlasInvalidDevice if a target device does not support floating * point arithmetic with double precision; * - the same error codes as the clAmdBlasSgbmv() function otherwise. * * @ingroup GBMV */ __inline clAmdBlasStatus clAmdBlasDgbmv( clblasOrder order, clblasTranspose trans, size_t M, size_t N, size_t KL, size_t KU, cl_double alpha, const cl_mem A, size_t offa, size_t lda, const cl_mem X, size_t offx, int incx, cl_double beta, cl_mem Y, size_t offy, int incy, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events) { return clblasDgbmv( order, trans, M, N, KL, KU, alpha, A, offa, lda, X, offx, incx, beta, Y, offy, incy, numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events ); } /** * @brief Matrix-vector product with a general rectangular banded matrix and * float-complex elements. * * Matrix-vector products: * - \f$ Y \leftarrow \alpha A X + \beta Y \f$ * - \f$ Y \leftarrow \alpha A^T X + \beta Y \f$ * * @param[in] order Row/column order. * @param[in] trans How matrix \b A is to be transposed. * @param[in] M Number of rows in banded matrix \b A. * @param[in] N Number of columns in banded matrix \b A. * @param[in] KL Number of sub-diagonals in banded matrix \b A. * @param[in] KU Number of super-diagonals in banded matrix \b A. * @param[in] alpha The factor of banded matrix \b A. * @param[in] A Buffer object storing banded matrix \b A. * @param[in] offa Offset in number of elements for the first element in banded matrix \b A. * @param[in] lda Leading dimension of banded matrix \b A. It cannot be less * than ( \b KL + \b KU + 1 ) * @param[in] X Buffer object storing vector \b X. * @param[in] offx Offset of first element of vector \b X in buffer object. * Counted in elements. * @param[in] incx Increment for the elements of \b X. Must not be zero. * @param[in] beta The factor of the vector \b Y. * @param[out] Y Buffer object storing the vector \b y. * @param[in] offy Offset of first element of vector \b Y in buffer object. * Counted in elements. * @param[in] incy Increment for the elements of \b Y. Must not be zero. * @param[in] numCommandQueues Number of OpenCL command queues in which the * task is to be performed. * @param[in] commandQueues OpenCL command queues. * @param[in] numEventsInWaitList Number of events in the event wait list. * @param[in] eventWaitList Event wait list. * @param[in] events Event objects per each command queue that identify * a particular kernel execution instance. * * @return The same result as the clAmdBlasSgbmv() function. * * @ingroup GBMV */ __inline clAmdBlasStatus clAmdBlasCgbmv( clblasOrder order, clblasTranspose trans, size_t M, size_t N, size_t KL, size_t KU, cl_float2 alpha, const cl_mem A, size_t offa, size_t lda, const cl_mem X, size_t offx, int incx, cl_float2 beta, cl_mem Y, size_t offy, int incy, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events) { return clblasCgbmv( order, trans, M, N, KL, KU, alpha, A, offa, lda, X, offx, incx, beta, Y, offy, incy, numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events ); } /** * @brief Matrix-vector product with a general rectangular banded matrix and * double-complex elements. * * Matrix-vector products: * - \f$ Y \leftarrow \alpha A X + \beta Y \f$ * - \f$ Y \leftarrow \alpha A^T X + \beta Y \f$ * * @param[in] order Row/column order. * @param[in] trans How matrix \b A is to be transposed. * @param[in] M Number of rows in banded matrix \b A. * @param[in] N Number of columns in banded matrix \b A. * @param[in] KL Number of sub-diagonals in banded matrix \b A. * @param[in] KU Number of super-diagonals in banded matrix \b A. * @param[in] alpha The factor of banded matrix \b A. * @param[in] A Buffer object storing banded matrix \b A. * @param[in] offa Offset in number of elements for the first element in banded matrix \b A. * @param[in] lda Leading dimension of banded matrix \b A. It cannot be less * than ( \b KL + \b KU + 1 ) * @param[in] X Buffer object storing vector \b X. * @param[in] offx Offset of first element of vector \b X in buffer object. * Counted in elements. * @param[in] incx Increment for the elements of \b X. Must not be zero. * @param[in] beta The factor of the vector \b Y. * @param[out] Y Buffer object storing the vector \b y. * @param[in] offy Offset of first element of vector \b Y in buffer object. * Counted in elements. * @param[in] incy Increment for the elements of \b Y. Must not be zero. * @param[in] numCommandQueues Number of OpenCL command queues in which the * task is to be performed. * @param[in] commandQueues OpenCL command queues. * @param[in] numEventsInWaitList Number of events in the event wait list. * @param[in] eventWaitList Event wait list. * @param[in] events Event objects per each command queue that identify * a particular kernel execution instance. * * @return The same result as the clAmdBlasDgbmv() function. * * @ingroup GBMV */ __inline clAmdBlasStatus clAmdBlasZgbmv( clblasOrder order, clblasTranspose trans, size_t M, size_t N, size_t KL, size_t KU, cl_double2 alpha, const cl_mem A, size_t offa, size_t lda, const cl_mem X, size_t offx, int incx, cl_double2 beta, cl_mem Y, size_t offy, int incy, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events) { return clblasZgbmv( order, trans, M, N, KL, KU, alpha, A, offa, lda, X, offx, incx, beta, Y, offy, incy, numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events ); } /*@}*/ /** * @defgroup TBMV TBMV - Triangular banded matrix vector multiply * @ingroup BLAS2 */ /*@{*/ /** * @brief Matrix-vector product with a triangular banded matrix and * float elements. * * Matrix-vector products: * - \f$ X \leftarrow A X \f$ * - \f$ X \leftarrow A^T X \f$ * * @param[in] order Row/column order. * @param[in] uplo The triangle in matrix being referenced. * @param[in] trans How matrix \b A is to be transposed. * @param[in] diag Specify whether matrix \b A is unit triangular. * @param[in] N Number of rows/columns in banded matrix \b A. * @param[in] K Number of sub-diagonals/super-diagonals in triangular banded matrix \b A. * @param[in] A Buffer object storing matrix \b A. * @param[in] offa Offset in number of elements for first element in matrix \b A. * @param[in] lda Leading dimension of matrix \b A. It cannot be less * than ( \b K + 1 ) * @param[out] X Buffer object storing vector \b X. * @param[in] offx Offset in number of elements for first element in vector \b X. * @param[in] incx Increment for the elements of \b X. Must not be zero. * @param[in] scratchBuff Temporary cl_mem scratch buffer object which can hold a * minimum of (1 + (N-1)*abs(incx)) elements * @param[in] numCommandQueues Number of OpenCL command queues in which the * task is to be performed. * @param[in] commandQueues OpenCL command queues. * @param[in] numEventsInWaitList Number of events in the event wait list. * @param[in] eventWaitList Event wait list. * @param[in] events Event objects per each command queue that identify * a particular kernel execution instance. * * @return * - \b clAmdBlasSuccess on success; * - \b clAmdBlasNotInitialized if clAmdBlasSetup() was not called; * - \b clAmdBlasInvalidValue if invalid parameters are passed: * - either \b N or \b incx is zero, or * - K is greater than \b N - 1 * - the leading dimension is invalid; * - \b clAmdBlasInvalidMemObject if either \b A or \b X object is * Invalid, or an image object rather than the buffer one; * - \b clAmdBlasOutOfHostMemory if the library can't allocate memory for * internal structures; * - \b clAmdBlasInvalidCommandQueue if the passed command queue is invalid; * - \b clAmdBlasInvalidContext if a context a passed command queue belongs * to was released; * - \b clAmdBlasInvalidOperation if kernel compilation relating to a previous * call has not completed for any of the target devices; * - \b clAmdBlasCompilerNotAvailable if a compiler is not available; * - \b clAmdBlasBuildProgramFailure if there is a failure to build a program * executable. * * @ingroup TBMV */ __inline clAmdBlasStatus clAmdBlasStbmv( clblasOrder order, clblasUplo uplo, clblasTranspose trans, clblasDiag diag, size_t N, size_t K, const cl_mem A, size_t offa, size_t lda, cl_mem X, size_t offx, int incx, cl_mem scratchBuff, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events) { return clblasStbmv( order, uplo, trans, diag, N, K, A, offa, lda, X, offx, incx, scratchBuff, numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events ); } /** * @example example_stbmv.c * Example of how to use the @ref clAmdBlasStbmv function. */ /** * @brief Matrix-vector product with a triangular banded matrix and * double elements. * * Matrix-vector products: * - \f$ X \leftarrow A X \f$ * - \f$ X \leftarrow A^T X \f$ * * @param[in] order Row/column order. * @param[in] uplo The triangle in matrix being referenced. * @param[in] trans How matrix \b A is to be transposed. * @param[in] diag Specify whether matrix \b A is unit triangular. * @param[in] N Number of rows/columns in banded matrix \b A. * @param[in] K Number of sub-diagonals/super-diagonals in triangular banded matrix \b A. * @param[in] A Buffer object storing matrix \b A. * @param[in] offa Offset in number of elements for first element in matrix \b A. * @param[in] lda Leading dimension of matrix \b A. It cannot be less * than ( \b K + 1 ) * @param[out] X Buffer object storing vector \b X. * @param[in] offx Offset in number of elements for first element in vector \b X. * @param[in] incx Increment for the elements of \b X. Must not be zero. * @param[in] scratchBuff Temporary cl_mem scratch buffer object which can hold a * minimum of (1 + (N-1)*abs(incx)) elements * @param[in] numCommandQueues Number of OpenCL command queues in which the * task is to be performed. * @param[in] commandQueues OpenCL command queues. * @param[in] numEventsInWaitList Number of events in the event wait list. * @param[in] eventWaitList Event wait list. * @param[in] events Event objects per each command queue that identify * a particular kernel execution instance. * * @return * - \b clAmdBlasSuccess on success; * - \b clAmdBlasInvalidDevice if a target device does not support floating * point arithmetic with double precision; * - the same error codes as the clAmdBlasStbmv() function otherwise. * * @ingroup TBMV */ __inline clAmdBlasStatus clAmdBlasDtbmv( clblasOrder order, clblasUplo uplo, clblasTranspose trans, clblasDiag diag, size_t N, size_t K, const cl_mem A, size_t offa, size_t lda, cl_mem X, size_t offx, int incx, cl_mem scratchBuff, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events) { return clblasDtbmv( order, uplo, trans, diag, N, K, A, offa, lda, X, offx, incx, scratchBuff, numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events ); } /** * @brief Matrix-vector product with a triangular banded matrix and * float-complex elements. * * Matrix-vector products: * - \f$ X \leftarrow A X \f$ * - \f$ X \leftarrow A^T X \f$ * * @param[in] order Row/column order. * @param[in] uplo The triangle in matrix being referenced. * @param[in] trans How matrix \b A is to be transposed. * @param[in] diag Specify whether matrix \b A is unit triangular. * @param[in] N Number of rows/columns in banded matrix \b A. * @param[in] K Number of sub-diagonals/super-diagonals in triangular banded matrix \b A. * @param[in] A Buffer object storing matrix \b A. * @param[in] offa Offset in number of elements for first element in matrix \b A. * @param[in] lda Leading dimension of matrix \b A. It cannot be less * than ( \b K + 1 ) * @param[out] X Buffer object storing vector \b X. * @param[in] offx Offset in number of elements for first element in vector \b X. * @param[in] incx Increment for the elements of \b X. Must not be zero. * @param[in] scratchBuff Temporary cl_mem scratch buffer object which can hold a * minimum of (1 + (N-1)*abs(incx)) elements * @param[in] numCommandQueues Number of OpenCL command queues in which the * task is to be performed. * @param[in] commandQueues OpenCL command queues. * @param[in] numEventsInWaitList Number of events in the event wait list. * @param[in] eventWaitList Event wait list. * @param[in] events Event objects per each command queue that identify * a particular kernel execution instance. * * @return The same result as the clAmdBlasStbmv() function. * * @ingroup TBMV */ __inline clAmdBlasStatus clAmdBlasCtbmv( clblasOrder order, clblasUplo uplo, clblasTranspose trans, clblasDiag diag, size_t N, size_t K, const cl_mem A, size_t offa, size_t lda, cl_mem X, size_t offx, int incx, cl_mem scratchBuff, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events) { return clblasCtbmv( order, uplo, trans, diag, N, K, A, offa, lda, X, offx, incx, scratchBuff, numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events ); } /** * @brief Matrix-vector product with a triangular banded matrix and * double-complex elements. * * Matrix-vector products: * - \f$ X \leftarrow A X \f$ * - \f$ X \leftarrow A^T X \f$ * * @param[in] order Row/column order. * @param[in] uplo The triangle in matrix being referenced. * @param[in] trans How matrix \b A is to be transposed. * @param[in] diag Specify whether matrix \b A is unit triangular. * @param[in] N Number of rows/columns in banded matrix \b A. * @param[in] K Number of sub-diagonals/super-diagonals in triangular banded matrix \b A. * @param[in] A Buffer object storing matrix \b A. * @param[in] offa Offset in number of elements for first element in matrix \b A. * @param[in] lda Leading dimension of matrix \b A. It cannot be less * than ( \b K + 1 ) * @param[out] X Buffer object storing vector \b X. * @param[in] offx Offset in number of elements for first element in vector \b X. * @param[in] incx Increment for the elements of \b X. Must not be zero. * @param[in] scratchBuff Temporary cl_mem scratch buffer object which can hold a * minimum of (1 + (N-1)*abs(incx)) elements * @param[in] numCommandQueues Number of OpenCL command queues in which the * task is to be performed. * @param[in] commandQueues OpenCL command queues. * @param[in] numEventsInWaitList Number of events in the event wait list. * @param[in] eventWaitList Event wait list. * @param[in] events Event objects per each command queue that identify * a particular kernel execution instance. * * @return The same result as the clAmdBlasDtbmv() function. * * @ingroup TBMV */ __inline clAmdBlasStatus clAmdBlasZtbmv( clblasOrder order, clblasUplo uplo, clblasTranspose trans, clblasDiag diag, size_t N, size_t K, const cl_mem A, size_t offa, size_t lda, cl_mem X, size_t offx, int incx, cl_mem scratchBuff, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events) { return clblasZtbmv( order, uplo, trans, diag, N, K, A, offa, lda, X, offx, incx, scratchBuff, numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events ); } /*@}*/ /** * @defgroup SBMV SBMV - Symmetric banded matrix-vector multiplication * @ingroup BLAS2 */ /*@{*/ /** * @brief Matrix-vector product with a symmetric banded matrix and float elements. * * Matrix-vector products: * - \f$ Y \leftarrow \alpha A X + \beta Y \f$ * * @param[in] order Row/columns order. * @param[in] uplo The triangle in matrix being referenced. * @param[in] N Number of rows and columns in banded matrix \b A. * @param[in] K Number of sub-diagonals/super-diagonals in banded matrix \b A. * @param[in] alpha The factor of matrix \b A. * @param[in] A Buffer object storing matrix \b A. * @param[in] offa Offset in number of elements for first element in matrix \b A. * @param[in] lda Leading dimension of matrix \b A. It cannot be less * than ( \b K + 1 ) * @param[in] X Buffer object storing vector \b X. * @param[in] offx Offset of first element of vector \b X in buffer object. * Counted in elements. * @param[in] incx Increment for the elements of vector \b X. It cannot be zero. * @param[in] beta The factor of vector \b Y. * @param[out] Y Buffer object storing vector \b Y. * @param[in] offy Offset of first element of vector \b Y in buffer object. * Counted in elements. * @param[in] incy Increment for the elements of vector \b Y. It cannot be zero. * @param[in] numCommandQueues Number of OpenCL command queues in which the * task is to be performed. * @param[in] commandQueues OpenCL command queues. * @param[in] numEventsInWaitList Number of events in the event wait list. * @param[in] eventWaitList Event wait list. * @param[in] events Event objects per each command queue that identify * a particular kernel execution instance. * * @return * - \b clAmdBlasSuccess on success; * - \b clAmdBlasNotInitialized if clAmdBlasSetup() was not called; * - \b clAmdBlasInvalidValue if invalid parameters are passed: * - either \b N or \b incx is zero, or * - K is greater than \b N - 1 * - the leading dimension is invalid; * - \b clAmdBlasInvalidMemObject if either \b A or \b X object is * Invalid, or an image object rather than the buffer one; * - \b clAmdBlasOutOfHostMemory if the library can't allocate memory for * internal structures; * - \b clAmdBlasInvalidCommandQueue if the passed command queue is invalid; * - \b clAmdBlasInvalidContext if a context a passed command queue belongs * to was released; * - \b clAmdBlasInvalidOperation if kernel compilation relating to a previous * call has not completed for any of the target devices; * - \b clAmdBlasCompilerNotAvailable if a compiler is not available; * - \b clAmdBlasBuildProgramFailure if there is a failure to build a program * executable. * * @ingroup SBMV */ __inline clAmdBlasStatus clAmdBlasSsbmv( clblasOrder order, clblasUplo uplo, size_t N, size_t K, cl_float alpha, const cl_mem A, size_t offa, size_t lda, const cl_mem X, size_t offx, int incx, cl_float beta, cl_mem Y, size_t offy, int incy, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events) { return clblasSsbmv( order, uplo, N, K, alpha, A, offa, lda, X, offx, incx, beta, Y, offy, incy, numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events ); } /** * @example example_ssbmv.c * This is an example of how to use the @ref clAmdBlasSsbmv function. */ /** * @brief Matrix-vector product with a symmetric banded matrix and double elements. * * Matrix-vector products: * - \f$ Y \leftarrow \alpha A X + \beta Y \f$ * * @param[in] order Row/columns order. * @param[in] uplo The triangle in matrix being referenced. * @param[in] N Number of rows and columns in banded matrix \b A. * @param[in] K Number of sub-diagonals/super-diagonals in banded matrix \b A. * @param[in] alpha The factor of matrix \b A. * @param[in] A Buffer object storing matrix \b A. * @param[in] offa Offset in number of elements for first element in matrix \b A. * @param[in] lda Leading dimension of matrix \b A. It cannot be less * than ( \b K + 1 ) * @param[in] X Buffer object storing vector \b X. * @param[in] offx Offset of first element of vector \b X in buffer object. * Counted in elements. * @param[in] incx Increment for the elements of vector \b X. It cannot be zero. * @param[in] beta The factor of vector \b Y. * @param[out] Y Buffer object storing vector \b Y. * @param[in] offy Offset of first element of vector \b Y in buffer object. * Counted in elements. * @param[in] incy Increment for the elements of vector \b Y. It cannot be zero. * @param[in] numCommandQueues Number of OpenCL command queues in which the * task is to be performed. * @param[in] commandQueues OpenCL command queues. * @param[in] numEventsInWaitList Number of events in the event wait list. * @param[in] eventWaitList Event wait list. * @param[in] events Event objects per each command queue that identify * a particular kernel execution instance. * * @return * - \b clAmdBlasSuccess on success; * - \b clAmdBlasInvalidDevice if a target device does not support floating * point arithmetic with double precision; * - the same error codes as the clAmdBlasSsbmv() function otherwise. * * @ingroup SBMV */ __inline clAmdBlasStatus clAmdBlasDsbmv( clblasOrder order, clblasUplo uplo, size_t N, size_t K, cl_double alpha, const cl_mem A, size_t offa, size_t lda, const cl_mem X, size_t offx, int incx, cl_double beta, cl_mem Y, size_t offy, int incy, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events) { return clblasDsbmv( order, uplo, N, K, alpha, A, offa, lda, X, offx, incx, beta, Y, offy, incy, numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events ); } /*@}*/ /** * @defgroup HBMV HBMV - Hermitian banded matrix-vector multiplication * @ingroup BLAS2 */ /*@{*/ /** * @brief Matrix-vector product with a hermitian banded matrix and float elements. * * Matrix-vector products: * - \f$ Y \leftarrow \alpha A X + \beta Y \f$ * * @param[in] order Row/columns order. * @param[in] uplo The triangle in matrix being referenced. * @param[in] N Number of rows and columns in banded matrix \b A. * @param[in] K Number of sub-diagonals/super-diagonals in banded matrix \b A. * @param[in] alpha The factor of matrix \b A. * @param[in] A Buffer object storing matrix \b A. * @param[in] offa Offset in number of elements for first element in matrix \b A. * @param[in] lda Leading dimension of matrix \b A. It cannot be less * than ( \b K + 1 ) * @param[in] X Buffer object storing vector \b X. * @param[in] offx Offset of first element of vector \b X in buffer object. * Counted in elements. * @param[in] incx Increment for the elements of vector \b X. It cannot be zero. * @param[in] beta The factor of vector \b Y. * @param[out] Y Buffer object storing vector \b Y. * @param[in] offy Offset of first element of vector \b Y in buffer object. * Counted in elements. * @param[in] incy Increment for the elements of vector \b Y. It cannot be zero. * @param[in] numCommandQueues Number of OpenCL command queues in which the * task is to be performed. * @param[in] commandQueues OpenCL command queues. * @param[in] numEventsInWaitList Number of events in the event wait list. * @param[in] eventWaitList Event wait list. * @param[in] events Event objects per each command queue that identify * a particular kernel execution instance. * * @return * - \b clAmdBlasSuccess on success; * - \b clAmdBlasNotInitialized if clAmdBlasSetup() was not called; * - \b clAmdBlasInvalidValue if invalid parameters are passed: * - either \b N or \b incx is zero, or * - K is greater than \b N - 1 * - the leading dimension is invalid; * - \b clAmdBlasInvalidMemObject if either \b A or \b X object is * Invalid, or an image object rather than the buffer one; * - \b clAmdBlasOutOfHostMemory if the library can't allocate memory for * internal structures; * - \b clAmdBlasInvalidCommandQueue if the passed command queue is invalid; * - \b clAmdBlasInvalidContext if a context a passed command queue belongs * to was released; * - \b clAmdBlasInvalidOperation if kernel compilation relating to a previous * call has not completed for any of the target devices; * - \b clAmdBlasCompilerNotAvailable if a compiler is not available; * - \b clAmdBlasBuildProgramFailure if there is a failure to build a program * executable. * * @ingroup HBMV */ __inline clAmdBlasStatus clAmdBlasChbmv( clblasOrder order, clblasUplo uplo, size_t N, size_t K, cl_float2 alpha, const cl_mem A, size_t offa, size_t lda, const cl_mem X, size_t offx, int incx, cl_float2 beta, cl_mem Y, size_t offy, int incy, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events) { return clblasChbmv( order, uplo, N, K, alpha, A, offa, lda, X, offx, incx, beta, Y, offy, incy, numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events ); } /** * @example example_chbmv.c * This is an example of how to use the @ref clAmdBlasChbmv function. */ /** * @brief Matrix-vector product with a hermitian banded matrix and double elements. * * Matrix-vector products: * - \f$ Y \leftarrow \alpha A X + \beta Y \f$ * * @param[in] order Row/columns order. * @param[in] uplo The triangle in matrix being referenced. * @param[in] N Number of rows and columns in banded matrix \b A. * @param[in] K Number of sub-diagonals/super-diagonals in banded matrix \b A. * @param[in] alpha The factor of matrix \b A. * @param[in] A Buffer object storing matrix \b A. * @param[in] offa Offset in number of elements for first element in matrix \b A. * @param[in] lda Leading dimension of matrix \b A. It cannot be less * than ( \b K + 1 ) * @param[in] X Buffer object storing vector \b X. * @param[in] offx Offset of first element of vector \b X in buffer object. * Counted in elements. * @param[in] incx Increment for the elements of vector \b X. It cannot be zero. * @param[in] beta The factor of vector \b Y. * @param[out] Y Buffer object storing vector \b Y. * @param[in] offy Offset of first element of vector \b Y in buffer object. * Counted in elements. * @param[in] incy Increment for the elements of vector \b Y. It cannot be zero. * @param[in] numCommandQueues Number of OpenCL command queues in which the * task is to be performed. * @param[in] commandQueues OpenCL command queues. * @param[in] numEventsInWaitList Number of events in the event wait list. * @param[in] eventWaitList Event wait list. * @param[in] events Event objects per each command queue that identify * a particular kernel execution instance. * * @return * - \b clAmdBlasSuccess on success; * - \b clAmdBlasInvalidDevice if a target device does not support floating * point arithmetic with double precision; * - the same error codes as the clAmdBlasChbmv() function otherwise. * * @ingroup HBMV */ __inline clAmdBlasStatus clAmdBlasZhbmv( clblasOrder order, clblasUplo uplo, size_t N, size_t K, cl_double2 alpha, const cl_mem A, size_t offa, size_t lda, const cl_mem X, size_t offx, int incx, cl_double2 beta, cl_mem Y, size_t offy, int incy, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events) { return clblasZhbmv( order, uplo, N, K, alpha, A, offa, lda, X, offx, incx, beta, Y, offy, incy, numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events ); } /*@}*/ /** * @defgroup TBSV TBSV - Solving triangular banded matrix * @ingroup BLAS2 */ /*@{*/ /** * @brief solving triangular banded matrix problems with float elements. * * Matrix-vector products: * - \f$ A X \leftarrow X \f$ * - \f$ A^T X \leftarrow X \f$ * * @param[in] order Row/column order. * @param[in] uplo The triangle in matrix being referenced. * @param[in] trans How matrix \b A is to be transposed. * @param[in] diag Specify whether matrix \b A is unit triangular. * @param[in] N Number of rows/columns in banded matrix \b A. * @param[in] K Number of sub-diagonals/super-diagonals in triangular banded matrix \b A. * @param[in] A Buffer object storing matrix \b A. * @param[in] offa Offset in number of elements for first element in matrix \b A. * @param[in] lda Leading dimension of matrix \b A. It cannot be less * than ( \b K + 1 ) * @param[out] X Buffer object storing vector \b X. * @param[in] offx Offset in number of elements for first element in vector \b X. * @param[in] incx Increment for the elements of \b X. Must not be zero. * @param[in] numCommandQueues Number of OpenCL command queues in which the * task is to be performed. * @param[in] commandQueues OpenCL command queues. * @param[in] numEventsInWaitList Number of events in the event wait list. * @param[in] eventWaitList Event wait list. * @param[in] events Event objects per each command queue that identify * a particular kernel execution instance. * * @return * - \b clAmdBlasSuccess on success; * - \b clAmdBlasNotInitialized if clAmdBlasSetup() was not called; * - \b clAmdBlasInvalidValue if invalid parameters are passed: * - either \b N or \b incx is zero, or * - K is greater than \b N - 1 * - the leading dimension is invalid; * - \b clAmdBlasInvalidMemObject if either \b A or \b X object is * Invalid, or an image object rather than the buffer one; * - \b clAmdBlasOutOfHostMemory if the library can't allocate memory for * internal structures; * - \b clAmdBlasInvalidCommandQueue if the passed command queue is invalid; * - \b clAmdBlasInvalidContext if a context a passed command queue belongs * to was released; * - \b clAmdBlasInvalidOperation if kernel compilation relating to a previous * call has not completed for any of the target devices; * - \b clAmdBlasCompilerNotAvailable if a compiler is not available; * - \b clAmdBlasBuildProgramFailure if there is a failure to build a program * executable. * * @ingroup TBSV */ __inline clAmdBlasStatus clAmdBlasStbsv( clblasOrder order, clblasUplo uplo, clblasTranspose trans, clblasDiag diag, size_t N, size_t K, const cl_mem A, size_t offa, size_t lda, cl_mem X, size_t offx, int incx, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events) { return clblasStbsv( order, uplo, trans, diag, N, K, A, offa, lda, X, offx, incx, numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events ); } /** * @example example_stbsv.c * This is an example of how to use the @ref clAmdBlasStbsv function. */ /** * @brief solving triangular banded matrix problems with double elements. * * Matrix-vector products: * - \f$ A X \leftarrow X \f$ * - \f$ A^T X \leftarrow X \f$ * * @param[in] order Row/column order. * @param[in] uplo The triangle in matrix being referenced. * @param[in] trans How matrix \b A is to be transposed. * @param[in] diag Specify whether matrix \b A is unit triangular. * @param[in] N Number of rows/columns in banded matrix \b A. * @param[in] K Number of sub-diagonals/super-diagonals in triangular banded matrix \b A. * @param[in] A Buffer object storing matrix \b A. * @param[in] offa Offset in number of elements for first element in matrix \b A. * @param[in] lda Leading dimension of matrix \b A. It cannot be less * than ( \b K + 1 ) * @param[out] X Buffer object storing vector \b X. * @param[in] offx Offset in number of elements for first element in vector \b X. * @param[in] incx Increment for the elements of \b X. Must not be zero. * @param[in] numCommandQueues Number of OpenCL command queues in which the * task is to be performed. * @param[in] commandQueues OpenCL command queues. * @param[in] numEventsInWaitList Number of events in the event wait list. * @param[in] eventWaitList Event wait list. * @param[in] events Event objects per each command queue that identify * a particular kernel execution instance. * * @return * - \b clAmdBlasSuccess on success; * - \b clAmdBlasInvalidDevice if a target device does not support floating * point arithmetic with double precision; * - the same error codes as the clAmdBlasStbsv() function otherwise. * * @ingroup TBSV */ __inline clAmdBlasStatus clAmdBlasDtbsv( clblasOrder order, clblasUplo uplo, clblasTranspose trans, clblasDiag diag, size_t N, size_t K, const cl_mem A, size_t offa, size_t lda, cl_mem X, size_t offx, int incx, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events) { return clblasDtbsv( order, uplo, trans, diag, N, K, A, offa, lda, X, offx, incx, numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events ); } /** * @brief solving triangular banded matrix problems with float-complex elements. * * Matrix-vector products: * - \f$ A X \leftarrow X \f$ * - \f$ A^T X \leftarrow X \f$ * * @param[in] order Row/column order. * @param[in] uplo The triangle in matrix being referenced. * @param[in] trans How matrix \b A is to be transposed. * @param[in] diag Specify whether matrix \b A is unit triangular. * @param[in] N Number of rows/columns in banded matrix \b A. * @param[in] K Number of sub-diagonals/super-diagonals in triangular banded matrix \b A. * @param[in] A Buffer object storing matrix \b A. * @param[in] offa Offset in number of elements for first element in matrix \b A. * @param[in] lda Leading dimension of matrix \b A. It cannot be less * than ( \b K + 1 ) * @param[out] X Buffer object storing vector \b X. * @param[in] offx Offset in number of elements for first element in vector \b X. * @param[in] incx Increment for the elements of \b X. Must not be zero. * @param[in] numCommandQueues Number of OpenCL command queues in which the * task is to be performed. * @param[in] commandQueues OpenCL command queues. * @param[in] numEventsInWaitList Number of events in the event wait list. * @param[in] eventWaitList Event wait list. * @param[in] events Event objects per each command queue that identify * a particular kernel execution instance. * * @return The same result as the clAmdBlasStbsv() function. * * @ingroup TBSV */ __inline clAmdBlasStatus clAmdBlasCtbsv( clblasOrder order, clblasUplo uplo, clblasTranspose trans, clblasDiag diag, size_t N, size_t K, const cl_mem A, size_t offa, size_t lda, cl_mem X, size_t offx, int incx, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events) { return clblasCtbsv( order, uplo, trans, diag, N, K, A, offa, lda, X, offx, incx, numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events ); } /** * @brief solving triangular banded matrix problems with double-complex elements. * * Matrix-vector products: * - \f$ A X \leftarrow X \f$ * - \f$ A^T X \leftarrow X \f$ * * @param[in] order Row/column order. * @param[in] uplo The triangle in matrix being referenced. * @param[in] trans How matrix \b A is to be transposed. * @param[in] diag Specify whether matrix \b A is unit triangular. * @param[in] N Number of rows/columns in banded matrix \b A. * @param[in] K Number of sub-diagonals/super-diagonals in triangular banded matrix \b A. * @param[in] A Buffer object storing matrix \b A. * @param[in] offa Offset in number of elements for first element in matrix \b A. * @param[in] lda Leading dimension of matrix \b A. It cannot be less * than ( \b K + 1 ) * @param[out] X Buffer object storing vector \b X. * @param[in] offx Offset in number of elements for first element in vector \b X. * @param[in] incx Increment for the elements of \b X. Must not be zero. * @param[in] numCommandQueues Number of OpenCL command queues in which the * task is to be performed. * @param[in] commandQueues OpenCL command queues. * @param[in] numEventsInWaitList Number of events in the event wait list. * @param[in] eventWaitList Event wait list. * @param[in] events Event objects per each command queue that identify * a particular kernel execution instance. * * @return The same result as the clAmdBlasDtbsv() function. * * @ingroup TBSV */ __inline clAmdBlasStatus clAmdBlasZtbsv( clblasOrder order, clblasUplo uplo, clblasTranspose trans, clblasDiag diag, size_t N, size_t K, const cl_mem A, size_t offa, size_t lda, cl_mem X, size_t offx, int incx, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events) { return clblasZtbsv( order, uplo, trans, diag, N, K, A, offa, lda, X, offx, incx, numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events ); } /*@}*/ /** * @defgroup BLAS3 BLAS-3 functions * * The Level 3 Basic Linear Algebra Subprograms are funcions that perform * matrix-matrix operations. */ /*@{*/ /*@}*/ /** * @defgroup GEMM GEMM - General matrix-matrix multiplication * @ingroup BLAS3 */ /*@{*/ /** * @brief Matrix-matrix product of general rectangular matrices with float * elements. * * Matrix-matrix products: * - \f$ C \leftarrow \alpha A B + \beta C \f$ * - \f$ C \leftarrow \alpha A^T B + \beta C \f$ * - \f$ C \leftarrow \alpha A B^T + \beta C \f$ * - \f$ C \leftarrow \alpha A^T B^T + \beta C \f$ * * @param[in] order Row/column order. * @param[in] transA How matrix \b A is to be transposed. * @param[in] transB How matrix \b B is to be transposed. * @param[in] M Number of rows in matrix \b A. * @param[in] N Number of columns in matrix \b B. * @param[in] K Number of columns in matrix \b A and rows in matrix \b B. * @param[in] alpha The factor of matrix \b A. * @param[in] A Buffer object storing matrix \b A. * @param[in] lda Leading dimension of matrix \b A. It cannot be less * than \b K when the \b order parameter is set to * \b clAmdBlasRowMajor,\n or less than \b M when the * parameter is set to \b clAmdBlasColumnMajor. * @param[in] B Buffer object storing matrix \b B. * @param[in] ldb Leading dimension of matrix \b B. It cannot be less * than \b N when the \b order parameter is set to * \b clAmdBlasRowMajor,\n or less than \b K * when it is set to \b clAmdBlasColumnMajor. * @param[in] beta The factor of matrix \b C. * @param[out] C Buffer object storing matrix \b C. * @param[in] ldc Leading dimension of matrix \b C. It cannot be less * than \b N when the \b order parameter is set to * \b clAmdBlasRowMajor,\n or less than \b M when * it is set to \b clAmdBlasColumnMajorOrder. * @param[in] numCommandQueues Number of OpenCL command queues in which the * task is to be performed. * @param[in] commandQueues OpenCL command queues. * @param[in] numEventsInWaitList Number of events in the event wait list. * @param[in] eventWaitList Event wait list. * @param[in] events Event objects per each command queue that identify * a particular kernel execution instance. * * The function is obsolete and is not recommended for using in new * applications. Use the superseding function clAmdBlasSgemmEx() instead. * * @return * - \b clAmdBlasSuccess on success; * - \b clAmdBlasNotInitialized if clAmdBlasSetup() was not called; * - \b clAmdBlasInvalidValue if invalid parameters are passed: * - \b M, \b N or \b K is zero, or * - any of the leading dimensions is invalid; * - the matrix sizes lead to accessing outsize of any of the buffers; * - \b clAmdBlasInvalidMemObject if A, B, or C object is invalid, * or an image object rather than the buffer one; * - \b clAmdBlasOutOfResources if you use image-based function implementation * and no suitable scratch image available; * - \b clAmdBlasOutOfHostMemory if the library can't allocate memory for * internal structures; * - \b clAmdBlasInvalidCommandQueue if the passed command queue is invalid; * - \b clAmdBlasInvalidContext if a context a passed command queue belongs to * was released; * - \b clAmdBlasInvalidOperation if kernel compilation relating to a previous * call has not completed for any of the target devices; * - \b clAmdBlasCompilerNotAvailable if a compiler is not available; * - \b clAmdBlasBuildProgramFailure if there is a failure to build a program * executable. * * @ingroup GEMM */ __inline clAmdBlasStatus clAmdBlasSgemm( clblasOrder order, clblasTranspose transA, clblasTranspose transB, size_t M, size_t N, size_t K, cl_float alpha, const cl_mem A, size_t lda, const cl_mem B, size_t ldb, cl_float beta, cl_mem C, size_t ldc, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events) { return clblasSgemm( order, transA, transB, M, N, K, alpha, A, 0, lda, B, 0, ldb, beta, C, 0, ldc, numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events ); } /** * @example example_sgemm.c * This is an example of how to use the @ref clAmdBlasSgemm function. */ /** * @brief Matrix-matrix product of general rectangular matrices with double * elements. * * Matrix-matrix products: * - \f$ C \leftarrow \alpha A B + \beta C \f$ * - \f$ C \leftarrow \alpha A^T B + \beta C \f$ * - \f$ C \leftarrow \alpha A B^T + \beta C \f$ * - \f$ C \leftarrow \alpha A^T B^T + \beta C \f$ * * @param[in] order Row/column order. * @param[in] transA How matrix \b A is to be transposed. * @param[in] transB How matrix \b B is to be transposed. * @param[in] M Number of rows in matrix \b A. * @param[in] N Number of columns in matrix \b B. * @param[in] K Number of columns in matrix \b A and rows in matrix \b B. * @param[in] alpha The factor of matrix \b A. * @param[in] A Buffer object storing matrix \b A. * @param[in] lda Leading dimension of matrix \b A. For detailed description, * see clAmdBlasSgemm(). * @param[in] B Buffer object storing matrix \b B. * @param[in] ldb Leading dimension of matrix \b B. For detailed description, * see clAmdBlasSgemm(). * @param[in] beta The factor of matrix \b C. * @param[out] C Buffer object storing matrix \b C. * @param[in] ldc Leading dimension of matrix \b C. For detailed description, * see clAmdBlasSgemm(). * @param[in] numCommandQueues Number of OpenCL command queues in which the * task is to be performed. * @param[in] commandQueues OpenCL command queues. * @param[in] numEventsInWaitList Number of events in the event wait list. * @param[in] eventWaitList Event wait list. * @param[in] events Event objects per each command queue that identify * a particular kernel execution instance. * * The function is obsolete and is not recommended for using in new * applications. Use the superseding function clAmdBlasDgemmEx() instead. * * @return * - \b clAmdBlasSuccess on success; * - \b clAmdBlasInvalidDevice if a target device does not support floating * point arithmetic with double precision; * - the same error codes as the clAmdBlasSgemm() function otherwise. * * @ingroup GEMM */ __inline clAmdBlasStatus clAmdBlasDgemm( clblasOrder order, clblasTranspose transA, clblasTranspose transB, size_t M, size_t N, size_t K, cl_float alpha, const cl_mem A, size_t lda, const cl_mem B, size_t ldb, cl_float beta, cl_mem C, size_t ldc, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events) { return clblasDgemm( order, transA, transB, M, N, K, alpha, A, 0, lda, B, 0, ldb, beta, C, 0, ldc, numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events ); } /** * @brief Matrix-matrix product of general rectangular matrices with float * complex elements. * * Matrix-matrix products: * - \f$ C \leftarrow \alpha A B + \beta C \f$ * - \f$ C \leftarrow \alpha A^T B + \beta C \f$ * - \f$ C \leftarrow \alpha A B^T + \beta C \f$ * - \f$ C \leftarrow \alpha A^T B^T + \beta C \f$ * * @param[in] order Row/column order. * @param[in] transA How matrix \b A is to be transposed. * @param[in] transB How matrix \b B is to be transposed. * @param[in] M Number of rows in matrix \b A. * @param[in] N Number of columns in matrix \b B. * @param[in] K Number of columns in matrix \b A and rows in matrix \b B. * @param[in] alpha The factor of matrix \b A. * @param[in] A Buffer object storing matrix \b A. * @param[in] lda Leading dimension of matrix \b A. For detailed description, * see clAmdBlasSgemm(). * @param[in] B Buffer object storing matrix \b B. * @param[in] ldb Leading dimension of matrix \b B. For detailed description, * see clAmdBlasSgemm(). * @param[in] beta The factor of matrix \b C. * @param[out] C Buffer object storing matrix \b C. * @param[in] ldc Leading dimension of matrix \b C. For detailed description, * see clAmdBlasSgemm(). * @param[in] numCommandQueues Number of OpenCL command queues in which the * task is to be performed. * @param[in] commandQueues OpenCL command queues. * @param[in] numEventsInWaitList Number of events in the event wait list. * @param[in] eventWaitList Event wait list. * @param[in] events Event objects per each command queue that identify * a particular kernel execution instance. * * The function is obsolete and is not recommended for using in new * applications. Use the superseding function clAmdBlasCgemmEx() instead. * * @return The same result as the clAmdBlasSgemm() function. * * @ingroup GEMM */ __inline clAmdBlasStatus clAmdBlasCgemm( clblasOrder order, clblasTranspose transA, clblasTranspose transB, size_t M, size_t N, size_t K, FloatComplex alpha, const cl_mem A, size_t lda, const cl_mem B, size_t ldb, FloatComplex beta, cl_mem C, size_t ldc, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events) { return clblasCgemm( order, transA, transB, M, N, K, alpha, A, 0, lda, B, 0, ldb, beta, C, 0, ldc, numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events ); } /** * @brief Matrix-matrix product of general rectangular matrices with double * complex elements. * * Matrix-matrix products: * - \f$ C \leftarrow \alpha A B + \beta C \f$ * - \f$ C \leftarrow \alpha A^T B + \beta C \f$ * - \f$ C \leftarrow \alpha A B^T + \beta C \f$ * - \f$ C \leftarrow \alpha A^T B^T + \beta C \f$ * * @param[in] order Row/column order. * @param[in] transA How matrix \b A is to be transposed. * @param[in] transB How matrix \b B is to be transposed. * @param[in] M Number of rows in matrix \b A. * @param[in] N Number of columns in matrix \b B. * @param[in] K Number of columns in matrix \b A and rows in matrix \b B. * @param[in] alpha The factor of matrix \b A. * @param[in] A Buffer object storing matrix \b A. * @param[in] lda Leading dimension of matrix \b A. For detailed description, * see clAmdBlasSgemm(). * @param[in] B Buffer object storing matrix \b B. * @param[in] ldb Leading dimension of matrix \b B. For detailed description, * see clAmdBlasSgemm(). * @param[in] beta The factor of matrix \b C. * @param[out] C Buffer object storing matrix \b C. * @param[in] ldc Leading dimension of matrix \b C. For detailed description, * see clAmdBlasSgemm(). * @param[in] numCommandQueues Number of OpenCL command queues in which the * task is to be performed. * @param[in] commandQueues OpenCL command queues. * @param[in] numEventsInWaitList Number of events in the event wait list. * @param[in] eventWaitList Event wait list. * @param[in] events Event objects per each command queue that identify * a particular kernel execution instance. * * The function is obsolete and is not recommended for using in new * applications. Use the superseding function clAmdBlasZgemmEx() instead. * * @return The same result as the clAmdBlasDgemm() function. * * @ingroup GEMM */ __inline clAmdBlasStatus clAmdBlasZgemm( clAmdBlasOrder order, clAmdBlasTranspose transA, clAmdBlasTranspose transB, size_t M, size_t N, size_t K, DoubleComplex alpha, const cl_mem A, size_t lda, const cl_mem B, size_t ldb, DoubleComplex beta, cl_mem C, size_t ldc, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events) { return clblasZgemm( order, transA, transB, M, N, K, alpha, A, 0, lda, B, 0, ldb, beta, C, 0, ldc, numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events ); } /** * @brief Matrix-matrix product of general rectangular matrices with float * elements. Extended version. * * Matrix-matrix products: * - \f$ C \leftarrow \alpha A B + \beta C \f$ * - \f$ C \leftarrow \alpha A^T B + \beta C \f$ * - \f$ C \leftarrow \alpha A B^T + \beta C \f$ * - \f$ C \leftarrow \alpha A^T B^T + \beta C \f$ * * @param[in] order Row/column order. * @param[in] transA How matrix \b A is to be transposed. * @param[in] transB How matrix \b B is to be transposed. * @param[in] M Number of rows in matrix \b A. * @param[in] N Number of columns in matrix \b B. * @param[in] K Number of columns in matrix \b A and rows in matrix \b B. * @param[in] alpha The factor of matrix \b A. * @param[in] A Buffer object storing matrix \b A. * @param[in] offA Offset of the first element of the matrix \b A in the * buffer object. Counted in elements. * @param[in] lda Leading dimension of matrix \b A. It cannot be less * than \b K when the \b order parameter is set to * \b clAmdBlasRowMajor,\n or less than \b M when the * parameter is set to \b clAmdBlasColumnMajor. * @param[in] B Buffer object storing matrix \b B. * @param[in] offB Offset of the first element of the matrix \b B in the * buffer object. Counted in elements. * @param[in] ldb Leading dimension of matrix \b B. It cannot be less * than \b N when the \b order parameter is set to * \b clAmdBlasRowMajor,\n or less than \b K * when it is set to \b clAmdBlasColumnMajor. * @param[in] beta The factor of matrix \b C. * @param[out] C Buffer object storing matrix \b C. * @param[in] offC Offset of the first element of the matrix \b C in the * buffer object. Counted in elements. * @param[in] ldc Leading dimension of matrix \b C. It cannot be less * than \b N when the \b order parameter is set to * \b clAmdBlasRowMajor,\n or less than \b M when * it is set to \b clAmdBlasColumnMajorOrder. * @param[in] numCommandQueues Number of OpenCL command queues in which the * task is to be performed. * @param[in] commandQueues OpenCL command queues. * @param[in] numEventsInWaitList Number of events in the event wait list. * @param[in] eventWaitList Event wait list. * @param[in] events Event objects per each command queue that identify * a particular kernel execution instance. * * @return * - \b clAmdBlasSuccess on success; * - \b clAmdBlasInvalidValue if either \b offA, \b offB or \b offC exceeds * the size of the respective buffer object; * - the same error codes as clAmdBlasSgemm() otherwise. * * @ingroup GEMM */ __inline clAmdBlasStatus clAmdBlasSgemmEx( clAmdBlasOrder order, clAmdBlasTranspose transA, clAmdBlasTranspose transB, size_t M, size_t N, size_t K, cl_float alpha, const cl_mem A, size_t offA, size_t lda, const cl_mem B, size_t offB, size_t ldb, cl_float beta, cl_mem C, size_t offC, size_t ldc, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events) { return clblasSgemm( order, transA, transB, M, N, K, alpha, A, offA, lda, B, offB, ldb, beta, C, offC, ldc, numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events ); } /** * @example example_sgemm.c * This is an example of how to use the @ref clAmdBlasSgemmEx function. */ /** * @brief Matrix-matrix product of general rectangular matrices with double * elements. Extended version. * * Matrix-matrix products: * - \f$ C \leftarrow \alpha A B + \beta C \f$ * - \f$ C \leftarrow \alpha A^T B + \beta C \f$ * - \f$ C \leftarrow \alpha A B^T + \beta C \f$ * - \f$ C \leftarrow \alpha A^T B^T + \beta C \f$ * * @param[in] order Row/column order. * @param[in] transA How matrix \b A is to be transposed. * @param[in] transB How matrix \b B is to be transposed. * @param[in] M Number of rows in matrix \b A. * @param[in] N Number of columns in matrix \b B. * @param[in] K Number of columns in matrix \b A and rows in matrix \b B. * @param[in] alpha The factor of matrix \b A. * @param[in] A Buffer object storing matrix \b A. * @param[in] offA Offset of the first element of the matrix \b A in the * buffer object. Counted in elements. * @param[in] lda Leading dimension of matrix \b A. For detailed description, * see clAmdBlasSgemm(). * @param[in] B Buffer object storing matrix \b B. * @param[in] offB Offset of the first element of the matrix \b B in the * buffer object. Counted in elements. * @param[in] ldb Leading dimension of matrix \b B. For detailed description, * see clAmdBlasSgemm(). * @param[in] beta The factor of matrix \b C. * @param[out] C Buffer object storing matrix \b C. * @param[in] offC Offset of the first element of the matrix \b C in the * buffer object. Counted in elements. * @param[in] ldc Leading dimension of matrix \b C. For detailed description, * see clAmdBlasSgemm(). * @param[in] numCommandQueues Number of OpenCL command queues in which the * task is to be performed. * @param[in] commandQueues OpenCL command queues. * @param[in] numEventsInWaitList Number of events in the event wait list. * @param[in] eventWaitList Event wait list. * @param[in] events Event objects per each command queue that identify * a particular kernel execution instance. * * @return * - \b clAmdBlasSuccess on success; * - \b clAmdBlasInvalidDevice if a target device does not support floating * point arithmetic with double precision; * - \b clAmdBlasInvalidValue if either \b offA, \b offB or offC exceeds * the size of the respective buffer object; * - the same error codes as the clAmdBlasSgemm() function otherwise. * * @ingroup GEMM */ __inline clAmdBlasStatus clAmdBlasDgemmEx( clAmdBlasOrder order, clAmdBlasTranspose transA, clAmdBlasTranspose transB, size_t M, size_t N, size_t K, cl_double alpha, const cl_mem A, size_t offA, size_t lda, const cl_mem B, size_t offB, size_t ldb, cl_double beta, cl_mem C, size_t offC, size_t ldc, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events) { return clblasDgemm( order, transA, transB, M, N, K, alpha, A, offA, lda, B, offB, ldb, beta, C, offC, ldc, numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events ); } /** * @brief Matrix-matrix product of general rectangular matrices with float * complex elements. Extended version. * * Matrix-matrix products: * - \f$ C \leftarrow \alpha A B + \beta C \f$ * - \f$ C \leftarrow \alpha A^T B + \beta C \f$ * - \f$ C \leftarrow \alpha A B^T + \beta C \f$ * - \f$ C \leftarrow \alpha A^T B^T + \beta C \f$ * * @param[in] order Row/column order. * @param[in] transA How matrix \b A is to be transposed. * @param[in] transB How matrix \b B is to be transposed. * @param[in] M Number of rows in matrix \b A. * @param[in] N Number of columns in matrix \b B. * @param[in] K Number of columns in matrix \b A and rows in matrix \b B. * @param[in] alpha The factor of matrix \b A. * @param[in] A Buffer object storing matrix \b A. * @param[in] offA Offset of the first element of the matrix \b A in the * buffer object. Counted in elements. * @param[in] lda Leading dimension of matrix \b A. For detailed description, * see clAmdBlasSgemm(). * @param[in] B Buffer object storing matrix \b B. * @param[in] offB Offset of the first element of the matrix \b B in the * buffer object. Counted in elements. * @param[in] ldb Leading dimension of matrix \b B. For detailed description, * see clAmdBlasSgemm(). * @param[in] beta The factor of matrix \b C. * @param[out] C Buffer object storing matrix \b C. * @param[in] offC Offset of the first element of the matrix \b C in the * buffer object. Counted in elements. * @param[in] ldc Leading dimension of matrix \b C. For detailed description, * see clAmdBlasSgemm(). * @param[in] numCommandQueues Number of OpenCL command queues in which the * task is to be performed. * @param[in] commandQueues OpenCL command queues. * @param[in] numEventsInWaitList Number of events in the event wait list. * @param[in] eventWaitList Event wait list. * @param[in] events Event objects per each command queue that identify * a particular kernel execution instance. * * @return * - \b clAmdBlasSuccess on success; * - \b clAmdBlasInvalidValue if either \b offA, \b offB or offC exceeds * the size of the respective buffer object; * - the same error codes as the clAmdBlasSgemm() function otherwise. * * @ingroup GEMM */ __inline clAmdBlasStatus clAmdBlasCgemmEx( clAmdBlasOrder order, clAmdBlasTranspose transA, clAmdBlasTranspose transB, size_t M, size_t N, size_t K, FloatComplex alpha, const cl_mem A, size_t offA, size_t lda, const cl_mem B, size_t offB, size_t ldb, FloatComplex beta, cl_mem C, size_t offC, size_t ldc, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events) { return clblasCgemm( order, transA, transB, M, N, K, alpha, A, offA, lda, B, offB, ldb, beta, C, offC, ldc, numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events ); } /** * @brief Matrix-matrix product of general rectangular matrices with double * complex elements. Exteneded version. * * Matrix-matrix products: * - \f$ C \leftarrow \alpha A B + \beta C \f$ * - \f$ C \leftarrow \alpha A^T B + \beta C \f$ * - \f$ C \leftarrow \alpha A B^T + \beta C \f$ * - \f$ C \leftarrow \alpha A^T B^T + \beta C \f$ * * @param[in] order Row/column order. * @param[in] transA How matrix \b A is to be transposed. * @param[in] transB How matrix \b B is to be transposed. * @param[in] M Number of rows in matrix \b A. * @param[in] N Number of columns in matrix \b B. * @param[in] K Number of columns in matrix \b A and rows in matrix \b B. * @param[in] alpha The factor of matrix \b A. * @param[in] A Buffer object storing matrix \b A. * @param[in] offA Offset of the first element of the matrix \b A in the * buffer object. Counted in elements. * @param[in] lda Leading dimension of matrix \b A. For detailed description, * see clAmdBlasSgemm(). * @param[in] B Buffer object storing matrix \b B. * @param[in] offB Offset of the first element of the matrix \b B in the * buffer object. Counted in elements. * @param[in] ldb Leading dimension of matrix \b B. For detailed description, * see clAmdBlasSgemm(). * @param[in] beta The factor of matrix \b C. * @param[out] C Buffer object storing matrix \b C. * @param[in] offC Offset of the first element of the matrix \b C in the * buffer object. Counted in elements. * @param[in] ldc Leading dimension of matrix \b C. For detailed description, * see clAmdBlasSgemm(). * @param[in] numCommandQueues Number of OpenCL command queues in which the * task is to be performed. * @param[in] commandQueues OpenCL command queues. * @param[in] numEventsInWaitList Number of events in the event wait list. * @param[in] eventWaitList Event wait list. * @param[in] events Event objects per each command queue that identify * a particular kernel execution instance. * * @return * - \b clAmdBlasSuccess on success; * - \b clAmdBlasInvalidDevice if a target device does not support floating * point arithmetic with double precision; * - \b clAmdBlasInvalidValue if either \b offA, \b offB or offC exceeds * the size of the respective buffer object; * - the same error codes as the clAmdBlasSgemm() function otherwise. * * @ingroup GEMM */ __inline clAmdBlasStatus clAmdBlasZgemmEx( clAmdBlasOrder order, clAmdBlasTranspose transA, clAmdBlasTranspose transB, size_t M, size_t N, size_t K, DoubleComplex alpha, const cl_mem A, size_t offA, size_t lda, const cl_mem B, size_t offB, size_t ldb, DoubleComplex beta, cl_mem C, size_t offC, size_t ldc, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events) { return clblasZgemm( order, transA, transB, M, N, K, alpha, A, offA, lda, B, offB, ldb, beta, C, offC, ldc, numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events ); } /*@}*/ /** * @defgroup TRMM TRMM - Triangular matrix-matrix multiplication * @ingroup BLAS3 */ /*@{*/ /** * @brief Multiplying a matrix by a triangular matrix with float elements. * * Matrix-triangular matrix products: * - \f$ B \leftarrow \alpha A B \f$ * - \f$ B \leftarrow \alpha A^T B \f$ * - \f$ B \leftarrow \alpha B A \f$ * - \f$ B \leftarrow \alpha B A^T \f$ * * where \b T is an upper or lower triangular matrix. * * @param[in] order Row/column order. * @param[in] side The side of triangular matrix. * @param[in] uplo The triangle in matrix being referenced. * @param[in] transA How matrix \b A is to be transposed. * @param[in] diag Specify whether matrix is unit triangular. * @param[in] M Number of rows in matrix \b B. * @param[in] N Number of columns in matrix \b B. * @param[in] alpha The factor of matrix \b A. * @param[in] A Buffer object storing matrix \b A. * @param[in] lda Leading dimension of matrix \b A. It cannot be less * than \b M when the \b side parameter is set to * \b clAmdBlasLeft,\n or less than \b N when it is set * to \b clAmdBlasRight. * @param[out] B Buffer object storing matrix \b B. * @param[in] ldb Leading dimension of matrix \b B. It cannot be less * than \b N when the \b order parameter is set to * \b clAmdBlasRowMajor,\n or not less than \b M * when it is set to \b clAmdBlasColumnMajor. * @param[in] numCommandQueues Number of OpenCL command queues in which the * task is to be performed. * @param[in] commandQueues OpenCL command queues. * @param[in] numEventsInWaitList Number of events in the event wait list. * @param[in] eventWaitList Event wait list. * @param[in] events Event objects per each command queue that identify * a particular kernel execution instance. * * The function is obsolete and is not recommended for using in new * applications. Use the superseding function clAmdBlasStrmmEx() instead. * * @return * - \b clAmdBlasSuccess on success; * - \b clAmdBlasNotInitialized if clAmdBlasSetup() was not called; * - \b clAmdBlasInvalidValue if invalid parameters are passed: * - \b M, \b N, or \b K is zero, or * - any of the leading dimensions is invalid; * - the matrix sizes lead to accessing outsize of any of the buffers; * - \b clAmdBlasInvalidMemObject if A, B, or C object is invalid, * or an image object rather than the buffer one; * - \b clAmdBlasOutOfResources if you use image-based function implementation * and no suitable scratch image available; * - \b clAmdBlasOutOfHostMemory if the library can't allocate memory for * internal structures; * - \b clAmdBlasInvalidCommandQueue if the passed command queue is invalid; * - \b clAmdBlasInvalidContext if a context a passed command queue belongs to * was released; * - \b clAmdBlasInvalidOperation if kernel compilation relating to a previous * call has not completed for any of the target devices; * - \b clAmdBlasCompilerNotAvailable if a compiler is not available; * - \b clAmdBlasBuildProgramFailure if there is a failure to build a program * executable. * * @ingroup TRMM */ __inline clAmdBlasStatus clAmdBlasStrmm( clAmdBlasOrder order, clAmdBlasSide side, clAmdBlasUplo uplo, clAmdBlasTranspose transA, clAmdBlasDiag diag, size_t M, size_t N, cl_float alpha, const cl_mem A, size_t lda, cl_mem B, size_t ldb, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events) { return clblasStrmm( order, side, uplo, transA, diag, M, N, alpha, A, 0, lda, B, 0, ldb, numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events ); } /** * @example example_strmm.c * This is an example of how to use the @ref clAmdBlasStrmm function. */ /** * @brief Multiplying a matrix by a triangular matrix with double elements. * * Matrix-triangular matrix products: * - \f$ B \leftarrow \alpha A B \f$ * - \f$ B \leftarrow \alpha A^T B \f$ * - \f$ B \leftarrow \alpha B A \f$ * - \f$ B \leftarrow \alpha B A^T \f$ * * where \b T is an upper or lower triangular matrix. * * @param[in] order Row/column order. * @param[in] side The side of triangular matrix. * @param[in] uplo The triangle in matrix being referenced. * @param[in] transA How matrix \b A is to be transposed. * @param[in] diag Specify whether matrix is unit triangular. * @param[in] M Number of rows in matrix \b B. * @param[in] N Number of columns in matrix \b B. * @param[in] alpha The factor of matrix \b A. * @param[in] A Buffer object storing matrix \b A. * @param[in] lda Leading dimension of matrix \b A. For detailed * description, see clAmdBlasStrmm(). * @param[out] B Buffer object storing matrix \b B. * @param[in] ldb Leading dimension of matrix \b B. For detailed * description, see clAmdBlasStrmm(). * @param[in] numCommandQueues Number of OpenCL command queues in which the * task is to be performed. * @param[in] commandQueues OpenCL command queues. * @param[in] numEventsInWaitList Number of events in the event wait list. * @param[in] eventWaitList Event wait list. * @param[in] events Event objects per each command queue that identify * a particular kernel execution instance. * * The function is obsolete and is not recommended for using in new * applications. Use the superseding function clAmdBlasDtrmmEx() instead. * * @return * - \b clAmdBlasSuccess on success; * - \b clAmdBlasInvalidDevice if a target device does not support floating * point arithmetic with double precision; * - the same error codes as the clAmdBlasStrmm() function otherwise. * * @ingroup TRMM */ __inline clAmdBlasStatus clAmdBlasDtrmm( clAmdBlasOrder order, clAmdBlasSide side, clAmdBlasUplo uplo, clAmdBlasTranspose transA, clAmdBlasDiag diag, size_t M, size_t N, cl_double alpha, const cl_mem A, size_t lda, cl_mem B, size_t ldb, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events) { return clblasDtrmm( order, side, uplo, transA, diag, M, N, alpha, A, 0, lda, B, 0, ldb, numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events ); } /** * @brief Multiplying a matrix by a triangular matrix with float complex * elements. * * Matrix-triangular matrix products: * - \f$ B \leftarrow \alpha A B \f$ * - \f$ B \leftarrow \alpha A^T B \f$ * - \f$ B \leftarrow \alpha B A \f$ * - \f$ B \leftarrow \alpha B A^T \f$ * * where \b T is an upper or lower triangular matrix. * @param[in] order Row/column order. * @param[in] side The side of triangular matrix. * @param[in] uplo The triangle in matrix being referenced. * @param[in] transA How matrix \b A is to be transposed. * @param[in] diag Specify whether matrix is unit triangular. * @param[in] M Number of rows in matrix \b B. * @param[in] N Number of columns in matrix \b B. * @param[in] alpha The factor of matrix \b A. * @param[in] A Buffer object storing matrix \b A. * @param[in] lda Leading dimension of matrix \b A. For detailed * description, see clAmdBlasStrmm(). * @param[out] B Buffer object storing matrix \b B. * @param[in] ldb Leading dimension of matrix \b B. For detailed * description, see clAmdBlasStrmm(). * @param[in] numCommandQueues Number of OpenCL command queues in which the * task is to be performed. * @param[in] commandQueues OpenCL command queues. * @param[in] numEventsInWaitList Number of events in the event wait list. * @param[in] eventWaitList Event wait list. * @param[in] events Event objects per each command queue that identify * a particular kernel execution instance. * * The function is obsolete and is not recommended for using in new * applications. Use the superseding function clAmdBlasCtrmmEx() instead. * * @return The same result as the clAmdBlasStrmm() function. * * @ingroup TRMM */ __inline clAmdBlasStatus clAmdBlasCtrmm( clAmdBlasOrder order, clAmdBlasSide side, clAmdBlasUplo uplo, clAmdBlasTranspose transA, clAmdBlasDiag diag, size_t M, size_t N, FloatComplex alpha, const cl_mem A, size_t lda, cl_mem B, size_t ldb, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events) { return clblasCtrmm( order, side, uplo, transA, diag, M, N, alpha, A, 0, lda, B, 0, ldb, numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events ); } /** * @brief Multiplying a matrix by a triangular matrix with double complex * elements. * * Matrix-triangular matrix products: * - \f$ B \leftarrow \alpha A B \f$ * - \f$ B \leftarrow \alpha A^T B \f$ * - \f$ B \leftarrow \alpha B A \f$ * - \f$ B \leftarrow \alpha B A^T \f$ * * where \b T is an upper or lower triangular matrix. * * @param[in] order Row/column order. * @param[in] side The side of triangular matrix. * @param[in] uplo The triangle in matrix being referenced. * @param[in] transA How matrix \b A is to be transposed. * @param[in] diag Specify whether matrix is unit triangular. * @param[in] M Number of rows in matrix \b B. * @param[in] N Number of columns in matrix \b B. * @param[in] alpha The factor of matrix \b A. * @param[in] A Buffer object storing matrix \b A. * @param[in] lda Leading dimension of matrix \b A. For detailed * description, see clAmdBlasStrmm(). * @param[out] B Buffer object storing matrix \b B. * @param[in] ldb Leading dimension of matrix \b B. For detailed * description, see clAmdBlasStrmm(). * @param[in] numCommandQueues Number of OpenCL command queues in which the * task is to be performed. * @param[in] commandQueues OpenCL command queues. * @param[in] numEventsInWaitList Number of events in the event wait list. * @param[in] eventWaitList Event wait list. * @param[in] events Event objects per each command queue that identify * a particular kernel execution instance. * * The function is obsolete and is not recommended for using in new * applications. Use the superseding function clAmdBlasZtrmmEx() instead. * * @return The same result as the clAmdBlasDtrmm() function. * * @ingroup TRMM */ __inline clAmdBlasStatus clAmdBlasZtrmm( clAmdBlasOrder order, clAmdBlasSide side, clAmdBlasUplo uplo, clAmdBlasTranspose transA, clAmdBlasDiag diag, size_t M, size_t N, DoubleComplex alpha, const cl_mem A, size_t lda, cl_mem B, size_t ldb, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events) { return clblasZtrmm( order, side, uplo, transA, diag, M, N, alpha, A, 0, lda, B, 0, ldb, numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events ); } /** * @brief Multiplying a matrix by a triangular matrix with float elements. * Extended version. * * Matrix-triangular matrix products: * - \f$ B \leftarrow \alpha A B \f$ * - \f$ B \leftarrow \alpha A^T B \f$ * - \f$ B \leftarrow \alpha B A \f$ * - \f$ B \leftarrow \alpha B A^T \f$ * * where \b T is an upper or lower triangular matrix. * * @param[in] order Row/column order. * @param[in] side The side of triangular matrix. * @param[in] uplo The triangle in matrix being referenced. * @param[in] transA How matrix \b A is to be transposed. * @param[in] diag Specify whether matrix is unit triangular. * @param[in] M Number of rows in matrix \b B. * @param[in] N Number of columns in matrix \b B. * @param[in] alpha The factor of matrix \b A. * @param[in] A Buffer object storing matrix \b A. * @param[in] offA Offset of the first element of the matrix \b A in the * buffer object. Counted in elements. * @param[in] lda Leading dimension of matrix \b A. It cannot be less * than \b M when the \b side parameter is set to * \b clAmdBlasLeft,\n or less than \b N when it is set * to \b clAmdBlasRight. * @param[out] B Buffer object storing matrix \b B. * @param[in] offB Offset of the first element of the matrix \b B in the * buffer object. Counted in elements. * @param[in] ldb Leading dimension of matrix \b B. It cannot be less * than \b N when the \b order parameter is set to * \b clAmdBlasRowMajor,\n or not less than \b M * when it is set to \b clAmdBlasColumnMajor. * @param[in] numCommandQueues Number of OpenCL command queues in which the * task is to be performed. * @param[in] commandQueues OpenCL command queues. * @param[in] numEventsInWaitList Number of events in the event wait list. * @param[in] eventWaitList Event wait list. * @param[in] events Event objects per each command queue that identify * a particular kernel execution instance. * * @return * - \b clAmdBlasSuccess on success; * - \b clAmdBlasInvalidValue if either \b offA or \b offB exceeds the size * of the respective buffer object; * - the same error codes as clAmdBlasStrmm() otherwise. * * @ingroup TRMM */ __inline clAmdBlasStatus clAmdBlasStrmmEx( clAmdBlasOrder order, clAmdBlasSide side, clAmdBlasUplo uplo, clAmdBlasTranspose transA, clAmdBlasDiag diag, size_t M, size_t N, cl_float alpha, const cl_mem A, size_t offA, size_t lda, cl_mem B, size_t offB, size_t ldb, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events) { return clblasStrmm( order, side, uplo, transA, diag, M, N, alpha, A, offA, lda, B, offB, ldb, numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events ); } /** * @example example_strmm.c * This is an example of how to use the @ref clAmdBlasStrmmEx function. */ /** * @brief Multiplying a matrix by a triangular matrix with double elements. * Extended version. * * Matrix-triangular matrix products: * - \f$ B \leftarrow \alpha A B \f$ * - \f$ B \leftarrow \alpha A^T B \f$ * - \f$ B \leftarrow \alpha B A \f$ * - \f$ B \leftarrow \alpha B A^T \f$ * * where \b T is an upper or lower triangular matrix. * * @param[in] order Row/column order. * @param[in] side The side of triangular matrix. * @param[in] uplo The triangle in matrix being referenced. * @param[in] transA How matrix \b A is to be transposed. * @param[in] diag Specify whether matrix is unit triangular. * @param[in] M Number of rows in matrix \b B. * @param[in] N Number of columns in matrix \b B. * @param[in] alpha The factor of matrix \b A. * @param[in] A Buffer object storing matrix \b A. * @param[in] offA Offset of the first element of the matrix \b A in the * buffer object. Counted in elements. * @param[in] lda Leading dimension of matrix \b A. For detailed * description, see clAmdBlasStrmm(). * @param[out] B Buffer object storing matrix \b B. * @param[in] offB Offset of the first element of the matrix \b B in the * buffer object. Counted in elements. * @param[in] ldb Leading dimension of matrix \b B. For detailed * description, see clAmdBlasStrmm(). * @param[in] numCommandQueues Number of OpenCL command queues in which the * task is to be performed. * @param[in] commandQueues OpenCL command queues. * @param[in] numEventsInWaitList Number of events in the event wait list. * @param[in] eventWaitList Event wait list. * @param[in] events Event objects per each command queue that identify * a particular kernel execution instance. * * @return * - \b clAmdBlasSuccess on success; * - \b clAmdBlasInvalidDevice if a target device does not support floating * point arithmetic with double precision; * - \b clAmdBlasInvalidValue if either \b offA or \b offB exceeds the size * of the respective buffer object; * - the same error codes as the clAmdBlasStrmm() function otherwise. * * @ingroup TRMM */ __inline clAmdBlasStatus clAmdBlasDtrmmEx( clAmdBlasOrder order, clAmdBlasSide side, clAmdBlasUplo uplo, clAmdBlasTranspose transA, clAmdBlasDiag diag, size_t M, size_t N, cl_double alpha, const cl_mem A, size_t offA, size_t lda, cl_mem B, size_t offB, size_t ldb, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events) { return clblasDtrmm( order, side, uplo, transA, diag, M, N, alpha, A, offA, lda, B, offB, ldb, numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events ); } /** * @brief Multiplying a matrix by a triangular matrix with float complex * elements. Extended version. * * Matrix-triangular matrix products: * - \f$ B \leftarrow \alpha A B \f$ * - \f$ B \leftarrow \alpha A^T B \f$ * - \f$ B \leftarrow \alpha B A \f$ * - \f$ B \leftarrow \alpha B A^T \f$ * * where \b T is an upper or lower triangular matrix. * @param[in] order Row/column order. * @param[in] side The side of triangular matrix. * @param[in] uplo The triangle in matrix being referenced. * @param[in] transA How matrix \b A is to be transposed. * @param[in] diag Specify whether matrix is unit triangular. * @param[in] M Number of rows in matrix \b B. * @param[in] N Number of columns in matrix \b B. * @param[in] alpha The factor of matrix \b A. * @param[in] offA Offset of the first element of the matrix \b A in the * buffer object. Counted in elements. * @param[in] A Buffer object storing matrix \b A. * @param[in] lda Leading dimension of matrix \b A. For detailed * description, see clAmdBlasStrmm(). * @param[out] B Buffer object storing matrix \b B. * @param[in] offB Offset of the first element of the matrix \b B in the * buffer object. Counted in elements. * @param[in] ldb Leading dimension of matrix \b B. For detailed * description, see clAmdBlasStrmm(). * @param[in] numCommandQueues Number of OpenCL command queues in which the * task is to be performed. * @param[in] commandQueues OpenCL command queues. * @param[in] numEventsInWaitList Number of events in the event wait list. * @param[in] eventWaitList Event wait list. * @param[in] events Event objects per each command queue that identify * a particular kernel execution instance. * * @return * - \b clAmdBlasSuccess on success; * - \b clAmdBlasInvalidValue if either \b offA or \b offB exceeds the size * of the respective buffer object; * - the same error codes as clAmdBlasStrmm() otherwise. * * @ingroup TRMM */ __inline clAmdBlasStatus clAmdBlasCtrmmEx( clAmdBlasOrder order, clAmdBlasSide side, clAmdBlasUplo uplo, clAmdBlasTranspose transA, clAmdBlasDiag diag, size_t M, size_t N, FloatComplex alpha, const cl_mem A, size_t offA, size_t lda, cl_mem B, size_t offB, size_t ldb, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events) { return clblasCtrmm( order, side, uplo, transA, diag, M, N, alpha, A, offA, lda, B, offB, ldb, numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events ); } /** * @brief Multiplying a matrix by a triangular matrix with double complex * elements. Extended version. * * Matrix-triangular matrix products: * - \f$ B \leftarrow \alpha A B \f$ * - \f$ B \leftarrow \alpha A^T B \f$ * - \f$ B \leftarrow \alpha B A \f$ * - \f$ B \leftarrow \alpha B A^T \f$ * * where \b T is an upper or lower triangular matrix. * * @param[in] order Row/column order. * @param[in] side The side of triangular matrix. * @param[in] uplo The triangle in matrix being referenced. * @param[in] transA How matrix \b A is to be transposed. * @param[in] diag Specify whether matrix is unit triangular. * @param[in] M Number of rows in matrix \b B. * @param[in] N Number of columns in matrix \b B. * @param[in] alpha The factor of matrix \b A. * @param[in] A Buffer object storing matrix \b A. * @param[in] offA Offset of the first element of the matrix \b A in the * buffer object. Counted in elements. * @param[in] lda Leading dimension of matrix \b A. For detailed * description, see clAmdBlasStrmm(). * @param[out] B Buffer object storing matrix \b B. * @param[in] offB Offset of the first element of the matrix \b B in the * buffer object. Counted in elements. * @param[in] ldb Leading dimension of matrix \b B. For detailed * description, see clAmdBlasStrmm(). * @param[in] numCommandQueues Number of OpenCL command queues in which the * task is to be performed. * @param[in] commandQueues OpenCL command queues. * @param[in] numEventsInWaitList Number of events in the event wait list. * @param[in] eventWaitList Event wait list. * @param[in] events Event objects per each command queue that identify * a particular kernel execution instance. * * @return * - \b clAmdBlasSuccess on success; * - \b clAmdBlasInvalidDevice if a target device does not support floating * point arithmetic with double precision; * - \b clAmdBlasInvalidValue if either \b offA or \b offB exceeds the size * of the respective buffer object; * - the same error codes as the clAmdBlasStrmm() function otherwise. * * @ingroup TRMM */ __inline clAmdBlasStatus clAmdBlasZtrmmEx( clAmdBlasOrder order, clAmdBlasSide side, clAmdBlasUplo uplo, clAmdBlasTranspose transA, clAmdBlasDiag diag, size_t M, size_t N, DoubleComplex alpha, const cl_mem A, size_t offA, size_t lda, cl_mem B, size_t offB, size_t ldb, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events) { return clblasZtrmm( order, side, uplo, transA, diag, M, N, alpha, A, offA, lda, B, offB, ldb, numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events ); } /*@}*/ /** * @defgroup TRSM TRSM - Solving triangular systems of equations * @ingroup BLAS3 */ /*@{*/ /** * @brief Solving triangular systems of equations with multiple right-hand * sides and float elements. * * Solving triangular systems of equations: * - \f$ B \leftarrow \alpha A^{-1} B \f$ * - \f$ B \leftarrow \alpha A^{-T} B \f$ * - \f$ B \leftarrow \alpha B A^{-1} \f$ * - \f$ B \leftarrow \alpha B A^{-T} \f$ * * where \b T is an upper or lower triangular matrix. * * @param[in] order Row/column order. * @param[in] side The side of triangular matrix. * @param[in] uplo The triangle in matrix being referenced. * @param[in] transA How matrix \b A is to be transposed. * @param[in] diag Specify whether matrix is unit triangular. * @param[in] M Number of rows in matrix \b B. * @param[in] N Number of columns in matrix \b B. * @param[in] alpha The factor of matrix \b A. * @param[in] A Buffer object storing matrix \b A. * @param[in] lda Leading dimension of matrix \b A. It cannot be less * than \b M when the \b side parameter is set to * \b clAmdBlasLeft,\n or less than \b N * when it is set to \b clAmdBlasRight. * @param[out] B Buffer object storing matrix \b B. * @param[in] ldb Leading dimension of matrix \b B. It cannot be less * than \b N when the \b order parameter is set to * \b clAmdBlasRowMajor,\n or less than \b M * when it is set to \b clAmdBlasColumnMajor. * @param[in] numCommandQueues Number of OpenCL command queues in which the * task is to be performed. * @param[in] commandQueues OpenCL command queues. * @param[in] numEventsInWaitList Number of events in the event wait list. * @param[in] eventWaitList Event wait list. * @param[in] events Event objects per each command queue that identify * a particular kernel execution instance. * * The function is obsolete and is not recommended for using in new * applications. Use the superseding function clAmdBlasStrsmEx() instead. * * @return * - \b clAmdBlasSuccess on success; * - \b clAmdBlasNotInitialized if clAmdBlasSetup() was not called; * - \b clAmdBlasInvalidValue if invalid parameters are passed: * - \b M, \b N or \b K is zero, or * - any of the leading dimensions is invalid; * - the matrix sizes lead to accessing outsize of any of the buffers; * - \b clAmdBlasInvalidMemObject if A, B, or C object is invalid, * or an image object rather than the buffer one; * - \b clAmdBlasOutOfResources if you use image-based function implementation * and no suitable scratch image available; * - \b clAmdBlasOutOfHostMemory if the library can't allocate memory for * internal structures; * - \b clAmdBlasInvalidCommandQueue if the passed command queue is invalid; * - \b clAmdBlasInvalidContext if a context a passed command queue belongs * to was released; * - \b clAmdBlasInvalidOperation if kernel compilation relating to a previous * call has not completed for any of the target devices; * - \b clAmdBlasCompilerNotAvailable if a compiler is not available; * - \b clAmdBlasBuildProgramFailure if there is a failure to build a program * executable. * * @ingroup TRSM */ __inline clAmdBlasStatus clAmdBlasStrsm( clAmdBlasOrder order, clAmdBlasSide side, clAmdBlasUplo uplo, clAmdBlasTranspose transA, clAmdBlasDiag diag, size_t M, size_t N, cl_float alpha, const cl_mem A, size_t lda, cl_mem B, size_t ldb, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events) { return clblasStrsm( order, side, uplo, transA, diag, M, N, alpha, A, 0, lda, B, 0, ldb, numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events ); } /** * @example example_strsm.c * This is an example of how to use the @ref clAmdBlasStrsm function. */ /** * @brief Solving triangular systems of equations with multiple right-hand * sides and double elements. * * Solving triangular systems of equations: * - \f$ B \leftarrow \alpha A^{-1} B \f$ * - \f$ B \leftarrow \alpha A^{-T} B \f$ * - \f$ B \leftarrow \alpha B A^{-1} \f$ * - \f$ B \leftarrow \alpha B A^{-T} \f$ * * where \b T is an upper or lower triangular matrix. * * @param[in] order Row/column order. * @param[in] side The side of triangular matrix. * @param[in] uplo The triangle in matrix being referenced. * @param[in] transA How matrix \b A is to be transposed. * @param[in] diag Specify whether matrix is unit triangular. * @param[in] M Number of rows in matrix \b B. * @param[in] N Number of columns in matrix \b B. * @param[in] alpha The factor of matrix \b A. * @param[in] A Buffer object storing matrix \b A. * @param[in] lda Leading dimension of matrix \b A. For detailed * description, see clAmdBlasStrsm(). * @param[out] B Buffer object storing matrix \b B. * @param[in] ldb Leading dimension of matrix \b B. For detailed * description, see clAmdBlasStrsm(). * @param[in] numCommandQueues Number of OpenCL command queues in which the * task is to be performed. * @param[in] commandQueues OpenCL command queues. * @param[in] numEventsInWaitList Number of events in the event wait list. * @param[in] eventWaitList Event wait list. * @param[in] events Event objects per each command queue that identify * a particular kernel execution instance. * * The function is obsolete and is not recommended for using in new * applications. Use the superseding function clAmdBlasDtrsmEx() instead. * * @return * - \b clAmdBlasSuccess on success; * - \b clAmdBlasInvalidDevice if a target device does not support floating * point arithmetic with double precision; * - the same error codes as the clAmdBlasStrsm() function otherwise. * * @ingroup TRSM */ __inline clAmdBlasStatus clAmdBlasDtrsm( clAmdBlasOrder order, clAmdBlasSide side, clAmdBlasUplo uplo, clAmdBlasTranspose transA, clAmdBlasDiag diag, size_t M, size_t N, cl_double alpha, const cl_mem A, size_t lda, cl_mem B, size_t ldb, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events) { return clblasDtrsm( order, side, uplo, transA, diag, M, N, alpha, A, 0, lda, B, 0, ldb, numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events ); } /** * @brief Solving triangular systems of equations with multiple right-hand * sides and float complex elements. * * Solving triangular systems of equations: * - \f$ B \leftarrow \alpha A^{-1} B \f$ * - \f$ B \leftarrow \alpha A^{-T} B \f$ * - \f$ B \leftarrow \alpha B A^{-1} \f$ * - \f$ B \leftarrow \alpha B A^{-T} \f$ * * where \b T is an upper or lower triangular matrix. * * @param[in] order Row/column order. * @param[in] side The side of triangular matrix. * @param[in] uplo The triangle in matrix being referenced. * @param[in] transA How matrix \b A is to be transposed. * @param[in] diag Specify whether matrix is unit triangular. * @param[in] M Number of rows in matrix \b B. * @param[in] N Number of columns in matrix \b B. * @param[in] alpha The factor of matrix \b A. * @param[in] A Buffer object storing matrix \b A. * @param[in] lda Leading dimension of matrix \b A. For detailed * description, see clAmdBlasStrsm(). * @param[out] B Buffer object storing matrix \b B. * @param[in] ldb Leading dimension of matrix \b B. For detailed * description, see clAmdBlasStrsm(). * @param[in] numCommandQueues Number of OpenCL command queues in which the * task is to be performed. * @param[in] commandQueues OpenCL command queues. * @param[in] numEventsInWaitList Number of events in the event wait list. * @param[in] eventWaitList Event wait list. * @param[in] events Event objects per each command queue that identify * a particular kernel execution instance. * * The function is obsolete and is not recommended for using in new * applications. Use the superseding function clAmdBlasCtrsmEx() instead. * * @return The same result as the clAmdBlasStrsm() function. * * @ingroup TRSM */ __inline clAmdBlasStatus clAmdBlasCtrsm( clAmdBlasOrder order, clAmdBlasSide side, clAmdBlasUplo uplo, clAmdBlasTranspose transA, clAmdBlasDiag diag, size_t M, size_t N, FloatComplex alpha, const cl_mem A, size_t lda, cl_mem B, size_t ldb, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events) { return clblasCtrsm( order, side, uplo, transA, diag, M, N, alpha, A, 0, lda, B, 0, ldb, numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events ); } /** * @brief Solving triangular systems of equations with multiple right-hand * sides and double complex elements. * * Solving triangular systems of equations: * - \f$ B \leftarrow \alpha A^{-1} B \f$ * - \f$ B \leftarrow \alpha A^{-T} B \f$ * - \f$ B \leftarrow \alpha B A^{-1} \f$ * - \f$ B \leftarrow \alpha B A^{-T} \f$ * * where \b T is an upper or lower triangular matrix. * * @param[in] order Row/column order. * @param[in] side The side of triangular matrix. * @param[in] uplo The triangle in matrix being referenced. * @param[in] transA How matrix \b A is to be transposed. * @param[in] diag Specify whether matrix is unit triangular. * @param[in] M Number of rows in matrix \b B. * @param[in] N Number of columns in matrix \b B. * @param[in] alpha The factor of matrix \b A. * @param[in] A Buffer object storing matrix \b A. * @param[in] lda Leading dimension of matrix \b A. For detailed * description, see clAmdBlasStrsm(). * @param[out] B Buffer object storing matrix \b B. * @param[in] ldb Leading dimension of matrix \b B. For detailed * description, see clAmdBlasStrsm(). * @param[in] numCommandQueues Number of OpenCL command queues in which the * task is to be performed. * @param[in] commandQueues OpenCL command queues. * @param[in] numEventsInWaitList Number of events in the event wait list. * @param[in] eventWaitList Event wait list. * @param[in] events Event objects per each command queue that identify * a particular kernel execution instance. * * The function is obsolete and is not recommended for using in new * applications. Use the superseding function clAmdBlasZtrsmEx() instead. * * @return The same result as the clAmdBlasDtrsm() function. * * @ingroup TRSM */ __inline clAmdBlasStatus clAmdBlasZtrsm( clAmdBlasOrder order, clAmdBlasSide side, clAmdBlasUplo uplo, clAmdBlasTranspose transA, clAmdBlasDiag diag, size_t M, size_t N, DoubleComplex alpha, const cl_mem A, size_t lda, cl_mem B, size_t ldb, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events) { return clblasZtrsm( order, side, uplo, transA, diag, M, N, alpha, A, 0, lda, B, 0, ldb, numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events ); } /** * @brief Solving triangular systems of equations with multiple right-hand * sides and float elements. Extended version. * * Solving triangular systems of equations: * - \f$ B \leftarrow \alpha A^{-1} B \f$ * - \f$ B \leftarrow \alpha A^{-T} B \f$ * - \f$ B \leftarrow \alpha B A^{-1} \f$ * - \f$ B \leftarrow \alpha B A^{-T} \f$ * * where \b T is an upper or lower triangular matrix. * * @param[in] order Row/column order. * @param[in] side The side of triangular matrix. * @param[in] uplo The triangle in matrix being referenced. * @param[in] transA How matrix \b A is to be transposed. * @param[in] diag Specify whether matrix is unit triangular. * @param[in] M Number of rows in matrix \b B. * @param[in] N Number of columns in matrix \b B. * @param[in] alpha The factor of matrix \b A. * @param[in] A Buffer object storing matrix \b A. * @param[in] offA Offset of the first element of the matrix \b A in the * buffer object. Counted in elements. * @param[in] lda Leading dimension of matrix \b A. It cannot be less * than \b M when the \b side parameter is set to * \b clAmdBlasLeft,\n or less than \b N * when it is set to \b clAmdBlasRight. * @param[out] B Buffer object storing matrix \b B. * @param[in] offB Offset of the first element of the matrix \b B in the * buffer object. Counted in elements. * @param[in] ldb Leading dimension of matrix \b B. It cannot be less * than \b N when the \b order parameter is set to * \b clAmdBlasRowMajor,\n or less than \b M * when it is set to \b clAmdBlasColumnMajor. * @param[in] numCommandQueues Number of OpenCL command queues in which the * task is to be performed. * @param[in] commandQueues OpenCL command queues. * @param[in] numEventsInWaitList Number of events in the event wait list. * @param[in] eventWaitList Event wait list. * @param[in] events Event objects per each command queue that identify * a particular kernel execution instance. * * @return * - \b clAmdBlasSuccess on success; * - \b clAmdBlasInvalidValue if either \b offA or \b offB exceeds the size * of the respective buffer object; * - the same error codes as clAmdBlasStrsm() otherwise. * * @ingroup TRSM */ __inline clAmdBlasStatus clAmdBlasStrsmEx( clAmdBlasOrder order, clAmdBlasSide side, clAmdBlasUplo uplo, clAmdBlasTranspose transA, clAmdBlasDiag diag, size_t M, size_t N, cl_float alpha, const cl_mem A, size_t offA, size_t lda, cl_mem B, size_t offB, size_t ldb, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events) { return clblasStrsm( order, side, uplo, transA, diag, M, N, alpha, A, offA, lda, B, offB, ldb, numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events ); } /** * @example example_strsm.c * This is an example of how to use the @ref clAmdBlasStrsmEx function. */ /** * @brief Solving triangular systems of equations with multiple right-hand * sides and double elements. Extended version. * * Solving triangular systems of equations: * - \f$ B \leftarrow \alpha A^{-1} B \f$ * - \f$ B \leftarrow \alpha A^{-T} B \f$ * - \f$ B \leftarrow \alpha B A^{-1} \f$ * - \f$ B \leftarrow \alpha B A^{-T} \f$ * * where \b T is an upper or lower triangular matrix. * * @param[in] order Row/column order. * @param[in] side The side of triangular matrix. * @param[in] uplo The triangle in matrix being referenced. * @param[in] transA How matrix \b A is to be transposed. * @param[in] diag Specify whether matrix is unit triangular. * @param[in] M Number of rows in matrix \b B. * @param[in] N Number of columns in matrix \b B. * @param[in] alpha The factor of matrix \b A. * @param[in] A Buffer object storing matrix \b A. * @param[in] offA Offset of the first element of the matrix \b A in the * buffer object. Counted in elements. * @param[in] lda Leading dimension of matrix \b A. For detailed * description, see clAmdBlasStrsm(). * @param[out] B Buffer object storing matrix \b B. * @param[in] offB Offset of the first element of the matrix \b A in the * buffer object. Counted in elements. * @param[in] ldb Leading dimension of matrix \b B. For detailed * description, see clAmdBlasStrsm(). * @param[in] numCommandQueues Number of OpenCL command queues in which the * task is to be performed. * @param[in] commandQueues OpenCL command queues. * @param[in] numEventsInWaitList Number of events in the event wait list. * @param[in] eventWaitList Event wait list. * @param[in] events Event objects per each command queue that identify * a particular kernel execution instance. * * @return * - \b clAmdBlasSuccess on success; * - \b clAmdBlasInvalidDevice if a target device does not support floating * point arithmetic with double precision; * - \b clAmdBlasInvalidValue if either \b offA or \b offB exceeds the size * of the respective buffer object; * - the same error codes as the clAmdBlasStrsm() function otherwise. * * @ingroup TRSM */ __inline clAmdBlasStatus clAmdBlasDtrsmEx( clAmdBlasOrder order, clAmdBlasSide side, clAmdBlasUplo uplo, clAmdBlasTranspose transA, clAmdBlasDiag diag, size_t M, size_t N, cl_double alpha, const cl_mem A, size_t offA, size_t lda, cl_mem B, size_t offB, size_t ldb, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events) { return clblasDtrsm( order, side, uplo, transA, diag, M, N, alpha, A, offA, lda, B, offB, ldb, numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events ); } /** * @brief Solving triangular systems of equations with multiple right-hand * sides and float complex elements. Extended version. * * Solving triangular systems of equations: * - \f$ B \leftarrow \alpha A^{-1} B \f$ * - \f$ B \leftarrow \alpha A^{-T} B \f$ * - \f$ B \leftarrow \alpha B A^{-1} \f$ * - \f$ B \leftarrow \alpha B A^{-T} \f$ * * where \b T is an upper or lower triangular matrix. * * @param[in] order Row/column order. * @param[in] side The side of triangular matrix. * @param[in] uplo The triangle in matrix being referenced. * @param[in] transA How matrix \b A is to be transposed. * @param[in] diag Specify whether matrix is unit triangular. * @param[in] M Number of rows in matrix \b B. * @param[in] N Number of columns in matrix \b B. * @param[in] alpha The factor of matrix \b A. * @param[in] A Buffer object storing matrix \b A. * @param[in] offA Offset of the first element of the matrix \b A in the * buffer object. Counted in elements. * @param[in] lda Leading dimension of matrix \b A. For detailed * description, see clAmdBlasStrsm(). * @param[out] B Buffer object storing matrix \b B. * @param[in] offB Offset of the first element of the matrix \b B in the * buffer object. Counted in elements. * @param[in] ldb Leading dimension of matrix \b B. For detailed * description, see clAmdBlasStrsm(). * @param[in] numCommandQueues Number of OpenCL command queues in which the * task is to be performed. * @param[in] commandQueues OpenCL command queues. * @param[in] numEventsInWaitList Number of events in the event wait list. * @param[in] eventWaitList Event wait list. * @param[in] events Event objects per each command queue that identify * a particular kernel execution instance. * * @return * - \b clAmdBlasSuccess on success; * - \b clAmdBlasInvalidValue if either \b offA or \b offB exceeds the size * of the respective buffer object; * - the same error codes as clAmdBlasStrsm() otherwise. * * @ingroup TRSM */ __inline clAmdBlasStatus clAmdBlasCtrsmEx( clAmdBlasOrder order, clAmdBlasSide side, clAmdBlasUplo uplo, clAmdBlasTranspose transA, clAmdBlasDiag diag, size_t M, size_t N, FloatComplex alpha, const cl_mem A, size_t offA, size_t lda, cl_mem B, size_t offB, size_t ldb, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events) { return clblasCtrsm( order, side, uplo, transA, diag, M, N, alpha, A, offA, lda, B, offB, ldb, numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events ); } /** * @brief Solving triangular systems of equations with multiple right-hand * sides and double complex elements. Extended version. * * Solving triangular systems of equations: * - \f$ B \leftarrow \alpha A^{-1} B \f$ * - \f$ B \leftarrow \alpha A^{-T} B \f$ * - \f$ B \leftarrow \alpha B A^{-1} \f$ * - \f$ B \leftarrow \alpha B A^{-T} \f$ * * where \b T is an upper or lower triangular matrix. * * @param[in] order Row/column order. * @param[in] side The side of triangular matrix. * @param[in] uplo The triangle in matrix being referenced. * @param[in] transA How matrix \b A is to be transposed. * @param[in] diag Specify whether matrix is unit triangular. * @param[in] M Number of rows in matrix \b B. * @param[in] N Number of columns in matrix \b B. * @param[in] alpha The factor of matrix \b A. * @param[in] A Buffer object storing matrix \b A. * @param[in] offA Offset of the first element of the matrix \b A in the * buffer object. Counted in elements. * @param[in] lda Leading dimension of matrix \b A. For detailed * description, see clAmdBlasStrsm(). * @param[out] B Buffer object storing matrix \b B. * @param[in] offB Offset of the first element of the matrix \b B in the * buffer object. Counted in elements. * @param[in] ldb Leading dimension of matrix \b B. For detailed * description, see clAmdBlasStrsm(). * @param[in] numCommandQueues Number of OpenCL command queues in which the * task is to be performed. * @param[in] commandQueues OpenCL command queues. * @param[in] numEventsInWaitList Number of events in the event wait list. * @param[in] eventWaitList Event wait list. * @param[in] events Event objects per each command queue that identify * a particular kernel execution instance. * * @return * - \b clAmdBlasSuccess on success; * - \b clAmdBlasInvalidDevice if a target device does not support floating * point arithmetic with double precision; * - \b clAmdBlasInvalidValue if either \b offA or \b offB exceeds the size * of the respective buffer object; * - the same error codes as the clAmdBlasStrsm() function otherwise * * @ingroup TRSM */ __inline clAmdBlasStatus clAmdBlasZtrsmEx( clAmdBlasOrder order, clAmdBlasSide side, clAmdBlasUplo uplo, clAmdBlasTranspose transA, clAmdBlasDiag diag, size_t M, size_t N, DoubleComplex alpha, const cl_mem A, size_t offA, size_t lda, cl_mem B, size_t offB, size_t ldb, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events) { return clblasZtrsm( order, side, uplo, transA, diag, M, N, alpha, A, offA, lda, B, offB, ldb, numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events ); } /*@}*/ /** * @defgroup SYRK SYRK - Symmetric rank-k update of a matrix * @ingroup BLAS3 */ /*@{*/ /** * @brief Rank-k update of a symmetric matrix with float elements. * * Rank-k updates: * - \f$ C \leftarrow \alpha A A^T + \beta C \f$ * - \f$ C \leftarrow \alpha A^T A + \beta C \f$ * * where \b C is a symmetric matrix. * * @param[in] order Row/column order. * @param[in] uplo The triangle in matrix \b C being referenced. * @param[in] transA How matrix \b A is to be transposed. * @param[in] N Number of rows and columns in matrix \b C. * @param[in] K Number of columns of the matrix \b A if it is not * transposed, and number of rows otherwise. * @param[in] alpha The factor of matrix \b A. * @param[in] A Buffer object storing the matrix \b A. * @param[in] lda Leading dimension of matrix \b A. It cannot be * less than \b K if \b A is * in the row-major format, and less than \b N * otherwise. * @param[in] beta The factor of the matrix \b C. * @param[out] C Buffer object storing matrix \b C. * @param[in] ldc Leading dimension of matric \b C. It cannot be less * than \b N. * @param[in] numCommandQueues Number of OpenCL command queues in which the * task is to be performed. * @param[in] commandQueues OpenCL command queues. * @param[in] numEventsInWaitList Number of events in the event wait list. * @param[in] eventWaitList Event wait list. * @param[in] events Event objects per each command queue that identify * a particular kernel execution instance. * * The function is obsolete and is not recommended for using in new * applications. Use the superseding function clAmdBlasSsyrkEx() instead. * * @return * - \b clAmdBlasSuccess on success; * - \b clAmdBlasNotInitialized if clAmdBlasSetup() was not called; * - \b clAmdBlasInvalidValue if invalid parameters are passed: * - either \b N or \b K is zero, or * - any of the leading dimensions is invalid; * - the matrix sizes lead to accessing outsize of any of the buffers; * - \b clAmdBlasInvalidMemObject if either \b A or \b C object is * invalid, or an image object rather than the buffer one; * - \b clAmdBlasOutOfHostMemory if the library can't allocate memory for * internal structures; * - \b clAmdBlasInvalidCommandQueue if the passed command queue is invalid; * - \b clAmdBlasInvalidContext if a context a passed command queue belongs to * was released. * * @ingroup SYRK */ __inline clAmdBlasStatus clAmdBlasSsyrk( clAmdBlasOrder order, clAmdBlasUplo uplo, clAmdBlasTranspose transA, size_t N, size_t K, cl_float alpha, const cl_mem A, size_t lda, cl_float beta, cl_mem C, size_t ldc, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events) { return clblasSsyrk( order, uplo, transA, N, K, alpha, A, 0, lda, beta, C, 0, ldc, numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events ); } /** * @example example_ssyrk.c * This is an example of how to use the @ref clAmdBlasSsyrk function. */ /** * @brief Rank-k update of a symmetric matrix with double elements. * * Rank-k updates: * - \f$ C \leftarrow \alpha A A^T + \beta C \f$ * - \f$ C \leftarrow \alpha A^T A + \beta C \f$ * * where \b C is a symmetric matrix. * * @param[in] order Row/column order. * @param[in] uplo The triangle in matrix \b C being referenced. * @param[in] transA How matrix \b A is to be transposed. * @param[in] N Number of rows and columns in matrix \b C. * @param[in] K Number of columns of the matrix \b A if it is not * transposed, and number of rows otherwise. * @param[in] alpha The factor of matrix \b A. * @param[in] A Buffer object storing the matrix \b A. * @param[in] lda Leading dimension of matrix \b A. For detailed * description, see clAmdBlasSsyrk(). * @param[in] beta The factor of the matrix \b C. * @param[out] C Buffer object storing matrix \b C. * @param[in] ldc Leading dimension of matrix \b C. It cannot be less * than \b N. * @param[in] numCommandQueues Number of OpenCL command queues in which the * task is to be performed. * @param[in] commandQueues OpenCL command queues. * @param[in] numEventsInWaitList Number of events in the event wait list. * @param[in] eventWaitList Event wait list. * @param[in] events Event objects per each command queue that identify * a particular kernel execution instance. * * The function is obsolete and is not recommended for using in new * applications. Use the superseding function clAmdBlasDsyrkEx() instead. * * @return * - \b clAmdBlasSuccess on success; * - \b clAmdBlasInvalidDevice if a target device does not support floating * point arithmetic with double precision; * - the same error codes as the clAmdBlasSsyrk() function otherwise. * * @ingroup SYRK */ __inline clAmdBlasStatus clAmdBlasDsyrk( clAmdBlasOrder order, clAmdBlasUplo uplo, clAmdBlasTranspose transA, size_t N, size_t K, cl_double alpha, const cl_mem A, size_t lda, cl_double beta, cl_mem C, size_t ldc, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events) { return clblasDsyrk( order, uplo, transA, N, K, alpha, A, 0, lda, beta, C, 0, ldc, numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events ); } /** * @brief Rank-k update of a symmetric matrix with complex float elements. * * Rank-k updates: * - \f$ C \leftarrow \alpha A A^T + \beta C \f$ * - \f$ C \leftarrow \alpha A^T A + \beta C \f$ * * where \b C is a symmetric matrix. * * @param[in] order Row/column order. * @param[in] uplo The triangle in matrix \b C being referenced. * @param[in] transA How matrix \b A is to be transposed. * @param[in] N Number of rows and columns in matrix \b C. * @param[in] K Number of columns of the matrix \b A if it is not * transposed, and number of rows otherwise. * @param[in] alpha The factor of matrix \b A. * @param[in] A Buffer object storing the matrix \b A. * @param[in] lda Leading dimension of matrix \b A. For detailed * description, see clAmdBlasSsyrk(). * @param[in] beta The factor of the matrix \b C. * @param[out] C Buffer object storing matrix \b C. * @param[in] ldc Leading dimension of matrix \b C. It cannot be less * than \b N. * @param[in] numCommandQueues Number of OpenCL command queues in which the * task is to be performed. * @param[in] commandQueues OpenCL command queues. * @param[in] numEventsInWaitList Number of events in the event wait list. * @param[in] eventWaitList Event wait list. * @param[in] events Event objects per each command queue that identify * a particular kernel execution instance. * * The function is obsolete and is not recommended for using in new * applications. Use the superseding function clAmdBlasCsyrkEx() instead. * * @return * - \b clAmdBlasSuccess on success; * - \b clAmdBlasInvalidValue if \b transA is set to \ref clAmdBlasConjTrans. * - the same error codes as the clAmdBlasSsyrk() function otherwise. * * @ingroup SYRK */ __inline clAmdBlasStatus clAmdBlasCsyrk( clAmdBlasOrder order, clAmdBlasUplo uplo, clAmdBlasTranspose transA, size_t N, size_t K, FloatComplex alpha, const cl_mem A, size_t lda, FloatComplex beta, cl_mem C, size_t ldc, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events) { return clblasCsyrk( order, uplo, transA, N, K, alpha, A, 0, lda, beta, C, 0, ldc, numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events ); } /** * @brief Rank-k update of a symmetric matrix with complex double elements. * * Rank-k updates: * - \f$ C \leftarrow \alpha A A^T + \beta C \f$ * - \f$ C \leftarrow \alpha A^T A + \beta C \f$ * * where \b C is a symmetric matrix. * * @param[in] order Row/column order. * @param[in] uplo The triangle in matrix \b C being referenced. * @param[in] transA How matrix \b A is to be transposed. * @param[in] N Number of rows and columns in matrix \b C. * @param[in] K Number of columns of the matrix \b A if it is not * transposed, and number of rows otherwise. * @param[in] alpha The factor of matrix \b A. * @param[in] A Buffer object storing the matrix \b A. * @param[in] lda Leading dimension of matrix \b A. For detailed * description, see clAmdBlasSsyrk(). * @param[in] beta The factor of the matrix \b C. * @param[out] C Buffer object storing matrix \b C. * @param[in] ldc Leading dimension of matrix \b C. It cannot be less * than \b N. * @param[in] numCommandQueues Number of OpenCL command queues in which the * task is to be performed. * @param[in] commandQueues OpenCL command queues. * @param[in] numEventsInWaitList Number of events in the event wait list. * @param[in] eventWaitList Event wait list. * @param[in] events Event objects per each command queue that identify * a particular kernel execution instance. * * The function is obsolete and is not recommended for using in new * applications. Use the superseding function clAmdBlasZsyrkEx() instead. * * @return * - \b clAmdBlasSuccess on success; * - \b clAmdBlasInvalidDevice if a target device does not support floating * point arithmetic with double precision; * - \b clAmdBlasInvalidValue if \b transA is set to \ref clAmdBlasConjTrans. * - the same error codes as the clAmdBlasSsyrk() function otherwise. * * @ingroup SYRK */ __inline clAmdBlasStatus clAmdBlasZsyrk( clAmdBlasOrder order, clAmdBlasUplo uplo, clAmdBlasTranspose transA, size_t N, size_t K, DoubleComplex alpha, const cl_mem A, size_t lda, DoubleComplex beta, cl_mem C, size_t ldc, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events) { return clblasZsyrk( order, uplo, transA, N, K, alpha, A, 0, lda, beta, C, 0, ldc, numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events ); } /** * @brief Rank-k update of a symmetric matrix with float elements. * Extended version. * * Rank-k updates: * - \f$ C \leftarrow \alpha A A^T + \beta C \f$ * - \f$ C \leftarrow \alpha A^T A + \beta C \f$ * * where \b C is a symmetric matrix. * * @param[in] order Row/column order. * @param[in] uplo The triangle in matrix \b C being referenced. * @param[in] transA How matrix \b A is to be transposed. * @param[in] N Number of rows and columns in matrix \b C. * @param[in] K Number of columns of the matrix \b A if it is not * transposed, and number of rows otherwise. * @param[in] alpha The factor of matrix \b A. * @param[in] A Buffer object storing the matrix \b A. * @param[in] offA Offset of the first element of the matrix \b A in the * buffer object. Counted in elements. * @param[in] lda Leading dimension of matrix \b A. It cannot be * less than \b K if \b A is * in the row-major format, and less than \b N * otherwise. * @param[in] beta The factor of the matrix \b C. * @param[out] C Buffer object storing matrix \b C. * @param[in] offC Offset of the first element of the matrix \b C in the * buffer object. Counted in elements. * @param[in] ldc Leading dimension of matric \b C. It cannot be less * than \b N. * @param[in] numCommandQueues Number of OpenCL command queues in which the * task is to be performed. * @param[in] commandQueues OpenCL command queues. * @param[in] numEventsInWaitList Number of events in the event wait list. * @param[in] eventWaitList Event wait list. * @param[in] events Event objects per each command queue that identify * a particular kernel execution instance. * * @return * - \b clAmdBlasSuccess on success; * - \b clAmdBlasInvalidValue if either \b offA or \b offC exceeds the size * of the respective buffer object; * - the same error codes as the clAmdBlasSsyrk() function otherwise. * * @ingroup SYRK */ __inline clAmdBlasStatus clAmdBlasSsyrkEx( clAmdBlasOrder order, clAmdBlasUplo uplo, clAmdBlasTranspose transA, size_t N, size_t K, cl_float alpha, const cl_mem A, size_t offA, size_t lda, cl_float beta, cl_mem C, size_t offC, size_t ldc, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events) { return clblasSsyrk( order, uplo, transA, N, K, alpha, A, offA, lda, beta, C, offC, ldc, numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events ); } /** * @example example_ssyrk.c * This is an example of how to use the @ref clAmdBlasSsyrkEx function. */ /** * @brief Rank-k update of a symmetric matrix with double elements. * Extended version. * * Rank-k updates: * - \f$ C \leftarrow \alpha A A^T + \beta C \f$ * - \f$ C \leftarrow \alpha A^T A + \beta C \f$ * * where \b C is a symmetric matrix. * * @param[in] order Row/column order. * @param[in] uplo The triangle in matrix \b C being referenced. * @param[in] transA How matrix \b A is to be transposed. * @param[in] N Number of rows and columns in matrix \b C. * @param[in] K Number of columns of the matrix \b A if it is not * transposed, and number of rows otherwise. * @param[in] alpha The factor of matrix \b A. * @param[in] A Buffer object storing the matrix \b A. * @param[in] offA Offset of the first element of the matrix \b A in the * buffer object. Counted in elements. * @param[in] lda Leading dimension of matrix \b A. For detailed * description, see clAmdBlasSsyrk(). * @param[in] beta The factor of the matrix \b C. * @param[out] C Buffer object storing matrix \b C. * @param[in] offC Offset of the first element of the matrix \b C in the * buffer object. Counted in elements. * @param[in] ldc Leading dimension of matrix \b C. It cannot be less * than \b N. * @param[in] numCommandQueues Number of OpenCL command queues in which the * task is to be performed. * @param[in] commandQueues OpenCL command queues. * @param[in] numEventsInWaitList Number of events in the event wait list. * @param[in] eventWaitList Event wait list. * @param[in] events Event objects per each command queue that identify * a particular kernel execution instance. * * @return * - \b clAmdBlasSuccess on success; * - \b clAmdBlasInvalidDevice if a target device does not support floating * point arithmetic with double precision; * - \b clAmdBlasInvalidValue if either \b offA or \b offC exceeds the size * of the respective buffer object; * - the same error codes as the clAmdBlasSsyrk() function otherwise. * * @ingroup SYRK */ __inline clAmdBlasStatus clAmdBlasDsyrkEx( clAmdBlasOrder order, clAmdBlasUplo uplo, clAmdBlasTranspose transA, size_t N, size_t K, cl_double alpha, const cl_mem A, size_t offA, size_t lda, cl_double beta, cl_mem C, size_t offC, size_t ldc, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events) { return clblasDsyrk( order, uplo, transA, N, K, alpha, A, offA, lda, beta, C, offC, ldc, numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events ); } /** * @brief Rank-k update of a symmetric matrix with complex float elements. * Extended version. * * Rank-k updates: * - \f$ C \leftarrow \alpha A A^T + \beta C \f$ * - \f$ C \leftarrow \alpha A^T A + \beta C \f$ * * where \b C is a symmetric matrix. * * @param[in] order Row/column order. * @param[in] uplo The triangle in matrix \b C being referenced. * @param[in] transA How matrix \b A is to be transposed. * @param[in] N Number of rows and columns in matrix \b C. * @param[in] K Number of columns of the matrix \b A if it is not * transposed, and number of rows otherwise. * @param[in] alpha The factor of matrix \b A. * @param[in] A Buffer object storing the matrix \b A. * @param[in] offA Offset of the first element of the matrix \b A in the * buffer object. Counted in elements. * @param[in] lda Leading dimension of matrix \b A. For detailed * description, see clAmdBlasSsyrk(). * @param[in] beta The factor of the matrix \b C. * @param[out] C Buffer object storing matrix \b C. * @param[in] offC Offset of the first element of the matrix \b C in the * buffer object. Counted in elements. * @param[in] ldc Leading dimension of matrix \b C. It cannot be less * than \b N. * @param[in] numCommandQueues Number of OpenCL command queues in which the * task is to be performed. * @param[in] commandQueues OpenCL command queues. * @param[in] numEventsInWaitList Number of events in the event wait list. * @param[in] eventWaitList Event wait list. * @param[in] events Event objects per each command queue that identify * a particular kernel execution instance. * * @return * - \b clAmdBlasSuccess on success; * - \b clAmdBlasInvalidValue if either \b offA or \b offC exceeds the size * of the respective buffer object; * - \b clAmdBlasInvalidValue if \b transA is set to \ref clAmdBlasConjTrans. * - the same error codes as the clAmdBlasSsyrk() function otherwise. * * @ingroup SYRK */ __inline clAmdBlasStatus clAmdBlasCsyrkEx( clAmdBlasOrder order, clAmdBlasUplo uplo, clAmdBlasTranspose transA, size_t N, size_t K, FloatComplex alpha, const cl_mem A, size_t offA, size_t lda, FloatComplex beta, cl_mem C, size_t offC, size_t ldc, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events) { return clblasCsyrk( order, uplo, transA, N, K, alpha, A, offA, lda, beta, C, offC, ldc, numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events ); } /** * @brief Rank-k update of a symmetric matrix with complex double elements. * Extended version. * * Rank-k updates: * - \f$ C \leftarrow \alpha A A^T + \beta C \f$ * - \f$ C \leftarrow \alpha A^T A + \beta C \f$ * * where \b C is a symmetric matrix. * * @param[in] order Row/column order. * @param[in] uplo The triangle in matrix \b C being referenced. * @param[in] transA How matrix \b A is to be transposed. * @param[in] N Number of rows and columns in matrix \b C. * @param[in] K Number of columns of the matrix \b A if it is not * transposed, and number of rows otherwise. * @param[in] alpha The factor of matrix \b A. * @param[in] A Buffer object storing the matrix \b A. * @param[in] offA Offset of the first element of the matrix \b A in the * buffer object. Counted in elements. * @param[in] lda Leading dimension of matrix \b A. For detailed * description, see clAmdBlasSsyrk(). * @param[in] beta The factor of the matrix \b C. * @param[out] C Buffer object storing matrix \b C. * @param[in] offC Offset of the first element of the matrix \b C in the * buffer object. Counted in elements. * @param[in] ldc Leading dimension of matrix \b C. It cannot be less * than \b N. * @param[in] numCommandQueues Number of OpenCL command queues in which the * task is to be performed. * @param[in] commandQueues OpenCL command queues. * @param[in] numEventsInWaitList Number of events in the event wait list. * @param[in] eventWaitList Event wait list. * @param[in] events Event objects per each command queue that identify * a particular kernel execution instance. * * @return * - \b clAmdBlasSuccess on success; * - \b clAmdBlasInvalidDevice if a target device does not support floating * point arithmetic with double precision; * - \b clAmdBlasInvalidValue if either \b offA or \b offC exceeds the size * of the respective buffer object; * - \b clAmdBlasInvalidValue if \b transA is set to \ref clAmdBlasConjTrans. * - the same error codes as the clAmdBlasSsyrk() function otherwise. * * @ingroup SYRK */ __inline clAmdBlasStatus clAmdBlasZsyrkEx( clAmdBlasOrder order, clAmdBlasUplo uplo, clAmdBlasTranspose transA, size_t N, size_t K, DoubleComplex alpha, const cl_mem A, size_t offA, size_t lda, DoubleComplex beta, cl_mem C, size_t offC, size_t ldc, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events) { return clblasZsyrk( order, uplo, transA, N, K, alpha, A, offA, lda, beta, C, offC, ldc, numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events ); } /*@}*/ /** * @defgroup SYR2K SYR2K - Symmetric rank-2k update to a matrix * @ingroup BLAS3 */ /*@{*/ /** * @brief Rank-2k update of a symmetric matrix with float elements. * * Rank-k updates: * - \f$ C \leftarrow \alpha A B^T + \alpha B A^T + \beta C \f$ * - \f$ C \leftarrow \alpha A^T B + \alpha B^T A \beta C \f$ * * where \b C is a symmetric matrix. * * @param[in] order Row/column order. * @param[in] uplo The triangle in matrix \b C being referenced. * @param[in] transAB How matrices \b A and \b B is to be transposed. * @param[in] N Number of rows and columns in matrix \b C. * @param[in] K Number of columns of the matrices \b A and \b B if they * are not transposed, and number of rows otherwise. * @param[in] alpha The factor of matrices \b A and \b B. * @param[in] A Buffer object storing matrix \b A. * @param[in] lda Leading dimension of matrix \b A. It cannot be less * than \b K if \b A is * in the row-major format, and less than \b N * otherwise. * @param[in] B Buffer object storing matrix \b B. * @param[in] ldb Leading dimension of matrix \b B. It cannot be less * less than \b K if \b B matches to the op(\b B) matrix * in the row-major format, and less than \b N * otherwise. * @param[in] beta The factor of matrix \b C. * @param[out] C Buffer object storing matrix \b C. * @param[in] ldc Leading dimension of matrix \b C. It cannot be less * than \b N. * @param[in] numCommandQueues Number of OpenCL command queues in which the * task is to be performed. * @param[in] commandQueues OpenCL command queues. * @param[in] numEventsInWaitList Number of events in the event wait list. * @param[in] eventWaitList Event wait list. * @param[in] events Event objects per each command queue that identify * a particular kernel execution instance. * * The function is obsolete and is not recommended for using in new * applications. Use the superseding function clAmdBlasSsyr2kEx() instead. * * @return * - \b clAmdBlasSuccess on success; * - \b clAmdBlasNotInitialized if clAmdBlasSetup() was not called; * - \b clAmdBlasInvalidValue if invalid parameters are passed: * - either \b N or \b K is zero, or * - any of the leading dimensions is invalid; * - the matrix sizes lead to accessing outsize of any of the buffers; * - \b clAmdBlasInvalidMemObject if either \b A, \b B or \b C object is * invalid, or an image object rather than the buffer one; * - \b clAmdBlasOutOfHostMemory if the library can't allocate memory for * internal structures; * - \b clAmdBlasInvalidCommandQueue if the passed command queue is invalid; * - \b clAmdBlasInvalidContext if a context a passed command queue belongs to * was released; * - \b clAmdBlasInvalidOperation if kernel compilation relating to a previous * call has not completed for any of the target devices; * - \b clAmdBlasCompilerNotAvailable if a compiler is not available; * - \b clAmdBlasBuildProgramFailure if there is a failure to build a program * executable. * * @ingroup SYR2K */ __inline clAmdBlasStatus clAmdBlasSsyr2k( clAmdBlasOrder order, clAmdBlasUplo uplo, clAmdBlasTranspose transAB, size_t N, size_t K, cl_float alpha, const cl_mem A, size_t lda, const cl_mem B, size_t ldb, cl_float beta, cl_mem C, size_t ldc, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events) { return clblasSsyr2k( order, uplo, transAB, N, K, alpha, A, 0, lda, B, 0, ldb, beta, C, 0, ldc, numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events ); } /** * @example example_ssyr2k.c * This is an example of how to use the @ref clAmdBlasSsyr2k function. */ /** * @brief Rank-2k update of a symmetric matrix with double elements. * * Rank-k updates: * - \f$ C \leftarrow \alpha A B^T + \alpha B A^T + \beta C \f$ * - \f$ C \leftarrow \alpha A^T B + \alpha B^T A \beta C \f$ * * where \b C is a symmetric matrix. * * @param[in] order Row/column order. * @param[in] uplo The triangle in matrix \b C being referenced. * @param[in] transAB How matrices \b A and \b B is to be transposed. * @param[in] N Number of rows and columns in matrix \b C. * @param[in] K Number of columns of the matrices \b A and \b B if they * are not transposed, and number of rows otherwise. * @param[in] alpha The factor of matrices \b A and \b B. * @param[in] A Buffer object storing matrix \b A. * @param[in] lda Leading dimension of matrix \b A. For detailed * description, see clAmdBlasSsyr2k(). * @param[in] B Buffer object storing matrix \b B. * @param[in] ldb Leading dimension of matrix \b B. For detailed * description, see clAmdBlasSsyr2k(). * @param[in] beta The factor of matrix \b C. * @param[out] C Buffer object storing matrix \b C. * @param[in] ldc Leading dimension of matrix \b C. It cannot be less * than \b N. * @param[in] numCommandQueues Number of OpenCL command queues in which the * task is to be performed. * @param[in] commandQueues OpenCL command queues. * @param[in] numEventsInWaitList Number of events in the event wait list. * @param[in] eventWaitList Event wait list. * @param[in] events Event objects per each command queue that identify * a particular kernel execution instance. * * The function is obsolete and is not recommended for using in new * applications. Use the superseding function clAmdBlasDsyr2kEx() instead. * * @return * - \b clAmdBlasSuccess on success; * - \b clAmdBlasInvalidDevice if a target device does not support floating * point arithmetic with double precision; * - the same error codes as the clAmdBlasSsyr2k() function otherwise. * * @ingroup SYR2K */ __inline clAmdBlasStatus clAmdBlasDsyr2k( clAmdBlasOrder order, clAmdBlasUplo uplo, clAmdBlasTranspose transAB, size_t N, size_t K, cl_double alpha, const cl_mem A, size_t lda, const cl_mem B, size_t ldb, cl_double beta, cl_mem C, size_t ldc, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events) { return clblasDsyr2k( order, uplo, transAB, N, K, alpha, A, 0, lda, B, 0, ldb, beta, C, 0, ldc, numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events ); } /** * @brief Rank-2k update of a symmetric matrix with complex float elements. * * Rank-k updates: * - \f$ C \leftarrow \alpha A B^T + \alpha B A^T + \beta C \f$ * - \f$ C \leftarrow \alpha A^T B + \alpha B^T A \beta C \f$ * * where \b C is a symmetric matrix. * * @param[in] order Row/column order. * @param[in] uplo The triangle in matrix \b C being referenced. * @param[in] transAB How matrices \b A and \b B is to be transposed. * @param[in] N Number of rows and columns in matrix \b C. * @param[in] K Number of columns of the matrices \b A and \b B if they * are not transposed, and number of rows otherwise. * @param[in] alpha The factor of matrices \b A and \b B. * @param[in] A Buffer object storing matrix \b A. * @param[in] lda Leading dimension of matrix \b A. For detailed * description, see clAmdBlasSsyr2k(). * @param[in] B Buffer object storing matrix \b B. * @param[in] ldb Leading dimension of matrix \b B. For detailed * description, see clAmdBlasSsyr2k(). * @param[in] beta The factor of matrix \b C. * @param[out] C Buffer object storing matrix \b C. * @param[in] ldc Leading dimension of matrix \b C. It cannot be less * than \b N. * @param[in] numCommandQueues Number of OpenCL command queues in which the * task is to be performed. * @param[in] commandQueues OpenCL command queues. * @param[in] numEventsInWaitList Number of events in the event wait list. * @param[in] eventWaitList Event wait list. * @param[in] events Event objects per each command queue that identify * a particular kernel execution instance. * * The function is obsolete and is not recommended for using in new * applications. Use the superseding function clAmdBlasCsyr2kEx() instead. * * @return * - \b clAmdBlasSuccess on success; * - \b clAmdBlasInvalidValue if \b transAB is set to \ref clAmdBlasConjTrans. * - the same error codes as the clAmdBlasSsyr2k() function otherwise. * * @ingroup SYR2K */ __inline clAmdBlasStatus clAmdBlasCsyr2k( clAmdBlasOrder order, clAmdBlasUplo uplo, clAmdBlasTranspose transAB, size_t N, size_t K, FloatComplex alpha, const cl_mem A, size_t lda, const cl_mem B, size_t ldb, FloatComplex beta, cl_mem C, size_t ldc, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events) { return clblasCsyr2k( order, uplo, transAB, N, K, alpha, A, 0, lda, B, 0, ldb, beta, C, 0, ldc, numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events ); } /** * @brief Rank-2k update of a symmetric matrix with complex double elements. * * Rank-k updates: * - \f$ C \leftarrow \alpha A B^T + \alpha B A^T + \beta C \f$ * - \f$ C \leftarrow \alpha A^T B + \alpha B^T A \beta C \f$ * * where \b C is a symmetric matrix. * * @param[in] order Row/column order. * @param[in] uplo The triangle in matrix \b C being referenced. * @param[in] transAB How matrices \b A and \b B is to be transposed. * @param[in] N Number of rows and columns in matrix \b C. * @param[in] K Number of columns of the matrices \b A and \b B if they * are not transposed, and number of rows otherwise. * @param[in] alpha The factor of matrices \b A and \b B. * @param[in] A Buffer object storing matrix \b A. * @param[in] lda Leading dimension of matrix \b A. For detailed * description, see clAmdBlasSsyr2k(). * @param[in] B Buffer object storing matrix \b B. * @param[in] ldb Leading dimension of matrix \b B. For detailed * description, see clAmdBlasSsyr2k(). * @param[in] beta The factor of matrix \b C. * @param[out] C Buffer object storing matrix \b C. * @param[in] ldc Leading dimension of matrix \b C. It cannot be less * than \b N. * @param[in] numCommandQueues Number of OpenCL command queues in which the * task is to be performed. * @param[in] commandQueues OpenCL command queues. * @param[in] numEventsInWaitList Number of events in the event wait list. * @param[in] eventWaitList Event wait list. * @param[in] events Event objects per each command queue that identify * a particular kernel execution instance. * * The function is obsolete and is not recommended for using in new * applications. Use the superseding function clAmdBlasZsyr2kEx() instead. * * @return * - \b clAmdBlasSuccess on success; * - \b clAmdBlasInvalidDevice if a target device does not support floating * point arithmetic with double precision; * - \b clAmdBlasInvalidValue if \b transAB is set to \ref clAmdBlasConjTrans. * - the same error codes as the clAmdBlasSsyr2k() function otherwise. * * @ingroup SYR2K */ __inline clAmdBlasStatus clAmdBlasZsyr2k( clAmdBlasOrder order, clAmdBlasUplo uplo, clAmdBlasTranspose transAB, size_t N, size_t K, DoubleComplex alpha, const cl_mem A, size_t lda, const cl_mem B, size_t ldb, DoubleComplex beta, cl_mem C, size_t ldc, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events) { return clblasZsyr2k( order, uplo, transAB, N, K, alpha, A, 0, lda, B, 0, ldb, beta, C, 0, ldc, numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events ); } /** * @brief Rank-2k update of a symmetric matrix with float elements. * Extended version. * * Rank-k updates: * - \f$ C \leftarrow \alpha A B^T + \alpha B A^T + \beta C \f$ * - \f$ C \leftarrow \alpha A^T B + \alpha B^T A \beta C \f$ * * where \b C is a symmetric matrix. * * @param[in] order Row/column order. * @param[in] uplo The triangle in matrix \b C being referenced. * @param[in] transAB How matrices \b A and \b B is to be transposed. * @param[in] N Number of rows and columns in matrix \b C. * @param[in] K Number of columns of the matrices \b A and \b B if they * are not transposed, and number of rows otherwise. * @param[in] alpha The factor of matrices \b A and \b B. * @param[in] A Buffer object storing matrix \b A. * @param[in] offA Offset of the first element of the matrix \b A in the * buffer object. Counted in elements. * @param[in] lda Leading dimension of matrix \b A. It cannot be less * than \b K if \b A is * in the row-major format, and less than \b N * otherwise. * @param[in] B Buffer object storing matrix \b B. * @param[in] offB Offset of the first element of the matrix \b B in the * buffer object. Counted in elements. * @param[in] ldb Leading dimension of matrix \b B. It cannot be less * less than \b K if \b B matches to the op(\b B) matrix * in the row-major format, and less than \b N * otherwise. * @param[in] beta The factor of matrix \b C. * @param[out] C Buffer object storing matrix \b C. * @param[in] offC Offset of the first element of the matrix \b C in the * buffer object. Counted in elements. * @param[in] ldc Leading dimension of matrix \b C. It cannot be less * than \b N. * @param[in] numCommandQueues Number of OpenCL command queues in which the * task is to be performed. * @param[in] commandQueues OpenCL command queues. * @param[in] numEventsInWaitList Number of events in the event wait list. * @param[in] eventWaitList Event wait list. * @param[in] events Event objects per each command queue that identify * a particular kernel execution instance. * * @return * - \b clAmdBlasSuccess on success; * - \b clAmdBlasInvalidValue if either \b offA, \b offB or \b offC exceeds * the size of the respective buffer object; * - the same error codes as the clAmdBlasSsyr2k() function otherwise. * * @ingroup SYR2K */ __inline clAmdBlasStatus clAmdBlasSsyr2kEx( clAmdBlasOrder order, clAmdBlasUplo uplo, clAmdBlasTranspose transAB, size_t N, size_t K, cl_float alpha, const cl_mem A, size_t offA, size_t lda, const cl_mem B, size_t offB, size_t ldb, cl_float beta, cl_mem C, size_t offC, size_t ldc, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events) { return clblasSsyr2k( order, uplo, transAB, N, K, alpha, A, offA, lda, B, offB, ldb, beta, C, offC, ldc, numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events ); } /** * @example example_ssyr2k.c * This is an example of how to use the @ref clAmdBlasSsyr2kEx function. */ /** * @brief Rank-2k update of a symmetric matrix with double elements. * Extended version. * * Rank-k updates: * - \f$ C \leftarrow \alpha A B^T + \alpha B A^T + \beta C \f$ * - \f$ C \leftarrow \alpha A^T B + \alpha B^T A \beta C \f$ * * where \b C is a symmetric matrix. * * @param[in] order Row/column order. * @param[in] uplo The triangle in matrix \b C being referenced. * @param[in] transAB How matrices \b A and \b B is to be transposed. * @param[in] N Number of rows and columns in matrix \b C. * @param[in] K Number of columns of the matrices \b A and \b B if they * are not transposed, and number of rows otherwise. * @param[in] alpha The factor of matrices \b A and \b B. * @param[in] A Buffer object storing matrix \b A. * @param[in] offA Offset of the first element of the matrix \b A in the * buffer object. Counted in elements. * @param[in] lda Leading dimension of matrix \b A. For detailed * description, see clAmdBlasSsyr2k(). * @param[in] B Buffer object storing matrix \b B. * @param[in] offB Offset of the first element of the matrix \b B in the * buffer object. Counted in elements. * @param[in] ldb Leading dimension of matrix \b B. For detailed * description, see clAmdBlasSsyr2k(). * @param[in] beta The factor of matrix \b C. * @param[out] C Buffer object storing matrix \b C. * @param[in] offC Offset of the first element of the matrix \b C in the * buffer object. Counted in elements. * @param[in] ldc Leading dimension of matrix \b C. It cannot be less * than \b N. * @param[in] numCommandQueues Number of OpenCL command queues in which the * task is to be performed. * @param[in] commandQueues OpenCL command queues. * @param[in] numEventsInWaitList Number of events in the event wait list. * @param[in] eventWaitList Event wait list. * @param[in] events Event objects per each command queue that identify * a particular kernel execution instance. * * @return * - \b clAmdBlasSuccess on success; * - \b clAmdBlasInvalidDevice if a target device does not support floating * point arithmetic with double precision; * - \b clAmdBlasInvalidValue if either \b offA, \b offB or \b offC exceeds * the size of the respective buffer object; * - the same error codes as the clAmdBlasSsyr2k() function otherwise. * * @ingroup SYR2K */ __inline clAmdBlasStatus clAmdBlasDsyr2kEx( clAmdBlasOrder order, clAmdBlasUplo uplo, clAmdBlasTranspose transAB, size_t N, size_t K, cl_double alpha, const cl_mem A, size_t offA, size_t lda, const cl_mem B, size_t offB, size_t ldb, cl_double beta, cl_mem C, size_t offC, size_t ldc, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events) { return clblasDsyr2k( order, uplo, transAB, N, K, alpha, A, offA, lda, B, offB, ldb, beta, C, offC, ldc, numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events ); } /** * @brief Rank-2k update of a symmetric matrix with complex float elements. * Extended version. * * Rank-k updates: * - \f$ C \leftarrow \alpha A B^T + \alpha B A^T + \beta C \f$ * - \f$ C \leftarrow \alpha A^T B + \alpha B^T A \beta C \f$ * * where \b C is a symmetric matrix. * * @param[in] order Row/column order. * @param[in] uplo The triangle in matrix \b C being referenced. * @param[in] transAB How matrices \b A and \b B is to be transposed. * @param[in] N Number of rows and columns in matrix \b C. * @param[in] K Number of columns of the matrices \b A and \b B if they * are not transposed, and number of rows otherwise. * @param[in] alpha The factor of matrices \b A and \b B. * @param[in] A Buffer object storing matrix \b A. * @param[in] offA Offset of the first element of the matrix \b A in the * buffer object. Counted in elements. * @param[in] lda Leading dimension of matrix \b A. For detailed * description, see clAmdBlasSsyr2k(). * @param[in] B Buffer object storing matrix \b B. * @param[in] offB Offset of the first element of the matrix \b B in the * buffer object. Counted in elements. * @param[in] ldb Leading dimension of matrix \b B. For detailed * description, see clAmdBlasSsyr2k(). * @param[in] beta The factor of matrix \b C. * @param[out] C Buffer object storing matrix \b C. * @param[in] offC Offset of the first element of the matrix \b C in the * buffer object. Counted in elements. * @param[in] ldc Leading dimension of matrix \b C. It cannot be less * than \b N. * @param[in] numCommandQueues Number of OpenCL command queues in which the * task is to be performed. * @param[in] commandQueues OpenCL command queues. * @param[in] numEventsInWaitList Number of events in the event wait list. * @param[in] eventWaitList Event wait list. * @param[in] events Event objects per each command queue that identify * a particular kernel execution instance. * * @return * - \b clAmdBlasSuccess on success; * - \b clAmdBlasInvalidValue if either \b offA, \b offB or \b offC exceeds * the size of the respective buffer object; * - \b clAmdBlasInvalidValue if \b transAB is set to \ref clAmdBlasConjTrans. * - the same error codes as the clAmdBlasSsyr2k() function otherwise. * * @ingroup SYR2K */ __inline clAmdBlasStatus clAmdBlasCsyr2kEx( clAmdBlasOrder order, clAmdBlasUplo uplo, clAmdBlasTranspose transAB, size_t N, size_t K, FloatComplex alpha, const cl_mem A, size_t offA, size_t lda, const cl_mem B, size_t offB, size_t ldb, FloatComplex beta, cl_mem C, size_t offC, size_t ldc, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events) { return clblasCsyr2k( order, uplo, transAB, N, K, alpha, A, offA, lda, B, offB, ldb, beta, C, offC, ldc, numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events ); } /** * @brief Rank-2k update of a symmetric matrix with complex double elements. * Extended version. * * Rank-k updates: * - \f$ C \leftarrow \alpha A B^T + \alpha B A^T + \beta C \f$ * - \f$ C \leftarrow \alpha A^T B + \alpha B^T A \beta C \f$ * * where \b C is a symmetric matrix. * * @param[in] order Row/column order. * @param[in] uplo The triangle in matrix \b C being referenced. * @param[in] transAB How matrices \b A and \b B is to be transposed. * @param[in] N Number of rows and columns in matrix \b C. * @param[in] K Number of columns of the matrices \b A and \b B if they * are not transposed, and number of rows otherwise. * @param[in] alpha The factor of matrices \b A and \b B. * @param[in] A Buffer object storing matrix \b A. * @param[in] offA Offset of the first element of the matrix \b A in the * buffer object. Counted in elements. * @param[in] lda Leading dimension of matrix \b A. For detailed * description, see clAmdBlasSsyr2k(). * @param[in] B Buffer object storing matrix \b B. * @param[in] offB Offset of the first element of the matrix \b B in the * buffer object. Counted in elements. * @param[in] ldb Leading dimension of matrix \b B. For detailed * description, see clAmdBlasSsyr2k(). * @param[in] beta The factor of matrix \b C. * @param[out] C Buffer object storing matrix \b C. * @param[in] offC Offset of the first element of the matrix \b C in the * buffer object. Counted in elements. * @param[in] ldc Leading dimension of matrix \b C. It cannot be less * than \b N. * @param[in] numCommandQueues Number of OpenCL command queues in which the * task is to be performed. * @param[in] commandQueues OpenCL command queues. * @param[in] numEventsInWaitList Number of events in the event wait list. * @param[in] eventWaitList Event wait list. * @param[in] events Event objects per each command queue that identify * a particular kernel execution instance. * * @return * - \b clAmdBlasSuccess on success; * - \b clAmdBlasInvalidDevice if a target device does not support floating * point arithmetic with double precision; * - \b clAmdBlasInvalidValue if either \b offA, \b offB or \b offC exceeds * the size of the respective buffer object; * - \b clAmdBlasInvalidValue if \b transAB is set to \ref clAmdBlasConjTrans. * - the same error codes as the clAmdBlasSsyr2k() function otherwise. * * @ingroup SYR2K */ __inline clAmdBlasStatus clAmdBlasZsyr2kEx( clAmdBlasOrder order, clAmdBlasUplo uplo, clAmdBlasTranspose transAB, size_t N, size_t K, DoubleComplex alpha, const cl_mem A, size_t offA, size_t lda, const cl_mem B, size_t offB, size_t ldb, DoubleComplex beta, cl_mem C, size_t offC, size_t ldc, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events) { return clblasZsyr2k( order, uplo, transAB, N, K, alpha, A, offA, lda, B, offB, ldb, beta, C, offC, ldc, numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events ); } /*@}*/ /** * @defgroup SYMM SYMM - Symmetric matrix-matrix multiply * @ingroup BLAS3 */ /*@{*/ /** * @brief Matrix-matrix product of symmetric rectangular matrices with float * elements. * * Matrix-matrix products: * - \f$ C \leftarrow \alpha A B + \beta C \f$ * - \f$ C \leftarrow \alpha B A + \beta C \f$ * * @param[in] order Row/column order. * @param[in] side The side of triangular matrix. * @param[in] uplo The triangle in matrix being referenced. * @param[in] M Number of rows in matrices \b B and \b C. * @param[in] N Number of columns in matrices \b B and \b C. * @param[in] alpha The factor of matrix \b A. * @param[in] A Buffer object storing matrix \b A. * @param[in] offa Offset of the first element of the matrix \b A in the * buffer object. Counted in elements. * @param[in] lda Leading dimension of matrix \b A. It cannot be less * than \b M when the \b side parameter is set to * \b clAmdBlasLeft,\n or less than \b N when the * parameter is set to \b clAmdBlasRight. * @param[in] B Buffer object storing matrix \b B. * @param[in] offb Offset of the first element of the matrix \b B in the * buffer object. Counted in elements. * @param[in] ldb Leading dimension of matrix \b B. It cannot be less * than \b N when the \b order parameter is set to * \b clAmdBlasRowMajor,\n or less than \b M * when it is set to \b clAmdBlasColumnMajor. * @param[in] beta The factor of matrix \b C. * @param[out] C Buffer object storing matrix \b C. * @param[in] offc Offset of the first element of the matrix \b C in the * buffer object. Counted in elements. * @param[in] ldc Leading dimension of matrix \b C. It cannot be less * than \b N when the \b order parameter is set to * \b clAmdBlasRowMajor,\n or less than \b M when * it is set to \b clAmdBlasColumnMajorOrder. * @param[in] numCommandQueues Number of OpenCL command queues in which the * task is to be performed. * @param[in] commandQueues OpenCL command queues. * @param[in] numEventsInWaitList Number of events in the event wait list. * @param[in] eventWaitList Event wait list. * @param[in] events Event objects per each command queue that identify * a particular kernel execution instance. * * @return * - \b clAmdBlasSuccess on success; * - \b clAmdBlasNotInitialized if clAmdBlasSetup() was not called; * - \b clAmdBlasInvalidValue if invalid parameters are passed: * - \b M or \b N is zero, or * - any of the leading dimensions is invalid; * - the matrix sizes lead to accessing outsize of any of the buffers; * - \b clAmdBlasInvalidMemObject if A, B, or C object is invalid, * or an image object rather than the buffer one; * - \b clAmdBlasOutOfResources if you use image-based function implementation * and no suitable scratch image available; * - \b clAmdBlasOutOfHostMemory if the library can't allocate memory for * internal structures; * - \b clAmdBlasInvalidCommandQueue if the passed command queue is invalid; * - \b clAmdBlasInvalidContext if a context a passed command queue belongs to * was released; * - \b clAmdBlasInvalidOperation if kernel compilation relating to a previous * call has not completed for any of the target devices; * - \b clAmdBlasCompilerNotAvailable if a compiler is not available; * - \b clAmdBlasBuildProgramFailure if there is a failure to build a program * executable. * * @ingroup SYMM */ __inline clAmdBlasStatus clAmdBlasSsymm( clAmdBlasOrder order, clAmdBlasSide side, clAmdBlasUplo uplo, size_t M, size_t N, cl_float alpha, const cl_mem A, size_t offa, size_t lda, const cl_mem B, size_t offb, size_t ldb, cl_float beta, cl_mem C, size_t offc, size_t ldc, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events) { return clblasSsymm( order, side, uplo, M, N, alpha, A, offa, lda, B, offb, ldb, beta, C, offc, ldc, numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events ); } /** * @example example_ssymm.c * This is an example of how to use the @ref clAmdBlasSsymm function. */ /** * @brief Matrix-matrix product of symmetric rectangular matrices with double * elements. * * Matrix-matrix products: * - \f$ C \leftarrow \alpha A B + \beta C \f$ * - \f$ C \leftarrow \alpha B A + \beta C \f$ * * @param[in] order Row/column order. * @param[in] side The side of triangular matrix. * @param[in] uplo The triangle in matrix being referenced. * @param[in] M Number of rows in matrices \b B and \b C. * @param[in] N Number of columns in matrices \b B and \b C. * @param[in] alpha The factor of matrix \b A. * @param[in] A Buffer object storing matrix \b A. * @param[in] offa Offset of the first element of the matrix \b A in the * buffer object. Counted in elements. * @param[in] lda Leading dimension of matrix \b A. It cannot be less * than \b M when the \b side parameter is set to * \b clAmdBlasLeft,\n or less than \b N when the * parameter is set to \b clAmdBlasRight. * @param[in] B Buffer object storing matrix \b B. * @param[in] offb Offset of the first element of the matrix \b B in the * buffer object. Counted in elements. * @param[in] ldb Leading dimension of matrix \b B. It cannot be less * than \b N when the \b order parameter is set to * \b clAmdBlasRowMajor,\n or less than \b M * when it is set to \b clAmdBlasColumnMajor. * @param[in] beta The factor of matrix \b C. * @param[out] C Buffer object storing matrix \b C. * @param[in] offc Offset of the first element of the matrix \b C in the * buffer object. Counted in elements. * @param[in] ldc Leading dimension of matrix \b C. It cannot be less * than \b N when the \b order parameter is set to * \b clAmdBlasRowMajor,\n or less than \b M when * it is set to \b clAmdBlasColumnMajorOrder. * @param[in] numCommandQueues Number of OpenCL command queues in which the * task is to be performed. * @param[in] commandQueues OpenCL command queues. * @param[in] numEventsInWaitList Number of events in the event wait list. * @param[in] eventWaitList Event wait list. * @param[in] events Event objects per each command queue that identify * a particular kernel execution instance. * * @return * - \b clAmdBlasSuccess on success; * - \b clAmdBlasInvalidDevice if a target device does not support floating * point arithmetic with double precision; * - the same error codes as the clAmdBlasSsymm() function otherwise. * * @ingroup SYMM */ __inline clAmdBlasStatus clAmdBlasDsymm( clAmdBlasOrder order, clAmdBlasSide side, clAmdBlasUplo uplo, size_t M, size_t N, cl_double alpha, const cl_mem A, size_t offa, size_t lda, const cl_mem B, size_t offb, size_t ldb, cl_double beta, cl_mem C, size_t offc, size_t ldc, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events) { return clblasDsymm( order, side, uplo, M, N, alpha, A, offa, lda, B, offb, ldb, beta, C, offc, ldc, numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events ); } /** * @brief Matrix-matrix product of symmetric rectangular matrices with * float-complex elements. * * Matrix-matrix products: * - \f$ C \leftarrow \alpha A B + \beta C \f$ * - \f$ C \leftarrow \alpha B A + \beta C \f$ * * @param[in] order Row/column order. * @param[in] side The side of triangular matrix. * @param[in] uplo The triangle in matrix being referenced. * @param[in] M Number of rows in matrices \b B and \b C. * @param[in] N Number of columns in matrices \b B and \b C. * @param[in] alpha The factor of matrix \b A. * @param[in] A Buffer object storing matrix \b A. * @param[in] offa Offset of the first element of the matrix \b A in the * buffer object. Counted in elements. * @param[in] lda Leading dimension of matrix \b A. It cannot be less * than \b M when the \b side parameter is set to * \b clAmdBlasLeft,\n or less than \b N when the * parameter is set to \b clAmdBlasRight. * @param[in] B Buffer object storing matrix \b B. * @param[in] offb Offset of the first element of the matrix \b B in the * buffer object. Counted in elements. * @param[in] ldb Leading dimension of matrix \b B. It cannot be less * than \b N when the \b order parameter is set to * \b clAmdBlasRowMajor,\n or less than \b M * when it is set to \b clAmdBlasColumnMajor. * @param[in] beta The factor of matrix \b C. * @param[out] C Buffer object storing matrix \b C. * @param[in] offc Offset of the first element of the matrix \b C in the * buffer object. Counted in elements. * @param[in] ldc Leading dimension of matrix \b C. It cannot be less * than \b N when the \b order parameter is set to * \b clAmdBlasRowMajor,\n or less than \b M when * it is set to \b clAmdBlasColumnMajorOrder. * @param[in] numCommandQueues Number of OpenCL command queues in which the * task is to be performed. * @param[in] commandQueues OpenCL command queues. * @param[in] numEventsInWaitList Number of events in the event wait list. * @param[in] eventWaitList Event wait list. * @param[in] events Event objects per each command queue that identify * a particular kernel execution instance. * * @return The same result as the clAmdBlasSsymm() function. * * @ingroup SYMM */ __inline clAmdBlasStatus clAmdBlasCsymm( clAmdBlasOrder order, clAmdBlasSide side, clAmdBlasUplo uplo, size_t M, size_t N, cl_float2 alpha, const cl_mem A, size_t offa, size_t lda, const cl_mem B, size_t offb, size_t ldb, cl_float2 beta, cl_mem C, size_t offc, size_t ldc, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events) { return clblasCsymm( order, side, uplo, M, N, alpha, A, offa, lda, B, offb, ldb, beta, C, offc, ldc, numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events ); } /** * @brief Matrix-matrix product of symmetric rectangular matrices with * double-complex elements. * * Matrix-matrix products: * - \f$ C \leftarrow \alpha A B + \beta C \f$ * - \f$ C \leftarrow \alpha B A + \beta C \f$ * * @param[in] order Row/column order. * @param[in] side The side of triangular matrix. * @param[in] uplo The triangle in matrix being referenced. * @param[in] M Number of rows in matrices \b B and \b C. * @param[in] N Number of columns in matrices \b B and \b C. * @param[in] alpha The factor of matrix \b A. * @param[in] A Buffer object storing matrix \b A. * @param[in] offa Offset of the first element of the matrix \b A in the * buffer object. Counted in elements. * @param[in] lda Leading dimension of matrix \b A. It cannot be less * than \b M when the \b side parameter is set to * \b clAmdBlasLeft,\n or less than \b N when the * parameter is set to \b clAmdBlasRight. * @param[in] B Buffer object storing matrix \b B. * @param[in] offb Offset of the first element of the matrix \b B in the * buffer object. Counted in elements. * @param[in] ldb Leading dimension of matrix \b B. It cannot be less * than \b N when the \b order parameter is set to * \b clAmdBlasRowMajor,\n or less than \b M * when it is set to \b clAmdBlasColumnMajor. * @param[in] beta The factor of matrix \b C. * @param[out] C Buffer object storing matrix \b C. * @param[in] offc Offset of the first element of the matrix \b C in the * buffer object. Counted in elements. * @param[in] ldc Leading dimension of matrix \b C. It cannot be less * than \b N when the \b order parameter is set to * \b clAmdBlasRowMajor,\n or less than \b M when * it is set to \b clAmdBlasColumnMajorOrder. * @param[in] numCommandQueues Number of OpenCL command queues in which the * task is to be performed. * @param[in] commandQueues OpenCL command queues. * @param[in] numEventsInWaitList Number of events in the event wait list. * @param[in] eventWaitList Event wait list. * @param[in] events Event objects per each command queue that identify * a particular kernel execution instance. * * @return The same result as the clAmdBlasDsymm() function. * * @ingroup SYMM */ __inline clAmdBlasStatus clAmdBlasZsymm( clAmdBlasOrder order, clAmdBlasSide side, clAmdBlasUplo uplo, size_t M, size_t N, cl_double2 alpha, const cl_mem A, size_t offa, size_t lda, const cl_mem B, size_t offb, size_t ldb, cl_double2 beta, cl_mem C, size_t offc, size_t ldc, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events) { return clblasZsymm( order, side, uplo, M, N, alpha, A, offa, lda, B, offb, ldb, beta, C, offc, ldc, numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events ); } /*@}*/ /** * @defgroup HEMM HEMM - Hermitian matrix-matrix multiplication * @ingroup BLAS3 */ /*@{*/ /** * @brief Matrix-matrix product of hermitian rectangular matrices with * float-complex elements. * * Matrix-matrix products: * - \f$ C \leftarrow \alpha A B + \beta C \f$ * - \f$ C \leftarrow \alpha B A + \beta C \f$ * * @param[in] order Row/column order. * @param[in] side The side of triangular matrix. * @param[in] uplo The triangle in matrix being referenced. * @param[in] M Number of rows in matrices \b B and \b C. * @param[in] N Number of columns in matrices \b B and \b C. * @param[in] alpha The factor of matrix \b A. * @param[in] A Buffer object storing matrix \b A. * @param[in] offa Offset of the first element of the matrix \b A in the * buffer object. Counted in elements. * @param[in] lda Leading dimension of matrix \b A. It cannot be less * than \b M when the \b side parameter is set to * \b clAmdBlasLeft,\n or less than \b N when the * parameter is set to \b clAmdBlasRight. * @param[in] B Buffer object storing matrix \b B. * @param[in] offb Offset of the first element of the matrix \b B in the * buffer object. Counted in elements. * @param[in] ldb Leading dimension of matrix \b B. It cannot be less * than \b N when the \b order parameter is set to * \b clAmdBlasRowMajor,\n or less than \b M * when it is set to \b clAmdBlasColumnMajor. * @param[in] beta The factor of matrix \b C. * @param[out] C Buffer object storing matrix \b C. * @param[in] offc Offset of the first element of the matrix \b C in the * buffer object. Counted in elements. * @param[in] ldc Leading dimension of matrix \b C. It cannot be less * than \b N when the \b order parameter is set to * \b clAmdBlasRowMajor,\n or less than \b M when * it is set to \b clAmdBlasColumnMajorOrder. * @param[in] numCommandQueues Number of OpenCL command queues in which the * task is to be performed. * @param[in] commandQueues OpenCL command queues. * @param[in] numEventsInWaitList Number of events in the event wait list. * @param[in] eventWaitList Event wait list. * @param[in] events Event objects per each command queue that identify * a particular kernel execution instance. * * @return * - \b clAmdBlasSuccess on success; * - \b clAmdBlasNotInitialized if clAmdBlasSetup() was not called; * - \b clAmdBlasInvalidValue if invalid parameters are passed: * - \b M or \b N is zero, or * - any of the leading dimensions is invalid; * - the matrix sizes lead to accessing outsize of any of the buffers; * - \b clAmdBlasInvalidMemObject if A, B, or C object is invalid, * or an image object rather than the buffer one; * - \b clAmdBlasOutOfResources if you use image-based function implementation * and no suitable scratch image available; * - \b clAmdBlasOutOfHostMemory if the library can't allocate memory for * internal structures; * - \b clAmdBlasInvalidCommandQueue if the passed command queue is invalid; * - \b clAmdBlasInvalidContext if a context a passed command queue belongs to * was released; * - \b clAmdBlasInvalidOperation if kernel compilation relating to a previous * call has not completed for any of the target devices; * - \b clAmdBlasCompilerNotAvailable if a compiler is not available; * - \b clAmdBlasBuildProgramFailure if there is a failure to build a program * executable. * * @ingroup HEMM */ __inline clAmdBlasStatus clAmdBlasChemm( clAmdBlasOrder order, clAmdBlasSide side, clAmdBlasUplo uplo, size_t M, size_t N, cl_float2 alpha, const cl_mem A, size_t offa, size_t lda, const cl_mem B, size_t offb, size_t ldb, cl_float2 beta, cl_mem C, size_t offc, size_t ldc, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events) { return clblasChemm( order, side, uplo, M, N, alpha, A, offa, lda, B, offb, ldb, beta, C, offc, ldc, numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events ); } /** * @example example_chemm.cpp * This is an example of how to use the @ref clAmdBlasChemm function. */ /** * @brief Matrix-matrix product of hermitian rectangular matrices with * double-complex elements. * * Matrix-matrix products: * - \f$ C \leftarrow \alpha A B + \beta C \f$ * - \f$ C \leftarrow \alpha B A + \beta C \f$ * * @param[in] order Row/column order. * @param[in] side The side of triangular matrix. * @param[in] uplo The triangle in matrix being referenced. * @param[in] M Number of rows in matrices \b B and \b C. * @param[in] N Number of columns in matrices \b B and \b C. * @param[in] alpha The factor of matrix \b A. * @param[in] A Buffer object storing matrix \b A. * @param[in] offa Offset of the first element of the matrix \b A in the * buffer object. Counted in elements. * @param[in] lda Leading dimension of matrix \b A. It cannot be less * than \b M when the \b side parameter is set to * \b clAmdBlasLeft,\n or less than \b N when the * parameter is set to \b clAmdBlasRight. * @param[in] B Buffer object storing matrix \b B. * @param[in] offb Offset of the first element of the matrix \b B in the * buffer object. Counted in elements. * @param[in] ldb Leading dimension of matrix \b B. It cannot be less * than \b N when the \b order parameter is set to * \b clAmdBlasRowMajor,\n or less than \b M * when it is set to \b clAmdBlasColumnMajor. * @param[in] beta The factor of matrix \b C. * @param[out] C Buffer object storing matrix \b C. * @param[in] offc Offset of the first element of the matrix \b C in the * buffer object. Counted in elements. * @param[in] ldc Leading dimension of matrix \b C. It cannot be less * than \b N when the \b order parameter is set to * \b clAmdBlasRowMajor,\n or less than \b M when * it is set to \b clAmdBlasColumnMajorOrder. * @param[in] numCommandQueues Number of OpenCL command queues in which the * task is to be performed. * @param[in] commandQueues OpenCL command queues. * @param[in] numEventsInWaitList Number of events in the event wait list. * @param[in] eventWaitList Event wait list. * @param[in] events Event objects per each command queue that identify * a particular kernel execution instance. * * @return * - \b clAmdBlasSuccess on success; * - \b clAmdBlasInvalidDevice if a target device does not support floating * point arithmetic with double precision; * - the same error codes as the clAmdBlasChemm() function otherwise. * * @ingroup HEMM */ __inline clAmdBlasStatus clAmdBlasZhemm( clAmdBlasOrder order, clAmdBlasSide side, clAmdBlasUplo uplo, size_t M, size_t N, cl_double2 alpha, const cl_mem A, size_t offa, size_t lda, const cl_mem B, size_t offb, size_t ldb, cl_double2 beta, cl_mem C, size_t offc, size_t ldc, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events) { return clblasZhemm( order, side, uplo, M, N, alpha, A, offa, lda, B, offb, ldb, beta, C, offc, ldc, numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events ); } /*@}*/ /** * @defgroup HERK HERK - Hermitian rank-k update to a matrix * @ingroup BLAS3 */ /*@{*/ /** * @brief Rank-k update of a hermitian matrix with float-complex elements. * * Rank-k updates: * - \f$ C \leftarrow \alpha A A^H + \beta C \f$ * - \f$ C \leftarrow \alpha A^H A + \beta C \f$ * * where \b C is a hermitian matrix. * * @param[in] order Row/column order. * @param[in] uplo The triangle in matrix \b C being referenced. * @param[in] transA How matrix \b A is to be transposed. * @param[in] N Number of rows and columns in matrix \b C. * @param[in] K Number of columns of the matrix \b A if it is not * transposed, and number of rows otherwise. * @param[in] alpha The factor of matrix \b A. * @param[in] A Buffer object storing the matrix \b A. * @param[in] offa Offset in number of elements for the first element in matrix \b A. * @param[in] lda Leading dimension of matrix \b A. It cannot be * less than \b K if \b A is * in the row-major format, and less than \b N * otherwise. * @param[in] beta The factor of the matrix \b C. * @param[out] C Buffer object storing matrix \b C. * @param[in] offc Offset in number of elements for the first element in matrix \b C. * @param[in] ldc Leading dimension of matric \b C. It cannot be less * than \b N. * @param[in] numCommandQueues Number of OpenCL command queues in which the * task is to be performed. * @param[in] commandQueues OpenCL command queues. * @param[in] numEventsInWaitList Number of events in the event wait list. * @param[in] eventWaitList Event wait list. * @param[in] events Event objects per each command queue that identify * a particular kernel execution instance. * * @return * - \b clAmdBlasSuccess on success; * - \b clAmdBlasNotInitialized if clAmdBlasSetup() was not called; * - \b clAmdBlasInvalidValue if invalid parameters are passed: * - either \b N or \b K is zero, or * - any of the leading dimensions is invalid; * - the matrix sizes lead to accessing outsize of any of the buffers; * - \b clAmdBlasInvalidMemObject if either \b A or \b C object is * invalid, or an image object rather than the buffer one; * - \b clAmdBlasOutOfHostMemory if the library can't allocate memory for * internal structures; * - \b clAmdBlasInvalidCommandQueue if the passed command queue is invalid; * - \b clAmdBlasInvalidContext if a context a passed command queue belongs to * was released. * * @ingroup HERK */ __inline clAmdBlasStatus clAmdBlasCherk( clAmdBlasOrder order, clAmdBlasUplo uplo, clAmdBlasTranspose transA, size_t N, size_t K, float alpha, const cl_mem A, size_t offa, size_t lda, float beta, cl_mem C, size_t offc, size_t ldc, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events) { return clblasCherk( order, uplo, transA, N, K, alpha, A, offa, lda, beta, C, offc, ldc, numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events ); } /** * @example example_cherk.cpp * This is an example of how to use the @ref clAmdBlasCherk function. */ /** * @brief Rank-k update of a hermitian matrix with double-complex elements. * * Rank-k updates: * - \f$ C \leftarrow \alpha A A^H + \beta C \f$ * - \f$ C \leftarrow \alpha A^H A + \beta C \f$ * * where \b C is a hermitian matrix. * * @param[in] order Row/column order. * @param[in] uplo The triangle in matrix \b C being referenced. * @param[in] transA How matrix \b A is to be transposed. * @param[in] N Number of rows and columns in matrix \b C. * @param[in] K Number of columns of the matrix \b A if it is not * transposed, and number of rows otherwise. * @param[in] alpha The factor of matrix \b A. * @param[in] A Buffer object storing the matrix \b A. * @param[in] offa Offset in number of elements for the first element in matrix \b A. * @param[in] lda Leading dimension of matrix \b A. It cannot be * less than \b K if \b A is * in the row-major format, and less than \b N * otherwise. * @param[in] beta The factor of the matrix \b C. * @param[out] C Buffer object storing matrix \b C. * @param[in] offc Offset in number of elements for the first element in matrix \b C. * @param[in] ldc Leading dimension of matric \b C. It cannot be less * than \b N. * @param[in] numCommandQueues Number of OpenCL command queues in which the * task is to be performed. * @param[in] commandQueues OpenCL command queues. * @param[in] numEventsInWaitList Number of events in the event wait list. * @param[in] eventWaitList Event wait list. * @param[in] events Event objects per each command queue that identify * a particular kernel execution instance. * * @return * - \b clAmdBlasSuccess on success; * - \b clAmdBlasInvalidDevice if a target device does not support floating * point arithmetic with double precision; * - the same error codes as the clAmdBlasCherk() function otherwise. * * @ingroup HERK */ __inline clAmdBlasStatus clAmdBlasZherk( clAmdBlasOrder order, clAmdBlasUplo uplo, clAmdBlasTranspose transA, size_t N, size_t K, double alpha, const cl_mem A, size_t offa, size_t lda, double beta, cl_mem C, size_t offc, size_t ldc, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events) { return clblasZherk( order, uplo, transA, N, K, alpha, A, offa, lda, beta, C, offc, ldc, numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events ); } /*@}*/ /** * @defgroup HER2K HER2K - Hermitian rank-2k update to a matrix * @ingroup BLAS3 */ /*@{*/ /** * @brief Rank-2k update of a hermitian matrix with float-complex elements. * * Rank-k updates: * - \f$ C \leftarrow \alpha A B^H + conj( \alpha ) B A^H + \beta C \f$ * - \f$ C \leftarrow \alpha A^H B + conj( \alpha ) B^H A + \beta C \f$ * * where \b C is a hermitian matrix. * * @param[in] order Row/column order. * @param[in] uplo The triangle in matrix \b C being referenced. * @param[in] trans How matrix \b A is to be transposed. * @param[in] N Number of rows and columns in matrix \b C. * @param[in] K Number of columns of the matrix \b A if it is not * transposed, and number of rows otherwise. * @param[in] alpha The factor of matrix \b A. * @param[in] A Buffer object storing the matrix \b A. * @param[in] offa Offset in number of elements for the first element in matrix \b A. * @param[in] lda Leading dimension of matrix \b A. It cannot be * less than \b K if \b A is * in the row-major format, and less than \b N * otherwise. Vice-versa for transpose case. * @param[in] B Buffer object storing the matrix \b B. * @param[in] offb Offset in number of elements for the first element in matrix \b B. * @param[in] ldb Leading dimension of matrix \b B. It cannot be * less than \b K if \b B is * in the row-major format, and less than \b N * otherwise. Vice-versa for transpose case * @param[in] beta The factor of the matrix \b C. * @param[out] C Buffer object storing matrix \b C. * @param[in] offc Offset in number of elements for the first element in matrix \b C. * @param[in] ldc Leading dimension of matric \b C. It cannot be less * than \b N. * @param[in] numCommandQueues Number of OpenCL command queues in which the * task is to be performed. * @param[in] commandQueues OpenCL command queues. * @param[in] numEventsInWaitList Number of events in the event wait list. * @param[in] eventWaitList Event wait list. * @param[in] events Event objects per each command queue that identify * a particular kernel execution instance. * * @return * - \b clAmdBlasSuccess on success; * - \b clAmdBlasNotInitialized if clAmdBlasSetup() was not called; * - \b clAmdBlasInvalidValue if invalid parameters are passed: * - either \b N or \b K is zero, or * - any of the leading dimensions is invalid; * - the matrix sizes lead to accessing outsize of any of the buffers; * - \b clAmdBlasInvalidMemObject if either \b A , \b B or \b C object is * invalid, or an image object rather than the buffer one; * - \b clAmdBlasOutOfHostMemory if the library can't allocate memory for * internal structures; * - \b clAmdBlasInvalidCommandQueue if the passed command queue is invalid; * - \b clAmdBlasInvalidContext if a context a passed command queue belongs to * was released. * * @ingroup HER2K */ __inline clAmdBlasStatus clAmdBlasCher2k( clAmdBlasOrder order, clAmdBlasUplo uplo, clAmdBlasTranspose trans, size_t N, size_t K, FloatComplex alpha, const cl_mem A, size_t offa, size_t lda, const cl_mem B, size_t offb, size_t ldb, cl_float beta, cl_mem C, size_t offc, size_t ldc, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events) { return clblasCher2k( order, uplo, trans, N, K, alpha, A, offa, lda, B, offb, ldb, beta, C, offc, ldc, numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events ); } /** * @example example_cher2k.c * This is an example of how to use the @ref clAmdBlasCher2k function. */ /** * @brief Rank-2k update of a hermitian matrix with double-complex elements. * * Rank-k updates: * - \f$ C \leftarrow \alpha A B^H + conj( \alpha ) B A^H + \beta C \f$ * - \f$ C \leftarrow \alpha A^H B + conj( \alpha ) B^H A + \beta C \f$ * * where \b C is a hermitian matrix. * * @param[in] order Row/column order. * @param[in] uplo The triangle in matrix \b C being referenced. * @param[in] trans How matrix \b A is to be transposed. * @param[in] N Number of rows and columns in matrix \b C. * @param[in] K Number of columns of the matrix \b A if it is not * transposed, and number of rows otherwise. * @param[in] alpha The factor of matrix \b A. * @param[in] A Buffer object storing the matrix \b A. * @param[in] offa Offset in number of elements for the first element in matrix \b A. * @param[in] lda Leading dimension of matrix \b A. It cannot be * less than \b K if \b A is * in the row-major format, and less than \b N * otherwise. Vice-versa for transpose case. * @param[in] B Buffer object storing the matrix \b B. * @param[in] offb Offset in number of elements for the first element in matrix \b B. * @param[in] ldb Leading dimension of matrix \b B. It cannot be * less than \b K if B is * in the row-major format, and less than \b N * otherwise. Vice-versa for transpose case. * @param[in] beta The factor of the matrix \b C. * @param[out] C Buffer object storing matrix \b C. * @param[in] offc Offset in number of elements for the first element in matrix \b C. * @param[in] ldc Leading dimension of matric \b C. It cannot be less * than \b N. * @param[in] numCommandQueues Number of OpenCL command queues in which the * task is to be performed. * @param[in] commandQueues OpenCL command queues. * @param[in] numEventsInWaitList Number of events in the event wait list. * @param[in] eventWaitList Event wait list. * @param[in] events Event objects per each command queue that identify * a particular kernel execution instance. * * @return * - \b clAmdBlasSuccess on success; * - \b clAmdBlasInvalidDevice if a target device does not support floating * point arithmetic with double precision; * - the same error codes as the clAmdBlasCher2k() function otherwise. * * @ingroup HER2K */ __inline clAmdBlasStatus clAmdBlasZher2k( clAmdBlasOrder order, clAmdBlasUplo uplo, clAmdBlasTranspose trans, size_t N, size_t K, DoubleComplex alpha, const cl_mem A, size_t offa, size_t lda, const cl_mem B, size_t offb, size_t ldb, cl_double beta, cl_mem C, size_t offc, size_t ldc, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events) { return clblasZher2k( order, uplo, trans, N, K, alpha, A, offa, lda, B, offb, ldb, beta, C, offc, ldc, numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events ); } /*@}*/ #ifdef __cplusplus } /* extern "C" { */ #endif #endif /* CLAMDBLAS_H_ */ clblas-2.10/src/clAmdBlas.version.h000066400000000000000000000016121264277366700171660ustar00rootroot00000000000000/* ************************************************************************ * Copyright 2013 Advanced Micro Devices, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * ************************************************************************/ /* the configured version and settings for clblas */ #define clAmdBlasVersionMajor 2 #define clAmdBlasVersionMinor 0 #define clAmdBlasVersionPatch 0 clblas-2.10/src/clBLAS-complex.h000066400000000000000000000025041264277366700163660ustar00rootroot00000000000000/* ************************************************************************ * Copyright 2013 Advanced Micro Devices, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * ************************************************************************/ #ifndef CLBLAS_COMPLEX_H_ #define CLBLAS_COMPLEX H_ #ifdef __cplusplus extern "C" { #endif typedef cl_float2 FloatComplex; typedef cl_double2 DoubleComplex; static __inline FloatComplex floatComplex(float real, float imag) { FloatComplex z; z.s[0] = real; z.s[1] = imag; return z; } static __inline DoubleComplex doubleComplex(double real, double imag) { DoubleComplex z; z.s[0] = real; z.s[1] = imag; return z; } #define CREAL(v) ((v).s[0]) #define CIMAG(v) ((v).s[1]) #ifdef __cplusplus } /* extern "C" { */ #endif #endif /* CLBLAS_COMPLEX_H_ */ clblas-2.10/src/clBLAS.def000066400000000000000000000071331264277366700152330ustar00rootroot00000000000000;/* ************************************************************************ ; * Copyright 2013 Advanced Micro Devices, Inc. ; * ; * Licensed under the Apache License, Version 2.0 (the "License"); ; * you may not use this file except in compliance with the License. ; * You may obtain a copy of the License at ; * ; * http://www.apache.org/licenses/LICENSE-2.0 ; * ; * Unless required by applicable law or agreed to in writing, software ; * distributed under the License is distributed on an "AS IS" BASIS, ; * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ; * See the License for the specific language governing permissions and ; * limitations under the License. ; * ************************************************************************/ LIBRARY clBLAS EXPORTS clblasGetVersion clblasSetup clblasTeardown clblasSgemv clblasDgemv clblasCgemv clblasZgemv clblasSsymv clblasDsymv clblasSgemm clblasDgemm clblasCgemm clblasZgemm ;GEMMV2 is not exported ;clblasSgemmV2 ;clblasDgemmV2 ;clblasCgemmV2 ;clblasZgemmV2 clblasStrmm clblasDtrmm clblasCtrmm clblasZtrmm clblasStrsm clblasDtrsm clblasCtrsm clblasZtrsm clblasSsyr2k clblasDsyr2k clblasCsyr2k clblasZsyr2k clblasSsyrk clblasDsyrk clblasCsyrk clblasZsyrk ;GEMMV2 is not exported ;clblasSgemmExV2 ;clblasDgemmExV2 ;clblasCgemmExV2 ;clblasZgemmExV2 clblasStrmv clblasDtrmv clblasCtrmv clblasZtrmv clblasStrsv clblasDtrsv clblasCtrsv clblasZtrsv clblasStpsv clblasDtpsv clblasCtpsv clblasZtpsv clblasSsymm clblasDsymm clblasCsymm clblasZsymm clblasSger clblasDger clblasCgeru clblasZgeru clblasCgerc clblasZgerc clblasSsyr clblasDsyr clblasCher clblasZher clblasSsyr2 clblasDsyr2 clblasChemv clblasZhemv clblasCher2 clblasZher2 clblasChemm clblasZhemm clblasCherk clblasZherk clblasStpmv clblasDtpmv clblasCtpmv clblasZtpmv clblasSspmv clblasDspmv clblasChpmv clblasZhpmv clblasSspr clblasDspr clblasChpr clblasZhpr clblasSspr2 clblasDspr2 clblasChpr2 clblasZhpr2 clblasSgbmv clblasDgbmv clblasCgbmv clblasZgbmv clblasStbmv clblasDtbmv clblasCtbmv clblasZtbmv clblasSsbmv clblasDsbmv clblasChbmv clblasZhbmv clblasStbsv clblasDtbsv clblasCtbsv clblasZtbsv clblasCher2k clblasZher2k clblasSswap clblasDswap clblasCswap clblasZswap clblasSscal clblasDscal clblasCscal clblasZscal clblasCsscal clblasZdscal clblasScopy clblasDcopy clblasCcopy clblasZcopy clblasSaxpy clblasDaxpy clblasCaxpy clblasZaxpy clblasSdot clblasDdot clblasCdotu clblasZdotu clblasCdotc clblasZdotc clblasSrotg clblasDrotg clblasCrotg clblasZrotg clblasSrotmg clblasDrotmg clblasSrot clblasDrot clblasCsrot clblasZdrot clblasSrotm clblasDrotm clblasSnrm2 clblasDnrm2 clblasScnrm2 clblasDznrm2 clblasSasum clblasDasum clblasScasum clblasDzasum clblasiSamax clblasiDamax clblasiCamax clblasiZamax clblasAddScratchImage clblasRemoveScratchImage clblasMatrixSizeInfo clblasCreateMatrix clblasCreateMatrixWithLd clblasCreateMatrixFromHost clblasWriteSubMatrix clblasWriteSubMatrixAsync clblasReadSubMatrix clblasReadSubMatrixAsync clblasCopySubMatrix clblasCopySubMatrixAsync clblasWriteVector clblasWriteVectorAsync clblasReadVector clblasReadVectorAsync clblasCopyVector clblasCopyVectorAsync clblasWriteMatrix clblasWriteMatrixAsync clblasReadMatrix clblasReadMatrixAsync clblasCopyMatrix clblasCopyMatrixAsync clblasFillVector clblasFillVectorAsync clblasFillMatrix clblasFillSubMatrix clblasFillSubMatrixAsync clblas-2.10/src/clBLAS.h000066400000000000000000014055331264277366700147330ustar00rootroot00000000000000/* ************************************************************************ * Copyright 2013 Advanced Micro Devices, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * ************************************************************************/ #ifndef CLBLAS_H_ #define CLBLAS_H_ /** * @mainpage OpenCL BLAS * * This is an implementation of * * Basic Linear Algebra Subprograms, levels 1, 2 and 3 using * OpenCL and optimized for * the AMD GPU hardware. */ #if defined(__APPLE__) || defined(__MACOSX) #include #else #include #endif #include #ifdef __cplusplus extern "C" { #endif /** * @defgroup OVERVIEW Overview * * This library provides an implementation of the Basic Linear Algebra Subprograms levels 1, 2 and 3, * using OpenCL and optimized for AMD GPU hardware. It provides BLAS-1 functions * SWAP, SCAL, COPY, AXPY, DOT, DOTU, DOTC, ROTG, ROTMG, ROT, ROTM, iAMAX, ASUM and NRM2, * BLAS-2 functions GEMV, SYMV, TRMV, TRSV, HEMV, SYR, SYR2, HER, HER2, GER, GERU, GERC, * TPMV, SPMV, HPMV, TPSV, SPR, SPR2, HPR, HPR2, GBMV, TBMV, SBMV, HBMV and TBSV * and BLAS-3 functions GEMM, SYMM, TRMM, TRSM, HEMM, HERK, HER2K, SYRK and SYR2K. * * This library’s primary goal is to assist the end user to enqueue OpenCL * kernels to process BLAS functions in an OpenCL-efficient manner, while * keeping interfaces familiar to users who know how to use BLAS. All * functions accept matrices through buffer objects. * * This library is entirely thread-safe with the exception of the following API : * clblasSetup and clblasTeardown. * Developers using the library can safely using any blas routine from different thread. * * @section deprecated * This library provided support for the creation of scratch images to achieve better performance * on older AMD APP SDK's. * However, memory buffers now give the same performance as buffers objects in the current SDK's. * Scratch image buffers are being deprecated and users are advised not to use scratch images in * new applications. */ /** * @defgroup TYPES clblas types */ /*@{*/ /** Shows how matrices are placed in memory. */ typedef enum clblasOrder_ { clblasRowMajor, /**< Every row is placed sequentially */ clblasColumnMajor /**< Every column is placed sequentially */ } clblasOrder; /** Used to specify whether the matrix is to be transposed or not. */ typedef enum clblasTranspose_ { clblasNoTrans, /**< Operate with the matrix. */ clblasTrans, /**< Operate with the transpose of the matrix. */ clblasConjTrans /**< Operate with the conjugate transpose of the matrix. */ } clblasTranspose; /** Used by the Hermitian, symmetric and triangular matrix * routines to specify whether the upper or lower triangle is being referenced. */ typedef enum clblasUplo_ { clblasUpper, /**< Upper triangle. */ clblasLower /**< Lower triangle. */ } clblasUplo; /** It is used by the triangular matrix routines to specify whether the * matrix is unit triangular. */ typedef enum clblasDiag_ { clblasUnit, /**< Unit triangular. */ clblasNonUnit /**< Non-unit triangular. */ } clblasDiag; /** Indicates the side matrix A is located relative to matrix B during multiplication. */ typedef enum clblasSide_ { clblasLeft, /**< Multiply general matrix by symmetric, Hermitian or triangular matrix on the left. */ clblasRight /**< Multiply general matrix by symmetric, Hermitian or triangular matrix on the right. */ } clblasSide; /** * @brief clblas error codes definition, incorporating OpenCL error * definitions. * * This enumeration is a subset of the OpenCL error codes extended with some * additional extra codes. For example, CL_OUT_OF_HOST_MEMORY, which is * defined in cl.h is aliased as clblasOutOfHostMemory. */ typedef enum clblasStatus_ { clblasSuccess = CL_SUCCESS, clblasInvalidValue = CL_INVALID_VALUE, clblasInvalidCommandQueue = CL_INVALID_COMMAND_QUEUE, clblasInvalidContext = CL_INVALID_CONTEXT, clblasInvalidMemObject = CL_INVALID_MEM_OBJECT, clblasInvalidDevice = CL_INVALID_DEVICE, clblasInvalidEventWaitList = CL_INVALID_EVENT_WAIT_LIST, clblasOutOfResources = CL_OUT_OF_RESOURCES, clblasOutOfHostMemory = CL_OUT_OF_HOST_MEMORY, clblasInvalidOperation = CL_INVALID_OPERATION, clblasCompilerNotAvailable = CL_COMPILER_NOT_AVAILABLE, clblasBuildProgramFailure = CL_BUILD_PROGRAM_FAILURE, /* Extended error codes */ clblasNotImplemented = -1024, /**< Functionality is not implemented */ clblasNotInitialized, /**< clblas library is not initialized yet */ clblasInvalidMatA, /**< Matrix A is not a valid memory object */ clblasInvalidMatB, /**< Matrix B is not a valid memory object */ clblasInvalidMatC, /**< Matrix C is not a valid memory object */ clblasInvalidVecX, /**< Vector X is not a valid memory object */ clblasInvalidVecY, /**< Vector Y is not a valid memory object */ clblasInvalidDim, /**< An input dimension (M,N,K) is invalid */ clblasInvalidLeadDimA, /**< Leading dimension A must not be less than the size of the first dimension */ clblasInvalidLeadDimB, /**< Leading dimension B must not be less than the size of the second dimension */ clblasInvalidLeadDimC, /**< Leading dimension C must not be less than the size of the third dimension */ clblasInvalidIncX, /**< The increment for a vector X must not be 0 */ clblasInvalidIncY, /**< The increment for a vector Y must not be 0 */ clblasInsufficientMemMatA, /**< The memory object for Matrix A is too small */ clblasInsufficientMemMatB, /**< The memory object for Matrix B is too small */ clblasInsufficientMemMatC, /**< The memory object for Matrix C is too small */ clblasInsufficientMemVecX, /**< The memory object for Vector X is too small */ clblasInsufficientMemVecY /**< The memory object for Vector Y is too small */ } clblasStatus; /*@}*/ /** * @defgroup VERSION Version information */ /*@{*/ /** * @brief Get the clblas library version info. * * @param[out] major Location to store library's major version. * @param[out] minor Location to store library's minor version. * @param[out] patch Location to store library's patch version. * * @returns always \b clblasSuccess. * * @ingroup VERSION */ clblasStatus clblasGetVersion(cl_uint* major, cl_uint* minor, cl_uint* patch); /*@}*/ /** * @defgroup INIT Initialize library */ /*@{*/ /** * @brief Initialize the clblas library. * * Must be called before any other clblas API function is invoked. * @note This function is not thread-safe. * * @return * - \b clblasSucces on success; * - \b clblasOutOfHostMemory if there is not enough of memory to allocate * library's internal structures; * - \b clblasOutOfResources in case of requested resources scarcity. * * @ingroup INIT */ clblasStatus clblasSetup(void); /** * @brief Finalize the usage of the clblas library. * * Frees all memory allocated for different computational kernel and other * internal data. * @note This function is not thread-safe. * * @ingroup INIT */ void clblasTeardown(void); /*@}*/ /** * @defgroup BLAS1 BLAS-1 functions * * The Level 1 Basic Linear Algebra Subprograms are functions that perform * vector-vector operations. */ /*@{*/ /*@}*/ /** * @defgroup SWAP SWAP - Swap elements from 2 vectors * @ingroup BLAS1 */ /*@{*/ /** * @brief interchanges two vectors of float. * * * @param[in] N Number of elements in vector \b X. * @param[out] X Buffer object storing vector \b X. * @param[in] offx Offset of first element of vector \b X in buffer object. * Counted in elements. * @param[in] incx Increment for the elements of \b X. Must not be zero. * @param[out] Y Buffer object storing the vector \b Y. * @param[in] offy Offset of first element of vector \b Y in buffer object. * Counted in elements. * @param[in] incy Increment for the elements of \b Y. Must not be zero. * @param[in] numCommandQueues Number of OpenCL command queues in which the * task is to be performed. * @param[in] commandQueues OpenCL command queues. * @param[in] numEventsInWaitList Number of events in the event wait list. * @param[in] eventWaitList Event wait list. * @param[in] events Event objects per each command queue that identify * a particular kernel execution instance. * * @return * - \b clblasSuccess on success; * - \b clblasNotInitialized if clblasSetup() was not called; * - \b clblasInvalidValue if invalid parameters are passed: * - \b N is zero, or * - either \b incx or \b incy is zero, or * - the vector sizes along with the increments lead to * accessing outside of any of the buffers; * - \b clblasInvalidMemObject if either \b X, or \b Y object is * Invalid, or an image object rather than the buffer one; * - \b clblasOutOfHostMemory if the library can't allocate memory for * internal structures; * - \b clblasInvalidCommandQueue if the passed command queue is invalid; * - \b clblasInvalidContext if a context a passed command queue belongs * to was released; * - \b clblasInvalidOperation if kernel compilation relating to a previous * call has not completed for any of the target devices; * - \b clblasCompilerNotAvailable if a compiler is not available; * - \b clblasBuildProgramFailure if there is a failure to build a program * executable. * * @ingroup SWAP */ clblasStatus clblasSswap( size_t N, cl_mem X, size_t offx, int incx, cl_mem Y, size_t offy, int incy, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events); /** * @example example_sswap.c * Example of how to use the @ref clblasSswap function. */ /** * @brief interchanges two vectors of double. * * * @param[in] N Number of elements in vector \b X. * @param[out] X Buffer object storing vector \b X. * @param[in] offx Offset of first element of vector \b X in buffer object. * Counted in elements. * @param[in] incx Increment for the elements of \b X. Must not be zero. * @param[out] Y Buffer object storing the vector \b Y. * @param[in] offy Offset of first element of vector \b Y in buffer object. * Counted in elements. * @param[in] incy Increment for the elements of \b Y. Must not be zero. * @param[in] numCommandQueues Number of OpenCL command queues in which the * task is to be performed. * @param[in] commandQueues OpenCL command queues. * @param[in] numEventsInWaitList Number of events in the event wait list. * @param[in] eventWaitList Event wait list. * @param[in] events Event objects per each command queue that identify * a particular kernel execution instance. * * @return * - \b clblasSuccess on success; * - \b clblasInvalidDevice if a target device does not support the * floating point arithmetic with double precision; * - the same error codes as the clblasSswap() function otherwise. * * @ingroup SWAP */ clblasStatus clblasDswap( size_t N, cl_mem X, size_t offx, int incx, cl_mem Y, size_t offy, int incy, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events); /** * @brief interchanges two vectors of complex-float elements. * * * @param[in] N Number of elements in vector \b X. * @param[out] X Buffer object storing vector \b X. * @param[in] offx Offset of first element of vector \b X in buffer object. * Counted in elements. * @param[in] incx Increment for the elements of \b X. Must not be zero. * @param[out] Y Buffer object storing the vector \b Y. * @param[in] offy Offset of first element of vector \b Y in buffer object. * Counted in elements. * @param[in] incy Increment for the elements of \b Y. Must not be zero. * @param[in] numCommandQueues Number of OpenCL command queues in which the * task is to be performed. * @param[in] commandQueues OpenCL command queues. * @param[in] numEventsInWaitList Number of events in the event wait list. * @param[in] eventWaitList Event wait list. * @param[in] events Event objects per each command queue that identify * a particular kernel execution instance. * * @return * - \b clblasSuccess on success; * - the same error codes as the clblasSwap() function otherwise. * * @ingroup SWAP */ clblasStatus clblasCswap( size_t N, cl_mem X, size_t offx, int incx, cl_mem Y, size_t offy, int incy, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events); /** * @brief interchanges two vectors of double-complex elements. * * * @param[in] N Number of elements in vector \b X. * @param[out] X Buffer object storing vector \b X. * @param[in] offx Offset of first element of vector \b X in buffer object. * Counted in elements. * @param[in] incx Increment for the elements of \b X. Must not be zero. * @param[out] Y Buffer object storing the vector \b Y. * @param[in] offy Offset of first element of vector \b Y in buffer object. * Counted in elements. * @param[in] incy Increment for the elements of \b Y. Must not be zero. * @param[in] numCommandQueues Number of OpenCL command queues in which the * task is to be performed. * @param[in] commandQueues OpenCL command queues. * @param[in] numEventsInWaitList Number of events in the event wait list. * @param[in] eventWaitList Event wait list. * @param[in] events Event objects per each command queue that identify * a particular kernel execution instance. * * @return * - \b clblasSuccess on success; * - the same error codes as the clblasDwap() function otherwise. * * @ingroup SWAP */ clblasStatus clblasZswap( size_t N, cl_mem X, size_t offx, int incx, cl_mem Y, size_t offy, int incy, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events); /*@}*/ /** * @defgroup SCAL SCAL - Scales a vector by a constant * @ingroup BLAS1 */ /*@{*/ /** * @brief Scales a float vector by a float constant * * - \f$ X \leftarrow \alpha X \f$ * * @param[in] N Number of elements in vector \b X. * @param[in] alpha The constant factor for vector \b X. * @param[out] X Buffer object storing vector \b X. * @param[in] offx Offset of first element of vector \b X in buffer object. * Counted in elements. * @param[in] incx Increment for the elements of \b X. Must not be zero. * @param[in] numCommandQueues Number of OpenCL command queues in which the * task is to be performed. * @param[in] commandQueues OpenCL command queues. * @param[in] numEventsInWaitList Number of events in the event wait list. * @param[in] eventWaitList Event wait list. * @param[in] events Event objects per each command queue that identify * a particular kernel execution instance. * * @return * - \b clblasSuccess on success; * - \b clblasNotInitialized if clblasSetup() was not called; * - \b clblasInvalidValue if invalid parameters are passed: * - \b N is zero, or * - \b incx zero, or * - the vector sizes along with the increments lead to * accessing outside of any of the buffers; * - \b clblasInvalidMemObject if either \b X, or \b Y object is * Invalid, or an image object rather than the buffer one; * - \b clblasOutOfHostMemory if the library can't allocate memory for * internal structures; * - \b clblasInvalidCommandQueue if the passed command queue is invalid; * - \b clblasInvalidContext if a context a passed command queue belongs * to was released; * - \b clblasInvalidOperation if kernel compilation relating to a previous * call has not completed for any of the target devices; * - \b clblasCompilerNotAvailable if a compiler is not available; * - \b clblasBuildProgramFailure if there is a failure to build a program * executable. * * @ingroup SCAL */ clblasStatus clblasSscal( size_t N, cl_float alpha, cl_mem X, size_t offx, int incx, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events); /** * @example example_sscal.c * Example of how to use the @ref clblasSscal function. */ /** * @brief Scales a double vector by a double constant * * - \f$ X \leftarrow \alpha X \f$ * * @param[in] N Number of elements in vector \b X. * @param[in] alpha The constant factor for vector \b X. * @param[out] X Buffer object storing vector \b X. * @param[in] offx Offset of first element of vector \b X in buffer object. * Counted in elements. * @param[in] incx Increment for the elements of \b X. Must not be zero. * @param[in] numCommandQueues Number of OpenCL command queues in which the * task is to be performed. * @param[in] commandQueues OpenCL command queues. * @param[in] numEventsInWaitList Number of events in the event wait list. * @param[in] eventWaitList Event wait list. * @param[in] events Event objects per each command queue that identify * a particular kernel execution instance. * * @return * - \b clblasSuccess on success; * - \b clblasInvalidDevice if a target device does not support the * floating point arithmetic with double precision; * - the same error codes as the clblasSscal() function otherwise. * * @ingroup SCAL */ clblasStatus clblasDscal( size_t N, cl_double alpha, cl_mem X, size_t offx, int incx, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events); /** * @brief Scales a complex-float vector by a complex-float constant * * - \f$ X \leftarrow \alpha X \f$ * * @param[in] N Number of elements in vector \b X. * @param[in] alpha The constant factor for vector \b X. * @param[out] X Buffer object storing vector \b X. * @param[in] offx Offset of first element of vector \b X in buffer object. * Counted in elements. * @param[in] incx Increment for the elements of \b X. Must not be zero. * @param[in] numCommandQueues Number of OpenCL command queues in which the * task is to be performed. * @param[in] commandQueues OpenCL command queues. * @param[in] numEventsInWaitList Number of events in the event wait list. * @param[in] eventWaitList Event wait list. * @param[in] events Event objects per each command queue that identify * a particular kernel execution instance. * * @return * - \b clblasSuccess on success; * - the same error codes as the clblasSscal() function otherwise. * * @ingroup SCAL */ clblasStatus clblasCscal( size_t N, cl_float2 alpha, cl_mem X, size_t offx, int incx, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events); /** * @brief Scales a complex-double vector by a complex-double constant * * - \f$ X \leftarrow \alpha X \f$ * * @param[in] N Number of elements in vector \b X. * @param[in] alpha The constant factor for vector \b X. * @param[out] X Buffer object storing vector \b X. * @param[in] offx Offset of first element of vector \b X in buffer object. * Counted in elements. * @param[in] incx Increment for the elements of \b X. Must not be zero. * @param[in] numCommandQueues Number of OpenCL command queues in which the * task is to be performed. * @param[in] commandQueues OpenCL command queues. * @param[in] numEventsInWaitList Number of events in the event wait list. * @param[in] eventWaitList Event wait list. * @param[in] events Event objects per each command queue that identify * a particular kernel execution instance. * * @return * - \b clblasSuccess on success; * - the same error codes as the clblasDscal() function otherwise. * * @ingroup SCAL */ clblasStatus clblasZscal( size_t N, cl_double2 alpha, cl_mem X, size_t offx, int incx, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events); /*@}*/ /** * @defgroup SSCAL SSCAL - Scales a complex vector by a real constant * @ingroup BLAS1 */ /*@{*/ /** * @brief Scales a complex-float vector by a float constant * * - \f$ X \leftarrow \alpha X \f$ * * @param[in] N Number of elements in vector \b X. * @param[in] alpha The constant factor for vector \b X. * @param[out] X Buffer object storing vector \b X. * @param[in] offx Offset of first element of vector \b X in buffer object. * Counted in elements. * @param[in] incx Increment for the elements of \b X. Must not be zero. * @param[in] numCommandQueues Number of OpenCL command queues in which the * task is to be performed. * @param[in] commandQueues OpenCL command queues. * @param[in] numEventsInWaitList Number of events in the event wait list. * @param[in] eventWaitList Event wait list. * @param[in] events Event objects per each command queue that identify * a particular kernel execution instance. * * @return * - \b clblasSuccess on success; * - \b clblasNotInitialized if clblasSetup() was not called; * - \b clblasInvalidValue if invalid parameters are passed: * - \b N is zero, or * - \b incx zero, or * - the vector sizes along with the increments lead to * accessing outside of any of the buffers; * - \b clblasInvalidMemObject if either \b X, or \b Y object is * Invalid, or an image object rather than the buffer one; * - \b clblasOutOfHostMemory if the library can't allocate memory for * internal structures; * - \b clblasInvalidCommandQueue if the passed command queue is invalid; * - \b clblasInvalidContext if a context a passed command queue belongs * to was released; * - \b clblasInvalidOperation if kernel compilation relating to a previous * call has not completed for any of the target devices; * - \b clblasCompilerNotAvailable if a compiler is not available; * - \b clblasBuildProgramFailure if there is a failure to build a program * executable. * * @ingroup SSCAL */ clblasStatus clblasCsscal( size_t N, cl_float alpha, cl_mem X, size_t offx, int incx, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events); /** * @example example_csscal.c * Example of how to use the @ref clblasCsscal function. */ /** * @brief Scales a complex-double vector by a double constant * * - \f$ X \leftarrow \alpha X \f$ * * @param[in] N Number of elements in vector \b X. * @param[in] alpha The constant factor for vector \b X. * @param[out] X Buffer object storing vector \b X. * @param[in] offx Offset of first element of vector \b X in buffer object. * Counted in elements. * @param[in] incx Increment for the elements of \b X. Must not be zero. * @param[in] numCommandQueues Number of OpenCL command queues in which the * task is to be performed. * @param[in] commandQueues OpenCL command queues. * @param[in] numEventsInWaitList Number of events in the event wait list. * @param[in] eventWaitList Event wait list. * @param[in] events Event objects per each command queue that identify * a particular kernel execution instance. * * @return * - \b clblasSuccess on success; * - \b clblasInvalidDevice if a target device does not support the * floating point arithmetic with double precision; * - the same error codes as the clblasCsscal() function otherwise. * * @ingroup SSCAL */ clblasStatus clblasZdscal( size_t N, cl_double alpha, cl_mem X, size_t offx, int incx, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events); /*@}*/ /** * @defgroup COPY COPY - Copies elements from vector X to vector Y * @ingroup BLAS1 */ /*@{*/ /** * @brief Copies float elements from vector X to vector Y * * - \f$ Y \leftarrow X \f$ * * @param[in] N Number of elements in vector \b X. * @param[in] X Buffer object storing vector \b X. * @param[in] offx Offset of first element of vector \b X in buffer object. * Counted in elements. * @param[in] incx Increment for the elements of \b X. Must not be zero. * @param[out] Y Buffer object storing the vector \b Y. * @param[in] offy Offset of first element of vector \b Y in buffer object. * Counted in elements. * @param[in] incy Increment for the elements of \b Y. Must not be zero. * @param[in] numCommandQueues Number of OpenCL command queues in which the * task is to be performed. * @param[in] commandQueues OpenCL command queues. * @param[in] numEventsInWaitList Number of events in the event wait list. * @param[in] eventWaitList Event wait list. * @param[in] events Event objects per each command queue that identify * a particular kernel execution instance. * * @return * - \b clblasSuccess on success; * - \b clblasNotInitialized if clblasSetup() was not called; * - \b clblasInvalidValue if invalid parameters are passed: * - \b N is zero, or * - either \b incx or \b incy is zero, or * - the vector sizes along with the increments lead to * accessing outside of any of the buffers; * - \b clblasInvalidMemObject if either \b X, or \b Y object is * Invalid, or an image object rather than the buffer one; * - \b clblasOutOfHostMemory if the library can't allocate memory for * internal structures; * - \b clblasInvalidCommandQueue if the passed command queue is invalid; * - \b clblasInvalidContext if a context a passed command queue belongs * to was released; * - \b clblasInvalidOperation if kernel compilation relating to a previous * call has not completed for any of the target devices; * - \b clblasCompilerNotAvailable if a compiler is not available; * - \b clblasBuildProgramFailure if there is a failure to build a program * executable. * * @ingroup COPY */ clblasStatus clblasScopy( size_t N, const cl_mem X, size_t offx, int incx, cl_mem Y, size_t offy, int incy, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events); /** * @example example_scopy.c * Example of how to use the @ref clblasScopy function. */ /** * @brief Copies double elements from vector X to vector Y * * - \f$ Y \leftarrow X \f$ * * @param[in] N Number of elements in vector \b X. * @param[in] X Buffer object storing vector \b X. * @param[in] offx Offset of first element of vector \b X in buffer object. * Counted in elements. * @param[in] incx Increment for the elements of \b X. Must not be zero. * @param[out] Y Buffer object storing the vector \b Y. * @param[in] offy Offset of first element of vector \b Y in buffer object. * Counted in elements. * @param[in] incy Increment for the elements of \b Y. Must not be zero. * @param[in] numCommandQueues Number of OpenCL command queues in which the * task is to be performed. * @param[in] commandQueues OpenCL command queues. * @param[in] numEventsInWaitList Number of events in the event wait list. * @param[in] eventWaitList Event wait list. * @param[in] events Event objects per each command queue that identify * a particular kernel execution instance. * * @return * - \b clblasSuccess on success; * - \b clblasInvalidDevice if a target device does not support the * floating point arithmetic with double precision; * - the same error codes as the clblasScopy() function otherwise. * * @ingroup COPY */ clblasStatus clblasDcopy( size_t N, const cl_mem X, size_t offx, int incx, cl_mem Y, size_t offy, int incy, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events); /** * @brief Copies complex-float elements from vector X to vector Y * * - \f$ Y \leftarrow X \f$ * * @param[in] N Number of elements in vector \b X. * @param[in] X Buffer object storing vector \b X. * @param[in] offx Offset of first element of vector \b X in buffer object. * Counted in elements. * @param[in] incx Increment for the elements of \b X. Must not be zero. * @param[out] Y Buffer object storing the vector \b Y. * @param[in] offy Offset of first element of vector \b Y in buffer object. * Counted in elements. * @param[in] incy Increment for the elements of \b Y. Must not be zero. * @param[in] numCommandQueues Number of OpenCL command queues in which the * task is to be performed. * @param[in] commandQueues OpenCL command queues. * @param[in] numEventsInWaitList Number of events in the event wait list. * @param[in] eventWaitList Event wait list. * @param[in] events Event objects per each command queue that identify * a particular kernel execution instance. * * @return * - \b clblasSuccess on success; * - the same error codes as the clblasScopy() function otherwise. * * @ingroup COPY */ clblasStatus clblasCcopy( size_t N, const cl_mem X, size_t offx, int incx, cl_mem Y, size_t offy, int incy, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events); /** * @brief Copies complex-double elements from vector X to vector Y * * - \f$ Y \leftarrow X \f$ * * @param[in] N Number of elements in vector \b X. * @param[in] X Buffer object storing vector \b X. * @param[in] offx Offset of first element of vector \b X in buffer object. * Counted in elements. * @param[in] incx Increment for the elements of \b X. Must not be zero. * @param[out] Y Buffer object storing the vector \b Y. * @param[in] offy Offset of first element of vector \b Y in buffer object. * Counted in elements. * @param[in] incy Increment for the elements of \b Y. Must not be zero. * @param[in] numCommandQueues Number of OpenCL command queues in which the * task is to be performed. * @param[in] commandQueues OpenCL command queues. * @param[in] numEventsInWaitList Number of events in the event wait list. * @param[in] eventWaitList Event wait list. * @param[in] events Event objects per each command queue that identify * a particular kernel execution instance. * * @return * - \b clblasSuccess on success; * - the same error codes as the clblasDcopy() function otherwise. * * @ingroup COPY */ clblasStatus clblasZcopy( size_t N, const cl_mem X, size_t offx, int incx, cl_mem Y, size_t offy, int incy, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events); /*@}*/ /** * @defgroup AXPY AXPY - Scale X and add to Y * @ingroup BLAS1 */ /*@{*/ /** * @brief Scale vector X of float elements and add to Y * * - \f$ Y \leftarrow \alpha X + Y \f$ * * @param[in] N Number of elements in vector \b X. * @param[in] alpha The constant factor for vector \b X. * @param[in] X Buffer object storing vector \b X. * @param[in] offx Offset of first element of vector \b X in buffer object. * Counted in elements. * @param[in] incx Increment for the elements of \b X. Must not be zero. * @param[out] Y Buffer object storing the vector \b Y. * @param[in] offy Offset of first element of vector \b Y in buffer object. * Counted in elements. * @param[in] incy Increment for the elements of \b Y. Must not be zero. * @param[in] numCommandQueues Number of OpenCL command queues in which the * task is to be performed. * @param[in] commandQueues OpenCL command queues. * @param[in] numEventsInWaitList Number of events in the event wait list. * @param[in] eventWaitList Event wait list. * @param[in] events Event objects per each command queue that identify * a particular kernel execution instance. * * @return * - \b clblasSuccess on success; * - \b clblasNotInitialized if clblasSetup() was not called; * - \b clblasInvalidValue if invalid parameters are passed: * - \b N is zero, or * - either \b incx or \b incy is zero, or * - the vector sizes along with the increments lead to * accessing outside of any of the buffers; * - \b clblasInvalidMemObject if either \b X, or \b Y object is * Invalid, or an image object rather than the buffer one; * - \b clblasOutOfHostMemory if the library can't allocate memory for * internal structures; * - \b clblasInvalidCommandQueue if the passed command queue is invalid; * - \b clblasInvalidContext if a context a passed command queue belongs * to was released; * - \b clblasInvalidOperation if kernel compilation relating to a previous * call has not completed for any of the target devices; * - \b clblasCompilerNotAvailable if a compiler is not available; * - \b clblasBuildProgramFailure if there is a failure to build a program * executable. * * @ingroup AXPY */ clblasStatus clblasSaxpy( size_t N, cl_float alpha, const cl_mem X, size_t offx, int incx, cl_mem Y, size_t offy, int incy, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events); /** * @example example_saxpy.c * Example of how to use the @ref clblasSaxpy function. */ /** * @brief Scale vector X of double elements and add to Y * * - \f$ Y \leftarrow \alpha X + Y \f$ * * @param[in] N Number of elements in vector \b X. * @param[in] alpha The constant factor for vector \b X. * @param[in] X Buffer object storing vector \b X. * @param[in] offx Offset of first element of vector \b X in buffer object. * Counted in elements. * @param[in] incx Increment for the elements of \b X. Must not be zero. * @param[out] Y Buffer object storing the vector \b Y. * @param[in] offy Offset of first element of vector \b Y in buffer object. * Counted in elements. * @param[in] incy Increment for the elements of \b Y. Must not be zero. * @param[in] numCommandQueues Number of OpenCL command queues in which the * task is to be performed. * @param[in] commandQueues OpenCL command queues. * @param[in] numEventsInWaitList Number of events in the event wait list. * @param[in] eventWaitList Event wait list. * @param[in] events Event objects per each command queue that identify * a particular kernel execution instance. * * @return * - \b clblasSuccess on success; * - \b clblasInvalidDevice if a target device does not support the * floating point arithmetic with double precision; * - the same error codes as the clblasSaxpy() function otherwise. * * @ingroup AXPY */ clblasStatus clblasDaxpy( size_t N, cl_double alpha, const cl_mem X, size_t offx, int incx, cl_mem Y, size_t offy, int incy, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events); /** * @brief Scale vector X of complex-float elements and add to Y * * - \f$ Y \leftarrow \alpha X + Y \f$ * * @param[in] N Number of elements in vector \b X. * @param[in] alpha The constant factor for vector \b X. * @param[in] X Buffer object storing vector \b X. * @param[in] offx Offset of first element of vector \b X in buffer object. * Counted in elements. * @param[in] incx Increment for the elements of \b X. Must not be zero. * @param[out] Y Buffer object storing the vector \b Y. * @param[in] offy Offset of first element of vector \b Y in buffer object. * Counted in elements. * @param[in] incy Increment for the elements of \b Y. Must not be zero. * @param[in] numCommandQueues Number of OpenCL command queues in which the * task is to be performed. * @param[in] commandQueues OpenCL command queues. * @param[in] numEventsInWaitList Number of events in the event wait list. * @param[in] eventWaitList Event wait list. * @param[in] events Event objects per each command queue that identify * a particular kernel execution instance. * * @return * - \b clblasSuccess on success; * - the same error codes as the clblasSaxpy() function otherwise. * * @ingroup AXPY */ clblasStatus clblasCaxpy( size_t N, cl_float2 alpha, const cl_mem X, size_t offx, int incx, cl_mem Y, size_t offy, int incy, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events); /** * @brief Scale vector X of double-complex elements and add to Y * * - \f$ Y \leftarrow \alpha X + Y \f$ * * @param[in] N Number of elements in vector \b X. * @param[in] alpha The constant factor for vector \b X. * @param[in] X Buffer object storing vector \b X. * @param[in] offx Offset of first element of vector \b X in buffer object. * Counted in elements. * @param[in] incx Increment for the elements of \b X. Must not be zero. * @param[out] Y Buffer object storing the vector \b Y. * @param[in] offy Offset of first element of vector \b Y in buffer object. * Counted in elements. * @param[in] incy Increment for the elements of \b Y. Must not be zero. * @param[in] numCommandQueues Number of OpenCL command queues in which the * task is to be performed. * @param[in] commandQueues OpenCL command queues. * @param[in] numEventsInWaitList Number of events in the event wait list. * @param[in] eventWaitList Event wait list. * @param[in] events Event objects per each command queue that identify * a particular kernel execution instance. * * @return * - \b clblasSuccess on success; * - the same error codes as the clblasDaxpy() function otherwise. * * @ingroup AXPY */ clblasStatus clblasZaxpy( size_t N, cl_double2 alpha, const cl_mem X, size_t offx, int incx, cl_mem Y, size_t offy, int incy, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events); /*@}*/ /** * @defgroup DOT DOT - Dot product of two vectors * @ingroup BLAS1 */ /*@{*/ /** * @brief dot product of two vectors containing float elements * * @param[in] N Number of elements in vector \b X. * @param[out] dotProduct Buffer object that will contain the dot-product value * @param[in] offDP Offset to dot-product in \b dotProduct buffer object. * Counted in elements. * @param[in] X Buffer object storing vector \b X. * @param[in] offx Offset of first element of vector \b X in buffer object. * Counted in elements. * @param[in] incx Increment for the elements of \b X. Must not be zero. * @param[in] Y Buffer object storing the vector \b Y. * @param[in] offy Offset of first element of vector \b Y in buffer object. * Counted in elements. * @param[in] incy Increment for the elements of \b Y. Must not be zero. * @param[in] scratchBuff Temporary cl_mem scratch buffer object of minimum size N * @param[in] numCommandQueues Number of OpenCL command queues in which the * task is to be performed. * @param[in] commandQueues OpenCL command queues. * @param[in] numEventsInWaitList Number of events in the event wait list. * @param[in] eventWaitList Event wait list. * @param[in] events Event objects per each command queue that identify * a particular kernel execution instance. * * @return * - \b clblasSuccess on success; * - \b clblasNotInitialized if clblasSetup() was not called; * - \b clblasInvalidValue if invalid parameters are passed: * - \b N is zero, or * - either \b incx or \b incy is zero, or * - the vector sizes along with the increments lead to * accessing outside of any of the buffers; * - \b clblasInvalidMemObject if either \b X, \b Y or \b dotProduct object is * Invalid, or an image object rather than the buffer one; * - \b clblasOutOfHostMemory if the library can't allocate memory for * internal structures; * - \b clblasInvalidCommandQueue if the passed command queue is invalid; * - \b clblasInvalidContext if a context a passed command queue belongs * to was released; * - \b clblasInvalidOperation if kernel compilation relating to a previous * call has not completed for any of the target devices; * - \b clblasCompilerNotAvailable if a compiler is not available; * - \b clblasBuildProgramFailure if there is a failure to build a program * executable. * * @ingroup DOT */ clblasStatus clblasSdot( size_t N, cl_mem dotProduct, size_t offDP, const cl_mem X, size_t offx, int incx, const cl_mem Y, size_t offy, int incy, cl_mem scratchBuff, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events); /** * @example example_sdot.c * Example of how to use the @ref clblasSdot function. */ /** * @brief dot product of two vectors containing double elements * * @param[in] N Number of elements in vector \b X. * @param[out] dotProduct Buffer object that will contain the dot-product value * @param[in] offDP Offset to dot-product in \b dotProduct buffer object. * Counted in elements. * @param[in] X Buffer object storing vector \b X. * @param[in] offx Offset of first element of vector \b X in buffer object. * Counted in elements. * @param[in] incx Increment for the elements of \b X. Must not be zero. * @param[in] Y Buffer object storing the vector \b Y. * @param[in] offy Offset of first element of vector \b Y in buffer object. * Counted in elements. * @param[in] incy Increment for the elements of \b Y. Must not be zero. * @param[in] scratchBuff Temporary cl_mem scratch buffer object of minimum size N * @param[in] numCommandQueues Number of OpenCL command queues in which the * task is to be performed. * @param[in] commandQueues OpenCL command queues. * @param[in] numEventsInWaitList Number of events in the event wait list. * @param[in] eventWaitList Event wait list. * @param[in] events Event objects per each command queue that identify * a particular kernel execution instance. * * @return * - \b clblasSuccess on success; * - \b clblasInvalidDevice if a target device does not support the * floating point arithmetic with double precision; * - the same error codes as the clblasSdot() function otherwise. * * @ingroup DOT */ clblasStatus clblasDdot( size_t N, cl_mem dotProduct, size_t offDP, const cl_mem X, size_t offx, int incx, const cl_mem Y, size_t offy, int incy, cl_mem scratchBuff, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events); /** * @brief dot product of two vectors containing float-complex elements * * @param[in] N Number of elements in vector \b X. * @param[out] dotProduct Buffer object that will contain the dot-product value * @param[in] offDP Offset to dot-product in \b dotProduct buffer object. * Counted in elements. * @param[in] X Buffer object storing vector \b X. * @param[in] offx Offset of first element of vector \b X in buffer object. * Counted in elements. * @param[in] incx Increment for the elements of \b X. Must not be zero. * @param[in] Y Buffer object storing the vector \b Y. * @param[in] offy Offset of first element of vector \b Y in buffer object. * Counted in elements. * @param[in] incy Increment for the elements of \b Y. Must not be zero. * @param[in] scratchBuff Temporary cl_mem scratch buffer object of minimum size N * @param[in] numCommandQueues Number of OpenCL command queues in which the * task is to be performed. * @param[in] commandQueues OpenCL command queues. * @param[in] numEventsInWaitList Number of events in the event wait list. * @param[in] eventWaitList Event wait list. * @param[in] events Event objects per each command queue that identify * a particular kernel execution instance. * * @return * - \b clblasSuccess on success; * - the same error codes as the clblasSdot() function otherwise. * * @ingroup DOT */ clblasStatus clblasCdotu( size_t N, cl_mem dotProduct, size_t offDP, const cl_mem X, size_t offx, int incx, const cl_mem Y, size_t offy, int incy, cl_mem scratchBuff, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events); /** * @brief dot product of two vectors containing double-complex elements * * @param[in] N Number of elements in vector \b X. * @param[out] dotProduct Buffer object that will contain the dot-product value * @param[in] offDP Offset to dot-product in \b dotProduct buffer object. * Counted in elements. * @param[in] X Buffer object storing vector \b X. * @param[in] offx Offset of first element of vector \b X in buffer object. * Counted in elements. * @param[in] incx Increment for the elements of \b X. Must not be zero. * @param[in] Y Buffer object storing the vector \b Y. * @param[in] offy Offset of first element of vector \b Y in buffer object. * Counted in elements. * @param[in] incy Increment for the elements of \b Y. Must not be zero. * @param[in] scratchBuff Temporary cl_mem scratch buffer object of minimum size N * @param[in] numCommandQueues Number of OpenCL command queues in which the * task is to be performed. * @param[in] commandQueues OpenCL command queues. * @param[in] numEventsInWaitList Number of events in the event wait list. * @param[in] eventWaitList Event wait list. * @param[in] events Event objects per each command queue that identify * a particular kernel execution instance. * * @return * - \b clblasSuccess on success; * - \b clblasInvalidDevice if a target device does not support the * floating point arithmetic with double precision; * - the same error codes as the clblasSdot() function otherwise. * * @ingroup DOT */ clblasStatus clblasZdotu( size_t N, cl_mem dotProduct, size_t offDP, const cl_mem X, size_t offx, int incx, const cl_mem Y, size_t offy, int incy, cl_mem scratchBuff, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events); /** * @brief dot product of two vectors containing float-complex elements conjugating the first vector * * @param[in] N Number of elements in vector \b X. * @param[out] dotProduct Buffer object that will contain the dot-product value * @param[in] offDP Offset to dot-product in \b dotProduct buffer object. * Counted in elements. * @param[in] X Buffer object storing vector \b X. * @param[in] offx Offset of first element of vector \b X in buffer object. * Counted in elements. * @param[in] incx Increment for the elements of \b X. Must not be zero. * @param[in] Y Buffer object storing the vector \b Y. * @param[in] offy Offset of first element of vector \b Y in buffer object. * Counted in elements. * @param[in] incy Increment for the elements of \b Y. Must not be zero. * @param[in] scratchBuff Temporary cl_mem scratch buffer object of minimum size N * @param[in] numCommandQueues Number of OpenCL command queues in which the * task is to be performed. * @param[in] commandQueues OpenCL command queues. * @param[in] numEventsInWaitList Number of events in the event wait list. * @param[in] eventWaitList Event wait list. * @param[in] events Event objects per each command queue that identify * a particular kernel execution instance. * * @return * - \b clblasSuccess on success; * - the same error codes as the clblasSdot() function otherwise. * * @ingroup DOT */ clblasStatus clblasCdotc( size_t N, cl_mem dotProduct, size_t offDP, const cl_mem X, size_t offx, int incx, const cl_mem Y, size_t offy, int incy, cl_mem scratchBuff, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events); /** * @brief dot product of two vectors containing double-complex elements conjugating the first vector * * @param[in] N Number of elements in vector \b X. * @param[out] dotProduct Buffer object that will contain the dot-product value * @param[in] offDP Offset to dot-product in \b dotProduct buffer object. * Counted in elements. * @param[in] X Buffer object storing vector \b X. * @param[in] offx Offset of first element of vector \b X in buffer object. * Counted in elements. * @param[in] incx Increment for the elements of \b X. Must not be zero. * @param[in] Y Buffer object storing the vector \b Y. * @param[in] offy Offset of first element of vector \b Y in buffer object. * Counted in elements. * @param[in] incy Increment for the elements of \b Y. Must not be zero. * @param[in] scratchBuff Temporary cl_mem scratch buffer object of minimum size N * @param[in] numCommandQueues Number of OpenCL command queues in which the * task is to be performed. * @param[in] commandQueues OpenCL command queues. * @param[in] numEventsInWaitList Number of events in the event wait list. * @param[in] eventWaitList Event wait list. * @param[in] events Event objects per each command queue that identify * a particular kernel execution instance. * * @return * - \b clblasSuccess on success; * - \b clblasInvalidDevice if a target device does not support the * floating point arithmetic with double precision; * - the same error codes as the clblasSdot() function otherwise. * * @ingroup DOT */ clblasStatus clblasZdotc( size_t N, cl_mem dotProduct, size_t offDP, const cl_mem X, size_t offx, int incx, const cl_mem Y, size_t offy, int incy, cl_mem scratchBuff, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events); /*@}*/ /** * @defgroup ROTG ROTG - Constructs givens plane rotation * @ingroup BLAS1 */ /*@{*/ /** * @brief construct givens plane rotation on float elements * * @param[out] SA Buffer object that contains SA * @param[in] offSA Offset to SA in \b SA buffer object. * Counted in elements. * @param[out] SB Buffer object that contains SB * @param[in] offSB Offset to SB in \b SB buffer object. * Counted in elements. * @param[out] C Buffer object that contains C * @param[in] offC Offset to C in \b C buffer object. * Counted in elements. * @param[out] S Buffer object that contains S * @param[in] offS Offset to S in \b S buffer object. * Counted in elements. * @param[in] numCommandQueues Number of OpenCL command queues in which the * task is to be performed. * @param[in] commandQueues OpenCL command queues. * @param[in] numEventsInWaitList Number of events in the event wait list. * @param[in] eventWaitList Event wait list. * @param[in] events Event objects per each command queue that identify * a particular kernel execution instance. * * @return * - \b clblasSuccess on success; * - \b clblasNotInitialized if clblasSetup() was not called; * - \b clblasInvalidMemObject if either \b SA, \b SB, \b C or \b S object is * Invalid, or an image object rather than the buffer one; * - \b clblasOutOfHostMemory if the library can't allocate memory for * internal structures; * - \b clblasInvalidCommandQueue if the passed command queue is invalid; * - \b clblasInvalidContext if a context a passed command queue belongs * to was released; * - \b clblasInvalidOperation if kernel compilation relating to a previous * call has not completed for any of the target devices; * - \b clblasCompilerNotAvailable if a compiler is not available; * - \b clblasBuildProgramFailure if there is a failure to build a program * executable. * * @ingroup ROTG */ clblasStatus clblasSrotg( cl_mem SA, size_t offSA, cl_mem SB, size_t offSB, cl_mem C, size_t offC, cl_mem S, size_t offS, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events); /** * @example example_srotg.c * Example of how to use the @ref clblasSrotg function. */ /** * @brief construct givens plane rotation on double elements * * @param[out] DA Buffer object that contains DA * @param[in] offDA Offset to DA in \b DA buffer object. * Counted in elements. * @param[out] DB Buffer object that contains DB * @param[in] offDB Offset to DB in \b DB buffer object. * Counted in elements. * @param[out] C Buffer object that contains C * @param[in] offC Offset to C in \b C buffer object. * Counted in elements. * @param[out] S Buffer object that contains S * @param[in] offS Offset to S in \b S buffer object. * Counted in elements. * @param[in] numCommandQueues Number of OpenCL command queues in which the * task is to be performed. * @param[in] commandQueues OpenCL command queues. * @param[in] numEventsInWaitList Number of events in the event wait list. * @param[in] eventWaitList Event wait list. * @param[in] events Event objects per each command queue that identify * a particular kernel execution instance. * * @return * - \b clblasSuccess on success; * - \b clblasInvalidDevice if a target device does not support the * floating point arithmetic with double precision; * - the same error codes as the clblasSrotg() function otherwise. * * @ingroup ROTG */ clblasStatus clblasDrotg( cl_mem DA, size_t offDA, cl_mem DB, size_t offDB, cl_mem C, size_t offC, cl_mem S, size_t offS, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events); /** * @brief construct givens plane rotation on float-complex elements * * @param[out] CA Buffer object that contains CA * @param[in] offCA Offset to CA in \b CA buffer object. * Counted in elements. * @param[out] CB Buffer object that contains CB * @param[in] offCB Offset to CB in \b CB buffer object. * Counted in elements. * @param[out] C Buffer object that contains C. C is real. * @param[in] offC Offset to C in \b C buffer object. * Counted in elements. * @param[out] S Buffer object that contains S * @param[in] offS Offset to S in \b S buffer object. * Counted in elements. * @param[in] numCommandQueues Number of OpenCL command queues in which the * task is to be performed. * @param[in] commandQueues OpenCL command queues. * @param[in] numEventsInWaitList Number of events in the event wait list. * @param[in] eventWaitList Event wait list. * @param[in] events Event objects per each command queue that identify * a particular kernel execution instance. * * @return * - \b clblasSuccess on success; * - the same error codes as the clblasSrotg() function otherwise. * * @ingroup ROTG */ clblasStatus clblasCrotg( cl_mem CA, size_t offCA, cl_mem CB, size_t offCB, cl_mem C, size_t offC, cl_mem S, size_t offS, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events); /** * @brief construct givens plane rotation on double-complex elements * * @param[out] CA Buffer object that contains CA * @param[in] offCA Offset to CA in \b CA buffer object. * Counted in elements. * @param[out] CB Buffer object that contains CB * @param[in] offCB Offset to CB in \b CB buffer object. * Counted in elements. * @param[out] C Buffer object that contains C. C is real. * @param[in] offC Offset to C in \b C buffer object. * Counted in elements. * @param[out] S Buffer object that contains S * @param[in] offS Offset to S in \b S buffer object. * Counted in elements. * @param[in] numCommandQueues Number of OpenCL command queues in which the * task is to be performed. * @param[in] commandQueues OpenCL command queues. * @param[in] numEventsInWaitList Number of events in the event wait list. * @param[in] eventWaitList Event wait list. * @param[in] events Event objects per each command queue that identify * a particular kernel execution instance. * * @return * - \b clblasSuccess on success; * - the same error codes as the clblasDrotg() function otherwise. * * @ingroup ROTG */ clblasStatus clblasZrotg( cl_mem CA, size_t offCA, cl_mem CB, size_t offCB, cl_mem C, size_t offC, cl_mem S, size_t offS, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events); /*@}*/ /** * @defgroup ROTMG ROTMG - Constructs the modified givens rotation * @ingroup BLAS1 */ /*@{*/ /** * @brief construct the modified givens rotation on float elements * * @param[out] SD1 Buffer object that contains SD1 * @param[in] offSD1 Offset to SD1 in \b SD1 buffer object. * Counted in elements. * @param[out] SD2 Buffer object that contains SD2 * @param[in] offSD2 Offset to SD2 in \b SD2 buffer object. * Counted in elements. * @param[out] SX1 Buffer object that contains SX1 * @param[in] offSX1 Offset to SX1 in \b SX1 buffer object. * Counted in elements. * @param[in] SY1 Buffer object that contains SY1 * @param[in] offSY1 Offset to SY1 in \b SY1 buffer object. * Counted in elements. * @param[out] SPARAM Buffer object that contains SPARAM array of minimum length 5 SPARAM(0) = SFLAG SPARAM(1) = SH11 SPARAM(2) = SH21 SPARAM(3) = SH12 SPARAM(4) = SH22 * @param[in] offSparam Offset to SPARAM in \b SPARAM buffer object. * Counted in elements. * @param[in] numCommandQueues Number of OpenCL command queues in which the * task is to be performed. * @param[in] commandQueues OpenCL command queues. * @param[in] numEventsInWaitList Number of events in the event wait list. * @param[in] eventWaitList Event wait list. * @param[in] events Event objects per each command queue that identify * a particular kernel execution instance. * * @return * - \b clblasSuccess on success; * - \b clblasNotInitialized if clblasSetup() was not called; * - \b clblasInvalidMemObject if either \b SX1, \b SY1, \b SD1, \b SD2 or \b SPARAM object is * Invalid, or an image object rather than the buffer one; * - \b clblasOutOfHostMemory if the library can't allocate memory for * internal structures; * - \b clblasInvalidCommandQueue if the passed command queue is invalid; * - \b clblasInvalidContext if a context a passed command queue belongs * to was released; * - \b clblasInvalidOperation if kernel compilation relating to a previous * call has not completed for any of the target devices; * - \b clblasCompilerNotAvailable if a compiler is not available; * - \b clblasBuildProgramFailure if there is a failure to build a program * executable. * * @ingroup ROTMG */ clblasStatus clblasSrotmg( cl_mem SD1, size_t offSD1, cl_mem SD2, size_t offSD2, cl_mem SX1, size_t offSX1, const cl_mem SY1, size_t offSY1, cl_mem SPARAM, size_t offSparam, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events); /** * @example example_srotmg.c * Example of how to use the @ref clblasSrotmg function. */ /** * @brief construct the modified givens rotation on double elements * * @param[out] DD1 Buffer object that contains DD1 * @param[in] offDD1 Offset to DD1 in \b DD1 buffer object. * Counted in elements. * @param[out] DD2 Buffer object that contains DD2 * @param[in] offDD2 Offset to DD2 in \b DD2 buffer object. * Counted in elements. * @param[out] DX1 Buffer object that contains DX1 * @param[in] offDX1 Offset to DX1 in \b DX1 buffer object. * Counted in elements. * @param[in] DY1 Buffer object that contains DY1 * @param[in] offDY1 Offset to DY1 in \b DY1 buffer object. * Counted in elements. * @param[out] DPARAM Buffer object that contains DPARAM array of minimum length 5 DPARAM(0) = DFLAG DPARAM(1) = DH11 DPARAM(2) = DH21 DPARAM(3) = DH12 DPARAM(4) = DH22 * @param[in] offDparam Offset to DPARAM in \b DPARAM buffer object. * Counted in elements. * @param[in] numCommandQueues Number of OpenCL command queues in which the * task is to be performed. * @param[in] commandQueues OpenCL command queues. * @param[in] numEventsInWaitList Number of events in the event wait list. * @param[in] eventWaitList Event wait list. * @param[in] events Event objects per each command queue that identify * a particular kernel execution instance. * * @return * - \b clblasSuccess on success; * - \b clblasInvalidDevice if a target device does not support the * floating point arithmetic with double precision; * - the same error codes as the clblasSrotmg() function otherwise. * * @ingroup ROTMG */ clblasStatus clblasDrotmg( cl_mem DD1, size_t offDD1, cl_mem DD2, size_t offDD2, cl_mem DX1, size_t offDX1, const cl_mem DY1, size_t offDY1, cl_mem DPARAM, size_t offDparam, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events); /*@}*/ /** * @defgroup ROT ROT - Apply givens rotation * @ingroup BLAS1 */ /*@{*/ /** * @brief applies a plane rotation for float elements * * @param[in] N Number of elements in vector \b X and \b Y. * @param[out] X Buffer object storing vector \b X. * @param[in] offx Offset of first element of vector \b X in buffer object. * Counted in elements. * @param[in] incx Increment for the elements of \b X. Must not be zero. * @param[out] Y Buffer object storing the vector \b Y. * @param[in] offy Offset of first element of vector \b Y in buffer object. * Counted in elements. * @param[in] incy Increment for the elements of \b Y. Must not be zero. * @param[in] C C specifies the cosine, cos. * @param[in] S S specifies the sine, sin. * @param[in] numCommandQueues Number of OpenCL command queues in which the * task is to be performed. * @param[in] commandQueues OpenCL command queues. * @param[in] numEventsInWaitList Number of events in the event wait list. * @param[in] eventWaitList Event wait list. * @param[in] events Event objects per each command queue that identify * a particular kernel execution instance. * * @return * - \b clblasSuccess on success; * - \b clblasNotInitialized if clblasSetup() was not called; * - \b clblasInvalidValue if invalid parameters are passed: * - \b N is zero, or * - either \b incx or \b incy is zero, or * - the vector sizes along with the increments lead to * accessing outside of any of the buffers; * - \b clblasInvalidMemObject if either \b X, or \b Y object is * Invalid, or an image object rather than the buffer one; * - \b clblasOutOfHostMemory if the library can't allocate memory for * internal structures; * - \b clblasInvalidCommandQueue if the passed command queue is invalid; * - \b clblasInvalidContext if a context a passed command queue belongs * to was released; * - \b clblasInvalidOperation if kernel compilation relating to a previous * call has not completed for any of the target devices; * - \b clblasCompilerNotAvailable if a compiler is not available; * - \b clblasBuildProgramFailure if there is a failure to build a program * executable. * * @ingroup ROT */ clblasStatus clblasSrot( size_t N, cl_mem X, size_t offx, int incx, cl_mem Y, size_t offy, int incy, cl_float C, cl_float S, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events); /** * @example example_srot.c * Example of how to use the @ref clblasSrot function. */ /** * @brief applies a plane rotation for double elements * * @param[in] N Number of elements in vector \b X and \b Y. * @param[out] X Buffer object storing vector \b X. * @param[in] offx Offset of first element of vector \b X in buffer object. * Counted in elements. * @param[in] incx Increment for the elements of \b X. Must not be zero. * @param[out] Y Buffer object storing the vector \b Y. * @param[in] offy Offset of first element of vector \b Y in buffer object. * Counted in elements. * @param[in] incy Increment for the elements of \b Y. Must not be zero. * @param[in] C C specifies the cosine, cos. * @param[in] S S specifies the sine, sin. * @param[in] numCommandQueues Number of OpenCL command queues in which the * task is to be performed. * @param[in] commandQueues OpenCL command queues. * @param[in] numEventsInWaitList Number of events in the event wait list. * @param[in] eventWaitList Event wait list. * @param[in] events Event objects per each command queue that identify * a particular kernel execution instance. * * @return * - \b clblasSuccess on success; * - \b clblasInvalidDevice if a target device does not support the * floating point arithmetic with double precision; * - the same error codes as the clblasSrot() function otherwise. * * @ingroup ROT */ clblasStatus clblasDrot( size_t N, cl_mem X, size_t offx, int incx, cl_mem Y, size_t offy, int incy, cl_double C, cl_double S, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events); /** * @brief applies a plane rotation for float-complex elements * * @param[in] N Number of elements in vector \b X and \b Y. * @param[out] X Buffer object storing vector \b X. * @param[in] offx Offset of first element of vector \b X in buffer object. * Counted in elements. * @param[in] incx Increment for the elements of \b X. Must not be zero. * @param[out] Y Buffer object storing the vector \b Y. * @param[in] offy Offset of first element of vector \b Y in buffer object. * Counted in elements. * @param[in] incy Increment for the elements of \b Y. Must not be zero. * @param[in] C C specifies the cosine, cos. This number is real * @param[in] S S specifies the sine, sin. This number is real * @param[in] numCommandQueues Number of OpenCL command queues in which the * task is to be performed. * @param[in] commandQueues OpenCL command queues. * @param[in] numEventsInWaitList Number of events in the event wait list. * @param[in] eventWaitList Event wait list. * @param[in] events Event objects per each command queue that identify * a particular kernel execution instance. * * @return * - \b clblasSuccess on success; * - the same error codes as the clblasSrot() function otherwise. * * @ingroup ROT */ clblasStatus clblasCsrot( size_t N, cl_mem X, size_t offx, int incx, cl_mem Y, size_t offy, int incy, cl_float C, cl_float S, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events); /** * @brief applies a plane rotation for double-complex elements * * @param[in] N Number of elements in vector \b X and \b Y. * @param[out] X Buffer object storing vector \b X. * @param[in] offx Offset of first element of vector \b X in buffer object. * Counted in elements. * @param[in] incx Increment for the elements of \b X. Must not be zero. * @param[out] Y Buffer object storing the vector \b Y. * @param[in] offy Offset of first element of vector \b Y in buffer object. * Counted in elements. * @param[in] incy Increment for the elements of \b Y. Must not be zero. * @param[in] C C specifies the cosine, cos. This number is real * @param[in] S S specifies the sine, sin. This number is real * @param[in] numCommandQueues Number of OpenCL command queues in which the * task is to be performed. * @param[in] commandQueues OpenCL command queues. * @param[in] numEventsInWaitList Number of events in the event wait list. * @param[in] eventWaitList Event wait list. * @param[in] events Event objects per each command queue that identify * a particular kernel execution instance. * * @return * - \b clblasSuccess on success; * - \b clblasInvalidDevice if a target device does not support the * floating point arithmetic with double precision; * - the same error codes as the clblasSrot() function otherwise. * * @ingroup ROT */ clblasStatus clblasZdrot( size_t N, cl_mem X, size_t offx, int incx, cl_mem Y, size_t offy, int incy, cl_double C, cl_double S, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events); /*@}*/ /** * @defgroup ROTM ROTM - Apply modified givens rotation for points in the plane * @ingroup BLAS1 */ /*@{*/ /** * @brief modified givens rotation for float elements * * @param[in] N Number of elements in vector \b X and \b Y. * @param[out] X Buffer object storing vector \b X. * @param[in] offx Offset of first element of vector \b X in buffer object. * Counted in elements. * @param[in] incx Increment for the elements of \b X. Must not be zero. * @param[out] Y Buffer object storing the vector \b Y. * @param[in] offy Offset of first element of vector \b Y in buffer object. * Counted in elements. * @param[in] incy Increment for the elements of \b Y. Must not be zero. * @param[in] SPARAM Buffer object that contains SPARAM array of minimum length 5 * SPARAM(1)=SFLAG * SPARAM(2)=SH11 * SPARAM(3)=SH21 * SPARAM(4)=SH12 * SPARAM(5)=SH22 * @param[in] offSparam Offset of first element of array \b SPARAM in buffer object. * Counted in elements. * @param[in] numCommandQueues Number of OpenCL command queues in which the * task is to be performed. * @param[in] commandQueues OpenCL command queues. * @param[in] numEventsInWaitList Number of events in the event wait list. * @param[in] eventWaitList Event wait list. * @param[in] events Event objects per each command queue that identify * a particular kernel execution instance. * * @return * - \b clblasSuccess on success; * - \b clblasNotInitialized if clblasSetup() was not called; * - \b clblasInvalidValue if invalid parameters are passed: * - \b N is zero, or * - either \b incx or \b incy is zero, or * - the vector sizes along with the increments lead to * accessing outside of any of the buffers; * - \b clblasInvalidMemObject if either \b X, \b Y or \b SPARAM object is * Invalid, or an image object rather than the buffer one; * - \b clblasOutOfHostMemory if the library can't allocate memory for * internal structures; * - \b clblasInvalidCommandQueue if the passed command queue is invalid; * - \b clblasInvalidContext if a context a passed command queue belongs * to was released; * - \b clblasInvalidOperation if kernel compilation relating to a previous * call has not completed for any of the target devices; * - \b clblasCompilerNotAvailable if a compiler is not available; * - \b clblasBuildProgramFailure if there is a failure to build a program * executable. * * @ingroup ROTM */ clblasStatus clblasSrotm( size_t N, cl_mem X, size_t offx, int incx, cl_mem Y, size_t offy, int incy, const cl_mem SPARAM, size_t offSparam, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events); /** * @example example_srotm.c * Example of how to use the @ref clblasSrotm function. */ /** * @brief modified givens rotation for double elements * * @param[in] N Number of elements in vector \b X and \b Y. * @param[out] X Buffer object storing vector \b X. * @param[in] offx Offset of first element of vector \b X in buffer object. * Counted in elements. * @param[in] incx Increment for the elements of \b X. Must not be zero. * @param[out] Y Buffer object storing the vector \b Y. * @param[in] offy Offset of first element of vector \b Y in buffer object. * Counted in elements. * @param[in] incy Increment for the elements of \b Y. Must not be zero. * @param[in] DPARAM Buffer object that contains SPARAM array of minimum length 5 * DPARAM(1)=DFLAG * DPARAM(2)=DH11 * DPARAM(3)=DH21 * DPARAM(4)=DH12 * DPARAM(5)=DH22 * @param[in] offDparam Offset of first element of array \b DPARAM in buffer object. * Counted in elements. * @param[in] numCommandQueues Number of OpenCL command queues in which the * task is to be performed. * @param[in] commandQueues OpenCL command queues. * @param[in] numEventsInWaitList Number of events in the event wait list. * @param[in] eventWaitList Event wait list. * @param[in] events Event objects per each command queue that identify * a particular kernel execution instance. * * @return * - \b clblasSuccess on success; * - \b clblasInvalidDevice if a target device does not support the * floating point arithmetic with double precision; * - the same error codes as the clblasSrotm() function otherwise. * * @ingroup ROTM */ clblasStatus clblasDrotm( size_t N, cl_mem X, size_t offx, int incx, cl_mem Y, size_t offy, int incy, const cl_mem DPARAM, size_t offDparam, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events); /*@}*/ /** * @defgroup NRM2 NRM2 - Euclidean norm of a vector * @ingroup BLAS1 */ /*@{*/ /** * @brief computes the euclidean norm of vector containing float elements * * NRM2 = sqrt( X' * X ) * * @param[in] N Number of elements in vector \b X. * @param[out] NRM2 Buffer object that will contain the NRM2 value * @param[in] offNRM2 Offset to NRM2 value in \b NRM2 buffer object. * Counted in elements. * @param[in] X Buffer object storing vector \b X. * @param[in] offx Offset of first element of vector \b X in buffer object. * Counted in elements. * @param[in] incx Increment for the elements of \b X. Must not be zero. * @param[in] scratchBuff Temporary cl_mem scratch buffer object that can hold minimum of (2*N) elements * @param[in] numCommandQueues Number of OpenCL command queues in which the * task is to be performed. * @param[in] commandQueues OpenCL command queues. * @param[in] numEventsInWaitList Number of events in the event wait list. * @param[in] eventWaitList Event wait list. * @param[in] events Event objects per each command queue that identify * a particular kernel execution instance. * * @return * - \b clblasSuccess on success; * - \b clblasNotInitialized if clblasSetup() was not called; * - \b clblasInvalidValue if invalid parameters are passed: * - \b N is zero, or * - either \b incx is zero, or * - the vector sizes along with the increments lead to * accessing outside of any of the buffers; * - \b clblasInvalidMemObject if any of \b X or \b NRM2 or \b scratchBuff object is * Invalid, or an image object rather than the buffer one; * - \b clblasOutOfHostMemory if the library can't allocate memory for * internal structures; * - \b clblasInvalidCommandQueue if the passed command queue is invalid; * - \b clblasInvalidContext if a context a passed command queue belongs * to was released; * - \b clblasInvalidOperation if kernel compilation relating to a previous * call has not completed for any of the target devices; * - \b clblasCompilerNotAvailable if a compiler is not available; * - \b clblasBuildProgramFailure if there is a failure to build a program * executable. * * @ingroup NRM2 */ clblasStatus clblasSnrm2( size_t N, cl_mem NRM2, size_t offNRM2, const cl_mem X, size_t offx, int incx, cl_mem scratchBuff, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events); /** * @example example_snrm2.c * Example of how to use the @ref clblasSnrm2 function. */ /** * @brief computes the euclidean norm of vector containing double elements * * NRM2 = sqrt( X' * X ) * * @param[in] N Number of elements in vector \b X. * @param[out] NRM2 Buffer object that will contain the NRM2 value * @param[in] offNRM2 Offset to NRM2 value in \b NRM2 buffer object. * Counted in elements. * @param[in] X Buffer object storing vector \b X. * @param[in] offx Offset of first element of vector \b X in buffer object. * Counted in elements. * @param[in] incx Increment for the elements of \b X. Must not be zero. * @param[in] scratchBuff Temporary cl_mem scratch buffer object that can hold minimum of (2*N) elements * @param[in] numCommandQueues Number of OpenCL command queues in which the * task is to be performed. * @param[in] commandQueues OpenCL command queues. * @param[in] numEventsInWaitList Number of events in the event wait list. * @param[in] eventWaitList Event wait list. * @param[in] events Event objects per each command queue that identify * a particular kernel execution instance. * * @return * - \b clblasSuccess on success; * - \b clblasInvalidDevice if a target device does not support the * floating point arithmetic with double precision; * - the same error codes as the clblasSnrm2() function otherwise. * * @ingroup NRM2 */ clblasStatus clblasDnrm2( size_t N, cl_mem NRM2, size_t offNRM2, const cl_mem X, size_t offx, int incx, cl_mem scratchBuff, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events); /** * @brief computes the euclidean norm of vector containing float-complex elements * * NRM2 = sqrt( X**H * X ) * * @param[in] N Number of elements in vector \b X. * @param[out] NRM2 Buffer object that will contain the NRM2 value. * Note that the answer of Scnrm2 is a real value. * @param[in] offNRM2 Offset to NRM2 value in \b NRM2 buffer object. * Counted in elements. * @param[in] X Buffer object storing vector \b X. * @param[in] offx Offset of first element of vector \b X in buffer object. * Counted in elements. * @param[in] incx Increment for the elements of \b X. Must not be zero. * @param[in] scratchBuff Temporary cl_mem scratch buffer object that can hold minimum of (2*N) elements * @param[in] numCommandQueues Number of OpenCL command queues in which the * task is to be performed. * @param[in] commandQueues OpenCL command queues. * @param[in] numEventsInWaitList Number of events in the event wait list. * @param[in] eventWaitList Event wait list. * @param[in] events Event objects per each command queue that identify * a particular kernel execution instance. * * @return * - \b clblasSuccess on success; * - the same error codes as the clblasSnrm2() function otherwise. * * @ingroup NRM2 */ clblasStatus clblasScnrm2( size_t N, cl_mem NRM2, size_t offNRM2, const cl_mem X, size_t offx, int incx, cl_mem scratchBuff, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events); /** * @brief computes the euclidean norm of vector containing double-complex elements * * NRM2 = sqrt( X**H * X ) * * @param[in] N Number of elements in vector \b X. * @param[out] NRM2 Buffer object that will contain the NRM2 value. * Note that the answer of Dznrm2 is a real value. * @param[in] offNRM2 Offset to NRM2 value in \b NRM2 buffer object. * Counted in elements. * @param[in] X Buffer object storing vector \b X. * @param[in] offx Offset of first element of vector \b X in buffer object. * Counted in elements. * @param[in] incx Increment for the elements of \b X. Must not be zero. * @param[in] scratchBuff Temporary cl_mem scratch buffer object that can hold minimum of (2*N) elements * @param[in] numCommandQueues Number of OpenCL command queues in which the * task is to be performed. * @param[in] commandQueues OpenCL command queues. * @param[in] numEventsInWaitList Number of events in the event wait list. * @param[in] eventWaitList Event wait list. * @param[in] events Event objects per each command queue that identify * a particular kernel execution instance. * * @return * - \b clblasSuccess on success; * - \b clblasInvalidDevice if a target device does not support the * floating point arithmetic with double precision; * - the same error codes as the clblasSnrm2() function otherwise. * executable. * * @ingroup NRM2 */ clblasStatus clblasDznrm2( size_t N, cl_mem NRM2, size_t offNRM2, const cl_mem X, size_t offx, int incx, cl_mem scratchBuff, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events); /*@}*/ /** * @defgroup iAMAX iAMAX - Index of max absolute value * @ingroup BLAS1 */ /*@{*/ /** * @brief index of max absolute value in a float array * * @param[in] N Number of elements in vector \b X. * @param[out] iMax Buffer object storing the index of first absolute max. * The index will be of type unsigned int * @param[in] offiMax Offset for storing index in the buffer iMax * Counted in elements. * @param[in] X Buffer object storing vector \b X. * @param[in] offx Offset of first element of vector \b X in buffer object. * Counted in elements. * @param[in] incx Increment for the elements of \b X. Must not be zero. * @param[in] scratchBuff Temprory cl_mem object to store intermediate results It should be able to hold minimum of (2*N) elements * @param[in] numCommandQueues Number of OpenCL command queues in which the * task is to be performed. * @param[in] commandQueues OpenCL command queues. * @param[in] numEventsInWaitList Number of events in the event wait list. * @param[in] eventWaitList Event wait list. * @param[in] events Event objects per each command queue that identify * a particular kernel execution instance. * * @return * - \b clblasSuccess on success; * - \b clblasNotInitialized if clblasSetup() was not called; * - \b clblasInvalidValue if invalid parameters are passed: * - \b N is zero, or * - either \b incx is zero, or * - the vector sizes along with the increments lead to * accessing outside of any of the buffers; * - \b clblasInvalidMemObject if any of \b iMax \b X or \b scratchBuff object is * Invalid, or an image object rather than the buffer one; * - \b clblasOutOfHostMemory if the library can't allocate memory for * internal structures; * - \b clblasInvalidCommandQueue if the passed command queue is invalid; * - \b clblasInvalidContext if the context, the passed command queue belongs * to was released; * - \b clblasInvalidOperation if kernel compilation relating to a previous * call has not completed for any of the target devices; * - \b clblasCompilerNotAvailable if a compiler is not available; * - \b clblasBuildProgramFailure if there is a failure to build a program * executable. * * @ingroup iAMAX */ clblasStatus clblasiSamax( size_t N, cl_mem iMax, size_t offiMax, const cl_mem X, size_t offx, int incx, cl_mem scratchBuff, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events); /** * @example example_isamax.c * Example of how to use the @ref clblasiSamax function. */ /** * @brief index of max absolute value in a double array * * @param[in] N Number of elements in vector \b X. * @param[out] iMax Buffer object storing the index of first absolute max. * The index will be of type unsigned int * @param[in] offiMax Offset for storing index in the buffer iMax * Counted in elements. * @param[in] X Buffer object storing vector \b X. * @param[in] offx Offset of first element of vector \b X in buffer object. * Counted in elements. * @param[in] incx Increment for the elements of \b X. Must not be zero. * @param[in] scratchBuff Temprory cl_mem object to store intermediate results It should be able to hold minimum of (2*N) elements * @param[in] numCommandQueues Number of OpenCL command queues in which the * task is to be performed. * @param[in] commandQueues OpenCL command queues. * @param[in] numEventsInWaitList Number of events in the event wait list. * @param[in] eventWaitList Event wait list. * @param[in] events Event objects per each command queue that identify * a particular kernel execution instance. * * @return * - \b clblasSuccess on success; * - \b clblasInvalidDevice if a target device does not support the * floating point arithmetic with double precision; * - the same error codes as the clblasiSamax() function otherwise. * * @ingroup iAMAX */ clblasStatus clblasiDamax( size_t N, cl_mem iMax, size_t offiMax, const cl_mem X, size_t offx, int incx, cl_mem scratchBuff, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events); /** * @brief index of max absolute value in a complex float array * * @param[in] N Number of elements in vector \b X. * @param[out] iMax Buffer object storing the index of first absolute max. * The index will be of type unsigned int * @param[in] offiMax Offset for storing index in the buffer iMax * Counted in elements. * @param[in] X Buffer object storing vector \b X. * @param[in] offx Offset of first element of vector \b X in buffer object. * Counted in elements. * @param[in] incx Increment for the elements of \b X. Must not be zero. * @param[in] scratchBuff Temprory cl_mem object to store intermediate results It should be able to hold minimum of (2*N) elements * @param[in] numCommandQueues Number of OpenCL command queues in which the * task is to be performed. * @param[in] commandQueues OpenCL command queues. * @param[in] numEventsInWaitList Number of events in the event wait list. * @param[in] eventWaitList Event wait list. * @param[in] events Event objects per each command queue that identify * a particular kernel execution instance. * * @return * - \b clblasSuccess on success; * - the same error codes as the clblasiSamax() function otherwise. * * @ingroup iAMAX */ clblasStatus clblasiCamax( size_t N, cl_mem iMax, size_t offiMax, const cl_mem X, size_t offx, int incx, cl_mem scratchBuff, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events); /** * @brief index of max absolute value in a complex double array * * @param[in] N Number of elements in vector \b X. * @param[out] iMax Buffer object storing the index of first absolute max. * The index will be of type unsigned int * @param[in] offiMax Offset for storing index in the buffer iMax * Counted in elements. * @param[in] X Buffer object storing vector \b X. * @param[in] offx Offset of first element of vector \b X in buffer object. * Counted in elements. * @param[in] incx Increment for the elements of \b X. Must not be zero. * @param[in] scratchBuff Temprory cl_mem object to store intermediate results It should be able to hold minimum of (2*N) elements * @param[in] numCommandQueues Number of OpenCL command queues in which the * task is to be performed. * @param[in] commandQueues OpenCL command queues. * @param[in] numEventsInWaitList Number of events in the event wait list. * @param[in] eventWaitList Event wait list. * @param[in] events Event objects per each command queue that identify * a particular kernel execution instance. * * @return * @return * - \b clblasSuccess on success; * - \b clblasInvalidDevice if a target device does not support the * floating point arithmetic with double precision; * - the same error codes as the clblasiSamax() function otherwise. * * @ingroup iAMAX */ clblasStatus clblasiZamax( size_t N, cl_mem iMax, size_t offiMax, const cl_mem X, size_t offx, int incx, cl_mem scratchBuff, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events); /*@}*/ /** * @defgroup ASUM ASUM - Sum of absolute values * @ingroup BLAS1 */ /*@{*/ /** * @brief absolute sum of values of a vector containing float elements * * @param[in] N Number of elements in vector \b X. * @param[out] asum Buffer object that will contain the absoule sum value * @param[in] offAsum Offset to absolute sum in \b asum buffer object. * Counted in elements. * @param[in] X Buffer object storing vector \b X. * @param[in] offx Offset of first element of vector \b X in buffer object. * Counted in elements. * @param[in] incx Increment for the elements of \b X. Must not be zero. * @param[in] scratchBuff Temporary cl_mem scratch buffer object of minimum size N * @param[in] numCommandQueues Number of OpenCL command queues in which the * task is to be performed. * @param[in] commandQueues OpenCL command queues. * @param[in] numEventsInWaitList Number of events in the event wait list. * @param[in] eventWaitList Event wait list. * @param[in] events Event objects per each command queue that identify * a particular kernel execution instance. * * @return * - \b clblasSuccess on success; * - \b clblasNotInitialized if clblasSetup() was not called; * - \b clblasInvalidValue if invalid parameters are passed: * - \b N is zero, or * - either \b incx is zero, or * - the vector sizes along with the increments lead to * accessing outside of any of the buffers; * - \b clblasInvalidMemObject if any of \b X or \b asum or \b scratchBuff object is * Invalid, or an image object rather than the buffer one; * - \b clblasOutOfHostMemory if the library can't allocate memory for * internal structures; * - \b clblasInvalidCommandQueue if the passed command queue is invalid; * - \b clblasInvalidContext if a context a passed command queue belongs * to was released; * - \b clblasInvalidOperation if kernel compilation relating to a previous * call has not completed for any of the target devices; * - \b clblasCompilerNotAvailable if a compiler is not available; * - \b clblasBuildProgramFailure if there is a failure to build a program * executable. * * @ingroup ASUM */ clblasStatus clblasSasum( size_t N, cl_mem asum, size_t offAsum, const cl_mem X, size_t offx, int incx, cl_mem scratchBuff, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events); /** * @example example_sasum.c * Example of how to use the @ref clblasSasum function. */ /** * @brief absolute sum of values of a vector containing double elements * * @param[in] N Number of elements in vector \b X. * @param[out] asum Buffer object that will contain the absoulte sum value * @param[in] offAsum Offset to absoule sum in \b asum buffer object. * Counted in elements. * @param[in] X Buffer object storing vector \b X. * @param[in] offx Offset of first element of vector \b X in buffer object. * Counted in elements. * @param[in] incx Increment for the elements of \b X. Must not be zero. * @param[in] scratchBuff Temporary cl_mem scratch buffer object of minimum size N * @param[in] numCommandQueues Number of OpenCL command queues in which the * task is to be performed. * @param[in] commandQueues OpenCL command queues. * @param[in] numEventsInWaitList Number of events in the event wait list. * @param[in] eventWaitList Event wait list. * @param[in] events Event objects per each command queue that identify * a particular kernel execution instance. * * @return * - \b clblasSuccess on success; * - \b clblasInvalidDevice if a target device does not support the * floating point arithmetic with double precision; * - the same error codes as the clblasSasum() function otherwise. * * @ingroup ASUM */ clblasStatus clblasDasum( size_t N, cl_mem asum, size_t offAsum, const cl_mem X, size_t offx, int incx, cl_mem scratchBuff, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events); /** * @brief absolute sum of values of a vector containing float-complex elements * * @param[in] N Number of elements in vector \b X. * @param[out] asum Buffer object that will contain the absolute sum value * @param[in] offAsum Offset to absolute sum in \b asum buffer object. * Counted in elements. * @param[in] X Buffer object storing vector \b X. * @param[in] offx Offset of first element of vector \b X in buffer object. * Counted in elements. * @param[in] incx Increment for the elements of \b X. Must not be zero. * @param[in] scratchBuff Temporary cl_mem scratch buffer object of minimum size N * @param[in] numCommandQueues Number of OpenCL command queues in which the * task is to be performed. * @param[in] commandQueues OpenCL command queues. * @param[in] numEventsInWaitList Number of events in the event wait list. * @param[in] eventWaitList Event wait list. * @param[in] events Event objects per each command queue that identify * a particular kernel execution instance. * * @return * - \b clblasSuccess on success; * - the same error codes as the clblasSasum() function otherwise. * * @ingroup ASUM */ clblasStatus clblasScasum( size_t N, cl_mem asum, size_t offAsum, const cl_mem X, size_t offx, int incx, cl_mem scratchBuff, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events); /** * @brief absolute sum of values of a vector containing double-complex elements * * @param[in] N Number of elements in vector \b X. * @param[out] asum Buffer object that will contain the absolute sum value * @param[in] offAsum Offset to absolute sum in \b asum buffer object. * Counted in elements. * @param[in] X Buffer object storing vector \b X. * @param[in] offx Offset of first element of vector \b X in buffer object. * Counted in elements. * @param[in] incx Increment for the elements of \b X. Must not be zero. * @param[in] scratchBuff Temporary cl_mem scratch buffer object of minimum size N * @param[in] numCommandQueues Number of OpenCL command queues in which the * task is to be performed. * @param[in] commandQueues OpenCL command queues. * @param[in] numEventsInWaitList Number of events in the event wait list. * @param[in] eventWaitList Event wait list. * @param[in] events Event objects per each command queue that identify * a particular kernel execution instance. * * @return * - \b clblasSuccess on success; * - \b clblasInvalidDevice if a target device does not support the * floating point arithmetic with double precision; * - the same error codes as the clblasSasum() function otherwise. * * @ingroup ASUM */ clblasStatus clblasDzasum( size_t N, cl_mem asum, size_t offAsum, const cl_mem X, size_t offx, int incx, cl_mem scratchBuff, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events); /*@}*/ /** * @defgroup BLAS2 BLAS-2 functions * * The Level 2 Basic Linear Algebra Subprograms are functions that perform * matrix-vector operations. */ /*@{*/ /*@}*/ /** * @defgroup GEMV GEMV - General matrix-Vector multiplication * @ingroup BLAS2 */ /*@{*/ /** * @brief Matrix-vector product with a general rectangular matrix and * float elements. Extended version. * * Matrix-vector products: * - \f$ y \leftarrow \alpha A x + \beta y \f$ * - \f$ y \leftarrow \alpha A^T x + \beta y \f$ * * @param[in] order Row/column order. * @param[in] transA How matrix \b A is to be transposed. * @param[in] M Number of rows in matrix \b A. * @param[in] N Number of columns in matrix \b A. * @param[in] alpha The factor of matrix \b A. * @param[in] A Buffer object storing matrix \b A. * @param[in] offA Offset of the first element of the matrix \b A in * the buffer object. Counted in elements. * @param[in] lda Leading dimension of matrix \b A. It cannot be less * than \b N when the \b order parameter is set to * \b clblasRowMajor,\n or less than \b M when the * parameter is set to \b clblasColumnMajor. * @param[in] x Buffer object storing vector \b x. * @param[in] offx Offset of first element of vector \b x in buffer object. * Counted in elements. * @param[in] incx Increment for the elements of \b x. It cannot be zero. * @param[in] beta The factor of the vector \b y. * @param[out] y Buffer object storing the vector \b y. * @param[in] offy Offset of first element of vector \b y in buffer object. * Counted in elements. * @param[in] incy Increment for the elements of \b y. It cannot be zero. * @param[in] numCommandQueues Number of OpenCL command queues in which the * task is to be performed. * @param[in] commandQueues OpenCL command queues. * @param[in] numEventsInWaitList Number of events in the event wait list. * @param[in] eventWaitList Event wait list. * @param[in] events Event objects per each command queue that identify * a particular kernel execution instance. * * @return * - \b clblasSuccess on success; * - \b clblasInvalidValue if \b offA exceeds the size of \b A buffer * object; * - the same error codes as the clblasSgemv() function otherwise. * * @ingroup GEMV */ clblasStatus clblasSgemv( clblasOrder order, clblasTranspose transA, size_t M, size_t N, cl_float alpha, const cl_mem A, size_t offA, size_t lda, const cl_mem x, size_t offx, int incx, cl_float beta, cl_mem y, size_t offy, int incy, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events); /** * @example example_sgemv.c * This is an example of how to use the @ref clblasSgemvEx function. */ /** * @brief Matrix-vector product with a general rectangular matrix and * double elements. Extended version. * * Matrix-vector products: * - \f$ y \leftarrow \alpha A x + \beta y \f$ * - \f$ y \leftarrow \alpha A^T x + \beta y \f$ * * @param[in] order Row/column order. * @param[in] transA How matrix \b A is to be transposed. * @param[in] M Number of rows in matrix \b A. * @param[in] N Number of columns in matrix \b A. * @param[in] alpha The factor of matrix \b A. * @param[in] A Buffer object storing matrix \b A. * @param[in] offA Offset of the first element of \b A in the buffer * object. Counted in elements. * @param[in] lda Leading dimension of matrix \b A. For a detailed description, * see clblasSgemv(). * @param[in] x Buffer object storing vector \b x. * @param[in] offx Offset of first element of vector \b x in buffer object. * Counted in elements. * @param[in] incx Increment for the elements of \b x. It cannot be zero. * @param[in] beta The factor of the vector \b y. * @param[out] y Buffer object storing the vector \b y. * @param[in] offy Offset of first element of vector \b y in buffer object. * Counted in elements. * @param[in] incy Increment for the elements of \b y. It cannot be zero. * @param[in] numCommandQueues Number of OpenCL command queues in which the * task is to be performed. * @param[in] commandQueues OpenCL command queues. * @param[in] numEventsInWaitList Number of events in the event wait list. * @param[in] eventWaitList Event wait list. * @param[in] events Event objects per each command queue that identify * a particular kernel execution instance. * * @return * - \b clblasSuccess on success; * - \b clblasInvalidDevice if a target device does not support the * floating point arithmetic with double precision; * - \b clblasInvalidValue if \b offA exceeds the size of \b A buffer * object; * - the same error codes as the clblasSgemv() function otherwise. * * @ingroup GEMV */ clblasStatus clblasDgemv( clblasOrder order, clblasTranspose transA, size_t M, size_t N, cl_double alpha, const cl_mem A, size_t offA, size_t lda, const cl_mem x, size_t offx, int incx, cl_double beta, cl_mem y, size_t offy, int incy, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events); /** * @brief Matrix-vector product with a general rectangular matrix and * float complex elements. Extended version. * * Matrix-vector products: * - \f$ y \leftarrow \alpha A x + \beta y \f$ * - \f$ y \leftarrow \alpha A^T x + \beta y \f$ * * @param[in] order Row/column order. * @param[in] transA How matrix \b A is to be transposed. * @param[in] M Number of rows in matrix \b A. * @param[in] N Number of columns in matrix \b A. * @param[in] alpha The factor of matrix \b A. * @param[in] A Buffer object storing matrix \b A. * @param[in] offA Offset of the first element of the matrix \b A in * the buffer object. Counted in elements * @param[in] lda Leading dimension of matrix \b A. For a detailed description, * see clblasSgemv(). * @param[in] x Buffer object storing vector \b x. * @param[in] offx Offset of first element of vector \b x in buffer object. * Counted in elements. * @param[in] incx Increment for the elements of \b x. It cannot be zero. * @param[in] beta The factor of the vector \b y. * @param[out] y Buffer object storing the vector \b y. * @param[in] offy Offset of first element of vector \b y in buffer object. * Counted in elements. * @param[in] incy Increment for the elements of \b y. It cannot be zero. * @param[in] numCommandQueues Number of OpenCL command queues in which the * task is to be performed. * @param[in] commandQueues OpenCL command queues. * @param[in] numEventsInWaitList Number of events in the event wait list. * @param[in] eventWaitList Event wait list. * @param[in] events Event objects per each command queue that identify * a particular kernel execution instance. * * @return * - \b clblasSuccess on success; * - \b clblasInvalidValue if \b offA exceeds the size of \b A buffer * object; * - the same error codes as the clblasSgemv() function otherwise. * * @ingroup GEMV */ clblasStatus clblasCgemv( clblasOrder order, clblasTranspose transA, size_t M, size_t N, FloatComplex alpha, const cl_mem A, size_t offA, size_t lda, const cl_mem x, size_t offx, int incx, FloatComplex beta, cl_mem y, size_t offy, int incy, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events); /** * @brief Matrix-vector product with a general rectangular matrix and * double complex elements. Extended version. * * Matrix-vector products: * - \f$ y \leftarrow \alpha A x + \beta y \f$ * - \f$ y \leftarrow \alpha A^T x + \beta y \f$ * * @param[in] order Row/column order. * @param[in] transA How matrix \b A is to be transposed. * @param[in] M Number of rows in matrix \b A. * @param[in] N Number of columns in matrix \b A. * @param[in] alpha The factor of matrix \b A. * @param[in] A Buffer object storing matrix \b A. * @param[in] offA Offset of the first element of the matrix \b A in * the buffer object. Counted in elements. * @param[in] lda Leading dimension of matrix \b A. For a detailed description, * see clblasSgemv(). * @param[in] x Buffer object storing vector \b x. * @param[in] offx Offset of first element of vector \b x in buffer object. * Counted in elements. * @param[in] incx Increment for the elements of \b x. It cannot be zero. * @param[in] beta The factor of the vector \b y. * @param[out] y Buffer object storing the vector \b y. * @param[in] offy Offset of first element of vector \b y in buffer object. * Counted in elements. * @param[in] incy Increment for the elements of \b y. It cannot be zero. * @param[in] numCommandQueues Number of OpenCL command queues in which the * task is to be performed. * @param[in] commandQueues OpenCL command queues. * @param[in] numEventsInWaitList Number of events in the event wait list. * @param[in] eventWaitList Event wait list. * @param[in] events Event objects per each command queue that identify * a particular kernel execution instance. * * @return * - \b clblasSuccess on success; * - \b clblasInvalidDevice if a target device does not support the * floating point arithmetic with double precision; * - \b clblasInvalidValue if \b offA exceeds the size of \b A buffer * object; * - the same error codes as the clblasSgemv() function otherwise. * * @ingroup GEMV */ clblasStatus clblasZgemv( clblasOrder order, clblasTranspose transA, size_t M, size_t N, DoubleComplex alpha, const cl_mem A, size_t offA, size_t lda, const cl_mem x, size_t offx, int incx, DoubleComplex beta, cl_mem y, size_t offy, int incy, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events); /*@}*/ /** * @defgroup SYMV SYMV - Symmetric matrix-Vector multiplication * @ingroup BLAS2 */ /*@{*/ /** * @brief Matrix-vector product with a symmetric matrix and float elements. * * * Matrix-vector products: * - \f$ y \leftarrow \alpha A x + \beta y \f$ * * @param[in] order Row/columns order. * @param[in] uplo The triangle in matrix being referenced. * @param[in] N Number of rows and columns in matrix \b A. * @param[in] alpha The factor of matrix \b A. * @param[in] A Buffer object storing matrix \b A. * @param[in] offA Offset of the first element of the matrix \b A in * the buffer object. Counted in elements. * @param[in] lda Leading dimension of matrix \b A. It cannot less * than \b N. * @param[in] x Buffer object storing vector \b x. * @param[in] offx Offset of first element of vector \b x in buffer object. * Counted in elements. * @param[in] incx Increment for the elements of vector \b x. It cannot be zero. * @param[in] beta The factor of vector \b y. * @param[out] y Buffer object storing vector \b y. * @param[in] offy Offset of first element of vector \b y in buffer object. * Counted in elements. * @param[in] incy Increment for the elements of vector \b y. It cannot be zero. * @param[in] numCommandQueues Number of OpenCL command queues in which the * task is to be performed. * @param[in] commandQueues OpenCL command queues. * @param[in] numEventsInWaitList Number of events in the event wait list. * @param[in] eventWaitList Event wait list. * @param[in] events Event objects per each command queue that identify * a particular kernel execution instance. * * @return * - \b clblasSuccess on success; * - \b clblasInvalidValue if \b offA exceeds the size of \b A buffer * object; * - the same error codes as the clblasSgemv() function otherwise. * * @ingroup SYMV */ clblasStatus clblasSsymv( clblasOrder order, clblasUplo uplo, size_t N, cl_float alpha, const cl_mem A, size_t offA, size_t lda, const cl_mem x, size_t offx, int incx, cl_float beta, cl_mem y, size_t offy, int incy, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events); /** * @example example_ssymv.c * This is an example of how to use the @ref clblasSsymv function. */ /** * @brief Matrix-vector product with a symmetric matrix and double elements. * * * Matrix-vector products: * - \f$ y \leftarrow \alpha A x + \beta y \f$ * * @param[in] order Row/columns order. * @param[in] uplo The triangle in matrix being referenced. * @param[in] N Number of rows and columns in matrix \b A. * @param[in] alpha The factor of matrix \b A. * @param[in] A Buffer object storing matrix \b A. * @param[in] offA Offset of the first element of the matrix \b A in * the buffer object. Counted in elements. * @param[in] lda Leading dimension of matrix \b A. It cannot less * than \b N. * @param[in] x Buffer object storing vector \b x. * @param[in] offx Offset of first element of vector \b x in buffer object. * Counted in elements. * @param[in] incx Increment for the elements of vector \b x. It cannot be zero. * @param[in] beta The factor of vector \b y. * @param[out] y Buffer object storing vector \b y. * @param[in] offy Offset of first element of vector \b y in buffer object. * Counted in elements. * @param[in] incy Increment for the elements of vector \b y. It cannot be zero. * @param[in] numCommandQueues Number of OpenCL command queues in which the * task is to be performed. * @param[in] commandQueues OpenCL command queues. * @param[in] numEventsInWaitList Number of events in the event wait list. * @param[in] eventWaitList Event wait list. * @param[in] events Event objects per each command queue that identify * a particular kernel execution instance. * * @return * - \b clblasSuccess on success; * - \b clblasInvalidDevice if a target device does not support floating * point arithmetic with double precision; * - \b clblasInvalidValue if \b offA exceeds the size of \b A buffer * object; * - the same error codes as the clblasSsymv() function otherwise. * * @ingroup SYMV */ clblasStatus clblasDsymv( clblasOrder order, clblasUplo uplo, size_t N, cl_double alpha, const cl_mem A, size_t offA, size_t lda, const cl_mem x, size_t offx, int incx, cl_double beta, cl_mem y, size_t offy, int incy, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events); /*@}*/ /** * @defgroup HEMV HEMV - Hermitian matrix-vector multiplication * @ingroup BLAS2 */ /*@{*/ /** * @brief Matrix-vector product with a hermitian matrix and float-complex elements. * * Matrix-vector products: * - \f$ Y \leftarrow \alpha A X + \beta Y \f$ * * @param[in] order Row/columns order. * @param[in] uplo The triangle in matrix being referenced. * @param[in] N Number of rows and columns in matrix \b A. * @param[in] alpha The factor of matrix \b A. * @param[in] A Buffer object storing matrix \b A. * @param[in] offa Offset in number of elements for first element in matrix \b A. * @param[in] lda Leading dimension of matrix \b A. It cannot less * than \b N. * @param[in] X Buffer object storing vector \b X. * @param[in] offx Offset of first element of vector \b X in buffer object. * Counted in elements. * @param[in] incx Increment for the elements of vector \b X. It cannot be zero. * @param[in] beta The factor of vector \b Y. * @param[out] Y Buffer object storing vector \b Y. * @param[in] offy Offset of first element of vector \b Y in buffer object. * Counted in elements. * @param[in] incy Increment for the elements of vector \b Y. It cannot be zero. * @param[in] numCommandQueues Number of OpenCL command queues in which the * task is to be performed. * @param[in] commandQueues OpenCL command queues. * @param[in] numEventsInWaitList Number of events in the event wait list. * @param[in] eventWaitList Event wait list. * @param[in] events Event objects per each command queue that identify * a particular kernel execution instance. * * @return * - \b clblasSuccess on success; * - \b clblasNotInitialized if clblasSetup() was not called; * - \b clblasInvalidValue if invalid parameters are passed: * - \b N is zero, or * - either \b incx or \b incy is zero, or * - any of the leading dimensions is invalid; * - the matrix sizes or the vector sizes along with the increments lead to * accessing outsize of any of the buffers; * - \b clblasInvalidMemObject if either \b A, \b X, or \b Y object is * invalid, or an image object rather than the buffer one; * - \b clblasOutOfHostMemory if the library can't allocate memory for * internal structures; * - \b clblasInvalidCommandQueue if the passed command queue is invalid; * - \b clblasInvalidContext if a context a passed command queue belongs to * was released; * - \b clblasInvalidOperation if kernel compilation relating to a previous * call has not completed for any of the target devices; * - \b clblasCompilerNotAvailable if a compiler is not available; * - \b clblasBuildProgramFailure if there is a failure to build a program * executable. * * @ingroup HEMV */ clblasStatus clblasChemv( clblasOrder order, clblasUplo uplo, size_t N, FloatComplex alpha, const cl_mem A, size_t offa, size_t lda, const cl_mem X, size_t offx, int incx, FloatComplex beta, cl_mem Y, size_t offy, int incy, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events); /** * @brief Matrix-vector product with a hermitian matrix and double-complex elements. * * Matrix-vector products: * - \f$ Y \leftarrow \alpha A X + \beta Y \f$ * * @param[in] order Row/columns order. * @param[in] uplo The triangle in matrix being referenced. * @param[in] N Number of rows and columns in matrix \b A. * @param[in] alpha The factor of matrix \b A. * @param[in] A Buffer object storing matrix \b A. * @param[in] offa Offset in number of elements for first element in matrix \b A. * @param[in] lda Leading dimension of matrix \b A. It cannot less * than \b N. * @param[in] X Buffer object storing vector \b X. * @param[in] offx Offset of first element of vector \b X in buffer object. * Counted in elements. * @param[in] incx Increment for the elements of vector \b X. It cannot be zero. * @param[in] beta The factor of vector \b Y. * @param[out] Y Buffer object storing vector \b Y. * @param[in] offy Offset of first element of vector \b Y in buffer object. * Counted in elements. * @param[in] incy Increment for the elements of vector \b Y. It cannot be zero. * @param[in] numCommandQueues Number of OpenCL command queues in which the * task is to be performed. * @param[in] commandQueues OpenCL command queues. * @param[in] numEventsInWaitList Number of events in the event wait list. * @param[in] eventWaitList Event wait list. * @param[in] events Event objects per each command queue that identify * a particular kernel execution instance. * * @return * - \b clblasSuccess on success; * - \b clblasInvalidDevice if a target device does not support floating * point arithmetic with double precision; * - the same error codes as the clblasChemv() function otherwise. * * @ingroup HEMV */ clblasStatus clblasZhemv( clblasOrder order, clblasUplo uplo, size_t N, DoubleComplex alpha, const cl_mem A, size_t offa, size_t lda, const cl_mem X, size_t offx, int incx, DoubleComplex beta, cl_mem Y, size_t offy, int incy, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events); /** * @example example_zhemv.cpp * Example of how to use the @ref clblasZhemv function. */ /*@}*/ /** * @defgroup TRMV TRMV - Triangular matrix vector multiply * @ingroup BLAS2 */ /*@{*/ /** * @brief Matrix-vector product with a triangular matrix and * float elements. * * Matrix-vector products: * - \f$ X \leftarrow A X \f$ * - \f$ X \leftarrow A^T X \f$ * * @param[in] order Row/column order. * @param[in] uplo The triangle in matrix being referenced. * @param[in] trans How matrix \b A is to be transposed. * @param[in] diag Specify whether matrix \b A is unit triangular. * @param[in] N Number of rows/columns in matrix \b A. * @param[in] A Buffer object storing matrix \b A. * @param[in] offa Offset in number of elements for first element in matrix \b A. * @param[in] lda Leading dimension of matrix \b A. It cannot be less * than \b N * @param[out] X Buffer object storing vector \b X. * @param[in] offx Offset in number of elements for first element in vector \b X. * @param[in] incx Increment for the elements of \b X. Must not be zero. * @param[in] scratchBuff Temporary cl_mem scratch buffer object which can hold a * minimum of (1 + (N-1)*abs(incx)) elements * @param[in] numCommandQueues Number of OpenCL command queues in which the * task is to be performed. * @param[in] commandQueues OpenCL command queues. * @param[in] numEventsInWaitList Number of events in the event wait list. * @param[in] eventWaitList Event wait list. * @param[in] events Event objects per each command queue that identify * a particular kernel execution instance. * * @return * - \b clblasSuccess on success; * - \b clblasNotInitialized if clblasSetup() was not called; * - \b clblasInvalidValue if invalid parameters are passed: * - either \b N or \b incx is zero, or * - the leading dimension is invalid; * - \b clblasInvalidMemObject if either \b A or \b X object is * Invalid, or an image object rather than the buffer one; * - \b clblasOutOfHostMemory if the library can't allocate memory for * internal structures; * - \b clblasInvalidCommandQueue if the passed command queue is invalid; * - \b clblasInvalidContext if a context a passed command queue belongs * to was released; * - \b clblasInvalidOperation if kernel compilation relating to a previous * call has not completed for any of the target devices; * - \b clblasCompilerNotAvailable if a compiler is not available; * - \b clblasBuildProgramFailure if there is a failure to build a program * executable. * * @ingroup TRMV */ clblasStatus clblasStrmv( clblasOrder order, clblasUplo uplo, clblasTranspose trans, clblasDiag diag, size_t N, const cl_mem A, size_t offa, size_t lda, cl_mem X, size_t offx, int incx, cl_mem scratchBuff, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events); /** * @example example_strmv.c * Example of how to use the @ref clblasStrmv function. */ /** * @brief Matrix-vector product with a triangular matrix and * double elements. * * Matrix-vector products: * - \f$ X \leftarrow A X \f$ * - \f$ X \leftarrow A^T X \f$ * * @param[in] order Row/column order. * @param[in] uplo The triangle in matrix being referenced. * @param[in] trans How matrix \b A is to be transposed. * @param[in] diag Specify whether matrix \b A is unit triangular. * @param[in] N Number of rows/columns in matrix \b A. * @param[in] A Buffer object storing matrix \b A. * @param[in] offa Offset in number of elements for first element in matrix \b A. * @param[in] lda Leading dimension of matrix \b A. It cannot be less * than \b N * @param[out] X Buffer object storing vector \b X. * @param[in] offx Offset in number of elements for first element in vector \b X. * @param[in] incx Increment for the elements of \b X. Must not be zero. * @param[in] scratchBuff Temporary cl_mem scratch buffer object which can hold a * minimum of (1 + (N-1)*abs(incx)) elements * @param[in] numCommandQueues Number of OpenCL command queues in which the * task is to be performed. * @param[in] commandQueues OpenCL command queues. * @param[in] numEventsInWaitList Number of events in the event wait list. * @param[in] eventWaitList Event wait list. * @param[in] events Event objects per each command queue that identify * a particular kernel execution instance. * * @return * - \b clblasSuccess on success; * - \b clblasInvalidDevice if a target device does not support floating * point arithmetic with double precision; * - the same error codes as the clblasStrmv() function otherwise. * * @ingroup TRMV */ clblasStatus clblasDtrmv( clblasOrder order, clblasUplo uplo, clblasTranspose trans, clblasDiag diag, size_t N, const cl_mem A, size_t offa, size_t lda, cl_mem X, size_t offx, int incx, cl_mem scratchBuff, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events); /** * @brief Matrix-vector product with a triangular matrix and * float complex elements. * * Matrix-vector products: * - \f$ X \leftarrow A X \f$ * - \f$ X \leftarrow A^T X \f$ * * @param[in] order Row/column order. * @param[in] uplo The triangle in matrix being referenced. * @param[in] trans How matrix \b A is to be transposed. * @param[in] diag Specify whether matrix \b A is unit triangular. * @param[in] N Number of rows/columns in matrix \b A. * @param[in] A Buffer object storing matrix \b A. * @param[in] offa Offset in number of elements for first element in matrix \b A. * @param[in] lda Leading dimension of matrix \b A. It cannot be less * than \b N * @param[out] X Buffer object storing vector \b X. * @param[in] offx Offset in number of elements for first element in vector \b X. * @param[in] incx Increment for the elements of \b X. Must not be zero. * @param[in] scratchBuff Temporary cl_mem scratch buffer object which can hold a * minimum of (1 + (N-1)*abs(incx)) elements * @param[in] numCommandQueues Number of OpenCL command queues in which the * task is to be performed. * @param[in] commandQueues OpenCL command queues. * @param[in] numEventsInWaitList Number of events in the event wait list. * @param[in] eventWaitList Event wait list. * @param[in] events Event objects per each command queue that identify * a particular kernel execution instance. * * @return The same result as the clblasStrmv() function. * @ingroup TRMV */ clblasStatus clblasCtrmv( clblasOrder order, clblasUplo uplo, clblasTranspose trans, clblasDiag diag, size_t N, const cl_mem A, size_t offa, size_t lda, cl_mem X, size_t offx, int incx, cl_mem scratchBuff, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events); /** * @brief Matrix-vector product with a triangular matrix and * double complex elements. * * Matrix-vector products: * - \f$ X \leftarrow A X \f$ * - \f$ X \leftarrow A^T X \f$ * * @param[in] order Row/column order. * @param[in] uplo The triangle in matrix being referenced. * @param[in] trans How matrix \b A is to be transposed. * @param[in] diag Specify whether matrix \b A is unit triangular. * @param[in] N Number of rows/columns in matrix \b A. * @param[in] A Buffer object storing matrix \b A. * @param[in] offa Offset in number of elements for first element in matrix \b A. * @param[in] lda Leading dimension of matrix \b A. It cannot be less * than \b N * @param[out] X Buffer object storing vector \b X. * @param[in] offx Offset in number of elements for first element in vector \b X. * @param[in] incx Increment for the elements of \b X. Must not be zero. * @param[in] scratchBuff Temporary cl_mem scratch buffer object which can hold a * minimum of (1 + (N-1)*abs(incx)) elements * @param[in] numCommandQueues Number of OpenCL command queues in which the * task is to be performed. * @param[in] commandQueues OpenCL command queues. * @param[in] numEventsInWaitList Number of events in the event wait list. * @param[in] eventWaitList Event wait list. * @param[in] events Event objects per each command queue that identify * a particular kernel execution instance. * * @return The same result as the clblasDtrmv() function. * @ingroup TRMV */ clblasStatus clblasZtrmv( clblasOrder order, clblasUplo uplo, clblasTranspose trans, clblasDiag diag, size_t N, const cl_mem A, size_t offa, size_t lda, cl_mem X, size_t offx, int incx, cl_mem scratchBuff, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events); /*@}*/ /** * @defgroup TRSV TRSV - Triangular matrix vector Solve * @ingroup BLAS2 */ /*@{*/ /** * @brief solving triangular matrix problems with float elements. * * Matrix-vector products: * - \f$ A X \leftarrow X \f$ * - \f$ A^T X \leftarrow X \f$ * * @param[in] order Row/column order. * @param[in] uplo The triangle in matrix being referenced. * @param[in] trans How matrix \b A is to be transposed. * @param[in] diag Specify whether matrix \b A is unit triangular. * @param[in] N Number of rows/columns in matrix \b A. * @param[in] A Buffer object storing matrix \b A. * @param[in] offa Offset in number of elements for first element in matrix \b A. * @param[in] lda Leading dimension of matrix \b A. It cannot be less * than \b N * @param[out] X Buffer object storing vector \b X. * @param[in] offx Offset in number of elements for first element in vector \b X. * @param[in] incx Increment for the elements of \b X. Must not be zero. * @param[in] numCommandQueues Number of OpenCL command queues in which the * task is to be performed. * @param[in] commandQueues OpenCL command queues. * @param[in] numEventsInWaitList Number of events in the event wait list. * @param[in] eventWaitList Event wait list. * @param[in] events Event objects per each command queue that identify * a particular kernel execution instance. * * @return * - \b clblasSuccess on success; * - \b clblasNotInitialized if clblasSetup() was not called; * - \b clblasInvalidValue if invalid parameters are passed: * - either \b N or \b incx is zero, or * - the leading dimension is invalid; * - \b clblasInvalidMemObject if either \b A or \b X object is * Invalid, or an image object rather than the buffer one; * - \b clblasOutOfHostMemory if the library can't allocate memory for * internal structures; * - \b clblasInvalidCommandQueue if the passed command queue is invalid; * - \b clblasInvalidContext if a context a passed command queue belongs * to was released; * - \b clblasInvalidOperation if kernel compilation relating to a previous * call has not completed for any of the target devices; * - \b clblasCompilerNotAvailable if a compiler is not available; * - \b clblasBuildProgramFailure if there is a failure to build a program * executable. * * @ingroup TRSV */ clblasStatus clblasStrsv( clblasOrder order, clblasUplo uplo, clblasTranspose trans, clblasDiag diag, size_t N, const cl_mem A, size_t offa, size_t lda, cl_mem X, size_t offx, int incx, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events); /** * @example example_strsv.c * Example of how to use the @ref clblasStrsv function. */ /** * @brief solving triangular matrix problems with double elements. * * Matrix-vector products: * - \f$ A X \leftarrow X \f$ * - \f$ A^T X \leftarrow X \f$ * * @param[in] order Row/column order. * @param[in] uplo The triangle in matrix being referenced. * @param[in] trans How matrix \b A is to be transposed. * @param[in] diag Specify whether matrix \b A is unit triangular. * @param[in] N Number of rows/columns in matrix \b A. * @param[in] A Buffer object storing matrix \b A. * @param[in] offa Offset in number of elements for first element in matrix \b A. * @param[in] lda Leading dimension of matrix \b A. It cannot be less * than \b N * @param[out] X Buffer object storing vector \b X. * @param[in] offx Offset in number of elements for first element in vector \b X. * @param[in] incx Increment for the elements of \b X. Must not be zero. * @param[in] numCommandQueues Number of OpenCL command queues in which the * task is to be performed. * @param[in] commandQueues OpenCL command queues. * @param[in] numEventsInWaitList Number of events in the event wait list. * @param[in] eventWaitList Event wait list. * @param[in] events Event objects per each command queue that identify * a particular kernel execution instance. * * @return * - \b clblasSuccess on success; * - \b clblasInvalidDevice if a target device does not support floating * point arithmetic with double precision; * - the same error codes as the clblasStrsv() function otherwise. * * @ingroup TRSV */ clblasStatus clblasDtrsv( clblasOrder order, clblasUplo uplo, clblasTranspose trans, clblasDiag diag, size_t N, const cl_mem A, size_t offa, size_t lda, cl_mem X, size_t offx, int incx, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events); /** * @brief solving triangular matrix problems with float-complex elements. * * Matrix-vector products: * - \f$ A X \leftarrow X \f$ * - \f$ A^T X \leftarrow X \f$ * * @param[in] order Row/column order. * @param[in] uplo The triangle in matrix being referenced. * @param[in] trans How matrix \b A is to be transposed. * @param[in] diag Specify whether matrix \b A is unit triangular. * @param[in] N Number of rows/columns in matrix \b A. * @param[in] A Buffer object storing matrix \b A. * @param[in] offa Offset in number of elements for first element in matrix \b A. * @param[in] lda Leading dimension of matrix \b A. It cannot be less * than \b N * @param[out] X Buffer object storing vector \b X. * @param[in] offx Offset in number of elements for first element in vector \b X. * @param[in] incx Increment for the elements of \b X. Must not be zero. * @param[in] numCommandQueues Number of OpenCL command queues in which the * task is to be performed. * @param[in] commandQueues OpenCL command queues. * @param[in] numEventsInWaitList Number of events in the event wait list. * @param[in] eventWaitList Event wait list. * @param[in] events Event objects per each command queue that identify * a particular kernel execution instance. * * @return The same result as the clblasStrsv() function. * * @ingroup TRSV */ clblasStatus clblasCtrsv( clblasOrder order, clblasUplo uplo, clblasTranspose trans, clblasDiag diag, size_t N, const cl_mem A, size_t offa, size_t lda, cl_mem X, size_t offx, int incx, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events); /** * @brief solving triangular matrix problems with double-complex elements. * * Matrix-vector products: * - \f$ A X \leftarrow X \f$ * - \f$ A^T X \leftarrow X \f$ * * @param[in] order Row/column order. * @param[in] uplo The triangle in matrix being referenced. * @param[in] trans How matrix \b A is to be transposed. * @param[in] diag Specify whether matrix \b A is unit triangular. * @param[in] N Number of rows/columns in matrix \b A. * @param[in] A Buffer object storing matrix \b A. * @param[in] offa Offset in number of elements for first element in matrix \b A. * @param[in] lda Leading dimension of matrix \b A. It cannot be less * than \b N * @param[out] X Buffer object storing vector \b X. * @param[in] offx Offset in number of elements for first element in vector \b X. * @param[in] incx Increment for the elements of \b X. Must not be zero. * @param[in] numCommandQueues Number of OpenCL command queues in which the * task is to be performed. * @param[in] commandQueues OpenCL command queues. * @param[in] numEventsInWaitList Number of events in the event wait list. * @param[in] eventWaitList Event wait list. * @param[in] events Event objects per each command queue that identify * a particular kernel execution instance. * * @return The same result as the clblasDtrsv() function. * * @ingroup TRSV */ clblasStatus clblasZtrsv( clblasOrder order, clblasUplo uplo, clblasTranspose trans, clblasDiag diag, size_t N, const cl_mem A, size_t offa, size_t lda, cl_mem X, size_t offx, int incx, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events); /*@}*/ /** * @defgroup GER GER - General matrix rank 1 operation * @ingroup BLAS2 */ /*@{*/ /** * @brief vector-vector product with float elements and * performs the rank 1 operation A * * Vector-vector products: * - \f$ A \leftarrow \alpha X Y^T + A \f$ * * @param[in] order Row/column order. * @param[in] M Number of rows in matrix \b A. * @param[in] N Number of columns in matrix \b A. * @param[in] alpha specifies the scalar alpha. * @param[in] X Buffer object storing vector \b X. * @param[in] offx Offset in number of elements for the first element in vector \b X. * @param[in] incx Increment for the elements of \b X. Must not be zero. * @param[in] Y Buffer object storing vector \b Y. * @param[in] offy Offset in number of elements for the first element in vector \b Y. * @param[in] incy Increment for the elements of \b Y. Must not be zero. * @param[out] A Buffer object storing matrix \b A. On exit, A is * overwritten by the updated matrix. * @param[in] offa Offset in number of elements for the first element in matrix \b A. * @param[in] lda Leading dimension of matrix \b A. It cannot be less * than \b N when the \b order parameter is set to * \b clblasRowMajor,\n or less than \b M when the * parameter is set to \b clblasColumnMajor. * @param[in] numCommandQueues Number of OpenCL command queues in which the * task is to be performed. * @param[in] commandQueues OpenCL command queues. * @param[in] numEventsInWaitList Number of events in the event wait list. * @param[in] eventWaitList Event wait list. * @param[in] events Event objects per each command queue that identify * a particular kernel execution instance. * * @return * - \b clblasSuccess on success; * - \b clblasNotInitialized if clblasSetup() was not called; * - \b clblasInvalidValue if invalid parameters are passed: * - \b M, \b N or * - either \b incx or \b incy is zero, or * - a leading dimension is invalid; * - \b clblasInvalidMemObject if A, X, or Y object is invalid, * or an image object rather than the buffer one; * - \b clblasOutOfResources if you use image-based function implementation * and no suitable scratch image available; * - \b clblasOutOfHostMemory if the library can't allocate memory for * internal structures; * - \b clblasInvalidCommandQueue if the passed command queue is invalid; * - \b clblasInvalidContext if a context a passed command queue belongs to * was released; * - \b clblasInvalidOperation if kernel compilation relating to a previous * call has not completed for any of the target devices; * - \b clblasCompilerNotAvailable if a compiler is not available; * - \b clblasBuildProgramFailure if there is a failure to build a program * executable. * * @ingroup GER */ clblasStatus clblasSger( clblasOrder order, size_t M, size_t N, cl_float alpha, const cl_mem X, size_t offx, int incx, const cl_mem Y, size_t offy, int incy, cl_mem A, size_t offa, size_t lda, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events); /** * @example example_sger.c * Example of how to use the @ref clblasSger function. */ /** * @brief vector-vector product with double elements and * performs the rank 1 operation A * * Vector-vector products: * - \f$ A \leftarrow \alpha X Y^T + A \f$ * * @param[in] order Row/column order. * @param[in] M Number of rows in matrix \b A. * @param[in] N Number of columns in matrix \b A. * @param[in] alpha specifies the scalar alpha. * @param[in] X Buffer object storing vector \b X. * @param[in] offx Offset in number of elements for the first element in vector \b X. * @param[in] incx Increment for the elements of \b X. Must not be zero. * @param[in] Y Buffer object storing vector \b Y. * @param[in] offy Offset in number of elements for the first element in vector \b Y. * @param[in] incy Increment for the elements of \b Y. Must not be zero. * @param[out] A Buffer object storing matrix \b A. On exit, A is * overwritten by the updated matrix. * @param[in] offa Offset in number of elements for the first element in matrix \b A. * @param[in] lda Leading dimension of matrix \b A. It cannot be less * than \b N when the \b order parameter is set to * \b clblasRowMajor,\n or less than \b M when the * parameter is set to \b clblasColumnMajor. * @param[in] numCommandQueues Number of OpenCL command queues in which the * task is to be performed. * @param[in] commandQueues OpenCL command queues. * @param[in] numEventsInWaitList Number of events in the event wait list. * @param[in] eventWaitList Event wait list. * @param[in] events Event objects per each command queue that identify * a particular kernel execution instance. * * @return * - \b clblasSuccess on success; * - \b clblasInvalidDevice if a target device does not support floating * point arithmetic with double precision; * - the same error codes as the clblasSger() function otherwise. * * @ingroup GER */ clblasStatus clblasDger( clblasOrder order, size_t M, size_t N, cl_double alpha, const cl_mem X, size_t offx, int incx, const cl_mem Y, size_t offy, int incy, cl_mem A, size_t offa, size_t lda, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events); /*@}*/ /** * @defgroup GERU GERU - General matrix rank 1 operation * @ingroup BLAS2 */ /*@{*/ /** * @brief vector-vector product with float complex elements and * performs the rank 1 operation A * * Vector-vector products: * - \f$ A \leftarrow \alpha X Y^T + A \f$ * * @param[in] order Row/column order. * @param[in] M Number of rows in matrix \b A. * @param[in] N Number of columns in matrix \b A. * @param[in] alpha specifies the scalar alpha. * @param[in] X Buffer object storing vector \b X. * @param[in] offx Offset in number of elements for the first element in vector \b X. * @param[in] incx Increment for the elements of \b X. Must not be zero. * @param[in] Y Buffer object storing vector \b Y. * @param[in] offy Offset in number of elements for the first element in vector \b Y. * @param[in] incy Increment for the elements of \b Y. Must not be zero. * @param[out] A Buffer object storing matrix \b A. On exit, A is * overwritten by the updated matrix. * @param[in] offa Offset in number of elements for the first element in matrix \b A. * @param[in] lda Leading dimension of matrix \b A. It cannot be less * than \b N when the \b order parameter is set to * \b clblasRowMajor,\n or less than \b M when the * parameter is set to \b clblasColumnMajor. * @param[in] numCommandQueues Number of OpenCL command queues in which the * task is to be performed. * @param[in] commandQueues OpenCL command queues. * @param[in] numEventsInWaitList Number of events in the event wait list. * @param[in] eventWaitList Event wait list. * @param[in] events Event objects per each command queue that identify * a particular kernel execution instance. * * @return * - \b clblasSuccess on success; * - \b clblasNotInitialized if clblasSetup() was not called; * - \b clblasInvalidValue if invalid parameters are passed: * - \b M, \b N or * - either \b incx or \b incy is zero, or * - a leading dimension is invalid; * - \b clblasInvalidMemObject if A, X, or Y object is invalid, * or an image object rather than the buffer one; * - \b clblasOutOfResources if you use image-based function implementation * and no suitable scratch image available; * - \b clblasOutOfHostMemory if the library can't allocate memory for * internal structures; * - \b clblasInvalidCommandQueue if the passed command queue is invalid; * - \b clblasInvalidContext if a context a passed command queue belongs to * was released; * - \b clblasInvalidOperation if kernel compilation relating to a previous * call has not completed for any of the target devices; * - \b clblasCompilerNotAvailable if a compiler is not available; * - \b clblasBuildProgramFailure if there is a failure to build a program * executable. * * @ingroup GERU */ clblasStatus clblasCgeru( clblasOrder order, size_t M, size_t N, cl_float2 alpha, const cl_mem X, size_t offx, int incx, const cl_mem Y, size_t offy, int incy, cl_mem A , size_t offa, size_t lda, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events); /** * @brief vector-vector product with double complex elements and * performs the rank 1 operation A * * Vector-vector products: * - \f$ A \leftarrow \alpha X Y^T + A \f$ * * @param[in] order Row/column order. * @param[in] M Number of rows in matrix \b A. * @param[in] N Number of columns in matrix \b A. * @param[in] alpha specifies the scalar alpha. * @param[in] X Buffer object storing vector \b X. * @param[in] offx Offset in number of elements for the first element in vector \b X. * @param[in] incx Increment for the elements of \b X. Must not be zero. * @param[in] Y Buffer object storing vector \b Y. * @param[in] offy Offset in number of elements for the first element in vector \b Y. * @param[in] incy Increment for the elements of \b Y. Must not be zero. * @param[out] A Buffer object storing matrix \b A. On exit, A is * overwritten by the updated matrix. * @param[in] offa Offset in number of elements for the first element in matrix \b A. * @param[in] lda Leading dimension of matrix \b A. It cannot be less * than \b N when the \b order parameter is set to * \b clblasRowMajor,\n or less than \b M when the * parameter is set to \b clblasColumnMajor. * @param[in] numCommandQueues Number of OpenCL command queues in which the * task is to be performed. * @param[in] commandQueues OpenCL command queues. * @param[in] numEventsInWaitList Number of events in the event wait list. * @param[in] eventWaitList Event wait list. * @param[in] events Event objects per each command queue that identify * a particular kernel execution instance. * * @return * - \b clblasSuccess on success; * - \b clblasInvalidDevice if a target device does not support floating * point arithmetic with double precision; * - the same error codes as the clblasCgeru() function otherwise. * * @ingroup GERU */ clblasStatus clblasZgeru( clblasOrder order, size_t M, size_t N, cl_double2 alpha, const cl_mem X, size_t offx, int incx, const cl_mem Y, size_t offy, int incy, cl_mem A, size_t offa, size_t lda, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events); /*@}*/ /** * @defgroup GERC GERC - General matrix rank 1 operation * @ingroup BLAS2 */ /*@{*/ /** * @brief vector-vector product with float complex elements and * performs the rank 1 operation A * * Vector-vector products: * - \f$ A \leftarrow \alpha X Y^H + A \f$ * * @param[in] order Row/column order. * @param[in] M Number of rows in matrix \b A. * @param[in] N Number of columns in matrix \b A. * @param[in] alpha specifies the scalar alpha. * @param[in] X Buffer object storing vector \b X. * @param[in] offx Offset in number of elements for the first element in vector \b X. * @param[in] incx Increment for the elements of \b X. Must not be zero. * @param[in] Y Buffer object storing vector \b Y. * @param[in] offy Offset in number of elements for the first element in vector \b Y. * @param[in] incy Increment for the elements of \b Y. Must not be zero. * @param[out] A Buffer object storing matrix \b A. On exit, A is * overwritten by the updated matrix. * @param[in] offa Offset in number of elements for the first element in matrix \b A. * @param[in] lda Leading dimension of matrix \b A. It cannot be less * than \b N when the \b order parameter is set to * \b clblasRowMajor,\n or less than \b M when the * parameter is set to \b clblasColumnMajor. * @param[in] numCommandQueues Number of OpenCL command queues in which the * task is to be performed. * @param[in] commandQueues OpenCL command queues. * @param[in] numEventsInWaitList Number of events in the event wait list. * @param[in] eventWaitList Event wait list. * @param[in] events Event objects per each command queue that identify * a particular kernel execution instance. * * @return * - \b clblasSuccess on success; * - \b clblasNotInitialized if clblasSetup() was not called; * - \b clblasInvalidValue if invalid parameters are passed: * - \b M, \b N or * - either \b incx or \b incy is zero, or * - a leading dimension is invalid; * - \b clblasInvalidMemObject if A, X, or Y object is invalid, * or an image object rather than the buffer one; * - \b clblasOutOfResources if you use image-based function implementation * and no suitable scratch image available; * - \b clblasOutOfHostMemory if the library can't allocate memory for * internal structures; * - \b clblasInvalidCommandQueue if the passed command queue is invalid; * - \b clblasInvalidContext if a context a passed command queue belongs to * was released; * - \b clblasInvalidOperation if kernel compilation relating to a previous * call has not completed for any of the target devices; * - \b clblasCompilerNotAvailable if a compiler is not available; * - \b clblasBuildProgramFailure if there is a failure to build a program * executable. * * @ingroup GERC */ clblasStatus clblasCgerc( clblasOrder order, size_t M, size_t N, cl_float2 alpha, const cl_mem X, size_t offx, int incx, const cl_mem Y, size_t offy, int incy, cl_mem A , size_t offa, size_t lda, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events); /** * @brief vector-vector product with double complex elements and * performs the rank 1 operation A * * Vector-vector products: * - \f$ A \leftarrow \alpha X Y^H + A \f$ * * @param[in] order Row/column order. * @param[in] M Number of rows in matrix \b A. * @param[in] N Number of columns in matrix \b A. * @param[in] alpha specifies the scalar alpha. * @param[in] X Buffer object storing vector \b X. * @param[in] offx Offset in number of elements for the first element in vector \b X. * @param[in] incx Increment for the elements of \b X. Must not be zero. * @param[in] Y Buffer object storing vector \b Y. * @param[in] offy Offset in number of elements for the first element in vector \b Y. * @param[in] incy Increment for the elements of \b Y. Must not be zero. * @param[out] A Buffer object storing matrix \b A. On exit, A is * overwritten by the updated matrix. * @param[in] offa Offset in number of elements for the first element in matrix \b A. * @param[in] lda Leading dimension of matrix \b A. It cannot be less * than \b N when the \b order parameter is set to * \b clblasRowMajor,\n or less than \b M when the * parameter is set to \b clblasColumnMajor. * @param[in] numCommandQueues Number of OpenCL command queues in which the * task is to be performed. * @param[in] commandQueues OpenCL command queues. * @param[in] numEventsInWaitList Number of events in the event wait list. * @param[in] eventWaitList Event wait list. * @param[in] events Event objects per each command queue that identify * a particular kernel execution instance. * * @return * - \b clblasSuccess on success; * - \b clblasInvalidDevice if a target device does not support floating * point arithmetic with double precision; * - the same error codes as the clblasCgerc() function otherwise. * * @ingroup GERC */ clblasStatus clblasZgerc( clblasOrder order, size_t M, size_t N, cl_double2 alpha, const cl_mem X, size_t offx, int incx, const cl_mem Y, size_t offy, int incy, cl_mem A, size_t offa, size_t lda, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events); /*@}*/ /** * @defgroup SYR SYR - Symmetric rank 1 update * * The Level 2 Basic Linear Algebra Subprograms are functions that perform * symmetric rank 1 update operations. * @ingroup BLAS2 */ /*@{*/ /** * @brief Symmetric rank 1 operation with a general triangular matrix and * float elements. * * Symmetric rank 1 operation: * - \f$ A \leftarrow \alpha x x^T + A \f$ * * @param[in] order Row/column order. * @param[in] uplo The triangle in matrix being referenced. * @param[in] N Number of columns in matrix \b A. * @param[in] alpha The factor of matrix \b A. * @param[in] X Buffer object storing vector \b X. * @param[in] offx Offset of first element of vector \b X in buffer object. * @param[in] incx Increment for the elements of \b X. Must not be zero. * @param[out] A Buffer object storing matrix \b A. * @param[in] offa Offset of first element of matrix \b A in buffer object. * @param[in] lda Leading dimension of matrix \b A. It cannot be less * than \b N. * @param[in] numCommandQueues Number of OpenCL command queues in which the * task is to be performed. * @param[in] commandQueues OpenCL command queues. * @param[in] numEventsInWaitList Number of events in the event wait list. * @param[in] eventWaitList Event wait list. * @param[in] events Event objects per each command queue that identify * a particular kernel execution instance. * * @return * - \b clblasSuccess on success; * - \b clblasNotInitialized if clblasSetup() was not called; * - \b clblasInvalidValue if invalid parameters are passed: * - \b N is zero, or * - either \b incx is zero, or * - the leading dimension is invalid; * - \b clblasInvalidMemObject if either \b A, \b X object is * Invalid, or an image object rather than the buffer one; * - \b clblasOutOfHostMemory if the library can't allocate memory for * internal structures; * - \b clblasInvalidCommandQueue if the passed command queue is invalid; * - \b clblasInvalidContext if a context a passed command queue belongs * to was released; * - \b clblasInvalidOperation if kernel compilation relating to a previous * call has not completed for any of the target devices; * - \b clblasCompilerNotAvailable if a compiler is not available; * - \b clblasBuildProgramFailure if there is a failure to build a program * executable. * * @ingroup SYR */ clblasStatus clblasSsyr( clblasOrder order, clblasUplo uplo, size_t N, cl_float alpha, const cl_mem X, size_t offx, int incx, cl_mem A, size_t offa, size_t lda, cl_uint numCommandQueues, cl_command_queue* commandQueues, cl_uint numEventsInWaitList, const cl_event* eventWaitList, cl_event* events); /** * @brief Symmetric rank 1 operation with a general triangular matrix and * double elements. * * Symmetric rank 1 operation: * - \f$ A \leftarrow \alpha x x^T + A \f$ * * @param[in] order Row/column order. * @param[in] uplo The triangle in matrix being referenced. * @param[in] N Number of columns in matrix \b A. * @param[in] alpha The factor of matrix \b A. * @param[in] X Buffer object storing vector \b X. * @param[in] offx Offset of first element of vector \b X in buffer object. * @param[in] incx Increment for the elements of \b X. Must not be zero. * @param[out] A Buffer object storing matrix \b A. * @param[in] offa Offset of first element of matrix \b A in buffer object. * @param[in] lda Leading dimension of matrix \b A. It cannot be less * than \b N. * @param[in] numCommandQueues Number of OpenCL command queues in which the * task is to be performed. * @param[in] commandQueues OpenCL command queues. * @param[in] numEventsInWaitList Number of events in the event wait list. * @param[in] eventWaitList Event wait list. * @param[in] events Event objects per each command queue that identify * a particular kernel execution instance. * * @return * - \b clblasSuccess on success; * - \b clblasInvalidDevice if a target device does not support floating * point arithmetic with double precision; * - the same error codes as the clblasSsyr() function otherwise. * * @ingroup SYR */ clblasStatus clblasDsyr( clblasOrder order, clblasUplo uplo, size_t N, cl_double alpha, const cl_mem X, size_t offx, int incx, cl_mem A, size_t offa, size_t lda, cl_uint numCommandQueues, cl_command_queue* commandQueues, cl_uint numEventsInWaitList, const cl_event* eventWaitList, cl_event* events); /*@}*/ /** * @defgroup HER HER - Hermitian rank 1 operation * * The Level 2 Basic Linear Algebra Subprogram functions that perform * hermitian rank 1 operations. * @ingroup BLAS2 */ /*@{*/ /** * @brief hermitian rank 1 operation with a general triangular matrix and * float-complex elements. * * hermitian rank 1 operation: * - \f$ A \leftarrow \alpha X X^H + A \f$ * * @param[in] order Row/column order. * @param[in] uplo The triangle in matrix being referenced. * @param[in] N Number of columns in matrix \b A. * @param[in] alpha The factor of matrix \b A (a scalar float value) * @param[in] X Buffer object storing vector \b X. * @param[in] offx Offset in number of elements for the first element in vector \b X. * @param[in] incx Increment for the elements of \b X. Must not be zero. * @param[out] A Buffer object storing matrix \b A. * @param[in] offa Offset in number of elements for the first element in matrix \b A. * @param[in] lda Leading dimension of matrix \b A. It cannot be less * than \b N. * @param[in] numCommandQueues Number of OpenCL command queues in which the * task is to be performed. * @param[in] commandQueues OpenCL command queues. * @param[in] numEventsInWaitList Number of events in the event wait list. * @param[in] eventWaitList Event wait list. * @param[in] events Event objects per each command queue that identify * a particular kernel execution instance. * * @return * - \b clblasSuccess on success; * - \b clblasNotInitialized if clblasSetup() was not called; * - \b clblasInvalidValue if invalid parameters are passed: * - \b N is zero, or * - either \b incx is zero, or * - the leading dimension is invalid; * - \b clblasInvalidMemObject if either \b A, \b X object is * Invalid, or an image object rather than the buffer one; * - \b clblasOutOfHostMemory if the library can't allocate memory for * internal structures; * - \b clblasInvalidCommandQueue if the passed command queue is invalid; * - \b clblasInvalidContext if a context a passed command queue belongs * to was released; * - \b clblasInvalidOperation if kernel compilation relating to a previous * call has not completed for any of the target devices; * - \b clblasCompilerNotAvailable if a compiler is not available; * - \b clblasBuildProgramFailure if there is a failure to build a program * executable. * * @ingroup HER */ clblasStatus clblasCher( clblasOrder order, clblasUplo uplo, size_t N, cl_float alpha, const cl_mem X, size_t offx, int incx, cl_mem A, size_t offa, size_t lda, cl_uint numCommandQueues, cl_command_queue* commandQueues, cl_uint numEventsInWaitList, const cl_event* eventWaitList, cl_event* events); /** * @example example_cher.c * Example of how to use the @ref clblasCher function. */ /** * @brief hermitian rank 1 operation with a general triangular matrix and * double-complex elements. * * hermitian rank 1 operation: * - \f$ A \leftarrow \alpha X X^H + A \f$ * * @param[in] order Row/column order. * @param[in] uplo The triangle in matrix being referenced. * @param[in] N Number of columns in matrix \b A. * @param[in] alpha The factor of matrix \b A (a scalar double value) * @param[in] X Buffer object storing vector \b X. * @param[in] offx Offset in number of elements for the first element in vector \b X. * @param[in] incx Increment for the elements of \b X. Must not be zero. * @param[out] A Buffer object storing matrix \b A. * @param[in] offa Offset in number of elements for the first element in matrix \b A. * @param[in] lda Leading dimension of matrix \b A. It cannot be less * than \b N. * @param[in] numCommandQueues Number of OpenCL command queues in which the * task is to be performed. * @param[in] commandQueues OpenCL command queues. * @param[in] numEventsInWaitList Number of events in the event wait list. * @param[in] eventWaitList Event wait list. * @param[in] events Event objects per each command queue that identify * a particular kernel execution instance. * * @return * - \b clblasSuccess on success; * - \b clblasInvalidDevice if a target device does not support floating * point arithmetic with double precision; * - the same error codes as the clblasCher() function otherwise. * * @ingroup HER */ clblasStatus clblasZher( clblasOrder order, clblasUplo uplo, size_t N, cl_double alpha, const cl_mem X, size_t offx, int incx, cl_mem A, size_t offa, size_t lda, cl_uint numCommandQueues, cl_command_queue* commandQueues, cl_uint numEventsInWaitList, const cl_event* eventWaitList, cl_event* events); /*@}*/ /** * @defgroup SYR2 SYR2 - Symmetric rank 2 update * * The Level 2 Basic Linear Algebra Subprograms are functions that perform * symmetric rank 2 update operations. * @ingroup BLAS2 */ /*@{*/ /** * @brief Symmetric rank 2 operation with a general triangular matrix and * float elements. * * Symmetric rank 2 operation: * - \f$ A \leftarrow \alpha x y^T + \alpha y x^T + A \f$ * * @param[in] order Row/column order. * @param[in] uplo The triangle in matrix being referenced. * @param[in] N Number of columns in matrix \b A. * @param[in] alpha The factor of matrix \b A. * @param[in] X Buffer object storing vector \b X. * @param[in] offx Offset of first element of vector \b X in buffer object. * @param[in] incx Increment for the elements of \b X. Must not be zero. * @param[in] Y Buffer object storing vector \b Y. * @param[in] offy Offset of first element of vector \b Y in buffer object. * @param[in] incy Increment for the elements of \b Y. Must not be zero. * @param[out] A Buffer object storing matrix \b A. * @param[in] offa Offset of first element of matrix \b A in buffer object. * @param[in] lda Leading dimension of matrix \b A. It cannot be less * than \b N. * @param[in] numCommandQueues Number of OpenCL command queues in which the * task is to be performed. * @param[in] commandQueues OpenCL command queues. * @param[in] numEventsInWaitList Number of events in the event wait list. * @param[in] eventWaitList Event wait list. * @param[in] events Event objects per each command queue that identify * a particular kernel execution instance. * * @return * - \b clblasSuccess on success; * - \b clblasNotInitialized if clblasSetup() was not called; * - \b clblasInvalidValue if invalid parameters are passed: * - either \b N is zero, or * - either \b incx or \b incy is zero, or * - the leading dimension is invalid; * - \b clblasInvalidMemObject if either \b A, \b X, or \b Y object is * Invalid, or an image object rather than the buffer one; * - \b clblasOutOfHostMemory if the library can't allocate memory for * internal structures; * - \b clblasInvalidCommandQueue if the passed command queue is invalid; * - \b clblasInvalidContext if a context a passed command queue belongs * to was released; * - \b clblasInvalidOperation if kernel compilation relating to a previous * call has not completed for any of the target devices; * - \b clblasCompilerNotAvailable if a compiler is not available; * - \b clblasBuildProgramFailure if there is a failure to build a program * executable. * * @ingroup SYR2 */ clblasStatus clblasSsyr2( clblasOrder order, clblasUplo uplo, size_t N, cl_float alpha, const cl_mem X, size_t offx, int incx, const cl_mem Y, size_t offy, int incy, cl_mem A, size_t offa, size_t lda, cl_uint numCommandQueues, cl_command_queue* commandQueues, cl_uint numEventsInWaitList, const cl_event* eventWaitList, cl_event* events); /** * @brief Symmetric rank 2 operation with a general triangular matrix and * double elements. * * Symmetric rank 2 operation: * - \f$ A \leftarrow \alpha x y^T + \alpha y x^T + A \f$ * * @param[in] order Row/column order. * @param[in] uplo The triangle in matrix being referenced. * @param[in] N Number of columns in matrix \b A. * @param[in] alpha The factor of matrix \b A. * @param[in] X Buffer object storing vector \b X. * @param[in] offx Offset of first element of vector \b X in buffer object. * @param[in] incx Increment for the elements of \b X. Must not be zero. * @param[in] Y Buffer object storing vector \b Y. * @param[in] offy Offset of first element of vector \b Y in buffer object. * @param[in] incy Increment for the elements of \b Y. Must not be zero. * @param[out] A Buffer object storing matrix \b A. * @param[in] offa Offset of first element of matrix \b A in buffer object. * @param[in] lda Leading dimension of matrix \b A. It cannot be less * than \b N. * @param[in] numCommandQueues Number of OpenCL command queues in which the * task is to be performed. * @param[in] commandQueues OpenCL command queues. * @param[in] numEventsInWaitList Number of events in the event wait list. * @param[in] eventWaitList Event wait list. * @param[in] events Event objects per each command queue that identify * a particular kernel execution instance. * * @return * - \b clblasSuccess on success; * - \b clblasNotInitialized if clblasSetup() was not called; * - \b clblasInvalidValue if invalid parameters are passed: * - either \b N is zero, or * - either \b incx or \b incy is zero, or * - the leading dimension is invalid; * - \b clblasInvalidMemObject if either \b A, \b X, or \b Y object is * Invalid, or an image object rather than the buffer one; * - \b clblasOutOfHostMemory if the library can't allocate memory for * internal structures; * - \b clblasInvalidCommandQueue if the passed command queue is invalid; * - \b clblasInvalidContext if a context a passed command queue belongs * to was released; * - \b clblasInvalidOperation if kernel compilation relating to a previous * call has not completed for any of the target devices; * - \b clblasCompilerNotAvailable if a compiler is not available; * - \b clblasBuildProgramFailure if there is a failure to build a program * executable. * * @ingroup SYR2 */ clblasStatus clblasDsyr2( clblasOrder order, clblasUplo uplo, size_t N, cl_double alpha, const cl_mem X, size_t offx, int incx, const cl_mem Y, size_t offy, int incy, cl_mem A, size_t offa, size_t lda, cl_uint numCommandQueues, cl_command_queue* commandQueues, cl_uint numEventsInWaitList, const cl_event* eventWaitList, cl_event* events); /*@}*/ /** * @defgroup HER2 HER2 - Hermitian rank 2 update * * The Level 2 Basic Linear Algebra Subprograms are functions that perform * hermitian rank 2 update operations. * @ingroup BLAS2 */ /*@{*/ /** * @brief Hermitian rank 2 operation with a general triangular matrix and * float-compelx elements. * * Hermitian rank 2 operation: * - \f$ A \leftarrow \alpha X Y^H + \overline{ \alpha } Y X^H + A \f$ * * @param[in] order Row/column order. * @param[in] uplo The triangle in matrix being referenced. * @param[in] N Number of columns in matrix \b A. * @param[in] alpha The factor of matrix \b A. * @param[in] X Buffer object storing vector \b X. * @param[in] offx Offset in number of elements for the first element in vector \b X. * @param[in] incx Increment for the elements of \b X. Must not be zero. * @param[in] Y Buffer object storing vector \b Y. * @param[in] offy Offset in number of elements for the first element in vector \b Y. * @param[in] incy Increment for the elements of \b Y. Must not be zero. * @param[out] A Buffer object storing matrix \b A. * @param[in] offa Offset in number of elements for the first element in matrix \b A. * @param[in] lda Leading dimension of matrix \b A. It cannot be less * than \b N. * @param[in] numCommandQueues Number of OpenCL command queues in which the * task is to be performed. * @param[in] commandQueues OpenCL command queues. * @param[in] numEventsInWaitList Number of events in the event wait list. * @param[in] eventWaitList Event wait list. * @param[in] events Event objects per each command queue that identify * a particular kernel execution instance. * * @return * - \b clblasSuccess on success; * - \b clblasNotInitialized if clblasSetup() was not called; * - \b clblasInvalidValue if invalid parameters are passed: * - either \b N is zero, or * - either \b incx or \b incy is zero, or * - the leading dimension is invalid; * - \b clblasInvalidMemObject if either \b A, \b X, or \b Y object is * Invalid, or an image object rather than the buffer one; * - \b clblasOutOfHostMemory if the library can't allocate memory for * internal structures; * - \b clblasInvalidCommandQueue if the passed command queue is invalid; * - \b clblasInvalidContext if a context a passed command queue belongs * to was released; * - \b clblasInvalidOperation if kernel compilation relating to a previous * call has not completed for any of the target devices; * - \b clblasCompilerNotAvailable if a compiler is not available; * - \b clblasBuildProgramFailure if there is a failure to build a program * executable. * * @ingroup HER2 */ clblasStatus clblasCher2( clblasOrder order, clblasUplo uplo, size_t N, cl_float2 alpha, const cl_mem X, size_t offx, int incx, const cl_mem Y, size_t offy, int incy, cl_mem A, size_t offa, size_t lda, cl_uint numCommandQueues, cl_command_queue* commandQueues, cl_uint numEventsInWaitList, const cl_event* eventWaitList, cl_event* events); /** * @brief Hermitian rank 2 operation with a general triangular matrix and * double-compelx elements. * * Hermitian rank 2 operation: * - \f$ A \leftarrow \alpha X Y^H + \overline{ \alpha } Y X^H + A \f$ * * @param[in] order Row/column order. * @param[in] uplo The triangle in matrix being referenced. * @param[in] N Number of columns in matrix \b A. * @param[in] alpha The factor of matrix \b A. * @param[in] X Buffer object storing vector \b X. * @param[in] offx Offset in number of elements for the first element in vector \b X. * @param[in] incx Increment for the elements of \b X. Must not be zero. * @param[in] Y Buffer object storing vector \b Y. * @param[in] offy Offset in number of elements for the first element in vector \b Y. * @param[in] incy Increment for the elements of \b Y. Must not be zero. * @param[out] A Buffer object storing matrix \b A. * @param[in] offa Offset in number of elements for the first element in matrix \b A. * @param[in] lda Leading dimension of matrix \b A. It cannot be less * than \b N. * @param[in] numCommandQueues Number of OpenCL command queues in which the * task is to be performed. * @param[in] commandQueues OpenCL command queues. * @param[in] numEventsInWaitList Number of events in the event wait list. * @param[in] eventWaitList Event wait list. * @param[in] events Event objects per each command queue that identify * a particular kernel execution instance. * * @return * - \b clblasSuccess on success; * - \b clblasInvalidDevice if a target device does not support floating * point arithmetic with double precision; * - the same error codes as the clblasCher2() function otherwise. * * @ingroup HER2 */ clblasStatus clblasZher2( clblasOrder order, clblasUplo uplo, size_t N, cl_double2 alpha, const cl_mem X, size_t offx, int incx, const cl_mem Y, size_t offy, int incy, cl_mem A, size_t offa, size_t lda, cl_uint numCommandQueues, cl_command_queue* commandQueues, cl_uint numEventsInWaitList, const cl_event* eventWaitList, cl_event* events); /** * @example example_zher2.c * Example of how to use the @ref clblasZher2 function. */ /*@}*/ /** * @defgroup TPMV TPMV - Triangular packed matrix-vector multiply * @ingroup BLAS2 */ /*@{*/ /** * @brief Matrix-vector product with a packed triangular matrix and * float elements. * * Matrix-vector products: * - \f$ X \leftarrow A X \f$ * - \f$ X \leftarrow A^T X \f$ * * @param[in] order Row/column order. * @param[in] uplo The triangle in matrix being referenced. * @param[in] trans How matrix \b AP is to be transposed. * @param[in] diag Specify whether matrix \b AP is unit triangular. * @param[in] N Number of rows/columns in matrix \b A. * @param[in] AP Buffer object storing matrix \b AP in packed format. * @param[in] offa Offset in number of elements for first element in matrix \b AP. * @param[out] X Buffer object storing vector \b X. * @param[in] offx Offset in number of elements for first element in vector \b X. * @param[in] incx Increment for the elements of \b X. Must not be zero. * @param[in] scratchBuff Temporary cl_mem scratch buffer object which can hold a * minimum of (1 + (N-1)*abs(incx)) elements * @param[in] numCommandQueues Number of OpenCL command queues in which the * task is to be performed. * @param[in] commandQueues OpenCL command queues. * @param[in] numEventsInWaitList Number of events in the event wait list. * @param[in] eventWaitList Event wait list. * @param[in] events Event objects per each command queue that identify * a particular kernel execution instance. * * @return * - \b clblasSuccess on success; * - \b clblasNotInitialized if clblasSetup() was not called; * - \b clblasInvalidValue if invalid parameters are passed: * - either \b N or \b incx is zero * - \b clblasInvalidMemObject if either \b AP or \b X object is * Invalid, or an image object rather than the buffer one; * - \b clblasOutOfHostMemory if the library can't allocate memory for * internal structures; * - \b clblasInvalidCommandQueue if the passed command queue is invalid; * - \b clblasInvalidContext if a context a passed command queue belongs * to was released; * - \b clblasInvalidOperation if kernel compilation relating to a previous * call has not completed for any of the target devices; * - \b clblasCompilerNotAvailable if a compiler is not available; * - \b clblasBuildProgramFailure if there is a failure to build a program * executable. * * @ingroup TPMV */ clblasStatus clblasStpmv( clblasOrder order, clblasUplo uplo, clblasTranspose trans, clblasDiag diag, size_t N, const cl_mem AP, size_t offa, cl_mem X, size_t offx, int incx, cl_mem scratchBuff, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events); /** * @example example_stpmv.c * Example of how to use the @ref clblasStpmv function. */ /** * @brief Matrix-vector product with a packed triangular matrix and * double elements. * * Matrix-vector products: * - \f$ X \leftarrow A X \f$ * - \f$ X \leftarrow A^T X \f$ * * @param[in] order Row/column order. * @param[in] uplo The triangle in matrix being referenced. * @param[in] trans How matrix \b AP is to be transposed. * @param[in] diag Specify whether matrix \b AP is unit triangular. * @param[in] N Number of rows/columns in matrix \b AP. * @param[in] AP Buffer object storing matrix \b AP in packed format. * @param[in] offa Offset in number of elements for first element in matrix \b AP. * @param[out] X Buffer object storing vector \b X. * @param[in] offx Offset in number of elements for first element in vector \b X. * @param[in] incx Increment for the elements of \b X. Must not be zero. * @param[in] scratchBuff Temporary cl_mem scratch buffer object which can hold a * minimum of (1 + (N-1)*abs(incx)) elements * @param[in] numCommandQueues Number of OpenCL command queues in which the * task is to be performed. * @param[in] commandQueues OpenCL command queues. * @param[in] numEventsInWaitList Number of events in the event wait list. * @param[in] eventWaitList Event wait list. * @param[in] events Event objects per each command queue that identify * a particular kernel execution instance. * * @return * - \b clblasSuccess on success; * - \b clblasInvalidDevice if a target device does not support floating * point arithmetic with double precision; * - the same error codes as the clblasStpmv() function otherwise. * * @ingroup TPMV */ clblasStatus clblasDtpmv( clblasOrder order, clblasUplo uplo, clblasTranspose trans, clblasDiag diag, size_t N, const cl_mem AP, size_t offa, cl_mem X, size_t offx, int incx, cl_mem scratchBuff, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events); /** * @brief Matrix-vector product with a packed triangular matrix and * float-complex elements. * * Matrix-vector products: * - \f$ X \leftarrow A X \f$ * - \f$ X \leftarrow A^T X \f$ * * @param[in] order Row/column order. * @param[in] uplo The triangle in matrix being referenced. * @param[in] trans How matrix \b AP is to be transposed. * @param[in] diag Specify whether matrix \b AP is unit triangular. * @param[in] N Number of rows/columns in matrix \b AP. * @param[in] AP Buffer object storing matrix \b AP in packed format. * @param[in] offa Offset in number of elements for first element in matrix \b AP. * @param[out] X Buffer object storing vector \b X. * @param[in] offx Offset in number of elements for first element in vector \b X. * @param[in] incx Increment for the elements of \b X. Must not be zero. * @param[in] scratchBuff Temporary cl_mem scratch buffer object which can hold a * minimum of (1 + (N-1)*abs(incx)) elements * @param[in] numCommandQueues Number of OpenCL command queues in which the * task is to be performed. * @param[in] commandQueues OpenCL command queues. * @param[in] numEventsInWaitList Number of events in the event wait list. * @param[in] eventWaitList Event wait list. * @param[in] events Event objects per each command queue that identify * a particular kernel execution instance. * * @return The same result as the clblasStpmv() function. * @ingroup TPMV */ clblasStatus clblasCtpmv( clblasOrder order, clblasUplo uplo, clblasTranspose trans, clblasDiag diag, size_t N, const cl_mem AP, size_t offa, cl_mem X, size_t offx, int incx, cl_mem scratchBuff, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events); /** * @brief Matrix-vector product with a packed triangular matrix and * double-complex elements. * * Matrix-vector products: * - \f$ X \leftarrow A X \f$ * - \f$ X \leftarrow A^T X \f$ * * @param[in] order Row/column order. * @param[in] uplo The triangle in matrix being referenced. * @param[in] trans How matrix \b AP is to be transposed. * @param[in] diag Specify whether matrix \b AP is unit triangular. * @param[in] N Number of rows/columns in matrix \b AP. * @param[in] AP Buffer object storing matrix \b AP in packed format. * @param[in] offa Offset in number of elements for first element in matrix \b AP. * @param[out] X Buffer object storing vector \b X. * @param[in] offx Offset in number of elements for first element in vector \b X. * @param[in] incx Increment for the elements of \b X. Must not be zero. * @param[in] scratchBuff Temporary cl_mem scratch buffer object which can hold a * minimum of (1 + (N-1)*abs(incx)) elements * @param[in] numCommandQueues Number of OpenCL command queues in which the * task is to be performed. * @param[in] commandQueues OpenCL command queues. * @param[in] numEventsInWaitList Number of events in the event wait list. * @param[in] eventWaitList Event wait list. * @param[in] events Event objects per each command queue that identify * a particular kernel execution instance. * * @return The same result as the clblasDtpmv() function. * @ingroup TPMV */ clblasStatus clblasZtpmv( clblasOrder order, clblasUplo uplo, clblasTranspose trans, clblasDiag diag, size_t N, const cl_mem AP, size_t offa, cl_mem X, size_t offx, int incx, cl_mem scratchBuff, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events); /*@}*/ /** * @defgroup TPSV TPSV - Triangular packed matrix vector solve * @ingroup BLAS2 */ /*@{*/ /** * @brief solving triangular packed matrix problems with float elements. * * Matrix-vector products: * - \f$ A X \leftarrow X \f$ * - \f$ A^T X \leftarrow X \f$ * * @param[in] order Row/column order. * @param[in] uplo The triangle in matrix being referenced. * @param[in] trans How matrix \b A is to be transposed. * @param[in] diag Specify whether matrix \b A is unit triangular. * @param[in] N Number of rows/columns in matrix \b A. * @param[in] A Buffer object storing matrix in packed format.\b A. * @param[in] offa Offset in number of elements for first element in matrix \b A. * @param[out] X Buffer object storing vector \b X. * @param[in] offx Offset in number of elements for first element in vector \b X. * @param[in] incx Increment for the elements of \b X. Must not be zero. * @param[in] numCommandQueues Number of OpenCL command queues in which the * task is to be performed. * @param[in] commandQueues OpenCL command queues. * @param[in] numEventsInWaitList Number of events in the event wait list. * @param[in] eventWaitList Event wait list. * @param[in] events Event objects per each command queue that identify * a particular kernel execution instance. * * @return * - \b clblasSuccess on success; * - \b clblasNotInitialized if clblasSetup() was not called; * - \b clblasInvalidValue if invalid parameters are passed: * - either \b N or \b incx is zero, or * - the leading dimension is invalid; * - \b clblasInvalidMemObject if either \b A or \b X object is * Invalid, or an image object rather than the buffer one; * - \b clblasOutOfHostMemory if the library can't allocate memory for * internal structures; * - \b clblasInvalidCommandQueue if the passed command queue is invalid; * - \b clblasInvalidContext if a context a passed command queue belongs * to was released; * - \b clblasInvalidOperation if kernel compilation relating to a previous * call has not completed for any of the target devices; * - \b clblasCompilerNotAvailable if a compiler is not available; * - \b clblasBuildProgramFailure if there is a failure to build a program * executable. * * @ingroup TPSV */ clblasStatus clblasStpsv( clblasOrder order, clblasUplo uplo, clblasTranspose trans, clblasDiag diag, size_t N, const cl_mem A, size_t offa, cl_mem X, size_t offx, int incx, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events); /** * @example example_stpsv.c * Example of how to use the @ref clblasStpsv function. */ /** * @brief solving triangular packed matrix problems with double elements. * * Matrix-vector products: * - \f$ A X \leftarrow X \f$ * - \f$ A^T X \leftarrow X \f$ * * @param[in] order Row/column order. * @param[in] uplo The triangle in matrix being referenced. * @param[in] trans How matrix \b A is to be transposed. * @param[in] diag Specify whether matrix \b A is unit triangular. * @param[in] N Number of rows/columns in matrix \b A. * @param[in] A Buffer object storing matrix in packed format.\b A. * @param[in] offa Offset in number of elements for first element in matrix \b A. * @param[out] X Buffer object storing vector \b X. * @param[in] offx Offset in number of elements for first element in vector \b X. * @param[in] incx Increment for the elements of \b X. Must not be zero. * @param[in] numCommandQueues Number of OpenCL command queues in which the * task is to be performed. * @param[in] commandQueues OpenCL command queues. * @param[in] numEventsInWaitList Number of events in the event wait list. * @param[in] eventWaitList Event wait list. * @param[in] events Event objects per each command queue that identify * a particular kernel execution instance. * * @return * - \b clblasSuccess on success; * - \b clblasNotInitialized if clblasSetup() was not called; * - \b clblasInvalidValue if invalid parameters are passed: * - either \b N or \b incx is zero, or * - the leading dimension is invalid; * - \b clblasInvalidMemObject if either \b A or \b X object is * Invalid, or an image object rather than the buffer one; * - \b clblasOutOfHostMemory if the library can't allocate memory for * internal structures; * - \b clblasInvalidCommandQueue if the passed command queue is invalid; * - \b clblasInvalidContext if a context a passed command queue belongs * to was released; * - \b clblasInvalidOperation if kernel compilation relating to a previous * call has not completed for any of the target devices; * - \b clblasCompilerNotAvailable if a compiler is not available; * - \b clblasBuildProgramFailure if there is a failure to build a program * executable. * * @ingroup TPSV */ clblasStatus clblasDtpsv( clblasOrder order, clblasUplo uplo, clblasTranspose trans, clblasDiag diag, size_t N, const cl_mem A, size_t offa, cl_mem X, size_t offx, int incx, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events); /** * @brief solving triangular packed matrix problems with float complex elements. * * Matrix-vector products: * - \f$ A X \leftarrow X \f$ * - \f$ A^T X \leftarrow X \f$ * * @param[in] order Row/column order. * @param[in] uplo The triangle in matrix being referenced. * @param[in] trans How matrix \b A is to be transposed. * @param[in] diag Specify whether matrix \b A is unit triangular. * @param[in] N Number of rows/columns in matrix \b A. * @param[in] A Buffer object storing matrix in packed format.\b A. * @param[in] offa Offset in number of elements for first element in matrix \b A. * @param[out] X Buffer object storing vector \b X. * @param[in] offx Offset in number of elements for first element in vector \b X. * @param[in] incx Increment for the elements of \b X. Must not be zero. * @param[in] numCommandQueues Number of OpenCL command queues in which the * task is to be performed. * @param[in] commandQueues OpenCL command queues. * @param[in] numEventsInWaitList Number of events in the event wait list. * @param[in] eventWaitList Event wait list. * @param[in] events Event objects per each command queue that identify * a particular kernel execution instance. * * @return * - \b clblasSuccess on success; * - \b clblasNotInitialized if clblasSetup() was not called; * - \b clblasInvalidValue if invalid parameters are passed: * - either \b N or \b incx is zero, or * - the leading dimension is invalid; * - \b clblasInvalidMemObject if either \b A or \b X object is * Invalid, or an image object rather than the buffer one; * - \b clblasOutOfHostMemory if the library can't allocate memory for * internal structures; * - \b clblasInvalidCommandQueue if the passed command queue is invalid; * - \b clblasInvalidContext if a context a passed command queue belongs * to was released; * - \b clblasInvalidOperation if kernel compilation relating to a previous * call has not completed for any of the target devices; * - \b clblasCompilerNotAvailable if a compiler is not available; * - \b clblasBuildProgramFailure if there is a failure to build a program * executable. * * @ingroup TPSV */ clblasStatus clblasCtpsv( clblasOrder order, clblasUplo uplo, clblasTranspose trans, clblasDiag diag, size_t N, const cl_mem A, size_t offa, cl_mem X, size_t offx, int incx, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events); /** * @brief solving triangular packed matrix problems with double complex elements. * * Matrix-vector products: * - \f$ A X \leftarrow X \f$ * - \f$ A^T X \leftarrow X \f$ * * @param[in] order Row/column order. * @param[in] uplo The triangle in matrix being referenced. * @param[in] trans How matrix \b A is to be transposed. * @param[in] diag Specify whether matrix \b A is unit triangular. * @param[in] N Number of rows/columns in matrix \b A. * @param[in] A Buffer object storing matrix in packed format.\b A. * @param[in] offa Offset in number of elements for first element in matrix \b A. * @param[out] X Buffer object storing vector \b X. * @param[in] offx Offset in number of elements for first element in vector \b X. * @param[in] incx Increment for the elements of \b X. Must not be zero. * @param[in] numCommandQueues Number of OpenCL command queues in which the * task is to be performed. * @param[in] commandQueues OpenCL command queues. * @param[in] numEventsInWaitList Number of events in the event wait list. * @param[in] eventWaitList Event wait list. * @param[in] events Event objects per each command queue that identify * a particular kernel execution instance. * * @return * - \b clblasSuccess on success; * - \b clblasNotInitialized if clblasSetup() was not called; * - \b clblasInvalidValue if invalid parameters are passed: * - either \b N or \b incx is zero, or * - the leading dimension is invalid; * - \b clblasInvalidMemObject if either \b A or \b X object is * Invalid, or an image object rather than the buffer one; * - \b clblasOutOfHostMemory if the library can't allocate memory for * internal structures; * - \b clblasInvalidCommandQueue if the passed command queue is invalid; * - \b clblasInvalidContext if a context a passed command queue belongs * to was released; * - \b clblasInvalidOperation if kernel compilation relating to a previous * call has not completed for any of the target devices; * - \b clblasCompilerNotAvailable if a compiler is not available; * - \b clblasBuildProgramFailure if there is a failure to build a program * executable. * * @ingroup TPSV */ clblasStatus clblasZtpsv( clblasOrder order, clblasUplo uplo, clblasTranspose trans, clblasDiag diag, size_t N, const cl_mem A, size_t offa, cl_mem X, size_t offx, int incx, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events); /*@}*/ /** * @defgroup SPMV SPMV - Symmetric packed matrix vector multiply * @ingroup BLAS2 */ /*@{*/ /** * @brief Matrix-vector product with a symmetric packed-matrix and float elements. * * Matrix-vector products: * - \f$ Y \leftarrow \alpha A X + \beta Y \f$ * * @param[in] order Row/columns order. * @param[in] uplo The triangle in matrix being referenced. * @param[in] N Number of rows and columns in matrix \b AP. * @param[in] alpha The factor of matrix \b AP. * @param[in] AP Buffer object storing matrix \b AP. * @param[in] offa Offset in number of elements for first element in matrix \b AP. * @param[in] X Buffer object storing vector \b X. * @param[in] offx Offset of first element of vector \b X in buffer object. * Counted in elements. * @param[in] incx Increment for the elements of vector \b X. It cannot be zero. * @param[in] beta The factor of vector \b Y. * @param[out] Y Buffer object storing vector \b Y. * @param[in] offy Offset of first element of vector \b Y in buffer object. * Counted in elements. * @param[in] incy Increment for the elements of vector \b Y. It cannot be zero. * @param[in] numCommandQueues Number of OpenCL command queues in which the * task is to be performed. * @param[in] commandQueues OpenCL command queues. * @param[in] numEventsInWaitList Number of events in the event wait list. * @param[in] eventWaitList Event wait list. * @param[in] events Event objects per each command queue that identify * a particular kernel execution instance. * * @return * - \b clblasSuccess on success; * - \b clblasNotInitialized if clblasSetup() was not called; * - \b clblasInvalidValue if invalid parameters are passed: * - \b N is zero, or * - either \b incx or \b incy is zero, or * - the matrix sizes or the vector sizes along with the increments lead to * accessing outsize of any of the buffers; * - \b clblasInvalidMemObject if either \b AP, \b X, or \b Y object is * invalid, or an image object rather than the buffer one; * - \b clblasOutOfHostMemory if the library can't allocate memory for * internal structures; * - \b clblasInvalidCommandQueue if the passed command queue is invalid; * - \b clblasInvalidContext if a context a passed command queue belongs to * was released; * - \b clblasInvalidOperation if kernel compilation relating to a previous * call has not completed for any of the target devices; * - \b clblasCompilerNotAvailable if a compiler is not available; * - \b clblasBuildProgramFailure if there is a failure to build a program * executable. * * @ingroup SPMV */ clblasStatus clblasSspmv( clblasOrder order, clblasUplo uplo, size_t N, cl_float alpha, const cl_mem AP, size_t offa, const cl_mem X, size_t offx, int incx, cl_float beta, cl_mem Y, size_t offy, int incy, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events); /** * @example example_sspmv.c * This is an example of how to use the @ref clblasSspmv function. */ /** * @brief Matrix-vector product with a symmetric packed-matrix and double elements. * * Matrix-vector products: * - \f$ Y \leftarrow \alpha A X + \beta Y \f$ * * @param[in] order Row/columns order. * @param[in] uplo The triangle in matrix being referenced. * @param[in] N Number of rows and columns in matrix \b AP. * @param[in] alpha The factor of matrix \b AP. * @param[in] AP Buffer object storing matrix \b AP. * @param[in] offa Offset in number of elements for first element in matrix \b AP. * @param[in] X Buffer object storing vector \b X. * @param[in] offx Offset of first element of vector \b X in buffer object. * Counted in elements. * @param[in] incx Increment for the elements of vector \b X. It cannot be zero. * @param[in] beta The factor of vector \b Y. * @param[out] Y Buffer object storing vector \b Y. * @param[in] offy Offset of first element of vector \b Y in buffer object. * Counted in elements. * @param[in] incy Increment for the elements of vector \b Y. It cannot be zero. * @param[in] numCommandQueues Number of OpenCL command queues in which the * task is to be performed. * @param[in] commandQueues OpenCL command queues. * @param[in] numEventsInWaitList Number of events in the event wait list. * @param[in] eventWaitList Event wait list. * @param[in] events Event objects per each command queue that identify * a particular kernel execution instance. * * @return * - \b clblasSuccess on success; * - \b clblasInvalidDevice if a target device does not support floating * point arithmetic with double precision; * - the same error codes as the clblasSspmv() function otherwise. * * @ingroup SPMV */ clblasStatus clblasDspmv( clblasOrder order, clblasUplo uplo, size_t N, cl_double alpha, const cl_mem AP, size_t offa, const cl_mem X, size_t offx, int incx, cl_double beta, cl_mem Y, size_t offy, int incy, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events); /*@}*/ /** * @defgroup HPMV HPMV - Hermitian packed matrix-vector multiplication * @ingroup BLAS2 */ /*@{*/ /** * @brief Matrix-vector product with a packed hermitian matrix and float-complex elements. * * Matrix-vector products: * - \f$ Y \leftarrow \alpha A X + \beta Y \f$ * * @param[in] order Row/columns order. * @param[in] uplo The triangle in matrix being referenced. * @param[in] N Number of rows and columns in matrix \b AP. * @param[in] alpha The factor of matrix \b AP. * @param[in] AP Buffer object storing packed matrix \b AP. * @param[in] offa Offset in number of elements for first element in matrix \b AP. * @param[in] X Buffer object storing vector \b X. * @param[in] offx Offset of first element of vector \b X in buffer object. * Counted in elements. * @param[in] incx Increment for the elements of vector \b X. It cannot be zero. * @param[in] beta The factor of vector \b Y. * @param[out] Y Buffer object storing vector \b Y. * @param[in] offy Offset of first element of vector \b Y in buffer object. * Counted in elements. * @param[in] incy Increment for the elements of vector \b Y. It cannot be zero. * @param[in] numCommandQueues Number of OpenCL command queues in which the * task is to be performed. * @param[in] commandQueues OpenCL command queues. * @param[in] numEventsInWaitList Number of events in the event wait list. * @param[in] eventWaitList Event wait list. * @param[in] events Event objects per each command queue that identify * a particular kernel execution instance. * * @return * - \b clblasSuccess on success; * - \b clblasNotInitialized if clblasSetup() was not called; * - \b clblasInvalidValue if invalid parameters are passed: * - \b N is zero, or * - either \b incx or \b incy is zero, or * - the matrix sizes or the vector sizes along with the increments lead to * accessing outsize of any of the buffers; * - \b clblasInvalidMemObject if either \b AP, \b X, or \b Y object is * invalid, or an image object rather than the buffer one; * - \b clblasOutOfHostMemory if the library can't allocate memory for * internal structures; * - \b clblasInvalidCommandQueue if the passed command queue is invalid; * - \b clblasInvalidContext if a context a passed command queue belongs to * was released; * - \b clblasInvalidOperation if kernel compilation relating to a previous * call has not completed for any of the target devices; * - \b clblasCompilerNotAvailable if a compiler is not available; * - \b clblasBuildProgramFailure if there is a failure to build a program * executable. * * @ingroup HPMV */ clblasStatus clblasChpmv( clblasOrder order, clblasUplo uplo, size_t N, cl_float2 alpha, const cl_mem AP, size_t offa, const cl_mem X, size_t offx, int incx, cl_float2 beta, cl_mem Y, size_t offy, int incy, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events); /** * @example example_chpmv.c * This is an example of how to use the @ref clblasChpmv function. */ /** * @brief Matrix-vector product with a packed hermitian matrix and double-complex elements. * * Matrix-vector products: * - \f$ Y \leftarrow \alpha A X + \beta Y \f$ * * @param[in] order Row/columns order. * @param[in] uplo The triangle in matrix being referenced. * @param[in] N Number of rows and columns in matrix \b AP. * @param[in] alpha The factor of matrix \b AP. * @param[in] AP Buffer object storing packed matrix \b AP. * @param[in] offa Offset in number of elements for first element in matrix \b AP. * @param[in] X Buffer object storing vector \b X. * @param[in] offx Offset of first element of vector \b X in buffer object. * Counted in elements. * @param[in] incx Increment for the elements of vector \b X. It cannot be zero. * @param[in] beta The factor of vector \b Y. * @param[out] Y Buffer object storing vector \b Y. * @param[in] offy Offset of first element of vector \b Y in buffer object. * Counted in elements. * @param[in] incy Increment for the elements of vector \b Y. It cannot be zero. * @param[in] numCommandQueues Number of OpenCL command queues in which the * task is to be performed. * @param[in] commandQueues OpenCL command queues. * @param[in] numEventsInWaitList Number of events in the event wait list. * @param[in] eventWaitList Event wait list. * @param[in] events Event objects per each command queue that identify * a particular kernel execution instance. * * @return * - \b clblasSuccess on success; * - \b clblasInvalidDevice if a target device does not support floating * point arithmetic with double precision; * - the same error codes as the clblasChpmv() function otherwise. * * @ingroup HPMV */ clblasStatus clblasZhpmv( clblasOrder order, clblasUplo uplo, size_t N, cl_double2 alpha, const cl_mem AP, size_t offa, const cl_mem X, size_t offx, int incx, cl_double2 beta, cl_mem Y, size_t offy, int incy, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events); /*@}*/ /** * @defgroup SPR SPR - Symmetric packed matrix rank 1 update * * The Level 2 Basic Linear Algebra Subprograms are functions that perform * symmetric rank 1 update operations on packed matrix * @ingroup BLAS2 */ /*@{*/ /** * @brief Symmetric rank 1 operation with a general triangular packed-matrix and * float elements. * * Symmetric rank 1 operation: * - \f$ A \leftarrow \alpha X X^T + A \f$ * * @param[in] order Row/column order. * @param[in] uplo The triangle in matrix being referenced. * @param[in] N Number of columns in matrix \b A. * @param[in] alpha The factor of matrix \b A. * @param[in] X Buffer object storing vector \b X. * @param[in] offx Offset of first element of vector \b X in buffer object. * @param[in] incx Increment for the elements of \b X. Must not be zero. * @param[out] AP Buffer object storing packed-matrix \b AP. * @param[in] offa Offset of first element of matrix \b AP in buffer object. * @param[in] numCommandQueues Number of OpenCL command queues in which the * task is to be performed. * @param[in] commandQueues OpenCL command queues. * @param[in] numEventsInWaitList Number of events in the event wait list. * @param[in] eventWaitList Event wait list. * @param[in] events Event objects per each command queue that identify * a particular kernel execution instance. * * @return * - \b clblasSuccess on success; * - \b clblasNotInitialized if clblasSetup() was not called; * - \b clblasInvalidValue if invalid parameters are passed: * - \b N is zero, or * - either \b incx is zero * - \b clblasInvalidMemObject if either \b AP, \b X object is * Invalid, or an image object rather than the buffer one; * - \b clblasOutOfHostMemory if the library can't allocate memory for * internal structures; * - \b clblasInvalidCommandQueue if the passed command queue is invalid; * - \b clblasInvalidContext if a context a passed command queue belongs * to was released; * - \b clblasInvalidOperation if kernel compilation relating to a previous * call has not completed for any of the target devices; * - \b clblasCompilerNotAvailable if a compiler is not available; * - \b clblasBuildProgramFailure if there is a failure to build a program * executable. * * @ingroup SPR */ clblasStatus clblasSspr( clblasOrder order, clblasUplo uplo, size_t N, cl_float alpha, const cl_mem X, size_t offx, int incx, cl_mem AP, size_t offa, cl_uint numCommandQueues, cl_command_queue* commandQueues, cl_uint numEventsInWaitList, const cl_event* eventWaitList, cl_event* events); /** * @example example_sspr.c * Example of how to use the @ref clblasSspr function. */ /** * @brief Symmetric rank 1 operation with a general triangular packed-matrix and * double elements. * * Symmetric rank 1 operation: * - \f$ A \leftarrow \alpha X X^T + A \f$ * * @param[in] order Row/column order. * @param[in] uplo The triangle in matrix being referenced. * @param[in] N Number of columns in matrix \b A. * @param[in] alpha The factor of matrix \b A. * @param[in] X Buffer object storing vector \b X. * @param[in] offx Offset of first element of vector \b X in buffer object. * @param[in] incx Increment for the elements of \b X. Must not be zero. * @param[out] AP Buffer object storing packed-matrix \b AP. * @param[in] offa Offset of first element of matrix \b AP in buffer object. * @param[in] numCommandQueues Number of OpenCL command queues in which the * task is to be performed. * @param[in] commandQueues OpenCL command queues. * @param[in] numEventsInWaitList Number of events in the event wait list. * @param[in] eventWaitList Event wait list. * @param[in] events Event objects per each command queue that identify * a particular kernel execution instance. * * @return * - \b clblasSuccess on success; * - \b clblasInvalidDevice if a target device does not support floating * point arithmetic with double precision; * - the same error codes as the clblasSspr() function otherwise. * * @ingroup SPR */ clblasStatus clblasDspr( clblasOrder order, clblasUplo uplo, size_t N, cl_double alpha, const cl_mem X, size_t offx, int incx, cl_mem AP, size_t offa, cl_uint numCommandQueues, cl_command_queue* commandQueues, cl_uint numEventsInWaitList, const cl_event* eventWaitList, cl_event* events); /*@}*/ /** * @defgroup HPR HPR - Hermitian packed matrix rank 1 update * * The Level 2 Basic Linear Algebra Subprogram functions that perform * hermitian rank 1 operations on packed matrix * @ingroup BLAS2 */ /*@{*/ /** * @brief hermitian rank 1 operation with a general triangular packed-matrix and * float-complex elements. * * hermitian rank 1 operation: * - \f$ A \leftarrow \alpha X X^H + A \f$ * * @param[in] order Row/column order. * @param[in] uplo The triangle in matrix being referenced. * @param[in] N Number of columns in matrix \b A. * @param[in] alpha The factor of matrix \b A (a scalar float value) * @param[in] X Buffer object storing vector \b X. * @param[in] offx Offset in number of elements for the first element in vector \b X. * @param[in] incx Increment for the elements of \b X. Must not be zero. * @param[out] AP Buffer object storing matrix \b AP. * @param[in] offa Offset in number of elements for the first element in matrix \b AP. * @param[in] numCommandQueues Number of OpenCL command queues in which the * task is to be performed. * @param[in] commandQueues OpenCL command queues. * @param[in] numEventsInWaitList Number of events in the event wait list. * @param[in] eventWaitList Event wait list. * @param[in] events Event objects per each command queue that identify * a particular kernel execution instance. * * @return * - \b clblasSuccess on success; * - \b clblasNotInitialized if clblasSetup() was not called; * - \b clblasInvalidValue if invalid parameters are passed: * - \b N is zero, or * - either \b incx is zero * - \b clblasInvalidMemObject if either \b AP, \b X object is * Invalid, or an image object rather than the buffer one; * - \b clblasOutOfHostMemory if the library can't allocate memory for * internal structures; * - \b clblasInvalidCommandQueue if the passed command queue is invalid; * - \b clblasInvalidContext if a context a passed command queue belongs * to was released; * - \b clblasInvalidOperation if kernel compilation relating to a previous * call has not completed for any of the target devices; * - \b clblasCompilerNotAvailable if a compiler is not available; * - \b clblasBuildProgramFailure if there is a failure to build a program * executable. * * @ingroup HPR */ clblasStatus clblasChpr( clblasOrder order, clblasUplo uplo, size_t N, cl_float alpha, const cl_mem X, size_t offx, int incx, cl_mem AP, size_t offa, cl_uint numCommandQueues, cl_command_queue* commandQueues, cl_uint numEventsInWaitList, const cl_event* eventWaitList, cl_event* events); /** * @example example_chpr.c * Example of how to use the @ref clblasChpr function. */ /** * @brief hermitian rank 1 operation with a general triangular packed-matrix and * double-complex elements. * * hermitian rank 1 operation: * - \f$ A \leftarrow \alpha X X^H + A \f$ * * @param[in] order Row/column order. * @param[in] uplo The triangle in matrix being referenced. * @param[in] N Number of columns in matrix \b A. * @param[in] alpha The factor of matrix \b A (a scalar float value) * @param[in] X Buffer object storing vector \b X. * @param[in] offx Offset in number of elements for the first element in vector \b X. * @param[in] incx Increment for the elements of \b X. Must not be zero. * @param[out] AP Buffer object storing matrix \b AP. * @param[in] offa Offset in number of elements for the first element in matrix \b AP. * @param[in] numCommandQueues Number of OpenCL command queues in which the * task is to be performed. * @param[in] commandQueues OpenCL command queues. * @param[in] numEventsInWaitList Number of events in the event wait list. * @param[in] eventWaitList Event wait list. * @param[in] events Event objects per each command queue that identify * a particular kernel execution instance. * * @return * - \b clblasSuccess on success; * - \b clblasInvalidDevice if a target device does not support floating * point arithmetic with double precision; * - the same error codes as the clblasChpr() function otherwise. * * @ingroup HPR */ clblasStatus clblasZhpr( clblasOrder order, clblasUplo uplo, size_t N, cl_double alpha, const cl_mem X, size_t offx, int incx, cl_mem AP, size_t offa, cl_uint numCommandQueues, cl_command_queue* commandQueues, cl_uint numEventsInWaitList, const cl_event* eventWaitList, cl_event* events); /*@}*/ /** * @defgroup SPR2 SPR2 - Symmetric packed matrix rank 2 update * * The Level 2 Basic Linear Algebra Subprograms are functions that perform * symmetric rank 2 update operations on packed matrices * @ingroup BLAS2 */ /*@{*/ /** * @brief Symmetric rank 2 operation with a general triangular packed-matrix and * float elements. * * Symmetric rank 2 operation: * - \f$ A \leftarrow \alpha X Y^T + \alpha Y X^T + A \f$ * * @param[in] order Row/column order. * @param[in] uplo The triangle in matrix being referenced. * @param[in] N Number of columns in matrix \b A. * @param[in] alpha The factor of matrix \b A. * @param[in] X Buffer object storing vector \b X. * @param[in] offx Offset of first element of vector \b X in buffer object. * @param[in] incx Increment for the elements of \b X. Must not be zero. * @param[in] Y Buffer object storing vector \b Y. * @param[in] offy Offset of first element of vector \b Y in buffer object. * @param[in] incy Increment for the elements of \b Y. Must not be zero. * @param[out] AP Buffer object storing packed-matrix \b AP. * @param[in] offa Offset of first element of matrix \b AP in buffer object. * @param[in] numCommandQueues Number of OpenCL command queues in which the * task is to be performed. * @param[in] commandQueues OpenCL command queues. * @param[in] numEventsInWaitList Number of events in the event wait list. * @param[in] eventWaitList Event wait list. * @param[in] events Event objects per each command queue that identify * a particular kernel execution instance. * * @return * - \b clblasSuccess on success; * - \b clblasNotInitialized if clblasSetup() was not called; * - \b clblasInvalidValue if invalid parameters are passed: * - either \b N is zero, or * - either \b incx or \b incy is zero * - \b clblasInvalidMemObject if either \b AP, \b X, or \b Y object is * Invalid, or an image object rather than the buffer one; * - \b clblasOutOfHostMemory if the library can't allocate memory for * internal structures; * - \b clblasInvalidCommandQueue if the passed command queue is invalid; * - \b clblasInvalidContext if a context a passed command queue belongs * to was released; * - \b clblasInvalidOperation if kernel compilation relating to a previous * call has not completed for any of the target devices; * - \b clblasCompilerNotAvailable if a compiler is not available; * - \b clblasBuildProgramFailure if there is a failure to build a program * executable. * * @ingroup SPR2 */ clblasStatus clblasSspr2( clblasOrder order, clblasUplo uplo, size_t N, cl_float alpha, const cl_mem X, size_t offx, int incx, const cl_mem Y, size_t offy, int incy, cl_mem AP, size_t offa, cl_uint numCommandQueues, cl_command_queue* commandQueues, cl_uint numEventsInWaitList, const cl_event* eventWaitList, cl_event* events); /** * @example example_sspr2.c * Example of how to use the @ref clblasSspr2 function. */ /** * @brief Symmetric rank 2 operation with a general triangular packed-matrix and * double elements. * * Symmetric rank 2 operation: * - \f$ A \leftarrow \alpha X Y^T + \alpha Y X^T + A \f$ * * @param[in] order Row/column order. * @param[in] uplo The triangle in matrix being referenced. * @param[in] N Number of columns in matrix \b A. * @param[in] alpha The factor of matrix \b A. * @param[in] X Buffer object storing vector \b X. * @param[in] offx Offset of first element of vector \b X in buffer object. * @param[in] incx Increment for the elements of \b X. Must not be zero. * @param[in] Y Buffer object storing vector \b Y. * @param[in] offy Offset of first element of vector \b Y in buffer object. * @param[in] incy Increment for the elements of \b Y. Must not be zero. * @param[out] AP Buffer object storing packed-matrix \b AP. * @param[in] offa Offset of first element of matrix \b AP in buffer object. * @param[in] numCommandQueues Number of OpenCL command queues in which the * task is to be performed. * @param[in] commandQueues OpenCL command queues. * @param[in] numEventsInWaitList Number of events in the event wait list. * @param[in] eventWaitList Event wait list. * @param[in] events Event objects per each command queue that identify * a particular kernel execution instance. * * @return * - \b clblasSuccess on success; * - \b clblasInvalidDevice if a target device does not support floating * point arithmetic with double precision; * - the same error codes as the clblasSspr2() function otherwise. * * @ingroup SPR2 */ clblasStatus clblasDspr2( clblasOrder order, clblasUplo uplo, size_t N, cl_double alpha, const cl_mem X, size_t offx, int incx, const cl_mem Y, size_t offy, int incy, cl_mem AP, size_t offa, cl_uint numCommandQueues, cl_command_queue* commandQueues, cl_uint numEventsInWaitList, const cl_event* eventWaitList, cl_event* events); /*@}*/ /** * @defgroup HPR2 HPR2 - Hermitian packed matrix rank 2 update * * The Level 2 Basic Linear Algebra Subprograms are functions that perform * hermitian rank 2 update operations on packed matrices * @ingroup BLAS2 */ /*@{*/ /** * @brief Hermitian rank 2 operation with a general triangular packed-matrix and * float-compelx elements. * * Hermitian rank 2 operation: * - \f$ A \leftarrow \alpha X Y^H + \conjg( alpha ) Y X^H + A \f$ * * @param[in] order Row/column order. * @param[in] uplo The triangle in matrix being referenced. * @param[in] N Number of columns in matrix \b A. * @param[in] alpha The factor of matrix \b A. * @param[in] X Buffer object storing vector \b X. * @param[in] offx Offset in number of elements for the first element in vector \b X. * @param[in] incx Increment for the elements of \b X. Must not be zero. * @param[in] Y Buffer object storing vector \b Y. * @param[in] offy Offset in number of elements for the first element in vector \b Y. * @param[in] incy Increment for the elements of \b Y. Must not be zero. * @param[out] AP Buffer object storing packed-matrix \b AP. * @param[in] offa Offset in number of elements for the first element in matrix \b AP. * @param[in] numCommandQueues Number of OpenCL command queues in which the * task is to be performed. * @param[in] commandQueues OpenCL command queues. * @param[in] numEventsInWaitList Number of events in the event wait list. * @param[in] eventWaitList Event wait list. * @param[in] events Event objects per each command queue that identify * a particular kernel execution instance. * * @return * - \b clblasSuccess on success; * - \b clblasNotInitialized if clblasSetup() was not called; * - \b clblasInvalidValue if invalid parameters are passed: * - either \b N is zero, or * - either \b incx or \b incy is zero * - \b clblasInvalidMemObject if either \b AP, \b X, or \b Y object is * Invalid, or an image object rather than the buffer one; * - \b clblasOutOfHostMemory if the library can't allocate memory for * internal structures; * - \b clblasInvalidCommandQueue if the passed command queue is invalid; * - \b clblasInvalidContext if a context a passed command queue belongs * to was released; * - \b clblasInvalidOperation if kernel compilation relating to a previous * call has not completed for any of the target devices; * - \b clblasCompilerNotAvailable if a compiler is not available; * - \b clblasBuildProgramFailure if there is a failure to build a program * executable. * * @ingroup HPR2 */ clblasStatus clblasChpr2( clblasOrder order, clblasUplo uplo, size_t N, cl_float2 alpha, const cl_mem X, size_t offx, int incx, const cl_mem Y, size_t offy, int incy, cl_mem AP, size_t offa, cl_uint numCommandQueues, cl_command_queue* commandQueues, cl_uint numEventsInWaitList, const cl_event* eventWaitList, cl_event* events); /** * @brief Hermitian rank 2 operation with a general triangular packed-matrix and * double-compelx elements. * * Hermitian rank 2 operation: * - \f$ A \leftarrow \alpha X Y^H + \conjg( alpha ) Y X^H + A \f$ * * @param[in] order Row/column order. * @param[in] uplo The triangle in matrix being referenced. * @param[in] N Number of columns in matrix \b A. * @param[in] alpha The factor of matrix \b A. * @param[in] X Buffer object storing vector \b X. * @param[in] offx Offset in number of elements for the first element in vector \b X. * @param[in] incx Increment for the elements of \b X. Must not be zero. * @param[in] Y Buffer object storing vector \b Y. * @param[in] offy Offset in number of elements for the first element in vector \b Y. * @param[in] incy Increment for the elements of \b Y. Must not be zero. * @param[out] AP Buffer object storing packed-matrix \b AP. * @param[in] offa Offset in number of elements for the first element in matrix \b AP. * @param[in] numCommandQueues Number of OpenCL command queues in which the * task is to be performed. * @param[in] commandQueues OpenCL command queues. * @param[in] numEventsInWaitList Number of events in the event wait list. * @param[in] eventWaitList Event wait list. * @param[in] events Event objects per each command queue that identify * a particular kernel execution instance. * * @return * - \b clblasSuccess on success; * - \b clblasInvalidDevice if a target device does not support floating * point arithmetic with double precision; * - the same error codes as the clblasChpr2() function otherwise. * * @ingroup HPR2 */ clblasStatus clblasZhpr2( clblasOrder order, clblasUplo uplo, size_t N, cl_double2 alpha, const cl_mem X, size_t offx, int incx, const cl_mem Y, size_t offy, int incy, cl_mem AP, size_t offa, cl_uint numCommandQueues, cl_command_queue* commandQueues, cl_uint numEventsInWaitList, const cl_event* eventWaitList, cl_event* events); /** * @example example_zhpr2.c * Example of how to use the @ref clblasZhpr2 function. */ /*@}*/ /** * @defgroup GBMV GBMV - General banded matrix-vector multiplication * @ingroup BLAS2 */ /*@{*/ /** * @brief Matrix-vector product with a general rectangular banded matrix and * float elements. * * Matrix-vector products: * - \f$ Y \leftarrow \alpha A X + \beta Y \f$ * - \f$ Y \leftarrow \alpha A^T X + \beta Y \f$ * * @param[in] order Row/column order. * @param[in] trans How matrix \b A is to be transposed. * @param[in] M Number of rows in banded matrix \b A. * @param[in] N Number of columns in banded matrix \b A. * @param[in] KL Number of sub-diagonals in banded matrix \b A. * @param[in] KU Number of super-diagonals in banded matrix \b A. * @param[in] alpha The factor of banded matrix \b A. * @param[in] A Buffer object storing banded matrix \b A. * @param[in] offa Offset in number of elements for the first element in banded matrix \b A. * @param[in] lda Leading dimension of banded matrix \b A. It cannot be less * than ( \b KL + \b KU + 1 ) * @param[in] X Buffer object storing vector \b X. * @param[in] offx Offset of first element of vector \b X in buffer object. * Counted in elements. * @param[in] incx Increment for the elements of \b X. Must not be zero. * @param[in] beta The factor of the vector \b Y. * @param[out] Y Buffer object storing the vector \b y. * @param[in] offy Offset of first element of vector \b Y in buffer object. * Counted in elements. * @param[in] incy Increment for the elements of \b Y. Must not be zero. * @param[in] numCommandQueues Number of OpenCL command queues in which the * task is to be performed. * @param[in] commandQueues OpenCL command queues. * @param[in] numEventsInWaitList Number of events in the event wait list. * @param[in] eventWaitList Event wait list. * @param[in] events Event objects per each command queue that identify * a particular kernel execution instance. * * @return * - \b clblasSuccess on success; * - \b clblasNotInitialized if clblasSetup() was not called; * - \b clblasInvalidValue if invalid parameters are passed: * - either \b M or \b N is zero, or * - KL is greater than \b M - 1, or * - KU is greater than \b N - 1, or * - either \b incx or \b incy is zero, or * - any of the leading dimensions is invalid; * - the matrix size or the vector sizes along with the increments lead to * accessing outside of any of the buffers; * - \b clblasInvalidMemObject if either \b A, \b X, or \b Y object is * Invalid, or an image object rather than the buffer one; * - \b clblasOutOfHostMemory if the library can't allocate memory for * internal structures; * - \b clblasInvalidCommandQueue if the passed command queue is invalid; * - \b clblasInvalidContext if a context a passed command queue belongs * to was released; * - \b clblasInvalidOperation if kernel compilation relating to a previous * call has not completed for any of the target devices; * - \b clblasCompilerNotAvailable if a compiler is not available; * - \b clblasBuildProgramFailure if there is a failure to build a program * executable. * * @ingroup GBMV */ clblasStatus clblasSgbmv( clblasOrder order, clblasTranspose trans, size_t M, size_t N, size_t KL, size_t KU, cl_float alpha, const cl_mem A, size_t offa, size_t lda, const cl_mem X, size_t offx, int incx, cl_float beta, cl_mem Y, size_t offy, int incy, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events); /** * @example example_sgbmv.c * Example of how to use the @ref clblasSgbmv function. */ /** * @brief Matrix-vector product with a general rectangular banded matrix and * double elements. * * Matrix-vector products: * - \f$ Y \leftarrow \alpha A X + \beta Y \f$ * - \f$ Y \leftarrow \alpha A^T X + \beta Y \f$ * * @param[in] order Row/column order. * @param[in] trans How matrix \b A is to be transposed. * @param[in] M Number of rows in banded matrix \b A. * @param[in] N Number of columns in banded matrix \b A. * @param[in] KL Number of sub-diagonals in banded matrix \b A. * @param[in] KU Number of super-diagonals in banded matrix \b A. * @param[in] alpha The factor of banded matrix \b A. * @param[in] A Buffer object storing banded matrix \b A. * @param[in] offa Offset in number of elements for the first element in banded matrix \b A. * @param[in] lda Leading dimension of banded matrix \b A. It cannot be less * than ( \b KL + \b KU + 1 ) * @param[in] X Buffer object storing vector \b X. * @param[in] offx Offset of first element of vector \b X in buffer object. * Counted in elements. * @param[in] incx Increment for the elements of \b X. Must not be zero. * @param[in] beta The factor of the vector \b Y. * @param[out] Y Buffer object storing the vector \b y. * @param[in] offy Offset of first element of vector \b Y in buffer object. * Counted in elements. * @param[in] incy Increment for the elements of \b Y. Must not be zero. * @param[in] numCommandQueues Number of OpenCL command queues in which the * task is to be performed. * @param[in] commandQueues OpenCL command queues. * @param[in] numEventsInWaitList Number of events in the event wait list. * @param[in] eventWaitList Event wait list. * @param[in] events Event objects per each command queue that identify * a particular kernel execution instance. * * @return * - \b clblasSuccess on success; * - \b clblasInvalidDevice if a target device does not support floating * point arithmetic with double precision; * - the same error codes as the clblasSgbmv() function otherwise. * * @ingroup GBMV */ clblasStatus clblasDgbmv( clblasOrder order, clblasTranspose trans, size_t M, size_t N, size_t KL, size_t KU, cl_double alpha, const cl_mem A, size_t offa, size_t lda, const cl_mem X, size_t offx, int incx, cl_double beta, cl_mem Y, size_t offy, int incy, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events); /** * @brief Matrix-vector product with a general rectangular banded matrix and * float-complex elements. * * Matrix-vector products: * - \f$ Y \leftarrow \alpha A X + \beta Y \f$ * - \f$ Y \leftarrow \alpha A^T X + \beta Y \f$ * * @param[in] order Row/column order. * @param[in] trans How matrix \b A is to be transposed. * @param[in] M Number of rows in banded matrix \b A. * @param[in] N Number of columns in banded matrix \b A. * @param[in] KL Number of sub-diagonals in banded matrix \b A. * @param[in] KU Number of super-diagonals in banded matrix \b A. * @param[in] alpha The factor of banded matrix \b A. * @param[in] A Buffer object storing banded matrix \b A. * @param[in] offa Offset in number of elements for the first element in banded matrix \b A. * @param[in] lda Leading dimension of banded matrix \b A. It cannot be less * than ( \b KL + \b KU + 1 ) * @param[in] X Buffer object storing vector \b X. * @param[in] offx Offset of first element of vector \b X in buffer object. * Counted in elements. * @param[in] incx Increment for the elements of \b X. Must not be zero. * @param[in] beta The factor of the vector \b Y. * @param[out] Y Buffer object storing the vector \b y. * @param[in] offy Offset of first element of vector \b Y in buffer object. * Counted in elements. * @param[in] incy Increment for the elements of \b Y. Must not be zero. * @param[in] numCommandQueues Number of OpenCL command queues in which the * task is to be performed. * @param[in] commandQueues OpenCL command queues. * @param[in] numEventsInWaitList Number of events in the event wait list. * @param[in] eventWaitList Event wait list. * @param[in] events Event objects per each command queue that identify * a particular kernel execution instance. * * @return The same result as the clblasSgbmv() function. * * @ingroup GBMV */ clblasStatus clblasCgbmv( clblasOrder order, clblasTranspose trans, size_t M, size_t N, size_t KL, size_t KU, cl_float2 alpha, const cl_mem A, size_t offa, size_t lda, const cl_mem X, size_t offx, int incx, cl_float2 beta, cl_mem Y, size_t offy, int incy, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events); /** * @brief Matrix-vector product with a general rectangular banded matrix and * double-complex elements. * * Matrix-vector products: * - \f$ Y \leftarrow \alpha A X + \beta Y \f$ * - \f$ Y \leftarrow \alpha A^T X + \beta Y \f$ * * @param[in] order Row/column order. * @param[in] trans How matrix \b A is to be transposed. * @param[in] M Number of rows in banded matrix \b A. * @param[in] N Number of columns in banded matrix \b A. * @param[in] KL Number of sub-diagonals in banded matrix \b A. * @param[in] KU Number of super-diagonals in banded matrix \b A. * @param[in] alpha The factor of banded matrix \b A. * @param[in] A Buffer object storing banded matrix \b A. * @param[in] offa Offset in number of elements for the first element in banded matrix \b A. * @param[in] lda Leading dimension of banded matrix \b A. It cannot be less * than ( \b KL + \b KU + 1 ) * @param[in] X Buffer object storing vector \b X. * @param[in] offx Offset of first element of vector \b X in buffer object. * Counted in elements. * @param[in] incx Increment for the elements of \b X. Must not be zero. * @param[in] beta The factor of the vector \b Y. * @param[out] Y Buffer object storing the vector \b y. * @param[in] offy Offset of first element of vector \b Y in buffer object. * Counted in elements. * @param[in] incy Increment for the elements of \b Y. Must not be zero. * @param[in] numCommandQueues Number of OpenCL command queues in which the * task is to be performed. * @param[in] commandQueues OpenCL command queues. * @param[in] numEventsInWaitList Number of events in the event wait list. * @param[in] eventWaitList Event wait list. * @param[in] events Event objects per each command queue that identify * a particular kernel execution instance. * * @return The same result as the clblasDgbmv() function. * * @ingroup GBMV */ clblasStatus clblasZgbmv( clblasOrder order, clblasTranspose trans, size_t M, size_t N, size_t KL, size_t KU, cl_double2 alpha, const cl_mem A, size_t offa, size_t lda, const cl_mem X, size_t offx, int incx, cl_double2 beta, cl_mem Y, size_t offy, int incy, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events); /*@}*/ /** * @defgroup TBMV TBMV - Triangular banded matrix vector multiply * @ingroup BLAS2 */ /*@{*/ /** * @brief Matrix-vector product with a triangular banded matrix and * float elements. * * Matrix-vector products: * - \f$ X \leftarrow A X \f$ * - \f$ X \leftarrow A^T X \f$ * * @param[in] order Row/column order. * @param[in] uplo The triangle in matrix being referenced. * @param[in] trans How matrix \b A is to be transposed. * @param[in] diag Specify whether matrix \b A is unit triangular. * @param[in] N Number of rows/columns in banded matrix \b A. * @param[in] K Number of sub-diagonals/super-diagonals in triangular banded matrix \b A. * @param[in] A Buffer object storing matrix \b A. * @param[in] offa Offset in number of elements for first element in matrix \b A. * @param[in] lda Leading dimension of matrix \b A. It cannot be less * than ( \b K + 1 ) * @param[out] X Buffer object storing vector \b X. * @param[in] offx Offset in number of elements for first element in vector \b X. * @param[in] incx Increment for the elements of \b X. Must not be zero. * @param[in] scratchBuff Temporary cl_mem scratch buffer object which can hold a * minimum of (1 + (N-1)*abs(incx)) elements * @param[in] numCommandQueues Number of OpenCL command queues in which the * task is to be performed. * @param[in] commandQueues OpenCL command queues. * @param[in] numEventsInWaitList Number of events in the event wait list. * @param[in] eventWaitList Event wait list. * @param[in] events Event objects per each command queue that identify * a particular kernel execution instance. * * @return * - \b clblasSuccess on success; * - \b clblasNotInitialized if clblasSetup() was not called; * - \b clblasInvalidValue if invalid parameters are passed: * - either \b N or \b incx is zero, or * - K is greater than \b N - 1 * - the leading dimension is invalid; * - \b clblasInvalidMemObject if either \b A or \b X object is * Invalid, or an image object rather than the buffer one; * - \b clblasOutOfHostMemory if the library can't allocate memory for * internal structures; * - \b clblasInvalidCommandQueue if the passed command queue is invalid; * - \b clblasInvalidContext if a context a passed command queue belongs * to was released; * - \b clblasInvalidOperation if kernel compilation relating to a previous * call has not completed for any of the target devices; * - \b clblasCompilerNotAvailable if a compiler is not available; * - \b clblasBuildProgramFailure if there is a failure to build a program * executable. * * @ingroup TBMV */ clblasStatus clblasStbmv( clblasOrder order, clblasUplo uplo, clblasTranspose trans, clblasDiag diag, size_t N, size_t K, const cl_mem A, size_t offa, size_t lda, cl_mem X, size_t offx, int incx, cl_mem scratchBuff, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events); /** * @example example_stbmv.c * Example of how to use the @ref clblasStbmv function. */ /** * @brief Matrix-vector product with a triangular banded matrix and * double elements. * * Matrix-vector products: * - \f$ X \leftarrow A X \f$ * - \f$ X \leftarrow A^T X \f$ * * @param[in] order Row/column order. * @param[in] uplo The triangle in matrix being referenced. * @param[in] trans How matrix \b A is to be transposed. * @param[in] diag Specify whether matrix \b A is unit triangular. * @param[in] N Number of rows/columns in banded matrix \b A. * @param[in] K Number of sub-diagonals/super-diagonals in triangular banded matrix \b A. * @param[in] A Buffer object storing matrix \b A. * @param[in] offa Offset in number of elements for first element in matrix \b A. * @param[in] lda Leading dimension of matrix \b A. It cannot be less * than ( \b K + 1 ) * @param[out] X Buffer object storing vector \b X. * @param[in] offx Offset in number of elements for first element in vector \b X. * @param[in] incx Increment for the elements of \b X. Must not be zero. * @param[in] scratchBuff Temporary cl_mem scratch buffer object which can hold a * minimum of (1 + (N-1)*abs(incx)) elements * @param[in] numCommandQueues Number of OpenCL command queues in which the * task is to be performed. * @param[in] commandQueues OpenCL command queues. * @param[in] numEventsInWaitList Number of events in the event wait list. * @param[in] eventWaitList Event wait list. * @param[in] events Event objects per each command queue that identify * a particular kernel execution instance. * * @return * - \b clblasSuccess on success; * - \b clblasInvalidDevice if a target device does not support floating * point arithmetic with double precision; * - the same error codes as the clblasStbmv() function otherwise. * * @ingroup TBMV */ clblasStatus clblasDtbmv( clblasOrder order, clblasUplo uplo, clblasTranspose trans, clblasDiag diag, size_t N, size_t K, const cl_mem A, size_t offa, size_t lda, cl_mem X, size_t offx, int incx, cl_mem scratchBuff, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events); /** * @brief Matrix-vector product with a triangular banded matrix and * float-complex elements. * * Matrix-vector products: * - \f$ X \leftarrow A X \f$ * - \f$ X \leftarrow A^T X \f$ * * @param[in] order Row/column order. * @param[in] uplo The triangle in matrix being referenced. * @param[in] trans How matrix \b A is to be transposed. * @param[in] diag Specify whether matrix \b A is unit triangular. * @param[in] N Number of rows/columns in banded matrix \b A. * @param[in] K Number of sub-diagonals/super-diagonals in triangular banded matrix \b A. * @param[in] A Buffer object storing matrix \b A. * @param[in] offa Offset in number of elements for first element in matrix \b A. * @param[in] lda Leading dimension of matrix \b A. It cannot be less * than ( \b K + 1 ) * @param[out] X Buffer object storing vector \b X. * @param[in] offx Offset in number of elements for first element in vector \b X. * @param[in] incx Increment for the elements of \b X. Must not be zero. * @param[in] scratchBuff Temporary cl_mem scratch buffer object which can hold a * minimum of (1 + (N-1)*abs(incx)) elements * @param[in] numCommandQueues Number of OpenCL command queues in which the * task is to be performed. * @param[in] commandQueues OpenCL command queues. * @param[in] numEventsInWaitList Number of events in the event wait list. * @param[in] eventWaitList Event wait list. * @param[in] events Event objects per each command queue that identify * a particular kernel execution instance. * * @return The same result as the clblasStbmv() function. * * @ingroup TBMV */ clblasStatus clblasCtbmv( clblasOrder order, clblasUplo uplo, clblasTranspose trans, clblasDiag diag, size_t N, size_t K, const cl_mem A, size_t offa, size_t lda, cl_mem X, size_t offx, int incx, cl_mem scratchBuff, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events); /** * @brief Matrix-vector product with a triangular banded matrix and * double-complex elements. * * Matrix-vector products: * - \f$ X \leftarrow A X \f$ * - \f$ X \leftarrow A^T X \f$ * * @param[in] order Row/column order. * @param[in] uplo The triangle in matrix being referenced. * @param[in] trans How matrix \b A is to be transposed. * @param[in] diag Specify whether matrix \b A is unit triangular. * @param[in] N Number of rows/columns in banded matrix \b A. * @param[in] K Number of sub-diagonals/super-diagonals in triangular banded matrix \b A. * @param[in] A Buffer object storing matrix \b A. * @param[in] offa Offset in number of elements for first element in matrix \b A. * @param[in] lda Leading dimension of matrix \b A. It cannot be less * than ( \b K + 1 ) * @param[out] X Buffer object storing vector \b X. * @param[in] offx Offset in number of elements for first element in vector \b X. * @param[in] incx Increment for the elements of \b X. Must not be zero. * @param[in] scratchBuff Temporary cl_mem scratch buffer object which can hold a * minimum of (1 + (N-1)*abs(incx)) elements * @param[in] numCommandQueues Number of OpenCL command queues in which the * task is to be performed. * @param[in] commandQueues OpenCL command queues. * @param[in] numEventsInWaitList Number of events in the event wait list. * @param[in] eventWaitList Event wait list. * @param[in] events Event objects per each command queue that identify * a particular kernel execution instance. * * @return The same result as the clblasDtbmv() function. * * @ingroup TBMV */ clblasStatus clblasZtbmv( clblasOrder order, clblasUplo uplo, clblasTranspose trans, clblasDiag diag, size_t N, size_t K, const cl_mem A, size_t offa, size_t lda, cl_mem X, size_t offx, int incx, cl_mem scratchBuff, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events); /*@}*/ /** * @defgroup SBMV SBMV - Symmetric banded matrix-vector multiplication * @ingroup BLAS2 */ /*@{*/ /** * @brief Matrix-vector product with a symmetric banded matrix and float elements. * * Matrix-vector products: * - \f$ Y \leftarrow \alpha A X + \beta Y \f$ * * @param[in] order Row/columns order. * @param[in] uplo The triangle in matrix being referenced. * @param[in] N Number of rows and columns in banded matrix \b A. * @param[in] K Number of sub-diagonals/super-diagonals in banded matrix \b A. * @param[in] alpha The factor of matrix \b A. * @param[in] A Buffer object storing matrix \b A. * @param[in] offa Offset in number of elements for first element in matrix \b A. * @param[in] lda Leading dimension of matrix \b A. It cannot be less * than ( \b K + 1 ) * @param[in] X Buffer object storing vector \b X. * @param[in] offx Offset of first element of vector \b X in buffer object. * Counted in elements. * @param[in] incx Increment for the elements of vector \b X. It cannot be zero. * @param[in] beta The factor of vector \b Y. * @param[out] Y Buffer object storing vector \b Y. * @param[in] offy Offset of first element of vector \b Y in buffer object. * Counted in elements. * @param[in] incy Increment for the elements of vector \b Y. It cannot be zero. * @param[in] numCommandQueues Number of OpenCL command queues in which the * task is to be performed. * @param[in] commandQueues OpenCL command queues. * @param[in] numEventsInWaitList Number of events in the event wait list. * @param[in] eventWaitList Event wait list. * @param[in] events Event objects per each command queue that identify * a particular kernel execution instance. * * @return * - \b clblasSuccess on success; * - \b clblasNotInitialized if clblasSetup() was not called; * - \b clblasInvalidValue if invalid parameters are passed: * - either \b N or \b incx is zero, or * - K is greater than \b N - 1 * - the leading dimension is invalid; * - \b clblasInvalidMemObject if either \b A or \b X object is * Invalid, or an image object rather than the buffer one; * - \b clblasOutOfHostMemory if the library can't allocate memory for * internal structures; * - \b clblasInvalidCommandQueue if the passed command queue is invalid; * - \b clblasInvalidContext if a context a passed command queue belongs * to was released; * - \b clblasInvalidOperation if kernel compilation relating to a previous * call has not completed for any of the target devices; * - \b clblasCompilerNotAvailable if a compiler is not available; * - \b clblasBuildProgramFailure if there is a failure to build a program * executable. * * @ingroup SBMV */ clblasStatus clblasSsbmv( clblasOrder order, clblasUplo uplo, size_t N, size_t K, cl_float alpha, const cl_mem A, size_t offa, size_t lda, const cl_mem X, size_t offx, int incx, cl_float beta, cl_mem Y, size_t offy, int incy, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events); /** * @example example_ssbmv.c * This is an example of how to use the @ref clblasSsbmv function. */ /** * @brief Matrix-vector product with a symmetric banded matrix and double elements. * * Matrix-vector products: * - \f$ Y \leftarrow \alpha A X + \beta Y \f$ * * @param[in] order Row/columns order. * @param[in] uplo The triangle in matrix being referenced. * @param[in] N Number of rows and columns in banded matrix \b A. * @param[in] K Number of sub-diagonals/super-diagonals in banded matrix \b A. * @param[in] alpha The factor of matrix \b A. * @param[in] A Buffer object storing matrix \b A. * @param[in] offa Offset in number of elements for first element in matrix \b A. * @param[in] lda Leading dimension of matrix \b A. It cannot be less * than ( \b K + 1 ) * @param[in] X Buffer object storing vector \b X. * @param[in] offx Offset of first element of vector \b X in buffer object. * Counted in elements. * @param[in] incx Increment for the elements of vector \b X. It cannot be zero. * @param[in] beta The factor of vector \b Y. * @param[out] Y Buffer object storing vector \b Y. * @param[in] offy Offset of first element of vector \b Y in buffer object. * Counted in elements. * @param[in] incy Increment for the elements of vector \b Y. It cannot be zero. * @param[in] numCommandQueues Number of OpenCL command queues in which the * task is to be performed. * @param[in] commandQueues OpenCL command queues. * @param[in] numEventsInWaitList Number of events in the event wait list. * @param[in] eventWaitList Event wait list. * @param[in] events Event objects per each command queue that identify * a particular kernel execution instance. * * @return * - \b clblasSuccess on success; * - \b clblasInvalidDevice if a target device does not support floating * point arithmetic with double precision; * - the same error codes as the clblasSsbmv() function otherwise. * * @ingroup SBMV */ clblasStatus clblasDsbmv( clblasOrder order, clblasUplo uplo, size_t N, size_t K, cl_double alpha, const cl_mem A, size_t offa, size_t lda, const cl_mem X, size_t offx, int incx, cl_double beta, cl_mem Y, size_t offy, int incy, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events); /*@}*/ /** * @defgroup HBMV HBMV - Hermitian banded matrix-vector multiplication * @ingroup BLAS2 */ /*@{*/ /** * @brief Matrix-vector product with a hermitian banded matrix and float elements. * * Matrix-vector products: * - \f$ Y \leftarrow \alpha A X + \beta Y \f$ * * @param[in] order Row/columns order. * @param[in] uplo The triangle in matrix being referenced. * @param[in] N Number of rows and columns in banded matrix \b A. * @param[in] K Number of sub-diagonals/super-diagonals in banded matrix \b A. * @param[in] alpha The factor of matrix \b A. * @param[in] A Buffer object storing matrix \b A. * @param[in] offa Offset in number of elements for first element in matrix \b A. * @param[in] lda Leading dimension of matrix \b A. It cannot be less * than ( \b K + 1 ) * @param[in] X Buffer object storing vector \b X. * @param[in] offx Offset of first element of vector \b X in buffer object. * Counted in elements. * @param[in] incx Increment for the elements of vector \b X. It cannot be zero. * @param[in] beta The factor of vector \b Y. * @param[out] Y Buffer object storing vector \b Y. * @param[in] offy Offset of first element of vector \b Y in buffer object. * Counted in elements. * @param[in] incy Increment for the elements of vector \b Y. It cannot be zero. * @param[in] numCommandQueues Number of OpenCL command queues in which the * task is to be performed. * @param[in] commandQueues OpenCL command queues. * @param[in] numEventsInWaitList Number of events in the event wait list. * @param[in] eventWaitList Event wait list. * @param[in] events Event objects per each command queue that identify * a particular kernel execution instance. * * @return * - \b clblasSuccess on success; * - \b clblasNotInitialized if clblasSetup() was not called; * - \b clblasInvalidValue if invalid parameters are passed: * - either \b N or \b incx is zero, or * - K is greater than \b N - 1 * - the leading dimension is invalid; * - \b clblasInvalidMemObject if either \b A or \b X object is * Invalid, or an image object rather than the buffer one; * - \b clblasOutOfHostMemory if the library can't allocate memory for * internal structures; * - \b clblasInvalidCommandQueue if the passed command queue is invalid; * - \b clblasInvalidContext if a context a passed command queue belongs * to was released; * - \b clblasInvalidOperation if kernel compilation relating to a previous * call has not completed for any of the target devices; * - \b clblasCompilerNotAvailable if a compiler is not available; * - \b clblasBuildProgramFailure if there is a failure to build a program * executable. * * @ingroup HBMV */ clblasStatus clblasChbmv( clblasOrder order, clblasUplo uplo, size_t N, size_t K, cl_float2 alpha, const cl_mem A, size_t offa, size_t lda, const cl_mem X, size_t offx, int incx, cl_float2 beta, cl_mem Y, size_t offy, int incy, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events); /** * @example example_chbmv.c * This is an example of how to use the @ref clblasChbmv function. */ /** * @brief Matrix-vector product with a hermitian banded matrix and double elements. * * Matrix-vector products: * - \f$ Y \leftarrow \alpha A X + \beta Y \f$ * * @param[in] order Row/columns order. * @param[in] uplo The triangle in matrix being referenced. * @param[in] N Number of rows and columns in banded matrix \b A. * @param[in] K Number of sub-diagonals/super-diagonals in banded matrix \b A. * @param[in] alpha The factor of matrix \b A. * @param[in] A Buffer object storing matrix \b A. * @param[in] offa Offset in number of elements for first element in matrix \b A. * @param[in] lda Leading dimension of matrix \b A. It cannot be less * than ( \b K + 1 ) * @param[in] X Buffer object storing vector \b X. * @param[in] offx Offset of first element of vector \b X in buffer object. * Counted in elements. * @param[in] incx Increment for the elements of vector \b X. It cannot be zero. * @param[in] beta The factor of vector \b Y. * @param[out] Y Buffer object storing vector \b Y. * @param[in] offy Offset of first element of vector \b Y in buffer object. * Counted in elements. * @param[in] incy Increment for the elements of vector \b Y. It cannot be zero. * @param[in] numCommandQueues Number of OpenCL command queues in which the * task is to be performed. * @param[in] commandQueues OpenCL command queues. * @param[in] numEventsInWaitList Number of events in the event wait list. * @param[in] eventWaitList Event wait list. * @param[in] events Event objects per each command queue that identify * a particular kernel execution instance. * * @return * - \b clblasSuccess on success; * - \b clblasInvalidDevice if a target device does not support floating * point arithmetic with double precision; * - the same error codes as the clblasChbmv() function otherwise. * * @ingroup HBMV */ clblasStatus clblasZhbmv( clblasOrder order, clblasUplo uplo, size_t N, size_t K, cl_double2 alpha, const cl_mem A, size_t offa, size_t lda, const cl_mem X, size_t offx, int incx, cl_double2 beta, cl_mem Y, size_t offy, int incy, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events); /*@}*/ /** * @defgroup TBSV TBSV - Solving triangular banded matrix * @ingroup BLAS2 */ /*@{*/ /** * @brief solving triangular banded matrix problems with float elements. * * Matrix-vector products: * - \f$ A X \leftarrow X \f$ * - \f$ A^T X \leftarrow X \f$ * * @param[in] order Row/column order. * @param[in] uplo The triangle in matrix being referenced. * @param[in] trans How matrix \b A is to be transposed. * @param[in] diag Specify whether matrix \b A is unit triangular. * @param[in] N Number of rows/columns in banded matrix \b A. * @param[in] K Number of sub-diagonals/super-diagonals in triangular banded matrix \b A. * @param[in] A Buffer object storing matrix \b A. * @param[in] offa Offset in number of elements for first element in matrix \b A. * @param[in] lda Leading dimension of matrix \b A. It cannot be less * than ( \b K + 1 ) * @param[out] X Buffer object storing vector \b X. * @param[in] offx Offset in number of elements for first element in vector \b X. * @param[in] incx Increment for the elements of \b X. Must not be zero. * @param[in] numCommandQueues Number of OpenCL command queues in which the * task is to be performed. * @param[in] commandQueues OpenCL command queues. * @param[in] numEventsInWaitList Number of events in the event wait list. * @param[in] eventWaitList Event wait list. * @param[in] events Event objects per each command queue that identify * a particular kernel execution instance. * * @return * - \b clblasSuccess on success; * - \b clblasNotInitialized if clblasSetup() was not called; * - \b clblasInvalidValue if invalid parameters are passed: * - either \b N or \b incx is zero, or * - K is greater than \b N - 1 * - the leading dimension is invalid; * - \b clblasInvalidMemObject if either \b A or \b X object is * Invalid, or an image object rather than the buffer one; * - \b clblasOutOfHostMemory if the library can't allocate memory for * internal structures; * - \b clblasInvalidCommandQueue if the passed command queue is invalid; * - \b clblasInvalidContext if a context a passed command queue belongs * to was released; * - \b clblasInvalidOperation if kernel compilation relating to a previous * call has not completed for any of the target devices; * - \b clblasCompilerNotAvailable if a compiler is not available; * - \b clblasBuildProgramFailure if there is a failure to build a program * executable. * * @ingroup TBSV */ clblasStatus clblasStbsv( clblasOrder order, clblasUplo uplo, clblasTranspose trans, clblasDiag diag, size_t N, size_t K, const cl_mem A, size_t offa, size_t lda, cl_mem X, size_t offx, int incx, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events); /** * @example example_stbsv.c * This is an example of how to use the @ref clblasStbsv function. */ /** * @brief solving triangular banded matrix problems with double elements. * * Matrix-vector products: * - \f$ A X \leftarrow X \f$ * - \f$ A^T X \leftarrow X \f$ * * @param[in] order Row/column order. * @param[in] uplo The triangle in matrix being referenced. * @param[in] trans How matrix \b A is to be transposed. * @param[in] diag Specify whether matrix \b A is unit triangular. * @param[in] N Number of rows/columns in banded matrix \b A. * @param[in] K Number of sub-diagonals/super-diagonals in triangular banded matrix \b A. * @param[in] A Buffer object storing matrix \b A. * @param[in] offa Offset in number of elements for first element in matrix \b A. * @param[in] lda Leading dimension of matrix \b A. It cannot be less * than ( \b K + 1 ) * @param[out] X Buffer object storing vector \b X. * @param[in] offx Offset in number of elements for first element in vector \b X. * @param[in] incx Increment for the elements of \b X. Must not be zero. * @param[in] numCommandQueues Number of OpenCL command queues in which the * task is to be performed. * @param[in] commandQueues OpenCL command queues. * @param[in] numEventsInWaitList Number of events in the event wait list. * @param[in] eventWaitList Event wait list. * @param[in] events Event objects per each command queue that identify * a particular kernel execution instance. * * @return * - \b clblasSuccess on success; * - \b clblasInvalidDevice if a target device does not support floating * point arithmetic with double precision; * - the same error codes as the clblasStbsv() function otherwise. * * @ingroup TBSV */ clblasStatus clblasDtbsv( clblasOrder order, clblasUplo uplo, clblasTranspose trans, clblasDiag diag, size_t N, size_t K, const cl_mem A, size_t offa, size_t lda, cl_mem X, size_t offx, int incx, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events); /** * @brief solving triangular banded matrix problems with float-complex elements. * * Matrix-vector products: * - \f$ A X \leftarrow X \f$ * - \f$ A^T X \leftarrow X \f$ * * @param[in] order Row/column order. * @param[in] uplo The triangle in matrix being referenced. * @param[in] trans How matrix \b A is to be transposed. * @param[in] diag Specify whether matrix \b A is unit triangular. * @param[in] N Number of rows/columns in banded matrix \b A. * @param[in] K Number of sub-diagonals/super-diagonals in triangular banded matrix \b A. * @param[in] A Buffer object storing matrix \b A. * @param[in] offa Offset in number of elements for first element in matrix \b A. * @param[in] lda Leading dimension of matrix \b A. It cannot be less * than ( \b K + 1 ) * @param[out] X Buffer object storing vector \b X. * @param[in] offx Offset in number of elements for first element in vector \b X. * @param[in] incx Increment for the elements of \b X. Must not be zero. * @param[in] numCommandQueues Number of OpenCL command queues in which the * task is to be performed. * @param[in] commandQueues OpenCL command queues. * @param[in] numEventsInWaitList Number of events in the event wait list. * @param[in] eventWaitList Event wait list. * @param[in] events Event objects per each command queue that identify * a particular kernel execution instance. * * @return The same result as the clblasStbsv() function. * * @ingroup TBSV */ clblasStatus clblasCtbsv( clblasOrder order, clblasUplo uplo, clblasTranspose trans, clblasDiag diag, size_t N, size_t K, const cl_mem A, size_t offa, size_t lda, cl_mem X, size_t offx, int incx, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events); /** * @brief solving triangular banded matrix problems with double-complex elements. * * Matrix-vector products: * - \f$ A X \leftarrow X \f$ * - \f$ A^T X \leftarrow X \f$ * * @param[in] order Row/column order. * @param[in] uplo The triangle in matrix being referenced. * @param[in] trans How matrix \b A is to be transposed. * @param[in] diag Specify whether matrix \b A is unit triangular. * @param[in] N Number of rows/columns in banded matrix \b A. * @param[in] K Number of sub-diagonals/super-diagonals in triangular banded matrix \b A. * @param[in] A Buffer object storing matrix \b A. * @param[in] offa Offset in number of elements for first element in matrix \b A. * @param[in] lda Leading dimension of matrix \b A. It cannot be less * than ( \b K + 1 ) * @param[out] X Buffer object storing vector \b X. * @param[in] offx Offset in number of elements for first element in vector \b X. * @param[in] incx Increment for the elements of \b X. Must not be zero. * @param[in] numCommandQueues Number of OpenCL command queues in which the * task is to be performed. * @param[in] commandQueues OpenCL command queues. * @param[in] numEventsInWaitList Number of events in the event wait list. * @param[in] eventWaitList Event wait list. * @param[in] events Event objects per each command queue that identify * a particular kernel execution instance. * * @return The same result as the clblasDtbsv() function. * * @ingroup TBSV */ clblasStatus clblasZtbsv( clblasOrder order, clblasUplo uplo, clblasTranspose trans, clblasDiag diag, size_t N, size_t K, const cl_mem A, size_t offa, size_t lda, cl_mem X, size_t offx, int incx, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events); /*@}*/ /** * @defgroup BLAS3 BLAS-3 functions * * The Level 3 Basic Linear Algebra Subprograms are funcions that perform * matrix-matrix operations. */ /*@{*/ /*@}*/ /** * @defgroup GEMM GEMM - General matrix-matrix multiplication * @ingroup BLAS3 */ /*@{*/ /** * @brief Matrix-matrix product of general rectangular matrices with float * elements. Extended version. * * Matrix-matrix products: * - \f$ C \leftarrow \alpha A B + \beta C \f$ * - \f$ C \leftarrow \alpha A^T B + \beta C \f$ * - \f$ C \leftarrow \alpha A B^T + \beta C \f$ * - \f$ C \leftarrow \alpha A^T B^T + \beta C \f$ * * @param[in] order Row/column order. * @param[in] transA How matrix \b A is to be transposed. * @param[in] transB How matrix \b B is to be transposed. * @param[in] M Number of rows in matrix \b A. * @param[in] N Number of columns in matrix \b B. * @param[in] K Number of columns in matrix \b A and rows in matrix \b B. * @param[in] alpha The factor of matrix \b A. * @param[in] A Buffer object storing matrix \b A. * @param[in] offA Offset of the first element of the matrix \b A in the * buffer object. Counted in elements. * @param[in] lda Leading dimension of matrix \b A. It cannot be less * than \b K when the \b order parameter is set to * \b clblasRowMajor,\n or less than \b M when the * parameter is set to \b clblasColumnMajor. * @param[in] B Buffer object storing matrix \b B. * @param[in] offB Offset of the first element of the matrix \b B in the * buffer object. Counted in elements. * @param[in] ldb Leading dimension of matrix \b B. It cannot be less * than \b N when the \b order parameter is set to * \b clblasRowMajor,\n or less than \b K * when it is set to \b clblasColumnMajor. * @param[in] beta The factor of matrix \b C. * @param[out] C Buffer object storing matrix \b C. * @param[in] offC Offset of the first element of the matrix \b C in the * buffer object. Counted in elements. * @param[in] ldc Leading dimension of matrix \b C. It cannot be less * than \b N when the \b order parameter is set to * \b clblasRowMajor,\n or less than \b M when * it is set to \b clblasColumnMajorOrder. * @param[in] numCommandQueues Number of OpenCL command queues in which the * task is to be performed. * @param[in] commandQueues OpenCL command queues. * @param[in] numEventsInWaitList Number of events in the event wait list. * @param[in] eventWaitList Event wait list. * @param[in] events Event objects per each command queue that identify * a particular kernel execution instance. * * @return * - \b clblasSuccess on success; * - \b clblasInvalidValue if either \b offA, \b offB or \b offC exceeds * the size of the respective buffer object; * - the same error codes as clblasSgemm() otherwise. * * @ingroup GEMM */ clblasStatus clblasSgemm( clblasOrder order, clblasTranspose transA, clblasTranspose transB, size_t M, size_t N, size_t K, cl_float alpha, const cl_mem A, size_t offA, size_t lda, const cl_mem B, size_t offB, size_t ldb, cl_float beta, cl_mem C, size_t offC, size_t ldc, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events); /** * @example example_sgemm.c * This is an example of how to use the @ref clblasSgemmEx function. */ /** * @brief Matrix-matrix product of general rectangular matrices with double * elements. Extended version. * * Matrix-matrix products: * - \f$ C \leftarrow \alpha A B + \beta C \f$ * - \f$ C \leftarrow \alpha A^T B + \beta C \f$ * - \f$ C \leftarrow \alpha A B^T + \beta C \f$ * - \f$ C \leftarrow \alpha A^T B^T + \beta C \f$ * * @param[in] order Row/column order. * @param[in] transA How matrix \b A is to be transposed. * @param[in] transB How matrix \b B is to be transposed. * @param[in] M Number of rows in matrix \b A. * @param[in] N Number of columns in matrix \b B. * @param[in] K Number of columns in matrix \b A and rows in matrix \b B. * @param[in] alpha The factor of matrix \b A. * @param[in] A Buffer object storing matrix \b A. * @param[in] offA Offset of the first element of the matrix \b A in the * buffer object. Counted in elements. * @param[in] lda Leading dimension of matrix \b A. For detailed description, * see clblasSgemm(). * @param[in] B Buffer object storing matrix \b B. * @param[in] offB Offset of the first element of the matrix \b B in the * buffer object. Counted in elements. * @param[in] ldb Leading dimension of matrix \b B. For detailed description, * see clblasSgemm(). * @param[in] beta The factor of matrix \b C. * @param[out] C Buffer object storing matrix \b C. * @param[in] offC Offset of the first element of the matrix \b C in the * buffer object. Counted in elements. * @param[in] ldc Leading dimension of matrix \b C. For detailed description, * see clblasSgemm(). * @param[in] numCommandQueues Number of OpenCL command queues in which the * task is to be performed. * @param[in] commandQueues OpenCL command queues. * @param[in] numEventsInWaitList Number of events in the event wait list. * @param[in] eventWaitList Event wait list. * @param[in] events Event objects per each command queue that identify * a particular kernel execution instance. * * @return * - \b clblasSuccess on success; * - \b clblasInvalidDevice if a target device does not support floating * point arithmetic with double precision; * - \b clblasInvalidValue if either \b offA, \b offB or offC exceeds * the size of the respective buffer object; * - the same error codes as the clblasSgemm() function otherwise. * * @ingroup GEMM */ clblasStatus clblasDgemm( clblasOrder order, clblasTranspose transA, clblasTranspose transB, size_t M, size_t N, size_t K, cl_double alpha, const cl_mem A, size_t offA, size_t lda, const cl_mem B, size_t offB, size_t ldb, cl_double beta, cl_mem C, size_t offC, size_t ldc, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events); /** * @brief Matrix-matrix product of general rectangular matrices with float * complex elements. Extended version. * * Matrix-matrix products: * - \f$ C \leftarrow \alpha A B + \beta C \f$ * - \f$ C \leftarrow \alpha A^T B + \beta C \f$ * - \f$ C \leftarrow \alpha A B^T + \beta C \f$ * - \f$ C \leftarrow \alpha A^T B^T + \beta C \f$ * * @param[in] order Row/column order. * @param[in] transA How matrix \b A is to be transposed. * @param[in] transB How matrix \b B is to be transposed. * @param[in] M Number of rows in matrix \b A. * @param[in] N Number of columns in matrix \b B. * @param[in] K Number of columns in matrix \b A and rows in matrix \b B. * @param[in] alpha The factor of matrix \b A. * @param[in] A Buffer object storing matrix \b A. * @param[in] offA Offset of the first element of the matrix \b A in the * buffer object. Counted in elements. * @param[in] lda Leading dimension of matrix \b A. For detailed description, * see clblasSgemm(). * @param[in] B Buffer object storing matrix \b B. * @param[in] offB Offset of the first element of the matrix \b B in the * buffer object. Counted in elements. * @param[in] ldb Leading dimension of matrix \b B. For detailed description, * see clblasSgemm(). * @param[in] beta The factor of matrix \b C. * @param[out] C Buffer object storing matrix \b C. * @param[in] offC Offset of the first element of the matrix \b C in the * buffer object. Counted in elements. * @param[in] ldc Leading dimension of matrix \b C. For detailed description, * see clblasSgemm(). * @param[in] numCommandQueues Number of OpenCL command queues in which the * task is to be performed. * @param[in] commandQueues OpenCL command queues. * @param[in] numEventsInWaitList Number of events in the event wait list. * @param[in] eventWaitList Event wait list. * @param[in] events Event objects per each command queue that identify * a particular kernel execution instance. * * @return * - \b clblasSuccess on success; * - \b clblasInvalidValue if either \b offA, \b offB or offC exceeds * the size of the respective buffer object; * - the same error codes as the clblasSgemm() function otherwise. * * @ingroup GEMM */ clblasStatus clblasCgemm( clblasOrder order, clblasTranspose transA, clblasTranspose transB, size_t M, size_t N, size_t K, FloatComplex alpha, const cl_mem A, size_t offA, size_t lda, const cl_mem B, size_t offB, size_t ldb, FloatComplex beta, cl_mem C, size_t offC, size_t ldc, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events); /** * @brief Matrix-matrix product of general rectangular matrices with double * complex elements. Exteneded version. * * Matrix-matrix products: * - \f$ C \leftarrow \alpha A B + \beta C \f$ * - \f$ C \leftarrow \alpha A^T B + \beta C \f$ * - \f$ C \leftarrow \alpha A B^T + \beta C \f$ * - \f$ C \leftarrow \alpha A^T B^T + \beta C \f$ * * @param[in] order Row/column order. * @param[in] transA How matrix \b A is to be transposed. * @param[in] transB How matrix \b B is to be transposed. * @param[in] M Number of rows in matrix \b A. * @param[in] N Number of columns in matrix \b B. * @param[in] K Number of columns in matrix \b A and rows in matrix \b B. * @param[in] alpha The factor of matrix \b A. * @param[in] A Buffer object storing matrix \b A. * @param[in] offA Offset of the first element of the matrix \b A in the * buffer object. Counted in elements. * @param[in] lda Leading dimension of matrix \b A. For detailed description, * see clblasSgemm(). * @param[in] B Buffer object storing matrix \b B. * @param[in] offB Offset of the first element of the matrix \b B in the * buffer object. Counted in elements. * @param[in] ldb Leading dimension of matrix \b B. For detailed description, * see clblasSgemm(). * @param[in] beta The factor of matrix \b C. * @param[out] C Buffer object storing matrix \b C. * @param[in] offC Offset of the first element of the matrix \b C in the * buffer object. Counted in elements. * @param[in] ldc Leading dimension of matrix \b C. For detailed description, * see clblasSgemm(). * @param[in] numCommandQueues Number of OpenCL command queues in which the * task is to be performed. * @param[in] commandQueues OpenCL command queues. * @param[in] numEventsInWaitList Number of events in the event wait list. * @param[in] eventWaitList Event wait list. * @param[in] events Event objects per each command queue that identify * a particular kernel execution instance. * * @return * - \b clblasSuccess on success; * - \b clblasInvalidDevice if a target device does not support floating * point arithmetic with double precision; * - \b clblasInvalidValue if either \b offA, \b offB or offC exceeds * the size of the respective buffer object; * - the same error codes as the clblasSgemm() function otherwise. * * @ingroup GEMM */ clblasStatus clblasZgemm( clblasOrder order, clblasTranspose transA, clblasTranspose transB, size_t M, size_t N, size_t K, DoubleComplex alpha, const cl_mem A, size_t offA, size_t lda, const cl_mem B, size_t offB, size_t ldb, DoubleComplex beta, cl_mem C, size_t offC, size_t ldc, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events); /*@}*/ /** * @defgroup TRMM TRMM - Triangular matrix-matrix multiplication * @ingroup BLAS3 */ /*@{*/ /** * @brief Multiplying a matrix by a triangular matrix with float elements. * Extended version. * * Matrix-triangular matrix products: * - \f$ B \leftarrow \alpha A B \f$ * - \f$ B \leftarrow \alpha A^T B \f$ * - \f$ B \leftarrow \alpha B A \f$ * - \f$ B \leftarrow \alpha B A^T \f$ * * where \b T is an upper or lower triangular matrix. * * @param[in] order Row/column order. * @param[in] side The side of triangular matrix. * @param[in] uplo The triangle in matrix being referenced. * @param[in] transA How matrix \b A is to be transposed. * @param[in] diag Specify whether matrix is unit triangular. * @param[in] M Number of rows in matrix \b B. * @param[in] N Number of columns in matrix \b B. * @param[in] alpha The factor of matrix \b A. * @param[in] A Buffer object storing matrix \b A. * @param[in] offA Offset of the first element of the matrix \b A in the * buffer object. Counted in elements. * @param[in] lda Leading dimension of matrix \b A. It cannot be less * than \b M when the \b side parameter is set to * \b clblasLeft,\n or less than \b N when it is set * to \b clblasRight. * @param[out] B Buffer object storing matrix \b B. * @param[in] offB Offset of the first element of the matrix \b B in the * buffer object. Counted in elements. * @param[in] ldb Leading dimension of matrix \b B. It cannot be less * than \b N when the \b order parameter is set to * \b clblasRowMajor,\n or not less than \b M * when it is set to \b clblasColumnMajor. * @param[in] numCommandQueues Number of OpenCL command queues in which the * task is to be performed. * @param[in] commandQueues OpenCL command queues. * @param[in] numEventsInWaitList Number of events in the event wait list. * @param[in] eventWaitList Event wait list. * @param[in] events Event objects per each command queue that identify * a particular kernel execution instance. * * @return * - \b clblasSuccess on success; * - \b clblasInvalidValue if either \b offA or \b offB exceeds the size * of the respective buffer object; * - the same error codes as clblasStrmm() otherwise. * * @ingroup TRMM */ clblasStatus clblasStrmm( clblasOrder order, clblasSide side, clblasUplo uplo, clblasTranspose transA, clblasDiag diag, size_t M, size_t N, cl_float alpha, const cl_mem A, size_t offA, size_t lda, cl_mem B, size_t offB, size_t ldb, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events); /** * @example example_strmm.c * This is an example of how to use the @ref clblasStrmmEx function. */ /** * @brief Multiplying a matrix by a triangular matrix with double elements. * Extended version. * * Matrix-triangular matrix products: * - \f$ B \leftarrow \alpha A B \f$ * - \f$ B \leftarrow \alpha A^T B \f$ * - \f$ B \leftarrow \alpha B A \f$ * - \f$ B \leftarrow \alpha B A^T \f$ * * where \b T is an upper or lower triangular matrix. * * @param[in] order Row/column order. * @param[in] side The side of triangular matrix. * @param[in] uplo The triangle in matrix being referenced. * @param[in] transA How matrix \b A is to be transposed. * @param[in] diag Specify whether matrix is unit triangular. * @param[in] M Number of rows in matrix \b B. * @param[in] N Number of columns in matrix \b B. * @param[in] alpha The factor of matrix \b A. * @param[in] A Buffer object storing matrix \b A. * @param[in] offA Offset of the first element of the matrix \b A in the * buffer object. Counted in elements. * @param[in] lda Leading dimension of matrix \b A. For detailed * description, see clblasStrmm(). * @param[out] B Buffer object storing matrix \b B. * @param[in] offB Offset of the first element of the matrix \b B in the * buffer object. Counted in elements. * @param[in] ldb Leading dimension of matrix \b B. For detailed * description, see clblasStrmm(). * @param[in] numCommandQueues Number of OpenCL command queues in which the * task is to be performed. * @param[in] commandQueues OpenCL command queues. * @param[in] numEventsInWaitList Number of events in the event wait list. * @param[in] eventWaitList Event wait list. * @param[in] events Event objects per each command queue that identify * a particular kernel execution instance. * * @return * - \b clblasSuccess on success; * - \b clblasInvalidDevice if a target device does not support floating * point arithmetic with double precision; * - \b clblasInvalidValue if either \b offA or \b offB exceeds the size * of the respective buffer object; * - the same error codes as the clblasStrmm() function otherwise. * * @ingroup TRMM */ clblasStatus clblasDtrmm( clblasOrder order, clblasSide side, clblasUplo uplo, clblasTranspose transA, clblasDiag diag, size_t M, size_t N, cl_double alpha, const cl_mem A, size_t offA, size_t lda, cl_mem B, size_t offB, size_t ldb, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events); /** * @brief Multiplying a matrix by a triangular matrix with float complex * elements. Extended version. * * Matrix-triangular matrix products: * - \f$ B \leftarrow \alpha A B \f$ * - \f$ B \leftarrow \alpha A^T B \f$ * - \f$ B \leftarrow \alpha B A \f$ * - \f$ B \leftarrow \alpha B A^T \f$ * * where \b T is an upper or lower triangular matrix. * @param[in] order Row/column order. * @param[in] side The side of triangular matrix. * @param[in] uplo The triangle in matrix being referenced. * @param[in] transA How matrix \b A is to be transposed. * @param[in] diag Specify whether matrix is unit triangular. * @param[in] M Number of rows in matrix \b B. * @param[in] N Number of columns in matrix \b B. * @param[in] alpha The factor of matrix \b A. * @param[in] offA Offset of the first element of the matrix \b A in the * buffer object. Counted in elements. * @param[in] A Buffer object storing matrix \b A. * @param[in] lda Leading dimension of matrix \b A. For detailed * description, see clblasStrmm(). * @param[out] B Buffer object storing matrix \b B. * @param[in] offB Offset of the first element of the matrix \b B in the * buffer object. Counted in elements. * @param[in] ldb Leading dimension of matrix \b B. For detailed * description, see clblasStrmm(). * @param[in] numCommandQueues Number of OpenCL command queues in which the * task is to be performed. * @param[in] commandQueues OpenCL command queues. * @param[in] numEventsInWaitList Number of events in the event wait list. * @param[in] eventWaitList Event wait list. * @param[in] events Event objects per each command queue that identify * a particular kernel execution instance. * * @return * - \b clblasSuccess on success; * - \b clblasInvalidValue if either \b offA or \b offB exceeds the size * of the respective buffer object; * - the same error codes as clblasStrmm() otherwise. * * @ingroup TRMM */ clblasStatus clblasCtrmm( clblasOrder order, clblasSide side, clblasUplo uplo, clblasTranspose transA, clblasDiag diag, size_t M, size_t N, FloatComplex alpha, const cl_mem A, size_t offA, size_t lda, cl_mem B, size_t offB, size_t ldb, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events); /** * @brief Multiplying a matrix by a triangular matrix with double complex * elements. Extended version. * * Matrix-triangular matrix products: * - \f$ B \leftarrow \alpha A B \f$ * - \f$ B \leftarrow \alpha A^T B \f$ * - \f$ B \leftarrow \alpha B A \f$ * - \f$ B \leftarrow \alpha B A^T \f$ * * where \b T is an upper or lower triangular matrix. * * @param[in] order Row/column order. * @param[in] side The side of triangular matrix. * @param[in] uplo The triangle in matrix being referenced. * @param[in] transA How matrix \b A is to be transposed. * @param[in] diag Specify whether matrix is unit triangular. * @param[in] M Number of rows in matrix \b B. * @param[in] N Number of columns in matrix \b B. * @param[in] alpha The factor of matrix \b A. * @param[in] A Buffer object storing matrix \b A. * @param[in] offA Offset of the first element of the matrix \b A in the * buffer object. Counted in elements. * @param[in] lda Leading dimension of matrix \b A. For detailed * description, see clblasStrmm(). * @param[out] B Buffer object storing matrix \b B. * @param[in] offB Offset of the first element of the matrix \b B in the * buffer object. Counted in elements. * @param[in] ldb Leading dimension of matrix \b B. For detailed * description, see clblasStrmm(). * @param[in] numCommandQueues Number of OpenCL command queues in which the * task is to be performed. * @param[in] commandQueues OpenCL command queues. * @param[in] numEventsInWaitList Number of events in the event wait list. * @param[in] eventWaitList Event wait list. * @param[in] events Event objects per each command queue that identify * a particular kernel execution instance. * * @return * - \b clblasSuccess on success; * - \b clblasInvalidDevice if a target device does not support floating * point arithmetic with double precision; * - \b clblasInvalidValue if either \b offA or \b offB exceeds the size * of the respective buffer object; * - the same error codes as the clblasStrmm() function otherwise. * * @ingroup TRMM */ clblasStatus clblasZtrmm( clblasOrder order, clblasSide side, clblasUplo uplo, clblasTranspose transA, clblasDiag diag, size_t M, size_t N, DoubleComplex alpha, const cl_mem A, size_t offA, size_t lda, cl_mem B, size_t offB, size_t ldb, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events); /*@}*/ /** * @defgroup TRSM TRSM - Solving triangular systems of equations * @ingroup BLAS3 */ /*@{*/ /** * @brief Solving triangular systems of equations with multiple right-hand * sides and float elements. Extended version. * * Solving triangular systems of equations: * - \f$ B \leftarrow \alpha A^{-1} B \f$ * - \f$ B \leftarrow \alpha A^{-T} B \f$ * - \f$ B \leftarrow \alpha B A^{-1} \f$ * - \f$ B \leftarrow \alpha B A^{-T} \f$ * * where \b T is an upper or lower triangular matrix. * * @param[in] order Row/column order. * @param[in] side The side of triangular matrix. * @param[in] uplo The triangle in matrix being referenced. * @param[in] transA How matrix \b A is to be transposed. * @param[in] diag Specify whether matrix is unit triangular. * @param[in] M Number of rows in matrix \b B. * @param[in] N Number of columns in matrix \b B. * @param[in] alpha The factor of matrix \b A. * @param[in] A Buffer object storing matrix \b A. * @param[in] offA Offset of the first element of the matrix \b A in the * buffer object. Counted in elements. * @param[in] lda Leading dimension of matrix \b A. It cannot be less * than \b M when the \b side parameter is set to * \b clblasLeft,\n or less than \b N * when it is set to \b clblasRight. * @param[out] B Buffer object storing matrix \b B. * @param[in] offB Offset of the first element of the matrix \b B in the * buffer object. Counted in elements. * @param[in] ldb Leading dimension of matrix \b B. It cannot be less * than \b N when the \b order parameter is set to * \b clblasRowMajor,\n or less than \b M * when it is set to \b clblasColumnMajor. * @param[in] numCommandQueues Number of OpenCL command queues in which the * task is to be performed. * @param[in] commandQueues OpenCL command queues. * @param[in] numEventsInWaitList Number of events in the event wait list. * @param[in] eventWaitList Event wait list. * @param[in] events Event objects per each command queue that identify * a particular kernel execution instance. * * @return * - \b clblasSuccess on success; * - \b clblasInvalidValue if either \b offA or \b offB exceeds the size * of the respective buffer object; * - the same error codes as clblasStrsm() otherwise. * * @ingroup TRSM */ clblasStatus clblasStrsm( clblasOrder order, clblasSide side, clblasUplo uplo, clblasTranspose transA, clblasDiag diag, size_t M, size_t N, cl_float alpha, const cl_mem A, size_t offA, size_t lda, cl_mem B, size_t offB, size_t ldb, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events); /** * @example example_strsm.c * This is an example of how to use the @ref clblasStrsmEx function. */ /** * @brief Solving triangular systems of equations with multiple right-hand * sides and double elements. Extended version. * * Solving triangular systems of equations: * - \f$ B \leftarrow \alpha A^{-1} B \f$ * - \f$ B \leftarrow \alpha A^{-T} B \f$ * - \f$ B \leftarrow \alpha B A^{-1} \f$ * - \f$ B \leftarrow \alpha B A^{-T} \f$ * * where \b T is an upper or lower triangular matrix. * * @param[in] order Row/column order. * @param[in] side The side of triangular matrix. * @param[in] uplo The triangle in matrix being referenced. * @param[in] transA How matrix \b A is to be transposed. * @param[in] diag Specify whether matrix is unit triangular. * @param[in] M Number of rows in matrix \b B. * @param[in] N Number of columns in matrix \b B. * @param[in] alpha The factor of matrix \b A. * @param[in] A Buffer object storing matrix \b A. * @param[in] offA Offset of the first element of the matrix \b A in the * buffer object. Counted in elements. * @param[in] lda Leading dimension of matrix \b A. For detailed * description, see clblasStrsm(). * @param[out] B Buffer object storing matrix \b B. * @param[in] offB Offset of the first element of the matrix \b A in the * buffer object. Counted in elements. * @param[in] ldb Leading dimension of matrix \b B. For detailed * description, see clblasStrsm(). * @param[in] numCommandQueues Number of OpenCL command queues in which the * task is to be performed. * @param[in] commandQueues OpenCL command queues. * @param[in] numEventsInWaitList Number of events in the event wait list. * @param[in] eventWaitList Event wait list. * @param[in] events Event objects per each command queue that identify * a particular kernel execution instance. * * @return * - \b clblasSuccess on success; * - \b clblasInvalidDevice if a target device does not support floating * point arithmetic with double precision; * - \b clblasInvalidValue if either \b offA or \b offB exceeds the size * of the respective buffer object; * - the same error codes as the clblasStrsm() function otherwise. * * @ingroup TRSM */ clblasStatus clblasDtrsm( clblasOrder order, clblasSide side, clblasUplo uplo, clblasTranspose transA, clblasDiag diag, size_t M, size_t N, cl_double alpha, const cl_mem A, size_t offA, size_t lda, cl_mem B, size_t offB, size_t ldb, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events); /** * @brief Solving triangular systems of equations with multiple right-hand * sides and float complex elements. Extended version. * * Solving triangular systems of equations: * - \f$ B \leftarrow \alpha A^{-1} B \f$ * - \f$ B \leftarrow \alpha A^{-T} B \f$ * - \f$ B \leftarrow \alpha B A^{-1} \f$ * - \f$ B \leftarrow \alpha B A^{-T} \f$ * * where \b T is an upper or lower triangular matrix. * * @param[in] order Row/column order. * @param[in] side The side of triangular matrix. * @param[in] uplo The triangle in matrix being referenced. * @param[in] transA How matrix \b A is to be transposed. * @param[in] diag Specify whether matrix is unit triangular. * @param[in] M Number of rows in matrix \b B. * @param[in] N Number of columns in matrix \b B. * @param[in] alpha The factor of matrix \b A. * @param[in] A Buffer object storing matrix \b A. * @param[in] offA Offset of the first element of the matrix \b A in the * buffer object. Counted in elements. * @param[in] lda Leading dimension of matrix \b A. For detailed * description, see clblasStrsm(). * @param[out] B Buffer object storing matrix \b B. * @param[in] offB Offset of the first element of the matrix \b B in the * buffer object. Counted in elements. * @param[in] ldb Leading dimension of matrix \b B. For detailed * description, see clblasStrsm(). * @param[in] numCommandQueues Number of OpenCL command queues in which the * task is to be performed. * @param[in] commandQueues OpenCL command queues. * @param[in] numEventsInWaitList Number of events in the event wait list. * @param[in] eventWaitList Event wait list. * @param[in] events Event objects per each command queue that identify * a particular kernel execution instance. * * @return * - \b clblasSuccess on success; * - \b clblasInvalidValue if either \b offA or \b offB exceeds the size * of the respective buffer object; * - the same error codes as clblasStrsm() otherwise. * * @ingroup TRSM */ clblasStatus clblasCtrsm( clblasOrder order, clblasSide side, clblasUplo uplo, clblasTranspose transA, clblasDiag diag, size_t M, size_t N, FloatComplex alpha, const cl_mem A, size_t offA, size_t lda, cl_mem B, size_t offB, size_t ldb, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events); /** * @brief Solving triangular systems of equations with multiple right-hand * sides and double complex elements. Extended version. * * Solving triangular systems of equations: * - \f$ B \leftarrow \alpha A^{-1} B \f$ * - \f$ B \leftarrow \alpha A^{-T} B \f$ * - \f$ B \leftarrow \alpha B A^{-1} \f$ * - \f$ B \leftarrow \alpha B A^{-T} \f$ * * where \b T is an upper or lower triangular matrix. * * @param[in] order Row/column order. * @param[in] side The side of triangular matrix. * @param[in] uplo The triangle in matrix being referenced. * @param[in] transA How matrix \b A is to be transposed. * @param[in] diag Specify whether matrix is unit triangular. * @param[in] M Number of rows in matrix \b B. * @param[in] N Number of columns in matrix \b B. * @param[in] alpha The factor of matrix \b A. * @param[in] A Buffer object storing matrix \b A. * @param[in] offA Offset of the first element of the matrix \b A in the * buffer object. Counted in elements. * @param[in] lda Leading dimension of matrix \b A. For detailed * description, see clblasStrsm(). * @param[out] B Buffer object storing matrix \b B. * @param[in] offB Offset of the first element of the matrix \b B in the * buffer object. Counted in elements. * @param[in] ldb Leading dimension of matrix \b B. For detailed * description, see clblasStrsm(). * @param[in] numCommandQueues Number of OpenCL command queues in which the * task is to be performed. * @param[in] commandQueues OpenCL command queues. * @param[in] numEventsInWaitList Number of events in the event wait list. * @param[in] eventWaitList Event wait list. * @param[in] events Event objects per each command queue that identify * a particular kernel execution instance. * * @return * - \b clblasSuccess on success; * - \b clblasInvalidDevice if a target device does not support floating * point arithmetic with double precision; * - \b clblasInvalidValue if either \b offA or \b offB exceeds the size * of the respective buffer object; * - the same error codes as the clblasStrsm() function otherwise * * @ingroup TRSM */ clblasStatus clblasZtrsm( clblasOrder order, clblasSide side, clblasUplo uplo, clblasTranspose transA, clblasDiag diag, size_t M, size_t N, DoubleComplex alpha, const cl_mem A, size_t offA, size_t lda, cl_mem B, size_t offB, size_t ldb, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events); /*@}*/ /** * @defgroup SYRK SYRK - Symmetric rank-k update of a matrix * @ingroup BLAS3 */ /*@{*/ /** * @brief Rank-k update of a symmetric matrix with float elements. * Extended version. * * Rank-k updates: * - \f$ C \leftarrow \alpha A A^T + \beta C \f$ * - \f$ C \leftarrow \alpha A^T A + \beta C \f$ * * where \b C is a symmetric matrix. * * @param[in] order Row/column order. * @param[in] uplo The triangle in matrix \b C being referenced. * @param[in] transA How matrix \b A is to be transposed. * @param[in] N Number of rows and columns in matrix \b C. * @param[in] K Number of columns of the matrix \b A if it is not * transposed, and number of rows otherwise. * @param[in] alpha The factor of matrix \b A. * @param[in] A Buffer object storing the matrix \b A. * @param[in] offA Offset of the first element of the matrix \b A in the * buffer object. Counted in elements. * @param[in] lda Leading dimension of matrix \b A. It cannot be * less than \b K if \b A is * in the row-major format, and less than \b N * otherwise. * @param[in] beta The factor of the matrix \b C. * @param[out] C Buffer object storing matrix \b C. * @param[in] offC Offset of the first element of the matrix \b C in the * buffer object. Counted in elements. * @param[in] ldc Leading dimension of matric \b C. It cannot be less * than \b N. * @param[in] numCommandQueues Number of OpenCL command queues in which the * task is to be performed. * @param[in] commandQueues OpenCL command queues. * @param[in] numEventsInWaitList Number of events in the event wait list. * @param[in] eventWaitList Event wait list. * @param[in] events Event objects per each command queue that identify * a particular kernel execution instance. * * @return * - \b clblasSuccess on success; * - \b clblasInvalidValue if either \b offA or \b offC exceeds the size * of the respective buffer object; * - the same error codes as the clblasSsyrk() function otherwise. * * @ingroup SYRK */ clblasStatus clblasSsyrk( clblasOrder order, clblasUplo uplo, clblasTranspose transA, size_t N, size_t K, cl_float alpha, const cl_mem A, size_t offA, size_t lda, cl_float beta, cl_mem C, size_t offC, size_t ldc, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events); /** * @example example_ssyrk.c * This is an example of how to use the @ref clblasSsyrkEx function. */ /** * @brief Rank-k update of a symmetric matrix with double elements. * Extended version. * * Rank-k updates: * - \f$ C \leftarrow \alpha A A^T + \beta C \f$ * - \f$ C \leftarrow \alpha A^T A + \beta C \f$ * * where \b C is a symmetric matrix. * * @param[in] order Row/column order. * @param[in] uplo The triangle in matrix \b C being referenced. * @param[in] transA How matrix \b A is to be transposed. * @param[in] N Number of rows and columns in matrix \b C. * @param[in] K Number of columns of the matrix \b A if it is not * transposed, and number of rows otherwise. * @param[in] alpha The factor of matrix \b A. * @param[in] A Buffer object storing the matrix \b A. * @param[in] offA Offset of the first element of the matrix \b A in the * buffer object. Counted in elements. * @param[in] lda Leading dimension of matrix \b A. For detailed * description, see clblasSsyrk(). * @param[in] beta The factor of the matrix \b C. * @param[out] C Buffer object storing matrix \b C. * @param[in] offC Offset of the first element of the matrix \b C in the * buffer object. Counted in elements. * @param[in] ldc Leading dimension of matrix \b C. It cannot be less * than \b N. * @param[in] numCommandQueues Number of OpenCL command queues in which the * task is to be performed. * @param[in] commandQueues OpenCL command queues. * @param[in] numEventsInWaitList Number of events in the event wait list. * @param[in] eventWaitList Event wait list. * @param[in] events Event objects per each command queue that identify * a particular kernel execution instance. * * @return * - \b clblasSuccess on success; * - \b clblasInvalidDevice if a target device does not support floating * point arithmetic with double precision; * - \b clblasInvalidValue if either \b offA or \b offC exceeds the size * of the respective buffer object; * - the same error codes as the clblasSsyrk() function otherwise. * * @ingroup SYRK */ clblasStatus clblasDsyrk( clblasOrder order, clblasUplo uplo, clblasTranspose transA, size_t N, size_t K, cl_double alpha, const cl_mem A, size_t offA, size_t lda, cl_double beta, cl_mem C, size_t offC, size_t ldc, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events); /** * @brief Rank-k update of a symmetric matrix with complex float elements. * Extended version. * * Rank-k updates: * - \f$ C \leftarrow \alpha A A^T + \beta C \f$ * - \f$ C \leftarrow \alpha A^T A + \beta C \f$ * * where \b C is a symmetric matrix. * * @param[in] order Row/column order. * @param[in] uplo The triangle in matrix \b C being referenced. * @param[in] transA How matrix \b A is to be transposed. * @param[in] N Number of rows and columns in matrix \b C. * @param[in] K Number of columns of the matrix \b A if it is not * transposed, and number of rows otherwise. * @param[in] alpha The factor of matrix \b A. * @param[in] A Buffer object storing the matrix \b A. * @param[in] offA Offset of the first element of the matrix \b A in the * buffer object. Counted in elements. * @param[in] lda Leading dimension of matrix \b A. For detailed * description, see clblasSsyrk(). * @param[in] beta The factor of the matrix \b C. * @param[out] C Buffer object storing matrix \b C. * @param[in] offC Offset of the first element of the matrix \b C in the * buffer object. Counted in elements. * @param[in] ldc Leading dimension of matrix \b C. It cannot be less * than \b N. * @param[in] numCommandQueues Number of OpenCL command queues in which the * task is to be performed. * @param[in] commandQueues OpenCL command queues. * @param[in] numEventsInWaitList Number of events in the event wait list. * @param[in] eventWaitList Event wait list. * @param[in] events Event objects per each command queue that identify * a particular kernel execution instance. * * @return * - \b clblasSuccess on success; * - \b clblasInvalidValue if either \b offA or \b offC exceeds the size * of the respective buffer object; * - \b clblasInvalidValue if \b transA is set to \ref clblasConjTrans. * - the same error codes as the clblasSsyrk() function otherwise. * * @ingroup SYRK */ clblasStatus clblasCsyrk( clblasOrder order, clblasUplo uplo, clblasTranspose transA, size_t N, size_t K, FloatComplex alpha, const cl_mem A, size_t offA, size_t lda, FloatComplex beta, cl_mem C, size_t offC, size_t ldc, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events); /** * @brief Rank-k update of a symmetric matrix with complex double elements. * Extended version. * * Rank-k updates: * - \f$ C \leftarrow \alpha A A^T + \beta C \f$ * - \f$ C \leftarrow \alpha A^T A + \beta C \f$ * * where \b C is a symmetric matrix. * * @param[in] order Row/column order. * @param[in] uplo The triangle in matrix \b C being referenced. * @param[in] transA How matrix \b A is to be transposed. * @param[in] N Number of rows and columns in matrix \b C. * @param[in] K Number of columns of the matrix \b A if it is not * transposed, and number of rows otherwise. * @param[in] alpha The factor of matrix \b A. * @param[in] A Buffer object storing the matrix \b A. * @param[in] offA Offset of the first element of the matrix \b A in the * buffer object. Counted in elements. * @param[in] lda Leading dimension of matrix \b A. For detailed * description, see clblasSsyrk(). * @param[in] beta The factor of the matrix \b C. * @param[out] C Buffer object storing matrix \b C. * @param[in] offC Offset of the first element of the matrix \b C in the * buffer object. Counted in elements. * @param[in] ldc Leading dimension of matrix \b C. It cannot be less * than \b N. * @param[in] numCommandQueues Number of OpenCL command queues in which the * task is to be performed. * @param[in] commandQueues OpenCL command queues. * @param[in] numEventsInWaitList Number of events in the event wait list. * @param[in] eventWaitList Event wait list. * @param[in] events Event objects per each command queue that identify * a particular kernel execution instance. * * @return * - \b clblasSuccess on success; * - \b clblasInvalidDevice if a target device does not support floating * point arithmetic with double precision; * - \b clblasInvalidValue if either \b offA or \b offC exceeds the size * of the respective buffer object; * - \b clblasInvalidValue if \b transA is set to \ref clblasConjTrans. * - the same error codes as the clblasSsyrk() function otherwise. * * @ingroup SYRK */ clblasStatus clblasZsyrk( clblasOrder order, clblasUplo uplo, clblasTranspose transA, size_t N, size_t K, DoubleComplex alpha, const cl_mem A, size_t offA, size_t lda, DoubleComplex beta, cl_mem C, size_t offC, size_t ldc, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events); /*@}*/ /** * @defgroup SYR2K SYR2K - Symmetric rank-2k update to a matrix * @ingroup BLAS3 */ /*@{*/ /** * @brief Rank-2k update of a symmetric matrix with float elements. * Extended version. * * Rank-k updates: * - \f$ C \leftarrow \alpha A B^T + \alpha B A^T + \beta C \f$ * - \f$ C \leftarrow \alpha A^T B + \alpha B^T A \beta C \f$ * * where \b C is a symmetric matrix. * * @param[in] order Row/column order. * @param[in] uplo The triangle in matrix \b C being referenced. * @param[in] transAB How matrices \b A and \b B is to be transposed. * @param[in] N Number of rows and columns in matrix \b C. * @param[in] K Number of columns of the matrices \b A and \b B if they * are not transposed, and number of rows otherwise. * @param[in] alpha The factor of matrices \b A and \b B. * @param[in] A Buffer object storing matrix \b A. * @param[in] offA Offset of the first element of the matrix \b A in the * buffer object. Counted in elements. * @param[in] lda Leading dimension of matrix \b A. It cannot be less * than \b K if \b A is * in the row-major format, and less than \b N * otherwise. * @param[in] B Buffer object storing matrix \b B. * @param[in] offB Offset of the first element of the matrix \b B in the * buffer object. Counted in elements. * @param[in] ldb Leading dimension of matrix \b B. It cannot be less * less than \b K if \b B matches to the op(\b B) matrix * in the row-major format, and less than \b N * otherwise. * @param[in] beta The factor of matrix \b C. * @param[out] C Buffer object storing matrix \b C. * @param[in] offC Offset of the first element of the matrix \b C in the * buffer object. Counted in elements. * @param[in] ldc Leading dimension of matrix \b C. It cannot be less * than \b N. * @param[in] numCommandQueues Number of OpenCL command queues in which the * task is to be performed. * @param[in] commandQueues OpenCL command queues. * @param[in] numEventsInWaitList Number of events in the event wait list. * @param[in] eventWaitList Event wait list. * @param[in] events Event objects per each command queue that identify * a particular kernel execution instance. * * @return * - \b clblasSuccess on success; * - \b clblasInvalidValue if either \b offA, \b offB or \b offC exceeds * the size of the respective buffer object; * - the same error codes as the clblasSsyr2k() function otherwise. * * @ingroup SYR2K */ clblasStatus clblasSsyr2k( clblasOrder order, clblasUplo uplo, clblasTranspose transAB, size_t N, size_t K, cl_float alpha, const cl_mem A, size_t offA, size_t lda, const cl_mem B, size_t offB, size_t ldb, cl_float beta, cl_mem C, size_t offC, size_t ldc, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events); /** * @example example_ssyr2k.c * This is an example of how to use the @ref clblasSsyr2kEx function. */ /** * @brief Rank-2k update of a symmetric matrix with double elements. * Extended version. * * Rank-k updates: * - \f$ C \leftarrow \alpha A B^T + \alpha B A^T + \beta C \f$ * - \f$ C \leftarrow \alpha A^T B + \alpha B^T A \beta C \f$ * * where \b C is a symmetric matrix. * * @param[in] order Row/column order. * @param[in] uplo The triangle in matrix \b C being referenced. * @param[in] transAB How matrices \b A and \b B is to be transposed. * @param[in] N Number of rows and columns in matrix \b C. * @param[in] K Number of columns of the matrices \b A and \b B if they * are not transposed, and number of rows otherwise. * @param[in] alpha The factor of matrices \b A and \b B. * @param[in] A Buffer object storing matrix \b A. * @param[in] offA Offset of the first element of the matrix \b A in the * buffer object. Counted in elements. * @param[in] lda Leading dimension of matrix \b A. For detailed * description, see clblasSsyr2k(). * @param[in] B Buffer object storing matrix \b B. * @param[in] offB Offset of the first element of the matrix \b B in the * buffer object. Counted in elements. * @param[in] ldb Leading dimension of matrix \b B. For detailed * description, see clblasSsyr2k(). * @param[in] beta The factor of matrix \b C. * @param[out] C Buffer object storing matrix \b C. * @param[in] offC Offset of the first element of the matrix \b C in the * buffer object. Counted in elements. * @param[in] ldc Leading dimension of matrix \b C. It cannot be less * than \b N. * @param[in] numCommandQueues Number of OpenCL command queues in which the * task is to be performed. * @param[in] commandQueues OpenCL command queues. * @param[in] numEventsInWaitList Number of events in the event wait list. * @param[in] eventWaitList Event wait list. * @param[in] events Event objects per each command queue that identify * a particular kernel execution instance. * * @return * - \b clblasSuccess on success; * - \b clblasInvalidDevice if a target device does not support floating * point arithmetic with double precision; * - \b clblasInvalidValue if either \b offA, \b offB or \b offC exceeds * the size of the respective buffer object; * - the same error codes as the clblasSsyr2k() function otherwise. * * @ingroup SYR2K */ clblasStatus clblasDsyr2k( clblasOrder order, clblasUplo uplo, clblasTranspose transAB, size_t N, size_t K, cl_double alpha, const cl_mem A, size_t offA, size_t lda, const cl_mem B, size_t offB, size_t ldb, cl_double beta, cl_mem C, size_t offC, size_t ldc, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events); /** * @brief Rank-2k update of a symmetric matrix with complex float elements. * Extended version. * * Rank-k updates: * - \f$ C \leftarrow \alpha A B^T + \alpha B A^T + \beta C \f$ * - \f$ C \leftarrow \alpha A^T B + \alpha B^T A \beta C \f$ * * where \b C is a symmetric matrix. * * @param[in] order Row/column order. * @param[in] uplo The triangle in matrix \b C being referenced. * @param[in] transAB How matrices \b A and \b B is to be transposed. * @param[in] N Number of rows and columns in matrix \b C. * @param[in] K Number of columns of the matrices \b A and \b B if they * are not transposed, and number of rows otherwise. * @param[in] alpha The factor of matrices \b A and \b B. * @param[in] A Buffer object storing matrix \b A. * @param[in] offA Offset of the first element of the matrix \b A in the * buffer object. Counted in elements. * @param[in] lda Leading dimension of matrix \b A. For detailed * description, see clblasSsyr2k(). * @param[in] B Buffer object storing matrix \b B. * @param[in] offB Offset of the first element of the matrix \b B in the * buffer object. Counted in elements. * @param[in] ldb Leading dimension of matrix \b B. For detailed * description, see clblasSsyr2k(). * @param[in] beta The factor of matrix \b C. * @param[out] C Buffer object storing matrix \b C. * @param[in] offC Offset of the first element of the matrix \b C in the * buffer object. Counted in elements. * @param[in] ldc Leading dimension of matrix \b C. It cannot be less * than \b N. * @param[in] numCommandQueues Number of OpenCL command queues in which the * task is to be performed. * @param[in] commandQueues OpenCL command queues. * @param[in] numEventsInWaitList Number of events in the event wait list. * @param[in] eventWaitList Event wait list. * @param[in] events Event objects per each command queue that identify * a particular kernel execution instance. * * @return * - \b clblasSuccess on success; * - \b clblasInvalidValue if either \b offA, \b offB or \b offC exceeds * the size of the respective buffer object; * - \b clblasInvalidValue if \b transAB is set to \ref clblasConjTrans. * - the same error codes as the clblasSsyr2k() function otherwise. * * @ingroup SYR2K */ clblasStatus clblasCsyr2k( clblasOrder order, clblasUplo uplo, clblasTranspose transAB, size_t N, size_t K, FloatComplex alpha, const cl_mem A, size_t offA, size_t lda, const cl_mem B, size_t offB, size_t ldb, FloatComplex beta, cl_mem C, size_t offC, size_t ldc, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events); /** * @brief Rank-2k update of a symmetric matrix with complex double elements. * Extended version. * * Rank-k updates: * - \f$ C \leftarrow \alpha A B^T + \alpha B A^T + \beta C \f$ * - \f$ C \leftarrow \alpha A^T B + \alpha B^T A \beta C \f$ * * where \b C is a symmetric matrix. * * @param[in] order Row/column order. * @param[in] uplo The triangle in matrix \b C being referenced. * @param[in] transAB How matrices \b A and \b B is to be transposed. * @param[in] N Number of rows and columns in matrix \b C. * @param[in] K Number of columns of the matrices \b A and \b B if they * are not transposed, and number of rows otherwise. * @param[in] alpha The factor of matrices \b A and \b B. * @param[in] A Buffer object storing matrix \b A. * @param[in] offA Offset of the first element of the matrix \b A in the * buffer object. Counted in elements. * @param[in] lda Leading dimension of matrix \b A. For detailed * description, see clblasSsyr2k(). * @param[in] B Buffer object storing matrix \b B. * @param[in] offB Offset of the first element of the matrix \b B in the * buffer object. Counted in elements. * @param[in] ldb Leading dimension of matrix \b B. For detailed * description, see clblasSsyr2k(). * @param[in] beta The factor of matrix \b C. * @param[out] C Buffer object storing matrix \b C. * @param[in] offC Offset of the first element of the matrix \b C in the * buffer object. Counted in elements. * @param[in] ldc Leading dimension of matrix \b C. It cannot be less * than \b N. * @param[in] numCommandQueues Number of OpenCL command queues in which the * task is to be performed. * @param[in] commandQueues OpenCL command queues. * @param[in] numEventsInWaitList Number of events in the event wait list. * @param[in] eventWaitList Event wait list. * @param[in] events Event objects per each command queue that identify * a particular kernel execution instance. * * @return * - \b clblasSuccess on success; * - \b clblasInvalidDevice if a target device does not support floating * point arithmetic with double precision; * - \b clblasInvalidValue if either \b offA, \b offB or \b offC exceeds * the size of the respective buffer object; * - \b clblasInvalidValue if \b transAB is set to \ref clblasConjTrans. * - the same error codes as the clblasSsyr2k() function otherwise. * * @ingroup SYR2K */ clblasStatus clblasZsyr2k( clblasOrder order, clblasUplo uplo, clblasTranspose transAB, size_t N, size_t K, DoubleComplex alpha, const cl_mem A, size_t offA, size_t lda, const cl_mem B, size_t offB, size_t ldb, DoubleComplex beta, cl_mem C, size_t offC, size_t ldc, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events); /*@}*/ /** * @defgroup SYMM SYMM - Symmetric matrix-matrix multiply * @ingroup BLAS3 */ /*@{*/ /** * @brief Matrix-matrix product of symmetric rectangular matrices with float * elements. * * Matrix-matrix products: * - \f$ C \leftarrow \alpha A B + \beta C \f$ * - \f$ C \leftarrow \alpha B A + \beta C \f$ * * @param[in] order Row/column order. * @param[in] side The side of triangular matrix. * @param[in] uplo The triangle in matrix being referenced. * @param[in] M Number of rows in matrices \b B and \b C. * @param[in] N Number of columns in matrices \b B and \b C. * @param[in] alpha The factor of matrix \b A. * @param[in] A Buffer object storing matrix \b A. * @param[in] offa Offset of the first element of the matrix \b A in the * buffer object. Counted in elements. * @param[in] lda Leading dimension of matrix \b A. It cannot be less * than \b M when the \b side parameter is set to * \b clblasLeft,\n or less than \b N when the * parameter is set to \b clblasRight. * @param[in] B Buffer object storing matrix \b B. * @param[in] offb Offset of the first element of the matrix \b B in the * buffer object. Counted in elements. * @param[in] ldb Leading dimension of matrix \b B. It cannot be less * than \b N when the \b order parameter is set to * \b clblasRowMajor,\n or less than \b M * when it is set to \b clblasColumnMajor. * @param[in] beta The factor of matrix \b C. * @param[out] C Buffer object storing matrix \b C. * @param[in] offc Offset of the first element of the matrix \b C in the * buffer object. Counted in elements. * @param[in] ldc Leading dimension of matrix \b C. It cannot be less * than \b N when the \b order parameter is set to * \b clblasRowMajor,\n or less than \b M when * it is set to \b clblasColumnMajorOrder. * @param[in] numCommandQueues Number of OpenCL command queues in which the * task is to be performed. * @param[in] commandQueues OpenCL command queues. * @param[in] numEventsInWaitList Number of events in the event wait list. * @param[in] eventWaitList Event wait list. * @param[in] events Event objects per each command queue that identify * a particular kernel execution instance. * * @return * - \b clblasSuccess on success; * - \b clblasNotInitialized if clblasSetup() was not called; * - \b clblasInvalidValue if invalid parameters are passed: * - \b M or \b N is zero, or * - any of the leading dimensions is invalid; * - the matrix sizes lead to accessing outsize of any of the buffers; * - \b clblasInvalidMemObject if A, B, or C object is invalid, * or an image object rather than the buffer one; * - \b clblasOutOfResources if you use image-based function implementation * and no suitable scratch image available; * - \b clblasOutOfHostMemory if the library can't allocate memory for * internal structures; * - \b clblasInvalidCommandQueue if the passed command queue is invalid; * - \b clblasInvalidContext if a context a passed command queue belongs to * was released; * - \b clblasInvalidOperation if kernel compilation relating to a previous * call has not completed for any of the target devices; * - \b clblasCompilerNotAvailable if a compiler is not available; * - \b clblasBuildProgramFailure if there is a failure to build a program * executable. * * @ingroup SYMM */ clblasStatus clblasSsymm( clblasOrder order, clblasSide side, clblasUplo uplo, size_t M, size_t N, cl_float alpha, const cl_mem A, size_t offa, size_t lda, const cl_mem B, size_t offb, size_t ldb, cl_float beta, cl_mem C, size_t offc, size_t ldc, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events); /** * @example example_ssymm.c * This is an example of how to use the @ref clblasSsymm function. */ /** * @brief Matrix-matrix product of symmetric rectangular matrices with double * elements. * * Matrix-matrix products: * - \f$ C \leftarrow \alpha A B + \beta C \f$ * - \f$ C \leftarrow \alpha B A + \beta C \f$ * * @param[in] order Row/column order. * @param[in] side The side of triangular matrix. * @param[in] uplo The triangle in matrix being referenced. * @param[in] M Number of rows in matrices \b B and \b C. * @param[in] N Number of columns in matrices \b B and \b C. * @param[in] alpha The factor of matrix \b A. * @param[in] A Buffer object storing matrix \b A. * @param[in] offa Offset of the first element of the matrix \b A in the * buffer object. Counted in elements. * @param[in] lda Leading dimension of matrix \b A. It cannot be less * than \b M when the \b side parameter is set to * \b clblasLeft,\n or less than \b N when the * parameter is set to \b clblasRight. * @param[in] B Buffer object storing matrix \b B. * @param[in] offb Offset of the first element of the matrix \b B in the * buffer object. Counted in elements. * @param[in] ldb Leading dimension of matrix \b B. It cannot be less * than \b N when the \b order parameter is set to * \b clblasRowMajor,\n or less than \b M * when it is set to \b clblasColumnMajor. * @param[in] beta The factor of matrix \b C. * @param[out] C Buffer object storing matrix \b C. * @param[in] offc Offset of the first element of the matrix \b C in the * buffer object. Counted in elements. * @param[in] ldc Leading dimension of matrix \b C. It cannot be less * than \b N when the \b order parameter is set to * \b clblasRowMajor,\n or less than \b M when * it is set to \b clblasColumnMajorOrder. * @param[in] numCommandQueues Number of OpenCL command queues in which the * task is to be performed. * @param[in] commandQueues OpenCL command queues. * @param[in] numEventsInWaitList Number of events in the event wait list. * @param[in] eventWaitList Event wait list. * @param[in] events Event objects per each command queue that identify * a particular kernel execution instance. * * @return * - \b clblasSuccess on success; * - \b clblasInvalidDevice if a target device does not support floating * point arithmetic with double precision; * - the same error codes as the clblasSsymm() function otherwise. * * @ingroup SYMM */ clblasStatus clblasDsymm( clblasOrder order, clblasSide side, clblasUplo uplo, size_t M, size_t N, cl_double alpha, const cl_mem A, size_t offa, size_t lda, const cl_mem B, size_t offb, size_t ldb, cl_double beta, cl_mem C, size_t offc, size_t ldc, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events); /** * @brief Matrix-matrix product of symmetric rectangular matrices with * float-complex elements. * * Matrix-matrix products: * - \f$ C \leftarrow \alpha A B + \beta C \f$ * - \f$ C \leftarrow \alpha B A + \beta C \f$ * * @param[in] order Row/column order. * @param[in] side The side of triangular matrix. * @param[in] uplo The triangle in matrix being referenced. * @param[in] M Number of rows in matrices \b B and \b C. * @param[in] N Number of columns in matrices \b B and \b C. * @param[in] alpha The factor of matrix \b A. * @param[in] A Buffer object storing matrix \b A. * @param[in] offa Offset of the first element of the matrix \b A in the * buffer object. Counted in elements. * @param[in] lda Leading dimension of matrix \b A. It cannot be less * than \b M when the \b side parameter is set to * \b clblasLeft,\n or less than \b N when the * parameter is set to \b clblasRight. * @param[in] B Buffer object storing matrix \b B. * @param[in] offb Offset of the first element of the matrix \b B in the * buffer object. Counted in elements. * @param[in] ldb Leading dimension of matrix \b B. It cannot be less * than \b N when the \b order parameter is set to * \b clblasRowMajor,\n or less than \b M * when it is set to \b clblasColumnMajor. * @param[in] beta The factor of matrix \b C. * @param[out] C Buffer object storing matrix \b C. * @param[in] offc Offset of the first element of the matrix \b C in the * buffer object. Counted in elements. * @param[in] ldc Leading dimension of matrix \b C. It cannot be less * than \b N when the \b order parameter is set to * \b clblasRowMajor,\n or less than \b M when * it is set to \b clblasColumnMajorOrder. * @param[in] numCommandQueues Number of OpenCL command queues in which the * task is to be performed. * @param[in] commandQueues OpenCL command queues. * @param[in] numEventsInWaitList Number of events in the event wait list. * @param[in] eventWaitList Event wait list. * @param[in] events Event objects per each command queue that identify * a particular kernel execution instance. * * @return The same result as the clblasSsymm() function. * * @ingroup SYMM */ clblasStatus clblasCsymm( clblasOrder order, clblasSide side, clblasUplo uplo, size_t M, size_t N, cl_float2 alpha, const cl_mem A, size_t offa, size_t lda, const cl_mem B, size_t offb, size_t ldb, cl_float2 beta, cl_mem C, size_t offc, size_t ldc, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events); /** * @brief Matrix-matrix product of symmetric rectangular matrices with * double-complex elements. * * Matrix-matrix products: * - \f$ C \leftarrow \alpha A B + \beta C \f$ * - \f$ C \leftarrow \alpha B A + \beta C \f$ * * @param[in] order Row/column order. * @param[in] side The side of triangular matrix. * @param[in] uplo The triangle in matrix being referenced. * @param[in] M Number of rows in matrices \b B and \b C. * @param[in] N Number of columns in matrices \b B and \b C. * @param[in] alpha The factor of matrix \b A. * @param[in] A Buffer object storing matrix \b A. * @param[in] offa Offset of the first element of the matrix \b A in the * buffer object. Counted in elements. * @param[in] lda Leading dimension of matrix \b A. It cannot be less * than \b M when the \b side parameter is set to * \b clblasLeft,\n or less than \b N when the * parameter is set to \b clblasRight. * @param[in] B Buffer object storing matrix \b B. * @param[in] offb Offset of the first element of the matrix \b B in the * buffer object. Counted in elements. * @param[in] ldb Leading dimension of matrix \b B. It cannot be less * than \b N when the \b order parameter is set to * \b clblasRowMajor,\n or less than \b M * when it is set to \b clblasColumnMajor. * @param[in] beta The factor of matrix \b C. * @param[out] C Buffer object storing matrix \b C. * @param[in] offc Offset of the first element of the matrix \b C in the * buffer object. Counted in elements. * @param[in] ldc Leading dimension of matrix \b C. It cannot be less * than \b N when the \b order parameter is set to * \b clblasRowMajor,\n or less than \b M when * it is set to \b clblasColumnMajorOrder. * @param[in] numCommandQueues Number of OpenCL command queues in which the * task is to be performed. * @param[in] commandQueues OpenCL command queues. * @param[in] numEventsInWaitList Number of events in the event wait list. * @param[in] eventWaitList Event wait list. * @param[in] events Event objects per each command queue that identify * a particular kernel execution instance. * * @return The same result as the clblasDsymm() function. * * @ingroup SYMM */ clblasStatus clblasZsymm( clblasOrder order, clblasSide side, clblasUplo uplo, size_t M, size_t N, cl_double2 alpha, const cl_mem A, size_t offa, size_t lda, const cl_mem B, size_t offb, size_t ldb, cl_double2 beta, cl_mem C, size_t offc, size_t ldc, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events); /*@}*/ /** * @defgroup HEMM HEMM - Hermitian matrix-matrix multiplication * @ingroup BLAS3 */ /*@{*/ /** * @brief Matrix-matrix product of hermitian rectangular matrices with * float-complex elements. * * Matrix-matrix products: * - \f$ C \leftarrow \alpha A B + \beta C \f$ * - \f$ C \leftarrow \alpha B A + \beta C \f$ * * @param[in] order Row/column order. * @param[in] side The side of triangular matrix. * @param[in] uplo The triangle in matrix being referenced. * @param[in] M Number of rows in matrices \b B and \b C. * @param[in] N Number of columns in matrices \b B and \b C. * @param[in] alpha The factor of matrix \b A. * @param[in] A Buffer object storing matrix \b A. * @param[in] offa Offset of the first element of the matrix \b A in the * buffer object. Counted in elements. * @param[in] lda Leading dimension of matrix \b A. It cannot be less * than \b M when the \b side parameter is set to * \b clblasLeft,\n or less than \b N when the * parameter is set to \b clblasRight. * @param[in] B Buffer object storing matrix \b B. * @param[in] offb Offset of the first element of the matrix \b B in the * buffer object. Counted in elements. * @param[in] ldb Leading dimension of matrix \b B. It cannot be less * than \b N when the \b order parameter is set to * \b clblasRowMajor,\n or less than \b M * when it is set to \b clblasColumnMajor. * @param[in] beta The factor of matrix \b C. * @param[out] C Buffer object storing matrix \b C. * @param[in] offc Offset of the first element of the matrix \b C in the * buffer object. Counted in elements. * @param[in] ldc Leading dimension of matrix \b C. It cannot be less * than \b N when the \b order parameter is set to * \b clblasRowMajor,\n or less than \b M when * it is set to \b clblasColumnMajorOrder. * @param[in] numCommandQueues Number of OpenCL command queues in which the * task is to be performed. * @param[in] commandQueues OpenCL command queues. * @param[in] numEventsInWaitList Number of events in the event wait list. * @param[in] eventWaitList Event wait list. * @param[in] events Event objects per each command queue that identify * a particular kernel execution instance. * * @return * - \b clblasSuccess on success; * - \b clblasNotInitialized if clblasSetup() was not called; * - \b clblasInvalidValue if invalid parameters are passed: * - \b M or \b N is zero, or * - any of the leading dimensions is invalid; * - the matrix sizes lead to accessing outsize of any of the buffers; * - \b clblasInvalidMemObject if A, B, or C object is invalid, * or an image object rather than the buffer one; * - \b clblasOutOfResources if you use image-based function implementation * and no suitable scratch image available; * - \b clblasOutOfHostMemory if the library can't allocate memory for * internal structures; * - \b clblasInvalidCommandQueue if the passed command queue is invalid; * - \b clblasInvalidContext if a context a passed command queue belongs to * was released; * - \b clblasInvalidOperation if kernel compilation relating to a previous * call has not completed for any of the target devices; * - \b clblasCompilerNotAvailable if a compiler is not available; * - \b clblasBuildProgramFailure if there is a failure to build a program * executable. * * @ingroup HEMM */ clblasStatus clblasChemm( clblasOrder order, clblasSide side, clblasUplo uplo, size_t M, size_t N, cl_float2 alpha, const cl_mem A, size_t offa, size_t lda, const cl_mem B, size_t offb, size_t ldb, cl_float2 beta, cl_mem C, size_t offc, size_t ldc, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events); /** * @example example_chemm.cpp * This is an example of how to use the @ref clblasChemm function. */ /** * @brief Matrix-matrix product of hermitian rectangular matrices with * double-complex elements. * * Matrix-matrix products: * - \f$ C \leftarrow \alpha A B + \beta C \f$ * - \f$ C \leftarrow \alpha B A + \beta C \f$ * * @param[in] order Row/column order. * @param[in] side The side of triangular matrix. * @param[in] uplo The triangle in matrix being referenced. * @param[in] M Number of rows in matrices \b B and \b C. * @param[in] N Number of columns in matrices \b B and \b C. * @param[in] alpha The factor of matrix \b A. * @param[in] A Buffer object storing matrix \b A. * @param[in] offa Offset of the first element of the matrix \b A in the * buffer object. Counted in elements. * @param[in] lda Leading dimension of matrix \b A. It cannot be less * than \b M when the \b side parameter is set to * \b clblasLeft,\n or less than \b N when the * parameter is set to \b clblasRight. * @param[in] B Buffer object storing matrix \b B. * @param[in] offb Offset of the first element of the matrix \b B in the * buffer object. Counted in elements. * @param[in] ldb Leading dimension of matrix \b B. It cannot be less * than \b N when the \b order parameter is set to * \b clblasRowMajor,\n or less than \b M * when it is set to \b clblasColumnMajor. * @param[in] beta The factor of matrix \b C. * @param[out] C Buffer object storing matrix \b C. * @param[in] offc Offset of the first element of the matrix \b C in the * buffer object. Counted in elements. * @param[in] ldc Leading dimension of matrix \b C. It cannot be less * than \b N when the \b order parameter is set to * \b clblasRowMajor,\n or less than \b M when * it is set to \b clblasColumnMajorOrder. * @param[in] numCommandQueues Number of OpenCL command queues in which the * task is to be performed. * @param[in] commandQueues OpenCL command queues. * @param[in] numEventsInWaitList Number of events in the event wait list. * @param[in] eventWaitList Event wait list. * @param[in] events Event objects per each command queue that identify * a particular kernel execution instance. * * @return * - \b clblasSuccess on success; * - \b clblasInvalidDevice if a target device does not support floating * point arithmetic with double precision; * - the same error codes as the clblasChemm() function otherwise. * * @ingroup HEMM */ clblasStatus clblasZhemm( clblasOrder order, clblasSide side, clblasUplo uplo, size_t M, size_t N, cl_double2 alpha, const cl_mem A, size_t offa, size_t lda, const cl_mem B, size_t offb, size_t ldb, cl_double2 beta, cl_mem C, size_t offc, size_t ldc, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events); /*@}*/ /** * @defgroup HERK HERK - Hermitian rank-k update to a matrix * @ingroup BLAS3 */ /*@{*/ /** * @brief Rank-k update of a hermitian matrix with float-complex elements. * * Rank-k updates: * - \f$ C \leftarrow \alpha A A^H + \beta C \f$ * - \f$ C \leftarrow \alpha A^H A + \beta C \f$ * * where \b C is a hermitian matrix. * * @param[in] order Row/column order. * @param[in] uplo The triangle in matrix \b C being referenced. * @param[in] transA How matrix \b A is to be transposed. * @param[in] N Number of rows and columns in matrix \b C. * @param[in] K Number of columns of the matrix \b A if it is not * transposed, and number of rows otherwise. * @param[in] alpha The factor of matrix \b A. * @param[in] A Buffer object storing the matrix \b A. * @param[in] offa Offset in number of elements for the first element in matrix \b A. * @param[in] lda Leading dimension of matrix \b A. It cannot be * less than \b K if \b A is * in the row-major format, and less than \b N * otherwise. * @param[in] beta The factor of the matrix \b C. * @param[out] C Buffer object storing matrix \b C. * @param[in] offc Offset in number of elements for the first element in matrix \b C. * @param[in] ldc Leading dimension of matric \b C. It cannot be less * than \b N. * @param[in] numCommandQueues Number of OpenCL command queues in which the * task is to be performed. * @param[in] commandQueues OpenCL command queues. * @param[in] numEventsInWaitList Number of events in the event wait list. * @param[in] eventWaitList Event wait list. * @param[in] events Event objects per each command queue that identify * a particular kernel execution instance. * * @return * - \b clblasSuccess on success; * - \b clblasNotInitialized if clblasSetup() was not called; * - \b clblasInvalidValue if invalid parameters are passed: * - either \b N or \b K is zero, or * - any of the leading dimensions is invalid; * - the matrix sizes lead to accessing outsize of any of the buffers; * - \b clblasInvalidMemObject if either \b A or \b C object is * invalid, or an image object rather than the buffer one; * - \b clblasOutOfHostMemory if the library can't allocate memory for * internal structures; * - \b clblasInvalidCommandQueue if the passed command queue is invalid; * - \b clblasInvalidContext if a context a passed command queue belongs to * was released. * * @ingroup HERK */ clblasStatus clblasCherk( clblasOrder order, clblasUplo uplo, clblasTranspose transA, size_t N, size_t K, float alpha, const cl_mem A, size_t offa, size_t lda, float beta, cl_mem C, size_t offc, size_t ldc, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events); /** * @example example_cherk.cpp * This is an example of how to use the @ref clblasCherk function. */ /** * @brief Rank-k update of a hermitian matrix with double-complex elements. * * Rank-k updates: * - \f$ C \leftarrow \alpha A A^H + \beta C \f$ * - \f$ C \leftarrow \alpha A^H A + \beta C \f$ * * where \b C is a hermitian matrix. * * @param[in] order Row/column order. * @param[in] uplo The triangle in matrix \b C being referenced. * @param[in] transA How matrix \b A is to be transposed. * @param[in] N Number of rows and columns in matrix \b C. * @param[in] K Number of columns of the matrix \b A if it is not * transposed, and number of rows otherwise. * @param[in] alpha The factor of matrix \b A. * @param[in] A Buffer object storing the matrix \b A. * @param[in] offa Offset in number of elements for the first element in matrix \b A. * @param[in] lda Leading dimension of matrix \b A. It cannot be * less than \b K if \b A is * in the row-major format, and less than \b N * otherwise. * @param[in] beta The factor of the matrix \b C. * @param[out] C Buffer object storing matrix \b C. * @param[in] offc Offset in number of elements for the first element in matrix \b C. * @param[in] ldc Leading dimension of matric \b C. It cannot be less * than \b N. * @param[in] numCommandQueues Number of OpenCL command queues in which the * task is to be performed. * @param[in] commandQueues OpenCL command queues. * @param[in] numEventsInWaitList Number of events in the event wait list. * @param[in] eventWaitList Event wait list. * @param[in] events Event objects per each command queue that identify * a particular kernel execution instance. * * @return * - \b clblasSuccess on success; * - \b clblasInvalidDevice if a target device does not support floating * point arithmetic with double precision; * - the same error codes as the clblasCherk() function otherwise. * * @ingroup HERK */ clblasStatus clblasZherk( clblasOrder order, clblasUplo uplo, clblasTranspose transA, size_t N, size_t K, double alpha, const cl_mem A, size_t offa, size_t lda, double beta, cl_mem C, size_t offc, size_t ldc, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events); /*@}*/ /** * @defgroup HER2K HER2K - Hermitian rank-2k update to a matrix * @ingroup BLAS3 */ /*@{*/ /** * @brief Rank-2k update of a hermitian matrix with float-complex elements. * * Rank-k updates: * - \f$ C \leftarrow \alpha A B^H + conj( \alpha ) B A^H + \beta C \f$ * - \f$ C \leftarrow \alpha A^H B + conj( \alpha ) B^H A + \beta C \f$ * * where \b C is a hermitian matrix. * * @param[in] order Row/column order. * @param[in] uplo The triangle in matrix \b C being referenced. * @param[in] trans How matrix \b A is to be transposed. * @param[in] N Number of rows and columns in matrix \b C. * @param[in] K Number of columns of the matrix \b A if it is not * transposed, and number of rows otherwise. * @param[in] alpha The factor of matrix \b A. * @param[in] A Buffer object storing the matrix \b A. * @param[in] offa Offset in number of elements for the first element in matrix \b A. * @param[in] lda Leading dimension of matrix \b A. It cannot be * less than \b K if \b A is * in the row-major format, and less than \b N * otherwise. Vice-versa for transpose case. * @param[in] B Buffer object storing the matrix \b B. * @param[in] offb Offset in number of elements for the first element in matrix \b B. * @param[in] ldb Leading dimension of matrix \b B. It cannot be * less than \b K if \b B is * in the row-major format, and less than \b N * otherwise. Vice-versa for transpose case * @param[in] beta The factor of the matrix \b C. * @param[out] C Buffer object storing matrix \b C. * @param[in] offc Offset in number of elements for the first element in matrix \b C. * @param[in] ldc Leading dimension of matric \b C. It cannot be less * than \b N. * @param[in] numCommandQueues Number of OpenCL command queues in which the * task is to be performed. * @param[in] commandQueues OpenCL command queues. * @param[in] numEventsInWaitList Number of events in the event wait list. * @param[in] eventWaitList Event wait list. * @param[in] events Event objects per each command queue that identify * a particular kernel execution instance. * * @return * - \b clblasSuccess on success; * - \b clblasNotInitialized if clblasSetup() was not called; * - \b clblasInvalidValue if invalid parameters are passed: * - either \b N or \b K is zero, or * - any of the leading dimensions is invalid; * - the matrix sizes lead to accessing outsize of any of the buffers; * - \b clblasInvalidMemObject if either \b A , \b B or \b C object is * invalid, or an image object rather than the buffer one; * - \b clblasOutOfHostMemory if the library can't allocate memory for * internal structures; * - \b clblasInvalidCommandQueue if the passed command queue is invalid; * - \b clblasInvalidContext if a context a passed command queue belongs to * was released. * * @ingroup HER2K */ clblasStatus clblasCher2k( clblasOrder order, clblasUplo uplo, clblasTranspose trans, size_t N, size_t K, FloatComplex alpha, const cl_mem A, size_t offa, size_t lda, const cl_mem B, size_t offb, size_t ldb, cl_float beta, cl_mem C, size_t offc, size_t ldc, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events); /** * @example example_cher2k.c * This is an example of how to use the @ref clblasCher2k function. */ /** * @brief Rank-2k update of a hermitian matrix with double-complex elements. * * Rank-k updates: * - \f$ C \leftarrow \alpha A B^H + conj( \alpha ) B A^H + \beta C \f$ * - \f$ C \leftarrow \alpha A^H B + conj( \alpha ) B^H A + \beta C \f$ * * where \b C is a hermitian matrix. * * @param[in] order Row/column order. * @param[in] uplo The triangle in matrix \b C being referenced. * @param[in] trans How matrix \b A is to be transposed. * @param[in] N Number of rows and columns in matrix \b C. * @param[in] K Number of columns of the matrix \b A if it is not * transposed, and number of rows otherwise. * @param[in] alpha The factor of matrix \b A. * @param[in] A Buffer object storing the matrix \b A. * @param[in] offa Offset in number of elements for the first element in matrix \b A. * @param[in] lda Leading dimension of matrix \b A. It cannot be * less than \b K if \b A is * in the row-major format, and less than \b N * otherwise. Vice-versa for transpose case. * @param[in] B Buffer object storing the matrix \b B. * @param[in] offb Offset in number of elements for the first element in matrix \b B. * @param[in] ldb Leading dimension of matrix \b B. It cannot be * less than \b K if B is * in the row-major format, and less than \b N * otherwise. Vice-versa for transpose case. * @param[in] beta The factor of the matrix \b C. * @param[out] C Buffer object storing matrix \b C. * @param[in] offc Offset in number of elements for the first element in matrix \b C. * @param[in] ldc Leading dimension of matric \b C. It cannot be less * than \b N. * @param[in] numCommandQueues Number of OpenCL command queues in which the * task is to be performed. * @param[in] commandQueues OpenCL command queues. * @param[in] numEventsInWaitList Number of events in the event wait list. * @param[in] eventWaitList Event wait list. * @param[in] events Event objects per each command queue that identify * a particular kernel execution instance. * * @return * - \b clblasSuccess on success; * - \b clblasInvalidDevice if a target device does not support floating * point arithmetic with double precision; * - the same error codes as the clblasCher2k() function otherwise. * * @ingroup HER2K */ clblasStatus clblasZher2k( clblasOrder order, clblasUplo uplo, clblasTranspose trans, size_t N, size_t K, DoubleComplex alpha, const cl_mem A, size_t offa, size_t lda, const cl_mem B, size_t offb, size_t ldb, cl_double beta, cl_mem C, size_t offc, size_t ldc, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events); /*@}*/ /** * @brief Helper function to compute leading dimension and size of a matrix * * @param[in] order matrix ordering * @param[in] rows number of rows * @param[in] columns number of column * @param[in] elemsize element size * @param[in] padding additional padding on the leading dimension * @param[out] ld if non-NULL *ld is filled with the leading dimension * in elements * @param[out] fullsize if non-NULL *fullsize is filled with the byte size * * @return * - \b clblasSuccess for success * - \b clblasInvalidValue if: * - \b elementsize is 0 * - \b row and \b colums are both equal to 0 */ clblasStatus clblasMatrixSizeInfo( clblasOrder order, size_t rows, size_t columns, size_t elemsize, size_t padding, size_t * ld, size_t * fullsize); /** * @brief Allocates matrix on device and computes ld and size * * @param[in] context OpenCL context * @param[in] order Row/column order. * @param[in] rows number of rows * @param[in] columns number of columns * @param[in] elemsize element size * @param[in] padding additional padding on the leading dimension * @param[out] ld if non-NULL *ld is filled with the leading dimension * in elements * @param[out] fullsize if non-NULL *fullsize is filled with the byte size * @param[in] err Error code (see \b clCreateBuffer() ) * * @return * - OpenCL memory object of the allocated matrix */ cl_mem clblasCreateMatrix( cl_context context, clblasOrder order, size_t rows, size_t columns, size_t elemsize, size_t padding, size_t * ld, size_t * fullsize, cl_int * err); /** * @brief Allocates matrix on device with specified size and ld and computes its size * * @param[in] context OpenCL context * @param[in] order Row/column order. * @param[in] rows number of rows * @param[in] columns number of columns * @param[in] elemsize element size * @param[in] padding additional padding on the leading dimension * @param[out] ld the length of the leading dimensions. It cannot * be less than \b columns when the \b order parameter is set to * \b clblasRowMajor,\n or less than \b rows when the * parameter is set to \b clblasColumnMajor. * @param[out] fullsize if non-NULL *fullsize is filled with the byte size * @param[in] err Error code (see \b clCreateBuffer() ) * * @return * - OpenCL memory object of the allocated matrix */ cl_mem clblasCreateMatrixWithLd( cl_context context, clblasOrder order, size_t rows, size_t columns, size_t elemsize, size_t ld, size_t * fullsize, cl_int * err) ; /** * @brief Allocates matrix on device and initialize from existing similar matrix * on host. See \b clblasCreateMatrixBuffer(). * * @param[in] ld leading dimension in elements * @param[in] host base address of host matrix data * @param[in] off_host host matrix offset in elements * @param[in] ld_host leading dimension of host matrix in elements * @param[in] command_queue specifies the OpenCL queue * @param[in] numEventsInWaitList specifies the number of OpenCL events * to wait for * @param[in] eventWaitList specifies the list of OpenCL events to * wait for * * @return * - OpenCL memory object of the allocated matrix */ cl_mem clblasCreateMatrixFromHost( cl_context context, clblasOrder order, size_t rows, size_t columns, size_t elemsize, size_t ld, void * host, size_t off_host, size_t ld_host, cl_command_queue command_queue, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_int * err); /** * @brief Copies synchronously a sub-matrix from host (A) to device (B). * * @param[in] order matrix ordering * @param[in] element_size element size * @param[in] A specifies the source matrix on the host * @param[in] offA specifies the offset of matrix A in * elements * @param[in] ldA specifies the leading dimension of * matrix A in elements * @param[in] nrA specifies the number of rows of A * in elements * @param[in] ncA specifies the number of columns of A * in elements * @param[in] xA specifies the top-left x position to * copy from A * @param[in] yA specifies the top-left y position to * copy from A * @param[in] B specifies the destination matrix on the * device * @param[in] offB specifies the offset of matrix B in * elements * @param[in] ldB specifies the leading dimension of * matrix B in bytes * @param[in] nrB specifies the number of rows of B * in elements * @param[in] ncB specifies the number of columns of B * in elements * @param[in] xB specifies the top-left x position to * copy from B * @param[in] yB specifies the top-left y position to * copy from B * @param[in] nx specifies the number of elements to * copy according to the x dimension (rows) * @param[in] ny specifies the number of elements to * copy according to the y dimension * (columns) * @param[in] command_queue specifies the OpenCL queue * @param[in] numEventsInWaitList specifies the number of OpenCL events * to wait for * @param[in] eventWaitList specifies the list of OpenCL events to * wait for * * @return * - \b clblasSuccess for success * - \b clblasInvalidValue if: * - \b xA + \b offA + \b nx is superior to number of columns of A * - \b xB + \b offB + \b nx is superior to number of columns of B * - \b yA + \b ny is superior to number of rows of A * - \b yB + \b ny is superior to number of rows of B */ clblasStatus clblasWriteSubMatrix( clblasOrder order, size_t element_size, const void *A, size_t offA, size_t ldA, size_t nrA, size_t ncA, size_t xA, size_t yA, cl_mem B, size_t offB, size_t ldB, size_t nrB, size_t ncB, size_t xB, size_t yB, size_t nx, size_t ny, cl_command_queue command_queue, cl_uint numEventsInWaitList, const cl_event *eventWaitList); /** * @brief Copies asynchronously a sub-matrix from host (A) to device (B). * See \b clblasWriteSubMatrix(). * * @param[out] event Event objects per each command queue that identify a * particular kernel execution instance. */ clblasStatus clblasWriteSubMatrixAsync( clblasOrder order, size_t element_size, const void *A, size_t offA, size_t ldA, size_t nrA, size_t ncA, size_t xA, size_t yA, cl_mem B, size_t offB, size_t ldB, size_t nrB, size_t ncB, size_t xB, size_t yB, size_t nx, size_t ny, cl_command_queue command_queue, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *event); /** * @brief Copies a sub-matrix from device (A) to host (B). * See \b clblasWriteSubMatrix(). * * @param[in] A specifies the source matrix on the device * @param[in] B specifies the destination matrix on the host * * @return * - see \b clblasWriteSubMatrix() */ clblasStatus clblasReadSubMatrix( clblasOrder order, size_t element_size, const cl_mem A, size_t offA, size_t ldA, size_t nrA, size_t ncA, size_t xA, size_t yA, void *B, size_t offB, size_t ldB, size_t nrB, size_t ncB, size_t xB, size_t yB, size_t nx, size_t ny, cl_command_queue command_queue, cl_uint numEventsInWaitList, const cl_event *eventWaitList); /** * @brief Copies asynchronously a sub-matrix from device (A) to host (B). * See \b clblasReadSubMatrix() and \b clblasWriteSubMatrixAsync(). */ clblasStatus clblasReadSubMatrixAsync( clblasOrder order, size_t element_size, const cl_mem A, size_t offA, size_t ldA, size_t nrA, size_t ncA, size_t xA, size_t yA, void *B, size_t offB, size_t ldB, size_t nrB, size_t ncB, size_t xB, size_t yB, size_t nx, size_t ny, cl_command_queue command_queue, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *event); /** * @brief Copies a sub-matrix from device (A) to device (B). * See \b clblasWriteSubMatrix(). * * @param[in] A specifies the source matrix on the device * @param[in] B specifies the destination matrix on the device * * @return * - see \b clblasWriteSubMatrix() */ clblasStatus clblasCopySubMatrix( clblasOrder order, size_t element_size, const cl_mem A, size_t offA, size_t ldA, size_t nrA, size_t ncA, size_t xA, size_t yA, cl_mem B, size_t offB, size_t ldB, size_t nrB, size_t ncB, size_t xB, size_t yB, size_t nx, size_t ny, cl_command_queue command_queue, cl_uint numEventsInWaitList, const cl_event *eventWaitList); /** * @brief Copies asynchronously a sub-matrix from device (A) to device (B). * See \b clblasCopySubMatrix() and \b clblasWriteSubMatrixAsync(). */ clblasStatus clblasCopySubMatrixAsync( clblasOrder order, size_t element_size, const cl_mem A, size_t offA, size_t ldA, size_t nrA, size_t ncA, size_t xA, size_t yA, cl_mem B, size_t offB, size_t ldB, size_t nrB, size_t ncB, size_t xB, size_t yB, size_t nx, size_t ny, cl_command_queue command_queue, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *event); /** * @brief Copies synchronously a vector from host (A) to device (B). * See \b clblasWriteSubMatrix(). * * @param[in] A specifies the source vector on the host * @param[in] B specifies the destination vector on the device * * @return * - see \b clblasWriteSubMatrix() */ clblasStatus clblasWriteVector( size_t nb_elem, size_t element_size, const void *A, size_t offA, cl_mem B, size_t offB, cl_command_queue command_queue, cl_uint numEventsInWaitList, const cl_event *eventWaitList); /** * @brief Copies asynchronously a vector from host (A) to device (B). * See \b clblasWriteVector() and \b clblasWriteSubMatrixAsync(). */ clblasStatus clblasWriteVectorAsync( size_t nb_elem, size_t element_size, const void *A, size_t offA, cl_mem B, size_t offB, cl_command_queue command_queue, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events); /** * @brief Copies synchronously a vector from device (A) to host (B). * See \b clblasReadSubMatrix(). * * @param[in] A specifies the source vector on the device * @param[in] B specifies the destination vector on the host * * @return * - see \b clblasReadSubMatrix() */ clblasStatus clblasReadVector( size_t nb_elem, size_t element_size, const cl_mem A, size_t offA, void * B, size_t offB, cl_command_queue command_queue, cl_uint numEventsInWaitList, const cl_event *eventWaitList); /** * @brief Copies asynchronously a vector from device (A) to host (B). * See \b clblasReadVector() and \b clblasWriteSubMatrixAsync(). */ clblasStatus clblasReadVectorAsync( size_t nb_elem, size_t element_size, const cl_mem A, size_t offA, void * B, size_t offB, cl_command_queue command_queue, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events); /** * @brief Copies synchronously a vector from device (A) to device (B). * See \b clblasCopySubMatrix(). * * @param[in] A specifies the source vector on the device * @param[in] B specifies the destination vector on the device * * @return * - see \b clblasCopySubMatrix() */ clblasStatus clblasCopyVector( size_t nb_elem, size_t element_size, const cl_mem A, size_t offA, cl_mem B, size_t offB, cl_command_queue command_queue, cl_uint numEventsInWaitList, const cl_event *eventWaitList); /** * @brief Copies asynchronously a vector from device (A) to device (B). * See \b clblasCopyVector() and \b clblasWriteSubMatrixAsync(). */ clblasStatus clblasCopyVectorAsync( size_t nb_elem, size_t element_size, const cl_mem A, size_t offA, cl_mem B, size_t offB, cl_command_queue command_queue, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events); /** * @brief Copies synchronously a whole matrix from host (A) to device (B). * See \b clblasWriteSubMatrix(). * * @param[in] A specifies the source matrix on the host * @param[in] B specifies the destination matrix on the device * * @return * - see \b clblasWriteSubMatrix() */ clblasStatus clblasWriteMatrix( clblasOrder order, size_t sx, size_t sy, size_t element_size, const void *A, size_t offA, size_t ldA, cl_mem B, size_t offB, size_t ldB, cl_command_queue command_queue, cl_uint numEventsInWaitList, const cl_event *eventWaitList); /** * @brief Copies asynchronously a vector from host (A) to device (B). * See \b clblasWriteMatrix() and \b clblasWriteSubMatrixAsync(). */ clblasStatus clblasWriteMatrixAsync( clblasOrder order, size_t sx, size_t sy, size_t element_size, const void *A, size_t offA, size_t ldA, cl_mem B, size_t offB, size_t ldB, cl_command_queue command_queue, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events); /** * @brief Copies synchronously a whole matrix from device (A) to host (B). * See \b clblasReadSubMatrix(). * * @param[in] A specifies the source vector on the device * @param[in] B specifies the destination vector on the host * * @return * - see \b clblasReadSubMatrix() */ clblasStatus clblasReadMatrix( clblasOrder order, size_t sx, size_t sy, size_t element_size, const cl_mem A, size_t offA, size_t ldA, void * B, size_t offB, size_t ldB, cl_command_queue command_queue, cl_uint numEventsInWaitList, const cl_event *eventWaitList); /** * @brief Copies asynchronously a vector from device (A) to host (B). * See \b clblasReadMatrix() and \b clblasWriteSubMatrixAsync(). */ clblasStatus clblasReadMatrixAsync( clblasOrder order, size_t sx, size_t sy, size_t element_size, const cl_mem A, size_t offA, size_t ldA, void * B, size_t offB, size_t ldB, cl_command_queue command_queue, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events); /** * @brief Copies synchronously a whole matrix from device (A) to device (B). * See \b clblasCopySubMatrix(). * * @param[in] A specifies the source matrix on the device * @param[in] B specifies the destination matrix on the device * * @return * - see \b clblasCopySubMatrix() */ clblasStatus clblasCopyMatrix( clblasOrder order, size_t sx, size_t sy, size_t element_size, const cl_mem A, size_t offA, size_t ldA, cl_mem B, size_t offB, size_t ldB, cl_command_queue command_queue, cl_uint numEventsInWaitList, const cl_event *eventWaitList); /** * @brief Copies asynchronously a vector from device (A) to device (B). * See \b clblasCopyMatrix() and \b clblasWriteSubMatrixAsync(). */ clblasStatus clblasCopyMatrixAsync( clblasOrder order, size_t sx, size_t sy, size_t element_size, const cl_mem A, size_t offA, size_t ldA, cl_mem B, size_t offB, size_t ldB, cl_command_queue command_queue, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events); /** * @brief Fill synchronously a vector with a pattern of a size element_size_bytes * * @param[in] nb_elem specifies the number of element in buffer A * @param[in] element_size specifies the size of one element of A. Supported sizes correspond * element size used in clBLAS (1,2,4,8,16) * @param[in] A specifies the source vector on the device * @param[in] offA specifies the offset of matrix A in * elements * @param[in] pattern specifies the host address of the pattern to fill with (element_size_bytes) * @param[in] command_queue specifies the OpenCL queue * @param[in] numEventsInWaitList specifies the number of OpenCL events * to wait for * @param[in] eventWaitList specifies the list of OpenCL events to * wait for * @return * - see \b clblasWriteSubMatrix() */ clblasStatus clblasFillVector( size_t nb_elem, size_t element_size, cl_mem A, size_t offA, const void * host, cl_command_queue command_queue, cl_uint numEventsInWaitList, const cl_event *eventWaitList); /** * @brief Fill asynchronously a vector with a pattern of a size element_size_bytes * See \b clblasFillVector(). */ clblasStatus clblasFillVectorAsync( size_t nb_elem, size_t element_size, cl_mem A, size_t offA, const void * pattern, cl_command_queue command_queue, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *event); /** * @brief Fill synchronously a matrix with a pattern of a size element_size_bytes * * @param[in] order specifies the matrix order * @param[in] element_size specifies the size of one element of A. Supported sizes correspond * element size used in clBLAS (1,2,4,8,16) * @param[in] A specifies the source vector on the device * @param[in] offA specifies the offset of matrix A in * @param[in] ldA specifies the leading dimension of A * @param[in] nrA specifies the number of row in A * @param[in] ncA specifies the number of column in A * @param[in] pattern specifies the host address of the pattern to fill with (element_size_bytes) * @param[in] command_queue specifies the OpenCL queue * @param[in] numEventsInWaitList specifies the number of OpenCL events to wait for * @param[in] eventWaitList specifies the list of OpenCL events to wait for * @return * - see \b clblasWriteSubMatrix() */ clblasStatus clblasFillMatrix( clblasOrder order, size_t element_size, cl_mem A, size_t offA, size_t ldA, size_t nrA, size_t ncA, const void *pattern, cl_command_queue command_queue, cl_uint numEventsInWaitList, const cl_event *eventWaitList); /** * @brief Partially fill a sub-matrix with a pattern of a size element_size_bytes * * * @param[in] order specifies the matrix order * @param[in] element_size specifies the size of one element of A. Supported values * are to element sizes used in clBLAS - that is 1, 2, 4, 8 or 16 * @param[in] offA specifies the offset of matrix A in elements * @param[in] ldA specifies the leading dimension of A in elements * @param[in] nrA specifies the number of rows of A * in elements * @param[in] ncA specifies the number of columns of A * in elements * @param[in] xA specifies the top-left x position to * copy from A * @param[in] yA specifies the top-left y position to * copy from A * @param[in] nx specifies the number of elements to * copy according to the x dimension (rows) * @param[in] ny specifies the number of elements to * copy according to the y dimension * (columns) * @param[in] pattern specifies the host address of the pattern to fill with (element_size_bytes) * @param[in] command_queue specifies the OpenCL queue * @param[in] numEventsInWaitList specifies the number of OpenCL events to wait for * @param[in] eventWaitList specifies the list of OpenCL events to wait for * @return * - see \b clblasWriteSubMatrix() */ clblasStatus clblasFillSubMatrix( clblasOrder order, size_t element_size, cl_mem A, size_t offA, size_t ldA, size_t nrA, size_t ncA, size_t xA, size_t yA, size_t nx, size_t ny, const void *pattern, cl_command_queue command_queue, cl_uint numEventsInWaitList, const cl_event *eventWaitList); /** * @brief Asynchronous asynchronously fill a sub-matrix with a pattern of a size element_size_bytes * See \b clblasFillSubMatrix(). */ clblasStatus clblasFillSubMatrixAsync( clblasOrder order, size_t element_size, cl_mem A, size_t offA, size_t ldA, size_t sxA, size_t syA, int xA, int yA, size_t nx, size_t ny, const void *host, cl_command_queue command_queue, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *event); #ifdef __cplusplus } /* extern "C" { */ #endif #endif /* CLBLAS_H_ */ clblas-2.10/src/clBLAS.version.h.in000066400000000000000000000017001264277366700170070ustar00rootroot00000000000000/* ************************************************************************ * Copyright 2013 Advanced Micro Devices, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * ************************************************************************/ /* the configured version and settings for clblas */ #define clblasVersionMajor @clBLAS_VERSION_MAJOR@ #define clblasVersionMinor @clBLAS_VERSION_MINOR@ #define clblasVersionPatch @clBLAS_VERSION_PATCH@ clblas-2.10/src/clBLASConfig.cmake.in000066400000000000000000000002641264277366700173060ustar00rootroot00000000000000include(${CMAKE_CURRENT_LIST_DIR}/clBLASTargets.cmake) get_filename_component(CLBLAS_INCLUDE_DIRS ${CMAKE_CURRENT_LIST_DIR}/@reldir@/include ABSOLUTE) set(CLBLAS_LIBRARIES clBLAS) clblas-2.10/src/clBLASConfigVersion.cmake.in000066400000000000000000000033271264277366700206570ustar00rootroot00000000000000# This is a basic version file for the Config-mode of find_package(). # It is used by write_basic_package_version_file() as input file for configure_file() # to create a version-file which can be installed along a config.cmake file. # # The created file sets PACKAGE_VERSION_EXACT if the current version string and # the requested version string are exactly the same and it sets # PACKAGE_VERSION_COMPATIBLE if the current version is >= requested version, # but only if the requested major version is the same as the current one. # The variable CLBLAS_VERSION must be set before calling configure_file(). set(PACKAGE_VERSION "@CLBLAS_VERSION@") if("${PACKAGE_VERSION}" VERSION_LESS "${PACKAGE_FIND_VERSION}" ) set(PACKAGE_VERSION_COMPATIBLE FALSE) else() if("@CLBLAS_VERSION@" MATCHES "^([0-9]+)\\.") set(CLBLAS_VERSION_MAJOR "${CMAKE_MATCH_1}") else() set(CLBLAS_VERSION_MAJOR "@CLBLAS_VERSION@") endif() if("${PACKAGE_FIND_VERSION_MAJOR}" STREQUAL "${CLBLAS_VERSION_MAJOR}") set(PACKAGE_VERSION_COMPATIBLE TRUE) else() set(PACKAGE_VERSION_COMPATIBLE FALSE) endif() if( "${PACKAGE_FIND_VERSION}" STREQUAL "${PACKAGE_VERSION}") set(PACKAGE_VERSION_EXACT TRUE) endif() endif() # if the installed or the using project don't have CMAKE_SIZEOF_VOID_P set, ignore it: if("${CMAKE_SIZEOF_VOID_P}" STREQUAL "" OR "@CMAKE_SIZEOF_VOID_P@" STREQUAL "") return() endif() # check that the installed version has the same 32/64bit-ness as the one which is currently searching: if(NOT "${CMAKE_SIZEOF_VOID_P}" STREQUAL "@CMAKE_SIZEOF_VOID_P@") math(EXPR installedBits "@CMAKE_SIZEOF_VOID_P@ * 8") set(PACKAGE_VERSION "${PACKAGE_VERSION} (${installedBits}bit)") set(PACKAGE_VERSION_UNSUITABLE TRUE) endif() clblas-2.10/src/client/000077500000000000000000000000001264277366700147655ustar00rootroot00000000000000clblas-2.10/src/client/CMakeLists.txt000066400000000000000000000044271264277366700175340ustar00rootroot00000000000000# ######################################################################## # Copyright 2013 Advanced Micro Devices, Inc. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # ######################################################################## set(CLIENT_SRC client.cpp stdafx.cpp statisticalTimer.cpp) set(CLIENT_HEADER stdafx.h targetver.h statisticalTimer.h clfunc_common.hpp clfunc_xgemm.hpp clfunc_xgemv.hpp clfunc_xsymv.hpp clfunc_xtrmm.hpp clfunc_xtrsm.hpp clfunc_xsyrk.hpp clfunc_xsyr2k.hpp clfunc_xhemm.hpp clfunc_xsymm.hpp clfunc_xherk.hpp clfunc_xher2k.hpp) set(WRAPPER_SRC testPerfWrapper.cpp) add_definitions(-D_CRT_SECURE_NO_WARNINGS) # Having problems on build server, compiling gtest headers with -pedantic; disabling detection of long long # http://code.google.com/p/googletest/issues/detail?id=334 if( CMAKE_COMPILER_IS_GNUCXX ) add_definitions( -Wno-long-long ) endif( ) include_directories( ${Boost_INCLUDE_DIRS} ${OPENCL_INCLUDE_DIRS} ${clBLAS_SOURCE_DIR} ${clBLAS_SOURCE_DIR}/include ${clBLAS_SOURCE_DIR}/tests/include .) add_executable(client ${CLIENT_SRC} ${CLIENT_HEADER}) target_link_libraries(client ${Boost_LIBRARIES} ${OPENCL_LIBRARIES} clBLAS) set_target_properties( client PROPERTIES RUNTIME_OUTPUT_DIRECTORY "${PROJECT_BINARY_DIR}/staging" OUTPUT_NAME clBLAS-client ) add_executable(testPerfWrapper ${WRAPPER_SRC}) target_link_libraries(testPerfWrapper ${Boost_LIBRARIES}) set_target_properties( testPerfWrapper PROPERTIES RUNTIME_OUTPUT_DIRECTORY "${PROJECT_BINARY_DIR}/staging" ) # CPack configuration; include the executable into the package install( TARGETS client testPerfWrapper RUNTIME DESTINATION bin${SUFFIX_BIN} LIBRARY DESTINATION lib${SUFFIX_LIB} ARCHIVE DESTINATION lib${SUFFIX_LIB}/import ) clblas-2.10/src/client/clGemm.h000066400000000000000000000460711264277366700163520ustar00rootroot00000000000000/* ************************************************************************ * Copyright 2013 Advanced Micro Devices, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * ************************************************************************/ #include #include #include #include cl_int gemm_err; std::string prettyPrintClStatus( const cl_int& status ) { switch( status ) { case CL_INVALID_GLOBAL_WORK_SIZE: return "CL_INVALID_GLOBAL_WORK_SIZE"; case CL_INVALID_MIP_LEVEL: return "CL_INVALID_MIP_LEVEL"; case CL_INVALID_BUFFER_SIZE: return "CL_INVALID_BUFFER_SIZE"; case CL_INVALID_GL_OBJECT: return "CL_INVALID_GL_OBJECT"; case CL_INVALID_OPERATION: return "CL_INVALID_OPERATION"; case CL_INVALID_EVENT: return "CL_INVALID_EVENT"; case CL_INVALID_EVENT_WAIT_LIST: return "CL_INVALID_EVENT_WAIT_LIST"; case CL_INVALID_GLOBAL_OFFSET: return "CL_INVALID_GLOBAL_OFFSET"; case CL_INVALID_WORK_ITEM_SIZE: return "CL_INVALID_WORK_ITEM_SIZE"; case CL_INVALID_WORK_GROUP_SIZE: return "CL_INVALID_WORK_GROUP_SIZE"; case CL_INVALID_WORK_DIMENSION: return "CL_INVALID_WORK_DIMENSION"; case CL_INVALID_KERNEL_ARGS: return "CL_INVALID_KERNEL_ARGS"; case CL_INVALID_ARG_SIZE: return "CL_INVALID_ARG_SIZE"; case CL_INVALID_ARG_VALUE: return "CL_INVALID_ARG_VALUE"; case CL_INVALID_ARG_INDEX: return "CL_INVALID_ARG_INDEX"; case CL_INVALID_KERNEL: return "CL_INVALID_KERNEL"; case CL_INVALID_KERNEL_DEFINITION: return "CL_INVALID_KERNEL_DEFINITION"; case CL_INVALID_KERNEL_NAME: return "CL_INVALID_KERNEL_NAME"; case CL_INVALID_PROGRAM_EXECUTABLE: return "CL_INVALID_PROGRAM_EXECUTABLE"; case CL_INVALID_PROGRAM: return "CL_INVALID_PROGRAM"; case CL_INVALID_BUILD_OPTIONS: return "CL_INVALID_BUILD_OPTIONS"; case CL_INVALID_BINARY: return "CL_INVALID_BINARY"; case CL_INVALID_SAMPLER: return "CL_INVALID_SAMPLER"; case CL_INVALID_IMAGE_SIZE: return "CL_INVALID_IMAGE_SIZE"; case CL_INVALID_IMAGE_FORMAT_DESCRIPTOR: return "CL_INVALID_IMAGE_FORMAT_DESCRIPTOR"; case CL_INVALID_MEM_OBJECT: return "CL_INVALID_MEM_OBJECT"; case CL_INVALID_HOST_PTR: return "CL_INVALID_HOST_PTR"; case CL_INVALID_COMMAND_QUEUE: return "CL_INVALID_COMMAND_QUEUE"; case CL_INVALID_QUEUE_PROPERTIES: return "CL_INVALID_QUEUE_PROPERTIES"; case CL_INVALID_CONTEXT: return "CL_INVALID_CONTEXT"; case CL_INVALID_DEVICE: return "CL_INVALID_DEVICE"; case CL_INVALID_PLATFORM: return "CL_INVALID_PLATFORM"; case CL_INVALID_DEVICE_TYPE: return "CL_INVALID_DEVICE_TYPE"; case CL_INVALID_VALUE: return "CL_INVALID_VALUE"; case CL_MAP_FAILURE: return "CL_MAP_FAILURE"; case CL_BUILD_PROGRAM_FAILURE: return "CL_BUILD_PROGRAM_FAILURE"; case CL_IMAGE_FORMAT_NOT_SUPPORTED: return "CL_IMAGE_FORMAT_NOT_SUPPORTED"; case CL_IMAGE_FORMAT_MISMATCH: return "CL_IMAGE_FORMAT_MISMATCH"; case CL_MEM_COPY_OVERLAP: return "CL_MEM_COPY_OVERLAP"; case CL_PROFILING_INFO_NOT_AVAILABLE: return "CL_PROFILING_INFO_NOT_AVAILABLE"; case CL_OUT_OF_HOST_MEMORY: return "CL_OUT_OF_HOST_MEMORY"; case CL_OUT_OF_RESOURCES: return "CL_OUT_OF_RESOURCES"; case CL_MEM_OBJECT_ALLOCATION_FAILURE: return "CL_MEM_OBJECT_ALLOCATION_FAILURE"; case CL_COMPILER_NOT_AVAILABLE: return "CL_COMPILER_NOT_AVAILABLE"; case CL_DEVICE_NOT_AVAILABLE: return "CL_DEVICE_NOT_AVAILABLE"; case CL_DEVICE_NOT_FOUND: return "CL_DEVICE_NOT_FOUND"; case CL_SUCCESS: return "CL_SUCCESS"; default: return "Error code not defined"; break; } } // This is used to either wrap an OpenCL function call, or to explicitly check a variable for an OpenCL error condition. // If an error occurs, we throw. // Note: std::runtime_error does not take unicode strings as input, so only strings supported inline cl_int OpenCL_V_Throw( cl_int res, const std::string& msg, size_t lineno ) { switch( res ) { case CL_SUCCESS: /**< No error */ break; default: { std::stringstream tmp; tmp << "OPENCL_V_THROWERROR< "; tmp << prettyPrintClStatus(res) ; tmp << " > ("; tmp << lineno; tmp << "): "; tmp << msg; std::string errorm(tmp.str()); std::cout << errorm<< std::endl; throw std::runtime_error( errorm ); } } return res; } #define OPENCL_V_THROW(_status,_message) OpenCL_V_Throw(_status, _message, __LINE__) enum complexity_t { not_complex = 1, yes_complex = 2 }; //can be cl_float, cl_double //TODO should be cl_float2 and cl_double2 instead of using float/double * complexity? template< class T > class buffers { public: size_t M, N, K; size_t lda, ldb, ldc; complexity_t complexity; T* A; T* B; T* C; cl_mem bufA, bufB, bufC; cl_command_queue queue; std::map buffer_map; std::map rows_map; std::map ldx_map; buffers( cl_context ctx, cl_command_queue _queue, size_t _M, size_t _N, size_t _K, size_t _lda, size_t _ldb, size_t _ldc, complexity_t _complexity ) : M(_M) , N(_N) , K(_K) , lda(_lda) , ldb(_ldb) , ldc(_ldc) , complexity(_complexity) , A(new T[M*lda*sizeof(T)*complexity]) , B(new T[K*ldb*sizeof(T)*complexity]) , C(new T[M*ldc*sizeof(T)*complexity]) , queue(_queue) { // request and initialize openCL memory bufA = clCreateBuffer(ctx, CL_MEM_READ_ONLY, M * lda * sizeof(*A) * complexity, NULL, &gemm_err); OPENCL_V_THROW( gemm_err, "creating buffer A" ); bufB = clCreateBuffer(ctx, CL_MEM_READ_ONLY, K * ldb * sizeof(*B) * complexity, NULL, &gemm_err); OPENCL_V_THROW( gemm_err, "creating buffer B" ); bufC = clCreateBuffer(ctx, CL_MEM_READ_WRITE, M * ldc * sizeof(*C) * complexity, NULL, &gemm_err); OPENCL_V_THROW( gemm_err, "creating buffer C" ); buffer_map.insert(std::pair("A",A)); buffer_map.insert(std::pair("B",B)); buffer_map.insert(std::pair("C",C)); rows_map.insert(std::pair("A",M)); rows_map.insert(std::pair("B",K)); rows_map.insert(std::pair("C",M)); ldx_map.insert(std::pair("A",lda)); ldx_map.insert(std::pair("B",ldb)); ldx_map.insert(std::pair("C",ldc)); initialize_data(); } ~buffers() { OPENCL_V_THROW( clReleaseMemObject(bufC), "releasing buffer A"); OPENCL_V_THROW( clReleaseMemObject(bufB), "releasing buffer B"); OPENCL_V_THROW( clReleaseMemObject(bufA), "releasing buffer C"); delete[] A; delete[] B; delete[] C; } void initialize_data() { initializeLocalMatrix("A"); initializeLocalMatrix("B"); initializeLocalMatrix("C"); gemm_err = clEnqueueWriteBuffer(queue, bufA, CL_TRUE, 0, M * K * sizeof(*A) * complexity, A, 0, NULL, NULL); OPENCL_V_THROW( gemm_err, "writing to buffer A" ); gemm_err = clEnqueueWriteBuffer(queue, bufB, CL_TRUE, 0, K * N * sizeof(*B) * complexity, A, 0, NULL, NULL); OPENCL_V_THROW( gemm_err, "writing to buffer B" ); gemm_err = clEnqueueWriteBuffer(queue, bufC, CL_TRUE, 0, M * N * sizeof(*C) * complexity, C, 0, NULL, NULL); OPENCL_V_THROW( gemm_err, "writing to buffer C" ); } void read_back_result() { OPENCL_V_THROW( clEnqueueReadBuffer(queue, bufC, CL_TRUE, 0, M * N * sizeof(*C) * complexity, C, 0, NULL, NULL), "reading from buffer C" ); } void initializeLocalMatrix(std::string matrix) { for (size_t i = 0; i < rows_map[matrix]*complexity; i++) { for (size_t j = 0; j < ldx_map[matrix]; j++) { buffer_map[matrix][i * ldx_map[matrix] + j] = (i+1)*10 + (j+1); } } } void printLocalMatrix(std::string matrix) { for (size_t i = 0; i < rows_map[matrix]*complexity; i++) { for (size_t j = 0; j < ldx_map[matrix]; j++) { std::cout << (int)buffer_map[matrix][i * ldx_map[matrix] + j] << " "; } std::cout << std::endl; } std::cout << std::endl; } }; class clGemm { public: size_t M; size_t N; size_t K; size_t lda; size_t ldb; size_t ldc; clblasOrder order; clblasTranspose transA; clblasTranspose transB; cl_context_properties props[3]; cl_platform_id platform; cl_device_id device; cl_context ctx; cl_device_type deviceType; cl_command_queue queue; cl_event event; cl_uint commandQueueFlags; bool useimages; cl_ulong imgA; cl_ulong imgB; StatisticalTimer& timer; StatisticalTimer::sTimerID gemm_timer_id; clGemm( size_t _M, size_t _N, size_t _K, size_t _lda, size_t _ldb, size_t _ldc, bool _useimages, clblasOrder _order, clblasTranspose _transA, clblasTranspose _transB, cl_device_type _deviceType, cl_uint _commandQueueFlags, StatisticalTimer& _timer ) : M(_M) , N(_N) , K(_K) , lda(_lda) , ldb(_ldb) , ldc(_ldc) , order(_order) , transA(_transA) , transB(_transB) , deviceType(_deviceType) , event(NULL) , commandQueueFlags(_commandQueueFlags) , useimages(_useimages) , imgA(0) , imgB(0) , timer(_timer) { props[0] = CL_CONTEXT_PLATFORM; props[1] = 0; props[2] = 0; OPENCL_V_THROW( clGetPlatformIDs(1, &platform, NULL), "getting platform IDs" ); OPENCL_V_THROW( clGetDeviceIDs(platform, deviceType, 1, &device, NULL), "getting device IDs" ); props[1] = (cl_context_properties)platform; ctx = clCreateContext(props, 1, &device, NULL, NULL, &gemm_err); OPENCL_V_THROW( gemm_err, "creating context" ); queue = clCreateCommandQueue(ctx, device, commandQueueFlags, &gemm_err); OPENCL_V_THROW( gemm_err, "creating command queue" ); gemm_err = clblasSetup(); if (gemm_err != CL_SUCCESS) { std::cout << "clblasSetup() failed with " << gemm_err << std::endl; clReleaseCommandQueue(queue); clReleaseContext(ctx); exit(1); } if (useimages) { imgA = clblasAddScratchImage(ctx, 16, 64, NULL); imgB = clblasAddScratchImage(ctx, 16, 64, NULL); } gemm_timer_id = timer.getUniqueID( "clGemm", 0 ); } ~clGemm() { if (useimages) { clblasRemoveScratchImage(imgA); clblasRemoveScratchImage(imgB); } clblasTeardown(); OPENCL_V_THROW( clReleaseCommandQueue(queue), "releasing command queue" ); OPENCL_V_THROW( clReleaseContext(ctx), "releasing context" ); } void wait_and_check() { cl_int wait_status = clWaitForEvents(1, &event); if( wait_status != CL_SUCCESS ) { if( wait_status == CL_EXEC_STATUS_ERROR_FOR_EVENTS_IN_WAIT_LIST ) { clGetEventInfo( event, CL_EVENT_COMMAND_EXECUTION_STATUS, sizeof(cl_int), &gemm_err, NULL ); std::cout << "blas function execution status error: " << gemm_err << std::endl; exit(1); } else { std::cout << "blas function wait status error: " << wait_status << std::endl; exit(1); } } } double time_in_ns() { StatisticalTimer& timer = StatisticalTimer::getInstance( ); return timer.getAverageTime( gemm_timer_id ) * 1e9; } virtual void call_gemm() = 0; virtual void clear_buffers() = 0; virtual double gflops() = 0; virtual std::string gflops_formula() = 0; }; class clSgemm : public clGemm { public: cl_float alpha; cl_float beta; buffers mybuffers; clSgemm( size_t _M, size_t _N, size_t _K, size_t _lda, size_t _ldb, size_t _ldc, bool _useimages, clblasOrder _order, clblasTranspose _transA, clblasTranspose _transB, cl_float _alpha, cl_float _beta, cl_device_type _deviceType, cl_uint _commandQueueFlags, StatisticalTimer& _timer) : clGemm( _M, _N, _K, _lda, _ldb, _ldc, _useimages, _order, _transA, _transB, _deviceType, _commandQueueFlags, _timer ) , alpha(_alpha) , beta(_beta) , mybuffers( ctx, queue, M, N, K, lda, ldb, ldc, not_complex ) {} void call_gemm() { timer.Start(gemm_timer_id); OPENCL_V_THROW( clblasSgemm(order, transA, transB, M, N, K, alpha, mybuffers.bufA, lda, mybuffers.bufB, ldb, beta, mybuffers.bufC, ldc, 1, &queue, 0, NULL, &event), "clblasSgemm" ); wait_and_check(); timer.Stop(gemm_timer_id); //mybuffers.read_back_result(); //mybuffers.printLocalMatrix("C"); } void clear_buffers() { mybuffers.initialize_data(); } double gflops() { return (2*M*N*K)/time_in_ns(); } std::string gflops_formula() { return "(2*M*N*K)/time_in_ns"; } }; class clDgemm : public clGemm { public: cl_double alpha; cl_double beta; buffers mybuffers; clDgemm( size_t _M, size_t _N, size_t _K, size_t _lda, size_t _ldb, size_t _ldc, bool _useimages, clblasOrder _order, clblasTranspose _transA, clblasTranspose _transB, cl_double _alpha, cl_double _beta, cl_device_type _deviceType, cl_uint _commandQueueFlags, StatisticalTimer& _timer) : clGemm( _M, _N, _K, _lda, _ldb, _ldc, _useimages, _order, _transA, _transB, _deviceType, _commandQueueFlags, _timer ) , alpha(_alpha) , beta(_beta) , mybuffers( ctx, queue, M, N, K, lda, ldb, ldc, not_complex ) {} void call_gemm() { timer.Start(gemm_timer_id); OPENCL_V_THROW( clblasDgemm(order, transA, transB, M, N, K, alpha, mybuffers.bufA, lda, mybuffers.bufB, ldb, beta, mybuffers.bufC, ldc, 1, &queue, 0, NULL, &event), "clblasDgemm" ); wait_and_check(); timer.Stop(gemm_timer_id); //mybuffers.read_back_result(); //mybuffers.printLocalMatrix("C"); } void clear_buffers() { mybuffers.initialize_data(); } double gflops() { return (2*M*N*K)/time_in_ns(); } std::string gflops_formula() { return "(2*M*N*K)/time_in_ns"; } }; class clCgemm : public clGemm { public: cl_float2 alpha; cl_float2 beta; buffers mybuffers; clCgemm( size_t _M, size_t _N, size_t _K, size_t _lda, size_t _ldb, size_t _ldc, bool _useimages, clblasOrder _order, clblasTranspose _transA, clblasTranspose _transB, cl_float _alpha, cl_float _beta, cl_device_type _deviceType, cl_uint _commandQueueFlags, StatisticalTimer& _timer) : clGemm( _M, _N, _K, _lda, _ldb, _ldc, _useimages, _order, _transA, _transB, _deviceType, _commandQueueFlags, _timer ) , mybuffers( ctx, queue, M, N, K, lda, ldb, ldc, yes_complex ) { alpha.s[0] = _alpha; alpha.s[1] = _alpha; beta.s[0] = _beta; beta.s[1] = _beta; } void call_gemm() { timer.Start(gemm_timer_id); OPENCL_V_THROW( clblasCgemm(order, transA, transB, M, N, K, alpha, mybuffers.bufA, lda, mybuffers.bufB, ldb, beta, mybuffers.bufC, ldc, 1, &queue, 0, NULL, &event), "clblasCgemm" ); wait_and_check(); timer.Stop(gemm_timer_id); //mybuffers.read_back_result(); //mybuffers.printLocalMatrix("C"); } void clear_buffers() { mybuffers.initialize_data(); } double gflops() { return (8*M*N*K)/time_in_ns(); } std::string gflops_formula() { return "(8*M*N*K)/time_in_ns"; } }; class clZgemm : public clGemm { public: cl_double2 alpha; cl_double2 beta; buffers mybuffers; clZgemm( size_t _M, size_t _N, size_t _K, size_t _lda, size_t _ldb, size_t _ldc, bool _useimages, clblasOrder _order, clblasTranspose _transA, clblasTranspose _transB, cl_double _alpha, cl_double _beta, cl_device_type _deviceType, cl_uint _commandQueueFlags, StatisticalTimer& _timer) : clGemm( _M, _N, _K, _lda, _ldb, _ldc, _useimages, _order, _transA, _transB, _deviceType, _commandQueueFlags, _timer ) , mybuffers( ctx, queue, M, N, K, lda, ldb, ldc, yes_complex ) { alpha.s[0] = _alpha; alpha.s[1] = _alpha; beta.s[0] = _beta; beta.s[1] = _beta; } void call_gemm() { timer.Start(gemm_timer_id); OPENCL_V_THROW( clblasZgemm(order, transA, transB, M, N, K, alpha, mybuffers.bufA, lda, mybuffers.bufB, ldb, beta, mybuffers.bufC, ldc, 1, &queue, 0, NULL, &event), "clblasZgemm" ); wait_and_check(); timer.Stop(gemm_timer_id); //mybuffers.read_back_result(); //mybuffers.printLocalMatrix("C"); } void clear_buffers() { mybuffers.initialize_data(); } double gflops() { return (8*M*N*K)/time_in_ns(); } std::string gflops_formula() { return "(8*M*N*K)/time_in_ns"; } }; clblas-2.10/src/client/clfunc_common.hpp000066400000000000000000000253251264277366700203270ustar00rootroot00000000000000/* ************************************************************************ * Copyright 2013 Advanced Micro Devices, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * ************************************************************************/ #ifndef CLBLAS_BENCHMARK_COMMON_HXX__ #define CLBLAS_BENCHMARK_COMMON_HXX__ #include #include #include #include #include #include "blas-math.h" #include "test-limits.h" #include "dis_warning.h" #include "clBLAS.h" #if defined(__APPLE__) || defined(__MACOSX) #include #else #include #endif template static T makeScalar(double val) { return static_cast(val); } template<> __template_static FloatComplex makeScalar(double val) { FloatComplex c; c.s[0] = static_cast(val); c.s[1] = 0; return c; } template<> __template_static DoubleComplex makeScalar(double val) { DoubleComplex c; c.s[0] = val; c.s[1] = 0; return c; } template static T randomScale() { T t = random(UPPER_BOUND()); if (module(t) == 0) { t = t + ONE(); } return t; } std::string prettyPrintClStatus( const cl_int& status ) { switch( status ) { case CL_INVALID_GLOBAL_WORK_SIZE: return "CL_INVALID_GLOBAL_WORK_SIZE"; case CL_INVALID_MIP_LEVEL: return "CL_INVALID_MIP_LEVEL"; case CL_INVALID_BUFFER_SIZE: return "CL_INVALID_BUFFER_SIZE"; case CL_INVALID_GL_OBJECT: return "CL_INVALID_GL_OBJECT"; case CL_INVALID_OPERATION: return "CL_INVALID_OPERATION"; case CL_INVALID_EVENT: return "CL_INVALID_EVENT"; case CL_INVALID_EVENT_WAIT_LIST: return "CL_INVALID_EVENT_WAIT_LIST"; case CL_INVALID_GLOBAL_OFFSET: return "CL_INVALID_GLOBAL_OFFSET"; case CL_INVALID_WORK_ITEM_SIZE: return "CL_INVALID_WORK_ITEM_SIZE"; case CL_INVALID_WORK_GROUP_SIZE: return "CL_INVALID_WORK_GROUP_SIZE"; case CL_INVALID_WORK_DIMENSION: return "CL_INVALID_WORK_DIMENSION"; case CL_INVALID_KERNEL_ARGS: return "CL_INVALID_KERNEL_ARGS"; case CL_INVALID_ARG_SIZE: return "CL_INVALID_ARG_SIZE"; case CL_INVALID_ARG_VALUE: return "CL_INVALID_ARG_VALUE"; case CL_INVALID_ARG_INDEX: return "CL_INVALID_ARG_INDEX"; case CL_INVALID_KERNEL: return "CL_INVALID_KERNEL"; case CL_INVALID_KERNEL_DEFINITION: return "CL_INVALID_KERNEL_DEFINITION"; case CL_INVALID_KERNEL_NAME: return "CL_INVALID_KERNEL_NAME"; case CL_INVALID_PROGRAM_EXECUTABLE: return "CL_INVALID_PROGRAM_EXECUTABLE"; case CL_INVALID_PROGRAM: return "CL_INVALID_PROGRAM"; case CL_INVALID_BUILD_OPTIONS: return "CL_INVALID_BUILD_OPTIONS"; case CL_INVALID_BINARY: return "CL_INVALID_BINARY"; case CL_INVALID_SAMPLER: return "CL_INVALID_SAMPLER"; case CL_INVALID_IMAGE_SIZE: return "CL_INVALID_IMAGE_SIZE"; case CL_INVALID_IMAGE_FORMAT_DESCRIPTOR: return "CL_INVALID_IMAGE_FORMAT_DESCRIPTOR"; case CL_INVALID_MEM_OBJECT: return "CL_INVALID_MEM_OBJECT"; case CL_INVALID_HOST_PTR: return "CL_INVALID_HOST_PTR"; case CL_INVALID_COMMAND_QUEUE: return "CL_INVALID_COMMAND_QUEUE"; case CL_INVALID_QUEUE_PROPERTIES: return "CL_INVALID_QUEUE_PROPERTIES"; case CL_INVALID_CONTEXT: return "CL_INVALID_CONTEXT"; case CL_INVALID_DEVICE: return "CL_INVALID_DEVICE"; case CL_INVALID_PLATFORM: return "CL_INVALID_PLATFORM"; case CL_INVALID_DEVICE_TYPE: return "CL_INVALID_DEVICE_TYPE"; case CL_INVALID_VALUE: return "CL_INVALID_VALUE"; case CL_MAP_FAILURE: return "CL_MAP_FAILURE"; case CL_BUILD_PROGRAM_FAILURE: return "CL_BUILD_PROGRAM_FAILURE"; case CL_IMAGE_FORMAT_NOT_SUPPORTED: return "CL_IMAGE_FORMAT_NOT_SUPPORTED"; case CL_IMAGE_FORMAT_MISMATCH: return "CL_IMAGE_FORMAT_MISMATCH"; case CL_MEM_COPY_OVERLAP: return "CL_MEM_COPY_OVERLAP"; case CL_PROFILING_INFO_NOT_AVAILABLE: return "CL_PROFILING_INFO_NOT_AVAILABLE"; case CL_OUT_OF_HOST_MEMORY: return "CL_OUT_OF_HOST_MEMORY"; case CL_OUT_OF_RESOURCES: return "CL_OUT_OF_RESOURCES"; case CL_MEM_OBJECT_ALLOCATION_FAILURE: return "CL_MEM_OBJECT_ALLOCATION_FAILURE"; case CL_COMPILER_NOT_AVAILABLE: return "CL_COMPILER_NOT_AVAILABLE"; case CL_DEVICE_NOT_AVAILABLE: return "CL_DEVICE_NOT_AVAILABLE"; case CL_DEVICE_NOT_FOUND: return "CL_DEVICE_NOT_FOUND"; case CL_SUCCESS: return "CL_SUCCESS"; default: return "Error code not defined"; break; } } // This is used to either wrap an OpenCL function call, or to // explicitly check a variable for an OpenCL error condition. // If an error occurs, we throw. // Note: std::runtime_error does not take unicode strings as input, so // only strings supported inline cl_int OpenCL_V_Throw( cl_int res, const std::string& msg, size_t lineno ) { switch( res ) { case CL_SUCCESS: /**< No error */ break; default: { std::stringstream tmp; tmp << "OPENCL_V_THROWERROR< "; tmp << prettyPrintClStatus(res) ; tmp << " > ("; tmp << lineno; tmp << "): "; tmp << msg; std::string errorm(tmp.str()); std::cout << errorm<< std::endl; throw std::runtime_error( errorm ); } } return res; } #define OPENCL_V_THROW(_status,_message) OpenCL_V_Throw(_status, _message, \ __LINE__) inline cl_ulong queryMemAllocSize( cl_device_id device_ ) { cl_int err; cl_ulong rc = 0; err = clGetDeviceInfo(device_, CL_DEVICE_MAX_MEM_ALLOC_SIZE, sizeof(rc), &rc, NULL); return rc; } class clblasFunc { public: clblasFunc(StatisticalTimer& _timer, cl_device_type devType) : timer(_timer) { cl_int err; /* Setup OpenCL environment. */ OPENCL_V_THROW(clGetPlatformIDs(1, &platform_, NULL), "getting platform IDs"); OPENCL_V_THROW(clGetDeviceIDs(platform_, devType, 1, &device_, NULL), "getting device IDs"); props_[0] = CL_CONTEXT_PLATFORM; props_[1] = (cl_context_properties)platform_; props_[2] = 0; ctx_ = clCreateContext(props_, 1, &device_, NULL, NULL, &err); OPENCL_V_THROW(err, "creating context"); for (unsigned int i = 0; i < numQueues; i++) { queues_[i] = clCreateCommandQueue(ctx_, device_, CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE, &err); } timer_id = timer.getUniqueID( "clfunc", 0 ); maxMemAllocSize = queryMemAllocSize( device_ ); /* Setup clblas. */ err = clblasSetup(); if (err != CL_SUCCESS) { std::cerr << "clblasSetup() failed with %d\n"; for (unsigned int i = 0; i < numQueues; i++) { clReleaseCommandQueue(queues_[i]); } clReleaseContext(ctx_); } } virtual ~clblasFunc() { clblasTeardown(); for (unsigned int i = 0; i < numQueues; i++) { OPENCL_V_THROW( clReleaseCommandQueue(queues_[i]), "releasing command queue" ); } OPENCL_V_THROW( clReleaseContext(ctx_), "releasing context" ); } void wait_and_check() { cl_int err; cl_int wait_status = clWaitForEvents(1, &event_); if( wait_status != CL_SUCCESS ) { if( wait_status == CL_EXEC_STATUS_ERROR_FOR_EVENTS_IN_WAIT_LIST ) { clGetEventInfo( event_, CL_EVENT_COMMAND_EXECUTION_STATUS, sizeof(cl_int), &err, NULL ); std::cout << "blas function execution status error: " << err << std::endl; exit(1); } else { std::cout << "blas function wait status error: " << wait_status << std::endl; exit(1); } } } double time_in_ns() { StatisticalTimer& timer = StatisticalTimer::getInstance( ); return timer.getAverageTime( timer_id ) * 1e9; } virtual void call_func() = 0; virtual double gflops() = 0; virtual std::string gflops_formula() = 0; virtual void setup_apiCallCount(cl_uint apiCallCount){} virtual void setup_buffer(int order_option, int side_option, int uplo_option, int diag_option, int transA_option, int transB_option, size_t M, size_t N, size_t K, size_t lda, size_t ldb, size_t ldc, size_t offA, size_t offBX, size_t offCY, double alpha, double beta) = 0; virtual void initialize_cpu_buffer() = 0; virtual void initialize_gpu_buffer() = 0; virtual void reset_gpu_write_buffer() = 0; virtual void read_gpu_buffer() = 0; virtual void roundtrip_func() = 0; virtual void roundtrip_func_rect() {} virtual void allochostptr_roundtrip_func() {} virtual void usehostptr_roundtrip_func() {} virtual void copyhostptr_roundtrip_func() {} virtual void usepersismem_roundtrip_func() {} virtual void roundtrip_setup_buffer(int order_option, int side_option, int uplo_option, int diag_option, int transA_option, int transB_option, size_t M, size_t N, size_t K, size_t lda, size_t ldb, size_t ldc, size_t offA, size_t offBX, size_t offCY, double alpha, double beta) = 0; virtual void releaseGPUBuffer_deleteCPUBuffer()=0; StatisticalTimer& timer; StatisticalTimer::sTimerID timer_id; protected: virtual void initialize_scalars(double alpha, double beta) = 0; protected: cl_platform_id platform_; cl_device_id device_; cl_context_properties props_[3]; cl_context ctx_; static const unsigned int numQueues = 4; cl_command_queue queues_[numQueues]; clblasOrder order_; cl_event event_; size_t maxMemAllocSize; }; // class clblasFunc #endif // ifndef CLBLAS_BENCHMARK_COMMON_HXX__ clblas-2.10/src/client/clfunc_xgemm.hpp000066400000000000000000001204271264277366700201530ustar00rootroot00000000000000/* ************************************************************************ * Copyright 2013 Advanced Micro Devices, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * ************************************************************************/ // $Id #ifndef CLBLAS_BENCHMARK_XGEMM_HXX__ #define CLBLAS_BENCHMARK_XGEMM_HXX__ #include "clfunc_common.hpp" template struct xGemmBuffer { clblasOrder order_; size_t m_; size_t n_; size_t k_; size_t lda_; size_t ldb_; size_t ldc_; size_t offA_; size_t offB_; size_t offC_; size_t a_num_vectors_; size_t b_num_vectors_; size_t c_num_vectors_; clblasTranspose trans_a_; clblasTranspose trans_b_; T* a_; T* b_; T* c_; cl_mem buf_a_; cl_mem buf_b_; cl_mem buf_c_; T alpha_; T beta_; cl_uint apiCallCount; }; // struct buffer template class xGemm : public clblasFunc { public: xGemm(StatisticalTimer& timer, cl_device_type devType, unsigned int iNumQueuesToUse) : clblasFunc(timer, devType), numQueuesToUse(iNumQueuesToUse) { timer.getUniqueID("clGemm", 0); } ~xGemm() { } void call_func() { timer.Start(timer_id); xGemm_Function(true, buffer_.apiCallCount); timer.Stop(timer_id); } double gflops() { return (2.0*buffer_.m_*buffer_.n_*buffer_.k_) / (time_in_ns() / buffer_.apiCallCount); } void setup_apiCallCount(cl_uint apiCallCount) { buffer_.apiCallCount = apiCallCount; } std::string gflops_formula() { return "2.0*M*N*K/time"; } void setup_buffer(int order_option, int side_option, int uplo_option, int diag_option, int transA_option, int transB_option, size_t M, size_t N, size_t K, size_t lda, size_t ldb, size_t ldc, size_t offA, size_t offBX, size_t offCY, double alpha, double beta) { DUMMY_ARGS_USAGE_3(side_option, uplo_option, diag_option); initialize_scalars(alpha, beta); buffer_.m_ = M; buffer_.n_ = N; buffer_.k_ = K; buffer_.offA_ = offA; buffer_.offB_ = offBX; buffer_.offC_ = offCY; if (order_option == 0) { order_ = clblasRowMajor; if (transA_option == 0) { buffer_.trans_a_ = clblasNoTrans; buffer_.a_num_vectors_ = M; if (lda == 0) { buffer_.lda_ = K; } else if (lda < K) { std::cerr << "lda:wrong size\n"; exit(1); } else { buffer_.lda_ = lda; } } else { buffer_.a_num_vectors_ = K; if (transA_option == 1) { buffer_.trans_a_ = clblasTrans; } else if (transA_option == 2) { buffer_.trans_a_ = clblasConjTrans; } if (lda == 0) { buffer_.lda_ = M; } else if (lda < M) { std::cerr << "lda:wrong size\n"; exit(1); } else { buffer_.lda_ = lda; } } if (transB_option == 0) { buffer_.b_num_vectors_ = K; buffer_.trans_b_ = clblasNoTrans; if (ldb == 0) { buffer_.ldb_ = N; } else if (ldb < N) { std::cerr << "ldb:wrong size\n"; exit(1); } else { buffer_.ldb_ = ldb; } } else { buffer_.b_num_vectors_ = N; if (transB_option == 1) { buffer_.trans_b_ = clblasTrans; } else if (transB_option == 2) { buffer_.trans_b_ = clblasConjTrans; } if (ldb == 0) { buffer_.ldb_ = K; } else if (ldb < K) { std::cerr << "ldb:wrong size\n"; exit(1); } else { buffer_.ldb_ = ldb; } } if (ldc == 0) { buffer_.ldc_ = N; } else if (ldc < N) { std::cerr << "ldc:wrong size\n"; } else { buffer_.ldc_ = ldc; } buffer_.c_num_vectors_ = M; } else { order_ = clblasColumnMajor; if (transA_option == 0) { buffer_.a_num_vectors_ = K; buffer_.trans_a_ = clblasNoTrans; if (lda == 0) { buffer_.lda_ = M; } else if (lda < M) { std::cerr << "lda:wrong size\n"; exit(1); } else { buffer_.lda_ = lda; } } else { buffer_.a_num_vectors_ = M; if (transA_option == 1) { buffer_.trans_a_ = clblasTrans; } else if (transA_option == 2) { buffer_.trans_a_ = clblasConjTrans; } if (lda == 0) { buffer_.lda_ = K; } else if (lda < K) { std::cerr << "lda:wrong size\n"; exit(1); } else { buffer_.lda_ = lda; } } if (transB_option == 0) { buffer_.b_num_vectors_ = N; buffer_.trans_b_ = clblasNoTrans; if (ldb == 0) { buffer_.ldb_ = K; } else if (ldb < K) { std::cerr << "ldb:wrong size\n"; exit(1); } else { buffer_.ldb_ = ldb; } } else { buffer_.b_num_vectors_ = K; if (transB_option == 1) { buffer_.trans_b_ = clblasTrans; } else if (transB_option == 2) { buffer_.trans_b_ = clblasConjTrans; } if (ldb == 0) { buffer_.ldb_ = N; } else if (ldb < N) { std::cerr << "ldb:wrong size\n"; exit(1); } else { buffer_.ldb_ = ldb; } } if (ldc == 0) { buffer_.ldc_ = M; } else if (ldc < M) { std::cerr << "ldc:wrong size\n"; } else { buffer_.ldc_ = ldc; } buffer_.c_num_vectors_ = N; } buffer_.a_ = new T[buffer_.lda_*buffer_.a_num_vectors_]; buffer_.b_ = new T[buffer_.ldb_*buffer_.b_num_vectors_]; buffer_.c_ = new T[buffer_.ldc_*buffer_.c_num_vectors_ ]; cl_int err; buffer_.buf_a_ = clCreateBuffer(ctx_, CL_MEM_READ_ONLY, (buffer_.lda_*buffer_.a_num_vectors_ + buffer_.offA_) * sizeof(T), NULL, &err); buffer_.buf_b_ = clCreateBuffer(ctx_, CL_MEM_READ_ONLY, (buffer_.ldb_ * buffer_.b_num_vectors_ + buffer_.offB_) * sizeof(T), NULL, &err); buffer_.buf_c_ = clCreateBuffer(ctx_, CL_MEM_READ_WRITE, (buffer_.ldc_ * buffer_.c_num_vectors_ + buffer_.offC_) * sizeof(T), NULL, &err); } void initialize_cpu_buffer() { srand(10); for (size_t i = 0; i < buffer_.a_num_vectors_; ++i) { for (size_t j = 0; j < buffer_.lda_; ++j) { buffer_.a_[i*buffer_.lda_+j] = random(UPPER_BOUND()) / randomScale(); } } for (size_t i = 0; i < buffer_.b_num_vectors_; ++i) { for (size_t j = 0; j < buffer_.ldb_; ++j) { buffer_.b_[i*buffer_.ldb_+j] = random(UPPER_BOUND()) / randomScale(); } } for (size_t i = 0; i < buffer_.c_num_vectors_; ++i) { for (size_t j = 0; j < buffer_.ldc_; ++j) { buffer_.c_[i*buffer_.ldc_+j] = random(UPPER_BOUND()) / randomScale(); } } } void initialize_gpu_buffer() { cl_int err; err = clEnqueueWriteBuffer(queues_[0], buffer_.buf_a_, CL_TRUE, buffer_.offA_ * sizeof(T), buffer_.lda_ * buffer_.a_num_vectors_ * sizeof(T), buffer_.a_, 0, NULL, NULL); err = clEnqueueWriteBuffer(queues_[0], buffer_.buf_b_, CL_TRUE, buffer_.offB_ * sizeof(T), buffer_.ldb_ * buffer_.b_num_vectors_ * sizeof(T), buffer_.b_, 0, NULL, NULL); err = clEnqueueWriteBuffer(queues_[0], buffer_.buf_c_, CL_TRUE, buffer_.offC_ * sizeof(T), buffer_.ldc_ * buffer_.c_num_vectors_ * sizeof(T), buffer_.c_, 0, NULL, NULL); } void reset_gpu_write_buffer() { cl_int err; err = clEnqueueWriteBuffer(queues_[0], buffer_.buf_c_, CL_TRUE, buffer_.offC_ * sizeof(T), buffer_.ldc_ * buffer_.c_num_vectors_ * sizeof(T), buffer_.c_, 0, NULL, NULL); } void read_gpu_buffer() { cl_int err; err = clEnqueueReadBuffer(queues_[0], buffer_.buf_c_, CL_TRUE, buffer_.offC_ * sizeof(T), buffer_.ldc_ * buffer_.c_num_vectors_ * sizeof(T), buffer_.c_, 0, NULL, NULL); } void roundtrip_func() { timer.Start(timer_id); cl_int err; buffer_.buf_a_ = clCreateBuffer(ctx_, CL_MEM_READ_ONLY, (buffer_.lda_*buffer_.a_num_vectors_ + buffer_.offA_) * sizeof(T), NULL, &err); buffer_.buf_b_ = clCreateBuffer(ctx_, CL_MEM_READ_ONLY, (buffer_.ldb_ * buffer_.b_num_vectors_ + buffer_.offB_) * sizeof(T), NULL, &err); buffer_.buf_c_ = clCreateBuffer(ctx_, CL_MEM_READ_WRITE, (buffer_.ldc_ * buffer_.c_num_vectors_ + buffer_.offC_) * sizeof(T), NULL, &err); err = clEnqueueWriteBuffer(queues_[0], buffer_.buf_a_, CL_TRUE, buffer_.offA_ * sizeof(T), buffer_.lda_ * buffer_.a_num_vectors_ * sizeof(T), buffer_.a_, 0, NULL, NULL); err = clEnqueueWriteBuffer(queues_[0], buffer_.buf_b_, CL_TRUE, buffer_.offB_ * sizeof(T), buffer_.ldb_ * buffer_.b_num_vectors_ * sizeof(T), buffer_.b_, 0, NULL, NULL); err = clEnqueueWriteBuffer(queues_[0], buffer_.buf_c_, CL_TRUE, buffer_.offC_ * sizeof(T), buffer_.ldc_ * buffer_.c_num_vectors_ * sizeof(T), buffer_.c_, 0, NULL, NULL); xGemm_Function(false); err = clEnqueueReadBuffer(queues_[0], buffer_.buf_c_, CL_TRUE, buffer_.offC_ * sizeof(T), buffer_.ldc_ * buffer_.c_num_vectors_ * sizeof(T), buffer_.c_, 0, NULL, &event_); clWaitForEvents(1, &event_); timer.Stop(timer_id); } void roundtrip_func_rect() { timer.Start(timer_id); cl_int err; //rect size_t a_buffer_origin[3] = {0,0,0}; size_t a_host_origin[3] = {0,0,0}; size_t a_region[3] = {buffer_.m_*sizeof(T),buffer_.k_,1}; size_t a_buffer_row_pitch=0*sizeof(T);//lda size_t a_buffer_slice_pitch=0; size_t a_host_row_pitch=buffer_.lda_*sizeof(T); size_t a_host_slice_pitch=0; size_t b_buffer_origin[3] = {0,0,0}; size_t b_host_origin[3] = {0,0,0}; size_t b_region[3] = {buffer_.k_*sizeof(T),buffer_.n_,1}; size_t b_buffer_row_pitch=0*sizeof(T);//ldb size_t b_buffer_slice_pitch=0; size_t b_host_row_pitch=buffer_.ldb_*sizeof(T); size_t b_host_slice_pitch=0; size_t c_buffer_origin[3] = {0,0,0}; size_t c_host_origin[3] = {0,0,0}; size_t c_region[3] = {buffer_.m_*sizeof(T),buffer_.n_,1}; size_t c_buffer_row_pitch=0*sizeof(T);//ldc size_t c_buffer_slice_pitch=0; size_t c_host_row_pitch=buffer_.ldc_*sizeof(T); size_t c_host_slice_pitch=0; buffer_.buf_a_ = clCreateBuffer(ctx_, CL_MEM_READ_ONLY, (buffer_.k_*buffer_.m_ + buffer_.offA_) * sizeof(T), NULL, &err); buffer_.buf_b_ = clCreateBuffer(ctx_, CL_MEM_READ_ONLY, (buffer_.k_ * buffer_.n_ + buffer_.offB_) * sizeof(T), NULL, &err); buffer_.buf_c_ = clCreateBuffer(ctx_, CL_MEM_READ_WRITE, (buffer_.m_ * buffer_.n_ + buffer_.offC_) * sizeof(T), NULL, &err); /* err = clEnqueueWriteBuffer(queues_[0], buffer_.buf_a_, CL_TRUE, buffer_.offA_ * sizeof(T), buffer_.lda_ * buffer_.a_num_vectors_ * sizeof(T), buffer_.a_, 0, NULL, NULL); err = clEnqueueWriteBuffer(queues_[0], buffer_.buf_b_, CL_TRUE, buffer_.offB_ * sizeof(T), buffer_.ldb_ * buffer_.b_num_vectors_ * sizeof(T), buffer_.b_, 0, NULL, NULL); err = clEnqueueWriteBuffer(queues_[0], buffer_.buf_c_, CL_TRUE, buffer_.offC_ * sizeof(T), buffer_.ldc_ * buffer_.c_num_vectors_ * sizeof(T), buffer_.c_, 0, NULL, NULL);*/ err = clEnqueueWriteBufferRect(queues_[0], buffer_.buf_a_, CL_TRUE, a_buffer_origin, a_host_origin, a_region, a_buffer_row_pitch, a_buffer_slice_pitch, a_host_row_pitch, a_host_slice_pitch, buffer_.a_, 0, NULL, NULL); err = clEnqueueWriteBufferRect(queues_[0], buffer_.buf_b_, CL_TRUE, b_buffer_origin, b_host_origin, b_region, b_buffer_row_pitch, b_buffer_slice_pitch, b_host_row_pitch, b_host_slice_pitch, buffer_.b_, 0, NULL, NULL); err = clEnqueueWriteBufferRect(queues_[0], buffer_.buf_c_, CL_TRUE, c_buffer_origin, c_host_origin, c_region, c_buffer_row_pitch, c_buffer_slice_pitch, c_host_row_pitch, c_host_slice_pitch, buffer_.c_, 0, NULL, NULL); if(buffer_.trans_a_==clblasNoTrans) { buffer_.lda_=buffer_.m_; } else { buffer_.lda_=buffer_.k_; } if(buffer_.trans_b_==clblasNoTrans) { buffer_.ldb_=buffer_.k_; } else { buffer_.ldb_=buffer_.n_; } buffer_.ldc_=buffer_.m_; xGemm_Function(false); /* err = clEnqueueReadBuffer(queues_[0], buffer_.buf_c_, CL_TRUE, buffer_.offC_ * sizeof(T), buffer_.ldc_ * buffer_.c_num_vectors_ * sizeof(T), buffer_.c_, 0, NULL, &event_); */ err = ::clEnqueueReadBufferRect(queues_[0], buffer_.buf_c_, CL_TRUE, c_buffer_origin, c_host_origin, c_region, c_buffer_row_pitch, c_buffer_slice_pitch, c_host_row_pitch, c_host_slice_pitch, buffer_.c_, 0, NULL, &event_); clWaitForEvents(1, &event_); timer.Stop(timer_id); } void allochostptr_roundtrip_func() { timer.Start(timer_id); cl_int err; // Create buffers with CL_MEM_ALLOC_HOST_PTR for zero copy buffer_.buf_a_ = clCreateBuffer(ctx_, CL_MEM_READ_ONLY | CL_MEM_ALLOC_HOST_PTR, (buffer_.lda_*buffer_.a_num_vectors_ + buffer_.offA_) * sizeof(T), NULL, &err); buffer_.buf_b_ = clCreateBuffer(ctx_, CL_MEM_READ_ONLY | CL_MEM_ALLOC_HOST_PTR, (buffer_.ldb_ * buffer_.b_num_vectors_ + buffer_.offB_) * sizeof(T), NULL, &err); buffer_.buf_c_ = clCreateBuffer(ctx_, CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR, (buffer_.ldc_ * buffer_.c_num_vectors_ + buffer_.offC_) * sizeof(T), NULL, &err); // map the buffers to pointers at host device T *map_a,*map_b,*map_c; map_a = (T*)clEnqueueMapBuffer(queues_[0], buffer_.buf_a_, CL_TRUE, CL_MAP_WRITE, 0, (buffer_.lda_*buffer_.a_num_vectors_ + buffer_.offA_) * sizeof(T), 0, NULL, NULL, &err); map_b = (T*)clEnqueueMapBuffer(queues_[0], buffer_.buf_b_, CL_TRUE, CL_MAP_WRITE, 0, (buffer_.ldb_*buffer_.b_num_vectors_ + buffer_.offB_) * sizeof(T), 0, NULL, NULL, &err); map_c = (T*)clEnqueueMapBuffer(queues_[0], buffer_.buf_c_, CL_TRUE, CL_MAP_WRITE, 0, (buffer_.lda_*buffer_.c_num_vectors_ + buffer_.offC_) * sizeof(T), 0, NULL, NULL, &err); // memcpy the input A, B, C to the host pointers memcpy( map_a, buffer_.a_, ( buffer_.lda_*buffer_.a_num_vectors_ + buffer_.offA_) * sizeof( T ) ); memcpy( map_b, buffer_.b_, ( buffer_.ldb_*buffer_.b_num_vectors_ + buffer_.offB_) * sizeof( T ) ); memcpy( map_c, buffer_.c_, ( buffer_.ldc_*buffer_.c_num_vectors_ + buffer_.offC_) * sizeof( T ) ); // unmap the buffers clEnqueueUnmapMemObject(queues_[0], buffer_.buf_a_, map_a, 0, NULL, NULL); clEnqueueUnmapMemObject(queues_[0], buffer_.buf_b_, map_b, 0, NULL, NULL); clEnqueueUnmapMemObject(queues_[0], buffer_.buf_c_, map_c, 0, NULL, NULL); // calling clBLAS xGemm_Function(false); // map the C buffer again to read output map_c = (T*)clEnqueueMapBuffer(queues_[0], buffer_.buf_c_, CL_TRUE, CL_MAP_READ, 0, (buffer_.lda_*buffer_.c_num_vectors_ + buffer_.offC_) * sizeof(T), 0, NULL, NULL, &err); memcpy( map_c, buffer_.c_, ( buffer_.ldc_*buffer_.c_num_vectors_ + buffer_.offC_) * sizeof( T ) ); clEnqueueUnmapMemObject(queues_[0], buffer_.buf_c_, map_c, 0, NULL, &event_); clWaitForEvents(1, &event_); timer.Stop(timer_id); } void usehostptr_roundtrip_func() { timer.Start(timer_id); cl_int err; buffer_.buf_a_ = clCreateBuffer(ctx_, CL_MEM_READ_ONLY | CL_MEM_USE_HOST_PTR, (buffer_.lda_*buffer_.a_num_vectors_ + buffer_.offA_) * sizeof(T), buffer_.a_, &err); buffer_.buf_b_ = clCreateBuffer(ctx_, CL_MEM_READ_ONLY | CL_MEM_USE_HOST_PTR, (buffer_.ldb_ * buffer_.b_num_vectors_ + buffer_.offB_) * sizeof(T), buffer_.b_, &err); buffer_.buf_c_ = clCreateBuffer(ctx_, CL_MEM_READ_WRITE | CL_MEM_USE_HOST_PTR, (buffer_.ldc_ * buffer_.c_num_vectors_ + buffer_.offC_) * sizeof(T), buffer_.c_, &err); xGemm_Function(true); timer.Stop(timer_id); } void copyhostptr_roundtrip_func() { timer.Start(timer_id); cl_int err; buffer_.buf_a_ = clCreateBuffer(ctx_, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, (buffer_.lda_*buffer_.a_num_vectors_ + buffer_.offA_) * sizeof(T), buffer_.a_, &err); buffer_.buf_b_ = clCreateBuffer(ctx_, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, (buffer_.ldb_ * buffer_.b_num_vectors_ + buffer_.offB_) * sizeof(T), buffer_.b_, &err); buffer_.buf_c_ = clCreateBuffer(ctx_, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, (buffer_.ldc_ * buffer_.c_num_vectors_ + buffer_.offC_) * sizeof(T), buffer_.c_, &err); xGemm_Function(false); err = clEnqueueReadBuffer(queues_[0], buffer_.buf_c_, CL_TRUE, buffer_.offC_ * sizeof(T), buffer_.ldc_ * buffer_.c_num_vectors_ * sizeof(T), buffer_.c_, 0, NULL, &event_); clWaitForEvents(1, &event_); timer.Stop(timer_id); } void usepersismem_roundtrip_func() { #if defined(CL_MEM_USE_PERSISTENT_MEM_AMD) timer.Start(timer_id); cl_int err; buffer_.buf_a_ = clCreateBuffer(ctx_, CL_MEM_READ_ONLY | CL_MEM_USE_PERSISTENT_MEM_AMD, (buffer_.lda_*buffer_.a_num_vectors_ + buffer_.offA_) * sizeof(T), NULL, &err); buffer_.buf_b_ = clCreateBuffer(ctx_, CL_MEM_READ_ONLY | CL_MEM_USE_PERSISTENT_MEM_AMD, (buffer_.ldb_ * buffer_.b_num_vectors_ + buffer_.offB_) * sizeof(T), NULL, &err); buffer_.buf_c_ = clCreateBuffer(ctx_, CL_MEM_READ_WRITE | CL_MEM_USE_PERSISTENT_MEM_AMD, (buffer_.ldc_ * buffer_.c_num_vectors_ + buffer_.offC_) * sizeof(T), NULL, &err); // map the buffers to pointers at host devices T *map_a,*map_b,*map_c; map_a = (T*)clEnqueueMapBuffer(queues_[0], buffer_.buf_a_, CL_TRUE, CL_MAP_WRITE, 0, (buffer_.lda_*buffer_.a_num_vectors_ + buffer_.offA_) * sizeof(T), 0, NULL, NULL, &err); map_b = (T*)clEnqueueMapBuffer(queues_[0], buffer_.buf_b_, CL_TRUE, CL_MAP_WRITE, 0, (buffer_.ldb_*buffer_.b_num_vectors_ + buffer_.offB_) * sizeof(T), 0, NULL, NULL, &err); map_c = (T*)clEnqueueMapBuffer(queues_[0], buffer_.buf_c_, CL_TRUE, CL_MAP_WRITE, 0, (buffer_.lda_*buffer_.c_num_vectors_ + buffer_.offC_) * sizeof(T), 0, NULL, NULL, &err); // memcpy the input A, B, C to the host pointers memcpy( map_a, buffer_.a_, ( buffer_.lda_*buffer_.a_num_vectors_ + buffer_.offA_) * sizeof( T ) ); memcpy( map_b, buffer_.b_, ( buffer_.ldb_*buffer_.b_num_vectors_ + buffer_.offB_) * sizeof( T ) ); memcpy( map_c, buffer_.c_, ( buffer_.ldc_*buffer_.c_num_vectors_ + buffer_.offC_) * sizeof( T ) ); // unmap the buffers clEnqueueUnmapMemObject(queues_[0], buffer_.buf_a_, map_a, 0, NULL, NULL); clEnqueueUnmapMemObject(queues_[0], buffer_.buf_b_, map_b, 0, NULL, NULL); clEnqueueUnmapMemObject(queues_[0], buffer_.buf_c_, map_c, 0, NULL, NULL); // calling clBLAS xGemm_Function(false); // map the C buffer again to read output map_c = (T*)clEnqueueMapBuffer(queues_[0], buffer_.buf_c_, CL_TRUE, CL_MAP_READ, 0, (buffer_.lda_*buffer_.c_num_vectors_ + buffer_.offC_) * sizeof(T), 0, NULL, NULL, &err); memcpy( map_c, buffer_.c_, ( buffer_.ldc_*buffer_.c_num_vectors_ + buffer_.offC_) * sizeof( T ) ); clEnqueueUnmapMemObject(queues_[0], buffer_.buf_c_, map_c, 0, NULL, &event_); clWaitForEvents(1, &event_); timer.Stop(timer_id); #else std::cout<<"CL_MEM_USE_PERSISTENT_MEM_AMD is only supported on AMD hardware"<(alpha); buffer_.beta_ = makeScalar(beta); } private: xGemmBuffer buffer_; void xGemm_Function(bool flush, cl_uint apiCallCount = 1); unsigned int numQueuesToUse; cl_event events_[numQueues]; }; // class xgemm template<> void xGemm:: xGemm_Function(bool flush, cl_uint apiCallCount ) { for (unsigned int i = 0; i < numQueues; i++) { events_[i] = NULL; } for (unsigned int i = 0; i < apiCallCount; i++) { clblasSgemm(order_, buffer_.trans_a_, buffer_.trans_b_, buffer_.m_, buffer_.n_, buffer_.k_, buffer_.alpha_, buffer_.buf_a_, buffer_.offA_, buffer_.lda_, buffer_.buf_b_, buffer_.offB_, buffer_.ldb_, buffer_.beta_, buffer_.buf_c_, buffer_.offC_, buffer_.ldc_, numQueuesToUse, queues_, 0, NULL, events_); } //flush==true if only the kernel time (library call) is timed //flush==false if memory time is also timed if (flush==true) { // check if any valid events returned cl_uint numValidEvents = 0; for (unsigned int i = 0; i < numQueuesToUse; i++) { if (events_[i]) { cl_uint clReferenceCount; cl_int err = clGetEventInfo(events_[i], CL_EVENT_REFERENCE_COUNT, sizeof(clReferenceCount), &clReferenceCount, NULL); if ( err == CL_SUCCESS) { //printf("events[%u/%u] has %u references\n", i, numQueuesToUse, clReferenceCount ); numValidEvents++; } else { //printf("events[%u/%u] invalid; err = %i\n", i, numQueuesToUse, err ); } } else { //printf("events[%u/%u] is NULL\n", i, numQueuesToUse ); } } for (unsigned int i = 0; i < numQueuesToUse; i++) { clFlush(queues_[i]); } clWaitForEvents(numValidEvents, events_); } } template<> void xGemm:: xGemm_Function(bool flush, cl_uint apiCallCount ) { for (unsigned int i = 0; i < numQueues; i++) { events_[i] = NULL; } for (unsigned int i = 0; i < apiCallCount; i++) { clblasDgemm(order_, buffer_.trans_a_, buffer_.trans_b_, buffer_.m_, buffer_.n_, buffer_.k_, buffer_.alpha_, buffer_.buf_a_, buffer_.offA_, buffer_.lda_, buffer_.buf_b_, buffer_.offB_, buffer_.ldb_, buffer_.beta_, buffer_.buf_c_, buffer_.offC_, buffer_.ldc_, numQueuesToUse, queues_, 0, NULL, events_); } //flush==true if only the kernel time (library call) is timed //flush==false if memory time is also timed if (flush==true) { // check if any valid events returned cl_uint numValidEvents = 0; for (unsigned int i = 0; i < numQueuesToUse; i++) { if (events_[i]) { cl_uint clReferenceCount; cl_int err = clGetEventInfo(events_[i], CL_EVENT_REFERENCE_COUNT, sizeof(clReferenceCount), &clReferenceCount, NULL); if ( err == CL_SUCCESS) { //printf("events[%u/%u] has %u references\n", i, numQueuesToUse, clReferenceCount ); numValidEvents++; } else { //printf("events[%u/%u] invalid; err = %i\n", i, numQueuesToUse, err ); } } else { //printf("events[%u/%u] is NULL\n", i, numQueuesToUse ); } } for (unsigned int i = 0; i < numQueuesToUse; i++) { clFlush(queues_[i]); } clWaitForEvents(numValidEvents, events_); } } template<> void xGemm:: xGemm_Function(bool flush, cl_uint apiCallCount ) { for (unsigned int i = 0; i < numQueues; i++) { events_[i] = NULL; } for (unsigned int i = 0; i < apiCallCount; i++) { clblasCgemm(order_, buffer_.trans_a_, buffer_.trans_b_, buffer_.m_, buffer_.n_, buffer_.k_, buffer_.alpha_, buffer_.buf_a_, buffer_.offA_, buffer_.lda_, buffer_.buf_b_, buffer_.offB_, buffer_.ldb_, buffer_.beta_, buffer_.buf_c_, buffer_.offC_, buffer_.ldc_, numQueuesToUse, queues_, 0, NULL, events_); } //flush==true if only the kernel time (library call) is timed //flush==false if memory time is also timed if (flush==true) { // check if any valid events returned cl_uint numValidEvents = 0; for (unsigned int i = 0; i < numQueuesToUse; i++) { if (events_[i]) { cl_uint clReferenceCount; cl_int err = clGetEventInfo(events_[i], CL_EVENT_REFERENCE_COUNT, sizeof(clReferenceCount), &clReferenceCount, NULL); if ( err == CL_SUCCESS) { //printf("events[%u/%u] has %u references\n", i, numQueuesToUse, clReferenceCount ); numValidEvents++; } else { //printf("events[%u/%u] invalid; err = %i\n", i, numQueuesToUse, err ); } } else { //printf("events[%u/%u] is NULL\n", i, numQueuesToUse ); } } for (unsigned int i = 0; i < numQueuesToUse; i++) { clFlush(queues_[i]); } clWaitForEvents(numValidEvents, events_); } } template<> void xGemm:: xGemm_Function(bool flush, cl_uint apiCallCount ) { for (unsigned int i = 0; i < numQueues; i++) { events_[i] = NULL; } for (unsigned int i = 0; i < apiCallCount; i++) { clblasZgemm(order_, buffer_.trans_a_, buffer_.trans_b_, buffer_.m_, buffer_.n_, buffer_.k_, buffer_.alpha_, buffer_.buf_a_, buffer_.offA_, buffer_.lda_, buffer_.buf_b_, buffer_.offB_, buffer_.ldb_, buffer_.beta_, buffer_.buf_c_, buffer_.offC_, buffer_.ldc_, numQueuesToUse, queues_, 0, NULL, events_); } //flush==true if only the kernel time (library call) is timed //flush==false if memory time is also timed if (flush==true) { // check if any valid events returned cl_uint numValidEvents = 0; for (unsigned int i = 0; i < numQueuesToUse; i++) { if (events_[i]) { cl_uint clReferenceCount; cl_int err = clGetEventInfo(events_[i], CL_EVENT_REFERENCE_COUNT, sizeof(clReferenceCount), &clReferenceCount, NULL); if ( err == CL_SUCCESS) { //printf("events[%u/%u] has %u references\n", i, numQueuesToUse, clReferenceCount ); numValidEvents++; } else { //printf("events[%u/%u] invalid; err = %i\n", i, numQueuesToUse, err ); } } else { //printf("events[%u/%u] is NULL\n", i, numQueuesToUse ); } } for (unsigned int i = 0; i < numQueuesToUse; i++) { clFlush(queues_[i]); } clWaitForEvents(numValidEvents, events_); } } template<> double xGemm:: gflops() { return (8.0*buffer_.m_*buffer_.n_*buffer_.k_)/(time_in_ns() / buffer_.apiCallCount); } template<> double xGemm:: gflops() { return (8.0*buffer_.m_*buffer_.n_*buffer_.k_)/(time_in_ns() / buffer_.apiCallCount); } template<> std::string xGemm:: gflops_formula() { return "8.0*M*N*K/time"; } template<> std::string xGemm:: gflops_formula() { return "8.0*M*N*K/time"; } #endif // ifndef CLBLAS_BENCHMARK_XGEMM_HXX__ clblas-2.10/src/client/clfunc_xgemv.hpp000066400000000000000000000257121264277366700201650ustar00rootroot00000000000000/* ************************************************************************ * Copyright 2013 Advanced Micro Devices, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * ************************************************************************/ // $Id #ifndef CLBLAS_BENCHMARK_XGEMV_HXX__ #define CLBLAS_BENCHMARK_XGEMV_HXX__ #include "clfunc_common.hpp" template struct xGemvBuffer { clblasOrder order_; size_t m_; size_t n_; size_t lda_; size_t offA_; size_t a_num_vectors_; size_t b_num_vectors_; size_t c_num_vectors_; clblasTranspose trans_a_; T* a_; T* x_; T* y_; cl_mem buf_a_; cl_mem buf_x_; cl_mem buf_y_; T alpha_; T beta_; }; // struct buffer template class xGemv : public clblasFunc { public: xGemv(StatisticalTimer& timer, cl_device_type devType) : clblasFunc(timer, devType) { timer.getUniqueID("clGemv", 0); } ~xGemv() { delete buffer_.a_; delete buffer_.x_; delete buffer_.y_; OPENCL_V_THROW( clReleaseMemObject(buffer_.buf_a_), "releasing buffer A"); OPENCL_V_THROW( clReleaseMemObject(buffer_.buf_x_), "releasing buffer X"); OPENCL_V_THROW( clReleaseMemObject(buffer_.buf_y_), "releasing buffer Y"); } void call_func() { } double gflops() { return (2.0*buffer_.m_*buffer_.n_)/time_in_ns(); } std::string gflops_formula() { return "2.0*M*N/time"; // NOTE i removed a \n from the end of this. it needs to be absent // from all functions } void setup_buffer(int order_option, int side_option, int uplo_option, int diag_option, int transA_option, int transB_option, size_t M, size_t N, size_t K, size_t lda, size_t ldb, size_t ldc, size_t offA, size_t offBX, size_t offCY, double alpha, double beta) { DUMMY_ARGS_USAGE_4(side_option, uplo_option, diag_option, transB_option); DUMMY_ARGS_USAGE_3(K, ldb, ldc); DUMMY_ARGS_USAGE_2(offBX, offCY); initialize_scalars(alpha, beta); buffer_.m_ = M; buffer_.n_ = N; buffer_.offA_ = offA; if (transA_option == 0) { buffer_.trans_a_ = clblasNoTrans; buffer_.x_ = new T[buffer_.n_]; buffer_.y_ = new T[buffer_.m_]; } else { buffer_.trans_a_ = clblasTrans; buffer_.x_ = new T[buffer_.m_]; buffer_.y_ = new T[buffer_.n_]; } if (order_option == 0) { order_ = clblasRowMajor; buffer_.a_num_vectors_ = M; if (lda == 0) { buffer_.lda_ = N; } else if (lda < N) { std::cerr << "lda:wrong size\n"; exit(1); } else { buffer_.lda_ = lda; } } else { order_ = clblasColumnMajor; buffer_.a_num_vectors_ = N; if (lda == 0) { buffer_.lda_ = M; } else if (lda < M) { std::cerr << "lda:wrong size\n"; exit(1); } else { buffer_.lda_ = lda; } } buffer_.a_ = new T[buffer_.lda_*buffer_.a_num_vectors_]; cl_int err; size_t size = (buffer_.lda_ * buffer_.a_num_vectors_ + buffer_.offA_) * sizeof(T); if( size >= maxMemAllocSize ) throw std::runtime_error( "Tried to create a buffer larger than allowable on this device" ); buffer_.buf_a_ = clCreateBuffer(ctx_, CL_MEM_READ_ONLY, size, NULL, &err); if (transA_option == 0) { buffer_.buf_x_ = clCreateBuffer(ctx_, CL_MEM_READ_ONLY, buffer_.n_*sizeof(T), NULL, &err); buffer_.buf_y_ = clCreateBuffer(ctx_, CL_MEM_READ_WRITE, buffer_.m_*sizeof(T), NULL, &err); } else { buffer_.buf_x_ = clCreateBuffer(ctx_, CL_MEM_READ_ONLY, buffer_.m_*sizeof(T), NULL, &err); buffer_.buf_y_ = clCreateBuffer(ctx_, CL_MEM_READ_WRITE, buffer_.n_*sizeof(T), NULL, &err); } } void initialize_cpu_buffer() { srand(10); for (size_t i = 0; i < buffer_.a_num_vectors_; ++i) { for (size_t j = 0; j < buffer_.lda_; ++j) { buffer_.a_[i*buffer_.lda_+j] = random(UPPER_BOUND()) / randomScale(); } } if (buffer_.trans_a_ == clblasNoTrans) { for (size_t i = 0; i < buffer_.n_; ++i) { buffer_.x_[i] = random(UPPER_BOUND()) / randomScale(); } for (size_t i = 0; i < buffer_.m_; ++i) { buffer_.y_[i] = random(UPPER_BOUND()) / randomScale(); } } else { for (size_t i = 0; i < buffer_.m_; ++i) { buffer_.x_[i] = random(UPPER_BOUND()) / randomScale(); } for (size_t i = 0; i < buffer_.n_; ++i) { buffer_.y_[i] = random(UPPER_BOUND()) / randomScale(); } } } void initialize_gpu_buffer() { cl_int err; err = clEnqueueWriteBuffer(queues_[0], buffer_.buf_a_, CL_TRUE, buffer_.offA_ * sizeof(T), buffer_.lda_ * buffer_.a_num_vectors_ * sizeof(T), buffer_.a_, 0, NULL, NULL); if (buffer_.trans_a_ == clblasNoTrans) { err = clEnqueueWriteBuffer(queues_[0], buffer_.buf_x_, CL_TRUE, 0, buffer_.n_*sizeof(T), buffer_.x_, 0, NULL, NULL); err = clEnqueueWriteBuffer(queues_[0], buffer_.buf_y_, CL_TRUE, 0, buffer_.m_*sizeof(T), buffer_.y_, 0, NULL, NULL); } else { err = clEnqueueWriteBuffer(queues_[0], buffer_.buf_x_, CL_TRUE, 0, buffer_.m_*sizeof(T), buffer_.x_, 0, NULL, NULL); err = clEnqueueWriteBuffer(queues_[0], buffer_.buf_y_, CL_TRUE, 0, buffer_.n_*sizeof(T), buffer_.y_, 0, NULL, NULL); } } void reset_gpu_write_buffer() { cl_int err; if (buffer_.trans_a_ == clblasNoTrans) { err = clEnqueueWriteBuffer(queues_[0], buffer_.buf_y_, CL_TRUE, 0, buffer_.m_*sizeof(T), buffer_.y_, 0, NULL, NULL); } else { err = clEnqueueWriteBuffer(queues_[0], buffer_.buf_y_, CL_TRUE, 0, buffer_.n_*sizeof(T), buffer_.y_, 0, NULL, NULL); } } void read_gpu_buffer() { //to-do need to fill up } void roundtrip_func() {//to-do need to fill up } void roundtrip_setup_buffer(int order_option, int side_option, int uplo_option, int diag_option, int transA_option, int transB_option, size_t M, size_t N, size_t K, size_t lda, size_t ldb, size_t ldc, size_t offA, size_t offBX, size_t offCY, double alpha, double beta) {} void releaseGPUBuffer_deleteCPUBuffer() { //this is necessary since we are running a iteration of tests and calculate the average time. (in client.cpp) //need to do this before we eventually hit the destructor //to-do } protected: void initialize_scalars(double alpha, double beta) { buffer_.alpha_ = makeScalar(alpha); buffer_.beta_ = makeScalar(beta); } private: xGemvBuffer buffer_; }; // class xgemv template<> void xGemv:: call_func() { timer.Start(timer_id); clblasSgemv(order_, buffer_.trans_a_, buffer_.m_, buffer_.n_, buffer_.alpha_, buffer_.buf_a_, buffer_.offA_, buffer_.lda_, buffer_.buf_x_, 0, 1, buffer_.beta_, buffer_.buf_y_, 0, 1, numQueues, queues_, 0, NULL, &event_); clWaitForEvents(1, &event_); timer.Stop(timer_id); } template<> void xGemv:: call_func() { timer.Start(timer_id); clblasDgemv(order_, buffer_.trans_a_, buffer_.m_, buffer_.n_, buffer_.alpha_, buffer_.buf_a_, buffer_.offA_, buffer_.lda_, buffer_.buf_x_, 0, 1, buffer_.beta_, buffer_.buf_y_, 0, 1, numQueues, queues_, 0, NULL, &event_); clWaitForEvents(1, &event_); timer.Stop(timer_id); } template<> void xGemv:: call_func() { timer.Start(timer_id); clblasCgemv(order_, buffer_.trans_a_, buffer_.m_, buffer_.n_, buffer_.alpha_, buffer_.buf_a_, buffer_.offA_, buffer_.lda_, buffer_.buf_x_, 0, 1, buffer_.beta_, buffer_.buf_y_, 0, 1, numQueues, queues_, 0, NULL, &event_); clWaitForEvents(1, &event_); timer.Stop(timer_id); } template<> void xGemv:: call_func() { timer.Start(timer_id); clblasZgemv(order_, buffer_.trans_a_, buffer_.m_, buffer_.n_, buffer_.alpha_, buffer_.buf_a_, buffer_.offA_, buffer_.lda_, buffer_.buf_x_, 0, 1, buffer_.beta_, buffer_.buf_y_, 0, 1, numQueues, queues_, 0, NULL, &event_); clWaitForEvents(1, &event_); timer.Stop(timer_id); } #endif // ifndef CLBLAS_BENCHMARK_XGEMV_HXX__ clblas-2.10/src/client/clfunc_xger.hpp000066400000000000000000000244601264277366700200030ustar00rootroot00000000000000/* ************************************************************************ * Copyright 2013 Advanced Micro Devices, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * ************************************************************************/ // $Id #ifndef CLBLAS_BENCHMARK_XGER_HXX__ #define CLBLAS_BENCHMARK_XGER_HXX__ #include "clfunc_common.hpp" template struct xGerBuffer { clblasOrder order_; size_t m_; size_t n_; T alpha; T* X; cl_mem x_; size_t offX; int incx_; T* Y; cl_mem y_; size_t offY; int incy_; T* A; cl_mem a_; size_t a_num_vectors_; size_t offA; size_t lda_; }; // struct buffer template class xGer : public clblasFunc { public: xGer(StatisticalTimer& timer, cl_device_type devType) : clblasFunc(timer, devType) { timer.getUniqueID("clGer", 0); } ~xGer() { delete buffer_.X; delete buffer_.Y; delete buffer_.A; OPENCL_V_THROW( clReleaseMemObject(buffer_.x_), "releasing buffer X"); OPENCL_V_THROW( clReleaseMemObject(buffer_.y_), "releasing buffer Y"); OPENCL_V_THROW( clReleaseMemObject(buffer_.a_), "releasing buffer A"); } //void call_func() {} double gflops() { return (buffer_.m_*(buffer_.m_+1))/time_in_ns(); } std::string gflops_formula() { return "M*(M+1)/time"; } void setup_buffer(int order_option, int side_option, int uplo_option, int diag_option, int transA_option, int transB_option, size_t M, size_t N, size_t K, size_t lda, size_t ldb, size_t ldc,size_t offA, size_t offB, size_t offC, double alpha, double beta) { initialize_scalars(alpha, beta); buffer_.m_ = M; buffer_.n_ = N; buffer_.incx_ = 1; buffer_.incy_ = 1; if (order_option == 0) { buffer_.order_ = clblasRowMajor; } else { buffer_.order_ = clblasColumnMajor; } if (lda == 0) { buffer_.lda_ = M; } else { if( lda < M ) { std::cerr << "ERROR: lda must be set to 0 or a value >= M" << std::endl; } else if (lda >= M) { buffer_.lda_ = lda; } } buffer_.offA = offA; buffer_.offX = offB; buffer_.offY = offC; buffer_.a_num_vectors_ = buffer_.n_; size_t sizeA = buffer_.lda_*buffer_.a_num_vectors_; size_t sizeX = buffer_.m_; size_t sizeY = buffer_.n_; buffer_.A = new T[sizeA]; buffer_.X = new T[sizeX]; buffer_.Y = new T[sizeY]; cl_int err; buffer_.a_ = clCreateBuffer(ctx_, CL_MEM_READ_ONLY, sizeA*sizeof(T), NULL, &err); buffer_.x_ = clCreateBuffer(ctx_, CL_MEM_READ_WRITE, sizeX*sizeof(T), NULL, &err); buffer_.y_ = clCreateBuffer(ctx_, CL_MEM_READ_WRITE, sizeY*sizeof(T), NULL, &err); } void initialize_cpu_buffer() { srand(10); for (size_t i = 0; i < buffer_.m_; ++i) { buffer_.X[i] = static_cast(rand())/static_cast(RAND_MAX); } for (size_t i = 0; i < buffer_.n_; ++i) { buffer_.Y[i] = static_cast(rand())/static_cast(RAND_MAX); } for (size_t i = 0; i < buffer_.a_num_vectors_; ++i) { for (size_t j = 0; j < buffer_.lda_; ++j) { if (i == j) { /*if (buffer_.diag_ == clblasUnit) { buffer_.a_[i*buffer_.lda_+j] = static_cast(1.0); } else {*/ buffer_.A[i*buffer_.lda_+j] = static_cast(rand())/static_cast(RAND_MAX); //} } else { buffer_.A[i*buffer_.lda_+j] = static_cast(0.0); } } } } void initialize_gpu_buffer() { cl_int err; err = clEnqueueWriteBuffer(queues_[0], buffer_.a_, CL_TRUE, 0, buffer_.lda_*buffer_.a_num_vectors_*sizeof(T), buffer_.A, 0, NULL, NULL); err = clEnqueueWriteBuffer(queues_[0], buffer_.x_, CL_TRUE, 0, buffer_.m_*sizeof(T), buffer_.X, 0, NULL, NULL); err = clEnqueueWriteBuffer(queues_[0], buffer_.y_, CL_TRUE, 0, buffer_.n_*sizeof(T), buffer_.Y, 0, NULL, NULL); } void reset_gpu_write_buffer() { cl_int err; err = clEnqueueWriteBuffer(queues_[0], buffer_.x_, CL_TRUE, 0, buffer_.m_, buffer_.x_, 0, NULL, NULL); } void call_func(); void read_gpu_buffer() { //cl_int err; //to-do need to fill up } void roundtrip_func() {//to-do need to fill up } void roundtrip_setup_buffer(int order_option, int side_option, int uplo_option, int diag_option, int transA_option, int transB_option, size_t M, size_t N, size_t K, size_t lda, size_t ldb, size_t ldc, size_t offA, size_t offBX, size_t offCY, double alpha, double beta) {} void releaseGPUBuffer_deleteCPUBuffer() { //this is necessary since we are running a iteration of tests and calculate the average time. (in client.cpp) //need to do this before we eventually hit the destructor //to-do } protected: void initialize_scalars(double alpha, double beta) { buffer_.alpha = alpha; } private: xGerBuffer buffer_; }; // class xger //template<> //void //xGer:: //initialize_scalars(double alpha, double beta) //{ // buffer_.alpha = alpha; //} //template<> //void //xGer:: //initialize_scalars(double alpha, double beta) //{ //} template<> void xGer:: call_func() { timer.Start(timer_id); clblasSger(buffer_.order_, buffer_.m_, buffer_.n_, buffer_.alpha, buffer_.x_, buffer_.offX, 1, buffer_.y_, buffer_.offY, 1, buffer_.a_, buffer_.offA, buffer_.lda_, numQueues, queues_, 0, NULL, &event_); clWaitForEvents(1, &event_); timer.Stop(timer_id); } template<> void xGer:: call_func() { timer.Start(timer_id); clblasDger(buffer_.order_, buffer_.m_, buffer_.n_, buffer_.alpha, buffer_.x_, buffer_.offX, 1, buffer_.y_, buffer_.offY, 1, buffer_.a_, buffer_.offA, buffer_.lda_, numQueues, queues_, 0, NULL, &event_); clWaitForEvents(1, &event_); timer.Stop(timer_id); } //template<> //void //xGer:: //call_func() //{ // timer.Start(timer_id); // clblasCger(order_, buffer_.m_, buffer_.n, buffer_a_, 0, // buffer_.lda_, buffer_x_, 0, 1, numQueues, queues_, 0, NULL, // &event_); // clWaitForEvents(1, &event_); // timer.Stop(timer_id); //} // //template<> //void //xGer:: //call_func() //{ // timer.Start(timer_id); // clblasZger(order_, buffer_.uplo_, buffer_.trans_a_, // buffer_.diag_, buffer_.m_, buffer_a_, 0, // buffer_.lda_, buffer_x_, 0, 1, numQueues, queues_, 0, NULL, // &event_); // clWaitForEvents(1, &event_); // timer.Stop(timer_id); //} //template<> //void //xGer:: //initialize_cpu_buffer() //{ // srand(10); // for (size_t i = 0; i < buffer_.m_; ++i) // { // buffer_x_[i].s[0] = // static_cast(rand())/static_cast(RAND_MAX); // buffer_.x_[i].s[1] = // static_cast(rand())/static_cast(RAND_MAX); // } // // for (size_t i = 0; i < buffer_.a_num_vectors_; ++i) // { // for (size_t j = 0; j < buffer_.lda_; ++j) // { // if (i == j) // { // if (buffer_.diag_ == clblasUnit) // { // buffer_.a_[i*buffer_.lda_+j].s[0] = 1.0f; // buffer_.a_[i*buffer_.lda_+j].s[1] = 0.0f; // } // else // { // buffer_.a_[i*buffer_.lda_+j].s[0] = // static_cast(rand())/static_cast(RAND_MAX); // buffer_.a_[i*buffer_.lda_+j].s[1] = // static_cast(rand())/static_cast(RAND_MAX); // } // } // else // { // buffer_.a_[i*buffer_.lda_+j].s[0] = 0.0f; // buffer_.a_[i*buffer_.lda_+j].s[1] = 0.0f; // } // } // } // // //} //template<> //void //xGer:: //initialize_cpu_buffer() //{ // srand(10); // for (size_t i = 0; i < buffer_.m_; ++i) // { // buffer_.x_[i].s[0] = // static_cast(rand())/static_cast(RAND_MAX); // buffer_.x_[i].s[1] = // static_cast(rand())/static_cast(RAND_MAX); // } // // for (size_t i = 0; i < buffer_.a_num_vectors_; ++i) // { // for (size_t j = 0; j < buffer_.lda_; ++j) // { // if (i == j) // { // if (buffer_.diag_ == clblasUnit) // { // buffer_.a_[i*buffer_.lda_+j].s[0] = 1.0; // buffer_.a_[i*buffer_.lda_+j].s[1] = 0.0; // } // else // { // buffer_.a_[i*buffer_.lda_+j].s[0] = // static_cast(rand())/static_cast(RAND_MAX); // buffer_.a_[i*buffer_.lda_+j].s[1] = // static_cast(rand())/static_cast(RAND_MAX); // } // } // else // { // buffer_.a_[i*buffer_.lda_+j].s[0] = 0.0; // buffer_.a_[i*buffer_.lda_+j].s[1] = 0.0; // } // } // } //} //template<> //double //xGer:: //gflops() //{ // return 2.0*buffer_.m_*(buffer_.m_+1)/time_in_ns(); //} // //template<> //double //xGer:: //gflops() //{ // return 2.0*buffer_.m_*(buffer_.m_+1)/time_in_ns(); //} // //template<> //std::string //xGer:: //gflops_formula() //{ // return "2.0*M*(M+1)/time"; //} // //template<> //std::string //xGer:: //gflops_formula() //{ // return "2.0*M*(M+1)/time"; //} #endif // ifndef CLBLAS_BENCHMARK_XGER_HXX__ clblas-2.10/src/client/clfunc_xgerc.hpp000066400000000000000000000216011264277366700201400ustar00rootroot00000000000000/* ************************************************************************ * Copyright 2013 Advanced Micro Devices, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * ************************************************************************/ // $Id #ifndef CLBLAS_BENCHMARK_XGERC_HXX__ #define CLBLAS_BENCHMARK_XGERC_HXX__ #include "clfunc_common.hpp" template struct xGercBuffer { clblasOrder order; size_t M; size_t N; T alpha; T* cpuX; cl_mem X; size_t offx; int incx; T* cpuY; cl_mem Y; size_t offy; int incy; T* cpuA; cl_mem A; size_t offa; size_t lda; }; // struct buffer template class xGerc : public clblasFunc { public: xGerc(StatisticalTimer& timer, cl_device_type devType) : clblasFunc(timer, devType) { timer.getUniqueID("clGerc", 0); } ~xGerc() { delete buffer.cpuA; delete buffer.cpuX; delete buffer.cpuY; OPENCL_V_THROW( clReleaseMemObject(buffer.A), "releasing buffer A"); OPENCL_V_THROW( clReleaseMemObject(buffer.X), "releasing buffer X"); OPENCL_V_THROW( clReleaseMemObject(buffer.Y), "releasing buffer Y"); } double gflops() { return (buffer.N*(buffer.N+1))/time_in_ns(); } std::string gflops_formula() { return "M*(M+1)/time"; } void setup_buffer(int order_option, int side_option, int uplo_option, int diag_option, int transA_option, int transB_option, size_t M, size_t N, size_t K, size_t lda, size_t ldb, size_t ldc,size_t offA, size_t offB, size_t offC, double alpha, double beta); void initialize_cpu_buffer(); void initialize_gpu_buffer(); void reset_gpu_write_buffer(); void read_gpu_buffer() { //cl_int err; //to-do need to fill up } void roundtrip_func() {//to-do need to fill up } void roundtrip_setup_buffer(int order_option, int side_option, int uplo_option, int diag_option, int transA_option, int transB_option, size_t M, size_t N, size_t K, size_t lda, size_t ldb, size_t ldc, size_t offA, size_t offBX, size_t offCY, double alpha, double beta) {} void call_func(); void releaseGPUBuffer_deleteCPUBuffer() { //this is necessary since we are running a iteration of tests and calculate the average time. (in client.cpp) //need to do this before we eventually hit the destructor //to do } protected: void initialize_scalars(double alpha, double beta) { buffer.alpha = makeScalar(alpha); //buffer.beta = makeScalar(beta); } private: xGercBuffer buffer; }; template void xGerc::setup_buffer(int order_option, int side_option, int uplo_option, int diag_option, int transA_option, int transB_option, size_t M, size_t N, size_t K, size_t lda, size_t ldb, size_t ldc,size_t offA, size_t offB, size_t offC, double alpha, double beta) { initialize_scalars(alpha, beta); buffer.offa = offA; buffer.offx = offB; buffer.incx = 1;//If this changes, remember to adjust size of Y in rest of the file buffer.offy = offC; buffer.incy = 1;//If this changes, remember to adjust size of Y in rest of the file buffer.M = M; buffer.N = N; if (order_option == 0) { buffer.order = clblasRowMajor; } else { buffer.order = clblasColumnMajor; } if (lda == 0) { buffer.lda = buffer.M; } else if (lda < buffer.M) { std::cerr << "lda:wrong size\n"; exit(1); } else { buffer.lda = lda; } buffer.cpuX = new T[buffer.M]; buffer.cpuY = new T[buffer.N]; buffer.cpuA = new T[buffer.N * buffer.lda]; cl_int err; buffer.A = clCreateBuffer(ctx_, CL_MEM_READ_ONLY, buffer.N * buffer.lda*sizeof(T), NULL, &err); buffer.X = clCreateBuffer(ctx_, CL_MEM_READ_WRITE, buffer.M*sizeof(T), NULL, &err); buffer.Y = clCreateBuffer(ctx_, CL_MEM_READ_WRITE, buffer.N*sizeof(T), NULL, &err); } template void xGerc::initialize_cpu_buffer() { srand(10); for (size_t i = 0; i < buffer.N; ++i) { for (size_t j = 0; j < buffer.lda; ++j) { buffer.cpuA[i*buffer.lda+j] = random(UPPER_BOUND()) / randomScale(); } } for (size_t i = 0; i < buffer.M; ++i) { buffer.cpuX[i] = random(UPPER_BOUND()) / randomScale(); } for (size_t i = 0; i < buffer.N; ++i) { buffer.cpuY[i] = random(UPPER_BOUND()) / randomScale(); } } //template <> //void xGerc::initialize_cpu_buffer() //{ // srand(10); // for (size_t i = 0; i < buffer.N; ++i) // { // for (size_t j = 0; j < buffer.lda; ++j) // { // buffer.cpuA[i*buffer.lda+j].s[0] = static_cast(rand())/ // static_cast(RAND_MAX); // buffer.cpuA[i*buffer.lda+j].s[1] = static_cast(rand())/ // static_cast(RAND_MAX); // } // } // // for (size_t i = 0; i < buffer.M; ++i) // { // buffer.cpuX[i].s[0] = static_cast(rand())/ // static_cast(RAND_MAX); // buffer.cpuX[i].s[1] = static_cast(rand())/ // static_cast(RAND_MAX); // } // for (size_t i = 0; i < buffer.N; ++i) // { // buffer.cpuY[i].s[0] = static_cast(rand())/ // static_cast(RAND_MAX); // buffer.cpuY[i].s[1] = static_cast(rand())/ // static_cast(RAND_MAX); // } //} //template <> //void xGerc::initialize_cpu_buffer() //{ // srand(10); // for (size_t i = 0; i < buffer.N; ++i) // { // for (size_t j = 0; j < buffer.lda; ++j) // { // buffer.cpuA[i*buffer.lda+j].s[0] = static_cast(rand())/ // static_cast(RAND_MAX); // buffer.cpuA[i*buffer.lda+j].s[1] = static_cast(rand())/ // static_cast(RAND_MAX); // } // } // // for (size_t i = 0; i < buffer.M; ++i) // { // buffer.cpuX[i].s[0] = static_cast(rand())/ // static_cast(RAND_MAX); // buffer.cpuX[i].s[1] = static_cast(rand())/ // static_cast(RAND_MAX); // } // for (size_t i = 0; i < buffer.N; ++i) // { // buffer.cpuY[i].s[0] = static_cast(rand())/ // static_cast(RAND_MAX); // buffer.cpuY[i].s[1] = static_cast(rand())/ // static_cast(RAND_MAX); // } //} // template void xGerc::initialize_gpu_buffer() { cl_int err; err = clEnqueueWriteBuffer(queues_[0], buffer.A, CL_TRUE, buffer.offa * sizeof(T), buffer.N * buffer.lda*sizeof(T), buffer.cpuA, 0, NULL, NULL); err = clEnqueueWriteBuffer(queues_[0], buffer.X, CL_TRUE, 0, buffer.M*sizeof(T), buffer.cpuX, 0, NULL, NULL); err = clEnqueueWriteBuffer(queues_[0], buffer.Y, CL_TRUE, 0, buffer.N*sizeof(T), buffer.cpuY, 0, NULL, NULL); } template void xGerc::reset_gpu_write_buffer() { cl_int err; err = clEnqueueWriteBuffer(queues_[0], buffer.A, CL_TRUE, buffer.offa * sizeof(T), buffer.N * buffer.lda*sizeof(T), buffer.cpuA, 0, NULL, NULL);; } template <> void xGerc::call_func() { timer.Start(timer_id); clblasCgerc(buffer.order, buffer.M, buffer.N, buffer.alpha, buffer.X, buffer.offx, buffer.incx, buffer.Y, buffer.offy, buffer.incy, buffer.A, buffer.offa, buffer.lda, numQueues, queues_, 0, NULL,&event_); clWaitForEvents(1, &event_); timer.Stop(timer_id); } template <> void xGerc::call_func() { timer.Start(timer_id); clblasZgerc(buffer.order, buffer.M, buffer.N, buffer.alpha, buffer.X, buffer.offx, buffer.incx, buffer.Y, buffer.offy, buffer.incy, buffer.A, buffer.offa, buffer.lda, numQueues, queues_, 0, NULL,&event_); clWaitForEvents(1, &event_); timer.Stop(timer_id); } #endif // ifndef CLBLAS_BENCHMARK_XSYR_HXX__clblas-2.10/src/client/clfunc_xgeru.hpp000066400000000000000000000155431264277366700201720ustar00rootroot00000000000000/* ************************************************************************ * Copyright 2013 Advanced Micro Devices, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * ************************************************************************/ // $Id #ifndef CLBLAS_BENCHMARK_XGERU_HXX__ #define CLBLAS_BENCHMARK_XGERU_HXX__ #include "clfunc_common.hpp" template struct xGeruBuffer { clblasOrder order; size_t M; size_t N; T alpha; T* cpuX; cl_mem X; size_t offx; int incx; T* cpuY; cl_mem Y; size_t offy; int incy; T* cpuA; cl_mem A; size_t offa; size_t lda; }; // struct buffer template class xGeru : public clblasFunc { public: xGeru(StatisticalTimer& timer, cl_device_type devType) : clblasFunc(timer, devType) { timer.getUniqueID("clGeru", 0); } ~xGeru() { delete buffer.cpuA; delete buffer.cpuX; OPENCL_V_THROW( clReleaseMemObject(buffer.A), "releasing buffer A"); OPENCL_V_THROW( clReleaseMemObject(buffer.X), "releasing buffer C"); } double gflops() { return (buffer.N*(buffer.N+1))/time_in_ns(); } std::string gflops_formula() { return "M*(M+1)/time"; } void setup_buffer(int order_option, int side_option, int uplo_option, int diag_option, int transA_option, int transB_option, size_t M, size_t N, size_t K, size_t lda, size_t ldb, size_t ldc,size_t offA, size_t offB, size_t offC, double alpha, double beta); void initialize_cpu_buffer(); void initialize_gpu_buffer(); void reset_gpu_write_buffer(); void call_func(); void read_gpu_buffer() { //cl_int err; //to-do need to fill up } void roundtrip_func() {//to-do need to fill up } void roundtrip_setup_buffer(int order_option, int side_option, int uplo_option, int diag_option, int transA_option, int transB_option, size_t M, size_t N, size_t K, size_t lda, size_t ldb, size_t ldc, size_t offA, size_t offBX, size_t offCY, double alpha, double beta) {} void releaseGPUBuffer_deleteCPUBuffer() { //this is necessary since we are running a iteration of tests and calculate the average time. (in client.cpp) //need to do this before we eventually hit the destructor //to-do } protected: protected: void initialize_scalars(double alpha, double beta) { buffer.alpha = makeScalar(alpha); //buffer.beta = makeScalar(beta); } private: xGeruBuffer buffer; }; template void xGeru::setup_buffer(int order_option, int side_option, int uplo_option, int diag_option, int transA_option, int transB_option, size_t M, size_t N, size_t K, size_t lda, size_t ldb, size_t ldc,size_t offA, size_t offB, size_t offC, double alpha, double beta) { initialize_scalars(alpha, beta); buffer.offa = offA; buffer.offx = offB; buffer.incx = 1;//If this changes, remember to adjust size of Y in rest of the file buffer.offy = offC; buffer.incy = 1;//If this changes, remember to adjust size of Y in rest of the file buffer.M = M; buffer.N = N; if (order_option == 0) { buffer.order = clblasRowMajor; } else { buffer.order = clblasColumnMajor; } if (lda == 0) { buffer.lda = buffer.M; } else if (lda < buffer.M) { std::cerr << "lda:wrong size\n"; exit(1); } else { buffer.lda = lda; } buffer.cpuX = new T[buffer.M]; buffer.cpuY = new T[buffer.N]; buffer.cpuA = new T[buffer.N * buffer.lda]; cl_int err; buffer.A = clCreateBuffer(ctx_, CL_MEM_READ_ONLY, buffer.N * buffer.lda*sizeof(T), NULL, &err); buffer.X = clCreateBuffer(ctx_, CL_MEM_READ_WRITE, buffer.M*sizeof(T), NULL, &err); buffer.Y = clCreateBuffer(ctx_, CL_MEM_READ_WRITE, buffer.N*sizeof(T), NULL, &err); } template void xGeru::initialize_cpu_buffer() { srand(10); for (size_t i = 0; i < buffer.N; ++i) { for (size_t j = 0; j < buffer.lda; ++j) { buffer.cpuA[i*buffer.lda+j] = random(UPPER_BOUND()) / randomScale(); } } for (size_t i = 0; i < buffer.M; ++i) { buffer.cpuX[i] = random(UPPER_BOUND()) / randomScale(); } for (size_t i = 0; i < buffer.N; ++i) { buffer.cpuY[i] = random(UPPER_BOUND()) / randomScale(); } } template void xGeru::initialize_gpu_buffer() { cl_int err; err = clEnqueueWriteBuffer(queues_[0], buffer.A, CL_TRUE, buffer.offa * sizeof(T), buffer.N * buffer.lda*sizeof(T), buffer.cpuA, 0, NULL, NULL); err = clEnqueueWriteBuffer(queues_[0], buffer.X, CL_TRUE, 0, buffer.M*sizeof(T), buffer.cpuX, 0, NULL, NULL); err = clEnqueueWriteBuffer(queues_[0], buffer.Y, CL_TRUE, 0, buffer.N*sizeof(T), buffer.cpuY, 0, NULL, NULL); } template void xGeru::reset_gpu_write_buffer() { cl_int err; err = clEnqueueWriteBuffer(queues_[0], buffer.A, CL_TRUE, buffer.offa * sizeof(T), buffer.N * buffer.lda*sizeof(T), buffer.cpuA, 0, NULL, NULL);; } template <> void xGeru::call_func() { timer.Start(timer_id); clblasCgeru(buffer.order, buffer.M, buffer.N, buffer.alpha, buffer.X, buffer.offx, buffer.incx, buffer.Y, buffer.offy, buffer.incy, buffer.A, buffer.offa, buffer.lda, numQueues, queues_, 0, NULL,&event_); clWaitForEvents(1, &event_); timer.Stop(timer_id); } template <> void xGeru::call_func() { timer.Start(timer_id); clblasZgeru(buffer.order, buffer.M, buffer.N, buffer.alpha, buffer.X, buffer.offx, buffer.incx, buffer.Y, buffer.offy, buffer.incy, buffer.A, buffer.offa, buffer.lda, numQueues, queues_, 0, NULL,&event_); clWaitForEvents(1, &event_); timer.Stop(timer_id); } #endif // ifndef CLBLAS_BENCHMARK_XSYR_HXX__clblas-2.10/src/client/clfunc_xhemm.hpp000066400000000000000000000411661264277366700201560ustar00rootroot00000000000000/* ************************************************************************ * Copyright 2013 Advanced Micro Devices, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * ************************************************************************/ // $Id #ifndef CLBLAS_BENCHMARK_XHEMM_HXX__ #define CLBLAS_BENCHMARK_XHEMM_HXX__ #include "clfunc_common.hpp" //clblasChemm( // clblasOrder order, // clblasSide side, // clblasUplo uplo, // size_t M, // size_t N, // cl_float2 alpha, // const cl_mem A, // size_t offa, // size_t lda, // const cl_mem B, // size_t offb, // size_t ldb, // cl_float2 beta, // cl_mem C, // size_t offc, // size_t ldc, // cl_uint numCommandQueues, // cl_command_queue *commandQueues, // cl_uint numEventsInWaitList, // const cl_event *eventWaitList); template struct xHemmBuffer { clblasOrder order; clblasSide side; clblasUplo uplo; size_t M; size_t N; T alpha; T* cpuA; size_t a_num_vectors; cl_mem A; size_t offa; size_t lda; T* cpuB; cl_mem B; size_t offb; size_t ldb; T beta; T* cpuC; cl_mem C; size_t offc; size_t ldc; }; // struct buffer template class xHemm : public clblasFunc { public: xHemm(StatisticalTimer& timer, cl_device_type devType) : clblasFunc(timer, devType) { timer.getUniqueID("clHemm", 0); } ~xHemm() { } double gflops() { if (buffer.side == clblasLeft) { return (8*buffer.M*buffer.M*buffer.N)/time_in_ns(); } else { return (8*buffer.N*buffer.N*buffer.M)/time_in_ns(); } } std::string gflops_formula() { if (buffer.side == clblasLeft) { return "8*M*M*N/time"; } else { return "8*N*N*M/time"; } } void setup_buffer(int order_option, int side_option, int uplo_option, int diag_option, int transA_option, int transB_option, size_t M, size_t N, size_t K, size_t lda, size_t ldb, size_t ldc,size_t offA, size_t offB, size_t offC, double alpha, double beta); void initialize_cpu_buffer(){} void initialize_gpu_buffer(); void reset_gpu_write_buffer(); void call_func(); void read_gpu_buffer() { cl_int err; err = clEnqueueReadBuffer(queues_[0], buffer.C, CL_TRUE, buffer.offc * sizeof(T), buffer.ldc*buffer.N*sizeof(T), buffer.cpuC,0,NULL,NULL); } void roundtrip_func() { std::cout << "xHemm::roundtrip_func" <(alpha); buffer.beta = makeScalar(beta); } private: xHemmBuffer buffer; }; template void xHemm::setup_buffer(int order_option, int side_option, int uplo_option, int diag_option, int transA_option, int transB_option, size_t M, size_t N, size_t K, size_t lda, size_t ldb, size_t ldc,size_t offA, size_t offB, size_t offC, double alpha, double beta) { initialize_scalars(alpha, beta); buffer.offa = offA; buffer.offb = offB; buffer.offc = offC; buffer.M = M; buffer.N = N; if (order_option == 0) { buffer.order = clblasRowMajor; } else { buffer.order = clblasColumnMajor; } if (uplo_option == 0) { buffer.uplo = clblasUpper; } else { buffer.uplo = clblasLower; } if (side_option == 0) { buffer.side = clblasLeft; buffer.a_num_vectors = M; if (lda == 0) { buffer.lda = buffer.M; } else if (lda < buffer.M) { std::cerr << "lda:wrong size\n"; exit(1); } else { buffer.lda = lda; } } else { buffer.side = clblasRight; buffer.a_num_vectors = N; if (lda == 0) { buffer.lda = buffer.N; } else if (lda < buffer.N) { std::cerr << "lda:wrong size\n"; exit(1); } else { buffer.lda = lda; } } /*} if (lda == 0) { buffer.lda = buffer.M; } else if (lda < buffer.M) { std::cerr << "lda:wrong size\n"; exit(1); } else { buffer.lda = lda; }*/ if (ldb == 0) { buffer.ldb = buffer.M; } else if (ldb < buffer.M) { std::cerr << "ldb:wrong size\n"; exit(1); } else { buffer.ldb = ldb; } if (ldc == 0) { buffer.ldc = buffer.M; } else if (ldc < buffer.M) { std::cerr << "ldc:wrong size\n"; exit(1); } else { buffer.ldc = ldc; } buffer.cpuB = new T[buffer.N * buffer.ldb]; buffer.cpuC = new T[buffer.N * buffer.ldc]; buffer.cpuA = new T[buffer.a_num_vectors * buffer.lda]; cl_int err; buffer.A = clCreateBuffer(ctx_, CL_MEM_READ_ONLY, buffer.a_num_vectors * buffer.lda*sizeof(T), NULL, &err); buffer.B = clCreateBuffer(ctx_, CL_MEM_READ_ONLY, buffer.N*buffer.ldb*sizeof(T), NULL, &err); buffer.C = clCreateBuffer(ctx_, CL_MEM_READ_WRITE, buffer.N*buffer.ldc*sizeof(T), NULL, &err); } template <> void xHemm::initialize_cpu_buffer() { srand(10); for (size_t i = 0; i < buffer.a_num_vectors; ++i) { for (size_t j = 0; j < buffer.lda; ++j) { buffer.cpuA[i*buffer.lda+j].s[0] = static_cast(rand())/ static_cast(RAND_MAX); buffer.cpuA[i*buffer.lda+j].s[1] = static_cast(rand())/ static_cast(RAND_MAX); } } for (size_t i = 0; i < buffer.N; ++i) { for (size_t j = 0; j < buffer.ldb; ++j) { buffer.cpuB[i*buffer.ldb+j].s[0] = static_cast(rand())/ static_cast(RAND_MAX); buffer.cpuB[i*buffer.ldb+j].s[1] = static_cast(rand())/ static_cast(RAND_MAX); } } for (size_t i = 0; i < buffer.N; ++i) { for (size_t j = 0; j < buffer.ldc; ++j) { buffer.cpuC[i*buffer.ldc+j].s[0] = static_cast(rand())/ static_cast(RAND_MAX); buffer.cpuC[i*buffer.ldc+j].s[1] = static_cast(rand())/ static_cast(RAND_MAX); } } //for (size_t i = 0; i < buffer.N; ++i) //{ // buffer.cpuX[i].s[0] = static_cast(rand())/ // static_cast(RAND_MAX); // buffer.cpuX[i].s[1] = static_cast(rand())/ // static_cast(RAND_MAX); // buffer.cpuY[i].s[0] = static_cast(rand())/ // static_cast(RAND_MAX); // buffer.cpuY[i].s[1] = static_cast(rand())/ // static_cast(RAND_MAX); //} } template <> void xHemm::initialize_cpu_buffer() { srand(10); for (size_t i = 0; i < buffer.a_num_vectors; ++i) { for (size_t j = 0; j < buffer.lda; ++j) { buffer.cpuA[i*buffer.lda+j].s[0] = static_cast(rand())/ static_cast(RAND_MAX); buffer.cpuA[i*buffer.lda+j].s[1] = static_cast(rand())/ static_cast(RAND_MAX); } } for (size_t i = 0; i < buffer.N; ++i) { for (size_t j = 0; j < buffer.ldb; ++j) { buffer.cpuB[i*buffer.ldb+j].s[0] = static_cast(rand())/ static_cast(RAND_MAX); buffer.cpuB[i*buffer.ldb+j].s[1] = static_cast(rand())/ static_cast(RAND_MAX); } } for (size_t i = 0; i < buffer.N; ++i) { for (size_t j = 0; j < buffer.ldc; ++j) { buffer.cpuC[i*buffer.ldc+j].s[0] = static_cast(rand())/ static_cast(RAND_MAX); buffer.cpuC[i*buffer.ldc+j].s[1] = static_cast(rand())/ static_cast(RAND_MAX); } } } template void xHemm::initialize_gpu_buffer() { cl_int err; err = clEnqueueWriteBuffer(queues_[0], buffer.A, CL_TRUE, buffer.offa * sizeof(T), buffer.a_num_vectors * buffer.lda*sizeof(T), buffer.cpuA, 0, NULL, NULL); err = clEnqueueWriteBuffer(queues_[0], buffer.B, CL_TRUE, buffer.offb * sizeof(T), buffer.ldb*buffer.N*sizeof(T), buffer.cpuB, 0, NULL, NULL); err = clEnqueueWriteBuffer(queues_[0], buffer.C, CL_TRUE, buffer.offc * sizeof(T), buffer.ldc*buffer.N*sizeof(T), buffer.cpuC, 0, NULL, NULL); } template void xHemm::reset_gpu_write_buffer() { cl_int err; err = clEnqueueWriteBuffer(queues_[0], buffer.C, CL_TRUE, 0, buffer.ldc*buffer.N*sizeof(T), buffer.cpuC, 0, NULL, NULL); } template <> void xHemm::call_func() { timer.Start(timer_id); clblasChemm(buffer.order, buffer.side, buffer.uplo, buffer.M, buffer.N, buffer.alpha, buffer.A, buffer.offa, buffer.lda, buffer.B, buffer.offb, buffer.ldb, buffer.beta, buffer.C, buffer.offc, buffer.ldc, numQueues, queues_, 0, NULL,&event_); clWaitForEvents(1, &event_); timer.Stop(timer_id); } template <> void xHemm::roundtrip_func() { timer.Start(timer_id); cl_int err; //create buffer buffer.A = clCreateBuffer(ctx_, CL_MEM_READ_ONLY, buffer.a_num_vectors * buffer.lda*sizeof(cl_float2), NULL, &err); buffer.B = clCreateBuffer(ctx_, CL_MEM_READ_ONLY, buffer.N*buffer.ldb*sizeof(cl_float2), NULL, &err); buffer.C = clCreateBuffer(ctx_, CL_MEM_READ_WRITE, buffer.N*buffer.ldc*sizeof(cl_float2), NULL, &err); //write gpu buffer err = clEnqueueWriteBuffer(queues_[0], buffer.A, CL_TRUE, buffer.offa * sizeof(cl_float2), buffer.a_num_vectors * buffer.lda*sizeof(cl_float2), buffer.cpuA, 0, NULL, NULL); err = clEnqueueWriteBuffer(queues_[0], buffer.B, CL_TRUE, buffer.offb * sizeof(cl_float2), buffer.ldb*buffer.N*sizeof(cl_float2), buffer.cpuB, 0, NULL, NULL); err = clEnqueueWriteBuffer(queues_[0], buffer.C, CL_TRUE, buffer.offc * sizeof(cl_float2), buffer.ldc*buffer.N*sizeof(cl_float2), buffer.cpuC, 0, NULL, NULL); clblasChemm(buffer.order, buffer.side, buffer.uplo, buffer.M, buffer.N, buffer.alpha, buffer.A, buffer.offa, buffer.lda, buffer.B, buffer.offb, buffer.ldb, buffer.beta, buffer.C, buffer.offc, buffer.ldc, numQueues, queues_, 0, NULL,NULL); //read gpu buffer err = clEnqueueReadBuffer(queues_[0], buffer.C, CL_TRUE, buffer.offc * sizeof(cl_float2), buffer.ldc*buffer.N*sizeof(cl_float2), buffer.cpuC, 0, NULL, &event_); clWaitForEvents(1, &event_); timer.Stop(timer_id); } template <> void xHemm::call_func() { timer.Start(timer_id); clblasZhemm(buffer.order, buffer.side, buffer.uplo, buffer.M, buffer.N, buffer.alpha, buffer.A, buffer.offa, buffer.lda, buffer.B, buffer.offb, buffer.ldb, buffer.beta, buffer.C, buffer.offc, buffer.ldc, numQueues, queues_, 0, NULL,&event_); clWaitForEvents(1, &event_); timer.Stop(timer_id); } template <> void xHemm::roundtrip_func() { timer.Start(timer_id); cl_int err; //create buffer buffer.A = clCreateBuffer(ctx_, CL_MEM_READ_ONLY, buffer.a_num_vectors * buffer.lda*sizeof(cl_double2), NULL, &err); buffer.B = clCreateBuffer(ctx_, CL_MEM_READ_ONLY, buffer.N*buffer.ldb*sizeof(cl_double2), NULL, &err); buffer.C = clCreateBuffer(ctx_, CL_MEM_READ_WRITE, buffer.N*buffer.ldc*sizeof(cl_double2), NULL, &err); //write gpu buffer err = clEnqueueWriteBuffer(queues_[0], buffer.A, CL_TRUE, buffer.offa * sizeof(cl_double2), buffer.a_num_vectors * buffer.lda*sizeof(cl_double2), buffer.cpuA, 0, NULL, NULL); err = clEnqueueWriteBuffer(queues_[0], buffer.B, CL_TRUE, buffer.offb * sizeof(cl_double2), buffer.ldb*buffer.N*sizeof(cl_double2), buffer.cpuB, 0, NULL, NULL); err = clEnqueueWriteBuffer(queues_[0], buffer.C, CL_TRUE, buffer.offc * sizeof(cl_double2), buffer.ldc*buffer.N*sizeof(cl_double2), buffer.cpuC, 0, NULL, NULL); clblasZhemm(buffer.order, buffer.side, buffer.uplo, buffer.M, buffer.N, buffer.alpha, buffer.A, buffer.offa, buffer.lda, buffer.B, buffer.offb, buffer.ldb, buffer.beta, buffer.C, buffer.offc, buffer.ldc, numQueues, queues_, 0, NULL,NULL); //read gpu buffer err = clEnqueueReadBuffer(queues_[0], buffer.C, CL_TRUE, buffer.offc * sizeof(cl_double2), buffer.ldc*buffer.N*sizeof(cl_double2), buffer.cpuC, 0, NULL, &event_); clWaitForEvents(1, &event_); timer.Stop(timer_id); } #endif // ifndef CLBLAS_BENCHMARK_XSYR_HXX__clblas-2.10/src/client/clfunc_xhemv.hpp000066400000000000000000000167031264277366700201660ustar00rootroot00000000000000/* ************************************************************************ * Copyright 2013 Advanced Micro Devices, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * ************************************************************************/ // $Id #ifndef CLBLAS_BENCHMARK_XHEMV_HXX__ #define CLBLAS_BENCHMARK_XHEMV_HXX__ #include "clfunc_common.hpp" template struct xHemvBuffer { clblasOrder order; clblasUplo uplo; size_t N; T alpha; T* cpuX; cl_mem X; size_t offx; int incx; T beta; T* cpuY; cl_mem Y; size_t offy; int incy; T* cpuA; cl_mem A; size_t offa; size_t lda; }; // struct buffer template class xHemv : public clblasFunc { public: xHemv(StatisticalTimer& timer, cl_device_type devType) : clblasFunc(timer, devType) { timer.getUniqueID("clHemv", 0); } ~xHemv() { delete buffer.cpuA; delete buffer.cpuX; OPENCL_V_THROW( clReleaseMemObject(buffer.A), "releasing buffer A"); OPENCL_V_THROW( clReleaseMemObject(buffer.X), "releasing buffer C"); } double gflops() { return static_cast((2 * buffer.N * buffer.N)/time_in_ns()); } std::string gflops_formula() { return "2*N*N/time"; } void setup_buffer(int order_option, int side_option, int uplo_option, int diag_option, int transA_option, int transB_option, size_t M, size_t N, size_t K, size_t lda, size_t ldb, size_t ldc,size_t offA, size_t offB, size_t offC, double alpha, double beta); void initialize_cpu_buffer(); void initialize_gpu_buffer(); void reset_gpu_write_buffer(); void call_func(); void read_gpu_buffer() { //cl_int err; //to-do need to fill up } void roundtrip_func() {//to-do need to fill up } void roundtrip_setup_buffer(int order_option, int side_option, int uplo_option, int diag_option, int transA_option, int transB_option, size_t M, size_t N, size_t K, size_t lda, size_t ldb, size_t ldc, size_t offA, size_t offBX, size_t offCY, double alpha, double beta) {} void releaseGPUBuffer_deleteCPUBuffer() { //this is necessary since we are running a iteration of tests and calculate the average time. (in client.cpp) //need to do this before we eventually hit the destructor //to do } protected: protected: void initialize_scalars(double alpha, double beta) { buffer.alpha = makeScalar(alpha); buffer.beta = makeScalar(beta); } private: xHemvBuffer buffer; }; template void xHemv::setup_buffer(int order_option, int side_option, int uplo_option, int diag_option, int transA_option, int transB_option, size_t M, size_t N, size_t K, size_t lda, size_t ldb, size_t ldc,size_t offA, size_t offB, size_t offC, double alpha, double beta) { initialize_scalars(alpha, beta); buffer.offa = offA; buffer.offx = offB; buffer.incx = 1;//If this changes, remember to adjust size of Y in rest of the file buffer.offy = offC; buffer.incy = 1;//If this changes, remember to adjust size of Y in rest of the file buffer.N = M; if (order_option == 0) { buffer.order = clblasRowMajor; } else { buffer.order = clblasColumnMajor; } if (uplo_option == 0) { buffer.uplo = clblasUpper; } else { buffer.uplo = clblasLower; } if (lda == 0) { buffer.lda = buffer.N; } else if (lda < buffer.N) { std::cerr << "lda:wrong size\n"; exit(1); } else { buffer.lda = lda; } buffer.cpuX = new T[buffer.N]; buffer.cpuY = new T[buffer.N]; buffer.cpuA = new T[buffer.N * buffer.lda]; cl_int err; buffer.A = clCreateBuffer(ctx_, CL_MEM_READ_ONLY, buffer.N * buffer.lda*sizeof(T), NULL, &err); buffer.X = clCreateBuffer(ctx_, CL_MEM_READ_WRITE, buffer.N*sizeof(T), NULL, &err); buffer.Y = clCreateBuffer(ctx_, CL_MEM_READ_WRITE, buffer.N*sizeof(T), NULL, &err); } template void xHemv::initialize_cpu_buffer() { srand(10); for (size_t i = 0; i < buffer.N; ++i) { for (size_t j = 0; j < buffer.lda; ++j) { buffer.cpuA[i*buffer.lda+j] = random(UPPER_BOUND()) / randomScale(); } } for (size_t i = 0; i < buffer.N; ++i) { buffer.cpuX[i] = random(UPPER_BOUND()) / randomScale(); buffer.cpuY[i] = random(UPPER_BOUND()) / randomScale(); } } template void xHemv::initialize_gpu_buffer() { cl_int err; err = clEnqueueWriteBuffer(queues_[0], buffer.A, CL_TRUE, buffer.offa * sizeof(T), buffer.N * buffer.lda*sizeof(T), buffer.cpuA, 0, NULL, NULL); err = clEnqueueWriteBuffer(queues_[0], buffer.X, CL_TRUE, 0, buffer.N*sizeof(T), buffer.cpuX, 0, NULL, NULL); err = clEnqueueWriteBuffer(queues_[0], buffer.Y, CL_TRUE, 0, buffer.N*sizeof(T), buffer.cpuY, 0, NULL, NULL); } template void xHemv::reset_gpu_write_buffer() { cl_int err; err = clEnqueueWriteBuffer(queues_[0], buffer.A, CL_TRUE, buffer.offa * sizeof(T), buffer.N * buffer.lda*sizeof(T), buffer.cpuA, 0, NULL, NULL);; } template <> void xHemv::call_func() { timer.Start(timer_id); clblasChemv(buffer.order, buffer.uplo, buffer.N, buffer.alpha, buffer.A, buffer.offa, buffer.lda, buffer.X, buffer.offx, buffer.incx, buffer.beta, buffer.Y, buffer.offy, buffer.incy, numQueues, queues_, 0, NULL,&event_); clWaitForEvents(1, &event_); timer.Stop(timer_id); } template <> void xHemv::call_func() { timer.Start(timer_id); clblasZhemv(buffer.order, buffer.uplo, buffer.N, buffer.alpha, buffer.A, buffer.offa, buffer.lda, buffer.X, buffer.offx, buffer.incx, buffer.beta, buffer.Y, buffer.offy, buffer.incy, numQueues, queues_, 0, NULL,&event_); clWaitForEvents(1, &event_); timer.Stop(timer_id); } template<> double xHemv:: gflops() { return static_cast((8 * buffer.N * buffer.N)/time_in_ns()); } template<> double xHemv:: gflops() { return static_cast((8 * buffer.N * buffer.N)/time_in_ns()); } template<> std::string xHemv:: gflops_formula() { return "8*N*N/time"; } template<> std::string xHemv:: gflops_formula() { return "8*N*N/time"; } #endif // ifndef CLBLAS_BENCHMARK_XSYR_HXX__clblas-2.10/src/client/clfunc_xher.hpp000066400000000000000000000200701264277366700177750ustar00rootroot00000000000000/* ************************************************************************ * Copyright 2013 Advanced Micro Devices, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * ************************************************************************/ // $Id #ifndef CLBLAS_BENCHMARK_XHER_HXX__ #define CLBLAS_BENCHMARK_XHER_HXX__ #include "clfunc_common.hpp" template struct xHerBuffer { clblasOrder order; clblasUplo uplo; size_t N; T alpha; T* cpuX; cl_mem X; size_t offx; int incx; T* cpuA; cl_mem A; size_t offa; size_t lda; }; // struct buffer template class xHer : public clblasFunc { public: xHer(StatisticalTimer& timer, cl_device_type devType) : clblasFunc(timer, devType) { timer.getUniqueID("clHer", 0); } ~xHer() { delete buffer.cpuA; delete buffer.cpuX; OPENCL_V_THROW( clReleaseMemObject(buffer.A), "releasing buffer A"); OPENCL_V_THROW( clReleaseMemObject(buffer.X), "releasing buffer C"); } double gflops() { return static_cast((buffer.N * buffer.N)/time_in_ns()); } std::string gflops_formula() { return "N*N/time"; } void setup_buffer(int order_option, int side_option, int uplo_option, int diag_option, int transA_option, int transB_option, size_t M, size_t N, size_t K, size_t lda, size_t ldb, size_t ldc,size_t offA, size_t offB, size_t offC, double alpha, double beta); void initialize_cpu_buffer(); void initialize_gpu_buffer(); void reset_gpu_write_buffer(); void call_func(); void read_gpu_buffer() { //cl_int err; //to-do need to fill up } void roundtrip_func() {//to-do need to fill up } void roundtrip_setup_buffer(int order_option, int side_option, int uplo_option, int diag_option, int transA_option, int transB_option, size_t M, size_t N, size_t K, size_t lda, size_t ldb, size_t ldc, size_t offA, size_t offBX, size_t offCY, double alpha, double beta) {} void releaseGPUBuffer_deleteCPUBuffer() { //this is necessary since we are running a iteration of tests and calculate the average time. (in client.cpp) //need to do this before we eventually hit the destructor //to do } protected: protected: void initialize_scalars(double alpha, double beta) { buffer.alpha = makeScalar(alpha); //buffer.beta = makeScalar(beta); } private: xHerBuffer buffer; }; template void xHer::setup_buffer(int order_option, int side_option, int uplo_option, int diag_option, int transA_option, int transB_option, size_t M, size_t N, size_t K, size_t lda, size_t ldb, size_t ldc,size_t offA, size_t offB, size_t offC, double alpha, double beta) { initialize_scalars(alpha, beta); buffer.offa = offA; buffer.offx = offB; buffer.incx = 1;//If this changes, remember to adjust size of Y in rest of the file buffer.N = M; if (order_option == 0) { buffer.order = clblasRowMajor; } else { buffer.order = clblasColumnMajor; } if (uplo_option == 0) { buffer.uplo = clblasUpper; } else { buffer.uplo = clblasLower; } if (lda == 0) { buffer.lda = buffer.N; } else if (lda < buffer.N) { std::cerr << "lda:wrong size\n"; exit(1); } else { buffer.lda = lda; } buffer.cpuX = new T[buffer.N]; buffer.cpuA = new T[buffer.N * buffer.lda]; cl_int err; buffer.A = clCreateBuffer(ctx_, CL_MEM_READ_ONLY, buffer.N * buffer.lda*sizeof(T), NULL, &err); buffer.X = clCreateBuffer(ctx_, CL_MEM_READ_WRITE, buffer.N*sizeof(T), NULL, &err); } template void xHer::initialize_cpu_buffer() { srand(10); for (size_t i = 0; i < buffer.N; ++i) { for (size_t j = 0; j < buffer.lda; ++j) { buffer.cpuA[i*buffer.lda+j] = random(UPPER_BOUND()) / randomScale(); } } for (size_t i = 0; i < buffer.N; ++i) { buffer.cpuX[i] = random(UPPER_BOUND()) / randomScale(); } } // //template <> //void xHer::initialize_cpu_buffer() //{ // srand(10); // for (size_t i = 0; i < buffer.N; ++i) // { // for (size_t j = 0; j < buffer.lda; ++j) // { // buffer.cpuA[i*buffer.lda+j].s[0] = static_cast(rand())/ // static_cast(RAND_MAX); // buffer.cpuA[i*buffer.lda+j].s[1] = static_cast(rand())/ // static_cast(RAND_MAX); // } // } // // for (size_t i = 0; i < buffer.N; ++i) // { // buffer.cpuX[i].s[0] = static_cast(rand())/ // static_cast(RAND_MAX); // buffer.cpuX[i].s[1] = static_cast(rand())/ // static_cast(RAND_MAX); // } //} //template <> //void xHer::initialize_cpu_buffer() //{ // srand(10); // for (size_t i = 0; i < buffer.N; ++i) // { // for (size_t j = 0; j < buffer.lda; ++j) // { // buffer.cpuA[i*buffer.lda+j].s[0] = static_cast(rand())/ // static_cast(RAND_MAX); // buffer.cpuA[i*buffer.lda+j].s[1] = static_cast(rand())/ // static_cast(RAND_MAX); // } // } // // for (size_t i = 0; i < buffer.N; ++i) // { // buffer.cpuX[i].s[0] = static_cast(rand())/ // static_cast(RAND_MAX); // buffer.cpuX[i].s[1] = static_cast(rand())/ // static_cast(RAND_MAX); // } //} template void xHer::initialize_gpu_buffer() { cl_int err; err = clEnqueueWriteBuffer(queues_[0], buffer.A, CL_TRUE, buffer.offa * sizeof(T), buffer.N * buffer.lda*sizeof(T), buffer.cpuA, 0, NULL, NULL); err = clEnqueueWriteBuffer(queues_[0], buffer.X, CL_TRUE, 0, buffer.N*sizeof(T), buffer.cpuX, 0, NULL, NULL); } template void xHer::reset_gpu_write_buffer() { cl_int err; err = clEnqueueWriteBuffer(queues_[0], buffer.A, CL_TRUE, buffer.offa * sizeof(T), buffer.N * buffer.lda*sizeof(T), buffer.cpuA, 0, NULL, NULL);; } template <> void xHer::call_func() { timer.Start(timer_id); clblasCher(buffer.order, buffer.uplo, buffer.N, buffer.alpha.s[0], buffer.X, buffer.offx, buffer.incx, buffer.A, buffer.offa, buffer.lda, numQueues, queues_, 0, NULL,&event_); clWaitForEvents(1, &event_); timer.Stop(timer_id); } template <> void xHer::call_func() { timer.Start(timer_id); clblasZher(buffer.order, buffer.uplo, buffer.N, buffer.alpha.s[0], buffer.X, buffer.offx, buffer.incx, buffer.A, buffer.offa, buffer.lda, numQueues, queues_, 0, NULL,&event_); clWaitForEvents(1, &event_); timer.Stop(timer_id); } template<> double xHer:: gflops() { return static_cast((4 * buffer.N * buffer.N)/time_in_ns()); } template<> double xHer:: gflops() { return static_cast((4 * buffer.N * buffer.N)/time_in_ns()); } template<> std::string xHer:: gflops_formula() { return "4*N*N/time"; } template<> std::string xHer:: gflops_formula() { return "4*N*N/time"; } #endif // ifndef CLBLAS_BENCHMARK_XSYR_HXX__clblas-2.10/src/client/clfunc_xher2.hpp000066400000000000000000000223701264277366700200640ustar00rootroot00000000000000/* ************************************************************************ * Copyright 2013 Advanced Micro Devices, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * ************************************************************************/ // $Id #ifndef CLBLAS_BENCHMARK_XHER2_HXX__ #define CLBLAS_BENCHMARK_XHER2_HXX__ #include "clfunc_common.hpp" template struct xHer2Buffer { clblasOrder order; clblasUplo uplo; size_t N; T alpha; T* cpuX; cl_mem X; size_t offx; int incx; T* cpuY; cl_mem Y; size_t offy; int incy; T* cpuA; cl_mem A; size_t offa; size_t lda; }; // struct buffer template class xHer2 : public clblasFunc { public: xHer2(StatisticalTimer& timer, cl_device_type devType) : clblasFunc(timer, devType) { timer.getUniqueID("clHer2", 0); } ~xHer2() { delete buffer.cpuA; delete buffer.cpuX; OPENCL_V_THROW( clReleaseMemObject(buffer.A), "releasing buffer A"); OPENCL_V_THROW( clReleaseMemObject(buffer.X), "releasing buffer C"); } double gflops() { return static_cast((2 * buffer.N * buffer.N)/time_in_ns()); } std::string gflops_formula() { return "2*N*N/time"; } void setup_buffer(int order_option, int side_option, int uplo_option, int diag_option, int transA_option, int transB_option, size_t M, size_t N, size_t K, size_t lda, size_t ldb, size_t ldc,size_t offA, size_t offB, size_t offC, double alpha, double beta); void initialize_cpu_buffer(); void initialize_gpu_buffer(); void reset_gpu_write_buffer(); void call_func(); void read_gpu_buffer() { //cl_int err; //to-do need to fill up } void roundtrip_func() {//to-do need to fill up } void roundtrip_setup_buffer(int order_option, int side_option, int uplo_option, int diag_option, int transA_option, int transB_option, size_t M, size_t N, size_t K, size_t lda, size_t ldb, size_t ldc, size_t offA, size_t offBX, size_t offCY, double alpha, double beta) {} void releaseGPUBuffer_deleteCPUBuffer() { //this is necessary since we are running a iteration of tests and calculate the average time. (in client.cpp) //need to do this before we eventually hit the destructor //to do } protected: protected: void initialize_scalars(double alpha, double beta) { buffer.alpha = makeScalar(alpha); //buffer.beta = makeScalar(beta); } private: xHer2Buffer buffer; }; template void xHer2::setup_buffer(int order_option, int side_option, int uplo_option, int diag_option, int transA_option, int transB_option, size_t M, size_t N, size_t K, size_t lda, size_t ldb, size_t ldc,size_t offA, size_t offB, size_t offC, double alpha, double beta) { initialize_scalars(alpha, beta); buffer.offa = offA; buffer.offx = offB; buffer.incx = 1;//If this changes, remember to adjust size of Y in rest of the file buffer.offy = offC; buffer.incy = 1;//If this changes, remember to adjust size of Y in rest of the file buffer.N = M; if (order_option == 0) { buffer.order = clblasRowMajor; } else { buffer.order = clblasColumnMajor; } if (uplo_option == 0) { buffer.uplo = clblasUpper; } else { buffer.uplo = clblasLower; } if (lda == 0) { buffer.lda = buffer.N; } else if (lda < buffer.N) { std::cerr << "lda:wrong size\n"; exit(1); } else { buffer.lda = lda; } buffer.cpuX = new T[buffer.N]; buffer.cpuY = new T[buffer.N]; buffer.cpuA = new T[buffer.N * buffer.lda]; cl_int err; buffer.A = clCreateBuffer(ctx_, CL_MEM_READ_ONLY, buffer.N * buffer.lda*sizeof(T), NULL, &err); buffer.X = clCreateBuffer(ctx_, CL_MEM_READ_WRITE, buffer.N*sizeof(T), NULL, &err); buffer.Y = clCreateBuffer(ctx_, CL_MEM_READ_WRITE, buffer.N*sizeof(T), NULL, &err); } template void xHer2::initialize_cpu_buffer() { srand(10); for (size_t i = 0; i < buffer.N; ++i) { for (size_t j = 0; j < buffer.lda; ++j) { buffer.cpuA[i*buffer.lda+j] = random(UPPER_BOUND()) / randomScale(); } } for (size_t i = 0; i < buffer.N; ++i) { buffer.cpuX[i] = random(UPPER_BOUND()) / randomScale(); buffer.cpuY[i] = random(UPPER_BOUND()) / randomScale(); } } //template <> //void xHer2::initialize_cpu_buffer() //{ // srand(10); // for (size_t i = 0; i < buffer.N; ++i) // { // for (size_t j = 0; j < buffer.lda; ++j) // { // buffer.cpuA[i*buffer.lda+j].s[0] = static_cast(rand())/ // static_cast(RAND_MAX); // buffer.cpuA[i*buffer.lda+j].s[1] = static_cast(rand())/ // static_cast(RAND_MAX); // } // } // // for (size_t i = 0; i < buffer.N; ++i) // { // buffer.cpuX[i].s[0] = static_cast(rand())/ // static_cast(RAND_MAX); // buffer.cpuX[i].s[1] = static_cast(rand())/ // static_cast(RAND_MAX); // buffer.cpuY[i].s[0] = static_cast(rand())/ // static_cast(RAND_MAX); // buffer.cpuY[i].s[1] = static_cast(rand())/ // static_cast(RAND_MAX); // } //} //template <> //void xHer2::initialize_cpu_buffer() //{ // srand(10); // for (size_t i = 0; i < buffer.N; ++i) // { // for (size_t j = 0; j < buffer.lda; ++j) // { // buffer.cpuA[i*buffer.lda+j].s[0] = static_cast(rand())/ // static_cast(RAND_MAX); // buffer.cpuA[i*buffer.lda+j].s[1] = static_cast(rand())/ // static_cast(RAND_MAX); // } // } // // for (size_t i = 0; i < buffer.N; ++i) // { // buffer.cpuX[i].s[0] = static_cast(rand())/ // static_cast(RAND_MAX); // buffer.cpuX[i].s[1] = static_cast(rand())/ // static_cast(RAND_MAX); // buffer.cpuY[i].s[0] = static_cast(rand())/ // static_cast(RAND_MAX); // buffer.cpuY[i].s[1] = static_cast(rand())/ // static_cast(RAND_MAX); // } //} // template void xHer2::initialize_gpu_buffer() { cl_int err; err = clEnqueueWriteBuffer(queues_[0], buffer.A, CL_TRUE, buffer.offa * sizeof(T), buffer.N * buffer.lda*sizeof(T), buffer.cpuA, 0, NULL, NULL); err = clEnqueueWriteBuffer(queues_[0], buffer.X, CL_TRUE, 0, buffer.N*sizeof(T), buffer.cpuX, 0, NULL, NULL); err = clEnqueueWriteBuffer(queues_[0], buffer.Y, CL_TRUE, 0, buffer.N*sizeof(T), buffer.cpuY, 0, NULL, NULL); } template void xHer2::reset_gpu_write_buffer() { cl_int err; err = clEnqueueWriteBuffer(queues_[0], buffer.A, CL_TRUE, buffer.offa * sizeof(T), buffer.N * buffer.lda*sizeof(T), buffer.cpuA, 0, NULL, NULL);; } template <> void xHer2::call_func() { timer.Start(timer_id); clblasCher2(buffer.order, buffer.uplo, buffer.N, buffer.alpha, buffer.X, buffer.offx, buffer.incx, buffer.Y, buffer.offy, buffer.incy, buffer.A, buffer.offa, buffer.lda, numQueues, queues_, 0, NULL,&event_); clWaitForEvents(1, &event_); timer.Stop(timer_id); } template <> void xHer2::call_func() { timer.Start(timer_id); clblasZher2(buffer.order, buffer.uplo, buffer.N, buffer.alpha, buffer.X, buffer.offx, buffer.incx, buffer.Y, buffer.offy, buffer.incy, buffer.A, buffer.offa, buffer.lda, numQueues, queues_, 0, NULL,&event_); clWaitForEvents(1, &event_); timer.Stop(timer_id); } template<> double xHer2:: gflops() { return static_cast((8 * buffer.N * buffer.N)/time_in_ns()); } template<> double xHer2:: gflops() { return static_cast((8 * buffer.N * buffer.N)/time_in_ns()); } template<> std::string xHer2:: gflops_formula() { return "8*N*N/time"; } template<> std::string xHer2:: gflops_formula() { return "8*N*N/time"; } #endif // ifndef CLBLAS_BENCHMARK_XSYR_HXX__clblas-2.10/src/client/clfunc_xher2k.hpp000066400000000000000000000474001264277366700202400ustar00rootroot00000000000000/* ************************************************************************ * Copyright 2013 Advanced Micro Devices, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * ************************************************************************/ // $Id #ifndef CLBLAS_BENCHMARK_XHER2K_HXX__ #define CLBLAS_BENCHMARK_XHER2K_HXX__ #include "clfunc_common.hpp" template struct xHer2kBuffer { clblasOrder order_; clblasUplo uplo_; clblasTranspose transA_; size_t N_; size_t K_; T alpha_; cl_mem A_; size_t offa_; size_t lda_; cl_mem B_; size_t offb_; size_t ldb_; T beta_; cl_mem C_; size_t offc_; size_t ldc_; size_t a_num_vectors_; size_t b_num_vectors_; size_t c_num_vectors_; T* cpuA_; T* cpuB_; T* cpuC_; }; // struct buffer template class xHer2k : public clblasFunc { public: xHer2k(StatisticalTimer& timer, cl_device_type devType) : clblasFunc(timer, devType) { timer.getUniqueID("clHer2k", 0); } ~xHer2k() { } double gflops() { return static_cast(8*(buffer_.K_ * buffer_.N_ * buffer_.N_)/time_in_ns()+2*buffer_.N_/time_in_ns()); } std::string gflops_formula() { return "(8*K*N*N+2*N)/time"; } void setup_buffer(int order_option, int side_option, int uplo_option, int diag_option, int transA_option, int transB_option, size_t M, size_t N, size_t K, size_t lda, size_t ldb, size_t ldc,size_t offA, size_t offB, size_t offC, double alpha, double beta) { DUMMY_ARGS_USAGE_4(side_option, diag_option, transB_option, M); initialize_scalars(alpha,beta); buffer_.N_ = N; buffer_.K_ = K; buffer_.offa_ = offA; buffer_.offb_ = offB; buffer_.offc_ = offC; if (uplo_option == 0) { buffer_.uplo_ = clblasUpper; } else { buffer_.uplo_ = clblasLower; } if (ldc == 0) { buffer_.ldc_ = N; } else if (ldc < N) { std::cerr << "ldc:wrong size\n"; } else { buffer_.ldc_ = ldc; } buffer_.c_num_vectors_ = N; if (order_option == 0) { order_ = clblasRowMajor; if (transA_option == 0) { buffer_.transA_ = clblasNoTrans; buffer_.a_num_vectors_ = N; buffer_.b_num_vectors_ = N; if (lda == 0) { buffer_.lda_ = K; } else if (lda < K) { std::cerr << "lda:wrong size\n"; exit(1); } else { buffer_.lda_ = lda; } if (ldb == 0) { buffer_.ldb_ = K; } else if (ldb < K) { std::cerr << "ldb:wrong size\n"; exit(1); } else { buffer_.ldb_ = ldb; } } else { buffer_.a_num_vectors_ = K; buffer_.b_num_vectors_ = K; if (transA_option == 1) { buffer_.transA_ = clblasTrans; } else if (transA_option == 2) { buffer_.transA_ = clblasConjTrans; } if (lda == 0) { buffer_.lda_ = N; } else if (lda < N) { std::cerr << "lda:wrong size\n"; exit(1); } else { buffer_.lda_ = lda; } if (ldb == 0) { buffer_.ldb_ = N; } else if (ldb < N) { std::cerr << "ldb:wrong size\n"; exit(1); } else { buffer_.ldb_ = ldb; } } } else { order_ = clblasColumnMajor; if (transA_option == 0) { buffer_.a_num_vectors_ = K; buffer_.b_num_vectors_ = K; buffer_.transA_ = clblasNoTrans; if (lda == 0) { buffer_.lda_ = N; } else if (lda < N) { std::cerr << "lda:wrong size\n"; exit(1); } else { buffer_.lda_ = lda; } if (ldb == 0) { buffer_.ldb_ = N; } else if (ldb < N) { std::cerr << "ldb:wrong size\n"; exit(1); } else { buffer_.ldb_ = ldb; } } else { buffer_.a_num_vectors_ = N; buffer_.b_num_vectors_ = N; if (transA_option == 1) { buffer_.transA_ = clblasTrans; } else if (transA_option == 2) { buffer_.transA_ = clblasConjTrans; } if (lda == 0) { buffer_.lda_ = K; } else if (lda < K) { std::cerr << "lda:wrong size\n"; exit(1); } else { buffer_.lda_ = lda; } if (ldb == 0) { buffer_.ldb_ = K; } else if (ldb < K) { std::cerr << "ldb:wrong size\n"; exit(1); } else { buffer_.ldb_ = ldb; } } } buffer_.cpuA_ = new T[buffer_.lda_*buffer_.a_num_vectors_]; buffer_.cpuB_ = new T[buffer_.ldb_*buffer_.b_num_vectors_]; buffer_.cpuC_ = new T[buffer_.ldc_*buffer_.c_num_vectors_]; cl_int err; buffer_.A_ = clCreateBuffer(ctx_, CL_MEM_READ_ONLY, (buffer_.lda_ * buffer_.a_num_vectors_ + buffer_.offa_) * sizeof(T), NULL, &err); buffer_.B_ = clCreateBuffer(ctx_, CL_MEM_READ_ONLY, (buffer_.ldb_ * buffer_.b_num_vectors_ + buffer_.offb_) * sizeof(T), NULL, &err); buffer_.C_ = clCreateBuffer(ctx_, CL_MEM_READ_WRITE, (buffer_.ldc_ * buffer_.c_num_vectors_ + buffer_.offc_) * sizeof(T), NULL, &err); } void initialize_cpu_buffer() { srand(10); for (size_t i = 0; i < buffer_.a_num_vectors_; ++i) { for (size_t j = 0; j < buffer_.lda_; ++j) { buffer_.cpuA_[i*buffer_.lda_+j] = random(UPPER_BOUND()) / randomScale(); } } for (size_t i = 0; i < buffer_.N_; ++i) { for (size_t j = 0; j < buffer_.ldc_; ++j) { buffer_.cpuC_[i*buffer_.ldc_+j] = random(UPPER_BOUND()) / randomScale(); } } } void initialize_gpu_buffer() { cl_int err; err = clEnqueueWriteBuffer(queues_[0], buffer_.A_, CL_TRUE, buffer_.offa_ * sizeof(T), buffer_.lda_ * buffer_.a_num_vectors_ * sizeof(T), buffer_.cpuA_, 0, NULL, NULL); err = clEnqueueWriteBuffer(queues_[0], buffer_.C_, CL_TRUE, buffer_.offa_ * sizeof(T), buffer_.ldc_ * buffer_.c_num_vectors_ * sizeof(T), buffer_.cpuC_, 0, NULL, NULL); } void reset_gpu_write_buffer() { cl_int err; err = clEnqueueWriteBuffer(queues_[0], buffer_.C_, CL_TRUE, buffer_.offc_ * sizeof(T), buffer_.ldc_ * buffer_.c_num_vectors_ * sizeof(T), buffer_.cpuC_, 0, NULL, NULL); } void call_func(); void read_gpu_buffer() { cl_int err; err = clEnqueueReadBuffer(queues_[0], buffer_.C_, CL_TRUE, buffer_.offc_*sizeof(T), buffer_.ldc_*buffer_.c_num_vectors_*sizeof(T), buffer_.cpuC_, 0, NULL, NULL); } void roundtrip_func(); void zerocopy_roundtrip_func() { std::cout << "xTrmm::zerocopy_roundtrip_func\n"; } void roundtrip_setup_buffer(int order_option, int side_option, int uplo_option, int diag_option, int transA_option, int transB_option, size_t M, size_t N, size_t K, size_t lda, size_t ldb, size_t ldc, size_t offA, size_t offBX, size_t offCY, double alpha, double beta) { DUMMY_ARGS_USAGE_4(side_option, diag_option, transB_option, M); initialize_scalars(alpha,beta); buffer_.N_ = N; buffer_.K_ = K; buffer_.offa_ = offA; buffer_.offb_ = offBX; buffer_.offc_ = offCY; if (uplo_option == 0) { buffer_.uplo_ = clblasUpper; } else { buffer_.uplo_ = clblasLower; } if (ldc == 0) { buffer_.ldc_ = N; } else if (ldc < N) { std::cerr << "ldc:wrong size\n"; } else { buffer_.ldc_ = ldc; } buffer_.c_num_vectors_ = N; if (order_option == 0) { order_ = clblasRowMajor; if (transA_option == 0) { buffer_.transA_ = clblasNoTrans; buffer_.a_num_vectors_ = N; buffer_.b_num_vectors_ = N; if (lda == 0) { buffer_.lda_ = K; } else if (lda < K) { std::cerr << "lda:wrong size\n"; exit(1); } else { buffer_.lda_ = lda; } if (ldb == 0) { buffer_.ldb_ = K; } else if (ldb < K) { std::cerr << "ldb:wrong size\n"; exit(1); } else { buffer_.ldb_ = ldb; } } else { buffer_.a_num_vectors_ = K; buffer_.b_num_vectors_ = K; if (transA_option == 1) { buffer_.transA_ = clblasTrans; } else if (transA_option == 2) { buffer_.transA_ = clblasConjTrans; } if (lda == 0) { buffer_.lda_ = N; } else if (lda < N) { std::cerr << "lda:wrong size\n"; exit(1); } else { buffer_.lda_ = lda; } if (ldb == 0) { buffer_.ldb_ = N; } else if (ldb < N) { std::cerr << "ldb:wrong size\n"; exit(1); } else { buffer_.ldb_ = ldb; } } } else { order_ = clblasColumnMajor; if (transA_option == 0) { buffer_.a_num_vectors_ = K; buffer_.b_num_vectors_ = K; buffer_.transA_ = clblasNoTrans; if (lda == 0) { buffer_.lda_ = N; } else if (lda < N) { std::cerr << "lda:wrong size\n"; exit(1); } else { buffer_.lda_ = lda; } if (ldb == 0) { buffer_.ldb_ = N; } else if (ldb < N) { std::cerr << "ldb:wrong size\n"; exit(1); } else { buffer_.ldb_ = ldb; } } else { buffer_.a_num_vectors_ = N; buffer_.b_num_vectors_ = N; if (transA_option == 1) { buffer_.transA_ = clblasTrans; } else if (transA_option == 2) { buffer_.transA_ = clblasConjTrans; } if (lda == 0) { buffer_.lda_ = K; } else if (lda < K) { std::cerr << "lda:wrong size\n"; exit(1); } else { buffer_.lda_ = lda; } if (ldb == 0) { buffer_.ldb_ = K; } else if (ldb < K) { std::cerr << "ldb:wrong size\n"; exit(1); } else { buffer_.ldb_ = ldb; } } } buffer_.cpuA_ = new T[buffer_.lda_*buffer_.a_num_vectors_]; buffer_.cpuB_ = new T[buffer_.ldb_*buffer_.b_num_vectors_]; buffer_.cpuC_ = new T[buffer_.ldc_*buffer_.c_num_vectors_]; } void releaseGPUBuffer_deleteCPUBuffer() { //this is necessary since we are running a iteration of tests and calculate the average time. (in client.cpp) //need to do this before we eventually hit the destructor delete buffer_.cpuA_; delete buffer_.cpuB_; delete buffer_.cpuC_; OPENCL_V_THROW( clReleaseMemObject(buffer_.A_), "releasing buffer A"); OPENCL_V_THROW( clReleaseMemObject(buffer_.B_), "releasing buffer B"); OPENCL_V_THROW( clReleaseMemObject(buffer_.C_), "releasing buffer C"); } protected: protected: void initialize_scalars(double alpha, double beta) { buffer_.alpha_ = makeScalar(alpha); buffer_.beta_ = makeScalar(beta); } private: xHer2kBuffer buffer_; }; template<> void xHer2k::call_func() { timer.Start(timer_id); clblasCher2k(order_, buffer_.uplo_, buffer_.transA_, buffer_.N_, buffer_.K_, buffer_.alpha_, buffer_.A_, buffer_.offa_, buffer_.lda_, buffer_.B_, buffer_.offb_, buffer_.ldb_, buffer_.beta_.s[0], buffer_.C_, buffer_.offc_, buffer_.ldc_, numQueues, queues_, 0, NULL, &event_); clWaitForEvents(1, &event_); timer.Stop(timer_id); } template<> void xHer2k::roundtrip_func() { timer.Start(timer_id); cl_int err; buffer_.A_ = clCreateBuffer(ctx_, CL_MEM_READ_ONLY, (buffer_.lda_ * buffer_.a_num_vectors_ + buffer_.offa_) * sizeof(cl_float2), NULL, &err); buffer_.B_ = clCreateBuffer(ctx_, CL_MEM_READ_ONLY, (buffer_.ldb_ * buffer_.b_num_vectors_ + buffer_.offb_) * sizeof(cl_float2), NULL, &err); buffer_.C_ = clCreateBuffer(ctx_, CL_MEM_READ_WRITE, (buffer_.ldc_ * buffer_.c_num_vectors_ + buffer_.offc_) * sizeof(cl_float2), NULL, &err); this->initialize_gpu_buffer(); clblasCher2k(order_, buffer_.uplo_, buffer_.transA_, buffer_.N_, buffer_.K_, buffer_.alpha_, buffer_.A_, buffer_.offa_, buffer_.lda_, buffer_.B_, buffer_.offb_, buffer_.ldb_, buffer_.beta_.s[0], buffer_.C_, buffer_.offc_, buffer_.ldc_, numQueues, queues_, 0, NULL, NULL); err = clEnqueueWriteBuffer(queues_[0], buffer_.C_, CL_TRUE, buffer_.offc_ * sizeof(cl_float2), buffer_.ldc_ * buffer_.c_num_vectors_ * sizeof(cl_float2), buffer_.cpuC_, 0, NULL, &event_); clWaitForEvents(1, &event_); timer.Stop(timer_id); } template<> void xHer2k::call_func() { timer.Start(timer_id); clblasZher2k(order_, buffer_.uplo_, buffer_.transA_, buffer_.N_, buffer_.K_, buffer_.alpha_, buffer_.A_, buffer_.offa_, buffer_.lda_, buffer_.B_, buffer_.offb_, buffer_.ldb_, buffer_.beta_.s[0], buffer_.C_, buffer_.offc_, buffer_.ldc_, numQueues, queues_, 0, NULL, &event_); clWaitForEvents(1, &event_); timer.Stop(timer_id); } template<> void xHer2k::roundtrip_func() { timer.Start(timer_id); cl_int err; buffer_.A_ = clCreateBuffer(ctx_, CL_MEM_READ_ONLY, (buffer_.lda_ * buffer_.a_num_vectors_ + buffer_.offa_) * sizeof(cl_double2), NULL, &err); buffer_.B_ = clCreateBuffer(ctx_, CL_MEM_READ_ONLY, (buffer_.ldb_ * buffer_.b_num_vectors_ + buffer_.offb_) * sizeof(cl_double2), NULL, &err); buffer_.C_ = clCreateBuffer(ctx_, CL_MEM_READ_WRITE, (buffer_.ldc_ * buffer_.c_num_vectors_ + buffer_.offc_) * sizeof(cl_double2), NULL, &err); this->initialize_gpu_buffer(); clblasZher2k(order_, buffer_.uplo_, buffer_.transA_, buffer_.N_, buffer_.K_, buffer_.alpha_, buffer_.A_, buffer_.offa_, buffer_.lda_, buffer_.B_, buffer_.offb_, buffer_.ldb_, buffer_.beta_.s[0], buffer_.C_, buffer_.offc_, buffer_.ldc_, numQueues, queues_, 0, NULL, NULL); err = clEnqueueWriteBuffer(queues_[0], buffer_.C_, CL_TRUE, buffer_.offc_ * sizeof(cl_double2), buffer_.ldc_ * buffer_.c_num_vectors_ * sizeof(cl_double2), buffer_.cpuC_, 0, NULL, &event_); clWaitForEvents(1, &event_); timer.Stop(timer_id); } #endif // ifndef CLBLAS_BENCHMARK_XSYR_HXX__clblas-2.10/src/client/clfunc_xherk.hpp000066400000000000000000000367111264277366700201610ustar00rootroot00000000000000/* ************************************************************************ * Copyright 2013 Advanced Micro Devices, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * ************************************************************************/ // $Id #ifndef CLBLAS_BENCHMARK_XHERK_HXX__ #define CLBLAS_BENCHMARK_XHERK_HXX__ #include "clfunc_common.hpp" template struct xHerkBuffer { clblasOrder order_; clblasUplo uplo_; clblasTranspose transA_; size_t N_; size_t K_; T alpha_; cl_mem A_; size_t offa_; size_t lda_; T beta_; cl_mem C_; size_t offc_; size_t ldc_; size_t a_num_vectors_; size_t c_num_vectors_; T* cpuA_; T* cpuC_; }; // struct buffer template class xHerk : public clblasFunc { public: xHerk(StatisticalTimer& timer, cl_device_type devType) : clblasFunc(timer, devType) { timer.getUniqueID("clHerk", 0); } ~xHerk() { } double gflops() { return static_cast(4*(buffer_.K_ * buffer_.N_ * (buffer_.N_+1))/time_in_ns()); } std::string gflops_formula() { return "4*K*N*(N+1)/time"; } void setup_buffer(int order_option, int side_option, int uplo_option, int diag_option, int transA_option, int transB_option, size_t M, size_t N, size_t K, size_t lda, size_t ldb, size_t ldc,size_t offA, size_t offB, size_t offC, double alpha, double beta) { DUMMY_ARGS_USAGE_4(side_option, diag_option, transB_option, M); DUMMY_ARGS_USAGE_2(ldb, offB); initialize_scalars(alpha,beta); buffer_.N_ = N; buffer_.K_ = K; buffer_.offa_ = offA; buffer_.offc_ = offC; if (uplo_option == 0) { buffer_.uplo_ = clblasUpper; } else { buffer_.uplo_ = clblasLower; } if (ldc == 0) { buffer_.ldc_ = N; } else if (ldc < N) { std::cerr << "ldc:wrong size\n"; } else { buffer_.ldc_ = ldc; } buffer_.c_num_vectors_ = N; if (order_option == 0) { order_ = clblasRowMajor; if (transA_option == 0) { buffer_.transA_ = clblasNoTrans; buffer_.a_num_vectors_ = N; if (lda == 0) { buffer_.lda_ = K; } else if (lda < K) { std::cerr << "lda:wrong size\n"; exit(1); } else { buffer_.lda_ = lda; } } else { buffer_.a_num_vectors_ = K; if (transA_option == 1) { buffer_.transA_ = clblasTrans; } else if (transA_option == 2) { buffer_.transA_ = clblasConjTrans; } if (lda == 0) { buffer_.lda_ = N; } else if (lda < N) { std::cerr << "lda:wrong size\n"; exit(1); } else { buffer_.lda_ = lda; } } } else { order_ = clblasColumnMajor; if (transA_option == 0) { buffer_.a_num_vectors_ = K; buffer_.transA_ = clblasNoTrans; if (lda == 0) { buffer_.lda_ = N; } else if (lda < N) { std::cerr << "lda:wrong size\n"; exit(1); } else { buffer_.lda_ = lda; } } else { buffer_.a_num_vectors_ = N; if (transA_option == 1) { buffer_.transA_ = clblasTrans; } else if (transA_option == 2) { buffer_.transA_ = clblasConjTrans; } if (lda == 0) { buffer_.lda_ = K; } else if (lda < K) { std::cerr << "lda:wrong size\n"; exit(1); } else { buffer_.lda_ = lda; } } } buffer_.cpuA_ = new T[buffer_.lda_*buffer_.a_num_vectors_]; buffer_.cpuC_ = new T[buffer_.ldc_*buffer_.c_num_vectors_]; cl_int err; buffer_.A_ = clCreateBuffer(ctx_, CL_MEM_READ_ONLY, (buffer_.lda_ * buffer_.a_num_vectors_ + buffer_.offa_) * sizeof(T), NULL, &err); buffer_.C_ = clCreateBuffer(ctx_, CL_MEM_READ_WRITE, (buffer_.ldc_ * buffer_.c_num_vectors_ + buffer_.offc_) * sizeof(T), NULL, &err); } void initialize_cpu_buffer() { srand(10); for (size_t i = 0; i < buffer_.a_num_vectors_; ++i) { for (size_t j = 0; j < buffer_.lda_; ++j) { buffer_.cpuA_[i*buffer_.lda_+j] = random(UPPER_BOUND()) / randomScale(); } } for (size_t i = 0; i < buffer_.N_; ++i) { for (size_t j = 0; j < buffer_.ldc_; ++j) { buffer_.cpuC_[i*buffer_.ldc_+j] = random(UPPER_BOUND()) / randomScale(); } } } void initialize_gpu_buffer() { cl_int err; err = clEnqueueWriteBuffer(queues_[0], buffer_.A_, CL_TRUE, buffer_.offa_ * sizeof(T), buffer_.lda_ * buffer_.a_num_vectors_ * sizeof(T), buffer_.cpuA_, 0, NULL, NULL); err = clEnqueueWriteBuffer(queues_[0], buffer_.C_, CL_TRUE, buffer_.offa_ * sizeof(T), buffer_.ldc_ * buffer_.c_num_vectors_ * sizeof(T), buffer_.cpuC_, 0, NULL, NULL); } void reset_gpu_write_buffer() { cl_int err; err = clEnqueueWriteBuffer(queues_[0], buffer_.C_, CL_TRUE, buffer_.offc_ * sizeof(T), buffer_.ldc_ * buffer_.c_num_vectors_ * sizeof(T), buffer_.cpuC_, 0, NULL, NULL); } void call_func(); void read_gpu_buffer() { cl_int err; err = clEnqueueReadBuffer(queues_[0], buffer_.C_, CL_TRUE, buffer_.offc_*sizeof(T), buffer_.ldc_*buffer_.c_num_vectors_*sizeof(T), buffer_.cpuC_, 0, NULL, NULL); } void roundtrip_func(); void zerocopy_roundtrip_func() { std::cout << "xTrmm::zerocopy_roundtrip_func\n"; } void roundtrip_setup_buffer(int order_option, int side_option, int uplo_option, int diag_option, int transA_option, int transB_option, size_t M, size_t N, size_t K, size_t lda, size_t ldb, size_t ldc, size_t offA, size_t offBX, size_t offCY, double alpha, double beta) { DUMMY_ARGS_USAGE_4(side_option, diag_option, transB_option, M); DUMMY_ARGS_USAGE_2(ldb, offBX); initialize_scalars(alpha,beta); buffer_.N_ = N; buffer_.K_ = K; buffer_.offa_ = offA; buffer_.offc_ = offCY; if (uplo_option == 0) { buffer_.uplo_ = clblasUpper; } else { buffer_.uplo_ = clblasLower; } if (ldc == 0) { buffer_.ldc_ = N; } else if (ldc < N) { std::cerr << "ldc:wrong size\n"; } else { buffer_.ldc_ = ldc; } buffer_.c_num_vectors_ = N; if (order_option == 0) { order_ = clblasRowMajor; if (transA_option == 0) { buffer_.transA_ = clblasNoTrans; buffer_.a_num_vectors_ = N; if (lda == 0) { buffer_.lda_ = K; } else if (lda < K) { std::cerr << "lda:wrong size\n"; exit(1); } else { buffer_.lda_ = lda; } } else { buffer_.a_num_vectors_ = K; if (transA_option == 1) { buffer_.transA_ = clblasTrans; } else if (transA_option == 2) { buffer_.transA_ = clblasConjTrans; } if (lda == 0) { buffer_.lda_ = N; } else if (lda < N) { std::cerr << "lda:wrong size\n"; exit(1); } else { buffer_.lda_ = lda; } } } else { order_ = clblasColumnMajor; if (transA_option == 0) { buffer_.a_num_vectors_ = K; buffer_.transA_ = clblasNoTrans; if (lda == 0) { buffer_.lda_ = N; } else if (lda < N) { std::cerr << "lda:wrong size\n"; exit(1); } else { buffer_.lda_ = lda; } } else { buffer_.a_num_vectors_ = N; if (transA_option == 1) { buffer_.transA_ = clblasTrans; } else if (transA_option == 2) { buffer_.transA_ = clblasConjTrans; } if (lda == 0) { buffer_.lda_ = K; } else if (lda < K) { std::cerr << "lda:wrong size\n"; exit(1); } else { buffer_.lda_ = lda; } } } buffer_.cpuA_ = new T[buffer_.lda_*buffer_.a_num_vectors_]; buffer_.cpuC_ = new T[buffer_.ldc_*buffer_.c_num_vectors_]; } void releaseGPUBuffer_deleteCPUBuffer() { //this is necessary since we are running a iteration of tests and calculate the average time. (in client.cpp) //need to do this before we eventually hit the destructor delete buffer_.cpuA_; delete buffer_.cpuC_; OPENCL_V_THROW( clReleaseMemObject(buffer_.A_), "releasing buffer A"); OPENCL_V_THROW( clReleaseMemObject(buffer_.C_), "releasing buffer C"); } protected: protected: void initialize_scalars(double alpha, double beta) { buffer_.alpha_ = makeScalar(alpha); buffer_.beta_ = makeScalar(beta); } private: xHerkBuffer buffer_; }; template<> void xHerk::call_func() { timer.Start(timer_id); clblasCherk(order_, buffer_.uplo_, buffer_.transA_, buffer_.N_, buffer_.K_, buffer_.alpha_.s[0], buffer_.A_, buffer_.offa_, buffer_.lda_, buffer_.beta_.s[0], buffer_.C_, buffer_.offc_, buffer_.ldc_, numQueues, queues_, 0, NULL, &event_); clWaitForEvents(1, &event_); timer.Stop(timer_id); } template<> void xHerk::roundtrip_func() { timer.Start(timer_id); cl_int err; buffer_.A_ = clCreateBuffer(ctx_, CL_MEM_READ_ONLY, (buffer_.lda_ * buffer_.a_num_vectors_ + buffer_.offa_) * sizeof(cl_float2), NULL, &err); buffer_.C_ = clCreateBuffer(ctx_, CL_MEM_READ_WRITE, (buffer_.ldc_ * buffer_.c_num_vectors_ + buffer_.offc_) * sizeof(cl_float2), NULL, &err); this->initialize_gpu_buffer(); clblasCherk(order_, buffer_.uplo_, buffer_.transA_, buffer_.N_, buffer_.K_, buffer_.alpha_.s[0], buffer_.A_, buffer_.offa_, buffer_.lda_, buffer_.beta_.s[0], buffer_.C_, buffer_.offc_, buffer_.ldc_, numQueues, queues_, 0, NULL, NULL); err = clEnqueueWriteBuffer(queues_[0], buffer_.C_, CL_TRUE, buffer_.offc_ * sizeof(cl_float2), buffer_.ldc_ * buffer_.c_num_vectors_ * sizeof(cl_float2), buffer_.cpuC_, 0, NULL, &event_); clWaitForEvents(1, &event_); timer.Stop(timer_id); } template<> void xHerk::call_func() { timer.Start(timer_id); clblasZherk(order_, buffer_.uplo_, buffer_.transA_, buffer_.N_, buffer_.K_, buffer_.alpha_.s[0], buffer_.A_, buffer_.offa_, buffer_.lda_, buffer_.beta_.s[0], buffer_.C_, buffer_.offc_, buffer_.ldc_, numQueues, queues_, 0, NULL, &event_); clWaitForEvents(1, &event_); timer.Stop(timer_id); } template<> void xHerk::roundtrip_func() { timer.Start(timer_id); cl_int err; buffer_.A_ = clCreateBuffer(ctx_, CL_MEM_READ_ONLY, (buffer_.lda_ * buffer_.a_num_vectors_ + buffer_.offa_) * sizeof(cl_double2), NULL, &err); buffer_.C_ = clCreateBuffer(ctx_, CL_MEM_READ_WRITE, (buffer_.ldc_ * buffer_.c_num_vectors_ + buffer_.offc_) * sizeof(cl_double2), NULL, &err); this->initialize_gpu_buffer(); clblasZherk(order_, buffer_.uplo_, buffer_.transA_, buffer_.N_, buffer_.K_, buffer_.alpha_.s[0], buffer_.A_, buffer_.offa_, buffer_.lda_, buffer_.beta_.s[0], buffer_.C_, buffer_.offc_, buffer_.ldc_, numQueues, queues_, 0, NULL, NULL); err = clEnqueueWriteBuffer(queues_[0], buffer_.C_, CL_TRUE, buffer_.offc_ * sizeof(cl_double2), buffer_.ldc_ * buffer_.c_num_vectors_ * sizeof(cl_double2), buffer_.cpuC_, 0, NULL, &event_); clWaitForEvents(1, &event_); timer.Stop(timer_id); } #endif // ifndef CLBLAS_BENCHMARK_XSYR_HXX__clblas-2.10/src/client/clfunc_xsymm.hpp000066400000000000000000000466371264277366700202250ustar00rootroot00000000000000/* ************************************************************************ * Copyright 2013 Advanced Micro Devices, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * ************************************************************************/ // $Id #ifndef CLBLAS_BENCHMARK_XSYMM_HXX__ #define CLBLAS_BENCHMARK_XSYMM_HXX__ #include "clfunc_common.hpp" template struct xSymmBuffer { clblasOrder order; clblasSide side; clblasUplo uplo; size_t M; size_t N; T alpha; T* cpuA; size_t a_num_vectors; cl_mem A; size_t offa; size_t lda; T* cpuB; cl_mem B; size_t offb; size_t ldb; T beta; T* cpuC; cl_mem C; size_t offc; size_t ldc; }; // struct buffer template class xSymm : public clblasFunc { public: xSymm(StatisticalTimer& timer, cl_device_type devType) : clblasFunc(timer, devType) { timer.getUniqueID("clSymm", 0); } ~xSymm() { } double gflops() { if (buffer.side == clblasLeft) return static_cast((2 * buffer.M * buffer.M * buffer.N)/time_in_ns()); else return static_cast((2 * buffer.N * buffer.N * buffer.M)/time_in_ns()); } std::string gflops_formula() { if (buffer.side == clblasLeft) return "2*M*M*N/time"; else return "2*N*N*M/time"; } void setup_buffer(int order_option, int side_option, int uplo_option, int diag_option, int transA_option, int transB_option, size_t M, size_t N, size_t K, size_t lda, size_t ldb, size_t ldc,size_t offA, size_t offB, size_t offC, double alpha, double beta); void initialize_cpu_buffer(); void initialize_gpu_buffer(); void reset_gpu_write_buffer(); void call_func(); void read_gpu_buffer() { cl_int err; err = clEnqueueReadBuffer(queues_[0], buffer.C, CL_TRUE, buffer.offc * sizeof(T), buffer.ldc * buffer.N * sizeof(T), buffer.cpuC, 0, NULL, NULL); } void roundtrip_func() { std::cout << "xSymm::roundtrip_func\n"; } void zerocopy_roundtrip_func() { std::cout << "xSymm::zerocopy_roundtrip_func\n"; } void roundtrip_setup_buffer(int order_option, int side_option, int uplo_option, int diag_option, int transA_option, int transB_option, size_t M, size_t N, size_t K, size_t lda, size_t ldb, size_t ldc, size_t offA, size_t offB, size_t offC, double alpha, double beta) { initialize_scalars(alpha, beta); buffer.offa = offA; buffer.offb = offB; buffer.offc = offC; buffer.M = M; buffer.N = N; if (order_option == 0) { buffer.order = clblasRowMajor; } else { buffer.order = clblasColumnMajor; } if (uplo_option == 0) { buffer.uplo = clblasUpper; } else { buffer.uplo = clblasLower; } if (side_option == 0) { buffer.side = clblasLeft; buffer.a_num_vectors = M; if (lda == 0) { buffer.lda = buffer.M; } else if (lda < buffer.M) { std::cerr << "lda:wrong size\n"; exit(1); } else { buffer.lda = lda; } } else { buffer.side = clblasRight; buffer.a_num_vectors = N; if (lda == 0) { buffer.lda = buffer.N; } else if (lda < buffer.N) { std::cerr << "lda:wrong size\n"; exit(1); } else { buffer.lda = lda; } } /*} if (lda == 0) { buffer.lda = buffer.M; } else if (lda < buffer.M) { std::cerr << "lda:wrong size\n"; exit(1); } else { buffer.lda = lda; }*/ if (ldb == 0) { buffer.ldb = buffer.M; } else if (ldb < buffer.M) { std::cerr << "ldb:wrong size\n"; exit(1); } else { buffer.ldb = ldb; } if (ldc == 0) { buffer.ldc = buffer.M; } else if (ldc < buffer.M) { std::cerr << "ldc:wrong size\n"; exit(1); } else { buffer.ldc = ldc; } buffer.cpuB = new T[buffer.N * buffer.ldb]; buffer.cpuC = new T[buffer.N * buffer.ldc]; buffer.cpuA = new T[buffer.a_num_vectors * buffer.lda]; } void releaseGPUBuffer_deleteCPUBuffer() { //this is necessary since we are running a iteration of tests and calculate the average time. (in client.cpp) //need to do this before we eventually hit the destructor delete buffer.cpuA; delete buffer.cpuB; delete buffer.cpuC; OPENCL_V_THROW( clReleaseMemObject(buffer.A), "releasing buffer A"); OPENCL_V_THROW( clReleaseMemObject(buffer.B), "releasing buffer B"); OPENCL_V_THROW( clReleaseMemObject(buffer.C), "releasing buffer C"); } protected: void initialize_scalars(double alpha, double beta) { buffer.alpha = makeScalar(alpha); buffer.beta = makeScalar(beta); } private: xSymmBuffer buffer; }; template void xSymm::setup_buffer(int order_option, int side_option, int uplo_option, int diag_option, int transA_option, int transB_option, size_t M, size_t N, size_t K, size_t lda, size_t ldb, size_t ldc,size_t offA, size_t offB, size_t offC, double alpha, double beta) { initialize_scalars(alpha, beta); buffer.offa = offA; buffer.offb = offB; buffer.offc = offC; buffer.M = M; buffer.N = N; if (order_option == 0) { buffer.order = clblasRowMajor; } else { buffer.order = clblasColumnMajor; } if (uplo_option == 0) { buffer.uplo = clblasUpper; } else { buffer.uplo = clblasLower; } if (side_option == 0) { buffer.side = clblasLeft; buffer.a_num_vectors = M; if (lda == 0) { buffer.lda = buffer.M; } else if (lda < buffer.M) { std::cerr << "lda:wrong size\n"; exit(1); } else { buffer.lda = lda; } } else { buffer.side = clblasRight; buffer.a_num_vectors = N; if (lda == 0) { buffer.lda = buffer.N; } else if (lda < buffer.N) { std::cerr << "lda:wrong size\n"; exit(1); } else { buffer.lda = lda; } } /*} if (lda == 0) { buffer.lda = buffer.M; } else if (lda < buffer.M) { std::cerr << "lda:wrong size\n"; exit(1); } else { buffer.lda = lda; }*/ if (ldb == 0) { buffer.ldb = buffer.M; } else if (ldb < buffer.M) { std::cerr << "ldb:wrong size\n"; exit(1); } else { buffer.ldb = ldb; } if (ldc == 0) { buffer.ldc = buffer.M; } else if (ldc < buffer.M) { std::cerr << "ldc:wrong size\n"; exit(1); } else { buffer.ldc = ldc; } buffer.cpuB = new T[buffer.N * buffer.ldb]; buffer.cpuC = new T[buffer.N * buffer.ldc]; buffer.cpuA = new T[buffer.a_num_vectors * buffer.lda]; cl_int err; buffer.A = clCreateBuffer(ctx_, CL_MEM_READ_ONLY, buffer.a_num_vectors * buffer.lda*sizeof(T), NULL, &err); buffer.B = clCreateBuffer(ctx_, CL_MEM_READ_ONLY, buffer.N*buffer.ldb*sizeof(T), NULL, &err); buffer.C = clCreateBuffer(ctx_, CL_MEM_READ_WRITE, buffer.N*buffer.ldc*sizeof(T), NULL, &err); } template void xSymm::initialize_cpu_buffer() { srand(10); for (size_t i = 0; i < buffer.a_num_vectors; ++i) { for (size_t j = 0; j < buffer.lda; ++j) { buffer.cpuA[i*buffer.lda+j] = random(UPPER_BOUND()) / randomScale(); } } for (size_t i = 0; i < buffer.N; ++i) { for (size_t j = 0; j < buffer.ldb; ++j) { buffer.cpuB[i*buffer.ldb+j] = random(UPPER_BOUND()) / randomScale(); } } for (size_t i = 0; i < buffer.N; ++i) { for (size_t j = 0; j < buffer.ldc; ++j) { buffer.cpuC[i*buffer.ldc+j] = random(UPPER_BOUND()) / randomScale(); } } } template void xSymm::initialize_gpu_buffer() { cl_int err; err = clEnqueueWriteBuffer(queues_[0], buffer.A, CL_TRUE, buffer.offa * sizeof(T), buffer.a_num_vectors * buffer.lda*sizeof(T), buffer.cpuA, 0, NULL, NULL); err = clEnqueueWriteBuffer(queues_[0], buffer.B, CL_TRUE, 0, buffer.ldb*buffer.N*sizeof(T), buffer.cpuB, 0, NULL, NULL); err = clEnqueueWriteBuffer(queues_[0], buffer.C, CL_TRUE, 0, buffer.ldc*buffer.N*sizeof(T), buffer.cpuC, 0, NULL, NULL); } template void xSymm::reset_gpu_write_buffer() { cl_int err; err = clEnqueueWriteBuffer(queues_[0], buffer.C, CL_TRUE, 0, buffer.ldc*buffer.N*sizeof(T), buffer.cpuC, 0, NULL, NULL); } template <> void xSymm::call_func() { timer.Start(timer_id); clblasSsymm(buffer.order, buffer.side, buffer.uplo, buffer.M, buffer.N, buffer.alpha, buffer.A, buffer.offa, buffer.lda, buffer.B, buffer.offb, buffer.ldb, buffer.beta, buffer.C, buffer.offc, buffer.ldc, numQueues, queues_, 0, NULL,&event_); clWaitForEvents(1, &event_); timer.Stop(timer_id); } template <> void xSymm::roundtrip_func() { timer.Start(timer_id); //set up buffer cl_int err; buffer.A = clCreateBuffer(ctx_, CL_MEM_READ_ONLY, buffer.a_num_vectors * buffer.lda*sizeof(cl_float), NULL, &err); buffer.B = clCreateBuffer(ctx_, CL_MEM_READ_ONLY, buffer.N*buffer.ldb*sizeof(cl_float), NULL, &err); buffer.C = clCreateBuffer(ctx_, CL_MEM_READ_WRITE, buffer.N*buffer.ldc*sizeof(cl_float), NULL, &err); //initialize gpu buffer err = clEnqueueWriteBuffer(queues_[0], buffer.A, CL_TRUE, buffer.offa * sizeof(cl_float), buffer.a_num_vectors * buffer.lda*sizeof(cl_float), buffer.cpuA, 0, NULL, NULL); err = clEnqueueWriteBuffer(queues_[0], buffer.B, CL_TRUE, 0, buffer.ldb*buffer.N*sizeof(cl_float), buffer.cpuB, 0, NULL, NULL); err = clEnqueueWriteBuffer(queues_[0], buffer.C, CL_TRUE, 0, buffer.ldc*buffer.N*sizeof(cl_float), buffer.cpuC, 0, NULL, NULL); //call func clblasSsymm(buffer.order, buffer.side, buffer.uplo, buffer.M, buffer.N, buffer.alpha, buffer.A, buffer.offa, buffer.lda, buffer.B, buffer.offb, buffer.ldb, buffer.beta, buffer.C, buffer.offc, buffer.ldc, numQueues, queues_, 0, NULL,NULL); //read gpu buffer err = clEnqueueReadBuffer(queues_[0], buffer.C, CL_TRUE, buffer.offc * sizeof(cl_float), buffer.ldc * buffer.N * sizeof(cl_float), buffer.cpuC, 0, NULL, &event_); clWaitForEvents(1, &event_); timer.Stop(timer_id); } template <> void xSymm::call_func() { timer.Start(timer_id); clblasDsymm(buffer.order, buffer.side, buffer.uplo, buffer.M, buffer.N, buffer.alpha, buffer.A, buffer.offa, buffer.lda, buffer.B, buffer.offb, buffer.ldb, buffer.beta, buffer.C, buffer.offc, buffer.ldc, numQueues, queues_, 0, NULL,&event_); clWaitForEvents(1, &event_); timer.Stop(timer_id); } template <> void xSymm::roundtrip_func() { timer.Start(timer_id); //set up buffer cl_int err; buffer.A = clCreateBuffer(ctx_, CL_MEM_READ_ONLY, buffer.a_num_vectors * buffer.lda*sizeof(cl_double), NULL, &err); buffer.B = clCreateBuffer(ctx_, CL_MEM_READ_ONLY, buffer.N*buffer.ldb*sizeof(cl_double), NULL, &err); buffer.C = clCreateBuffer(ctx_, CL_MEM_READ_WRITE, buffer.N*buffer.ldc*sizeof(cl_double), NULL, &err); //initialize gpu buffer err = clEnqueueWriteBuffer(queues_[0], buffer.A, CL_TRUE, buffer.offa * sizeof(cl_double), buffer.a_num_vectors * buffer.lda*sizeof(cl_double), buffer.cpuA, 0, NULL, NULL); err = clEnqueueWriteBuffer(queues_[0], buffer.B, CL_TRUE, 0, buffer.ldb*buffer.N*sizeof(cl_double), buffer.cpuB, 0, NULL, NULL); err = clEnqueueWriteBuffer(queues_[0], buffer.C, CL_TRUE, 0, buffer.ldc*buffer.N*sizeof(cl_double), buffer.cpuC, 0, NULL, NULL); //call func clblasDsymm(buffer.order, buffer.side, buffer.uplo, buffer.M, buffer.N, buffer.alpha, buffer.A, buffer.offa, buffer.lda, buffer.B, buffer.offb, buffer.ldb, buffer.beta, buffer.C, buffer.offc, buffer.ldc, numQueues, queues_, 0, NULL,NULL); //read gpu buffer err = clEnqueueReadBuffer(queues_[0], buffer.C, CL_TRUE, buffer.offc * sizeof(cl_double), buffer.ldc * buffer.N * sizeof(cl_double), buffer.cpuC, 0, NULL, &event_); clWaitForEvents(1, &event_); timer.Stop(timer_id); } template <> void xSymm::call_func() { timer.Start(timer_id); clblasCsymm(buffer.order, buffer.side, buffer.uplo, buffer.M, buffer.N, buffer.alpha, buffer.A, buffer.offa, buffer.lda, buffer.B, buffer.offb, buffer.ldb, buffer.beta, buffer.C, buffer.offc, buffer.ldc, numQueues, queues_, 0, NULL,&event_); clWaitForEvents(1, &event_); timer.Stop(timer_id); } template <> void xSymm::roundtrip_func() { timer.Start(timer_id); //set up buffer cl_int err; buffer.A = clCreateBuffer(ctx_, CL_MEM_READ_ONLY, buffer.a_num_vectors * buffer.lda*sizeof(cl_float2), NULL, &err); buffer.B = clCreateBuffer(ctx_, CL_MEM_READ_ONLY, buffer.N*buffer.ldb*sizeof(cl_float2), NULL, &err); buffer.C = clCreateBuffer(ctx_, CL_MEM_READ_WRITE, buffer.N*buffer.ldc*sizeof(cl_float2), NULL, &err); //initialize gpu buffer err = clEnqueueWriteBuffer(queues_[0], buffer.A, CL_TRUE, buffer.offa * sizeof(cl_float2), buffer.a_num_vectors * buffer.lda*sizeof(cl_float2), buffer.cpuA, 0, NULL, NULL); err = clEnqueueWriteBuffer(queues_[0], buffer.B, CL_TRUE, 0, buffer.ldb*buffer.N*sizeof(cl_float2), buffer.cpuB, 0, NULL, NULL); err = clEnqueueWriteBuffer(queues_[0], buffer.C, CL_TRUE, 0, buffer.ldc*buffer.N*sizeof(cl_float2), buffer.cpuC, 0, NULL, NULL); //call func clblasCsymm(buffer.order, buffer.side, buffer.uplo, buffer.M, buffer.N, buffer.alpha, buffer.A, buffer.offa, buffer.lda, buffer.B, buffer.offb, buffer.ldb, buffer.beta, buffer.C, buffer.offc, buffer.ldc, numQueues, queues_, 0, NULL,NULL); //read gpu buffer err = clEnqueueReadBuffer(queues_[0], buffer.C, CL_TRUE, buffer.offc * sizeof(cl_float2), buffer.ldc * buffer.N * sizeof(cl_float2), buffer.cpuC, 0, NULL, &event_); clWaitForEvents(1, &event_); timer.Stop(timer_id); } template <> void xSymm::call_func() { timer.Start(timer_id); clblasZsymm(buffer.order, buffer.side, buffer.uplo, buffer.M, buffer.N, buffer.alpha, buffer.A, buffer.offa, buffer.lda, buffer.B, buffer.offb, buffer.ldb, buffer.beta, buffer.C, buffer.offc, buffer.ldc, numQueues, queues_, 0, NULL,&event_); clWaitForEvents(1, &event_); timer.Stop(timer_id); } template <> void xSymm::roundtrip_func() { timer.Start(timer_id); //set up buffer cl_int err; buffer.A = clCreateBuffer(ctx_, CL_MEM_READ_ONLY, buffer.a_num_vectors * buffer.lda*sizeof(cl_double2), NULL, &err); buffer.B = clCreateBuffer(ctx_, CL_MEM_READ_ONLY, buffer.N*buffer.ldb*sizeof(cl_double2), NULL, &err); buffer.C = clCreateBuffer(ctx_, CL_MEM_READ_WRITE, buffer.N*buffer.ldc*sizeof(cl_double2), NULL, &err); //initialize gpu buffer err = clEnqueueWriteBuffer(queues_[0], buffer.A, CL_TRUE, buffer.offa * sizeof(cl_double2), buffer.a_num_vectors * buffer.lda*sizeof(cl_double2), buffer.cpuA, 0, NULL, NULL); err = clEnqueueWriteBuffer(queues_[0], buffer.B, CL_TRUE, 0, buffer.ldb*buffer.N*sizeof(cl_double2), buffer.cpuB, 0, NULL, NULL); err = clEnqueueWriteBuffer(queues_[0], buffer.C, CL_TRUE, 0, buffer.ldc*buffer.N*sizeof(cl_double2), buffer.cpuC, 0, NULL, NULL); //call func clblasZsymm(buffer.order, buffer.side, buffer.uplo, buffer.M, buffer.N, buffer.alpha, buffer.A, buffer.offa, buffer.lda, buffer.B, buffer.offb, buffer.ldb, buffer.beta, buffer.C, buffer.offc, buffer.ldc, numQueues, queues_, 0, NULL,NULL); //read gpu buffer err = clEnqueueReadBuffer(queues_[0], buffer.C, CL_TRUE, buffer.offc * sizeof(cl_double2), buffer.ldc * buffer.N * sizeof(cl_double2), buffer.cpuC, 0, NULL, &event_); clWaitForEvents(1, &event_); timer.Stop(timer_id); } template<> double xSymm:: gflops() { if (buffer.side == clblasLeft) return static_cast((8 * buffer.M * buffer.M * buffer.N)/time_in_ns()); else return static_cast((8 * buffer.N * buffer.N * buffer.M)/time_in_ns()); } template<> double xSymm:: gflops() { if (buffer.side == clblasLeft) return static_cast((8 * buffer.M * buffer.M * buffer.N)/time_in_ns()); else return static_cast((8 * buffer.N * buffer.N * buffer.M)/time_in_ns()); } template<> std::string xSymm:: gflops_formula() { if (buffer.side == clblasLeft) return "8*M*M*N/time"; else return "8*N*N*M/time"; } template<> std::string xSymm:: gflops_formula() { if (buffer.side == clblasLeft) return "8*M*M*N/time"; else return "8*N*N*M/time"; } #endif // ifndef CLBLAS_BENCHMARK_XSYR_HXX__clblas-2.10/src/client/clfunc_xsymv.hpp000066400000000000000000000166171264277366700202310ustar00rootroot00000000000000/* ************************************************************************ * Copyright 2013 Advanced Micro Devices, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * ************************************************************************/ // $Id #ifndef CLBLAS_BENCHMARK_XSYMV_HXX__ #define CLBLAS_BENCHMARK_XSYMV_HXX__ #include "clfunc_common.hpp" template struct xSymvBuffer { clblasOrder order_; size_t n_; size_t lda_; size_t offA_; size_t a_num_vectors_; clblasUplo uplo_; T* a_; T* x_; T* y_; cl_mem buf_a_; cl_mem buf_x_; cl_mem buf_y_; T alpha_; T beta_; }; // struct buffer template class xSymv : public clblasFunc { public: xSymv(StatisticalTimer& _timer, cl_device_type devType) : clblasFunc(_timer, devType) { timer.getUniqueID("clSymv", 0); } ~xSymv() { delete buffer_.a_; delete buffer_.x_; delete buffer_.y_; OPENCL_V_THROW( clReleaseMemObject(buffer_.buf_a_), "releasing buffer A"); OPENCL_V_THROW( clReleaseMemObject(buffer_.buf_x_), "releasing buffer X"); OPENCL_V_THROW( clReleaseMemObject(buffer_.buf_y_), "releasing buffer Y"); } void call_func() { } double gflops() { return (2.0*buffer_.n_*buffer_.n_)/time_in_ns(); } std::string gflops_formula() { return "2.0*N*N/time"; } void setup_buffer(int order_option, int side_option, int uplo_option, int diag_option, int transA_option, int transB_option, size_t M, size_t N, size_t K, size_t lda, size_t ldb, size_t ldc, size_t offA, size_t offBX, size_t offCY, double alpha, double beta) { DUMMY_ARGS_USAGE_3(side_option, diag_option, transB_option); DUMMY_ARGS_USAGE_4(M, K, ldb, ldc); DUMMY_ARGS_USAGE_3(transA_option, offBX, offCY); initialize_scalars(alpha, beta); buffer_.n_ = N; buffer_.a_num_vectors_ = N; buffer_.offA_ = offA; if (uplo_option == 0) { buffer_.uplo_ = clblasUpper; } else { buffer_.uplo_ = clblasLower; } buffer_.x_ = new T[buffer_.n_]; buffer_.y_ = new T[buffer_.n_]; if (order_option == 0) { order_ = clblasRowMajor; } else { order_ = clblasColumnMajor; } if (lda == 0) { buffer_.lda_ = N; } else if (lda < N) { std::cerr << "lda:wrong size\n"; exit(1); } else { buffer_.lda_ = lda; } buffer_.a_ = new T[buffer_.lda_*buffer_.a_num_vectors_]; cl_int err; buffer_.buf_a_ = clCreateBuffer(ctx_, CL_MEM_READ_ONLY, (buffer_.lda_ * buffer_.a_num_vectors_ + buffer_.offA_) * sizeof(T), NULL, &err); buffer_.buf_x_ = clCreateBuffer(ctx_, CL_MEM_READ_ONLY, buffer_.n_*sizeof(T), NULL, &err); buffer_.buf_y_ = clCreateBuffer(ctx_, CL_MEM_READ_WRITE, buffer_.n_*sizeof(T), NULL, &err); } void initialize_cpu_buffer() { srand(10); for (size_t i = 0; i < buffer_.a_num_vectors_; ++i) { for (size_t j = 0; j < buffer_.lda_; ++j) { buffer_.a_[i*buffer_.lda_+j] = static_cast(rand()) / static_cast(RAND_MAX); } } for (size_t i = 0; i < buffer_.n_; ++i) { buffer_.x_[i] = static_cast(rand()) / static_cast(RAND_MAX); buffer_.y_[i] = static_cast(rand()) / static_cast(RAND_MAX); } } void initialize_gpu_buffer() { cl_int err; err = clEnqueueWriteBuffer(queues_[0], buffer_.buf_a_, CL_TRUE, buffer_.offA_ * sizeof(T), buffer_.lda_ * buffer_.a_num_vectors_ * sizeof(T), buffer_.a_, 0, NULL, NULL); err = clEnqueueWriteBuffer(queues_[0], buffer_.buf_x_, CL_TRUE, 0, buffer_.n_*sizeof(T), buffer_.x_, 0, NULL, NULL); err = clEnqueueWriteBuffer(queues_[0], buffer_.buf_y_, CL_TRUE, 0, buffer_.n_*sizeof(T), buffer_.y_, 0, NULL, NULL); } void reset_gpu_write_buffer() { cl_int err; err = clEnqueueWriteBuffer(queues_[0], buffer_.buf_y_, CL_TRUE, 0, buffer_.n_*sizeof(T), buffer_.y_, 0, NULL, NULL); } void read_gpu_buffer() { //cl_int err; //to-do need to fill up } void roundtrip_func() {//to-do need to fill up } void roundtrip_setup_buffer(int order_option, int side_option, int uplo_option, int diag_option, int transA_option, int transB_option, size_t M, size_t N, size_t K, size_t lda, size_t ldb, size_t ldc, size_t offA, size_t offBX, size_t offCY, double alpha, double beta) {} void releaseGPUBuffer_deleteCPUBuffer() { //this is necessary since we are running a iteration of tests and calculate the average time. (in client.cpp) //need to do this before we eventually hit the destructor //to-do } protected: void initialize_scalars(double alpha, double beta) { buffer_.alpha_ = static_cast(alpha); buffer_.beta_ = static_cast(beta); } private: xSymvBuffer buffer_; }; // class xsymv template<> void xSymv:: call_func() { timer.Start(timer_id); clblasSsymv(order_, buffer_.uplo_, buffer_.n_, buffer_.alpha_, buffer_.buf_a_, buffer_.offA_, buffer_.lda_, buffer_.buf_x_, 0, 1, buffer_.beta_, buffer_.buf_y_, 0, 1, numQueues, queues_, 0, NULL, &event_); clWaitForEvents(1, &event_); timer.Stop(timer_id); } template<> void xSymv:: call_func() { timer.Start(timer_id); clblasDsymv(order_, buffer_.uplo_, buffer_.n_, buffer_.alpha_, buffer_.buf_a_, buffer_.offA_, buffer_.lda_, buffer_.buf_x_, 0, 1, buffer_.beta_, buffer_.buf_y_, 0, 1, numQueues, queues_, 0, NULL, &event_); clWaitForEvents(1, &event_); timer.Stop(timer_id); } #endif // ifndef CLBLAS_BENCHMARK_XSYMV_HXX__ clblas-2.10/src/client/clfunc_xsyr.hpp000066400000000000000000000141751264277366700200450ustar00rootroot00000000000000/* ************************************************************************ * Copyright 2013 Advanced Micro Devices, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * ************************************************************************/ // $Id #ifndef CLBLAS_BENCHMARK_XSYR_HXX__ #define CLBLAS_BENCHMARK_XSYR_HXX__ #include "clfunc_common.hpp" template struct xSyrBuffer { clblasOrder order; clblasUplo uplo; size_t N; T alpha; T* cpuX; cl_mem X; size_t offx; int incx; T* cpuA; cl_mem A; size_t offa; size_t lda; }; // struct buffer template class xSyr : public clblasFunc { public: xSyr(StatisticalTimer& timer, cl_device_type devType) : clblasFunc(timer, devType) { timer.getUniqueID("clSyr", 0); } ~xSyr() { delete buffer.cpuA; delete buffer.cpuX; OPENCL_V_THROW( clReleaseMemObject(buffer.A), "releasing buffer A"); OPENCL_V_THROW( clReleaseMemObject(buffer.X), "releasing buffer C"); } double gflops() { return static_cast((buffer.N * buffer.N)/time_in_ns()); } std::string gflops_formula() { return "N*N/time"; } void setup_buffer(int order_option, int side_option, int uplo_option, int diag_option, int transA_option, int transB_option, size_t M, size_t N, size_t K, size_t lda, size_t ldb, size_t ldc,size_t offA, size_t offB, size_t offC, double alpha, double beta); void initialize_cpu_buffer(); void initialize_gpu_buffer(); void reset_gpu_write_buffer(); void call_func(); void read_gpu_buffer() { //cl_int err; //to-do need to fill up } void roundtrip_func() {//to-do need to fill up } void roundtrip_setup_buffer(int order_option, int side_option, int uplo_option, int diag_option, int transA_option, int transB_option, size_t M, size_t N, size_t K, size_t lda, size_t ldb, size_t ldc, size_t offA, size_t offBX, size_t offCY, double alpha, double beta) {} void releaseGPUBuffer_deleteCPUBuffer() { //this is necessary since we are running a iteration of tests and calculate the average time. (in client.cpp) //need to do this before we eventually hit the destructor //to-do } protected: protected: void initialize_scalars(double alpha, double beta) { buffer.alpha = alpha; } private: xSyrBuffer buffer; }; template void xSyr::setup_buffer(int order_option, int side_option, int uplo_option, int diag_option, int transA_option, int transB_option, size_t M, size_t N, size_t K, size_t lda, size_t ldb, size_t ldc,size_t offA, size_t offB, size_t offC, double alpha, double beta) { initialize_scalars(alpha, beta); buffer.offa = offA; buffer.offx = offB; buffer.incx = 1; buffer.N = M; if (order_option == 0) { buffer.order = clblasRowMajor; } else { buffer.order = clblasColumnMajor; } if (uplo_option == 0) { buffer.uplo = clblasUpper; } else { buffer.uplo = clblasLower; } if (lda == 0) { buffer.lda = buffer.N; } else if (lda < buffer.N) { std::cerr << "lda:wrong size\n"; exit(1); } else { buffer.lda = lda; } buffer.cpuX = new T[buffer.N]; buffer.cpuA = new T[buffer.N * buffer.lda]; cl_int err; buffer.A = clCreateBuffer(ctx_, CL_MEM_READ_ONLY, buffer.N * buffer.lda*sizeof(T), NULL, &err); buffer.X = clCreateBuffer(ctx_, CL_MEM_READ_WRITE, buffer.N*sizeof(T), NULL, &err); } template void xSyr::initialize_cpu_buffer() { srand(10); for (size_t i = 0; i < buffer.N; ++i) { for (size_t j = 0; j < buffer.lda; ++j) { buffer.cpuA[i*buffer.lda+j] = random(UPPER_BOUND()) / randomScale(); } } for (size_t i = 0; i < buffer.N; ++i) { buffer.cpuX[i] = random(UPPER_BOUND()) / randomScale(); } } template void xSyr::initialize_gpu_buffer() { cl_int err; err = clEnqueueWriteBuffer(queues_[0], buffer.A, CL_TRUE, buffer.offa * sizeof(T), buffer.N * buffer.lda*sizeof(T), buffer.cpuA, 0, NULL, NULL); err = clEnqueueWriteBuffer(queues_[0], buffer.X, CL_TRUE, 0, buffer.N*sizeof(T), buffer.cpuX, 0, NULL, NULL); } template void xSyr::reset_gpu_write_buffer() { cl_int err; err = clEnqueueWriteBuffer(queues_[0], buffer.A, CL_TRUE, buffer.offa * sizeof(T), buffer.N * buffer.lda*sizeof(T), buffer.cpuA, 0, NULL, NULL);; } template <> void xSyr::call_func() { timer.Start(timer_id); clblasSsyr(buffer.order, buffer.uplo, buffer.N, buffer.alpha, buffer.X, buffer.offx, buffer.incx, buffer.A, buffer.offa, buffer.lda, numQueues, queues_, 0, NULL,&event_); clWaitForEvents(1, &event_); timer.Stop(timer_id); } template <> void xSyr::call_func() { timer.Start(timer_id); clblasSsyr(buffer.order, buffer.uplo, buffer.N, buffer.alpha, buffer.X, buffer.offx, buffer.incx, buffer.A, buffer.offa, buffer.lda, numQueues, queues_, 0, NULL,&event_); clWaitForEvents(1, &event_); timer.Stop(timer_id); } #endif // ifndef CLBLAS_BENCHMARK_XSYR_HXX__clblas-2.10/src/client/clfunc_xsyr2.hpp000066400000000000000000000155661264277366700201340ustar00rootroot00000000000000/* ************************************************************************ * Copyright 2013 Advanced Micro Devices, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * ************************************************************************/ // $Id #ifndef CLBLAS_BENCHMARK_XSYR2_HXX__ #define CLBLAS_BENCHMARK_XSYR2_HXX__ #include "clfunc_common.hpp" template struct xSyr2Buffer { clblasOrder order; clblasUplo uplo; size_t N; T alpha; T* cpuX; cl_mem X; size_t offx; int incx; T* cpuY; cl_mem Y; size_t offy; int incy; T* cpuA; cl_mem A; size_t offa; size_t lda; }; // struct buffer template class xSyr2 : public clblasFunc { public: xSyr2(StatisticalTimer& timer, cl_device_type devType) : clblasFunc(timer, devType) { timer.getUniqueID("clSyr2", 0); } ~xSyr2() { delete buffer.cpuA; delete buffer.cpuX; OPENCL_V_THROW( clReleaseMemObject(buffer.A), "releasing buffer A"); OPENCL_V_THROW( clReleaseMemObject(buffer.X), "releasing buffer C"); } double gflops() { return static_cast((2 * buffer.N * buffer.N)/time_in_ns()); } std::string gflops_formula() { return "2*N*N/time"; } void setup_buffer(int order_option, int side_option, int uplo_option, int diag_option, int transA_option, int transB_option, size_t M, size_t N, size_t K, size_t lda, size_t ldb, size_t ldc,size_t offA, size_t offB, size_t offC, double alpha, double beta); void initialize_cpu_buffer(); void initialize_gpu_buffer(); void reset_gpu_write_buffer(); void call_func(); void read_gpu_buffer() { //cl_int err; //to-do need to fill up } void roundtrip_func() {//to-do need to fill up } void roundtrip_setup_buffer(int order_option, int side_option, int uplo_option, int diag_option, int transA_option, int transB_option, size_t M, size_t N, size_t K, size_t lda, size_t ldb, size_t ldc, size_t offA, size_t offBX, size_t offCY, double alpha, double beta) {} void releaseGPUBuffer_deleteCPUBuffer() { //this is necessary since we are running a iteration of tests and calculate the average time. (in client.cpp) //need to do this before we eventually hit the destructor //to-do } protected: protected: void initialize_scalars(double alpha, double beta) { buffer.alpha = alpha; } private: xSyr2Buffer buffer; }; template void xSyr2::setup_buffer(int order_option, int side_option, int uplo_option, int diag_option, int transA_option, int transB_option, size_t M, size_t N, size_t K, size_t lda, size_t ldb, size_t ldc,size_t offA, size_t offB, size_t offC, double alpha, double beta) { initialize_scalars(alpha, beta); buffer.offa = offA; buffer.offx = offB; buffer.incx = 1;//If this changes, remember to adjust size of Y in rest of the file buffer.offy = offC; buffer.incy = 1;//If this changes, remember to adjust size of Y in rest of the file buffer.N = M; if (order_option == 0) { buffer.order = clblasRowMajor; } else { buffer.order = clblasColumnMajor; } if (uplo_option == 0) { buffer.uplo = clblasUpper; } else { buffer.uplo = clblasLower; } if (lda == 0) { buffer.lda = buffer.N; } else if (lda < buffer.N) { std::cerr << "lda:wrong size\n"; exit(1); } else { buffer.lda = lda; } buffer.cpuX = new T[buffer.N]; buffer.cpuY = new T[buffer.N]; buffer.cpuA = new T[buffer.N * buffer.lda]; cl_int err; buffer.A = clCreateBuffer(ctx_, CL_MEM_READ_ONLY, buffer.N * buffer.lda*sizeof(T), NULL, &err); buffer.X = clCreateBuffer(ctx_, CL_MEM_READ_WRITE, buffer.N*sizeof(T), NULL, &err); buffer.Y = clCreateBuffer(ctx_, CL_MEM_READ_WRITE, buffer.N*sizeof(T), NULL, &err); } template void xSyr2::initialize_cpu_buffer() { srand(10); for (size_t i = 0; i < buffer.N; ++i) { for (size_t j = 0; j < buffer.lda; ++j) { buffer.cpuA[i*buffer.lda+j] = random(UPPER_BOUND()) / randomScale(); } } for (size_t i = 0; i < buffer.N; ++i) { buffer.cpuX[i] = random(UPPER_BOUND()) / randomScale(); buffer.cpuY[i] = random(UPPER_BOUND()) / randomScale(); } } template void xSyr2::initialize_gpu_buffer() { cl_int err; err = clEnqueueWriteBuffer(queues_[0], buffer.A, CL_TRUE, buffer.offa * sizeof(T), buffer.N * buffer.lda*sizeof(T), buffer.cpuA, 0, NULL, NULL); err = clEnqueueWriteBuffer(queues_[0], buffer.X, CL_TRUE, 0, buffer.N*sizeof(T), buffer.cpuX, 0, NULL, NULL); err = clEnqueueWriteBuffer(queues_[0], buffer.Y, CL_TRUE, 0, buffer.N*sizeof(T), buffer.cpuY, 0, NULL, NULL); } template void xSyr2::reset_gpu_write_buffer() { cl_int err; err = clEnqueueWriteBuffer(queues_[0], buffer.A, CL_TRUE, buffer.offa * sizeof(T), buffer.N * buffer.lda*sizeof(T), buffer.cpuA, 0, NULL, NULL);; } template <> void xSyr2::call_func() { timer.Start(timer_id); clblasSsyr2(buffer.order, buffer.uplo, buffer.N, buffer.alpha, buffer.X, buffer.offx, buffer.incx, buffer.Y, buffer.offy, buffer.incy, buffer.A, buffer.offa, buffer.lda, numQueues, queues_, 0, NULL,&event_); clWaitForEvents(1, &event_); timer.Stop(timer_id); } template <> void xSyr2::call_func() { timer.Start(timer_id); clblasSsyr2(buffer.order, buffer.uplo, buffer.N, buffer.alpha, buffer.X, buffer.offx, buffer.incx, buffer.Y, buffer.offy, buffer.incy, buffer.A, buffer.offa, buffer.lda, numQueues, queues_, 0, NULL,&event_); clWaitForEvents(1, &event_); timer.Stop(timer_id); } #endif // ifndef CLBLAS_BENCHMARK_XSYR_HXX__clblas-2.10/src/client/clfunc_xsyr2k.hpp000066400000000000000000000630511264277366700202770ustar00rootroot00000000000000/* ************************************************************************ * Copyright 2013 Advanced Micro Devices, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * ************************************************************************/ // $Id #ifndef CLBLAS_BENCHMARK_XSYR2K_HXX__ #define CLBLAS_BENCHMARK_XSYR2K_HXX__ #include "clfunc_common.hpp" template struct xSyr2kBuffer { clblasOrder order_; size_t n_; size_t k_; size_t lda_; size_t ldb_; size_t ldc_; size_t offA_; size_t offB_; size_t offC_; size_t a_num_vectors_; size_t b_num_vectors_; size_t c_num_vectors_; clblasTranspose trans_; clblasUplo uplo_; T* a_; T* b_; T* c_; cl_mem buf_a_; cl_mem buf_b_; cl_mem buf_c_; T alpha_; T beta_; }; // struct buffer template class xSyr2k : public clblasFunc { public: xSyr2k(StatisticalTimer& _timer, cl_device_type devType) : clblasFunc(_timer, devType) { timer.getUniqueID("clSyr2k", 0); } ~xSyr2k() { } void call_func() { } double gflops() { return (2*buffer_.k_*buffer_.n_*buffer_.n_+buffer_.n_)/time_in_ns(); } std::string gflops_formula() { return "(2*K*N*N+N)/time"; } void setup_buffer(int order_option, int side_option, int uplo_option, int diag_option, int transA_option, int transB_option, size_t M, size_t N, size_t K, size_t lda, size_t ldb, size_t ldc, size_t offA, size_t offBX, size_t offCY, double alpha, double beta) { DUMMY_ARGS_USAGE_4(side_option, diag_option, transB_option, M); initialize_scalars(alpha, beta); buffer_.n_ = N; buffer_.k_ = K; buffer_.offA_ = offA; buffer_.offB_ = offBX; buffer_.offC_ = offCY; if (uplo_option == 0) { buffer_.uplo_ = clblasUpper; } else { buffer_.uplo_ = clblasLower; } if (ldc == 0) { buffer_.ldc_ = N; } else if (ldc < N) { std::cerr << "ldc:wrong size\n"; } else { buffer_.ldc_ = ldc; } buffer_.c_num_vectors_ = N; if (order_option == 0) { order_ = clblasRowMajor; if (transA_option == 0) { buffer_.trans_ = clblasNoTrans; buffer_.a_num_vectors_ = N; buffer_.b_num_vectors_ = N; if (lda == 0) { buffer_.lda_ = K; } else if (lda < K) { std::cerr << "lda:wrong size\n"; exit(1); } else { buffer_.lda_ = lda; } if (ldb == 0) { buffer_.ldb_ = K; } else if (ldb < K) { std::cerr << "ldb:wrong size\n"; exit(1); } else { buffer_.ldb_ = ldb; } } else { buffer_.a_num_vectors_ = K; buffer_.b_num_vectors_ = K; if (transA_option == 1) { buffer_.trans_ = clblasTrans; } else if (transA_option == 2) { buffer_.trans_ = clblasConjTrans; } if (lda == 0) { buffer_.lda_ = N; } else if (lda < N) { std::cerr << "lda:wrong size\n"; exit(1); } else { buffer_.lda_ = lda; } if (ldb == 0) { buffer_.ldb_ = N; } else if (ldb < N) { std::cerr << "ldb:wrong size\n"; exit(1); } else { buffer_.ldb_ = ldb; } } } else { order_ = clblasColumnMajor; if (transA_option == 0) { buffer_.a_num_vectors_ = K; buffer_.b_num_vectors_ = K; buffer_.trans_ = clblasNoTrans; if (lda == 0) { buffer_.lda_ = N; } else if (lda < N) { std::cerr << "lda:wrong size\n"; exit(1); } else { buffer_.lda_ = lda; } if (ldb == 0) { buffer_.ldb_ = N; } else if (ldb < N) { std::cerr << "ldb:wrong size\n"; exit(1); } else { buffer_.ldb_ = ldb; } } else { buffer_.a_num_vectors_ = N; buffer_.b_num_vectors_ = N; if (transA_option == 1) { buffer_.trans_ = clblasTrans; } else if (transA_option == 2) { buffer_.trans_ = clblasConjTrans; } if (lda == 0) { buffer_.lda_ = K; } else if (lda < K) { std::cerr << "lda:wrong size\n"; exit(1); } else { buffer_.lda_ = lda; } if (ldb == 0) { buffer_.ldb_ = K; } else if (ldb < K) { std::cerr << "ldb:wrong size\n"; exit(1); } else { buffer_.ldb_ = ldb; } } } buffer_.a_ = new T[buffer_.lda_*buffer_.a_num_vectors_]; buffer_.b_ = new T[buffer_.ldb_*buffer_.b_num_vectors_]; buffer_.c_ = new T[buffer_.ldc_*buffer_.c_num_vectors_]; cl_int err; buffer_.buf_a_ = clCreateBuffer(ctx_, CL_MEM_READ_ONLY, (buffer_.lda_ * buffer_.a_num_vectors_ + buffer_.offA_) * sizeof(T), NULL, &err); buffer_.buf_b_ = clCreateBuffer(ctx_, CL_MEM_READ_ONLY, (buffer_.ldb_ * buffer_.b_num_vectors_ + buffer_.offB_) * sizeof(T), NULL, &err); buffer_.buf_c_ = clCreateBuffer(ctx_, CL_MEM_READ_WRITE, (buffer_.ldc_ * buffer_.c_num_vectors_ + buffer_.offC_) * sizeof(T), NULL, &err); } void initialize_cpu_buffer() { srand(10); for (size_t i = 0; i < buffer_.a_num_vectors_; ++i) { for (size_t j = 0; j < buffer_.lda_; ++j) { buffer_.a_[i*buffer_.lda_+j] = random(UPPER_BOUND()) / randomScale(); } } for (size_t i = 0; i < buffer_.b_num_vectors_; ++i) { for (size_t j = 0; j < buffer_.ldb_; ++j) { buffer_.b_[i*buffer_.ldb_+j] = random(UPPER_BOUND()) / randomScale(); } } for (size_t i = 0; i < buffer_.c_num_vectors_; ++i) { for (size_t j = 0; j < buffer_.ldc_; ++j) { buffer_.c_[i*buffer_.ldc_+j] = random(UPPER_BOUND()) / randomScale(); } } } void initialize_gpu_buffer() { cl_int err; err = clEnqueueWriteBuffer(queues_[0], buffer_.buf_a_, CL_TRUE, buffer_.offA_ * sizeof(T), buffer_.lda_ * buffer_.a_num_vectors_ * sizeof(T), buffer_.a_, 0, NULL, NULL); err = clEnqueueWriteBuffer(queues_[0], buffer_.buf_b_, CL_TRUE, buffer_.offB_ * sizeof(T), buffer_.ldb_ * buffer_.b_num_vectors_ * sizeof(T), buffer_.b_, 0, NULL, NULL); err = clEnqueueWriteBuffer(queues_[0], buffer_.buf_c_, CL_TRUE, buffer_.offC_ * sizeof(T), buffer_.ldc_ * buffer_.c_num_vectors_ * sizeof(T), buffer_.c_, 0, NULL, NULL); } void reset_gpu_write_buffer() { cl_int err; err = clEnqueueWriteBuffer(queues_[0], buffer_.buf_c_, CL_TRUE, 0, buffer_.ldc_ * buffer_.c_num_vectors_ * sizeof(T), buffer_.c_, 0, NULL, NULL); } void read_gpu_buffer() { cl_int err; err = clEnqueueReadBuffer(queues_[0], buffer_.buf_c_, CL_TRUE, buffer_.offC_ * sizeof(T), buffer_.ldc_ * buffer_.c_num_vectors_ * sizeof(T), buffer_.c_, 0, NULL, NULL); } void roundtrip_func() { } void zerocopy_roundtrip_func() { std::cout << "xTrmm::zerocopy_roundtrip_func\n"; } void roundtrip_setup_buffer(int order_option, int side_option, int uplo_option, int diag_option, int transA_option, int transB_option, size_t M, size_t N, size_t K, size_t lda, size_t ldb, size_t ldc, size_t offA, size_t offBX, size_t offCY, double alpha, double beta) { DUMMY_ARGS_USAGE_4(side_option, diag_option, transB_option, M); initialize_scalars(alpha, beta); buffer_.n_ = N; buffer_.k_ = K; buffer_.offA_ = offA; buffer_.offB_ = offBX; buffer_.offC_ = offCY; if (uplo_option == 0) { buffer_.uplo_ = clblasUpper; } else { buffer_.uplo_ = clblasLower; } if (ldc == 0) { buffer_.ldc_ = N; } else if (ldc < N) { std::cerr << "ldc:wrong size\n"; } else { buffer_.ldc_ = ldc; } buffer_.c_num_vectors_ = N; if (order_option == 0) { order_ = clblasRowMajor; if (transA_option == 0) { buffer_.trans_ = clblasNoTrans; buffer_.a_num_vectors_ = N; buffer_.b_num_vectors_ = N; if (lda == 0) { buffer_.lda_ = K; } else if (lda < K) { std::cerr << "lda:wrong size\n"; exit(1); } else { buffer_.lda_ = lda; } if (ldb == 0) { buffer_.ldb_ = K; } else if (ldb < K) { std::cerr << "ldb:wrong size\n"; exit(1); } else { buffer_.ldb_ = ldb; } } else { buffer_.a_num_vectors_ = K; buffer_.b_num_vectors_ = K; if (transA_option == 1) { buffer_.trans_ = clblasTrans; } else if (transA_option == 2) { buffer_.trans_ = clblasConjTrans; } if (lda == 0) { buffer_.lda_ = N; } else if (lda < N) { std::cerr << "lda:wrong size\n"; exit(1); } else { buffer_.lda_ = lda; } if (ldb == 0) { buffer_.ldb_ = N; } else if (ldb < N) { std::cerr << "ldb:wrong size\n"; exit(1); } else { buffer_.ldb_ = ldb; } } } else { order_ = clblasColumnMajor; if (transA_option == 0) { buffer_.a_num_vectors_ = K; buffer_.b_num_vectors_ = K; buffer_.trans_ = clblasNoTrans; if (lda == 0) { buffer_.lda_ = N; } else if (lda < N) { std::cerr << "lda:wrong size\n"; exit(1); } else { buffer_.lda_ = lda; } if (ldb == 0) { buffer_.ldb_ = N; } else if (ldb < N) { std::cerr << "ldb:wrong size\n"; exit(1); } else { buffer_.ldb_ = ldb; } } else { buffer_.a_num_vectors_ = N; buffer_.b_num_vectors_ = N; if (transA_option == 1) { buffer_.trans_ = clblasTrans; } else if (transA_option == 2) { buffer_.trans_ = clblasConjTrans; } if (lda == 0) { buffer_.lda_ = K; } else if (lda < K) { std::cerr << "lda:wrong size\n"; exit(1); } else { buffer_.lda_ = lda; } if (ldb == 0) { buffer_.ldb_ = K; } else if (ldb < K) { std::cerr << "ldb:wrong size\n"; exit(1); } else { buffer_.ldb_ = ldb; } } } buffer_.a_ = new T[buffer_.lda_*buffer_.a_num_vectors_]; buffer_.b_ = new T[buffer_.ldb_*buffer_.b_num_vectors_]; buffer_.c_ = new T[buffer_.ldc_*buffer_.c_num_vectors_]; } void releaseGPUBuffer_deleteCPUBuffer() { //this is necessary since we are running a iteration of tests and calculate the average time. (in client.cpp) //need to do this before we eventually hit the destructor delete buffer_.a_; delete buffer_.b_; delete buffer_.c_; OPENCL_V_THROW( clReleaseMemObject(buffer_.buf_a_), "releasing buffer A"); OPENCL_V_THROW( clReleaseMemObject(buffer_.buf_b_), "releasing buffer B"); OPENCL_V_THROW( clReleaseMemObject(buffer_.buf_c_), "releasing buffer C"); } protected: void initialize_scalars(double alpha, double beta) { buffer_.alpha_ = makeScalar(alpha); buffer_.beta_ = makeScalar(beta); } private: xSyr2kBuffer buffer_; }; // class xsyr2k template<> void xSyr2k:: call_func() { timer.Start(timer_id); clblasSsyr2k(order_, buffer_.uplo_, buffer_.trans_, buffer_.n_, buffer_.k_, buffer_.alpha_, buffer_.buf_a_, buffer_.offA_, buffer_.lda_, buffer_.buf_b_, buffer_.offB_, buffer_.ldb_, buffer_.beta_, buffer_.buf_c_, buffer_.offC_, buffer_.ldc_, numQueues, queues_, 0, NULL, &event_); clWaitForEvents(1, &event_); timer.Stop(timer_id); } template<> void xSyr2k:: roundtrip_func() { timer.Start(timer_id); cl_int err; buffer_.buf_a_ = clCreateBuffer(ctx_, CL_MEM_READ_ONLY, (buffer_.lda_ * buffer_.a_num_vectors_ + buffer_.offA_) * sizeof(float), NULL, &err); buffer_.buf_b_ = clCreateBuffer(ctx_, CL_MEM_READ_ONLY, (buffer_.ldb_ * buffer_.b_num_vectors_ + buffer_.offB_) * sizeof(float), NULL, &err); buffer_.buf_c_ = clCreateBuffer(ctx_, CL_MEM_READ_WRITE, (buffer_.ldc_ * buffer_.c_num_vectors_ + buffer_.offC_) * sizeof(float), NULL, &err); this->initialize_gpu_buffer(); clblasSsyr2k(order_, buffer_.uplo_, buffer_.trans_, buffer_.n_, buffer_.k_, buffer_.alpha_, buffer_.buf_a_, buffer_.offA_, buffer_.lda_, buffer_.buf_b_, buffer_.offB_, buffer_.ldb_, buffer_.beta_, buffer_.buf_c_, buffer_.offC_, buffer_.ldc_, numQueues, queues_, 0, NULL, NULL); err = clEnqueueReadBuffer(queues_[0], buffer_.buf_c_, CL_TRUE, buffer_.offC_ * sizeof(float), buffer_.ldc_ * buffer_.c_num_vectors_ * sizeof(float), buffer_.c_, 0, NULL, &event_); clWaitForEvents(1, &event_); timer.Stop(timer_id); } template<> void xSyr2k:: call_func() { timer.Start(timer_id); clblasDsyr2k(order_, buffer_.uplo_, buffer_.trans_, buffer_.n_, buffer_.k_, buffer_.alpha_, buffer_.buf_a_, buffer_.offA_, buffer_.lda_, buffer_.buf_b_, buffer_.offB_, buffer_.ldb_, buffer_.beta_, buffer_.buf_c_, buffer_.offC_, buffer_.ldc_, numQueues, queues_, 0, NULL, &event_); clWaitForEvents(1, &event_); timer.Stop(timer_id); } template<> void xSyr2k:: roundtrip_func() { timer.Start(timer_id); cl_int err; buffer_.buf_a_ = clCreateBuffer(ctx_, CL_MEM_READ_ONLY, (buffer_.lda_ * buffer_.a_num_vectors_ + buffer_.offA_) * sizeof(double), NULL, &err); buffer_.buf_b_ = clCreateBuffer(ctx_, CL_MEM_READ_ONLY, (buffer_.ldb_ * buffer_.b_num_vectors_ + buffer_.offB_) * sizeof(double), NULL, &err); buffer_.buf_c_ = clCreateBuffer(ctx_, CL_MEM_READ_WRITE, (buffer_.ldc_ * buffer_.c_num_vectors_ + buffer_.offC_) * sizeof(double), NULL, &err); this->initialize_gpu_buffer(); clblasDsyr2k(order_, buffer_.uplo_, buffer_.trans_, buffer_.n_, buffer_.k_, buffer_.alpha_, buffer_.buf_a_, buffer_.offA_, buffer_.lda_, buffer_.buf_b_, buffer_.offB_, buffer_.ldb_, buffer_.beta_, buffer_.buf_c_, buffer_.offC_, buffer_.ldc_, numQueues, queues_, 0, NULL, NULL); err = clEnqueueReadBuffer(queues_[0], buffer_.buf_c_, CL_TRUE, buffer_.offC_ * sizeof(double), buffer_.ldc_ * buffer_.c_num_vectors_ * sizeof(double), buffer_.c_, 0, NULL, &event_); clWaitForEvents(1, &event_); timer.Stop(timer_id); } template<> void xSyr2k:: call_func() { timer.Start(timer_id); clblasCsyr2k(order_, buffer_.uplo_, buffer_.trans_, buffer_.n_, buffer_.k_, buffer_.alpha_, buffer_.buf_a_, buffer_.offA_, buffer_.lda_, buffer_.buf_b_, buffer_.offB_, buffer_.ldb_, buffer_.beta_, buffer_.buf_c_, buffer_.offC_, buffer_.ldc_, numQueues, queues_, 0, NULL, &event_); clWaitForEvents(1, &event_); timer.Stop(timer_id); } template<> void xSyr2k:: roundtrip_func() { timer.Start(timer_id); cl_int err; buffer_.buf_a_ = clCreateBuffer(ctx_, CL_MEM_READ_ONLY, (buffer_.lda_ * buffer_.a_num_vectors_ + buffer_.offA_) * sizeof(cl_float2), NULL, &err); buffer_.buf_b_ = clCreateBuffer(ctx_, CL_MEM_READ_ONLY, (buffer_.ldb_ * buffer_.b_num_vectors_ + buffer_.offB_) * sizeof(cl_float2), NULL, &err); buffer_.buf_c_ = clCreateBuffer(ctx_, CL_MEM_READ_WRITE, (buffer_.ldc_ * buffer_.c_num_vectors_ + buffer_.offC_) * sizeof(cl_float2), NULL, &err); this->initialize_gpu_buffer(); clblasCsyr2k(order_, buffer_.uplo_, buffer_.trans_, buffer_.n_, buffer_.k_, buffer_.alpha_, buffer_.buf_a_, buffer_.offA_, buffer_.lda_, buffer_.buf_b_, buffer_.offB_, buffer_.ldb_, buffer_.beta_, buffer_.buf_c_, buffer_.offC_, buffer_.ldc_, numQueues, queues_, 0, NULL, NULL); err = clEnqueueReadBuffer(queues_[0], buffer_.buf_c_, CL_TRUE, buffer_.offC_ * sizeof(cl_float2), buffer_.ldc_ * buffer_.c_num_vectors_ * sizeof(cl_float2), buffer_.c_, 0, NULL, &event_); clWaitForEvents(1, &event_); timer.Stop(timer_id); } template<> double xSyr2k::gflops() { return (8*buffer_.k_*buffer_.n_*buffer_.n_+2*buffer_.n_)/time_in_ns(); } template<> std::string xSyr2k::gflops_formula() { return "(8*K*N*N+2*N)/time"; } template<> void xSyr2k:: call_func() { timer.Start(timer_id); clblasZsyr2k(order_, buffer_.uplo_, buffer_.trans_, buffer_.n_, buffer_.k_, buffer_.alpha_, buffer_.buf_a_, buffer_.offA_, buffer_.lda_, buffer_.buf_b_, buffer_.offB_, buffer_.ldb_, buffer_.beta_, buffer_.buf_c_, buffer_.offC_, buffer_.ldc_, numQueues, queues_, 0, NULL, &event_); clWaitForEvents(1, &event_); timer.Stop(timer_id); } template<> void xSyr2k:: roundtrip_func() { timer.Start(timer_id); cl_int err; buffer_.buf_a_ = clCreateBuffer(ctx_, CL_MEM_READ_ONLY, (buffer_.lda_ * buffer_.a_num_vectors_ + buffer_.offA_) * sizeof(cl_double2), NULL, &err); buffer_.buf_b_ = clCreateBuffer(ctx_, CL_MEM_READ_ONLY, (buffer_.ldb_ * buffer_.b_num_vectors_ + buffer_.offB_) * sizeof(cl_double2), NULL, &err); buffer_.buf_c_ = clCreateBuffer(ctx_, CL_MEM_READ_WRITE, (buffer_.ldc_ * buffer_.c_num_vectors_ + buffer_.offC_) * sizeof(cl_double2), NULL, &err); this->initialize_gpu_buffer(); clblasZsyr2k(order_, buffer_.uplo_, buffer_.trans_, buffer_.n_, buffer_.k_, buffer_.alpha_, buffer_.buf_a_, buffer_.offA_, buffer_.lda_, buffer_.buf_b_, buffer_.offB_, buffer_.ldb_, buffer_.beta_, buffer_.buf_c_, buffer_.offC_, buffer_.ldc_, numQueues, queues_, 0, NULL, NULL); err = clEnqueueReadBuffer(queues_[0], buffer_.buf_c_, CL_TRUE, buffer_.offC_ * sizeof(cl_double2), buffer_.ldc_ * buffer_.c_num_vectors_ * sizeof(cl_double2), buffer_.c_, 0, NULL, &event_); clWaitForEvents(1, &event_); timer.Stop(timer_id); } template<> double xSyr2k::gflops() { return (8*buffer_.k_*buffer_.n_*buffer_.n_+2*buffer_.n_)/time_in_ns(); } template<> std::string xSyr2k::gflops_formula() { return "(8*K*N*N+2*N)/time"; } #endif // ifndef CLBLAS_BENCHMARK_XSYR2K_HXX__ clblas-2.10/src/client/clfunc_xsyrk.hpp000066400000000000000000000466001264277366700202160ustar00rootroot00000000000000/* ************************************************************************ * Copyright 2013 Advanced Micro Devices, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * ************************************************************************/ // $Id #ifndef CLBLAS_BENCHMARK_XSYRK_HXX__ #define CLBLAS_BENCHMARK_XSYRK_HXX__ #include "clfunc_common.hpp" template struct xSyrkBuffer { clblasOrder order_; size_t n_; size_t k_; size_t lda_; size_t ldc_; size_t offA_; size_t offC_; size_t a_num_vectors_; size_t c_num_vectors_; clblasTranspose trans_a_; clblasUplo uplo_; T* a_; T* c_; cl_mem buf_a_; cl_mem buf_c_; T alpha_; T beta_; }; // struct buffer template class xSyrk : public clblasFunc { public: xSyrk(StatisticalTimer& _timer, cl_device_type devType) : clblasFunc(_timer, devType) { timer.getUniqueID("clSyrk", 0); } ~xSyrk() { } void call_func() { } double gflops() { return buffer_.n_*(buffer_.n_+1)*buffer_.n_/time_in_ns(); } std::string gflops_formula() { return "(N*(N+1)*K)/time"; } void setup_buffer(int order_option, int side_option, int uplo_option, int diag_option, int transA_option, int transB_option, size_t M, size_t N, size_t K, size_t lda, size_t ldb, size_t ldc, size_t offA, size_t offBX, size_t offCY, double alpha, double beta) { DUMMY_ARGS_USAGE_4(side_option, diag_option, transB_option, M); DUMMY_ARGS_USAGE_2(ldb, offBX); initialize_scalars(alpha, beta); buffer_.n_ = N; buffer_.k_ = K; buffer_.offA_ = offA; buffer_.offC_ = offCY; if (uplo_option == 0) { buffer_.uplo_ = clblasUpper; } else { buffer_.uplo_ = clblasLower; } if (ldc == 0) { buffer_.ldc_ = N; } else if (ldc < N) { std::cerr << "ldc:wrong size\n"; } else { buffer_.ldc_ = ldc; } buffer_.c_num_vectors_ = N; if (order_option == 0) { order_ = clblasRowMajor; if (transA_option == 0) { buffer_.trans_a_ = clblasNoTrans; buffer_.a_num_vectors_ = N; if (lda == 0) { buffer_.lda_ = K; } else if (lda < K) { std::cerr << "lda:wrong size\n"; exit(1); } else { buffer_.lda_ = lda; } } else { buffer_.a_num_vectors_ = K; if (transA_option == 1) { buffer_.trans_a_ = clblasTrans; } else if (transA_option == 2) { buffer_.trans_a_ = clblasConjTrans; } if (lda == 0) { buffer_.lda_ = N; } else if (lda < N) { std::cerr << "lda:wrong size\n"; exit(1); } else { buffer_.lda_ = lda; } } } else { order_ = clblasColumnMajor; if (transA_option == 0) { buffer_.a_num_vectors_ = K; buffer_.trans_a_ = clblasNoTrans; if (lda == 0) { buffer_.lda_ = N; } else if (lda < N) { std::cerr << "lda:wrong size\n"; exit(1); } else { buffer_.lda_ = lda; } } else { buffer_.a_num_vectors_ = N; if (transA_option == 1) { buffer_.trans_a_ = clblasTrans; } else if (transA_option == 2) { buffer_.trans_a_ = clblasConjTrans; } if (lda == 0) { buffer_.lda_ = K; } else if (lda < K) { std::cerr << "lda:wrong size\n"; exit(1); } else { buffer_.lda_ = lda; } } } buffer_.a_ = new T[buffer_.lda_*buffer_.a_num_vectors_]; buffer_.c_ = new T[buffer_.ldc_*buffer_.c_num_vectors_]; cl_int err; buffer_.buf_a_ = clCreateBuffer(ctx_, CL_MEM_READ_ONLY, (buffer_.lda_ * buffer_.a_num_vectors_ + buffer_.offA_) * sizeof(T), NULL, &err); buffer_.buf_c_ = clCreateBuffer(ctx_, CL_MEM_READ_WRITE, (buffer_.ldc_ * buffer_.c_num_vectors_ + buffer_.offC_) * sizeof(T), NULL, &err); } void initialize_cpu_buffer() { srand(10); for (size_t i = 0; i < buffer_.a_num_vectors_; ++i) { for (size_t j = 0; j < buffer_.lda_; ++j) { buffer_.a_[i*buffer_.lda_+j] = random(UPPER_BOUND()) / randomScale(); } } for (size_t i = 0; i < buffer_.c_num_vectors_; ++i) { for (size_t j = 0; j < buffer_.ldc_; ++j) { buffer_.c_[i*buffer_.ldc_+j] = random(UPPER_BOUND()) / randomScale(); } } } void initialize_gpu_buffer() { cl_int err; err = clEnqueueWriteBuffer(queues_[0], buffer_.buf_a_, CL_TRUE, buffer_.offA_ * sizeof(T), buffer_.lda_ * buffer_.a_num_vectors_ * sizeof(T), buffer_.a_, 0, NULL, NULL); err = clEnqueueWriteBuffer(queues_[0], buffer_.buf_c_, CL_TRUE, buffer_.offA_ * sizeof(T), buffer_.ldc_ * buffer_.c_num_vectors_ * sizeof(T), buffer_.c_, 0, NULL, NULL); } void reset_gpu_write_buffer() { cl_int err; err = clEnqueueWriteBuffer(queues_[0], buffer_.buf_c_, CL_TRUE, buffer_.offC_ * sizeof(T), buffer_.ldc_ * buffer_.c_num_vectors_ * sizeof(T), buffer_.c_, 0, NULL, NULL); } void read_gpu_buffer() { cl_int err; err = clEnqueueReadBuffer(queues_[0], buffer_.buf_c_, CL_TRUE, buffer_.offC_*sizeof(T), buffer_.ldc_*buffer_.c_num_vectors_*sizeof(T), buffer_.c_, 0, NULL, NULL); } void roundtrip_func() { } void zerocopy_roundtrip_func() { std::cout << "xSyrk::zerocopy_roundtrip_func\n"; } void roundtrip_setup_buffer(int order_option, int side_option, int uplo_option, int diag_option, int transA_option, int transB_option, size_t M, size_t N, size_t K, size_t lda, size_t ldb, size_t ldc, size_t offA, size_t offBX, size_t offCY, double alpha, double beta) { DUMMY_ARGS_USAGE_4(side_option, diag_option, transB_option, M); DUMMY_ARGS_USAGE_2(ldb, offBX); initialize_scalars(alpha, beta); buffer_.n_ = N; buffer_.k_ = K; buffer_.offA_ = offA; buffer_.offC_ = offCY; if (uplo_option == 0) { buffer_.uplo_ = clblasUpper; } else { buffer_.uplo_ = clblasLower; } if (ldc == 0) { buffer_.ldc_ = N; } else if (ldc < N) { std::cerr << "ldc:wrong size\n"; } else { buffer_.ldc_ = ldc; } buffer_.c_num_vectors_ = N; if (order_option == 0) { order_ = clblasRowMajor; if (transA_option == 0) { buffer_.trans_a_ = clblasNoTrans; buffer_.a_num_vectors_ = N; if (lda == 0) { buffer_.lda_ = K; } else if (lda < K) { std::cerr << "lda:wrong size\n"; exit(1); } else { buffer_.lda_ = lda; } } else { buffer_.a_num_vectors_ = K; if (transA_option == 1) { buffer_.trans_a_ = clblasTrans; } else if (transA_option == 2) { buffer_.trans_a_ = clblasConjTrans; } if (lda == 0) { buffer_.lda_ = N; } else if (lda < N) { std::cerr << "lda:wrong size\n"; exit(1); } else { buffer_.lda_ = lda; } } } else { order_ = clblasColumnMajor; if (transA_option == 0) { buffer_.a_num_vectors_ = K; buffer_.trans_a_ = clblasNoTrans; if (lda == 0) { buffer_.lda_ = N; } else if (lda < N) { std::cerr << "lda:wrong size\n"; exit(1); } else { buffer_.lda_ = lda; } } else { buffer_.a_num_vectors_ = N; if (transA_option == 1) { buffer_.trans_a_ = clblasTrans; } else if (transA_option == 2) { buffer_.trans_a_ = clblasConjTrans; } if (lda == 0) { buffer_.lda_ = K; } else if (lda < K) { std::cerr << "lda:wrong size\n"; exit(1); } else { buffer_.lda_ = lda; } } } buffer_.a_ = new T[buffer_.lda_*buffer_.a_num_vectors_]; buffer_.c_ = new T[buffer_.ldc_*buffer_.c_num_vectors_]; } void releaseGPUBuffer_deleteCPUBuffer() { //this is necessary since we are running a iteration of tests and calculate the average time. (in client.cpp) //need to do this before we eventually hit the destructor delete buffer_.a_; delete buffer_.c_; OPENCL_V_THROW( clReleaseMemObject(buffer_.buf_a_), "releasing buffer A"); OPENCL_V_THROW( clReleaseMemObject(buffer_.buf_c_), "releasing buffer C"); } protected: void initialize_scalars(double alpha, double beta) { buffer_.alpha_ = makeScalar(alpha); buffer_.beta_ = makeScalar(beta); } private: xSyrkBuffer buffer_; }; // class xsyrk template<> void xSyrk:: call_func() { timer.Start(timer_id); clblasSsyrk(order_, buffer_.uplo_, buffer_.trans_a_, buffer_.n_, buffer_.k_, buffer_.alpha_, buffer_.buf_a_, buffer_.offA_, buffer_.lda_, buffer_.beta_, buffer_.buf_c_, buffer_.offC_, buffer_.ldc_, 4, queues_, 0, NULL, &event_); clWaitForEvents(1, &event_); timer.Stop(timer_id); } template<> void xSyrk::roundtrip_func() { timer.Start(timer_id); cl_int err; buffer_.buf_a_ = clCreateBuffer(ctx_, CL_MEM_READ_ONLY, (buffer_.lda_ * buffer_.a_num_vectors_ + buffer_.offA_) * sizeof(float), NULL, &err); buffer_.buf_c_ = clCreateBuffer(ctx_, CL_MEM_READ_WRITE, (buffer_.ldc_ * buffer_.c_num_vectors_ + buffer_.offC_) * sizeof(float), NULL, &err); this->initialize_gpu_buffer(); clblasSsyrk(order_, buffer_.uplo_, buffer_.trans_a_, buffer_.n_, buffer_.k_, buffer_.alpha_, buffer_.buf_a_, buffer_.offA_, buffer_.lda_, buffer_.beta_, buffer_.buf_c_, buffer_.offC_, buffer_.ldc_, numQueues, queues_, 0, NULL, NULL); err = clEnqueueReadBuffer(queues_[0], buffer_.buf_c_, CL_TRUE, buffer_.offC_*sizeof(float), buffer_.ldc_*buffer_.c_num_vectors_*sizeof(float), buffer_.c_, 0, NULL, &event_); clWaitForEvents(1, &event_); timer.Stop(timer_id); } template<> void xSyrk:: call_func() { timer.Start(timer_id); clblasDsyrk(order_, buffer_.uplo_, buffer_.trans_a_, buffer_.n_, buffer_.k_, buffer_.alpha_, buffer_.buf_a_, buffer_.offA_, buffer_.lda_, buffer_.beta_, buffer_.buf_c_, buffer_.offC_, buffer_.ldc_, numQueues, queues_, 0, NULL, &event_); clWaitForEvents(1, &event_); timer.Stop(timer_id); } template<> void xSyrk::roundtrip_func() { timer.Start(timer_id); cl_int err; buffer_.buf_a_ = clCreateBuffer(ctx_, CL_MEM_READ_ONLY, (buffer_.lda_ * buffer_.a_num_vectors_ + buffer_.offA_) * sizeof(double), NULL, &err); buffer_.buf_c_ = clCreateBuffer(ctx_, CL_MEM_READ_WRITE, (buffer_.ldc_ * buffer_.c_num_vectors_ + buffer_.offC_) * sizeof(double), NULL, &err); this->initialize_gpu_buffer(); clblasDsyrk(order_, buffer_.uplo_, buffer_.trans_a_, buffer_.n_, buffer_.k_, buffer_.alpha_, buffer_.buf_a_, buffer_.offA_, buffer_.lda_, buffer_.beta_, buffer_.buf_c_, buffer_.offC_, buffer_.ldc_, numQueues, queues_, 0, NULL, NULL); err = clEnqueueReadBuffer(queues_[0], buffer_.buf_c_, CL_TRUE, buffer_.offC_*sizeof(double), buffer_.ldc_*buffer_.c_num_vectors_*sizeof(double), buffer_.c_, 0, NULL, &event_); clWaitForEvents(1, &event_); timer.Stop(timer_id); } template<> void xSyrk:: call_func() { timer.Start(timer_id); clblasCsyrk(order_, buffer_.uplo_, buffer_.trans_a_, buffer_.n_, buffer_.k_, buffer_.alpha_, buffer_.buf_a_, buffer_.offA_, buffer_.lda_, buffer_.beta_, buffer_.buf_c_, buffer_.offC_, buffer_.ldc_, numQueues, queues_, 0, NULL, &event_); clWaitForEvents(1, &event_); timer.Stop(timer_id); } template<> void xSyrk::roundtrip_func() { timer.Start(timer_id); cl_int err; buffer_.buf_a_ = clCreateBuffer(ctx_, CL_MEM_READ_ONLY, (buffer_.lda_ * buffer_.a_num_vectors_ + buffer_.offA_) * sizeof(cl_float2), NULL, &err); buffer_.buf_c_ = clCreateBuffer(ctx_, CL_MEM_READ_WRITE, (buffer_.ldc_ * buffer_.c_num_vectors_ + buffer_.offC_) * sizeof(cl_float2), NULL, &err); this->initialize_gpu_buffer(); clblasCsyrk(order_, buffer_.uplo_, buffer_.trans_a_, buffer_.n_, buffer_.k_, buffer_.alpha_, buffer_.buf_a_, buffer_.offA_, buffer_.lda_, buffer_.beta_, buffer_.buf_c_, buffer_.offC_, buffer_.ldc_, numQueues, queues_, 0, NULL, NULL); err = clEnqueueReadBuffer(queues_[0], buffer_.buf_c_, CL_TRUE, buffer_.offC_*sizeof(cl_float2), buffer_.ldc_*buffer_.c_num_vectors_*sizeof(cl_float2), buffer_.c_, 0, NULL, &event_); clWaitForEvents(1, &event_); timer.Stop(timer_id); } template<> double xSyrk::gflops() { return 4*buffer_.n_*(buffer_.n_+1)*buffer_.n_/time_in_ns(); } template<> std::string xSyrk::gflops_formula() { return "(4*N*(N+1)*K)/time"; } template<> void xSyrk:: call_func() { timer.Start(timer_id); clblasZsyrk(order_, buffer_.uplo_, buffer_.trans_a_, buffer_.n_, buffer_.k_, buffer_.alpha_, buffer_.buf_a_, buffer_.offA_, buffer_.lda_, buffer_.beta_, buffer_.buf_c_, buffer_.offC_, buffer_.ldc_, numQueues, queues_, 0, NULL, &event_); clWaitForEvents(1, &event_); timer.Stop(timer_id); } template<> void xSyrk::roundtrip_func() { timer.Start(timer_id); cl_int err; buffer_.buf_a_ = clCreateBuffer(ctx_, CL_MEM_READ_ONLY, (buffer_.lda_ * buffer_.a_num_vectors_ + buffer_.offA_) * sizeof(cl_double2), NULL, &err); buffer_.buf_c_ = clCreateBuffer(ctx_, CL_MEM_READ_WRITE, (buffer_.ldc_ * buffer_.c_num_vectors_ + buffer_.offC_) * sizeof(cl_double2), NULL, &err); this->initialize_gpu_buffer(); clblasZsyrk(order_, buffer_.uplo_, buffer_.trans_a_, buffer_.n_, buffer_.k_, buffer_.alpha_, buffer_.buf_a_, buffer_.offA_, buffer_.lda_, buffer_.beta_, buffer_.buf_c_, buffer_.offC_, buffer_.ldc_, numQueues, queues_, 0, NULL, NULL); err = clEnqueueReadBuffer(queues_[0], buffer_.buf_c_, CL_TRUE, buffer_.offC_*sizeof(cl_double2), buffer_.ldc_*buffer_.c_num_vectors_*sizeof(cl_double2), buffer_.c_, 0, NULL, &event_); clWaitForEvents(1, &event_); timer.Stop(timer_id); } template<> double xSyrk::gflops() { return 4*buffer_.n_*(buffer_.n_+1)*buffer_.n_/time_in_ns(); } template<> std::string xSyrk::gflops_formula() { return "(4*N*(N+1)*K)/time"; } #endif // ifndef CLBLAS_BENCHMARK_XSYRK_HXX__ clblas-2.10/src/client/clfunc_xtrmm.hpp000066400000000000000000000575261264277366700202160ustar00rootroot00000000000000/* ************************************************************************ * Copyright 2013 Advanced Micro Devices, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * ************************************************************************/ // $Id #ifndef CLBLAS_BENCHMARK_XTRMM_HXX__ #define CLBLAS_BENCHMARK_XTRMM_HXX__ #include "clfunc_common.hpp" template struct xTrmmBuffer { clblasOrder order_; size_t m_; size_t n_; size_t lda_; size_t ldb_; size_t offA_; size_t offB_; size_t a_num_vectors_; size_t b_num_vectors_; clblasTranspose trans_a_; clblasSide side_; clblasUplo uplo_; clblasDiag diag_; T* a_; T* b_; cl_mem buf_a_; cl_mem buf_b_; T alpha_; }; // struct buffer template class xTrmm : public clblasFunc { public: xTrmm(StatisticalTimer& timer, cl_device_type devType) : clblasFunc(timer, devType) { timer.getUniqueID("clTrmm", 0); } ~xTrmm() { } void call_func() { std::cout << "xtrmm::call_func\n"; } double gflops() { if (buffer_.side_ == clblasLeft) { return buffer_.m_*(buffer_.m_+1)*buffer_.n_/time_in_ns(); } else { return 20*buffer_.m_*(buffer_.n_+1)*buffer_.n_/time_in_ns(); } } std::string gflops_formula() { if (buffer_.side_ == clblasLeft) { return "M*(M+1)*N/time"; } else { return "M*(N+1)*N/time"; } } void setup_buffer(int order_option, int side_option, int uplo_option, int diag_option, int transA_option, int transB_option, size_t M, size_t N, size_t K, size_t lda, size_t ldb, size_t ldc, size_t offA, size_t offBX, size_t offCY, double alpha, double beta) { DUMMY_ARGS_USAGE_3(transB_option, K, beta); DUMMY_ARGS_USAGE_2(ldc, offCY); initialize_scalars(alpha, beta); buffer_.m_ = M; buffer_.n_ = N; buffer_.offA_ = offA; buffer_.offB_ = offBX; if (transA_option == 0) { buffer_.trans_a_ = clblasNoTrans; } else if (transA_option == 1) { buffer_.trans_a_ = clblasTrans; } else if (transA_option == 2) { buffer_.trans_a_ = clblasConjTrans; } if (side_option == 0) { buffer_.side_ = clblasLeft; buffer_.a_num_vectors_ = M; } else { buffer_.side_ = clblasRight; buffer_.a_num_vectors_ = N; } if (uplo_option == 0) { buffer_.uplo_ = clblasUpper; } else { buffer_.uplo_ = clblasLower; } if (diag_option == 0) { buffer_.diag_ = clblasUnit; } else { buffer_.diag_ = clblasNonUnit; } if (order_option == 0) { order_ = clblasRowMajor; buffer_.b_num_vectors_ = M; if (ldb == 0) { buffer_.ldb_ = N; } else { if (ldb < N) { std::cerr << "ldb:wrong size\n"; exit(1); } else { buffer_.ldb_ = ldb; } } } else { buffer_.order_ = clblasColumnMajor; buffer_.b_num_vectors_ = N; if (ldb == 0) { buffer_.ldb_ = M; } else { if (ldb < M) { std::cerr << "ldb:wrong size\n"; exit(1); } else { buffer_.ldb_ = ldb; } } } if (lda == 0) { if (side_option == 0) { buffer_.lda_ = M; } else { buffer_.lda_ = N; } } else { if( side_option == 0 && lda < M ) { std::cerr << "ERROR: when side is 0, lda must be set to 0 " "or a value >= M" << std::endl; } else if(side_option == 0 && lda >= M ) { buffer_.lda_ = lda; } else if(side_option != 0 && lda < N) { std::cerr << "ERROR: when side is 1, lda must be set to 0 " "or a value >= N" << std::endl; } else if (side_option != 0 && lda >= N) { buffer_.lda_ = lda; } } buffer_.a_ = new T[buffer_.lda_*buffer_.a_num_vectors_]; buffer_.b_ = new T[buffer_.ldb_*buffer_.b_num_vectors_]; cl_int err; buffer_.buf_a_ = clCreateBuffer(ctx_, CL_MEM_READ_ONLY, (buffer_.lda_ * buffer_.a_num_vectors_ + buffer_.offA_) * sizeof(T), NULL, &err); buffer_.buf_b_ = clCreateBuffer(ctx_, CL_MEM_READ_WRITE, (buffer_.ldb_ * buffer_.b_num_vectors_ + buffer_.offB_) * sizeof(T), NULL, &err); } void initialize_cpu_buffer() { srand(10); for (size_t i = 0; i < buffer_.b_num_vectors_; ++i) { for (size_t j = 0; j < buffer_.ldb_; ++j) { buffer_.b_[i*buffer_.ldb_+j] = random(UPPER_BOUND()) / randomScale(); } } for (size_t i = 0; i < buffer_.a_num_vectors_; ++i) { for (size_t j = 0; j < buffer_.lda_; ++j) { if (i == j && buffer_.diag_ == clblasUnit) { buffer_.a_[i*buffer_.lda_+j] = ONE(); } else { buffer_.a_[i*buffer_.lda_+j] = random(UPPER_BOUND()) / randomScale(); } } } } void initialize_gpu_buffer() { cl_int err; err = clEnqueueWriteBuffer(queues_[0], buffer_.buf_a_, CL_TRUE, buffer_.offA_ * sizeof(T), buffer_.lda_ * buffer_.a_num_vectors_ * sizeof(T), buffer_.a_, 0, NULL, NULL); err = clEnqueueWriteBuffer(queues_[0], buffer_.buf_b_, CL_TRUE, buffer_.offB_ * sizeof(T), buffer_.ldb_ *buffer_.b_num_vectors_ * sizeof(T), buffer_.b_, 0, NULL, NULL); } void reset_gpu_write_buffer() { cl_int err; err = clEnqueueWriteBuffer(queues_[0], buffer_.buf_b_, CL_TRUE, buffer_.offB_ * sizeof(T), buffer_.ldb_ * buffer_.b_num_vectors_ * sizeof(T), buffer_.b_, 0, NULL, NULL); } void read_gpu_buffer() { cl_int err; err = clEnqueueReadBuffer(queues_[0], buffer_.buf_b_, CL_TRUE, buffer_.offB_ * sizeof(T), buffer_.ldb_ * buffer_.b_num_vectors_ * sizeof(T), buffer_.b_, 0, NULL, NULL); } void roundtrip_func() { std::cout << "xTrmm::roundtrip_func\n"; } void zerocopy_roundtrip_func() { std::cout << "xTrmm::zerocopy_roundtrip_func\n"; } void roundtrip_setup_buffer(int order_option, int side_option, int uplo_option, int diag_option, int transA_option, int transB_option, size_t M, size_t N, size_t K, size_t lda, size_t ldb, size_t ldc, size_t offA, size_t offBX, size_t offCY, double alpha, double beta) { DUMMY_ARGS_USAGE_3(transB_option, K, beta); DUMMY_ARGS_USAGE_2(ldc, offCY); initialize_scalars(alpha, beta); buffer_.m_ = M; buffer_.n_ = N; buffer_.offA_ = offA; buffer_.offB_ = offBX; if (transA_option == 0) { buffer_.trans_a_ = clblasNoTrans; } else if (transA_option == 1) { buffer_.trans_a_ = clblasTrans; } else if (transA_option == 2) { buffer_.trans_a_ = clblasConjTrans; } if (side_option == 0) { buffer_.side_ = clblasLeft; buffer_.a_num_vectors_ = M; } else { buffer_.side_ = clblasRight; buffer_.a_num_vectors_ = N; } if (uplo_option == 0) { buffer_.uplo_ = clblasUpper; } else { buffer_.uplo_ = clblasLower; } if (diag_option == 0) { buffer_.diag_ = clblasUnit; } else { buffer_.diag_ = clblasNonUnit; } if (order_option == 0) { order_ = clblasRowMajor; buffer_.b_num_vectors_ = M; if (ldb == 0) { buffer_.ldb_ = N; } else { if (ldb < N) { std::cerr << "ldb:wrong size\n"; exit(1); } else { buffer_.ldb_ = ldb; } } } else { buffer_.order_ = clblasColumnMajor; buffer_.b_num_vectors_ = N; if (ldb == 0) { buffer_.ldb_ = M; } else { if (ldb < M) { std::cerr << "ldb:wrong size\n"; exit(1); } else { buffer_.ldb_ = ldb; } } } if (lda == 0) { if (side_option == 0) { buffer_.lda_ = M; } else { buffer_.lda_ = N; } } else { if( side_option == 0 && lda < M ) { std::cerr << "ERROR: when side is 0, lda must be set to 0 " "or a value >= M" << std::endl; } else if(side_option == 0 && lda >= M ) { buffer_.lda_ = lda; } else if(side_option != 0 && lda < N) { std::cerr << "ERROR: when side is 1, lda must be set to 0 " "or a value >= N" << std::endl; } else if (side_option != 0 && lda >= N) { buffer_.lda_ = lda; } } buffer_.a_ = new T[buffer_.lda_*buffer_.a_num_vectors_]; buffer_.b_ = new T[buffer_.ldb_*buffer_.b_num_vectors_]; } void releaseGPUBuffer_deleteCPUBuffer() { //this is necessary since we are running a iteration of tests and calculate the average time. (in client.cpp) //need to do this before we eventually hit the destructor delete buffer_.a_; delete buffer_.b_; OPENCL_V_THROW(clReleaseMemObject(buffer_.buf_a_), "releasing buffer A"); OPENCL_V_THROW(clReleaseMemObject(buffer_.buf_b_), "releasing buffer B"); } protected: void initialize_scalars(double alpha, double beta) { DUMMY_ARG_USAGE(beta); buffer_.alpha_ = makeScalar(alpha); } private: xTrmmBuffer buffer_; }; // class xTrmm template<> void xTrmm:: call_func() { timer.Start(timer_id); clblasStrmm(order_, buffer_.side_, buffer_.uplo_, buffer_.trans_a_, buffer_.diag_, buffer_.m_, buffer_.n_, buffer_.alpha_, buffer_.buf_a_, buffer_.offA_, buffer_.lda_, buffer_.buf_b_, buffer_.offB_, buffer_.ldb_, numQueues, queues_, 0, NULL, &event_); clWaitForEvents(1, &event_); timer.Stop(timer_id); } template<> void xTrmm:: roundtrip_func() { timer.Start(timer_id); cl_int err; //set up buffer buffer_.buf_a_ = clCreateBuffer(ctx_, CL_MEM_READ_ONLY, (buffer_.lda_ * buffer_.a_num_vectors_ + buffer_.offA_) * sizeof(cl_float), NULL, &err); buffer_.buf_b_ = clCreateBuffer(ctx_, CL_MEM_READ_WRITE, (buffer_.ldb_ * buffer_.b_num_vectors_ + buffer_.offB_) * sizeof(cl_float), NULL, &err); //initialize gpu buffer err = clEnqueueWriteBuffer(queues_[0], buffer_.buf_a_, CL_TRUE, buffer_.offA_ * sizeof(cl_float), buffer_.lda_ * buffer_.a_num_vectors_ * sizeof(cl_float), buffer_.a_, 0, NULL, NULL); err = clEnqueueWriteBuffer(queues_[0], buffer_.buf_b_, CL_TRUE, buffer_.offB_ * sizeof(cl_float), buffer_.ldb_ *buffer_.b_num_vectors_ * sizeof(cl_float), buffer_.b_, 0, NULL, NULL); //call_func clblasStrmm(order_, buffer_.side_, buffer_.uplo_, buffer_.trans_a_, buffer_.diag_, buffer_.m_, buffer_.n_, buffer_.alpha_, buffer_.buf_a_, buffer_.offA_, buffer_.lda_, buffer_.buf_b_, buffer_.offB_, buffer_.ldb_, numQueues, queues_, 0, NULL, NULL); //read gpu buffer err = clEnqueueReadBuffer(queues_[0], buffer_.buf_b_, CL_TRUE, buffer_.offB_ * sizeof(cl_float), buffer_.ldb_ * buffer_.b_num_vectors_ * sizeof(cl_float), buffer_.b_, 0, NULL, &event_); clWaitForEvents(1, &event_); timer.Stop(timer_id); } template<> void xTrmm:: call_func() { timer.Start(timer_id); clblasDtrmm(order_, buffer_.side_, buffer_.uplo_, buffer_.trans_a_, buffer_.diag_, buffer_.m_, buffer_.n_, buffer_.alpha_, buffer_.buf_a_, buffer_.offB_, buffer_.lda_, buffer_.buf_b_, buffer_.offB_, buffer_.ldb_, numQueues, queues_, 0, NULL, &event_); clWaitForEvents(1, &event_); timer.Stop(timer_id); } template<> void xTrmm:: roundtrip_func() { timer.Start(timer_id); cl_int err; //set up buffer buffer_.buf_a_ = clCreateBuffer(ctx_, CL_MEM_READ_ONLY, (buffer_.lda_ * buffer_.a_num_vectors_ + buffer_.offA_) * sizeof(cl_double), NULL, &err); buffer_.buf_b_ = clCreateBuffer(ctx_, CL_MEM_READ_WRITE, (buffer_.ldb_ * buffer_.b_num_vectors_ + buffer_.offB_) * sizeof(cl_double), NULL, &err); //initialize gpu buffer err = clEnqueueWriteBuffer(queues_[0], buffer_.buf_a_, CL_TRUE, buffer_.offA_ * sizeof(cl_double), buffer_.lda_ * buffer_.a_num_vectors_ * sizeof(cl_double), buffer_.a_, 0, NULL, NULL); err = clEnqueueWriteBuffer(queues_[0], buffer_.buf_b_, CL_TRUE, buffer_.offB_ * sizeof(cl_double), buffer_.ldb_ *buffer_.b_num_vectors_ * sizeof(cl_double), buffer_.b_, 0, NULL, NULL); //call_func clblasDtrmm(order_, buffer_.side_, buffer_.uplo_, buffer_.trans_a_, buffer_.diag_, buffer_.m_, buffer_.n_, buffer_.alpha_, buffer_.buf_a_, buffer_.offA_, buffer_.lda_, buffer_.buf_b_, buffer_.offB_, buffer_.ldb_, numQueues, queues_, 0, NULL, NULL); //read gpu buffer err = clEnqueueReadBuffer(queues_[0], buffer_.buf_b_, CL_TRUE, buffer_.offB_ * sizeof(cl_double), buffer_.ldb_ * buffer_.b_num_vectors_ * sizeof(cl_double), buffer_.b_, 0, NULL, &event_); clWaitForEvents(1, &event_); timer.Stop(timer_id); } template<> void xTrmm:: call_func() { timer.Start(timer_id); clblasCtrmm(order_, buffer_.side_, buffer_.uplo_, buffer_.trans_a_, buffer_.diag_, buffer_.m_, buffer_.n_, buffer_.alpha_, buffer_.buf_a_, buffer_.offA_, buffer_.lda_, buffer_.buf_b_, buffer_.offB_, buffer_.ldb_, numQueues, queues_, 0, NULL, &event_); clWaitForEvents(1, &event_); timer.Stop(timer_id); } template<> void xTrmm:: roundtrip_func() { timer.Start(timer_id); cl_int err; //set up buffer buffer_.buf_a_ = clCreateBuffer(ctx_, CL_MEM_READ_ONLY, (buffer_.lda_ * buffer_.a_num_vectors_ + buffer_.offA_) * sizeof(cl_float2), NULL, &err); buffer_.buf_b_ = clCreateBuffer(ctx_, CL_MEM_READ_WRITE, (buffer_.ldb_ * buffer_.b_num_vectors_ + buffer_.offB_) * sizeof(cl_float2), NULL, &err); //initialize gpu buffer err = clEnqueueWriteBuffer(queues_[0], buffer_.buf_a_, CL_TRUE, buffer_.offA_ * sizeof(cl_float2), buffer_.lda_ * buffer_.a_num_vectors_ * sizeof(cl_float2), buffer_.a_, 0, NULL, NULL); err = clEnqueueWriteBuffer(queues_[0], buffer_.buf_b_, CL_TRUE, buffer_.offB_ * sizeof(cl_float2), buffer_.ldb_ *buffer_.b_num_vectors_ * sizeof(cl_float2), buffer_.b_, 0, NULL, NULL); //call_func clblasCtrmm(order_, buffer_.side_, buffer_.uplo_, buffer_.trans_a_, buffer_.diag_, buffer_.m_, buffer_.n_, buffer_.alpha_, buffer_.buf_a_, buffer_.offA_, buffer_.lda_, buffer_.buf_b_, buffer_.offB_, buffer_.ldb_, numQueues, queues_, 0, NULL, NULL); //read gpu buffer err = clEnqueueReadBuffer(queues_[0], buffer_.buf_b_, CL_TRUE, buffer_.offB_ * sizeof(cl_float2), buffer_.ldb_ * buffer_.b_num_vectors_ * sizeof(cl_float2), buffer_.b_, 0, NULL, &event_); clWaitForEvents(1, &event_); timer.Stop(timer_id); } template<> void xTrmm:: call_func() { timer.Start(timer_id); clblasZtrmm(order_, buffer_.side_, buffer_.uplo_, buffer_.trans_a_, buffer_.diag_, buffer_.m_, buffer_.n_, buffer_.alpha_, buffer_.buf_a_, buffer_.offA_, buffer_.lda_, buffer_.buf_b_, buffer_.offB_, buffer_.ldb_, numQueues, queues_, 0, NULL, &event_); clWaitForEvents(1, &event_); timer.Stop(timer_id); } template<> void xTrmm:: roundtrip_func() { timer.Start(timer_id); cl_int err; //set up buffer buffer_.buf_a_ = clCreateBuffer(ctx_, CL_MEM_READ_ONLY, (buffer_.lda_ * buffer_.a_num_vectors_ + buffer_.offA_) * sizeof(cl_double2), NULL, &err); buffer_.buf_b_ = clCreateBuffer(ctx_, CL_MEM_READ_WRITE, (buffer_.ldb_ * buffer_.b_num_vectors_ + buffer_.offB_) * sizeof(cl_double2), NULL, &err); //initialize gpu buffer err = clEnqueueWriteBuffer(queues_[0], buffer_.buf_a_, CL_TRUE, buffer_.offA_ * sizeof(cl_double2), buffer_.lda_ * buffer_.a_num_vectors_ * sizeof(cl_double2), buffer_.a_, 0, NULL, NULL); err = clEnqueueWriteBuffer(queues_[0], buffer_.buf_b_, CL_TRUE, buffer_.offB_ * sizeof(cl_double2), buffer_.ldb_ *buffer_.b_num_vectors_ * sizeof(cl_double2), buffer_.b_, 0, NULL, NULL); //call_func clblasZtrmm(order_, buffer_.side_, buffer_.uplo_, buffer_.trans_a_, buffer_.diag_, buffer_.m_, buffer_.n_, buffer_.alpha_, buffer_.buf_a_, buffer_.offA_, buffer_.lda_, buffer_.buf_b_, buffer_.offB_, buffer_.ldb_, numQueues, queues_, 0, NULL, NULL); //read gpu buffer err = clEnqueueReadBuffer(queues_[0], buffer_.buf_b_, CL_TRUE, buffer_.offB_ * sizeof(cl_double2), buffer_.ldb_ * buffer_.b_num_vectors_ * sizeof(cl_double2), buffer_.b_, 0, NULL, &event_); clWaitForEvents(1, &event_); timer.Stop(timer_id); } template<> double xTrmm:: gflops() { if (buffer_.side_ == clblasLeft) { return 4.0*buffer_.m_*(buffer_.m_+1)*buffer_.n_/time_in_ns(); // NOTE i already had my version of clfunc_common integrated, so i went // ahead with that. i had a time_in_ns(), not a time_in_sec(), // so i adjusted the formula accordingly } else { return 4.0*buffer_.m_*(buffer_.n_+1)*buffer_.n_/time_in_ns(); } } template<> double xTrmm:: gflops() { if (buffer_.side_ == clblasLeft) { return 4.0*buffer_.m_*(buffer_.m_+1)*buffer_.n_/time_in_ns(); } else { return 4.0*buffer_.m_*(buffer_.n_+1)*buffer_.n_/time_in_ns(); } } template<> std::string xTrmm:: gflops_formula() { if (buffer_.side_ == clblasLeft) { return "4.0*M*(M+1)*N/time"; } else { return "4.0*M*(N+1)*N/time"; } } template<> std::string xTrmm:: gflops_formula() { if (buffer_.side_ == clblasLeft) { return "4.0*M*(M+1)*N/time"; } else { return "4.0*M*(N+1)*N/time"; } } #endif // ifndef CLBLAS_BENCHMARK_XTRMM_HXX__ clblas-2.10/src/client/clfunc_xtrmv.hpp000066400000000000000000000240761264277366700202210ustar00rootroot00000000000000/* ************************************************************************ * Copyright 2013 Advanced Micro Devices, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * ************************************************************************/ // $Id #ifndef CLBLAS_BENCHMARK_XTRMV_HXX__ #define CLBLAS_BENCHMARK_XTRMV_HXX__ #include "clfunc_common.hpp" template struct xTrmvBuffer { size_t m_; size_t lda_; size_t a_num_vectors_; clblasTranspose trans_a_; clblasUplo uplo_; clblasDiag diag_; T* a_; T* x_; cl_mem buf_a_; cl_mem buf_x_; cl_mem scratch_; }; // struct buffer template class xTrmv : public clblasFunc { public: xTrmv(StatisticalTimer& timer, cl_device_type devType) : clblasFunc(timer, devType) { timer.getUniqueID("clTrmv", 0); } ~xTrmv() { delete buffer_.a_; delete buffer_.x_; OPENCL_V_THROW( clReleaseMemObject(buffer_.buf_a_), "releasing buffer A"); OPENCL_V_THROW( clReleaseMemObject(buffer_.buf_x_), "releasing buffer X"); OPENCL_V_THROW( clReleaseMemObject(buffer_.scratch_), "releasing buffer X"); } void call_func() {} double gflops() { return static_cast(buffer_.m_ * buffer_.m_ )/time_in_ns(); } std::string gflops_formula() { return "M*M/time"; } void setup_buffer(int order_option, int side_option, int uplo_option, int diag_option, int transA_option, int transB_option, size_t M, size_t N, size_t K, size_t lda, size_t ldb, size_t ldc,size_t offA, size_t offB, size_t offC, double alpha, double beta) { initialize_scalars(alpha, beta); buffer_.m_ = M; if (transA_option == 0) { buffer_.trans_a_ = clblasNoTrans; } else if (transA_option == 1) { buffer_.trans_a_ = clblasTrans; } else if (transA_option == 2) { buffer_.trans_a_ = clblasConjTrans; } if (uplo_option == 0) { buffer_.uplo_ = clblasUpper; } else { buffer_.uplo_ = clblasLower; } if (diag_option == 0) { buffer_.diag_ = clblasUnit; } else { buffer_.diag_ = clblasNonUnit; } if (order_option == 0) { order_ = clblasRowMajor; } else { order_ = clblasColumnMajor; } if (lda == 0) { buffer_.lda_ = M; } else { if( lda < M ) { std::cerr << "ERROR: lda must be set to 0 or a value >= M" << std::endl; } else if (lda >= M) { buffer_.lda_ = lda; } } buffer_.a_num_vectors_ = buffer_.m_; buffer_.a_ = new T[buffer_.lda_*buffer_.a_num_vectors_]; buffer_.x_ = new T[buffer_.m_]; cl_int err; buffer_.buf_a_ = clCreateBuffer(ctx_, CL_MEM_READ_ONLY, buffer_.lda_*buffer_.a_num_vectors_*sizeof(T), NULL, &err); buffer_.buf_x_ = clCreateBuffer(ctx_, CL_MEM_READ_WRITE, buffer_.m_*sizeof(T), NULL, &err); buffer_.scratch_ = clCreateBuffer(ctx_, CL_MEM_READ_WRITE, buffer_.m_*sizeof(T), NULL, &err); } void initialize_cpu_buffer() { srand(10); for (size_t i = 0; i < buffer_.m_; ++i) { buffer_.x_[i] = static_cast(rand())/static_cast(RAND_MAX); } for (size_t i = 0; i < buffer_.a_num_vectors_; ++i) { for (size_t j = 0; j < buffer_.lda_; ++j) { if (i == j) { if (buffer_.diag_ == clblasUnit) { buffer_.a_[i*buffer_.lda_+j] = static_cast(1.0); } else { buffer_.a_[i*buffer_.lda_+j] = static_cast(rand())/static_cast(RAND_MAX); } } else { buffer_.a_[i*buffer_.lda_+j] = static_cast(0.0); } } } } void initialize_gpu_buffer() { cl_int err; err = clEnqueueWriteBuffer(queues_[0], buffer_.buf_a_, CL_TRUE, 0, buffer_.lda_*buffer_.a_num_vectors_*sizeof(T), buffer_.a_, 0, NULL, NULL); err = clEnqueueWriteBuffer(queues_[0], buffer_.buf_x_, CL_TRUE, 0, buffer_.m_*sizeof(T), buffer_.x_, 0, NULL, NULL); } void reset_gpu_write_buffer() { cl_int err; err = clEnqueueWriteBuffer(queues_[0], buffer_.buf_x_, CL_TRUE, 0, buffer_.m_, buffer_.x_, 0, NULL, NULL); } void read_gpu_buffer() { //cl_int err; //to-do need to fill up } void roundtrip_func() {//to-do need to fill up } void roundtrip_setup_buffer(int order_option, int side_option, int uplo_option, int diag_option, int transA_option, int transB_option, size_t M, size_t N, size_t K, size_t lda, size_t ldb, size_t ldc, size_t offA, size_t offBX, size_t offCY, double alpha, double beta) {} void releaseGPUBuffer_deleteCPUBuffer() { //this is necessary since we are running a iteration of tests and calculate the average time. (in client.cpp) //need to do this before we eventually hit the destructor //to-do } protected: void initialize_scalars(double alpha, double beta) { } private: xTrmvBuffer buffer_; }; // class xtrmv template<> void xTrmv:: initialize_scalars(double alpha, double beta) { } template<> void xTrmv:: initialize_scalars(double alpha, double beta) { } template<> void xTrmv:: call_func() { timer.Start(timer_id); clblasStrmv(order_, buffer_.uplo_, buffer_.trans_a_, buffer_.diag_, buffer_.m_, buffer_.buf_a_, 0, buffer_.lda_, buffer_.buf_x_, 0, 1, buffer_.scratch_, numQueues, queues_, 0, NULL, &event_); clWaitForEvents(1, &event_); timer.Stop(timer_id); } template<> void xTrmv:: call_func() { timer.Start(timer_id); clblasDtrmv(order_, buffer_.uplo_, buffer_.trans_a_, buffer_.diag_, buffer_.m_, buffer_.buf_a_, 0, buffer_.lda_, buffer_.buf_x_, 0, 1, buffer_.scratch_, numQueues, queues_, 0, NULL, &event_); clWaitForEvents(1, &event_); timer.Stop(timer_id); } template<> void xTrmv:: call_func() { timer.Start(timer_id); clblasCtrmv(order_, buffer_.uplo_, buffer_.trans_a_, buffer_.diag_, buffer_.m_, buffer_.buf_a_, 0, buffer_.lda_, buffer_.buf_x_, 0, 1, buffer_.scratch_, numQueues, queues_, 0, NULL, &event_); clWaitForEvents(1, &event_); timer.Stop(timer_id); } template<> void xTrmv:: call_func() { timer.Start(timer_id); clblasZtrmv(order_, buffer_.uplo_, buffer_.trans_a_, buffer_.diag_, buffer_.m_, buffer_.buf_a_, 0, buffer_.lda_, buffer_.buf_x_, 0, 1, buffer_.scratch_, numQueues, queues_, 0, NULL, &event_); clWaitForEvents(1, &event_); timer.Stop(timer_id); } template<> void xTrmv:: initialize_cpu_buffer() { srand(10); for (size_t i = 0; i < buffer_.m_; ++i) { buffer_.x_[i].s[0] = static_cast(rand())/static_cast(RAND_MAX); buffer_.x_[i].s[1] = static_cast(rand())/static_cast(RAND_MAX); } for (size_t i = 0; i < buffer_.a_num_vectors_; ++i) { for (size_t j = 0; j < buffer_.lda_; ++j) { if (i == j) { if (buffer_.diag_ == clblasUnit) { buffer_.a_[i*buffer_.lda_+j].s[0] = 1.0f; buffer_.a_[i*buffer_.lda_+j].s[1] = 0.0f; } else { buffer_.a_[i*buffer_.lda_+j].s[0] = static_cast(rand())/static_cast(RAND_MAX); buffer_.a_[i*buffer_.lda_+j].s[1] = static_cast(rand())/static_cast(RAND_MAX); } } else { buffer_.a_[i*buffer_.lda_+j].s[0] = 0.0f; buffer_.a_[i*buffer_.lda_+j].s[1] = 0.0f; } } } } template<> void xTrmv:: initialize_cpu_buffer() { srand(10); for (size_t i = 0; i < buffer_.m_; ++i) { buffer_.x_[i].s[0] = static_cast(rand())/static_cast(RAND_MAX); buffer_.x_[i].s[1] = static_cast(rand())/static_cast(RAND_MAX); } for (size_t i = 0; i < buffer_.a_num_vectors_; ++i) { for (size_t j = 0; j < buffer_.lda_; ++j) { if (i == j) { if (buffer_.diag_ == clblasUnit) { buffer_.a_[i*buffer_.lda_+j].s[0] = 1.0; buffer_.a_[i*buffer_.lda_+j].s[1] = 0.0; } else { buffer_.a_[i*buffer_.lda_+j].s[0] = static_cast(rand())/static_cast(RAND_MAX); buffer_.a_[i*buffer_.lda_+j].s[1] = static_cast(rand())/static_cast(RAND_MAX); } } else { buffer_.a_[i*buffer_.lda_+j].s[0] = 0.0; buffer_.a_[i*buffer_.lda_+j].s[1] = 0.0; } } } } template<> double xTrmv:: gflops() { return static_cast(4 * buffer_.m_ * buffer_.m_ )/time_in_ns(); } template<> double xTrmv:: gflops() { return static_cast(4 * buffer_.m_ * buffer_.m_ )/time_in_ns(); } template<> std::string xTrmv:: gflops_formula() { return "4*M*M/time"; } template<> std::string xTrmv:: gflops_formula() { return "4*M*M/time"; } #endif // ifndef CLBLAS_BENCHMARK_XTRMV_HXX__ clblas-2.10/src/client/clfunc_xtrsm.hpp000066400000000000000000000600221264277366700202050ustar00rootroot00000000000000/* ************************************************************************ * Copyright 2013 Advanced Micro Devices, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * ************************************************************************/ // $Id #ifndef CLBLAS_BENCHMARK_XTRSM_HXX__ #define CLBLAS_BENCHMARK_XTRSM_HXX__ #include "clfunc_common.hpp" template struct xTrsmBuffer { clblasOrder order_; size_t m_; size_t n_; size_t lda_; size_t ldb_; size_t offA_; size_t offB_; size_t a_num_vectors_; size_t b_num_vectors_; clblasTranspose trans_a_; clblasSide side_; clblasUplo uplo_; clblasDiag diag_; T* a_; T* b_; cl_mem buf_a_; cl_mem buf_b_; T alpha_; }; // struct buffer template class xTrsm : public clblasFunc { public: xTrsm(StatisticalTimer& timer, cl_device_type devType) : clblasFunc(timer, devType) { timer.getUniqueID("clTrsm", 0); } ~xTrsm() { } void call_func() { timer.Start(timer_id); xTrsm_Function(true); timer.Stop(timer_id); } double gflops() { if (buffer_.side_ == clblasLeft) { return buffer_.m_*(buffer_.m_+1)*buffer_.n_/time_in_ns(); } else { return buffer_.m_*(buffer_.n_+1)*buffer_.n_/time_in_ns(); } } std::string gflops_formula() { if (buffer_.side_ == clblasLeft) { return "M*(M+1)*N/time"; } else { return "M*(N+1)*N/time"; } } void setup_buffer(int order_option, int side_option, int uplo_option, int diag_option, int transA_option, int transB_option, size_t M, size_t N, size_t K, size_t lda, size_t ldb, size_t ldc, size_t offA, size_t offBX, size_t offCY, double alpha, double beta) { DUMMY_ARGS_USAGE_3(transB_option, K, beta); DUMMY_ARGS_USAGE_2(ldc, offCY); initialize_scalars(alpha, beta); buffer_.m_ = M; buffer_.n_ = N; buffer_.offA_ = offA; buffer_.offB_ = offBX; if (transA_option == 0) { buffer_.trans_a_ = clblasNoTrans; } else if (transA_option == 1) { buffer_.trans_a_ = clblasTrans; } else if (transA_option == 2) { buffer_.trans_a_ = clblasConjTrans; } if (side_option == 0) { buffer_.side_ = clblasLeft; buffer_.a_num_vectors_ = M; } else { buffer_.side_ = clblasRight; buffer_.a_num_vectors_ = N; } if (uplo_option == 0) { buffer_.uplo_ = clblasUpper; } else { buffer_.uplo_ = clblasLower; } if (diag_option == 0) { buffer_.diag_ = clblasUnit; } else { buffer_.diag_ = clblasNonUnit; } if (order_option == 0) { buffer_.order_ = clblasRowMajor; buffer_.b_num_vectors_ = M; if (ldb == 0) { buffer_.ldb_ = N; } else { if (ldb < N) { std::cerr << "ldb:wrong size\n"; exit(1); } else { buffer_.ldb_ = ldb; } } } else { buffer_.order_ = clblasColumnMajor; buffer_.b_num_vectors_ = N; if (ldb == 0) { buffer_.ldb_ = M; } else { if (ldb < M) { std::cerr << "ldb:wrong size\n"; exit(1); } else { buffer_.ldb_ = ldb; } } } if (lda == 0) { if (side_option == 0) { buffer_.lda_ = M; } else { buffer_.lda_ = N; } } else { if( side_option == 0 && lda < M ) { std::cerr << "ERROR: when side is 0, lda must be set to 0 " "or a value >= M" << std::endl; } else if(side_option == 0 && lda >= M ) { buffer_.lda_ = lda; } else if(side_option != 0 && lda < N) { std::cerr << "ERROR: when side is 1, lda must be set to 0 " "or a value >= N" << std::endl; } else if (side_option != 0 && lda >= N) { buffer_.lda_ = lda; } } buffer_.a_ = new T[buffer_.lda_*buffer_.a_num_vectors_]; buffer_.b_ = new T[buffer_.ldb_*buffer_.b_num_vectors_]; cl_int err; buffer_.buf_a_ = clCreateBuffer(ctx_, CL_MEM_READ_ONLY, (buffer_.lda_ * buffer_.a_num_vectors_ + buffer_.offA_) * sizeof(T), NULL, &err); buffer_.buf_b_ = clCreateBuffer(ctx_, CL_MEM_READ_WRITE, (buffer_.ldb_ * buffer_.b_num_vectors_ + buffer_.offB_) * sizeof(T), NULL, &err); } void initialize_cpu_buffer() { srand(10); for (size_t i = 0; i < buffer_.b_num_vectors_; ++i) { for (size_t j = 0; j < buffer_.ldb_; ++j) { buffer_.b_[i*buffer_.ldb_+j] = random(UPPER_BOUND()) / randomScale(); } } for (size_t i = 0; i < buffer_.a_num_vectors_; ++i) { for (size_t j = 0; j < buffer_.lda_; ++j) { if (i == j) { if (buffer_.diag_ == clblasUnit) { buffer_.a_[i*buffer_.lda_+j] = ONE(); } else { buffer_.a_[i*buffer_.lda_+j] = random(UPPER_BOUND()) / randomScale(); } } else { buffer_.a_[i*buffer_.lda_+j] = ZERO(); } } } } void initialize_gpu_buffer() { cl_int err; err = clEnqueueWriteBuffer(queues_[0], buffer_.buf_a_, CL_TRUE, buffer_.offA_ * sizeof(T), buffer_.lda_ * buffer_.a_num_vectors_ * sizeof(T), buffer_.a_, 0, NULL, NULL); err = clEnqueueWriteBuffer(queues_[0], buffer_.buf_b_, CL_TRUE, buffer_.offB_ * sizeof(T), buffer_.ldb_ * buffer_.b_num_vectors_ * sizeof(T), buffer_.b_, 0, NULL, NULL); } void reset_gpu_write_buffer() { cl_int err; err = clEnqueueWriteBuffer(queues_[0], buffer_.buf_b_, CL_TRUE, buffer_.offB_ * sizeof(T), buffer_.ldb_ * buffer_.b_num_vectors_ * sizeof(T), buffer_.b_, 0, NULL, NULL); } void read_gpu_buffer() { cl_int err; err = clEnqueueReadBuffer(queues_[0], buffer_.buf_b_, CL_TRUE, buffer_.offB_ * sizeof(T), buffer_.ldb_ * buffer_.b_num_vectors_ * sizeof(T), buffer_.b_, 0, NULL, NULL); } void roundtrip_func() { timer.Start(timer_id); //set up buffer cl_int err; buffer_.buf_a_ = clCreateBuffer(ctx_, CL_MEM_READ_ONLY, (buffer_.lda_ * buffer_.a_num_vectors_ + buffer_.offA_) * sizeof(T), NULL, &err); buffer_.buf_b_ = clCreateBuffer(ctx_, CL_MEM_READ_WRITE, (buffer_.ldb_ * buffer_.b_num_vectors_ + buffer_.offB_) * sizeof(T), NULL, &err); //initialize gpu buffer err = clEnqueueWriteBuffer(queues_[0], buffer_.buf_a_, CL_TRUE, buffer_.offA_ * sizeof(T), buffer_.lda_ * buffer_.a_num_vectors_ * sizeof(T), buffer_.a_, 0, NULL, NULL); err = clEnqueueWriteBuffer(queues_[0], buffer_.buf_b_, CL_TRUE, buffer_.offB_ * sizeof(T), buffer_.ldb_ * buffer_.b_num_vectors_ * sizeof(T), buffer_.b_, 0, NULL, NULL); //call func xTrsm_Function(false); //read gpu buffer err = clEnqueueReadBuffer(queues_[0], buffer_.buf_b_, CL_TRUE, buffer_.offB_ * sizeof(T), buffer_.ldb_ * buffer_.b_num_vectors_ * sizeof(T), buffer_.b_, 0, NULL, &event_); clWaitForEvents(1, &event_); timer.Stop(timer_id); } void allochostptr_roundtrip_func() { timer.Start(timer_id); //set up buffer cl_int err; buffer_.buf_a_ = clCreateBuffer(ctx_, CL_MEM_READ_ONLY | CL_MEM_ALLOC_HOST_PTR, (buffer_.lda_ * buffer_.a_num_vectors_ + buffer_.offA_) * sizeof(T), NULL, &err); buffer_.buf_b_ = clCreateBuffer(ctx_, CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR, (buffer_.ldb_ * buffer_.b_num_vectors_ + buffer_.offB_) * sizeof(T), NULL, &err); // Map the buffers to pointers at host device T *map_a,*map_b; map_a = (T*)clEnqueueMapBuffer(queues_[0], buffer_.buf_a_, CL_TRUE, CL_MAP_WRITE, 0, (buffer_.ldb_ * buffer_.b_num_vectors_ + buffer_.offB_) * sizeof(T), 0, NULL, NULL, &err); map_b = (T*)clEnqueueMapBuffer(queues_[0], buffer_.buf_b_, CL_TRUE, CL_MAP_WRITE, 0, (buffer_.ldb_ * buffer_.b_num_vectors_ + buffer_.offB_) * sizeof(T), 0, NULL, NULL, &err); // memcpy the input A, B to the mapped regions memcpy( map_a, buffer_.a_, ( buffer_.lda_*buffer_.a_num_vectors_ + buffer_.offA_) * sizeof( T ) ); memcpy( map_b, buffer_.b_, ( buffer_.ldb_*buffer_.b_num_vectors_ + buffer_.offB_) * sizeof( T ) ); // unmap the buffers clEnqueueUnmapMemObject(queues_[0], buffer_.buf_a_, map_a, 0, NULL, NULL); clEnqueueUnmapMemObject(queues_[0], buffer_.buf_b_, map_b, 0, NULL, NULL); //call func xTrsm_Function(false); // map the B buffer again to read the output map_b = (T*)clEnqueueMapBuffer(queues_[0], buffer_.buf_b_, CL_TRUE, CL_MAP_READ, 0, (buffer_.ldb_ * buffer_.b_num_vectors_ + buffer_.offB_) * sizeof(T), 0, NULL, NULL, &err); memcpy( map_b, buffer_.b_, ( buffer_.ldb_*buffer_.b_num_vectors_ + buffer_.offB_) * sizeof( T ) ); clEnqueueUnmapMemObject(queues_[0], buffer_.buf_b_, map_b, 0, NULL, NULL); clWaitForEvents(1, &event_); timer.Stop(timer_id); } void usehostptr_roundtrip_func() { timer.Start(timer_id); //set up buffer cl_int err; buffer_.buf_a_ = clCreateBuffer(ctx_, CL_MEM_READ_ONLY | CL_MEM_USE_HOST_PTR, (buffer_.lda_ * buffer_.a_num_vectors_ + buffer_.offA_) * sizeof(T), buffer_.a_, &err); buffer_.buf_b_ = clCreateBuffer(ctx_, CL_MEM_READ_WRITE | CL_MEM_USE_HOST_PTR, (buffer_.ldb_ * buffer_.b_num_vectors_ + buffer_.offB_) * sizeof(T), buffer_.b_, &err); //call func xTrsm_Function(false); //read gpu buffer err = clEnqueueReadBuffer(queues_[0], buffer_.buf_b_, CL_TRUE, buffer_.offB_ * sizeof(T), buffer_.ldb_ * buffer_.b_num_vectors_ * sizeof(T), buffer_.b_, 0, NULL, &event_); clWaitForEvents(1, &event_); timer.Stop(timer_id); } void copyhostptr_roundtrip_func() { timer.Start(timer_id); //set up buffer cl_int err; buffer_.buf_a_ = clCreateBuffer(ctx_, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, (buffer_.lda_ * buffer_.a_num_vectors_ + buffer_.offA_) * sizeof(T), buffer_.a_, &err); buffer_.buf_b_ = clCreateBuffer(ctx_, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, (buffer_.ldb_ * buffer_.b_num_vectors_ + buffer_.offB_) * sizeof(T), buffer_.b_, &err); //call func xTrsm_Function(false); //read gpu buffer err = clEnqueueReadBuffer(queues_[0], buffer_.buf_b_, CL_TRUE, buffer_.offB_ * sizeof(T), buffer_.ldb_ * buffer_.b_num_vectors_ * sizeof(T), buffer_.b_, 0, NULL, &event_); clWaitForEvents(1, &event_); timer.Stop(timer_id); } void usepersismem_roundtrip_func() { #if defined(CL_MEM_USE_PERSISTENT_MEM_AMD) timer.Start(timer_id); //set up buffer cl_int err; buffer_.buf_a_ = clCreateBuffer(ctx_, CL_MEM_READ_ONLY | CL_MEM_USE_PERSISTENT_MEM_AMD, (buffer_.lda_ * buffer_.a_num_vectors_ + buffer_.offA_) * sizeof(T), NULL, &err); buffer_.buf_b_ = clCreateBuffer(ctx_, CL_MEM_READ_WRITE | CL_MEM_USE_PERSISTENT_MEM_AMD, (buffer_.ldb_ * buffer_.b_num_vectors_ + buffer_.offB_) * sizeof(T), NULL, &err); // Map the buffers to pointers at host device T *map_a,*map_b; map_a = (T*)clEnqueueMapBuffer(queues_[0], buffer_.buf_a_, CL_TRUE, CL_MAP_WRITE, 0, (buffer_.ldb_ * buffer_.b_num_vectors_ + buffer_.offB_) * sizeof(T), 0, NULL, NULL, &err); map_b = (T*)clEnqueueMapBuffer(queues_[0], buffer_.buf_b_, CL_TRUE, CL_MAP_WRITE, 0, (buffer_.ldb_ * buffer_.b_num_vectors_ + buffer_.offB_) * sizeof(T), 0, NULL, NULL, &err); // memcpy the input A, B to the mapped regions memcpy( map_a, buffer_.a_, ( buffer_.lda_*buffer_.a_num_vectors_ + buffer_.offA_) * sizeof( T ) ); memcpy( map_b, buffer_.b_, ( buffer_.ldb_*buffer_.b_num_vectors_ + buffer_.offB_) * sizeof( T ) ); // unmap the buffers clEnqueueUnmapMemObject(queues_[0], buffer_.buf_a_, map_a, 0, NULL, NULL); clEnqueueUnmapMemObject(queues_[0], buffer_.buf_b_, map_b, 0, NULL, NULL); //call func xTrsm_Function(false); // map the B buffer again to read the output map_b = (T*)clEnqueueMapBuffer(queues_[0], buffer_.buf_b_, CL_TRUE, CL_MAP_READ, 0, (buffer_.ldb_ * buffer_.b_num_vectors_ + buffer_.offB_) * sizeof(T), 0, NULL, NULL, &err); memcpy( map_b, buffer_.b_, ( buffer_.ldb_*buffer_.b_num_vectors_ + buffer_.offB_) * sizeof( T ) ); clEnqueueUnmapMemObject(queues_[0], buffer_.buf_b_, map_b, 0, NULL, NULL); clWaitForEvents(1, &event_); timer.Stop(timer_id); #else std::cout<<"CL_MEM_USE_PERSISTENT_MEM_AMD is only supported on AMD hardware"<= M" << std::endl; } else if(side_option == 0 && lda >= M ) { buffer_.lda_ = lda; } else if(side_option != 0 && lda < N) { std::cerr << "ERROR: when side is 1, lda must be set to 0 " "or a value >= N" << std::endl; } else if (side_option != 0 && lda >= N) { buffer_.lda_ = lda; } } buffer_.a_ = new T[buffer_.lda_*buffer_.a_num_vectors_]; buffer_.b_ = new T[buffer_.ldb_*buffer_.b_num_vectors_]; } void releaseGPUBuffer_deleteCPUBuffer() { //this is necessary since we are running a iteration of tests and calculate the average time. (in client.cpp) //need to do this before we eventually hit the destructor delete buffer_.a_; delete buffer_.b_; OPENCL_V_THROW(clReleaseMemObject(buffer_.buf_a_), "releasing buffer A"); OPENCL_V_THROW(clReleaseMemObject(buffer_.buf_b_), "releasing buffer B"); } protected: void initialize_scalars(double alpha, double beta) { DUMMY_ARG_USAGE(beta); buffer_.alpha_ = makeScalar(alpha); } private: xTrsmBuffer buffer_; void xTrsm_Function(bool flush); }; // class xtrsm template<> void xTrsm:: xTrsm_Function(bool flush) { clblasStrsm(buffer_.order_, buffer_.side_, buffer_.uplo_, buffer_.trans_a_, buffer_.diag_, buffer_.m_, buffer_.n_, buffer_.alpha_, buffer_.buf_a_, buffer_.offA_, buffer_.lda_, buffer_.buf_b_, buffer_.offB_, buffer_.ldb_, numQueues, queues_, 0, NULL, &event_); if(flush==true) { clWaitForEvents(1, &event_); } } template<> void xTrsm:: xTrsm_Function(bool flush) { clblasDtrsm(buffer_.order_, buffer_.side_, buffer_.uplo_, buffer_.trans_a_, buffer_.diag_, buffer_.m_, buffer_.n_, buffer_.alpha_, buffer_.buf_a_, buffer_.offA_, buffer_.lda_, buffer_.buf_b_, buffer_.offB_, buffer_.ldb_, numQueues, queues_, 0, NULL, &event_); if(flush==true) { clWaitForEvents(1, &event_); } } template<> void xTrsm:: xTrsm_Function(bool flush) { clblasCtrsm(buffer_.order_, buffer_.side_, buffer_.uplo_, buffer_.trans_a_, buffer_.diag_, buffer_.m_, buffer_.n_, buffer_.alpha_, buffer_.buf_a_, buffer_.offA_, buffer_.lda_, buffer_.buf_b_, buffer_.offB_, buffer_.ldb_, numQueues, queues_, 0, NULL, &event_); if(flush==true) { clWaitForEvents(1, &event_); } } template<> void xTrsm:: xTrsm_Function(bool flush) { clblasZtrsm(buffer_.order_, buffer_.side_, buffer_.uplo_, buffer_.trans_a_, buffer_.diag_, buffer_.m_, buffer_.n_, buffer_.alpha_, buffer_.buf_a_, buffer_.offA_, buffer_.lda_, buffer_.buf_b_, buffer_.offB_, buffer_.ldb_, numQueues, queues_, 0, NULL, &event_); if(flush==true) { clWaitForEvents(1, &event_); } } template<> double xTrsm:: gflops() { if (buffer_.side_ == clblasLeft) { return 4.0*buffer_.m_*(buffer_.m_+1)*buffer_.n_/time_in_ns(); } else { return 4.0*buffer_.m_*(buffer_.n_+1)*buffer_.n_/time_in_ns(); } } template<> double xTrsm:: gflops() { if (buffer_.side_ == clblasLeft) { return 4.0*buffer_.m_*(buffer_.m_+1)*buffer_.n_/time_in_ns(); } else { return 4.0*buffer_.m_*(buffer_.n_+1)*buffer_.n_/time_in_ns(); } } template<> std::string xTrsm:: gflops_formula() { if (buffer_.side_ == clblasLeft) { return "4.0*M*(M+1)*N/time"; } else { return "4.0*M*(N+1)*N/time"; } } template<> std::string xTrsm:: gflops_formula() { if (buffer_.side_ == clblasLeft) { return "4.0*M*(M+1)*N/time"; } else { return "4.0*M*(N+1)*N/time"; } } #endif // ifndef CLBLAS_BENCHMARK_XTRSM_HXX__ clblas-2.10/src/client/clfunc_xtrsv.hpp000066400000000000000000000233431264277366700202230ustar00rootroot00000000000000/* ************************************************************************ * Copyright 2013 Advanced Micro Devices, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * ************************************************************************/ // $Id #ifndef CLBLAS_BENCHMARK_XTRSV_HXX__ #define CLBLAS_BENCHMARK_XTRSV_HXX__ #include "clfunc_common.hpp" template struct xTrsvBuffer { size_t m_; size_t lda_; size_t a_num_vectors_; clblasTranspose trans_a_; clblasUplo uplo_; clblasDiag diag_; T* a_; T* x_; cl_mem buf_a_; cl_mem buf_x_; }; // struct buffer template class xTrsv : public clblasFunc { public: xTrsv(StatisticalTimer& timer, cl_device_type devType) : clblasFunc(timer, devType) { timer.getUniqueID("clTrsv", 0); } ~xTrsv() { delete buffer_.a_; delete buffer_.x_; OPENCL_V_THROW( clReleaseMemObject(buffer_.buf_a_), "releasing buffer A"); OPENCL_V_THROW( clReleaseMemObject(buffer_.buf_x_), "releasing buffer X"); } void call_func() {} double gflops() { return static_cast(buffer_.m_ * buffer_.m_ )/time_in_ns(); } std::string gflops_formula() { return "M*M/time"; } void setup_buffer(int order_option, int side_option, int uplo_option, int diag_option, int transA_option, int transB_option, size_t M, size_t N, size_t K, size_t lda, size_t ldb, size_t ldc,size_t offA, size_t offB, size_t offC, double alpha, double beta) { initialize_scalars(alpha, beta); buffer_.m_ = M; if (transA_option == 0) { buffer_.trans_a_ = clblasNoTrans; } else if (transA_option == 1) { buffer_.trans_a_ = clblasTrans; } else if (transA_option == 2) { buffer_.trans_a_ = clblasConjTrans; } if (uplo_option == 0) { buffer_.uplo_ = clblasUpper; } else { buffer_.uplo_ = clblasLower; } if (diag_option == 0) { buffer_.diag_ = clblasUnit; } else { buffer_.diag_ = clblasNonUnit; } if (order_option == 0) { order_ = clblasRowMajor; } else { order_ = clblasColumnMajor; } if (lda == 0) { buffer_.lda_ = M; } else { if( lda < M ) { std::cerr << "ERROR: lda must be set to 0 or a value >= M" << std::endl; } else if (lda >= M) { buffer_.lda_ = lda; } } buffer_.a_num_vectors_ = buffer_.m_; buffer_.a_ = new T[buffer_.lda_*buffer_.a_num_vectors_]; buffer_.x_ = new T[buffer_.m_]; cl_int err; buffer_.buf_a_ = clCreateBuffer(ctx_, CL_MEM_READ_ONLY, buffer_.lda_*buffer_.a_num_vectors_*sizeof(T), NULL, &err); buffer_.buf_x_ = clCreateBuffer(ctx_, CL_MEM_READ_WRITE, buffer_.m_*sizeof(T), NULL, &err); } void initialize_cpu_buffer() { srand(10); for (size_t i = 0; i < buffer_.m_; ++i) { buffer_.x_[i] = static_cast(rand())/static_cast(RAND_MAX); } for (size_t i = 0; i < buffer_.a_num_vectors_; ++i) { for (size_t j = 0; j < buffer_.lda_; ++j) { if (i == j) { if (buffer_.diag_ == clblasUnit) { buffer_.a_[i*buffer_.lda_+j] = static_cast(1.0); } else { buffer_.a_[i*buffer_.lda_+j] = static_cast(rand())/static_cast(RAND_MAX); } } else { buffer_.a_[i*buffer_.lda_+j] = static_cast(0.0); } } } } void initialize_gpu_buffer() { cl_int err; err = clEnqueueWriteBuffer(queues_[0], buffer_.buf_a_, CL_TRUE, 0, buffer_.lda_*buffer_.a_num_vectors_*sizeof(T), buffer_.a_, 0, NULL, NULL); err = clEnqueueWriteBuffer(queues_[0], buffer_.buf_x_, CL_TRUE, 0, buffer_.m_*sizeof(T), buffer_.x_, 0, NULL, NULL); } void reset_gpu_write_buffer() { cl_int err; err = clEnqueueWriteBuffer(queues_[0], buffer_.buf_x_, CL_TRUE, 0, buffer_.m_, buffer_.x_, 0, NULL, NULL); } void read_gpu_buffer() { //cl_int err; //to-do need to fill up } void roundtrip_func() {//to-do need to fill up } void roundtrip_setup_buffer(int order_option, int side_option, int uplo_option, int diag_option, int transA_option, int transB_option, size_t M, size_t N, size_t K, size_t lda, size_t ldb, size_t ldc, size_t offA, size_t offBX, size_t offCY, double alpha, double beta) {} void releaseGPUBuffer_deleteCPUBuffer() { //this is necessary since we are running a iteration of tests and calculate the average time. (in client.cpp) //need to do this before we eventually hit the destructor //to-do } protected: void initialize_scalars(double alpha, double beta) { } private: xTrsvBuffer buffer_; }; // class xtrsv template<> void xTrsv:: initialize_scalars(double alpha, double beta) { } template<> void xTrsv:: initialize_scalars(double alpha, double beta) { } template<> void xTrsv:: call_func() { timer.Start(timer_id); clblasStrsv(order_, buffer_.uplo_, buffer_.trans_a_, buffer_.diag_, buffer_.m_, buffer_.buf_a_, 0, buffer_.lda_, buffer_.buf_x_, 0, 1, numQueues, queues_, 0, NULL, &event_); clWaitForEvents(1, &event_); timer.Stop(timer_id); } template<> void xTrsv:: call_func() { timer.Start(timer_id); clblasDtrsv(order_, buffer_.uplo_, buffer_.trans_a_, buffer_.diag_, buffer_.m_, buffer_.buf_a_, 0, buffer_.lda_, buffer_.buf_x_, 0, 1, numQueues, queues_, 0, NULL, &event_); clWaitForEvents(1, &event_); timer.Stop(timer_id); } template<> void xTrsv:: call_func() { timer.Start(timer_id); clblasCtrsv(order_, buffer_.uplo_, buffer_.trans_a_, buffer_.diag_, buffer_.m_, buffer_.buf_a_, 0, buffer_.lda_, buffer_.buf_x_, 0, 1, numQueues, queues_, 0, NULL, &event_); clWaitForEvents(1, &event_); timer.Stop(timer_id); } template<> void xTrsv:: call_func() { timer.Start(timer_id); clblasZtrsv(order_, buffer_.uplo_, buffer_.trans_a_, buffer_.diag_, buffer_.m_, buffer_.buf_a_, 0, buffer_.lda_, buffer_.buf_x_, 0, 1, numQueues, queues_, 0, NULL, &event_); clWaitForEvents(1, &event_); timer.Stop(timer_id); } template<> void xTrsv:: initialize_cpu_buffer() { srand(10); for (size_t i = 0; i < buffer_.m_; ++i) { buffer_.x_[i].s[0] = static_cast(rand())/static_cast(RAND_MAX); buffer_.x_[i].s[1] = static_cast(rand())/static_cast(RAND_MAX); } for (size_t i = 0; i < buffer_.a_num_vectors_; ++i) { for (size_t j = 0; j < buffer_.lda_; ++j) { if (i == j) { if (buffer_.diag_ == clblasUnit) { buffer_.a_[i*buffer_.lda_+j].s[0] = 1.0f; buffer_.a_[i*buffer_.lda_+j].s[1] = 0.0f; } else { buffer_.a_[i*buffer_.lda_+j].s[0] = static_cast(rand())/static_cast(RAND_MAX); buffer_.a_[i*buffer_.lda_+j].s[1] = static_cast(rand())/static_cast(RAND_MAX); } } else { buffer_.a_[i*buffer_.lda_+j].s[0] = 0.0f; buffer_.a_[i*buffer_.lda_+j].s[1] = 0.0f; } } } } template<> void xTrsv:: initialize_cpu_buffer() { srand(10); for (size_t i = 0; i < buffer_.m_; ++i) { buffer_.x_[i].s[0] = static_cast(rand())/static_cast(RAND_MAX); buffer_.x_[i].s[1] = static_cast(rand())/static_cast(RAND_MAX); } for (size_t i = 0; i < buffer_.a_num_vectors_; ++i) { for (size_t j = 0; j < buffer_.lda_; ++j) { if (i == j) { if (buffer_.diag_ == clblasUnit) { buffer_.a_[i*buffer_.lda_+j].s[0] = 1.0; buffer_.a_[i*buffer_.lda_+j].s[1] = 0.0; } else { buffer_.a_[i*buffer_.lda_+j].s[0] = static_cast(rand())/static_cast(RAND_MAX); buffer_.a_[i*buffer_.lda_+j].s[1] = static_cast(rand())/static_cast(RAND_MAX); } } else { buffer_.a_[i*buffer_.lda_+j].s[0] = 0.0; buffer_.a_[i*buffer_.lda_+j].s[1] = 0.0; } } } } template<> double xTrsv:: gflops() { return static_cast(4 * buffer_.m_ * buffer_.m_ )/time_in_ns(); } template<> double xTrsv:: gflops() { return static_cast(4 * buffer_.m_ * buffer_.m_ )/time_in_ns(); } template<> std::string xTrsv:: gflops_formula() { return "4*M*M/time"; } template<> std::string xTrsv:: gflops_formula() { return "4*M*M/time"; } #endif // ifndef CLBLAS_BENCHMARK_XTRSV_HXX__ clblas-2.10/src/client/client.cpp000066400000000000000000000465701264277366700167630ustar00rootroot00000000000000/* ************************************************************************ * Copyright 2013 Advanced Micro Devices, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * ************************************************************************/ #include #include #include #include "statisticalTimer.h" #include "clfunc_xgemm.hpp" #include "clfunc_xtrmm.hpp" #include "clfunc_xtrsm.hpp" #include "clfunc_xgemv.hpp" #include "clfunc_xsymv.hpp" #include "clfunc_xsyrk.hpp" #include "clfunc_xsyr2k.hpp" #include "clfunc_xtrsv.hpp" #include "clfunc_xtrmv.hpp" #include "clfunc_xtrsv.hpp" #include "clfunc_xger.hpp" #include "clfunc_xsyr.hpp" #include "clfunc_xsyr2.hpp" #include "clfunc_xgeru.hpp" #include "clfunc_xgerc.hpp" #include "clfunc_xher.hpp" #include "clfunc_xher2.hpp" #include "clfunc_xhemv.hpp" #include "clfunc_xhemm.hpp" #include "clfunc_xsymm.hpp" #include "clfunc_xherk.hpp" #include "clfunc_xher2k.hpp" namespace po = boost::program_options; int main(int argc, char *argv[]) { size_t M; size_t N; size_t K; cl_double alpha; cl_double beta; cl_uint profileCount; cl_uint apiCallCount; cl_uint commandQueueFlags = 0; cl_device_type deviceType = CL_DEVICE_TYPE_GPU; int order_option; //clblasOrder order; //clblasTranspose transA; //clblasTranspose transB; int transA_option; int transB_option; size_t lda; size_t ldb; size_t ldc; size_t offA; size_t offBX; size_t offCY; std::string function; std::string precision; std::string roundtrip; std::string memalloc; int side_option; int uplo_option; int diag_option; unsigned int numQueuesToUse; po::options_description desc( "clBLAS client command line options" ); desc.add_options() ( "help,h", "produces this help message" ) ( "gpu,g", "Force instantiation of an OpenCL GPU device" ) ( "cpu,c", "Force instantiation of an OpenCL CPU device" ) ( "all,a", "Force instantiation of all OpenCL devices" ) ( "useimages", "Use an image-based kernel" ) ( "sizem,m", po::value( &M )->default_value(128), "number of rows in A and C" ) ( "sizen,n", po::value( &N )->default_value(128), "number of columns in B and C" ) ( "sizek,k", po::value( &K )->default_value(128), "number of columns in A and rows in B" ) ( "lda", po::value( &lda )->default_value(0), "first dimension of A in memory. if set to 0, lda will default to M (when transposeA is \"no transpose\") or K (otherwise)" ) ( "ldb", po::value( &ldb )->default_value(0), "first dimension of B in memory. if set to 0, ldb will default to K (when transposeB is \"no transpose\") or N (otherwise)" ) ( "ldc", po::value( &ldc )->default_value(0), "first dimension of C in memory. if set to 0, ldc will default to M" ) ( "offA", po::value( &offA )->default_value(0), "offset of the matrix A in memory object" ) ( "offBX", po::value( &offBX )->default_value(0), "offset of the matrix B or vector X in memory object" ) ( "offCY", po::value( &offCY )->default_value(0), "offset of the matrix C or vector Y in memory object" ) ( "alpha", po::value( &alpha )->default_value(1.0f), "specifies the scalar alpha" ) ( "beta", po::value( &beta )->default_value(1.0f), "specifies the scalar beta" ) ( "order,o", po::value( &order_option )->default_value(0), "0 = row major, 1 = column major" ) ( "transposeA", po::value( &transA_option )->default_value(0), "0 = no transpose, 1 = transpose, 2 = conjugate transpose" ) ( "transposeB", po::value( &transB_option )->default_value(0), "0 = no transpose, 1 = transpose, 2 = conjugate transpose" ) ( "function,f", po::value( &function )->default_value("gemm"), "BLAS function to test. Options: gemm, trsm, trmm, gemv, symv, syrk, syr2k" ) ( "precision,r", po::value( &precision )->default_value("s"), "Options: s,d,c,z" ) ( "side", po::value( &side_option )->default_value(0), "0 = left, 1 = right. only used with [list of function families]" ) // xtrsm xtrmm ( "uplo", po::value( &uplo_option )->default_value(0), "0 = upper, 1 = lower. only used with [list of function families]" ) // xsymv xsyrk xsyr2k xtrsm xtrmm ( "diag", po::value( &diag_option )->default_value(0), "0 = unit diagonal, 1 = non unit diagonal. only used with [list of function families]" ) // xtrsm xtrmm ( "profile,p", po::value( &profileCount )->default_value(20), "Time and report the kernel speed (default: 20)" ) ( "apiCallCount", po::value(&apiCallCount)->default_value(10), "Time and report the kernel speed on counds of API calls (default: 10)") ( "numQueues", po::value(&numQueuesToUse)->default_value(1), "Number of cl_command_queues to use( default: 1)") ( "roundtrip", po::value( &roundtrip )->default_value("noroundtrip"),"including the time of OpenCL memory allocation and transportation; options:roundtrip, noroundtrip(default)") ( "memalloc", po::value( &memalloc )->default_value("default"),"setting the memory allocation flags for OpenCL; would not take effect if roundtrip time is not measured; options:default(default),alloc_host_ptr,use_host_ptr,copy_host_ptr,use_persistent_mem_amd,rect_mem") ; po::variables_map vm; po::store( po::parse_command_line( argc, argv, desc ), vm ); po::notify( vm ); if( vm.count( "help" ) ) { std::cout << desc << std::endl; return 0; } if( function != "gemm" && function != "trsm" && function != "trmm" && function != "gemv" && function != "symv" && function != "syrk" && function != "syr2k" && function != "trsv" && function != "trmv" && function != "ger" && function != "syr" && function != "syr2" && function != "geru" && function != "gerc" && function != "her" && function != "her2" && function != "hemv" && function != "hemm" && function != "symm" && function != "herk" && function != "her2k" ) { std::cerr << "Invalid value for --function" << std::endl; return -1; } if( precision != "s" && precision != "d" && precision != "c" && precision != "z" ) { std::cerr << "Invalid value for --precision" << std::endl; return -1; } size_t mutex = ((vm.count( "gpu" ) > 0) ? 1 : 0) | ((vm.count( "cpu" ) > 0) ? 2 : 0) | ((vm.count( "all" ) > 0) ? 4 : 0); if((mutex & (mutex-1)) != 0) { std::cerr << "You have selected mutually-exclusive OpenCL device options:" << std::endl; if (vm.count ( "gpu" ) > 0) std::cerr << " gpu,g Force instantiation of an OpenCL GPU device" << std::endl; if (vm.count ( "cpu" ) > 0) std::cerr << " cpu,c Force instantiation of an OpenCL CPU device" << std::endl; if (vm.count ( "all" ) > 0) std::cerr << " all,a Force instantiation of all OpenCL devices" << std::endl; return 1; } if( vm.count( "gpu" ) ) { deviceType = CL_DEVICE_TYPE_GPU; } if( vm.count( "cpu" ) ) { deviceType = CL_DEVICE_TYPE_CPU; } if( vm.count( "all" ) ) { deviceType = CL_DEVICE_TYPE_ALL; } if( profileCount >= 1 ) { commandQueueFlags |= CL_QUEUE_PROFILING_ENABLE; } bool useimages; if( vm.count("useimages") ) useimages = true; else useimages = false; StatisticalTimer& timer = StatisticalTimer::getInstance( ); timer.Reserve( 3, profileCount ); timer.setNormalize( true ); clblasFunc *my_function = NULL; if (function == "gemm") { if (precision == "s") my_function = new xGemm(timer, deviceType, numQueuesToUse); else if (precision == "d") my_function = new xGemm(timer, deviceType, numQueuesToUse); else if (precision == "c") my_function = new xGemm(timer, deviceType, numQueuesToUse); else if (precision == "z") my_function = new xGemm(timer, deviceType, numQueuesToUse); else { std::cerr << "Unknown gemm function" << std::endl; return -1; } } else if (function == "trsm") { if (precision == "s") my_function = new xTrsm(timer, deviceType); else if (precision == "d") my_function = new xTrsm(timer, deviceType); else if (precision == "c") my_function = new xTrsm(timer, deviceType); else if (precision == "z") my_function = new xTrsm(timer, deviceType); else { std::cerr << "Unknown trsm function" << std::endl; return -1; } } else if (function == "trmm") { if (precision == "s") my_function = new xTrmm(timer, deviceType); else if (precision == "d") my_function = new xTrmm(timer, deviceType); else if (precision == "c") my_function = new xTrmm(timer, deviceType); else if (precision == "z") my_function = new xTrmm(timer, deviceType); else { std::cerr << "Unknown trmm function" << std::endl; return -1; } } else if (function == "gemv") { if (precision == "s") my_function = new xGemv(timer, deviceType); else if (precision == "d") my_function = new xGemv(timer, deviceType); else if (precision == "c") my_function = new xGemv(timer, deviceType); else if (precision == "z") my_function = new xGemv(timer, deviceType); else { std::cerr << "Unknown gemv function" << std::endl; return -1; } } else if (function == "symv") { if (precision == "s") my_function = new xSymv(timer, deviceType); else if (precision == "d") my_function = new xSymv(timer, deviceType); else { std::cerr << "Unknown symv function" << std::endl; return -1; } } else if (function == "syrk") { if (precision == "s") my_function = new xSyrk(timer, deviceType); else if (precision == "d") my_function = new xSyrk(timer, deviceType); else if (precision == "c") my_function = new xSyrk(timer, deviceType); else if (precision == "z") my_function = new xSyrk(timer, deviceType); else { std::cerr << "Unknown syrk function" << std::endl; return -1; } } else if (function == "syr2k") { if (precision == "s") my_function = new xSyr2k(timer, deviceType); else if (precision == "d") my_function = new xSyr2k(timer, deviceType); else if (precision == "c") my_function = new xSyr2k(timer, deviceType); else if (precision == "z") my_function = new xSyr2k(timer, deviceType); else { std::cerr << "Unknown syr2k function" << std::endl; return -1; } } else if (function == "trsv") { if (precision == "s") my_function = new xTrsv(timer, deviceType); else if (precision == "d") my_function = new xTrsv(timer, deviceType); else if (precision == "c") my_function = new xTrsv(timer, deviceType); else if (precision == "z") my_function = new xTrsv(timer, deviceType); else { std::cerr << "Unknown trsv function" << std::endl; return -1; } } else if (function == "trmv") { if (precision == "s") my_function = new xTrmv(timer, deviceType); else if (precision == "d") my_function = new xTrmv(timer, deviceType); else if (precision == "c") my_function = new xTrmv(timer, deviceType); else if (precision == "z") my_function = new xTrmv(timer, deviceType); else { std::cerr << "Unknown trmv function" << std::endl; return -1; } } else if (function == "ger") { if (precision == "s") my_function = new xGer(timer, deviceType); else if (precision == "d") my_function = new xGer(timer, deviceType); else { std::cerr << "Unknown ger function" << std::endl; return -1; } } else if (function == "syr") { if (precision == "s") my_function = new xSyr(timer, deviceType); else if (precision == "d") my_function = new xSyr(timer, deviceType); else { std::cerr << "Unknown syr function" << std::endl; return -1; } } else if (function == "syr2") { if (precision == "s") my_function = new xSyr2(timer, deviceType); else if (precision == "d") my_function = new xSyr2(timer, deviceType); else { std::cerr << "Unknown syr2 function" << std::endl; return -1; } } else if (function == "geru") { if (precision == "c") my_function = new xGeru(timer, deviceType); else if (precision == "z") my_function = new xGeru(timer, deviceType); else { std::cerr << "Unknown geru function" << std::endl; return -1; } } else if (function == "gerc") { if (precision == "c") my_function = new xGerc(timer, deviceType); else if (precision == "z") my_function = new xGerc(timer, deviceType); else { std::cerr << "Unknown gerc function" << std::endl; return -1; } } else if (function == "her") { if (precision == "c") my_function = new xHer(timer, deviceType); else if (precision == "z") my_function = new xHer(timer, deviceType); else { std::cerr << "Unknown her function" << std::endl; return -1; } } else if (function == "her2") { if (precision == "c") my_function = new xHer2(timer, deviceType); else if (precision == "z") my_function = new xHer2(timer, deviceType); else { std::cerr << "Unknown her2 function" << std::endl; return -1; } } else if (function == "hemv") { if (precision == "c") my_function = new xHemv(timer, deviceType); else if (precision == "z") my_function = new xHemv(timer, deviceType); else { std::cerr << "Unknown hemv function" << std::endl; return -1; } } else if (function == "hemm") { if (precision == "c") my_function = new xHemm(timer, deviceType); else if (precision == "z") my_function = new xHemm(timer, deviceType); else { std::cerr << "Unknown hemm function" << std::endl; return -1; } } else if (function == "herk") { if (precision == "c") my_function = new xHerk(timer, deviceType); else if (precision == "z") my_function = new xHerk(timer, deviceType); else { std::cerr << "Unknown her function" << std::endl; return -1; } } else if (function == "her2k") { if (precision == "c") my_function = new xHer2k(timer, deviceType); else if (precision == "z") my_function = new xHer2k(timer, deviceType); else { std::cerr << "Unknown her2 function" << std::endl; return -1; } } else if (function == "symm") { if (precision == "s") my_function = new xSymm(timer, deviceType); else if (precision == "d") my_function = new xSymm(timer, deviceType); else if (precision == "c") my_function = new xSymm(timer, deviceType); else if (precision == "z") my_function = new xSymm(timer, deviceType); else { std::cerr << "Unknown symm function" << std::endl; return -1; } } try { my_function->setup_buffer( order_option, side_option, uplo_option, diag_option, transA_option, transB_option, M, N, K, lda, ldb, ldc, offA, offBX, offCY, alpha, beta ); my_function->initialize_cpu_buffer(); my_function->initialize_gpu_buffer(); my_function->setup_apiCallCount(apiCallCount); my_function->call_func(); // do a calculation first to get any compilation out of the way my_function->reset_gpu_write_buffer(); // reset GPU write buffer } catch( std::exception& exc ) { std::cerr << exc.what( ) << std::endl; return 1; } if(roundtrip=="roundtrip"||roundtrip=="both") { timer.Reset(); for( cl_uint i = 0; i < profileCount; ++i ) { my_function->roundtrip_setup_buffer( order_option, side_option, uplo_option, diag_option, transA_option, transB_option, M, N, K, lda, ldb, ldc, offA, offBX, offCY, alpha, beta ); my_function->initialize_cpu_buffer(); /*my_function->initialize_gpu_buffer(); my_function->call_func(); my_function->read_gpu_buffer(); my_function->reset_gpu_write_buffer();*/ if(memalloc=="default") { my_function->roundtrip_func(); } else if (memalloc=="alloc_host_ptr") { my_function->allochostptr_roundtrip_func(); } else if (memalloc=="use_host_ptr") { my_function->usehostptr_roundtrip_func(); } else if (memalloc=="copy_host_ptr") { my_function->copyhostptr_roundtrip_func(); } else if (memalloc=="use_persistent_mem_amd") { my_function->usepersismem_roundtrip_func(); } else if (memalloc=="rect_mem") { my_function->roundtrip_func_rect(); } //my_function->reset_gpu_write_buffer(); my_function->releaseGPUBuffer_deleteCPUBuffer(); } if( commandQueueFlags & CL_QUEUE_PROFILING_ENABLE ) { //std::cout << timer << std::endl; timer.pruneOutliers( 3.0 ); std::cout << "BLAS (round trip) execution time < ns >: " << my_function->time_in_ns() << std::endl; std::cout << "BLAS (round trip) execution Gflops < " << my_function->gflops_formula() << " >: " << my_function->gflops() << std::endl; } } if(roundtrip=="noroundtrip"||roundtrip=="both") { timer.Reset(); my_function->setup_buffer( order_option, side_option, uplo_option, diag_option, transA_option, transB_option, M, N, K, lda, ldb, ldc, offA, offBX, offCY, alpha, beta ); my_function->initialize_cpu_buffer(); my_function->initialize_gpu_buffer(); my_function->setup_apiCallCount( apiCallCount ); for (cl_uint i = 0; i < profileCount; ++i) { my_function->call_func(); } my_function->read_gpu_buffer(); //my_function->reset_gpu_write_buffer(); my_function->releaseGPUBuffer_deleteCPUBuffer(); if( commandQueueFlags & CL_QUEUE_PROFILING_ENABLE ) { //std::cout << timer << std::endl; timer.pruneOutliers( 3.0 ); std::cout << "BLAS kernel execution time < ns >: " << my_function->time_in_ns() / apiCallCount << std::endl; std::cout << "BLAS kernel execution Gflops < " << my_function->gflops_formula() << " >: " << my_function->gflops() << std::endl; } } delete my_function; return 0; } clblas-2.10/src/client/ctimer.h000066400000000000000000000024051264277366700164220ustar00rootroot00000000000000/* ************************************************************************ * Copyright 2013 Advanced Micro Devices, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * ************************************************************************/ #ifndef C_TIMER_HXX__ #define C_TIMER_HXX__ #if defined(__cplusplus) typedef class timer *Timer; #else typedef struct timer *Timer; #endif #if defined(__cplusplus) extern "C" { #endif extern Timer CreateTimer(); extern void DeleteTimer(Timer timer); extern double GetTime(Timer timer); extern void PauseTimer(Timer timer); extern void RestartTimer(Timer timer); extern void ResetTimer(Timer timer); extern void ResetDelayTimer(Timer timer, double delay_time); #if defined(__cplusplus) } #endif #endif // ifndef C_TIMER_HXX__ clblas-2.10/src/client/makefile000066400000000000000000000005241264277366700164660ustar00rootroot00000000000000SHELL = /bin/bash CXX = g++ CXXFLAGS = -O3 -fomit-frame-pointer -finline-functions -I../include -I../tests/include LIBS = -lclblas -lOpenCL -lboost_program_options -lrt .PHONY: clean %.o:%.cpp ${CXX} ${CXXFLAGS} $< -c clblas_client: clblas_client.o statisticalTimer.o timer.o ${CXX} ${CXXFLAGS} $^ ${LIBS} -o $@ clean: rm -rf *.o clblas-2.10/src/client/statisticalTimer.cpp000066400000000000000000000203361264277366700210220ustar00rootroot00000000000000/* ************************************************************************ * Copyright 2013 Advanced Micro Devices, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * ************************************************************************/ // StatTimer.cpp : Defines the exported functions for the DLL application. // #include "stdafx.h" #include #include #include #include #include #include "statisticalTimer.h" #if defined( __GNUC__ ) #include #endif // Functor object to help with accumulating values in vectors template< typename T > struct Accumulator: public std::unary_function< T, void > { T acc; Accumulator( ): acc( 0 ) {} void operator( )(T x) { acc += x; } }; // Unary predicate used for remove_if() algorithm // Currently, RangeType is expected to be a floating point type, and ValType an integer type template< typename RangeType, typename ValType > struct PruneRange { RangeType lower, upper; PruneRange( RangeType mean, RangeType stdev ): lower( mean-stdev ), upper( mean+stdev ) {} bool operator( )( ValType val ) { // These comparisons can be susceptible to signed/unsigned casting problems // This is why we cast ValType to RangeType, because RangeType should always be floating and signed if( static_cast< RangeType >( val ) < lower ) return true; else if( static_cast< RangeType >( val ) > upper ) return true; return false; } }; StatisticalTimer& StatisticalTimer::getInstance( ) { static StatisticalTimer timer; return timer; } StatisticalTimer::StatisticalTimer( ): nEvents( 0 ), nSamples( 0 ), normalize( true ) { #if defined( _WIN32 ) // OS call to get ticks per second2 ::QueryPerformanceFrequency( reinterpret_cast( &clkFrequency ) ); #else clkFrequency = 1000000; #endif } StatisticalTimer::~StatisticalTimer( ) {} void StatisticalTimer::Clear( ) { labelID.clear( ); clkStart.clear( ); clkTicks.clear( ); } void StatisticalTimer::Reset( ) { if( nEvents == 0 || nSamples == 0 ) throw std::runtime_error( "StatisticalTimer::Reserve( ) was not called before Reset( )" ); clkStart.clear( ); clkTicks.clear( ); clkStart.resize( nEvents ); clkTicks.resize( nEvents ); for( unsigned int i = 0; i < nEvents; ++i ) { clkTicks.at( i ).reserve( nSamples ); } return; } // The caller can pre-allocate memory, to improve performance. // nEvents is an approximate value for how many seperate events the caller will think // they will need, and nSamples is a hint on how many samples we think we will take // per event void StatisticalTimer::Reserve( unsigned int nEvents, unsigned int nSamples ) { this->nEvents = std::max (1, nEvents); this->nSamples = std::max (1, nSamples); Clear( ); labelID.reserve( nEvents ); clkStart.resize( nEvents ); clkTicks.resize( nEvents ); for( unsigned int i = 0; i < nEvents; ++i ) { clkTicks.at( i ).reserve( nSamples ); } } void StatisticalTimer::setNormalize( bool norm ) { normalize = norm; } void StatisticalTimer::Start( sTimerID id ) { #if defined( _WIN32 ) ::QueryPerformanceCounter( reinterpret_cast( &clkStart.at( id ) ) ); #else struct timeval s; gettimeofday(&s, 0); clkStart.at( id ) = (unsigned long long)s.tv_sec * 1000000 + (unsigned long long)s.tv_usec; #endif } void StatisticalTimer::Stop( sTimerID id ) { unsigned long long n; #if defined( _WIN32 ) ::QueryPerformanceCounter( reinterpret_cast( &n ) ); #else struct timeval s; gettimeofday(&s, 0); n = (unsigned long long)s.tv_sec * 1000000 + (unsigned long long)s.tv_usec; #endif n -= clkStart.at( id ); clkStart.at( id ) = 0; AddSample( id, n ); } void StatisticalTimer::AddSample( const sTimerID id, const unsigned long long n ) { clkTicks.at( id ).push_back( n ); } // This function's purpose is to provide a mapping from a 'friendly' human readable text string // to an index into internal data structures. StatisticalTimer::sTimerID StatisticalTimer::getUniqueID( const std::string& label, unsigned int groupID ) { // I expect labelID will hardly ever grow beyond 30, so it's not of any use // to keep this sorted and do a binary search labelPair sItem = std::make_pair( label, groupID ); stringVector::iterator iter; iter = std::find( labelID.begin(), labelID.end(), sItem ); if( iter != labelID.end( ) ) return std::distance( labelID.begin( ), iter ); labelID.push_back( sItem ); return labelID.size( ) - 1; } double StatisticalTimer::getMean( sTimerID id ) const { if( clkTicks.empty( ) ) return 0; size_t N = clkTicks.at( id ).size( ); Accumulator sum = std::for_each( clkTicks.at( id ).begin(), clkTicks.at( id ).end(), Accumulator() ); return static_cast( sum.acc ) / N; } double StatisticalTimer::getVariance( sTimerID id ) const { if( clkTicks.empty( ) ) return 0; double mean = getMean( id ); size_t N = clkTicks.at( id ).size( ); double sum = 0; for( unsigned int i = 0; i < N; ++i ) { double diff = clkTicks.at( id ).at( i ) - mean; diff *= diff; sum += diff; } return sum / N; } double StatisticalTimer::getStdDev( sTimerID id ) const { double variance = getVariance( id ); return sqrt( variance ); } double StatisticalTimer::getAverageTime( sTimerID id ) const { if( normalize ) return getMean( id ) / clkFrequency; else return getMean( id ); } double StatisticalTimer::getMinimumTime( sTimerID id ) const { clkVector::const_iterator iter = std::min_element( clkTicks.at( id ).begin( ), clkTicks.at( id ).end( ) ); if( iter != clkTicks.at( id ).end( ) ) { if( normalize ) return static_cast( *iter ) / clkFrequency; else return static_cast( *iter ); } else return 0; } unsigned int StatisticalTimer::pruneOutliers( sTimerID id , double multiple ) { if( clkTicks.empty( ) ) return 0; double mean = getMean( id ); double stdDev = getStdDev( id ); clkVector& clks = clkTicks.at( id ); // Look on p. 379, "The C++ Standard Library" // std::remove_if does not actually erase, it only copies elements, it returns new 'logical' end clkVector::iterator newEnd = std::remove_if( clks.begin( ), clks.end( ), PruneRange< double,unsigned long long >( mean, multiple*stdDev ) ); clkVector::difference_type dist = std::distance( newEnd, clks.end( ) ); if( dist != 0 ) clks.erase( newEnd, clks.end( ) ); assert( dist < std::numeric_limits< unsigned int >::max( ) ); return static_cast< unsigned int >( dist ); } unsigned int StatisticalTimer::pruneOutliers( double multiple ) { unsigned int tCount = 0; for( unsigned int l = 0; l < labelID.size( ); ++l ) { unsigned int lCount = pruneOutliers( l , multiple ); std::clog << "\tStatisticalTimer:: Pruning " << lCount << " samples from " << labelID[l].first << std::endl; tCount += lCount; } return tCount; } // Defining an output print operator std::ostream& operator<<( std::ostream& os, const StatisticalTimer& st ) { if( st.clkTicks.empty( ) ) return os; std::ios::fmtflags bckup = os.flags( ); for( unsigned int l = 0; l < st.labelID.size( ); ++l ) { unsigned long long min = 0; StatisticalTimer::clkVector::const_iterator iter = std::min_element( st.clkTicks.at( l ).begin( ), st.clkTicks.at( l ).end( ) ); if( iter != st.clkTicks.at( l ).end( ) ) min = *iter; os << st.labelID[l].first << ", " << st.labelID[l].second << std::fixed << std::endl; os << "Min:," << min << std::endl; os << "Mean:," << st.getMean( l ) << std::endl; os << "StdDev:," << st.getStdDev( l ) << std::endl; os << "AvgTime:," << st.getAverageTime( l ) << std::endl; os << "MinTime:," << st.getMinimumTime( l ) << std::endl; for( unsigned int t = 0; t < st.clkTicks[l].size( ); ++t ) { os << st.clkTicks[l][t]<< ","; } os << "\n" << std::endl; } os.flags( bckup ); return os; } clblas-2.10/src/client/statisticalTimer.h000066400000000000000000000122441264277366700204660ustar00rootroot00000000000000/* ************************************************************************ * Copyright 2013 Advanced Micro Devices, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * ************************************************************************/ #pragma once #ifndef _STATISTICALTIMER_H_ #define _STATISTICALTIMER_H_ #include #include #include /** * \file clAmdFft.StatisticalTimer.h * \brief A timer class that provides a cross platform timer for use * in timing code progress with a high degree of accuracy. * This class is implemented entirely in the header, to facilitate inclusion into multiple * projects without needing to compile an object file for each project. */ /** * \class StatisticalTimer * \brief Counter that provides a fairly accurate timing mechanism for both * windows and linux. This timer is used extensively in all the samples. */ class StatisticalTimer { // Private typedefs typedef std::vector< unsigned long long > clkVector; typedef std::pair< std::string, unsigned int > labelPair; typedef std::vector< labelPair > stringVector; // In order to calculate statistics , we need to keep a history of our timings stringVector labelID; clkVector clkStart; std::vector< clkVector > clkTicks; // How many clockticks in a second unsigned long long clkFrequency; // Saved sizes for our vectors, used in Reset() to reallocate vectors clkVector::size_type nEvents, nSamples; // This setting controls whether the Timer should convert samples into time by dividing by the // clock frequency bool normalize; /** * \fn StatisticalTimer() * \brief Constructor for StatisticalTimer that initializes the class * This is private so that user code cannot create their own instantiation. Instead, you * must go through getInstance( ) to get a reference to the class. */ StatisticalTimer( ); /** * \fn ~StatisticalTimer() * \brief Destructor for StatisticalTimer that cleans up the class */ ~StatisticalTimer( ); /** * \fn StatisticalTimer(const StatisticalTimer& ) * \brief Copy constructors do not make sense for a singleton, disallow copies */ StatisticalTimer( const StatisticalTimer& ); /** * \fn operator=( const StatisticalTimer& ) * \brief Assignment operator does not make sense for a singleton, disallow assignments */ StatisticalTimer& operator=( const StatisticalTimer& ); friend std::ostream& operator<<( std::ostream& os, const StatisticalTimer& s ); public: // Public typedefs typedef stringVector::difference_type sTimerID; /** * \fn getInstance() * \brief This returns a reference to the singleton timer. Guarantees only 1 timer class is ever * instantiated within a compilable executable. */ static StatisticalTimer& getInstance( ); /** * \fn void Start( sTimerID id ) * \brief Start the timer * \sa Stop(), Reset() */ void Start( sTimerID id ); /** * \fn void Stop( sTimerID id ) * \brief Stop the timer * \sa Start(), Reset() */ void Stop( sTimerID id ); /** * \fn void AddSample( const sTimerID id, const unsigned long long n ) * \brief Explicitely add a timing sample into the class */ void AddSample( const sTimerID id, const unsigned long long n ); /** * \fn void Reset(void) * \brief Reset the timer to 0 * \sa Start(), Stop() */ void Clear( ); /** * \fn void Reset(void) * \brief Reset the timer to 0 * \sa Start(), Stop() */ void Reset( ); void Reserve( unsigned int nEvents, unsigned int nSamples ); sTimerID getUniqueID( const std::string& label, unsigned int groupID ); // Calculate the average/mean of data for a given event void setNormalize( bool norm ); // Calculate the average/mean of data for a given event double getMean( sTimerID id ) const; // Calculate the variance of data for a given event // Variance - average of the squared differences between data points and the mean double getVariance( sTimerID id ) const; // Sqrt of variance, also in units of the original data double getStdDev( sTimerID id ) const; /** * \fn double getAverageTime(sTimerID id) const * \return Return the arithmetic mean of all the samples that have been saved */ double getAverageTime( sTimerID id ) const; /** * \fn double getMinimumTime(sTimerID id) const * \return Return the arithmetic min of all the samples that have been saved */ double getMinimumTime( sTimerID id ) const; // Using the stdDev of the entire population (of an id), eliminate those samples that fall // outside some specified multiple of the stdDev. This assumes that the population // form a gaussian curve. unsigned int pruneOutliers( double multiple ); unsigned int pruneOutliers( sTimerID id , double multiple ); }; #endif // _STATISTICALTIMER_H_ clblas-2.10/src/client/stdafx.cpp000066400000000000000000000020231264277366700167570ustar00rootroot00000000000000/* ************************************************************************ * Copyright 2013 Advanced Micro Devices, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * ************************************************************************/ // stdafx.cpp : source file that includes just the standard includes // clAmdFft.pch will be the pre-compiled header // stdafx.obj will contain the pre-compiled type information #include "stdafx.h" // TODO: reference any additional headers you need in STDAFX.H // and not in this file clblas-2.10/src/client/stdafx.h000066400000000000000000000023361264277366700164330ustar00rootroot00000000000000/* ************************************************************************ * Copyright 2013 Advanced Micro Devices, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * ************************************************************************/ // stdafx.h : include file for standard system include files, // or project specific include files that are used frequently, but // are changed infrequently // #pragma once #include "targetver.h" #include #include #include #include #include #include #if defined( _WIN32 ) #define NOMINMAX #define WIN32_LEAN_AND_MEAN // Exclude rarely-used stuff from Windows headers #include #include #endif clblas-2.10/src/client/targetver.h000066400000000000000000000021041264277366700171360ustar00rootroot00000000000000/* ************************************************************************ * Copyright 2013 Advanced Micro Devices, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * ************************************************************************/ #pragma once // Including SDKDDKVer.h defines the highest available Windows platform. // If you wish to build your application for a previous Windows platform, include WinSDKVer.h and // set the _WIN32_WINNT macro to the platform you wish to support before including SDKDDKVer.h. #if defined( _WIN32 ) #include #endif clblas-2.10/src/client/testPerfWrapper.cpp000066400000000000000000000200751264277366700206320ustar00rootroot00000000000000/* ************************************************************************ * Copyright 2013 Advanced Micro Devices, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * ************************************************************************/ #include #include #include #include #if defined( _WIN32 ) #define popen _popen #define pclose _pclose #pragma warning (disable:4996) #endif namespace po = boost::program_options; int main(int argc, char *argv[]) { size_t M; size_t N; size_t K; cl_double alpha; cl_double beta; cl_uint profileCount; int order_option; int transA_option; int transB_option; int uplo_option; int side_option; int diag_option; size_t lda; size_t ldb; size_t ldc; size_t offA; size_t offBX; size_t offCY; std::string function; std::string perf_options; std::string precision; std::string command_line; FILE *perf_pipe; float perfGFL; int test_case; perf_options = ""; po::options_description desc( "clBLAS client command line options" ); desc.add_options() ( "help,h", "produces this help message" ) ( "gpu,g", "Force instantiation of an OpenCL GPU device" ) ( "cpu,c", "Force instantiation of an OpenCL CPU device" ) ( "all,a", "Force instantiation of all OpenCL devices" ) ( "useimages", "Use an image-based kernel" ) ( "sizem,m", po::value( &M )->default_value(128), "number of rows in A and C" ) ( "sizen,n", po::value( &N )->default_value(128), "number of columns in B and C" ) ( "sizek,k", po::value( &K )->default_value(128), "number of columns in A and rows in B" ) ( "lda", po::value( &lda )->default_value(0), "first dimension of A in memory. if set to 0, lda will default to M (when transposeA is \"no transpose\") or K (otherwise)" ) ( "ldb", po::value( &ldb )->default_value(0), "first dimension of B in memory. if set to 0, ldb will default to K (when transposeB is \"no transpose\") or N (otherwise)" ) ( "ldc", po::value( &ldc )->default_value(0), "first dimension of C in memory. if set to 0, ldc will default to M" ) ( "offA", po::value( &offA )->default_value(0), "offset of the matrix A in memory object (ignored, just for compatibility with the python script)" ) ( "offBX", po::value( &offBX )->default_value(0), "offset of the matrix B or vector X in memory object (ignored, just for compatibility with the python script)" ) ( "offCY", po::value( &offCY )->default_value(0), "offset of the matrix C or vector Y in memory object (ignored, just for compatibility with the python script)" ) ( "alpha", po::value( &alpha )->default_value(1.0f), "specifies the scalar alpha" ) ( "beta", po::value( &beta )->default_value(1.0f), "specifies the scalar beta" ) ( "order,o", po::value( &order_option )->default_value(0), "0 = row major, 1 = column major" ) ( "transposeA", po::value( &transA_option )->default_value(0), "0 = no transpose, 1 = transpose, 2 = conjugate transpose" ) ( "transposeB", po::value( &transB_option )->default_value(0), "0 = no transpose, 1 = transpose, 2 = conjugate transpose" ) ( "function,f", po::value( &function )->default_value("gemm"), "BLAS function to test. Options: gemm, trsm, trmm, gemv, symv, syrk, syr2k" ) ( "precision,r", po::value( &precision )->default_value("s"), "Options: s,d,c,z" ) ( "side", po::value( &side_option )->default_value(0), "0 = left, 1 = right. only used with trmm, trsm" ) ( "uplo", po::value( &uplo_option )->default_value(0), "0 = upper, 1 = lower. only used with trmm, trs, syrk, syr2k, symv" ) ( "diag", po::value( &diag_option )->default_value(0), "0 = unit diagonal, 1 = non unit diagonal. only used with trmm, trsm" ) ( "profile,p", po::value( &profileCount )->default_value(1), "Time and report the kernel speed (default: profiling off)" ) ; po::variables_map vm; po::store( po::parse_command_line( argc, argv, desc ), vm ); po::notify( vm ); if( vm.count( "help" ) ) { std::cout << desc << std::endl; return 0; } if( vm.count( "cpu" ) ) { perf_options += " --device cpu"; } else { perf_options += " --device gpu"; } perf_options = " --gtest_filter=Custom/"; test_case = 0; if( function == "gemm" ) { perf_options += "GEMM."; test_case += transB_option; test_case += 3 * transA_option; test_case += 9 * (1 - order_option); } else if( function == "trmm" ) { perf_options += "TRMM."; test_case += diag_option; test_case += 2 * transA_option; test_case += 6 * uplo_option; test_case += 12 * side_option; test_case += 24 * (1 - order_option); } else if( function == "trsm" ) { perf_options += "TRSM."; test_case += diag_option; test_case += 2 * transA_option; test_case += 6 * uplo_option; test_case += 12 * side_option; test_case += 24 * (1 - order_option); } else if( function == "syrk" ) { perf_options += "SYRK."; test_case += transA_option; test_case += 3 * uplo_option; test_case += 6 * (1 - order_option); } else if( function == "syr2k" ) { perf_options += "SYR2K."; test_case += transA_option; test_case += 3 * uplo_option; test_case += 6 * (1 - order_option); } else if( function == "gemv" ) { perf_options += "GEMV."; test_case += transA_option; test_case += 3 * (1 - order_option); } else if( function == "symv" ) { perf_options += "SYMV."; test_case += uplo_option; test_case += 2 * (1 - order_option); } else { std::cerr << "Invalid value for --function" << std::endl; return -1; } perf_options += precision + function; std::stringstream sizes_str; sizes_str << "/" << test_case << " " << M << " " << N << " " << K; perf_options += sizes_str.str(); command_line = "test-performance" + perf_options; std::cerr << "Command line: " << command_line << std::endl; perfGFL = 0; perf_pipe = popen( command_line.c_str(), "r" ); if (perf_pipe == NULL) { perror(command_line.c_str()); std::cerr << "Could not run " << command_line << std::endl; return -1; } else { char strbuf[512]; while(!feof(perf_pipe)) { strbuf[0] = '\0'; if (fgets(strbuf, sizeof(strbuf), perf_pipe) == NULL) { std::cout << "[ERROR]: Read from the pipe has failed!" << std::endl; pclose(perf_pipe); return 1; } if (sscanf(strbuf, "average performance = %f", &perfGFL) == 1) { break; } } } pclose(perf_pipe); std::cout << "BLAS kernel execution Gflops < >: " << perfGFL << std::endl; return 0; } clblas-2.10/src/client/timer.cpp000066400000000000000000000032351264277366700166140ustar00rootroot00000000000000/* ************************************************************************ * Copyright 2013 Advanced Micro Devices, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * ************************************************************************/ #include "ctimer.h" #include "timer.hpp" timer:: timer() : elapsed_time_(0.0) { init_time_ = get_walltime(); } timer:: ~timer() { } double timer:: get() { return elapsed_time_ + get_walltime() - init_time_; } void timer:: pause() { elapsed_time_ = get(); } void timer:: restart() { init_time_ = get_walltime(); } void timer:: reset() { elapsed_time_ = 0.0; init_time_ = get_walltime(); } void timer:: reset_delay(double delay_time) { reset(); elapsed_time_ = delay_time; } Timer CreateTimer() { Timer local_timer = new timer(); return local_timer; } void DeleteTimer(Timer timer) { delete timer; } double GetTime(Timer timer) { return timer->get(); } void ResetTimer(Timer timer) { timer->reset(); } void RestartTimer(Timer timer) { timer->restart(); } void PauseTimer(Timer timer) { timer->pause(); } void ResetDelayTimer(Timer timer, double delay_time) { timer->reset_delay(delay_time); } clblas-2.10/src/client/timer.hpp000066400000000000000000000024051264277366700166170ustar00rootroot00000000000000/* ************************************************************************ * Copyright 2013 Advanced Micro Devices, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * ************************************************************************/ #ifndef TIMER_HXX__ #define TIMER_HXX__ #include class timer { public: double get(); void pause(); void restart(); void reset(); void reset_delay(double delay_time); private: inline double get_walltime() { struct timespec ts; clock_gettime(CLOCK_REALTIME, &ts); return static_cast(ts.tv_sec) + static_cast(ts.tv_nsec) * 1.0e-9; } private: double init_time_; double elapsed_time_; public: timer(); ~timer(); }; // class timer #endif // ifndef TIMER_HXX__ clblas-2.10/src/flags_public.txt000066400000000000000000000001331264277366700166770ustar00rootroot00000000000000TAHITI_OCL " "; HAWAII1_OCL " "; HAWAII2_OCL "-cl-std=CL2.0"; BONAIRE_OCL "-cl-std=CL2.0"; clblas-2.10/src/include/000077500000000000000000000000001264277366700151325ustar00rootroot00000000000000clblas-2.10/src/include/binary_lookup.h000066400000000000000000000227141264277366700201660ustar00rootroot00000000000000/* ************************************************************************ * Copyright 2014 Advanced Micro Devices, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * ************************************************************************/ #ifndef __CLBLAS_BINARY_LOOKUP__ #define __CLBLAS_BINARY_LOOKUP__ #if defined(__APPLE__) || defined(__MACOSX) #include #else #include #endif #include #include // // BinaryLookup defines an API to manage the kernel cache on the disk // // The BinaryLookup object provides methods to: // * check if a cache file exists on the disk or not // * fill-up the signature to characterize the program beeing built on the disk // * build a cl_program from a string kernel or from a binary // // A cache entry is a file stored on the disk which contains 3 sections: // * A header section (providing information about file structure) // * The binary contained in the cl_program // * A signature which provides additionnal informations about the kernel // and allows to characterize the kernel in the disk cache // // The environment variable CLBLAS_CACHE_PATH defines the location of the // cache on the disk. If the variable CLBLAS_CACHE_PATH is not defined, no // cache file is written on the disk, but the cl_program can be built and // remains on memory // // Concerning multithreading, the policy is that every thread build the // cl_program from the source, but only the first one writes it on the // disk. Other threads continue with the cl_program in memory. // // A typical cache query shall be composed of the following steps: // // (1) Create a local instance of BinaryLookup // // (2) Specify the additional characteristics (i.e. variants) of the // requested program. Those information combined with the program // name and the OpenCL context and device shall form a unique // signature for the binary program. // // (3) Perform the effective search by calling the 'found' method // // (4) if the search was successfull then cl_program can be retreived // by a call to the 'getProgram' method // // (5) if the search was not successfull then a cl_program // must be created and populated in the cache by a call // to the 'setProgram' method. // // (6) Destroy the BinaryLookup local instance. // // For instance, that could be // // cl_program program ; // // The program name is part of the signature and shall be unique // const char * program_name = "... my unique program name ... " ; // // BinaryLookup bl(context, device, program_name); // // // Specify additionnal information used to build a // // signature signature for that cache entry // // bl.variantInt( vectorSize ); // bl.variantInt( hasBorder ); // ... // // // Perform the query // if ( bl.found() ) // { // // Success! use the cl_program retreived from the cache // program = bl.getProgram(); // } // else // { // // Failure! we need to build the program ourself // program = build_the_program(context,device,vectorSize,...) ; // // Inform the lookup object of the program // bl.setProgram(program); // // And populate the cache // bl.populateCache() // } // // Remark: The members buildFromSource, buildFromBinary etc are utility // functions that can be used to build the cl_program from either // sources or binary (e.g. SPIR). Their use is optionnal. // // class BinaryLookup { public: // Constructor // \param ctxt the context for which the cl_program should be built // \param device the device for which the cl_program should be built // \param kernel_name the kernel identifier BinaryLookup(cl_context ctxt, cl_device_id device, const std::string & kernel_name); ~BinaryLookup(); // Methods to fill up the signature of the cache entry void variantInt(int num); void variantDouble(double num); void variantCompileOptions(const std::string & opts); void variantRaw(const void * data, size_t bytes); // Indicates whether or not the cache entry was found on the disk // If the cache entry was found and is complete on the disk, its content // is loaded // \return true if a cache entry was found, false else bool found(); // Build a cl_program from the source code and init attributes // of the current structure // so that the program can be accessed with the getProgram method // Write the file to the cache cl_int buildFromSource(const char * source); // Build a cl_program from the source code and init attributes // so that the program can be accessed with the getProgram method // Write the file to the cache cl_int buildFromBinary(const void * data, size_t len, const char * BuildOption); // Returns the cl_program built from binary or loaded from disk cl_program getProgram(); // Set the current m_program to the given program void setProgram(cl_program program); // Build a cl_program from a text static cl_program buildProgramFromSource(const char * filename, cl_context context, cl_device_id device, cl_int & err, const char * options = 0); // Build a cl_program from binary static cl_program buildProgramFromBinary(const char * data, size_t data_size, cl_context context, cl_device_id device, cl_int & err, const char * options = 0); // Initialize the whole cache file information (magic_key, header and program) // and dump on the disk cl_int populateCache(); private: // Serialize variants and compute the checksum to load the file from cache void finalizeVariant(); // Build a cl_program from the source code and init attributes // so that the program can be accessed with the getProgram method // Do not write the file to the cache cl_int buildFromLoadedBinary(const void * data, size_t len, const char * BuildOption); // Try to retrieve the header of the cache file // Returns: ok if the header sections was successfully loaded, false else bool loadHeader(std::ifstream &file, size_t length); // Try to retrieve the cl_program and its signature in file // Returns: ok if the binary and signature sections were successfully loaded, false else bool loadBinaryAndSignature(std::ifstream &file); // Try to create a file associated to the current program/variant in the cache folder // Returns true if the file was successfully opened and loaded, false else bool tryLoadCacheFile(); // Dump the file on the disk with the name stored in this->m_cache_entry_name cl_int writeCacheFile(std::vector &data); // Retrieve device name, device vendor and driver number by calling // clGetDeviceInfo cl_int retrieveDeviceAndDriverInfo(); // Cache entry name std::string m_cache_entry_name; // Path for the cache entry name std::string m_path; // Header structure of a cache entry typedef struct Header_ { char magic_key[4]; // = |C|L|B|\0, useful to know that we are loading a clblas cache entry int whole_file_size; // the whole file of the size to know if the current file is complete or not int header_size; // = sizeof(Header) int binary_size; // kernel binary size int signature_size; // variant information } Header; Header m_header; cl_context m_context; cl_device_id m_device; cl_program m_program; unsigned char * m_binary; char * m_signature; enum VariantKind { INT, DOUBLE, STRING, DATA }; struct Variant { Variant(); Variant(VariantKind kind, char * data, size_t size); ~Variant(); VariantKind m_kind; size_t m_size; char * m_data; static char * serialize(VariantKind kind, char * data, size_t size); static Variant unserialize(char * data); }; // Cache entry, useful to abstract Windows and linux // cache entry file descriptor struct CacheEntry { CacheEntry(const std::string & filename); bool exclusive_create(); void close(); bool successful_creation(); private: std::string m_filename; bool m_successful_creation; void * m_handle; }; // Variants std::vector m_variants; // Indicates whether the cache should be used or not bool m_cache_enabled; }; #undef SIZE #endif clblas-2.10/src/include/clblas_stddef.h000066400000000000000000000047771264277366700201130ustar00rootroot00000000000000/* ************************************************************************ * Copyright 2013 Advanced Micro Devices, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * ************************************************************************/ #ifndef CLBLAS_STDDEF_H_ #define CLBLAS_STDDEF_H_ static __inline size_t szmin(size_t a, size_t b) { return (a <= b ? a : b); } static __inline size_t szmax(size_t a, size_t b) { return (a >= b ? a : b); } static __inline unsigned int umin(unsigned int a, unsigned int b) { return (a <= b ? a : b); } static __inline unsigned int umax(unsigned int a, unsigned int b) { return (a >= b ? a : b); } static __inline void uswap(unsigned int *a, unsigned int *b) { unsigned int tmp; tmp = *a; *a = *b; *b = tmp; } static __inline size_t roundDown(size_t a, size_t b) { return (a / b * b); } static __inline size_t roundUp(size_t a, size_t b) { return (a + b - 1) / b * b; } static __inline size_t divRoundUp(size_t a, size_t b) { return (a / b) + (a % b != 0); } static __inline int isRoundedPow2(size_t a) { return ((a & (a - 1)) == 0); } /* * Return zero based sequential number of the highest set bit the * number. If the number is 0, then the function returns -1. */ static __inline int findHighestSetBit(size_t a) { int n = (sizeof(size_t) * 8 - 1); size_t s = (size_t)1 << n; for (; (s != 0) && !(s & a); s >>= 1) { n--; } return (s == 0) ? -1 : n; } static __inline size_t roundDownPow2(size_t a) { size_t s; if (isRoundedPow2(a)) { return a; } s = (size_t)1 << (sizeof(size_t) * 8 - 1); // find the highest non zero bit for (; (s != 0) && !(s & a); s >>= 1); return s; } /* * With BLAS we never deal with so large number sufficient for overflowing. * So, it's safe */ static __inline size_t roundUpPow2(size_t a) { size_t s; if (isRoundedPow2(a)) { return a; } s = roundDownPow2(a); return (s << 1); } #endif /* CLBLAS_STDDEF_H_ */ clblas-2.10/src/include/clkern.h000066400000000000000000000116431264277366700165660ustar00rootroot00000000000000/* ************************************************************************ * Copyright 2013 Advanced Micro Devices, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * ************************************************************************/ #ifndef CLARGS_H_ #define CLARGS_H_ #if defined(__APPLE__) || defined(__MACOSX) #include #else #include #endif #include #include #ifdef __cplusplus extern "C" { #endif #define INIT_KARG(karg, val) \ do { \ memcpy((karg)->arg.data, &(val), sizeof(val)); \ (karg)->typeSize = sizeof(val); \ } while (0) enum { MAX_KERNEL_ARGS = 32, MAX_ARG_SIZE = sizeof(cl_double2), MAX_WORK_DIM = 3 }; // memory object data transfer direction typedef enum MemobjDir { MEMOBJ_READ = 0x1, MEMOBJ_WRITE = 0x2 } MemobjDir; typedef enum KernelLaunchPhase { PHASE_SET_ARGS, PHASE_ENQUEUE_WRITE, PHASE_ENQUEUE_KERNEL, PHASE_PROFILING, PHASE_ENQUEUE_READ } KernelLaunchPhase; typedef union KernelArgValue { cl_mem mem; int ival; unsigned char data[MAX_ARG_SIZE]; } KernelArgValue; /* * Structure describing an argument to be passed to a kernel * * @arg: pointer to the argument * @ardIdx: argument index in the kernel argument list * @hostBuf: buffer to copy data to/from from/to GPU memory * @enqType: buffer enqueue type * @sync: blocking I/O * @event: event for I/O */ typedef struct KernelArg { KernelArgValue arg; unsigned int typeSize; // argument type size, ignored for mem objects void *hostBuf; // host buffer for using with OpenCL memory objects size_t hostBufLen; MemobjDir dir; } KernelArg; typedef struct KernelDesc { cl_kernel kernel; size_t globalThreads[MAX_WORK_DIM]; size_t localThreads[MAX_WORK_DIM]; size_t workDim; const cl_event *eventWaitList; size_t waitListSize; cl_event *event; int nowait; int needExecTime; KernelArg args[MAX_KERNEL_ARGS]; unsigned long execTime; } KernelDesc; typedef struct KernelErrorInfo { unsigned int wrongArg; KernelLaunchPhase phase; } KernelErrorInfo; /* * store kernel arguments launch the kernel and read its results * * @kernDesc: descriptor of the kernel to be launched * @queue: command queue associated with the device * @errInfo: location to store info about occurred error, * ignored if NULL * * The function gets itself number of arguments to the kernel * usging the OpenCL API */ cl_int launchClKernel( KernelDesc *kernDesc, cl_command_queue queue, KernelErrorInfo *errInfo); /* * build a program from source * * @source: program source * @buildOpts: options to the opencl program builder * @DevID: ID of device to create program for * @logBuf: buffer to store build log at error * @status: location to store OpenCL status at error * * On success returns a build program object. * On error returns , and stores to the 'status' location * opencl status; if result is returned, but 'status' * cointains 'CL_SUCCESS', it means an file I/O or memory allocation * failure is occurred. If 'status' is set to NULL, it is ignored */ cl_program buildClProgram( const char *source, const char *buildOpts, cl_context ctx, cl_device_id devID, char *logBuf, size_t logBufSize, cl_int *status); /* * TODO: Doxygen-style comments */ cl_program createClProgramWithBinary( cl_context ctx, cl_device_id devID, unsigned char *binary, size_t binSize, cl_int *status); /* * TODO: Doxygen-style comments */ size_t getProgramBinarySize(cl_program program); /* * TODO: Doxygen-style comments */ unsigned char *getProgramBinary(cl_program program); /* * set a kernel argument of the size_t type */ static __inline void initSizeKarg(KernelArg *arg, size_t value) { memcpy(arg->arg.data, &value, sizeof(cl_uint)); arg->typeSize = sizeof(cl_uint); } /* * @inOut: memory object data transfer direction */ static __inline void initMemobjKarg( KernelArg *karg, cl_mem memobj, void *hostBuf, size_t hostBufLen, MemobjDir dir) { karg->arg.mem = memobj; karg->typeSize = sizeof(cl_mem); karg->hostBuf = hostBuf; karg->hostBufLen = hostBufLen; karg->dir = dir; } #ifdef __cplusplus } #endif #endif /* CLARGS_H_ */ clblas-2.10/src/include/cltypes.h000066400000000000000000000037241264277366700167740ustar00rootroot00000000000000/* ************************************************************************ * Copyright 2013 Advanced Micro Devices, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * ************************************************************************/ #ifndef CLTYPES_H_ #define CLTYPES_H_ #include #if defined(__APPLE__) || defined(__MACOSX) #include #else #include #endif /** * @internal * @defgroup DTYPES Data types */ /*@{*/ /** * @brief OpenCL type identifiers */ typedef enum DataType { TYPE_FLOAT, /**< single float precision type */ TYPE_DOUBLE, /**< double float precision type */ TYPE_COMPLEX_FLOAT, /**< single float precision complex type */ TYPE_COMPLEX_DOUBLE, /**< double float precision complex type */ TYPE_UNSIGNED_INT /**< Unsigned int, for output buffer for iAMAX routine */ } DataType; /*@}*/ enum { FLOAT4_VECLEN = sizeof(cl_float4) / sizeof(cl_float) }; /* * return size of a BLAS related data type */ #ifdef __cplusplus extern "C" #endif unsigned int dtypeSize(DataType type); /* * width of the matrix (block) in float4 words */ size_t fl4RowWidth(size_t width, size_t typeSize); static __inline bool isDoubleBasedType(DataType dtype) { return (dtype == TYPE_DOUBLE || dtype == TYPE_COMPLEX_DOUBLE); } static __inline bool isComplexType(DataType dtype) { return (dtype == TYPE_COMPLEX_FLOAT || dtype == TYPE_COMPLEX_DOUBLE); } #endif /* CLTYPES_H_ */ clblas-2.10/src/include/dblock_kgen.h000066400000000000000000000161471264277366700175560ustar00rootroot00000000000000/* ************************************************************************ * Copyright 2013 Advanced Micro Devices, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * ************************************************************************/ /* * Common generators for functions manipulating * with data blocks placed in the global, local, * or private memory. */ /* * TODO: add the unroll option to 'rwMatrBlockGen' * and 'smulMatrBlockGen' */ #ifndef DBLOCK_KGEN_H_ #define DBLOCK_KGEN_H_ #include #include /** * @internal * @defgroup MAJOR_GENS Major common used generators */ /*@{*/ /** * @internal * @brief Data block copying directions */ typedef enum DBlockCopyDirection { /** Copy from the global to the local memory */ DBLOCK_GLOBAL_TO_LOCAL, /** Copy from the local to the global memory */ DBLOCK_LOCAL_TO_GLOBAL, /** Copy from the global memory to an image */ DBLOCK_GLOBAL_TO_IMAGE, /** Copy from the local memory to an image */ DBLOCK_LOCAL_TO_IMAGE } DBlockCopyDirection; /** * @internal * @brief Data block copying flags */ typedef enum DBlockCopyFlags { DBLOCK_COPY_TRANSPOSE = 0x1, /**< Transpose 2D block */ /** pack several rows into single image row */ DBLOCK_COPY_PACKED_IMAGE = 0x2, DBLOCK_COPY_CONJUGATE = 0x4, /**< Conjugate complex elements */ DBLOCK_COPY_NOT_VECTORIZE = 0x8 /**< Disable vectorized copying */ } DBlockCopyFlags; /** * @internal * @brief Generator to copy data blocks between different kinds * of memory * * @param[out] ctx Generator context * @param[in] dim Subproblem dimension to generate a function for * @param[in] pgran Data parallelism granularity * @param[in] dtype Data type * @param[in] dir Copying direction * @param[in] flags Copying flags; when an image is used as destination * block transposing is prohibited * * If 'dim' is set to NULL a generic version working with subproblem * of any dimension is generated. In the case specific work group * sizes are ignored, only work group dimension is used. * * 'x' field of the passed SuproblemDim structure should contain * the block width * 'y' should contain the block height * * Copied blocks can be as well one as two dimensional. For any one * dimensional block 'y' field of the dimension structure should be * set to 1. If a block is two dimensional, and the local memory is \n * the source or destination memory, the block's rows must be aligned * to float4 boundary. * * Rows of the matrix block must be aligned to float4 boundary. \n * * Generated functions have the following definitions: \n *\n * Buffer-buffer copying function for optimal blocks: \n * @code * void * funcName( * dst, * src, * size_t startRow, * size_t startCol, * size_t ld) * @endcode * * The unified pointer types can be GPtr if the global memory is used or LPtr * is the local memory is used respectively * (See the "Data types in kernels" section). Function naming rule is follow: \n * (type prefix)copyDBlock['Transp']['Conj']['Nvec'](src mem][dst mem] * [block height][block width] \n * The 'Nvec' suffix is added if vectorized copying is prohibited.\n *\n * Buffer-buffer copying function, generic version: \n * @code * void * funcName( * dst, * src, * size_t startRow, * size_t startCol, * size_t nrRows, * size_t nrCols, * size_t dstLD, * size_t srcLD) * @endcode * * Here "dstLD" is destination leading dimension, "srcLD" - source leading * dimension. \n * Naming rule is the same as for the function above except block sizes. \n *\n * Function copying optimal blocks from the global memory to an image: \n * @code * void * funcName( * __write_only image2d_t dst, * size_t startX, * size_t startY, * GPtr src, * size_t startRow, * size_t startCol, * size_t ld) * @endcode * 'start' and 'startY' arguments is start X and Y coordinate in the image to * write from. The generic version has the analogous definition, and takes two * additional arguments 'nrRows' and 'nrCols' of the size_t type following just * fter the 'startCol' argument. \n *\n * Function copying optimal blocks from the local memory to an image: \n * @code * void * funcName( * __write_only image2d_t dst, * size_t startX, * size_t startY, * LPtr src) * @endcode * The generic version takes two additional arguments 'nrRows' and 'nrCols' of the * size_t type following just after the 'src' argument. * * @return 0 on success; on error returns negated error code: * * - -EINVAL: unsupported data type is passed, or * 'DBLOCK_COPY_TRANSPOSE' is set when * an image is used as destination * - -ENOTSUP: unsupported copying direction is passed * - -EOVEFFLOW: code buffer overflowed */ int copyDataBlockGen( struct KgenContext *ctx, const SubproblemDim *dim, const PGranularity *pgran, DataType dtype, DBlockCopyDirection dir, DBlockCopyFlags flags); /*@}*/ /* * Zero data block in the local or global memory * * @ctx: generator context * @dim: Subproblem dimension to generate the function for * @pgran: data parallelism granularity * @memPrefix: type of memory to generate the function for * * The 'memPrefix' field of the passed BlasKernExtra structure * should contain the type of memory the buffer is stored in. * It cane take one of the "__local", or the "__global" value. * * 'x' field of the passed SuproblemDim structure should contain * the block width in float4 words. In the case the function takes only * a buffer pointer. If the field is set to 'SUBDIM_UNUSED' * the function is generated without any loop unrollings. In the case * the function takes buffer length as the second argument. * * If 'unroll' is set, the 'bwidth' field of the structure should * contain the maximum width of a block zeroed with loop unrolling. * If 'unroll' is set but the 'bwidth' is set to 'SUBDIM_UNUSED', * the generator don't apply any restriction to loop unrolling. * The parameter is ignored if the 'x' field of the 'dim' is set to * 'SUBDIM_UNUSED'. * * On success returns 0, on error returns negated error code: * * -EINVAL: wrong memory prefix is passed * -EOVEFFLOW: code buffer overflowed */ int f4zeroBlockGen( struct KgenContext *ctx, const SubproblemDim *dim, const PGranularity *pgran, const char *memPrefix); #endif /* DBLOCK_KGEN_H_ */ clblas-2.10/src/include/defbool.h000066400000000000000000000025051264277366700167170ustar00rootroot00000000000000/* ************************************************************************ * Copyright 2013 Advanced Micro Devices, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * ************************************************************************/ #ifndef DEFBOOL_H_ #define DEFBOOL_H_ #if defined(_MSC_VER) && _MSC_VER <= 1700 /* FIX for windows compilation #if !defined(__cplusplus) typedef int _Bool; #define bool _Bool enum { false, true }; #endif */ #define __bool_true_false_are_defined 1 #ifndef __cplusplus #define bool _Bool #if __STDC_VERSION__ < 199901L && __GNUC__ < 3 #define false 0 #define true 1 typedef int _Bool; #endif #endif /* !__cplusplus */ #else /* defined(_MSC_VER) && _MSC_VER <= 1700 */ #include #endif /* defined(_MSC_VER) && _MSC_VER <= 1700 */ #endif /* DEFBOOL_H_ */ clblas-2.10/src/include/devinfo.h000066400000000000000000000053631264277366700167440ustar00rootroot00000000000000/* ************************************************************************ * Copyright 2013 Advanced Micro Devices, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * ************************************************************************/ #ifndef DEVINFO_H_ #define DEVINFO_H_ #include #ifdef __cplusplus extern "C" { #endif /* * TODO: Expand these enumerations in respect with known * vendors and devices */ typedef enum DeviceVendor { VENDOR_UNKNOWN, VENDOR_AMD, VENDOR_NVIDIA } DeviceVendor; typedef enum DeviceFamily { DEVICE_FAMILY_UNKNOWN, GPU_FAMILY_EVERGREEN, GPU_FAMILY_FERMI } DeviceFamily; typedef enum DeviceChip { CHIP_UNKNOWN, REDWOOD, JUNIPER, CYPRESS, HEMLOCK, CAYMAN, TAHITI, HAWAII, BONAIRE, GEFORCE_GTX_480, GEFORCE_GTX_580, NUM_DEVICE_CHIPS } DeviceChip; typedef struct DeviceIdent { DeviceVendor vendor; DeviceFamily family; DeviceChip chip; } DeviceIdent; typedef struct DeviceHwInfo { unsigned int wavefront; unsigned int channelSize; unsigned int bankSize; unsigned int l1CacheAssoc; } DeviceHwInfo; typedef struct TargetDevice { cl_device_id id; DeviceIdent ident; bool hwInfoValid; DeviceHwInfo hwInfo; } TargetDevice; cl_int identifyDevice(TargetDevice *target); cl_uint deviceComputeUnits (cl_device_id device, cl_int *error); cl_ulong deviceLDSSize (cl_device_id device, cl_int *error); cl_uint deviceWavefront (cl_device_id device, cl_int *error); cl_uint deviceDataAlignment (cl_device_id device, cl_int *error); cl_uint deviceAddressBits (cl_device_id device, cl_int *error); bool deviceHasNativeDouble (cl_device_id device, cl_int *error); bool deviceHasNativeComplex(cl_device_id device, cl_int *error); cl_ulong deviceL2CacheSize (cl_device_id device, cl_int *error); cl_ulong deviceL1CacheSize (cl_device_id device, cl_ulong l2CacheSize, cl_int *error); cl_uint deviceL1CacheAssoc (cl_device_id device, cl_ulong l1CacheSize, cl_int *error); size_t deviceMaxWorkgroupSize (cl_device_id device, cl_int *error); #ifdef __cplusplus } /* extern "C" { */ #endif #endif /* DEVINFO_H_ */ clblas-2.10/src/include/dis_warning.h000066400000000000000000000044041264277366700176110ustar00rootroot00000000000000/* ************************************************************************ * Copyright 2013 Advanced Micro Devices, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * ************************************************************************/ #ifndef DIS_WARNING_H_ #define DIS_WARNING_H_ #if _MSC_VER #pragma warning (disable:4204) #pragma warning (disable:4127) #define MAY_ALIAS #else /* _MSC_VER */ #define MAY_ALIAS __attribute__((__may_alias__)) #endif /* * Set of macro to mute gcc when we don't need in using some * function arguments */ #define DUMMY_ARG_USAGE(arg) \ do { \ (void)arg; \ } while (0) #define DUMMY_ARGS_USAGE_2(arg1, arg2) \ do { \ (void)arg1; \ (void)arg2; \ } while (0) #define DUMMY_ARGS_USAGE_3(arg1, arg2, arg3) \ do { \ (void)arg1; \ (void)arg2; \ (void)arg3; \ } while(0) \ #define DUMMY_ARGS_USAGE_4(arg1, arg2, arg3, arg4) \ do { \ (void)arg1; \ (void)arg2; \ (void)arg3; \ (void)arg4; \ } while(0) \ #endif /* DIS_WARNING_H_ */ clblas-2.10/src/include/granulation.h000066400000000000000000000051611264277366700176310ustar00rootroot00000000000000/* ************************************************************************ * Copyright 2013 Advanced Micro Devices, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * ************************************************************************/ /* * Data and execution granulation */ #ifndef GRANULATION_H_ #define GRANULATION_H_ /** * @internal * @brief Decomposition axis * @ingroup PROBLEM_DECOMPOSITION */ typedef enum DecompositionAxis { DECOMP_AXIS_Y, DECOMP_AXIS_X } DecompositionAxis; /** * @internal * @brief Data parallelism granularity * @ingroup PROBLEM_DECOMPOSITION */ typedef struct PGranularity { /** work group sizes */ unsigned int wgSize[2]; /** work group dimension */ unsigned int wgDim; /** wavefront size */ unsigned int wfSize; /** Record number of work-groups spawned */ unsigned int numWGSpawned[2]; /** max number of work group size */ unsigned int maxWorkGroupSize; } PGranularity; /** * @internal * @brief Subproblem dimensions * * The structure represents how a problem is decomposed during * the computation. The decomposition is made in terms of * resulting data. It describes as well what portion of work each * computing item gets as what chunk it evaluates at a time. * The chunk processed at a time is typically bound by amount * of resources consumed at this level of decomposition while * the whole portion is bound of amount of more high level resources * to be available, and can also be used for the purpose of work * balancing. * * @ingroup PROBLEM_DECOMPOSITION */ typedef struct SubproblemDim { size_t x; /**< Subproblem step size in X dimension */ size_t y; /**< Subproblem step size in Y dimension */ /** Width of data blocks processed consecutively * to evaluate a subproblem of 'x' by 'y' size */ size_t bwidth; size_t itemX; /**< Size of the whole subproblem in X dimension evaluated by a computing item */ size_t itemY; /**< Size of the whole subproblem in Y dimension evaluated by a computing item */ } SubproblemDim; #endif /* GRANULATION_H_ */ clblas-2.10/src/include/kern_cache.h000066400000000000000000000107311264277366700173670ustar00rootroot00000000000000/* ************************************************************************ * Copyright 2013 Advanced Micro Devices, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * ************************************************************************/ /* * OpenCL kernel cache */ #ifndef KERN_CACHE_H_ #define KERN_CACHE_H_ #if defined(__APPLE__) || defined(__MACOSX) #include #else #include #endif #include #include #include #include #include #include struct KernelCache; /* Unique kernel characteristics */ typedef struct KernelKey { cl_device_id device; cl_context context; unsigned int nrDims; SubproblemDim subdims[MAX_SUBDIMS]; } KernelKey; /* * structure describing an optimal CL kernel for some * memory pattern and subproblem dimensions */ typedef struct Kernel { cl_program program; // program the kernel belongs to /* extra information specific for the application field */ void *extra; size_t extraSize; void (*dtor)(struct Kernel *kern); int noSource; } Kernel; typedef int (*KernelExtraCmpFn)(const void *extra, const void *extraKey); /* * Create kernel cache * * @nrSolvers: total solvers amount to store kernels of in a cache * @sizeLimit: limit of the cache in bytes; * if set to 0 the cache size is * unlimited * * On success returns pointer to kernel cache object; * On error returns NULL, if it has not succeeded to allocated need resources */ struct KernelCache *createKernelCache( unsigned int nrSolvers, size_t sizeLimit); void destroyKernelCache(struct KernelCache *kcache); /* * Allocate kernel * * After allocation fill the structure with zero bytes * and set the kernel's reference counter to 1. * * return pointer to a just created kernel, * return NULL if there is not enough memory * to allocate a kernel */ Kernel *allocKernel(void); /* * Get reference to kernel not yet added to a cache */ void getKernel(Kernel *kern); /* * Decrement reference counter of this kernel * * @kcache: the cache the kernel inserted to; * may be NULL if the kernel is not yet * added to a cache, it is ignored in the case * * When there are no more references to the kernel, it is automatically * destroyed */ void putKernel(struct KernelCache *kcache, Kernel *kern); /* * Add new generated kernel to cache * * @kcache: cache to add the kernel to * @sid: solver ID to add the kernel for * @kern: kernel to add * @key: kernel characteristics * * On success returns 0. * On error returns -1, in on of the following cases: * kernel size is larger than the maximum cache size, * or there is not enough memory to allocate internal * structures, * or the passed solver ID is wrong, * or 'nrDims' is wrong, */ int addKernelToCache( struct KernelCache *kcache, solver_id_t sid, Kernel *kern, const KernelKey *key, KernelExtraCmpFn extraCmp); /* * Find the kernel for the given OpenCL solver and * subproblem dimensions, and increment reference counter to it * * On success returns the kernel being actually stored in the cache. * On error returns NULL; it means the passed solver ID * is wrong, or any kernel for the given solver and subprolem * dimensions is not stored in the cache */ Kernel *findKernel( struct KernelCache *kcache, solver_id_t sid, const KernelKey *key, const void *extraKey); /* * Get available size in the kernel cache */ size_t availKernelCacheSize(struct KernelCache *kcache); /* * Remove all kernels from the cache */ void cleanKernelCache(struct KernelCache *kcache); size_t fullKernelSize(struct Kernel *kern); #if defined(TRACE_MALLOC) void printKernelCacheSize(struct KernelCache *kcache); #else /* TRACE_MALLOC */ static __inline void printKernelCacheSize(struct KernelCache *kcache) { /* do nothing */ (void)kcache; } #endif /* !TRACE_MALLOC */ #endif /* KERN_CACHE_H_ */ clblas-2.10/src/include/kernel_extra.h000066400000000000000000000133131264277366700177670ustar00rootroot00000000000000/* ************************************************************************ * Copyright 2013 Advanced Micro Devices, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * ************************************************************************/ #ifndef KERNEL_EXTRA_H_ #define KERNEL_EXTRA_H_ #include enum { MAX_SOLVER_PRIVATE_SIZE = 16 }; // // Moving BUILD_OPTS_MAXLEN here. Originally in clblas-internal.h // Including "clblas-internal.h" enum { MEMPAT_PER_BLASFN = 8, BUILD_OPTS_MAXLEN = 256 }; /** * @internal * @brief BLAS kernel type identifiers * * @ingroup BLAS_SOLVERIF_SPEC */ typedef enum CLBlasKernelType { CLBLAS_COMPUTING_KERNEL, /**< Main computing kernel */ CLBLAS_PREP_A_KERNEL, /**< Kernel preparing matrix A */ CLBLAS_PREP_B_KERNEL, /**< Kernel preparing matrix B */ MAX_CLBLAS_KERNELS_PER_STEP } CLBlasKernelType; /** * @internal * @defgroup BLAS_SOLVERIF_SPEC BLAS specifics * @ingroup SOLVERIF */ /*@{*/ /** * @brief BLAS kernel flags * * These flags uniquely determine problem options kernels are generated for */ typedef enum KernelExtraFlags { /** Matches to a problem without any options */ KEXTRA_NO_FLAGS = 0, KEXTRA_TRANS_A = 0x01, /**< Matrix A should be transposed */ /** matrix A should be took in the conjugate form */ KEXTRA_CONJUGATE_A = 0x02, KEXTRA_TRANS_B = 0x04, /**< matrix B should be transposed */ /** Matrix B should be taken in the conjugate form */ KEXTRA_CONJUGATE_B = 0x08, KEXTRA_COLUMN_MAJOR = 0x10, /**< Order is column major */ /** * Matrix A is upper triangular, it is lower triangular * if this flag is not set */ KEXTRA_UPPER_TRIANG = 0x20, /** * Matrix A is placed on the right, it is placed * on the left if this flag is not set */ KEXTRA_SIDE_RIGHT = 0x40, /** * Unit diagonal matrix */ KEXTRA_UNIT_DIAGONAL = 0x80, /** kernel should process tails of upper level blocks in M dimension */ KEXTRA_TAILS_M = 0x100, /** kernel should process tails of upper level blocks in N dimension */ KEXTRA_TAILS_N = 0x200, /** kernel should process tails of upper level blocks in K dimension */ KEXTRA_TAILS_K = 0x400, /** Beta multiplier is zero */ KEXTRA_BETA_ZERO = 0x800, /** Disable vectorization at block copying for matrix A */ KEXTRA_NO_COPY_VEC_A = 0x1000, /** Disable vectorization at block copying for matrix B */ KEXTRA_NO_COPY_VEC_B = 0x2000, /** Disable vectorization at block copying for matrix C */ KEXTRA_NO_COPY_VEC_C = 0x4000, // SYRXK specific flags /** Diagonal solution blocks are evaluated in a separate kernel */ KEXTRA_SYRK_SEPARATE_DIAGONAL = 0x8000, /** Evaluate diagonal solution blocks for a SYRXK function */ KEXTRA_SYRK_EVALUATE_DIAGONAL = 0x10000, /** 2k rank update */ KEXTRA_SYRK_2K_RANK = 0x20000, // BLAS2 specific flags /** Incx increment is one */ KEXTRA_INCX_ONE = 0x40000, /** Incy increment is one */ KEXTRA_INCY_ONE = 0x80000, // Generator specific flags /** MAD function can be used */ // FIXME: throw this kludge away KEXTRA_ENABLE_MAD = 0x100000, // FIXME: It's a kludge, pass further DeviceIndent structure to generators KEXTRA_VENDOR_AMD = 0x200000, /* Flags showing not zero starting offsets for kernels */ KEXTRA_STARTM_NOT_ZERO = 0x400000, KEXTRA_STARTN_NOT_ZERO = 0x800000, //KEXTRA_STARTK_NOT_ZERO = 0x2000000, /** Matrix A offset in a memory object is not zero */ KEXTRA_A_OFF_NOT_ZERO = 0x1000000, /** Matrix B or vector X offset in a memory object is not zero */ KEXTRA_BX_OFF_NOT_ZERO = 0x2000000, /** Matrix C or vector Y offset in a memory object is not zero */ KEXTRA_CY_OFF_NOT_ZERO = 0x4000000, /** kernel should process tails of lower level blocks in M dimension */ KEXTRA_TAILS_M_LOWER = 0x8000000, /** kernel should process tails of lower level blocks in N dimension */ KEXTRA_TAILS_N_LOWER = 0x10000000, /** kernel should process tails of lower level blocks in K dimension */ KEXTRA_TAILS_K_LOWER = 0x20000000 } KernelExtraFlags; /** * @internal * @brief extra information CLBLAS kernel generator * @ingroup BLAS_SOLVERIF_SPEC */ typedef struct CLBLASKernExtra { DataType dtype; /**< Data type */ KernelExtraFlags flags; /**< Kernel flags identifying a problem */ CLBlasKernelType kernType; /**< Kernel type */ // Fixme: Deprecate it; now it is just for backward compatibility unsigned int vecLen; /**< vector length to evaluate with */ /** vector length for matrix A elements to evaluate with */ unsigned int vecLenA; /** vector length for matrix B elements to evaluate with */ unsigned int vecLenB; /* * FIXME: remove this kludge; vectorization for the result should be * autodetected */ unsigned int vecLenC; char solverPriv[MAX_SOLVER_PRIVATE_SIZE]; char buildOptions[BUILD_OPTS_MAXLEN]; // Build Flags used for the kernel call } CLBLASKernExtra; /* * function to compare blas kernels extra information */ int clblasKernelExtraCmp(const void *extra, const void *extraKey); /*@}*/ #endif /* KERNEL_EXTRA_H_ */ clblas-2.10/src/include/kerngen.h000066400000000000000000000432141264277366700167400ustar00rootroot00000000000000/* ************************************************************************ * Copyright 2013 Advanced Micro Devices, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * ************************************************************************/ /* * Kernel generator related common definitions */ #ifndef KERNGEN_H_ #define KERNGEN_H_ #include #include #if defined (_MSC_VER) #include #endif #include #include #include #include #include #include /** * @internal * @defgroup KGEN_INFRA Kernel generator infrastructure */ /*@{*/ #ifdef _MSC_VER #define SPREFIX "I" #else #define SPREFIX "z" #endif #define SUBDIM_UNUSED (size_t)-1 enum { MAX_TABS = 16, MAX_STATEMENT_PRIORITY = 63, MAX_STATEMENT_LENGTH = 4096 }; enum { // maximum subproblem dimensions MAX_SUBDIMS = 3, // maximum code nesting MAX_NESTING = 10, KSTRING_MAXLEN = 256, // generated function name max len FUNC_NAME_MAXLEN = KSTRING_MAXLEN }; typedef struct{ SubproblemDim subdims[MAX_SUBDIMS]; PGranularity pgran; }DecompositionStruct; struct KgenContext; struct KgenGuard; struct StatementBatch; /** * @internal * @defgroup KGEN_TYPES Types * @ingroup KGEN_INFRA */ /*@{*/ /** * @internal * @brief Memory fence type */ typedef enum CLMemFence { /** Fence for operations against the local memory */ CLK_LOCAL_MEM_FENCE, /** Fence for operations against the global memory */ CLK_GLOBAL_MEM_FENCE } CLMemFence; // TODO: deprecate typedef enum UptrType { UPTR_GLOBAL, UPTR_LOCAL, UPTR_PRIVATE } UptrType; /** * @internal * @brief Null-terminated string being a part of a kernel */ typedef struct Kstring { /** Buffer storing the string */ char buf[KSTRING_MAXLEN]; } Kstring; /** * @internal * @brief Type of custom generator for loop unrolling */ typedef int (*LoopUnrollGen)(struct KgenContext *ctx, void *priv); /*@}*/ /** * @internal * @brief Unrolled loop control information */ typedef struct LoopCtl { const char *ocName; /**< outer loop counter name */ union { const char *name; unsigned long val; } outBound; /**< outer loop bound */ bool obConst; /**< outer loop bound is constant flag */ unsigned long inBound; /**< inner loop bound */ } LoopCtl; /** * @internal * @brief Set of loop unrolling subgenerators */ typedef struct LoopUnrollers { /** generate preparative code before unrolling */ LoopUnrollGen preUnroll; /** generate single step for unrolled body in the vectorized way */ LoopUnrollGen genSingleVec; /** generated single step for unrolled body in non vectorized way */ LoopUnrollGen genSingle; /** generate code that should be inserted just after unrolled loop body */ LoopUnrollGen postUnroll; /** return veclen*/ LoopUnrollGen getVecLen; } LoopUnrollers; /*@}*/ static __inline void emptyKstring(Kstring *kstr) { kstr->buf[0] = '\0'; } static __inline bool isKstringEmpty(const Kstring *kstr) { return (kstr->buf[0] == '\0'); } /** * @internal * @defgroup KGEN_CORE Core API * @ingroup KGEN_INFRA */ /*@{*/ /** * @internal * @brief Create new generator context * * @param[out] srcBuf Source buffer; if NULL, then any statements * were not actually added to the source buffer, just * their overall size will be calculated * @param[in] srcBufLen Maximal length of the source which is being * generated; ignored if an actual buffer was not * specified * @param[in] fmt Format the source. Code formatting assumes * tabulation and watch line width * * @return New generator context on success. Returns NULL * if there is not enough memory to allocate internal structures */ struct KgenContext *createKgenContext(char *srcBuf, size_t srcBufLen, bool fmt); /** * @internal * @brief Destroy a kernel generator context * * @param[out] ctx An existing generator context to be destroyed */ void destroyKgenContext(struct KgenContext *ctx); /** * @internal * @brief Reset a kernel generator context used before * * @param[out] ctx A generator context to be reset * * Clear the source buffer and another information associated * with this context */ void resetKgenContext(struct KgenContext *ctx); /** * @internal * @brief Synchronize formatting of 2 contexts * * @param[in] srcCtx Source generator context * @param[out] dstCtx Destination generator context * @param[in] nrTabs Tabs number to be inserted in the source context. * It is relative on the current nesting level of the * target context. It must be not less than zero, and * resulting number of tabs which is evaluated as * the target context's nesting level plus 'nrTabs' * must not exceed 'MAX_TABS' * * The function is usable when it's needed to insert a code from * one context into another one, and don't disturb formatting. * * @return 0 on success, -EINVAL if the 'nrTabs' parameter is out * of range */ int kgenSyncFormatting( struct KgenContext *srcCtx, const struct KgenContext *dstCtx, int nrTabs); /** * @internal * @brief Add a function declaration * * @param[out] ctx Generator context * @param[in] decl The declaration to be added * * @return 0 on success; -1 if the source code exceeds the buffer, * or level of the code nesting is not zero, or the returned * type is not defined, or there is not a paranthesis opening * the argument list */ int kgenDeclareFunction(struct KgenContext *ctx, const char *decl); /** * @internal * @brief Begin function body * * @param[out] ctx Generator context * * Adds the opening bracket and increments a nesting counter. * * @return 0 on success; -1 if the source code exceeds the buffer */ int kgenBeginFuncBody(struct KgenContext *ctx); /** * @internal * @brief End function body * * @param[out] ctx Generator context * * Adds the closing bracket and decrements a nesting counter * * @return 0 on success; -1 if the source code exceeds the buffer, * or code nesting is not 1 */ int kgenEndFuncBody(struct KgenContext *ctx); /** * @internal * @brief Get the last declared function name for the context * * @param[out] buf A buffer to store the function name * @param[in] buflen Size of the buffer * @param[in] ctx Generator context * * @return pointer to the gotten function name on success; -1 * if no functions were declared or the passed buffer is * insufficient */ int kgenGetLastFuncName( char *buf, size_t buflen, const struct KgenContext *ctx); /** * @internal * @brief Begin new execution branch: conditional branch or loop * * @param[out] ctx Generator context * @param[in] stmt A statement containing a branch control code. * Ignored if NULL. * * The opening bracket and trailing new line symbol are added * automatically and should not be passed * * @return 0 on success; -1 if the overall source exceeds the set * limit or nesting exceeds the maximum allowed one */ int kgenBeginBranch(struct KgenContext *ctx, const char *stmt); /** * @internal * @brief End the current code branch * * @param[out] ctx Generator context * @param[in] stmt A statement containing a branch control code * * As well closing bracket as trailing ';' and '\n' are added automatically and * should not be passed. * The statement passed in 'stmt' is appended after the closing bracket. * * @return 0 on sucess; -1 if the overall source exceeds the set limit, * or there is not an opened branch */ int kgenEndBranch(struct KgenContext *ctx, const char *stmt); /** * @internal * @brief Add a statement to generated source * * @param[out] ctx Generator context * @param[in] stmt A statement to be added * * If formatting is enabled and the statement is multiline, all the lines are * formatted automatically. It's strongly not recommended to add with this * function any statements containing variables or function declaration, * or branch bounds. The appropriated functions should be used for that to avoid * unexpected side effects. * * @return 0 on success; -1 if the overall source exceeds the set limit */ int kgenAddStmt(struct KgenContext *ctx, const char *stmt); int kgenPrintf(struct KgenContext *ctx, const char *fmt,...); struct StatementBatch *createStmtBatch(void); int kgenAddStmtToBatch( struct StatementBatch *batch, int priority, const char *stmt); int kgenBatchPrintf( struct StatementBatch *batch, int priority, const char *fmt,...); int flushStmtBatch(struct KgenContext *ctx, struct StatementBatch *batch); void destroyStmtBatch(struct StatementBatch *batch); /** * @internal * @brief Add a blank line to generated source * * @param[out] ctx Generator context * * @return 0 on success; -1 if the overall source exceeds * the set limit returns -1 */ int kgenAddBlankLine(struct KgenContext *ctx); /** * @internal * @brief Get resulting source size * * @param[out] ctx Generator context * * @return size of the overall source was added to the * generator context including the trailing null * byte */ size_t kgenSourceSize(struct KgenContext *ctx); /*@}*/ /** * @internal * @defgroup KGEN_BASIC Basic generating functions * @ingroup KGEN_INFRA */ /*@{*/ /** * @internal * @brief Add barrier * * @param[out] ctx Generator context * @param[in] fence Fence type * * @return 0 on success, and -EOVERFLOW on buffer overflowing */ int kgenAddBarrier(struct KgenContext *ctx, CLMemFence fence); /** * @internal * @brief Add memory fence * * @param[out] ctx Generator context * @param[in] fence Fence type * * @return 0 on success, and -EOVERFLOW on buffer overflowing */ int kgenAddMemFence(struct KgenContext *ctx, CLMemFence fence); /** * @internal * @brief Add local ID declaration and evaluating expression * * @param[out] ctx Generator context * @param[in] lidName Local id variable name * @param[in] pgran Data parallelism granularity * * The resulting expression depends on the work group dimension and size * of the first one. * * @return 0 on success, and -EOVERFLOW on buffer overflowing */ int kgenDeclareLocalID( struct KgenContext *ctx, const char *lidName, const PGranularity *pgran); /** * @internal * @brief Add work group ID declaration and evaluating expression * * @param[out] ctx Generator context * @param[in] gidName Group id variable name * @param[in] pgran Data parallelism granularity * * The resulting expression depends on the work group dimension and size * of the first one. * * @return 0 on success, and -EOVERFLOW on buffer overflowing */ int kgenDeclareGroupID( struct KgenContext *ctx, const char *gidName, const PGranularity *pgran); /* * TODO: deprecate when casting is eliminated * * declare unified pointers * * @withDouble: double based types pointers area needed * * On success returns 0, on buffer overflowing returns -EOVERFLOW */ int kgenDeclareUptrs(struct KgenContext *ctx, bool withDouble); /*@}*/ /** * @internal * @defgroup KGEN_HELPERS Generating helpers * @ingroup KGEN_INFRA */ /*@{*/ /** * @internal * @brief Assistant for loop body unrolling * * @param[out] ctx Generator context * @param[in] loopCtl Unrolled loop control information * @param[in] dtype Data type to unroll the loop body for * @param[in] unrollers Set of subgenerators; * If 'preUnroll', 'postUnroll' or 'vecUnroll' * is set to NULL, it is ignored. Vectorized unrolling * is not used for 'COMPLEX_DOUBLE' type * @param[out] priv Private data for generators * * The unrolled loop can be as well single as double. In the case * of the double loop only the inner loop is unrolled, and the outer * loop is generated in the standard way with using the passed loop * counter name and its bound. For the single loop 'ocName' field of the * 'loop' structure should be NULL. * * @return 0 on success. On error returns negated error code:\n *\n * -EOVERFLOW: code buffer overflowed\n * -EINVAL: invalid parameter is passed * (unsupported data type, or 'genSingle' generator * is not specified) */ int kgenLoopUnroll( struct KgenContext *ctx, LoopCtl *loopCtl, DataType dtype, const LoopUnrollers *unrollers, void *priv); /** * @internal * @brief Create code generation guard * * @param[out] ctx Generator context * @param[in] genCallback Generator callback which is invoked it the function * matching to a pattern is not found * @param[in] patSize Pattern size * * The guard doesn't allow to generate several functions matching to the same * pattern and as result having the same name. * * @return a guard object on success; -ENOMEM if there is * not enough of memory to allocate internal structures */ struct KgenGuard *createKgenGuard( struct KgenContext *ctx, int (*genCallback)(struct KgenContext *ctx, const void *pattern), size_t patSize); /** * @internal * @brief Reinitialize generator guard * * @param[out] guard An existing generation guard * @param[out] ctx Generator context * @param[in] genCallback Generator callback which is invoked it the function * matching to a pattern is not found * @param[in] patSize Pattern size */ void reinitKgenGuard( struct KgenGuard *guard, struct KgenContext *ctx, int (*genCallback)(struct KgenContext *ctx, const void *pattern), size_t patSize); /** * @internal * @brief Find an already generated function or generate it * * @param[out] guard An existing generation guard * @param[in] pattern Pattern the function being looked for should match * @param[out] name Buffer to store a name of the function * @param[in] nameLen Name buffer length * * At first it tries to find an already generated function mathing to the passed * pattern. If the guard doesn't find the function, it invokes the generator * callback * * NOTE: names of generated functions should not exceed 'FUNC_NAME_MAXLEN' * constant. * * @return 0 on success, otherwise returns a negated error code:\n * -ENOMEM: enough of memory to allocate internal structures\n * -EOVERFLOW: source buffer overflowing */ int findGenerateFunction( struct KgenGuard *guard, const void *pattern, char *name, size_t nameLen); /** * @internal * @brief Destroy code generation guard * * @param[out] guard A guard instance to be destroyed */ void destroyKgenGuard(struct KgenGuard *guard); /*@}*/ /** * @internal * @defgroup KGEN_AUX_FUNCS Auxiliary functions * @ingroup KGEN_INFRA */ /*@{*/ void kstrcpy(Kstring *kstr, const char *str); void ksprintf(Kstring *kstr, const char *fmt,...); void kstrcatf(Kstring *kstr, const char *fmt,...); // unified pointer type name const char *uptrTypeName(UptrType type); /** * @internal * @brief get a BLAS data type dependendtto function prefix * * @param[in] type Data type * * A literal returned by the function is assumed to be used as the prefix * of some generated function to put the accent on the BLAS data type it * operates with. * * @return 0 if an unknown type is passed */ char dtypeToPrefix(DataType type); /** * @internal * @brief convert a BLAS data type to the respective built-in OpenCL type * * @param[in] dtype Data type * * @return NULL if an unknown type is passed */ const char *dtypeBuiltinType(DataType dtype); /** * internal * @brief Return unified pointer field corresponding to the data type * * @param[in] dtype Data type * * @Returns NULL if an unknown type is passed */ const char *dtypeUPtrField(DataType dtype); /** * @internal * @brief Return "one" value string depending on the data type * * @param[in] dtype Data type * * @return NULL if an unknown type is passed */ const char *strOne(DataType dtype); /** * @internal * @brief Get vector type name * * @param[in] dtype Data type * @param[in] vecLen Vector length for the type. Must be set to 1 if * the type is scalar. * @param[out] typeName Location to store pointer to a constant string * with the type name * @param[out] typePtrName Location to store unified pointer field * corresponding to the vector consisting of elements * of \b dtype \b type */ void getVectorTypeName( DataType dtype, unsigned int vecLen, const char **typeName, const char **typePtrName); /*@}*/ #endif /* KERNGEN_H_ */ clblas-2.10/src/include/list.h000066400000000000000000000051201264277366700162540ustar00rootroot00000000000000/* ************************************************************************ * Copyright 2013 Advanced Micro Devices, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * ************************************************************************/ /* * Work with circular double linked lists */ #ifndef LIST_H_ #define LIST_H_ #include #if defined (_WIN64) typedef unsigned long long prt_size_t; #else typedef unsigned long prt_size_t; #endif #ifdef __cplusplus extern "C" { #endif #define offset_of(field, type) \ (prt_size_t)(&((type*)0)->field) #define container_of(node, field, type) \ (type*)((prt_size_t)(node) - offset_of(field, type)) typedef struct ListNode { struct ListNode *prev; struct ListNode *next; } ListNode; typedef ListNode ListHead; typedef void (*ListAction)(ListNode *node); typedef void (*ListPrivAction)(ListNode *node, void *priv); /* * Type of function comparing list node contents with a key. * On equality such a function must return 0 */ typedef int (*ListCmpFn)(const ListNode *node, const void *key); static __inline bool isListEmpty(ListHead *list) { return (list->next == list); } static __inline ListNode *listNodeFirst(const ListHead *head) { return head->next; } static __inline ListNode *listNodeLast(const ListHead *head) { return head->prev; } static __inline void listInitHead(ListHead *head) { head->prev = head; head->next = head; } void listAddToTail(ListHead *head, ListNode *node); void listAddToHead(ListHead *head, ListNode *node); void listDel(ListNode *node); ListNode *listDelFromTail(ListHead *head); void listDoForEach(ListHead *head, ListAction act); void listDoForEachSafe(ListHead *head, ListAction act); void listDoForEachPriv(const ListHead *head, ListPrivAction act, void *actPriv); void listDoForEachPrivSafe(const ListHead *head, ListPrivAction act, void *actPriv); ListNode *listNodeSearch(const ListHead *head, const void *key, ListCmpFn cmp); size_t listLength(const ListHead *head); #ifdef __cplusplus } #endif #endif /* LIST_H_ */ clblas-2.10/src/include/md5sum.h000066400000000000000000000026751264277366700165270ustar00rootroot00000000000000/* * This is an OpenSSL-compatible implementation of the RSA Data Security, Inc. * MD5 Message-Digest Algorithm (RFC 1321). * * Homepage: * http://openwall.info/wiki/people/solar/software/public-domain-source-code/md5 * * Author: * Alexander Peslyak, better known as Solar Designer * * This software was written by Alexander Peslyak in 2001. No copyright is * claimed, and the software is hereby placed in the public domain. * In case this attempt to disclaim copyright and place the software in the * public domain is deemed null and void, then the software is * Copyright (c) 2001 Alexander Peslyak and it is hereby released to the * general public under the following terms: * * Redistribution and use in source and binary forms, with or without * modification, are permitted. * * There's ABSOLUTELY NO WARRANTY, express or implied. * * See md5.c for more information. */ #ifdef HAVE_OPENSSL #include #elif !defined(_MD5_H) #define _MD5_H /* Any 32-bit or wider unsigned integer data type will do */ typedef unsigned int MD5_u32plus; typedef struct { MD5_u32plus lo, hi; MD5_u32plus a, b, c, d; unsigned char buffer[64]; MD5_u32plus block[16]; } MD5_CTX; extern void MD5_Init(MD5_CTX *ctx); extern void MD5_Update(MD5_CTX *ctx, const void *data, unsigned long size); extern void MD5_Final(unsigned char *result, MD5_CTX *ctx); char * md5sum (const void * data, unsigned long size); #endif clblas-2.10/src/include/mempat.h000066400000000000000000000044561264277366700165770ustar00rootroot00000000000000/* ************************************************************************ * Copyright 2013 Advanced Micro Devices, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * ************************************************************************/ /* * Memory usage pattern related definitions */ #ifndef MEMPAT_H_ #define MEMPAT_H_ #include enum { MAX_MEMORY_PATTERNS = 16 }; /** * @internal * @brief Memory level identifiers * * @ingroup SOLVERIF */ typedef enum CLMemLevel { CLMEM_LEVEL_LDS = 0x01, /**< Local data storage */ CLMEM_LEVEL_L1 = 0x02, /**< L1 cache */ CLMEM_LEVEL_L2 = 0x04 /**< L2 cache */ } CLMemLevel; /** * @internal * @brief Memory type identifiers * * @ingroup SOLVERIF */ typedef enum CLMemType { CLMEM_GLOBAL_MEMORY, CLMEM_LOCAL_MEMORY, CLMEM_IMAGE, // FIXME: it's for backward compatibility, remove after blkmul deprecation CLMEM_BUFFER = CLMEM_LOCAL_MEMORY } CLMemType; // memory levels set typedef unsigned int meml_set_t; /* * FIXME: deprecate cuLevel and thLevel */ /** * @internal * @brief Solver memory pattern description structure * * The structure decribes memory using features and used * by frontend at choosing of solving strategy and decomposition * block sizes * * @ingroup SOLVERIF */ typedef struct MemoryPattern { const char *name; /**< Pattern's name */ unsigned int nrLevels; /**< Decomposition levels number */ /** Level a problem is decomposed among compute units at */ int cuLevel; /** Level a problem is decomposed among threads within single compute unit */ int thLevel; SolverOps *sops; /**< Solver operations */ /** extra information specific for the application field */ void *extra; } MemoryPattern; #endif /* MEMPAT_H_ */ clblas-2.10/src/include/msvc.h000066400000000000000000000020031264277366700162460ustar00rootroot00000000000000/* ************************************************************************ * Copyright 2013 Advanced Micro Devices, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * ************************************************************************/ /* * Declarations not supported in visual studio * by default */ #ifndef MSVC_H_ #define MSVC_H_ #ifndef EOVERFLOW #define EOVERFLOW 1000 #endif /* EOVERFLOW */ #if ( _MSC_VER < 1900 ) #define snprintf _snprintf #endif typedef long ssize_t; #endif /* MSVS_H_ */ clblas-2.10/src/include/mutex.h000066400000000000000000000020101264277366700164360ustar00rootroot00000000000000/* ************************************************************************ * Copyright 2013 Advanced Micro Devices, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * ************************************************************************/ #ifndef MUTEX_H_ #define MUTEX_H_ #ifdef __cplusplus extern "C" { #endif typedef void* mutex_t; mutex_t* mutexInit(void); int mutexDestroy(mutex_t *mutex); int mutexLock(mutex_t *mutex); int mutexUnlock(mutex_t *mutex); #ifdef __cplusplus } #endif #endif /* MUTEX_H_ */ clblas-2.10/src/include/rwlock.h000066400000000000000000000050321264277366700166040ustar00rootroot00000000000000/* ************************************************************************ * Copyright 2013 Advanced Micro Devices, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * ************************************************************************/ #ifndef RWLOCK_H_ #define RWLOCK_H_ #ifdef __cplusplus extern "C" { #endif /* * The rwlock functions provide an abstract implementation of a * readers-writer lock (also called a shared/exclusive lock). * * Simply speaking, that kind of locks allows either * - multiple concurently read access to a data structure. * - a single write access that excludes other read or write accesses. * * A read region shall start by a call to rwlockReadLock() and stop with * rwlockReadUnlock() * * A write region shall start by a call to rwlockWriteLock() and stop with * rwlockWriteUnlock() * * Of course two consecutive calls of either rwlockReadLock() and * rwlockWriteLock() in a single thread will cause a deadlock. * * * Example: Access to a protected counter * * * int counter = 0 ; * rwlock_t *lock = rwlockInit() ; * * int getCounter() * { * int v ; * rwlockReadLock(lock) ; * v = counter ; * rwlockReadUnlock(lock) ; * return v ; * } * * int preIncrementCounter() * { * int v ; * rwlockWriteLock(lock) ; * v = ++counter ; * rwlockWriteUnlock(lock) ; * return v; * } * * * */ typedef void* rwlock_t; /* * Create and initialize a new readers-writer lock * */ rwlock_t* rwlockInit(void); /* * Destroy a readers-writer lock previously created by rwlockInit() * */ int rwlockDestroy(rwlock_t *rwlock); /* * Enter a (shared) read region * */ int rwlockReadLock(rwlock_t *rwlock ); /* * Enter a (exclusive) write region * */ int rwlockWriteLock(rwlock_t *rwlock ); /* * Leave a read region * */ int rwlockReadUnlock(rwlock_t *rwlock ); /* * Leave a write region * */ int rwlockWriteUnlock(rwlock_t *rwlock ); #ifdef __cplusplus } #endif #endif /* RWLOCK_H_ */ clblas-2.10/src/include/solver.h000066400000000000000000000135601264277366700166220ustar00rootroot00000000000000/* ************************************************************************ * Copyright 2013 Advanced Micro Devices, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * ************************************************************************/ #ifndef SOLVER_H_ #define SOLVER_H_ #include #include #include #include #include #include struct Kernel; // OpenCL solver ID typedef int solver_id_t; /** * @internal * @defgroup SOLVERIF Solver interface * * This interface binds the library frontend to the library backend */ /*@{*/ /** * @internal * @brief Solver flags */ typedef enum SolverFlags { /** supports 1D work space */ SF_WSPACE_1D = 0x01, /** supports 2D work space */ SF_WSPACE_2D = 0x02, /** input data blocks at the top level must be square */ SF_TOP_INPUT_SQUARE_BLOCKS = 0x04 } SolverFlags; typedef enum PatternPerformance{ PPERF_NOT_SUPPORTED = -1, PPERF_POOR = 0, PPERF_AVERAGE, PPERF_GOOD, PPERF_BEST } PatternPerformance; typedef enum CheckCalcPGran{ PGRAN_CHECK = 0, PGRAN_CALC } CheckCalcPGran; /** * @internal * @brief type of function generating kernel source for an * OpenCL based solver * * @param[out] buf Pointer to a buffer to store a generated kenrel to * @param[in] buflen Length of the buffer * @param[in] subdims Subproblem dimensions to generate an optimal kernel * @param[in] pgran Data parallelism granularity * @param[in] extra Generator extra information depending on the * application fields * * If the pointer to the buffer is NULL, the function should just calculate * needed size of the buffer to fit the code in. * * @return size of the generated kernel source on success; negated error code * otherwise * - -ENOMEM: enough of memory to allocated internal structures * - -EOVERFLOW: generated source exceeds the buffer size * - -EINVAL: invalid argument is passed */ typedef ssize_t (*SolverKgen)( char *buf, size_t buflen, const SubproblemDim *subdims, const PGranularity *pgran, void *extra); /** * @internal * @brief Solver operations * * The 'args' parameter for 'calcPrepWorkGroups', * and the second parameter for the 'assignKargs' methods plays the role of pointer * to a kernel arguments structure depending on the application field. */ typedef struct SolverOps { /** Kernel generator */ SolverKgen genKernel; /** Assign kernel arguments; the first argument is kernel argument batch * passed immediately to a kernel */ void (*assignKargs)(KernelArg*, const void* args, const void *extra); /** Check if available LDS size is enough to fit all needed data at such * granulation; 'kernelArgs' - kernel arguments depending on the * application fields */ bool (*isFitToLDS)( SubproblemDim *dims, DataType, cl_ulong ldsSize, const void *args); /** Get the pattern`s performance estimation for specified flags, * arguments and granulation. * Is used for selecting most suitable pattern current problem */ int (*getPatternPerf)( unsigned int kflags, const void *args); /** * Inner decomposition axis matching to the fastest moving OpenCL * work dimension. Used only for those patterns which use 2 dimensional * decomposition */ DecompositionAxis (*innerDecompositionAxis)(const void *args); /** Calculate number of needed global threads to execute a kernel */ void (*calcThreads)( size_t threads[2], const SubproblemDim *subdims, const PGranularity *pgran, const void *args, const void *extra); /** Set number of lines of the same top level block stored into the image * together and the direction of blocks storing. A solver that uses images * and stores data to images by blocks must provide the method */ void (*imgPackMode)( const void *extra, const SubproblemDim *subdims, int dataID, unsigned int *rate, clblasOrder *order); /** Get solver flags */ SolverFlags (*getFlags)(void); /** Correct problem arguments anr extra kernel parameters * depending on solver specifics. Basically, a solver should not * change any arguments that come from the API level to avoid any * confusing points */ void (*fixupArgs)(void *args, SubproblemDim* pSubDims, void *extra); /** Function, returning default decomposition for the pattern */ int ( *getDefaultDecomp)( PGranularity *pgran, SubproblemDim *subdims, unsigned int subdimsNum, void *pArgs); /** Perform validation of decomposition. * If "check" flag set to true: validate specified decomposition and * check, if specified granulation is valid for it. * If "check" flag set to false: calculate granulation, * fitting the specified decomposition, if possible */ bool (*checkCalcDecomp)( PGranularity *pgran, SubproblemDim *subdims, unsigned int subdimsNum, DataType dtype, int check); /* SetBuildOptions */ void (*setBuildOptions)( char *buildOptsStr, const void *args); /* * selectVectorization */ KernelExtraFlags (*selectVectorization)( void *kargs, unsigned int vlen); } SolverOps; /*@}*/ #endif /* SOLVER_H_ */ clblas-2.10/src/include/trace_malloc.h000066400000000000000000000040661264277366700177360ustar00rootroot00000000000000/* ************************************************************************ * Copyright 2013 Advanced Micro Devices, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * ************************************************************************/ /* * Define simple functionality to track memory leaks in order to separate * library leaks from leaks in the other components and to take info in * a human friendly format */ #ifndef TRACE_MALLOC_H_ #define TRACE_MALLOC_H_ #ifdef __cplusplus extern "C" { #endif #if defined(TRACE_MALLOC) #define malloc(size) debugMalloc(size, __FILE__, __LINE__) #define calloc(nmemb, size) debugCalloc(size * nmemb, __FILE__, __LINE__) #define realloc(ptr, size) debugRealloc(ptr, size, __FILE__, __LINE__) #define free(ptr) debugFree(ptr) void initMallocTrace(void); void *debugMalloc(size_t size, const char *file, int line); void *debugCalloc(size_t size, const char *file, int line); void *debugRealloc(void *ptr, size_t size, const char *file, int line); void debugFree(void *ptr); void printMallocStatistics(void); void printMemLeaksInfo(void); void releaseMallocTrace(void); #else /* TRACE_MALLOC */ static __inline void initMallocTrace(void) { /* do nothing */ } static __inline void printMallocStatistics(void) { /* do nothing */ } static __inline void printMemLeaksInfo(void) { /* do nothing */ } static __inline void releaseMallocTrace(void) { /* do nothing */ } #endif /* !TRACE_MALLOC */ #ifdef __cplusplus } /* extern "C" { */ #endif #endif /* TRACE_MALLOC_H_ */ clblas-2.10/src/library/000077500000000000000000000000001264277366700151535ustar00rootroot00000000000000clblas-2.10/src/library/CMakeLists.txt000066400000000000000000000773301264277366700177250ustar00rootroot00000000000000# ######################################################################## # Copyright 2013 Advanced Micro Devices, Inc. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # ######################################################################## find_package(PythonInterp REQUIRED) ################################################################################ # AutoGemm Begin ################################################################################ # AutoGemm scripts and out files set(AUTOGEMM_SCRIPTS ${CMAKE_SOURCE_DIR}/library/blas/AutoGemm/AutoGemm.py ${CMAKE_SOURCE_DIR}/library/blas/AutoGemm/AutoGemmParameters.py ${CMAKE_SOURCE_DIR}/library/blas/AutoGemm/Common.py ${CMAKE_SOURCE_DIR}/library/blas/AutoGemm/Includes.py ${CMAKE_SOURCE_DIR}/library/blas/AutoGemm/KernelOpenCL.py ${CMAKE_SOURCE_DIR}/library/blas/AutoGemm/KernelParameters.py ${CMAKE_SOURCE_DIR}/library/blas/AutoGemm/KernelSelection.py ${CMAKE_SOURCE_DIR}/library/blas/AutoGemm/KernelsToPreCompile.py ) set(AUTOGEMM_HEADERS ${CMAKE_BINARY_DIR}/include/AutoGemmIncludes/AutoGemmClKernels.h ${CMAKE_BINARY_DIR}/include/AutoGemmIncludes/AutoGemmKernelBuildOptionsBinary.h ${CMAKE_BINARY_DIR}/include/AutoGemmIncludes/AutoGemmKernelBinaries.h ${CMAKE_BINARY_DIR}/include/AutoGemmIncludes/AutoGemmKernelSelection.h ${CMAKE_BINARY_DIR}/include/AutoGemmIncludes/AutoGemmKernelSelectionSpecific.h ${CMAKE_BINARY_DIR}/include/AutoGemmIncludes/AutoGemmKernelBuildOptionsSource.h ${CMAKE_BINARY_DIR}/include/AutoGemmIncludes/AutoGemmKernelSources.h ) set(AUTOGEMM_SRC ${CMAKE_SOURCE_DIR}/library/blas/AutoGemm/UserGemmKernelSources/UserGemmClKernels.cc ${CMAKE_BINARY_DIR}/include/AutoGemmIncludes/AutoGemmClKernels.cpp ${CMAKE_BINARY_DIR}/include/AutoGemmIncludes/AutoGemmKernelBuildOptionsBinary.cpp ${CMAKE_BINARY_DIR}/include/AutoGemmIncludes/AutoGemmKernelBinaries.cpp ${CMAKE_BINARY_DIR}/include/AutoGemmIncludes/AutoGemmKernelSelection.cpp ${CMAKE_BINARY_DIR}/include/AutoGemmIncludes/AutoGemmKernelSelectionSpecific.cpp ${CMAKE_BINARY_DIR}/include/AutoGemmIncludes/AutoGemmKernelBuildOptionsSource.cpp ${CMAKE_BINARY_DIR}/include/AutoGemmIncludes/AutoGemmKernelSources.cpp ) #set(USERGEMM_SRC # ${CMAKE_SOURCE_DIR}/library/blas/AutoGemm/UserGemmKernelSources/UserGemmKernelSourceIncludes.cpp #) set(USERGEMM_HEADERS ${CMAKE_SOURCE_DIR}/library/blas/AutoGemm/UserGemmKernelSources/UserGemmKernelSourceIncludes.h ${CMAKE_SOURCE_DIR}/library/blas/AutoGemm/UserGemmKernelSources/UserGemmClKernels.h ) set(AUTOGEMM_TEST_SRC ${CMAKE_SOURCE_DIR}/library/blas/AutoGemm/AutoGemmTools/TestAutoGemm.cpp ) set(AUTOGEMM_PROFILER_SRC ${CMAKE_SOURCE_DIR}/library/blas/AutoGemm/AutoGemmTools/ProfileAutoGemm.cpp ) set(AUTOGEMM_PRECOMPILE_SRC ${CMAKE_SOURCE_DIR}/library/blas/AutoGemm/AutoGemmTools/AutoGemmPreCompileKernels.cpp ) set( AUTOGEMM_PRECOMPILED_KERNELS ${CMAKE_BINARY_DIR}/include/AutoGemmKernelBinaries/AutoGemmKernelBinariesPreCompiled.h ) # AutoGemm options for pre-compiling kernels option( PRECOMPILE_GEMM_PRECISION_SGEMM "AutoGemm: pre-compile sgemm kernels" OFF) option( PRECOMPILE_GEMM_PRECISION_DGEMM "AutoGemm: pre-compile dgemm kernels" OFF) option( PRECOMPILE_GEMM_PRECISION_CGEMM "AutoGemm: pre-compile cgemm kernels" OFF) option( PRECOMPILE_GEMM_PRECISION_ZGEMM "AutoGemm: pre-compile zgemm kernels" OFF) option( PRECOMPILE_GEMM_TRANS_NN "AutoGemm: pre-compile NN transpose cases" OFF) option( PRECOMPILE_GEMM_TRANS_NT "AutoGemm: pre-compile NT transpose cases" OFF) option( PRECOMPILE_GEMM_TRANS_NC "AutoGemm: pre-compile NC transpose cases" OFF) option( PRECOMPILE_GEMM_TRANS_TN "AutoGemm: pre-compile TN transpose cases" OFF) option( PRECOMPILE_GEMM_TRANS_TT "AutoGemm: pre-compile TT transpose cases" OFF) option( PRECOMPILE_GEMM_TRANS_TC "AutoGemm: pre-compile TC transpose cases" OFF) option( PRECOMPILE_GEMM_TRANS_CN "AutoGemm: pre-compile CN transpose cases" OFF) option( PRECOMPILE_GEMM_TRANS_CT "AutoGemm: pre-compile CT transpose cases" OFF) option( PRECOMPILE_GEMM_TRANS_CC "AutoGemm: pre-compile CC transpose cases" OFF) set( AUTOGEMM_ARCHITECTURE "Hawaii" CACHE STRING "AutoGemm: device for kernel selection logic" ) set_property( CACHE AUTOGEMM_ARCHITECTURE PROPERTY STRINGS "Hawaii" "Fiji" ) # opencl compiler version #set( PRECOMPILE_GEMM_OPENCL_VERSION "2.0" CACHE STRING "OpenCL compiler version supported by device driver." ) #set_property( CACHE PRECOMPILE_GEMM_OPENCL_VERSION PROPERTY STRINGS 2.0 1.2 1.1 ) #message( STATUS "AutoGemm PreCompiler will use OpenCL ${PRECOMPILE_GEMM_OPENCL_VERSION} compiler." ) # PreCompile precision selected? set( PRECOMPILE_GEMM_PRECISION_SELECTED OFF) if ( PRECOMPILE_GEMM_PRECISION_SGEMM OR PRECOMPILE_GEMM_PRECISION_DGEMM OR PRECOMPILE_GEMM_PRECISION_CGEMM OR PRECOMPILE_GEMM_PRECISION_ZGEMM ) set( PRECOMPILE_GEMM_PRECISION_SELECTED ON) endif() # PreCompile transpose selected? set( PRECOMPILE_GEMM_TRANS_SELECTED OFF) if ( PRECOMPILE_GEMM_TRANS_NN OR PRECOMPILE_GEMM_TRANS_NT OR PRECOMPILE_GEMM_TRANS_NC OR PRECOMPILE_GEMM_TRANS_TN OR PRECOMPILE_GEMM_TRANS_TT OR PRECOMPILE_GEMM_TRANS_TC OR PRECOMPILE_GEMM_TRANS_CN OR PRECOMPILE_GEMM_TRANS_CT OR PRECOMPILE_GEMM_TRANS_CC ) set( PRECOMPILE_GEMM_TRANS_SELECTED ON) endif() # PreCompile is valid and active? set( PRECOMPILE_GEMM_ACTIVE OFF) if ( PRECOMPILE_GEMM_PRECISION_SELECTED AND PRECOMPILE_GEMM_TRANS_SELECTED) # valid selection set( PRECOMPILE_GEMM_ACTIVE ON) MESSAGE( STATUS "AutoGemm-PreCompile: selected kernels will be pre-compiled." ) elseif(NOT PRECOMPILE_GEMM_PRECISION_SELECTED AND NOT PRECOMPILE_GEMM_TRANS_SELECTED) MESSAGE( STATUS "AutoGemm-PreCompile: no kernels to be pre-compiled." ) else() MESSAGE( SEND_ERROR "AutoGemm-PreCompile: To pre-compile gemm kernels, select at lease one option from each of PRECOMPILE_GEMM_PRECISION_* and PRECOMPILE_GEMM_TRANS_*; otherwise, unselect all PRECOMPILE_GEMM_* options to not pre-compile any gemm kernels." ) endif() # build commandline argument for AutoGemm set( AGPC_ARGS --output ${CMAKE_BINARY_DIR}/include ) if ( PRECOMPILE_GEMM_ACTIVE ) # precisions set(AGPC_ARGS ${AGPC_ARGS} --precisions ) if (PRECOMPILE_GEMM_PRECISION_SGEMM) set(AGPC_ARGS ${AGPC_ARGS} s ) endif() if (PRECOMPILE_GEMM_PRECISION_DGEMM) set(AGPC_ARGS ${AGPC_ARGS} d ) endif() if (PRECOMPILE_GEMM_PRECISION_CGEMM) set(AGPC_ARGS ${AGPC_ARGS} c ) endif() if (PRECOMPILE_GEMM_PRECISION_ZGEMM) set(AGPC_ARGS ${AGPC_ARGS} z ) endif() # orders set(AGPC_ARGS ${AGPC_ARGS} --orders clblasColumnMajor ) # transposes set(AGPC_ARGS ${AGPC_ARGS} --transposes ) if (PRECOMPILE_GEMM_TRANS_NN) set(AGPC_ARGS ${AGPC_ARGS} NN ) endif() if (PRECOMPILE_GEMM_TRANS_NT) set(AGPC_ARGS ${AGPC_ARGS} NT ) endif() if (PRECOMPILE_GEMM_TRANS_NC) set(AGPC_ARGS ${AGPC_ARGS} NC ) endif() if (PRECOMPILE_GEMM_TRANS_TN) set(AGPC_ARGS ${AGPC_ARGS} TN ) endif() if (PRECOMPILE_GEMM_TRANS_TT) set(AGPC_ARGS ${AGPC_ARGS} TT ) endif() if (PRECOMPILE_GEMM_TRANS_TC) set(AGPC_ARGS ${AGPC_ARGS} TC ) endif() if (PRECOMPILE_GEMM_TRANS_CN) set(AGPC_ARGS ${AGPC_ARGS} CN ) endif() if (PRECOMPILE_GEMM_TRANS_CT) set(AGPC_ARGS ${AGPC_ARGS} CT ) endif() if (PRECOMPILE_GEMM_TRANS_CC) set(AGPC_ARGS ${AGPC_ARGS} CC ) endif() # betas set(AGPC_ARGS ${AGPC_ARGS} --betas 0 1 ) ################################################################################ # add target for generating pre-compile WhichKernels header ################################################################################ set( AUTOGEMM_PRECOMPILE_HEADER_SRC ${CMAKE_SOURCE_DIR}/library/blas/AutoGemm/KernelsToPreCompile.py ) set( AUTOGEMM_PRECOMPILE_HEADER_OUT ${CMAKE_BINARY_DIR}/include/AutoGemmIncludes/AutoGemmKernelsToPreCompile.h ) add_custom_command( OUTPUT ${AUTOGEMM_PRECOMPILE_HEADER_OUT} COMMAND ${PYTHON_EXECUTABLE} ${CMAKE_SOURCE_DIR}/library/blas/AutoGemm/KernelsToPreCompile.py ${AGPC_ARGS} DEPENDS ${AUTOGEMM_PRECOMPILE_HEADER_SRC} ) ################################################################################ # add target for compiling pre-compile executable ################################################################################ add_executable(AutoGemm_PreCompile_Bin ${AUTOGEMM_PRECOMPILE_SRC} ${AUTOGEMM_PRECOMPILE_HEADER_OUT} ${AUTOGEMM_HEADERS} ${AUTOGEMM_SRC} ${AUTOGEMM_SCRIPTS} ) target_link_libraries(AutoGemm_PreCompile_Bin ${OPENCL_LIBRARIES}) set_property( TARGET AutoGemm_PreCompile_Bin PROPERTY FOLDER "AutoGemm") #set_target_properties( #AutoGemm_PreCompile_Bin #PROPERTIES #EXCLUDE_FROM_ALL TRUE #EXCLUDE_FROM_DEFAULT_BUILD TRUE #) ################################################################################ # add target for running pre-compile executable ################################################################################ add_custom_command( OUTPUT ${AUTOGEMM_PRECOMPILED_KERNELS} COMMAND AutoGemm_PreCompile_Bin ${CMAKE_BINARY_DIR} DEPENDS AutoGemm_PreCompile_Bin ) endif()#endif precompile active ################################################################################ # add target for main AutoGemm headers / source ################################################################################ add_custom_command( OUTPUT ${AUTOGEMM_HEADERS} ${AUTOGEMM_SRC} COMMAND ${PYTHON_EXECUTABLE} ${CMAKE_SOURCE_DIR}/library/blas/AutoGemm/AutoGemm.py --output-path ${CMAKE_BINARY_DIR}/include --opencl-compiler-version ${OPENCL_VERSION} --architecture ${AUTOGEMM_ARCHITECTURE} DEPENDS ${AUTOGEMM_SCRIPTS} ) include_directories( ${OPENCL_INCLUDE_DIRS} ${CMAKE_SOURCE_DIR} ${CMAKE_BINARY_DIR}/include . ) ################################################################################ # AutoGemm Tools ################################################################################ add_executable(AutoGemm_Tools_Test ${AUTOGEMM_TEST_SRC} ${AUTOGEMM_SRC} ${AUTOGEMM_HEADERS} ${AUTOGEMM_SCRIPTS} ) target_link_libraries(AutoGemm_Tools_Test ${OPENCL_LIBRARIES}) set_property( TARGET AutoGemm_Tools_Test PROPERTY FOLDER "AutoGemm") set_target_properties( AutoGemm_Tools_Test PROPERTIES EXCLUDE_FROM_ALL TRUE EXCLUDE_FROM_DEFAULT_BUILD TRUE ) add_executable(AutoGemm_Tools_Profile ${AUTOGEMM_PROFILER_SRC} ${AUTOGEMM_SRC} ${AUTOGEMM_HEADERS} ${AUTOGEMM_SCRIPTS} ) target_link_libraries(AutoGemm_Tools_Profile ${OPENCL_LIBRARIES}) set_property( TARGET AutoGemm_Tools_Profile PROPERTY FOLDER "AutoGemm") set_target_properties( AutoGemm_Tools_Profile PROPERTIES EXCLUDE_FROM_ALL TRUE EXCLUDE_FROM_DEFAULT_BUILD TRUE ) source_group(AutoGemm\\scripts FILES ${AUTOGEMM_SCRIPTS} ) source_group(AutoGemm\\include FILES ${AUTOGEMM_HEADERS} ) source_group(AutoGemm\\src FILES ${AUTOGEMM_SRC} ${AUTOGEMM_PRECOMPILED_KERNELS} ) ################################################################################ # AutoGemm End ################################################################################ ################################################################################ # BEGIN Pre Compile General (static) Kernels ################################################################################ # options for pre-compiling trsm kernels option( PRECOMPILE_TRSM_STRSM "pre-compile available dtrsm kernels" OFF ) option( PRECOMPILE_TRSM_DTRSM "pre-compile available strsm kernels" OFF ) if(PRECOMPILE_TRSM_DTRSM) add_definitions(-DCLBLAS_OFFLINE_COMPILE_DTRSM) message(STATUS "precompile DTRSM kernels.") endif() if(PRECOMPILE_TRSM_STRSM) add_definitions(-DCLBLAS_OFFLINE_COMPILE_STRSM) message(STATUS "precompile STRSM kernels. (not yet implemented)") endif() ################################################################################ # END Pre Compile General (static) Kernels ################################################################################ set(SRC_BLAS blas/init.c blas/impl.c blas/scimage.c blas/xgemv.c blas/xsymv.c blas/xgemm.cc blas/xtrmm.c blas/xtrsm.cc blas/xsyrk.c blas/xsyr2k.c blas/xtrmv.c blas/xtrsv.c blas/xsymm.c blas/xgemm2.c blas/xger.c blas/xsyr.c blas/xsyr2.c blas/xher.c blas/xher2.c blas/xhemv.c blas/xhemm.c blas/xherk.c blas/xhpmv.c blas/xspmv.c blas/xgbmv.c blas/xtbmv.c blas/xshbmv.c blas/xtbsv.c blas/xher2k.c blas/xswap.c blas/xscal.cc blas/xcopy.c blas/xaxpy.c blas/xdot.c blas/xrotg.c blas/xrotmg.c blas/xrot.c blas/xrotm.c blas/ixamax.c blas/xnrm2.c blas/xasum.c blas/matrix.c blas/fill.cc blas/functor/functor.cc blas/functor/functor_selector.cc blas/functor/functor_xgemm.cc blas/functor/functor_xscal.cc blas/functor/functor_xtrsm.cc blas/functor/functor_xscal_generic.cc blas/functor/tahiti.cc blas/functor/hawaii.cc blas/functor/bonaire.cc blas/functor/gcn_dgemm.cc blas/functor/gpu_dtrsm.cc blas/functor/gpu_dtrsm192.cc blas/functor/functor_fill.cc blas/functor/hawaii_dgemmChannelConflict.cc blas/functor/hawaii_dgemmSplitKernel.cc blas/functor/hawaii_sgemmSplitKernel.cc blas/functor/hawaii_sgemmSplit64_32.cc blas/functor/gcn_dgemmCommon.cc blas/functor/gcn_sgemm.cc blas/functor/gcn_zgemm.cc blas/functor/gcn_dgemmSmallMatrices.cc blas/functor/gcn_sgemmSmallMatrices.cc blas/functor/hawaii_sgemmBranchKernel.cc blas/functor/hawaii_sgemmBig1024Kernel.cc blas/specialCases/GemmSpecialCases.cpp ) set(SRC_BLAS_HEADERS blas/include/blas_funcs.h blas/include/matrix_dims.h blas/include/matrix_props.h blas/include/blas_mempat.h blas/include/clblas-internal.h blas/include/solution_seq.h blas/include/events.h blas/include/xgemm.h blas/functor/include/functor.h blas/functor/include/functor_xgemm.h blas/functor/include/functor_xscal.h blas/functor/include/functor_xtrsm.h blas/functor/include/functor_xscal_generic.h blas/functor/include/functor_selector.h blas/functor/include/tahiti.h blas/functor/include/hawaii.h blas/functor/include/bonaire.h blas/functor/include/gcn_dgemm.h blas/functor/include/gpu_dtrsm.h blas/functor/include/gpu_dtrsm192.h blas/functor/include/BinaryBuild.h blas/functor/include/hawaii_dgemmChannelConflict.h blas/functor/include/hawaii_dgemmSplitKernel.h blas/functor/include/hawaii_sgemmSplitKernel.h blas/functor/include/hawaii_sgemmSplit64_32.h blas/functor/include/gcn_dgemmCommon.h blas/functor/include/gcn_sgemm.h blas/functor/include/gcn_zgemm.h blas/functor/include/gcn_dgemmSmallMatrices.h blas/functor/include/gcn_sgemmSmallMatrices.h blas/functor/include/hawaii_sgemmBranchKernel.h blas/functor/include/hawaii_sgemmBig1024Kernel.h blas/AutoGemm/UserGemmKernelSources/UserGemmClKernels.h blas/AutoGemm/UserGemmKernelSources/UserGemmKernelSourceIncludes.h ) set(SRC_BLAS_GENERIC blas/generic/common.c blas/generic/common2.cc blas/generic/blas_funcs.c blas/generic/events.c blas/generic/matrix_props.c blas/generic/matrix_dims.c blas/generic/kdump.c blas/generic/solution_assert.c blas/generic/solution_seq.c blas/generic/solution_seq_make.c blas/generic/problem_iter.c blas/generic/kernel_extra.c blas/generic/binary_lookup.cc blas/generic/functor_cache.cc ) set(SRC_BLAS_GENS blas/gens/gen_init.c blas/gens/blas_kgen.c blas/gens/blas_subgroup.c blas/gens/gen_helper.c blas/gens/tilemul.c blas/gens/fetch.c blas/gens/tile.c blas/gens/tile_iter.c blas/gens/decomposition.c blas/gens/gemv.c blas/gens/symv.c blas/gens/gemm.c blas/gens/trmm.c blas/gens/trsm.c blas/gens/syrxk.c blas/gens/trxm_common.c blas/gens/trsm_kgen.c blas/gens/xxmv_common.c blas/gens/legacy/blas_kgen_legacy.c blas/gens/legacy/gen_helper_legacy.c blas/gens/legacy/trxm_common_legacy.c blas/gens/legacy/trsm_kgen_legacy.c blas/gens/legacy/blkmul.c blas/gens/legacy/gemm_lds.c blas/gens/legacy/gemm_img.c blas/gens/legacy/trmm_lds.c blas/gens/legacy/trmm_img.c blas/gens/legacy/trsm_lds.c blas/gens/legacy/trsm_img.c blas/gens/legacy/trsm_cached_lds.c blas/gens/trmv_reg.cpp blas/gens/ger_lds.cpp blas/gens/trsv_trtri.cpp blas/gens/trsv_gemv.cpp blas/gens/kprintf.cpp blas/gens/syr_lds.cpp blas/gens/her_lds.cpp blas/gens/syr2_lds.cpp blas/gens/her2_lds.cpp blas/gens/symm_cached.cpp blas/gens/gemm_cached.cpp blas/gens/gemm_tail_cached.cpp blas/gens/gbmv.cpp blas/gens/tuned_numbers.c blas/gens/swap_reg.cpp blas/gens/scal_reg.cpp blas/gens/copy_reg.cpp blas/gens/axpy_reg.cpp blas/gens/dot.cpp blas/gens/reduction.cpp blas/gens/rotg_reg.cpp blas/gens/rotmg_reg.cpp blas/gens/rotm_reg.cpp blas/gens/iamax.cpp blas/gens/nrm2.cpp blas/gens/asum.cpp ) #set (BIN_CL_TEMPLATES #dgemm_hawai.cl #dtrsm_gpu.cl #) set (SRC_CL_TEMPLATES gemm.cl gemm_helper.cl gbmv.cl ger.cl her.cl symm_helper.cl syr2_her2.cl syr_her.cl trsv.cl her2.cl symm.cl syr2.cl syr.cl trmv.cl trsv_gemv.cl swap.cl scal.cl copy.cl axpy.cl dot.cl reduction.cl rotg.cl rotmg.cl rotm.cl iamax.cl nrm2.cl asum.cl custom_gemm.cl dgemm_hawai.cl dgemm_hawaiiChannelConfilct.cl dgemm_hawaiiSplitKernel.cl sgemm_hawaiiSplitKernel.cl dtrsm_gpu.cl dtrsm_gpu192.cl dgemm_gcn_SmallMatrices.cl sgemm_gcn_SmallMatrices.cl sgemm_gcn_bigMatrices.cl sgemm_gcn.cl zgemm_gcn.cl ) set(SRC_CL_TEMPLATES_GEN dgemm_hawai.clHawaii_64.bin.cl dtrsm_gpu.clHawaii_64.bin.cl dtrsm_gpu192.clHawaii_64.bin.cl dgemm_hawaiiChannelConfilct.clHawaii_64.bin.cl dgemm_hawaiiSplitKernel.clHawaii_64.bin.cl sgemm_hawaiiSplitKernel.clHawaii_64.bin.cl sgemm_hawaiiSplitKernel.clBonaire_64.bin.cl dgemm_hawai.clTahiti_64.bin.cl dtrsm_gpu.clTahiti_64.bin.cl dgemm_gcn_SmallMatrices.clHawaii_64.bin.cl dgemm_gcn_SmallMatrices.clTahiti_64.bin.cl sgemm_gcn_SmallMatrices.clHawaii_64.bin.cl sgemm_gcn_SmallMatrices.clTahiti_64.bin.cl sgemm_gcn_SmallMatrices.clBonaire_64.bin.cl sgemm_gcn_bigMatrices.clHawaii_64.bin.cl sgemm_gcn_bigMatrices.clTahiti_64.bin.cl sgemm_gcn_bigMatrices.clBonaire_64.bin.cl sgemm_gcn.clHawaii_64.bin.cl zgemm_gcn.clHawaii_64.bin.cl sgemm_gcn.clBonaire_64.bin.cl sgemm_gcn.clTahiti_64.bin.cl sgemm_hawaiiSplit64_32.clHawaii_64.bin.cl ) set(SRC_BLAS_GENERIC_HEADERS blas/generic/solution_assert.h blas/generic/problem_iter.h ) set(SRC_BLAS_GENS_HEADERS blas/gens/fetch.h blas/gens/blas_kgen.h blas/gens/blas_subgroup.h blas/gens/gen_helper.h blas/gens/init.h blas/gens/trxm_common.h blas/gens/trsm_kgen.h blas/gens/xxmv_common.h blas/gens/tile.h blas/gens/tile_iter.h blas/gens/tuned_numbers.h ) set(SRC_COMMON common/list.c common/clkern.c common/kern_cache.c common/kerngen_core.c common/kgen_basic.c common/kgen_loop_helper.c common/kgen_guard.c common/misc.c common/devinfo.c common/devinfo-cache.c common/mutex.c common/rwlock.c common/trace_malloc.c common/md5sum.c ) set(SRC_COMMON_GENS common/gens/dblock_kgen.c ) set(SRC_TOOLS tools/tune/toolslib.c tools/tune/fileio.c tools/tune/dimension.c tools/tune/storage_init.c tools/tune/storage_io.c tools/tune/storage_data.c ) set(CLBLAS_SOURCES ${SRC_COMMON} ${SRC_COMMON_GENS} ${SRC_BLAS} ${SRC_BLAS_GENERIC} ${SRC_BLAS_GENS} ${SRC_TOOLS} ../clBLAS.def ) set(GLOBAL_HEADERS ${clBLAS_SOURCE_DIR}/clBLAS.h ${clBLAS_SOURCE_DIR}/clBLAS-complex.h ${clBLAS_SOURCE_DIR}/include/clkern.h ${clBLAS_SOURCE_DIR}/include/cltypes.h ${clBLAS_SOURCE_DIR}/include/dblock_kgen.h ${clBLAS_SOURCE_DIR}/include/defbool.h ${clBLAS_SOURCE_DIR}/include/devinfo.h ${clBLAS_SOURCE_DIR}/include/dis_warning.h ${clBLAS_SOURCE_DIR}/include/kern_cache.h ${clBLAS_SOURCE_DIR}/include/kernel_extra.h ${clBLAS_SOURCE_DIR}/include/kerngen.h ${clBLAS_SOURCE_DIR}/include/list.h ${clBLAS_SOURCE_DIR}/include/mempat.h ${clBLAS_SOURCE_DIR}/include/msvc.h ${clBLAS_SOURCE_DIR}/include/mutex.h ${clBLAS_SOURCE_DIR}/include/rwlock.h ${clBLAS_SOURCE_DIR}/include/solver.h ${clBLAS_SOURCE_DIR}/include/md5sum.h ${clBLAS_SOURCE_DIR}/include/binary_lookup.h ) source_group(common FILES ${SRC_COMMON}) source_group(common\\gens FILES ${SRC_COMMON_GENS}) source_group(blas FILES ${SRC_BLAS}) source_group(blas\\include FILES ${SRC_BLAS_HEADERS}) source_group(blas\\generic FILES ${SRC_BLAS_GENERIC}) source_group(blas\\gens FILES ${SRC_BLAS_GENS} ${SRC_BLAS_GENS_HEADERS}) include_directories(${OPENCL_INCLUDE_DIRS} ${clBLAS_SOURCE_DIR} ${clBLAS_SOURCE_DIR}/include ${clBLAS_SOURCE_DIR}/library/blas/include ${clBLAS_SOURCE_DIR}/library/blas/functor/include ${clBLAS_SOURCE_DIR}/library/tools/tune ${clBLAS_BINARY_DIR}/include ${clBLAS_SOURCE_DIR}/library/blas/AutoGemm ${clBLAS_SOURCE_DIR}/library/blas/AutoGemm/UserGemmKernelSources ${clBLAS_SOURCE_DIR}/library/blas/specialCases/include ${clBLAS_SOURCE_DIR}/library/blas/trtri ) option( BLAS_DUMP_CLBLAS_KERNELS "Force the library to dump OpenCL kernels to disk" OFF ) if( BLAS_DUMP_CLBLAS_KERNELS ) add_definitions( -DDUMP_CLBLAS_KERNELS ) endif() option( BLAS_KEEP_KERNEL_SOURCES "Prevent the library from stripping source from kernels" OFF ) if( BLAS_KEEP_KERNEL_SOURCES ) add_definitions( -DKEEP_CLBLAS_KERNEL_SOURCES ) endif() option( BLAS_TRACE_MALLOC "Simple functionality to track memory leaks" OFF ) if( BLAS_TRACE_MALLOC ) add_definitions( -DTRACE_MALLOC ) endif() option( BLAS_PRINT_BUILD_ERRORS "Enable printing of OpenCL compiler errors on stdout" ON ) if( BLAS_PRINT_BUILD_ERRORS ) add_definitions( -DPRINT_BUILD_ERRORS ) endif() include( ExternalProject ) ExternalProject_Add( tplgen URL "${CMAKE_SOURCE_DIR}/library/tools/tplgen" INSTALL_COMMAND "" ) ################OCLBinaryGenerator if (PRECOMPILE_TRSM_DTRSM OR PRECOMPILE_TRSM_STRSM) ExternalProject_Add( OCLBinaryGenerator URL "${CMAKE_SOURCE_DIR}/library/tools/OCLBinaryGenerator" CMAKE_ARGS -DOPENCL_LIBRARIES=${OPENCL_LIBRARIES} -DOPENCL_INCLUDE_DIRS=${OPENCL_INCLUDE_DIRS} INSTALL_COMMAND "" ) ExternalProject_Get_Property( OCLBinaryGenerator binary_dir ) message(STATUS "OCLBinaryGenerator binary_dir =${binary_dir}") set( OCLBinaryGeneratorBinaryDir "${binary_dir}/staging" ) # OCLBinaryGenerator requires at least three inputs # 1, path to the kernel file # 2, file name # 3, output directory # 4, [optional] compiler flags # 5, [optional] trageted hardware. If this is not supplied OCLBinaryGenerator will generate binary for the first device on system set( OCL_COMPILER_FLAGS " ") if( OPENCL_VERSION STREQUAL "2.0") set( OCL_COMPILER_FLAGS "-cl-std=CL2.0") endif() add_custom_target( OCLBinaryGenerator_GEN ) add_custom_command(TARGET OCLBinaryGenerator_GEN PRE_BUILD COMMAND ${CMAKE_COMMAND} -DOCLBinaryGeneratorBinaryDir=${OCLBinaryGeneratorBinaryDir} -DSOURCE_DIR=${CMAKE_SOURCE_DIR} -DBINARY_DIR=${CMAKE_BINARY_DIR} -DOCL_COMPILER_FLAGS=${OCL_COMPILER_FLAGS} -P "${CMAKE_SOURCE_DIR}/library/OCLBinaryGenerator.cmake" ) add_dependencies( OCLBinaryGenerator_GEN OCLBinaryGenerator ) endif() # if offline compilation is not chosen, bingen should not be built if(OPENCL_OFFLINE_BUILD_TAHITI_KERNEL OR OPENCL_OFFLINE_BUILD_HAWAII_KERNEL OR OPENCL_OFFLINE_BUILD_BONAIRE_KERNEL) ExternalProject_Add( bingen URL "${CMAKE_SOURCE_DIR}/library/tools/bingen" CMAKE_ARGS -DOPENCL_LIBRARIES=${OPENCL_LIBRARIES} -DOPENCL_INCLUDE_DIRS=${OPENCL_INCLUDE_DIRS} INSTALL_COMMAND "" ) endif() message(STATUS "OPENCL_VERSION = ${OPENCL_VERSION}") #if( OPENCL_VERSION STREQUAL "2.0") # if(EXISTS ${CMAKE_SOURCE_DIR}/flags.txt) # MESSAGE(STATUS "flags.txt found. will load AMD_OPENCL_BUILD_OPTIONS_APPEND from it.") # set (LOAD_CL_FLAGS TRUE) # file (STRINGS "${CMAKE_SOURCE_DIR}/flags.txt" OPENCL_FLAGS) # MESSAGE(STATUS "OCLFLAGS: ${OPENCL_FLAGS}") # string(REPLACE "OCL " "OCL;" OPENCL_FLAGS_REPLACED ${OPENCL_FLAGS}) # list(GET OPENCL_FLAGS_REPLACED 1 OPENCL_FLAGS_REPLACED_1)#flags for TAHITI # list(GET OPENCL_FLAGS_REPLACED 3 OPENCL_FLAGS_REPLACED_3)#flags for HAWAII 1 # list(GET OPENCL_FLAGS_REPLACED 5 OPENCL_FLAGS_REPLACED_5)#flags for HAWAII 2 # list(GET OPENCL_FLAGS_REPLACED 7 OPENCL_FLAGS_REPLACED_7)#flags for BONAIRE # #MESSAGE("${OPENCL_FLAGS_REPLACED_7}") # elseif(EXISTS ${CMAKE_SOURCE_DIR}/flags_public.txt) # MESSAGE(STATUS "flags_public.txt found. will load AMD_OPENCL_BUILD_OPTIONS_APPEND from it.") # set (LOAD_CL_FLAGS TRUE) # file (STRINGS "${CMAKE_SOURCE_DIR}/flags_public.txt" OPENCL_FLAGS) # MESSAGE(STATUS "OCLFLAGS: ${OPENCL_FLAGS}") # string(REPLACE "OCL " "OCL;" OPENCL_FLAGS_REPLACED ${OPENCL_FLAGS}) # list(GET OPENCL_FLAGS_REPLACED 1 OPENCL_FLAGS_REPLACED_1)#flags for TAHITI # list(GET OPENCL_FLAGS_REPLACED 3 OPENCL_FLAGS_REPLACED_3)#flags for HAWAII 1 # list(GET OPENCL_FLAGS_REPLACED 5 OPENCL_FLAGS_REPLACED_5)#flags for HAWAII 2 # list(GET OPENCL_FLAGS_REPLACED 7 OPENCL_FLAGS_REPLACED_7)#flags for BONAIRE # else() # MESSAGE(STATUS "flags.txt not found. will use the default flags.") # set (LOAD_CL_FLAGS FALSE) # endif() #else() # MESSAGE(STATUS "loading of compiler flags requires OpenCL 2.0. will use default flags.") # set (LOAD_CL_FLAGS FALSE) #endif() set (LOAD_CL_FLAGS FALSE) #set( bingenBinaryDir "${CMAKE_BINARY_DIR}/library/tools/bingen/staging" ) # if offline compilation is not chosen, bingen should not be built if(OPENCL_OFFLINE_BUILD_TAHITI_KERNEL OR OPENCL_OFFLINE_BUILD_HAWAII_KERNEL OR OPENCL_OFFLINE_BUILD_BONAIRE_KERNEL) ExternalProject_Get_Property( bingen binary_dir ) set( bingenBinaryDir "" ) if( CMAKE_COMPILER_IS_GNUCXX ) set( bingenBinaryDir "${binary_dir}/staging" ) else() set( bingenBinaryDir "${binary_dir}/staging" ) # set( bingenBinaryDir "${binary_dir}/${CMAKE_CFG_INTDIR}" ) endif() if (LOAD_CL_FLAGS) add_custom_target( GEN_CLBIN ) add_custom_command(TARGET GEN_CLBIN PRE_BUILD COMMAND ${CMAKE_COMMAND} -DbingenBinaryDir=${bingenBinaryDir} -DCLTEMPLATE_PATH="${CMAKE_SOURCE_DIR}/library/blas/gens/clTemplates" -DLOAD_CL_FLAGS=${LOAD_CL_FLAGS} -DTAHITI_FLAG=${OPENCL_FLAGS_REPLACED_1} -DHAWAII1_FLAG=${OPENCL_FLAGS_REPLACED_3} -DHAWAII2_FLAG=${OPENCL_FLAGS_REPLACED_5} -DBONAIRE_FLAG=${OPENCL_FLAGS_REPLACED_7} -DENV_PATH=${ENV_PATH} -DOPENCL_OFFLINE_BUILD_HAWAII_KERNEL=${OPENCL_OFFLINE_BUILD_HAWAII_KERNEL} -DOPENCL_OFFLINE_BUILD_BONAIRE_KERNEL=${OPENCL_OFFLINE_BUILD_BONAIRE_KERNEL} -DOPENCL_OFFLINE_BUILD_TAHITI_KERNEL=${OPENCL_OFFLINE_BUILD_TAHITI_KERNEL} -P "${CMAKE_SOURCE_DIR}/library/bingen.cmake" ) add_dependencies( GEN_CLBIN bingen ) else() add_custom_target( GEN_CLBIN ) add_custom_command(TARGET GEN_CLBIN PRE_BUILD COMMAND ${CMAKE_COMMAND} -DbingenBinaryDir=${bingenBinaryDir} -DCLTEMPLATE_PATH="${CMAKE_SOURCE_DIR}/library/blas/gens/clTemplates" -DOPENCL_OFFLINE_BUILD_HAWAII_KERNEL=${OPENCL_OFFLINE_BUILD_HAWAII_KERNEL} -DOPENCL_OFFLINE_BUILD_BONAIRE_KERNEL=${OPENCL_OFFLINE_BUILD_BONAIRE_KERNEL} -DOPENCL_OFFLINE_BUILD_TAHITI_KERNEL=${OPENCL_OFFLINE_BUILD_TAHITI_KERNEL} -P "${CMAKE_SOURCE_DIR}/library/bingen.cmake" ) add_dependencies( GEN_CLBIN bingen ) endif() endif()#if(OPENCL_OFFLINE_BUILD_TAHITI_KERNEL OR OPENCL_OFFLINE_BUILD_HAWAII_KERNEL OR OPENCL_OFFLINE_BUILD_BONAIRE_KERNEL) ExternalProject_Get_Property( tplgen binary_dir ) set( tplgenBinaryDir "" ) if( CMAKE_COMPILER_IS_GNUCXX ) set( tplgenBinaryDir ${binary_dir} ) else() set( tplgenBinaryDir "${binary_dir}/${CMAKE_CFG_INTDIR}" ) endif() add_custom_target( GENERATE_CLT COMMAND ${tplgenBinaryDir}/tplgen -o ${clBLAS_BINARY_DIR}/include -i ${CMAKE_SOURCE_DIR}/library/blas/gens/clTemplates/ ${SRC_CL_TEMPLATES} COMMAND ${tplgenBinaryDir}/tplgen -o ${clBLAS_BINARY_DIR}/include -i ${bingenBinaryDir}/ ${SRC_CL_TEMPLATES_GEN} #WORKING_DIRECTORY ${CMAKE_SOURCE_DIR}/library/blas/gens/clTemplates WORKING_DIRECTORY ${bingenBinaryDir} ) #add_dependencies( tplgen GEN_CLBIN ) add_dependencies( GENERATE_CLT tplgen ) if( CMAKE_COMPILER_IS_GNUCC ) configure_file( ${CMAKE_CURRENT_SOURCE_DIR}/clBLAS.pc.in ${CMAKE_CURRENT_BINARY_DIR}/clBLAS.pc @ONLY ) install( FILES ${CMAKE_CURRENT_BINARY_DIR}/clBLAS.pc DESTINATION lib${SUFFIX_LIB}/pkgconfig ) endif( ) # clBLAS to depend on AutoGemm if ( ${PRECOMPILE_GEMM_ACTIVE} ) set( AUTOGEMM_PRECOMPILED_KERNELS_CONDITIONAL ${AUTOGEMM_PRECOMPILED_KERNELS} ) MESSAGE( STATUS "clBLAS will depend on ${AUTOGEMM_PRECOMPILED_KERNELS}" ) else() set( AUTOGEMM_PRECOMPILED_KERNELS_CONDITIONAL ) MESSAGE( STATUS "clBLAS will NOT depend on ${AUTOGEMM_PRECOMPILED_KERNELS}" ) endif() set(CLBLAS_ALL_SOURCES ${CLBLAS_SOURCES} ${GLOBAL_HEADERS} ${SRC_BLAS_HEADERS} ${SRC_BLAS_GENS_HEADERS} ${AUTOGEMM_SRC} ${AUTOGEMM_HEADERS} ${AUTOGEMM_SCRIPTS} ${AUTOGEMM_PRECOMPILED_KERNELS_CONDITIONAL} #${USERGEMM_SRC} #${USERGEMM_HEADERS} ) add_definitions(-DOPENCL_VERSION="${OPENCL_VERSION}") add_library(clBLAS ${CLBLAS_ALL_SOURCES}) add_dependencies(clBLAS GENERATE_CLT) function (add_target_definitions target) get_target_property(defs ${target} COMPILE_DEFINITIONS) if (defs MATCHES "NOTFOUND") set(defs "") endif () foreach (def ${defs} ${ARGN}) list(APPEND deflist ${def}) endforeach () set_target_properties(${target} PROPERTIES COMPILE_DEFINITIONS "${deflist}") endfunction () add_target_definitions(clBLAS BUILDING_CLBLAS) if (PRECOMPILE_TRSM_DTRSM OR PRECOMPILE_TRSM_STRSM) add_dependencies(clBLAS OCLBinaryGenerator_GEN) endif() # AutoGemm needs compiler flag to utilize pre-compiled kernels if ( ${PRECOMPILE_GEMM_ACTIVE} ) set_target_properties(clBLAS PROPERTIES COMPILE_FLAGS -DAUTOGEMM_USE_PRE_COMPILED_KERNELS) endif() set_target_properties(clBLAS PROPERTIES VERSION ${clBLAS_VERSION}) set_target_properties(clBLAS PROPERTIES SOVERSION ${clBLAS_SOVERSION}) set_target_properties( clBLAS PROPERTIES RUNTIME_OUTPUT_DIRECTORY "${PROJECT_BINARY_DIR}/staging" ) target_link_libraries(clBLAS ${OPENCL_LIBRARIES} ${MATH_LIBRARY}) # CPack configuration; include the executable into the package install( TARGETS clBLAS EXPORT Library RUNTIME DESTINATION bin${SUFFIX_BIN} LIBRARY DESTINATION lib${SUFFIX_LIB} ARCHIVE DESTINATION lib${SUFFIX_LIB}/import ) # For debug builds, include the debug runtimes into the package for testing on non-developer machines set( CMAKE_INSTALL_SYSTEM_RUNTIME_LIBS_SKIP true ) set( CMAKE_INSTALL_DEBUG_LIBRARIES true ) set( CMAKE_INSTALL_DEBUG_LIBRARIES_ONLY true ) if( WIN32 ) set( CLBLAS_RUNTIME_DESTINATION bin${SUFFIX_BIN} ) else( ) set( CLBLAS_RUNTIME_DESTINATION lib${SUFFIX_LIB} ) endif( ) include( InstallRequiredSystemLibraries ) # Install necessary runtime files for debug builds install( PROGRAMS ${CMAKE_INSTALL_SYSTEM_RUNTIME_LIBS} CONFIGURATIONS Debug DESTINATION ${CLBLAS_RUNTIME_DESTINATION} ) # Install all *.pdb files for debug builds install( DIRECTORY ${PROJECT_BINARY_DIR}/staging/ DESTINATION ${CLBLAS_RUNTIME_DESTINATION} OPTIONAL CONFIGURATIONS Debug FILES_MATCHING PATTERN "*.pdb" ) # Install a snapshot of the source as it was for this build; useful for the .pdb's install( DIRECTORY ${PROJECT_SOURCE_DIR} DESTINATION ${CLBLAS_RUNTIME_DESTINATION} OPTIONAL CONFIGURATIONS Debug ) clblas-2.10/src/library/OCLBinaryGenerator.cmake000066400000000000000000000123031264277366700216050ustar00rootroot00000000000000 message(STATUS "inside OCLBinaryGenerator.cmake") message(STATUS "OCLBinary.cmake SOURCE_DIR=${SOURCE_DIR}") message(STATUS "OCLBinary.cmake BINARY_DIR=${BINARY_DIR}") execute_process( COMMAND ${OCLBinaryGeneratorBinaryDir}/OCLBinaryGenerator ${SOURCE_DIR}/library/blas/trtri diag_dtrtri_lower_128_16 ${BINARY_DIR}/include ${OCL_COMPILER_FLAGS} ) execute_process( COMMAND ${OCLBinaryGeneratorBinaryDir}/OCLBinaryGenerator ${SOURCE_DIR}/library/blas/trtri diag_dtrtri_upper_128_16 ${BINARY_DIR}/include ${OCL_COMPILER_FLAGS} ) execute_process( COMMAND ${OCLBinaryGeneratorBinaryDir}/OCLBinaryGenerator ${SOURCE_DIR}/library/blas/trtri diag_dtrtri_upper_192_12 ${BINARY_DIR}/include ${OCL_COMPILER_FLAGS} ) execute_process( COMMAND ${OCLBinaryGeneratorBinaryDir}/OCLBinaryGenerator ${SOURCE_DIR}/library/blas/trtri triple_dgemm_update_128_16_PART1_L ${BINARY_DIR}/include ${OCL_COMPILER_FLAGS} ) execute_process( COMMAND ${OCLBinaryGeneratorBinaryDir}/OCLBinaryGenerator ${SOURCE_DIR}/library/blas/trtri triple_dgemm_update_128_16_PART2_L ${BINARY_DIR}/include ${OCL_COMPILER_FLAGS} ) execute_process( COMMAND ${OCLBinaryGeneratorBinaryDir}/OCLBinaryGenerator ${SOURCE_DIR}/library/blas/trtri triple_dgemm_update_128_16_R ${BINARY_DIR}/include ${OCL_COMPILER_FLAGS} ) execute_process( COMMAND ${OCLBinaryGeneratorBinaryDir}/OCLBinaryGenerator ${SOURCE_DIR}/library/blas/trtri triple_dgemm_update_128_32_PART1_L ${BINARY_DIR}/include ${OCL_COMPILER_FLAGS} ) execute_process( COMMAND ${OCLBinaryGeneratorBinaryDir}/OCLBinaryGenerator ${SOURCE_DIR}/library/blas/trtri triple_dgemm_update_128_32_PART1_R ${BINARY_DIR}/include ${OCL_COMPILER_FLAGS} ) execute_process( COMMAND ${OCLBinaryGeneratorBinaryDir}/OCLBinaryGenerator ${SOURCE_DIR}/library/blas/trtri triple_dgemm_update_128_32_PART2_L ${BINARY_DIR}/include ${OCL_COMPILER_FLAGS} ) execute_process( COMMAND ${OCLBinaryGeneratorBinaryDir}/OCLBinaryGenerator ${SOURCE_DIR}/library/blas/trtri triple_dgemm_update_128_32_PART2_R ${BINARY_DIR}/include ${OCL_COMPILER_FLAGS} ) execute_process( COMMAND ${OCLBinaryGeneratorBinaryDir}/OCLBinaryGenerator ${SOURCE_DIR}/library/blas/trtri triple_dgemm_update_128_64_PART1_L ${BINARY_DIR}/include ${OCL_COMPILER_FLAGS} ) execute_process( COMMAND ${OCLBinaryGeneratorBinaryDir}/OCLBinaryGenerator ${SOURCE_DIR}/library/blas/trtri triple_dgemm_update_128_64_PART1_R ${BINARY_DIR}/include ${OCL_COMPILER_FLAGS} ) execute_process( COMMAND ${OCLBinaryGeneratorBinaryDir}/OCLBinaryGenerator ${SOURCE_DIR}/library/blas/trtri triple_dgemm_update_128_64_PART2_L ${BINARY_DIR}/include ${OCL_COMPILER_FLAGS} ) execute_process( COMMAND ${OCLBinaryGeneratorBinaryDir}/OCLBinaryGenerator ${SOURCE_DIR}/library/blas/trtri triple_dgemm_update_128_64_PART2_R ${BINARY_DIR}/include ${OCL_COMPILER_FLAGS} ) execute_process( COMMAND ${OCLBinaryGeneratorBinaryDir}/OCLBinaryGenerator ${SOURCE_DIR}/library/blas/trtri triple_dgemm_update_128_ABOVE64_PART1_L ${BINARY_DIR}/include ${OCL_COMPILER_FLAGS} ) execute_process( COMMAND ${OCLBinaryGeneratorBinaryDir}/OCLBinaryGenerator ${SOURCE_DIR}/library/blas/trtri triple_dgemm_update_128_ABOVE64_PART1_R ${BINARY_DIR}/include ${OCL_COMPILER_FLAGS} ) execute_process( COMMAND ${OCLBinaryGeneratorBinaryDir}/OCLBinaryGenerator ${SOURCE_DIR}/library/blas/trtri triple_dgemm_update_128_ABOVE64_PART2_L ${BINARY_DIR}/include ${OCL_COMPILER_FLAGS} ) execute_process( COMMAND ${OCLBinaryGeneratorBinaryDir}/OCLBinaryGenerator ${SOURCE_DIR}/library/blas/trtri triple_dgemm_update_128_ABOVE64_PART2_R ${BINARY_DIR}/include ${OCL_COMPILER_FLAGS} ) execute_process( COMMAND ${OCLBinaryGeneratorBinaryDir}/OCLBinaryGenerator ${SOURCE_DIR}/library/blas/trtri triple_dgemm_update_128_ABOVE64_PART3_L ${BINARY_DIR}/include ${OCL_COMPILER_FLAGS} ) execute_process( COMMAND ${OCLBinaryGeneratorBinaryDir}/OCLBinaryGenerator ${SOURCE_DIR}/library/blas/trtri triple_dgemm_update_128_ABOVE64_PART3_R ${BINARY_DIR}/include ${OCL_COMPILER_FLAGS} ) execute_process( COMMAND ${OCLBinaryGeneratorBinaryDir}/OCLBinaryGenerator ${SOURCE_DIR}/library/blas/trtri triple_dgemm_update_192_12_R ${BINARY_DIR}/include ${OCL_COMPILER_FLAGS} ) execute_process( COMMAND ${OCLBinaryGeneratorBinaryDir}/OCLBinaryGenerator ${SOURCE_DIR}/library/blas/trtri triple_dgemm_update_192_24_PART1_R ${BINARY_DIR}/include ${OCL_COMPILER_FLAGS} ) execute_process( COMMAND ${OCLBinaryGeneratorBinaryDir}/OCLBinaryGenerator ${SOURCE_DIR}/library/blas/trtri triple_dgemm_update_192_24_PART2_R ${BINARY_DIR}/include ${OCL_COMPILER_FLAGS} ) execute_process( COMMAND ${OCLBinaryGeneratorBinaryDir}/OCLBinaryGenerator ${SOURCE_DIR}/library/blas/trtri triple_dgemm_update_192_48_PART1_R ${BINARY_DIR}/include ${OCL_COMPILER_FLAGS} ) execute_process( COMMAND ${OCLBinaryGeneratorBinaryDir}/OCLBinaryGenerator ${SOURCE_DIR}/library/blas/trtri triple_dgemm_update_192_48_PART2_R ${BINARY_DIR}/include ${OCL_COMPILER_FLAGS} ) execute_process( COMMAND ${OCLBinaryGeneratorBinaryDir}/OCLBinaryGenerator ${SOURCE_DIR}/library/blas/trtri triple_dgemm_update_192_96_PART1_R ${BINARY_DIR}/include ${OCL_COMPILER_FLAGS} ) execute_process( COMMAND ${OCLBinaryGeneratorBinaryDir}/OCLBinaryGenerator ${SOURCE_DIR}/library/blas/trtri triple_dgemm_update_192_96_PART2_R ${BINARY_DIR}/include ${OCL_COMPILER_FLAGS} ) clblas-2.10/src/library/bingen.cmake000066400000000000000000000110641264277366700174210ustar00rootroot00000000000000 #set( bingenBinaryDir "${CMAKE_SOURCE_DIR}/library/tools/bingen/staging" ) set (BIN_CL_TEMPLATES_TAHITI ${CLTEMPLATE_PATH}/dgemm_hawai.cl ${CLTEMPLATE_PATH}/dtrsm_gpu.cl ${CLTEMPLATE_PATH}/dgemm_gcn_SmallMatrices.cl ${CLTEMPLATE_PATH}/sgemm_gcn.cl ${CLTEMPLATE_PATH}/sgemm_gcn_SmallMatrices.cl #sgemm_hawaiiSplitKernel.cl ) set (BIN_CL_TEMPLATES_HAWAII_CL2 ${CLTEMPLATE_PATH}/dgemm_hawaiiSplitKernel.cl ${CLTEMPLATE_PATH}/sgemm_hawaiiSplitKernel.cl ${CLTEMPLATE_PATH}/sgemm_gcn.cl ${CLTEMPLATE_PATH}/zgemm_gcn.cl ${CLTEMPLATE_PATH}/sgemm_gcn_SmallMatrices.cl ${CLTEMPLATE_PATH}/sgemm_gcn_bigMatrices.cl ${CLTEMPLATE_PATH}/sgemm_hawaiiSplit64_32.cl ${CLTEMPLATE_PATH}/dtrsm_gpu192.cl ) set (BIN_CL_TEMPLATES_BONAIRE_CL ${CLTEMPLATE_PATH}/sgemm_hawaiiSplitKernel.cl ${CLTEMPLATE_PATH}/sgemm_gcn.cl ${CLTEMPLATE_PATH}/sgemm_gcn_SmallMatrices.cl ) set (BIN_CL_TEMPLATES_HAWAII_CL1 ${CLTEMPLATE_PATH}/dgemm_hawai.cl ${CLTEMPLATE_PATH}/dtrsm_gpu.cl ${CLTEMPLATE_PATH}/dgemm_hawaiiChannelConfilct.cl ${CLTEMPLATE_PATH}/dgemm_gcn_SmallMatrices.cl ) MESSAGE("run bingen") if(UNIX) MESSAGE("echo $LD_LIBRARY_PATH: $ENV{LD_LIBRARY_PATH}") else() MESSAGE("echo %PATH%: $ENV{PATH}") endif() if(OCL_OFFLINE_BUILD_TAHITI_KERNEL) if(LOAD_CL_FLAGS) if(UNIX) set(ENV{LD_LIBRARY_PATH} "${ENV_PATH}") MESSAGE("LD_LIBRARY_PATH : $ENV{LD_LIBRARY_PATH}") else() set(ENV{PATH} "${ENV_PATH}") MESSAGE("PATH : $ENV{PATH}") endif() string(REPLACE "\\" "" TAHITI_FLAG ${TAHITI_FLAG}) set(ENV{AMD_OCL_BUILD_OPTIONS_APPEND} "${TAHITI_FLAG}") else() set(ENV{AMD_OCL_BUILD_OPTIONS_APPEND} "") endif() MESSAGE("TAHITI AMD_OCL_BUILD_OPTIONS_APPEND FLAGS : $ENV{AMD_OCL_BUILD_OPTIONS_APPEND}") MESSAGE("command : ${bingenBinaryDir}/bingen Tahiti ${BIN_CL_TEMPLATES_TAHITI}") execute_process( COMMAND ${bingenBinaryDir}/bingen Tahiti ${BIN_CL_TEMPLATES_TAHITI} WORKING_DIRECTORY ${bingenBinaryDir}# ) else() MESSAGE(STATUS "Tahiti kernels will be built at runtime. Bingen not called.") endif() if (OCL_OFFLINE_BUILD_HAWAII_KERNEL) if(LOAD_CL_FLAGS) if(UNIX) set(ENV{LD_LIBRARY_PATH} "${ENV_PATH}") MESSAGE("LD_LIBRARY_PATH : $ENV{LD_LIBRARY_PATH}") else() set(ENV{PATH} "${ENV_PATH}") MESSAGE("PATH : $ENV{PATH}") endif() string(REPLACE "\\" "" HAWAII1_FLAG ${HAWAII1_FLAG}) set(ENV{AMD_OCL_BUILD_OPTIONS_APPEND} "${HAWAII1_FLAG}") else() set(ENV{AMD_OCL_BUILD_OPTIONS_APPEND} "") endif() MESSAGE("HAWAII CL1 AMD_OCL_BUILD_OPTIONS_APPEND FLAGS : $ENV{AMD_OCL_BUILD_OPTIONS_APPEND}") MESSAGE("command : ${bingenBinaryDir}/bingen Hawaii ${BIN_CL_TEMPLATES_HAWAII_CL1}") execute_process( COMMAND ${bingenBinaryDir}/bingen Hawaii ${BIN_CL_TEMPLATES_HAWAII_CL1} WORKING_DIRECTORY ${bingenBinaryDir}# ) else() MESSAGE(STATUS "Hawaii 1 kernels will be built at runtime. Bingen not called.") endif() if (OCL_OFFLINE_BUILD_HAWAII_KERNEL) if(LOAD_CL_FLAGS) if(UNIX) set(ENV{LD_LIBRARY_PATH} "${ENV_PATH}") MESSAGE("LD_LIBRARY_PATH : $ENV{LD_LIBRARY_PATH}") else() set(ENV{PATH} "${ENV_PATH}") MESSAGE("PATH : $ENV{PATH}") endif() string(REPLACE "\\" "" HAWAII2_FLAG ${HAWAII2_FLAG}) message("HAWAII2_FLAG = ${HAWAII2_FLAG}") set(ENV{AMD_OCL_BUILD_OPTIONS_APPEND} "${HAWAII2_FLAG}") else() set(ENV{AMD_OCL_BUILD_OPTIONS_APPEND} "") endif() MESSAGE("HAWAII CL2 AMD_OCL_BUILD_OPTIONS_APPEND FLAGS : $ENV{AMD_OCL_BUILD_OPTIONS_APPEND}") MESSAGE("ENV : $ENV{AMD_OCL_BUILD_OPTIONS_APPEND}") MESSAGE("command : ${bingenBinaryDir}/bingen Hawaii ${BIN_CL_TEMPLATES_HAWAII_CL2}") execute_process( COMMAND ${bingenBinaryDir}/bingen Hawaii ${BIN_CL_TEMPLATES_HAWAII_CL2} WORKING_DIRECTORY ${bingenBinaryDir}# ) else() MESSAGE(STATUS "Hawaii 2 kernels will be built at runtime. Bingen not called.") endif() if(OCL_OFFLINE_BUILD_BONAIRE_KERNEL) if(LOAD_CL_FLAGS) if(UNIX) set(ENV{LD_LIBRARY_PATH} "${ENV_PATH}") MESSAGE("LD_LIBRARY_PATH : $ENV{LD_LIBRARY_PATH}") else() set(ENV{PATH} "${ENV_PATH}") MESSAGE("PATH : $ENV{PATH}") endif() string(REPLACE "\\" "" BONAIRE_FLAG ${BONAIRE_FLAG}) message("BONAIRE_FLAG = ${BONAIRE_FLAG}") set(ENV{AMD_OCL_BUILD_OPTIONS_APPEND} "${BONAIRE_FLAG}") else() set(ENV{AMD_OCL_BUILD_OPTIONS_APPEND} "") endif() MESSAGE("BONAIRE CL AMD_OCL_BUILD_OPTIONS_APPEND FLAGS : $ENV{AMD_OCL_BUILD_OPTIONS_APPEND}") MESSAGE("command : ${bingenBinaryDir}/bingen Bonaire ${BIN_CL_TEMPLATES_BONAIRE_CL}") execute_process( COMMAND ${bingenBinaryDir}/bingen Bonaire ${BIN_CL_TEMPLATES_BONAIRE_CL} WORKING_DIRECTORY ${bingenBinaryDir}# ) else() MESSAGE(STATUS "Bonaire kernels will be built at runtime. Bingen not called.") endif() clblas-2.10/src/library/blas/000077500000000000000000000000001264277366700160745ustar00rootroot00000000000000clblas-2.10/src/library/blas/AutoGemm/000077500000000000000000000000001264277366700176125ustar00rootroot00000000000000clblas-2.10/src/library/blas/AutoGemm/.gitignore000066400000000000000000000000271264277366700216010ustar00rootroot00000000000000*.cl *.swp *.txt *.pyc clblas-2.10/src/library/blas/AutoGemm/AutoGemm.py000066400000000000000000000033351264277366700217060ustar00rootroot00000000000000################################################################################ # AutoGemm # - Automatically generate gemm kernels based on tile parameters # - This script generates the following to ease integration into clBLAS: # - generate all the kernel files # - kernel selection logic # - include files for kernel strings # # TODO Now # - offline compilation # TODO Future # - fuse together unroll=8 and unroll=1 in same kernel ? # functionally works fine, but lowers performance by ~10% ################################################################################ import os import sys import argparse import getopt import Common import Includes import KernelSelection import AutoGemmParameters import KernelOpenCL ################################################################################ # Main ################################################################################ if __name__ == "__main__": # parse arguments ap = argparse.ArgumentParser(description="AutoGemm") ap.add_argument("--output-path", dest="output" ) ap.add_argument("--opencl-compiler-version", dest="clCompilerVersion", action="store", choices=["1.1", "1.2", "2.0" ]) ap.add_argument("--architecture", dest="architecture", action="store", choices=["Hawaii", "Fiji" ]) args = ap.parse_args() if args.output: Common.setOutputPath(args.output) else: print("AutoGemm.py: Warning: No output path specified; default is working directory.") print("AutoGemm.py: using OpenCL " + args.clCompilerVersion + " compiler") Common.setClCompilerVersion(args.clCompilerVersion) AutoGemmParameters.setArchitecture(args.architecture) KernelOpenCL.writeOpenCLKernels() KernelSelection.writeKernelSelection() Includes.writeIncludes() clblas-2.10/src/library/blas/AutoGemm/AutoGemmParameters.py000066400000000000000000000222701264277366700237310ustar00rootroot00000000000000import copy import KernelParameters ################################################################################ # Tile Parameters for Kernel Selection Data ################################################################################ kernelSelectionDataHawaii = { # [ size, fallback tile, [ valid tiles ] ], "s":[ [ 4000, [ 16, 16, 6, 6], [ [ 16, 16, 6, 6] ] ], [ 2496, [ 16, 16, 4, 4], [ [ 16, 16, 6, 6], [ 16, 16, 4, 4] ] ], [ 2448, [ 16, 16, 6, 6], [ [ 16, 16, 6, 6] ] ], [ 1600, [ 16, 16, 6, 6], [ [ 16, 16, 6, 6], [ 16, 16, 4, 4], [ 16, 16, 5, 5] ] ], [ 1008, [ 16, 16, 6, 6], [ [ 16, 16, 6, 6], [ 16, 16, 4, 4], [ 16, 16, 5, 5], [ 16, 16, 3, 3] ] ], [ 960, [ 16, 16, 2, 2], [ [ 16, 16, 4, 4], [ 16, 16, 3, 3], [ 16, 16, 5, 5], [ 16, 16, 2, 2] ] ], [ 896, [ 16, 16, 2, 2], [ [ 16, 16, 4, 4], [ 16, 16, 6, 6], [ 16, 16, 3, 3], [ 16, 16, 5, 5], [ 16, 16, 2, 2] ] ], [ 864, [ 16, 16, 2, 2], [ [ 16, 16, 6, 6], [ 16, 16, 3, 3], [ 16, 16, 5, 5], [ 16, 16, 4, 4], [ 16, 16, 2, 2] ] ], [ 784, [ 16, 16, 2, 2], [ [ 16, 16, 3, 3], [ 16, 16, 5, 5], [ 16, 16, 4, 4], [ 16, 16, 2, 2], [ 16, 16, 1, 1] ] ], [ 768, [ 16, 16, 2, 2], [ [ 16, 16, 3, 3], [ 16, 16, 5, 5], [ 16, 16, 4, 4], [ 16, 16, 2, 2], [ 16, 16, 1, 1] ] ], [ 720, [ 16, 16, 2, 2], [ [ 16, 16, 4, 4], [ 16, 16, 5, 5], [ 16, 16, 4, 4], [ 16, 16, 6, 6], [ 16, 16, 3, 3] ] ], [ 464, [ 16, 16, 3, 3], [ [ 16, 16, 3, 3], [ 16, 16, 4, 4], [ 16, 16, 2, 2], [ 16, 16, 5, 5] ] ], [ 304, [ 16, 16, 2, 2], [ [ 16, 16, 3, 3], [ 16, 16, 2, 2], [ 16, 16, 1, 1] ] ], [ 0, [ 16, 16, 1, 1], [ [ 16, 16, 1, 1] ] ], ], "d":[ [ 5408, [ 8, 8, 6, 6], [ [ 8, 8, 6, 6], [ 16, 16, 4, 4] ] ], [ 2800, [ 16, 16, 4, 4], [ [ 8, 8, 6, 6], [ 16, 16, 4, 4] ] ], [ 1536, [ 16, 16, 4, 4], [ [ 8, 8, 6, 6], [ 16, 16, 4, 4], [ 16, 16, 5, 5] ] ], [ 1136, [ 16, 16, 4, 4], [ [ 8, 8, 6, 6], [ 16, 16, 4, 4], [ 16, 16, 5, 5], [ 16, 16, 2, 2] ] ], [ 576, [ 16, 16, 2, 2], [ [ 16, 16, 4, 4], [ 8, 8, 6, 6], [ 16, 16, 5, 5], [ 16, 16, 2, 2] ] ], [ 384, [ 16, 16, 2, 2], [ [ 16, 16, 4, 4], [ 8, 8, 6, 6], [ 16, 16, 5, 5], [ 16, 16, 2, 2], [ 16, 16, 1, 1] ] ], [ 256, [ 16, 16, 1, 1], [ [ 16, 16, 2, 2], [ 16, 16, 1, 1] ] ], [ 0, [ 16, 16, 1, 1], [ [ 16, 16, 1, 1] ] ], ], "c":[ [ 3840, [ 16, 16, 4, 4], [ [ 16, 16, 4, 4] ] ], [ 2592, [ 16, 16, 4, 4], [ [ 16, 16, 4, 4], [ 16, 16, 6, 6], [ 16, 16, 3, 3] ] ], [ 2224, [ 16, 16, 4, 4], [ [ 16, 16, 4, 4], [ 16, 16, 3, 3], [ 16, 16, 2, 2] ] ], [ 720, [ 16, 16, 2, 2], [ [ 16, 16, 4, 4], [ 16, 16, 3, 3], [ 16, 16, 2, 2], [ 16, 16, 5, 5] ] ], [ 432, [ 16, 16, 2, 2], [ [ 16, 16, 2, 2], [ 16, 16, 3, 3], [ 16, 16, 1, 1] ] ], [ 288, [ 16, 16, 1, 1], [ [ 16, 16, 2, 2], [ 16, 16, 1, 1] ] ], [ 0, [ 16, 16, 1, 1], [ [ 16, 16, 1, 1] ] ], ], "z":[ [ 3008, [ 16, 16, 3, 3], [ [ 16, 16, 3, 3] ] ], [ 1344, [ 16, 16, 3, 3], [ [ 16, 16, 3, 3], [ 16, 16, 4, 4] ] ], [ 1040, [ 16, 16, 3, 3], [ [ 16, 16, 3, 3], [ 16, 16, 4, 4], [ 16, 16, 2, 2] ] ], [ 832, [ 16, 16, 2, 2], [ [ 16, 16, 3, 3], [ 16, 16, 4, 4], [ 16, 16, 2, 2] ] ], [ 544, [ 16, 16, 2, 2], [ [ 16, 16, 3, 3], [ 16, 16, 2, 2] ] ], [ 336, [ 16, 16, 2, 2], [ [ 16, 16, 3, 3], [ 16, 16, 2, 2], [ 16, 16, 1, 1] ] ], [ 192, [ 16, 16, 1, 1], [ [ 16, 16, 2, 2], [ 16, 16, 1, 1] ] ], [ 0, [ 16, 16, 1, 1], [ [ 16, 16, 1, 1] ] ], ], } kernelSelectionDataFiji = { "s":[ [ 3072, [ 16, 16, 6, 6], [ [ 16, 16, 6, 6], [ 16, 16, 5, 5], [ 16, 16, 4, 4] ] ], [ 2240, [ 16, 16, 6, 6], [ [ 16, 16, 6, 6], [ 16, 16, 4, 4], [ 16, 16, 5, 5], [ 16, 16, 3, 3] ] ], [ 1760, [ 16, 16, 4, 4], [ [ 16, 16, 6, 6], [ 16, 16, 4, 4], [ 16, 16, 5, 5], [ 16, 16, 3, 3] ] ], [ 1600, [ 16, 16, 4, 4], [ [ 16, 16, 4, 4], [ 16, 16, 6, 6], [ 16, 16, 5, 5], [ 16, 16, 3, 3] ] ], [ 1056, [ 16, 16, 4, 4], [ [ 16, 16, 4, 4], [ 16, 16, 6, 6], [ 16, 16, 5, 5], [ 16, 16, 3, 3], [ 16, 16, 2, 2] ] ], [ 960, [ 16, 16, 4, 4], [ [ 16, 16, 4, 4], [ 16, 16, 5, 5], [ 16, 16, 3, 3], [ 16, 16, 2, 2] ] ], [ 736, [ 16, 16, 3, 3], [ [ 16, 16, 4, 4], [ 16, 16, 3, 3], [ 16, 16, 5, 5], [ 16, 16, 2, 2] ] ], [ 528, [ 16, 16, 3, 3], [ [ 16, 16, 4, 4], [ 16, 16, 3, 3], [ 16, 16, 2, 2], [ 16, 16, 1, 1] ] ], [ 432, [ 16, 16, 2, 2], [ [ 16, 16, 3, 3], [ 16, 16, 2, 2], [ 16, 16, 1, 1] ] ], [ 320, [ 16, 16, 2, 2], [ [ 16, 16, 2, 2], [ 16, 16, 1, 1] ] ], [ 0, [ 16, 16, 1, 1], [ [ 16, 16, 1, 1] ] ], ], "d":[ [ 3200, [ 16, 16, 4, 4], [ [ 16, 16, 4, 4], [ 16, 16, 5, 5], [ 16, 16, 2, 2], [ 8, 8, 6, 6 ] ] ], [ 1632, [ 16, 16, 2, 2], [ [ 16, 16, 4, 4], [ 16, 16, 2, 2], [ 16, 16, 5, 5], [ 8, 8, 6, 6 ] ] ], [ 1280, [ 16, 16, 2, 2], [ [ 16, 16, 4, 4], [ 16, 16, 2, 2], [ 16, 16, 5, 5], [ 8, 8, 6, 6 ], [ 16, 16, 1, 1] ] ], [ 1056, [ 16, 16, 2, 2], [ [ 16, 16, 2, 2], [ 16, 16, 1, 1] ] ], [ 672, [ 16, 16, 2, 2], [ [ 16, 16, 1, 1] ] ], [ 0, [ 16, 16, 1, 1], [ [ 16, 16, 1, 1] ] ], ], "c":[ [ 2240, [ 16, 16, 4, 4], [ [ 16, 16, 4, 4], [ 16, 16, 6, 6], ] ], [ 1440, [ 16, 16, 4, 4], [ [ 16, 16, 4, 4], [ 16, 16, 6, 6], [ 16, 16, 5, 5], [ 16, 16, 2, 2] ] ], [ 1088, [ 16, 16, 2, 2], [ [ 16, 16, 4, 4], [ 16, 16, 3, 3], [ 16, 16, 2, 2], [ 16, 16, 5, 5] ] ], [ 704, [ 16, 16, 2, 2], [ [ 16, 16, 2, 2], [ 16, 16, 3, 3], [ 16, 16, 5, 5] ] ], [ 528, [ 16, 16, 2, 2], [ [ 16, 16, 2, 2], [ 16, 16, 3, 3], [ 16, 16, 1, 1] ] ], [ 336, [ 16, 16, 2, 2], [ [ 16, 16, 2, 2], [ 16, 16, 1, 1] ] ], [ 0, [ 16, 16, 1, 1], [ [ 16, 16, 1, 1] ] ], ], "z":[ [ 2528, [ 16, 16, 2, 2], [ [ 16, 16, 4, 4], [ 16, 16, 2, 2], [ 16, 16, 3, 3] ] ], [ 1872, [ 16, 16, 2, 2], [ [ 16, 16, 2, 2], [ 16, 16, 3, 3], [ 16, 16, 1, 1] ] ], [ 1040, [ 16, 16, 2, 2], [ [ 16, 16, 2, 2], [ 16, 16, 1, 1] ] ], [ 768, [ 16, 16, 1, 1], [ [ 16, 16, 2, 2], [ 16, 16, 1, 1] ] ], [ 0, [ 16, 16, 1, 1], [ [ 16, 16, 1, 1] ] ], ] } kernelSelectionData = kernelSelectionDataHawaii def setArchitecture(architecture): global kernelSelectionData, kernelSelectionDataHawaii, kernelSelectionDataFiji if architecture == "Fiji": kernelSelectionData = kernelSelectionDataFiji else: kernelSelectionData = kernelSelectionDataHawaii ################################################################################ # Non-Tile Parameters ################################################################################ precisions = ["s", "d", "c", "z"] orders = [ "clblasColumnMajor" ] transposes = { "s":["N", "T"], "d":["N", "T"], \ "c":["N", "T", "C"], "z":["N", "T", "C"] } unrolls = { "s":[16, 8, 1], "d":[8, 1], "c":[8, 1], "z":[8, 1] } betas = [ 0, 1 ] def getTilesForPrecision(precision): # valid tiles for this precision tiles = [] tile = KernelParameters.TileParameters() for sizeData in kernelSelectionData[precision]: fallbackTile = sizeData[1] validTiles = sizeData[2] # add valid tiles for tileParams in validTiles: #print(tileParams) tile.workGroupNumRows = tileParams[0] tile.workGroupNumCols = tileParams[1] tile.microTileNumRows = tileParams[2] tile.microTileNumCols = tileParams[3] tile.macroTileNumRows = tile.workGroupNumRows*tile.microTileNumRows tile.macroTileNumCols = tile.workGroupNumCols*tile.microTileNumCols #print(tile.getName()) for unroll in unrolls[precision]: tile.unroll = unroll if tile.isValid(): tiles.append( copy.copy(tile) ) else: print(tile.getName() + " - SKIPPING - ") # add fallback tile tile.workGroupNumRows = fallbackTile[0] tile.workGroupNumCols = fallbackTile[1] tile.microTileNumRows = fallbackTile[2] tile.microTileNumCols = fallbackTile[3] tile.macroTileNumRows = tile.workGroupNumRows*tile.microTileNumRows tile.macroTileNumCols = tile.workGroupNumCols*tile.microTileNumCols for unroll in unrolls[precision]: tile.unroll = unroll if tile.isValid(): tiles.append( copy.copy(tile) ) else: print(tile.getName() + " - SKIPPING - ") setTiles = set(tiles) tiles = list( setTiles ) tiles.sort() return tiles def getTransposeChoices(): singleTransposes = [] for precision in precisions: for transpose in transposes[precision]: singleTransposes.append( transpose ) singleTransposeSet = set(singleTransposes) singleTranspose = list( singleTransposeSet) transposeChoices = [] for transA in singleTranspose: for transB in singleTranspose: transposePair = transA+transB if transposePair not in transposeChoices: transposeChoices.append(transposePair) return transposeChoices def getTileChoices(): tileChoices = [] for precision in precisions: tilesForPrecision = getTilesForPrecision(precision) for t in tilesForPrecision: tile = str(t.workGroupNumRows*t.microTileNumRows) + "x" + str(t.workGroupNumCols*t.microTileNumCols) if tile not in tileChoices: tileChoices.append(tile) return tileChoices clblas-2.10/src/library/blas/AutoGemm/AutoGemmTeardown.h000066400000000000000000000001671264277366700232110ustar00rootroot00000000000000#pragma once #ifdef __cplusplus #extern "C" { #endif void initAutoGemmClKernels(void); #ifdef __cplusplus } #endif clblas-2.10/src/library/blas/AutoGemm/AutoGemmTools/000077500000000000000000000000001264277366700223515ustar00rootroot00000000000000clblas-2.10/src/library/blas/AutoGemm/AutoGemmTools/AutoGemmPreCompileKernels.cpp000066400000000000000000000676731264277366700301220ustar00rootroot00000000000000/* ************************************************************************ * Copyright 2013 Advanced Micro Devices, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * ************************************************************************/ #include #include #include #include #include #include #include #include #include #include #ifdef __GNUC__ // Linux #include #include #include #include #else // Windows #include #include #include #include #define stat _stat #endif #include "CL/opencl.h" //#include "naive_blas.cpp" //using namespace NaiveBlas; #include "AutoGemmIncludes/AutoGemmKernelsToPreCompile.h" #include "AutoGemmIncludes/AutoGemmKernelSelectionSpecific.h" #include "UserGemmKernelSources/UserGemmClKernels.h" #include "UserGemmKernelSources/UserGemmKernelSourceIncludes.h" unsigned int totalKernelsToCompile; unsigned int numKernelsCompiled; char *path; std::ofstream includeFile; //std::clock_t clockStart; unsigned long long clockStart; unsigned long long clockFrequency; /****************************************************************************** * Check OpenCL Errors *****************************************************************************/ #define CL_CHECK(STATUS) \ if(STATUS != CL_SUCCESS) { \ printf("OpenCL error %i on line %u\n", STATUS, __LINE__); \ assert(false); \ } /****************************************************************************** * Get AMD Platform *****************************************************************************/ cl_int getAMDPlatform(cl_platform_id *platform) { *platform = NULL; cl_int status = CL_SUCCESS; // get num platforms cl_uint numPlatforms; status = clGetPlatformIDs(0, NULL, &numPlatforms); if(status != CL_SUCCESS) { std::cout << "Error: clGetPlatformIDs failed. Error code: " << status << std::endl; return status; } if (numPlatforms > 0) { // Get selected platform cl_platform_id* platforms = new cl_platform_id[numPlatforms]; status = clGetPlatformIDs(numPlatforms, platforms, NULL); if(status != CL_SUCCESS) { std::cout<<"Error: clGetPlatformIDs failed. Error code : " << status << std::endl; return status; } // Print all platforms for (unsigned i = 0; i < numPlatforms; ++i) { char pbuf[100]; status = clGetPlatformInfo(platforms[i], CL_PLATFORM_VENDOR, sizeof(pbuf), pbuf, NULL); if(status != CL_SUCCESS) { std::cout<<"Error: clGetPlatformInfo failed. Error code : " << status << std::endl; return status; } //std::cout << "Platform " << i << " : " << pbuf << std::endl; } // Get AMD platform for (unsigned i = 0; i < numPlatforms; ++i) { char pbuf[100]; status = clGetPlatformInfo(platforms[i], CL_PLATFORM_VENDOR, sizeof(pbuf), pbuf, NULL); if(status != CL_SUCCESS) { std::cout << "Error: clGetPlatformInfo failed. Error code: " << status << std::endl; return status; } *platform = platforms[i]; if (!strcmp(pbuf, "Advanced Micro Devices, Inc.")) { break; } } // verify AMD platform char pbuf[100]; status = clGetPlatformInfo(*platform, CL_PLATFORM_VENDOR, sizeof(pbuf), pbuf, NULL); if(status != CL_SUCCESS) { std::cout<<"Error: clGetPlatformInfo failed. Error code: " << status << std::endl; return status; } if (strcmp(pbuf, "Advanced Micro Devices, Inc.")) { std::cout << "AMD platform not found" << std::endl; return CL_INVALID_PLATFORM; } } else { std::cout << "No OpenCL platforms found." << std::endl; return CL_INVALID_PLATFORM; } return status; } /****************************************************************************** * Precision -> char *****************************************************************************/ template char getPrecisionChar(); template<> char getPrecisionChar(){ return 's'; } template<> char getPrecisionChar(){ return 'd'; } template<> char getPrecisionChar(){ return 'c'; } template<> char getPrecisionChar(){ return 'z'; } /****************************************************************************** * get kernel name *****************************************************************************/ template int getKernelName( char **kernelName, clblasOrder order, clblasTranspose transA, clblasTranspose transB, bool beta, unsigned int macroTileNumRows, unsigned int macroTileNumCols, unsigned int unroll, bool extraRow, bool extraCol, char *appendstring) { int n = sprintf( *kernelName, "%cgemm_%s_%s%s_B%i_M%c%03u_N%c%03u_KX%02u", getPrecisionChar(), order==clblasColumnMajor ? "Col" : "Row", transA==clblasNoTrans ? "N" : transA==clblasTrans ? "T" : "C", transB==clblasNoTrans ? "N" : transB==clblasTrans ? "T" : "C", beta ? 1 : 0, extraRow ? 'L' : 'X', macroTileNumRows, extraCol ? 'L' : 'X', macroTileNumCols, unroll ); int n2 = 0; if (appendstring != NULL) { n2 = sprintf((*kernelName) + n, appendstring); } return n2 + n; } template int getStringName( char **stringName, clblasOrder order, clblasTranspose transA, clblasTranspose transB, bool beta, unsigned int macroTileNumRows, unsigned int macroTileNumCols, unsigned int unroll, bool extraRow, bool extraCol, char *appendstring) { int n = getKernelName(stringName, order, transA, transB, beta, macroTileNumRows, macroTileNumCols, unroll, extraRow, extraCol, appendstring); int n2 = sprintf( (*stringName)+n, "_bin" ); return n+n2; } template int getFileName( char **fileName, clblasOrder order, clblasTranspose transA, clblasTranspose transB, bool beta, unsigned int macroTileNumRows, unsigned int macroTileNumCols, unsigned int unroll, bool extraRow, bool extraCol, char *appendstring) { int n = getKernelName(fileName, order, transA, transB, beta, macroTileNumRows, macroTileNumCols, unroll, extraRow, extraCol, appendstring); int n2 = sprintf( (*fileName)+n, "_bin.cpp" ); return n+n2; } template int getPreprocessorName( char **preprocessorName, clblasOrder order, clblasTranspose transA, clblasTranspose transB, bool beta, unsigned int macroTileNumRows, unsigned int macroTileNumCols, unsigned int unroll, bool extraRow, bool extraCol, char *appendstring) { char kernelNameArray[64]; char *kernelName = kernelNameArray; int n = getKernelName(&kernelName, order, transA, transB, beta, macroTileNumRows, macroTileNumCols, unroll, extraRow, extraCol, appendstring); for ( int i = 0; i < n; i++) { kernelName[i] = toupper(kernelName[i]); } int n2 = sprintf( *preprocessorName, "KERNEL_%s_BIN_CPP", kernelName ); return n2; } /****************************************************************************** * get kernel binary from source *****************************************************************************/ cl_int getKernelBinaryFromSource( cl_context context, const char *source, const char *buildOptions, char **binary, size_t *binarySize) { cl_int status = CL_SUCCESS; // create program cl_program program = clCreateProgramWithSource(context,1, &source, NULL, &status); CL_CHECK(status); cl_uint numDevicesInContext; status = clGetContextInfo(context, CL_CONTEXT_NUM_DEVICES, sizeof(cl_uint), &numDevicesInContext, NULL); CL_CHECK(status); // get devices //printf("Devices: %u\n", numDevicesInContext); cl_device_id* devices = new cl_device_id[numDevicesInContext]; clGetContextInfo(context, CL_CONTEXT_DEVICES, numDevicesInContext*sizeof(cl_device_id), devices, NULL); CL_CHECK(status); // choose device 0 cl_device_id device = devices[0]; // build program for device status = clBuildProgram(program, 1, &device, buildOptions, NULL, NULL); // print build failure if (status != CL_SUCCESS) { printf("clBuildProgram Failed\n"); printf("status = %d\n", status); size_t len=0; clGetProgramBuildInfo(program, device, CL_PROGRAM_BUILD_LOG, 0, NULL, &len); char* buildLog = new char[len]; printf("Error: Failed to build program executable!\n"); clGetProgramBuildInfo(program, device, CL_PROGRAM_BUILD_LOG, len*sizeof(char), buildLog, 0); printf("\nBuild Log:\n\n"); printf("%s\n", buildLog); printf("\n\nKernel String:\n\n"); printf("%s\n", source); binary[0] = 0; *binarySize = 0; return status; } // get binary from program status = clGetProgramInfo(program, CL_PROGRAM_BINARY_SIZES, sizeof(size_t), binarySize, NULL); //printf("BinarySize: %llu\n", *binarySize); binary[0] = new char[*binarySize]; //for (int i = 0; i < *binarySize; i++) binary[0][i] = 512; //size_t size = 2; //status = -1; //for (int i = 0; status; i++) { // printf("size=%i\n", i); status = clGetProgramInfo(program, CL_PROGRAM_BINARIES, 8 /*?*/, binary, NULL); //} CL_CHECK(status); //for (int i = 0; i < *binarySize; i++) { // std::cout << std::setw(3) << (int)binary[0][i] << ","; //} //printf("binary[0][0] = %p\n", binary[0][0]); //printf("binary[0] = %p\n", binary[0]); //printf("binary = %p\n", binary); //printf("&binary = %p\n", &binary); return CL_SUCCESS; } /****************************************************************************** * write binary to stream *****************************************************************************/ void writeBinaryToStream(std::ostream & out, char *binary, size_t binarySize) { for(int i = 0; i < binarySize; i++) { out << std::setw(4) << (int) binary[i]; if(i < binarySize-1) { out << ","; } if((i+1)%16 == 0) { out << std::endl; } } out << std::endl; } /****************************************************************************** * Pre-compile kernels within parameter group and write to file *****************************************************************************/ template void compileKernelAndWriteToFile( cl_context context, clblasOrder order, clblasTranspose transA, clblasTranspose transB, bool beta, unsigned int macroTileNumRows, unsigned int macroTileNumCols, unsigned int unroll, bool extraRow, bool extraCol, const char *source, const char *buildOptions, char* appendString) { // get kernel name char stringNameArray[64]; char fileNameArray[64]; char preprocessorNameArray[64]; char *stringName = &stringNameArray[0]; char *fileName = &fileNameArray[0]; char *preprocessorName = &preprocessorNameArray[0]; int stringNameLength = getStringName(&stringName, order, transA, transB, beta, macroTileNumRows, macroTileNumCols, unroll, extraRow, extraCol, appendString); int fileNameLength = getFileName(&fileName, order, transA, transB, beta, macroTileNumRows, macroTileNumCols, unroll, extraRow, extraCol, appendString); int preprocessorNameLength = getPreprocessorName(&preprocessorName, order, transA, transB, beta, macroTileNumRows, macroTileNumCols, unroll, extraRow, extraCol, appendString); // get kernel binary char **kernelBinary = new char*[1]; kernelBinary[0] = 0; size_t kernelBinarySize; cl_int status = getKernelBinaryFromSource(context, source, buildOptions, kernelBinary, &kernelBinarySize); if (status == CL_SUCCESS) { // write binary to file std::ofstream kernelFile; std::string fullFilePath; fullFilePath += path; fullFilePath += fileName; kernelFile.open(fullFilePath.c_str(), std::ios::out); kernelFile << "/* AutoGemm Pre-Compiled kernel binary */" << std::endl << std::endl; kernelFile << "#define " << preprocessorName << std::endl << std::endl; kernelFile << "char " << stringName << "Array[" << kernelBinarySize << "] = {" << std::endl; //kernelFile << "unsigned char *" << stringName << " = {" << std::endl; //kernelFile << "unsigned char " << stringName << "[] = {" << std::endl; writeBinaryToStream( kernelFile, *kernelBinary, kernelBinarySize ); kernelFile << "};" << std::endl; kernelFile << "unsigned char *" << stringName << " = " << "reinterpret_cast(" << stringName << "Array);" << std::endl; kernelFile << "size_t " << stringName << "Size = " << kernelBinarySize << ";" << std::endl; kernelFile.close(); // add file to include includeFile << "#include \"AutoGemmKernelBinaries/" << fileName << "\"" << std::endl; } if (kernelBinary[0]) delete[] kernelBinary[0]; delete[] kernelBinary; // report kernel compiled numKernelsCompiled++; // how much time left unsigned long long clockCurrent; #if defined( _WIN32 ) ::QueryPerformanceCounter( reinterpret_cast( &clockCurrent ) ); #else struct timeval s; gettimeofday(&s, 0); clockCurrent = (unsigned long long)s.tv_sec * 1000000 + (unsigned long long)s.tv_usec; #endif double elapsedTimeSec = ((double) clockCurrent - clockStart) / clockFrequency; double timePerKernel = elapsedTimeSec / numKernelsCompiled; double timeRemaining = timePerKernel * (totalKernelsToCompile - numKernelsCompiled); //printf("AutoGemm-PreCompile[%3u/%3u]: %s %7u bytes ( %.0f sec remaining)\n", numKernelsCompiled, totalKernelsToCompile, stringName, kernelBinarySize, timeRemaining); std::cout << "AutoGemm-PreCompile[" << std::setw(3) << numKernelsCompiled << "/" << std::setw(3) << totalKernelsToCompile << "]: " << stringName << std::setw(4) << kernelBinarySize/1024 << " kB (" << std::setw(4) << (int) timeRemaining << " sec remaining)" << std::endl; } /****************************************************************************** * compile kernels within parameter group and write to file *****************************************************************************/ template cl_int compileKernelGroupAndWriteToFile( cl_context context, clblasOrder order, clblasTranspose transA, clblasTranspose transB, bool beta, unsigned int macroTileNumRows, unsigned int macroTileNumCols, unsigned int unroll ) { const char *tileKernelSource; const char *rowKernelSource; const char *colKernelSource; const char *cornerKernelSource; const char *sourceBuildOptions; const unsigned char *tileKernelBinary; const unsigned char *rowKernelBinary; const unsigned char *colKernelBinary; const unsigned char *cornerKernelBinary; size_t *tileKernelBinarySize; size_t *rowKernelBinarySize; size_t *colKernelBinarySize; size_t *cornerKernelBinarySize; const char *binaryBuildOptions; cl_kernel *tileClKernel; cl_kernel *rowClKernel; cl_kernel *colClKernel; cl_kernel *cornerClKernel; unsigned int workGroupNumRows; unsigned int workGroupNumCols; unsigned int microTileNumRows; unsigned int microTileNumCols; bool kernelFound = gemmSelectKernelSpecific( order, transA, transB, beta, macroTileNumRows, macroTileNumCols, unroll, &tileKernelSource, &rowKernelSource, &colKernelSource, &cornerKernelSource, &sourceBuildOptions, &tileKernelBinary, &rowKernelBinary, &colKernelBinary, &cornerKernelBinary, &tileKernelBinarySize, &rowKernelBinarySize, &colKernelBinarySize, &cornerKernelBinarySize, &binaryBuildOptions, &tileClKernel, &rowClKernel, &colClKernel, &cornerClKernel, &workGroupNumRows, &workGroupNumCols, µTileNumRows, µTileNumCols ); if (!kernelFound) { totalKernelsToCompile -= 4; char stringNameArray[64]; char *stringName = &stringNameArray[0]; int stringNameLength = getStringName( &stringName, order, transA, transB, beta, macroTileNumRows, macroTileNumCols, unroll, 0, 0, NULL); printf("AutoGemm-PreCompile: %s not found; skipping.\n", stringName ); return 0; } compileKernelAndWriteToFile( context, order, transA, transB, beta, macroTileNumRows, macroTileNumCols, unroll, false, // extra row false, // extra col tileKernelSource, sourceBuildOptions, NULL); compileKernelAndWriteToFile( context, order, transA, transB, beta, macroTileNumRows, macroTileNumCols, unroll, true, // extra row false, // extra col rowKernelSource, sourceBuildOptions, NULL); compileKernelAndWriteToFile( context, order, transA, transB, beta, macroTileNumRows, macroTileNumCols, unroll, false, // extra row true, // extra col colKernelSource, sourceBuildOptions, NULL); compileKernelAndWriteToFile( context, order, transA, transB, beta, macroTileNumRows, macroTileNumCols, unroll, true, // extra row true, // extra col cornerKernelSource, sourceBuildOptions, NULL); return 1; } /****************************************************************************** * Main *****************************************************************************/ int main( int argc, char *argv[] ) { if (argc < 2 ) { printf("AutoGemmPreCompileKernels requires output path as argument\n"); exit(-1); } size_t pathLength = std::strlen(argv[1]); path = new char[pathLength+64]; sprintf(path, "%s/include/AutoGemmKernelBinaries/", argv[1]); //printf("AutoGemm-PreCompile: writing to %s\n", path); std::string fullIncludeFilePath; fullIncludeFilePath += path; fullIncludeFilePath += "AutoGemmKernelBinariesPreCompiled.h"; includeFile.open(fullIncludeFilePath.c_str(), std::ios::out); const char *includeFileHeader = "/*****************************************************************************\n" " * this file auto-generated by AutoGemmPreCompileKernels\n" " ****************************************************************************/\n\n"; includeFile << includeFileHeader; // get AMD platform cl_platform_id platform; cl_int status = getAMDPlatform( &platform ); CL_CHECK(status); cl_uint numDevices; status = clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, 0, NULL, &numDevices); CL_CHECK(status); // get all gpu devices //printf("NumDevicesInPlatform: %u\n", numDevices); cl_device_id* devices = new cl_device_id[numDevices]; clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, numDevices, devices, NULL); CL_CHECK(status); // choose device 0 cl_device_id device = devices[0]; // create context cl_context_properties cps[3] = { CL_CONTEXT_PLATFORM, (cl_context_properties)platform, 0 }; cl_context context = clCreateContext( cps, 1, // device &device, NULL, NULL, &status); CL_CHECK(status); cl_uint numDevicesInContext; status = clGetContextInfo(context, CL_CONTEXT_NUM_DEVICES, sizeof(cl_uint), &numDevicesInContext, NULL); //printf("NumDevicesInContext: %u\n", numDevicesInContext); CL_CHECK(status); clblasOrder order; clblasTranspose transA; clblasTranspose transB; bool beta; unsigned int macroTileNumRows; unsigned int macroTileNumCols; unsigned int unroll; // timer clockStart = clock(); #if defined( _WIN32 ) // OS call to get ticks per second2 ::QueryPerformanceFrequency( reinterpret_cast( &clockFrequency ) ); #else clockFrequency = 1000000; #endif #if defined( _WIN32 ) ::QueryPerformanceCounter( reinterpret_cast( &clockStart ) ); #else struct timeval s; gettimeofday(&s, 0); clockStart = (unsigned long long)s.tv_sec * 1000000 + (unsigned long long)s.tv_usec; #endif const int specialKernelCount = user_kernel_count; totalKernelsToCompile = gemmPreCompileNum; totalKernelsToCompile *= 4; totalKernelsToCompile += specialKernelCount; numKernelsCompiled = 0; //precompile user defined kernels //all of the user defined special kernels will be precompiled if precompile is active //there are 7 user defined special kernels refer to UserGemmKernelIncludes.h const char *tileKernelSourceArray[specialKernelCount] = { sgemm_Col_NT_B1_MX032_NX064_KX16_ROW_src, sgemm_Col_NT_B1_MX064_NX032_KX16_COLUMN_src, sgemm_Col_NT_B1_MX032_NX032_KX16_SINGLE_src, sgemm_Col_NT_B1_MX128_NX128_KX16_src, sgemm_Col_NN_B1_MX032_NX032_KX16_BRANCH_src, sgemm_Col_NT_B1_MX032_NX032_KX16_BRANCH_src, sgemm_Col_TN_B1_MX032_NX032_KX16_BRANCH_src }; const unsigned char *tileKernelBinaryArray[specialKernelCount] = { sgemm_Col_NT_B1_MX032_NX064_KX16_ROW_bin, sgemm_Col_NT_B1_MX064_NX032_KX16_COLUMN_bin, sgemm_Col_NT_B1_MX032_NX032_KX16_SINGLE_bin, sgemm_Col_NT_B1_MX128_NX128_KX16_bin, sgemm_Col_NN_B1_MX032_NX032_KX16_BRANCH_bin, sgemm_Col_NT_B1_MX032_NX032_KX16_BRANCH_bin, sgemm_Col_TN_B1_MX032_NX032_KX16_BRANCH_bin }; size_t tileKernelBinarySizeArray[specialKernelCount] = { sgemm_Col_NT_B1_MX032_NX064_KX16_ROW_binSize, sgemm_Col_NT_B1_MX064_NX032_KX16_COLUMN_binSize, sgemm_Col_NT_B1_MX032_NX032_KX16_SINGLE_binSize, sgemm_Col_NT_B1_MX128_NX128_KX16_binSize, sgemm_Col_NN_B1_MX032_NX032_KX16_BRANCH_binSize, sgemm_Col_NT_B1_MX032_NX032_KX16_BRANCH_binSize, sgemm_Col_TN_B1_MX032_NX032_KX16_BRANCH_binSize }; unsigned int workGroupNumRowsArray[specialKernelCount] = { sgemm_Col_NT_B1_MX032_NX064_KX16_ROW_workGroupNumRows, sgemm_Col_NT_B1_MX064_NX032_KX16_COLUMN_workGroupNumRows, sgemm_Col_NT_B1_MX032_NX032_KX16_SINGLE_workGroupNumRows, sgemm_Col_NT_B1_MX128_NX128_KX16_workGroupNumRows, sgemm_Col_NN_B1_MX032_NX032_KX16_BRANCH_workGroupNumRows, sgemm_Col_NT_B1_MX032_NX032_KX16_BRANCH_workGroupNumRows, sgemm_Col_TN_B1_MX032_NX032_KX16_BRANCH_workGroupNumRows }; unsigned int workGroupNumColsArray[specialKernelCount] = { sgemm_Col_NT_B1_MX032_NX064_KX16_ROW_workGroupNumCols, sgemm_Col_NT_B1_MX064_NX032_KX16_COLUMN_workGroupNumCols, sgemm_Col_NT_B1_MX032_NX032_KX16_SINGLE_workGroupNumCols, sgemm_Col_NT_B1_MX128_NX128_KX16_workGroupNumCols, sgemm_Col_NN_B1_MX032_NX032_KX16_BRANCH_workGroupNumCols, sgemm_Col_NT_B1_MX032_NX032_KX16_BRANCH_workGroupNumCols, sgemm_Col_TN_B1_MX032_NX032_KX16_BRANCH_workGroupNumCols }; unsigned int microTileNumRowsArray[specialKernelCount] = { sgemm_Col_NT_B1_MX032_NX064_KX16_ROW_microTileNumRows, sgemm_Col_NT_B1_MX064_NX032_KX16_COLUMN_microTileNumRows, sgemm_Col_NT_B1_MX032_NX032_KX16_SINGLE_microTileNumRows, sgemm_Col_NT_B1_MX128_NX128_KX16_microTileNumRows, sgemm_Col_NN_B1_MX032_NX032_KX16_BRANCH_microTileNumRows, sgemm_Col_NT_B1_MX032_NX032_KX16_BRANCH_microTileNumRows, sgemm_Col_TN_B1_MX032_NX032_KX16_BRANCH_microTileNumRows }; unsigned int microTileNumColsArray[specialKernelCount] = { sgemm_Col_NT_B1_MX032_NX064_KX16_ROW_microTileNumCols, sgemm_Col_NT_B1_MX064_NX032_KX16_COLUMN_microTileNumCols, sgemm_Col_NT_B1_MX032_NX032_KX16_SINGLE_microTileNumCols, sgemm_Col_NT_B1_MX128_NX128_KX16_microTileNumCols, sgemm_Col_NN_B1_MX032_NX032_KX16_BRANCH_microTileNumCols, sgemm_Col_NT_B1_MX032_NX032_KX16_BRANCH_microTileNumCols, sgemm_Col_TN_B1_MX032_NX032_KX16_BRANCH_microTileNumCols }; unsigned int unrollArray[specialKernelCount] = { sgemm_Col_NT_B1_MX032_NX064_KX16_ROW_unroll, sgemm_Col_NT_B1_MX064_NX032_KX16_COLUMN_unroll, sgemm_Col_NT_B1_MX032_NX032_KX16_SINGLE_unroll, sgemm_Col_NT_B1_MX128_NX128_KX16_unroll, sgemm_Col_NN_B1_MX032_NX032_KX16_BRANCH_unroll, sgemm_Col_NT_B1_MX032_NX032_KX16_BRANCH_unroll, sgemm_Col_TN_B1_MX032_NX032_KX16_BRANCH_unroll }; char *appendStringArray[specialKernelCount] = { "_ROW", "_COLUMN", "_SINGLE", NULL, "_BRANCH", "_BRANCH", "_BRANCH" }; clblasTranspose transA_Array[specialKernelCount] = { clblasNoTrans, clblasNoTrans, clblasNoTrans, clblasNoTrans, clblasNoTrans, clblasNoTrans, clblasTrans }; clblasTranspose transB_Array[specialKernelCount] = { clblasTrans, clblasTrans, clblasTrans, clblasTrans, clblasNoTrans, clblasTrans, clblasNoTrans }; for (int i = 0; i < specialKernelCount; i++) { const char *tileKernelSource; const unsigned char *tileKernelBinary; size_t tileKernelBinarySize; const char *binaryBuildOptions; //cl_kernel *tileClKernel; unsigned int workGroupNumRows; unsigned int workGroupNumCols; unsigned int unroll; tileKernelSource = tileKernelSourceArray[i]; tileKernelBinary = tileKernelBinaryArray[i]; tileKernelBinarySize = tileKernelBinarySizeArray[i]; binaryBuildOptions = User_binBuildOptions; workGroupNumRows = workGroupNumRowsArray[i]; workGroupNumCols = workGroupNumColsArray[i]; macroTileNumRows = microTileNumRowsArray[i] * workGroupNumRowsArray[i]; macroTileNumCols = microTileNumColsArray[i] * workGroupNumColsArray[i]; unroll = unrollArray[i]; beta = 1.0; char *appendString = appendStringArray[i]; compileKernelAndWriteToFile( context, clblasColumnMajor, transA_Array[i], transB_Array[i], beta, macroTileNumRows, macroTileNumCols, unroll, false, // extra row false, // extra col tileKernelSource, binaryBuildOptions, appendString); } // for each kernel to be pre-compiled //totalKernelsToCompile = gemmPreCompileNum; //totalKernelsToCompile *= 4; //numKernelsCompiled = 0; for (unsigned int i = 0; i < gemmPreCompileNum ; i++) { // unload parameters // idx 0 is precision order = gemmPreCompile[i][1]==1 ? clblasColumnMajor : clblasRowMajor; transA = gemmPreCompile[i][2]==0 ? clblasNoTrans : gemmPreCompile[i][2]==1 ? clblasTrans : clblasConjTrans; transB = gemmPreCompile[i][3]==0 ? clblasNoTrans : gemmPreCompile[i][3]==1 ? clblasTrans : clblasConjTrans; beta = gemmPreCompile[i][4]==1; macroTileNumRows = gemmPreCompile[i][5]; macroTileNumCols = gemmPreCompile[i][6]; unroll = gemmPreCompile[i][7]; if (gemmPreCompile[i][0] == 0) { // sgemm compileKernelGroupAndWriteToFile( context, order, transA, transB, beta, macroTileNumRows, macroTileNumCols, unroll ); } else if (gemmPreCompile[i][0] == 1) { // dgemm compileKernelGroupAndWriteToFile( context, order, transA, transB, beta, macroTileNumRows, macroTileNumCols, unroll ); } else if (gemmPreCompile[i][0] == 2) { // cgemm compileKernelGroupAndWriteToFile( context, order, transA, transB, beta, macroTileNumRows, macroTileNumCols, unroll ); } else if (gemmPreCompile[i][0] == 3) { // zgemm compileKernelGroupAndWriteToFile( context, order, transA, transB, beta, macroTileNumRows, macroTileNumCols, unroll ); } }// end for //precompile user defined kernels unsigned long long clockCurrent; #if defined( _WIN32 ) ::QueryPerformanceCounter( reinterpret_cast( &clockCurrent ) ); #else gettimeofday(&s, 0); clockCurrent = (unsigned long long)s.tv_sec * 1000000 + (unsigned long long)s.tv_usec; #endif double elapsedTimeSec = ((double)clockCurrent - clockStart) / clockFrequency; includeFile.close(); std::cout << "Total Compile Time: " << elapsedTimeSec << " sec" << std::endl; //system("PAUSE"); return 0; } clblas-2.10/src/library/blas/AutoGemm/AutoGemmTools/AutoGemmUtil.h000066400000000000000000000417021264277366700251020ustar00rootroot00000000000000/* ************************************************************************ * Copyright 2013 Advanced Micro Devices, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * ************************************************************************/ #if defined (_MSC_VER) #define __template_static static #define isnan(x) _isnan((x)) #pragma warning( disable : 4290 ) #else /* _MSC_VER */ #define __template_static #endif /* !_MSC_VER */ #include #define CREAL(v) ((v).s[0]) #define CIMAG(v) ((v).s[1]) // Type-dependent constants template static T ZERO() { return static_cast(0.0); } template<> __template_static FloatComplex ZERO() { return floatComplex(0.0, 0.0); } template<> __template_static DoubleComplex ZERO() { return doubleComplex(0.0, 0.0); } template static T ONE() { return static_cast(1.0); } template<> __template_static FloatComplex ONE() { return floatComplex(1.0, 0.0); } template<> __template_static DoubleComplex ONE() { return doubleComplex(1.0, 0.0); } template static T TWO() { return static_cast(2.0); } template<> __template_static FloatComplex TWO() { return floatComplex(2.0, 0.0); } template<> __template_static DoubleComplex TWO() { return doubleComplex(2.0, 0.0); } template static bool isNAN(T x) { return (isnan(x) != 0); } template<> __template_static bool isNAN(FloatComplex x) { return (isNAN(CREAL(x)) && isNAN(CIMAG(x))); } template<> __template_static bool isNAN(DoubleComplex x) { return (isNAN(CREAL(x)) && isNAN(CIMAG(x))); } /* Type-dependent random() */ template static T random(cl_double limit) { T v; cl_ulong l = static_cast(limit); if (l == 0) { return 0; } v = static_cast(rand() % l); if ((rand() % 2) == 1) v = -v; return v; } template static T random(cl_double left, cl_double right) { T v; T l = static_cast(left); v = random(right - left); if (v < 0) { v -= l; } else { v += l; } return v; } template static T random() { return random(static_cast(10)); } template<> __template_static FloatComplex random() { return floatComplex(random(), random()); } template<> __template_static FloatComplex random(cl_double limit) { return floatComplex(random(limit), random(limit)); } template<> __template_static FloatComplex random(cl_double left, cl_double right) { return floatComplex(random(left, right), random(left, right)); } template<> __template_static DoubleComplex random() { return doubleComplex(random(), random()); } template<> __template_static DoubleComplex random(cl_double limit) { return doubleComplex(random(limit), random(limit)); } template<> __template_static DoubleComplex random(cl_double left, cl_double right) { return doubleComplex(random(left, right), random(left, right)); } /* Boolean operators */ template static bool operator==(T a, T b) { return (a == b); } template<> __template_static bool operator==(FloatComplex a, FloatComplex b) { return ((CREAL(a) == CREAL(b)) && (CIMAG(a) == CIMAG(b))); } template<> __template_static bool operator==(DoubleComplex a, DoubleComplex b) { return ((CREAL(a) == CREAL(b)) && (CIMAG(a) == CIMAG(b))); } template static bool operator!=(T a, T b) { return !(a == b); } /* math operators */ static __inline float conjugate(float elem) { return elem; } static __inline double conjugate(double elem) { return elem; } static __inline FloatComplex conjugate(FloatComplex elem) { return floatComplex(CREAL(elem), -CIMAG(elem)); } static __inline DoubleComplex conjugate(DoubleComplex elem) { return doubleComplex(CREAL(elem), -CIMAG(elem)); } static __inline FloatComplex operator+(FloatComplex a, FloatComplex b) { return floatComplex(CREAL(a) + CREAL(b), CIMAG(a) + CIMAG(b)); } static __inline FloatComplex operator-(FloatComplex a, FloatComplex b) { return floatComplex(CREAL(a) - CREAL(b), CIMAG(a) - CIMAG(b)); } static __inline FloatComplex operator*(FloatComplex a, FloatComplex b) { return floatComplex( CREAL(a) * CREAL(b) - CIMAG(a) * CIMAG(b), CREAL(a) * CIMAG(b) + CREAL(b) * CIMAG(a)); } static __inline FloatComplex operator*(FloatComplex a, cl_float b) { return floatComplex(CREAL(a) * b, CIMAG(a) * b); } static __inline FloatComplex operator/(FloatComplex a, FloatComplex b) { cl_float div = CREAL(b) * CREAL(b) + CIMAG(b) * CIMAG(b); return floatComplex( (CREAL(a) * CREAL(b) + CIMAG(a) * CIMAG(b)) / div, (CREAL(b) * CIMAG(a) - CREAL(a) * CIMAG(b)) / div); } static __inline FloatComplex operator/(FloatComplex a, cl_float b) { return floatComplex(CREAL(a) / b, CIMAG(a) / b); } static __inline DoubleComplex operator+(DoubleComplex a, DoubleComplex b) { return doubleComplex(CREAL(a) + CREAL(b), CIMAG(a) + CIMAG(b)); } static __inline DoubleComplex operator-(DoubleComplex a, DoubleComplex b) { return doubleComplex(CREAL(a) - CREAL(b), CIMAG(a) - CIMAG(b)); } static __inline DoubleComplex operator*(DoubleComplex a, DoubleComplex b) { return doubleComplex( CREAL(a) * CREAL(b) - CIMAG(a) * CIMAG(b), CREAL(a) * CIMAG(b) + CREAL(b) * CIMAG(a)); } static __inline DoubleComplex operator*(DoubleComplex a, cl_double b) { return doubleComplex(CREAL(a) * b, CIMAG(a) * b); } static __inline DoubleComplex operator/(DoubleComplex a, DoubleComplex b) { cl_double div = CREAL(b) * CREAL(b) + CIMAG(b) * CIMAG(b); return doubleComplex( (CREAL(a) * CREAL(b) + CIMAG(a) * CIMAG(b)) / div, (CREAL(b) * CIMAG(a) - CREAL(a) * CIMAG(b)) / div); } static __inline DoubleComplex operator/(DoubleComplex a, cl_double b) { return doubleComplex(CREAL(a) / b, CIMAG(a) / b); } cl_int module(cl_int a) { return abs(a); } cl_float module(cl_float a) { return fabsf(a); } cl_double module(cl_double a) { return fabs(a); } cl_float module(FloatComplex a) { if ((CREAL(a) == 0.0) && (CIMAG(a) == 0.0)) return 0.0; return sqrtf(CREAL(a) * CREAL(a) + CIMAG(a) * CIMAG(a)); } cl_double module(DoubleComplex a) { if ((CREAL(a) == 0.0) && (CIMAG(a) == 0.0)) return 0.0; return sqrt(CREAL(a) * CREAL(a) + CIMAG(a) * CIMAG(a)); } #define FLOAT_UPPER_BOUND pow(2.0, 23) #define DOUBLE_UPPER_BOUND pow(2.0, 52) // Type-dependant constants template static cl_double UPPER_BOUND(); template<> __template_static cl_double UPPER_BOUND() { return FLOAT_UPPER_BOUND; } template<> __template_static cl_double UPPER_BOUND() { return DOUBLE_UPPER_BOUND;} template<> __template_static cl_double UPPER_BOUND() { return FLOAT_UPPER_BOUND; } template<> __template_static cl_double UPPER_BOUND() { return DOUBLE_UPPER_BOUND; } /* Provide simple access to vector elements */ template class VectorAccessor { public: VectorAccessor( ElemType *vector, size_t len, IncType inc, bool conj=false) : vector_(vector), inc_(inc), len_(len), conj_(conj) { /* do nothing */ } ElemType& operator [] (size_t idx) throw (std::string) { ElemType *el; if (idx >= len_) { throw std::string("Trying to access vector beyond boundary!"); } if (inc_ > 0) { el = vector_ + idx * inc_; } else { el = vector_ + (len_ - idx - 1) * (-inc_); } if (conj_) { tmp_ = conjugate(*el); return tmp_; } else { return *el; } } private: ElemType *vector_; ElemType tmp_; IncType inc_; size_t len_; bool conj_; }; /* Mapping between logical and physical matrix layout */ template class MatrixAccessor { public: MatrixAccessor( T *matrix, clblasOrder order, clblasTranspose trans, size_t nrRows, size_t nrCols, size_t ld) : matrix_(matrix), nrRows_(nrRows), nrCols_(nrCols), ld_(ld) { conj_ = (trans == clblasConjTrans); if ((order == clblasColumnMajor && trans == clblasNoTrans) || (order == clblasRowMajor && trans != clblasNoTrans)) { tra_ = true; } else { tra_ = false; } } void flipTransposing(void) { tra_ = !tra_; } VectorAccessor operator [] (size_t row) const throw (std::string) { T *vector; size_t inc; if (row >= nrRows_) { throw std::string("Trying to access matrix beyond boundary!"); } if (tra_) { vector = matrix_ + row; inc = ld_; } else { vector = matrix_ + row * ld_; inc = 1; } return VectorAccessor(vector, nrCols_, inc, conj_); } private: T *matrix_; bool tra_; bool conj_; size_t nrRows_; size_t nrCols_; size_t ld_; }; template __template_static void gemm( clblasOrder order, clblasTranspose transA, clblasTranspose transB, size_t M, size_t N, size_t K, T alpha, const T *A, size_t lda, const T *B, size_t ldb, T beta, T *C, size_t ldc) { MatrixAccessor ma(const_cast(A), order, transA, M, K, lda); MatrixAccessor mb(const_cast(B), order, transB, K, N, ldb); MatrixAccessor mc(C, order, clblasNoTrans, M, N, ldc); size_t i, j, k; T tmp; for (i = 0; i < M; i++) { for (j = 0; j < N; j++) { tmp = ZERO(); for (k = 0; k < K; k++) { tmp = tmp + ma[i][k] * mb[k][j]; } mc[i][j] = mc[i][j] * beta + tmp * alpha; } } } template __template_static void trmm( clblasOrder order, clblasSide side, clblasUplo uplo, clblasTranspose transA, clblasDiag diag, size_t M, size_t N, T alpha, const T *A, size_t lda, T *B, size_t ldb) { size_t i, j, k; size_t row, col; size_t rowsA = (side == clblasLeft) ? M : N; size_t colsB = (side == clblasLeft) ? N : M; MatrixAccessor ma(const_cast(A), order, transA, rowsA, rowsA, lda); MatrixAccessor mb(B, order, clblasNoTrans, rowsA, colsB, ldb); T tmp, a; bool revPass; revPass = (uplo == clblasLower) ^ (transA != clblasNoTrans); if (side == clblasRight) { ma.flipTransposing(); mb.flipTransposing(); revPass = !revPass; } for (i = 0; i < rowsA; i++) { row = (revPass) ? (rowsA - i - 1) : i; for (j = 0; j < colsB; j++) { size_t boundK = (revPass) ? row : (rowsA - row - 1); tmp = ZERO(); for (k = 0; k <= boundK; k++) { col = (revPass) ? k : (rowsA - k - 1); if ((k == boundK) && (diag == clblasUnit)) { a = ONE(); } else { a = ma[row][col]; } tmp = tmp + a * mb[col][j]; } mb[row][j] = tmp * alpha; } } } template __template_static void trsm( clblasOrder order, clblasSide side, clblasUplo uplo, clblasTranspose transA, clblasDiag diag, size_t M, size_t N, T alpha, const T *A, size_t lda, T *B, size_t ldb) { size_t i, j, k; size_t row, col; size_t rowsA = (side == clblasLeft) ? M : N; size_t colsB = (side == clblasLeft) ? N : M; MatrixAccessor ma(const_cast(A), order, transA, rowsA, rowsA, lda); MatrixAccessor mb(B, order, clblasNoTrans, rowsA, colsB, ldb); T tmp, a; bool revPass; revPass = (uplo == clblasUpper) ^ (transA != clblasNoTrans); if (side == clblasRight) { ma.flipTransposing(); mb.flipTransposing(); revPass = !revPass; } for (i = 0; i < rowsA; i++) { row = (revPass) ? (rowsA - i - 1) : i; for (j = 0; j < colsB; j++) { size_t boundK = (revPass) ? (rowsA - row - 1) : row; tmp = ZERO(); for (k = 0; k <= boundK; k++) { col = (revPass) ? (rowsA - k - 1) : k; if (col == row) { a = (diag == clblasUnit) ? ONE() : ma[row][col]; tmp = (mb[row][j] - tmp) / a; } else { tmp = tmp + ma[row][col] * mb[col][j]; } } mb[row][j] = tmp; } } for (i = 0; i < rowsA; i++) { for (j = 0; j < colsB; j++) { mb[i][j] = mb[i][j] * alpha; } } } template __template_static void syrk( clblasOrder order, clblasUplo uplo, clblasTranspose trans, size_t N, size_t K, T alpha, const T *A, size_t lda, T beta, T *C, size_t ldc) { size_t i, j, k; clblasTranspose tr = trans == clblasNoTrans ? clblasNoTrans : clblasTrans; MatrixAccessor ma(const_cast(A), order, tr, N, K, lda); MatrixAccessor mc(C, order, clblasNoTrans, N, N, ldc); T tmp; for (i = 0; i < N; i++) { for (j = 0; j < N; j++) { if ((uplo == clblasLower && j > i) || (uplo == clblasUpper && i > j)) { continue; } tmp = ZERO(); for (k = 0; k < K; k++) { tmp = tmp + ma[i][k] * ma[j][k]; } mc[i][j] = mc[i][j] * beta + tmp * alpha; } } } template __template_static void syr2k( clblasOrder order, clblasUplo uplo, clblasTranspose trans, size_t N, size_t K, T alpha, const T *A, size_t lda, const T *B, size_t ldb, T beta, T *C, size_t ldc) { size_t i, j, k; clblasTranspose tr = trans == clblasNoTrans ? clblasNoTrans : clblasTrans; MatrixAccessor ma(const_cast(A), order, tr, N, K, lda); MatrixAccessor mb(const_cast(B), order, tr, N, K, ldb); MatrixAccessor mc(C, order, clblasNoTrans, N, N, ldc); T tmp; for (i = 0; i < N; i++) { for (j = 0; j < N; j++) { if ((uplo == clblasLower && j > i) || (uplo == clblasUpper && i > j)) { continue; } tmp = ZERO(); for (k = 0; k < K; k++) { tmp = tmp + ma[i][k] * mb[j][k] + ma[j][k] * mb[i][k]; } mc[i][j] = mc[i][j] * beta + tmp * alpha; } } } template __template_static void gemv( clblasOrder order, clblasTranspose transA, size_t M, size_t N, T alpha, const T *A, size_t lda, const T *X, int incx, T beta, T *Y, int incy) { size_t sizeX, sizeY; size_t m, n; T tmp; if(transA == clblasNoTrans) { sizeX = N; sizeY = M; } else { sizeX = M; sizeY = N; } MatrixAccessor ma(const_cast(A), order, transA, sizeY, sizeX, lda); VectorAccessor vx(const_cast(X), sizeX, incx); VectorAccessor vy(const_cast(Y), sizeY, incy); for (m = 0; m < sizeY; m++) { tmp = ZERO(); for (n = 0; n < sizeX; n++) { tmp = tmp + ma[m][n] * vx[n]; } vy[m] = tmp * alpha + vy[m] * beta; } } template __template_static void symv( clblasOrder order, clblasUplo uplo, size_t N, T alpha, const T *A, size_t lda, const T *X, int incx, T beta, T *Y, int incy) { size_t m, n; T tmp; MatrixAccessor ma(const_cast(A), order, clblasNoTrans, N, N, lda); VectorAccessor vx(const_cast(X), N, incx); VectorAccessor vy(const_cast(Y), N, incy); for (m = 0; m < N; m++) { tmp = ZERO(); for (n = 0; n < N; n++) { if (((uplo == clblasUpper) && (m <= n)) || ((uplo == clblasLower) && (m >= n))) { tmp = tmp + ma[m][n] * vx[n]; } else { tmp = tmp + ma[n][m] * vx[n]; } } vy[m] = tmp * alpha + vy[m] * beta; } } clblas-2.10/src/library/blas/AutoGemm/AutoGemmTools/ProfileAutoGemm.cpp000066400000000000000000001307461264277366700261270ustar00rootroot00000000000000#include #include #include #include #include #include #include #include #include #include #if defined( __APPLE__ ) || defined( __MACOSX ) #include #else #include #endif //#include "library/tools/ktest/naive/naive_blas.cpp" //using namespace NaiveBlas; #include "AutoGemmTools/AutoGemmUtil.h" #include "AutoGemmIncludes/AutoGemmKernelSelection.h" #include "AutoGemmIncludes/AutoGemmKernelSelectionSpecific.h" #include "AutoGemmIncludes/AutoGemmKernelEnumeration.h" #define SGEMM 1 #define DGEMM 0 #define CGEMM 0 #define ZGEMM 0 #define RANDOM_DATA 1 #define DO_VALIDATION 0 #if SGEMM #define DATA_TYPE float #define DATA_TYPE_CONSTRUCTOR(X,Y) X const unsigned int numTiles = sgemmNumTiles; const unsigned int numNonTiles = sgemmNumNonTiles; const unsigned int numKernels = sgemmNumKernels; #ifdef USER_KERNELS const char * const ksrFileName = "prof_user_sgemm_ksr.txt"; const char * const rawFileName = "prof_user_sgemm_raw.csv"; #else const char * const ksrFileName = "prof_sgemm_ksr.txt"; const char * const rawFileName = "prof_sgemm_raw.csv"; #endif unsigned int systemSizeMax = 1000; #endif #if DGEMM #define DATA_TYPE double #define DATA_TYPE_CONSTRUCTOR(X,Y) X const unsigned int numTiles = dgemmNumTiles; const unsigned int numNonTiles = dgemmNumNonTiles; const unsigned int numKernels = dgemmNumKernels; #ifdef USER_KERNELS const char * const ksrFileName = "prof_user_dgemm_ksr.txt"; const char * const rawFileName = "prof_user_dgemm_raw.csv"; #else const char * const ksrFileName = "prof_dgemm_ksr.txt"; const char * const rawFileName = "prof_dgemm_raw.csv"; #endif unsigned int systemSizeMax = 6000; #endif #if CGEMM #define DATA_TYPE FloatComplex #define DATA_TYPE_CONSTRUCTOR floatComplex const unsigned int numTiles = cgemmNumTiles; const unsigned int numNonTiles = cgemmNumNonTiles; const unsigned int numKernels = cgemmNumKernels; #ifdef USER_KERNELS const char * const ksrFileName = "prof_user_cgemm_ksr.txt"; const char * const rawFileName = "prof_user_cgemm_raw.csv"; #else const char * const ksrFileName = "prof_cgemm_ksr.txt"; const char * const rawFileName = "prof_cgemm_raw.csv"; #endif unsigned int systemSizeMax = 5500; #endif #if ZGEMM #define DATA_TYPE DoubleComplex #define DATA_TYPE_CONSTRUCTOR doubleComplex const unsigned int numTiles = zgemmNumTiles; const unsigned int numNonTiles = zgemmNumNonTiles; const unsigned int numKernels = zgemmNumKernels; #ifdef USER_KERNELS const char * const ksrFileName = "prof_user_zgemm_ksr.txt"; const char * const rawFileName = "prof_user_zgemm_raw.csv"; #else const char * const ksrFileName = "prof_zgemm_ksr.txt"; const char * const rawFileName = "prof_zgemm_raw.csv"; #endif unsigned int systemSizeMax = 5000; #endif #ifndef _CRT_SECURE_NO_WARNINGS #define _CRT_SECURE_NO_WARNINGS #endif #define CL_CHECK(RET) \ if(RET != CL_SUCCESS) { \ printf("OpenCL error %i on line %u\n", RET, __LINE__); \ assert(false); \ } unsigned int **tiles; typedef struct _RuleStack { unsigned int startSize; unsigned int validTileIndices[numTiles]; unsigned int numValidTiles; int fallbackTileIndex; _RuleStack() : numValidTiles(0), fallbackTileIndex(-1) {} } RuleStack; class KernelSelectionRules { public: RuleStack rule; RuleStack history[1024]; unsigned int numRulesInHistory; std::ostream & out; //constructor KernelSelectionRules( std::ostream & file) : numRulesInHistory(0), out(file) { } int getFastestValidTileIndex( unsigned int M, unsigned int N) { for (unsigned int i = 0; i < rule.numValidTiles; i++) { if ( M%tiles[rule.validTileIndices[i]][0]==0 && N%tiles[rule.validTileIndices[i]][1]==0) { return rule.validTileIndices[i]; } } return -1; } void removeTileFromRule( unsigned int tileIdx ) { int idx = -1; for ( int i = 0; i < rule.numValidTiles; i++) { if (rule.validTileIndices[i] == tileIdx) { idx = i; break; } } if (idx >= 0) { for ( int i = idx; i < rule.numValidTiles-1; i++) { rule.validTileIndices[i] = rule.validTileIndices[i+1]; } rule.numValidTiles--; } } void addTileToRule( unsigned int tileIdx ) { for (int i = rule.numValidTiles; i > 0; i--) { rule.validTileIndices[i] = rule.validTileIndices[i-1]; } rule.validTileIndices[0] = tileIdx; rule.numValidTiles++; } bool add( unsigned int M, unsigned int N, unsigned int *validTileIndices, unsigned int numValidTiles, unsigned int fallbackTileIndex ) { // print rule printf("rule[%4u+]: ", rule.startSize ); for (unsigned int i = 0; i < rule.numValidTiles; i++) { printf("%ux%u, ", tiles[rule.validTileIndices[i]][0], tiles[rule.validTileIndices[i]][1]); } if (rule.fallbackTileIndex>=0) { printf("; %ux%u", tiles[rule.fallbackTileIndex][0], tiles[rule.fallbackTileIndex][1]); } printf("\n"); // print add printf("check[%4u,%4u]: ", M, N ); for (unsigned int i = 0; i < numValidTiles; i++) { printf("%ux%u, ", tiles[validTileIndices[i]][0], tiles[validTileIndices[i]][1]); } printf("; fallback = %ux%u\n", tiles[fallbackTileIndex][0], tiles[fallbackTileIndex][1]); bool mismatch = false; // compare fallbacks if (rule.fallbackTileIndex < 0) { mismatch = true; printf("mismatch:no fallback tile\n" ); rule.fallbackTileIndex = fallbackTileIndex; } else { if (rule.fallbackTileIndex != fallbackTileIndex) { mismatch = true; printf("mismatch:rule fallback was %ux%u, whereas new fallback is %ux%u\n", tiles[rule.fallbackTileIndex][0], tiles[rule.fallbackTileIndex][1], tiles[fallbackTileIndex][0], tiles[fallbackTileIndex][1] ); rule.fallbackTileIndex = fallbackTileIndex; } } // compare fastest valid tile if (numValidTiles > 0) { int ruleFastestValidTileIndex = getFastestValidTileIndex(M,N); if (ruleFastestValidTileIndex < 0) { // no valid tile for this M,N mismatch = true; printf("mismatch:no valid tile for size=%u,%u\n", M, N); rule.validTileIndices[rule.numValidTiles] = validTileIndices[0]; rule.numValidTiles++; } else { if (ruleFastestValidTileIndex != validTileIndices[0]) { // there is a valid tile for this M,N but it mismatches mismatch = true; printf("mismatch:rule tile was %ux%u, whereas fastest is %ux%u\n", tiles[ruleFastestValidTileIndex][0], tiles[ruleFastestValidTileIndex][1], tiles[validTileIndices[0]][0], tiles[validTileIndices[0]][1] ); removeTileFromRule(validTileIndices[0]); // if it existed elsewhere in the rule stack addTileToRule(validTileIndices[0]); } } } // remove retired tiles for (unsigned int i = 0; i < rule.numValidTiles; i++) { if ( M%tiles[rule.validTileIndices[i]][0]==0 && N%tiles[rule.validTileIndices[i]][1]==0) { bool tileIsValid = false; for (unsigned int j = 0; j < numValidTiles; j++) { if (validTileIndices[j] == rule.validTileIndices[i]) { tileIsValid = true; break; } } if (!tileIsValid) { mismatch = true; printf("mismatch:tile %ux%u no longer valid\n", tiles[rule.validTileIndices[i]][0], tiles[rule.validTileIndices[i]][1]); removeTileFromRule( rule.validTileIndices[i] ); } } } // if new rule, add it to history if (mismatch) { // update history rule.startSize = sqrt(M*N)+0.5; history[numRulesInHistory] = rule; numRulesInHistory++; // print what we added printf("new[%4u+]: ", rule.startSize); for (unsigned int i = 0; i < rule.numValidTiles; i++) { printf("%ux%u, ", tiles[rule.validTileIndices[i]][0], tiles[rule.validTileIndices[i]][1]); } printf("; fallback = %ux%u\n", tiles[rule.fallbackTileIndex][0], tiles[rule.fallbackTileIndex][1]); // write size event out << " [ " << std::setw(4) << rule.startSize; // write fallback out << ", [ " << std::setw(2) << tiles[rule.fallbackTileIndex][0] << ", " << std::setw(2) << tiles[rule.fallbackTileIndex][1] << "]"; out << ", [ "; if (rule.numValidTiles >= 0) { out << "[ " << std::setw(2) << tiles[rule.validTileIndices[0]][0] << ", " << std::setw(2) << tiles[rule.validTileIndices[0]][1] << "]"; } for (unsigned int i = 1; i < rule.numValidTiles; i++) { out << ", [ " << std::setw(2) << tiles[rule.validTileIndices[i]][0] << ", " << std::setw(2) << tiles[rule.validTileIndices[i]][1] << "]"; } out << " ] ], \n"; out.flush(); } printf("\n"); return mismatch; } // end add }; template void randomMatrix( clblasOrder order, size_t rows, size_t columns, T *A, size_t lda) { size_t r, c; MatrixAccessor a(A, order, clblasNoTrans, rows, columns, lda); for (r = 0; r < rows; r++) { for (c = 0; c < columns; c++) { #if RANDOM_DATA a[r][c] = random(); #else a[r][c] = DATA_TYPE_CONSTRUCTOR(1, 0); #endif } } } /****************************************************************************** * Make Gemm Kernel *****************************************************************************/ void makeGemmKernel( cl_kernel *clKernel, cl_command_queue clQueue, const char *kernelSource, const char *sourceBuildOptions, const unsigned char **kernelBinary, const char *binaryBuildOptions) { cl_int err; if (*clKernel) { // kernel has already been built, return #if 0 // get kernel name size_t kernelNameLength; err = clGetKernelInfo( *clKernel, CL_KERNEL_FUNCTION_NAME, sizeof(kernelNameLength), NULL, &kernelNameLength ); CL_CHECK(err) char *kernelName = new char[kernelNameLength]; err = clGetKernelInfo( *clKernel, CL_KERNEL_FUNCTION_NAME, kernelNameLength*sizeof(char), kernelName, NULL ); CL_CHECK(err) printf("makeGemmKernel: \"%s\" already built; returning.\n", kernelName); delete[] kernelName; #endif return; } else { // kernel has not been built, so build it (from binary, preferably) cl_context clContext; cl_device_id clDevice; err = clGetCommandQueueInfo( clQueue, CL_QUEUE_CONTEXT, sizeof(clContext), &clContext, NULL); CL_CHECK(err) err = clGetCommandQueueInfo( clQueue, CL_QUEUE_DEVICE, sizeof(clDevice), &clDevice, NULL); CL_CHECK(err) cl_program clProgram; cl_int clBinaryStatus; if (*kernelBinary) { size_t kernelBinarySize = strlen((char *)*kernelBinary); clProgram = clCreateProgramWithBinary( clContext, 1, &clDevice, &kernelBinarySize, kernelBinary, &clBinaryStatus, &err ); CL_CHECK(err) err = clBuildProgram( clProgram, 1, &clDevice, binaryBuildOptions, NULL, NULL ); CL_CHECK(err) } else { clProgram = clCreateProgramWithSource( clContext, 1, &kernelSource, NULL, &err ); CL_CHECK(err) err = clBuildProgram( clProgram, 1, &clDevice, sourceBuildOptions, NULL, NULL ); CL_CHECK(err) } err = clCreateKernelsInProgram( clProgram, 1, clKernel, NULL ); CL_CHECK(err) #if 0 // get kernel name size_t kernelNameLength; err = clGetKernelInfo( *clKernel, CL_KERNEL_FUNCTION_NAME, sizeof(kernelNameLength), NULL, &kernelNameLength ); CL_CHECK(err) char *kernelName = new char[kernelNameLength]; err = clGetKernelInfo( *clKernel, CL_KERNEL_FUNCTION_NAME, kernelNameLength*sizeof(char), kernelName, NULL ); CL_CHECK(err) printf("makeGemmKernel: \"%s\" built; returning.\n", kernelName); delete[] kernelName; #endif } } /**************************************************************************** * Compare Matrices ***************************************************************************/ template bool compareMatrices( clblasOrder order, size_t rows, size_t columns, T *blasMatrix, T *naiveMatrix, size_t ld) { size_t r, c; MatrixAccessor blas(blasMatrix, order, clblasNoTrans, rows, columns, ld); MatrixAccessor naive(naiveMatrix, order, clblasNoTrans, rows, columns, ld); T blasVal, naiveVal; int numPrint = 96*96; bool equal = true; for (r = 0; r < rows; r++) { for (c = 0; c < columns; c++) { blasVal = blas[r][c]; naiveVal = naive[r][c]; if (isNAN(blasVal) && isNAN(naiveVal)) { continue; } if (blasVal != naiveVal) { equal = false; } if (blasVal != naiveVal) { if (numPrint-- > 0) { #if CGEMM || ZGEMM printf("MISMATCH C[%u][%u]: gpu= %4.1f + %4.1fi, cpu= %4.1f + %4.1fi\n", r, c, blasVal.s[0], blasVal.s[1], naiveVal.s[0], naiveVal.s[1] ); #else printf("MISMATCH C[%u][%u]: gpu= %4.1f, cpu= %4.1f\n", r, c, blasVal, naiveVal ); #endif } else { return equal; } } } } return equal; } const char PLATFORM_NAME[] = "AMD Accelerated Parallel Processing"; //const char DEVICE_NAME[] = "Hawaii"; #if SGEMM || CGEMM const float peakGflops = 5.24e3; // sp for W9100 #else const float peakGflops = 2.62e3; // dp for W9100 #endif //const float peakGflops = 696; // for R9 290 "Hawaii" const cl_uint offsetM = 0; const cl_uint offsetN = 0; const cl_uint offsetK = 0; cl_uint offA = 0; cl_uint offB = 0; cl_uint offC = 0; DATA_TYPE alpha = DATA_TYPE_CONSTRUCTOR(1, 0); cl_mem bufA = NULL; cl_mem bufB = NULL; cl_mem bufC = NULL; DATA_TYPE* A = NULL; DATA_TYPE* B = NULL; DATA_TYPE* C = NULL; DATA_TYPE* naiveC = NULL; const cl_uint workDim = 2; std::ofstream file; std::ofstream ksrFile; #if DO_VALIDATION const unsigned int numEnqueuesPerFlush = 1; const unsigned int numFlushesPerFinish = 1; const unsigned int numFinishes = 1; #else const unsigned int numEnqueuesPerFlush = 10; const unsigned int numFlushesPerFinish = 1; const unsigned int numFinishes = 1; #endif char* loadFile(const char* path); cl_platform_id getPlatform(const char *name); cl_device_id getDevice(cl_platform_id platform); cl_kernel createKernel(const char *source, cl_context context, const char* options, cl_int *error); cl_int err; cl_platform_id platform; cl_device_id device; cl_context_properties props[3] = { CL_CONTEXT_PLATFORM, 0, 0 }; cl_context context; cl_command_queue queue; /**************************************************************************** * Benchmark Kernel ***************************************************************************/ float benchmarkKernel( clblasOrder order, clblasTranspose transA, clblasTranspose transB, bool betaNonZero, unsigned int macroTileNumRows, unsigned int macroTileNumCols, unsigned int unroll, size_t M, size_t N, size_t K ) { DATA_TYPE beta; if (betaNonZero) { beta = DATA_TYPE_CONSTRUCTOR(1, 0); } else { beta = DATA_TYPE_CONSTRUCTOR(0, 0); } bool needTileKernel = M/macroTileNumRows > 0 && N/macroTileNumCols > 0; bool needRowKernel = M%macroTileNumRows > 0 && N/macroTileNumCols > 0; bool needColKernel = N%macroTileNumCols > 0 && M/macroTileNumRows > 0; bool needCornerKernel = M%macroTileNumRows > 0 && N%macroTileNumCols > 0; #if 0 printf("Testing: %sgemm_%s_%s%s_%s_%03u_%03u_%02u\n", #if SGEMM "s", #elif DGEMM "d", #elif CGEMM "c", #else "z", #endif order==clblasColumnMajor ? "Col" : "Row", transA==clblasTrans ? "T" : "N", transB==clblasTrans ? "T" : "N", betaNonZero ? "_B1" : "_B0", macroTileNumRows, macroTileNumCols, unroll ); #endif //printf("M=%u, N=%u, K=%u\n", M, N, K); // matrix A parameters cl_uint numRowsA; cl_uint numColsA; if (transA == clblasTrans) { numRowsA = K; numColsA = M; } else { numRowsA = M; numColsA = K; } // matrix B parameters cl_uint numRowsB; cl_uint numColsB; if (transB == clblasTrans) { numRowsB = N; numColsB = K; } else { numRowsB = K; numColsB = N; } // Matrix C cl_uint numRowsC = M; cl_uint numColsC = N; // leading dimension cl_uint lda; cl_uint ldb; cl_uint ldc; if (order == clblasColumnMajor) { lda = numRowsA; ldb = numRowsB; ldc = numRowsC; } else { lda = numColsA; ldb = numColsB; ldc = numColsC; } const char *tileKernelSource; const char *rowKernelSource; const char *colKernelSource; const char *cornerKernelSource; const char *sourceBuildOptions; const unsigned char *tileKernelBinary; const unsigned char *rowKernelBinary; const unsigned char *colKernelBinary; const unsigned char *cornerKernelBinary; size_t *tileKernelBinarySize = 0; size_t *rowKernelBinarySize = 0; size_t *colKernelBinarySize = 0; size_t *cornerKernelBinarySize = 0; const char *binaryBuildOptions; cl_kernel *tileClKernel; cl_kernel *rowClKernel; cl_kernel *colClKernel; cl_kernel *cornerClKernel; unsigned int workGroupNumRows; unsigned int workGroupNumCols; unsigned int microTileNumRows; unsigned int microTileNumCols; //printf("Creating kernel.\n"); bool kernelFound = gemmSelectKernelSpecific( order, transA, transB, betaNonZero, macroTileNumRows, macroTileNumCols, unroll, &tileKernelSource, &rowKernelSource, &colKernelSource, &cornerKernelSource, &sourceBuildOptions, &tileKernelBinary, &rowKernelBinary, &colKernelBinary, &cornerKernelBinary, &tileKernelBinarySize, &rowKernelBinarySize, &colKernelBinarySize, &cornerKernelBinarySize, &binaryBuildOptions, &tileClKernel, &rowClKernel, &colClKernel, &cornerClKernel, &workGroupNumRows, &workGroupNumCols, µTileNumRows, µTileNumCols ); if ( !kernelFound ) { printf("ERROR: couldn't find kernel\n" ); } if (needTileKernel) makeGemmKernel( tileClKernel, queue, tileKernelSource, sourceBuildOptions, &tileKernelBinary, binaryBuildOptions); if (needRowKernel) makeGemmKernel( rowClKernel, queue, rowKernelSource, sourceBuildOptions, &rowKernelBinary, binaryBuildOptions); if (needColKernel) makeGemmKernel( colClKernel, queue, colKernelSource, sourceBuildOptions, &colKernelBinary, binaryBuildOptions); if (needCornerKernel) makeGemmKernel(cornerClKernel, queue, cornerKernelSource, sourceBuildOptions, &cornerKernelBinary, binaryBuildOptions); /**************************************************************************** * Tile Kernel ***************************************************************************/ //printf("%s", tileKernelSource); unsigned int totalEnqueues = 0; if (needTileKernel) { err = clSetKernelArg(*tileClKernel, 0, sizeof(cl_mem), &bufA); CL_CHECK(err); err = clSetKernelArg(*tileClKernel, 1, sizeof(cl_mem), &bufB); CL_CHECK(err); err = clSetKernelArg(*tileClKernel, 2, sizeof(cl_mem), &bufC); CL_CHECK(err); err = clSetKernelArg(*tileClKernel, 3, sizeof(DATA_TYPE), &alpha); CL_CHECK(err); err = clSetKernelArg(*tileClKernel, 4, sizeof(DATA_TYPE), &beta); CL_CHECK(err); err = clSetKernelArg(*tileClKernel, 5, sizeof(cl_uint), &M); CL_CHECK(err); err = clSetKernelArg(*tileClKernel, 6, sizeof(cl_uint), &N); CL_CHECK(err); err = clSetKernelArg(*tileClKernel, 7, sizeof(cl_uint), &K); CL_CHECK(err); err = clSetKernelArg(*tileClKernel, 8, sizeof(cl_uint), &lda); CL_CHECK(err); err = clSetKernelArg(*tileClKernel, 9, sizeof(cl_uint), &ldb); CL_CHECK(err); err = clSetKernelArg(*tileClKernel, 10, sizeof(cl_uint), &ldc); CL_CHECK(err); err = clSetKernelArg(*tileClKernel, 11, sizeof(cl_uint), &offA); CL_CHECK(err); err = clSetKernelArg(*tileClKernel, 12, sizeof(cl_uint), &offB); CL_CHECK(err); err = clSetKernelArg(*tileClKernel, 13, sizeof(cl_uint), &offC); CL_CHECK(err); totalEnqueues++; } // kernel dimensions const size_t localWorkSize[2] = { workGroupNumRows, workGroupNumCols }; size_t tileKernelGlobalWorkSize[2] = { (M/(macroTileNumRows))*workGroupNumRows, (N/(macroTileNumCols))*workGroupNumCols }; size_t rowKernelGlobalWorkSize[2] = { 1*workGroupNumRows, (N/(macroTileNumCols))*workGroupNumCols }; size_t colKernelGlobalWorkSize[2] = { (M/(macroTileNumRows))*workGroupNumRows, 1*workGroupNumCols }; size_t cornerKernelGlobalWorkSize[2] = { 1*workGroupNumRows, 1*workGroupNumCols }; /**************************************************************************** * Row Kernel (along bottom of matrix) ***************************************************************************/ if (needRowKernel) { err = clSetKernelArg(*rowClKernel, 0, sizeof(cl_mem), &bufA); CL_CHECK(err); err = clSetKernelArg(*rowClKernel, 1, sizeof(cl_mem), &bufB); CL_CHECK(err); err = clSetKernelArg(*rowClKernel, 2, sizeof(cl_mem), &bufC); CL_CHECK(err); err = clSetKernelArg(*rowClKernel, 3, sizeof(DATA_TYPE), &alpha); CL_CHECK(err); err = clSetKernelArg(*rowClKernel, 4, sizeof(DATA_TYPE), &beta); CL_CHECK(err); err = clSetKernelArg(*rowClKernel, 5, sizeof(cl_uint), &M); CL_CHECK(err); err = clSetKernelArg(*rowClKernel, 6, sizeof(cl_uint), &N); CL_CHECK(err); err = clSetKernelArg(*rowClKernel, 7, sizeof(cl_uint), &K); CL_CHECK(err); err = clSetKernelArg(*rowClKernel, 8, sizeof(cl_uint), &lda); CL_CHECK(err); err = clSetKernelArg(*rowClKernel, 9, sizeof(cl_uint), &ldb); CL_CHECK(err); err = clSetKernelArg(*rowClKernel, 10, sizeof(cl_uint), &ldc); CL_CHECK(err); err = clSetKernelArg(*rowClKernel, 11, sizeof(cl_uint), &offA); CL_CHECK(err); err = clSetKernelArg(*rowClKernel, 12, sizeof(cl_uint), &offB); CL_CHECK(err); err = clSetKernelArg(*rowClKernel, 13, sizeof(cl_uint), &offC); CL_CHECK(err); totalEnqueues++; // kernel dimensions } /**************************************************************************** * Col Kernel (along side of kernel) ***************************************************************************/ if (needColKernel) { err = clSetKernelArg(*colClKernel, 0, sizeof(cl_mem), &bufA); CL_CHECK(err); err = clSetKernelArg(*colClKernel, 1, sizeof(cl_mem), &bufB); CL_CHECK(err); err = clSetKernelArg(*colClKernel, 2, sizeof(cl_mem), &bufC); CL_CHECK(err); err = clSetKernelArg(*colClKernel, 3, sizeof(DATA_TYPE), &alpha); CL_CHECK(err); err = clSetKernelArg(*colClKernel, 4, sizeof(DATA_TYPE), &beta); CL_CHECK(err); err = clSetKernelArg(*colClKernel, 5, sizeof(cl_uint), &M); CL_CHECK(err); err = clSetKernelArg(*colClKernel, 6, sizeof(cl_uint), &N); CL_CHECK(err); err = clSetKernelArg(*colClKernel, 7, sizeof(cl_uint), &K); CL_CHECK(err); err = clSetKernelArg(*colClKernel, 8, sizeof(cl_uint), &lda); CL_CHECK(err); err = clSetKernelArg(*colClKernel, 9, sizeof(cl_uint), &ldb); CL_CHECK(err); err = clSetKernelArg(*colClKernel, 10, sizeof(cl_uint), &ldc); CL_CHECK(err); err = clSetKernelArg(*colClKernel, 11, sizeof(cl_uint), &offA); CL_CHECK(err); err = clSetKernelArg(*colClKernel, 12, sizeof(cl_uint), &offB); CL_CHECK(err); err = clSetKernelArg(*colClKernel, 13, sizeof(cl_uint), &offC); CL_CHECK(err); totalEnqueues++; // kernel dimensions } /**************************************************************************** * Corner Kernel (lower left corder of kernel) ***************************************************************************/ if (needCornerKernel) { err = clSetKernelArg(*cornerClKernel, 0, sizeof(cl_mem), &bufA); CL_CHECK(err); err = clSetKernelArg(*cornerClKernel, 1, sizeof(cl_mem), &bufB); CL_CHECK(err); err = clSetKernelArg(*cornerClKernel, 2, sizeof(cl_mem), &bufC); CL_CHECK(err); err = clSetKernelArg(*cornerClKernel, 3, sizeof(DATA_TYPE), &alpha); CL_CHECK(err); err = clSetKernelArg(*cornerClKernel, 4, sizeof(DATA_TYPE), &beta); CL_CHECK(err); err = clSetKernelArg(*cornerClKernel, 5, sizeof(cl_uint), &M); CL_CHECK(err); err = clSetKernelArg(*cornerClKernel, 6, sizeof(cl_uint), &N); CL_CHECK(err); err = clSetKernelArg(*cornerClKernel, 7, sizeof(cl_uint), &K); CL_CHECK(err); err = clSetKernelArg(*cornerClKernel, 8, sizeof(cl_uint), &lda); CL_CHECK(err); err = clSetKernelArg(*cornerClKernel, 9, sizeof(cl_uint), &ldb); CL_CHECK(err); err = clSetKernelArg(*cornerClKernel, 10, sizeof(cl_uint), &ldc); CL_CHECK(err); err = clSetKernelArg(*cornerClKernel, 11, sizeof(cl_uint), &offA); CL_CHECK(err); err = clSetKernelArg(*cornerClKernel, 12, sizeof(cl_uint), &offB); CL_CHECK(err); err = clSetKernelArg(*cornerClKernel, 13, sizeof(cl_uint), &offC); CL_CHECK(err); totalEnqueues++; // kernel dimensions } totalEnqueues *= numEnqueuesPerFlush * numFlushesPerFinish * numFinishes; cl_event kernelEvents[numEnqueuesPerFlush * numFlushesPerFinish * numFinishes * 4]; unsigned int kernelIdx = 0; //printf("Launching %u kernels of %u x %u threads\n", totalEnqueues, globalWorkSize[0], globalWorkSize[1]); for (unsigned int finishIdx = 0; finishIdx < numFinishes; finishIdx++) { for (unsigned int flushIdx = 0; flushIdx < numFlushesPerFinish; flushIdx++) { for (unsigned int enqIdx = 0; enqIdx < numEnqueuesPerFlush; enqIdx++) { // tile kernel if (needTileKernel) { err = clEnqueueNDRangeKernel(queue, *tileClKernel, workDim, NULL, tileKernelGlobalWorkSize, localWorkSize, 0, NULL, &kernelEvents[kernelIdx]); CL_CHECK(err); kernelIdx++; } // row kernel if (needRowKernel) { //printf("launching rowKernel %ux%u threads b/c M=%u\n", rowKernelGlobalWorkSize[0], rowKernelGlobalWorkSize[1], M); err = clEnqueueNDRangeKernel(queue, *rowClKernel, workDim, NULL, rowKernelGlobalWorkSize, localWorkSize, 0, NULL, &kernelEvents[kernelIdx]); CL_CHECK(err); kernelIdx++; } // col kernel if (needColKernel) { //printf("launching colKernel %ux%u threads b/c N=%u\n", colKernelGlobalWorkSize[0], colKernelGlobalWorkSize[1], N); err = clEnqueueNDRangeKernel(queue, *colClKernel, workDim, NULL, colKernelGlobalWorkSize, localWorkSize, 0, NULL, &kernelEvents[kernelIdx]); CL_CHECK(err); kernelIdx++; } // corner kernel if (needCornerKernel) { //printf("launching crnKernel %ux%u threads b/c M=%u, N=%u\n", cornerKernelGlobalWorkSize[0], cornerKernelGlobalWorkSize[1], M, N); err = clEnqueueNDRangeKernel(queue, *cornerClKernel, workDim, NULL, cornerKernelGlobalWorkSize, localWorkSize, 0, NULL, &kernelEvents[kernelIdx]); CL_CHECK(err); kernelIdx++; } } err = clFlush(queue); CL_CHECK(err); } err = clFinish(queue); CL_CHECK(err); } cl_ulong totalNs = 0; cl_ulong totalFlops = (size_t) numEnqueuesPerFlush * numFlushesPerFinish * numFinishes * (2 * M * N * K); #if CGEMM || ZGEMM // complex totalFlops *= 4; #endif cl_ulong start, end; for (kernelIdx = 0; kernelIdx < totalEnqueues; kernelIdx++) { err = clGetEventProfilingInfo(kernelEvents[kernelIdx], CL_PROFILING_COMMAND_START, sizeof(start), &start, NULL); CL_CHECK(err); err = clGetEventProfilingInfo(kernelEvents[kernelIdx], CL_PROFILING_COMMAND_END, sizeof(end), &end, NULL); CL_CHECK(err); cl_ulong timeNs = end - start; totalNs += timeNs; } double gFlops = (1.0*totalFlops) / (1.0*totalNs); return gFlops; } /**************************************************************************** * Main ***************************************************************************/ int main(void) { file.open(rawFileName, std::ios_base::out); // or ::app for append file << "M, N, "; bool printDetails = true; // load tiles for precision tiles = new unsigned int*[numTiles]; for (unsigned int i = 0; i < numTiles; i++) { tiles[i] = #if SGEMM sgemmTileEnumeration[i]; #elif DGEMM dgemmTileEnumeration[i]; #elif CGEMM cgemmTileEnumeration[i]; #elif ZGEMM zgemmTileEnumeration[i]; #endif } for (unsigned int tileIdx = 0; tileIdx < numTiles; tileIdx++) { unsigned int *tile = tiles[tileIdx]; file << tile[0] << "x" << tile[1] << ", "; } file << "<-F|T->, "; for (unsigned int tileIdx = 0; tileIdx < numTiles; tileIdx++) { unsigned int *tile = tiles[tileIdx]; file << tile[0] << "x" << tile[1] << ", "; } file << "fallback, fastest, would-be valid tiles\n"; int *fallbackBegin = new int[numTiles]; // size at which tile starts being fallback int *fallbackEnd = new int[numTiles]; // size at which tile stops being fallback int *validBegin = new int[numTiles]; // size at which tile starts being valid int *validEnd = new int[numTiles]; // size at which tile stops being valid float *fallbackScore = new float[numTiles]; // fallback score for a size float *tileScore = new float[numTiles]; // tile score for a size unsigned int *validTiles = new unsigned int[numTiles]; for (unsigned int i = 0; i < numTiles; i++) { fallbackBegin[i] = -1; fallbackEnd[i] = -1; validBegin[i] = -1; validEnd[i] = -1; } platform = getPlatform(PLATFORM_NAME); assert(platform != NULL); device = getDevice(platform); assert(device != NULL); props[1] = (cl_context_properties)platform; context = clCreateContext(props, 1, &device, NULL, NULL, &err); assert(context != NULL); queue = clCreateCommandQueue(context, device, CL_QUEUE_PROFILING_ENABLE, &err); assert(queue != NULL); clblasOrder order = clblasColumnMajor; clblasTranspose transA = clblasNoTrans; clblasTranspose transB = clblasTrans; bool beta = false; unsigned int systemSizeMin = 16; unsigned int systemSizeStep = 16; //unsigned int kValues[] = {64, 512, 2048}; //unsigned int numKValues = 3; unsigned int kValues[] = {0}; unsigned int numKValues = 1; //unsigned int kValues[] = {4032}; //unsigned int numKValues = 1; unsigned int kMax; if (kValues[numKValues-1] > 0) { kMax = kValues[numKValues-1]; } else { kMax = systemSizeMax; } /****************************************************************** * Largest Matrix Dimension *****************************************************************/ cl_uint numRowsA; cl_uint numColsA; if (transA == clblasTrans) { numRowsA = kMax; numColsA = systemSizeMax; } else { numRowsA = systemSizeMax; numColsA = kMax; } // matrix B parameters cl_uint numRowsB; cl_uint numColsB; if (transB == clblasTrans) { numRowsB = systemSizeMax; numColsB = kMax; } else { numRowsB = systemSizeMax; numColsB = kMax; } // Matrix C cl_uint numRowsC = systemSizeMax; cl_uint numColsC = systemSizeMax; // leading dimension cl_uint lda; cl_uint ldb; cl_uint ldc; if (order == clblasColumnMajor) { lda = numRowsA; ldb = numRowsB; ldc = numRowsC; } else { lda = numColsA; ldb = numColsB; ldc = numColsC; } /****************************************************************** * Allocate Matrices *****************************************************************/ A = (DATA_TYPE*)malloc((offA + numRowsA * numColsA) * sizeof(*A)); assert(A != NULL); randomMatrix(order, numRowsA, numColsA, A + offA, lda); B = (DATA_TYPE*)malloc((offB + numRowsB * numColsB) * sizeof(*B)); assert(B != NULL); randomMatrix(order, numRowsB, numColsB, B + offB, ldb); C = (DATA_TYPE*)malloc((offC + numRowsC * numColsC) * sizeof(*C)); assert(C != NULL); randomMatrix(order, numRowsC, numColsC, C + offC, ldc); bufA = clCreateBuffer(context, CL_MEM_READ_ONLY, (offA + numRowsA * numColsA) * sizeof(*A), NULL, &err); CL_CHECK(err); assert(bufA != NULL); err = clEnqueueWriteBuffer(queue, bufA, CL_TRUE, 0, (offA + numRowsA * numColsA) * sizeof(*A), A, 0, NULL, NULL); CL_CHECK(err); assert(err == CL_SUCCESS); bufB = clCreateBuffer(context, CL_MEM_READ_ONLY, (offB + numRowsB * numColsB) * sizeof(*B), NULL, &err); CL_CHECK(err); assert(bufB != NULL); err = clEnqueueWriteBuffer(queue, bufB, CL_TRUE, 0, (offB + numRowsB * numColsB) * sizeof(*B), B, 0, NULL, NULL); CL_CHECK(err); assert(err == CL_SUCCESS); //printf("Writing to gpu buffers.\n"); bufC = clCreateBuffer(context, CL_MEM_READ_WRITE, (offC + numRowsC * numColsC) * sizeof(*C), NULL, &err); CL_CHECK(err); assert(bufC != NULL); err = clEnqueueWriteBuffer(queue, bufC, CL_TRUE, 0, (offC + numRowsC * numColsC) * sizeof(*C), C, 0, NULL, NULL); CL_CHECK(err); assert(err == CL_SUCCESS); // (0) for each precision // (1) for each system size ksrFile.open( ksrFileName, std::ios_base::out); // or ::app for append KernelSelectionRules ksr(ksrFile); for (unsigned int systemSize = systemSizeMin; systemSize <= systemSizeMax; systemSize += systemSizeStep) { unsigned int M = systemSize; unsigned int N = systemSize; file << M << ", " << N << ", "; // reset scores for this system size for (unsigned int i = 0; i < numTiles; i++) { fallbackScore[i] = 0.f; tileScore[i] = 0.f; } // (2) for each k size for (unsigned int kIdx = 0; kIdx < numKValues; kIdx++) { unsigned int K = kValues[kIdx]; if (K == 0) K = systemSize; // (3) for each tile for (unsigned int tileIdx = 0; tileIdx < numTiles; tileIdx++) { unsigned int *tile = tiles[tileIdx]; unsigned int macroTileNumRows = tile[0]; unsigned int macroTileNumCols = tile[1]; unsigned int unroll = tile[2]; //if (printDetails) printf("%4ux%4ux%4u; %ux%u; ", M, N, K, macroTileNumRows, macroTileNumCols ); /****************************************************************** * (4) fallback speed *****************************************************************/ float fallbackSpeed = benchmarkKernel( // non-tile order, transA, transB, false, // tile macroTileNumRows, macroTileNumCols, unroll, // system M-1, N-1, K ); fallbackScore[tileIdx] += fallbackSpeed; /****************************************************************** * (5) tile speed *****************************************************************/ float tileSpeed = 0.f; if (M%macroTileNumRows==0 && N%macroTileNumCols==0) { tileSpeed = benchmarkKernel( // non-tile order, transA, transB, false, // tile macroTileNumRows, macroTileNumCols, unroll, // system M, N, K ); tileScore[tileIdx] += tileSpeed; } //if (printDetails) printf("fs=%8.3f, ts=%8.3f\n", fallbackSpeed, tileSpeed ); } // tile sizes } // for k /************************************************************** * (6) score is gbps averaged over k values *************************************************************/ for (unsigned int tileIdx = 0; tileIdx < numTiles; tileIdx++) { fallbackScore[tileIdx] /= numKValues; file << fallbackScore[tileIdx] << ", "; } file << "<-F|T->, "; for (unsigned int tileIdx = 0; tileIdx < numTiles; tileIdx++) { tileScore[tileIdx] /= numKValues; file << tileScore[tileIdx] << ", "; } /************************************************************** * (7) get fastest fallback speed for this system size *************************************************************/ float fastestFallbackScore = 0; unsigned int fastestFallbackIdx = 0; for (unsigned int tileIdx = 0; tileIdx < numTiles; tileIdx++) { if (fallbackScore[tileIdx] > fastestFallbackScore) { fastestFallbackScore = fallbackScore[tileIdx]; fastestFallbackIdx = tileIdx; } } file << tiles[fastestFallbackIdx][0] << "x" << tiles[fastestFallbackIdx][1] << ", "; /************************************************************** * (8) ensure fallback tile has begun/ended *************************************************************/ //if (fallbackBegin[fastestFallbackIdx] == -1) { // fallbackBegin[fastestFallbackIdx] = static_cast(systemSize); //} //fallbackEnd[fastestFallbackIdx] = static_cast(systemSize); // push the end back farther /************************************************************** * (9) which tiles are valid for this system size * - tile must be faster than fallback * - there must not exist a faster tile which covers the same multiples *************************************************************/ unsigned int numValidTiles = 0; float priorFastestTileScore = 99999999; for (unsigned int checkIter = 0; checkIter < numTiles; checkIter++) { // find the next fastest tile float fastestTileScore = -1.f; int fastestTileIdx = -1; for (unsigned int tileIdx = 0; tileIdx < numTiles; tileIdx++) { if (tileScore[tileIdx] > fastestTileScore && (tileScore[tileIdx] < priorFastestTileScore || priorFastestTileScore < 0) ) { fastestTileScore = tileScore[tileIdx]; fastestTileIdx = tileIdx; } } priorFastestTileScore = fastestTileScore; // if next fastest tile isn't faster than fallback, then quit if (fastestTileScore < fastestFallbackScore-1) break; // if the coverage of this tile is already handled by prior (faster) valid tiles, then skip it bool uniqueCoverage = true; for (unsigned int i = 0; i < numValidTiles; i++) { if ( tiles[fastestTileIdx][0] % tiles[ validTiles[i] ][0] == 0 && tiles[fastestTileIdx][1] % tiles[ validTiles[i] ][1] == 0 ) { uniqueCoverage = false; break; } } if (!uniqueCoverage) continue; // this tile valid validTiles[numValidTiles] = fastestTileIdx; numValidTiles++; } for (unsigned int i = 0; i < numValidTiles; i++) { file << tiles[validTiles[i]][0] << "x" << tiles[validTiles[i]][1] << ", "; } ksr.add(M, N, validTiles, numValidTiles, fastestFallbackIdx ); // for now, just pay attention to the fastest tile //if (numValidTiles > 1) { // numValidTiles = 1; //} /************************************************************** * (10) ensure valid tiles have begun/ended *************************************************************/ //for (unsigned int i = 0; i < numValidTiles; i++) { // if (validBegin[ validTiles[i] ] == -1) { // validBegin[ validTiles[i] ] = static_cast(systemSize); // } // validEnd[ validTiles[i] ] = static_cast(systemSize); // push the end back farther //} // print valid tiles //printf("%4ux%4u; fallback = %ux%u; validTiles = ", M, N, tiles[fastestFallbackIdx][2], tiles[fastestFallbackIdx][3]); //for (unsigned int i = 0; i < numValidTiles; i++) { // printf("%ux%u, ", tiles[ validTiles[i] ][2], tiles[ validTiles[i] ][3]); //} //printf("\n"); // print tile ranges //for (unsigned int i = 0; i < numTiles; i++) { // printf("%4u; %ux%u fallback=[%4i, %4i] tile=[%4i, %4i]\n", // systemSize, tiles[i][2], tiles[i][3], // fallbackBegin[i], fallbackEnd[i], // validBegin[i], validEnd[i] ); //} //printf("\n"); file << "\n"; } // for system size file.close(); ksrFile.close(); //err = clReleaseMemObject(bufA); CL_CHECK(err); //err = clReleaseMemObject(bufB); CL_CHECK(err); //err = clReleaseMemObject(bufC); CL_CHECK(err); //err = clReleaseKernel(kernel); CL_CHECK(err); //err = clReleaseCommandQueue(queue); CL_CHECK(err); //err = clReleaseContext(context); CL_CHECK(err); //free(A); //free(B); //free(C); //free(naiveC); //free(source); //system("PAUSE"); //Sleep(5000); // ms exit(EXIT_SUCCESS); }; cl_platform_id getPlatform(const char *name) { cl_int err; cl_uint nrPlatforms, i; cl_platform_id *list, platform; char platformName[64]; err = clGetPlatformIDs(0, NULL, &nrPlatforms); CL_CHECK(err); assert(err == CL_SUCCESS); if (err != CL_SUCCESS) { return NULL; } list = (cl_platform_id*)malloc(nrPlatforms * sizeof(*list)); if (list == NULL) { return NULL; } err = clGetPlatformIDs(nrPlatforms, list, NULL); CL_CHECK(err); assert(err == CL_SUCCESS); if (err != CL_SUCCESS) { free(list); return NULL; } platform = NULL; for (i = 0; i < nrPlatforms; i++) { err = clGetPlatformInfo(list[i], CL_PLATFORM_NAME, sizeof(platformName), platformName, NULL); CL_CHECK(err); assert(err == CL_SUCCESS); if ((err == CL_SUCCESS) && (strcmp(platformName, name) == 0)) { platform = list[i]; break; } } free(list); return platform; } cl_device_id getDevice( cl_platform_id platform) { cl_int err; cl_uint nrDevices, i; cl_device_id *list, device; char deviceName[64]; err = clGetDeviceIDs(platform, CL_DEVICE_TYPE_ALL, 0, NULL, &nrDevices); CL_CHECK(err); assert(err == CL_SUCCESS); if (err != CL_SUCCESS) { return NULL; } assert( nrDevices > 0 ); list = (cl_device_id*)malloc(nrDevices * sizeof(*list)); assert(list); if (list == NULL) { printf("Error: malloc device list\n"); return NULL; } err = clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, nrDevices, list, NULL); CL_CHECK(err); assert(err == CL_SUCCESS); if (err != CL_SUCCESS) { free(list); return NULL; } device = NULL; for (i = 0; i < nrDevices; i++) { err = clGetDeviceInfo(list[i], CL_DEVICE_NAME, sizeof(deviceName), deviceName, NULL); CL_CHECK(err); assert(err == CL_SUCCESS); if ((err == CL_SUCCESS) ) { device = list[i]; break; } } free(list); return device; } cl_kernel createKernel( const char* source, cl_context context, const char* options, cl_int* error) { //printf("Kernel Source:\n%s", source ); cl_int err; cl_device_id device; cl_program program; cl_kernel kernel; size_t logSize; char *log; err = clGetContextInfo(context, CL_CONTEXT_DEVICES, sizeof(device), &device, NULL); assert(err == CL_SUCCESS); if (err != CL_SUCCESS) { if (error != NULL) { *error = err; } return NULL; } program = clCreateProgramWithSource(context, 1, &source, NULL, &err); assert(err == CL_SUCCESS); assert(program != NULL); if (program == NULL) { if (error != NULL) { *error = err; } return NULL; } err = clBuildProgram(program, 1, &device, options, NULL, NULL); if (err != CL_SUCCESS) { logSize = 0; clGetProgramBuildInfo(program, device, CL_PROGRAM_BUILD_LOG, 0, NULL, &logSize); log = (char*)malloc(logSize + 1); clGetProgramBuildInfo(program, device, CL_PROGRAM_BUILD_LOG, logSize, log, NULL); printf("=== Build Log [%lu]===\n%s\n", logSize, log); free(log); } assert(err == CL_SUCCESS); if (err != CL_SUCCESS) { clReleaseProgram(program); if (error != NULL) { *error = err; } return NULL; } kernel = NULL; err = clCreateKernelsInProgram(program, 1, &kernel, NULL); assert(err == CL_SUCCESS); assert(kernel != NULL); clReleaseProgram(program); // kernel name size_t length; char kernelName[64]; err = clGetKernelInfo( kernel, CL_KERNEL_FUNCTION_NAME, 64, kernelName, &length ); //printf("KernelName[%lu]: %s\n", length, kernelName); // kernel arguments cl_uint numArguments; err = clGetKernelInfo( kernel, CL_KERNEL_NUM_ARGS, sizeof(numArguments), &numArguments, NULL ); if (error != NULL) { *error = err; } return kernel; } clblas-2.10/src/library/blas/AutoGemm/AutoGemmTools/TestAutoGemm.cpp000066400000000000000000000731731264277366700254460ustar00rootroot00000000000000#include #include #include #include #include #include #include //#include #include //#include "library/tools/ktest/naive/naive_blas.cpp" //using namespace NaiveBlas; #include "AutoGemmIncludes/AutoGemmKernelSelection.h" #include "AutoGemmIncludes/AutoGemmKernelSelectionSpecific.h" #include "AutoGemmIncludes/AutoGemmKernelEnumeration.h" #include "AutoGemmUtil.h" #if 0 // from clBLAS.h typedef enum clblasOrder_ { clblasRowMajor, clblasColumnMajor } clblasOrder; typedef enum clblasTranspose_ { clblasNoTrans, clblasTrans, clblasConjTrans } clblasTranspose; #endif #define SGEMM 0 #define DGEMM 1 #define CGEMM 0 #define ZGEMM 0 #define RANDOM_DATA 1 #define DO_VALIDATION 1 #if SGEMM #define DATA_TYPE float #define DATA_TYPE_CONSTRUCTOR(X,Y) X const unsigned int numTiles = sgemmNumTiles; const unsigned int numNonTiles = sgemmNumNonTiles; const unsigned int numKernels = sgemmNumKernels; #endif #if DGEMM #define DATA_TYPE double #define DATA_TYPE_CONSTRUCTOR(X,Y) X const unsigned int numTiles = dgemmNumTiles; const unsigned int numNonTiles = dgemmNumNonTiles; const unsigned int numKernels = dgemmNumKernels; #endif #if CGEMM #define DATA_TYPE FloatComplex #define DATA_TYPE_CONSTRUCTOR floatComplex const unsigned int numTiles = cgemmNumTiles; const unsigned int numNonTiles = cgemmNumNonTiles; const unsigned int numKernels = cgemmNumKernels; #endif #if ZGEMM #define DATA_TYPE DoubleComplex #define DATA_TYPE_CONSTRUCTOR doubleComplex const unsigned int numTiles = zgemmNumTiles; const unsigned int numNonTiles = zgemmNumNonTiles; const unsigned int numKernels = zgemmNumKernels; #endif #ifndef _CRT_SECURE_NO_WARNINGS #define _CRT_SECURE_NO_WARNINGS #endif #define CL_CHECK(RET) \ if(RET != CL_SUCCESS) { \ printf("OpenCL error %i on line %u\n", RET, __LINE__); \ assert(false); \ } template void randomMatrix( clblasOrder order, size_t rows, size_t columns, T *A, size_t lda) { size_t r, c; MatrixAccessor a(A, order, clblasNoTrans, rows, columns, lda); for (r = 0; r < rows; r++) { for (c = 0; c < columns; c++) { #if RANDOM_DATA a[r][c] = random(); #else a[r][c] = DATA_TYPE_CONSTRUCTOR(1, 0); #endif } } } template bool compareMatrices( clblasOrder order, size_t rows, size_t columns, T *blasMatrix, T *naiveMatrix, size_t ld) { size_t r, c; MatrixAccessor blas(blasMatrix, order, clblasNoTrans, rows, columns, ld); MatrixAccessor naive(naiveMatrix, order, clblasNoTrans, rows, columns, ld); T blasVal, naiveVal; int numPrint = 96*96; bool equal = true; for (r = 0; r < rows; r++) { for (c = 0; c < columns; c++) { blasVal = blas[r][c]; naiveVal = naive[r][c]; if (isNAN(blasVal) && isNAN(naiveVal)) { continue; } if (blasVal != naiveVal) { equal = false; } if (blasVal != naiveVal) { if (numPrint-- > 0) { #if CGEMM || ZGEMM printf("MISMATCH C[%u][%u]: gpu= %4.1f + %4.1fi, cpu= %4.1f + %4.1fi\n", r, c, blasVal.s[0], blasVal.s[1], naiveVal.s[0], naiveVal.s[1] ); #else printf("MISMATCH C[%u][%u]: gpu= %4.1f, cpu= %4.1f\n", r, c, blasVal, naiveVal ); #endif } else { return equal; } } } } return equal; } const char PLATFORM_NAME[] = "AMD Accelerated Parallel Processing"; const char DEVICE_NAME[] = "Hawaii"; #if SGEMM || CGEMM const float peakGflops = 5.24e3; // sp for W9100 #else const float peakGflops = 2.62e3; // dp for W9100 #endif //const float peakGflops = 696; // for R9 290 "Hawaii" const cl_uint offsetM = 0; const cl_uint offsetN = 0; const cl_uint offsetK = 0; cl_uint offA = 0; cl_uint offB = 0; cl_uint offC = 0; DATA_TYPE alpha = DATA_TYPE_CONSTRUCTOR(1, 0); cl_mem bufA = NULL; cl_mem bufB = NULL; cl_mem bufC = NULL; DATA_TYPE* A = NULL; DATA_TYPE* B = NULL; DATA_TYPE* C = NULL; DATA_TYPE* naiveC = NULL; const cl_uint workDim = 2; #if DO_VALIDATION const unsigned int numEnqueuesPerFlush = 1; const unsigned int numFlushesPerFinish = 1; const unsigned int numFinishes = 1; #else const unsigned int numEnqueuesPerFlush = 2; const unsigned int numFlushesPerFinish = 2; const unsigned int numFinishes = 2; #endif cl_platform_id getPlatform(const char *name); cl_device_id getDevice(cl_platform_id platform, const char *name); cl_kernel createKernel(const char *source, cl_context context, const char* options, cl_int *error); void testKernelParameterCombination( unsigned int columnMajorInt, unsigned int transAInt, unsigned int transBInt, unsigned int betaNonZero, unsigned int macroTileNumRows, unsigned int macroTileNumCols, unsigned int unroll, unsigned int mSpill, unsigned int nSpill ) { DATA_TYPE beta; if (betaNonZero) { beta = DATA_TYPE_CONSTRUCTOR(1, 0); } else { beta = DATA_TYPE_CONSTRUCTOR(0, 0); } // how large of a matrix to test? #if DO_VALIDATION size_t M = 16*macroTileNumRows; size_t N = 16*macroTileNumCols; size_t K = 16*unroll; #else //if (mSpill || nSpill || unroll==1 || transAInt==1 || transBInt==0) return; if (mSpill || nSpill || unroll==1 ) return; size_t M = 22*macroTileNumRows; size_t N = 24*macroTileNumCols; size_t K = 2*64*90+unroll; #endif if (mSpill) { M += 1; } if (nSpill) { N += 1; } #if 1 printf("Testing: %sgemm_%s%s_B%u_MX%03u_NX%03u_KX%02u\n", #if SGEMM "s", #elif DGEMM "d", #elif CGEMM "c", #else "z", #endif transAInt ? "T" : "N", transBInt ? "T" : "N", betaNonZero ? 1 : 0, macroTileNumRows, macroTileNumCols, unroll ); #endif //printf("M=%u, N=%u, K=%u\n", M, N, K); // matrix A parameters clblasTranspose transA; cl_uint numRowsA; cl_uint numColsA; if (transAInt) { transA = clblasTrans; numRowsA = K; numColsA = M; } else { transA = clblasNoTrans; numRowsA = M; numColsA = K; } // matrix B parameters clblasTranspose transB; cl_uint numRowsB; cl_uint numColsB; if (transBInt) { transB = clblasTrans; numRowsB = N; numColsB = K; } else { transB = clblasNoTrans; numRowsB = K; numColsB = N; } // Matrix C cl_uint numRowsC = M; cl_uint numColsC = N; // leading dimension clblasOrder order; cl_uint lda; cl_uint ldb; cl_uint ldc; if (columnMajorInt) { order = clblasColumnMajor; lda = numRowsA; ldb = numRowsB; ldc = numRowsC; } else { order = clblasRowMajor; lda = numColsA; ldb = numColsB; ldc = numColsC; } cl_int err; cl_platform_id platform; cl_device_id device; cl_context_properties props[3] = { CL_CONTEXT_PLATFORM, 0, 0 }; cl_context context; cl_command_queue queue; platform = getPlatform(PLATFORM_NAME); assert(platform != NULL); device = getDevice(platform, DEVICE_NAME); assert(device != NULL); props[1] = (cl_context_properties)platform; context = clCreateContext(props, 1, &device, NULL, NULL, &err); assert(context != NULL); queue = clCreateCommandQueue(context, device, CL_QUEUE_PROFILING_ENABLE, &err); assert(queue != NULL); //printf("Allocating matrices.\n"); A = (DATA_TYPE*)malloc((offA + numRowsA * numColsA) * sizeof(*A)); assert(A != NULL); randomMatrix(order, numRowsA, numColsA, A + offA, lda); B = (DATA_TYPE*)malloc((offB + numRowsB * numColsB) * sizeof(*B)); assert(B != NULL); randomMatrix(order, numRowsB, numColsB, B + offB, ldb); C = (DATA_TYPE*)malloc((offC + numRowsC * numColsC) * sizeof(*C)); assert(C != NULL); randomMatrix(order, numRowsC, numColsC, C + offC, ldc); naiveC = (DATA_TYPE*)malloc((offC + numRowsC * numColsC) * sizeof(*naiveC)); assert(naiveC != NULL); memcpy(naiveC, C, (offC + numRowsC * numColsC) * sizeof(*C)); #if DO_VALIDATION //printf("Running naive gemm.\n"); gemm(order, transA, transB, M, N, K, alpha, A + offA, lda, B + offB, ldb, beta, naiveC + offC, ldc); #endif bufA = clCreateBuffer(context, CL_MEM_READ_ONLY, (offA + numRowsA * numColsA) * sizeof(*A), NULL, &err); CL_CHECK(err); assert(bufA != NULL); err = clEnqueueWriteBuffer(queue, bufA, CL_TRUE, 0, (offA + numRowsA * numColsA) * sizeof(*A), A, 0, NULL, NULL); CL_CHECK(err); assert(err == CL_SUCCESS); bufB = clCreateBuffer(context, CL_MEM_READ_ONLY, (offB + numRowsB * numColsB) * sizeof(*B), NULL, &err); CL_CHECK(err); assert(bufB != NULL); err = clEnqueueWriteBuffer(queue, bufB, CL_TRUE, 0, (offB + numRowsB * numColsB) * sizeof(*B), B, 0, NULL, NULL); CL_CHECK(err); assert(err == CL_SUCCESS); //printf("Writing to gpu buffers.\n"); bufC = clCreateBuffer(context, CL_MEM_READ_WRITE, (offC + numRowsC * numColsC) * sizeof(*C), NULL, &err); CL_CHECK(err); assert(bufC != NULL); err = clEnqueueWriteBuffer(queue, bufC, CL_TRUE, 0, (offC + numRowsC * numColsC) * sizeof(*C), C, 0, NULL, NULL); CL_CHECK(err); assert(err == CL_SUCCESS); float optimalNumElementsPerWorkItem = 1; const char *tileKernelSource; const char *rowKernelSource; const char *colKernelSource; const char *cornerKernelSource; const char *sourceBuildOptions; const unsigned char *tileKernelBinary; const unsigned char *rowKernelBinary; const unsigned char *colKernelBinary; const unsigned char *cornerKernelBinary; size_t *tileKernelBinarySize = 0; size_t *rowKernelBinarySize = 0; size_t *colKernelBinarySize = 0; size_t *cornerKernelBinarySize = 0; const char *binaryBuildOptions; cl_kernel *tileClKernel; cl_kernel *rowClKernel; cl_kernel *colClKernel; cl_kernel *cornerClKernel; unsigned int workGroupNumRows; unsigned int workGroupNumCols; unsigned int microTileNumRows; unsigned int microTileNumCols; unsigned int retUnroll; #if 0 //printf("Creating kernel.\n"); gemmSelectKernel( order, transA, transB, M, N, K, betaNonZero==1, optimalNumElementsPerWorkItem, &tileKernelSource, &rowKernelSource, &colKernelSource, &cornerKernelSource, &sourceBuildOptions, &tileKernelBinary, &rowKernelBinary, &colKernelBinary, &cornerKernelBinary, &tileKernelBinarySize, &rowKernelBinarySize, &colKernelBinarySize, &cornerKernelBinarySize, &binaryBuildOptions, &tileClKernel, &rowClKernel, &colClKernel, &cornerClKernel, &workGroupNumRows, &workGroupNumCols, µTileNumRows, µTileNumCols, &retUnroll ); bool kernelFound = tileKernelSource != NULL; #else bool kernelFound = gemmSelectKernelSpecific( order, transA, transB, betaNonZero==1, macroTileNumRows, macroTileNumCols, unroll, &tileKernelSource, &rowKernelSource, &colKernelSource, &cornerKernelSource, &sourceBuildOptions, &tileKernelBinary, &rowKernelBinary, &colKernelBinary, &cornerKernelBinary, &tileKernelBinarySize, &rowKernelBinarySize, &colKernelBinarySize, &cornerKernelBinarySize, &binaryBuildOptions, &tileClKernel, &rowClKernel, &colClKernel, &cornerClKernel, &workGroupNumRows, &workGroupNumCols, µTileNumRows, µTileNumCols ); #endif if ( !kernelFound ) { printf("ERROR: selected kernel doesn't match desired kernel: %u, %u, %u, %u, %u\n", workGroupNumRows, workGroupNumCols, microTileNumRows, microTileNumCols, unroll ); } /**************************************************************************** * Tile Kernel ***************************************************************************/ //printf("%s", tileKernelSource); assert(tileKernelSource != NULL); *tileClKernel = createKernel(tileKernelSource, context, sourceBuildOptions, &err); assert(tileClKernel != NULL); err = clSetKernelArg(*tileClKernel, 0, sizeof(cl_mem), &bufA); CL_CHECK(err); err = clSetKernelArg(*tileClKernel, 1, sizeof(cl_mem), &bufB); CL_CHECK(err); err = clSetKernelArg(*tileClKernel, 2, sizeof(cl_mem), &bufC); CL_CHECK(err); err = clSetKernelArg(*tileClKernel, 3, sizeof(DATA_TYPE), &alpha); CL_CHECK(err); err = clSetKernelArg(*tileClKernel, 4, sizeof(DATA_TYPE), &beta); CL_CHECK(err); err = clSetKernelArg(*tileClKernel, 5, sizeof(cl_uint), &M); CL_CHECK(err); err = clSetKernelArg(*tileClKernel, 6, sizeof(cl_uint), &N); CL_CHECK(err); err = clSetKernelArg(*tileClKernel, 7, sizeof(cl_uint), &K); CL_CHECK(err); err = clSetKernelArg(*tileClKernel, 8, sizeof(cl_uint), &lda); CL_CHECK(err); err = clSetKernelArg(*tileClKernel, 9, sizeof(cl_uint), &ldb); CL_CHECK(err); err = clSetKernelArg(*tileClKernel, 10, sizeof(cl_uint), &ldc); CL_CHECK(err); err = clSetKernelArg(*tileClKernel, 11, sizeof(cl_uint), &offA); CL_CHECK(err); err = clSetKernelArg(*tileClKernel, 12, sizeof(cl_uint), &offB); CL_CHECK(err); err = clSetKernelArg(*tileClKernel, 13, sizeof(cl_uint), &offC); CL_CHECK(err); // kernel dimensions const size_t localWorkSize[2] = { workGroupNumRows, workGroupNumCols }; size_t tileKernelGlobalWorkSize[2] = { (M/(macroTileNumRows))*workGroupNumRows, (N/(macroTileNumCols))*workGroupNumCols }; size_t rowKernelGlobalWorkSize[2] = { 1*workGroupNumRows, (N/(macroTileNumCols))*workGroupNumCols }; size_t colKernelGlobalWorkSize[2] = { (M/(macroTileNumRows))*workGroupNumRows, 1*workGroupNumCols }; size_t cornerKernelGlobalWorkSize[2] = { 1*workGroupNumRows, 1*workGroupNumCols }; /**************************************************************************** * Row Kernel (along bottom of matrix) ***************************************************************************/ if (mSpill) { assert(rowKernelSource != NULL); *rowClKernel = createKernel(rowKernelSource, context, sourceBuildOptions, &err); assert(rowClKernel != NULL); err = clSetKernelArg(*rowClKernel, 0, sizeof(cl_mem), &bufA); CL_CHECK(err); err = clSetKernelArg(*rowClKernel, 1, sizeof(cl_mem), &bufB); CL_CHECK(err); err = clSetKernelArg(*rowClKernel, 2, sizeof(cl_mem), &bufC); CL_CHECK(err); err = clSetKernelArg(*rowClKernel, 3, sizeof(DATA_TYPE), &alpha); CL_CHECK(err); err = clSetKernelArg(*rowClKernel, 4, sizeof(DATA_TYPE), &beta); CL_CHECK(err); err = clSetKernelArg(*rowClKernel, 5, sizeof(cl_uint), &M); CL_CHECK(err); err = clSetKernelArg(*rowClKernel, 6, sizeof(cl_uint), &N); CL_CHECK(err); err = clSetKernelArg(*rowClKernel, 7, sizeof(cl_uint), &K); CL_CHECK(err); err = clSetKernelArg(*rowClKernel, 8, sizeof(cl_uint), &lda); CL_CHECK(err); err = clSetKernelArg(*rowClKernel, 9, sizeof(cl_uint), &ldb); CL_CHECK(err); err = clSetKernelArg(*rowClKernel, 10, sizeof(cl_uint), &ldc); CL_CHECK(err); err = clSetKernelArg(*rowClKernel, 11, sizeof(cl_uint), &offA); CL_CHECK(err); err = clSetKernelArg(*rowClKernel, 12, sizeof(cl_uint), &offB); CL_CHECK(err); err = clSetKernelArg(*rowClKernel, 13, sizeof(cl_uint), &offC); CL_CHECK(err); // kernel dimensions } /**************************************************************************** * Col Kernel (along side of kernel) ***************************************************************************/ if (nSpill) { assert(colKernelSource != NULL); *colClKernel = createKernel(colKernelSource, context, sourceBuildOptions, &err); assert(colClKernel != NULL); err = clSetKernelArg(*colClKernel, 0, sizeof(cl_mem), &bufA); CL_CHECK(err); err = clSetKernelArg(*colClKernel, 1, sizeof(cl_mem), &bufB); CL_CHECK(err); err = clSetKernelArg(*colClKernel, 2, sizeof(cl_mem), &bufC); CL_CHECK(err); err = clSetKernelArg(*colClKernel, 3, sizeof(DATA_TYPE), &alpha); CL_CHECK(err); err = clSetKernelArg(*colClKernel, 4, sizeof(DATA_TYPE), &beta); CL_CHECK(err); err = clSetKernelArg(*colClKernel, 5, sizeof(cl_uint), &M); CL_CHECK(err); err = clSetKernelArg(*colClKernel, 6, sizeof(cl_uint), &N); CL_CHECK(err); err = clSetKernelArg(*colClKernel, 7, sizeof(cl_uint), &K); CL_CHECK(err); err = clSetKernelArg(*colClKernel, 8, sizeof(cl_uint), &lda); CL_CHECK(err); err = clSetKernelArg(*colClKernel, 9, sizeof(cl_uint), &ldb); CL_CHECK(err); err = clSetKernelArg(*colClKernel, 10, sizeof(cl_uint), &ldc); CL_CHECK(err); err = clSetKernelArg(*colClKernel, 11, sizeof(cl_uint), &offA); CL_CHECK(err); err = clSetKernelArg(*colClKernel, 12, sizeof(cl_uint), &offB); CL_CHECK(err); err = clSetKernelArg(*colClKernel, 13, sizeof(cl_uint), &offC); CL_CHECK(err); // kernel dimensions } /**************************************************************************** * Corner Kernel (lower left corder of kernel) ***************************************************************************/ if (mSpill && nSpill) { assert(cornerKernelSource != NULL); *cornerClKernel = createKernel(cornerKernelSource, context, sourceBuildOptions, &err); assert(cornerClKernel != NULL); err = clSetKernelArg(*cornerClKernel, 0, sizeof(cl_mem), &bufA); CL_CHECK(err); err = clSetKernelArg(*cornerClKernel, 1, sizeof(cl_mem), &bufB); CL_CHECK(err); err = clSetKernelArg(*cornerClKernel, 2, sizeof(cl_mem), &bufC); CL_CHECK(err); err = clSetKernelArg(*cornerClKernel, 3, sizeof(DATA_TYPE), &alpha); CL_CHECK(err); err = clSetKernelArg(*cornerClKernel, 4, sizeof(DATA_TYPE), &beta); CL_CHECK(err); err = clSetKernelArg(*cornerClKernel, 5, sizeof(cl_uint), &M); CL_CHECK(err); err = clSetKernelArg(*cornerClKernel, 6, sizeof(cl_uint), &N); CL_CHECK(err); err = clSetKernelArg(*cornerClKernel, 7, sizeof(cl_uint), &K); CL_CHECK(err); err = clSetKernelArg(*cornerClKernel, 8, sizeof(cl_uint), &lda); CL_CHECK(err); err = clSetKernelArg(*cornerClKernel, 9, sizeof(cl_uint), &ldb); CL_CHECK(err); err = clSetKernelArg(*cornerClKernel, 10, sizeof(cl_uint), &ldc); CL_CHECK(err); err = clSetKernelArg(*cornerClKernel, 11, sizeof(cl_uint), &offA); CL_CHECK(err); err = clSetKernelArg(*cornerClKernel, 12, sizeof(cl_uint), &offB); CL_CHECK(err); err = clSetKernelArg(*cornerClKernel, 13, sizeof(cl_uint), &offC); CL_CHECK(err); // kernel dimensions } unsigned int totalEnqueues = numEnqueuesPerFlush * numFlushesPerFinish * numFinishes; if (mSpill || nSpill) { totalEnqueues *= 2; } if (mSpill && nSpill) { totalEnqueues *= 2; } cl_event kernelEvents[numEnqueuesPerFlush * numFlushesPerFinish * numFinishes * 4]; unsigned int kernelIdx = 0; //printf("Launching %u kernels of %u x %u threads\n", totalEnqueues, globalWorkSize[0], globalWorkSize[1]); for (unsigned int finishIdx = 0; finishIdx < numFinishes; finishIdx++) { for (unsigned int flushIdx = 0; flushIdx < numFlushesPerFinish; flushIdx++) { for (unsigned int enqIdx = 0; enqIdx < numEnqueuesPerFlush; enqIdx++) { // tile kernel err = clEnqueueNDRangeKernel(queue, *tileClKernel, workDim, NULL, tileKernelGlobalWorkSize, localWorkSize, 0, NULL, &kernelEvents[kernelIdx]); CL_CHECK(err); kernelIdx++; #if 1 // row kernel if (mSpill) { printf("launching rowKernel %ux%u threads b/c M=%u\n", rowKernelGlobalWorkSize[0], rowKernelGlobalWorkSize[1], M); err = clEnqueueNDRangeKernel(queue, *rowClKernel, workDim, NULL, rowKernelGlobalWorkSize, localWorkSize, 0, NULL, &kernelEvents[kernelIdx]); CL_CHECK(err); kernelIdx++; } // col kernel if (nSpill) { printf("launching colKernel %ux%u threads b/c N=%u\n", colKernelGlobalWorkSize[0], colKernelGlobalWorkSize[1], N); err = clEnqueueNDRangeKernel(queue, *colClKernel, workDim, NULL, colKernelGlobalWorkSize, localWorkSize, 0, NULL, &kernelEvents[kernelIdx]); CL_CHECK(err); kernelIdx++; } // corner kernel if (mSpill && nSpill) { printf("launching crnKernel %ux%u threads b/c M=%u, N=%u\n", cornerKernelGlobalWorkSize[0], cornerKernelGlobalWorkSize[1], M, N); err = clEnqueueNDRangeKernel(queue, *cornerClKernel, workDim, NULL, cornerKernelGlobalWorkSize, localWorkSize, 0, NULL, &kernelEvents[kernelIdx]); CL_CHECK(err); kernelIdx++; } #endif } err = clFlush(queue); CL_CHECK(err); } err = clFinish(queue); CL_CHECK(err); } #if DO_VALIDATION #else cl_ulong start, end; for (kernelIdx = 0; kernelIdx < totalEnqueues; kernelIdx++) { err = clGetEventProfilingInfo(kernelEvents[kernelIdx], CL_PROFILING_COMMAND_START, sizeof(start), &start, NULL); CL_CHECK(err); err = clGetEventProfilingInfo(kernelEvents[kernelIdx], CL_PROFILING_COMMAND_END, sizeof(end), &end, NULL); CL_CHECK(err); cl_ulong timeNs = end - start; cl_ulong totalFlops; if (!mSpill && !nSpill) { totalFlops = 2*((cl_ulong)M)*N*K; } else if (mSpill && !nSpill) { if (kernelIdx%2==0) { totalFlops = 2*((cl_ulong)M)*N*K; } else { totalFlops = 2*((cl_ulong)macroTileNumRows)*N*K; } } else if (nSpill && !mSpill) { if (kernelIdx%2==0) { totalFlops = 2*((cl_ulong)M)*N*K; } else { totalFlops = 2*((cl_ulong)M)*macroTileNumCols*K; } } else { if (kernelIdx%4==0) { totalFlops = 2*((cl_ulong)M)*N*K; } else if (kernelIdx%4==1) { totalFlops = 2*((cl_ulong)macroTileNumRows)*N*K; } else if (kernelIdx%4==2) { totalFlops = 2*((cl_ulong)M)*macroTileNumCols*K; } else { totalFlops = 2*((cl_ulong)macroTileNumRows)*macroTileNumCols*K; } } #if CGEMM || ZGEMM // complex totalFlops *= 4; #endif double gFlops = (1.0*totalFlops) / (1.0*timeNs); printf("%12llu flops in %12llu ns = %7.1f Gflop/s (%5.1f%% of peak)\n", totalFlops, timeNs, gFlops, 100*gFlops/peakGflops); } #endif err = clEnqueueReadBuffer(queue, bufC, CL_TRUE, 0, (offC + numRowsC * numColsC) * sizeof(*C), C, 0, NULL, NULL); CL_CHECK(err); #if DO_VALIDATION bool equal = compareMatrices(order, numRowsC, numColsC, C + offC, naiveC + offC, ldc); printf("%s_%s%s_%03u_%03u_%u_%02ux%02u_%ux%u%s%s%s%s", #if SGEMM "sgemm", #endif #if DGEMM "dgemm", #endif #if CGEMM "cgemm", #endif #if ZGEMM "zgemm", #endif transAInt ? "T" : "N", transBInt ? "T" : "N", macroTileNumRows, macroTileNumCols, unroll, workGroupNumRows, workGroupNumCols, microTileNumRows, microTileNumCols, columnMajorInt ? "_ColumnMajor" : "_RowMajor", mSpill ? "_1" : "_0", nSpill ? "_1" : "_0", betaNonZero ? "_BETA" : "" ); if (equal) { printf(" - passed\n\n"); } else { printf(" - failed\n\n"); printf("%s", tileKernelSource ); } fflush(stdout); system("PAUSE"); #endif err = clReleaseMemObject(bufA); CL_CHECK(err); err = clReleaseMemObject(bufB); CL_CHECK(err); err = clReleaseMemObject(bufC); CL_CHECK(err); //err = clReleaseKernel(kernel); CL_CHECK(err); err = clReleaseCommandQueue(queue); CL_CHECK(err); err = clReleaseContext(context); CL_CHECK(err); free(A); free(B); free(C); free(naiveC); //free(source); } int main(void) { #if 0 srand((unsigned int)time(NULL)); unsigned int **kernels = new unsigned int*[numKernels]; for (unsigned int i = 0; i < numKernels; i++) { kernels[i] = #if SGEMM sgemmKernelEnumeration[i]; #elif DGEMM dgemmKernelEnumeration[i]; #elif CGEMM cgemmKernelEnumeration[i]; #elif ZGEMM zgemmKernelEnumeration[i]; #endif } for (unsigned int kernelIdx = 0; kernelIdx < numKernels; kernelIdx++) { printf("kernelIdx = %u\n", kernelIdx); /* {isColumnMajor, transA, transB, betaNonZero, wgNumRows, wgNumCols, mtNumRows, mtNumCols, }*/ unsigned int *kernelParameters = kernels[kernelIdx]; unsigned int columnMajor = kernelParameters[0]; unsigned int transA = kernelParameters[1]; unsigned int transB = kernelParameters[2]; unsigned int betaNonZero = kernelParameters[3]; unsigned int macroTileNumRows = kernelParameters[4]; unsigned int macroTileNumCols = kernelParameters[5]; unsigned int unroll = kernelParameters[6]; unsigned int mSpill = kernelParameters[7]; unsigned int nSpill = kernelParameters[8]; testKernelParameterCombination( columnMajor, transA, transB, betaNonZero, macroTileNumRows, macroTileNumCols, unroll, mSpill, nSpill ); } // end for #else unsigned int columnMajor = 1; unsigned int transA = 0; unsigned int transB = 1; unsigned int beta = 0; unsigned int macroTileNumRows = 16*4; unsigned int macroTileNumCols = 16*4; unsigned int unroll = 8; unsigned int mSpill = 0; unsigned int nSpill = 0; testKernelParameterCombination( columnMajor, transA, transB, true, macroTileNumRows, macroTileNumCols, unroll, mSpill, nSpill ); #endif //system("PAUSE"); //Sleep(5000); // ms exit(EXIT_SUCCESS); }; cl_platform_id getPlatform(const char *name) { cl_int err; cl_uint nrPlatforms, i; cl_platform_id *list, platform; char platformName[64]; err = clGetPlatformIDs(0, NULL, &nrPlatforms); assert(err == CL_SUCCESS); if (err != CL_SUCCESS) { return NULL; } list = (cl_platform_id*)malloc(nrPlatforms * sizeof(*list)); if (list == NULL) { return NULL; } err = clGetPlatformIDs(nrPlatforms, list, NULL); assert(err == CL_SUCCESS); if (err != CL_SUCCESS) { free(list); return NULL; } platform = NULL; for (i = 0; i < nrPlatforms; i++) { err = clGetPlatformInfo(list[i], CL_PLATFORM_NAME, sizeof(platformName), platformName, NULL); assert(err == CL_SUCCESS); if ((err == CL_SUCCESS) && (strcmp(platformName, name) == 0)) { platform = list[i]; break; } } free(list); return platform; } cl_device_id getDevice( cl_platform_id platform, const char *name) { cl_int err; cl_uint nrDevices, i; cl_device_id *list, device; char deviceName[64]; err = clGetDeviceIDs(platform, CL_DEVICE_TYPE_ALL, 0, NULL, &nrDevices); assert(err == CL_SUCCESS); if (err != CL_SUCCESS) { return NULL; } list = (cl_device_id*)malloc(nrDevices * sizeof(*list)); assert(list); if (list == NULL) { return NULL; } err = clGetDeviceIDs(platform, CL_DEVICE_TYPE_ALL, nrDevices, list, NULL); assert(err == CL_SUCCESS); if (err != CL_SUCCESS) { free(list); return NULL; } device = NULL; for (i = 0; i < nrDevices; i++) { err = clGetDeviceInfo(list[i], CL_DEVICE_NAME, sizeof(deviceName), deviceName, NULL); assert(err == CL_SUCCESS); if ((err == CL_SUCCESS) && (strcmp(deviceName, name) == 0)) { device = list[i]; break; } } free(list); return device; } cl_kernel createKernel( const char* source, cl_context context, const char* options, cl_int* error) { printf("BuildOptions: %s\n", options ); //printf("Kernel Source:\n%s", source ); cl_int err; cl_device_id device; cl_program program; cl_kernel kernel; size_t logSize; char *log; err = clGetContextInfo(context, CL_CONTEXT_DEVICES, sizeof(device), &device, NULL); assert(err == CL_SUCCESS); if (err != CL_SUCCESS) { if (error != NULL) { *error = err; } return NULL; } program = clCreateProgramWithSource(context, 1, &source, NULL, &err); assert(err == CL_SUCCESS); assert(program != NULL); if (program == NULL) { if (error != NULL) { *error = err; } return NULL; } err = clBuildProgram(program, 1, &device, options, NULL, NULL); if (err != CL_SUCCESS) { logSize = 0; clGetProgramBuildInfo(program, device, CL_PROGRAM_BUILD_LOG, 0, NULL, &logSize); log = (char*)malloc(logSize + 1); clGetProgramBuildInfo(program, device, CL_PROGRAM_BUILD_LOG, logSize, log, NULL); printf("=== Build Log [%lu]===\n%s\n", logSize, log); free(log); } assert(err == CL_SUCCESS); if (err != CL_SUCCESS) { clReleaseProgram(program); if (error != NULL) { *error = err; } return NULL; } kernel = NULL; cl_uint num_kernels_ret; err = clCreateKernelsInProgram(program, 0, NULL, &num_kernels_ret); err = clCreateKernelsInProgram(program, 1, &kernel, NULL); assert(err == CL_SUCCESS); assert(kernel != NULL); clReleaseProgram(program); // kernel name size_t length; char kernelName[64]; err = clGetKernelInfo( kernel, CL_KERNEL_FUNCTION_NAME, 64, kernelName, &length ); printf("KernelName[%lu]: %s\n", length, kernelName); // kernel arguments cl_uint numArguments; err = clGetKernelInfo( kernel, CL_KERNEL_NUM_ARGS, sizeof(numArguments), &numArguments, NULL ); if (error != NULL) { *error = err; } return kernel; } clblas-2.10/src/library/blas/AutoGemm/Common.py000066400000000000000000000034741264277366700214240ustar00rootroot00000000000000################################################################################ # Auto-Gemm ################################################################################ outputPath = "" clCompilerVersion = "2.0" def setClCompilerVersion(version): global clCompilerVersion clCompilerVersion = version def getClCompilerVersion(): global clCompilerVersion return clCompilerVersion def setOutputPath(path): global outputPath outputPath = path + "/" def getOutputPath(): global outputPath return outputPath def getRelativeKernelSourcePath(): return "AutoGemmKernelSources/" def getRelativeKernelBinaryPath(): return "AutoGemmKernelBinaries/" def getRelativeIncludePath(): return "AutoGemmIncludes/" def getKernelSourcePath(): return getOutputPath() + getRelativeKernelSourcePath() def getKernelBinaryPath(): return getOutputPath() + getRelativeKernelBinaryPath() def getIncludePath(): return getOutputPath() + getRelativeIncludePath() def getAutoGemmHeader(): return ( "/*******************************************************************************\n" " * This file was auto-generated using the AutoGemm.py python script.\n" " * DO NOT modify this file! Instead, make changes to scripts in\n" " * clBLAS/src/library/blas/AutoGemm/ then re-generate files\n" " * (otherwise local changes will be lost after re-generation).\n" " ******************************************************************************/\n\n" ) hostDataChar = { "s":"s", "d":"d", "c":"c", "z":"z" } hostDataType = { "s":"float", "d":"double", "c":"float2", "z":"double2" } openclDataType = { "s":"float", "d":"double", "c":"float2", "z":"double2" } precisionInt = { "s":0, "d":1, "c":2, "z":3 } orderInt = { "clblasRowMajor":0, "clblasColumnMajor":1 } transposeInt = { "N":0, "T":1, "C":2 } clblas-2.10/src/library/blas/AutoGemm/Includes.py000066400000000000000000000506601264277366700217410ustar00rootroot00000000000000import os import sys import getopt import Common import AutoGemmParameters import KernelParameters ################################################################################ # SINC - Kernel Source Includes ################################################################################ class KernelSourceIncludes: ############################################################################## # SINC - default constructor ############################################################################## def __init__(self): self.incFileName = Common.getIncludePath() + "AutoGemmKernelSources.h" self.incFile = open(self.incFileName, "w") self.incFile.write( Common.getAutoGemmHeader() ) self.incStr = "#ifndef AUTOGEMM_KERNEL_SOURCE_INCLUDES_H\n" self.incStr += "#define AUTOGEMM_KERNEL_SOURCE_INCLUDES_H\n" self.incStr += "\n" self.cppFileName = Common.getIncludePath() + "AutoGemmKernelSources.cpp" self.cppFile = open(self.cppFileName, "w") self.cppFile.write( Common.getAutoGemmHeader() ) self.cppStr = "\n" self.cppStr += "#include \"%sAutoGemmKernelSources.h\"\n" % Common.getRelativeIncludePath() self.cppStr += "#include \"UserGemmKernelSources/UserGemmKernelSourceIncludes.cpp\"\n" #self.cppStr += "#include \"UserGemmKernelSources/UserGemmKernelSources.cpp\"\n" def addKernel(self, kernel): kernelName = kernel.getName() self.incStr += "extern const unsigned int %s_workGroupNumRows;\n" % kernelName self.incStr += "extern const unsigned int %s_workGroupNumCols;\n" % kernelName self.incStr += "extern const unsigned int %s_microTileNumRows;\n" % kernelName self.incStr += "extern const unsigned int %s_microTileNumCols;\n" % kernelName self.incStr += "extern const unsigned int %s_unroll;\n" % kernelName self.incStr += "extern const char * const %s_src;\n" % kernelName self.cppStr += "#include \"%s%s_src.cpp\"\n" % (Common.getRelativeKernelSourcePath(), kernelName) kernelName = kernel.getRowName() self.incStr += "extern const unsigned int %s_workGroupNumRows;\n" % kernelName self.incStr += "extern const unsigned int %s_workGroupNumCols;\n" % kernelName self.incStr += "extern const unsigned int %s_microTileNumRows;\n" % kernelName self.incStr += "extern const unsigned int %s_microTileNumCols;\n" % kernelName self.incStr += "extern const unsigned int %s_unroll;\n" % kernelName self.incStr += "extern const char * const %s_src;\n" % kernelName self.cppStr += "#include \"%s%s_src.cpp\"\n" % (Common.getRelativeKernelSourcePath(), kernelName ) kernelName = kernel.getColName() self.incStr += "extern const unsigned int %s_workGroupNumRows;\n" % kernelName self.incStr += "extern const unsigned int %s_workGroupNumCols;\n" % kernelName self.incStr += "extern const unsigned int %s_microTileNumRows;\n" % kernelName self.incStr += "extern const unsigned int %s_microTileNumCols;\n" % kernelName self.incStr += "extern const unsigned int %s_unroll;\n" % kernelName self.incStr += "extern const char * const %s_src;\n" % kernelName self.cppStr += "#include \"%s%s_src.cpp\"\n" % (Common.getRelativeKernelSourcePath(), kernelName) kernelName = kernel.getCornerName() self.incStr += "extern const unsigned int %s_workGroupNumRows;\n" % kernelName self.incStr += "extern const unsigned int %s_workGroupNumCols;\n" % kernelName self.incStr += "extern const unsigned int %s_microTileNumRows;\n" % kernelName self.incStr += "extern const unsigned int %s_microTileNumCols;\n" % kernelName self.incStr += "extern const unsigned int %s_unroll;\n" % kernelName self.incStr += "extern const char * const %s_src;\n" % kernelName self.cppStr += "#include \"%s%s_src.cpp\"\n" % (Common.getRelativeKernelSourcePath(), kernelName) self.incFile.write( self.incStr ) self.incStr = "" self.cppFile.write( self.cppStr ) self.cppStr = "" def writeToFile(self): self.incFile.write( self.incStr ) self.incFile.write( "\n#endif\n" ) self.incFile.close() self.cppFile.write( self.cppStr ) self.cppFile.close() ################################################################################ # BINC - Kernel Binary Includes ################################################################################ class KernelBinaryIncludes: ############################################################################## # BINC - default constructor ############################################################################## def __init__(self): self.incFileName = Common.getIncludePath() + "AutoGemmKernelBinaries.h" self.incFile = open(self.incFileName, "w") self.incFile.write( Common.getAutoGemmHeader() ) self.incStr = "" self.incStr += "#include \n" self.incStr += "\n#ifndef AUTOGEMM_KERNEL_BINARIES_H\n" self.incStr += "#define AUTOGEMM_KERNEL_BINARIES_H\n" self.incStr += "\n" self.cppFileName = Common.getIncludePath() + "AutoGemmKernelBinaries.cpp" self.cppFile = open(self.cppFileName, "w") self.cppFile.write( Common.getAutoGemmHeader() ) self.cppStr = "" self.cppStr += "#include \"%sAutoGemmKernelBinaries.h\"\n" % Common.getRelativeIncludePath() self.cppStr += "\n" self.cppStr += "#ifdef AUTOGEMM_USE_PRE_COMPILED_KERNELS\n" self.cppStr += "#include \"%sAutoGemmKernelBinariesPreCompiled.h\"\n" % Common.getRelativeKernelBinaryPath() self.cppStr += "#endif\n" self.cppStr += "\n" def addKernel(self, kernel): kernelName = kernel.getName() self.incStr += "extern unsigned char *%s_bin;\n" % kernelName self.incStr += "extern size_t %s_binSize;\n" % kernelName self.cppStr += "#ifndef KERNEL_" + kernelName.upper() + "_BIN_CPP\n" self.cppStr += "unsigned char *%s_bin = 0;\n" % kernelName self.cppStr += " size_t %s_binSize = 0;\n" % kernelName self.cppStr += "#else\n" self.cppStr += "#pragma message(\"AutoGemmKernelBinaries.cpp: %s was pre-compiled.\")\n" % kernelName self.cppStr += "#endif\n" kernelName = kernel.getRowName() self.incStr += "extern unsigned char *%s_bin;\n" % kernelName self.incStr += "extern size_t %s_binSize;\n" % kernelName self.cppStr += "#ifndef KERNEL_" + kernelName.upper() + "_BIN_CPP\n" self.cppStr += "unsigned char *%s_bin = 0;\n" % kernelName self.cppStr += " size_t %s_binSize = 0;\n" % kernelName self.cppStr += "#else\n" self.cppStr += "#pragma message(\"AutoGemmKernelBinaries.cpp: %s was pre-compiled.\")\n" % kernelName self.cppStr += "#endif\n" kernelName = kernel.getColName() self.incStr += "extern unsigned char *%s_bin;\n" % kernelName self.incStr += "extern size_t %s_binSize;\n" % kernelName self.cppStr += "#ifndef KERNEL_" + kernelName.upper() + "_BIN_CPP\n" self.cppStr += "unsigned char *%s_bin = 0;\n" % kernelName self.cppStr += " size_t %s_binSize = 0;\n" % kernelName self.cppStr += "#else\n" self.cppStr += "#pragma message(\"AutoGemmKernelBinaries.cpp: %s was pre-compiled.\")\n" % kernelName self.cppStr += "#endif\n" kernelName = kernel.getCornerName() self.incStr += "extern unsigned char *%s_bin;\n" % kernelName self.incStr += "extern size_t %s_binSize;\n" % kernelName self.cppStr += "#ifndef KERNEL_" + kernelName.upper() + "_BIN_CPP\n" self.cppStr += "unsigned char *%s_bin = 0;\n" % kernelName self.cppStr += " size_t %s_binSize = 0;\n" % kernelName self.cppStr += "#else\n" self.cppStr += "#pragma message(\"AutoGemmKernelBinaries.cpp: %s was pre-compiled.\")\n" % kernelName self.cppStr += "#endif\n" self.incFile.write( self.incStr ) self.incStr = "" self.cppFile.write( self.cppStr ) self.cppStr = "" def writeToFile(self): self.incFile.write( self.incStr ) self.incFile.write( "\n#endif\n" ) self.incFile.close() self.cppFile.write( self.cppStr ) self.cppFile.close() ################################################################################ # CINC - ClKernel Includes ################################################################################ class ClKernelIncludes: ############################################################################## # CINC - default constructor ############################################################################## def __init__(self): self.incName = Common.getIncludePath() + "AutoGemmClKernels.h" self.incFile = open(self.incName, "w") self.incFile.write( Common.getAutoGemmHeader() ) self.incStr = "#ifndef AUTOGEMM_CL_KERNELS_H\n" self.incStr += "#define AUTOGEMM_CL_KERNELS_H\n" self.incStr += "#if defined( __APPLE__ ) || defined( __MACOSX )\n" self.incStr += "#include \n" self.incStr += "#else\n" self.incStr += "#include \n" self.incStr += "#endif\n" self.incStr += "\n" self.incStr += "#ifdef __cplusplus\n" self.incStr += "extern \"C\" {\n" self.incStr += "#endif\n" self.incStr += " void initAutoGemmClKernels(void);\n"; self.incStr += "#ifdef __cplusplus\n" self.incStr += "}\n"; self.incStr += "#endif\n" self.incStr += "\n"; self.cppName = Common.getIncludePath() + "AutoGemmClKernels.cpp" self.cppFile = open(self.cppName, "w") self.cppFile.write( Common.getAutoGemmHeader() ) self.cppStr = "#if defined( __APPLE__ ) || defined( __MACOSX )\n" self.cppStr += "#include \n" self.cppStr += "#else\n" self.cppStr += "#include \n" self.cppStr += "#endif\n" self.cppStr += "\n" self.initFunction = ""; self.initFunction += "extern \"C\" {\n"; self.initFunction += " void initAutoGemmClKernels(void);\n"; self.initFunction += "}\n"; self.initFunction += "\n"; self.initFunction += "void initAutoGemmClKernels(void) {\n"; self.defines = ""; def addKernel(self, kernel): kernelNames = [ kernel.getName(), kernel.getRowName(), kernel.getColName(), kernel.getCornerName() ] for kernelName in kernelNames: self.incStr += "extern cl_kernel %s_clKernel;\n" % kernelName self.defines += "cl_kernel %s_clKernel = NULL;\n" % kernelName self.initFunction += " if(%s_clKernel != NULL) {\n" % kernelName self.initFunction += " clReleaseKernel(%s_clKernel);\n" % kernelName self.initFunction += " %s_clKernel = NULL;\n" % kernelName self.initFunction += " }\n" self.incFile.write( self.incStr ) self.incStr = "" # self.cppFile.write( self.cppStr ) # self.cppStr = "" def writeToFile(self): self.incFile.write( self.incStr ) self.incFile.write( "\n#endif\n" ) self.incFile.close() self.initFunction += "}\n"; self.cppStr += self.defines + "\n"; self.defines = ""; self.cppStr += self.initFunction + "\n"; self.initFunction = ""; # self.cppStr += "\n"; # self.cppStr += "initAutoGemmClKernels();\n"; self.cppFile.write( self.cppStr ) self.cppFile.close() ################################################################################ # KSBO - Kernel Source Build Options ################################################################################ class KernelSourceBuildOptions: ############################################################################## # KSBO - default constructor ############################################################################## def __init__(self): self.incName = Common.getIncludePath() + "AutoGemmKernelBuildOptionsSource.h" self.incFile = open(self.incName, "w") self.incFile.write( Common.getAutoGemmHeader() ) self.incStr = "#ifndef AUTOGEMM_KERNEL_SOURCE_BUILD_OPTIONS_H\n" self.incStr += "#define AUTOGEMM_KERNEL_SOURCE_BUILD_OPTIONS_H\n" self.incStr += "\n" self.cppName = Common.getIncludePath() + "AutoGemmKernelBuildOptionsSource.cpp" self.cppFile = open(self.cppName, "w") self.cppFile.write( Common.getAutoGemmHeader() ) self.cppStr = "" self.cppStr += "#include \"" + Common.getRelativeIncludePath() + "AutoGemmKernelBuildOptionsSource.h\"\n" def addKernel(self, kernel): kernelName = kernel.getName() self.incStr += "extern const char * const %s_srcBuildOptions;\n" \ % kernelName self.cppStr += "const char * const %s_srcBuildOptions = \"-cl-std=CL%s\";\n" \ % (kernelName, Common.getClCompilerVersion() ) self.incFile.write( self.incStr ) self.incStr = "" self.cppFile.write( self.cppStr ) self.cppStr = "" def writeToFile(self): self.incFile.write( self.incStr ) self.incFile.write( "\n#endif\n" ) self.incFile.close() self.cppFile.write( self.cppStr ) self.cppFile.close() ################################################################################ # KBSO - Kernel Binary Build Options ################################################################################ class KernelBinaryBuildOptions: ############################################################################## # KBSO - default constructor ############################################################################## def __init__(self): self.incName = Common.getIncludePath() + "AutoGemmKernelBuildOptionsBinary.h" self.incFile = open(self.incName, "w") self.incFile.write( Common.getAutoGemmHeader() ) self.incStr = "#ifndef AUTOGEMM_KERNEL_BINARY_BUILD_OPTIONS_H\n" self.incStr += "#define AUTOGEMM_KERNEL_BINARY_BUILD_OPTIONS_H\n" self.incStr += "\n" self.cppName = Common.getIncludePath() + "AutoGemmKernelBuildOptionsBinary.cpp" self.cppFile = open(self.cppName, "w") self.cppFile.write( Common.getAutoGemmHeader() ) self.cppStr = "" self.cppStr += "#include \"" + Common.getRelativeIncludePath() + "AutoGemmKernelBuildOptionsBinary.h\"\n" def addKernel(self, kernel): kernelName = kernel.getName() self.incStr += "extern const char * const %s_binBuildOptions;\n" % kernelName self.cppStr += "const char * const %s_binBuildOptions = \"-cl-std=CL%s\";\n" % (kernelName, Common.getClCompilerVersion() ) self.incFile.write( self.incStr ) self.incStr = "" self.cppFile.write( self.cppStr ) self.cppStr = "" def writeToFile(self): self.incFile.write( self.incStr ) self.incFile.write( "\n#endif\n" ) self.incFile.close() self.cppFile.write( self.cppStr ) self.cppFile.close() ################################################################################ # CPPKE - Cpp Kernel enumeration ################################################################################ class CppKernelEnumeration: ############################################################################## # CPPKE - default constructor ############################################################################## def __init__(self): self.fileName = Common.getIncludePath() + "AutoGemmKernelEnumeration.h" self.kernelStr = "" self.tileStr = "" self.nonTileStr = "" self.kernelCount = 0 self.tileCount = 0 self.nonTileCount = 0 self.precision = "" self.precisionInitialized = False def newPrecision(self, precision): if self.precisionInitialized: self.kernelStr += "};\n" self.kernelStr += "const unsigned int %sgemmNumKernels = %d;\n\n" \ % (self.precision, self.kernelCount) self.tileStr += "};\n" self.tileStr += "const unsigned int %sgemmNumTiles = %d;\n\n" \ % (self.precision, self.tileCount) self.nonTileStr += "};\n" self.nonTileStr += "const unsigned int %sgemmNumNonTiles = %d;\n\n" \ % (self.precision, self.nonTileCount) self.precisionInitialized = True self.precision = precision self.kernelStr += "// order, transA, transB, beta, macroTileNumRows, macroTileNumCols, unroll, mSpill, nSpill\n" self.kernelStr += "unsigned int " + precision + "gemmKernelEnumeration[][9] = {\n" self.tileStr += "// macroTileNumRows, macroTileNumCols, unroll\n" self.tileStr += "unsigned int " + precision + "gemmTileEnumeration[][3] = {\n" self.nonTileStr += "// order, transA, transB, beta\n" self.nonTileStr += "unsigned int " + precision + "gemmNonTileEnumeration[][4] = {\n" self.tileCount = 0 self.nonTileCount = 0 self.kernelCount = 0 def addTile(self, tile): self.tileStr += " { %3u, %3u, %1u },\n" % ( \ tile.macroTileNumRows, \ tile.macroTileNumCols, \ tile.unroll ) self.tileCount += 1 def addNonTile(self, nonTile): self.nonTileStr += " { %1u, %1u, %1u, %1u },\n" % ( \ 1 if nonTile.order=="clblasColumnMajor" else 0, \ 0 if nonTile.transA=="N" else 1 if nonTile.transA=="T" else 2 , \ 0 if nonTile.transB=="N" else 1 if nonTile.transB=="T" else 2, \ 1 if nonTile.beta>0 else 0 ) self.nonTileCount += 1 def addKernel(self, kernel): # 6) list to add to ktest for automated kernel testing for mSpill in range(0, 2): for nSpill in range(0, 2): self.kernelStr += " { %1u, %1u, %1u, %1u, %3u, %3u, %2u, %1u, %1u },\n" % ( \ 1 if kernel.order=="clblasColumnMajor" else 0, \ 0 if kernel.transA=="N" else 1 if kernel.transA=="T" else 2 , \ 0 if kernel.transB=="N" else 1 if kernel.transB=="T" else 2, \ 1 if kernel.beta>0 else 0, \ kernel.macroTileNumRows, \ kernel.macroTileNumCols, \ kernel.unroll, \ mSpill, \ nSpill ) self.kernelCount += 4 def writeToFile(self): self.kernelStr += "};\n" self.kernelStr += "const unsigned int %sgemmNumKernels = %d;\n" % (self.precision, self.kernelCount) self.tileStr += "};\n" self.tileStr += "const unsigned int %sgemmNumTiles = %d;\n" % (self.precision, self.tileCount) self.nonTileStr += "};\n" self.nonTileStr += "const unsigned int %sgemmNumNonTiles = %d;\n" % (self.precision, self.nonTileCount) incFile = open(self.fileName, "w") incFile.write( Common.getAutoGemmHeader() ) incFile.write( self.tileStr ) incFile.write( "\n\n" ) incFile.write( self.nonTileStr ) incFile.write( "\n\n" ) incFile.write( self.kernelStr ) incFile.close() ################################################################################ # Write Includes ################################################################################ def writeIncludes(): print("AutoGemm.py: Generating include files.") if not os.path.exists( Common.getIncludePath() ): os.makedirs( Common.getIncludePath() ) kernelSourceIncludes = KernelSourceIncludes() kernelBinaryIncludes = KernelBinaryIncludes() clKernelIncludes = ClKernelIncludes() kernelSourceBuildOptions = KernelSourceBuildOptions() kernelBinaryBuildOptions = KernelBinaryBuildOptions() cppKernelEnumeration = CppKernelEnumeration() # for each precision kernel = KernelParameters.KernelParameters() for precision in AutoGemmParameters.precisions: kernel.precision = precision cppKernelEnumeration.newPrecision(precision) # valid tiles for this precision tiles = AutoGemmParameters.getTilesForPrecision(precision) # add tiles for this precision to Cpp for tile in tiles: cppKernelEnumeration.addTile(tile) # for non tile parameters for order in AutoGemmParameters.orders: kernel.order = order for transA in AutoGemmParameters.transposes[precision]: kernel.transA = transA for transB in AutoGemmParameters.transposes[precision]: kernel.transB = transB for beta in AutoGemmParameters.betas: kernel.beta = beta # add this nonTile combo for this precision to Cpp cppKernelEnumeration.addNonTile(kernel) # for tile parameters for tile in tiles: kernel.useTile(tile) kernelSourceIncludes.addKernel(kernel) kernelBinaryIncludes.addKernel(kernel) kernelSourceBuildOptions.addKernel(kernel) kernelBinaryBuildOptions.addKernel(kernel) clKernelIncludes.addKernel(kernel) cppKernelEnumeration.addKernel(kernel) # save written files kernelSourceIncludes.writeToFile() kernelBinaryIncludes.writeToFile() clKernelIncludes.writeToFile() kernelSourceBuildOptions.writeToFile() kernelBinaryBuildOptions.writeToFile() cppKernelEnumeration.writeToFile() ################################################################################ # Main ################################################################################ if __name__ == "__main__": if len(sys.argv) == 2: Common.setOutputPath(sys.argv[1]) else: print("Warning: No output path specified; default is working directory.") writeIncludes() clblas-2.10/src/library/blas/AutoGemm/KernelOpenCL.py000066400000000000000000000627441264277366700224620ustar00rootroot00000000000000import os import sys import copy import Common import KernelParameters import AutoGemmParameters import argparse ############################################################################## # Make OpenCL Kernel String ############################################################################## def makeOpenCLKernelString(kernel): endLine = "\\n\"\n\"" #################################### # parameters valid? if kernel.isValid() == False: return kernel.getName() + " invalid" #################################### # initializations kStr = "" kStr += endLine kStr += "/* %s */" % kernel.getName() kStr += endLine #################################### # Double precision pragma prec = kernel.getName()[0].lower() if prec == "d" or prec == "z": kStr += endLine kStr += "#pragma OPENCL EXTENSION cl_khr_fp64 : enable" + endLine #################################### # kernel parameters kStr += endLine kStr += "/* kernel parameters */" + endLine #if kernel.order == "clblasColumnMajor": # kStr += "#define COLUMN_MAJOR 1" + endLine #else: # kStr += "#define COLUMN_MAJOR 0" + endLine #if kernel.transA == "T": # kStr += "#define TRANSPOSE_A 1" + endLine #else: # kStr += "#define TRANSPOSE_A 0" + endLine #if kernel.transB == "T": # kStr += "#define TRANSPOSE_B 1" + endLine #else: # kStr += "#define TRANSPOSE_B 0" + endLine #kStr += "" + endLine kStr += "#define WG_NUM_ROWS %d%s" % (kernel.workGroupNumRows, endLine ) kStr += "#define WG_NUM_COLS %d%s" % (kernel.workGroupNumCols, endLine ) kStr += "#define MICRO_TILE_NUM_ROWS %d%s" % (kernel.microTileNumRows, endLine ) kStr += "#define MICRO_TILE_NUM_COLS %d%s" % (kernel.microTileNumCols, endLine ) kStr += "#define MACRO_TILE_NUM_ROWS %s%s" % ((kernel.workGroupNumRows * kernel.microTileNumRows), endLine ) kStr += "#define MACRO_TILE_NUM_COLS %s%s" % ((kernel.workGroupNumCols * kernel.microTileNumCols), endLine ) kStr += "#define NUM_UNROLL_ITER %s%s" % (kernel.unroll, endLine ) kStr += "" + endLine kStr += "#define LOCAL_ROW_PAD %s%s" % (kernel.localRowPad, endLine) kStr += "#define LOCAL_COL_PAD %s%s" % (kernel.localColPad, endLine) #################################### # global memory indices # A kStr += endLine kStr += "/* global memory indices */" + endLine if (kernel.order=="clblasColumnMajor")==(kernel.transA=="N"): kStr += "#define GET_GLOBAL_INDEX_A(ROW,COL) ((COL)*lda+(ROW))" + endLine else: kStr += "#define GET_GLOBAL_INDEX_A(ROW,COL) ((ROW)*lda+(COL))" + endLine # B if (kernel.order=="clblasColumnMajor")==(kernel.transB=="N"): kStr += "#define GET_GLOBAL_INDEX_B(ROW,COL) ((COL)*ldb+(ROW))" + endLine else: kStr += "#define GET_GLOBAL_INDEX_B(ROW,COL) ((ROW)*ldb+(COL))" + endLine # C if (kernel.order=="clblasColumnMajor"): kStr += "#define GET_GLOBAL_INDEX_C(ROW,COL) ((COL)*ldc+(ROW))" + endLine else: kStr += "#define GET_GLOBAL_INDEX_C(ROW,COL) ((ROW)*ldc+(COL))" + endLine #################################### # local memory indices # A kStr += endLine kStr += "/* local memory indices */" + endLine kStr += "#define GET_LOCAL_INDEX_A(ROW,COL) ((ROW) + (COL)*((MACRO_TILE_NUM_ROWS)+(LOCAL_COL_PAD)) )" + endLine # B kStr += "#define GET_LOCAL_INDEX_B(ROW,COL) ((COL) + (ROW)*((MACRO_TILE_NUM_COLS)+(LOCAL_ROW_PAD)) )" + endLine #################################### # data types kStr += endLine kStr += "/* data types */" + endLine kStr += "#define DATA_TYPE_STR %s%s" \ % (Common.openclDataType[kernel.precision], endLine) if kernel.precision=="s" or kernel.precision=="d": # real arithmetic kStr += "#define TYPE_MAD(MULA,MULB,DST) DST = mad(MULA,MULB,DST);" + endLine if kernel.beta==1: kStr += "#define TYPE_MAD_WRITE(DST,ALPHA,REG,BETA) DST = (ALPHA)*(REG) + (BETA)*(DST);" + endLine else: kStr += "#define TYPE_MAD_WRITE(DST,ALPHA,REG,BETA) DST = (ALPHA)*(REG);" + endLine else: # complex arithmetic if kernel.transA!="C" and kernel.transB!="C": # neither conjugate kStr += ( "#define TYPE_MAD(MULA,MULB,DST) \\\\" + endLine + " DST.s0 = mad( MULA.s0, MULB.s0, DST.s0 ); \\\\" + endLine + " DST.s0 = mad( -MULA.s1, MULB.s1, DST.s0 ); \\\\" + endLine + " DST.s1 = mad( MULA.s0, MULB.s1, DST.s1 ); \\\\" + endLine + " DST.s1 = mad( MULA.s1, MULB.s0, DST.s1 );" + endLine ) elif kernel.transA=="C" and kernel.transB!="C": # A conjugate (negate imaginary A.s1) kStr += ( "#define TYPE_MAD(MULA,MULB,DST) \\\\" + endLine + " DST.s0 = mad( MULA.s0, MULB.s0, DST.s0 ); \\\\" + endLine + " DST.s0 = mad( MULA.s1, MULB.s1, DST.s0 ); \\\\" + endLine + " DST.s1 = mad( MULA.s0, MULB.s1, DST.s1 ); \\\\" + endLine + " DST.s1 = mad( -MULA.s1, MULB.s0, DST.s1 );" + endLine ) elif kernel.transA!="C" and kernel.transB=="C": # B conjugate (negate imaginary B.s1) kStr += ( "#define TYPE_MAD(MULA,MULB,DST) \\\\" + endLine + " DST.s0 = mad( MULA.s0, MULB.s0, DST.s0 ); \\\\" + endLine + " DST.s0 = mad( -MULA.s1, -MULB.s1, DST.s0 ); \\\\" + endLine + " DST.s1 = mad( MULA.s0, -MULB.s1, DST.s1 ); \\\\" + endLine + " DST.s1 = mad( MULA.s1, MULB.s0, DST.s1 );" + endLine ) else: # A & B conjugate (negate imaginary .s1) kStr += ( "#define TYPE_MAD(MULA,MULB,DST) \\\\" + endLine + " DST.s0 = mad( MULA.s0, MULB.s0, DST.s0 ); \\\\" + endLine + " DST.s0 = mad( MULA.s1, -MULB.s1, DST.s0 ); \\\\" + endLine + " DST.s1 = mad( MULA.s0, -MULB.s1, DST.s1 ); \\\\" + endLine + " DST.s1 = mad( -MULA.s1, MULB.s0, DST.s1 );" + endLine ) if kernel.beta==1: kStr += ( "#define TYPE_MAD_WRITE( DST, ALPHA, REG, BETA ) \\\\" + endLine + " /* (1) */ \\\\" + endLine + " type_mad_tmp = REG.s0; \\\\" + endLine + " REG.s0 *= ALPHA.s0; \\\\" + endLine + " REG.s0 = mad( -ALPHA.s1, REG.s1, REG.s0 ); \\\\" + endLine + " REG.s1 *= ALPHA.s0; \\\\" + endLine + " REG.s1 = mad( ALPHA.s1, type_mad_tmp, REG.s1 ); \\\\" + endLine + " /* (2) */ \\\\" + endLine + " REG.s0 = mad( BETA.s0, DST.s0, REG.s0 ); \\\\" + endLine + " REG.s0 = mad( -BETA.s1, DST.s1, REG.s0 ); \\\\" + endLine + " REG.s1 = mad( BETA.s1, DST.s0, REG.s1 ); \\\\" + endLine + " REG.s1 = mad( BETA.s0, DST.s1, REG.s1 ); \\\\" + endLine + " /* (3) */ \\\\" + endLine + " DST = REG;" + endLine ) else: kStr += ( "#define TYPE_MAD_WRITE( DST, ALPHA, REG, BETA ) \\\\" + endLine + " /* (1) */ \\\\" + endLine + " type_mad_tmp = REG.s0; \\\\" + endLine + " REG.s0 *= ALPHA.s0; \\\\" + endLine + " REG.s0 = mad( -ALPHA.s1, REG.s1, REG.s0 ); \\\\" + endLine + " REG.s1 *= ALPHA.s0; \\\\" + endLine + " REG.s1 = mad( ALPHA.s1, type_mad_tmp, REG.s1 ); \\\\" + endLine + " /* (2) */ \\\\" + endLine + " REG.s0 = mad( BETA.s0, DST.s0, REG.s0 ); \\\\" + endLine + " REG.s0 = mad( -BETA.s1, DST.s1, REG.s0 ); \\\\" + endLine + " REG.s1 = mad( BETA.s1, DST.s0, REG.s1 ); \\\\" + endLine + " REG.s1 = mad( BETA.s0, DST.s1, REG.s1 ); \\\\" + endLine + " /* (3) */ \\\\" + endLine + " DST = REG;" + endLine ) #################################### # micro-tile kStr += endLine kStr += "/* %dx%d micro-tile */%s" % (kernel.microTileNumRows, kernel.microTileNumCols, endLine) kStr += "#define MICRO_TILE \\\\" + endLine for a in range(0, int(kernel.microTileNumRows)): kStr += " rA[%d] = localA[offA + %d*WG_NUM_ROWS]; \\\\%s" % (a, a, endLine) for b in range(0, int(kernel.microTileNumCols)): kStr += " rB[%d] = localB[offB + %d*WG_NUM_COLS]; \\\\%s" % (b, b, endLine) kStr += " offA += (MACRO_TILE_NUM_ROWS+LOCAL_COL_PAD); \\\\" + endLine kStr += " offB += (MACRO_TILE_NUM_COLS+LOCAL_ROW_PAD); \\\\" + endLine for a in range(0, int(kernel.microTileNumRows)): for b in range(0, int(kernel.microTileNumCols)): kStr += " TYPE_MAD(rA[%d],rB[%d],rC[%d][%d]); \\\\%s" % (a, b, a, b, endLine) kStr += " mem_fence(CLK_LOCAL_MEM_FENCE);" + endLine kStr += endLine #################################### # function signature #################################### kStr += "__attribute__((reqd_work_group_size(WG_NUM_COLS,WG_NUM_ROWS,1)))" + endLine kStr += "__kernel void %s" % ( kernel.getName() ) kStr += "(" + endLine # arguments kStr += ( " __global DATA_TYPE_STR const * restrict A," + endLine + " __global DATA_TYPE_STR const * restrict B," + endLine + " __global DATA_TYPE_STR * C," + endLine + " DATA_TYPE_STR const alpha," + endLine + " DATA_TYPE_STR const beta," + endLine + " uint const M," + endLine + " uint const N," + endLine + " uint const K," + endLine + " uint const lda," + endLine + " uint const ldb," + endLine + " uint const ldc," + endLine + " uint const offsetA," + endLine + " uint const offsetB," + endLine + " uint const offsetC" + endLine + ") {" + endLine ) #################################### # apply offsets kStr += endLine kStr += ( " /* apply offsets */" + endLine + " A += offsetA;" + endLine + " B += offsetB;" + endLine + " C += offsetC;" + endLine ) #################################### # allocate registers kStr += endLine kStr += ( " /* allocate registers */" + endLine + " DATA_TYPE_STR rC[MICRO_TILE_NUM_ROWS][MICRO_TILE_NUM_COLS] = { {0} };" + endLine + " DATA_TYPE_STR rA[MICRO_TILE_NUM_ROWS];" + endLine + " DATA_TYPE_STR rB[MICRO_TILE_NUM_COLS];" + endLine ) #################################### # allocate local memory kStr += endLine kStr += ( " /* allocate local memory */" + endLine + " __local DATA_TYPE_STR localA[NUM_UNROLL_ITER*(MACRO_TILE_NUM_ROWS+LOCAL_COL_PAD)];" + endLine + " __local DATA_TYPE_STR localB[NUM_UNROLL_ITER*(MACRO_TILE_NUM_COLS+LOCAL_ROW_PAD)];" + endLine ) #################################### # work item indices kStr += endLine kStr += " /* work item indices */" + endLine if kernel.isRowKernel(): kStr += " uint groupRow = M / " + str(kernel.workGroupNumRows*kernel.microTileNumRows) + "; // last row" + endLine else: kStr += " uint groupRow = get_group_id(0);" + endLine if kernel.isColKernel(): kStr += " uint groupCol = N / " + str(kernel.workGroupNumCols*kernel.microTileNumCols) + "; // last column" + endLine else: kStr += " uint groupCol = get_group_id(1);" + endLine #################################### # z-order - TODO doesn't improve caching, only lowers occupancy if False: kStr += ( " // convert work-group order to z-order" + endLine + " unsigned int morton = get_group_id(1) * get_num_groups(0) + get_group_id(0);" + endLine + " groupRow = morton;" + endLine + " groupCol = ( groupRow >> 1 );" + endLine + " groupRow &= 0x55555555;" + endLine + " groupCol &= 0x55555555;" + endLine + " groupRow |= ( groupRow >> 1 );" + endLine + " groupCol |= ( groupCol >> 1 );" + endLine + " groupRow &= 0x33333333;" + endLine + " groupCol &= 0x33333333;" + endLine + " groupRow |= ( groupRow >> 2 );" + endLine + " groupCol |= ( groupCol >> 2 );" + endLine + " groupRow &= 0x0f0f0f0f;" + endLine + " groupCol &= 0x0f0f0f0f;" + endLine + " groupRow |= ( groupRow >> 4 );" + endLine + " groupCol |= ( groupCol >> 4 );" + endLine + " groupRow &= 0x00ff00ff;" + endLine + " groupCol &= 0x00ff00ff;" + endLine + " groupRow |= ( groupRow >> 8 );" + endLine + " groupCol |= ( groupCol >> 8 );" + endLine + " groupRow &= 0x0000ffff;" + endLine + " groupCol &= 0x0000ffff;" + endLine + endLine ) kStr += ( " uint localRow = get_local_id(0);" + endLine + " uint localCol = get_local_id(1);" + endLine + " uint localSerial = localRow + localCol*WG_NUM_ROWS;" + endLine ) #################################### # global indices being loaded kStr += endLine kStr += " /* global indices being loaded */" + endLine if (kernel.order=="clblasColumnMajor")==(kernel.transA=="N"): kStr += ( "#define globalARow(LID) (groupRow*MACRO_TILE_NUM_ROWS + (localSerial+(LID)*WG_NUM_ROWS*WG_NUM_COLS)%MACRO_TILE_NUM_ROWS)" + endLine + "#define globalACol(LID) ((localSerial+(LID)*WG_NUM_ROWS*WG_NUM_COLS)/MACRO_TILE_NUM_ROWS)" + endLine ) else: kStr += ( "#define globalARow(LID) (groupRow*MACRO_TILE_NUM_ROWS + (localSerial+(LID)*WG_NUM_ROWS*WG_NUM_COLS)/NUM_UNROLL_ITER)" + endLine + "#define globalACol(LID) ((localSerial+(LID)*WG_NUM_ROWS*WG_NUM_COLS)%NUM_UNROLL_ITER)" + endLine ) if (kernel.order=="clblasColumnMajor")==(kernel.transB=="N"): kStr += ( "#define globalBRow(LID) ((localSerial+(LID)*WG_NUM_ROWS*WG_NUM_COLS)%NUM_UNROLL_ITER)" + endLine + "#define globalBCol(LID) (groupCol*MACRO_TILE_NUM_COLS + (localSerial+(LID)*WG_NUM_ROWS*WG_NUM_COLS)/NUM_UNROLL_ITER)" + endLine ) else: kStr += ( "#define globalBRow(LID) ((localSerial+(LID)*WG_NUM_ROWS*WG_NUM_COLS)/MACRO_TILE_NUM_COLS)" + endLine + "#define globalBCol(LID) (groupCol*MACRO_TILE_NUM_COLS + (localSerial+(LID)*WG_NUM_ROWS*WG_NUM_COLS)%MACRO_TILE_NUM_COLS)" + endLine ) #kStr += ( # " A += GET_GLOBAL_INDEX_A( globalARow, globalACol );" + endLine + # " B += GET_GLOBAL_INDEX_B( globalBRow, globalBCol );" + endLine ) #################################### # loop over k kStr += endLine kStr += ( " /* loop over k */" + endLine + " uint block_k = K / NUM_UNROLL_ITER;" + endLine + " do {" + endLine ) #################################### # local indices being written kStr += endLine kStr += " /* local indices being written */" + endLine if (kernel.order=="clblasColumnMajor")==(kernel.transA=="N"): kStr += ( "#define localARow (localSerial % MACRO_TILE_NUM_ROWS)" + endLine + "#define localACol (localSerial / MACRO_TILE_NUM_ROWS)" + endLine + "#define localAStride (WG_NUM_ROWS*WG_NUM_COLS)" + endLine ) else: kStr += ( "#define localARow (localSerial / NUM_UNROLL_ITER)" + endLine + "#define localACol (localSerial % NUM_UNROLL_ITER)" + endLine + "#define localAStride (WG_NUM_ROWS*WG_NUM_COLS/NUM_UNROLL_ITER)" + endLine ) if (kernel.order=="clblasColumnMajor")==(kernel.transB=="N"): kStr += ( "#define localBRow ( localSerial % NUM_UNROLL_ITER )" + endLine + "#define localBCol ( localSerial / NUM_UNROLL_ITER )" + endLine + "#define localBStride (WG_NUM_ROWS*WG_NUM_COLS/NUM_UNROLL_ITER)" + endLine ) else: kStr += ( "#define localBRow ( localSerial / MACRO_TILE_NUM_COLS )" + endLine + "#define localBCol ( localSerial % MACRO_TILE_NUM_COLS )" + endLine + "#define localBStride (WG_NUM_ROWS*WG_NUM_COLS)" + endLine ) kStr += ( " __local DATA_TYPE_STR *lA = localA + GET_LOCAL_INDEX_A(localARow, localACol);" + endLine + " __local DATA_TYPE_STR *lB = localB + GET_LOCAL_INDEX_B(localBRow, localBCol);" + endLine + " barrier(CLK_LOCAL_MEM_FENCE);" + endLine ) #################################### # load global -> local # threads to do loading = (workGroupNumRows*workGroupNumCols) # A elements to be loaded = workGroupNumRows*microTileNumRows*unroll # B elements to be loaded = workGroupNumCols*microTileNumCols*unroll kStr += endLine kStr += " /* load global -> local */" + endLine numALoads = (kernel.workGroupNumRows*kernel.microTileNumRows*kernel.unroll) \ / (kernel.workGroupNumRows*kernel.workGroupNumCols) numALoadsR = (kernel.workGroupNumRows*kernel.microTileNumRows*kernel.unroll) \ % (kernel.workGroupNumRows*kernel.workGroupNumCols) numBLoads = (kernel.workGroupNumCols*kernel.microTileNumCols*kernel.unroll) \ / (kernel.workGroupNumRows*kernel.workGroupNumCols) numBLoadsR = (kernel.workGroupNumCols*kernel.microTileNumCols*kernel.unroll) \ % (kernel.workGroupNumRows*kernel.workGroupNumCols) # TODO - zeroString for real and complex if kernel.precision == "c": zeroString = "(float2)(0.f, 0.f)" elif kernel.precision == "z": zeroString = "(double2)(0.0, 0.0)" else: zeroString = "0.0" for a in range(0, int(numALoads)): kStr += " lA[ %d*localAStride ] = " % a if kernel.isRowKernel(): kStr += "( globalARow(%d) >= M) ? %s : " % ( a, zeroString ) kStr += "A[ GET_GLOBAL_INDEX_A( globalARow(%d), globalACol(%d) ) ];%s" % (a, a, endLine) if numALoadsR: kStr += " if ( localSerial + " + str(numALoads) + "*WG_NUM_ROWS*WG_NUM_COLS < (WG_NUM_ROWS*MICRO_TILE_NUM_ROWS*NUM_UNROLL_ITER) ) {" + endLine kStr += " lA[ %d*localAStride ] = " % numALoads if kernel.isRowKernel(): kStr += "( globalARow(%d) >= M) ? %s : " % ( numALoads, zeroString ) kStr += "A[ GET_GLOBAL_INDEX_A( globalARow(%d), globalACol(%d) ) ];%s" % (numALoads, numALoads, endLine) kStr += " }" + endLine for b in range(0, int(numBLoads)): kStr += " lB[ %d*localBStride ] = " % b if kernel.isColKernel(): kStr += "( globalBCol(%d) >= N) ? %s : " % ( b, zeroString ) kStr += "B[ GET_GLOBAL_INDEX_B( globalBRow(%d), globalBCol(%d) ) ];%s" % (b, b, endLine) if numBLoadsR: kStr += " if ( localSerial + " + str(numBLoads) + "*WG_NUM_ROWS*WG_NUM_COLS < (WG_NUM_COLS*MICRO_TILE_NUM_COLS*NUM_UNROLL_ITER) ) {" + endLine kStr += " lB[ %d*localBStride ] = " % numBLoads if kernel.isColKernel(): kStr += "(globalBCol(%d) >= N) ? %s : " % ( numBLoads, zeroString ) kStr += "B[ GET_GLOBAL_INDEX_B( globalBRow(%d), globalBCol(%d) ) ];%s" % (numBLoads, numBLoads, endLine) kStr += " }" + endLine kStr += ( " barrier(CLK_LOCAL_MEM_FENCE);" + endLine + " uint offA = localRow;" + endLine + " uint offB = localCol;" + endLine ) #################################### # do mads kStr += endLine kStr += " /* do mads */" + endLine for u in range(0, int(kernel.unroll)): kStr += " MICRO_TILE" + endLine #################################### # shift to next k block kStr += endLine kStr += " /* shift to next k block */" + endLine if (kernel.order=="clblasColumnMajor")==(kernel.transA=="N"): kStr += " A += lda*NUM_UNROLL_ITER;" + endLine else: kStr += " A += NUM_UNROLL_ITER;" + endLine if (kernel.order=="clblasColumnMajor")==(kernel.transB=="N"): kStr += " B += NUM_UNROLL_ITER;" + endLine else: kStr += " B += ldb*NUM_UNROLL_ITER;" + endLine #################################### # end loop kStr += endLine kStr += " } while (--block_k > 0);" + endLine kStr += endLine #################################### # which global Cij index kStr += endLine kStr += " /* which global Cij index */" + endLine kStr += " uint globalCRow = groupRow * MACRO_TILE_NUM_ROWS + localRow;" + endLine kStr += " uint globalCCol = groupCol * MACRO_TILE_NUM_COLS + localCol;" + endLine #################################### # write global Cij kStr += endLine kStr += " /* write global Cij */" + endLine if kernel.precision=="c": kStr += " float type_mad_tmp;" + endLine if kernel.precision=="z": kStr += " double type_mad_tmp;" + endLine for a in range(0, int(kernel.microTileNumRows)): for b in range(0, int(kernel.microTileNumCols)): if kernel.isRowKernel(): kStr += " if (globalCRow+%d*WG_NUM_ROWS < M)" % a if kernel.isColKernel(): kStr += " if (globalCCol+%d*WG_NUM_COLS < N)" % b if kernel.isRowKernel() or kernel.isColKernel(): kStr += "{" kStr += " TYPE_MAD_WRITE( C[ GET_GLOBAL_INDEX_C( globalCRow+%d*WG_NUM_ROWS, globalCCol+%d*WG_NUM_COLS) ], alpha, rC[%d][%d], beta )" % (a, b, a, b) if kernel.isRowKernel() or kernel.isColKernel(): kStr += "}" kStr += endLine #################################### # end kernel kStr += endLine kStr += "}" + endLine return kStr ############################################################################## # Write OpenCL kernel to file ############################################################################## def writeOpenCLKernelToFile(kernel): kernelName = kernel.getName() kernelString = makeOpenCLKernelString(kernel) kernelFileName = Common.getKernelSourcePath() + kernelName +"_src.cpp" kernelFile = open(kernelFileName, "w") kernelFile.write( Common.getAutoGemmHeader() ) kernelFile.write("#ifndef KERNEL_" + kernelName.upper() + "_SRC_H\n") kernelFile.write("#define KERNEL_" + kernelName.upper() + "_SRC_H\n") kernelFile.write("\n") kernelFile.write("const unsigned int %s_workGroupNumRows = %u;\n" % (kernel.getName(), kernel.workGroupNumRows ) ) kernelFile.write("const unsigned int %s_workGroupNumCols = %u;\n" % (kernel.getName(), kernel.workGroupNumCols ) ) kernelFile.write("const unsigned int %s_microTileNumRows = %u;\n" % (kernel.getName(), kernel.microTileNumRows ) ) kernelFile.write("const unsigned int %s_microTileNumCols = %u;\n" % (kernel.getName(), kernel.microTileNumCols ) ) kernelFile.write("const unsigned int %s_unroll = %u;\n" % (kernel.getName(), kernel.unroll) ) kernelFile.write("\n") kernelFile.write("const char * const %s_src =\"" % (kernelName) ) kernelFile.write(kernelString) kernelFile.write("\";\n") kernelFile.write("\n") kernelFile.write("#else\n") kernelFile.write("#pragma message(\"AutoGemmKernelSources.cpp: %s was overriden by user kernel.\")\n" % kernel.getName() ) kernelFile.write("#endif\n") kernelFile.close() ############################################################################## # Write OpenCL kernel to file ############################################################################## def writeOpenCLKernels(): if not os.path.exists( Common.getKernelSourcePath() ): os.makedirs( Common.getKernelSourcePath() ) if not os.path.exists( Common.getKernelBinaryPath() ): os.makedirs( Common.getKernelBinaryPath() ) numKernels = 0 # for each precision kernel = KernelParameters.KernelParameters() for precision in AutoGemmParameters.precisions: kernel.precision = precision # valid tiles for this precision tiles = AutoGemmParameters.getTilesForPrecision(precision) # for non tile parameters for order in AutoGemmParameters.orders: kernel.order = order for transA in AutoGemmParameters.transposes[precision]: kernel.transA = transA for transB in AutoGemmParameters.transposes[precision]: kernel.transB = transB for beta in AutoGemmParameters.betas: kernel.beta = beta # for tile parameters for tile in tiles: # tile kernel kernel.useTile(tile) writeOpenCLKernelToFile(kernel) # row kernel rowKernel = copy.copy(kernel) rowKernel.macroTileNumRows = 1 writeOpenCLKernelToFile(rowKernel) # col kernel colKernel = copy.copy(kernel) colKernel.macroTileNumCols = 1 writeOpenCLKernelToFile(colKernel) # corner kernel cornerKernel = copy.copy(kernel) cornerKernel.macroTileNumRows = 1 cornerKernel.macroTileNumCols = 1 writeOpenCLKernelToFile(cornerKernel) numKernels += 4 print("AutoGemm.py: generated %d kernels" % numKernels) ################################################################################ # Main ################################################################################ if __name__ == "__main__": ap = argparse.ArgumentParser(description="KernelOpenCL") ap.add_argument("precision", choices=["s","d","c","z"], help="precision" ) ap.add_argument("order", choices=["row","col"], help="order: row major or column major" ) ap.add_argument("transA", choices=["N","T", "C"], help="transA" ) ap.add_argument("transB", choices=["N","T", "C"], help="transB" ) ap.add_argument("beta", choices=[0, 1], type=int, help="0 for beta is zero, 1 for beta is non-zero" ) ap.add_argument("workGroupNumRows", type=int ) ap.add_argument("workGroupNumCols", type=int ) ap.add_argument("microTileNumRows", type=int ) ap.add_argument("microTileNumCols", type=int ) ap.add_argument("unroll", type=int, help="number of iterations to unroll the loop over k" ) ap.add_argument("outputPath", default=".", help="output path; %s will be appended to path" % Common.getRelativeKernelSourcePath() ) args = ap.parse_args() kernel = KernelParameters.KernelParameters() kernel.precision = args.precision if args.order == "col": kernel.order = "clblasColumnMajor" else: kernel.order = "clblasRowMajor" kernel.transA = args.transA kernel.transB = args.transB kernel.beta = args.beta kernel.workGroupNumRows = args.workGroupNumRows kernel.workGroupNumCols = args.workGroupNumCols kernel.microTileNumRows = args.microTileNumRows kernel.microTileNumCols = args.microTileNumCols kernel.unroll = args.unroll Common.setOutputPath(args.outputPath) kernel.macroTileNumRows = kernel.workGroupNumRows * kernel.microTileNumRows kernel.macroTileNumCols = kernel.workGroupNumCols * kernel.microTileNumCols if not os.path.exists( Common.getKernelSourcePath() ): os.makedirs( Common.getKernelSourcePath() ) writeOpenCLKernelToFile(kernel) kernelName = kernel.getName() kernelFileName = Common.getKernelSourcePath() + kernelName +"_src.cpp" print("kernel \"%s\" written to %s" % (kernelName, kernelFileName)) clblas-2.10/src/library/blas/AutoGemm/KernelParameters.py000066400000000000000000000252611264277366700234360ustar00rootroot00000000000000import copy import Common ################################################################################ # Tile Parameters # - parameters which should match matrix system for good performance ################################################################################ class TileParameters: nameFormatTile = "MX%03d_NX%03d_KX%02d" nameFormatRow = "ML%03d_NX%03d_KX%02d" nameFormatCol = "MX%03d_NL%03d_KX%02d" nameFormatCorner = "ML%03d_NL%03d_KX%02d" ############################################################################## # Tile - constructors ############################################################################## def __init__(self): self.workGroupNumRows = -1 self.workGroupNumCols = -1 self.microTileNumRows = -1 self.microTileNumCols = -1 self.macroTileNumRows = -1 self.macroTileNumCols = -1 self.unroll = -1 def __eq__(self, other): return self.workGroupNumRows == other.workGroupNumRows \ and self.workGroupNumCols == other.workGroupNumCols \ and self.microTileNumRows == other.microTileNumRows \ and self.microTileNumCols == other.microTileNumCols \ and self.unroll == other.unroll def __ni__(self, other): return not self.__eq__(other) def __hash__(self): return \ self.workGroupNumRows*2*8*8*256 + \ self.workGroupNumCols*2*8*8 + \ self.microTileNumRows*2*8 + \ self.microTileNumCols*2 + \ self.unroll def __str__(self): return self.getName() def __repr__(self): return self.getName() def __lt__(self, other): return self.getName() < other.getName() def __cmp__(self, other): # Python3 should ignore this method # This is needed for python2 for proper comparison try: return cmp(self.getName(), other.getName()) except: self_name = self.getName() other_name = other.getName() if (self_name < other_name): return -1 elif (self_name == other_name): return 0 else: return 1 def printAttributes(self): print("workGroupNumRows = %d" % self.workGroupNumRows) print("workGroupNumCols = %d" % self.workGroupNumCols) print("microTileNumRows = %d" % self.microTileNumRows) print("microTileNumCols = %d" % self.microTileNumCols) print("macroTileNumRows = %d" % self.macroTileNumRows) print("macroTileNumCols = %d" % self.macroTileNumCols) print("unroll = %d" % self.unroll) ############################################################################## # Tile - get Multiples ############################################################################## def getMultipleM(self): return (self.workGroupNumRows * self.microTileNumRows) def getMultipleN(self): return (self.workGroupNumCols * self.microTileNumCols) def getMultipleK(self): return (self.unroll) ############################################################################## # Tile - are tile parameters valid? ############################################################################## def isValid(self): return True """ numALoads = (self.workGroupNumRows*self.microTileNumRows*self.unroll) \ / (self.workGroupNumRows*self.workGroupNumCols) numALoadsR = (self.workGroupNumRows*self.microTileNumRows*self.unroll) \ % (self.workGroupNumRows*self.workGroupNumCols) numBLoads = (self.workGroupNumCols*self.microTileNumCols*self.unroll) \ / (self.workGroupNumRows*self.workGroupNumCols) numBLoadsR = (self.workGroupNumCols*self.microTileNumCols*self.unroll) \ % (self.workGroupNumRows*self.workGroupNumCols) if (numALoads>0 and numALoadsR>0): self.error = ("(%2d * %d * %d = %3d) A elements can't be loaded " "by (%2d * %2d = %3d) threads" ) \ % ( self.workGroupNumRows, self.microTileNumRows, self.unroll, \ (self.workGroupNumRows*self.microTileNumRows*self.unroll), \ self.workGroupNumRows, self.workGroupNumCols, \ (self.workGroupNumRows*self.workGroupNumCols) ) return False elif (numBLoads>0 and numBLoadsR>0): self.error = ( "(%2d * %d * %d = %3d) B elements can't be loaded " "by (%2d * %2d = %3d) threads" ) \ % ( self.workGroupNumCols, self.microTileNumCols, self.unroll, \ (self.workGroupNumCols*self.microTileNumCols*self.unroll), \ self.workGroupNumRows, self.workGroupNumCols, \ (self.workGroupNumRows*self.workGroupNumCols) ) return False else: return True """ ############################################################################## # Tile - get Name ############################################################################## def getName(self): if self.macroTileNumRows < self.workGroupNumRows*self.microTileNumRows: if self.macroTileNumCols < self.workGroupNumCols*self.microTileNumCols: return self.nameFormatCorner \ % ( (self.workGroupNumRows*self.microTileNumRows), \ (self.workGroupNumCols*self.microTileNumCols), self.unroll ) else: return self.nameFormatRow \ % ( (self.workGroupNumRows*self.microTileNumRows), \ (self.workGroupNumCols*self.microTileNumCols), self.unroll ) else: if self.macroTileNumCols < self.workGroupNumCols*self.microTileNumCols: return self.nameFormatCol \ % ( (self.workGroupNumRows*self.microTileNumRows), \ (self.workGroupNumCols*self.microTileNumCols), self.unroll ) else: return self.nameFormatTile \ % ( (self.workGroupNumRows*self.microTileNumRows), \ (self.workGroupNumCols*self.microTileNumCols), self.unroll ) def getRowName(self): return self.nameFormatRow \ % ( (self.workGroupNumRows*self.microTileNumRows), \ (self.workGroupNumCols*self.microTileNumCols), self.unroll ) def getColName(self): return self.nameFormatCol \ % ( (self.workGroupNumRows*self.microTileNumRows), \ (self.workGroupNumCols*self.microTileNumCols), self.unroll ) def getCornerName(self): return self.nameFormatCorner \ % ( (self.workGroupNumRows*self.microTileNumRows), \ (self.workGroupNumCols*self.microTileNumCols), self.unroll ) ############################################################################## # Row Kernel # - macroTileNumRows = 1 # - guards around gA -> lA # - guards around gC[gRow,:] = rC[row,:] ############################################################################## def isRowKernel(self): if self.workGroupNumRows * self.microTileNumRows == self.macroTileNumRows: return False; # normal tile kernel else: if self.macroTileNumRows == 1: return True; # single row kernel else: printf( ("ERROR: workGroupNumRows=%u, microTileNumRows=%u " "and macroTileNumRows=%u doesn't make sense\n") \ % (self.workGroupNumRows, self.microTileNumRows, \ self.macroTileNumRows) ); return False; # ERROR ############################################################################## # Col Kernel # - macroTileNumCols = 1 # - guards around gB -> lB # - guards around gC[:,gCol] = rC[:,col] ############################################################################## def isColKernel(self): if self.workGroupNumCols * self.microTileNumCols == self.macroTileNumCols: return False; # normal tile kernel else: if self.macroTileNumCols == 1: return True; # single row kernel else: printf(("ERROR: workGroupNumCols=%u, microTileNumCols=%u " "and macroTileNumCols=%u doesn't make sense\n") \ % (self.workGroupNumCols, self.microTileNumCols, \ self.macroTileNumCols) ); return False; # ERROR ################################################################################ # Non Tile Parameters # - parameters which must match matrix system for correct answer ################################################################################ class NonTileParameters: def __init__(self): self.precision = "" # s, d, c, z self.order = "" # clblasColumnMajor, clblasRowMajor self.transA = "" # N, T, C self.transB = "" # N, T, C self.beta = -1 # 0, 1 def printAttributes(self): print("precision = " + self.precision) print("order = " + self.order) print("transA = " + self.transA) print("transB = " + self.transB) print("beta = %d" % self.beta) ############################################################################## # NonTile - get Name ############################################################################## def getName(self): return "%sgemm_%3s_%1s%1s_B%d" \ % (Common.hostDataChar[self.precision], \ "Col" if self.order=="clblasColumnMajor" else "Row", \ self.transA, self.transB, self.beta ) ################################################################################ # Kernel Parameters ################################################################################ class KernelParameters( NonTileParameters, TileParameters ): ############################################################################## # Kernel - constructor ############################################################################## def __init__(self): NonTileParameters.__init__(self) TileParameters.__init__(self) self.localRowPad = 0 self.localColPad = 0 ############################################################################## # Kernel - use tile ############################################################################## def useTile(self, tile): self.workGroupNumRows = tile.workGroupNumRows self.workGroupNumCols = tile.workGroupNumCols self.microTileNumRows = tile.microTileNumRows self.microTileNumCols = tile.microTileNumCols self.macroTileNumRows = tile.macroTileNumRows self.macroTileNumCols = tile.macroTileNumCols self.unroll = tile.unroll def printAttributes(self): NonTileParameters.printAttributes(self) TileParameters.printAttributes(self) ############################################################################## # Kernel - get Name ############################################################################## def getName(self): return NonTileParameters.getName(self) \ + "_" + TileParameters.getName(self) def getRowName(self): return NonTileParameters.getName(self) \ + "_" + TileParameters.getRowName(self) def getColName(self): return NonTileParameters.getName(self) \ + "_" + TileParameters.getColName(self) def getCornerName(self): return NonTileParameters.getName(self) \ + "_" + TileParameters.getCornerName(self) clblas-2.10/src/library/blas/AutoGemm/KernelSelection.py000066400000000000000000000671311264277366700232620ustar00rootroot00000000000000import os import sys import copy import AutoGemmParameters import Common import KernelParameters def indent(il): returnTabs = "" for i in range(0, il): returnTabs += " " return returnTabs def tileInRange( tileMin, tileMax, rangeMin, rangeMax): if ( tileMax < 0 or (tileMax >= rangeMax and rangeMax>0) ) and tileMin <= rangeMin : valid = True else: valid = False #print("Range [%4ux%4u]: [%4u,%4u] is %s b/c" \ # % (rangeMin, rangeMax, tileMin, tileMax, "valid" if valid else "INVALID" )) #print("if ( %i<0 or (%u >= %u and %u>0) and %u <= %u" \ # %( tileMax, tileMax, rangeMax, rangeMax, tileMin, rangeMin )) return valid ################################################################################ # KSL - Kernel Selection Logic File ################################################################################ class KernelSelection: ############################################################################## # KSL - default constructor ############################################################################## def __init__( \ self, \ precisionList, \ orderList, \ transDict, \ betaList, \ unrollDict, \ kernelSelectionData): self.incFileName = Common.getIncludePath() + "AutoGemmKernelSelection.h" self.incFile = open(self.incFileName, "w") self.incFile.write( Common.getAutoGemmHeader() ) self.kernelSelectionFileName = Common.getIncludePath() + "AutoGemmKernelSelection.cpp" self.selectionFile = open(self.kernelSelectionFileName, "w") self.selectionFile.write( Common.getAutoGemmHeader() ) self.inc = ( "#include \n" "#include \"" + Common.getRelativeIncludePath() + "AutoGemmKernelSources.h\"\n" "#include \"" + Common.getRelativeIncludePath() + "AutoGemmKernelBinaries.h\"\n" "#include \"" + Common.getRelativeIncludePath() + "AutoGemmKernelBuildOptionsSource.h\"\n" "#include \"" + Common.getRelativeIncludePath() + "AutoGemmKernelBuildOptionsBinary.h\"\n" "#include \"" + Common.getRelativeIncludePath() + "AutoGemmClKernels.h\"\n" "\n" "#define EXACT_MULTIPLES(MULTIPLE_STR) MULTIPLE_STR\n" "\n" "// kernel selection logic template\n" "template\n" "void gemmSelectKernel(\n" " clblasOrder order,\n" " clblasTranspose transA,\n" " clblasTranspose transB,\n" " size_t M,\n" " size_t N,\n" " size_t K,\n" " bool betaNonZero,\n" " float optimalNumElementsPerWorkItem,\n" " const char **tileKernelSource,\n" " const char **rowKernelSource,\n" " const char **colKernelSource,\n" " const char **cornerKernelSource,\n" " const char **sourceBuildOptions,\n" " const unsigned char **tileKernelBinary,\n" " const unsigned char **rowKernelBinary,\n" " const unsigned char **colKernelBinary,\n" " const unsigned char **cornerKernelBinary,\n" " size_t **tileKernelBinarySize,\n" " size_t **rowKernelBinarySize,\n" " size_t **colKernelBinarySize,\n" " size_t **cornerKernelBinarySize,\n" " const char **binaryBuildOptions,\n" " cl_kernel **tileClKernel,\n" " cl_kernel **rowClKernel,\n" " cl_kernel **colClKernel,\n" " cl_kernel **cornerClKernel,\n" " unsigned int *workGroupNumRows,\n" " unsigned int *workGroupNumCols,\n" " unsigned int *microTileNumRows,\n" " unsigned int *microTileNumCols,\n" " unsigned int *unroll\n" ");\n\n" ) self.logic = "#include \"" + Common.getRelativeIncludePath() + "AutoGemmKernelSelection.h\"\n" #################################### # precision kernel = KernelParameters.KernelParameters() for precision in precisionList: #self.selectionFile.write( self.logic ) #self.logic = "" kernel.precision = precision sizeEvents = kernelSelectionData[precision] self.logic += ( "\n// " + precision + "gemm kernel selection logic\n" "template<>\n" "void gemmSelectKernel<" ) if precision == "s": self.logic += "float" elif precision == "d": self.logic += "double" elif precision == "c": self.logic += "FloatComplex" else: self.logic += "DoubleComplex" self.logic += ( ">(\n" " clblasOrder order,\n" " clblasTranspose transA,\n" " clblasTranspose transB,\n" " size_t M,\n" " size_t N,\n" " size_t K,\n" " bool betaNonZero,\n" " float optimalNumElementsPerWorkItem,\n" " const char **tileKernelSource,\n" " const char **rowKernelSource,\n" " const char **colKernelSource,\n" " const char **cornerKernelSource,\n" " const char **sourceBuildOptions,\n" " const unsigned char **tileKernelBinary,\n" " const unsigned char **rowKernelBinary,\n" " const unsigned char **colKernelBinary,\n" " const unsigned char **cornerKernelBinary,\n" " size_t **tileKernelBinarySize,\n" " size_t **rowKernelBinarySize,\n" " size_t **colKernelBinarySize,\n" " size_t **cornerKernelBinarySize,\n" " const char **binaryBuildOptions,\n" " cl_kernel **tileClKernel,\n" " cl_kernel **rowClKernel,\n" " cl_kernel **colClKernel,\n" " cl_kernel **cornerClKernel,\n" " unsigned int *workGroupNumRows,\n" " unsigned int *workGroupNumCols,\n" " unsigned int *microTileNumRows,\n" " unsigned int *microTileNumCols,\n" " unsigned int *unroll\n" ") {\n" ) #################################### # order for order in orderList: #print(precision + "gemm" + "_" + order) kernel.order = order self.logic += indent(1) + "if (order == " + order + ") {\n" transList = transDict[precision] #################################### # transA for transA in transList: #print(precision + "gemm" + "_" + order + "_" + transA) kernel.transA = transA self.logic += indent(2) + "if (transA == " if transA == "N": self.logic += "clblasNoTrans" elif transA == "T": self.logic += "clblasTrans" else: self.logic += "clblasConjTrans" self.logic += ") {\n" #################################### # transB for transB in transList: kernel.transB = transB self.logic += indent(3) + "if (transB == " if transB == "N": self.logic += "clblasNoTrans" elif transB == "T": self.logic += "clblasTrans" else: self.logic += "clblasConjTrans" self.logic += ") {\n" #################################### # beta for beta in betaList: #print(precision + "gemm" + "_" + order + "_" + transA + "_" + transB + "_B" + str(beta)) kernel.beta = beta self.logic += indent(4) + "if ( " if beta == 0: self.logic += "!betaNonZero" else: self.logic += "betaNonZero" self.logic += " ) {\n" #################################### # if size event for sizeEvent in sizeEvents: self.selectionFile.write( self.logic ) self.logic = "" sizeMin = sizeEvent[0] fallbackTile = sizeEvent[1] validTiles = sizeEvent[2] self.logic += indent(5)+"if ( M*N >= "+str(sizeMin)+"*"+str(sizeMin) + ") {\n" #print(precision + "gemm" + "_" + order + "_" + transA + "_" + transB + "_B" + str(beta) + "_" + str(sizeMin) + "->" + str(sizeMax)) #################################### # valid tiles self.logic += indent(6)+"// valid tiles\n" for tileParams in validTiles: kernel.workGroupNumRows = tileParams[0] kernel.workGroupNumCols = tileParams[1] kernel.microTileNumRows = tileParams[2] kernel.microTileNumCols = tileParams[3] kernel.macroTileNumRows = kernel.workGroupNumRows*kernel.microTileNumRows kernel.macroTileNumCols = kernel.workGroupNumCols*kernel.microTileNumCols for unroll in unrollDict[precision]: kernel.unroll = unroll self.logic += indent(6)+"if ( M%%%d == 0 && N%%%d == 0 && K%%%d == 0) {\n" \ % (kernel.getMultipleM(), kernel.getMultipleN(), kernel.getMultipleK()) self.addBodyForKernel( kernel ) self.logic += indent(6) + "}\n" #################################### # fallback tile - TODO all tiles begin added self.logic += indent(6)+"// fallback tile\n" #print("\nFallback[%i, %i]"%(sizeMin, sizeMax)) kernel.workGroupNumRows = fallbackTile[0] kernel.workGroupNumCols = fallbackTile[1] kernel.microTileNumRows = fallbackTile[2] kernel.microTileNumCols = fallbackTile[3] kernel.macroTileNumRows = kernel.workGroupNumRows*kernel.microTileNumRows kernel.macroTileNumCols = kernel.workGroupNumCols*kernel.microTileNumCols for unroll in unrollDict[precision]: kernel.unroll = unroll self.logic += indent(6)+"if ( K%%%d == 0 ) {\n" \ % (kernel.getMultipleK()) self.addBodyForKernel( kernel ) self.logic += indent(6) + "}\n" #################################### # end size event self.logic += indent(5) + "} // end size\n" #################################### # end beta self.logic += indent(4) + "} // end beta\n" #################################### # end transB self.logic += indent(3) + "} // end transB\n" #################################### # end transA self.logic += indent(2) + "} // end transA\n" #################################### # end order self.logic += indent(1) + "} // end order\n" #################################### # end precision self.logic += indent(0) + "} // end precision function\n" # write last precision self.selectionFile.write( self.logic ) self.selectionFile.write( "\n" ) def addBodyForKernel( self, kernel ): #self.logic += indent(7) + "printf(\"selected kernel: " + kernel.getName() + "\\n\");\n" self.logic += indent(7) + "*tileKernelSource = " + kernel.getName() + "_src;\n" self.logic += indent(7) + "*rowKernelSource = " + kernel.getRowName() + "_src;\n" self.logic += indent(7) + "*colKernelSource = " + kernel.getColName() + "_src;\n" self.logic += indent(7) + "*cornerKernelSource = " + kernel.getCornerName() + "_src;\n" self.logic += indent(7) + "*sourceBuildOptions = " + kernel.getName() + "_srcBuildOptions;\n" self.logic += indent(7) + "*tileKernelBinary = " + kernel.getName() + "_bin;\n" self.logic += indent(7) + "*rowKernelBinary = " + kernel.getRowName() + "_bin;\n" self.logic += indent(7) + "*colKernelBinary = " + kernel.getColName() + "_bin;\n" self.logic += indent(7) + "*cornerKernelBinary = " + kernel.getCornerName() + "_bin;\n" self.logic += indent(7) + "*tileKernelBinarySize = &" + kernel.getName() + "_binSize;\n" self.logic += indent(7) + "*rowKernelBinarySize = &" + kernel.getRowName() + "_binSize;\n" self.logic += indent(7) + "*colKernelBinarySize = &" + kernel.getColName() + "_binSize;\n" self.logic += indent(7) + "*cornerKernelBinarySize = &" + kernel.getCornerName() + "_binSize;\n" self.logic += indent(7) + "*binaryBuildOptions = " + kernel.getName() + "_binBuildOptions;\n" self.logic += indent(7) + "*tileClKernel = &" + kernel.getName() + "_clKernel;\n" self.logic += indent(7) + "*rowClKernel = &" + kernel.getRowName() + "_clKernel;\n" self.logic += indent(7) + "*colClKernel = &" + kernel.getColName() + "_clKernel;\n" self.logic += indent(7) + "*cornerClKernel = &" + kernel.getCornerName() + "_clKernel;\n" self.logic += indent(7) + "*workGroupNumRows = " + kernel.getName() + "_workGroupNumRows;\n" self.logic += indent(7) + "*workGroupNumCols = " + kernel.getName() + "_workGroupNumCols;\n" self.logic += indent(7) + "*microTileNumRows = " + kernel.getName() + "_microTileNumRows;\n" self.logic += indent(7) + "*microTileNumCols = " + kernel.getName() + "_microTileNumRows;\n" self.logic += indent(7) + "*unroll = " + kernel.getName() + "_unroll;\n" self.logic += indent(7) + "return;\n" ############################################################################## # KSL - write to file ############################################################################## def writeToFile(self): self.selectionFile.close() self.incFile.write( self.inc ) self.incFile.close() ################################################################################ # KSM - Kernel Selection Manual/Specific File ################################################################################ class KernelSelectionSpecific: zeroIndent = " " tab = " " ############################################################################## # KSL - default constructor ############################################################################## def __init__(self): self.incFileName = Common.getIncludePath() + "AutoGemmKernelSelectionSpecific.h" self.incFile = open(self.incFileName, "w") self.incFile.write( Common.getAutoGemmHeader() ) self.kernelSelectionFileName = Common.getIncludePath() + "AutoGemmKernelSelectionSpecific.cpp" self.selectionFile = open(self.kernelSelectionFileName, "w") self.selectionFile.write( Common.getAutoGemmHeader() ) self.inc = ( "#include \n" "#include \"" + Common.getRelativeIncludePath() + "AutoGemmKernelSources.h\"\n" "#include \"" + Common.getRelativeIncludePath() + "AutoGemmKernelBinaries.h\"\n" "#include \"" + Common.getRelativeIncludePath() + "AutoGemmKernelBuildOptionsSource.h\"\n" "#include \"" + Common.getRelativeIncludePath() + "AutoGemmKernelBuildOptionsBinary.h\"\n" "#include \"" + Common.getRelativeIncludePath() + "AutoGemmClKernels.h\"\n" "\n" "// kernel selection specific template\n" "template\n" "bool gemmSelectKernelSpecific(\n" " clblasOrder order,\n" " clblasTranspose transA,\n" " clblasTranspose transB,\n" " bool betaNonZero,\n" " unsigned int macroTileNumRows,\n" " unsigned int macroTileNumCols,\n" " unsigned int unroll,\n" " const char **tileKernelSource,\n" " const char **rowKernelSource,\n" " const char **colKernelSource,\n" " const char **cornerKernelSource,\n" " const char **sourceBuildOptions,\n" " const unsigned char **tileKernelBinary,\n" " const unsigned char **rowKernelBinary,\n" " const unsigned char **colKernelBinary,\n" " const unsigned char **cornerKernelBinary,\n" " size_t **tileKernelBinarySize,\n" " size_t **rowKernelBinarySize,\n" " size_t **colKernelBinarySize,\n" " size_t **cornerKernelBinarySize,\n" " const char **binaryBuildOptions,\n" " cl_kernel **tileClKernel,\n" " cl_kernel **rowClKernel,\n" " cl_kernel **colClKernel,\n" " cl_kernel **cornerClKernel,\n" " unsigned int *workGroupNumRows,\n" " unsigned int *workGroupNumCols,\n" " unsigned int *microTileNumRows,\n" " unsigned int *microTileNumCols\n" ");\n\n" ) self.logic = "#include \"" + Common.getRelativeIncludePath() + "AutoGemmKernelSelectionSpecific.h\"\n" self.precisionInitialized = False self.orderInitialized = False self.transInitialized = False self.betaInitialized = False def newPrecision(self, precision ): #print("KernelSelectionSpecific: " + precision + "gemm") if self.precisionInitialized: self.logic += self.zeroIndent+self.tab+self.tab + "}\n" # 2 tabs self.logic += self.zeroIndent+self.tab + "}\n" # 1 tab self.logic += self.zeroIndent+"}\n" self.logic += self.zeroIndent + "return false; // didn't find a match\n" self.logic += "}\n\n" else: self.logic += self.zeroIndent self.logic += ( "\n// " + precision + "gemm kernel selection specific\n" "template<>\n" "bool gemmSelectKernelSpecific<" ) if precision == "s": self.logic += "float" elif precision == "d": self.logic += "double" elif precision == "c": self.logic += "FloatComplex" else: self.logic += "DoubleComplex" self.logic += ( ">(\n" " clblasOrder order,\n" " clblasTranspose transA,\n" " clblasTranspose transB,\n" " bool betaNonZero,\n" " unsigned int macroTileNumRows,\n" " unsigned int macroTileNumCols,\n" " unsigned int unroll,\n" " const char **tileKernelSource,\n" " const char **rowKernelSource,\n" " const char **colKernelSource,\n" " const char **cornerKernelSource,\n" " const char **sourceBuildOptions,\n" " const unsigned char **tileKernelBinary,\n" " const unsigned char **rowKernelBinary,\n" " const unsigned char **colKernelBinary,\n" " const unsigned char **cornerKernelBinary,\n" " size_t **tileKernelBinarySize,\n" " size_t **rowKernelBinarySize,\n" " size_t **colKernelBinarySize,\n" " size_t **cornerKernelBinarySize,\n" " const char **binaryBuildOptions,\n" " cl_kernel **tileClKernel,\n" " cl_kernel **rowClKernel,\n" " cl_kernel **colClKernel,\n" " cl_kernel **cornerClKernel,\n" " unsigned int *workGroupNumRows,\n" " unsigned int *workGroupNumCols,\n" " unsigned int *microTileNumRows,\n" " unsigned int *microTileNumCols\n" ") {\n" ) self.precisionInitialized = True self.orderInitialized = False self.transInitialized = False self.betaInitialized = False #################################### # KSL - new order def newOrder(self, order): if (self.orderInitialized): self.logic += self.zeroIndent+self.tab+self.tab + "}\n" # 2 tabs self.logic += self.zeroIndent+self.tab + "}\n" # 1 tab self.logic += self.zeroIndent self.logic += "} else " else: self.logic += self.zeroIndent self.logic += "if (order == " + order + ") {\n" self.orderInitialized = True self.transInitialized = False self.betaInitialized = False #################################### # KSL - new trans def newTrans(self, transA, transB): if (self.transInitialized): self.logic += self.zeroIndent+self.tab+self.tab + "}\n" # 2 tabs self.logic += self.zeroIndent+self.tab # 1 tab self.logic += "} else " else: self.logic += self.zeroIndent+self.tab # 1 tabs self.logic += "if (transA == " if transA == "N": self.logic += "clblasNoTrans" elif transA == "T": self.logic += "clblasTrans" else: self.logic += "clblasConjTrans" self.logic += " && transB == " if transB == "N": self.logic += "clblasNoTrans" elif transB == "T": self.logic += "clblasTrans" else: self.logic += "clblasConjTrans" self.logic += ") {\n" self.transInitialized = True self.betaInitialized = False #################################### # KSL - new beta def newBeta(self, beta): if (self.betaInitialized): self.logic += self.zeroIndent+self.tab+self.tab # 2 tabs self.logic += "} else " else: self.logic += self.zeroIndent+self.tab+self.tab # 2 tabs self.logic += "if ( " if beta == 0: self.logic += "!betaNonZero" else: self.logic += "betaNonZero" self.logic += " ) {\n" self.betaInitialized = True ############################################################################## # KSL - add new kernel ############################################################################## def newKernel(self, kernel): # new kernel self.logic += self.zeroIndent+self.tab+self.tab+self.tab # 3 tabs self.logic += ("if ( macroTileNumRows == %u && macroTileNumCols == %u " "&& unroll == %u) {\n") \ % ( kernel.macroTileNumRows, kernel.macroTileNumCols, kernel.unroll ) #self.logic += self.zeroIndent+self.tab+self.tab+self.tab+self.tab+self.tab # 5 tabs #self.logic += "printf(\"selected kernel: " + kernel.getName() + "\\n\");\n" self.logic += self.zeroIndent+self.tab+self.tab+self.tab+self.tab # 4 tabs self.logic += "*tileKernelSource = " + kernel.getName() + "_src;\n" self.logic += self.zeroIndent+self.tab+self.tab+self.tab+self.tab # 4 tabs self.logic += "*rowKernelSource = " + kernel.getRowName() + "_src;\n" self.logic += self.zeroIndent+self.tab+self.tab+self.tab+self.tab # 4 tabs self.logic += "*colKernelSource = " + kernel.getColName() + "_src;\n" self.logic += self.zeroIndent+self.tab+self.tab+self.tab+self.tab # 4 tabs self.logic += "*cornerKernelSource = " + kernel.getCornerName() + "_src;\n" self.logic += self.zeroIndent+self.tab+self.tab+self.tab+self.tab # 4 tabs self.logic += "*sourceBuildOptions = " + kernel.getName() + "_srcBuildOptions;\n" self.logic += self.zeroIndent+self.tab+self.tab+self.tab+self.tab # 4 tabs self.logic += "*tileKernelBinary = " + kernel.getName() + "_bin;\n" self.logic += self.zeroIndent+self.tab+self.tab+self.tab+self.tab # 4 tabs self.logic += "*rowKernelBinary = " + kernel.getRowName() + "_bin;\n" self.logic += self.zeroIndent+self.tab+self.tab+self.tab+self.tab # 4 tabs self.logic += "*colKernelBinary = " + kernel.getColName() + "_bin;\n" self.logic += self.zeroIndent+self.tab+self.tab+self.tab+self.tab # 4 tabs self.logic += "*cornerKernelBinary = " + kernel.getCornerName() + "_bin;\n" self.logic += self.zeroIndent+self.tab+self.tab+self.tab+self.tab # 4 tabs self.logic += "*tileKernelBinarySize = &" + kernel.getName() + "_binSize;\n" self.logic += self.zeroIndent+self.tab+self.tab+self.tab+self.tab # 4 tabs self.logic += "*rowKernelBinarySize = &" + kernel.getRowName() + "_binSize;\n" self.logic += self.zeroIndent+self.tab+self.tab+self.tab+self.tab # 4 tabs self.logic += "*colKernelBinarySize = &" + kernel.getColName() + "_binSize;\n" self.logic += self.zeroIndent+self.tab+self.tab+self.tab+self.tab # 4 tabs self.logic += "*cornerKernelBinarySize = &" + kernel.getCornerName() + "_binSize;\n" self.logic += self.zeroIndent+self.tab+self.tab+self.tab+self.tab # 4 tabs self.logic += "*binaryBuildOptions = " + kernel.getName() + "_binBuildOptions;\n" self.logic += self.zeroIndent+self.tab+self.tab+self.tab+self.tab # 4 tabs self.logic += "*tileClKernel = &" + kernel.getName() + "_clKernel;\n" self.logic += self.zeroIndent+self.tab+self.tab+self.tab+self.tab # 4 tabs self.logic += "*rowClKernel = &" + kernel.getRowName() + "_clKernel;\n" self.logic += self.zeroIndent+self.tab+self.tab+self.tab+self.tab # 4 tabs self.logic += "*colClKernel = &" + kernel.getColName() + "_clKernel;\n" self.logic += self.zeroIndent+self.tab+self.tab+self.tab+self.tab # 4 tabs self.logic += "*cornerClKernel = &" + kernel.getCornerName() + "_clKernel;\n" # dims self.logic += self.zeroIndent+self.tab+self.tab+self.tab+self.tab # 4 tabs self.logic += "*workGroupNumRows = " + kernel.getName() + "_workGroupNumRows;\n" self.logic += self.zeroIndent+self.tab+self.tab+self.tab+self.tab # 4 tabs self.logic += "*workGroupNumCols = " + kernel.getName() + "_workGroupNumCols;\n" self.logic += self.zeroIndent+self.tab+self.tab+self.tab+self.tab # 4 tabs self.logic += "*microTileNumRows = " + kernel.getName() + "_microTileNumRows;\n" self.logic += self.zeroIndent+self.tab+self.tab+self.tab+self.tab # 4 tabs self.logic += "*microTileNumCols = " + kernel.getName() + "_microTileNumCols;\n" self.logic += self.zeroIndent+self.tab+self.tab+self.tab+self.tab # 4 tabs self.logic += "return true;\n" self.logic += self.zeroIndent+self.tab+self.tab+self.tab # 3 tabs self.logic += "}\n" self.selectionFile.write( self.logic ) self.logic = "" ############################################################################## # KSL - write to file ############################################################################## def writeToFile(self): self.logic += self.zeroIndent+self.tab+self.tab + "}\n" # 2 tabs self.logic += self.zeroIndent+self.tab + "}\n" # 1 tab self.logic += self.zeroIndent + "}\n" # 0 tab self.logic += self.zeroIndent + "return false; // didn't find a match\n" self.logic += "}\n" # close function self.selectionFile.write(self.logic) self.selectionFile.write("\n") self.selectionFile.close() self.incFile.write(self.inc) self.incFile.write("\n") self.incFile.close() ################################################################################ # Main ################################################################################ def writeKernelSelection(): print("AutoGemm.py: Generating kernel selection.") if not os.path.exists( Common.getIncludePath() ): os.makedirs( Common.getIncludePath() ) ######################################## # kernel selection specific kss = KernelSelectionSpecific() # for each precision kernel = KernelParameters.KernelParameters() for precision in AutoGemmParameters.precisions: kernel.precision = precision kss.newPrecision(precision) # valid tiles for this precision tiles = AutoGemmParameters.getTilesForPrecision(precision) # for non tile parameters for order in AutoGemmParameters.orders: kernel.order = order kss.newOrder(order) for transA in AutoGemmParameters.transposes[precision]: kernel.transA = transA for transB in AutoGemmParameters.transposes[precision]: kernel.transB = transB kss.newTrans(transA, transB) for beta in AutoGemmParameters.betas: kernel.beta = beta kss.newBeta(beta) # for tile parameters for tile in tiles: kernel.useTile(tile) kss.newKernel(kernel) kss.writeToFile() ######################################## # kernel selection ks = KernelSelection( \ AutoGemmParameters.precisions, \ AutoGemmParameters.orders, \ AutoGemmParameters.transposes, \ AutoGemmParameters.betas, \ AutoGemmParameters.unrolls, \ AutoGemmParameters.kernelSelectionData ) ks.writeToFile() ################################################################################ # Main ################################################################################ if __name__ == "__main__": if len(sys.argv) == 3: Common.setOutputPath(sys.argv[1]) AutoGemmParameters.setArchitecture(sys.argv[2]) writeKernelSelection() else: print("USAGE: python KernelSelection.py output_path architecture") clblas-2.10/src/library/blas/AutoGemm/KernelsToPreCompile.py000066400000000000000000000065041264277366700240570ustar00rootroot00000000000000import os import argparse import AutoGemmParameters import Common ################################################################################ # Auto-Gemm ################################################################################ def writeOfflineCompilation(args): print("AutoGemm.py: Generating list of kernels to pre-compile.") if not os.path.exists( Common.getIncludePath() ): os.makedirs( Common.getIncludePath() ) ocFileName = Common.getIncludePath() + "AutoGemmKernelsToPreCompile.h" ocFile = open(ocFileName, "w") ocFile.write( Common.getAutoGemmHeader() ) fileStr = "\n/*precision, order, transA, transB, beta, tileNumRows, tileNumCols, unroll*/\n" fileStr += "\nunsigned int gemmPreCompile[][8] = {\n" count = 0 for precision in args.precisions: ocFile.write( fileStr ) fileStr = "" validTiles = AutoGemmParameters.getTilesForPrecision(precision) for order in args.orders: for transpose in args.transposes: transA = transpose[0] transB = transpose[1] if (transA=="C" or transB=="C") and (precision=="s" or precision=="d"): # real precision doesn't have conjugate transpose continue for beta in args.betas: for tile in validTiles: # print combination kernelStr = " { %1u, %1u, %1u, %1u, %1u, %3u, %3u, %2u },\n" \ % ( Common.precisionInt[precision], Common.orderInt[order], Common.transposeInt[transA], Common.transposeInt[transB], beta, tile.macroTileNumRows, tile.macroTileNumCols, tile.unroll ) fileStr += kernelStr #print kernelStr count+=1 if count is 0: fileStr += " { %1u, %1u, %1u, %1u, %1u, %3u, %3u, %2u },\n" \ % ( 0, 0, 0, 0, 0, 0, 0, 0 ) fileStr += "};\n" fileStr += "unsigned int gemmPreCompileNum = " + str(count) + ";\n" ocFile.write( fileStr ) ocFile.close() count *= 4 print("AutoGemm.py: %u kernels will be pre-compiled." % count) ################################################################################ # Main ################################################################################ if __name__ == "__main__": # parse arguments ap = argparse.ArgumentParser(description="Which gemm kernels to compile offline.") ap.add_argument("--output-path", dest="output" ) ap.add_argument("--precisions", dest="precisions", action="store", nargs="+", choices=AutoGemmParameters.precisions ) ap.add_argument("--orders", dest="orders", action="store", nargs="+", choices=AutoGemmParameters.orders ) ap.add_argument("--transposes", dest="transposes", action="store", nargs="+", choices=AutoGemmParameters.getTransposeChoices() ) ap.add_argument("--betas", dest="betas", action="store", nargs="+", type=int, choices=AutoGemmParameters.betas ) args = ap.parse_args() if args.output: Common.setOutputPath(args.output) else: print("Warning: No output path specified; default is working directory.") # write offline compilation header if args.precisions is None: args.precisions = [] if args.transposes is None: args.transposes = [] if args.orders is None: args.orders = [] if args.betas is None: args.betas = [] writeOfflineCompilation(args) clblas-2.10/src/library/blas/AutoGemm/README.txt000066400000000000000000000000001264277366700212760ustar00rootroot00000000000000clblas-2.10/src/library/blas/AutoGemm/UserGemmKernelSources/000077500000000000000000000000001264277366700240435ustar00rootroot00000000000000clblas-2.10/src/library/blas/AutoGemm/UserGemmKernelSources/UserGemmClKernels.cc000066400000000000000000000042531264277366700277050ustar00rootroot00000000000000// GENERATED using create_user_gemm_cl_kernels.py #if defined( __APPLE__ ) || defined( __MACOSX ) #include #else #include #endif cl_kernel sgemm_Col_NT_B1_MX128_NX128_KX16_clKernel = NULL; cl_kernel sgemm_Col_NT_B1_MX032_NX064_KX16_ROW_clKernel = NULL; cl_kernel sgemm_Col_NT_B1_MX064_NX032_KX16_COLUMN_clKernel = NULL; cl_kernel sgemm_Col_NT_B1_MX032_NX032_KX16_SINGLE_clKernel = NULL; cl_kernel sgemm_Col_NN_B1_MX032_NX032_KX16_BRANCH_clKernel = NULL; cl_kernel sgemm_Col_NT_B1_MX032_NX032_KX16_BRANCH_clKernel = NULL; cl_kernel sgemm_Col_TN_B1_MX032_NX032_KX16_BRANCH_clKernel = NULL; #ifdef __cplusplus extern "C" { #endif void initUserGemmClKernels(void); #ifdef __cplusplus } #endif void initUserGemmClKernels(void) { if(sgemm_Col_NT_B1_MX128_NX128_KX16_clKernel != NULL) { clReleaseKernel(sgemm_Col_NT_B1_MX128_NX128_KX16_clKernel); sgemm_Col_NT_B1_MX128_NX128_KX16_clKernel = NULL; } if(sgemm_Col_NT_B1_MX032_NX064_KX16_ROW_clKernel != NULL) { clReleaseKernel(sgemm_Col_NT_B1_MX032_NX064_KX16_ROW_clKernel); sgemm_Col_NT_B1_MX032_NX064_KX16_ROW_clKernel = NULL; } if(sgemm_Col_NT_B1_MX064_NX032_KX16_COLUMN_clKernel != NULL) { clReleaseKernel(sgemm_Col_NT_B1_MX064_NX032_KX16_COLUMN_clKernel); sgemm_Col_NT_B1_MX064_NX032_KX16_COLUMN_clKernel = NULL; } if(sgemm_Col_NT_B1_MX032_NX032_KX16_SINGLE_clKernel != NULL) { clReleaseKernel(sgemm_Col_NT_B1_MX032_NX032_KX16_SINGLE_clKernel); sgemm_Col_NT_B1_MX032_NX032_KX16_SINGLE_clKernel = NULL; } if(sgemm_Col_NN_B1_MX032_NX032_KX16_BRANCH_clKernel != NULL) { clReleaseKernel(sgemm_Col_NN_B1_MX032_NX032_KX16_BRANCH_clKernel); sgemm_Col_NN_B1_MX032_NX032_KX16_BRANCH_clKernel = NULL; } if(sgemm_Col_NT_B1_MX032_NX032_KX16_BRANCH_clKernel != NULL) { clReleaseKernel(sgemm_Col_NT_B1_MX032_NX032_KX16_BRANCH_clKernel); sgemm_Col_NT_B1_MX032_NX032_KX16_BRANCH_clKernel = NULL; } if(sgemm_Col_TN_B1_MX032_NX032_KX16_BRANCH_clKernel != NULL) { clReleaseKernel(sgemm_Col_TN_B1_MX032_NX032_KX16_BRANCH_clKernel); sgemm_Col_TN_B1_MX032_NX032_KX16_BRANCH_clKernel = NULL; } } clblas-2.10/src/library/blas/AutoGemm/UserGemmKernelSources/UserGemmClKernels.h000066400000000000000000000014161264277366700275450ustar00rootroot00000000000000 #ifndef USERGEMM_CL_KERNELS_H #define USERGEMM_CL_KERNELS_H #if defined( __APPLE__ ) || defined( __MACOSX ) #include #else #include #endif extern cl_kernel sgemm_Col_NT_B1_MX128_NX128_KX16_clKernel; extern cl_kernel sgemm_Col_NT_B1_MX032_NX064_KX16_ROW_clKernel; extern cl_kernel sgemm_Col_NT_B1_MX064_NX032_KX16_COLUMN_clKernel; extern cl_kernel sgemm_Col_NT_B1_MX032_NX032_KX16_SINGLE_clKernel; extern cl_kernel sgemm_Col_NN_B1_MX032_NX032_KX16_BRANCH_clKernel; extern cl_kernel sgemm_Col_NT_B1_MX032_NX032_KX16_BRANCH_clKernel; extern cl_kernel sgemm_Col_TN_B1_MX032_NX032_KX16_BRANCH_clKernel; static const int user_kernel_count = 7; #ifdef __cplusplus extern "C" { #endif void initUserGemmClKernels(void); #ifdef __cplusplus } #endif #endif clblas-2.10/src/library/blas/AutoGemm/UserGemmKernelSources/UserGemmKernelSourceIncludes.cpp000066400000000000000000000060621264277366700323100ustar00rootroot00000000000000/******************************************************************************* * This file is NOT auto-generated; populate it with hand-written kernels * - David Tanner ******************************************************************************/ #ifndef USER_GEMM_SOURCE_INCLUDES_CPP #define USER_GEMM_SOURCE_INCLUDES_CPP //**** Kernels to replace auto-generated versions #include "UserGemmKernelSources/sgemm_Col_NN_B0_MX032_NX032_KX16_src.cpp" #include "UserGemmKernelSources/sgemm_Col_NN_B0_MX064_NX064_KX16_src.cpp" #include "UserGemmKernelSources/sgemm_Col_NN_B0_MX096_NX096_KX16_src.cpp" #include "UserGemmKernelSources/sgemm_Col_NN_B1_MX032_NX032_KX16_src.cpp" #include "UserGemmKernelSources/sgemm_Col_NN_B1_MX064_NX064_KX16_src.cpp" #include "UserGemmKernelSources/sgemm_Col_NN_B1_MX096_NX096_KX16_src.cpp" #include "UserGemmKernelSources/sgemm_Col_NT_B0_MX032_NX032_KX16_src.cpp" #include "UserGemmKernelSources/sgemm_Col_NT_B0_MX064_NX064_KX16_src.cpp" #include "UserGemmKernelSources/sgemm_Col_NT_B0_MX096_NX096_KX16_src.cpp" #include "UserGemmKernelSources/sgemm_Col_NT_B1_MX032_NX032_KX16_src.cpp" #include "UserGemmKernelSources/sgemm_Col_NT_B1_MX064_NX064_KX16_src.cpp" #include "UserGemmKernelSources/sgemm_Col_NT_B1_MX096_NX096_KX16_src.cpp" #include "UserGemmKernelSources/sgemm_Col_TN_B0_MX032_NX032_KX16_src.cpp" #include "UserGemmKernelSources/sgemm_Col_TN_B0_MX064_NX064_KX16_src.cpp" #include "UserGemmKernelSources/sgemm_Col_TN_B0_MX096_NX096_KX16_src.cpp" #include "UserGemmKernelSources/sgemm_Col_TN_B1_MX032_NX032_KX16_src.cpp" #include "UserGemmKernelSources/sgemm_Col_TN_B1_MX064_NX064_KX16_src.cpp" #include "UserGemmKernelSources/sgemm_Col_TN_B1_MX096_NX096_KX16_src.cpp" #include "UserGemmKernelSources/dgemm_Col_NN_B0_MX048_NX048_KX08_src.cpp" #include "UserGemmKernelSources/dgemm_Col_NN_B1_MX048_NX048_KX08_src.cpp" #include "UserGemmKernelSources/dgemm_Col_NT_B0_MX048_NX048_KX08_src.cpp" #include "UserGemmKernelSources/dgemm_Col_NT_B1_MX048_NX048_KX08_src.cpp" #include "UserGemmKernelSources/dgemm_Col_TN_B0_MX048_NX048_KX08_src.cpp" #include "UserGemmKernelSources/dgemm_Col_TN_B1_MX048_NX048_KX08_src.cpp" //**** Special kernels without auto-generated counterparts //**** micro tile size 8x8 kernel #include "UserGemmKernelSources/sgemm_Col_NT_B1_MX128_NX128_KX16_src.cpp" //**** mod32 but not mod64 kernels #include "UserGemmKernelSources/sgemm_Col_NT_B1_MX032_NX064_KX16_ROW_src.cpp" #include "UserGemmKernelSources/sgemm_Col_NT_B1_MX064_NX032_KX16_COL_src.cpp" #include "UserGemmKernelSources/sgemm_Col_NT_B1_MX032_NX032_KX16_SINGLE_src.cpp" //**** branch kernels with 32x32 macro tile size #include "UserGemmKernelSources/sgemm_Col_NN_B1_MX032_NX032_KX16_BRANCH_src.cpp" #include "UserGemmKernelSources/sgemm_Col_NT_B1_MX032_NX032_KX16_BRANCH_src.cpp" #include "UserGemmKernelSources/sgemm_Col_TN_B1_MX032_NX032_KX16_BRANCH_src.cpp" //**** compiler flags //**** online compilation flags //const char * const User_srcBuildOptions = "-cl-std=CL2.0"; //const char * const User_binBuildOptions = " "; #endif clblas-2.10/src/library/blas/AutoGemm/UserGemmKernelSources/UserGemmKernelSourceIncludes.h000066400000000000000000000110311264277366700317450ustar00rootroot00000000000000 #ifndef USER_GEMM_SOURCE_INCLUDES_H #define USER_GEMM_SOURCE_INCLUDES_H #include //#ifdef AUTOGEMM_USE_PRE_COMPILED_KERNELS //#include "AutoGemmKernelBinaries/sgemm_Col_NT_B1_MX128_NX128_KX16_bin.cpp" //#endif //**** compiler flags //**** online compilation flags const char * const User_srcBuildOptions = "-cl-std=CL" OPENCL_VERSION; const char * const User_binBuildOptions = "-cl-std=CL" OPENCL_VERSION; extern const unsigned int sgemm_Col_NT_B1_MX032_NX064_KX16_ROW_workGroupNumRows; extern const unsigned int sgemm_Col_NT_B1_MX032_NX064_KX16_ROW_workGroupNumCols; extern const unsigned int sgemm_Col_NT_B1_MX032_NX064_KX16_ROW_microTileNumRows; extern const unsigned int sgemm_Col_NT_B1_MX032_NX064_KX16_ROW_microTileNumCols; extern const unsigned int sgemm_Col_NT_B1_MX032_NX064_KX16_ROW_unroll; extern const char * const sgemm_Col_NT_B1_MX032_NX064_KX16_ROW_src; extern unsigned char *sgemm_Col_NT_B1_MX032_NX064_KX16_ROW_bin; extern size_t sgemm_Col_NT_B1_MX032_NX064_KX16_ROW_binSize; extern const unsigned int sgemm_Col_NT_B1_MX064_NX032_KX16_COLUMN_workGroupNumRows; extern const unsigned int sgemm_Col_NT_B1_MX064_NX032_KX16_COLUMN_workGroupNumCols; extern const unsigned int sgemm_Col_NT_B1_MX064_NX032_KX16_COLUMN_microTileNumRows; extern const unsigned int sgemm_Col_NT_B1_MX064_NX032_KX16_COLUMN_microTileNumCols; extern const unsigned int sgemm_Col_NT_B1_MX064_NX032_KX16_COLUMN_unroll; extern const char * const sgemm_Col_NT_B1_MX064_NX032_KX16_COLUMN_src; extern unsigned char *sgemm_Col_NT_B1_MX064_NX032_KX16_COLUMN_bin; extern size_t sgemm_Col_NT_B1_MX064_NX032_KX16_COLUMN_binSize; extern const unsigned int sgemm_Col_NT_B1_MX032_NX032_KX16_SINGLE_workGroupNumRows; extern const unsigned int sgemm_Col_NT_B1_MX032_NX032_KX16_SINGLE_workGroupNumCols; extern const unsigned int sgemm_Col_NT_B1_MX032_NX032_KX16_SINGLE_microTileNumRows; extern const unsigned int sgemm_Col_NT_B1_MX032_NX032_KX16_SINGLE_microTileNumCols; extern const unsigned int sgemm_Col_NT_B1_MX032_NX032_KX16_SINGLE_unroll; extern const char * const sgemm_Col_NT_B1_MX032_NX032_KX16_SINGLE_src; extern unsigned char *sgemm_Col_NT_B1_MX032_NX032_KX16_SINGLE_bin; extern size_t sgemm_Col_NT_B1_MX032_NX032_KX16_SINGLE_binSize; extern const unsigned int sgemm_Col_NT_B1_MX128_NX128_KX16_workGroupNumRows; extern const unsigned int sgemm_Col_NT_B1_MX128_NX128_KX16_workGroupNumCols; extern const unsigned int sgemm_Col_NT_B1_MX128_NX128_KX16_microTileNumRows; extern const unsigned int sgemm_Col_NT_B1_MX128_NX128_KX16_microTileNumCols; extern const unsigned int sgemm_Col_NT_B1_MX128_NX128_KX16_unroll; extern const char * const sgemm_Col_NT_B1_MX128_NX128_KX16_src; extern unsigned char *sgemm_Col_NT_B1_MX128_NX128_KX16_bin; extern size_t sgemm_Col_NT_B1_MX128_NX128_KX16_binSize; extern const unsigned int sgemm_Col_NN_B1_MX032_NX032_KX16_BRANCH_workGroupNumRows; extern const unsigned int sgemm_Col_NN_B1_MX032_NX032_KX16_BRANCH_workGroupNumCols; extern const unsigned int sgemm_Col_NN_B1_MX032_NX032_KX16_BRANCH_microTileNumRows; extern const unsigned int sgemm_Col_NN_B1_MX032_NX032_KX16_BRANCH_microTileNumCols; extern const unsigned int sgemm_Col_NN_B1_MX032_NX032_KX16_BRANCH_unroll; extern const char * const sgemm_Col_NN_B1_MX032_NX032_KX16_BRANCH_src; extern unsigned char *sgemm_Col_NN_B1_MX032_NX032_KX16_BRANCH_bin; extern size_t sgemm_Col_NN_B1_MX032_NX032_KX16_BRANCH_binSize; extern const unsigned int sgemm_Col_NT_B1_MX032_NX032_KX16_BRANCH_workGroupNumRows; extern const unsigned int sgemm_Col_NT_B1_MX032_NX032_KX16_BRANCH_workGroupNumCols; extern const unsigned int sgemm_Col_NT_B1_MX032_NX032_KX16_BRANCH_microTileNumRows; extern const unsigned int sgemm_Col_NT_B1_MX032_NX032_KX16_BRANCH_microTileNumCols; extern const unsigned int sgemm_Col_NT_B1_MX032_NX032_KX16_BRANCH_unroll; extern const char * const sgemm_Col_NT_B1_MX032_NX032_KX16_BRANCH_src; extern unsigned char *sgemm_Col_NT_B1_MX032_NX032_KX16_BRANCH_bin; extern size_t sgemm_Col_NT_B1_MX032_NX032_KX16_BRANCH_binSize; extern const unsigned int sgemm_Col_TN_B1_MX032_NX032_KX16_BRANCH_workGroupNumRows; extern const unsigned int sgemm_Col_TN_B1_MX032_NX032_KX16_BRANCH_workGroupNumCols; extern const unsigned int sgemm_Col_TN_B1_MX032_NX032_KX16_BRANCH_microTileNumRows; extern const unsigned int sgemm_Col_TN_B1_MX032_NX032_KX16_BRANCH_microTileNumCols; extern const unsigned int sgemm_Col_TN_B1_MX032_NX032_KX16_BRANCH_unroll; extern const char * const sgemm_Col_TN_B1_MX032_NX032_KX16_BRANCH_src; extern unsigned char *sgemm_Col_TN_B1_MX032_NX032_KX16_BRANCH_bin; extern size_t sgemm_Col_TN_B1_MX032_NX032_KX16_BRANCH_binSize; #endif clblas-2.10/src/library/blas/AutoGemm/UserGemmKernelSources/create_user_gemm_cl_kernels.py000066400000000000000000000024241264277366700321260ustar00rootroot00000000000000""" Run from same directory as this file is in Generates UserGemmClKernels.cc Would be nice to use Jinja2 for this, but using print for now, for consistency """ kernelNames = [] # lets just read the kernel names from UserGemmClKernels.h: ifile = open('UserGemmClKernels.h', 'r') contents = ifile.read() for line in contents.split('\n'): if line.find('cl_kernel') < 0: continue kernelName = line.split()[2].split(';')[0] # probably not terribly un-fragile, but works for now kernelNames.append(kernelName) ifile.close() ofile = open('UserGemmClKernels.cc', 'w') ofile.write('// GENERATED using create_user_gemm_cl_kernels.py\n') ofile.write('\n') ofile.write('#if defined( __APPLE__ ) || defined( __MACOSX )\n') ofile.write('#include \n') ofile.write('#else\n') ofile.write('#include \n') ofile.write('#endif\n') ofile.write('\n') for kernelName in kernelNames: ofile.write('cl_kernel %s = NULL;\n' % kernelName) ofile.write('\n') ofile.write('void initUserGemmClKernels(void) {\n') for kernelName in kernelNames: ofile.write(' if(%s != NULL) {\n' % kernelName) ofile.write(' clReleaseKernel(%s);\n' % kernelName) ofile.write(' %s = NULL;\n' % kernelName) ofile.write(' }\n') ofile.write('}\n') ofile.close() clblas-2.10/src/library/blas/AutoGemm/UserGemmKernelSources/dgemm_Col_NN_B0_MX048_NX048_KX08_src.cpp000066400000000000000000000145411264277366700325000ustar00rootroot00000000000000/******************************************************************************* * Hand-tuned kernel ******************************************************************************/ #ifndef KERNEL_DGEMM_COL_NN_B0_MX048_NX048_KX08_SRC_H #define KERNEL_DGEMM_COL_NN_B0_MX048_NX048_KX08_SRC_H #pragma message("AutoGemm's dgemm_Col_NN_B0_MX048_NX048_KX08_src overriden by user.") #ifndef STRINGIFY #define STRINGIFY(S) STRINGIFY2(S) #define STRINGIFY2(S) #S #endif const unsigned int dgemm_Col_NN_B0_MX048_NX048_KX08_workGroupNumRows = 8; const unsigned int dgemm_Col_NN_B0_MX048_NX048_KX08_workGroupNumCols = 8; const unsigned int dgemm_Col_NN_B0_MX048_NX048_KX08_microTileNumRows = 6; const unsigned int dgemm_Col_NN_B0_MX048_NX048_KX08_microTileNumCols = 6; const unsigned int dgemm_Col_NN_B0_MX048_NX048_KX08_unroll = 8; const char * const dgemm_Col_NN_B0_MX048_NX048_KX08_src = STRINGIFY( #pragma OPENCL EXTENSION cl_khr_fp64 : enable \n #define M6x6 \ rA[0] = lA[offA + 0];\ rA[1] = lA[offA + 8];\ rA[2] = lA[offA + 16];\ rA[3] = lA[offA + 24];\ rA[4] = lA[offA + 32];\ rA[5] = lA[offA + 40];\ rB[0] = lB[offB + 0];\ rB[1] = lB[offB + 8];\ rB[2] = lB[offB + 16];\ rB[3] = lB[offB + 24]; \ rB[4] = lB[offB + 32]; \ rB[5] = lB[offB + 40]; \ offA += 49; \ offB += 49; \ rC[0][0]=mad(rA[0],rB[0],rC[0][0]); \ rC[1][0]=mad(rA[1],rB[0],rC[1][0]); \ rC[2][0]=mad(rA[2],rB[0],rC[2][0]); \ rC[3][0]=mad(rA[3],rB[0],rC[3][0]); \ rC[4][0]=mad(rA[4],rB[0],rC[4][0]); \ rC[5][0]=mad(rA[5],rB[0],rC[5][0]); \ rC[0][1]=mad(rA[0],rB[1],rC[0][1]); \ rC[1][1]=mad(rA[1],rB[1],rC[1][1]); \ rC[2][1]=mad(rA[2],rB[1],rC[2][1]); \ rC[3][1]=mad(rA[3],rB[1],rC[3][1]); \ rC[4][1]=mad(rA[4],rB[1],rC[4][1]); \ rC[5][1]=mad(rA[5],rB[1],rC[5][1]); \ rC[0][2]=mad(rA[0],rB[2],rC[0][2]); \ rC[1][2]=mad(rA[1],rB[2],rC[1][2]); \ rC[2][2]=mad(rA[2],rB[2],rC[2][2]); \ rC[3][2]=mad(rA[3],rB[2],rC[3][2]); \ rC[4][2]=mad(rA[4],rB[2],rC[4][2]); \ rC[5][2]=mad(rA[5],rB[2],rC[5][2]); \ rC[0][3]=mad(rA[0],rB[3],rC[0][3]); \ rC[1][3]=mad(rA[1],rB[3],rC[1][3]); \ rC[2][3]=mad(rA[2],rB[3],rC[2][3]); \ rC[3][3]=mad(rA[3],rB[3],rC[3][3]); \ rC[4][3]=mad(rA[4],rB[3],rC[4][3]); \ rC[5][3]=mad(rA[5],rB[3],rC[5][3]); \ rC[0][4]=mad(rA[0],rB[4],rC[0][4]); \ rC[1][4]=mad(rA[1],rB[4],rC[1][4]); \ rC[2][4]=mad(rA[2],rB[4],rC[2][4]); \ rC[3][4]=mad(rA[3],rB[4],rC[3][4]); \ rC[4][4]=mad(rA[4],rB[4],rC[4][4]); \ rC[5][4]=mad(rA[5],rB[4],rC[5][4]); \ rC[0][5]=mad(rA[0],rB[5],rC[0][5]); \ rC[1][5]=mad(rA[1],rB[5],rC[1][5]); \ rC[2][5]=mad(rA[2],rB[5],rC[2][5]); \ rC[3][5]=mad(rA[3],rB[5],rC[3][5]); \ rC[4][5]=mad(rA[4],rB[5],rC[4][5]); \ rC[5][5]=mad(rA[5],rB[5],rC[5][5]); \ mem_fence(CLK_LOCAL_MEM_FENCE);\n __attribute__((reqd_work_group_size(8,8,1))) __kernel void dgemm_Col_NN_B0_MX048_NX048_KX08 ( __global double const * restrict A, __global double const * restrict B, __global double * C, double const alpha, double const beta, uint const M, uint const N, uint const K, uint lda, uint ldb, uint ldc, uint offsetA, uint offsetB, uint offsetC) { A += offsetA; B += offsetB; C += offsetC; double rC[6][6] = { {(double)0} }; double rA[6]; double rB[6]; __local double lA[392]; __local double lB[392]; int gidx = get_group_id(0); int gidy = get_group_id(1); int idx = get_local_id(0); int idy = get_local_id(1); int idt = 8*idy + idx; int idxT = idt % 8; int idyT = idt / 8; A += gidx*48+ idxT + idyT*lda; B += gidy*48*ldb+ idx + idy*ldb; int block_k = K >> 3; do { __local double* plA = lA + idyT*49 + idxT; __local double* plB = lB + idxT*49 + idyT; barrier(CLK_LOCAL_MEM_FENCE); plA[0] = A[0+0*lda]; plA[8] = A[8+0*lda]; plA[16] = A[16+0*lda]; plA[24] = A[24+0*lda]; plA[32] = A[32+0*lda]; plA[40] = A[40+0*lda]; plB[0] = B[0+0*ldb]; plB[8] = B[0+8*ldb]; plB[16] = B[0+16*ldb]; plB[24] = B[0+24*ldb]; plB[32] = B[0+32*ldb]; plB[40] = B[0+40*ldb]; barrier(CLK_LOCAL_MEM_FENCE); int offA = idx ; int offB = idy ; M6x6 M6x6 M6x6 M6x6 M6x6 M6x6 M6x6 M6x6 A += lda << 3; B += 8; } while (--block_k > 0); C+= gidx*48; C+= idx; C+= gidy*48*ldc; C+= idy*ldc; C[0*ldc] = alpha*rC[0][0] ; C[8*ldc] = alpha*rC[0][1] ; C[16*ldc] = alpha*rC[0][2] ; C[24*ldc] = alpha*rC[0][3] ; C[32*ldc] = alpha*rC[0][4] ; C[40*ldc] = alpha*rC[0][5] ; C+=8; ; C[0*ldc] = alpha*rC[1][0] ; C[8*ldc] = alpha*rC[1][1] ; C[16*ldc] = alpha*rC[1][2] ; C[24*ldc] = alpha*rC[1][3] ; C[32*ldc] = alpha*rC[1][4] ; C[40*ldc] = alpha*rC[1][5] ; C+=8; ; C[0*ldc] = alpha*rC[2][0] ; C[8*ldc] = alpha*rC[2][1] ; C[16*ldc] = alpha*rC[2][2] ; C[24*ldc] = alpha*rC[2][3] ; C[32*ldc] = alpha*rC[2][4] ; C[40*ldc] = alpha*rC[2][5] ; C+=8; ; C[0*ldc] = alpha*rC[3][0] ; C[8*ldc] = alpha*rC[3][1] ; C[16*ldc] = alpha*rC[3][2] ; C[24*ldc] = alpha*rC[3][3] ; C[32*ldc] = alpha*rC[3][4] ; C[40*ldc] = alpha*rC[3][5] ; C+=8; ; C[0*ldc] = alpha*rC[4][0] ; C[8*ldc] = alpha*rC[4][1] ; C[16*ldc] = alpha*rC[4][2] ; C[24*ldc] = alpha*rC[4][3] ; C[32*ldc] = alpha*rC[4][4] ; C[40*ldc] = alpha*rC[4][5] ; C+=8; ; C[0*ldc] = alpha*rC[5][0] ; C[8*ldc] = alpha*rC[5][1] ; C[16*ldc] = alpha*rC[5][2] ; C[24*ldc] = alpha*rC[5][3] ; C[32*ldc] = alpha*rC[5][4] ; C[40*ldc] = alpha*rC[5][5] ; } ); #endif clblas-2.10/src/library/blas/AutoGemm/UserGemmKernelSources/dgemm_Col_NN_B1_MX048_NX048_KX08_src.cpp000066400000000000000000000162521264277366700325020ustar00rootroot00000000000000/******************************************************************************* * Hand-tuned kernel ******************************************************************************/ #ifndef KERNEL_DGEMM_COL_NN_B1_MX048_NX048_KX08_SRC_H #define KERNEL_DGEMM_COL_NN_B1_MX048_NX048_KX08_SRC_H #pragma message("AutoGemm's dgemm_Col_NN_B1_MX048_NX048_KX08_src overriden by user.") #ifndef STRINGIFY #define STRINGIFY(S) STRINGIFY2(S) #define STRINGIFY2(S) #S #endif const unsigned int dgemm_Col_NN_B1_MX048_NX048_KX08_workGroupNumRows = 8; const unsigned int dgemm_Col_NN_B1_MX048_NX048_KX08_workGroupNumCols = 8; const unsigned int dgemm_Col_NN_B1_MX048_NX048_KX08_microTileNumRows = 6; const unsigned int dgemm_Col_NN_B1_MX048_NX048_KX08_microTileNumCols = 6; const unsigned int dgemm_Col_NN_B1_MX048_NX048_KX08_unroll = 8; const char * const dgemm_Col_NN_B1_MX048_NX048_KX08_src = STRINGIFY( #pragma OPENCL EXTENSION cl_khr_fp64 : enable \n #define M6x6 \ rA[0] = lA[offA + 0]; \ rA[1] = lA[offA + 8]; \ rA[2] = lA[offA + 16]; \ rA[3] = lA[offA + 24]; \ rA[4] = lA[offA + 32]; \ rA[5] = lA[offA + 40]; \ rB[0] = lB[offB + 0]; \ rB[1] = lB[offB + 8]; \ rB[2] = lB[offB + 16]; \ rB[3] = lB[offB + 24]; \ rB[4] = lB[offB + 32]; \ rB[5] = lB[offB + 40]; \ offA += 49; \ offB += 49; \ rC[0][0]=mad(rA[0],rB[0],rC[0][0]); \ rC[1][0]=mad(rA[1],rB[0],rC[1][0]); \ rC[2][0]=mad(rA[2],rB[0],rC[2][0]); \ rC[3][0]=mad(rA[3],rB[0],rC[3][0]); \ rC[4][0]=mad(rA[4],rB[0],rC[4][0]); \ rC[5][0]=mad(rA[5],rB[0],rC[5][0]); \ rC[0][1]=mad(rA[0],rB[1],rC[0][1]); \ rC[1][1]=mad(rA[1],rB[1],rC[1][1]); \ rC[2][1]=mad(rA[2],rB[1],rC[2][1]); \ rC[3][1]=mad(rA[3],rB[1],rC[3][1]); \ rC[4][1]=mad(rA[4],rB[1],rC[4][1]); \ rC[5][1]=mad(rA[5],rB[1],rC[5][1]); \ rC[0][2]=mad(rA[0],rB[2],rC[0][2]); \ rC[1][2]=mad(rA[1],rB[2],rC[1][2]); \ rC[2][2]=mad(rA[2],rB[2],rC[2][2]); \ rC[3][2]=mad(rA[3],rB[2],rC[3][2]); \ rC[4][2]=mad(rA[4],rB[2],rC[4][2]); \ rC[5][2]=mad(rA[5],rB[2],rC[5][2]); \ rC[0][3]=mad(rA[0],rB[3],rC[0][3]); \ rC[1][3]=mad(rA[1],rB[3],rC[1][3]); \ rC[2][3]=mad(rA[2],rB[3],rC[2][3]); \ rC[3][3]=mad(rA[3],rB[3],rC[3][3]); \ rC[4][3]=mad(rA[4],rB[3],rC[4][3]); \ rC[5][3]=mad(rA[5],rB[3],rC[5][3]); \ rC[0][4]=mad(rA[0],rB[4],rC[0][4]); \ rC[1][4]=mad(rA[1],rB[4],rC[1][4]); \ rC[2][4]=mad(rA[2],rB[4],rC[2][4]); \ rC[3][4]=mad(rA[3],rB[4],rC[3][4]); \ rC[4][4]=mad(rA[4],rB[4],rC[4][4]); \ rC[5][4]=mad(rA[5],rB[4],rC[5][4]); \ rC[0][5]=mad(rA[0],rB[5],rC[0][5]); \ rC[1][5]=mad(rA[1],rB[5],rC[1][5]); \ rC[2][5]=mad(rA[2],rB[5],rC[2][5]); \ rC[3][5]=mad(rA[3],rB[5],rC[3][5]); \ rC[4][5]=mad(rA[4],rB[5],rC[4][5]); \ rC[5][5]=mad(rA[5],rB[5],rC[5][5]); \ mem_fence(CLK_LOCAL_MEM_FENCE);\n __attribute__((reqd_work_group_size(8,8,1))) __kernel void dgemm_Col_NN_B1_MX048_NX048_KX08 ( __global double const * restrict A, __global double const * restrict B, __global double * C, double const alpha, double const beta, uint const M, uint const N, uint const K, uint lda, uint ldb, uint ldc, uint offsetA, uint offsetB, uint offsetC) { A += offsetA; B += offsetB; C += offsetC; double rC[6][6] = { {(double)0} }; double rA[6]; double rB[6]; __local double lA[392]; __local double lB[392]; int gidx = get_group_id(0); int gidy = get_group_id(1); int idx = get_local_id(0); int idy = get_local_id(1); int idt = 8*idy + idx; int idxT = idt % 8; int idyT = idt / 8; A += gidx*48+ idxT + idyT*lda; B += gidy*48*ldb+ idx + idy*ldb; int block_k = K >> 3; do { __local double* plA = lA + idyT*49 + idxT; __local double* plB = lB + idxT*49 + idyT; barrier(CLK_LOCAL_MEM_FENCE); plA[0] = A[0+0*lda]; plA[8] = A[8+0*lda]; plA[16] = A[16+0*lda]; plA[24] = A[24+0*lda]; plA[32] = A[32+0*lda]; plA[40] = A[40+0*lda]; plB[0] = B[0+0*ldb]; plB[8] = B[0+8*ldb]; plB[16] = B[0+16*ldb]; plB[24] = B[0+24*ldb]; plB[32] = B[0+32*ldb]; plB[40] = B[0+40*ldb]; barrier(CLK_LOCAL_MEM_FENCE); int offA = idx; int offB = idy; M6x6 M6x6 M6x6 M6x6 M6x6 M6x6 M6x6 M6x6 A += lda << 3; B += 8; } while (--block_k > 0); C+= gidx*48; C+= idx; C+= gidy*48*ldc; C+= idy*ldc; C[0*ldc] = alpha*rC[0][0] + beta*C[0*ldc]; C[8*ldc] = alpha*rC[0][1] + beta*C[8*ldc]; C[16*ldc] = alpha*rC[0][2] + beta*C[16*ldc]; C[24*ldc] = alpha*rC[0][3] + beta*C[24*ldc]; C[32*ldc] = alpha*rC[0][4] + beta*C[32*ldc]; C[40*ldc] = alpha*rC[0][5] + beta*C[40*ldc]; C+=8; C[0*ldc] = alpha*rC[1][0] + beta*C[0*ldc]; C[8*ldc] = alpha*rC[1][1] + beta*C[8*ldc]; C[16*ldc] = alpha*rC[1][2] + beta*C[16*ldc]; C[24*ldc] = alpha*rC[1][3] + beta*C[24*ldc]; C[32*ldc] = alpha*rC[1][4] + beta*C[32*ldc]; C[40*ldc] = alpha*rC[1][5] + beta*C[40*ldc]; C+=8; C[0*ldc] = alpha*rC[2][0] + beta*C[0*ldc]; C[8*ldc] = alpha*rC[2][1] + beta*C[8*ldc]; C[16*ldc] = alpha*rC[2][2] + beta*C[16*ldc]; C[24*ldc] = alpha*rC[2][3] + beta*C[24*ldc]; C[32*ldc] = alpha*rC[2][4] + beta*C[32*ldc]; C[40*ldc] = alpha*rC[2][5] + beta*C[40*ldc]; C+=8; C[0*ldc] = alpha*rC[3][0] + beta*C[0*ldc]; C[8*ldc] = alpha*rC[3][1] + beta*C[8*ldc]; C[16*ldc] = alpha*rC[3][2] + beta*C[16*ldc]; C[24*ldc] = alpha*rC[3][3] + beta*C[24*ldc]; C[32*ldc] = alpha*rC[3][4] + beta*C[32*ldc]; C[40*ldc] = alpha*rC[3][5] + beta*C[40*ldc]; C+=8; C[0*ldc] = alpha*rC[4][0] + beta*C[0*ldc]; C[8*ldc] = alpha*rC[4][1] + beta*C[8*ldc]; C[16*ldc] = alpha*rC[4][2] + beta*C[16*ldc]; C[24*ldc] = alpha*rC[4][3] + beta*C[24*ldc]; C[32*ldc] = alpha*rC[4][4] + beta*C[32*ldc]; C[40*ldc] = alpha*rC[4][5] + beta*C[40*ldc]; C+=8; C[0*ldc] = alpha*rC[5][0] + beta*C[0*ldc]; C[8*ldc] = alpha*rC[5][1] + beta*C[8*ldc]; C[16*ldc] = alpha*rC[5][2] + beta*C[16*ldc]; C[24*ldc] = alpha*rC[5][3] + beta*C[24*ldc]; C[32*ldc] = alpha*rC[5][4] + beta*C[32*ldc]; C[40*ldc] = alpha*rC[5][5] + beta*C[40*ldc]; } ); #endif clblas-2.10/src/library/blas/AutoGemm/UserGemmKernelSources/dgemm_Col_NT_B0_MX048_NX048_KX08_src.cpp000066400000000000000000000205311264277366700325020ustar00rootroot00000000000000/******************************************************************************* * Hand-tuned kernel ******************************************************************************/ #ifndef KERNEL_DGEMM_COL_NT_B0_MX048_NX048_KX08_SRC_H #define KERNEL_DGEMM_COL_NT_B0_MX048_NX048_KX08_SRC_H #pragma message("AutoGemm's dgemm_Col_NT_B0_MX048_NX048_KX08_src overriden by user.") #ifndef STRINGIFY #define STRINGIFY(S) STRINGIFY2(S) #define STRINGIFY2(S) #S #endif const unsigned int dgemm_Col_NT_B0_MX048_NX048_KX08_workGroupNumRows = 8; const unsigned int dgemm_Col_NT_B0_MX048_NX048_KX08_workGroupNumCols = 8; const unsigned int dgemm_Col_NT_B0_MX048_NX048_KX08_microTileNumRows = 6; const unsigned int dgemm_Col_NT_B0_MX048_NX048_KX08_microTileNumCols = 6; const unsigned int dgemm_Col_NT_B0_MX048_NX048_KX08_unroll = 8; const char * const dgemm_Col_NT_B0_MX048_NX048_KX08_src = STRINGIFY( #pragma OPENCL EXTENSION cl_khr_fp64 : enable \n \n \ntypedef union _GPtr { \n __global float *f; \n __global double *d; \n __global float2 *f2v; \n __global double2 *d2v; \n} GPtr; \n #define M6x6 \ rA[0] = lA[offA + 0]; \ rA[1] = lA[offA + 1]; \ rA[2] = lA[offA + 16]; \ rA[3] = lA[offA + 17]; \ rA[4] = lA[offA + 32]; \ rA[5] = lA[offA + 33]; \ rB[0] = lB[offB + 0]; \ rB[1] = lB[offB + 1]; \ rB[2] = lB[offB + 16]; \ rB[3] = lB[offB + 17]; \ rB[4] = lB[offB + 32]; \ rB[5] = lB[offB + 33]; \ offA += 48; \ offB += 48; \ rC[0][0] = mad(rA[0],rB[0],rC[0][0]); \ rC[0][1] = mad(rA[1],rB[0],rC[0][1]); \ rC[0][2] = mad(rA[2],rB[0],rC[0][2]); \ rC[0][3] = mad(rA[3],rB[0],rC[0][3]); \ rC[0][4] = mad(rA[4],rB[0],rC[0][4]); \ rC[0][5] = mad(rA[5],rB[0],rC[0][5]); \ rC[1][0] = mad(rA[0],rB[1],rC[1][0]); \ rC[1][1] = mad(rA[1],rB[1],rC[1][1]); \ rC[1][2] = mad(rA[2],rB[1],rC[1][2]); \ rC[1][3] = mad(rA[3],rB[1],rC[1][3]); \ rC[1][4] = mad(rA[4],rB[1],rC[1][4]); \ rC[1][5] = mad(rA[5],rB[1],rC[1][5]); \ rC[2][0] = mad(rA[0],rB[2],rC[2][0]); \ rC[2][1] = mad(rA[1],rB[2],rC[2][1]); \ rC[2][2] = mad(rA[2],rB[2],rC[2][2]); \ rC[2][3] = mad(rA[3],rB[2],rC[2][3]); \ rC[2][4] = mad(rA[4],rB[2],rC[2][4]); \ rC[2][5] = mad(rA[5],rB[2],rC[2][5]); \ rC[3][0] = mad(rA[0],rB[3],rC[3][0]); \ rC[3][1] = mad(rA[1],rB[3],rC[3][1]); \ rC[3][2] = mad(rA[2],rB[3],rC[3][2]); \ rC[3][3] = mad(rA[3],rB[3],rC[3][3]); \ rC[3][4] = mad(rA[4],rB[3],rC[3][4]); \ rC[3][5] = mad(rA[5],rB[3],rC[3][5]); \ rC[4][0] = mad(rA[0],rB[4],rC[4][0]); \ rC[4][1] = mad(rA[1],rB[4],rC[4][1]); \ rC[4][2] = mad(rA[2],rB[4],rC[4][2]); \ rC[4][3] = mad(rA[3],rB[4],rC[4][3]); \ rC[4][4] = mad(rA[4],rB[4],rC[4][4]); \ rC[4][5] = mad(rA[5],rB[4],rC[4][5]); \ rC[5][0] = mad(rA[0],rB[5],rC[5][0]); \ rC[5][1] = mad(rA[1],rB[5],rC[5][1]); \ rC[5][2] = mad(rA[2],rB[5],rC[5][2]); \ rC[5][3] = mad(rA[3],rB[5],rC[5][3]); \ rC[5][4] = mad(rA[4],rB[5],rC[5][4]); \ rC[5][5] = mad(rA[5],rB[5],rC[5][5]); \ mem_fence(CLK_LOCAL_MEM_FENCE);\n \n \n \n__attribute__((reqd_work_group_size(8,8,1))) \n__kernel void dgemm_Col_NT_B0_MX048_NX048_KX08 ( \n __global double2 const * restrict A, \n __global double2 const * restrict B, \n __global double * C, \n double const alpha, \n double const beta, \n uint const M, \n uint const N, \n uint const K, \n uint lda, \n uint ldb, \n uint ldc, \n uint offsetA, \n uint offsetB, \n uint offsetC) \n{ \n GPtr uA; \n GPtr uB; \n uA.d2v = (__global double2 *)A; \n uB.d2v = (__global double2 *)B; \n \n \n uA.d += offsetA; \n uB.d += offsetB; \n C += offsetC; \n \n \n double rC[6][6] = { {(double)0} }; \n double rA[6]; \n double rB[6]; \n \n __local double lA[392]; \n __local double lB[392]; \n \n int gidx = get_group_id(0); \n int gidy = get_group_id(1); \n int idx = get_local_id(0); \n int idy = get_local_id(1); \n \n \n uA.d += 2*(gidx*24 + idx) + idy*lda; \n uB.d += 2*(gidy*24 + idx) + idy*ldb; \n \n int block_k = K >> 3; \n do { \n __local double2* plA = (__local double2*)(lA + idy*48 + 2*idx); \n __local double2* plB = (__local double2*)(lB + idy*48 + 2*idx); \n barrier(CLK_LOCAL_MEM_FENCE); \n plB[0 ] = uB.d2v[0 ]; \n plB[8 ] = uB.d2v[8 ]; \n plB[16] = uB.d2v[16]; \n plA[0 ] = uA.d2v[0 ]; \n plA[8 ] = uA.d2v[8 ]; \n plA[16] = uA.d2v[16]; \n \n barrier(CLK_LOCAL_MEM_FENCE); \n int offA = idx << 1; \n int offB = idy << 1; \n M6x6 \n M6x6 \n M6x6 \n M6x6 \n M6x6 \n M6x6 \n M6x6 \n M6x6 \n uA.d += lda << 3; \n uB.d += ldb << 3; \n } while (--block_k > 0); \n \n \n \n int offset_x = gidx*48+ idx*2; \n int offset_y = gidy*48+ idy*2; \n (C[(offset_x + 0) + (offset_y + 0) * ldc] = alpha * rC[0][0]); \n (C[(offset_x + 1) + (offset_y + 0) * ldc] = alpha * rC[0][1]); \n (C[(offset_x + 0) + (offset_y + 1) * ldc] = alpha * rC[1][0]); \n (C[(offset_x + 1) + (offset_y + 1) * ldc] = alpha * rC[1][1]); \n (C[(offset_x + 0) + (offset_y + 16) * ldc] = alpha * rC[2][0]); \n (C[(offset_x + 1) + (offset_y + 16) * ldc] = alpha * rC[2][1]); \n (C[(offset_x + 0) + (offset_y + 17) * ldc] = alpha * rC[3][0]); \n (C[(offset_x + 1) + (offset_y + 17) * ldc] = alpha * rC[3][1]); \n (C[(offset_x + 0) + (offset_y + 32) * ldc] = alpha * rC[4][0]); \n (C[(offset_x + 1) + (offset_y + 32) * ldc] = alpha * rC[4][1]); \n (C[(offset_x + 0) + (offset_y + 33) * ldc] = alpha * rC[5][0]); \n (C[(offset_x + 1) + (offset_y + 33) * ldc] = alpha * rC[5][1]); \n (C[(offset_x + 16) + (offset_y + 0) * ldc] = alpha * rC[0][2]); \n (C[(offset_x + 17) + (offset_y + 0) * ldc] = alpha * rC[0][3]); \n (C[(offset_x + 16) + (offset_y + 1) * ldc] = alpha * rC[1][2]); \n (C[(offset_x + 17) + (offset_y + 1) * ldc] = alpha * rC[1][3]); \n (C[(offset_x + 16) + (offset_y + 16) * ldc] = alpha * rC[2][2]); \n (C[(offset_x + 17) + (offset_y + 16) * ldc] = alpha * rC[2][3]); \n (C[(offset_x + 16) + (offset_y + 17) * ldc] = alpha * rC[3][2]); \n (C[(offset_x + 17) + (offset_y + 17) * ldc] = alpha * rC[3][3]); \n (C[(offset_x + 16) + (offset_y + 32) * ldc] = alpha * rC[4][2]); \n (C[(offset_x + 17) + (offset_y + 32) * ldc] = alpha * rC[4][3]); \n (C[(offset_x + 16) + (offset_y + 33) * ldc] = alpha * rC[5][2]); \n (C[(offset_x + 17) + (offset_y + 33) * ldc] = alpha * rC[5][3]); \n (C[(offset_x + 32) + (offset_y + 0) * ldc] = alpha * rC[0][4]); \n (C[(offset_x + 33) + (offset_y + 0) * ldc] = alpha * rC[0][5]); \n (C[(offset_x + 32) + (offset_y + 1) * ldc] = alpha * rC[1][4]); \n (C[(offset_x + 33) + (offset_y + 1) * ldc] = alpha * rC[1][5]); \n (C[(offset_x + 32) + (offset_y + 16) * ldc] = alpha * rC[2][4]); \n (C[(offset_x + 33) + (offset_y + 16) * ldc] = alpha * rC[2][5]); \n (C[(offset_x + 32) + (offset_y + 17) * ldc] = alpha * rC[3][4]); \n (C[(offset_x + 33) + (offset_y + 17) * ldc] = alpha * rC[3][5]); \n (C[(offset_x + 32) + (offset_y + 32) * ldc] = alpha * rC[4][4]); \n (C[(offset_x + 33) + (offset_y + 32) * ldc] = alpha * rC[4][5]); \n (C[(offset_x + 32) + (offset_y + 33) * ldc] = alpha * rC[5][4]); \n (C[(offset_x + 33) + (offset_y + 33) * ldc] = alpha * rC[5][5]); \n} ); #endif clblas-2.10/src/library/blas/AutoGemm/UserGemmKernelSources/dgemm_Col_NT_B1_MX048_NX048_KX08_src.cpp000066400000000000000000000244201264277366700325040ustar00rootroot00000000000000/******************************************************************************* * Hand-tuned kernel ******************************************************************************/ #ifndef KERNEL_DGEMM_COL_NT_B1_MX048_NX048_KX08_SRC_H #define KERNEL_DGEMM_COL_NT_B1_MX048_NX048_KX08_SRC_H #pragma message("AutoGemm's dgemm_Col_NT_B1_MX048_NX048_KX08_src overriden by user.") #ifndef STRINGIFY #define STRINGIFY(S) STRINGIFY2(S) #define STRINGIFY2(S) #S #endif const unsigned int dgemm_Col_NT_B1_MX048_NX048_KX08_workGroupNumRows = 8; const unsigned int dgemm_Col_NT_B1_MX048_NX048_KX08_workGroupNumCols = 8; const unsigned int dgemm_Col_NT_B1_MX048_NX048_KX08_microTileNumRows = 6; const unsigned int dgemm_Col_NT_B1_MX048_NX048_KX08_microTileNumCols = 6; const unsigned int dgemm_Col_NT_B1_MX048_NX048_KX08_unroll = 8; const char * const dgemm_Col_NT_B1_MX048_NX048_KX08_src = STRINGIFY( #pragma OPENCL EXTENSION cl_khr_fp64 : enable \n \n \ntypedef union _GPtr { \n __global float *f; \n __global double *d; \n __global float2 *f2v; \n __global double2 *d2v; \n} GPtr; \n #define M6x6 \ rA[0] = lA[offA + 0]; \ rA[1] = lA[offA + 1]; \ rA[2] = lA[offA + 16]; \ rA[3] = lA[offA + 17]; \ rA[4] = lA[offA + 32]; \ rA[5] = lA[offA + 33]; \ rB[0] = lB[offB + 0]; \ rB[1] = lB[offB + 1]; \ rB[2] = lB[offB + 16]; \ rB[3] = lB[offB + 17]; \ rB[4] = lB[offB + 32]; \ rB[5] = lB[offB + 33]; \ offA += 48; \ offB += 48; \ rC[0][0] = mad(rA[0],rB[0],rC[0][0]); \ rC[0][1] = mad(rA[1],rB[0],rC[0][1]); \ rC[0][2] = mad(rA[2],rB[0],rC[0][2]); \ rC[0][3] = mad(rA[3],rB[0],rC[0][3]); \ rC[0][4] = mad(rA[4],rB[0],rC[0][4]); \ rC[0][5] = mad(rA[5],rB[0],rC[0][5]); \ rC[1][0] = mad(rA[0],rB[1],rC[1][0]); \ rC[1][1] = mad(rA[1],rB[1],rC[1][1]); \ rC[1][2] = mad(rA[2],rB[1],rC[1][2]); \ rC[1][3] = mad(rA[3],rB[1],rC[1][3]); \ rC[1][4] = mad(rA[4],rB[1],rC[1][4]); \ rC[1][5] = mad(rA[5],rB[1],rC[1][5]); \ rC[2][0] = mad(rA[0],rB[2],rC[2][0]); \ rC[2][1] = mad(rA[1],rB[2],rC[2][1]); \ rC[2][2] = mad(rA[2],rB[2],rC[2][2]); \ rC[2][3] = mad(rA[3],rB[2],rC[2][3]); \ rC[2][4] = mad(rA[4],rB[2],rC[2][4]); \ rC[2][5] = mad(rA[5],rB[2],rC[2][5]); \ rC[3][0] = mad(rA[0],rB[3],rC[3][0]); \ rC[3][1] = mad(rA[1],rB[3],rC[3][1]); \ rC[3][2] = mad(rA[2],rB[3],rC[3][2]); \ rC[3][3] = mad(rA[3],rB[3],rC[3][3]); \ rC[3][4] = mad(rA[4],rB[3],rC[3][4]); \ rC[3][5] = mad(rA[5],rB[3],rC[3][5]); \ rC[4][0] = mad(rA[0],rB[4],rC[4][0]); \ rC[4][1] = mad(rA[1],rB[4],rC[4][1]); \ rC[4][2] = mad(rA[2],rB[4],rC[4][2]); \ rC[4][3] = mad(rA[3],rB[4],rC[4][3]); \ rC[4][4] = mad(rA[4],rB[4],rC[4][4]); \ rC[4][5] = mad(rA[5],rB[4],rC[4][5]); \ rC[5][0] = mad(rA[0],rB[5],rC[5][0]); \ rC[5][1] = mad(rA[1],rB[5],rC[5][1]); \ rC[5][2] = mad(rA[2],rB[5],rC[5][2]); \ rC[5][3] = mad(rA[3],rB[5],rC[5][3]); \ rC[5][4] = mad(rA[4],rB[5],rC[5][4]); \ rC[5][5] = mad(rA[5],rB[5],rC[5][5]); \ mem_fence(CLK_LOCAL_MEM_FENCE);\n \n \n \n__attribute__((reqd_work_group_size(8,8,1))) \n__kernel void dgemm_Col_NT_B1_MX048_NX048_KX08_src ( \n __global double2 const * restrict A, \n __global double2 const * restrict B, \n __global double * C, \n double const alpha, \n double const beta, \n uint const M, \n uint const N, \n uint const K, \n uint lda, \n uint ldb, \n uint ldc, \n uint offsetA, \n uint offsetB, \n uint offsetC) \n{ \n GPtr uA; \n GPtr uB; \n uA.d2v = (__global double2 *)A; \n uB.d2v = (__global double2 *)B; \n \n uA.d += offsetA; \n uB.d += offsetB; \n C += offsetC; \n \n \n double rC[6][6] = { {(double)0} }; \n double rA[6]; \n double rB[6]; \n \n __local double lA[392]; \n __local double lB[392]; \n \n int gidx = get_group_id(0); \n int gidy = get_group_id(1); \n int idx = get_local_id(0); \n int idy = get_local_id(1); \n \n \n uA.d += 2*(gidx*24 + idx) + idy*lda; \n uB.d += 2*(gidy*24 + idx) + idy*ldb; \n \n int block_k = K >> 3; \n do { \n __local double2* plA = (__local double2*)(lA + idy*48 + 2*idx); \n __local double2* plB = (__local double2*)(lB + idy*48 + 2*idx); \n barrier(CLK_LOCAL_MEM_FENCE); \n plB[0 ] = uB.d2v[0 ]; \n plB[8 ] = uB.d2v[8 ]; \n plB[16] = uB.d2v[16]; \n plA[0 ] = uA.d2v[0 ]; \n plA[8 ] = uA.d2v[8 ]; \n plA[16] = uA.d2v[16]; \n \n barrier(CLK_LOCAL_MEM_FENCE); \n int offA = idx << 1; \n int offB = idy << 1; \n M6x6 \n M6x6 \n M6x6 \n M6x6 \n M6x6 \n M6x6 \n M6x6 \n M6x6 \n uA.d += lda << 3; \n uB.d += ldb << 3; \n } while (--block_k > 0); \n \n int offset_x = gidx*48+ idx*2; \n int offset_y = gidy*48+ idy*2; \n (C[(offset_x + 0) + (offset_y + 0) * ldc] = mad(beta, C[(offset_x + 0) + (offset_y + 0) * ldc], alpha * rC[0][0])); \n (C[(offset_x + 1) + (offset_y + 0) * ldc] = mad(beta, C[(offset_x + 1) + (offset_y + 0) * ldc], alpha * rC[0][1])); \n (C[(offset_x + 0) + (offset_y + 1) * ldc] = mad(beta, C[(offset_x + 0) + (offset_y + 1) * ldc], alpha * rC[1][0])); \n (C[(offset_x + 1) + (offset_y + 1) * ldc] = mad(beta, C[(offset_x + 1) + (offset_y + 1) * ldc], alpha * rC[1][1])); \n (C[(offset_x + 0) + (offset_y + 16) * ldc] = mad(beta, C[(offset_x + 0) + (offset_y + 16) * ldc], alpha * rC[2][0])); \n (C[(offset_x + 1) + (offset_y + 16) * ldc] = mad(beta, C[(offset_x + 1) + (offset_y + 16) * ldc], alpha * rC[2][1])); \n (C[(offset_x + 0) + (offset_y + 17) * ldc] = mad(beta, C[(offset_x + 0) + (offset_y + 17) * ldc], alpha * rC[3][0])); \n (C[(offset_x + 1) + (offset_y + 17) * ldc] = mad(beta, C[(offset_x + 1) + (offset_y + 17) * ldc], alpha * rC[3][1])); \n (C[(offset_x + 0) + (offset_y + 32) * ldc] = mad(beta, C[(offset_x + 0) + (offset_y + 32) * ldc], alpha * rC[4][0])); \n (C[(offset_x + 1) + (offset_y + 32) * ldc] = mad(beta, C[(offset_x + 1) + (offset_y + 32) * ldc], alpha * rC[4][1])); \n (C[(offset_x + 0) + (offset_y + 33) * ldc] = mad(beta, C[(offset_x + 0) + (offset_y + 33) * ldc], alpha * rC[5][0])); \n (C[(offset_x + 1) + (offset_y + 33) * ldc] = mad(beta, C[(offset_x + 1) + (offset_y + 33) * ldc], alpha * rC[5][1])); \n (C[(offset_x + 16) + (offset_y + 0) * ldc] = mad(beta, C[(offset_x + 16) + (offset_y + 0) * ldc], alpha * rC[0][2])); \n (C[(offset_x + 17) + (offset_y + 0) * ldc] = mad(beta, C[(offset_x + 17) + (offset_y + 0) * ldc], alpha * rC[0][3])); \n (C[(offset_x + 16) + (offset_y + 1) * ldc] = mad(beta, C[(offset_x + 16) + (offset_y + 1) * ldc], alpha * rC[1][2])); \n (C[(offset_x + 17) + (offset_y + 1) * ldc] = mad(beta, C[(offset_x + 17) + (offset_y + 1) * ldc], alpha * rC[1][3])); \n (C[(offset_x + 16) + (offset_y + 16) * ldc] = mad(beta, C[(offset_x + 16) + (offset_y + 16) * ldc], alpha * rC[2][2])); \n (C[(offset_x + 17) + (offset_y + 16) * ldc] = mad(beta, C[(offset_x + 17) + (offset_y + 16) * ldc], alpha * rC[2][3])); \n (C[(offset_x + 16) + (offset_y + 17) * ldc] = mad(beta, C[(offset_x + 16) + (offset_y + 17) * ldc], alpha * rC[3][2])); \n (C[(offset_x + 17) + (offset_y + 17) * ldc] = mad(beta, C[(offset_x + 17) + (offset_y + 17) * ldc], alpha * rC[3][3])); \n (C[(offset_x + 16) + (offset_y + 32) * ldc] = mad(beta, C[(offset_x + 16) + (offset_y + 32) * ldc], alpha * rC[4][2])); \n (C[(offset_x + 17) + (offset_y + 32) * ldc] = mad(beta, C[(offset_x + 17) + (offset_y + 32) * ldc], alpha * rC[4][3])); \n (C[(offset_x + 16) + (offset_y + 33) * ldc] = mad(beta, C[(offset_x + 16) + (offset_y + 33) * ldc], alpha * rC[5][2])); \n (C[(offset_x + 17) + (offset_y + 33) * ldc] = mad(beta, C[(offset_x + 17) + (offset_y + 33) * ldc], alpha * rC[5][3])); \n (C[(offset_x + 32) + (offset_y + 0) * ldc] = mad(beta, C[(offset_x + 32) + (offset_y + 0) * ldc], alpha * rC[0][4])); \n (C[(offset_x + 33) + (offset_y + 0) * ldc] = mad(beta, C[(offset_x + 33) + (offset_y + 0) * ldc], alpha * rC[0][5])); \n (C[(offset_x + 32) + (offset_y + 1) * ldc] = mad(beta, C[(offset_x + 32) + (offset_y + 1) * ldc], alpha * rC[1][4])); \n (C[(offset_x + 33) + (offset_y + 1) * ldc] = mad(beta, C[(offset_x + 33) + (offset_y + 1) * ldc], alpha * rC[1][5])); \n (C[(offset_x + 32) + (offset_y + 16) * ldc] = mad(beta, C[(offset_x + 32) + (offset_y + 16) * ldc], alpha * rC[2][4])); \n (C[(offset_x + 33) + (offset_y + 16) * ldc] = mad(beta, C[(offset_x + 33) + (offset_y + 16) * ldc], alpha * rC[2][5])); \n (C[(offset_x + 32) + (offset_y + 17) * ldc] = mad(beta, C[(offset_x + 32) + (offset_y + 17) * ldc], alpha * rC[3][4])); \n (C[(offset_x + 33) + (offset_y + 17) * ldc] = mad(beta, C[(offset_x + 33) + (offset_y + 17) * ldc], alpha * rC[3][5])); \n (C[(offset_x + 32) + (offset_y + 32) * ldc] = mad(beta, C[(offset_x + 32) + (offset_y + 32) * ldc], alpha * rC[4][4])); \n (C[(offset_x + 33) + (offset_y + 32) * ldc] = mad(beta, C[(offset_x + 33) + (offset_y + 32) * ldc], alpha * rC[4][5])); \n (C[(offset_x + 32) + (offset_y + 33) * ldc] = mad(beta, C[(offset_x + 32) + (offset_y + 33) * ldc], alpha * rC[5][4])); \n (C[(offset_x + 33) + (offset_y + 33) * ldc] = mad(beta, C[(offset_x + 33) + (offset_y + 33) * ldc], alpha * rC[5][5])); \n} ); #endif clblas-2.10/src/library/blas/AutoGemm/UserGemmKernelSources/dgemm_Col_TN_B0_MX048_NX048_KX08_src.cpp000066400000000000000000000143061264277366700325050ustar00rootroot00000000000000/******************************************************************************* * Hand-tuned kernel ******************************************************************************/ #ifndef KERNEL_DGEMM_COL_TN_B0_MX048_NX048_KX08_SRC_H #define KERNEL_DGEMM_COL_TN_B0_MX048_NX048_KX08_SRC_H #pragma message("AutoGemm's dgemm_Col_TN_B0_MX048_NX048_KX08_src overriden by user.") #ifndef STRINGIFY #define STRINGIFY(S) STRINGIFY2(S) #define STRINGIFY2(S) #S #endif const unsigned int dgemm_Col_TN_B0_MX048_NX048_KX08_workGroupNumRows = 8; const unsigned int dgemm_Col_TN_B0_MX048_NX048_KX08_workGroupNumCols = 8; const unsigned int dgemm_Col_TN_B0_MX048_NX048_KX08_microTileNumRows = 6; const unsigned int dgemm_Col_TN_B0_MX048_NX048_KX08_microTileNumCols = 6; const unsigned int dgemm_Col_TN_B0_MX048_NX048_KX08_unroll = 8; const char * const dgemm_Col_TN_B0_MX048_NX048_KX08_src = STRINGIFY( #pragma OPENCL EXTENSION cl_khr_fp64 : enable \n __attribute__( (reqd_work_group_size(8, 8, 1)) ) __kernel void dgemm_Col_TN_B0_MX048_NX048_KX08_src ( __global double const * restrict A, __global double const * restrict B, __global double * C, double const alpha, double const beta, uint const M, uint const N, uint const K, uint lda, uint ldb, uint ldc, uint const offsetA, uint const offsetB, uint const offsetC ) { double rC[6][6] = { {(double)0} }; double rA[1][6]; double rB[1][6]; A += offsetA; B += offsetB; C+=offsetC; __local double lA[392]; __local double lB[392]; int gidx = get_group_id(0); int gidy = get_group_id(1); int idx = get_local_id(0); int idy = get_local_id(1); int idt = 8*idy + idx; int idxT = idt % 4; int idyT = idt / 4; A += gidx*48*lda + idxT + idyT*lda; B += gidy*48*ldb+ idxT + idyT*ldb; //for( int block_k=0 ; block_k< K ; block_k+=8) uint block_k = K >> 3; do { __local double* plA = lA + idxT*49+ idyT; __local double* plB = lB + idxT*49+ idyT; barrier(CLK_LOCAL_MEM_FENCE); plA[0] = A[0]; plA[196] = A[4]; plA[16] = A[16*lda]; plA[212] = A[4+16*lda]; plA[32] = A[32*lda]; plA[228] = A[4+32*lda]; plB[0] = B[0]; plB[196] = B[4+0*ldb]; plB[16] = B[0+16*ldb]; plB[212] = B[4+16*ldb]; plB[32] = B[0+32*ldb]; plB[228] = B[4+32*ldb]; barrier(CLK_LOCAL_MEM_FENCE); int offA = 1*idx; int offB = 1*idy; for( int k = 0 ; k < 8; k+=1) { rA[0][0] = lA[offA + 0]; rA[0][1] = lA[offA + 8]; rA[0][2] = lA[offA + 16]; rA[0][3] = lA[offA + 24]; rA[0][4] = lA[offA + 32]; rA[0][5] = lA[offA + 40]; rB[0][0] = lB[offB + 0]; rB[0][1] = lB[offB + 8]; rB[0][2] = lB[offB + 16]; rB[0][3] = lB[offB + 24]; rB[0][4] = lB[offB + 32]; rB[0][5] = lB[offB + 40]; offA += 49; offB += 49; rC[0][0]=mad(rA[0][0],rB[0][0],rC[0][0]); rC[1][0]=mad(rA[0][1],rB[0][0],rC[1][0]); rC[2][0]=mad(rA[0][2],rB[0][0],rC[2][0]); rC[3][0]=mad(rA[0][3],rB[0][0],rC[3][0]); rC[4][0]=mad(rA[0][4],rB[0][0],rC[4][0]); rC[5][0]=mad(rA[0][5],rB[0][0],rC[5][0]); rC[0][1]=mad(rA[0][0],rB[0][1],rC[0][1]); rC[1][1]=mad(rA[0][1],rB[0][1],rC[1][1]); rC[2][1]=mad(rA[0][2],rB[0][1],rC[2][1]); rC[3][1]=mad(rA[0][3],rB[0][1],rC[3][1]); rC[4][1]=mad(rA[0][4],rB[0][1],rC[4][1]); rC[5][1]=mad(rA[0][5],rB[0][1],rC[5][1]); rC[0][2]=mad(rA[0][0],rB[0][2],rC[0][2]); rC[1][2]=mad(rA[0][1],rB[0][2],rC[1][2]); rC[2][2]=mad(rA[0][2],rB[0][2],rC[2][2]); rC[3][2]=mad(rA[0][3],rB[0][2],rC[3][2]); rC[4][2]=mad(rA[0][4],rB[0][2],rC[4][2]); rC[5][2]=mad(rA[0][5],rB[0][2],rC[5][2]); rC[0][3]=mad(rA[0][0],rB[0][3],rC[0][3]); rC[1][3]=mad(rA[0][1],rB[0][3],rC[1][3]); rC[2][3]=mad(rA[0][2],rB[0][3],rC[2][3]); rC[3][3]=mad(rA[0][3],rB[0][3],rC[3][3]); rC[4][3]=mad(rA[0][4],rB[0][3],rC[4][3]); rC[5][3]=mad(rA[0][5],rB[0][3],rC[5][3]); rC[0][4]=mad(rA[0][0],rB[0][4],rC[0][4]); rC[1][4]=mad(rA[0][1],rB[0][4],rC[1][4]); rC[2][4]=mad(rA[0][2],rB[0][4],rC[2][4]); rC[3][4]=mad(rA[0][3],rB[0][4],rC[3][4]); rC[4][4]=mad(rA[0][4],rB[0][4],rC[4][4]); rC[5][4]=mad(rA[0][5],rB[0][4],rC[5][4]); rC[0][5]=mad(rA[0][0],rB[0][5],rC[0][5]); rC[1][5]=mad(rA[0][1],rB[0][5],rC[1][5]); rC[2][5]=mad(rA[0][2],rB[0][5],rC[2][5]); rC[3][5]=mad(rA[0][3],rB[0][5],rC[3][5]); rC[4][5]=mad(rA[0][4],rB[0][5],rC[4][5]); rC[5][5]=mad(rA[0][5],rB[0][5],rC[5][5]); } A += 8; B += 8; } while (--block_k > 0); C+= gidx*48; C+= idx; C+= gidy*48*ldc; C+= idy*ldc; C[0*ldc] = alpha*rC[0][0] ; C[8*ldc] = alpha*rC[0][1] ; C[16*ldc] = alpha*rC[0][2]; C[24*ldc] = alpha*rC[0][3]; C[32*ldc] = alpha*rC[0][4]; C[40*ldc] = alpha*rC[0][5]; C+=8; C[0*ldc] = alpha*rC[1][0] ; C[8*ldc] = alpha*rC[1][1] ; C[16*ldc] = alpha*rC[1][2]; C[24*ldc] = alpha*rC[1][3]; C[32*ldc] = alpha*rC[1][4]; C[40*ldc] = alpha*rC[1][5]; C+=8; C[0*ldc] = alpha*rC[2][0] ; C[8*ldc] = alpha*rC[2][1] ; C[16*ldc] = alpha*rC[2][2]; C[24*ldc] = alpha*rC[2][3]; C[32*ldc] = alpha*rC[2][4]; C[40*ldc] = alpha*rC[2][5]; C+=8; C[0*ldc] = alpha*rC[3][0] ; C[8*ldc] = alpha*rC[3][1] ; C[16*ldc] = alpha*rC[3][2]; C[24*ldc] = alpha*rC[3][3]; C[32*ldc] = alpha*rC[3][4]; C[40*ldc] = alpha*rC[3][5]; C+=8; C[0*ldc] = alpha*rC[4][0] ; C[8*ldc] = alpha*rC[4][1] ; C[16*ldc] = alpha*rC[4][2]; C[24*ldc] = alpha*rC[4][3]; C[32*ldc] = alpha*rC[4][4]; C[40*ldc] = alpha*rC[4][5]; C+=8; C[0*ldc] = alpha*rC[5][0] ; C[8*ldc] = alpha*rC[5][1] ; C[16*ldc] = alpha*rC[5][2]; C[24*ldc] = alpha*rC[5][3]; C[32*ldc] = alpha*rC[5][4]; C[40*ldc] = alpha*rC[5][5]; } ); #endif clblas-2.10/src/library/blas/AutoGemm/UserGemmKernelSources/dgemm_Col_TN_B1_MX048_NX048_KX08_src.cpp000066400000000000000000000154331264277366700325100ustar00rootroot00000000000000/******************************************************************************* * Hand-tuned kernel ******************************************************************************/ #ifndef KERNEL_DGEMM_COL_TN_B1_MX048_NX048_KX08_SRC_H #define KERNEL_DGEMM_COL_TN_B1_MX048_NX048_KX08_SRC_H #pragma message("AutoGemm's dgemm_Col_TN_B1_MX048_NX048_KX08_src overriden by user.") #ifndef STRINGIFY #define STRINGIFY(S) STRINGIFY2(S) #define STRINGIFY2(S) #S #endif const unsigned int dgemm_Col_TN_B1_MX048_NX048_KX08_workGroupNumRows = 8; const unsigned int dgemm_Col_TN_B1_MX048_NX048_KX08_workGroupNumCols = 8; const unsigned int dgemm_Col_TN_B1_MX048_NX048_KX08_microTileNumRows = 6; const unsigned int dgemm_Col_TN_B1_MX048_NX048_KX08_microTileNumCols = 6; const unsigned int dgemm_Col_TN_B1_MX048_NX048_KX08_unroll = 8; const char * const dgemm_Col_TN_B1_MX048_NX048_KX08_src = STRINGIFY( #pragma OPENCL EXTENSION cl_khr_fp64 : enable \n __attribute__( (reqd_work_group_size(8, 8, 1)) ) __kernel void dgemm_Col_TN_B1_MX048_NX048_KX08_src ( __global double const * restrict A, __global double const * restrict B, __global double * C, double const alpha, double const beta, uint const M, uint const N, uint const K, uint lda, uint ldb, uint ldc, uint const offsetA, uint const offsetB, uint const offsetC ) { double rC[6][6] = { {(double)0} }; double rA[1][6]; double rB[1][6]; A += offsetA; B += offsetB; C+=offsetC; __local double lA[392]; __local double lB[392]; int gidx = get_group_id(0); int gidy = get_group_id(1); int idx = get_local_id(0); int idy = get_local_id(1); int idt = 8*idy + idx; int idxT = idt % 4; int idyT = idt / 4; A += gidx*48*lda + idxT + idyT*lda; B += gidy*48*ldb+ idxT + idyT*ldb; //for( int block_k=0 ; block_k< K ; block_k+=8) uint block_k = K >> 3; do { __local double* plA = lA + idxT*49+ idyT; __local double* plB = lB + idxT*49+ idyT; barrier(CLK_LOCAL_MEM_FENCE); plA[0] = A[0]; plA[196] = A[4]; plA[16] = A[16*lda]; plA[212] = A[4+16*lda]; plA[32] = A[32*lda]; plA[228] = A[4+32*lda]; plB[0] = B[0]; plB[196] = B[4+0*ldb]; plB[16] = B[0+16*ldb]; plB[212] = B[4+16*ldb]; plB[32] = B[0+32*ldb]; plB[228] = B[4+32*ldb]; barrier(CLK_LOCAL_MEM_FENCE); int offA = 1*idx; int offB = 1*idy; for( int k = 0 ; k < 8; k+=1) { rA[0][0] = lA[offA + 0]; rA[0][1] = lA[offA + 8]; rA[0][2] = lA[offA + 16]; rA[0][3] = lA[offA + 24]; rA[0][4] = lA[offA + 32]; rA[0][5] = lA[offA + 40]; rB[0][0] = lB[offB + 0]; rB[0][1] = lB[offB + 8]; rB[0][2] = lB[offB + 16]; rB[0][3] = lB[offB + 24]; rB[0][4] = lB[offB + 32]; rB[0][5] = lB[offB + 40]; offA += 49; offB += 49; rC[0][0]=mad(rA[0][0],rB[0][0],rC[0][0]); rC[1][0]=mad(rA[0][1],rB[0][0],rC[1][0]); rC[2][0]=mad(rA[0][2],rB[0][0],rC[2][0]); rC[3][0]=mad(rA[0][3],rB[0][0],rC[3][0]); rC[4][0]=mad(rA[0][4],rB[0][0],rC[4][0]); rC[5][0]=mad(rA[0][5],rB[0][0],rC[5][0]); rC[0][1]=mad(rA[0][0],rB[0][1],rC[0][1]); rC[1][1]=mad(rA[0][1],rB[0][1],rC[1][1]); rC[2][1]=mad(rA[0][2],rB[0][1],rC[2][1]); rC[3][1]=mad(rA[0][3],rB[0][1],rC[3][1]); rC[4][1]=mad(rA[0][4],rB[0][1],rC[4][1]); rC[5][1]=mad(rA[0][5],rB[0][1],rC[5][1]); rC[0][2]=mad(rA[0][0],rB[0][2],rC[0][2]); rC[1][2]=mad(rA[0][1],rB[0][2],rC[1][2]); rC[2][2]=mad(rA[0][2],rB[0][2],rC[2][2]); rC[3][2]=mad(rA[0][3],rB[0][2],rC[3][2]); rC[4][2]=mad(rA[0][4],rB[0][2],rC[4][2]); rC[5][2]=mad(rA[0][5],rB[0][2],rC[5][2]); rC[0][3]=mad(rA[0][0],rB[0][3],rC[0][3]); rC[1][3]=mad(rA[0][1],rB[0][3],rC[1][3]); rC[2][3]=mad(rA[0][2],rB[0][3],rC[2][3]); rC[3][3]=mad(rA[0][3],rB[0][3],rC[3][3]); rC[4][3]=mad(rA[0][4],rB[0][3],rC[4][3]); rC[5][3]=mad(rA[0][5],rB[0][3],rC[5][3]); rC[0][4]=mad(rA[0][0],rB[0][4],rC[0][4]); rC[1][4]=mad(rA[0][1],rB[0][4],rC[1][4]); rC[2][4]=mad(rA[0][2],rB[0][4],rC[2][4]); rC[3][4]=mad(rA[0][3],rB[0][4],rC[3][4]); rC[4][4]=mad(rA[0][4],rB[0][4],rC[4][4]); rC[5][4]=mad(rA[0][5],rB[0][4],rC[5][4]); rC[0][5]=mad(rA[0][0],rB[0][5],rC[0][5]); rC[1][5]=mad(rA[0][1],rB[0][5],rC[1][5]); rC[2][5]=mad(rA[0][2],rB[0][5],rC[2][5]); rC[3][5]=mad(rA[0][3],rB[0][5],rC[3][5]); rC[4][5]=mad(rA[0][4],rB[0][5],rC[4][5]); rC[5][5]=mad(rA[0][5],rB[0][5],rC[5][5]); } A += 8; B += 8; } while (--block_k > 0); C+= gidx*48; C+= idx; C+= gidy*48*ldc; C+= idy*ldc; C[0*ldc] = alpha*rC[0][0] + beta*C[0*ldc]; C[8*ldc] = alpha*rC[0][1] + beta*C[8*ldc]; C[16*ldc] = alpha*rC[0][2] + beta*C[16*ldc]; C[24*ldc] = alpha*rC[0][3] + beta*C[24*ldc]; C[32*ldc] = alpha*rC[0][4] + beta*C[32*ldc]; C[40*ldc] = alpha*rC[0][5] + beta*C[40*ldc]; C+=8; C[0*ldc] = alpha*rC[1][0] + beta*C[0*ldc]; C[8*ldc] = alpha*rC[1][1] + beta*C[8*ldc]; C[16*ldc] = alpha*rC[1][2] + beta*C[16*ldc]; C[24*ldc] = alpha*rC[1][3] + beta*C[24*ldc]; C[32*ldc] = alpha*rC[1][4] + beta*C[32*ldc]; C[40*ldc] = alpha*rC[1][5] + beta*C[40*ldc]; C+=8; C[0*ldc] = alpha*rC[2][0] + beta*C[0*ldc]; C[8*ldc] = alpha*rC[2][1] + beta*C[8*ldc]; C[16*ldc] = alpha*rC[2][2] + beta*C[16*ldc]; C[24*ldc] = alpha*rC[2][3] + beta*C[24*ldc]; C[32*ldc] = alpha*rC[2][4] + beta*C[32*ldc]; C[40*ldc] = alpha*rC[2][5] + beta*C[40*ldc]; C+=8; C[0*ldc] = alpha*rC[3][0] + beta*C[0*ldc]; C[8*ldc] = alpha*rC[3][1] + beta*C[8*ldc]; C[16*ldc] = alpha*rC[3][2] + beta*C[16*ldc]; C[24*ldc] = alpha*rC[3][3] + beta*C[24*ldc]; C[32*ldc] = alpha*rC[3][4] + beta*C[32*ldc]; C[40*ldc] = alpha*rC[3][5] + beta*C[40*ldc]; C+=8; C[0*ldc] = alpha*rC[4][0] + beta*C[0*ldc]; C[8*ldc] = alpha*rC[4][1] + beta*C[8*ldc]; C[16*ldc] = alpha*rC[4][2] + beta*C[16*ldc]; C[24*ldc] = alpha*rC[4][3] + beta*C[24*ldc]; C[32*ldc] = alpha*rC[4][4] + beta*C[32*ldc]; C[40*ldc] = alpha*rC[4][5] + beta*C[40*ldc]; C+=8; C[0*ldc] = alpha*rC[5][0] + beta*C[0*ldc]; C[8*ldc] = alpha*rC[5][1] + beta*C[8*ldc]; C[16*ldc] = alpha*rC[5][2] + beta*C[16*ldc]; C[24*ldc] = alpha*rC[5][3] + beta*C[24*ldc]; C[32*ldc] = alpha*rC[5][4] + beta*C[32*ldc]; C[40*ldc] = alpha*rC[5][5] + beta*C[40*ldc]; C+=8; } ); #endif clblas-2.10/src/library/blas/AutoGemm/UserGemmKernelSources/sgemm_Col_NN_B0_MX032_NX032_KX16_src.cpp000066400000000000000000000056551264277366700325060ustar00rootroot00000000000000/******************************************************************************* * Hand-tuned kernel ******************************************************************************/ #ifndef KERNEL_SGEMM_COL_NN_B0_MX032_NX032_KX16_SRC_H #define KERNEL_SGEMM_COL_NN_B0_MX032_NX032_KX16_SRC_H #pragma message("AutoGemm's sgemm_Col_NN_B0_MX032_NX032_KX16_src overriden by user.") #ifndef STRINGIFY #define STRINGIFY(S) STRINGIFY2(S) #define STRINGIFY2(S) #S #endif const unsigned int sgemm_Col_NN_B0_MX032_NX032_KX16_workGroupNumRows = 16; const unsigned int sgemm_Col_NN_B0_MX032_NX032_KX16_workGroupNumCols = 16; const unsigned int sgemm_Col_NN_B0_MX032_NX032_KX16_microTileNumRows = 2; const unsigned int sgemm_Col_NN_B0_MX032_NX032_KX16_microTileNumCols = 2; const unsigned int sgemm_Col_NN_B0_MX032_NX032_KX16_unroll = 16; const char * const sgemm_Col_NN_B0_MX032_NX032_KX16_src = STRINGIFY( #define M2x2 \ rA[0][0] = lA[offA + 0]; \ rA[0][1] = lA[offA + 16]; \ rB[0][0] = lB[offB + 0]; \ rB[0][1] = lB[offB + 16]; \ offA += 33; \ offB += 33; \ rC[0][0]=mad(rA[0][0],rB[0][0],rC[0][0]); \ rC[1][0]=mad(rA[0][1],rB[0][0],rC[1][0]); \ rC[0][1]=mad(rA[0][0],rB[0][1],rC[0][1]); \ rC[1][1]=mad(rA[0][1],rB[0][1],rC[1][1]); \ mem_fence(CLK_LOCAL_MEM_FENCE);\n __attribute__((reqd_work_group_size(16,16,1))) __kernel void sgemm_Col_NN_B0_MX032_NX032_KX16 ( __global float const * restrict A, __global float const * restrict B, __global float * C, float const alpha, float const beta, uint const M, uint const N, uint const K, uint lda, uint ldb, uint ldc, uint offsetA, uint offsetB, uint offsetC) { float rC[2][2] = { {(float)0} }; float rA[1][2]; float rB[1][2]; A += offsetA; B += offsetB; C+=offsetC; __local float lA[544]; __local float lB[544]; uint gidx = get_group_id(0); uint gidy = get_group_id(1); uint idx = get_local_id(0); uint idy = get_local_id(1); uint idt = 16*idy + idx; uint idxT = idt % 16; uint idyT = idt / 16; A += gidx*32+ idxT + idyT*lda; B += (gidy*32+idyT)*ldb + idxT; uint block_k = K >> 4; do { __local float* plA = lA + idyT*33+idxT; __local float* plB = lB + idxT*33+idyT; barrier(CLK_LOCAL_MEM_FENCE); plB[0] = B[0]; plB[16] = B[16*ldb]; plA[0] = A[0+0*lda]; plA[16] = A[16+0*lda]; barrier(CLK_LOCAL_MEM_FENCE); uint offA = idx; uint offB = idy; M2x2 M2x2 M2x2 M2x2 M2x2 M2x2 M2x2 M2x2 M2x2 M2x2 M2x2 M2x2 M2x2 M2x2 M2x2 M2x2 A += lda<<4; B += 16; } while (--block_k > 0); C+= gidx*32+idx; C+= gidy*32*ldc; C+= idy*ldc; C[0*ldc] = alpha*rC[0][0] ; C[16*ldc] = alpha*rC[0][1]; C+=16; C[0*ldc] = alpha*rC[1][0] ; C[16*ldc] = alpha*rC[1][1]; } ); #endif clblas-2.10/src/library/blas/AutoGemm/UserGemmKernelSources/sgemm_Col_NN_B0_MX064_NX064_KX16_src.cpp000066400000000000000000000077771264277366700325270ustar00rootroot00000000000000/******************************************************************************* * Hand-tuned kernel ******************************************************************************/ #ifndef KERNEL_SGEMM_COL_NN_B0_MX064_NX064_KX16_SRC_H #define KERNEL_SGEMM_COL_NN_B0_MX064_NX064_KX16_SRC_H #pragma message("AutoGemm's sgemm_Col_NN_B0_MX064_NX064_KX16_src overriden by user.") #ifndef STRINGIFY #define STRINGIFY(S) STRINGIFY2(S) #define STRINGIFY2(S) #S #endif const unsigned int sgemm_Col_NN_B0_MX064_NX064_KX16_workGroupNumRows = 16; const unsigned int sgemm_Col_NN_B0_MX064_NX064_KX16_workGroupNumCols = 16; const unsigned int sgemm_Col_NN_B0_MX064_NX064_KX16_microTileNumRows = 4; const unsigned int sgemm_Col_NN_B0_MX064_NX064_KX16_microTileNumCols = 4; const unsigned int sgemm_Col_NN_B0_MX064_NX064_KX16_unroll = 16; const char * const sgemm_Col_NN_B0_MX064_NX064_KX16_src = STRINGIFY( #define M4x4 \ rA[0][0] = lA[offA + 0]; \ rA[0][1] = lA[offA + 16]; \ rA[0][2] = lA[offA + 32]; \ rA[0][3] = lA[offA + 48]; \ rB[0][0] = lB[offB + 0]; \ rB[0][1] = lB[offB + 16]; \ rB[0][2] = lB[offB + 32]; \ rB[0][3] = lB[offB + 48]; \ offA += 65; \ offB += 65; \ rC[0][0]=mad(rA[0][0],rB[0][0],rC[0][0]); \ rC[1][0]=mad(rA[0][1],rB[0][0],rC[1][0]); \ rC[2][0]=mad(rA[0][2],rB[0][0],rC[2][0]); \ rC[3][0]=mad(rA[0][3],rB[0][0],rC[3][0]); \ rC[0][1]=mad(rA[0][0],rB[0][1],rC[0][1]); \ rC[1][1]=mad(rA[0][1],rB[0][1],rC[1][1]); \ rC[2][1]=mad(rA[0][2],rB[0][1],rC[2][1]); \ rC[3][1]=mad(rA[0][3],rB[0][1],rC[3][1]); \ rC[0][2]=mad(rA[0][0],rB[0][2],rC[0][2]); \ rC[1][2]=mad(rA[0][1],rB[0][2],rC[1][2]); \ rC[2][2]=mad(rA[0][2],rB[0][2],rC[2][2]); \ rC[3][2]=mad(rA[0][3],rB[0][2],rC[3][2]); \ rC[0][3]=mad(rA[0][0],rB[0][3],rC[0][3]); \ rC[1][3]=mad(rA[0][1],rB[0][3],rC[1][3]); \ rC[2][3]=mad(rA[0][2],rB[0][3],rC[2][3]); \ rC[3][3]=mad(rA[0][3],rB[0][3],rC[3][3]); \ mem_fence(CLK_LOCAL_MEM_FENCE);\n __attribute__((reqd_work_group_size(16,16,1))) __kernel void sgemm_Col_NN_B0_MX064_NX064_KX16 ( __global float const * restrict A, __global float const * restrict B, __global float * C, float const alpha, float const beta, uint const M, uint const N, uint const K, uint lda, uint ldb, uint ldc, uint offsetA, uint offsetB, uint offsetC) { float rC[4][4] = { {(float)0} }; float rA[1][4]; float rB[1][4]; A += offsetA; B += offsetB; C+=offsetC; __local float lA[1056]; __local float lB[1056]; uint gidx = get_group_id(0); uint gidy = get_group_id(1); uint idx = get_local_id(0); uint idy = get_local_id(1); A += gidx*64+ idx + idy*lda; B += (gidy*64+idy)*ldb+ idx; uint block_k = K >> 4; do { __local float* plA = lA + idy*65+idx; __local float* plB = lB + idx*65+idy; barrier(CLK_LOCAL_MEM_FENCE); plB[0] = B[0]; plB[16] = B[16*ldb]; plB[32] = B[32*ldb]; plB[48] = B[48*ldb]; plA[0] = A[0+0*lda]; plA[16] = A[16+0*lda]; plA[32] = A[32+0*lda]; plA[48] = A[48+0*lda]; barrier(CLK_LOCAL_MEM_FENCE); uint offA = idx; uint offB = idy; M4x4 M4x4 M4x4 M4x4 M4x4 M4x4 M4x4 M4x4 M4x4 M4x4 M4x4 M4x4 M4x4 M4x4 M4x4 M4x4 A += lda<<4; B += 16; } while (--block_k > 0); C+= gidx*64+idx; C+= gidy*64*ldc; C+= idy*ldc; C[0*ldc] = alpha*rC[0][0] ; C[16*ldc] = alpha*rC[0][1]; C[32*ldc] = alpha*rC[0][2]; C[48*ldc] = alpha*rC[0][3]; C+=16; C[0*ldc] = alpha*rC[1][0] ; C[16*ldc] = alpha*rC[1][1]; C[32*ldc] = alpha*rC[1][2]; C[48*ldc] = alpha*rC[1][3]; C+=16; C[0*ldc] = alpha*rC[2][0] ; C[16*ldc] = alpha*rC[2][1]; C[32*ldc] = alpha*rC[2][2]; C[48*ldc] = alpha*rC[2][3]; C+=16; C[0*ldc] = alpha*rC[3][0] ; C[16*ldc] = alpha*rC[3][1]; C[32*ldc] = alpha*rC[3][2]; C[48*ldc] = alpha*rC[3][3]; } ); #endif clblas-2.10/src/library/blas/AutoGemm/UserGemmKernelSources/sgemm_Col_NN_B0_MX096_NX096_KX16_src.cpp000066400000000000000000000155461264277366700325320ustar00rootroot00000000000000/******************************************************************************* * Hand-tuned kernel ******************************************************************************/ #ifndef KERNEL_SGEMM_COL_NN_B0_MX096_NX096_KX16_SRC_H #define KERNEL_SGEMM_COL_NN_B0_MX096_NX096_KX16_SRC_H #pragma message("AutoGemm's sgemm_Col_NN_B0_MX096_NX096_KX16_src overriden by user.") #ifndef STRINGIFY #define STRINGIFY(S) STRINGIFY2(S) #define STRINGIFY2(S) #S #endif const unsigned int sgemm_Col_NN_B0_MX096_NX096_KX16_workGroupNumRows = 16; const unsigned int sgemm_Col_NN_B0_MX096_NX096_KX16_workGroupNumCols = 16; const unsigned int sgemm_Col_NN_B0_MX096_NX096_KX16_microTileNumRows = 6; const unsigned int sgemm_Col_NN_B0_MX096_NX096_KX16_microTileNumCols = 6; const unsigned int sgemm_Col_NN_B0_MX096_NX096_KX16_unroll = 16; const char * const sgemm_Col_NN_B0_MX096_NX096_KX16_src = STRINGIFY( #define M6x6 \ rA[0][0] = lA[offA + 0]; \ rA[0][1] = lA[offA + 16]; \ rA[0][2] = lA[offA + 32]; \ rA[0][3] = lA[offA + 48]; \ rA[0][4] = lA[offA + 64]; \ rA[0][5] = lA[offA + 80]; \ rB[0][0] = lB[offB + 0]; \ rB[0][1] = lB[offB + 16]; \ rB[0][2] = lB[offB + 32]; \ rB[0][3] = lB[offB + 48]; \ rB[0][4] = lB[offB + 64]; \ rB[0][5] = lB[offB + 80]; \ offA += 97; \ offB += 97; \ rC[0][0]=mad(rA[0][0],rB[0][0],rC[0][0]); \ rC[1][0]=mad(rA[0][1],rB[0][0],rC[1][0]); \ rC[2][0]=mad(rA[0][2],rB[0][0],rC[2][0]); \ rC[3][0]=mad(rA[0][3],rB[0][0],rC[3][0]); \ rC[4][0]=mad(rA[0][4],rB[0][0],rC[4][0]); \ rC[5][0]=mad(rA[0][5],rB[0][0],rC[5][0]); \ rC[0][1]=mad(rA[0][0],rB[0][1],rC[0][1]); \ rC[1][1]=mad(rA[0][1],rB[0][1],rC[1][1]); \ rC[2][1]=mad(rA[0][2],rB[0][1],rC[2][1]); \ rC[3][1]=mad(rA[0][3],rB[0][1],rC[3][1]); \ rC[4][1]=mad(rA[0][4],rB[0][1],rC[4][1]); \ rC[5][1]=mad(rA[0][5],rB[0][1],rC[5][1]); \ rC[0][2]=mad(rA[0][0],rB[0][2],rC[0][2]); \ rC[1][2]=mad(rA[0][1],rB[0][2],rC[1][2]); \ rC[2][2]=mad(rA[0][2],rB[0][2],rC[2][2]); \ rC[3][2]=mad(rA[0][3],rB[0][2],rC[3][2]); \ rC[4][2]=mad(rA[0][4],rB[0][2],rC[4][2]); \ rC[5][2]=mad(rA[0][5],rB[0][2],rC[5][2]); \ rC[0][3]=mad(rA[0][0],rB[0][3],rC[0][3]); \ rC[1][3]=mad(rA[0][1],rB[0][3],rC[1][3]); \ rC[2][3]=mad(rA[0][2],rB[0][3],rC[2][3]); \ rC[3][3]=mad(rA[0][3],rB[0][3],rC[3][3]); \ rC[4][3]=mad(rA[0][4],rB[0][3],rC[4][3]); \ rC[5][3]=mad(rA[0][5],rB[0][3],rC[5][3]); \ rC[0][4]=mad(rA[0][0],rB[0][4],rC[0][4]); \ rC[1][4]=mad(rA[0][1],rB[0][4],rC[1][4]); \ rC[2][4]=mad(rA[0][2],rB[0][4],rC[2][4]); \ rC[3][4]=mad(rA[0][3],rB[0][4],rC[3][4]); \ rC[4][4]=mad(rA[0][4],rB[0][4],rC[4][4]); \ rC[5][4]=mad(rA[0][5],rB[0][4],rC[5][4]); \ rC[0][5]=mad(rA[0][0],rB[0][5],rC[0][5]); \ rC[1][5]=mad(rA[0][1],rB[0][5],rC[1][5]); \ rC[2][5]=mad(rA[0][2],rB[0][5],rC[2][5]); \ rC[3][5]=mad(rA[0][3],rB[0][5],rC[3][5]); \ rC[4][5]=mad(rA[0][4],rB[0][5],rC[4][5]); \ rC[5][5]=mad(rA[0][5],rB[0][5],rC[5][5]); \ mem_fence(CLK_LOCAL_MEM_FENCE);\n __attribute__((reqd_work_group_size(16,16,1))) __kernel void sgemm_Col_NN_B0_MX096_NX096_KX16 ( __global float const * restrict A, __global float const * restrict B, __global float * C, float const alpha, float const beta, uint const M, uint const N, uint const K, uint lda, uint ldb, uint ldc, uint offsetA, uint offsetB, uint offsetC) { float rC[6][6] = { {(float)0} }; float rA[1][6]; float rB[1][6]; A += offsetA; B += offsetB; C+=offsetC; __local float lA[1552]; __local float lB[1552]; uint gidx = get_group_id(0); uint gidy = get_group_id(1); uint idx = get_local_id(0); uint idy = get_local_id(1); A += gidx*96+ idx + idy*lda; B += gidy*96*ldb+ idx + idy*ldb; uint block_k = K >> 4; do { __local float* plA = lA + idy*97+idx; __local float* plB = lB + idx*97+idy; barrier(CLK_LOCAL_MEM_FENCE); plB[0] = B[0]; plB[16] = B[16*ldb]; plB[32] = B[32*ldb]; plB[48] = B[48*ldb]; plB[64] = B[64*ldb]; plB[80] = B[80*ldb]; plA[0] = A[0+0*lda]; plA[16] = A[16+0*lda]; plA[32] = A[32+0*lda]; plA[48] = A[48+0*lda]; plA[64] = A[64+0*lda]; plA[80] = A[80+0*lda]; barrier(CLK_LOCAL_MEM_FENCE); uint offA = idx; uint offB = idy; M6x6 M6x6 M6x6 M6x6 M6x6 M6x6 M6x6 M6x6 M6x6 M6x6 M6x6 M6x6 M6x6 M6x6 M6x6 M6x6 A += lda<<4; B += 16; } while (--block_k > 0); C+= gidx*96+idx; C+= gidy*96*ldc; C+= idy*ldc; C[0*ldc] = alpha*rC[0][0] + beta*C[0*ldc]; C[16*ldc] = alpha*rC[0][1] + beta*C[16*ldc]; C[32*ldc] = alpha*rC[0][2] + beta*C[32*ldc]; C[48*ldc] = alpha*rC[0][3] + beta*C[48*ldc]; C[64*ldc] = alpha*rC[0][4] + beta*C[64*ldc]; C[80*ldc] = alpha*rC[0][5] + beta*C[80*ldc]; C+=16; C[0*ldc] = alpha*rC[1][0] + beta*C[0*ldc]; C[16*ldc] = alpha*rC[1][1] + beta*C[16*ldc]; C[32*ldc] = alpha*rC[1][2] + beta*C[32*ldc]; C[48*ldc] = alpha*rC[1][3] + beta*C[48*ldc]; C[64*ldc] = alpha*rC[1][4] + beta*C[64*ldc]; C[80*ldc] = alpha*rC[1][5] + beta*C[80*ldc]; C+=16; C[0*ldc] = alpha*rC[2][0] + beta*C[0*ldc]; C[16*ldc] = alpha*rC[2][1] + beta*C[16*ldc]; C[32*ldc] = alpha*rC[2][2] + beta*C[32*ldc]; C[48*ldc] = alpha*rC[2][3] + beta*C[48*ldc]; C[64*ldc] = alpha*rC[2][4] + beta*C[64*ldc]; C[80*ldc] = alpha*rC[2][5] + beta*C[80*ldc]; C+=16; C[0*ldc] = alpha*rC[3][0] + beta*C[0*ldc]; C[16*ldc] = alpha*rC[3][1] + beta*C[16*ldc]; C[32*ldc] = alpha*rC[3][2] + beta*C[32*ldc]; C[48*ldc] = alpha*rC[3][3] + beta*C[48*ldc]; C[64*ldc] = alpha*rC[3][4] + beta*C[64*ldc]; C[80*ldc] = alpha*rC[3][5] + beta*C[80*ldc]; C+=16; C[0*ldc] = alpha*rC[4][0] + beta*C[0*ldc]; C[16*ldc] = alpha*rC[4][1] + beta*C[16*ldc]; C[32*ldc] = alpha*rC[4][2] + beta*C[32*ldc]; C[48*ldc] = alpha*rC[4][3] + beta*C[48*ldc]; C[64*ldc] = alpha*rC[4][4] + beta*C[64*ldc]; C[80*ldc] = alpha*rC[4][5] + beta*C[80*ldc]; C+=16; C[0*ldc] = alpha*rC[5][0] + beta*C[0*ldc]; C[16*ldc] = alpha*rC[5][1] + beta*C[16*ldc]; C[32*ldc] = alpha*rC[5][2] + beta*C[32*ldc]; C[48*ldc] = alpha*rC[5][3] + beta*C[48*ldc]; C[64*ldc] = alpha*rC[5][4] + beta*C[64*ldc]; C[80*ldc] = alpha*rC[5][5] + beta*C[80*ldc]; } ); #endif sgemm_Col_NN_B1_MX032_NX032_KX16_BRANCH_src.cpp000066400000000000000000000070671264277366700334440ustar00rootroot00000000000000clblas-2.10/src/library/blas/AutoGemm/UserGemmKernelSources/******************************************************************************* * Hand-tuned kernel ******************************************************************************/ #ifndef KERNEL_SGEMM_COL_NN_B1_MX032_NX032_KX16_BRANCH_SRC_H #define KERNEL_SGEMM_COL_NN_B1_MX032_NX032_KX16_BRANCH_SRC_H #pragma message("AutoGemm's sgemm_Col_NN_B1_MX032_NX032_KX16_BRANCH_src (if exists) overriden by user.") #ifndef STRINGIFY #define STRINGIFY(S) STRINGIFY2(S) #define STRINGIFY2(S) #S #endif const unsigned int sgemm_Col_NN_B1_MX032_NX032_KX16_BRANCH_workGroupNumRows = 16; const unsigned int sgemm_Col_NN_B1_MX032_NX032_KX16_BRANCH_workGroupNumCols = 16; const unsigned int sgemm_Col_NN_B1_MX032_NX032_KX16_BRANCH_microTileNumRows = 2; const unsigned int sgemm_Col_NN_B1_MX032_NX032_KX16_BRANCH_microTileNumCols = 2; const unsigned int sgemm_Col_NN_B1_MX032_NX032_KX16_BRANCH_unroll = 16; #ifndef AUTOGEMM_USE_PRE_COMPILED_KERNELS unsigned char *sgemm_Col_NN_B1_MX032_NX032_KX16_BRANCH_bin = 0; size_t sgemm_Col_NN_B1_MX032_NX032_KX16_BRANCH_binSize = 0; #endif const char * const sgemm_Col_NN_B1_MX032_NX032_KX16_BRANCH_src = STRINGIFY( #define M2x2 \ rA[0][0] = lA[offA + 0]; \ rA[0][1] = lA[offA + 16]; \ rB[0][0] = lB[offB + 0]; \ rB[0][1] = lB[offB + 16]; \ offA += 33; \ offB += 33; \ rC[0][0]=mad(rA[0][0],rB[0][0],rC[0][0]); \ rC[1][0]=mad(rA[0][1],rB[0][0],rC[1][0]); \ rC[0][1]=mad(rA[0][0],rB[0][1],rC[0][1]); \ rC[1][1]=mad(rA[0][1],rB[0][1],rC[1][1]); \ mem_fence(CLK_LOCAL_MEM_FENCE);\n __attribute__((reqd_work_group_size(16,16,1))) __kernel void sgemm_Col_NN_B1_MX032_NX032_KX16_BRANCH ( __global float const * restrict A, __global float const * restrict B, __global float * C, float const alpha, float const beta, uint const M, uint const N, uint const K, uint lda, uint ldb, uint ldc, uint offsetA, uint offsetB, uint offsetC) { float rC[2][2] = { {(float)0} }; float rA[1][2]; float rB[1][2]; A += offsetA; B += offsetB; C+=offsetC; __local float lA[528];//16*32+16 __local float lB[528]; uint gidx = get_group_id(0); uint gidy = get_group_id(1); uint idx = get_local_id(0); uint idy = get_local_id(1); int CurrentOffSetA = gidx*32+ idx; int CurrentOffSetB = gidy*32+ idy; A += gidx*32+ idx + idy*lda; B += gidy*32*ldb+ idx + idy*ldb; uint block_k = K >> 4; do { __local float* plA = lA + idy*33+idx; __local float* plB = lB + idx*33+idy; barrier(CLK_LOCAL_MEM_FENCE); plB[0] = CurrentOffSetB>=N?0.0:B[0]; plB[16] = CurrentOffSetB+16>=N?0.0:B[16*ldb]; plA[0] = CurrentOffSetA>=M?0.0:A[0]; plA[16] = CurrentOffSetA+16>=M?0.0:A[16]; barrier(CLK_LOCAL_MEM_FENCE); uint offA = idx; uint offB = idy; M2x2 M2x2 M2x2 M2x2 M2x2 M2x2 M2x2 M2x2 M2x2 M2x2 M2x2 M2x2 M2x2 M2x2 M2x2 M2x2 A += lda<<4; B += 16; //} } while (--block_k > 0); int offset_x = gidx*32+idx; int offset_y = gidy*32+ idy; if(offset_x>=M || offset_y>=N ) return; C+=offset_x+offset_y*ldc; int i = 0; do { C[0 ] = mad(alpha, rC[i][0], beta*C[0]); if(offset_y+16=M ) return; } while (++i < 2); } ); #endif clblas-2.10/src/library/blas/AutoGemm/UserGemmKernelSources/sgemm_Col_NN_B1_MX032_NX032_KX16_src.cpp000066400000000000000000000060661264277366700325040ustar00rootroot00000000000000/******************************************************************************* * Hand-tuned kernel ******************************************************************************/ #ifndef KERNEL_SGEMM_COL_NN_B1_MX032_NX032_KX16_SRC_H #define KERNEL_SGEMM_COL_NN_B1_MX032_NX032_KX16_SRC_H #pragma message("AutoGemm's sgemm_Col_NN_B1_MX032_NX032_KX16_src overriden by user.") #ifndef STRINGIFY #define STRINGIFY(S) STRINGIFY2(S) #define STRINGIFY2(S) #S #endif const unsigned int sgemm_Col_NN_B1_MX032_NX032_KX16_workGroupNumRows = 16; const unsigned int sgemm_Col_NN_B1_MX032_NX032_KX16_workGroupNumCols = 16; const unsigned int sgemm_Col_NN_B1_MX032_NX032_KX16_microTileNumRows = 2; const unsigned int sgemm_Col_NN_B1_MX032_NX032_KX16_microTileNumCols = 2; const unsigned int sgemm_Col_NN_B1_MX032_NX032_KX16_unroll = 16; const char * const sgemm_Col_NN_B1_MX032_NX032_KX16_src = STRINGIFY( #define M2x2 \ rA[0][0] = lA[offA + 0]; \ rA[0][1] = lA[offA + 16]; \ rB[0][0] = lB[offB + 0]; \ rB[0][1] = lB[offB + 16]; \ offA += 33; \ offB += 33; \ rC[0][0]=mad(rA[0][0],rB[0][0],rC[0][0]); \ rC[1][0]=mad(rA[0][1],rB[0][0],rC[1][0]); \ rC[0][1]=mad(rA[0][0],rB[0][1],rC[0][1]); \ rC[1][1]=mad(rA[0][1],rB[0][1],rC[1][1]); \ mem_fence(CLK_LOCAL_MEM_FENCE);\n __attribute__((reqd_work_group_size(16,16,1))) __kernel void sgemm_Col_NN_B1_MX032_NX032_KX16 ( __global float const * restrict A, __global float const * restrict B, __global float * C, float const alpha, float const beta, uint const M, uint const N, uint const K, uint lda, uint ldb, uint ldc, uint offsetA, uint offsetB, uint offsetC) { float rC[2][2] = { {(float)0} }; float rA[1][2]; float rB[1][2]; A += offsetA; B += offsetB; C+=offsetC; __local float lA[544]; __local float lB[544]; uint gidx = get_group_id(0); uint gidy = get_group_id(1); uint idx = get_local_id(0); uint idy = get_local_id(1); uint idt = 16*idy + idx; uint idxT = idt % 16; uint idyT = idt / 16; A += gidx*32+ idxT + idyT*lda; B += (gidy*32+idyT)*ldb + idxT; uint block_k = K >> 4; do { __local float* plA = lA + idyT*33+idxT; __local float* plB = lB + idxT*33+idyT; barrier(CLK_LOCAL_MEM_FENCE); plB[0] = B[0]; plB[16] = B[16*ldb]; plA[0] = A[0+0*lda]; plA[16] = A[16+0*lda]; barrier(CLK_LOCAL_MEM_FENCE); uint offA = idx; uint offB = idy; M2x2 M2x2 M2x2 M2x2 M2x2 M2x2 M2x2 M2x2 M2x2 M2x2 M2x2 M2x2 M2x2 M2x2 M2x2 M2x2 A += lda<<4; B += 16; } while (--block_k > 0); C+= gidx*32+idx; C+= gidy*32*ldc; C+= idy*ldc; C[0*ldc] = alpha*rC[0][0] + beta*C[0*ldc]; C[16*ldc] = alpha*rC[0][1] + beta*C[16*ldc]; C+=16; C[0*ldc] = alpha*rC[1][0] + beta*C[0*ldc]; C[16*ldc] = alpha*rC[1][1] + beta*C[16*ldc]; } ); #endif clblas-2.10/src/library/blas/AutoGemm/UserGemmKernelSources/sgemm_Col_NN_B1_MX064_NX064_KX16_src.cpp000066400000000000000000000107601264277366700325120ustar00rootroot00000000000000/******************************************************************************* * Hand-tuned kernel ******************************************************************************/ #ifndef KERNEL_SGEMM_COL_NN_B1_MX064_NX064_KX16_SRC_H #define KERNEL_SGEMM_COL_NN_B1_MX064_NX064_KX16_SRC_H #pragma message("AutoGemm's sgemm_Col_NN_B1_MX064_NX064_KX16_src overriden by user.") #ifndef STRINGIFY #define STRINGIFY(S) STRINGIFY2(S) #define STRINGIFY2(S) #S #endif const unsigned int sgemm_Col_NN_B1_MX064_NX064_KX16_workGroupNumRows = 16; const unsigned int sgemm_Col_NN_B1_MX064_NX064_KX16_workGroupNumCols = 16; const unsigned int sgemm_Col_NN_B1_MX064_NX064_KX16_microTileNumRows = 4; const unsigned int sgemm_Col_NN_B1_MX064_NX064_KX16_microTileNumCols = 4; const unsigned int sgemm_Col_NN_B1_MX064_NX064_KX16_unroll = 16; const char * const sgemm_Col_NN_B1_MX064_NX064_KX16_src = STRINGIFY( #define M4x4 \ rA[0][0] = lA[offA + 0]; \ rA[0][1] = lA[offA + 16]; \ rA[0][2] = lA[offA + 32]; \ rA[0][3] = lA[offA + 48]; \ rB[0][0] = lB[offB + 0]; \ rB[0][1] = lB[offB + 16]; \ rB[0][2] = lB[offB + 32]; \ rB[0][3] = lB[offB + 48]; \ offA += 65; \ offB += 65; \ rC[0][0]=mad(rA[0][0],rB[0][0],rC[0][0]); \ rC[1][0]=mad(rA[0][1],rB[0][0],rC[1][0]); \ rC[2][0]=mad(rA[0][2],rB[0][0],rC[2][0]); \ rC[3][0]=mad(rA[0][3],rB[0][0],rC[3][0]); \ rC[0][1]=mad(rA[0][0],rB[0][1],rC[0][1]); \ rC[1][1]=mad(rA[0][1],rB[0][1],rC[1][1]); \ rC[2][1]=mad(rA[0][2],rB[0][1],rC[2][1]); \ rC[3][1]=mad(rA[0][3],rB[0][1],rC[3][1]); \ rC[0][2]=mad(rA[0][0],rB[0][2],rC[0][2]); \ rC[1][2]=mad(rA[0][1],rB[0][2],rC[1][2]); \ rC[2][2]=mad(rA[0][2],rB[0][2],rC[2][2]); \ rC[3][2]=mad(rA[0][3],rB[0][2],rC[3][2]); \ rC[0][3]=mad(rA[0][0],rB[0][3],rC[0][3]); \ rC[1][3]=mad(rA[0][1],rB[0][3],rC[1][3]); \ rC[2][3]=mad(rA[0][2],rB[0][3],rC[2][3]); \ rC[3][3]=mad(rA[0][3],rB[0][3],rC[3][3]); \ mem_fence(CLK_LOCAL_MEM_FENCE);\n __attribute__((reqd_work_group_size(16,16,1))) __kernel void sgemm_Col_NN_B1_MX064_NX064_KX16 ( __global float const * restrict A, __global float const * restrict B, __global float * C, float const alpha, float const beta, uint const M, uint const N, uint const K, uint lda, uint ldb, uint ldc, uint offsetA, uint offsetB, uint offsetC) { float rC[4][4] = { {(float)0} }; float rA[1][4]; float rB[1][4]; A += offsetA; B += offsetB; C+=offsetC; __local float lA[1056]; __local float lB[1056]; uint gidx = get_group_id(0); uint gidy = get_group_id(1); uint idx = get_local_id(0); uint idy = get_local_id(1); A += gidx*64+ idx + idy*lda; B += (gidy*64+idy)*ldb+ idx; uint block_k = K >> 4; do { __local float* plA = lA + idy*65+idx; __local float* plB = lB + idx*65+idy; barrier(CLK_LOCAL_MEM_FENCE); plB[0] = B[0]; plB[16] = B[16*ldb]; plB[32] = B[32*ldb]; plB[48] = B[48*ldb]; plA[0] = A[0+0*lda]; plA[16] = A[16+0*lda]; plA[32] = A[32+0*lda]; plA[48] = A[48+0*lda]; barrier(CLK_LOCAL_MEM_FENCE); uint offA = idx; uint offB = idy; M4x4 M4x4 M4x4 M4x4 M4x4 M4x4 M4x4 M4x4 M4x4 M4x4 M4x4 M4x4 M4x4 M4x4 M4x4 M4x4 A += lda<<4; B += 16; } while (--block_k > 0); C+= gidx*64+idx; C+= gidy*64*ldc; C+= idy*ldc; C[0*ldc] = alpha*rC[0][0] + beta*C[0*ldc]; C[16*ldc] = alpha*rC[0][1] + beta*C[16*ldc]; C[32*ldc] = alpha*rC[0][2] + beta*C[32*ldc]; C[48*ldc] = alpha*rC[0][3] + beta*C[48*ldc]; C+=16; C[0*ldc] = alpha*rC[1][0] + beta*C[0*ldc]; C[16*ldc] = alpha*rC[1][1] + beta*C[16*ldc]; C[32*ldc] = alpha*rC[1][2] + beta*C[32*ldc]; C[48*ldc] = alpha*rC[1][3] + beta*C[48*ldc]; C+=16; C[0*ldc] = alpha*rC[2][0] + beta*C[0*ldc]; C[16*ldc] = alpha*rC[2][1] + beta*C[16*ldc]; C[32*ldc] = alpha*rC[2][2] + beta*C[32*ldc]; C[48*ldc] = alpha*rC[2][3] + beta*C[48*ldc]; C+=16; C[0*ldc] = alpha*rC[3][0] + beta*C[0*ldc]; C[16*ldc] = alpha*rC[3][1] + beta*C[16*ldc]; C[32*ldc] = alpha*rC[3][2] + beta*C[32*ldc]; C[48*ldc] = alpha*rC[3][3] + beta*C[48*ldc]; } ); #endif clblas-2.10/src/library/blas/AutoGemm/UserGemmKernelSources/sgemm_Col_NN_B1_MX096_NX096_KX16_src.cpp000066400000000000000000000156641264277366700325340ustar00rootroot00000000000000/******************************************************************************* * Hand-tuned kernel ******************************************************************************/ #ifndef KERNEL_SGEMM_COL_NN_B1_MX096_NX096_KX16_SRC_H #define KERNEL_SGEMM_COL_NN_B1_MX096_NX096_KX16_SRC_H #pragma message("AutoGemm's sgemm_Col_NN_B1_MX096_NX096_KX16_src overriden by user.") #ifndef STRINGIFY #define STRINGIFY(S) STRINGIFY2(S) #define STRINGIFY2(S) #S #endif const unsigned int sgemm_Col_NN_B1_MX096_NX096_KX16_workGroupNumRows = 16; const unsigned int sgemm_Col_NN_B1_MX096_NX096_KX16_workGroupNumCols = 16; const unsigned int sgemm_Col_NN_B1_MX096_NX096_KX16_microTileNumRows = 6; const unsigned int sgemm_Col_NN_B1_MX096_NX096_KX16_microTileNumCols = 6; const unsigned int sgemm_Col_NN_B1_MX096_NX096_KX16_unroll = 16; const char * const sgemm_Col_NN_B1_MX096_NX096_KX16_src = STRINGIFY( #define M6x6 \ rA[0][0] = lA[offA + 0]; \ rA[0][1] = lA[offA + 16]; \ rA[0][2] = lA[offA + 32]; \ rA[0][3] = lA[offA + 48]; \ rA[0][4] = lA[offA + 64]; \ rA[0][5] = lA[offA + 80]; \ rB[0][0] = lB[offB + 0]; \ rB[0][1] = lB[offB + 16]; \ rB[0][2] = lB[offB + 32]; \ rB[0][3] = lB[offB + 48]; \ rB[0][4] = lB[offB + 64]; \ rB[0][5] = lB[offB + 80]; \ offA += 97; \ offB += 97; \ rC[0][0]=mad(rA[0][0],rB[0][0],rC[0][0]); \ rC[1][0]=mad(rA[0][1],rB[0][0],rC[1][0]); \ rC[2][0]=mad(rA[0][2],rB[0][0],rC[2][0]); \ rC[3][0]=mad(rA[0][3],rB[0][0],rC[3][0]); \ rC[4][0]=mad(rA[0][4],rB[0][0],rC[4][0]); \ rC[5][0]=mad(rA[0][5],rB[0][0],rC[5][0]); \ rC[0][1]=mad(rA[0][0],rB[0][1],rC[0][1]); \ rC[1][1]=mad(rA[0][1],rB[0][1],rC[1][1]); \ rC[2][1]=mad(rA[0][2],rB[0][1],rC[2][1]); \ rC[3][1]=mad(rA[0][3],rB[0][1],rC[3][1]); \ rC[4][1]=mad(rA[0][4],rB[0][1],rC[4][1]); \ rC[5][1]=mad(rA[0][5],rB[0][1],rC[5][1]); \ rC[0][2]=mad(rA[0][0],rB[0][2],rC[0][2]); \ rC[1][2]=mad(rA[0][1],rB[0][2],rC[1][2]); \ rC[2][2]=mad(rA[0][2],rB[0][2],rC[2][2]); \ rC[3][2]=mad(rA[0][3],rB[0][2],rC[3][2]); \ rC[4][2]=mad(rA[0][4],rB[0][2],rC[4][2]); \ rC[5][2]=mad(rA[0][5],rB[0][2],rC[5][2]); \ rC[0][3]=mad(rA[0][0],rB[0][3],rC[0][3]); \ rC[1][3]=mad(rA[0][1],rB[0][3],rC[1][3]); \ rC[2][3]=mad(rA[0][2],rB[0][3],rC[2][3]); \ rC[3][3]=mad(rA[0][3],rB[0][3],rC[3][3]); \ rC[4][3]=mad(rA[0][4],rB[0][3],rC[4][3]); \ rC[5][3]=mad(rA[0][5],rB[0][3],rC[5][3]); \ rC[0][4]=mad(rA[0][0],rB[0][4],rC[0][4]); \ rC[1][4]=mad(rA[0][1],rB[0][4],rC[1][4]); \ rC[2][4]=mad(rA[0][2],rB[0][4],rC[2][4]); \ rC[3][4]=mad(rA[0][3],rB[0][4],rC[3][4]); \ rC[4][4]=mad(rA[0][4],rB[0][4],rC[4][4]); \ rC[5][4]=mad(rA[0][5],rB[0][4],rC[5][4]); \ rC[0][5]=mad(rA[0][0],rB[0][5],rC[0][5]); \ rC[1][5]=mad(rA[0][1],rB[0][5],rC[1][5]); \ rC[2][5]=mad(rA[0][2],rB[0][5],rC[2][5]); \ rC[3][5]=mad(rA[0][3],rB[0][5],rC[3][5]); \ rC[4][5]=mad(rA[0][4],rB[0][5],rC[4][5]); \ rC[5][5]=mad(rA[0][5],rB[0][5],rC[5][5]); \ mem_fence(CLK_LOCAL_MEM_FENCE);\n __attribute__((reqd_work_group_size(16,16,1))) __kernel void sgemm_Col_NN_B1_MX096_NX096_KX16 ( __global float const * restrict A, __global float const * restrict B, __global float * C, float const alpha, float const beta, uint const M, uint const N, uint const K, uint lda, uint ldb, uint ldc, uint offsetA, uint offsetB, uint offsetC) { float rC[6][6] = { {(float)0} }; float rA[1][6]; float rB[1][6]; A += offsetA; B += offsetB; C+=offsetC; __local float lA[1552]; __local float lB[1552]; uint gidx = get_group_id(0); uint gidy = get_group_id(1); uint idx = get_local_id(0); uint idy = get_local_id(1); A += gidx*96+ idx + idy*lda; B += gidy*96*ldb+ idx + idy*ldb; uint block_k = K >> 4; do { __local float* plA = lA + idy*97+idx; __local float* plB = lB + idx*97+idy; barrier(CLK_LOCAL_MEM_FENCE); plB[0] = B[0]; plB[16] = B[16*ldb]; plB[32] = B[32*ldb]; plB[48] = B[48*ldb]; plB[64] = B[64*ldb]; plB[80] = B[80*ldb]; plA[0] = A[0+0*lda]; plA[16] = A[16+0*lda]; plA[32] = A[32+0*lda]; plA[48] = A[48+0*lda]; plA[64] = A[64+0*lda]; plA[80] = A[80+0*lda]; barrier(CLK_LOCAL_MEM_FENCE); uint offA = idx; uint offB = idy; M6x6 M6x6 M6x6 M6x6 M6x6 M6x6 M6x6 M6x6 M6x6 M6x6 M6x6 M6x6 M6x6 M6x6 M6x6 M6x6 A += lda<<4; B += 16; } while (--block_k > 0); C+= gidx*96+idx; C+= gidy*96*ldc; C+= idy*ldc; C[0*ldc] = alpha*rC[0][0] + beta*C[0*ldc]; C[16*ldc] = alpha*rC[0][1] + beta*C[16*ldc]; C[32*ldc] = alpha*rC[0][2] + beta*C[32*ldc]; C[48*ldc] = alpha*rC[0][3] + beta*C[48*ldc]; C[64*ldc] = alpha*rC[0][4] + beta*C[64*ldc]; C[80*ldc] = alpha*rC[0][5] + beta*C[80*ldc]; C+=16; C[0*ldc] = alpha*rC[1][0] + beta*C[0*ldc]; C[16*ldc] = alpha*rC[1][1] + beta*C[16*ldc]; C[32*ldc] = alpha*rC[1][2] + beta*C[32*ldc]; C[48*ldc] = alpha*rC[1][3] + beta*C[48*ldc]; C[64*ldc] = alpha*rC[1][4] + beta*C[64*ldc]; C[80*ldc] = alpha*rC[1][5] + beta*C[80*ldc]; C+=16; C[0*ldc] = alpha*rC[2][0] + beta*C[0*ldc]; C[16*ldc] = alpha*rC[2][1] + beta*C[16*ldc]; C[32*ldc] = alpha*rC[2][2] + beta*C[32*ldc]; C[48*ldc] = alpha*rC[2][3] + beta*C[48*ldc]; C[64*ldc] = alpha*rC[2][4] + beta*C[64*ldc]; C[80*ldc] = alpha*rC[2][5] + beta*C[80*ldc]; C+=16; C[0*ldc] = alpha*rC[3][0] + beta*C[0*ldc]; C[16*ldc] = alpha*rC[3][1] + beta*C[16*ldc]; C[32*ldc] = alpha*rC[3][2] + beta*C[32*ldc]; C[48*ldc] = alpha*rC[3][3] + beta*C[48*ldc]; C[64*ldc] = alpha*rC[3][4] + beta*C[64*ldc]; C[80*ldc] = alpha*rC[3][5] + beta*C[80*ldc]; C+=16; C[0*ldc] = alpha*rC[4][0] + beta*C[0*ldc]; C[16*ldc] = alpha*rC[4][1] + beta*C[16*ldc]; C[32*ldc] = alpha*rC[4][2] + beta*C[32*ldc]; C[48*ldc] = alpha*rC[4][3] + beta*C[48*ldc]; C[64*ldc] = alpha*rC[4][4] + beta*C[64*ldc]; C[80*ldc] = alpha*rC[4][5] + beta*C[80*ldc]; C+=16; C[0*ldc] = alpha*rC[5][0] + beta*C[0*ldc]; C[16*ldc] = alpha*rC[5][1] + beta*C[16*ldc]; C[32*ldc] = alpha*rC[5][2] + beta*C[32*ldc]; C[48*ldc] = alpha*rC[5][3] + beta*C[48*ldc]; C[64*ldc] = alpha*rC[5][4] + beta*C[64*ldc]; C[80*ldc] = alpha*rC[5][5] + beta*C[80*ldc]; } ); #endif clblas-2.10/src/library/blas/AutoGemm/UserGemmKernelSources/sgemm_Col_NT_B0_MX032_NX032_KX16_src.cpp000066400000000000000000000057531264277366700325130ustar00rootroot00000000000000/******************************************************************************* * Hand-tuned kernel ******************************************************************************/ #ifndef KERNEL_SGEMM_COL_NT_B0_MX032_NX032_KX16_SRC_H #define KERNEL_SGEMM_COL_NT_B0_MX032_NX032_KX16_SRC_H #pragma message("AutoGemm's sgemm_Col_NT_B0_MX032_NX032_KX16_src overriden by user.") #ifndef STRINGIFY #define STRINGIFY(S) STRINGIFY2(S) #define STRINGIFY2(S) #S #endif const unsigned int sgemm_Col_NT_B0_MX032_NX032_KX16_workGroupNumRows = 16; const unsigned int sgemm_Col_NT_B0_MX032_NX032_KX16_workGroupNumCols = 16; const unsigned int sgemm_Col_NT_B0_MX032_NX032_KX16_microTileNumRows = 2; const unsigned int sgemm_Col_NT_B0_MX032_NX032_KX16_microTileNumCols = 2; const unsigned int sgemm_Col_NT_B0_MX032_NX032_KX16_unroll = 16; const char * const sgemm_Col_NT_B0_MX032_NX032_KX16_src = STRINGIFY( #define M2x2 \ rA[0][0] = lA[offA + 0]; \ rA[0][1] = lA[offA + 16]; \ rB[0][0] = lB[offB + 0]; \ rB[0][1] = lB[offB + 16]; \ offA += 33; \ offB += 33; \ rC[0][0]=mad(rA[0][0],rB[0][0],rC[0][0]); \ rC[1][0]=mad(rA[0][1],rB[0][0],rC[1][0]); \ rC[0][1]=mad(rA[0][0],rB[0][1],rC[0][1]); \ rC[1][1]=mad(rA[0][1],rB[0][1],rC[1][1]); \ mem_fence(CLK_LOCAL_MEM_FENCE);\n __attribute__((reqd_work_group_size(16,16,1))) __kernel void sgemm_Col_NT_B0_MX032_NX032_KX16 ( __global float const * restrict A, __global float const * restrict B, __global float * C, float const alpha, float const beta, uint const M, uint const N, uint const K, uint lda, uint ldb, uint ldc, uint offsetA, uint offsetB, uint offsetC) { float rC[2][2] = { {(float)0} }; float rA[1][2]; float rB[1][2]; A += offsetA; B += offsetB; C+=offsetC; __local float lA[544]; __local float lB[544]; uint gidx = get_group_id(0); uint gidy = get_group_id(1); uint idx = get_local_id(0); uint idy = get_local_id(1); uint idt = 16*idy + idx; uint idxT = idt % 16; uint idyT = idt / 16; A += gidx*32+ idxT + idyT*lda; B += gidy*32+ idxT + idyT*ldb; uint block_k = K >> 4; do { __local float* plA = lA + idyT*33+idxT; __local float* plB = lB + idyT*33+idxT; barrier(CLK_LOCAL_MEM_FENCE); plB[0] = B[0+0*ldb]; plB[16] = B[16+0*ldb]; plA[0] = A[0+0*lda]; plA[16] = A[16+0*lda]; barrier(CLK_LOCAL_MEM_FENCE); uint offA = idx; uint offB = idy; M2x2 M2x2 M2x2 M2x2 M2x2 M2x2 M2x2 M2x2 M2x2 M2x2 M2x2 M2x2 M2x2 M2x2 M2x2 M2x2 A += lda<<4; B += ldb<<4; } while (--block_k > 0); C+= gidx*32+idx; C+= gidy*32*ldc; C+= idy*ldc; C[0*ldc] = alpha*rC[0][0] ; C[16*ldc] = alpha*rC[0][1]; C+=16; C[0*ldc] = alpha*rC[1][0] ; C[16*ldc] = alpha*rC[1][1]; } ); #endif clblas-2.10/src/library/blas/AutoGemm/UserGemmKernelSources/sgemm_Col_NT_B0_MX064_NX064_KX16_src.cpp000066400000000000000000000105301264277366700325120ustar00rootroot00000000000000/******************************************************************************* * Hand-tuned kernel ******************************************************************************/ #ifndef KERNEL_SGEMM_COL_NT_B0_MX064_NX064_KX16_SRC_H #define KERNEL_SGEMM_COL_NT_B0_MX064_NX064_KX16_SRC_H #pragma message("AutoGemm's sgemm_Col_NT_B0_MX064_NX064_KX16_src overriden by user.") #ifndef STRINGIFY #define STRINGIFY(S) STRINGIFY2(S) #define STRINGIFY2(S) #S #endif const unsigned int sgemm_Col_NT_B0_MX064_NX064_KX16_workGroupNumRows = 16; const unsigned int sgemm_Col_NT_B0_MX064_NX064_KX16_workGroupNumCols = 16; const unsigned int sgemm_Col_NT_B0_MX064_NX064_KX16_microTileNumRows = 4; const unsigned int sgemm_Col_NT_B0_MX064_NX064_KX16_microTileNumCols = 4; const unsigned int sgemm_Col_NT_B0_MX064_NX064_KX16_unroll = 16; const char * const sgemm_Col_NT_B0_MX064_NX064_KX16_src = STRINGIFY( #define M4x4 \ rA[0][0] = lA[offA + 0]; \ rA[0][1] = lA[offA + 16]; \ rA[0][2] = lA[offA + 32]; \ rA[0][3] = lA[offA + 48]; \ rB[0][0] = lB[offB + 0]; \ rB[0][1] = lB[offB + 16]; \ rB[0][2] = lB[offB + 32]; \ rB[0][3] = lB[offB + 48]; \ offA += 65; \ offB += 65; \ rC[0][0]=mad(rA[0][0],rB[0][0],rC[0][0]); \ rC[1][0]=mad(rA[0][1],rB[0][0],rC[1][0]); \ rC[2][0]=mad(rA[0][2],rB[0][0],rC[2][0]); \ rC[3][0]=mad(rA[0][3],rB[0][0],rC[3][0]); \ rC[0][1]=mad(rA[0][0],rB[0][1],rC[0][1]); \ rC[1][1]=mad(rA[0][1],rB[0][1],rC[1][1]); \ rC[2][1]=mad(rA[0][2],rB[0][1],rC[2][1]); \ rC[3][1]=mad(rA[0][3],rB[0][1],rC[3][1]); \ rC[0][2]=mad(rA[0][0],rB[0][2],rC[0][2]); \ rC[1][2]=mad(rA[0][1],rB[0][2],rC[1][2]); \ rC[2][2]=mad(rA[0][2],rB[0][2],rC[2][2]); \ rC[3][2]=mad(rA[0][3],rB[0][2],rC[3][2]); \ rC[0][3]=mad(rA[0][0],rB[0][3],rC[0][3]); \ rC[1][3]=mad(rA[0][1],rB[0][3],rC[1][3]); \ rC[2][3]=mad(rA[0][2],rB[0][3],rC[2][3]); \ rC[3][3]=mad(rA[0][3],rB[0][3],rC[3][3]); \ mem_fence(CLK_LOCAL_MEM_FENCE);\n __attribute__((reqd_work_group_size(16,16,1))) __kernel void sgemm_Col_NT_B0_MX064_NX064_KX16 ( __global float const * restrict A, __global float const * restrict B, __global float * C, float const alpha, float const beta, uint const M, uint const N, uint const K, uint lda, uint ldb, uint ldc, uint offsetA, uint offsetB, uint offsetC) { float rC[4][4] = { {(float)0} }; float rA[1][4]; float rB[1][4]; A += offsetA; B += offsetB; C+=offsetC; __local float lA[1056]; __local float lB[1056]; uint gidx = get_group_id(0); uint gidy = get_group_id(1); uint idx = get_local_id(0); uint idy = get_local_id(1); uint idt = 16*idy + idx; uint idxT = idt % 16; uint idyT = idt / 16; A += gidx*64+ idxT + idyT*lda; B += gidy*64+ idxT + idyT*ldb; uint block_k = K >> 4; do { __local float* plA = lA + idyT*65+idxT; __local float* plB = lB + idyT*65+idxT; barrier(CLK_LOCAL_MEM_FENCE); plB[0] = B[0+0*ldb]; plB[16] = B[16+0*ldb]; plB[32] = B[32+0*ldb]; plB[48] = B[48+0*ldb]; plA[0] = A[0+0*lda]; plA[16] = A[16+0*lda]; plA[32] = A[32+0*lda]; plA[48] = A[48+0*lda]; barrier(CLK_LOCAL_MEM_FENCE); uint offA = idx; uint offB = idy; M4x4 M4x4 M4x4 M4x4 M4x4 M4x4 M4x4 M4x4 M4x4 M4x4 M4x4 M4x4 M4x4 M4x4 M4x4 M4x4 A += lda<<4; B += ldb<<4; } while (--block_k > 0); C+= gidx*64+idx; C+= gidy*64*ldc; C+= idy*ldc; C[0*ldc] = alpha*rC[0][0] ; C[16*ldc] = alpha*rC[0][1]; C[32*ldc] = alpha*rC[0][2]; C[48*ldc] = alpha*rC[0][3]; C+=16; C[0*ldc] = alpha*rC[1][0] ; C[16*ldc] = alpha*rC[1][1]; C[32*ldc] = alpha*rC[1][2]; C[48*ldc] = alpha*rC[1][3]; C+=16; C[0*ldc] = alpha*rC[2][0] ; C[16*ldc] = alpha*rC[2][1]; C[32*ldc] = alpha*rC[2][2]; C[48*ldc] = alpha*rC[2][3]; C+=16; C[0*ldc] = alpha*rC[3][0] ; C[16*ldc] = alpha*rC[3][1]; C[32*ldc] = alpha*rC[3][2]; C[48*ldc] = alpha*rC[3][3]; } ); #endif clblas-2.10/src/library/blas/AutoGemm/UserGemmKernelSources/sgemm_Col_NT_B0_MX096_NX096_KX16_src.cpp000066400000000000000000000155671264277366700325430ustar00rootroot00000000000000/******************************************************************************* * Hand-tuned kernel ******************************************************************************/ #ifndef KERNEL_SGEMM_COL_NT_B0_MX096_NX096_KX16_SRC_H #define KERNEL_SGEMM_COL_NT_B0_MX096_NX096_KX16_SRC_H #pragma message("AutoGemm's sgemm_Col_NT_B0_MX096_NX096_KX16_src overriden by user.") #ifndef STRINGIFY #define STRINGIFY(S) STRINGIFY2(S) #define STRINGIFY2(S) #S #endif const unsigned int sgemm_Col_NT_B0_MX096_NX096_KX16_workGroupNumRows = 16; const unsigned int sgemm_Col_NT_B0_MX096_NX096_KX16_workGroupNumCols = 16; const unsigned int sgemm_Col_NT_B0_MX096_NX096_KX16_microTileNumRows = 6; const unsigned int sgemm_Col_NT_B0_MX096_NX096_KX16_microTileNumCols = 6; const unsigned int sgemm_Col_NT_B0_MX096_NX096_KX16_unroll = 16; const char * const sgemm_Col_NT_B0_MX096_NX096_KX16_src = STRINGIFY( #define M6x6 \ rA[0][0] = lA[offA + 0]; \ rA[0][1] = lA[offA + 16]; \ rA[0][2] = lA[offA + 32]; \ rA[0][3] = lA[offA + 48]; \ rA[0][4] = lA[offA + 64]; \ rA[0][5] = lA[offA + 80]; \ rB[0][0] = lB[offB + 0]; \ rB[0][1] = lB[offB + 16]; \ rB[0][2] = lB[offB + 32]; \ rB[0][3] = lB[offB + 48]; \ rB[0][4] = lB[offB + 64]; \ rB[0][5] = lB[offB + 80]; \ offA += 97; \ offB += 97; \ rC[0][0]=mad(rA[0][0],rB[0][0],rC[0][0]); \ rC[1][0]=mad(rA[0][1],rB[0][0],rC[1][0]); \ rC[2][0]=mad(rA[0][2],rB[0][0],rC[2][0]); \ rC[3][0]=mad(rA[0][3],rB[0][0],rC[3][0]); \ rC[4][0]=mad(rA[0][4],rB[0][0],rC[4][0]); \ rC[5][0]=mad(rA[0][5],rB[0][0],rC[5][0]); \ rC[0][1]=mad(rA[0][0],rB[0][1],rC[0][1]); \ rC[1][1]=mad(rA[0][1],rB[0][1],rC[1][1]); \ rC[2][1]=mad(rA[0][2],rB[0][1],rC[2][1]); \ rC[3][1]=mad(rA[0][3],rB[0][1],rC[3][1]); \ rC[4][1]=mad(rA[0][4],rB[0][1],rC[4][1]); \ rC[5][1]=mad(rA[0][5],rB[0][1],rC[5][1]); \ rC[0][2]=mad(rA[0][0],rB[0][2],rC[0][2]); \ rC[1][2]=mad(rA[0][1],rB[0][2],rC[1][2]); \ rC[2][2]=mad(rA[0][2],rB[0][2],rC[2][2]); \ rC[3][2]=mad(rA[0][3],rB[0][2],rC[3][2]); \ rC[4][2]=mad(rA[0][4],rB[0][2],rC[4][2]); \ rC[5][2]=mad(rA[0][5],rB[0][2],rC[5][2]); \ rC[0][3]=mad(rA[0][0],rB[0][3],rC[0][3]); \ rC[1][3]=mad(rA[0][1],rB[0][3],rC[1][3]); \ rC[2][3]=mad(rA[0][2],rB[0][3],rC[2][3]); \ rC[3][3]=mad(rA[0][3],rB[0][3],rC[3][3]); \ rC[4][3]=mad(rA[0][4],rB[0][3],rC[4][3]); \ rC[5][3]=mad(rA[0][5],rB[0][3],rC[5][3]); \ rC[0][4]=mad(rA[0][0],rB[0][4],rC[0][4]); \ rC[1][4]=mad(rA[0][1],rB[0][4],rC[1][4]); \ rC[2][4]=mad(rA[0][2],rB[0][4],rC[2][4]); \ rC[3][4]=mad(rA[0][3],rB[0][4],rC[3][4]); \ rC[4][4]=mad(rA[0][4],rB[0][4],rC[4][4]); \ rC[5][4]=mad(rA[0][5],rB[0][4],rC[5][4]); \ rC[0][5]=mad(rA[0][0],rB[0][5],rC[0][5]); \ rC[1][5]=mad(rA[0][1],rB[0][5],rC[1][5]); \ rC[2][5]=mad(rA[0][2],rB[0][5],rC[2][5]); \ rC[3][5]=mad(rA[0][3],rB[0][5],rC[3][5]); \ rC[4][5]=mad(rA[0][4],rB[0][5],rC[4][5]); \ rC[5][5]=mad(rA[0][5],rB[0][5],rC[5][5]); \ mem_fence(CLK_LOCAL_MEM_FENCE);\n __attribute__((reqd_work_group_size(16,16,1))) __kernel void sgemm_Col_NT_B0_MX096_NX096_KX16 ( __global float const * restrict A, __global float const * restrict B, __global float * C, float const alpha, float const beta, uint const M, uint const N, uint const K, uint lda, uint ldb, uint ldc, uint offsetA, uint offsetB, uint offsetC) { float rC[6][6] = { {(float)0} }; float rA[1][6]; float rB[1][6]; A += offsetA; B += offsetB; C+=offsetC; __local float lA[1552]; __local float lB[1552]; uint gidx = get_group_id(0); uint gidy = get_group_id(1); uint idx = get_local_id(0); uint idy = get_local_id(1); A += gidx*96+ idx + idy*lda; B += gidy*96+ idx + idy*ldb; uint block_k = K >> 4; do { __local float* plA = lA + idy*97+idx; __local float* plB = lB + idy*97+idx; barrier(CLK_LOCAL_MEM_FENCE); plB[0] = B[0+0*ldb]; plB[16] = B[16+0*ldb]; plB[32] = B[32+0*ldb]; plB[48] = B[48+0*ldb]; plB[64] = B[64+0*ldb]; plB[80] = B[80+0*ldb]; plA[0] = A[0+0*lda]; plA[16] = A[16+0*lda]; plA[32] = A[32+0*lda]; plA[48] = A[48+0*lda]; plA[64] = A[64+0*lda]; plA[80] = A[80+0*lda]; barrier(CLK_LOCAL_MEM_FENCE); uint offA = idx; uint offB = idy; M6x6 M6x6 M6x6 M6x6 M6x6 M6x6 M6x6 M6x6 M6x6 M6x6 M6x6 M6x6 M6x6 M6x6 M6x6 M6x6 A += lda<<4; B += ldb<<4; } while (--block_k > 0); C+= gidx*96+idx; C+= gidy*96*ldc; C+= idy*ldc; C[0*ldc] = alpha*rC[0][0] + beta*C[0*ldc]; C[16*ldc] = alpha*rC[0][1] + beta*C[16*ldc]; C[32*ldc] = alpha*rC[0][2] + beta*C[32*ldc]; C[48*ldc] = alpha*rC[0][3] + beta*C[48*ldc]; C[64*ldc] = alpha*rC[0][4] + beta*C[64*ldc]; C[80*ldc] = alpha*rC[0][5] + beta*C[80*ldc]; C+=16; C[0*ldc] = alpha*rC[1][0] + beta*C[0*ldc]; C[16*ldc] = alpha*rC[1][1] + beta*C[16*ldc]; C[32*ldc] = alpha*rC[1][2] + beta*C[32*ldc]; C[48*ldc] = alpha*rC[1][3] + beta*C[48*ldc]; C[64*ldc] = alpha*rC[1][4] + beta*C[64*ldc]; C[80*ldc] = alpha*rC[1][5] + beta*C[80*ldc]; C+=16; C[0*ldc] = alpha*rC[2][0] + beta*C[0*ldc]; C[16*ldc] = alpha*rC[2][1] + beta*C[16*ldc]; C[32*ldc] = alpha*rC[2][2] + beta*C[32*ldc]; C[48*ldc] = alpha*rC[2][3] + beta*C[48*ldc]; C[64*ldc] = alpha*rC[2][4] + beta*C[64*ldc]; C[80*ldc] = alpha*rC[2][5] + beta*C[80*ldc]; C+=16; C[0*ldc] = alpha*rC[3][0] + beta*C[0*ldc]; C[16*ldc] = alpha*rC[3][1] + beta*C[16*ldc]; C[32*ldc] = alpha*rC[3][2] + beta*C[32*ldc]; C[48*ldc] = alpha*rC[3][3] + beta*C[48*ldc]; C[64*ldc] = alpha*rC[3][4] + beta*C[64*ldc]; C[80*ldc] = alpha*rC[3][5] + beta*C[80*ldc]; C+=16; C[0*ldc] = alpha*rC[4][0] + beta*C[0*ldc]; C[16*ldc] = alpha*rC[4][1] + beta*C[16*ldc]; C[32*ldc] = alpha*rC[4][2] + beta*C[32*ldc]; C[48*ldc] = alpha*rC[4][3] + beta*C[48*ldc]; C[64*ldc] = alpha*rC[4][4] + beta*C[64*ldc]; C[80*ldc] = alpha*rC[4][5] + beta*C[80*ldc]; C+=16; C[0*ldc] = alpha*rC[5][0] + beta*C[0*ldc]; C[16*ldc] = alpha*rC[5][1] + beta*C[16*ldc]; C[32*ldc] = alpha*rC[5][2] + beta*C[32*ldc]; C[48*ldc] = alpha*rC[5][3] + beta*C[48*ldc]; C[64*ldc] = alpha*rC[5][4] + beta*C[64*ldc]; C[80*ldc] = alpha*rC[5][5] + beta*C[80*ldc]; } ); #endif sgemm_Col_NT_B1_MX032_NX032_KX16_BRANCH_src.cpp000066400000000000000000000070501264277366700334420ustar00rootroot00000000000000clblas-2.10/src/library/blas/AutoGemm/UserGemmKernelSources/******************************************************************************* * Hand-tuned kernel ******************************************************************************/ #ifndef KERNEL_SGEMM_COL_NT_B1_MX032_NX032_KX16_BRANCH_SRC_H #define KERNEL_SGEMM_COL_NT_B1_MX032_NX032_KX16_BRANCH_SRC_H #pragma message("AutoGemm's sgemm_Col_NT_B1_MX032_NX032_KX16_BRANCH_src (if exists) overriden by user.") #ifndef STRINGIFY #define STRINGIFY(S) STRINGIFY2(S) #define STRINGIFY2(S) #S #endif const unsigned int sgemm_Col_NT_B1_MX032_NX032_KX16_BRANCH_workGroupNumRows = 16; const unsigned int sgemm_Col_NT_B1_MX032_NX032_KX16_BRANCH_workGroupNumCols = 16; const unsigned int sgemm_Col_NT_B1_MX032_NX032_KX16_BRANCH_microTileNumRows = 2; const unsigned int sgemm_Col_NT_B1_MX032_NX032_KX16_BRANCH_microTileNumCols = 2; const unsigned int sgemm_Col_NT_B1_MX032_NX032_KX16_BRANCH_unroll = 16; #ifndef AUTOGEMM_USE_PRE_COMPILED_KERNELS unsigned char *sgemm_Col_NT_B1_MX032_NX032_KX16_BRANCH_bin = 0; size_t sgemm_Col_NT_B1_MX032_NX032_KX16_BRANCH_binSize = 0; #endif const char * const sgemm_Col_NT_B1_MX032_NX032_KX16_BRANCH_src = STRINGIFY( #define M2x2 \ rA[0][0] = lA[offA + 0]; \ rA[0][1] = lA[offA + 16]; \ rB[0][0] = lB[offB + 0]; \ rB[0][1] = lB[offB + 16]; \ offA += 33; \ offB += 33; \ rC[0][0]=mad(rA[0][0],rB[0][0],rC[0][0]); \ rC[1][0]=mad(rA[0][1],rB[0][0],rC[1][0]); \ rC[0][1]=mad(rA[0][0],rB[0][1],rC[0][1]); \ rC[1][1]=mad(rA[0][1],rB[0][1],rC[1][1]); \ mem_fence(CLK_LOCAL_MEM_FENCE);\n __attribute__((reqd_work_group_size(16,16,1))) __kernel void sgemm_Col_NT_B1_MX032_NX032_KX16_BRANCH ( __global float const * restrict A, __global float const * restrict B, __global float * C, float const alpha, float const beta, uint const M, uint const N, uint const K, uint lda, uint ldb, uint ldc, uint offsetA, uint offsetB, uint offsetC) { float rC[2][2] = { {(float)0} }; float rA[1][2]; float rB[1][2]; A += offsetA; B += offsetB; C+=offsetC; __local float lA[528];//16*32+16 __local float lB[528]; uint gidx = get_group_id(0); uint gidy = get_group_id(1); uint idx = get_local_id(0); uint idy = get_local_id(1); int CurrentOffSetA = gidx*32+ idx; int CurrentOffSetB = gidy*32+ idx; A += gidx*32+ idx + idy*lda; B += gidy*32+ idx + idy*ldb; uint block_k = K >> 4; do { __local float* plA = lA + idy*33+idx; __local float* plB = lB + idy*33+idx; barrier(CLK_LOCAL_MEM_FENCE); plB[0] = CurrentOffSetB>=N?0.0:B[0]; plB[16] = CurrentOffSetB+16>=N?0.0:B[16]; plA[0] = CurrentOffSetA>=M?0.0:A[0]; plA[16] = CurrentOffSetA+16>=M?0.0:A[16]; barrier(CLK_LOCAL_MEM_FENCE); uint offA = idx; uint offB = idy; M2x2 M2x2 M2x2 M2x2 M2x2 M2x2 M2x2 M2x2 M2x2 M2x2 M2x2 M2x2 M2x2 M2x2 M2x2 M2x2 A += lda<<4; B += ldb<<4; } while (--block_k > 0); int offset_x = gidx*32+idx; int offset_y = gidy*32+ idy; if(offset_x>=M || offset_y>=N ) return; C+=offset_x+offset_y*ldc; int i = 0; do { C[0 ] = mad(alpha, rC[i][0], beta*C[0]); if(offset_y+16=M ) return; } while (++i < 2); } ); #endif sgemm_Col_NT_B1_MX032_NX032_KX16_SINGLE_src.cpp000066400000000000000000000076701264277366700334760ustar00rootroot00000000000000clblas-2.10/src/library/blas/AutoGemm/UserGemmKernelSources/******************************************************************************* * Hand-tuned kernel * below kernels work with an assumption: after the main matrix being computed by kernels with 64x64 micro tile size, the boundary are of size 32. * Thus, M and N are of mod32 and not necessarily of mod64. ******************************************************************************/ #ifndef KERNEL_SGEMM_COL_NT_B1_MX032_NX032_KX16_SINGLE_SRC_H #define KERNEL_SGEMM_COL_NT_B1_MX032_NX032_KX16_SINGLE_SRC_H #pragma message("AutoGemm's sgemm_Col_NT_B1_MX032_NX032_KX16_SINGLE_src (if exists) overriden by user.") #include "UserGemmKernelSourceIncludes.h" #ifndef STRINGIFY #define STRINGIFY(S) STRINGIFY2(S) #define STRINGIFY2(S) #S #endif const unsigned int sgemm_Col_NT_B1_MX032_NX032_KX16_SINGLE_workGroupNumRows = 16; const unsigned int sgemm_Col_NT_B1_MX032_NX032_KX16_SINGLE_workGroupNumCols = 16; const unsigned int sgemm_Col_NT_B1_MX032_NX032_KX16_SINGLE_microTileNumRows = 2; const unsigned int sgemm_Col_NT_B1_MX032_NX032_KX16_SINGLE_microTileNumCols = 2; const unsigned int sgemm_Col_NT_B1_MX032_NX032_KX16_SINGLE_unroll = 16; #ifndef AUTOGEMM_USE_PRE_COMPILED_KERNELS unsigned char *sgemm_Col_NT_B1_MX032_NX032_KX16_SINGLE_bin = 0; size_t sgemm_Col_NT_B1_MX032_NX032_KX16_SINGLE_binSize = 0; #endif const char * const sgemm_Col_NT_B1_MX032_NX032_KX16_SINGLE_src = STRINGIFY( #define M2x2 \ rA[0][0] = lA[offA + 0]; \ rA[0][1] = lA[offA + 16]; \ rB[0][0] = lB[offB + 0]; \ rB[0][1] = lB[offB + 16]; \ offA += 33; \ offB += 33; \ rC[0][0]=mad(rA[0][0],rB[0][0],rC[0][0]); \ rC[1][0]=mad(rA[0][1],rB[0][0],rC[1][0]); \ rC[0][1]=mad(rA[0][0],rB[0][1],rC[0][1]); \ rC[1][1]=mad(rA[0][1],rB[0][1],rC[1][1]); \ rC[2][1]=mad(rA[0][2],rB[0][1],rC[2][1]); \ mem_fence(CLK_LOCAL_MEM_FENCE);\n __attribute__((reqd_work_group_size(16,16,1))) __kernel void sgemm_Col_NT_B1_MX032_NX032_KX16_SINGLE ( __global float const * restrict A, __global float const * restrict B, __global float * C, float const alpha, float const beta, uint const M, uint const N, uint const K, uint lda, uint ldb, uint ldc, uint offsetA, uint offsetB, uint offsetC) { float rC[2][2] = { (float)0 }; float rA[1][2]; float rB[1][2]; A += offsetA; B += offsetB; C += offsetC; __local float lA[528]; __local float lB[528]; uint gidx = M / 64;//get_group_id(0); uint gidy = N / 64;//get_group_id(1); uint idx = get_local_id(0); uint idy = get_local_id(1); int CurrentOffSetA = gidx * 64 + idx; int CurrentOffSetB = gidy * 64 + idx; A += gidx * 64 + idx + idy*lda; B += gidy * 64 + idx + idy*ldb; uint block_k = K >> 4; do { __local float* plA = lA + idy * 33 + idx; __local float* plB = lB + idy * 33 + idx; barrier(CLK_LOCAL_MEM_FENCE); //plB[0] = CurrentOffSetB>=N?0.0:B[0]; //plB[16] = CurrentOffSetB+16>=N?0.0:B[16]; //plB[32] = CurrentOffSetB+32>=N?0.0:B[32]; //plB[48] = CurrentOffSetB+48>=N?0.0:B[48]; plB[0] = B[0]; plB[16] = B[16]; //plA[0] = CurrentOffSetA>=M?0.0:A[0]; //plA[16] = CurrentOffSetA+16>=M?0.0:A[16]; //plA[32] = CurrentOffSetA+32>=M?0.0:A[32]; //plA[48] = CurrentOffSetA+48>=M?0.0:A[48]; plA[0] = A[0]; plA[16] = A[16]; barrier(CLK_LOCAL_MEM_FENCE); uint offA = idx; uint offB = idy; M2x2 M2x2 M2x2 M2x2 M2x2 M2x2 M2x2 M2x2 M2x2 M2x2 M2x2 M2x2 M2x2 M2x2 M2x2 M2x2 A += lda << 4; B += ldb << 4; } while (--block_k > 0); int offset_x = gidx * 64 + idx; int offset_y = gidy * 64 + idy; //if(offset_x>=M || offset_y>=N ) // return; C += offset_x + offset_y*ldc; int i = 0; do { C[0] = mad(alpha, rC[i][0], beta*C[0]); C[16 * ldc] = mad(alpha, rC[i][1], beta*C[16 * ldc]); C += 16; offset_x += 16; //if(offset_x>=M ) // return; } while (++i < 2); } ); #endif clblas-2.10/src/library/blas/AutoGemm/UserGemmKernelSources/sgemm_Col_NT_B1_MX032_NX032_KX16_src.cpp000066400000000000000000000060511264277366700325040ustar00rootroot00000000000000/******************************************************************************* * Hand-tuned kernel ******************************************************************************/ #ifndef KERNEL_SGEMM_COL_NT_B1_MX032_NX032_KX16_SRC_H #define KERNEL_SGEMM_COL_NT_B1_MX032_NX032_KX16_SRC_H #pragma message("AutoGemm's sgemm_Col_NT_B1_MX032_NX032_KX16_src overriden by user.") #ifndef STRINGIFY #define STRINGIFY(S) STRINGIFY2(S) #define STRINGIFY2(S) #S #endif const unsigned int sgemm_Col_NT_B1_MX032_NX032_KX16_workGroupNumRows = 16; const unsigned int sgemm_Col_NT_B1_MX032_NX032_KX16_workGroupNumCols = 16; const unsigned int sgemm_Col_NT_B1_MX032_NX032_KX16_microTileNumRows = 2; const unsigned int sgemm_Col_NT_B1_MX032_NX032_KX16_microTileNumCols = 2; const unsigned int sgemm_Col_NT_B1_MX032_NX032_KX16_unroll = 16; const char * const sgemm_Col_NT_B1_MX032_NX032_KX16_src = STRINGIFY( #define M2x2 \ rA[0][0] = lA[offA + 0]; \ rA[0][1] = lA[offA + 16]; \ rB[0][0] = lB[offB + 0]; \ rB[0][1] = lB[offB + 16]; \ offA += 33; \ offB += 33; \ rC[0][0]=mad(rA[0][0],rB[0][0],rC[0][0]); \ rC[1][0]=mad(rA[0][1],rB[0][0],rC[1][0]); \ rC[0][1]=mad(rA[0][0],rB[0][1],rC[0][1]); \ rC[1][1]=mad(rA[0][1],rB[0][1],rC[1][1]); \ mem_fence(CLK_LOCAL_MEM_FENCE);\n __attribute__((reqd_work_group_size(16,16,1))) __kernel void sgemm_Col_NT_B1_MX032_NX032_KX16 ( __global float const * restrict A, __global float const * restrict B, __global float * C, float const alpha, float const beta, uint const M, uint const N, uint const K, uint lda, uint ldb, uint ldc, uint offsetA, uint offsetB, uint offsetC) { float rC[2][2] = { {(float)0} }; float rA[1][2]; float rB[1][2]; A += offsetA; B += offsetB; C+=offsetC; __local float lA[544]; __local float lB[544]; uint gidx = get_group_id(0); uint gidy = get_group_id(1); uint idx = get_local_id(0); uint idy = get_local_id(1); uint idt = 16*idy + idx; uint idxT = idt % 16; uint idyT = idt / 16; A += gidx*32+ idxT + idyT*lda; B += gidy*32+ idxT + idyT*ldb; uint block_k = K >> 4; do { __local float* plA = lA + idyT*33+idxT; __local float* plB = lB + idyT*33+idxT; barrier(CLK_LOCAL_MEM_FENCE); plB[0] = B[0+0*ldb]; plB[16] = B[16+0*ldb]; plA[0] = A[0+0*lda]; plA[16] = A[16+0*lda]; barrier(CLK_LOCAL_MEM_FENCE); uint offA = idx; uint offB = idy; M2x2 M2x2 M2x2 M2x2 M2x2 M2x2 M2x2 M2x2 M2x2 M2x2 M2x2 M2x2 M2x2 M2x2 M2x2 M2x2 A += lda<<4; B += ldb<<4; } while (--block_k > 0); C+= gidx*32+idx; C+= gidy*32*ldc; C+= idy*ldc; C[0*ldc] = alpha*rC[0][0] + beta*C[0*ldc]; C[16*ldc] = alpha*rC[0][1] + beta*C[16*ldc]; C+=16; C[0*ldc] = alpha*rC[1][0] + beta*C[0*ldc]; C[16*ldc] = alpha*rC[1][1] + beta*C[16*ldc]; } ); #endif sgemm_Col_NT_B1_MX032_NX064_KX16_ROW_src.cpp000066400000000000000000000103071264277366700331600ustar00rootroot00000000000000clblas-2.10/src/library/blas/AutoGemm/UserGemmKernelSources/******************************************************************************* * Hand-tuned kernel * below kernels work with an assumption: after the main matrix being computed by kernels with 64x64 micro tile size, the boundary are of size 32. * Thus, M and N are of mod32 and not necessarily of mod64. ******************************************************************************/ #ifndef KERNEL_SGEMM_COL_NT_B1_MX032_NX064_KX16_ROW_SRC_H #define KERNEL_SGEMM_COL_NT_B1_MX032_NX064_KX16_ROW_SRC_H #pragma message("AutoGemm's sgemm_Col_NT_B1_MX032_NX064_KX16_ROW_src (if exists) overriden by user.") #include "UserGemmKernelSourceIncludes.h" #ifndef STRINGIFY #define STRINGIFY(S) STRINGIFY2(S) #define STRINGIFY2(S) #S #endif const unsigned int sgemm_Col_NT_B1_MX032_NX064_KX16_ROW_workGroupNumRows = 16; const unsigned int sgemm_Col_NT_B1_MX032_NX064_KX16_ROW_workGroupNumCols = 16; const unsigned int sgemm_Col_NT_B1_MX032_NX064_KX16_ROW_microTileNumRows = 2; const unsigned int sgemm_Col_NT_B1_MX032_NX064_KX16_ROW_microTileNumCols = 4; const unsigned int sgemm_Col_NT_B1_MX032_NX064_KX16_ROW_unroll = 16; //if precompiled is enabled. All hand tuned kerenls should be precompiled. #ifndef AUTOGEMM_USE_PRE_COMPILED_KERNELS unsigned char *sgemm_Col_NT_B1_MX032_NX064_KX16_ROW_bin = 0; size_t sgemm_Col_NT_B1_MX032_NX064_KX16_ROW_binSize = 0; #endif const char * const sgemm_Col_NT_B1_MX032_NX064_KX16_ROW_src = STRINGIFY( #define M2x4 \ rA[0][0] = lA[offA + 0]; \ rA[0][1] = lA[offA + 16]; \ rB[0][0] = lB[offB + 0]; \ rB[0][1] = lB[offB + 16]; \ rB[0][2] = lB[offB + 32]; \ rB[0][3] = lB[offB + 48]; \ offA += 33; \ offB += 65; \ rC[0][0]=mad(rA[0][0],rB[0][0],rC[0][0]); \ rC[1][0]=mad(rA[0][1],rB[0][0],rC[1][0]); \ rC[0][1]=mad(rA[0][0],rB[0][1],rC[0][1]); \ rC[1][1]=mad(rA[0][1],rB[0][1],rC[1][1]); \ rC[0][2]=mad(rA[0][0],rB[0][2],rC[0][2]); \ rC[1][2]=mad(rA[0][1],rB[0][2],rC[1][2]); \ rC[0][3]=mad(rA[0][0],rB[0][3],rC[0][3]); \ rC[1][3]=mad(rA[0][1],rB[0][3],rC[1][3]); \ mem_fence(CLK_LOCAL_MEM_FENCE);\n __attribute__((reqd_work_group_size(16,16,1))) __kernel void sgemm_Col_NT_B1_MX032_NX064_KX16_ROW ( __global float const * restrict A, __global float const * restrict B, __global float * C, float const alpha, float const beta, uint const M, uint const N, uint const K, uint lda, uint ldb, uint ldc, uint offsetA, uint offsetB, uint offsetC) { float rC[2][4] = { (float)0 }; float rA[1][2]; float rB[1][4]; A += offsetA; B += offsetB; C += offsetC; __local float lA[528];//16*32+16 __local float lB[1040];//16*64+16 uint gidx = M / 64;//get_group_id(0); uint gidy = get_group_id(1); uint idx = get_local_id(0); uint idy = get_local_id(1); int CurrentOffSetA = gidx * 64 + idx; A += gidx * 64 + idx + idy*lda; B += gidy * 64 + idx + idy*ldb; uint block_k = K >> 4; do { __local float* plA = lA + idy * 33 + idx; __local float* plB = lB + idy * 65 + idx; barrier(CLK_LOCAL_MEM_FENCE); plB[0] = B[0 + 0 * ldb]; plB[16] = B[16 + 0 * ldb]; plB[32] = B[32 + 0 * ldb]; plB[48] = B[48 + 0 * ldb]; //plA[0] = CurrentOffSetA>=M?0.0:A[0]; //plA[16] = CurrentOffSetA+16>=M?0.0:A[16]; //plA[32] = CurrentOffSetA+32>=M?0.0:A[32]; //plA[48] = CurrentOffSetA+48>=M?0.0:A[48]; plA[0] = A[0]; plA[16] = A[16]; barrier(CLK_LOCAL_MEM_FENCE); uint offA = idx; uint offB = idy; M2x4 M2x4 M2x4 M2x4 M2x4 M2x4 M2x4 M2x4 M2x4 M2x4 M2x4 M2x4 M2x4 M2x4 M2x4 M2x4 A += lda << 4; B += ldb << 4; } while (--block_k > 0); int offset_x = gidx * 64 + idx; int offset_y = gidy * 64 + idy; //if(offset_x>=M ) // return; C += offset_x + offset_y*ldc; int i = 0; do { C[0] = mad(alpha, rC[i][0], beta*C[0]); C[16 * ldc] = mad(alpha, rC[i][1], beta*C[16 * ldc]); C[32 * ldc] = mad(alpha, rC[i][2], beta*C[32 * ldc]); C[48 * ldc] = mad(alpha, rC[i][3], beta*C[48 * ldc]); C += 16; offset_x += 16; //if(offset_x>=M ) // return; } while (++i < 2); } ); #endif sgemm_Col_NT_B1_MX064_NX032_KX16_COL_src.cpp000066400000000000000000000100321264277366700331210ustar00rootroot00000000000000clblas-2.10/src/library/blas/AutoGemm/UserGemmKernelSources/******************************************************************************* * Hand-tuned kernel * below kernels work with an assumption: after the main matrix being computed by kernels with 64x64 micro tile size, the boundary are of size 32. * Thus, M and N are of mod32 and not necessarily of mod64. ******************************************************************************/ #ifndef KERNEL_SGEMM_COL_NT_B1_MX064_NX032_KX16_COLUMN_SRC_H #define KERNEL_SGEMM_COL_NT_B1_MX064_NX032_KX16_COLUMN_SRC_H #pragma message("AutoGemm's sgemm_Col_NT_B1_MX064_NX032_KX16_COLUMN_src (if exists) overriden by user.") #include "UserGemmKernelSourceIncludes.h" #ifndef STRINGIFY #define STRINGIFY(S) STRINGIFY2(S) #define STRINGIFY2(S) #S #endif const unsigned int sgemm_Col_NT_B1_MX064_NX032_KX16_COLUMN_workGroupNumRows = 16; const unsigned int sgemm_Col_NT_B1_MX064_NX032_KX16_COLUMN_workGroupNumCols = 16; const unsigned int sgemm_Col_NT_B1_MX064_NX032_KX16_COLUMN_microTileNumRows = 4; const unsigned int sgemm_Col_NT_B1_MX064_NX032_KX16_COLUMN_microTileNumCols = 2; const unsigned int sgemm_Col_NT_B1_MX064_NX032_KX16_COLUMN_unroll = 16; //if precompiled is enabled. All hand tuned kerenls should be precompiled. #ifndef AUTOGEMM_USE_PRE_COMPILED_KERNELS unsigned char *sgemm_Col_NT_B1_MX064_NX032_KX16_COLUMN_bin = 0; size_t sgemm_Col_NT_B1_MX064_NX032_KX16_COLUMN_binSize = 0; #endif const char * const sgemm_Col_NT_B1_MX064_NX032_KX16_COLUMN_src = STRINGIFY( #define M4x2 \ rA[0][0] = lA[offA + 0]; \ rA[0][1] = lA[offA + 16]; \ rA[0][2] = lA[offA + 32]; \ rA[0][3] = lA[offA + 48]; \ rB[0][0] = lB[offB + 0]; \ rB[0][1] = lB[offB + 16]; \ offA += 65; \ offB += 33; \ rC[0][0]=mad(rA[0][0],rB[0][0],rC[0][0]); \ rC[1][0]=mad(rA[0][1],rB[0][0],rC[1][0]); \ rC[2][0]=mad(rA[0][2],rB[0][0],rC[2][0]); \ rC[3][0]=mad(rA[0][3],rB[0][0],rC[3][0]); \ rC[0][1]=mad(rA[0][0],rB[0][1],rC[0][1]); \ rC[1][1]=mad(rA[0][1],rB[0][1],rC[1][1]); \ rC[2][1]=mad(rA[0][2],rB[0][1],rC[2][1]); \ rC[3][1]=mad(rA[0][3],rB[0][1],rC[3][1]); \ mem_fence(CLK_LOCAL_MEM_FENCE);\n __attribute__((reqd_work_group_size(16,16,1))) __kernel void sgemm_Col_NT_B1_MX064_NX032_KX16_COLUMN ( __global float const * restrict A, __global float const * restrict B, __global float * C, float const alpha, float const beta, uint const M, uint const N, uint const K, uint lda, uint ldb, uint ldc, uint offsetA, uint offsetB, uint offsetC) { float rC[4][2] = { (float)0 }; float rA[1][4]; float rB[1][2]; A += offsetA; B += offsetB; C += offsetC; __local float lA[1040];//16*64+16 __local float lB[528];//16*32+16 uint gidx = get_group_id(0); uint gidy = N / 64;//get_group_id(1); uint idx = get_local_id(0); uint idy = get_local_id(1); int CurrentOffSetB = gidy * 64 + idx; A += gidx * 64 + idx + idy*lda; B += gidy * 64 + idx + idy*ldb; uint block_k = K >> 4; do { __local float* plA = lA + idy * 65 + idx; __local float* plB = lB + idy * 33 + idx; barrier(CLK_LOCAL_MEM_FENCE); //plB[0] = CurrentOffSetB>=N?0.0:B[0]; //plB[16] = CurrentOffSetB+16>=N?0.0:B[16]; //plB[32] = CurrentOffSetB+32>=N?0.0:B[32]; //plB[48] = CurrentOffSetB+48>=N?0.0:B[48]; plB[0] = B[0]; plB[16] = B[16]; plA[0] = A[0]; plA[16] = A[16]; plA[32] = A[32]; plA[48] = A[48]; barrier(CLK_LOCAL_MEM_FENCE); uint offA = idx; uint offB = idy; M4x2 M4x2 M4x2 M4x2 M4x2 M4x2 M4x2 M4x2 M4x2 M4x2 M4x2 M4x2 M4x2 M4x2 M4x2 M4x2 A += lda << 4; B += ldb << 4; } while (--block_k > 0); int offset_x = gidx * 64 + idx; int offset_y = gidy * 64 + idy; //if(offset_y>=N ) // return; C += offset_x + offset_y*ldc; int i = 0; do { C[0] = mad(alpha, rC[i][0], beta*C[0]); C[16 * ldc] = mad(alpha, rC[i][1], beta*C[16 * ldc]); C += 16; } while (++i < 4); } ); #endif clblas-2.10/src/library/blas/AutoGemm/UserGemmKernelSources/sgemm_Col_NT_B1_MX064_NX064_KX16_src.cpp000066400000000000000000000111221264277366700325110ustar00rootroot00000000000000/******************************************************************************* * Hand-tuned kernel ******************************************************************************/ #ifndef KERNEL_SGEMM_COL_NT_B1_MX064_NX064_KX16_SRC_H #define KERNEL_SGEMM_COL_NT_B1_MX064_NX064_KX16_SRC_H #pragma message("AutoGemm's sgemm_Col_NT_B1_MX064_NX064_KX16_src overriden by user.") #ifndef STRINGIFY #define STRINGIFY(S) STRINGIFY2(S) #define STRINGIFY2(S) #S #endif const unsigned int sgemm_Col_NT_B1_MX064_NX064_KX16_workGroupNumRows = 16; const unsigned int sgemm_Col_NT_B1_MX064_NX064_KX16_workGroupNumCols = 16; const unsigned int sgemm_Col_NT_B1_MX064_NX064_KX16_microTileNumRows = 4; const unsigned int sgemm_Col_NT_B1_MX064_NX064_KX16_microTileNumCols = 4; const unsigned int sgemm_Col_NT_B1_MX064_NX064_KX16_unroll = 16; const char * const sgemm_Col_NT_B1_MX064_NX064_KX16_src = STRINGIFY( #define M4x4 \ rA[0][0] = lA[offA + 0]; \ rA[0][1] = lA[offA + 16]; \ rA[0][2] = lA[offA + 32]; \ rA[0][3] = lA[offA + 48]; \ rB[0][0] = lB[offB + 0]; \ rB[0][1] = lB[offB + 16]; \ rB[0][2] = lB[offB + 32]; \ rB[0][3] = lB[offB + 48]; \ offA += 65; \ offB += 65; \ rC[0][0]=mad(rA[0][0],rB[0][0],rC[0][0]); \ rC[1][0]=mad(rA[0][1],rB[0][0],rC[1][0]); \ rC[2][0]=mad(rA[0][2],rB[0][0],rC[2][0]); \ rC[3][0]=mad(rA[0][3],rB[0][0],rC[3][0]); \ rC[0][1]=mad(rA[0][0],rB[0][1],rC[0][1]); \ rC[1][1]=mad(rA[0][1],rB[0][1],rC[1][1]); \ rC[2][1]=mad(rA[0][2],rB[0][1],rC[2][1]); \ rC[3][1]=mad(rA[0][3],rB[0][1],rC[3][1]); \ rC[0][2]=mad(rA[0][0],rB[0][2],rC[0][2]); \ rC[1][2]=mad(rA[0][1],rB[0][2],rC[1][2]); \ rC[2][2]=mad(rA[0][2],rB[0][2],rC[2][2]); \ rC[3][2]=mad(rA[0][3],rB[0][2],rC[3][2]); \ rC[0][3]=mad(rA[0][0],rB[0][3],rC[0][3]); \ rC[1][3]=mad(rA[0][1],rB[0][3],rC[1][3]); \ rC[2][3]=mad(rA[0][2],rB[0][3],rC[2][3]); \ rC[3][3]=mad(rA[0][3],rB[0][3],rC[3][3]); \ mem_fence(CLK_LOCAL_MEM_FENCE);\n __attribute__((reqd_work_group_size(16,16,1))) __kernel void sgemm_Col_NT_B1_MX064_NX064_KX16 ( __global float const * restrict A, __global float const * restrict B, __global float * C, float const alpha, float const beta, uint const M, uint const N, uint const K, uint lda, uint ldb, uint ldc, uint offsetA, uint offsetB, uint offsetC) { float rC[4][4] = { {(float)0} }; float rA[1][4]; float rB[1][4]; A += offsetA; B += offsetB; C+=offsetC; __local float lA[1040]; __local float lB[1040]; uint gidx = get_group_id(0); uint gidy = get_group_id(1); uint idx = get_local_id(0); uint idy = get_local_id(1); uint idt = 16*idy + idx; uint idxT = idt % 16; uint idyT = idt / 16; A += gidx*64+ idxT + idyT*lda; B += gidy*64+ idxT + idyT*ldb; uint block_k = K >> 4; do { __local float* plA = lA + idyT*65+idxT; __local float* plB = lB + idyT*65+idxT; barrier(CLK_LOCAL_MEM_FENCE); plB[0] = B[0+0*ldb]; plB[16] = B[16+0*ldb]; plB[32] = B[32+0*ldb]; plB[48] = B[48+0*ldb]; plA[0] = A[0+0*lda]; plA[16] = A[16+0*lda]; plA[32] = A[32+0*lda]; plA[48] = A[48+0*lda]; barrier(CLK_LOCAL_MEM_FENCE); uint offA = idx; uint offB = idy; M4x4 M4x4 M4x4 M4x4 M4x4 M4x4 M4x4 M4x4 M4x4 M4x4 M4x4 M4x4 M4x4 M4x4 M4x4 M4x4 A += lda<<4; B += ldb<<4; } while (--block_k > 0); C+= gidx*64+idx; C+= gidy*64*ldc; C+= idy*ldc; C[0*ldc] = alpha*rC[0][0] + beta*C[0*ldc]; C[16*ldc] = alpha*rC[0][1] + beta*C[16*ldc]; C[32*ldc] = alpha*rC[0][2] + beta*C[32*ldc]; C[48*ldc] = alpha*rC[0][3] + beta*C[48*ldc]; C+=16; C[0*ldc] = alpha*rC[1][0] + beta*C[0*ldc]; C[16*ldc] = alpha*rC[1][1] + beta*C[16*ldc]; C[32*ldc] = alpha*rC[1][2] + beta*C[32*ldc]; C[48*ldc] = alpha*rC[1][3] + beta*C[48*ldc]; C+=16; C[0*ldc] = alpha*rC[2][0] + beta*C[0*ldc]; C[16*ldc] = alpha*rC[2][1] + beta*C[16*ldc]; C[32*ldc] = alpha*rC[2][2] + beta*C[32*ldc]; C[48*ldc] = alpha*rC[2][3] + beta*C[48*ldc]; C+=16; C[0*ldc] = alpha*rC[3][0] + beta*C[0*ldc]; C[16*ldc] = alpha*rC[3][1] + beta*C[16*ldc]; C[32*ldc] = alpha*rC[3][2] + beta*C[32*ldc]; C[48*ldc] = alpha*rC[3][3] + beta*C[48*ldc]; } ); #endif clblas-2.10/src/library/blas/AutoGemm/UserGemmKernelSources/sgemm_Col_NT_B1_MX096_NX096_KX16_src.cpp000066400000000000000000000156361264277366700325410ustar00rootroot00000000000000/******************************************************************************* * Hand-tuned kernel ******************************************************************************/ #ifndef KERNEL_SGEMM_COL_NT_B1_MX096_NX096_KX16_SRC_H #define KERNEL_SGEMM_COL_NT_B1_MX096_NX096_KX16_SRC_H #pragma message("AutoGemm's sgemm_Col_NT_B1_MX096_NX096_KX16_src overriden by user.") #include "UserGemmKernelSourceIncludes.h" #ifndef STRINGIFY #define STRINGIFY(S) STRINGIFY2(S) #define STRINGIFY2(S) #S #endif const unsigned int sgemm_Col_NT_B1_MX096_NX096_KX16_workGroupNumRows = 16; const unsigned int sgemm_Col_NT_B1_MX096_NX096_KX16_workGroupNumCols = 16; const unsigned int sgemm_Col_NT_B1_MX096_NX096_KX16_microTileNumRows = 6; const unsigned int sgemm_Col_NT_B1_MX096_NX096_KX16_microTileNumCols = 6; const unsigned int sgemm_Col_NT_B1_MX096_NX096_KX16_unroll = 16; const char * const sgemm_Col_NT_B1_MX096_NX096_KX16_src = STRINGIFY( #define M6x6 \ rA[0][0] = lA[offA + 0]; \ rA[0][1] = lA[offA + 16]; \ rA[0][2] = lA[offA + 32]; \ rA[0][3] = lA[offA + 48]; \ rA[0][4] = lA[offA + 64]; \ rA[0][5] = lA[offA + 80]; \ rB[0][0] = lB[offB + 0]; \ rB[0][1] = lB[offB + 16]; \ rB[0][2] = lB[offB + 32]; \ rB[0][3] = lB[offB + 48]; \ rB[0][4] = lB[offB + 64]; \ rB[0][5] = lB[offB + 80]; \ offA += 97; \ offB += 97; \ rC[0][0]=mad(rA[0][0],rB[0][0],rC[0][0]); \ rC[1][0]=mad(rA[0][1],rB[0][0],rC[1][0]); \ rC[2][0]=mad(rA[0][2],rB[0][0],rC[2][0]); \ rC[3][0]=mad(rA[0][3],rB[0][0],rC[3][0]); \ rC[4][0]=mad(rA[0][4],rB[0][0],rC[4][0]); \ rC[5][0]=mad(rA[0][5],rB[0][0],rC[5][0]); \ rC[0][1]=mad(rA[0][0],rB[0][1],rC[0][1]); \ rC[1][1]=mad(rA[0][1],rB[0][1],rC[1][1]); \ rC[2][1]=mad(rA[0][2],rB[0][1],rC[2][1]); \ rC[3][1]=mad(rA[0][3],rB[0][1],rC[3][1]); \ rC[4][1]=mad(rA[0][4],rB[0][1],rC[4][1]); \ rC[5][1]=mad(rA[0][5],rB[0][1],rC[5][1]); \ rC[0][2]=mad(rA[0][0],rB[0][2],rC[0][2]); \ rC[1][2]=mad(rA[0][1],rB[0][2],rC[1][2]); \ rC[2][2]=mad(rA[0][2],rB[0][2],rC[2][2]); \ rC[3][2]=mad(rA[0][3],rB[0][2],rC[3][2]); \ rC[4][2]=mad(rA[0][4],rB[0][2],rC[4][2]); \ rC[5][2]=mad(rA[0][5],rB[0][2],rC[5][2]); \ rC[0][3]=mad(rA[0][0],rB[0][3],rC[0][3]); \ rC[1][3]=mad(rA[0][1],rB[0][3],rC[1][3]); \ rC[2][3]=mad(rA[0][2],rB[0][3],rC[2][3]); \ rC[3][3]=mad(rA[0][3],rB[0][3],rC[3][3]); \ rC[4][3]=mad(rA[0][4],rB[0][3],rC[4][3]); \ rC[5][3]=mad(rA[0][5],rB[0][3],rC[5][3]); \ rC[0][4]=mad(rA[0][0],rB[0][4],rC[0][4]); \ rC[1][4]=mad(rA[0][1],rB[0][4],rC[1][4]); \ rC[2][4]=mad(rA[0][2],rB[0][4],rC[2][4]); \ rC[3][4]=mad(rA[0][3],rB[0][4],rC[3][4]); \ rC[4][4]=mad(rA[0][4],rB[0][4],rC[4][4]); \ rC[5][4]=mad(rA[0][5],rB[0][4],rC[5][4]); \ rC[0][5]=mad(rA[0][0],rB[0][5],rC[0][5]); \ rC[1][5]=mad(rA[0][1],rB[0][5],rC[1][5]); \ rC[2][5]=mad(rA[0][2],rB[0][5],rC[2][5]); \ rC[3][5]=mad(rA[0][3],rB[0][5],rC[3][5]); \ rC[4][5]=mad(rA[0][4],rB[0][5],rC[4][5]); \ rC[5][5]=mad(rA[0][5],rB[0][5],rC[5][5]); \ mem_fence(CLK_LOCAL_MEM_FENCE);\n __attribute__((reqd_work_group_size(16,16,1))) __kernel void sgemm_Col_NT_B1_MX096_NX096_KX16 ( __global float const * restrict A, __global float const * restrict B, __global float * C, float const alpha, float const beta, uint const M, uint const N, uint const K, uint lda, uint ldb, uint ldc, uint offsetA, uint offsetB, uint offsetC) { float rC[6][6] = { {(float)0} }; float rA[1][6]; float rB[1][6]; A += offsetA; B += offsetB; C+=offsetC; __local float lA[1552]; __local float lB[1552]; uint gidx = get_group_id(0); uint gidy = get_group_id(1); uint idx = get_local_id(0); uint idy = get_local_id(1); A += gidx*96+ idx + idy*lda; B += gidy*96+ idx + idy*ldb; uint block_k = K >> 4; do { __local float* plA = lA + idy*97+idx; __local float* plB = lB + idy*97+idx; barrier(CLK_LOCAL_MEM_FENCE); plB[0] = B[0+0*ldb]; plB[16] = B[16+0*ldb]; plB[32] = B[32+0*ldb]; plB[48] = B[48+0*ldb]; plB[64] = B[64+0*ldb]; plB[80] = B[80+0*ldb]; plA[0] = A[0+0*lda]; plA[16] = A[16+0*lda]; plA[32] = A[32+0*lda]; plA[48] = A[48+0*lda]; plA[64] = A[64+0*lda]; plA[80] = A[80+0*lda]; barrier(CLK_LOCAL_MEM_FENCE); uint offA = idx; uint offB = idy; M6x6 M6x6 M6x6 M6x6 M6x6 M6x6 M6x6 M6x6 M6x6 M6x6 M6x6 M6x6 M6x6 M6x6 M6x6 M6x6 A += lda<<4; B += ldb<<4; } while (--block_k > 0); C+= gidx*96+idx; C+= gidy*96*ldc; C+= idy*ldc; C[0*ldc] = alpha*rC[0][0] + beta*C[0*ldc]; C[16*ldc] = alpha*rC[0][1] + beta*C[16*ldc]; C[32*ldc] = alpha*rC[0][2] + beta*C[32*ldc]; C[48*ldc] = alpha*rC[0][3] + beta*C[48*ldc]; C[64*ldc] = alpha*rC[0][4] + beta*C[64*ldc]; C[80*ldc] = alpha*rC[0][5] + beta*C[80*ldc]; C+=16; C[0*ldc] = alpha*rC[1][0] + beta*C[0*ldc]; C[16*ldc] = alpha*rC[1][1] + beta*C[16*ldc]; C[32*ldc] = alpha*rC[1][2] + beta*C[32*ldc]; C[48*ldc] = alpha*rC[1][3] + beta*C[48*ldc]; C[64*ldc] = alpha*rC[1][4] + beta*C[64*ldc]; C[80*ldc] = alpha*rC[1][5] + beta*C[80*ldc]; C+=16; C[0*ldc] = alpha*rC[2][0] + beta*C[0*ldc]; C[16*ldc] = alpha*rC[2][1] + beta*C[16*ldc]; C[32*ldc] = alpha*rC[2][2] + beta*C[32*ldc]; C[48*ldc] = alpha*rC[2][3] + beta*C[48*ldc]; C[64*ldc] = alpha*rC[2][4] + beta*C[64*ldc]; C[80*ldc] = alpha*rC[2][5] + beta*C[80*ldc]; C+=16; C[0*ldc] = alpha*rC[3][0] + beta*C[0*ldc]; C[16*ldc] = alpha*rC[3][1] + beta*C[16*ldc]; C[32*ldc] = alpha*rC[3][2] + beta*C[32*ldc]; C[48*ldc] = alpha*rC[3][3] + beta*C[48*ldc]; C[64*ldc] = alpha*rC[3][4] + beta*C[64*ldc]; C[80*ldc] = alpha*rC[3][5] + beta*C[80*ldc]; C+=16; C[0*ldc] = alpha*rC[4][0] + beta*C[0*ldc]; C[16*ldc] = alpha*rC[4][1] + beta*C[16*ldc]; C[32*ldc] = alpha*rC[4][2] + beta*C[32*ldc]; C[48*ldc] = alpha*rC[4][3] + beta*C[48*ldc]; C[64*ldc] = alpha*rC[4][4] + beta*C[64*ldc]; C[80*ldc] = alpha*rC[4][5] + beta*C[80*ldc]; C+=16; C[0*ldc] = alpha*rC[5][0] + beta*C[0*ldc]; C[16*ldc] = alpha*rC[5][1] + beta*C[16*ldc]; C[32*ldc] = alpha*rC[5][2] + beta*C[32*ldc]; C[48*ldc] = alpha*rC[5][3] + beta*C[48*ldc]; C[64*ldc] = alpha*rC[5][4] + beta*C[64*ldc]; C[80*ldc] = alpha*rC[5][5] + beta*C[80*ldc]; } ); #endif clblas-2.10/src/library/blas/AutoGemm/UserGemmKernelSources/sgemm_Col_NT_B1_MX128_NX128_KX16_src.cpp000066400000000000000000000254361264277366700325300ustar00rootroot00000000000000/******************************************************************************* * Hand-tuned kernel ******************************************************************************/ #ifndef KERNEL_SGEMM_COL_NT_B1_MX128_NX128_KX16_SRC_H #define KERNEL_SGEMM_COL_NT_B1_MX128_NX128_KX16_SRC_H #pragma message("AutoGemm's sgemm_Col_NT_B1_MX128_NX128_KX16_src (if exists) overriden by user.") #include "UserGemmKernelSourceIncludes.h" #ifndef STRINGIFY #define STRINGIFY(S) STRINGIFY2(S) #define STRINGIFY2(S) #S #endif const unsigned int sgemm_Col_NT_B1_MX128_NX128_KX16_workGroupNumRows = 16; const unsigned int sgemm_Col_NT_B1_MX128_NX128_KX16_workGroupNumCols = 16; const unsigned int sgemm_Col_NT_B1_MX128_NX128_KX16_microTileNumRows = 8; const unsigned int sgemm_Col_NT_B1_MX128_NX128_KX16_microTileNumCols = 8; const unsigned int sgemm_Col_NT_B1_MX128_NX128_KX16_unroll = 16; //if precompiled is enabled. All hand tuned kerenls should be precompiled. #ifndef AUTOGEMM_USE_PRE_COMPILED_KERNELS unsigned char *sgemm_Col_NT_B1_MX128_NX128_KX16_bin = 0; size_t sgemm_Col_NT_B1_MX128_NX128_KX16_binSize = 0; #endif const char * const sgemm_Col_NT_B1_MX128_NX128_KX16_src = STRINGIFY( #define M8x8 \ rA[0][0] = lA[offA + 0]; \ rA[0][1] = lA[offA + 16]; \ rA[0][2] = lA[offA + 32]; \ rA[0][3] = lA[offA + 48]; \ rA[0][4] = lA[offA + 64]; \ rA[0][5] = lA[offA + 80]; \ rA[0][6] = lA[offA + 96]; \ rA[0][7] = lA[offA + 112]; \ rB[0][0] = lB[offB + 0]; \ rB[0][1] = lB[offB + 16]; \ rB[0][2] = lB[offB + 32]; \ rB[0][3] = lB[offB + 48]; \ rB[0][4] = lB[offB + 64]; \ rB[0][5] = lB[offB + 80]; \ rB[0][6] = lB[offB + 96]; \ rB[0][7] = lB[offB + 112]; \ offA += 129; \ offB += 129; \ rC[0][0] = mad(rA[0][0],rB[0][0],rC[0][0]); \ rC[1][0] = mad(rA[0][1],rB[0][0],rC[1][0]); \ rC[2][0] = mad(rA[0][2],rB[0][0],rC[2][0]); \ rC[3][0] = mad(rA[0][3],rB[0][0],rC[3][0]); \ rC[4][0] = mad(rA[0][4],rB[0][0],rC[4][0]); \ rC[5][0] = mad(rA[0][5],rB[0][0],rC[5][0]); \ rC[6][0] = mad(rA[0][6],rB[0][0],rC[6][0]); \ rC[7][0] = mad(rA[0][7],rB[0][0],rC[7][0]); \ rC[0][1] = mad(rA[0][0], rB[0][1], rC[0][1]); \ rC[1][1] = mad(rA[0][1], rB[0][1], rC[1][1]); \ rC[2][1] = mad(rA[0][2], rB[0][1], rC[2][1]); \ rC[3][1] = mad(rA[0][3], rB[0][1], rC[3][1]); \ rC[4][1] = mad(rA[0][4], rB[0][1], rC[4][1]); \ rC[5][1] = mad(rA[0][5], rB[0][1], rC[5][1]); \ rC[6][1] = mad(rA[0][6], rB[0][1], rC[6][1]); \ rC[7][1] = mad(rA[0][7], rB[0][1], rC[7][1]); \ rC[0][2] = mad(rA[0][0], rB[0][2], rC[0][2]); \ rC[1][2] = mad(rA[0][1], rB[0][2], rC[1][2]); \ rC[2][2] = mad(rA[0][2], rB[0][2], rC[2][2]); \ rC[3][2] = mad(rA[0][3], rB[0][2], rC[3][2]); \ rC[4][2] = mad(rA[0][4], rB[0][2], rC[4][2]); \ rC[5][2] = mad(rA[0][5], rB[0][2], rC[5][2]); \ rC[6][2] = mad(rA[0][6], rB[0][2], rC[6][2]); \ rC[7][2] = mad(rA[0][7], rB[0][2], rC[7][2]); \ rC[0][3] = mad(rA[0][0], rB[0][3], rC[0][3]); \ rC[1][3] = mad(rA[0][1], rB[0][3], rC[1][3]); \ rC[2][3] = mad(rA[0][2], rB[0][3], rC[2][3]); \ rC[3][3] = mad(rA[0][3], rB[0][3], rC[3][3]); \ rC[4][3] = mad(rA[0][4], rB[0][3], rC[4][3]); \ rC[5][3] = mad(rA[0][5], rB[0][3], rC[5][3]); \ rC[6][3] = mad(rA[0][6], rB[0][3], rC[6][3]); \ rC[7][3] = mad(rA[0][7], rB[0][3], rC[7][3]); \ rC[0][4] = mad(rA[0][0], rB[0][4], rC[0][4]); \ rC[1][4] = mad(rA[0][1], rB[0][4], rC[1][4]); \ rC[2][4] = mad(rA[0][2], rB[0][4], rC[2][4]); \ rC[3][4] = mad(rA[0][3], rB[0][4], rC[3][4]); \ rC[4][4] = mad(rA[0][4], rB[0][4], rC[4][4]); \ rC[5][4] = mad(rA[0][5], rB[0][4], rC[5][4]); \ rC[6][4] = mad(rA[0][6], rB[0][4], rC[6][4]); \ rC[7][4] = mad(rA[0][7], rB[0][4], rC[7][4]); \ rC[0][5] = mad(rA[0][0], rB[0][5], rC[0][5]); \ rC[1][5] = mad(rA[0][1], rB[0][5], rC[1][5]); \ rC[2][5] = mad(rA[0][2], rB[0][5], rC[2][5]); \ rC[3][5] = mad(rA[0][3], rB[0][5], rC[3][5]); \ rC[4][5] = mad(rA[0][4], rB[0][5], rC[4][5]); \ rC[5][5] = mad(rA[0][5], rB[0][5], rC[5][5]); \ rC[6][5] = mad(rA[0][6], rB[0][5], rC[6][5]); \ rC[7][5] = mad(rA[0][7], rB[0][5], rC[7][5]); \ rC[0][6] = mad(rA[0][0], rB[0][6], rC[0][6]); \ rC[1][6] = mad(rA[0][1], rB[0][6], rC[1][6]); \ rC[2][6] = mad(rA[0][2], rB[0][6], rC[2][6]); \ rC[3][6] = mad(rA[0][3], rB[0][6], rC[3][6]); \ rC[4][6] = mad(rA[0][4], rB[0][6], rC[4][6]); \ rC[5][6] = mad(rA[0][5], rB[0][6], rC[5][6]); \ rC[6][6] = mad(rA[0][6], rB[0][6], rC[6][6]); \ rC[7][6] = mad(rA[0][7], rB[0][6], rC[7][6]); \ rC[0][7] = mad(rA[0][0], rB[0][7], rC[0][7]); \ rC[1][7] = mad(rA[0][1], rB[0][7], rC[1][7]); \ rC[2][7] = mad(rA[0][2], rB[0][7], rC[2][7]); \ rC[3][7] = mad(rA[0][3], rB[0][7], rC[3][7]); \ rC[4][7] = mad(rA[0][4], rB[0][7], rC[4][7]); \ rC[5][7] = mad(rA[0][5], rB[0][7], rC[5][7]); \ rC[6][7] = mad(rA[0][6], rB[0][7], rC[6][7]); \ rC[7][7] = mad(rA[0][7], rB[0][7], rC[7][7]); \ mem_fence(CLK_LOCAL_MEM_FENCE);\n __attribute__((reqd_work_group_size(16,16,1))) __kernel void sgemm_Col_NT_B1_MX128_NX128_KX16( __global float const * restrict A, __global float const * restrict B, __global float * C, float const alpha, float const beta, uint const M, uint const N, uint const K, uint lda, uint ldb, uint ldc, uint offsetA, uint offsetB, uint offsetC) { float rC[8][8] = { (float)0 }; float rA[1][8]; float rB[1][8]; A += offsetA; B += offsetB; C += offsetC; __local float lA[2064]; __local float lB[2064]; uint gidx = get_group_id(0); uint gidy = get_group_id(1); uint idx = get_local_id(0); uint idy = get_local_id(1); uint idt = 16 * idy + idx; uint idxT = idt % 16; uint idyT = idt / 16; A += gidx * 128 + idxT + idyT*lda; B += gidy * 128 + idxT + idyT*ldb; uint block_k = K >> 4; do { // for(unsigned int block_k=0 ; block_k< K ; block_k+=16) //{ __local float* plA = lA + idyT * 129 + idxT; __local float* plB = lB + idyT * 129 + idxT; barrier(CLK_LOCAL_MEM_FENCE); plB[0] = B[0 + 0 * ldb]; plB[16] = B[16 + 0 * ldb]; plB[32] = B[32 + 0 * ldb]; plB[48] = B[48 + 0 * ldb]; plB[64] = B[64 + 0 * ldb]; plB[80] = B[80 + 0 * ldb]; plB[96] = B[96 + 0 * ldb]; plB[112] = B[112 + 0 * ldb]; plA[0] = A[0 + 0 * lda]; plA[16] = A[16 + 0 * lda]; plA[32] = A[32 + 0 * lda]; plA[48] = A[48 + 0 * lda]; plA[64] = A[64 + 0 * lda]; plA[80] = A[80 + 0 * lda]; plA[96] = A[96 + 0 * lda]; plA[112] = A[112 + 0 * lda]; barrier(CLK_LOCAL_MEM_FENCE); uint offA = idx; uint offB = idy; // #pragma unroll 1 // for(unsigned int k = 0 ; k < 16; k+=1){ // } M8x8 M8x8 M8x8 M8x8 M8x8 M8x8 M8x8 M8x8 M8x8 M8x8 M8x8 M8x8 M8x8 M8x8 M8x8 M8x8 A += lda << 4; B += ldb << 4; //} } while (--block_k > 0); C += gidx * 128 + idx; C += gidy * 128 * ldc; C += idy*ldc; C[0 * ldc] = alpha*rC[0][0] + beta*C[0 * ldc]; C[16 * ldc] = alpha*rC[0][1] + beta*C[16 * ldc]; C[32 * ldc] = alpha*rC[0][2] + beta*C[32 * ldc]; C[48 * ldc] = alpha*rC[0][3] + beta*C[48 * ldc]; C[64 * ldc] = alpha*rC[0][4] + beta*C[64 * ldc]; C[80 * ldc] = alpha*rC[0][5] + beta*C[80 * ldc]; C[96 * ldc] = alpha*rC[0][6] + beta*C[96 * ldc]; C[112 * ldc] = alpha*rC[0][7] + beta*C[112 * ldc]; C += 16; C[0 * ldc] = alpha*rC[1][0] + beta*C[0 * ldc]; C[16 * ldc] = alpha*rC[1][1] + beta*C[16 * ldc]; C[32 * ldc] = alpha*rC[1][2] + beta*C[32 * ldc]; C[48 * ldc] = alpha*rC[1][3] + beta*C[48 * ldc]; C[64 * ldc] = alpha*rC[1][4] + beta*C[64 * ldc]; C[80 * ldc] = alpha*rC[1][5] + beta*C[80 * ldc]; C[96 * ldc] = alpha*rC[1][6] + beta*C[96 * ldc]; C[112 * ldc] = alpha*rC[1][7] + beta*C[112 * ldc]; C += 16; C[0 * ldc] = alpha*rC[2][0] + beta*C[0 * ldc]; C[16 * ldc] = alpha*rC[2][1] + beta*C[16 * ldc]; C[32 * ldc] = alpha*rC[2][2] + beta*C[32 * ldc]; C[48 * ldc] = alpha*rC[2][3] + beta*C[48 * ldc]; C[64 * ldc] = alpha*rC[2][4] + beta*C[64 * ldc]; C[80 * ldc] = alpha*rC[2][5] + beta*C[80 * ldc]; C[96 * ldc] = alpha*rC[2][6] + beta*C[96 * ldc]; C[112 * ldc] = alpha*rC[2][7] + beta*C[112 * ldc]; C += 16; C[0 * ldc] = alpha*rC[3][0] + beta*C[0 * ldc]; C[16 * ldc] = alpha*rC[3][1] + beta*C[16 * ldc]; C[32 * ldc] = alpha*rC[3][2] + beta*C[32 * ldc]; C[48 * ldc] = alpha*rC[3][3] + beta*C[48 * ldc]; C[64 * ldc] = alpha*rC[3][4] + beta*C[64 * ldc]; C[80 * ldc] = alpha*rC[3][5] + beta*C[80 * ldc]; C[96 * ldc] = alpha*rC[3][6] + beta*C[96 * ldc]; C[112 * ldc] = alpha*rC[3][7] + beta*C[112 * ldc]; C += 16; C[0 * ldc] = alpha*rC[4][0] + beta*C[0 * ldc]; C[16 * ldc] = alpha*rC[4][1] + beta*C[16 * ldc]; C[32 * ldc] = alpha*rC[4][2] + beta*C[32 * ldc]; C[48 * ldc] = alpha*rC[4][3] + beta*C[48 * ldc]; C[64 * ldc] = alpha*rC[4][4] + beta*C[64 * ldc]; C[80 * ldc] = alpha*rC[4][5] + beta*C[80 * ldc]; C[96 * ldc] = alpha*rC[4][6] + beta*C[96 * ldc]; C[112 * ldc] = alpha*rC[4][7] + beta*C[112 * ldc]; C += 16; C[0 * ldc] = alpha*rC[5][0] + beta*C[0 * ldc]; C[16 * ldc] = alpha*rC[5][1] + beta*C[16 * ldc]; C[32 * ldc] = alpha*rC[5][2] + beta*C[32 * ldc]; C[48 * ldc] = alpha*rC[5][3] + beta*C[48 * ldc]; C[64 * ldc] = alpha*rC[5][4] + beta*C[64 * ldc]; C[80 * ldc] = alpha*rC[5][5] + beta*C[80 * ldc]; C[96 * ldc] = alpha*rC[5][6] + beta*C[96 * ldc]; C[112 * ldc] = alpha*rC[5][7] + beta*C[112 * ldc]; C += 16; C[0 * ldc] = alpha*rC[6][0] + beta*C[0 * ldc]; C[16 * ldc] = alpha*rC[6][1] + beta*C[16 * ldc]; C[32 * ldc] = alpha*rC[6][2] + beta*C[32 * ldc]; C[48 * ldc] = alpha*rC[6][3] + beta*C[48 * ldc]; C[64 * ldc] = alpha*rC[6][4] + beta*C[64 * ldc]; C[80 * ldc] = alpha*rC[6][5] + beta*C[80 * ldc]; C[96 * ldc] = alpha*rC[6][6] + beta*C[96 * ldc]; C[112 * ldc] = alpha*rC[6][7] + beta*C[112 * ldc]; C += 16; C[0 * ldc] = alpha*rC[7][0] + beta*C[0 * ldc]; C[16 * ldc] = alpha*rC[7][1] + beta*C[16 * ldc]; C[32 * ldc] = alpha*rC[7][2] + beta*C[32 * ldc]; C[48 * ldc] = alpha*rC[7][3] + beta*C[48 * ldc]; C[64 * ldc] = alpha*rC[7][4] + beta*C[64 * ldc]; C[80 * ldc] = alpha*rC[7][5] + beta*C[80 * ldc]; C[96 * ldc] = alpha*rC[7][6] + beta*C[96 * ldc]; C[112 * ldc] = alpha*rC[7][7] + beta*C[112 * ldc]; } ); #endif clblas-2.10/src/library/blas/AutoGemm/UserGemmKernelSources/sgemm_Col_TN_B0_MX032_NX032_KX16_src.cpp000066400000000000000000000056451264277366700325130ustar00rootroot00000000000000/******************************************************************************* * Hand-tuned kernel ******************************************************************************/ #ifndef KERNEL_SGEMM_COL_TN_B0_MX032_NX032_KX16_SRC_H #define KERNEL_SGEMM_COL_TN_B0_MX032_NX032_KX16_SRC_H #pragma message("AutoGemm's sgemm_Col_TN_B0_MX032_NX032_KX16_src overriden by user.") #ifndef STRINGIFY #define STRINGIFY(S) STRINGIFY2(S) #define STRINGIFY2(S) #S #endif const unsigned int sgemm_Col_TN_B0_MX032_NX032_KX16_workGroupNumRows = 16; const unsigned int sgemm_Col_TN_B0_MX032_NX032_KX16_workGroupNumCols = 16; const unsigned int sgemm_Col_TN_B0_MX032_NX032_KX16_microTileNumRows = 2; const unsigned int sgemm_Col_TN_B0_MX032_NX032_KX16_microTileNumCols = 2; const unsigned int sgemm_Col_TN_B0_MX032_NX032_KX16_unroll = 16; const char * const sgemm_Col_TN_B0_MX032_NX032_KX16_src = STRINGIFY( #define M2x2 \ rA[0][0] = lA[offA + 0]; \ rA[0][1] = lA[offA + 16]; \ rB[0][0] = lB[offB + 0]; \ rB[0][1] = lB[offB + 16]; \ offA += 33; \ offB += 33; \ rC[0][0]=mad(rA[0][0],rB[0][0],rC[0][0]); \ rC[1][0]=mad(rA[0][1],rB[0][0],rC[1][0]); \ rC[0][1]=mad(rA[0][0],rB[0][1],rC[0][1]); \ rC[1][1]=mad(rA[0][1],rB[0][1],rC[1][1]); \ mem_fence(CLK_LOCAL_MEM_FENCE);\n __attribute__((reqd_work_group_size(16,16,1))) __kernel void sgemm_Col_TN_B0_MX032_NX032_KX16_src ( __global float const * restrict A, __global float const * restrict B, __global float * C, float const alpha, float const beta, uint const M, uint const N, uint const K, uint lda, uint ldb, uint ldc, uint offsetA, uint offsetB, uint offsetC) { float rC[2][2] = { {(float)0} }; float rA[1][2]; float rB[1][2]; A += offsetA; B += offsetB; C+=offsetC; __local float lA[544]; __local float lB[544]; uint gidx = get_group_id(0); uint gidy = get_group_id(1); uint idx = get_local_id(0); uint idy = get_local_id(1); uint idt = 16*idy + idx; uint idxT = idt % 16; uint idyT = idt / 16; A += (gidx*32+idyT)*lda+ idxT ; B += (gidy*32+idyT)*ldb + idxT; uint block_k = K >> 4; do { __local float* plA = lA + idxT*33+idyT; __local float* plB = lB + idxT*33+idyT; barrier(CLK_LOCAL_MEM_FENCE); plB[0] = B[0]; plB[16] = B[16*ldb]; plA[0] = A[0]; plA[16] = A[16*lda]; barrier(CLK_LOCAL_MEM_FENCE); uint offA = idx; uint offB = idy; M2x2 M2x2 M2x2 M2x2 M2x2 M2x2 M2x2 M2x2 M2x2 M2x2 M2x2 M2x2 M2x2 M2x2 M2x2 M2x2 A += 16; B += 16; } while (--block_k > 0); C+= gidx*32+idx; C+= gidy*32*ldc; C+= idy*ldc; C[0*ldc] = alpha*rC[0][0] ; C[16*ldc] = alpha*rC[0][1]; C+=16; C[0*ldc] = alpha*rC[1][0] ; C[16*ldc] = alpha*rC[1][1]; } ); #endif clblas-2.10/src/library/blas/AutoGemm/UserGemmKernelSources/sgemm_Col_TN_B0_MX064_NX064_KX16_src.cpp000066400000000000000000000103531264277366700325150ustar00rootroot00000000000000/******************************************************************************* * Hand-tuned kernel ******************************************************************************/ #ifndef KERNEL_SGEMM_COL_TN_B0_MX064_NX064_KX16_SRC_H #define KERNEL_SGEMM_COL_TN_B0_MX064_NX064_KX16_SRC_H #pragma message("AutoGemm's sgemm_Col_TN_B0_MX064_NX064_KX16_src overriden by user.") #ifndef STRINGIFY #define STRINGIFY(S) STRINGIFY2(S) #define STRINGIFY2(S) #S #endif const unsigned int sgemm_Col_TN_B0_MX064_NX064_KX16_workGroupNumRows = 16; const unsigned int sgemm_Col_TN_B0_MX064_NX064_KX16_workGroupNumCols = 16; const unsigned int sgemm_Col_TN_B0_MX064_NX064_KX16_microTileNumRows = 4; const unsigned int sgemm_Col_TN_B0_MX064_NX064_KX16_microTileNumCols = 4; const unsigned int sgemm_Col_TN_B0_MX064_NX064_KX16_unroll = 16; const char * const sgemm_Col_TN_B0_MX064_NX064_KX16_src = STRINGIFY( #define M4x4 \ rA[0][0] = lA[offA + 0]; \ rA[0][1] = lA[offA + 16]; \ rA[0][2] = lA[offA + 32]; \ rA[0][3] = lA[offA + 48]; \ rB[0][0] = lB[offB + 0]; \ rB[0][1] = lB[offB + 16]; \ rB[0][2] = lB[offB + 32]; \ rB[0][3] = lB[offB + 48]; \ offA += 65; \ offB += 65; \ rC[0][0]=mad(rA[0][0],rB[0][0],rC[0][0]); \ rC[1][0]=mad(rA[0][1],rB[0][0],rC[1][0]); \ rC[2][0]=mad(rA[0][2],rB[0][0],rC[2][0]); \ rC[3][0]=mad(rA[0][3],rB[0][0],rC[3][0]); \ rC[0][1]=mad(rA[0][0],rB[0][1],rC[0][1]); \ rC[1][1]=mad(rA[0][1],rB[0][1],rC[1][1]); \ rC[2][1]=mad(rA[0][2],rB[0][1],rC[2][1]); \ rC[3][1]=mad(rA[0][3],rB[0][1],rC[3][1]); \ rC[0][2]=mad(rA[0][0],rB[0][2],rC[0][2]); \ rC[1][2]=mad(rA[0][1],rB[0][2],rC[1][2]); \ rC[2][2]=mad(rA[0][2],rB[0][2],rC[2][2]); \ rC[3][2]=mad(rA[0][3],rB[0][2],rC[3][2]); \ rC[0][3]=mad(rA[0][0],rB[0][3],rC[0][3]); \ rC[1][3]=mad(rA[0][1],rB[0][3],rC[1][3]); \ rC[2][3]=mad(rA[0][2],rB[0][3],rC[2][3]); \ rC[3][3]=mad(rA[0][3],rB[0][3],rC[3][3]); \ mem_fence(CLK_LOCAL_MEM_FENCE);\n __attribute__((reqd_work_group_size(16,16,1))) __kernel void sgemm_Col_TN_B0_MX064_NX064_KX16 ( __global float const * restrict A, __global float const * restrict B, __global float * C, float const alpha, float const beta, uint const M, uint const N, uint const K, uint lda, uint ldb, uint ldc, uint offsetA, uint offsetB, uint offsetC) { float rC[4][4] = { {(float)0} }; float rA[1][4]; float rB[1][4]; A += offsetA; B += offsetB; C+=offsetC; __local float lA[1056]; __local float lB[1056]; uint gidx = get_group_id(0); uint gidy = get_group_id(1); uint idx = get_local_id(0); uint idy = get_local_id(1); uint idt = 16*idy + idx; uint idxT = idt % 16; uint idyT = idt / 16; A += gidx*64*lda+ idxT + idyT*lda; B += gidy*64*ldb+ idxT + idyT*ldb; uint block_k = K >> 4; do { __local float* plA = lA + idxT*65+idyT; __local float* plB = lB + idxT*65+idyT; barrier(CLK_LOCAL_MEM_FENCE); plB[0] = B[0]; plB[16] = B[16*ldb]; plB[32] = B[32*ldb]; plB[48] = B[48*ldb]; plA[0] = A[0]; plA[16] = A[16*lda]; plA[32] = A[32*lda]; plA[48] = A[48*lda]; barrier(CLK_LOCAL_MEM_FENCE); uint offA = idx; uint offB = idy; M4x4 M4x4 M4x4 M4x4 M4x4 M4x4 M4x4 M4x4 M4x4 M4x4 M4x4 M4x4 M4x4 M4x4 M4x4 M4x4 A += 16; B += 16; } while (--block_k > 0); C+= gidx*64+idx; C+= gidy*64*ldc; C+= idy*ldc; C[0*ldc] = alpha*rC[0][0] ; C[16*ldc] = alpha*rC[0][1] ; C[32*ldc] = alpha*rC[0][2] ; C[48*ldc] = alpha*rC[0][3] ; C+=16; C[0*ldc] = alpha*rC[1][0] ; C[16*ldc] = alpha*rC[1][1] ; C[32*ldc] = alpha*rC[1][2] ; C[48*ldc] = alpha*rC[1][3] ; C+=16; C[0*ldc] = alpha*rC[2][0] ; C[16*ldc] = alpha*rC[2][1] ; C[32*ldc] = alpha*rC[2][2] ; C[48*ldc] = alpha*rC[2][3] ; C+=16; C[0*ldc] = alpha*rC[3][0] ; C[16*ldc] = alpha*rC[3][1] ; C[32*ldc] = alpha*rC[3][2] ; C[48*ldc] = alpha*rC[3][3] ; } ); #endif clblas-2.10/src/library/blas/AutoGemm/UserGemmKernelSources/sgemm_Col_TN_B0_MX096_NX096_KX16_src.cpp000066400000000000000000000152541264277366700325340ustar00rootroot00000000000000/******************************************************************************* * Hand-tuned kernel ******************************************************************************/ #ifndef KERNEL_SGEMM_COL_TN_B0_MX096_NX096_KX16_SRC_H #define KERNEL_SGEMM_COL_TN_B0_MX096_NX096_KX16_SRC_H #pragma message("AutoGemm's sgemm_Col_TN_B0_MX096_NX096_KX16_src overriden by user.") #ifndef STRINGIFY #define STRINGIFY(S) STRINGIFY2(S) #define STRINGIFY2(S) #S #endif const unsigned int sgemm_Col_TN_B0_MX096_NX096_KX16_workGroupNumRows = 16; const unsigned int sgemm_Col_TN_B0_MX096_NX096_KX16_workGroupNumCols = 16; const unsigned int sgemm_Col_TN_B0_MX096_NX096_KX16_microTileNumRows = 6; const unsigned int sgemm_Col_TN_B0_MX096_NX096_KX16_microTileNumCols = 6; const unsigned int sgemm_Col_TN_B0_MX096_NX096_KX16_unroll = 16; const char * const sgemm_Col_TN_B0_MX096_NX096_KX16_src = STRINGIFY( #define M6x6 \ rA[0][0] = lA[offA + 0]; \ rA[0][1] = lA[offA + 16]; \ rA[0][2] = lA[offA + 32]; \ rA[0][3] = lA[offA + 48]; \ rA[0][4] = lA[offA + 64]; \ rA[0][5] = lA[offA + 80]; \ rB[0][0] = lB[offB + 0]; \ rB[0][1] = lB[offB + 16]; \ rB[0][2] = lB[offB + 32]; \ rB[0][3] = lB[offB + 48]; \ rB[0][4] = lB[offB + 64]; \ rB[0][5] = lB[offB + 80]; \ offA += 97; \ offB += 97; \ rC[0][0]=mad(rA[0][0],rB[0][0],rC[0][0]); \ rC[1][0]=mad(rA[0][1],rB[0][0],rC[1][0]); \ rC[2][0]=mad(rA[0][2],rB[0][0],rC[2][0]); \ rC[3][0]=mad(rA[0][3],rB[0][0],rC[3][0]); \ rC[4][0]=mad(rA[0][4],rB[0][0],rC[4][0]); \ rC[5][0]=mad(rA[0][5],rB[0][0],rC[5][0]); \ rC[0][1]=mad(rA[0][0],rB[0][1],rC[0][1]); \ rC[1][1]=mad(rA[0][1],rB[0][1],rC[1][1]); \ rC[2][1]=mad(rA[0][2],rB[0][1],rC[2][1]); \ rC[3][1]=mad(rA[0][3],rB[0][1],rC[3][1]); \ rC[4][1]=mad(rA[0][4],rB[0][1],rC[4][1]); \ rC[5][1]=mad(rA[0][5],rB[0][1],rC[5][1]); \ rC[0][2]=mad(rA[0][0],rB[0][2],rC[0][2]); \ rC[1][2]=mad(rA[0][1],rB[0][2],rC[1][2]); \ rC[2][2]=mad(rA[0][2],rB[0][2],rC[2][2]); \ rC[3][2]=mad(rA[0][3],rB[0][2],rC[3][2]); \ rC[4][2]=mad(rA[0][4],rB[0][2],rC[4][2]); \ rC[5][2]=mad(rA[0][5],rB[0][2],rC[5][2]); \ rC[0][3]=mad(rA[0][0],rB[0][3],rC[0][3]); \ rC[1][3]=mad(rA[0][1],rB[0][3],rC[1][3]); \ rC[2][3]=mad(rA[0][2],rB[0][3],rC[2][3]); \ rC[3][3]=mad(rA[0][3],rB[0][3],rC[3][3]); \ rC[4][3]=mad(rA[0][4],rB[0][3],rC[4][3]); \ rC[5][3]=mad(rA[0][5],rB[0][3],rC[5][3]); \ rC[0][4]=mad(rA[0][0],rB[0][4],rC[0][4]); \ rC[1][4]=mad(rA[0][1],rB[0][4],rC[1][4]); \ rC[2][4]=mad(rA[0][2],rB[0][4],rC[2][4]); \ rC[3][4]=mad(rA[0][3],rB[0][4],rC[3][4]); \ rC[4][4]=mad(rA[0][4],rB[0][4],rC[4][4]); \ rC[5][4]=mad(rA[0][5],rB[0][4],rC[5][4]); \ rC[0][5]=mad(rA[0][0],rB[0][5],rC[0][5]); \ rC[1][5]=mad(rA[0][1],rB[0][5],rC[1][5]); \ rC[2][5]=mad(rA[0][2],rB[0][5],rC[2][5]); \ rC[3][5]=mad(rA[0][3],rB[0][5],rC[3][5]); \ rC[4][5]=mad(rA[0][4],rB[0][5],rC[4][5]); \ rC[5][5]=mad(rA[0][5],rB[0][5],rC[5][5]); \ mem_fence(CLK_LOCAL_MEM_FENCE);\n __attribute__((reqd_work_group_size(16,16,1))) __kernel void sgemm_Col_TN_B0_MX096_NX096_KX16 ( __global float const * restrict A, __global float const * restrict B, __global float * C, float const alpha, float const beta, uint const M, uint const N, uint const K, uint lda, uint ldb, uint ldc, uint offsetA, uint offsetB, uint offsetC) { float rC[6][6] = { {(float)0} }; float rA[1][6]; float rB[1][6]; A += offsetA; B += offsetB; C+=offsetC; __local float lA[1552]; __local float lB[1552]; uint gidx = get_group_id(0); uint gidy = get_group_id(1); uint idx = get_local_id(0); uint idy = get_local_id(1); A += (gidx*96+idy)*lda + idx; B += (gidy*96+idy)*ldb + idx; uint block_k = K >> 4; do { __local float* plA = lA + idx*97+idy; __local float* plB = lB + idx*97+idy; barrier(CLK_LOCAL_MEM_FENCE); plB[0] = B[0]; plB[16] = B[16*ldb]; plB[32] = B[32*ldb]; plB[48] = B[48*ldb]; plB[64] = B[64*ldb]; plB[80] = B[80*ldb]; plA[0] = A[0]; plA[16] = A[16*lda]; plA[32] = A[32*lda]; plA[48] = A[48*lda]; plA[64] = A[64*lda]; plA[80] = A[80*lda]; barrier(CLK_LOCAL_MEM_FENCE); uint offA = idx; uint offB = idy; M6x6 M6x6 M6x6 M6x6 M6x6 M6x6 M6x6 M6x6 M6x6 M6x6 M6x6 M6x6 M6x6 M6x6 M6x6 M6x6 A += 16; B += 16; } while (--block_k > 0); C+= gidx*96+idx; C+= gidy*96*ldc; C+= idy*ldc; C[0*ldc] = alpha*rC[0][0] + beta*C[0*ldc]; C[16*ldc] = alpha*rC[0][1] + beta*C[16*ldc]; C[32*ldc] = alpha*rC[0][2] + beta*C[32*ldc]; C[48*ldc] = alpha*rC[0][3] + beta*C[48*ldc]; C[64*ldc] = alpha*rC[0][4] + beta*C[64*ldc]; C[80*ldc] = alpha*rC[0][5] + beta*C[80*ldc]; C+=16; C[0*ldc] = alpha*rC[1][0] + beta*C[0*ldc]; C[16*ldc] = alpha*rC[1][1] + beta*C[16*ldc]; C[32*ldc] = alpha*rC[1][2] + beta*C[32*ldc]; C[48*ldc] = alpha*rC[1][3] + beta*C[48*ldc]; C[64*ldc] = alpha*rC[1][4] + beta*C[64*ldc]; C[80*ldc] = alpha*rC[1][5] + beta*C[80*ldc]; C+=16; C[0*ldc] = alpha*rC[2][0] + beta*C[0*ldc]; C[16*ldc] = alpha*rC[2][1] + beta*C[16*ldc]; C[32*ldc] = alpha*rC[2][2] + beta*C[32*ldc]; C[48*ldc] = alpha*rC[2][3] + beta*C[48*ldc]; C[64*ldc] = alpha*rC[2][4] + beta*C[64*ldc]; C[80*ldc] = alpha*rC[2][5] + beta*C[80*ldc]; C+=16; C[0*ldc] = alpha*rC[3][0] + beta*C[0*ldc]; C[16*ldc] = alpha*rC[3][1] + beta*C[16*ldc]; C[32*ldc] = alpha*rC[3][2] + beta*C[32*ldc]; C[48*ldc] = alpha*rC[3][3] + beta*C[48*ldc]; C[64*ldc] = alpha*rC[3][4] + beta*C[64*ldc]; C[80*ldc] = alpha*rC[3][5] + beta*C[80*ldc]; C+=16; C[0*ldc] = alpha*rC[4][0] + beta*C[0*ldc]; C[16*ldc] = alpha*rC[4][1] + beta*C[16*ldc]; C[32*ldc] = alpha*rC[4][2] + beta*C[32*ldc]; C[48*ldc] = alpha*rC[4][3] + beta*C[48*ldc]; C[64*ldc] = alpha*rC[4][4] + beta*C[64*ldc]; C[80*ldc] = alpha*rC[4][5] + beta*C[80*ldc]; C+=16; C[0*ldc] = alpha*rC[5][0] + beta*C[0*ldc]; C[16*ldc] = alpha*rC[5][1] + beta*C[16*ldc]; C[32*ldc] = alpha*rC[5][2] + beta*C[32*ldc]; C[48*ldc] = alpha*rC[5][3] + beta*C[48*ldc]; C[64*ldc] = alpha*rC[5][4] + beta*C[64*ldc]; C[80*ldc] = alpha*rC[5][5] + beta*C[80*ldc]; } ); #endif sgemm_Col_TN_B1_MX032_NX032_KX16_BRANCH_src.cpp000066400000000000000000000070301264277366700334400ustar00rootroot00000000000000clblas-2.10/src/library/blas/AutoGemm/UserGemmKernelSources/******************************************************************************* * Hand-tuned kernel ******************************************************************************/ #ifndef KERNEL_SGEMM_COL_TN_B1_MX032_NX032_KX16_BRANCH_SRC_H #define KERNEL_SGEMM_COL_TN_B1_MX032_NX032_KX16_BRANCH_SRC_H #pragma message("AutoGemm's sgemm_Col_TN_B1_MX032_NX032_KX16_BRANCH_src overriden by user.") #ifndef STRINGIFY #define STRINGIFY(S) STRINGIFY2(S) #define STRINGIFY2(S) #S #endif const unsigned int sgemm_Col_TN_B1_MX032_NX032_KX16_BRANCH_workGroupNumRows = 16; const unsigned int sgemm_Col_TN_B1_MX032_NX032_KX16_BRANCH_workGroupNumCols = 16; const unsigned int sgemm_Col_TN_B1_MX032_NX032_KX16_BRANCH_microTileNumRows = 2; const unsigned int sgemm_Col_TN_B1_MX032_NX032_KX16_BRANCH_microTileNumCols = 2; const unsigned int sgemm_Col_TN_B1_MX032_NX032_KX16_BRANCH_unroll = 16; #ifndef AUTOGEMM_USE_PRE_COMPILED_KERNELS unsigned char *sgemm_Col_TN_B1_MX032_NX032_KX16_BRANCH_bin = 0; size_t sgemm_Col_TN_B1_MX032_NX032_KX16_BRANCH_binSize = 0; #endif const char * const sgemm_Col_TN_B1_MX032_NX032_KX16_BRANCH_src = STRINGIFY( #define M2x2 \ rA[0][0] = lA[offA + 0]; \ rA[0][1] = lA[offA + 16]; \ rB[0][0] = lB[offB + 0]; \ rB[0][1] = lB[offB + 16]; \ offA += 33; \ offB += 33; \ rC[0][0]=mad(rA[0][0],rB[0][0],rC[0][0]); \ rC[1][0]=mad(rA[0][1],rB[0][0],rC[1][0]); \ rC[0][1]=mad(rA[0][0],rB[0][1],rC[0][1]); \ rC[1][1]=mad(rA[0][1],rB[0][1],rC[1][1]); \ mem_fence(CLK_LOCAL_MEM_FENCE);\n __attribute__((reqd_work_group_size(16,16,1))) __kernel void sgemm_Col_TN_B1_MX032_NX032_KX16_BRANCH_src ( __global float const * restrict A, __global float const * restrict B, __global float * C, float const alpha, float const beta, uint const M, uint const N, uint const K, uint lda, uint ldb, uint ldc, uint offsetA, uint offsetB, uint offsetC) { float rC[2][2] = { {(float)0} }; float rA[1][2]; float rB[1][2]; A += offsetA; B += offsetB; C+=offsetC; __local float lA[528];//16*32+16 __local float lB[528]; uint gidx = get_group_id(0); uint gidy = get_group_id(1); uint idx = get_local_id(0); uint idy = get_local_id(1); int CurrentOffSetA = gidx*32+ idy; int CurrentOffSetB = gidy*32+ idy; A += (gidx*32+idy)*lda + idx; B += (gidy*32+idy)*ldb + idx; uint block_k = K >> 4; do { __local float* plA = lA + idx*33+idy; __local float* plB = lB + idx*33+idy; barrier(CLK_LOCAL_MEM_FENCE); plB[0] = CurrentOffSetB>=N?0.0:B[0]; plB[16] = CurrentOffSetB+16>=N?0.0:B[16*ldb]; plA[0] = CurrentOffSetA>=M?0.0:A[0]; plA[16] = CurrentOffSetA+16>=M?0.0:A[16*lda]; barrier(CLK_LOCAL_MEM_FENCE); uint offA = idx; uint offB = idy; M2x2 M2x2 M2x2 M2x2 M2x2 M2x2 M2x2 M2x2 M2x2 M2x2 M2x2 M2x2 M2x2 M2x2 M2x2 M2x2 A += 16; B += 16; } while (--block_k > 0); int offset_x = gidx*32+idx; int offset_y = gidy*32+ idy; if(offset_x>=M || offset_y>=N ) return; C+=offset_x+offset_y*ldc; int i = 0; do { C[0 ] = mad(alpha, rC[i][0], beta*C[0]); if(offset_y+16=M ) return; } while (++i < 2); } ); #endif clblas-2.10/src/library/blas/AutoGemm/UserGemmKernelSources/sgemm_Col_TN_B1_MX032_NX032_KX16_src.cpp000066400000000000000000000060451264277366700325070ustar00rootroot00000000000000/******************************************************************************* * Hand-tuned kernel ******************************************************************************/ #ifndef KERNEL_SGEMM_COL_TN_B1_MX032_NX032_KX16_SRC_H #define KERNEL_SGEMM_COL_TN_B1_MX032_NX032_KX16_SRC_H #pragma message("AutoGemm's sgemm_Col_TN_B1_MX032_NX032_KX16_src overriden by user.") #ifndef STRINGIFY #define STRINGIFY(S) STRINGIFY2(S) #define STRINGIFY2(S) #S #endif const unsigned int sgemm_Col_TN_B1_MX032_NX032_KX16_workGroupNumRows = 16; const unsigned int sgemm_Col_TN_B1_MX032_NX032_KX16_workGroupNumCols = 16; const unsigned int sgemm_Col_TN_B1_MX032_NX032_KX16_microTileNumRows = 2; const unsigned int sgemm_Col_TN_B1_MX032_NX032_KX16_microTileNumCols = 2; const unsigned int sgemm_Col_TN_B1_MX032_NX032_KX16_unroll = 16; const char * const sgemm_Col_TN_B1_MX032_NX032_KX16_src = STRINGIFY( #define M2x2 \ rA[0][0] = lA[offA + 0]; \ rA[0][1] = lA[offA + 16]; \ rB[0][0] = lB[offB + 0]; \ rB[0][1] = lB[offB + 16]; \ offA += 33; \ offB += 33; \ rC[0][0]=mad(rA[0][0],rB[0][0],rC[0][0]); \ rC[1][0]=mad(rA[0][1],rB[0][0],rC[1][0]); \ rC[0][1]=mad(rA[0][0],rB[0][1],rC[0][1]); \ rC[1][1]=mad(rA[0][1],rB[0][1],rC[1][1]); \ mem_fence(CLK_LOCAL_MEM_FENCE);\n __attribute__((reqd_work_group_size(16,16,1))) __kernel void sgemm_Col_TN_B1_MX032_NX032_KX16_src ( __global float const * restrict A, __global float const * restrict B, __global float * C, float const alpha, float const beta, uint const M, uint const N, uint const K, uint lda, uint ldb, uint ldc, uint offsetA, uint offsetB, uint offsetC) { float rC[2][2] = { {(float)0} }; float rA[1][2]; float rB[1][2]; A += offsetA; B += offsetB; C+=offsetC; __local float lA[544]; __local float lB[544]; uint gidx = get_group_id(0); uint gidy = get_group_id(1); uint idx = get_local_id(0); uint idy = get_local_id(1); uint idt = 16*idy + idx; uint idxT = idt % 16; uint idyT = idt / 16; A += (gidx*32+idyT)*lda+ idxT ; B += (gidy*32+idyT)*ldb + idxT; uint block_k = K >> 4; do { __local float* plA = lA + idxT*33+idyT; __local float* plB = lB + idxT*33+idyT; barrier(CLK_LOCAL_MEM_FENCE); plB[0] = B[0]; plB[16] = B[16*ldb]; plA[0] = A[0]; plA[16] = A[16*lda]; barrier(CLK_LOCAL_MEM_FENCE); uint offA = idx; uint offB = idy; M2x2 M2x2 M2x2 M2x2 M2x2 M2x2 M2x2 M2x2 M2x2 M2x2 M2x2 M2x2 M2x2 M2x2 M2x2 M2x2 A += 16; B += 16; } while (--block_k > 0); C+= gidx*32+idx; C+= gidy*32*ldc; C+= idy*ldc; C[0*ldc] = alpha*rC[0][0] + beta*C[0*ldc]; C[16*ldc] = alpha*rC[0][1] + beta*C[16*ldc]; C+=16; C[0*ldc] = alpha*rC[1][0] + beta*C[0*ldc]; C[16*ldc] = alpha*rC[1][1] + beta*C[16*ldc]; } ); #endif clblas-2.10/src/library/blas/AutoGemm/UserGemmKernelSources/sgemm_Col_TN_B1_MX064_NX064_KX16_src.cpp000066400000000000000000000107261264277366700325220ustar00rootroot00000000000000/******************************************************************************* * Hand-tuned kernel ******************************************************************************/ #ifndef KERNEL_SGEMM_COL_TN_B1_MX064_NX064_KX16_SRC_H #define KERNEL_SGEMM_COL_TN_B1_MX064_NX064_KX16_SRC_H #pragma message("AutoGemm's sgemm_Col_TN_B1_MX064_NX064_KX16_src overriden by user.") #ifndef STRINGIFY #define STRINGIFY(S) STRINGIFY2(S) #define STRINGIFY2(S) #S #endif const unsigned int sgemm_Col_TN_B1_MX064_NX064_KX16_workGroupNumRows = 16; const unsigned int sgemm_Col_TN_B1_MX064_NX064_KX16_workGroupNumCols = 16; const unsigned int sgemm_Col_TN_B1_MX064_NX064_KX16_microTileNumRows = 4; const unsigned int sgemm_Col_TN_B1_MX064_NX064_KX16_microTileNumCols = 4; const unsigned int sgemm_Col_TN_B1_MX064_NX064_KX16_unroll = 16; const char * const sgemm_Col_TN_B1_MX064_NX064_KX16_src = STRINGIFY( #define M4x4 \ rA[0][0] = lA[offA + 0]; \ rA[0][1] = lA[offA + 16]; \ rA[0][2] = lA[offA + 32]; \ rA[0][3] = lA[offA + 48]; \ rB[0][0] = lB[offB + 0]; \ rB[0][1] = lB[offB + 16]; \ rB[0][2] = lB[offB + 32]; \ rB[0][3] = lB[offB + 48]; \ offA += 65; \ offB += 65; \ rC[0][0]=mad(rA[0][0],rB[0][0],rC[0][0]); \ rC[1][0]=mad(rA[0][1],rB[0][0],rC[1][0]); \ rC[2][0]=mad(rA[0][2],rB[0][0],rC[2][0]); \ rC[3][0]=mad(rA[0][3],rB[0][0],rC[3][0]); \ rC[0][1]=mad(rA[0][0],rB[0][1],rC[0][1]); \ rC[1][1]=mad(rA[0][1],rB[0][1],rC[1][1]); \ rC[2][1]=mad(rA[0][2],rB[0][1],rC[2][1]); \ rC[3][1]=mad(rA[0][3],rB[0][1],rC[3][1]); \ rC[0][2]=mad(rA[0][0],rB[0][2],rC[0][2]); \ rC[1][2]=mad(rA[0][1],rB[0][2],rC[1][2]); \ rC[2][2]=mad(rA[0][2],rB[0][2],rC[2][2]); \ rC[3][2]=mad(rA[0][3],rB[0][2],rC[3][2]); \ rC[0][3]=mad(rA[0][0],rB[0][3],rC[0][3]); \ rC[1][3]=mad(rA[0][1],rB[0][3],rC[1][3]); \ rC[2][3]=mad(rA[0][2],rB[0][3],rC[2][3]); \ rC[3][3]=mad(rA[0][3],rB[0][3],rC[3][3]); \ mem_fence(CLK_LOCAL_MEM_FENCE);\n __attribute__((reqd_work_group_size(16,16,1))) __kernel void sgemm_Col_TN_B1_MX064_NX064_KX16 ( __global float const * restrict A, __global float const * restrict B, __global float * C, float const alpha, float const beta, uint const M, uint const N, uint const K, uint lda, uint ldb, uint ldc, uint offsetA, uint offsetB, uint offsetC) { float rC[4][4] = { {(float)0} }; float rA[1][4]; float rB[1][4]; A += offsetA; B += offsetB; C+=offsetC; __local float lA[1056]; __local float lB[1056]; uint gidx = get_group_id(0); uint gidy = get_group_id(1); uint idx = get_local_id(0); uint idy = get_local_id(1); uint idt = 16*idy + idx; uint idxT = idt % 16; uint idyT = idt / 16; A += gidx*64*lda+ idxT + idyT*lda; B += gidy*64*ldb+ idxT + idyT*ldb; uint block_k = K >> 4; do { __local float* plA = lA + idxT*65+idyT; __local float* plB = lB + idxT*65+idyT; barrier(CLK_LOCAL_MEM_FENCE); plB[0] = B[0]; plB[16] = B[16*ldb]; plB[32] = B[32*ldb]; plB[48] = B[48*ldb]; plA[0] = A[0]; plA[16] = A[16*lda]; plA[32] = A[32*lda]; plA[48] = A[48*lda]; barrier(CLK_LOCAL_MEM_FENCE); uint offA = idx; uint offB = idy; M4x4 M4x4 M4x4 M4x4 M4x4 M4x4 M4x4 M4x4 M4x4 M4x4 M4x4 M4x4 M4x4 M4x4 M4x4 M4x4 A += 16; B += 16; } while (--block_k > 0); C+= gidx*64+idx; C+= gidy*64*ldc; C+= idy*ldc; C[0*ldc] = alpha*rC[0][0] + beta*C[0*ldc]; C[16*ldc] = alpha*rC[0][1] + beta*C[16*ldc]; C[32*ldc] = alpha*rC[0][2] + beta*C[32*ldc]; C[48*ldc] = alpha*rC[0][3] + beta*C[48*ldc]; C+=16; C[0*ldc] = alpha*rC[1][0] + beta*C[0*ldc]; C[16*ldc] = alpha*rC[1][1] + beta*C[16*ldc]; C[32*ldc] = alpha*rC[1][2] + beta*C[32*ldc]; C[48*ldc] = alpha*rC[1][3] + beta*C[48*ldc]; C+=16; C[0*ldc] = alpha*rC[2][0] + beta*C[0*ldc]; C[16*ldc] = alpha*rC[2][1] + beta*C[16*ldc]; C[32*ldc] = alpha*rC[2][2] + beta*C[32*ldc]; C[48*ldc] = alpha*rC[2][3] + beta*C[48*ldc]; C+=16; C[0*ldc] = alpha*rC[3][0] + beta*C[0*ldc]; C[16*ldc] = alpha*rC[3][1] + beta*C[16*ldc]; C[32*ldc] = alpha*rC[3][2] + beta*C[32*ldc]; C[48*ldc] = alpha*rC[3][3] + beta*C[48*ldc]; } ); #endif clblas-2.10/src/library/blas/AutoGemm/UserGemmKernelSources/sgemm_Col_TN_B1_MX096_NX096_KX16_src.cpp000066400000000000000000000152541264277366700325350ustar00rootroot00000000000000/******************************************************************************* * Hand-tuned kernel ******************************************************************************/ #ifndef KERNEL_SGEMM_COL_TN_B1_MX096_NX096_KX16_SRC_H #define KERNEL_SGEMM_COL_TN_B1_MX096_NX096_KX16_SRC_H #pragma message("AutoGemm's sgemm_Col_TN_B1_MX096_NX096_KX16_src overriden by user.") #ifndef STRINGIFY #define STRINGIFY(S) STRINGIFY2(S) #define STRINGIFY2(S) #S #endif const unsigned int sgemm_Col_TN_B1_MX096_NX096_KX16_workGroupNumRows = 16; const unsigned int sgemm_Col_TN_B1_MX096_NX096_KX16_workGroupNumCols = 16; const unsigned int sgemm_Col_TN_B1_MX096_NX096_KX16_microTileNumRows = 6; const unsigned int sgemm_Col_TN_B1_MX096_NX096_KX16_microTileNumCols = 6; const unsigned int sgemm_Col_TN_B1_MX096_NX096_KX16_unroll = 16; const char * const sgemm_Col_TN_B1_MX096_NX096_KX16_src = STRINGIFY( #define M6x6 \ rA[0][0] = lA[offA + 0]; \ rA[0][1] = lA[offA + 16]; \ rA[0][2] = lA[offA + 32]; \ rA[0][3] = lA[offA + 48]; \ rA[0][4] = lA[offA + 64]; \ rA[0][5] = lA[offA + 80]; \ rB[0][0] = lB[offB + 0]; \ rB[0][1] = lB[offB + 16]; \ rB[0][2] = lB[offB + 32]; \ rB[0][3] = lB[offB + 48]; \ rB[0][4] = lB[offB + 64]; \ rB[0][5] = lB[offB + 80]; \ offA += 97; \ offB += 97; \ rC[0][0]=mad(rA[0][0],rB[0][0],rC[0][0]); \ rC[1][0]=mad(rA[0][1],rB[0][0],rC[1][0]); \ rC[2][0]=mad(rA[0][2],rB[0][0],rC[2][0]); \ rC[3][0]=mad(rA[0][3],rB[0][0],rC[3][0]); \ rC[4][0]=mad(rA[0][4],rB[0][0],rC[4][0]); \ rC[5][0]=mad(rA[0][5],rB[0][0],rC[5][0]); \ rC[0][1]=mad(rA[0][0],rB[0][1],rC[0][1]); \ rC[1][1]=mad(rA[0][1],rB[0][1],rC[1][1]); \ rC[2][1]=mad(rA[0][2],rB[0][1],rC[2][1]); \ rC[3][1]=mad(rA[0][3],rB[0][1],rC[3][1]); \ rC[4][1]=mad(rA[0][4],rB[0][1],rC[4][1]); \ rC[5][1]=mad(rA[0][5],rB[0][1],rC[5][1]); \ rC[0][2]=mad(rA[0][0],rB[0][2],rC[0][2]); \ rC[1][2]=mad(rA[0][1],rB[0][2],rC[1][2]); \ rC[2][2]=mad(rA[0][2],rB[0][2],rC[2][2]); \ rC[3][2]=mad(rA[0][3],rB[0][2],rC[3][2]); \ rC[4][2]=mad(rA[0][4],rB[0][2],rC[4][2]); \ rC[5][2]=mad(rA[0][5],rB[0][2],rC[5][2]); \ rC[0][3]=mad(rA[0][0],rB[0][3],rC[0][3]); \ rC[1][3]=mad(rA[0][1],rB[0][3],rC[1][3]); \ rC[2][3]=mad(rA[0][2],rB[0][3],rC[2][3]); \ rC[3][3]=mad(rA[0][3],rB[0][3],rC[3][3]); \ rC[4][3]=mad(rA[0][4],rB[0][3],rC[4][3]); \ rC[5][3]=mad(rA[0][5],rB[0][3],rC[5][3]); \ rC[0][4]=mad(rA[0][0],rB[0][4],rC[0][4]); \ rC[1][4]=mad(rA[0][1],rB[0][4],rC[1][4]); \ rC[2][4]=mad(rA[0][2],rB[0][4],rC[2][4]); \ rC[3][4]=mad(rA[0][3],rB[0][4],rC[3][4]); \ rC[4][4]=mad(rA[0][4],rB[0][4],rC[4][4]); \ rC[5][4]=mad(rA[0][5],rB[0][4],rC[5][4]); \ rC[0][5]=mad(rA[0][0],rB[0][5],rC[0][5]); \ rC[1][5]=mad(rA[0][1],rB[0][5],rC[1][5]); \ rC[2][5]=mad(rA[0][2],rB[0][5],rC[2][5]); \ rC[3][5]=mad(rA[0][3],rB[0][5],rC[3][5]); \ rC[4][5]=mad(rA[0][4],rB[0][5],rC[4][5]); \ rC[5][5]=mad(rA[0][5],rB[0][5],rC[5][5]); \ mem_fence(CLK_LOCAL_MEM_FENCE);\n __attribute__((reqd_work_group_size(16,16,1))) __kernel void sgemm_Col_TN_B1_MX096_NX096_KX16 ( __global float const * restrict A, __global float const * restrict B, __global float * C, float const alpha, float const beta, uint const M, uint const N, uint const K, uint lda, uint ldb, uint ldc, uint offsetA, uint offsetB, uint offsetC) { float rC[6][6] = { {(float)0} }; float rA[1][6]; float rB[1][6]; A += offsetA; B += offsetB; C+=offsetC; __local float lA[1552]; __local float lB[1552]; uint gidx = get_group_id(0); uint gidy = get_group_id(1); uint idx = get_local_id(0); uint idy = get_local_id(1); A += (gidx*96+idy)*lda + idx; B += (gidy*96+idy)*ldb + idx; uint block_k = K >> 4; do { __local float* plA = lA + idx*97+idy; __local float* plB = lB + idx*97+idy; barrier(CLK_LOCAL_MEM_FENCE); plB[0] = B[0]; plB[16] = B[16*ldb]; plB[32] = B[32*ldb]; plB[48] = B[48*ldb]; plB[64] = B[64*ldb]; plB[80] = B[80*ldb]; plA[0] = A[0]; plA[16] = A[16*lda]; plA[32] = A[32*lda]; plA[48] = A[48*lda]; plA[64] = A[64*lda]; plA[80] = A[80*lda]; barrier(CLK_LOCAL_MEM_FENCE); uint offA = idx; uint offB = idy; M6x6 M6x6 M6x6 M6x6 M6x6 M6x6 M6x6 M6x6 M6x6 M6x6 M6x6 M6x6 M6x6 M6x6 M6x6 M6x6 A += 16; B += 16; } while (--block_k > 0); C+= gidx*96+idx; C+= gidy*96*ldc; C+= idy*ldc; C[0*ldc] = alpha*rC[0][0] + beta*C[0*ldc]; C[16*ldc] = alpha*rC[0][1] + beta*C[16*ldc]; C[32*ldc] = alpha*rC[0][2] + beta*C[32*ldc]; C[48*ldc] = alpha*rC[0][3] + beta*C[48*ldc]; C[64*ldc] = alpha*rC[0][4] + beta*C[64*ldc]; C[80*ldc] = alpha*rC[0][5] + beta*C[80*ldc]; C+=16; C[0*ldc] = alpha*rC[1][0] + beta*C[0*ldc]; C[16*ldc] = alpha*rC[1][1] + beta*C[16*ldc]; C[32*ldc] = alpha*rC[1][2] + beta*C[32*ldc]; C[48*ldc] = alpha*rC[1][3] + beta*C[48*ldc]; C[64*ldc] = alpha*rC[1][4] + beta*C[64*ldc]; C[80*ldc] = alpha*rC[1][5] + beta*C[80*ldc]; C+=16; C[0*ldc] = alpha*rC[2][0] + beta*C[0*ldc]; C[16*ldc] = alpha*rC[2][1] + beta*C[16*ldc]; C[32*ldc] = alpha*rC[2][2] + beta*C[32*ldc]; C[48*ldc] = alpha*rC[2][3] + beta*C[48*ldc]; C[64*ldc] = alpha*rC[2][4] + beta*C[64*ldc]; C[80*ldc] = alpha*rC[2][5] + beta*C[80*ldc]; C+=16; C[0*ldc] = alpha*rC[3][0] + beta*C[0*ldc]; C[16*ldc] = alpha*rC[3][1] + beta*C[16*ldc]; C[32*ldc] = alpha*rC[3][2] + beta*C[32*ldc]; C[48*ldc] = alpha*rC[3][3] + beta*C[48*ldc]; C[64*ldc] = alpha*rC[3][4] + beta*C[64*ldc]; C[80*ldc] = alpha*rC[3][5] + beta*C[80*ldc]; C+=16; C[0*ldc] = alpha*rC[4][0] + beta*C[0*ldc]; C[16*ldc] = alpha*rC[4][1] + beta*C[16*ldc]; C[32*ldc] = alpha*rC[4][2] + beta*C[32*ldc]; C[48*ldc] = alpha*rC[4][3] + beta*C[48*ldc]; C[64*ldc] = alpha*rC[4][4] + beta*C[64*ldc]; C[80*ldc] = alpha*rC[4][5] + beta*C[80*ldc]; C+=16; C[0*ldc] = alpha*rC[5][0] + beta*C[0*ldc]; C[16*ldc] = alpha*rC[5][1] + beta*C[16*ldc]; C[32*ldc] = alpha*rC[5][2] + beta*C[32*ldc]; C[48*ldc] = alpha*rC[5][3] + beta*C[48*ldc]; C[64*ldc] = alpha*rC[5][4] + beta*C[64*ldc]; C[80*ldc] = alpha*rC[5][5] + beta*C[80*ldc]; } ); #endif clblas-2.10/src/library/blas/fill.cc000066400000000000000000000211221264277366700173270ustar00rootroot00000000000000/************************************************************************ * Copyright 2014 Advanced Micro Devices, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * ************************************************************************/ #include #include #include #include #include #define SWAP(TYPE,a,b) do { TYPE swap_tmp_ = a ; a = b ; b = swap_tmp_ ; } while(0) // Return true if the area starting from pint (x,y) and of size (w,h) is // within the array of size d1 x d2 static int inside2d( size_t d1, size_t d2, int x, int y, size_t w, size_t h ) { // Very very large dimensions are likely a bug size_t MAXDIM = ((size_t)INT_MAX) ; if ( d1 >= MAXDIM ) return 0 ; if ( d2 >= MAXDIM ) return 0 ; if ( w >= MAXDIM ) return 0 ; if ( h >= MAXDIM ) return 0 ; if ( x < 0 || x >= (int)d1 ) return 0 ; size_t max_w = (size_t)(d1-x) ; if ( w > max_w ) return 0 ; if ( y < 0 || y >= (int)d2 ) return 0 ; size_t max_h = (size_t)(d2-y) ; if ( h > max_h ) return 0 ; return 1 ; } extern "C" clblasStatus clblasFillVectorAsync( size_t nb_elem, size_t element_size, cl_mem A, size_t offA, const void * host, cl_command_queue command_queue, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *event) { return (clblasStatus) clEnqueueFillBuffer(command_queue, A, host, element_size, offA*element_size, nb_elem*element_size, numEventsInWaitList, eventWaitList, event); } extern "C" clblasStatus clblasFillVector( size_t nb_elem, size_t element_size, cl_mem A, size_t offA, const void * host, cl_command_queue command_queue, cl_uint numEventsInWaitList, const cl_event *eventWaitList) { cl_event event ; cl_int err = clblasFillVectorAsync( nb_elem, element_size, A, offA, host, command_queue, numEventsInWaitList, eventWaitList, &event) ; if (err == clblasSuccess) { err = clWaitForEvents(1,&event) ; } return (clblasStatus) err ; } extern "C" clblasStatus clblasFillSubMatrixAsync( clblasOrder order, size_t element_size, cl_mem A, size_t offA, size_t ldA, size_t nrA, size_t ncA, int xA, int yA, size_t nx, size_t ny, const void *host, cl_command_queue command_queue, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *event) { // Transform Row-major into equivalent ColumnMajor so X becomes the contiguous dimension. if( order == clblasRowMajor ) { SWAP(size_t, nrA, ncA); SWAP(int, xA, yA); SWAP(size_t, nx, ny); } // Check that the specified area is within the array if ( !inside2d( nrA,ncA, xA,yA , nx, ny ) ) { return clblasInvalidValue ; } // If the area to fill is contiguous then use clblasFillVector if ( nx==ldA || ny==1 ) { return clblasFillVectorAsync( nx*ny, element_size, A, offA + xA + yA*ldA, host, command_queue, numEventsInWaitList, eventWaitList, event) ; } else if (1) { clblasFill2DFunctor::Args args(A, offA + xA + yA*ldA, nx,ny, ldA, element_size, host, command_queue, numEventsInWaitList, eventWaitList, event) ; clblasFunctorSelector * fselector = clblasFunctorSelector::find(command_queue); clblasFill2DFunctor * functor = fselector->select_fill2d_specific(args); if (!functor) return clblasInvalidValue ; cl_int err = functor->execute(args); functor->release(); return (clblasStatus) err ; } else { // Temporary: perform one fill per row cl_int err ; for( size_t i=0; i #include "math.h" #include "hawaii_sgemmSplitKernel.h" #include "gcn_sgemmSmallMatrices.h" FunctorSelectorBonaire FunctorSelectorBonaire::instance ; FunctorSelectorBonaire::FunctorSelectorBonaire() : clblasFunctorSelector(BONAIRE) { } // // The selector function for DGEMM on hawaii // // // The selector function for SGEMM on bonaire clblasSgemmFunctor * FunctorSelectorBonaire::select_sgemm_specific(clblasSgemmFunctor::Args & args) { #ifdef CLBLAS_BONAIRE_DYNAMIC_KERNEL return this->clblasFunctorSelector::select_sgemm_specific(args); #else clblasSgemmFunctor * functor; bool Not_TT = ((args.transA==clblasNoTrans && args.transB==clblasTrans ) || ( args.transA==clblasNoTrans && args.transB==clblasNoTrans ) || ( args.transA==clblasTrans && args.transB==clblasNoTrans )); bool SmallMatrices = args.M*args.N<256*256 || ((args.M%64!=0 && args.N%64!=0 && args.M<1900 &&args.N<1900 ) && (args.M%96!=0 && args.N%96!=0 && args.M<1900 &&args.N<1900 )); SmallMatrices= (SmallMatrices && (args.M%32==0&&args.N%32==0)) ; SmallMatrices=SmallMatrices&&Not_TT&&args.K%16==0; // SmallMatrices= false; bool useSpliKernel=((args.M%96==0 && args.N%96==0) ||!(args.M%64==0 && args.N%64==0&& args.M<4000 &&args.N<4000)) &&args.K%16==0; useSpliKernel=useSpliKernel&&Not_TT; if (args.alpha!=0 ) { if (SmallMatrices) { functor = clBlasGCNSgemmSmallMatricesFunctor::provide(args, "Bonaire"); if (functor) return functor; } if ( useSpliKernel) { functor = clBlashawaiiSgemmSplitKernelFunctor::provide(args, "Bonaire"); if (functor) return functor; } else { functor = clblasSgemmFunctorGCN::provide(args, "Bonaire"); if (functor) return functor; } } // else use the fallback implementation return this->clblasFunctorSelector::select_sgemm_specific(args); #endif } clblas-2.10/src/library/blas/functor/functor.cc000066400000000000000000000050071264277366700215450ustar00rootroot00000000000000/* ************************************************************************ * Copyright 2014 Advanced Micro Devices, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * ************************************************************************/ #include #include #include #include #include #include #include #include #include clblasFunctor::clblasFunctor() : refcount(1) // implicit retain { } clblasFunctor::~clblasFunctor() { } void clblasFunctor::retain() { refcount.increment() ; } void clblasFunctor::release() { int n = refcount.decrement() ; if (n==0) { delete this; } } cl_int clblasFunctor::getDeviceAndContext(cl_command_queue queue, cl_device_id & device, cl_context & context) { cl_int err; err = getQueueContext(queue, &context); if (err != CL_SUCCESS) { return err; } err = getQueueDevice(queue, &device); if (err != CL_SUCCESS) { return err; } return CL_SUCCESS; } cl_uint clblasFunctor::getAddressBits(cl_device_id & device) { cl_uint bitness; cl_uint error = clGetDeviceInfo(device, CL_DEVICE_ADDRESS_BITS, sizeof(cl_uint), &bitness, NULL); if(error==CL_SUCCESS) return bitness; else return 32; } void clblasFunctor::getCLVersion(cl_device_id & device, int&major, int& minor) { size_t size = 0; cl_int success = 0; major = 0; minor = 0; success = clGetDeviceInfo(device, CL_DEVICE_OPENCL_C_VERSION, 0, NULL, &size); if (success == CL_SUCCESS) { char* CLVersion = NULL; if (size) { CLVersion = new char[size]; if (CLVersion) success = clGetDeviceInfo(device, CL_DEVICE_OPENCL_C_VERSION, size, CLVersion, NULL); else return; if (success != CL_SUCCESS) return; char Major = CLVersion[9]; char Minor = CLVersion[11]; major = atoi(&Major); minor = atoi(&Minor); } } } clblas-2.10/src/library/blas/functor/functor_fill.cc000066400000000000000000000106421264277366700225540ustar00rootroot00000000000000/* ************************************************************************ * Copyright 2013 Advanced Micro Devices, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * ************************************************************************/ #include #include #include #include #include #include "clblas-internal.h" #include "solution_seq.h" #include #include // The internal cache of clblasFill2DFunctorFallback typedef clblasFunctorCache Cache; static Cache cache ; // Generic fill kernel: require macro TYPE to be defined to an element type static const char FILL2D_KERNEL_SRC[] = "\n\ __kernel void fill2d( __global TYPE * A, int offA, int ldA, TYPE value) \n\ {\n\ A[ offA + get_global_id(0) + get_global_id(1) * ldA ] = value ; \n\ }\n\ " ; clblasFill2DFunctorDefault::clblasFill2DFunctorDefault(cl_context ctxt, cl_device_id dev, int elemsize, cl_int & err) : m_elemsize(elemsize), m_program(0) { BinaryLookup bl(ctxt, dev, "clblasFill2DFunctorDefault"); bl.variantInt(elemsize); if ( bl.found() ) // may create empty file or may wait until file is ready { m_program = bl.getProgram(); } else { const char * options; switch( elemsize ) { case 1: options = "-DTYPE=char"; break ; case 2: options = "-DTYPE=short"; break ; case 4: options = "-DTYPE=int"; break ; // or 'float' case 8: options = "-DTYPE=long"; break ; // or 'double' or 'complex float' case 16: options = "-DTYPE=float4"; break ; // or 'complex float' default: options = NULL ; // shall never happen } m_program = BinaryLookup::buildProgramFromSource(FILL2D_KERNEL_SRC, ctxt, dev, err, options); if (m_program) { bl.setProgram(m_program); bl.populateCache(); } } } clblasFill2DFunctorDefault::~clblasFill2DFunctorDefault() { if (this->m_program) { clReleaseProgram( this->m_program ) ; } } clblasStatus clblasFill2DFunctorDefault::execute(Args & args) { cl_int err; cl_kernel kernel = clCreateKernel( this->m_program, "fill2d", &err); if (err != CL_SUCCESS) return clblasStatus(err) ; clblasFunctor::setKernelArg (kernel, 0, args.A); clblasFunctor::setKernelArg (kernel, 1, args.offA); clblasFunctor::setKernelArg (kernel, 2, args.ldA); clblasFunctor::setKernelArgPtr (kernel, 3, args.elemsize, args.value); size_t globalThreads[2] = { args.m , args.n }; err = clEnqueueNDRangeKernel(args.queue, kernel, 2, NULL, globalThreads, NULL , args.numEventsInWaitList, args.eventWaitList, args.events); clReleaseKernel(kernel) ; return clblasStatus(err) ; } clblasFill2DFunctorDefault * clblasFill2DFunctorDefault::provide(Args & args) { // The current implementation only support the common scalar data // sizes from 'char' (1) to 'double complex' 16 switch(args.elemsize) { case 1: case 2: case 4: case 8: case 16: break ; default: return NULL ; } cl_device_id dev; cl_context ctxt; cl_int err = clblasFunctor::getDeviceAndContext(args.queue, dev, ctxt); if (err != CL_SUCCESS) { return NULL; } Cache::Lookup lookup(cache, ctxt, dev, args.elemsize ) ; if ( lookup.ok() ) { clblasFill2DFunctorDefault * functor = lookup.get(); functor->retain(); // increment the reference counter to avoid deletion while it is still beeing used return functor; } clblasFill2DFunctorDefault * functor = new clblasFill2DFunctorDefault(ctxt, dev, args.elemsize, err); if (err != CL_SUCCESS) { return NULL; } lookup.set(functor) ; return functor; } clblas-2.10/src/library/blas/functor/functor_selector.cc000066400000000000000000000210741264277366700234470ustar00rootroot00000000000000 /* ************************************************************************ * Copyright 2014 Advanced Micro Devices, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * ************************************************************************/ #include #include #include #include // This is the selector used by default for 'unknown' targets clblasFunctorSelector clblasFunctorSelector::default_instance ; typedef std::map clblasFunctorSelectorMap ; // // Provide a global map in which all clblasFunctorSelector will register during // their constructor (with the exception of the default one). // // Remark: For that, we use the "construct on first use" idiom, to avoid the // infamous "static initialization order fiasco". // See for example http://www.parashift.com/c++-faq/static-init-order.html // static clblasFunctorSelectorMap & getMap() { static clblasFunctorSelectorMap * the_map = new clblasFunctorSelectorMap ; return * the_map ; } // Constructor for the non-default selector clblasFunctorSelector::clblasFunctorSelector(DeviceChip chip) { clblasFunctorSelectorMap::iterator it = getMap().find(chip); if (it != getMap().end()) { assert(false); } getMap()[chip] = this; } // Constructor for the default selector clblasFunctorSelector::clblasFunctorSelector() { } clblasFunctorSelector * clblasFunctorSelector::find(cl_command_queue queue) { cl_device_id device; cl_int status = getQueueDevice(queue, &device); assert( status == CL_SUCCESS ); return clblasFunctorSelector::find(device); } clblasFunctorSelector * clblasFunctorSelector::find(cl_device_id device) { TargetDevice td; td.id = device; cl_int status = identifyDevice(&td); assert( status == CL_SUCCESS ); return clblasFunctorSelector::find(td.ident.chip); } clblasFunctorSelector * clblasFunctorSelector::find(DeviceChip chip) { clblasFunctorSelectorMap & the_map = getMap(); clblasFunctorSelectorMap::iterator it = the_map.find(chip); if (it != the_map.end()) { return it->second; } else { return &default_instance ; } } int clblasFunctorSelector::FindFirePro(cl_device_id device) { return 1; /*char cardName [1024]; cl_int error = clGetDeviceInfo(device, CL_DEVICE_BOARD_NAME_AMD, sizeof(cardName), cardName, NULL); if (error!=CL_SUCCESS) return 0; else if (strstr (cardName, "FirePro")) return 1; else return 0; */ } // ================================================================================= // // XGEMM // // ================================================================================= clblasSgemmFunctor * clblasFunctorSelector::select_sgemm_generic() { return clblasSgemmFunctorFallback::provide(); } clblasDgemmFunctor * clblasFunctorSelector::select_dgemm_generic() { return clblasDgemmFunctorFallback::provide(); } clblasCgemmFunctor * clblasFunctorSelector::select_cgemm_generic() { return clblasCgemmFunctorFallback::provide(); } clblasZgemmFunctor * clblasFunctorSelector::select_zgemm_generic() { return clblasZgemmFunctorFallback::provide(); } clblasSgemmFunctor * clblasFunctorSelector::select_sgemm_specific(clblasSgemmFunctor::Args &) { return this->select_sgemm_generic() ; } clblasDgemmFunctor * clblasFunctorSelector::select_dgemm_specific(clblasDgemmFunctor::Args &) { return this->select_dgemm_generic() ; } clblasCgemmFunctor * clblasFunctorSelector::select_cgemm_specific(clblasCgemmFunctor::Args &) { return this->select_cgemm_generic() ; } clblasZgemmFunctor * clblasFunctorSelector::select_zgemm_specific(clblasZgemmFunctor::Args &) { return this->select_zgemm_generic() ; } // ================================================================================= // // XTRSM // // ================================================================================= clblasStrsmFunctor * clblasFunctorSelector::select_strsm_generic() { return clblasStrsmFunctorFallback::provide(); } clblasDtrsmFunctor * clblasFunctorSelector::select_dtrsm_generic() { return clblasDtrsmFunctorFallback::provide(); } clblasCtrsmFunctor * clblasFunctorSelector::select_ctrsm_generic() { return clblasCtrsmFunctorFallback::provide(); } clblasZtrsmFunctor * clblasFunctorSelector::select_ztrsm_generic() { return clblasZtrsmFunctorFallback::provide(); } clblasStrsmFunctor * clblasFunctorSelector::select_strsm_specific(clblasStrsmFunctor::Args &) { return this->select_strsm_generic() ; } clblasDtrsmFunctor * clblasFunctorSelector::select_dtrsm_specific(clblasDtrsmFunctor::Args &) { return this->select_dtrsm_generic() ; } clblasCtrsmFunctor * clblasFunctorSelector::select_ctrsm_specific(clblasCtrsmFunctor::Args &) { return this->select_ctrsm_generic() ; } clblasZtrsmFunctor * clblasFunctorSelector::select_ztrsm_specific(clblasZtrsmFunctor::Args &) { return this->select_ztrsm_generic() ; } // ================================================================================= // // XSCAL // // ================================================================================= clblasSscalFunctor * clblasFunctorSelector::select_sscal_generic(clblasSscalFunctor::Args & args) { clblasSscalFunctor * functor; functor = clblasSscalFunctorGeneric::provide(args); if(functor) return functor; return clblasSscalFunctorFallback::provide(); } clblasDscalFunctor * clblasFunctorSelector::select_dscal_generic(clblasDscalFunctor::Args & args) { clblasDscalFunctor * functor; functor = clblasDscalFunctorGeneric::provide(args); if(functor) return functor; return clblasDscalFunctorFallback::provide(); } clblasCscalFunctor * clblasFunctorSelector::select_cscal_generic(clblasCscalFunctor::Args & args) { clblasCscalFunctor * functor; functor = clblasCscalFunctorGeneric::provide(args); if(functor) return functor; return clblasCscalFunctorFallback::provide(); } clblasZscalFunctor * clblasFunctorSelector::select_zscal_generic(clblasZscalFunctor::Args & args) { clblasZscalFunctor * functor; functor = clblasZscalFunctorGeneric::provide(args); if(functor) return functor; return clblasZscalFunctorFallback::provide(); } clblasCsscalFunctor * clblasFunctorSelector::select_csscal_generic(clblasCsscalFunctor::Args & args) { clblasCsscalFunctor * functor; functor = clblasCsscalFunctorGeneric::provide(args); if(functor) return functor; return clblasCsscalFunctorFallback::provide(); } clblasZdscalFunctor * clblasFunctorSelector::select_zdscal_generic(clblasZdscalFunctor::Args & args) { clblasZdscalFunctor * functor; functor = clblasZdscalFunctorGeneric::provide(args); if(functor) return functor; return clblasZdscalFunctorFallback::provide(); } clblasSscalFunctor * clblasFunctorSelector::select_sscal_specific(clblasSscalFunctor::Args & args) { return this->select_sscal_generic(args) ; } clblasDscalFunctor * clblasFunctorSelector::select_dscal_specific(clblasDscalFunctor::Args & args) { return this->select_dscal_generic(args) ; } clblasCscalFunctor * clblasFunctorSelector::select_cscal_specific(clblasCscalFunctor::Args & args) { return this->select_cscal_generic(args) ; } clblasZscalFunctor * clblasFunctorSelector::select_zscal_specific(clblasZscalFunctor::Args & args) { return this->select_zscal_generic(args) ; } clblasCsscalFunctor * clblasFunctorSelector::select_csscal_specific(clblasCsscalFunctor::Args & args) { return this->select_csscal_generic(args) ; } clblasZdscalFunctor * clblasFunctorSelector::select_zdscal_specific(clblasZdscalFunctor::Args & args) { return this->select_zdscal_generic(args) ; } // ================================================================================= // // FILL2D // // ================================================================================= clblasFill2DFunctor * clblasFunctorSelector::select_fill2d_specific(clblasFill2DFunctor::Args & args) { return clblasFill2DFunctorDefault::provide(args); } clblas-2.10/src/library/blas/functor/functor_xgemm.cc000066400000000000000000000206701264277366700227450ustar00rootroot00000000000000/* ************************************************************************ * Copyright 2013 Advanced Micro Devices, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * ************************************************************************/ #include #include #include #include "clblas-internal.h" #include "solution_seq.h" #include // // Common part of all XGEMM implementations using the old Solver infrastructure // static clblasStatus doGemm( CLBlasKargs *kargs, clblasOrder order, clblasTranspose transA, clblasTranspose transB, size_t M, size_t N, size_t K, const cl_mem A, size_t offA, size_t lda, const cl_mem B, size_t offB, size_t ldb, cl_mem C, size_t offC, size_t ldc, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events) { cl_int err; ListHead seq; clblasStatus retCode = clblasSuccess; if (!clblasInitialized) { return clblasNotInitialized; } /* Validate arguments */ if ((retCode = checkMemObjects(A, B, C, true, A_MAT_ERRSET, B_MAT_ERRSET, C_MAT_ERRSET))) { return retCode; } if (K != 0) { if ((retCode = checkMatrixSizes(kargs->dtype, order, transA, M, K, A, offA, lda, A_MAT_ERRSET ))) { return retCode; } if ((retCode = checkMatrixSizes(kargs->dtype, order, transB, K, N, B, offB, ldb, B_MAT_ERRSET ))) { return retCode; } } if ((retCode = checkMatrixSizes(kargs->dtype, order, clblasNoTrans, M, N, C, offC, ldc, C_MAT_ERRSET ))) { return retCode; } #ifdef DEBUG_2 printf("DoGemm being called...\n"); #endif kargs->order = order; kargs->transA = transA; kargs->transB = transB; kargs->M = M; kargs->N = N; kargs->K = K; kargs->A = A; kargs->offA = offA; kargs->lda.matrix = lda; kargs->B = B; kargs->offBX = offB; kargs->ldb.matrix = ldb; kargs->C = C; kargs->offCY = offC; kargs->ldc.matrix = ldc; kargs->offsetM = 0; kargs->offsetN = 0; kargs->scimage[0] = 0; kargs->scimage[1] = 0; listInitHead(&seq); err = makeSolutionSeq(CLBLAS_GEMM, kargs, numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events, &seq); if (err == CL_SUCCESS) { err = executeSolutionSeq(&seq); } freeSolutionSeq(&seq); return (clblasStatus)err; } // ================================================================================= // // class clblasSgemmFunctorFallback // // ================================================================================= clblasStatus clblasSgemmFunctorFallback::execute(Args & args) { CLBlasKargs kargs; memset(&kargs, 0, sizeof(kargs)); kargs.dtype = TYPE_FLOAT; kargs.alpha.argFloat = args.alpha; kargs.beta.argFloat = args.beta; return doGemm(&kargs, args.order, args.transA, args.transB, args.M, args.N, args.K, args.A, args.offA, args.lda, args.B, args.offB, args.ldb, args.C, args.offC, args.ldc, 1, &args.queue, args.numEventsInWaitList, args.eventWaitList, args.events); } clblasSgemmFunctorFallback * clblasSgemmFunctorFallback::provide () { static clblasSgemmFunctorFallback sgemm_fallback; // The unique instance of clblasSgemmFunctorFallback return & sgemm_fallback; } void clblasSgemmFunctorFallback::retain() { // clblasSgemmFunctorFallback has a single global instance // and shall never be freed } void clblasSgemmFunctorFallback::release() { // clblasSgemmFunctorFallback has a single global instance // and shall never be freed } // ================================================================================= // // class clblasDgemmFunctorFallback // // ================================================================================= clblasStatus clblasDgemmFunctorFallback::execute(Args & args) { CLBlasKargs kargs; memset(&kargs, 0, sizeof(kargs)); kargs.dtype = TYPE_DOUBLE; kargs.alpha.argDouble = args.alpha; kargs.beta.argDouble = args.beta; return doGemm(&kargs, args.order, args.transA, args.transB, args.M, args.N, args.K, args.A, args.offA, args.lda, args.B, args.offB, args.ldb, args.C, args.offC, args.ldc, 1, &args.queue, args.numEventsInWaitList, args.eventWaitList, args.events); } clblasDgemmFunctorFallback * clblasDgemmFunctorFallback::provide () { static clblasDgemmFunctorFallback dgemm_fallback; // The unique instance of clblasDgemmFunctorFallback return & dgemm_fallback; } void clblasDgemmFunctorFallback::retain() { // clblasDgemmFunctorFallback has a single global instance // and shall never be freed } void clblasDgemmFunctorFallback::release() { // clblasDgemmFunctorFallback has a single global instance // and shall never be freed } // ================================================================================= // // class clblasCgemmFunctorFallback // // ================================================================================= clblasStatus clblasCgemmFunctorFallback::execute(Args & args) { CLBlasKargs kargs; memset(&kargs, 0, sizeof(kargs)); kargs.dtype = TYPE_COMPLEX_FLOAT; kargs.alpha.argFloatComplex = args.alpha; kargs.beta.argFloatComplex = args.beta; return doGemm(&kargs, args.order, args.transA, args.transB, args.M, args.N, args.K, args.A, args.offA, args.lda, args.B, args.offB, args.ldb, args.C, args.offC, args.ldc, 1, &args.queue, args.numEventsInWaitList, args.eventWaitList, args.events); } clblasCgemmFunctorFallback * clblasCgemmFunctorFallback::provide () { static clblasCgemmFunctorFallback cgemm_fallback; // The unique instance of clblasCgemmFunctorFallback return & cgemm_fallback; } void clblasCgemmFunctorFallback::retain() { // clblasCgemmFunctorFallback has a single global instance // and shall never be freed } void clblasCgemmFunctorFallback::release() { // clblasCgemmFunctorFallback has a single global instance // and shall never be freed } // ================================================================================= // // class clblasZgemmFunctorFallback // // ================================================================================= clblasStatus clblasZgemmFunctorFallback::execute(Args & args) { CLBlasKargs kargs; memset(&kargs, 0, sizeof(kargs)); kargs.dtype = TYPE_COMPLEX_DOUBLE; kargs.alpha.argDoubleComplex = args.alpha; kargs.beta.argDoubleComplex = args.beta; return doGemm(&kargs, args.order, args.transA, args.transB, args.M, args.N, args.K, args.A, args.offA, args.lda, args.B, args.offB, args.ldb, args.C, args.offC, args.ldc, 1, &args.queue, args.numEventsInWaitList, args.eventWaitList, args.events); } clblasZgemmFunctorFallback * clblasZgemmFunctorFallback::provide () { static clblasZgemmFunctorFallback cgemm_fallback; // The unique instance of clblasZgemmFunctorFallback return & cgemm_fallback; } void clblasZgemmFunctorFallback::retain() { // clblasZgemmFunctorFallback has a single global instance // and shall never be freed } void clblasZgemmFunctorFallback::release() { // clblasZgemmFunctorFallback has a single global instance // and shall never be freed } clblas-2.10/src/library/blas/functor/functor_xscal.cc000066400000000000000000000234441264277366700227440ustar00rootroot00000000000000/* ************************************************************************ * Copyright 2013 Advanced Micro Devices, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * ************************************************************************/ #include #include #include #include #include "clblas-internal.h" #include "solution_seq.h" #include static clblasStatus doScal( CLBlasKargs *kargs, size_t N, cl_mem X, size_t offx, int incx, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events) { cl_int err; ListHead seq; clblasStatus retCode = clblasSuccess; if (!clblasInitialized) { return clblasNotInitialized; } /* Validate arguments */ retCode = checkMemObjects(X, X, X, false, X_VEC_ERRSET, X_VEC_ERRSET, X_VEC_ERRSET ); if (retCode) { printf("Invalid mem object..\n"); return retCode; } // Check wheather enough memory was allocated if ((retCode = checkVectorSizes(kargs->dtype, N, X, offx, incx, X_VEC_ERRSET))) { printf("Invalid Size for X\n"); return retCode; } /////////////////////////////////////////////////////////////// if ((commandQueues == NULL) || (numCommandQueues == 0)) { return clblasInvalidValue; } /* numCommandQueues will be hardcoded to 1 as of now. No multi-gpu support */ numCommandQueues = 1; if (commandQueues[0] == NULL) { return clblasInvalidCommandQueue; } if ((numEventsInWaitList !=0) && (eventWaitList == NULL)) { return clblasInvalidEventWaitList; } kargs->N = N; kargs->A = X; kargs->offBX = offx; kargs->ldb.vector = incx; // Will be using this as incx if(incx < 0) { // According to Netlib - return for negative incx return clblasSuccess; } listInitHead(&seq); err = makeSolutionSeq(CLBLAS_SCAL, kargs, numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events, &seq); if (err == CL_SUCCESS) { err = executeSolutionSeq(&seq); } freeSolutionSeq(&seq); return (clblasStatus)err; } // ================================================================================= // // class clblasSscalFunctorFallback // // ================================================================================= static clblasSscalFunctorFallback sscal_fallback; clblasStatus clblasSscalFunctorFallback::execute(Args & args) { CLBlasKargs kargs; memset(&kargs, 0, sizeof(kargs)); kargs.dtype = TYPE_FLOAT; kargs.alpha.argFloat = args.alpha; return doScal(&kargs, args.N, args.X, args.offx, args.incx, 1, &args.queue, args.numEventsInWaitList, args.eventWaitList, args.events); } clblasSscalFunctorFallback * clblasSscalFunctorFallback::provide () { static clblasSscalFunctorFallback sscal_fallback; return & sscal_fallback; } void clblasSscalFunctorFallback::retain() { // clblasSscalFunctorFallback has a single global instance // and shall never be freed } void clblasSscalFunctorFallback::release() { // clblasDscalFunctorFallback has a single global instance // and shall never be freed } // ================================================================================= // // class clblasDscalFunctorFallback // // ================================================================================= static clblasDscalFunctorFallback dscal_fallback; clblasStatus clblasDscalFunctorFallback::execute(Args & args) { CLBlasKargs kargs; memset(&kargs, 0, sizeof(kargs)); kargs.dtype = TYPE_DOUBLE; kargs.alpha.argDouble = args.alpha; return doScal(&kargs, args.N, args.X, args.offx, args.incx, 1, &args.queue, args.numEventsInWaitList, args.eventWaitList, args.events); } clblasDscalFunctorFallback * clblasDscalFunctorFallback::provide () { static clblasDscalFunctorFallback dscal_fallback; return & dscal_fallback; } void clblasDscalFunctorFallback::retain() { // clblasDscalFunctorFallback has a single global instance // and shall never be freed } void clblasDscalFunctorFallback::release() { // clblasDscalFunctorFallback has a single global instance // and shall never be freed } // ================================================================================= // // class clblasCscalFunctorFallback // // ================================================================================= static clblasCscalFunctorFallback cscal_fallback; clblasStatus clblasCscalFunctorFallback::execute(Args & args) { CLBlasKargs kargs; memset(&kargs, 0, sizeof(kargs)); kargs.dtype = TYPE_COMPLEX_FLOAT; kargs.alpha.argFloatComplex = args.alpha; return doScal(&kargs, args.N, args.X, args.offx, args.incx, 1, &args.queue, args.numEventsInWaitList, args.eventWaitList, args.events); } clblasCscalFunctorFallback * clblasCscalFunctorFallback::provide () { static clblasCscalFunctorFallback cscal_fallback; return & cscal_fallback; } void clblasCscalFunctorFallback::retain() { // clblasCscalFunctorFallback has a single global instance // and shall never be freed } void clblasCscalFunctorFallback::release() { // clblasCscalFunctorFallback has a single global instance // and shall never be freed } // ================================================================================= // // class clblasZscalFunctorFallback // // ================================================================================= static clblasZscalFunctorFallback zscal_fallback; clblasStatus clblasZscalFunctorFallback::execute(Args & args) { CLBlasKargs kargs; memset(&kargs, 0, sizeof(kargs)); kargs.dtype = TYPE_COMPLEX_DOUBLE; kargs.alpha.argDoubleComplex = args.alpha; return doScal(&kargs, args.N, args.X, args.offx, args.incx, 1, &args.queue, args.numEventsInWaitList, args.eventWaitList, args.events); } clblasZscalFunctorFallback * clblasZscalFunctorFallback::provide () { static clblasZscalFunctorFallback zscal_fallback; return & zscal_fallback; } void clblasZscalFunctorFallback::retain() { // clblasZscalFunctorFallback has a single global instance // and shall never be freed } void clblasZscalFunctorFallback::release() { // clblasZscalFunctorFallback has a single global instance // and shall never be freed } // ================================================================================= // // class clblasCsscalFunctorFallback // // ================================================================================= static clblasCsscalFunctorFallback csscal_fallback; clblasStatus clblasCsscalFunctorFallback::execute(Args & args) { CLBlasKargs kargs; FloatComplex fAlpha; CREAL(fAlpha) = args.alpha; CIMAG(fAlpha) = 0.0f; memset(&kargs, 0, sizeof(kargs)); kargs.alpha.argFloatComplex = fAlpha; kargs.dtype = TYPE_COMPLEX_FLOAT; return doScal(&kargs, args.N, args.X, args.offx, args.incx, 1, &args.queue, args.numEventsInWaitList, args.eventWaitList, args.events); } clblasCsscalFunctorFallback * clblasCsscalFunctorFallback::provide () { static clblasCsscalFunctorFallback csscal_fallback; return & csscal_fallback; } void clblasCsscalFunctorFallback::retain() { // clblasCsscalFunctorFallback has a single global instance // and shall never be freed } void clblasCsscalFunctorFallback::release() { // clblasCsscalFunctorFallback has a single global instance // and shall never be freed } // ================================================================================= // // class clblasZdscalFunctorFallback // // ================================================================================= static clblasZdscalFunctorFallback zdscal_fallback; clblasStatus clblasZdscalFunctorFallback::execute(Args & args) { CLBlasKargs kargs; DoubleComplex fAlpha; CREAL(fAlpha) = args.alpha; CIMAG(fAlpha) = 0.0f; memset(&kargs, 0, sizeof(kargs)); kargs.alpha.argDoubleComplex = fAlpha; kargs.dtype = TYPE_COMPLEX_DOUBLE; return doScal(&kargs, args.N, args.X, args.offx, args.incx, 1, &args.queue, args.numEventsInWaitList, args.eventWaitList, args.events); } clblasZdscalFunctorFallback * clblasZdscalFunctorFallback::provide () { static clblasZdscalFunctorFallback zdscal_fallback; return & zdscal_fallback; } void clblasZdscalFunctorFallback::retain() { // clblasZdscalFunctorFallback has a single global instance // and shall never be freed } void clblasZdscalFunctorFallback::release() { // clblasZdscalFunctorFallback has a single global instance // and shall never be freed } clblas-2.10/src/library/blas/functor/functor_xscal_generic.cc000066400000000000000000000351261264277366700244400ustar00rootroot00000000000000/* ************************************************************************ * Copyright 2013 Advanced Micro Devices, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * ************************************************************************/ #include #include #include #include #include "clblas-internal.h" #include "solution_seq.h" #include #include #include #include template static cl_program xcalCreateProgram(cl_context ctxt, cl_device_id dev, char type, const char* functorName, const typename FUNCTOR::Data & data, cl_int & err) { BinaryLookup bl(ctxt, dev, functorName); bl.variantInt(data.vecLen); bl.variantInt(data.doVLOAD); bl.variantInt(data.noUnity); if ( bl.found() ) // may create empty file or may wait until file is ready { return bl.getProgram(); } else { char tempTemplate[32*1024]; char buf [32*1024]; cl_program scalProgram; strcpy( tempTemplate, (char*)scal_kernel ); kprintf kobj( type, data.vecLen, data.doVLOAD, data.doVLOAD); kobj.spit((char*)buf, tempTemplate); const char * options; if(data.noUnity) { options = "-DINCX_NONUNITY"; }else{ options = ""; } scalProgram = BinaryLookup::buildProgramFromSource(buf, ctxt, dev, err , options); if(scalProgram) { bl.setProgram(scalProgram); bl.populateCache(); } return scalProgram; } } template static clblasStatus xscalExecute(cl_command_queue queue, cl_program program, const char * kernelName, TA alpha, cl_mem X, unsigned int N, unsigned int offx, int incx, size_t nThreads, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events) { cl_int err; cl_kernel kernel = clCreateKernel( program, kernelName, &err); if (err != CL_SUCCESS) return clblasStatus(err) ; clblasFunctor::setKernelArg (kernel, 0, alpha); clblasFunctor::setKernelArg (kernel, 1, X); clblasFunctor::setKernelArg (kernel, 2, N); clblasFunctor::setKernelArg (kernel, 3, offx); clblasFunctor::setKernelArg (kernel, 4, incx); size_t globalThreads[1] = { nThreads }; err = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, globalThreads, NULL , numEventsInWaitList, eventWaitList, events); clReleaseKernel(kernel) ; return clblasStatus(err) ; } template static FUNCTOR * xscalProvide(typename FUNCTOR::Args & args) { cl_device_id dev; cl_context ctxt; cl_int err = clblasFunctor::getDeviceAndContext(args.queue, dev, ctxt); if (err != CL_SUCCESS) { return NULL; } unsigned int vecLen = 1 ; //to customize according to the device and args bool doVLOAD = false ; //TO DO (see scal_reg.cpp) bool noUnity = (args.incx != 1) ; typename FUNCTOR::Data data = { vecLen , doVLOAD , noUnity}; typename FUNCTOR::Cache::Lookup lookup(FUNCTOR::cache, ctxt, dev, data ) ; if ( lookup.ok() ){ FUNCTOR * functor = lookup.get(); functor->retain(); // increment the reference counter to avoid deletion while it is still beeing used return functor; } FUNCTOR * functor = new FUNCTOR(ctxt, dev, data, err); if (err != CL_SUCCESS) { return NULL; } lookup.set(functor) ; return functor; } // ================================================================================= // // class clblasSscalFunctorGeneric // // ================================================================================= clblasSscalFunctorGeneric::clblasSscalFunctorGeneric(cl_context ctxt, cl_device_id dev, const Data & data, cl_int & err) : program(0) { this->program = xcalCreateProgram(ctxt, dev, 'S', "clblasSscalFunctorGeneric", data, err); } clblasSscalFunctorGeneric::~clblasSscalFunctorGeneric() { if (this->program) { clReleaseProgram( this->program ) ; } } clblasStatus clblasSscalFunctorGeneric::execute(Args & args) { size_t nThreads = args.N; //to customize according to the device, data and args return xscalExecute(args.queue, this->program, "Sscal_kernel", args.alpha, args.X, args.N, args.offx, args.incx, nThreads, args.numEventsInWaitList, args.eventWaitList, args.events); } clblasSscalFunctorGeneric::Cache clblasSscalFunctorGeneric::cache; clblasSscalFunctorGeneric * clblasSscalFunctorGeneric::provide (clblasSscalFunctor::Args & args) { return xscalProvide(args); } // ================================================================================= // // class clblasDscalFunctorGeneric // // ================================================================================= clblasDscalFunctorGeneric::clblasDscalFunctorGeneric(cl_context ctxt, cl_device_id dev, const Data & data, cl_int & err) : program(0) { this->program = xcalCreateProgram(ctxt, dev, 'D', "clblasDscalFunctorGeneric", data, err); } clblasDscalFunctorGeneric::~clblasDscalFunctorGeneric() { if (this->program) { clReleaseProgram( this->program ) ; } } clblasStatus clblasDscalFunctorGeneric::execute(Args & args) { size_t nThreads = args.N; //to customize according to the device, data and args return xscalExecute(args.queue, this->program, "Dscal_kernel", args.alpha, args.X, args.N, args.offx, args.incx, nThreads, args.numEventsInWaitList, args.eventWaitList, args.events); } clblasDscalFunctorGeneric::Cache clblasDscalFunctorGeneric::cache; clblasDscalFunctorGeneric * clblasDscalFunctorGeneric::provide (clblasDscalFunctor::Args & args) { return xscalProvide(args); } // ================================================================================= // // class clblasCscalFunctorGeneric // // ================================================================================= clblasCscalFunctorGeneric::clblasCscalFunctorGeneric(cl_context ctxt, cl_device_id dev, const Data & data, cl_int & err) : program(0) { this->program = xcalCreateProgram(ctxt, dev, 'C', "clblasCscalFunctorGeneric", data, err); } clblasCscalFunctorGeneric::~clblasCscalFunctorGeneric() { if (this->program) { clReleaseProgram( this->program ) ; } } clblasStatus clblasCscalFunctorGeneric::execute(Args & args) { size_t nThreads = args.N; //to customize according to the device, data and args return xscalExecute(args.queue, this->program, "Cscal_kernel", args.alpha, args.X, args.N, args.offx, args.incx, nThreads, args.numEventsInWaitList, args.eventWaitList, args.events); } clblasCscalFunctorGeneric::Cache clblasCscalFunctorGeneric::cache; clblasCscalFunctorGeneric * clblasCscalFunctorGeneric::provide (clblasCscalFunctor::Args & args) { return xscalProvide(args); } // ================================================================================= // // class clblasZscalFunctorGeneric // // ================================================================================= clblasZscalFunctorGeneric::clblasZscalFunctorGeneric(cl_context ctxt, cl_device_id dev, const Data & data, cl_int & err) : program(0) { this->program = xcalCreateProgram(ctxt, dev, 'Z', "clblasZscalFunctorGeneric", data, err); } clblasZscalFunctorGeneric::~clblasZscalFunctorGeneric() { if (this->program) { clReleaseProgram( this->program ) ; } } clblasStatus clblasZscalFunctorGeneric::execute(Args & args) { size_t nThreads = args.N; //to customize according to the device, data and args return xscalExecute(args.queue, this->program, "Zscal_kernel", args.alpha, args.X, args.N, args.offx, args.incx, nThreads, args.numEventsInWaitList, args.eventWaitList, args.events); } clblasZscalFunctorGeneric::Cache clblasZscalFunctorGeneric::cache; clblasZscalFunctorGeneric * clblasZscalFunctorGeneric::provide (clblasZscalFunctor::Args & args) { return xscalProvide(args); } // ================================================================================= // // class clblasCsscalFunctorGeneric // // ================================================================================= clblasCsscalFunctorGeneric::clblasCsscalFunctorGeneric(cl_context ctxt, cl_device_id dev, const Data & data, cl_int & err) : program(0) { this->program = xcalCreateProgram(ctxt, dev, 'C', "clblasCsscalFunctorGeneric", data, err); } clblasCsscalFunctorGeneric::~clblasCsscalFunctorGeneric() { if (this->program) { clReleaseProgram( this->program ) ; } } clblasStatus clblasCsscalFunctorGeneric::execute(Args & args) { size_t nThreads = args.N; //to customize according to the device, data and args cl_float2 l_alpha; l_alpha.s[0] = args.alpha ; l_alpha.s[1] = 0.f ; return xscalExecute(args.queue, this->program, "Cscal_kernel", l_alpha, args.X, args.N, args.offx, args.incx, nThreads, args.numEventsInWaitList, args.eventWaitList, args.events); } clblasCsscalFunctorGeneric::Cache clblasCsscalFunctorGeneric::cache; clblasCsscalFunctorGeneric * clblasCsscalFunctorGeneric::provide (clblasCsscalFunctor::Args & args) { return xscalProvide(args); } // ================================================================================= // // class clblasZdscalFunctorGeneric // // ================================================================================= clblasZdscalFunctorGeneric::clblasZdscalFunctorGeneric(cl_context ctxt, cl_device_id dev, const Data & data, cl_int & err) : program(0) { this->program = xcalCreateProgram(ctxt, dev, 'Z', "clblasZdscalFunctorGeneric", data, err); } clblasZdscalFunctorGeneric::~clblasZdscalFunctorGeneric() { if (this->program) { clReleaseProgram( this->program ) ; } } clblasStatus clblasZdscalFunctorGeneric::execute(Args & args) { size_t nThreads = args.N; //to customize according to the device, data and args cl_double2 l_alpha; l_alpha.s[0] = args.alpha ; l_alpha.s[1] = 0.f ; return xscalExecute(args.queue, this->program, "Zscal_kernel", l_alpha, args.X, args.N, args.offx, args.incx, nThreads, args.numEventsInWaitList, args.eventWaitList, args.events); } clblasZdscalFunctorGeneric::Cache clblasZdscalFunctorGeneric::cache; clblasZdscalFunctorGeneric * clblasZdscalFunctorGeneric::provide (clblasZdscalFunctor::Args & args) { return xscalProvide(args); } clblas-2.10/src/library/blas/functor/functor_xtrsm.cc000066400000000000000000000205631264277366700230060ustar00rootroot00000000000000/* ************************************************************************ * Copyright 2013 Advanced Micro Devices, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * ************************************************************************/ #include #include #include #include "clblas-internal.h" #include "solution_seq.h" #include // // Common part of all XTRSM implementations using the old Solver infrastructure // static clblasStatus doTrsm( CLBlasKargs *kargs, clblasOrder order, clblasSide side, clblasUplo uplo, clblasTranspose transA, clblasDiag diag, size_t M, size_t N, const cl_mem A, size_t offA, size_t lda, cl_mem B, size_t offB, size_t ldb, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events) { cl_int err; ListHead seq; size_t msize; clblasStatus retCode = clblasSuccess; if (!clblasInitialized) { return clblasNotInitialized; } /* Validate arguments */ if ((retCode = checkMemObjects(A, B, NULL, false, A_MAT_ERRSET, B_MAT_ERRSET, END_ERRSET ))) { return retCode; } msize = (side == clblasLeft) ? M : N; if ((retCode = checkMatrixSizes(kargs->dtype, order, transA, msize, msize, A, offA, lda, A_MAT_ERRSET ))) { return retCode; } if ((retCode = checkMatrixSizes(kargs->dtype, order, clblasNoTrans, M, N, B, offB, ldb, B_MAT_ERRSET ))) { return retCode; } kargs->order = order; kargs->side = side; kargs->uplo = uplo; kargs->transA = transA; kargs->diag = diag; kargs->M = M; kargs->N = N; kargs->A = A; kargs->offA = offA; kargs->lda.matrix = lda; kargs->B = B; kargs->offBX = offB; kargs->ldb.matrix = ldb; // Store original problem size in K, this is used to know it while // calculating result by parts using M or N as part size if (side == clblasLeft) { kargs->K = M; } else { kargs->K = N; } kargs->offsetM = 0; kargs->offsetN = 0; kargs->scimage[0] = 0; #ifndef TRXM_MULTIPLE_QUEUES if (numCommandQueues != 0) { numCommandQueues = 1; } #endif listInitHead(&seq); err = makeSolutionSeq(CLBLAS_TRSM, kargs, numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events, &seq); if (err == CL_SUCCESS) { err = executeSolutionSeq(&seq); } freeSolutionSeq(&seq); return (clblasStatus)err; } // ================================================================================= // // class clblasStrsmFunctorFallback // // ================================================================================= clblasStatus clblasStrsmFunctorFallback::execute(Args & args) { CLBlasKargs kargs; memset(&kargs, 0, sizeof(kargs)); kargs.dtype = TYPE_FLOAT; kargs.alpha.argFloat = args.alpha; return doTrsm(&kargs, args.order, args.side, args.uplo, args.transA, args.diag, args.M, args.N, args.A, args.offA, args.lda, args.B, args.offB, args.ldb, 1, &args.queue, args.numEventsInWaitList, args.eventWaitList, args.events); } clblasStrsmFunctorFallback * clblasStrsmFunctorFallback::provide () { static clblasStrsmFunctorFallback strsm_fallback; // The unique instance of clblasStrsmFunctorFallback return & strsm_fallback; } void clblasStrsmFunctorFallback::retain() { // clblasStrsmFunctorFallback has a single global instance // and shall never be freed } void clblasStrsmFunctorFallback::release() { // clblasStrsmFunctorFallback has a single global instance // and shall never be freed } // ================================================================================= // // class clblasDtrsmFunctorFallback // // ================================================================================= clblasStatus clblasDtrsmFunctorFallback::execute(Args & args) { CLBlasKargs kargs; memset(&kargs, 0, sizeof(kargs)); kargs.dtype = TYPE_DOUBLE; kargs.alpha.argDouble = args.alpha; return doTrsm(&kargs, args.order, args.side, args.uplo, args.transA, args.diag, args.M, args.N, args.A, args.offA, args.lda, args.B, args.offB, args.ldb, 1, &args.queue, args.numEventsInWaitList, args.eventWaitList, args.events); } clblasDtrsmFunctorFallback * clblasDtrsmFunctorFallback::provide () { static clblasDtrsmFunctorFallback dtrsm_fallback; // The unique instance of clblasDtrsmFunctorFallback return & dtrsm_fallback; } void clblasDtrsmFunctorFallback::retain() { // clblasDtrsmFunctorFallback has a single global instance // and shall never be freed } void clblasDtrsmFunctorFallback::release() { // clblasDtrsmFunctorFallback has a single global instance // and shall never be freed } // ================================================================================= // // class clblasCtrsmFunctorFallback // // ================================================================================= clblasStatus clblasCtrsmFunctorFallback::execute(Args & args) { CLBlasKargs kargs; memset(&kargs, 0, sizeof(kargs)); kargs.dtype = TYPE_COMPLEX_FLOAT; kargs.alpha.argFloatComplex = args.alpha; return doTrsm(&kargs, args.order, args.side, args.uplo, args.transA, args.diag, args.M, args.N, args.A, args.offA, args.lda, args.B, args.offB, args.ldb, 1, &args.queue, args.numEventsInWaitList, args.eventWaitList, args.events); } clblasCtrsmFunctorFallback * clblasCtrsmFunctorFallback::provide () { static clblasCtrsmFunctorFallback ctrsm_fallback; // The unique instance of clblasCtrsmFunctorFallback return & ctrsm_fallback; } void clblasCtrsmFunctorFallback::retain() { // clblasCtrsmFunctorFallback has a single global instance // and shall never be freed } void clblasCtrsmFunctorFallback::release() { // clblasCtrsmFunctorFallback has a single global instance // and shall never be freed } // ================================================================================= // // class clblasZtrsmFunctorFallback // // ================================================================================= clblasStatus clblasZtrsmFunctorFallback::execute(Args & args) { CLBlasKargs kargs; memset(&kargs, 0, sizeof(kargs)); kargs.dtype = TYPE_COMPLEX_DOUBLE; kargs.alpha.argDoubleComplex = args.alpha; return doTrsm(&kargs, args.order, args.side, args.uplo, args.transA, args.diag, args.M, args.N, args.A, args.offA, args.lda, args.B, args.offB, args.ldb, 1, &args.queue, args.numEventsInWaitList, args.eventWaitList, args.events); } clblasZtrsmFunctorFallback * clblasZtrsmFunctorFallback::provide () { static clblasZtrsmFunctorFallback ztrsm_fallback; // The unique instance of clblasZtrsmFunctorFallback return & ztrsm_fallback; } void clblasZtrsmFunctorFallback::retain() { // clblasZtrsmFunctorFallback has a single global instance // and shall never be freed } void clblasZtrsmFunctorFallback::release() { // clblasZtrsmFunctorFallback has a single global instance // and shall never be freed } clblas-2.10/src/library/blas/functor/gcn_dgemm.cc000066400000000000000000000763201264277366700220130ustar00rootroot00000000000000#include #include #include #include #include "clblas-internal.h" #include "solution_seq.h" #include #include #include #include #include #include #include #include "BinaryBuild.h" //for the moment only managing source code and cl binary //#if BUILD_KERNEL_FROM_STRING //#include "dgemm_hawai.clT" //#else //#include "dgemm_hawai.cl_32.bin.clT" //#include "dgemm_hawai.cl_64.bin.clT" //#endif // // //cl_uint _64Bits = 32; // //// //// The name of the 'const char *' providing the kernel OpenCL source //// //// dgemm_TATB_DIVN_DIVM_DIVK_BS0xBS1_NV0xNV1 //// //// For instance, DGEMM_SRC_NAME(N,T,32,64,8,8,8,4,8) is dgemm_NT_32_64_8_8x8_4x8 //// //#define DGEMM_SRC_NAME(TA,TB,DIVN,DIVM,DIVK,BS0,BS1,NV0,NV1) dgemm_##TA##TB##_##DIVN##_##DIVM##_##DIVK##_##BS0##x##BS1##_##NV0##x##NV1 //#define DGEMM_SRC_NAME_TAHITI(TA,TB,DIVN,DIVM,DIVK,BS0,BS1,NV0,NV1,BITS) dgemm_##TA##TB##_##DIVN##_##DIVM##_##DIVK##_##BS0##x##BS1##_##NV0##x##NV1##_##BITS##_bin_Tahiti //#define DGEMM_SRC_NAME_HAWAII(TA,TB,DIVN,DIVM,DIVK,BS0,BS1,NV0,NV1,BITS) dgemm_##TA##TB##_##DIVN##_##DIVM##_##DIVK##_##BS0##x##BS1##_##NV0##x##NV1##_##BITS##_bin_Hawaii // //// //// The name of the 'const char []' global variable that contain the SPIR data. //// That name is similar to the one produced by DGEMM_SRC_NAME but suffixed by _spir //// //#define DGEMM_SPIR_NAME(TA,TB,DIVN,DIVM,DIVK,BS0,BS1,NV0,NV1) dgemm_##TA##TB##_##DIVN##_##DIVM##_##DIVK##_##BS0##x##BS1##_##NV0##x##NV1_spir // //// //// The name of the 'const char []' global variable that contain the CL binaries data. //// That name is similar to the one produced by DGEMM_SRC_NAME but suffixed by _bin //// // // //// The name of the kernel itself. //// This is basically the name returned by DGEMM_SRC_NAME but as string //// //#define DGEMM_KERNEL_NAME(TA,TB,DIVN,DIVM,DIVK,BS0,BS1,NV0,NV1) "dgemm_" #TA #TB "_" #DIVN "_" #DIVM "_" #DIVK "_" #BS0 "x" #BS1 "_" #NV0 "x" #NV1 // //// //// Helpers to transform N and T in proper clblas values for the macros above //// //#define trans_N clblasNoTrans //#define trans_T clblasTrans // // //// Fill a variant descriptor using OpenCL source //#define DGEMM_VARIANT_SRC(TA,TB,DIVN,DIVM,DIVK,BS0,BS1,NV0,NV1) { \ // DGEMM_KERNEL_NAME(TA,TB,DIVN,DIVM,DIVK,BS0,BS1,NV0,NV1) , \ // DGEMM_SRC_NAME(TA,TB,DIVN,DIVM,DIVK,BS0,BS1,NV0,NV1) , \ // NULL, NULL, 0, \ // trans_##TA, trans_##TB, \ // DIVN,DIVM,DIVK, \ //{ BS0, BS1 } , \ //{ NV0, NV1 } \ //} // //// Fill a variant descriptor using SPIR //#define DGEMM_VARIANT_SPIR(TA,TB,DIVN,DIVM,DIVK,BS0,BS1,NV0,NV1) { \ // DGEMM_KERNEL_NAME(TA,TB,DIVN,DIVM,DIVK,BS0,BS1,NV0,NV1) , \ // NULL , "-x spir -spir-std=1.2" \ // DGEMM_SPIR_NAME(TA,TB,DIVN,DIVM,DIVK,BS0,BS1,NV0,NV1), \ // sizeof(DGEMM_SPIR_NAME(TA,TB,DIVN,DIVM,DIVK,BS0,BS1,NV0,NV1)), \ // trans_##TA,trans_##TB, \ // DIVN,DIVM,DIVK, \ //{ BS0, BS1 } , \ //{ NV0, NV1 } \ //} // //// Fill a variant descriptor using CL Binaries //#define DGEMM_VARIANT_BIN(TA,TB,DIVN,DIVM,DIVK,BS0,BS1,NV0,NV1,BITS,DEVICE) { \ // DGEMM_KERNEL_NAME(TA,TB,DIVN,DIVM,DIVK,BS0,BS1,NV0,NV1) , \ // NULL , NULL, \ // DGEMM_SRC_NAME##_##DEVICE(TA,TB,DIVN,DIVM,DIVK,BS0,BS1,NV0,NV1,BITS), \ // sizeof(DGEMM_SRC_NAME##_##DEVICE(TA,TB,DIVN,DIVM,DIVK,BS0,BS1,NV0,NV1,BITS)), \ // trans_##TA,trans_##TB, \ // DIVN,DIVM,DIVK, \ //{ BS0, BS1 } , \ //{ NV0, NV1 } \ //} // Make it 1 to enable additional debug 'print' #define VERB 0 // //// Just because the full name is too long //typedef clblasDgemmFunctorGCN::Variant Variant ; // //// //// The static cache used to store all instances of clblasDgemmFunctorTahiti //// //typedef clblasFunctorCache Cache ; //static Cache cache ; // // //// return true iff a kernel variant is applicable to the specified args //static bool applicable( const Variant & var, clblasDgemmFunctor::Args & args ) //{ //#if 0 // // Transpose values are tested in select_variant // if ( args.transA != var.transA ) return false ; // if ( args.transB != var.transB ) return false ; //#endif // if ( args.N % var.divN != 0 ) return false ; // if ( args.M % var.divM != 0 ) return false ; // if ( args.K % var.divK != 0 ) return false ; // return true ; //} // //// //// The goal of this function is to return the Variant to be used //// for the DGEMM specified by 'args'. //// //// The variants are typically tested sequentially from the more //// specific to the more generic. Additional conditions can be //// placed into the surrounding 'if' (typically that would be //// to perform additional tests on M, N and K). //// //// //static const Variant * select_variant( clblasDgemmFunctor::Args & args, const char* DevName, cl_uint _64BitsUse ) //{ // // // if ( args.transA == clblasNoTrans ) // { // if ( args.transB == clblasNoTrans ) // { // // // ===== dgemm NN ====== // // if (true) // { //#if BUILD_KERNEL_FROM_STRING // static const Variant variant = DGEMM_VARIANT_SRC(N,N,48,48,8,8,8,6,6) ; // if ( applicable(variant,args) ) // return &variant ; // //#else // if(!strcmp(DevName, "Tahiti")) // { // if(_64BitsUse==64) // { // static const Variant variant = DGEMM_VARIANT_BIN(N,N,48,48,8,8,8,6,6,64,TAHITI) ; // if ( applicable(variant,args) ) // return &variant ; // } // else // { // static const Variant variant = DGEMM_VARIANT_BIN(N,N,48,48,8,8,8,6,6,32,TAHITI) ; // if ( applicable(variant,args) ) // return &variant ; // // } // } // else if(!strcmp(DevName, "Hawaii")) // { // if(_64BitsUse==64) // { // static const Variant variant = DGEMM_VARIANT_BIN(N,N,48,48,8,8,8,6,6,64,HAWAII) ; // if ( applicable(variant,args) ) // return &variant ; // } // else // { // static const Variant variant = DGEMM_VARIANT_BIN(N,N,48,48,8,8,8,6,6,32,HAWAII) ; // if ( applicable(variant,args) ) // return &variant ; // } // } // //#endif // } // // if (true) // { //#if BUILD_KERNEL_FROM_STRING // static const Variant variant = DGEMM_VARIANT_SRC(N,N,32,32,8,8,8,4,4) ; // if ( applicable(variant,args) ) // return &variant ; //#else // if(!strcmp(DevName, "Tahiti")) // { // if(_64BitsUse==64) // { // static const Variant variant = DGEMM_VARIANT_BIN(N,N,32,32,8,8,8,4,4,64,TAHITI) ; // if ( applicable(variant,args) ) // return &variant ; // } // // else // { // static const Variant variant = DGEMM_VARIANT_BIN(N,N,32,32,8,8,8,4,4,32,TAHITI) ; // if ( applicable(variant,args) ) // return &variant ; // } // } // else if(!strcmp(DevName, "Hawaii")) // { // if(_64BitsUse==64) // { // static const Variant variant = DGEMM_VARIANT_BIN(N,N,32,32,8,8,8,4,4,64,HAWAII) ; // if ( applicable(variant,args) ) // return &variant ; // } // // else // { // static const Variant variant = DGEMM_VARIANT_BIN(N,N,32,32,8,8,8,4,4,32,HAWAII) ; // if ( applicable(variant,args) ) // return &variant ; // } // } //#endif // // } // // if (true) // { //#if BUILD_KERNEL_FROM_STRING // static const Variant variant = DGEMM_VARIANT_SRC(N,N,32,32,1,8,8,4,4) ; // if ( applicable(variant,args) ) // return &variant ; //#else // if(!strcmp(DevName, "Tahiti")) // { // if(_64BitsUse==64) // { // static const Variant variant = DGEMM_VARIANT_BIN(N,N,32,32,1,8,8,4,4,64,TAHITI) ; // if ( applicable(variant,args) ) // return &variant ; // } // // else // { // static const Variant variant = DGEMM_VARIANT_BIN(N,N,32,32,1,8,8,4,4,32,TAHITI) ; // if ( applicable(variant,args) ) // return &variant ; // } // } // else if(!strcmp(DevName, "Hawaii")) // { // if(_64BitsUse==64) // { // static const Variant variant = DGEMM_VARIANT_BIN(N,N,32,32,1,8,8,4,4,64,HAWAII) ; // if ( applicable(variant,args) ) // return &variant ; // } // // else // { // static const Variant variant = DGEMM_VARIANT_BIN(N,N,32,32,1,8,8,4,4,32,HAWAII) ; // if ( applicable(variant,args) ) // return &variant ; // } // } //#endif // // } // // if (true) // { //#if BUILD_KERNEL_FROM_STRING // static const Variant variant = DGEMM_VARIANT_SRC(N,N,1,1,8,8,8,4,4) ; // if ( applicable(variant,args) ) // return &variant ; //#else // if(!strcmp(DevName, "Tahiti")) // { // if(_64BitsUse==64) // { // static const Variant variant = DGEMM_VARIANT_BIN(N,N,1,1,8,8,8,4,4,64,TAHITI) ; // if ( applicable(variant,args) ) // return &variant ; // } // // else // { // static const Variant variant = DGEMM_VARIANT_BIN(N,N,1,1,8,8,8,4,4,32,TAHITI) ; // if ( applicable(variant,args) ) // return &variant ; // } // } // else if(!strcmp(DevName, "Hawaii")) // { // if(_64BitsUse==64) // { // static const Variant variant = DGEMM_VARIANT_BIN(N,N,1,1,8,8,8,4,4,64,HAWAII) ; // if ( applicable(variant,args) ) // return &variant ; // } // // else // { // static const Variant variant = DGEMM_VARIANT_BIN(N,N,1,1,8,8,8,4,4,32,HAWAII) ; // if ( applicable(variant,args) ) // return &variant ; // } // } //#endif // // } // // // The generic version shall be last // if (true) // { //#if BUILD_KERNEL_FROM_STRING // static const Variant variant = DGEMM_VARIANT_SRC(N,N,1,1,1,8,8,4,4) ; // if ( applicable(variant,args) ) // return &variant ; //#else // if(!strcmp(DevName, "Tahiti")) // { // if(_64BitsUse==64) // { // static const Variant variant = DGEMM_VARIANT_BIN(N,N,1,1,1,8,8,4,4,64,TAHITI) ; // if ( applicable(variant,args) ) // return &variant ; // } // // else // { // static const Variant variant = DGEMM_VARIANT_BIN(N,N,1,1,1,8,8,4,4,32,TAHITI) ; // if ( applicable(variant,args) ) // return &variant ; // } // } // else if(!strcmp(DevName, "Hawaii")) // { // if(_64BitsUse==64) // { // static const Variant variant = DGEMM_VARIANT_BIN(N,N,1,1,1,8,8,4,4,64,HAWAII) ; // if ( applicable(variant,args) ) // return &variant ; // } // // else // { // static const Variant variant = DGEMM_VARIANT_BIN(N,N,1,1,1,8,8,4,4,32,HAWAII) ; // if ( applicable(variant,args) ) // return &variant ; // } // } //#endif // // } // } // else // { // // ===== dgemm NT ====== // // if (true) // { //#if BUILD_KERNEL_FROM_STRING // static const Variant variant = DGEMM_VARIANT_SRC(N,T,48,48,8,8,8,6,6) ; // if ( applicable(variant,args) ) // return &variant ; //#else // if(!strcmp(DevName, "Tahiti")) // { // if(_64BitsUse==64) // { // static const Variant variant = DGEMM_VARIANT_BIN(N,T,48,48,8,8,8,6,6,64,TAHITI) ; // if ( applicable(variant,args) ) // return &variant ; // } // // else // { // static const Variant variant = DGEMM_VARIANT_BIN(N,T,48,48,8,8,8,6,6,32,TAHITI) ; // if ( applicable(variant,args) ) // return &variant ; // } // } // else if(!strcmp(DevName, "Hawaii")) // { // if(_64BitsUse==64) // { // static const Variant variant = DGEMM_VARIANT_BIN(N,T,48,48,8,8,8,6,6,64,HAWAII) ; // if ( applicable(variant,args) ) // return &variant ; // } // // else // { // static const Variant variant = DGEMM_VARIANT_BIN(N,T,48,48,8,8,8,6,6,32,HAWAII) ; // if ( applicable(variant,args) ) // return &variant ; // } // } //#endif // // } // // if (true) // { //#if BUILD_KERNEL_FROM_STRING // static const Variant variant = DGEMM_VARIANT_SRC(N,T,32,32,8,8,8,4,4) ; // if ( applicable(variant,args) ) // return &variant ; //#else // if(!strcmp(DevName, "Tahiti")) // { // if(_64BitsUse==64) // { // static const Variant variant = DGEMM_VARIANT_BIN(N,T,32,32,8,8,8,4,4,64,TAHITI) ; // if ( applicable(variant,args) ) // return &variant ; // } // // else // { // static const Variant variant = DGEMM_VARIANT_BIN(N,T,32,32,8,8,8,4,4,32,TAHITI) ; // if ( applicable(variant,args) ) // return &variant ; // } // } // else if(!strcmp(DevName, "Hawaii")) // { // if(_64BitsUse==64) // { // static const Variant variant = DGEMM_VARIANT_BIN(N,T,32,32,8,8,8,4,4,64,HAWAII) ; // if ( applicable(variant,args) ) // return &variant ; // } // // else // { // static const Variant variant = DGEMM_VARIANT_BIN(N,T,32,32,8,8,8,4,4,32,HAWAII) ; // if ( applicable(variant,args) ) // return &variant ; // } // } //#endif // // } // // if (true) // { //#if BUILD_KERNEL_FROM_STRING // static const Variant variant = DGEMM_VARIANT_SRC(N,T,32,32,1,8,8,4,4) ; // if ( applicable(variant,args) ) // return &variant ; //#else // if(!strcmp(DevName, "Tahiti")) // { // if(_64BitsUse==64) // { // static const Variant variant = DGEMM_VARIANT_BIN(N,T,32,32,1,8,8,4,4,64,TAHITI) ; // if ( applicable(variant,args) ) // return &variant ; // } // // else // { // static const Variant variant = DGEMM_VARIANT_BIN(N,T,32,32,1,8,8,4,4,32,TAHITI) ; // if ( applicable(variant,args) ) // return &variant ; // } // } // else if(!strcmp(DevName, "Hawaii")) // { // if(_64BitsUse==64) // { // static const Variant variant = DGEMM_VARIANT_BIN(N,T,32,32,1,8,8,4,4,64,HAWAII) ; // if ( applicable(variant,args) ) // return &variant ; // } // // else // { // static const Variant variant = DGEMM_VARIANT_BIN(N,T,32,32,1,8,8,4,4,32,HAWAII) ; // if ( applicable(variant,args) ) // return &variant ; // } // } //#endif // // } // // if (true) // { //#if BUILD_KERNEL_FROM_STRING // static const Variant variant = DGEMM_VARIANT_SRC(N,T,1,1,8,8,8,4,4) ; // if ( applicable(variant,args) ) // return &variant ; //#else // if(!strcmp(DevName, "Tahiti")) // { // if(_64BitsUse==64) // { // static const Variant variant = DGEMM_VARIANT_BIN(N,T,1,1,8,8,8,4,4,64,TAHITI) ; // if ( applicable(variant,args) ) // return &variant ; // } // // else // { // static const Variant variant = DGEMM_VARIANT_BIN(N,T,1,1,8,8,8,4,4,32,TAHITI) ; // if ( applicable(variant,args) ) // return &variant ; // } // } // else if(!strcmp(DevName, "Hawaii")) // { // if(_64BitsUse==64) // { // static const Variant variant = DGEMM_VARIANT_BIN(N,T,1,1,8,8,8,4,4,64,HAWAII) ; // if ( applicable(variant,args) ) // return &variant ; // } // // else // { // static const Variant variant = DGEMM_VARIANT_BIN(N,T,1,1,8,8,8,4,4,32,HAWAII) ; // if ( applicable(variant,args) ) // return &variant ; // } // } //#endif // // } // // // The generic version shall be last // if (true) // { //#if BUILD_KERNEL_FROM_STRING // static const Variant variant = DGEMM_VARIANT_SRC(N,T,1,1,1,8,8,4,4) ; // if ( applicable(variant,args) ) // return &variant ; //#else // if(!strcmp(DevName, "Tahiti")) // { // if(_64BitsUse==64) // { // static const Variant variant = DGEMM_VARIANT_BIN(N,T,1,1,1,8,8,4,4,64,TAHITI) ; // if ( applicable(variant,args) ) // return &variant ; // } // // else // { // static const Variant variant = DGEMM_VARIANT_BIN(N,T,1,1,1,8,8,4,4,32,TAHITI) ; // if ( applicable(variant,args) ) // return &variant ; // } // } // else if(!strcmp(DevName, "Hawaii")) // { // if(_64BitsUse==64) // { // static const Variant variant = DGEMM_VARIANT_BIN(N,T,1,1,1,8,8,4,4,64,HAWAII) ; // if ( applicable(variant,args) ) // return &variant ; // } // // else // { // static const Variant variant = DGEMM_VARIANT_BIN(N,T,1,1,1,8,8,4,4,32,HAWAII) ; // if ( applicable(variant,args) ) // return &variant ; // } // } //#endif // // } // // // } // } // else // { // if ( args.transB == clblasNoTrans ) // { // // ===== dgemm TN ====== // // if ( args.M >= 2000 && args.N >= 2000 ) // { //#if BUILD_KERNEL_FROM_STRING // static const Variant variant = DGEMM_VARIANT_SRC(T,N,48,48,16,8,8,6,6) ; // if ( applicable(variant,args) ) // return &variant ; //#else // if(!strcmp(DevName, "Tahiti")) // { // if(_64BitsUse==64) // { // static const Variant variant = DGEMM_VARIANT_BIN(T,N,48,48,16,8,8,6,6,64,TAHITI) ; // if ( applicable(variant,args) ) // return &variant ; // } // // else // { // static const Variant variant = DGEMM_VARIANT_BIN(T,N,48,48,16,8,8,6,6,32,TAHITI) ; // if ( applicable(variant,args) ) // return &variant ; // } // } // else if(!strcmp(DevName, "Hawaii")) // { // if(_64BitsUse==64) // { // static const Variant variant = DGEMM_VARIANT_BIN(T,N,48,48,16,8,8,6,6,64,HAWAII) ; // if ( applicable(variant,args) ) // return &variant ; // } // // else // { // static const Variant variant = DGEMM_VARIANT_BIN(T,N,48,48,16,8,8,6,6,32,HAWAII) ; // if ( applicable(variant,args) ) // return &variant ; // } // } //#endif // // } // // if (true) // { //#if BUILD_KERNEL_FROM_STRING // static const Variant variant = DGEMM_VARIANT_SRC(T,N,48,48,8,8,8,6,6) ; // if ( applicable(variant,args) ) // return &variant ; //#else // if(!strcmp(DevName, "Tahiti")) // { // if(_64BitsUse==64) // { // static const Variant variant = DGEMM_VARIANT_BIN(T,N,48,48,8,8,8,6,6,64,TAHITI) ; // if ( applicable(variant,args) ) // return &variant ; // } // // else // { // static const Variant variant = DGEMM_VARIANT_BIN(T,N,48,48,8,8,8,6,6,32,TAHITI) ; // if ( applicable(variant,args) ) // return &variant ; // } // } // else if(!strcmp(DevName, "Hawaii")) // { // if(_64BitsUse==64) // { // static const Variant variant = DGEMM_VARIANT_BIN(T,N,48,48,8,8,8,6,6,64,HAWAII) ; // if ( applicable(variant,args) ) // return &variant ; // } // // else // { // static const Variant variant = DGEMM_VARIANT_BIN(T,N,48,48,8,8,8,6,6,32,HAWAII) ; // if ( applicable(variant,args) ) // return &variant ; // } // } //#endif // // } // // if (true) // { //#if BUILD_KERNEL_FROM_STRING // static const Variant variant = DGEMM_VARIANT_SRC(T,N,32,32,16,8,16,4,2) ; // if ( applicable(variant,args) ) // return &variant ; //#else // if(!strcmp(DevName, "Tahiti")) // { // if(_64BitsUse==64) // { // static const Variant variant = DGEMM_VARIANT_BIN(T,N,32,32,16,8,16,4,2,64,TAHITI) ; // if ( applicable(variant,args) ) // return &variant ; // } // // else // { // static const Variant variant = DGEMM_VARIANT_BIN(T,N,32,32,16,8,16,4,2,32,TAHITI) ; // if ( applicable(variant,args) ) // return &variant ; // } // } // else if(!strcmp(DevName, "Hawaii")) // { // if(_64BitsUse==64) // { // static const Variant variant = DGEMM_VARIANT_BIN(T,N,32,32,16,8,16,4,2,64,HAWAII) ; // if ( applicable(variant,args) ) // return &variant ; // } // // else // { // static const Variant variant = DGEMM_VARIANT_BIN(T,N,32,32,16,8,16,4,2,32,HAWAII) ; // if ( applicable(variant,args) ) // return &variant ; // } // } //#endif // // } // // // if (true) // { //#if BUILD_KERNEL_FROM_STRING // static const Variant variant = DGEMM_VARIANT_SRC(T,N,32,32,1,8,16,4,2) ; // if ( applicable(variant,args) ) // return &variant ; //#else // if(!strcmp(DevName, "Tahiti")) // { // if(_64BitsUse==64) // { // static const Variant variant = DGEMM_VARIANT_BIN(T,N,32,32,1,8,16,4,2,64,TAHITI) ; // if ( applicable(variant,args) ) // return &variant ; // } // // else // { // static const Variant variant = DGEMM_VARIANT_BIN(T,N,32,32,1,8,16,4,2,32,TAHITI) ; // if ( applicable(variant,args) ) // return &variant ; // } // } // else if(!strcmp(DevName, "Hawaii")) // { // if(_64BitsUse==64) // { // static const Variant variant = DGEMM_VARIANT_BIN(T,N,32,32,1,8,16,4,2,64,HAWAII) ; // if ( applicable(variant,args) ) // return &variant ; // } // // else // { // static const Variant variant = DGEMM_VARIANT_BIN(T,N,32,32,1,8,16,4,2,32,HAWAII) ; // if ( applicable(variant,args) ) // return &variant ; // } // } //#endif // // } // // if (true) // { //#if BUILD_KERNEL_FROM_STRING // static const Variant variant = DGEMM_VARIANT_SRC(T,N,1,1,16,8,16,4,2) ; // if ( applicable(variant,args) ) // return &variant ; //#else // if(!strcmp(DevName, "Tahiti")) // { // if(_64BitsUse==64) // { // static const Variant variant = DGEMM_VARIANT_BIN(T,N,1,1,16,8,16,4,2,64,TAHITI) ; // if ( applicable(variant,args) ) // return &variant ; // } // // else // { // static const Variant variant = DGEMM_VARIANT_BIN(T,N,1,1,16,8,16,4,2,32,TAHITI) ; // if ( applicable(variant,args) ) // return &variant ; // } // } // else if(!strcmp(DevName, "Hawaii")) // { // if(_64BitsUse==64) // { // static const Variant variant = DGEMM_VARIANT_BIN(T,N,1,1,16,8,16,4,2,64,HAWAII) ; // if ( applicable(variant,args) ) // return &variant ; // } // // else // { // static const Variant variant = DGEMM_VARIANT_BIN(T,N,1,1,16,8,16,4,2,32,HAWAII) ; // if ( applicable(variant,args) ) // return &variant ; // } // } //#endif // // } // // // The generic version shall be last // if (true) // { //#if BUILD_KERNEL_FROM_STRING // static const Variant variant = DGEMM_VARIANT_SRC(T,N,1,1,1,8,16,4,2) ; // if ( applicable(variant,args) ) // return &variant ; //#else // if(!strcmp(DevName, "Tahiti")) // { // if(_64BitsUse==64) // { // static const Variant variant = DGEMM_VARIANT_BIN(T,N,1,1,1,8,16,4,2,64,TAHITI) ; // if ( applicable(variant,args) ) // return &variant ; // } // // else // { // static const Variant variant = DGEMM_VARIANT_BIN(T,N,1,1,1,8,16,4,2,32,TAHITI) ; // if ( applicable(variant,args) ) // return &variant ; // } // } // else if(!strcmp(DevName, "Hawaii")) // { // if(_64BitsUse==64) // { // static const Variant variant = DGEMM_VARIANT_BIN(T,N,1,1,1,8,16,4,2,64,HAWAII) ; // if ( applicable(variant,args) ) // return &variant ; // } // // else // { // static const Variant variant = DGEMM_VARIANT_BIN(T,N,1,1,1,8,16,4,2,32,HAWAII) ; // if ( applicable(variant,args) ) // return &variant ; // } // } //#endif // // } // // // } // else // { // // ===== dgemm TT ====== // // // TODO // } // } // // // return NULL ; // No suitable variant ... will use the fallback // //} //clblasDgemmFunctorGCN::clblasDgemmFunctorGCN(Args & args, const Variant * variant, cl_int & err) : // m_program(0) , m_variant(variant) //{ // // cl_device_id device; // cl_context context; // // cl_command_queue queue = args.queue; // err = getDeviceAndContext(queue, device, context); // if( err != CL_SUCCESS ) // { // return; // } // // if (VERB) printf(" ===> GET KERNEL %s\n", this->m_variant->kernel_name) ; // // //Ben do I use the correct "kernel_name"? // BinaryLookup bl(context, device, "clblasDgemmFunctorGCN"); // // bl.variantRaw( this->m_variant->kernel_name, strlen(this->m_variant->kernel_name)+1 ) ; // // if ( !bl.found() ) // may create empty file or may wait until file is ready // { // if ( this->m_variant->bin != 0 ) // { // // build from a pre-compiled version of the kernel (SPIR or cl binaries) // err = bl.buildFromBinary(this->m_variant->bin, this->m_variant->bin_size, this->m_variant->build_options); // } // else // { // // directly build from a char* // err = bl.buildFromSource(this->m_variant->source); // } // // if ( err != CL_SUCCESS ) // { // if (VERB) printf(" ===> BUILD PROBLEM\n") ; // // return; // } // } // // this->m_program = bl.getProgram(); //} clblasStatus clblasDgemmFunctorGCN::execute(Args &args) { cl_int err; cl_command_queue queue = args.queue; if (VERB) printf(" ===> EXECUTE KERNEL %s\n", this->m_variant->kernel_name) ; cl_kernel kernel = clCreateKernel( this->m_program, this->m_variant->kernel_name, &err); if (err != CL_SUCCESS) return clblasStatus(err) ; if (VERB) printf(" ===> FOUND %s\n", this->m_variant->kernel_name) ; int M = args.M, N = args.N, K = args.K; int lda = args.lda, ldb = args.ldb, ldc = args.ldc; int offsetA = args.offA; int offsetB = args.offB; int offsetC = args.offC; int arg=0 ; // All dgemm kernels shall have the same arguments: (A,B,C,M,N,K,alpha,beta,lda,ldb,ldc,offa,offb,offc) setKernelArg(kernel, arg++, args.A); setKernelArg(kernel, arg++, args.B); setKernelArg(kernel, arg++, args.C); setKernelArg(kernel, arg++, M); setKernelArg(kernel, arg++, N); setKernelArg(kernel, arg++, K); setKernelArg(kernel, arg++, args.alpha); if (args.beta!=0 && this->m_variant->mult.compare("__ALPHA")!=0) setKernelArg(kernel, arg++, args.beta); setKernelArg(kernel, arg++, lda); setKernelArg(kernel, arg++, ldb); setKernelArg(kernel, arg++, ldc); setKernelArg(kernel, arg++, offsetA); setKernelArg(kernel, arg++, offsetB); setKernelArg(kernel, arg++, offsetC); const size_t * ls = this->m_variant->ls ; // Each work group is made of ls[0] x ls[1] PE const size_t * bwi = this->m_variant->bwi ; // Each PE updates bwi[0] x bwi[1] values size_t globalThreads[2]; unsigned int thx, thy; thx = M/bwi[0] + ((M%bwi[0] != 0) ? 1 : 0); thx = thx/ls[0] + ((thx%ls[0] != 0) ? 1 : 0); thx = ls[0] * thx; thy = N/bwi[1] + ((N%bwi[1] != 0) ? 1 : 0); thy = thy/ls[1] + ((thy%ls[1] != 0) ? 1 : 0); thy = ls[1] * thy; globalThreads[0] = thx; globalThreads[1] = thy; err = clEnqueueNDRangeKernel(queue, kernel, 2, NULL, globalThreads, NULL , args.numEventsInWaitList, args.eventWaitList, args.events); clReleaseKernel(kernel) ; if (VERB) printf(" ===> ERR=%d \n",(int)err) ; return clblasStatus(err) ; } //clblasDgemmFunctorGCN * // clblasDgemmFunctorGCN::provide(clblasDgemmFunctor::Args & args, const char* DevName) //{ // // if ( args.order == clblasRowMajor ) // return NULL ; // The RowMajor case shall never occur. // // cl_device_id dev; // cl_context ctxt; // // cl_int err = getDeviceAndContext(args.queue, dev, ctxt); // if (err != CL_SUCCESS) // { // return NULL; // } // cl_uint bitness = getAddressBits(dev); // // const Variant * variant = select_variant( args, DevName, bitness ) ; // if ( variant == NULL ) // return NULL ; // // // // // Cache::Lookup lookup(cache, ctxt, dev, variant) ; // // if ( lookup.ok() ){ // clblasDgemmFunctorGCN * functor = lookup.get(); // functor->retain(); // increment the reference counter to avoid deletion while it is still beeing used // return functor; // } // // clblasDgemmFunctorGCN * functor = new clblasDgemmFunctorGCN(args, variant, err); // if (err != CL_SUCCESS) // { // return NULL; // } // // lookup.set(functor) ; // // return functor; // //} clblas-2.10/src/library/blas/functor/gcn_dgemmCommon.cc000066400000000000000000001012201264277366700231500ustar00rootroot00000000000000#include #include #include #include #include "clblas-internal.h" #include "solution_seq.h" #include #include #include #include #include #include #include "BinaryBuild.h" #include "gcn_dgemmCommon.h" #if BUILD_KERNEL_FROM_STRING #include "dgemm_hawai.clT" #else #include "dgemm_hawai.clHawaii_64.bin.clT" #include "dgemm_hawai.clTahiti_64.bin.clT" #endif //cl_uint _64Bits = 32; //dgemm_NT_48_48_8_8x8_6x6_ALPHA_32_bin_Tahiti // // The name of the 'const char *' providing the kernel OpenCL source // // dgemm_TATB_DIVN_DIVM_DIVK_BS0xBS1_NV0xNV1 // // For instance, DGEMM_SRC_NAME(N,T,32,64,8,8,8,4,8) is dgemm_NT_32_64_8_8x8_4x8 // #define DGEMM_SRC_NAME(TA,TB,DIVN,DIVM,DIVK,BS0,BS1,NV0,NV1,MULT) dgemm_##TA##TB##_##DIVN##_##DIVM##_##DIVK##_##BS0##x##BS1##_##NV0##x##NV1##MULT #define DGEMM_SRC_NAME_TAHITI(TA,TB,DIVN,DIVM,DIVK,BS0,BS1,NV0,NV1,BITS,MULT) dgemm_##TA##TB##_##DIVN##_##DIVM##_##DIVK##_##BS0##x##BS1##_##NV0##x##NV1##MULT##_##BITS##_bin_Tahiti #define DGEMM_SRC_NAME_HAWAII(TA,TB,DIVN,DIVM,DIVK,BS0,BS1,NV0,NV1,BITS,MULT) dgemm_##TA##TB##_##DIVN##_##DIVM##_##DIVK##_##BS0##x##BS1##_##NV0##x##NV1##MULT##_##BITS##_bin_Hawaii // // The name of the 'const char []' global variable that contain the SPIR data. // That name is similar to the one produced by DGEMM_SRC_NAME but suffixed by _spir // #define DGEMM_SPIR_NAME(TA,TB,DIVN,DIVM,DIVK,BS0,BS1,NV0,NV1,MULT) dgemm_##TA##TB##_##DIVN##_##DIVM##_##DIVK##_##BS0##x##BS1##_##NV0##x##NV1_spir // // The name of the 'const char []' global variable that contain the CL binaries data. // That name is similar to the one produced by DGEMM_SRC_NAME but suffixed by _bin // // The name of the kernel itself. // This is basically the name returned by DGEMM_SRC_NAME but as string // #define DGEMM_KERNEL_NAME(TA,TB,DIVN,DIVM,DIVK,BS0,BS1,NV0,NV1,MULT) "dgemm_" #TA #TB "_" #DIVN "_" #DIVM "_" #DIVK "_" #BS0 "x" #BS1 "_" #NV0 "x" #NV1 #MULT // // Helpers to transform N and T in proper clblas values for the macros above // #define trans_N clblasNoTrans #define trans_T clblasTrans // Fill a variant descriptor using OpenCL source #define DGEMM_VARIANT_SRC(TA,TB,DIVN,DIVM,DIVK,BS0,BS1,NV0,NV1, MULT) { \ DGEMM_KERNEL_NAME(TA,TB,DIVN,DIVM,DIVK,BS0,BS1,NV0,NV1, MULT) , \ DGEMM_SRC_NAME(TA,TB,DIVN,DIVM,DIVK,BS0,BS1,NV0,NV1, MULT) , \ NULL, NULL, 0, \ trans_##TA, trans_##TB, \ DIVN,DIVM,DIVK, \ { BS0, BS1 } , \ { NV0, NV1 } , \ #MULT \ } // Fill a variant descriptor using SPIR #define DGEMM_VARIANT_SPIR(TA,TB,DIVN,DIVM,DIVK,BS0,BS1,NV0,NV1, MULT) { \ DGEMM_KERNEL_NAME(TA,TB,DIVN,DIVM,DIVK,BS0,BS1,NV0,NV1, MULT) , \ NULL , "-x spir -spir-std=1.2" \ DGEMM_SPIR_NAME(TA,TB,DIVN,DIVM,DIVK,BS0,BS1,NV0,NV1), \ sizeof(DGEMM_SPIR_NAME(TA,TB,DIVN,DIVM,DIVK,BS0,BS1,NV0,NV1)), \ trans_##TA,trans_##TB, \ DIVN,DIVM,DIVK, \ { BS0, BS1 } , \ { NV0, NV1 } , \ #MULT \ } // Fill a variant descriptor using CL Binaries #define DGEMM_VARIANT_BIN(TA,TB,DIVN,DIVM,DIVK,BS0,BS1,NV0,NV1,BITS,DEVICE, MULT) { \ DGEMM_KERNEL_NAME(TA,TB,DIVN,DIVM,DIVK,BS0,BS1,NV0,NV1, MULT) , \ NULL , NULL, \ DGEMM_SRC_NAME##_##DEVICE(TA,TB,DIVN,DIVM,DIVK,BS0,BS1,NV0,NV1,BITS, MULT), \ sizeof(DGEMM_SRC_NAME##_##DEVICE(TA,TB,DIVN,DIVM,DIVK,BS0,BS1,NV0,NV1,BITS, MULT)), \ trans_##TA,trans_##TB, \ DIVN,DIVM,DIVK, \ { BS0, BS1 } , \ { NV0, NV1 } , \ #MULT \ } // Make it 1 to enable additional debug 'print' #define VERB 0 // Just because the full name is too long typedef clblasDgemmFunctorGCN::Variant Variant ; // // The static cache used to store all instances of clblasDgemmFunctorTahiti // typedef clblasFunctorCache Cache ; static Cache cache ; // return true iff a kernel variant is applicable to the specified args static bool applicable( const Variant & var, clblasDgemmFunctor::Args & args ) { #if 0 // Transpose values are tested in select_variant if ( args.transA != var.transA ) return false ; if ( args.transB != var.transB ) return false ; #endif if ( args.N % var.divN != 0 ) return false ; if ( args.M % var.divM != 0 ) return false ; if ( args.K % var.divK != 0 ) return false ; if ( args.beta==0 && var.mult.compare("__ALPHA")!=0) return false ; return true ; } // // The goal of this function is to return the Variant to be used // for the DGEMM specified by 'args'. // // The variants are typically tested sequentially from the more // specific to the more generic. Additional conditions can be // placed into the surrounding 'if' (typically that would be // to perform additional tests on M, N and K). // // static const Variant * select_variant( clblasDgemmFunctor::Args & args, const char* DevName, cl_uint _64BitsUse ) { if(_64BitsUse!=64) { std::cout<<"we don't support clblas on 32 bits"<< std::endl; assert(1); return NULL; } if ( args.transA == clblasNoTrans ) { if ( args.transB == clblasNoTrans ) { // ===== dgemm NN ====== if (true) { #if BUILD_KERNEL_FROM_STRING static const Variant variant = DGEMM_VARIANT_SRC(N,N,48,48,8,8,8,6,6,__ALPHABETA) ; if ( applicable(variant,args) ) return &variant ; static const Variant variantA = DGEMM_VARIANT_SRC(N,N,48,48,8,8,8,6,6,__ALPHA) ; if ( applicable(variantA,args) ) return &variantA ; #else if(!strcmp(DevName, "Tahiti")) { #ifndef CLBLAS_TAHITI_DYNAMIC_KERNEL static const Variant variant = DGEMM_VARIANT_BIN(N,N,48,48,8,8,8,6,6,64,TAHITI,__ALPHABETA ) ; if ( applicable(variant,args) ) return &variant ; //const char * test = DGEMM_KERNEL_NAME(N,N,48,48,8,8,8,6,6, __ALPHA); // test static const Variant variantA = DGEMM_VARIANT_BIN(N,N,48,48,8,8,8,6,6,64,TAHITI,__ALPHA ) ; if ( applicable(variantA,args) ) return &variantA ; #endif } else if(!strcmp(DevName, "Hawaii")) { #ifndef CLBLAS_HAWAII_DYNAMIC_KERNEL static const Variant variant = DGEMM_VARIANT_BIN(N,N,48,48,8,8,8,6,6,64,HAWAII,__ALPHABETA) ; if ( applicable(variant,args) ) return &variant ; static const Variant variantA = DGEMM_VARIANT_BIN(N,N,48,48,8,8,8,6,6,64,HAWAII,__ALPHA) ; if ( applicable(variantA,args) ) return &variantA ; #endif //#ifndef CLBLAS_HAWAII_DYNAMIC_KERNEL } #endif } if (true) { #if BUILD_KERNEL_FROM_STRING static const Variant variant = DGEMM_VARIANT_SRC(N,N,32,32,8,8,8,4,4,__ALPHABETA) ; if ( applicable(variant,args) ) return &variant ; static const Variant variantA = DGEMM_VARIANT_SRC(N,N,32,32,8,8,8,4,4,__ALPHA) ; if ( applicable(variantA,args) ) return &variantA ; #else if(!strcmp(DevName, "Tahiti")) { #ifndef CLBLAS_TAHITI_DYNAMIC_KERNEL static const Variant variant = DGEMM_VARIANT_BIN(N,N,32,32,8,8,8,4,4,64,TAHITI,__ALPHABETA) ; if ( applicable(variant,args) ) return &variant ; static const Variant variantA = DGEMM_VARIANT_BIN(N,N,32,32,8,8,8,4,4,64,TAHITI,__ALPHA) ; if ( applicable(variantA,args) ) return &variantA ; #endif } else if(!strcmp(DevName, "Hawaii")) { #ifndef CLBLAS_HAWAII_DYNAMIC_KERNEL static const Variant variant = DGEMM_VARIANT_BIN(N,N,32,32,8,8,8,4,4,64,HAWAII,__ALPHABETA) ; if ( applicable(variant,args) ) return &variant ; static const Variant variantA = DGEMM_VARIANT_BIN(N,N,32,32,8,8,8,4,4,64,HAWAII,__ALPHA) ; if ( applicable(variantA,args) ) return &variantA ; #endif //#ifndef CLBLAS_HAWAII_DYNAMIC_KERNEL } #endif } if (true) { #if BUILD_KERNEL_FROM_STRING static const Variant variant = DGEMM_VARIANT_SRC(N,N,40,40,8,8,8,5,5,__ALPHABETA) ; if ( applicable(variant,args) ) return &variant ; static const Variant variantA = DGEMM_VARIANT_SRC(N,N,40,40,8,8,8,5,5,__ALPHA) ; if ( applicable(variantA,args) ) return &variantA ; #else if(!strcmp(DevName, "Tahiti")) { #ifndef CLBLAS_TAHITI_DYNAMIC_KERNEL static const Variant variant = DGEMM_VARIANT_BIN(N,N,40,40,8,8,8,5,5,64,TAHITI,__ALPHABETA) ; if ( applicable(variant,args) ) return &variant ; static const Variant variantA = DGEMM_VARIANT_BIN(N,N,40,40,8,8,8,5,5,64,TAHITI,__ALPHA) ; if ( applicable(variantA,args) ) return &variantA ; #endif } else if(!strcmp(DevName, "Hawaii")) { #ifndef CLBLAS_HAWAII_DYNAMIC_KERNEL static const Variant variant = DGEMM_VARIANT_BIN(N,N,40,40,8,8,8,5,5,64,HAWAII,__ALPHABETA) ; if ( applicable(variant,args) ) return &variant ; static const Variant variantA = DGEMM_VARIANT_BIN(N,N,40,40,8,8,8,5,5,64,HAWAII,__ALPHA) ; if ( applicable(variantA,args) ) return &variantA ; #endif //#ifndef CLBLAS_HAWAII_DYNAMIC_KERNEL } #endif } if (true) { #if BUILD_KERNEL_FROM_STRING static const Variant variant = DGEMM_VARIANT_SRC(N,N,32,32,1,8,8,4,4,__ALPHABETA) ; if ( applicable(variant,args) ) return &variant ; static const Variant variantA = DGEMM_VARIANT_SRC(N,N,32,32,1,8,8,4,4,__ALPHA) ; if ( applicable(variantA,args) ) return &variantA ; #else if(!strcmp(DevName, "Tahiti")) { #ifndef CLBLAS_TAHITI_DYNAMIC_KERNEL static const Variant variant = DGEMM_VARIANT_BIN(N,N,32,32,1,8,8,4,4,64,TAHITI,__ALPHABETA) ; if ( applicable(variant,args) ) return &variant ; static const Variant variantA = DGEMM_VARIANT_BIN(N,N,32,32,1,8,8,4,4,64,TAHITI,__ALPHA) ; if ( applicable(variantA,args) ) return &variantA ; #endif } else if(!strcmp(DevName, "Hawaii")) { #ifndef CLBLAS_HAWAII_DYNAMIC_KERNEL static const Variant variant = DGEMM_VARIANT_BIN(N,N,32,32,1,8,8,4,4,64,HAWAII,__ALPHABETA) ; if ( applicable(variant,args) ) return &variant ; static const Variant variantA = DGEMM_VARIANT_BIN(N,N,32,32,1,8,8,4,4,64,HAWAII,__ALPHA) ; if ( applicable(variantA,args) ) return &variantA ; #endif //#ifndef CLBLAS_HAWAII_DYNAMIC_KERNEL } #endif } if (true) { #if BUILD_KERNEL_FROM_STRING static const Variant variant = DGEMM_VARIANT_SRC(N,N,1,1,8,8,8,4,4,__ALPHABETA) ; if ( applicable(variant,args) ) return &variant ; static const Variant variantA = DGEMM_VARIANT_SRC(N,N,1,1,8,8,8,4,4,__ALPHA) ; if ( applicable(variantA,args) ) return &variantA ; #else if(!strcmp(DevName, "Tahiti")) { #ifndef CLBLAS_TAHITI_DYNAMIC_KERNEL static const Variant variant = DGEMM_VARIANT_BIN(N,N,1,1,8,8,8,4,4,64,TAHITI,__ALPHABETA) ; if ( applicable(variant,args) ) return &variant ; static const Variant variantA = DGEMM_VARIANT_BIN(N,N,1,1,8,8,8,4,4,64,TAHITI,__ALPHA) ; if ( applicable(variantA,args) ) return &variantA ; #endif } else if(!strcmp(DevName, "Hawaii")) { #ifndef CLBLAS_HAWAII_DYNAMIC_KERNEL static const Variant variant = DGEMM_VARIANT_BIN(N,N,1,1,8,8,8,4,4,64,HAWAII,__ALPHABETA) ; if ( applicable(variant,args) ) return &variant ; static const Variant variantA = DGEMM_VARIANT_BIN(N,N,1,1,8,8,8,4,4,64,HAWAII,__ALPHA) ; if ( applicable(variantA,args) ) return &variantA ; #endif //#ifndef CLBLAS_HAWAII_DYNAMIC_KERNEL } #endif } // The generic version shall be last if (true) { #if BUILD_KERNEL_FROM_STRING static const Variant variant = DGEMM_VARIANT_SRC(N,N,1,1,1,8,8,4,4,__ALPHABETA) ; if ( applicable(variant,args) ) return &variant ; static const Variant variantA = DGEMM_VARIANT_SRC(N,N,1,1,1,8,8,4,4,__ALPHA) ; if ( applicable(variantA,args) ) return &variantA ; #else if(!strcmp(DevName, "Tahiti")) { #ifndef CLBLAS_TAHITI_DYNAMIC_KERNEL static const Variant variant = DGEMM_VARIANT_BIN(N,N,1,1,1,8,8,4,4,64,TAHITI,__ALPHABETA) ; if ( applicable(variant,args) ) return &variant ; static const Variant variantA = DGEMM_VARIANT_BIN(N,N,1,1,1,8,8,4,4,64,TAHITI,__ALPHA) ; if ( applicable(variantA,args) ) return &variantA ; #endif } else if(!strcmp(DevName, "Hawaii")) { #ifndef CLBLAS_HAWAII_DYNAMIC_KERNEL static const Variant variant = DGEMM_VARIANT_BIN(N,N,1,1,1,8,8,4,4,64,HAWAII,__ALPHABETA) ; if ( applicable(variant,args) ) return &variant ; static const Variant variantA = DGEMM_VARIANT_BIN(N,N,1,1,1,8,8,4,4,64,HAWAII,__ALPHA) ; if ( applicable(variantA,args) ) return &variantA ; #endif //#ifndef CLBLAS_HAWAII_DYNAMIC_KERNEL } #endif } } else { // ===== dgemm NT ====== if (true) { #if BUILD_KERNEL_FROM_STRING static const Variant variant = DGEMM_VARIANT_SRC(N,T,48,48,8,8,8,6,6,__ALPHABETA) ; if ( applicable(variant,args) ) return &variant ; static const Variant variantA = DGEMM_VARIANT_SRC(N,T,48,48,8,8,8,6,6, __ALPHA) ; if ( applicable(variantA,args) ) return &variantA ; #else if(!strcmp(DevName, "Tahiti")) { #ifndef CLBLAS_TAHITI_DYNAMIC_KERNEL static const Variant variant = DGEMM_VARIANT_BIN(N,T,48,48,8,8,8,6,6,64,TAHITI,__ALPHABETA) ; if ( applicable(variant,args) ) return &variant ; static const Variant variantA = DGEMM_VARIANT_BIN(N,T,48,48,8,8,8,6,6,64,TAHITI,__ALPHA) ; if ( applicable(variantA,args) ) return &variantA ; #endif } else if(!strcmp(DevName, "Hawaii")) { #ifndef CLBLAS_HAWAII_DYNAMIC_KERNEL static const Variant variant = DGEMM_VARIANT_BIN(N,T,48,48,8,8,8,6,6,64,HAWAII,__ALPHABETA) ; if ( applicable(variant,args) ) return &variant ; static const Variant variantA = DGEMM_VARIANT_BIN(N,T,48,48,8,8,8,6,6,64,HAWAII,__ALPHA) ; if ( applicable(variantA,args) ) return &variantA ; #endif //#ifndef CLBLAS_HAWAII_DYNAMIC_KERNEL } #endif } if (true) { #if BUILD_KERNEL_FROM_STRING static const Variant variant = DGEMM_VARIANT_SRC(N,T,32,32,8,8,8,4,4,__ALPHABETA) ; if ( applicable(variant,args) ) return &variant ; static const Variant variantA = DGEMM_VARIANT_SRC(N,T,32,32,8,8,8,4,4,__ALPHA) ; if ( applicable(variantA,args) ) return &variantA ; #else if(!strcmp(DevName, "Tahiti")) { #ifndef CLBLAS_TAHITI_DYNAMIC_KERNEL static const Variant variant = DGEMM_VARIANT_BIN(N,T,32,32,8,8,8,4,4,64,TAHITI,__ALPHABETA) ; if ( applicable(variant,args) ) return &variant ; static const Variant variantA = DGEMM_VARIANT_BIN(N,T,32,32,8,8,8,4,4,64,TAHITI,__ALPHA) ; if ( applicable(variantA,args) ) return &variantA ; #endif } else if(!strcmp(DevName, "Hawaii")) { #ifndef CLBLAS_HAWAII_DYNAMIC_KERNEL static const Variant variant = DGEMM_VARIANT_BIN(N,T,32,32,8,8,8,4,4,64,HAWAII,__ALPHABETA) ; if ( applicable(variant,args) ) return &variant ; static const Variant variantA = DGEMM_VARIANT_BIN(N,T,32,32,8,8,8,4,4,64,HAWAII,__ALPHA) ; if ( applicable(variantA,args) ) return &variantA ; #endif //#ifndef CLBLAS_HAWAII_DYNAMIC_KERNEL } #endif } if (true) { #if BUILD_KERNEL_FROM_STRING static const Variant variant = DGEMM_VARIANT_SRC(N,T,40,40,8,8,8,5,5,__ALPHABETA) ; if ( applicable(variant,args) ) return &variant ; static const Variant variantA = DGEMM_VARIANT_SRC(N,T,40,40,8,8,8,5,5,__ALPHA) ; if ( applicable(variantA,args) ) return &variantA ; #else if(!strcmp(DevName, "Tahiti")) { #ifndef CLBLAS_TAHITI_DYNAMIC_KERNEL static const Variant variant = DGEMM_VARIANT_BIN(N,T,40,40,8,8,8,5,5,64,TAHITI,__ALPHABETA) ; if ( applicable(variant,args) ) return &variant ; static const Variant variantA = DGEMM_VARIANT_BIN(N,T,40,40,8,8,8,5,5,64,TAHITI,__ALPHA) ; if ( applicable(variantA,args) ) return &variantA ; #endif } else if(!strcmp(DevName, "Hawaii")) { #ifndef CLBLAS_HAWAII_DYNAMIC_KERNEL static const Variant variant = DGEMM_VARIANT_BIN(N,T,40,40,8,8,8,5,5,64,HAWAII,__ALPHABETA) ; if ( applicable(variant,args) ) return &variant ; static const Variant variantA = DGEMM_VARIANT_BIN(N,T,40,40,8,8,8,5,5,64,HAWAII,__ALPHA) ; if ( applicable(variantA,args) ) return &variantA ; #endif //#ifndef CLBLAS_HAWAII_DYNAMIC_KERNEL } #endif } if (true) { #if BUILD_KERNEL_FROM_STRING static const Variant variant = DGEMM_VARIANT_SRC(N,T,32,32,1,8,8,4,4,__ALPHABETA) ; if ( applicable(variant,args) ) return &variant ; static const Variant variantA = DGEMM_VARIANT_SRC(N,T,32,32,1,8,8,4,4,__ALPHA) ; if ( applicable(variantA,args) ) return &variantA ; #else if(!strcmp(DevName, "Tahiti")) { #ifndef CLBLAS_TAHITI_DYNAMIC_KERNEL static const Variant variant = DGEMM_VARIANT_BIN(N,T,32,32,1,8,8,4,4,64,TAHITI,__ALPHABETA) ; if ( applicable(variant,args) ) return &variant ; static const Variant variantA = DGEMM_VARIANT_BIN(N,T,32,32,1,8,8,4,4,64,TAHITI, __ALPHA) ; if ( applicable(variantA,args) ) return &variantA ; #endif } else if(!strcmp(DevName, "Hawaii")) { #ifndef CLBLAS_HAWAII_DYNAMIC_KERNEL static const Variant variant = DGEMM_VARIANT_BIN(N,T,32,32,1,8,8,4,4,64,HAWAII,__ALPHABETA) ; if ( applicable(variant,args) ) return &variant ; static const Variant variantA = DGEMM_VARIANT_BIN(N,T,32,32,1,8,8,4,4,64,HAWAII,__ALPHA) ; if ( applicable(variantA,args) ) return &variantA ; #endif //#ifndef CLBLAS_HAWAII_DYNAMIC_KERNEL } #endif } if (true) { #if BUILD_KERNEL_FROM_STRING static const Variant variant = DGEMM_VARIANT_SRC(N,T,1,1,8,8,8,4,4,__ALPHABETA) ; if ( applicable(variant,args) ) return &variant ; static const Variant variantA = DGEMM_VARIANT_SRC(N,T,1,1,8,8,8,4,4,__ALPHA) ; if ( applicable(variantA,args) ) return &variantA ; #else if(!strcmp(DevName, "Tahiti")) { #ifndef CLBLAS_TAHITI_DYNAMIC_KERNEL static const Variant variant = DGEMM_VARIANT_BIN(N,T,1,1,8,8,8,4,4,64,TAHITI,__ALPHABETA) ; if ( applicable(variant,args) ) return &variant ; static const Variant variantA = DGEMM_VARIANT_BIN(N,T,1,1,8,8,8,4,4,64,TAHITI,__ALPHA) ; if ( applicable(variantA,args) ) return &variantA ; #endif } else if(!strcmp(DevName, "Hawaii")) { #ifndef CLBLAS_HAWAII_DYNAMIC_KERNEL static const Variant variant = DGEMM_VARIANT_BIN(N,T,1,1,8,8,8,4,4,64,HAWAII,__ALPHABETA) ; if ( applicable(variant,args) ) return &variant ; static const Variant variantA = DGEMM_VARIANT_BIN(N,T,1,1,8,8,8,4,4,64,HAWAII,__ALPHA) ; if ( applicable(variantA,args) ) return &variantA ; #endif //#ifndef CLBLAS_HAWAII_DYNAMIC_KERNEL } #endif } // The generic version shall be last if (true) { #if BUILD_KERNEL_FROM_STRING static const Variant variant = DGEMM_VARIANT_SRC(N,T,1,1,1,8,8,4,4,__ALPHABETA) ; if ( applicable(variant,args) ) return &variant ; static const Variant variantA = DGEMM_VARIANT_SRC(N,T,1,1,1,8,8,4,4,__ALPHA) ; if ( applicable(variantA,args) ) return &variantA ; #else if(!strcmp(DevName, "Tahiti")) { #ifndef CLBLAS_TAHITI_DYNAMIC_KERNEL static const Variant variant = DGEMM_VARIANT_BIN(N,T,1,1,1,8,8,4,4,64,TAHITI,__ALPHABETA) ; if ( applicable(variant,args) ) return &variant ; static const Variant variantA = DGEMM_VARIANT_BIN(N,T,1,1,1,8,8,4,4,64,TAHITI,__ALPHA) ; if ( applicable(variantA,args) ) return &variantA ; #endif } else if(!strcmp(DevName, "Hawaii")) { #ifndef CLBLAS_HAWAII_DYNAMIC_KERNEL static const Variant variant = DGEMM_VARIANT_BIN(N,T,1,1,1,8,8,4,4,64,HAWAII,__ALPHABETA) ; if ( applicable(variant,args) ) return &variant ; static const Variant variantA = DGEMM_VARIANT_BIN(N,T,1,1,1,8,8,4,4,64,HAWAII,__ALPHA) ; if ( applicable(variantA,args) ) return &variantA ; #endif //#ifndef CLBLAS_HAWAII_DYNAMIC_KERNEL } #endif } } } else { if ( args.transB == clblasNoTrans ) { // ===== dgemm TN ====== if ( args.M >= 2000 && args.N >= 2000 ) { #if BUILD_KERNEL_FROM_STRING static const Variant variant = DGEMM_VARIANT_SRC(T,N,48,48,16,8,8,6,6,__ALPHABETA) ; if ( applicable(variant,args) ) return &variant ; static const Variant variantA = DGEMM_VARIANT_SRC(T,N,48,48,16,8,8,6,6,__ALPHA) ; if ( applicable(variantA,args) ) return &variantA ; #else if(!strcmp(DevName, "Tahiti")) { #ifndef CLBLAS_TAHITI_DYNAMIC_KERNEL static const Variant variant = DGEMM_VARIANT_BIN(T,N,48,48,16,8,8,6,6,64,TAHITI,__ALPHABETA) ; if ( applicable(variant,args) ) return &variant ; static const Variant variantA = DGEMM_VARIANT_BIN(T,N,48,48,16,8,8,6,6,64,TAHITI,__ALPHA) ; if ( applicable(variantA,args) ) return &variantA ; #endif } else if(!strcmp(DevName, "Hawaii")) { #ifndef CLBLAS_HAWAII_DYNAMIC_KERNEL static const Variant variant = DGEMM_VARIANT_BIN(T,N,48,48,16,8,8,6,6,64,HAWAII,__ALPHABETA) ; if ( applicable(variant,args) ) return &variant ; static const Variant variantA = DGEMM_VARIANT_BIN(T,N,48,48,16,8,8,6,6,64,HAWAII,__ALPHA) ; if ( applicable(variantA,args) ) return &variantA ; #endif //#ifndef CLBLAS_HAWAII_DYNAMIC_KERNEL } #endif } if (true) { #if BUILD_KERNEL_FROM_STRING static const Variant variant = DGEMM_VARIANT_SRC(T,N,48,48,8,8,8,6,6,__ALPHABETA) ; if ( applicable(variant,args) ) return &variant ; static const Variant variantA = DGEMM_VARIANT_SRC(T,N,48,48,8,8,8,6,6,__ALPHA) ; if ( applicable(variantA,args) ) return &variantA ; #else if(!strcmp(DevName, "Tahiti")) { #ifndef CLBLAS_TAHITI_DYNAMIC_KERNEL static const Variant variant = DGEMM_VARIANT_BIN(T,N,48,48,8,8,8,6,6,64,TAHITI,__ALPHABETA) ; if ( applicable(variant,args) ) return &variant ; static const Variant variantA = DGEMM_VARIANT_BIN(T,N,48,48,8,8,8,6,6,64,TAHITI,__ALPHA) ; if ( applicable(variantA,args) ) return &variantA ; #endif } else if(!strcmp(DevName, "Hawaii")) { #ifndef CLBLAS_HAWAII_DYNAMIC_KERNEL static const Variant variant = DGEMM_VARIANT_BIN(T,N,48,48,8,8,8,6,6,64,HAWAII,__ALPHABETA) ; if ( applicable(variant,args) ) return &variant ; static const Variant variantA = DGEMM_VARIANT_BIN(T,N,48,48,8,8,8,6,6,64,HAWAII, __ALPHA) ; if ( applicable(variantA,args) ) return &variantA ; #endif //#ifndef CLBLAS_HAWAII_DYNAMIC_KERNEL } #endif } if (true) { #if BUILD_KERNEL_FROM_STRING static const Variant variant = DGEMM_VARIANT_SRC(T,N,32,32,16,8,16,4,2,__ALPHABETA) ; if ( applicable(variant,args) ) return &variant ; static const Variant variantA = DGEMM_VARIANT_SRC(T,N,32,32,16,8,16,4,2,__ALPHA) ; if ( applicable(variantA,args) ) return &variantA ; #else if(!strcmp(DevName, "Tahiti")) { #ifndef CLBLAS_TAHITI_DYNAMIC_KERNEL static const Variant variant = DGEMM_VARIANT_BIN(T,N,32,32,16,8,16,4,2,64,TAHITI,__ALPHABETA) ; if ( applicable(variant,args) ) return &variant ; static const Variant variantA = DGEMM_VARIANT_BIN(T,N,32,32,16,8,16,4,2,64,TAHITI,__ALPHA) ; if ( applicable(variantA,args) ) return &variantA ; #endif } else if(!strcmp(DevName, "Hawaii")) { #ifndef CLBLAS_HAWAII_DYNAMIC_KERNEL static const Variant variant = DGEMM_VARIANT_BIN(T,N,32,32,16,8,16,4,2,64,HAWAII,__ALPHABETA) ; if ( applicable(variant,args) ) return &variant ; static const Variant variantA = DGEMM_VARIANT_BIN(T,N,32,32,16,8,16,4,2,64,HAWAII,__ALPHA) ; if ( applicable(variantA,args) ) return &variantA ; #endif //#ifndef CLBLAS_HAWAII_DYNAMIC_KERNEL } #endif } if (true) { #if BUILD_KERNEL_FROM_STRING static const Variant variant = DGEMM_VARIANT_SRC(T,N,32,32,1,8,16,4,2,__ALPHABETA) ; if ( applicable(variant,args) ) return &variant ; static const Variant variantA = DGEMM_VARIANT_SRC(T,N,32,32,1,8,16,4,2,__ALPHA) ; if ( applicable(variantA,args) ) return &variantA ; #else if(!strcmp(DevName, "Tahiti")) { #ifndef CLBLAS_TAHITI_DYNAMIC_KERNEL static const Variant variant = DGEMM_VARIANT_BIN(T,N,32,32,1,8,16,4,2,64,TAHITI,__ALPHABETA) ; if ( applicable(variant,args) ) return &variant ; static const Variant variantA = DGEMM_VARIANT_BIN(T,N,32,32,1,8,16,4,2,64,TAHITI,__ALPHA) ; if ( applicable(variantA,args) ) return &variantA ; #endif } else if(!strcmp(DevName, "Hawaii")) { #ifndef CLBLAS_HAWAII_DYNAMIC_KERNEL static const Variant variant = DGEMM_VARIANT_BIN(T,N,32,32,1,8,16,4,2,64,HAWAII,__ALPHABETA) ; if ( applicable(variant,args) ) return &variant ; static const Variant variantA = DGEMM_VARIANT_BIN(T,N,32,32,1,8,16,4,2,64,HAWAII,__ALPHA) ; if ( applicable(variantA,args) ) return &variantA ; #endif //#ifndef CLBLAS_HAWAII_DYNAMIC_KERNEL } #endif } if (true) { #if BUILD_KERNEL_FROM_STRING static const Variant variant = DGEMM_VARIANT_SRC(T,N,1,1,16,8,16,4,2,__ALPHABETA) ; if ( applicable(variant,args) ) return &variant ; static const Variant variantA = DGEMM_VARIANT_SRC(T,N,1,1,16,8,16,4,2,__ALPHA) ; if ( applicable(variantA,args) ) return &variantA ; #else if(!strcmp(DevName, "Tahiti")) { #ifndef CLBLAS_TAHITI_DYNAMIC_KERNEL static const Variant variant = DGEMM_VARIANT_BIN(T,N,1,1,16,8,16,4,2,64,TAHITI,__ALPHABETA) ; if ( applicable(variant,args) ) return &variant ; static const Variant variantA = DGEMM_VARIANT_BIN(T,N,1,1,16,8,16,4,2,64,TAHITI,__ALPHA) ; if ( applicable(variantA,args) ) return &variantA ; #endif } else if(!strcmp(DevName, "Hawaii")) { #ifndef CLBLAS_HAWAII_DYNAMIC_KERNEL static const Variant variant = DGEMM_VARIANT_BIN(T,N,1,1,16,8,16,4,2,64,HAWAII,__ALPHABETA) ; if ( applicable(variant,args) ) return &variant ; static const Variant variantA = DGEMM_VARIANT_BIN(T,N,1,1,16,8,16,4,2,64,HAWAII, __ALPHA) ; if ( applicable(variantA,args) ) return &variantA ; #endif //#ifndef CLBLAS_HAWAII_DYNAMIC_KERNEL } #endif } // The generic version shall be last if (true) { #if BUILD_KERNEL_FROM_STRING static const Variant variant = DGEMM_VARIANT_SRC(T,N,1,1,1,8,16,4,2,__ALPHABETA) ; if ( applicable(variant,args) ) return &variant ; static const Variant variantA = DGEMM_VARIANT_SRC(T,N,1,1,1,8,16,4,2,__ALPHA) ; if ( applicable(variantA,args) ) return &variantA ; #else if(!strcmp(DevName, "Tahiti")) { #ifndef CLBLAS_TAHITI_DYNAMIC_KERNEL static const Variant variant = DGEMM_VARIANT_BIN(T,N,1,1,1,8,16,4,2,64,TAHITI,__ALPHABETA) ; if ( applicable(variant,args) ) return &variant ; static const Variant variantA = DGEMM_VARIANT_BIN(T,N,1,1,1,8,16,4,2,64,TAHITI,__ALPHA) ; if ( applicable(variantA,args) ) return &variantA ; #endif //CLBLAS_TAHITI_DYNAMIC_KERNEL } else if(!strcmp(DevName, "Hawaii")) { #ifndef CLBLAS_HAWAII_DYNAMIC_KERNEL static const Variant variant = DGEMM_VARIANT_BIN(T,N,1,1,1,8,16,4,2,64,HAWAII,__ALPHABETA) ; if ( applicable(variant,args) ) return &variant ; static const Variant variantA = DGEMM_VARIANT_BIN(T,N,1,1,1,8,16,4,2,64,HAWAII,__ALPHA) ; if ( applicable(variantA,args) ) return &variantA ; #endif //CLBLAS_HAWAII_DYNAMIC_KERNEL } #endif } } else { // ===== dgemm TT ====== // TODO } } return NULL ; // No suitable variant ... will use the fallback } clBlasGCNdgemmCommonFunctor::clBlasGCNdgemmCommonFunctor(Args & args, const Variant * variant, cl_int & err) { cl_device_id device; cl_context context; m_program = NULL; m_variant = variant; cl_command_queue queue = args.queue; err = getDeviceAndContext(queue, device, context); if( err != CL_SUCCESS ) { return; } if (VERB) printf(" ===> GET KERNEL %s\n", this->m_variant->kernel_name) ; //Ben do I use the correct "kernel_name"? BinaryLookup bl(context, device, "clBlasGCNdgemmCommonFunctor"); bl.variantRaw( this->m_variant->kernel_name, strlen(this->m_variant->kernel_name)+1 ) ; if ( !bl.found() ) // may create empty file or may wait until file is ready { if ( this->m_variant->bin != 0 ) { // build from a pre-compiled version of the kernel (SPIR or cl binaries) err = bl.buildFromBinary(this->m_variant->bin, this->m_variant->bin_size, this->m_variant->build_options); } else { // directly build from a char* err = bl.buildFromSource(this->m_variant->source); } if ( err != CL_SUCCESS ) { if (VERB) printf(" ===> BUILD PROBLEM\n") ; return; } } this->m_program = bl.getProgram(); } clBlasGCNdgemmCommonFunctor * clBlasGCNdgemmCommonFunctor::provide(clblasDgemmFunctor::Args & args, const char* DevName) { if ( args.order == clblasRowMajor ) return NULL ; // The RowMajor case shall never occur. cl_device_id dev; cl_context ctxt; cl_int err = getDeviceAndContext(args.queue, dev, ctxt); if (err != CL_SUCCESS) { return NULL; } cl_uint bitness = getAddressBits(dev); const Variant * variant = select_variant( args, DevName, bitness ) ; if ( variant == NULL ) return NULL ; Cache::Lookup lookup(cache, ctxt, dev, variant) ; if ( lookup.ok() ){ clBlasGCNdgemmCommonFunctor * functor = lookup.get(); functor->retain(); // increment the reference counter to avoid deletion while it is still beeing used return functor; } clBlasGCNdgemmCommonFunctor * functor = new clBlasGCNdgemmCommonFunctor(args, variant, err); if (err != CL_SUCCESS) { return NULL; } lookup.set(functor) ; return functor; } clblas-2.10/src/library/blas/functor/gcn_dgemmSmallMatrices.cc000066400000000000000000000533561264277366700245000ustar00rootroot00000000000000#include #include #include #include #include "clblas-internal.h" #include "solution_seq.h" #include #include #include #include #include #include #include "BinaryBuild.h" #include "gcn_dgemmSmallMatrices.h" #if BUILD_KERNEL_FROM_STRING #include "dgemm_gcn_SmallMatrices.clT" #else #include "dgemm_gcn_SmallMatrices.clHawaii_64.bin.clT" #include "dgemm_gcn_SmallMatrices.clTahiti_64.bin.clT" #endif #define DGEMM_SRC_NAME(TA,TB,DIVN,DIVM,DIVK,BS0,BS1,NV0,NV1,MULT) dgemm_##TA##TB##_##DIVN##_##DIVM##_##DIVK##_##BS0##x##BS1##_##NV0##x##NV1##MULT #define DGEMM_SRC_NAME_TAHITI(TA,TB,DIVN,DIVM,DIVK,BS0,BS1,NV0,NV1,BITS,MULT) dgemm_##TA##TB##_##DIVN##_##DIVM##_##DIVK##_##BS0##x##BS1##_##NV0##x##NV1##MULT##_##BITS##_bin_Tahiti #define DGEMM_SRC_NAME_HAWAII(TA,TB,DIVN,DIVM,DIVK,BS0,BS1,NV0,NV1,BITS,MULT) dgemm_##TA##TB##_##DIVN##_##DIVM##_##DIVK##_##BS0##x##BS1##_##NV0##x##NV1##MULT##_##BITS##_bin_Hawaii // // The name of the 'const char []' global variable that contain the SPIR data. // That name is similar to the one produced by DGEMM_SRC_NAME but suffixed by _spir // #define DGEMM_SPIR_NAME(TA,TB,DIVN,DIVM,DIVK,BS0,BS1,NV0,NV1,MULT) dgemm_##TA##TB##_##DIVN##_##DIVM##_##DIVK##_##BS0##x##BS1##_##NV0##x##NV1_spir // // The name of the 'const char []' global variable that contain the CL binaries data. // That name is similar to the one produced by DGEMM_SRC_NAME but suffixed by _bin // // The name of the kernel itself. // This is basically the name returned by DGEMM_SRC_NAME but as string // #define DGEMM_KERNEL_NAME(TA,TB,DIVN,DIVM,DIVK,BS0,BS1,NV0,NV1,MULT) "dgemm_" #TA #TB "_" #DIVN "_" #DIVM "_" #DIVK "_" #BS0 "x" #BS1 "_" #NV0 "x" #NV1 #MULT // // Helpers to transform N and T in proper clblas values for the macros above // #define trans_N clblasNoTrans #define trans_T clblasTrans // Fill a variant descriptor using OpenCL source #define DGEMM_VARIANT_SRC(TA,TB,DIVN,DIVM,DIVK,BS0,BS1,NV0,NV1, MULT) { \ DGEMM_KERNEL_NAME(TA,TB,DIVN,DIVM,DIVK,BS0,BS1,NV0,NV1, MULT) , \ DGEMM_SRC_NAME(TA,TB,DIVN,DIVM,DIVK,BS0,BS1,NV0,NV1, MULT) , \ NULL, NULL, 0, \ trans_##TA, trans_##TB, \ DIVN,DIVM,DIVK, \ { BS0, BS1 } , \ { NV0, NV1 } , \ #MULT \ } // Fill a variant descriptor using SPIR #define DGEMM_VARIANT_SPIR(TA,TB,DIVN,DIVM,DIVK,BS0,BS1,NV0,NV1, MULT) { \ DGEMM_KERNEL_NAME(TA,TB,DIVN,DIVM,DIVK,BS0,BS1,NV0,NV1, MULT) , \ NULL , "-x spir -spir-std=1.2" \ DGEMM_SPIR_NAME(TA,TB,DIVN,DIVM,DIVK,BS0,BS1,NV0,NV1), \ sizeof(DGEMM_SPIR_NAME(TA,TB,DIVN,DIVM,DIVK,BS0,BS1,NV0,NV1)), \ trans_##TA,trans_##TB, \ DIVN,DIVM,DIVK, \ { BS0, BS1 } , \ { NV0, NV1 } , \ #MULT \ } // Fill a variant descriptor using CL Binaries #define DGEMM_VARIANT_BIN(TA,TB,DIVN,DIVM,DIVK,BS0,BS1,NV0,NV1,BITS,DEVICE, MULT) { \ DGEMM_KERNEL_NAME(TA,TB,DIVN,DIVM,DIVK,BS0,BS1,NV0,NV1, MULT) , \ NULL , NULL, \ DGEMM_SRC_NAME##_##DEVICE(TA,TB,DIVN,DIVM,DIVK,BS0,BS1,NV0,NV1,BITS, MULT), \ sizeof(DGEMM_SRC_NAME##_##DEVICE(TA,TB,DIVN,DIVM,DIVK,BS0,BS1,NV0,NV1,BITS, MULT)), \ trans_##TA,trans_##TB, \ DIVN,DIVM,DIVK, \ { BS0, BS1 } , \ { NV0, NV1 } , \ #MULT \ } // Just because the full name is too long typedef clBlasGCNDgemmSmallMatricesFunctor::Variant Variant ; ////define the string name of the soure/binary code //#define DGEMM_SRC_NAME(TA,TB,MULT) dgemm_##TA##TB##_SMALL##MULT //#define DGEMM_SRC_NAME_HAWAII(TA,TB, MULT, BITS) dgemm_##TA##TB##_SMALL##MULT##_##BITS##_bin_Hawaii // ////variant name used to differentiate the different ones //#define DGEMM_VARIANT_NAME(TA,TB, MULT) "dgemm_" #TA #TB "_SMALL" #MULT ////DGEMM_VARIANT_NAME(TA, TB, DIVM , DIVN, DIVK, GREATER48M, GREATER48N, NBKERNEL), // //#define DGEMM_KERNEL_NAME(TA,TB,DIVM,DIVN,DIVK,BS0,BS1,NV0,NV1,MULT, BLOC) "dgemm_" #TA #TB "_" #DIVM "_" #DIVN "_" #DIVK "_" #BS0 "x" #BS1 "_" #NV0 "x" #NV1 #MULT "_SPLIT_" #BLOC // // //#define trans_N clblasNoTrans //#define trans_T clblasTrans // //// Fill a variant descriptor using OpenCL source //#define DGEMM_VARIANT_OBJ(TA,TB,DIVK,BS0,BS1,NV0,NV1, BITS, MULT, \ // KERNEL_NAME_MAIN, KERNEL_NAME_ROW, \ // KERNELS_SRC, \ // KERNEL_BUILD_OPTIONS, \ // KERNELS_BIN, \ // KERNEL_BIN_SIZE) { \ // DGEMM_VARIANT_NAME(TA,TB, MULT), \ //{ KERNEL_NAME_MAIN, KERNEL_NAME_ROW } , \ // KERNELS_SRC, \ // KERNEL_BUILD_OPTIONS, \ // KERNELS_BIN, \ // KERNEL_BIN_SIZE, \ // trans_##TA, trans_##TB, \ // DIVK , \ //{ BS0, BS1 } , \ //{ NV0, NV1 } , \ //#MULT \ //} typedef clblasFunctorCache CacheSMall ; static CacheSMall cachesmall ; // Make it 1 to enable additional debug 'print' #define VERB 0 static bool applicable( const Variant & var, clblasDgemmFunctor::Args & args ) { #if 0 // Transpose values are tested in select_variant if ( args.transA != var.transA ) return false ; if ( args.transB != var.transB ) return false ; #endif //if (args.N>=var.divN && args.N % var.divN != 0 ) if ( args.N % var.divN != 0 ) return false ; if ( args.M % var.divM != 0 ) return false ; if ( args.beta==0 && var.mult.compare("__ALPHA")!=0) return false ; return true ; } static const Variant * select_variant_GCNSmallMatrices( clblasDgemmFunctor::Args & args, const char* DevName, cl_uint _64BitsUse ) { if(_64BitsUse!=64) { std::cout<<"we don't support clblas on 32 bits"<< std::endl; assert(1); return NULL; } if ( args.transA == clblasNoTrans ) { if ( args.transB == clblasNoTrans ) { if (true) { #if BUILD_KERNEL_FROM_STRING static const Variant variant = DGEMM_VARIANT_SRC(N,N,16,16,8,8,8,2,2,__ALPHABETA) ; if ( applicable(variant,args) ) return &variant ; static const Variant variantA = DGEMM_VARIANT_SRC(N,N,16,16,8,8,8,2,2,__ALPHA) ; if ( applicable(variantA,args) ) return &variantA ; #else if(!strcmp(DevName, "Tahiti")) { #ifndef CLBLAS_TAHITI_DYNAMIC_KERNEL static const Variant variant = DGEMM_VARIANT_BIN(N,N,16,16,8,8,8,2,2,64,TAHITI,__ALPHABETA ) ; if ( applicable(variant,args) ) return &variant ; // const char * test = DGEMM_KERNEL_NAME(N,N,48,48,8,8,8,6,6, __ALPHA); // test static const Variant variantA = DGEMM_VARIANT_BIN(N,N,16,16,8,8,8,2,2,64,TAHITI,__ALPHA ) ; if ( applicable(variantA,args) ) return &variantA ; #endif //#ifndef CLBLAS_TAHITI_DYNAMIC_KERNEL } else if(!strcmp(DevName, "Hawaii")) { #ifndef CLBLAS_HAWAII_DYNAMIC_KERNEL static const Variant variant = DGEMM_VARIANT_BIN(N,N,16,16,8,8,8,2,2,64,HAWAII,__ALPHABETA ) ; if ( applicable(variant,args) ) return &variant ; //const char * test = DGEMM_KERNEL_NAME(N,N,48,48,8,8,8,6,6, __ALPHA); // test static const Variant variantA = DGEMM_VARIANT_BIN(N,N,16,16,8,8,8,2,2,64,HAWAII,__ALPHA ) ; if ( applicable(variantA,args) ) return &variantA ; #endif //#ifndef CLBLAS_HAWAII_DYNAMIC_KERNEL } #endif } if (true) { #if BUILD_KERNEL_FROM_STRING static const Variant variant = DGEMM_VARIANT_SRC(N,N,24,24,8,8,8,3,3,__ALPHABETA) ; if ( applicable(variant,args) ) return &variant ; static const Variant variantA = DGEMM_VARIANT_SRC(N,N,24,24,8,8,8,3,3,__ALPHA) ; if ( applicable(variantA,args) ) return &variantA ; #else if(!strcmp(DevName, "Tahiti")) { #ifndef CLBLAS_TAHITI_DYNAMIC_KERNEL static const Variant variant = DGEMM_VARIANT_BIN(N,N,24,24,8,8,8,3,3,64,TAHITI,__ALPHABETA ) ; if ( applicable(variant,args) ) return &variant ; static const Variant variantA = DGEMM_VARIANT_BIN(N,N,24,24,8,8,8,3,3,64,TAHITI,__ALPHA ) ; if ( applicable(variantA,args) ) return &variantA ; #endif //#ifndef CLBLAS_TAHITI_DYNAMIC_KERNEL } else if(!strcmp(DevName, "Hawaii")) { #ifndef CLBLAS_HAWAII_DYNAMIC_KERNEL static const Variant variant = DGEMM_VARIANT_BIN(N,N,24,24,8,8,8,3,3,64,HAWAII,__ALPHABETA ) ; if ( applicable(variant,args) ) return &variant ; static const Variant variantA = DGEMM_VARIANT_BIN(N,N,24,24,8,8,8,3,3,64,HAWAII,__ALPHA ) ; if ( applicable(variantA,args) ) return &variantA ; #endif //#ifndef CLBLAS_HAWAII_DYNAMIC_KERNEL } #endif } } if (args.transB == clblasTrans) { if (true) { #if BUILD_KERNEL_FROM_STRING static const Variant variant = DGEMM_VARIANT_SRC(N,T,16,16,8,8,8,2,2,__ALPHABETA) ; if ( applicable(variant,args) ) return &variant ; static const Variant variantA = DGEMM_VARIANT_SRC(N,T,16,16,8,8,8,2,2,__ALPHA) ; if ( applicable(variantA,args) ) return &variantA ; #else if(!strcmp(DevName, "Tahiti")) { #ifndef CLBLAS_TAHITI_DYNAMIC_KERNEL static const Variant variant = DGEMM_VARIANT_BIN(N,T,16,16,8,8,8,2,2,64,TAHITI,__ALPHABETA ) ; if ( applicable(variant,args) ) return &variant ; // const char * test = DGEMM_KERNEL_NAME(N,N,48,48,8,8,8,6,6, __ALPHA); // test static const Variant variantA = DGEMM_VARIANT_BIN(N,T,16,16,8,8,8,2,2,64,TAHITI,__ALPHA ) ; if ( applicable(variantA,args) ) return &variantA ; #endif //#ifndef CLBLAS_TAHITI_DYNAMIC_KERNEL } else if(!strcmp(DevName, "Hawaii")) { #ifndef CLBLAS_HAWAII_DYNAMIC_KERNEL static const Variant variant = DGEMM_VARIANT_BIN(N,T,16,16,8,8,8,2,2,64,HAWAII,__ALPHABETA ) ; if ( applicable(variant,args) ) return &variant ; //const char * test = DGEMM_KERNEL_NAME(N,N,48,48,8,8,8,6,6, __ALPHA); // test static const Variant variantA = DGEMM_VARIANT_BIN(N,T,16,16,8,8,8,2,2,64,HAWAII,__ALPHA ) ; if ( applicable(variantA,args) ) return &variantA ; #endif //#ifndef CLBLAS_HAWAII_DYNAMIC_KERNEL } #endif } if (true) { #if BUILD_KERNEL_FROM_STRING static const Variant variant = DGEMM_VARIANT_SRC(N,T,24,24,8,8,8,3,3,__ALPHABETA) ; if ( applicable(variant,args) ) return &variant ; static const Variant variantA = DGEMM_VARIANT_SRC(N,T,24,24,8,8,8,3,3,__ALPHA) ; if ( applicable(variantA,args) ) return &variantA ; #else if(!strcmp(DevName, "Tahiti")) { #ifndef CLBLAS_TAHITI_DYNAMIC_KERNEL static const Variant variant = DGEMM_VARIANT_BIN(N,T,24,24,8,8,8,3,3,64,TAHITI,__ALPHABETA ) ; if ( applicable(variant,args) ) return &variant ; static const Variant variantA = DGEMM_VARIANT_BIN(N,T,24,24,8,8,8,3,3,64,TAHITI,__ALPHA ) ; if ( applicable(variantA,args) ) return &variantA ; #endif //#ifndef CLBLAS_TAHITI_DYNAMIC_KERNEL } else if(!strcmp(DevName, "Hawaii")) { #ifndef CLBLAS_HAWAII_DYNAMIC_KERNEL static const Variant variant = DGEMM_VARIANT_BIN(N,T,24,24,8,8,8,3,3,64,HAWAII,__ALPHABETA ) ; if ( applicable(variant,args) ) return &variant ; static const Variant variantA = DGEMM_VARIANT_BIN(N,T,24,24,8,8,8,3,3,64,HAWAII,__ALPHA ) ; if ( applicable(variantA,args) ) return &variantA ; #endif //#ifndef CLBLAS_HAWAII_DYNAMIC_KERNEL } #endif } } } return NULL; } clBlasGCNDgemmSmallMatricesFunctor::clBlasGCNDgemmSmallMatricesFunctor(Args & args, const Variant * variant, cl_int & err) { cl_device_id device; cl_context context; m_program=NULL; m_variant = variant; cl_command_queue queue = args.queue; err = getDeviceAndContext(queue, device, context); if( err != CL_SUCCESS ) { return; } if (VERB) printf(" ===> GET KERNEL %s\n", this->m_variant->kernel_name) ; //Ben do I use the correct "kernel_name"? BinaryLookup bl(context, device, "clBlasGCNDgemmSmallMatricesFunctor"); bl.variantRaw( this->m_variant->kernel_name, strlen(this->m_variant->kernel_name)+1 ) ; if ( !bl.found() ) // may create empty file or may wait until file is ready { if ( this->m_variant->bin != NULL ) { // build from a pre-compiled version of the kernel (SPIR or cl binaries) //only 1 binary containing all the kernel err = bl.buildFromBinary(this->m_variant->bin, this->m_variant->bin_size, this->m_variant->build_options); } else { //// directly build from a char* //for (int i=0; i<4; i++) // if(this->m_variantSplit->source[i] != 0) err = bl.buildFromSource(this->m_variant->source); //if (VERB) printf(" ===> BUILD PROBLEM WE DON'T SUPPORT SOURCE BUILD FOR SPLIT DGEMM\n") ; return; } if ( err != CL_SUCCESS ) { if (VERB) printf(" ===> BUILD PROBLEM\n") ; return; } } this->m_program = bl.getProgram(); } clBlasGCNDgemmSmallMatricesFunctor * clBlasGCNDgemmSmallMatricesFunctor::provide(clblasDgemmFunctor::Args & args, const char* DevName) { if ( args.order == clblasRowMajor ) return NULL ; // The RowMajor case shall never occur. cl_device_id dev; cl_context ctxt; cl_int err = getDeviceAndContext(args.queue, dev, ctxt); if (err != CL_SUCCESS) { return NULL; } cl_uint bitness = getAddressBits(dev); const Variant * variant = select_variant_GCNSmallMatrices( args, DevName, bitness ) ; if ( variant == NULL ) return NULL ; CacheSMall::Lookup lookup(cachesmall, ctxt, dev, variant) ; if ( lookup.ok() ) { clBlasGCNDgemmSmallMatricesFunctor * functor = lookup.get(); functor->retain(); // increment the reference counter to avoid deletion while it is still beeing used return functor; } clBlasGCNDgemmSmallMatricesFunctor * functor = new clBlasGCNDgemmSmallMatricesFunctor(args, variant, err); if (err != CL_SUCCESS) { return NULL; } lookup.set(functor) ; return functor; } //cl_int clBlasGCNDgemmSmallMatricesFunctor::KernelsLaunch(cl_command_queue queue, cl_kernel Kernel[4], Args &args) //{ // size_t GlobalX =args.M/m_variantSplit->bwi[0]; // GlobalX-=GlobalX%m_variantSplit->ls[0]; // // // // size_t GlobalY = args.N/m_variantSplit->bwi[1]; // GlobalY-=GlobalY%m_variantSplit->ls[1]; // // // std::size_t gs[2] = {GlobalX, GlobalY}; // cl_int error = 0; // // if (args.M%48==0 && args.N%48==0) // { // if (VERB) printf(" ===> EXECUTE KERNEL 0 \n") ; // error = clEnqueueNDRangeKernel(queue, Kernel[0], 2, NULL, gs, m_variantSplit->ls, args.numEventsInWaitList, args.eventWaitList,args.events); // return error; // } // // if (args.M%48!=0 && args.N%48!=0 && args.M>=48 && args.N>=48 ) // { // if (VERB) printf(" ===> EXECUTE KERNEL 0, 1, 2, 3 \n") ; // error = clEnqueueNDRangeKernel(queue, Kernel[0], 2, NULL, gs, m_variantSplit->ls, args.numEventsInWaitList, args.eventWaitList,NULL); // // gs[0] = 8; // error |= clEnqueueNDRangeKernel(queue, Kernel[1], 2, NULL, gs, m_variantSplit->ls, 0, NULL,NULL); // // gs[1] = 8; // gs[0] = GlobalX; // error |= clEnqueueNDRangeKernel(queue, Kernel[2], 2, NULL, gs, m_variantSplit->ls, 0, NULL,NULL); // // gs[0] = 8; gs[1] = 8; // error |= clEnqueueNDRangeKernel(queue, Kernel[3], 2, NULL, gs, m_variantSplit->ls, 0, NULL,args.events); // return error; // } // if (args.M%48==0 && args.N%48!=0 && args.N>48 ) // { // if (VERB) printf(" ===> EXECUTE KERNEL 0, 2, \n") ; // // error = clEnqueueNDRangeKernel(queue, Kernel[0], 2, NULL, gs, m_variantSplit->ls, args.numEventsInWaitList, args.eventWaitList,NULL); // gs[1] = 8; // error |= clEnqueueNDRangeKernel(queue, Kernel[2], 2, NULL, gs, m_variantSplit->ls, 0, NULL,NULL); // // return error; // } // if (args.N%48==0 && args.M%48!=0 && args.M>48 ) // { // if (VERB) printf(" ===> EXECUTE KERNEL 0, 1 \n") ; // // error = clEnqueueNDRangeKernel(queue, Kernel[0], 2, NULL, gs, m_variantSplit->ls, args.numEventsInWaitList, args.eventWaitList,NULL); // gs[0] = 8; // error |= clEnqueueNDRangeKernel(queue, Kernel[1], 2, NULL, gs, m_variantSplit->ls, 0, NULL,NULL); // // return error; // } // if(args.M<48 && args.N%48==0) // { // if (VERB) printf(" ===> EXECUTE KERNEL 1, \n") ; // // gs[0] = 8; // error |= clEnqueueNDRangeKernel(queue, Kernel[1], 2, NULL, gs, m_variantSplit->ls, args.numEventsInWaitList, args.eventWaitList, args.events); // return error; // } // if(args.M<48 && args.N%48!=0 && args.N>=48) // { // if (VERB) printf(" ===> EXECUTE KERNEL 1, 3 \n") ; // // gs[0] = 8; // error |= clEnqueueNDRangeKernel(queue, Kernel[1], 2, NULL, gs, m_variantSplit->ls, args.numEventsInWaitList, args.eventWaitList, NULL); // gs[1] = 8; // error |= clEnqueueNDRangeKernel(queue, Kernel[3], 2, NULL, gs, m_variantSplit->ls, 0, NULL,args.events); // return error; // } // if(args.N<48 && args.M%48==0) // { // if (VERB) printf(" ===> EXECUTE KERNEL 2 \n") ; // // gs[1] = 8; // error |= clEnqueueNDRangeKernel(queue, Kernel[2], 2, NULL, gs, m_variantSplit->ls, args.numEventsInWaitList, args.eventWaitList, args.events); // return error; // } // if(args.N<48 && args.M%48!=0&& args.M>=48) // { // if (VERB) printf(" ===> EXECUTE KERNEL 2, 3 \n") ; // // gs[1] = 8; // error |= clEnqueueNDRangeKernel(queue, Kernel[2], 2, NULL, gs, m_variantSplit->ls, args.numEventsInWaitList, args.eventWaitList, NULL); // // gs[0] = 8; // error |= clEnqueueNDRangeKernel(queue, Kernel[3], 2, NULL, gs, m_variantSplit->ls, 0, NULL,args.events); // return error; // } // if (args.N<48 && args.M<48) // { // if (VERB) printf(" ===> EXECUTE KERNEL 3 \n") ; // gs[0] = 8; gs[1] = 8; // error |= clEnqueueNDRangeKernel(queue, Kernel[3], 2, NULL, gs, m_variantSplit->ls,args.numEventsInWaitList, args.eventWaitList, args.events); // return error; // } // // return clblasNotImplemented; //} //need to rewrite execute!!! clblasStatus clBlasGCNDgemmSmallMatricesFunctor::execute(Args &args) { cl_int err; cl_command_queue queue = args.queue; if (VERB) printf(" ===> EXECUTE KERNEL %s, alpha =%f ,beta = %f\n", this->m_variant->kernel_name, args.alpha, args.beta) ; cl_kernel kernel; // int NBKernel = 0; kernel= clCreateKernel( this->m_program, this->m_variant->kernel_name, &err); if (err != CL_SUCCESS) return clblasStatus(err) ; //if (NBKernel != 4) return clblasStatus(clblasBuildProgramFailure) ; if (VERB) { printf(" ===> FOUND %s\n", this->m_variant->kernel_name) ; } int M = args.M, N = args.N, K = args.K; int lda = args.lda, ldb = args.ldb, ldc = args.ldc; int offsetA = args.offA; int offsetB = args.offB; int offsetC = args.offC; int arg=0 ; //// All dgemm kernels shall have the same arguments: (A,B,C,M,N,K,alpha,beta,lda,ldb,ldc,offa,offb,offc) setKernelArg(kernel, arg++, args.A); setKernelArg(kernel, arg++, args.B); setKernelArg(kernel, arg++, args.C); setKernelArg(kernel, arg++, M); setKernelArg(kernel, arg++, N); setKernelArg(kernel, arg++, K); setKernelArg(kernel, arg++, args.alpha); if (args.beta!=0 && this->m_variant->mult.compare("__ALPHA")!=0) setKernelArg(kernel, arg++, args.beta); setKernelArg(kernel, arg++, lda); setKernelArg(kernel, arg++, ldb); setKernelArg(kernel, arg++, ldc); setKernelArg(kernel, arg++, offsetA); setKernelArg(kernel, arg++, offsetB); setKernelArg(kernel, arg++, offsetC); // err = KernelsLaunch(queue, kernel, args); const size_t * ls = this->m_variant->ls ; // Each work group is made of ls[0] x ls[1] PE const size_t * bwi = this->m_variant->bwi ; // Each PE updates bwi[0] x bwi[1] values size_t globalThreads[2]; unsigned int thx, thy; thx = M/bwi[0] + ((M%bwi[0] != 0) ? 1 : 0); thx = thx/ls[0] + ((thx%ls[0] != 0) ? 1 : 0); thx = ls[0] * thx; thy = N/bwi[1] + ((N%bwi[1] != 0) ? 1 : 0); thy = thy/ls[1] + ((thy%ls[1] != 0) ? 1 : 0); thy = ls[1] * thy; globalThreads[0] = thx; globalThreads[1] = thy; err = clEnqueueNDRangeKernel(queue, kernel, 2, NULL, globalThreads, ls , args.numEventsInWaitList, args.eventWaitList, args.events); clReleaseKernel(kernel) ; if (VERB) printf(" ===> ERR=%d \n",(int)err) ; return clblasStatus(err) ; } clblas-2.10/src/library/blas/functor/gcn_sgemm.cc000066400000000000000000000453561264277366700220370ustar00rootroot00000000000000#include #include #include #include #include "clblas-internal.h" #include "solution_seq.h" #include #include #include #include #include #include #include #include "BinaryBuild.h" //for the moment only managing source code and cl binary #if BUILD_KERNEL_FROM_STRING #include "sgemm_gcn.clT" #else #include "sgemm_gcn.clHawaii_64.bin.clT" #include "sgemm_gcn.clBonaire_64.bin.clT" #include "sgemm_gcn.clTahiti_64.bin.clT" #endif // // The name of the 'const char *' providing the kernel OpenCL source // // dgemm_TATB_DIVN_DIVM_DIVK_BS0xBS1_NV0xNV1 // // For instance, DGEMM_SRC_NAME(N,T,32,64,8,8,8,4,8) is dgemm_NT_32_64_8_8x8_4x8 // #define SGEMM_SRC_NAME(TA,TB,DIVN,DIVM,DIVK,BS0,BS1,NV0,NV1,MULT) sgemm_##TA##TB##_##DIVN##_##DIVM##_##DIVK##_##BS0##x##BS1##_##NV0##x##NV1##MULT #define SGEMM_SRC_NAME_TAHITI(TA,TB,DIVN,DIVM,DIVK,BS0,BS1,NV0,NV1,BITS,MULT) sgemm_##TA##TB##_##DIVN##_##DIVM##_##DIVK##_##BS0##x##BS1##_##NV0##x##NV1##MULT##_##BITS##_bin_Tahiti #define SGEMM_SRC_NAME_HAWAII(TA,TB,DIVN,DIVM,DIVK,BS0,BS1,NV0,NV1,BITS,MULT) sgemm_##TA##TB##_##DIVN##_##DIVM##_##DIVK##_##BS0##x##BS1##_##NV0##x##NV1##MULT##_##BITS##_bin_Hawaii #define SGEMM_SRC_NAME_BONAIRE(TA,TB,DIVN,DIVM,DIVK,BS0,BS1,NV0,NV1,BITS,MULT) sgemm_##TA##TB##_##DIVN##_##DIVM##_##DIVK##_##BS0##x##BS1##_##NV0##x##NV1##MULT##_##BITS##_bin_Bonaire // // The name of the 'const char []' global variable that contain the SPIR data. // That name is similar to the one produced by DGEMM_SRC_NAME but suffixed by _spir // #define SGEMM_SPIR_NAME(TA,TB,DIVN,DIVM,DIVK,BS0,BS1,NV0,NV1,MULT) sgemm_##TA##TB##_##DIVN##_##DIVM##_##DIVK##_##BS0##x##BS1##_##NV0##x##NV1_spir // // The name of the 'const char []' global variable that contain the CL binaries data. // That name is similar to the one produced by DGEMM_SRC_NAME but suffixed by _bin // // The name of the kernel itself. // This is basically the name returned by DGEMM_SRC_NAME but as string // #define SGEMM_KERNEL_NAME(TA,TB,DIVN,DIVM,DIVK,BS0,BS1,NV0,NV1,MULT) "sgemm_" #TA #TB "_" #DIVN "_" #DIVM "_" #DIVK "_" #BS0 "x" #BS1 "_" #NV0 "x" #NV1 #MULT // // Helpers to transform N and T in proper clblas values for the macros above // #define trans_N clblasNoTrans #define trans_T clblasTrans // Fill a variant descriptor using OpenCL source #define SGEMM_VARIANT_SRC(TA,TB,DIVN,DIVM,DIVK,BS0,BS1,NV0,NV1,MULT) { \ SGEMM_KERNEL_NAME(TA,TB,DIVN,DIVM,DIVK,BS0,BS1,NV0,NV1,MULT) , \ SGEMM_SRC_NAME(TA,TB,DIVN,DIVM,DIVK,BS0,BS1,NV0,NV1,MULT) , \ NULL, NULL, 0, \ trans_##TA, trans_##TB, \ DIVN,DIVM,DIVK, \ { BS0, BS1 } , \ { NV0, NV1 } \ } // Fill a variant descriptor using SPIR #define SGEMM_VARIANT_SPIR(TA,TB,DIVN,DIVM,DIVK,BS0,BS1,NV0,NV1,MULT) { \ SGEMM_KERNEL_NAME(TA,TB,DIVN,DIVM,DIVK,BS0,BS1,NV0,NV1) , \ NULL , "-x spir -spir-std=1.2" \ SGEMM_SPIR_NAME(TA,TB,DIVN,DIVM,DIVK,BS0,BS1,NV0,NV1), \ sizeof(SGEMM_SPIR_NAME(TA,TB,DIVN,DIVM,DIVK,BS0,BS1,NV0,NV1)), \ trans_##TA,trans_##TB, \ DIVN,DIVM,DIVK, \ { BS0, BS1 } , \ { NV0, NV1 } \ } // Fill a variant descriptor using CL Binaries #define SGEMM_VARIANT_BIN_CL1(TA,TB,DIVN,DIVM,DIVK,BS0,BS1,NV0,NV1,BITS,DEVICE,MULT) { \ SGEMM_KERNEL_NAME(TA,TB,DIVN,DIVM,DIVK,BS0,BS1,NV0,NV1,MULT) , \ NULL , NULL, \ SGEMM_SRC_NAME##_##DEVICE(TA,TB,DIVN,DIVM,DIVK,BS0,BS1,NV0,NV1,BITS,MULT), \ sizeof(SGEMM_SRC_NAME##_##DEVICE(TA,TB,DIVN,DIVM,DIVK,BS0,BS1,NV0,NV1,BITS,MULT)), \ trans_##TA,trans_##TB, \ DIVN,DIVM,DIVK, \ { BS0, BS1 } , \ { NV0, NV1 } \ } #define SGEMM_VARIANT_BIN_CL2(TA,TB,DIVN,DIVM,DIVK,BS0,BS1,NV0,NV1,BITS,DEVICE,MULT) { \ SGEMM_KERNEL_NAME(TA,TB,DIVN,DIVM,DIVK,BS0,BS1,NV0,NV1,MULT) , \ NULL , "-cl-std=CL2.0", \ SGEMM_SRC_NAME##_##DEVICE(TA,TB,DIVN,DIVM,DIVK,BS0,BS1,NV0,NV1,BITS,MULT), \ sizeof(SGEMM_SRC_NAME##_##DEVICE(TA,TB,DIVN,DIVM,DIVK,BS0,BS1,NV0,NV1,BITS,MULT)), \ trans_##TA,trans_##TB, \ DIVN,DIVM,DIVK, \ { BS0, BS1 } , \ { NV0, NV1 } \ } // Make it 1 to enable additional debug 'print' #define VERB 0 // Just because the full name is too long typedef clblasSgemmFunctorGCN::Variant Variant ; // // The static cache used to store all instances of clblasSgemmFunctorGCN // typedef clblasFunctorCache Cache ; static Cache cache ; // return true iff a kernel variant is applicable to the specified args static bool applicable( const Variant & var, clblasSgemmFunctor::Args & args ) { #if 0 // Transpose values are tested in select_variant if ( args.transA != var.transA ) return false ; if ( args.transB != var.transB ) return false ; #endif if ( args.N % var.divN != 0 ) return false ; if ( args.M % var.divM != 0 ) return false ; if ( args.K % var.divK != 0 ) return false ; if ( args.beta==0 && var.mult.compare("__ALPHA")!=0) return false ; return true ; } // // The goal of this function is to return the Variant to be used // for the DGEMM specified by 'args'. // // The variants are typically tested sequentially from the more // specific to the more generic. Additional conditions can be // placed into the surrounding 'if' (typically that would be // to perform additional tests on M, N and K). // // static const Variant * select_variant( clblasSgemmFunctor::Args & args, const char* DevName, cl_uint _64BitsUse ) { // if(_64BitsUse!=64) { std::cout<<"we don't support clblas on 32 bits"<< std::endl; assert(1); return NULL; } if ( args.transA == clblasNoTrans ) { if ( args.transB == clblasNoTrans ) { if (true) { //we only manage the binary version here if(!strcmp(DevName, "Tahiti")) { #ifndef CLBLAS_TAHITI_DYNAMIC_KERNEL static const Variant variant = SGEMM_VARIANT_BIN_CL1(N,N,96,96,16,16,16,6,6,64,TAHITI, __ALPHABETA) ; if ( applicable(variant,args) ) return &variant ; static const Variant variantA = SGEMM_VARIANT_BIN_CL1(N,N,96,96,16,16,16,6,6,64,TAHITI, __ALPHA) ; if ( applicable(variantA,args) ) return &variantA ; #endif //#ifndef CLBLAS_TAHITI_DYNAMIC_KERNEL } //For GCN2 devices we will use the splitsgemm functor } if (true) { //we only manage the binary version here if(!strcmp(DevName, "Tahiti")) { #ifndef CLBLAS_TAHITI_DYNAMIC_KERNEL static const Variant variant = SGEMM_VARIANT_BIN_CL1(N,N,64,64,16,16,16,4,4,64,TAHITI, __ALPHABETA) ; if ( applicable(variant,args) ) return &variant ; static const Variant variantA = SGEMM_VARIANT_BIN_CL1(N,N,64,64,16,16,16,4,4,64,TAHITI, __ALPHA) ; if ( applicable(variantA,args) ) return &variantA ; #endif //#ifndef CLBLAS_TAHITI_DYNAMIC_KERNEL } else if(!strcmp(DevName, "Hawaii")) { #ifndef CLBLAS_HAWAII_DYNAMIC_KERNEL static const Variant variant = SGEMM_VARIANT_BIN_CL2(N,N,64,64,16,16,16,4,4,64,HAWAII, __ALPHABETA) ; if ( applicable(variant,args) ) return &variant ; static const Variant variantA = SGEMM_VARIANT_BIN_CL2(N,N,64,64,16,16,16,4,4,64,HAWAII, __ALPHA) ; if ( applicable(variantA,args) ) return &variantA ; #endif //#ifndef CLBLAS_HAWAII_DYNAMIC_KERNEL } else if(!strcmp(DevName, "Bonaire")) { #ifndef CLBLAS_BONAIRE_DYNAMIC_KERNEL static const Variant variant = SGEMM_VARIANT_BIN_CL2(N,N,64,64,16,16,16,4,4,64,BONAIRE, __ALPHABETA) ; if ( applicable(variant,args) ) return &variant ; static const Variant variantA = SGEMM_VARIANT_BIN_CL2(N,N,64,64,16,16,16,4,4,64,BONAIRE, __ALPHA) ; if ( applicable(variantA,args) ) return &variantA ; #endif //#ifndef CLBLAS_BONAIRE_DYNAMIC_KERNEL } } } else { // ===== sgemm NT ====== if (true) { //we only manage the binary version here if(!strcmp(DevName, "Tahiti")) { #ifndef CLBLAS_TAHITI_DYNAMIC_KERNEL static const Variant variant = SGEMM_VARIANT_BIN_CL1(N,T,96,96,16,16,16,6,6,64,TAHITI, __ALPHABETA) ; if ( applicable(variant,args) ) return &variant ; static const Variant variantA = SGEMM_VARIANT_BIN_CL1(N,T,96,96,16,16,16,6,6,64,TAHITI, __ALPHA) ; if ( applicable(variantA,args) ) return &variantA ; #endif //#ifndef CLBLAS_TAHITI_DYNAMIC_KERNEL } //For GCN2 devices we will use the splitsgemm functor //else if(!strcmp(DevName, "Hawaii")) //{ // static const Variant variant = SGEMM_VARIANT_BIN_CL2(N,T,96,96,16,16,16,6,6,64,HAWAII, __ALPHABETA) ; // if ( applicable(variant,args) ) // return &variant ; // static const Variant variantA = SGEMM_VARIANT_BIN_CL2(N,T,96,96,16,16,16,6,6,64,HAWAII, __ALPHA) ; // if ( applicable(variantA,args) ) // return &variantA ; //} //else if(!strcmp(DevName, "Bonaire")) //{ // static const Variant variant = SGEMM_VARIANT_BIN_CL2(N,T,96,96,16,16,16,6,6,64,BONAIRE, __ALPHABETA) ; // if ( applicable(variant,args) ) // return &variant ; // static const Variant variantA = SGEMM_VARIANT_BIN_CL2(N,T,96,96,16,16,16,6,6,64,BONAIRE, __ALPHA) ; // if ( applicable(variantA,args) ) // return &variantA ; //} } if (true) { //we only manage the binary version here if(!strcmp(DevName, "Tahiti")) { #ifndef CLBLAS_TAHITI_DYNAMIC_KERNEL static const Variant variant = SGEMM_VARIANT_BIN_CL1(N,T,64,64,16,16,16,4,4,64,TAHITI, __ALPHABETA) ; if ( applicable(variant,args) ) return &variant ; static const Variant variantA = SGEMM_VARIANT_BIN_CL1(N,T,64,64,16,16,16,4,4,64,TAHITI, __ALPHA) ; if ( applicable(variantA,args) ) return &variantA ; #endif //#ifndef CLBLAS_TAHITI_DYNAMIC_KERNEL } else if(!strcmp(DevName, "Hawaii")) { #ifndef CLBLAS_HAWAII_DYNAMIC_KERNEL static const Variant variant = SGEMM_VARIANT_BIN_CL2(N,T,64,64,16,16,16,4,4,64,HAWAII, __ALPHABETA) ; if ( applicable(variant,args) ) return &variant ; static const Variant variantA = SGEMM_VARIANT_BIN_CL2(N,T,64,64,16,16,16,4,4,64,HAWAII, __ALPHA) ; if ( applicable(variantA,args) ) return &variantA ; #endif //#ifndef CLBLAS_HAWAII_DYNAMIC_KERNEL } else if(!strcmp(DevName, "Bonaire")) { #ifndef CLBLAS_BONAIRE_DYNAMIC_KERNEL static const Variant variant = SGEMM_VARIANT_BIN_CL2(N,T,64,64,16,16,16,4,4,64,BONAIRE, __ALPHABETA) ; if ( applicable(variant,args) ) return &variant ; static const Variant variantA = SGEMM_VARIANT_BIN_CL2(N,T,64,64,16,16,16,4,4,64,BONAIRE, __ALPHA) ; if ( applicable(variantA,args) ) return &variantA ; #endif //#ifndef CLBLAS_BONAIRE_DYNAMIC_KERNEL } } } } else { if ( args.transB == clblasNoTrans ) { // ===== sgemm TN ====== if (true) { //we only manage the binary version here if(!strcmp(DevName, "Tahiti")) { #ifndef CLBLAS_TAHITI_DYNAMIC_KERNEL static const Variant variant = SGEMM_VARIANT_BIN_CL1(T,N,96,96,16,16,16,6,6,64,TAHITI, __ALPHABETA) ; if ( applicable(variant,args) ) return &variant ; static const Variant variantA = SGEMM_VARIANT_BIN_CL1(T,N,96,96,16,16,16,6,6,64,TAHITI, __ALPHA) ; if ( applicable(variantA,args) ) return &variantA ; #endif //#ifndef CLBLAS_TAHITI_DYNAMIC_KERNEL } //For GCN2 devices we will use the splitsgemm functor } if (true) { //we only manage the binary version here if(!strcmp(DevName, "Tahiti")) { #ifndef CLBLAS_TAHITI_DYNAMIC_KERNEL static const Variant variant = SGEMM_VARIANT_BIN_CL1(T,N,64,64,16,16,16,4,4,64,TAHITI, __ALPHABETA) ; if ( applicable(variant,args) ) return &variant ; static const Variant variantA = SGEMM_VARIANT_BIN_CL1(T,N,64,64,16,16,16,4,4,64,TAHITI, __ALPHA) ; if ( applicable(variantA,args) ) return &variantA ; #endif //#ifndef CLBLAS_TAHITI_DYNAMIC_KERNEL } else if(!strcmp(DevName, "Hawaii")) { #ifndef CLBLAS_HAWAII_DYNAMIC_KERNEL static const Variant variant = SGEMM_VARIANT_BIN_CL2(T,N,64,64,16,16,16,4,4,64,HAWAII, __ALPHABETA) ; if ( applicable(variant,args) ) return &variant ; static const Variant variantA = SGEMM_VARIANT_BIN_CL2(T,N,64,64,16,16,16,4,4,64,HAWAII, __ALPHA) ; if ( applicable(variantA,args) ) return &variantA ; #endif //#ifndef CLBLAS_HAWAII_DYNAMIC_KERNEL } else if(!strcmp(DevName, "Bonaire")) { #ifndef CLBLAS_BONAIRE_DYNAMIC_KERNEL static const Variant variant = SGEMM_VARIANT_BIN_CL2(T,N,64,64,16,16,16,4,4,64,BONAIRE, __ALPHABETA) ; if ( applicable(variant,args) ) return &variant ; static const Variant variantA = SGEMM_VARIANT_BIN_CL2(T,N,64,64,16,16,16,4,4,64,BONAIRE, __ALPHA) ; if ( applicable(variantA,args) ) return &variantA ; #endif //#ifndef CLBLAS_BONAIRE_DYNAMIC_KERNEL } } } } return NULL ; // No suitable variant ... will use the fallback } clblasSgemmFunctorGCN::clblasSgemmFunctorGCN(Args & args, const Variant * variant, cl_int & err) : m_program(0) , m_variant(variant) { cl_device_id device; cl_context context; cl_command_queue queue = args.queue; err = getDeviceAndContext(queue, device, context); if( err != CL_SUCCESS ) { return; } if (VERB) printf(" ===> GET KERNEL %s\n", this->m_variant->kernel_name) ; //Ben do I use the correct "kernel_name"? BinaryLookup bl(context, device, "clblasSgemmFunctorGCN"); //clGetDeviceInfo(device, CL_DEVICE_NAME); bl.variantRaw( this->m_variant->kernel_name, strlen(this->m_variant->kernel_name)+1 ) ; if ( !bl.found() ) // may create empty file or may wait until file is ready { if ( this->m_variant->bin != 0 ) { // build from a pre-compiled version of the kernel (SPIR or cl binaries) err = bl.buildFromBinary(this->m_variant->bin, this->m_variant->bin_size, this->m_variant->build_options); } else { // directly build from a char* err = bl.buildFromSource(this->m_variant->source); } if ( err != CL_SUCCESS ) { if (VERB) printf(" ===> BUILD PROBLEM\n") ; return; } } this->m_program = bl.getProgram(); } clblasStatus clblasSgemmFunctorGCN::execute(Args &args) { cl_int err; cl_command_queue queue = args.queue; if (VERB) printf(" ===> EXECUTE KERNEL %s\n", this->m_variant->kernel_name) ; cl_kernel kernel = clCreateKernel( this->m_program, this->m_variant->kernel_name, &err); if (err != CL_SUCCESS) return clblasStatus(err) ; if (VERB) printf(" ===> FOUND %s\n", this->m_variant->kernel_name) ; int M = args.M, N = args.N, K = args.K; int lda = args.lda, ldb = args.ldb, ldc = args.ldc; int offsetA = args.offA; int offsetB = args.offB; int offsetC = args.offC; int arg=0 ; // All dgemm kernels shall have the same arguments: (A,B,C,M,N,K,alpha,beta,lda,ldb,ldc,offa,offb,offc) setKernelArg(kernel, arg++, args.A); setKernelArg(kernel, arg++, args.B); setKernelArg(kernel, arg++, args.C); setKernelArg(kernel, arg++, M); setKernelArg(kernel, arg++, N); setKernelArg(kernel, arg++, K); setKernelArg(kernel, arg++, args.alpha); if (args.beta!=0 && this->m_variant->mult.compare("__ALPHA")!=0) setKernelArg(kernel, arg++, args.beta); setKernelArg(kernel, arg++, lda); setKernelArg(kernel, arg++, ldb); setKernelArg(kernel, arg++, ldc); setKernelArg(kernel, arg++, offsetA); setKernelArg(kernel, arg++, offsetB); setKernelArg(kernel, arg++, offsetC); const size_t * ls = this->m_variant->ls ; // Each work group is made of ls[0] x ls[1] PE const size_t * bwi = this->m_variant->bwi ; // Each PE updates bwi[0] x bwi[1] values size_t globalThreads[2]; unsigned int thx, thy; thx = M/bwi[0] + ((M%bwi[0] != 0) ? 1 : 0); thx = thx/ls[0] + ((thx%ls[0] != 0) ? 1 : 0); thx = ls[0] * thx; thy = N/bwi[1] + ((N%bwi[1] != 0) ? 1 : 0); thy = thy/ls[1] + ((thy%ls[1] != 0) ? 1 : 0); thy = ls[1] * thy; globalThreads[0] = thx; globalThreads[1] = thy; err = clEnqueueNDRangeKernel(queue, kernel, 2, NULL, globalThreads, ls , args.numEventsInWaitList, args.eventWaitList, args.events); clReleaseKernel(kernel) ; if (VERB) printf(" ===> ERR=%d \n",(int)err) ; return clblasStatus(err) ; } clblasSgemmFunctorGCN * clblasSgemmFunctorGCN::provide(clblasSgemmFunctor::Args & args, const char* DevName) { if ( args.order == clblasRowMajor ) return NULL ; // The RowMajor case shall never occur. cl_device_id dev; cl_context ctxt; cl_int err = getDeviceAndContext(args.queue, dev, ctxt); if (err != CL_SUCCESS) { return NULL; } cl_uint bitness = getAddressBits(dev); const Variant * variant = select_variant( args, DevName, bitness ) ; if ( variant == NULL ) return NULL ; Cache::Lookup lookup(cache, ctxt, dev, variant) ; if ( lookup.ok() ) { clblasSgemmFunctorGCN * functor = lookup.get(); functor->retain(); // increment the reference counter to avoid deletion while it is still beeing used return functor; } clblasSgemmFunctorGCN * functor = new clblasSgemmFunctorGCN(args, variant, err); if (err != CL_SUCCESS) { return NULL; } lookup.set(functor) ; return functor; } clblas-2.10/src/library/blas/functor/gcn_sgemmSmallMatrices.cc000066400000000000000000000451141264277366700245100ustar00rootroot00000000000000#include #include #include #include #include "clblas-internal.h" #include "solution_seq.h" #include #include #include #include #include #include #include "BinaryBuild.h" #include "gcn_sgemmSmallMatrices.h" #if BUILD_KERNEL_FROM_STRING #include "sgemm_gcn_SmallMatrices.clT" #else #include "sgemm_gcn_SmallMatrices.clHawaii_64.bin.clT" #include "sgemm_gcn_SmallMatrices.clBonaire_64.bin.clT" #include "sgemm_gcn_SmallMatrices.clTahiti_64.bin.clT" #endif #define SGEMM_SRC_NAME(TA,TB,DIVN,DIVM,DIVK,BS0,BS1,NV0,NV1,MULT) sgemm_##TA##TB##_##DIVN##_##DIVM##_##DIVK##_##BS0##x##BS1##_##NV0##x##NV1##MULT #define SGEMM_SRC_NAME_TAHITI(TA,TB,DIVN,DIVM,DIVK,BS0,BS1,NV0,NV1,BITS,MULT) sgemm_##TA##TB##_##DIVN##_##DIVM##_##DIVK##_##BS0##x##BS1##_##NV0##x##NV1##MULT##_##BITS##_bin_Tahiti #define SGEMM_SRC_NAME_HAWAII(TA,TB,DIVN,DIVM,DIVK,BS0,BS1,NV0,NV1,BITS,MULT) sgemm_##TA##TB##_##DIVN##_##DIVM##_##DIVK##_##BS0##x##BS1##_##NV0##x##NV1##MULT##_##BITS##_bin_Hawaii #define SGEMM_SRC_NAME_BONAIRE(TA,TB,DIVN,DIVM,DIVK,BS0,BS1,NV0,NV1,BITS,MULT) sgemm_##TA##TB##_##DIVN##_##DIVM##_##DIVK##_##BS0##x##BS1##_##NV0##x##NV1##MULT##_##BITS##_bin_Bonaire // // The name of the 'const char []' global variable that contain the SPIR data. // That name is similar to the one produced by DGEMM_SRC_NAME but suffixed by _spir // #define SGEMM_SPIR_NAME(TA,TB,DIVN,DIVM,DIVK,BS0,BS1,NV0,NV1,MULT) sgemm_##TA##TB##_##DIVN##_##DIVM##_##DIVK##_##BS0##x##BS1##_##NV0##x##NV1_spir // // The name of the 'const char []' global variable that contain the CL binaries data. // That name is similar to the one produced by DGEMM_SRC_NAME but suffixed by _bin // // The name of the kernel itself. // This is basically the name returned by DGEMM_SRC_NAME but as string // #define SGEMM_KERNEL_NAME(TA,TB,DIVN,DIVM,DIVK,BS0,BS1,NV0,NV1,MULT) "sgemm_" #TA #TB "_" #DIVN "_" #DIVM "_" #DIVK "_" #BS0 "x" #BS1 "_" #NV0 "x" #NV1 #MULT // // Helpers to transform N and T in proper clblas values for the macros above // #define trans_N clblasNoTrans #define trans_T clblasTrans // Fill a variant descriptor using OpenCL source #define SGEMM_VARIANT_SRC(TA,TB,DIVN,DIVM,DIVK,BS0,BS1,NV0,NV1, MULT) { \ SGEMM_KERNEL_NAME(TA,TB,DIVN,DIVM,DIVK,BS0,BS1,NV0,NV1, MULT) , \ SGEMM_SRC_NAME(TA,TB,DIVN,DIVM,DIVK,BS0,BS1,NV0,NV1, MULT) , \ NULL, NULL, 0, \ trans_##TA, trans_##TB, \ DIVN,DIVM,DIVK, \ { BS0, BS1 } , \ { NV0, NV1 } , \ #MULT \ } // Fill a variant descriptor using SPIR #define SGEMM_VARIANT_SPIR(TA,TB,DIVN,DIVM,DIVK,BS0,BS1,NV0,NV1, MULT) { \ SGEMM_KERNEL_NAME(TA,TB,DIVN,DIVM,DIVK,BS0,BS1,NV0,NV1, MULT) , \ NULL , "-x spir -spir-std=1.2" \ SGEMM_SPIR_NAME(TA,TB,DIVN,DIVM,DIVK,BS0,BS1,NV0,NV1), \ sizeof(SGEMM_SPIR_NAME(TA,TB,DIVN,DIVM,DIVK,BS0,BS1,NV0,NV1)), \ trans_##TA,trans_##TB, \ DIVN,DIVM,DIVK, \ { BS0, BS1 } , \ { NV0, NV1 } , \ #MULT \ } // Fill a variant descriptor using CL Binaries #define SGEMM_VARIANT_BIN_CL1(TA,TB,DIVN,DIVM,DIVK,BS0,BS1,NV0,NV1,BITS,DEVICE, MULT) { \ SGEMM_KERNEL_NAME(TA,TB,DIVN,DIVM,DIVK,BS0,BS1,NV0,NV1, MULT) , \ NULL , NULL, \ SGEMM_SRC_NAME##_##DEVICE(TA,TB,DIVN,DIVM,DIVK,BS0,BS1,NV0,NV1,BITS, MULT), \ sizeof(SGEMM_SRC_NAME##_##DEVICE(TA,TB,DIVN,DIVM,DIVK,BS0,BS1,NV0,NV1,BITS, MULT)), \ trans_##TA,trans_##TB, \ DIVN,DIVM,DIVK, \ { BS0, BS1 } , \ { NV0, NV1 } , \ #MULT \ } #define SGEMM_VARIANT_BIN_CL2(TA,TB,DIVN,DIVM,DIVK,BS0,BS1,NV0,NV1,BITS,DEVICE, MULT) { \ SGEMM_KERNEL_NAME(TA,TB,DIVN,DIVM,DIVK,BS0,BS1,NV0,NV1, MULT) , \ NULL , "-cl-std=CL2.0", \ SGEMM_SRC_NAME##_##DEVICE(TA,TB,DIVN,DIVM,DIVK,BS0,BS1,NV0,NV1,BITS, MULT), \ sizeof(SGEMM_SRC_NAME##_##DEVICE(TA,TB,DIVN,DIVM,DIVK,BS0,BS1,NV0,NV1,BITS, MULT)), \ trans_##TA,trans_##TB, \ DIVN,DIVM,DIVK, \ { BS0, BS1 } , \ { NV0, NV1 } , \ #MULT \ } // Just because the full name is too long typedef clBlasGCNSgemmSmallMatricesFunctor::Variant Variant ; ////define the string name of the soure/binary code //#define DGEMM_SRC_NAME(TA,TB,MULT) dgemm_##TA##TB##_SMALL##MULT //#define DGEMM_SRC_NAME_HAWAII(TA,TB, MULT, BITS) dgemm_##TA##TB##_SMALL##MULT##_##BITS##_bin_Hawaii // ////variant name used to differentiate the different ones //#define DGEMM_VARIANT_NAME(TA,TB, MULT) "dgemm_" #TA #TB "_SMALL" #MULT ////DGEMM_VARIANT_NAME(TA, TB, DIVM , DIVN, DIVK, GREATER48M, GREATER48N, NBKERNEL), // //#define DGEMM_KERNEL_NAME(TA,TB,DIVM,DIVN,DIVK,BS0,BS1,NV0,NV1,MULT, BLOC) "dgemm_" #TA #TB "_" #DIVM "_" #DIVN "_" #DIVK "_" #BS0 "x" #BS1 "_" #NV0 "x" #NV1 #MULT "_SPLIT_" #BLOC // // //#define trans_N clblasNoTrans //#define trans_T clblasTrans // //// Fill a variant descriptor using OpenCL source //#define DGEMM_VARIANT_OBJ(TA,TB,DIVK,BS0,BS1,NV0,NV1, BITS, MULT, \ // KERNEL_NAME_MAIN, KERNEL_NAME_ROW, \ // KERNELS_SRC, \ // KERNEL_BUILD_OPTIONS, \ // KERNELS_BIN, \ // KERNEL_BIN_SIZE) { \ // DGEMM_VARIANT_NAME(TA,TB, MULT), \ //{ KERNEL_NAME_MAIN, KERNEL_NAME_ROW } , \ // KERNELS_SRC, \ // KERNEL_BUILD_OPTIONS, \ // KERNELS_BIN, \ // KERNEL_BIN_SIZE, \ // trans_##TA, trans_##TB, \ // DIVK , \ //{ BS0, BS1 } , \ //{ NV0, NV1 } , \ //#MULT \ //} typedef clblasFunctorCache CacheSMallsgemm ; static CacheSMallsgemm cachesmall ; // Make it 1 to enable additional debug 'print' #define VERB 0 static bool applicable( const Variant & var, clblasSgemmFunctor::Args & args ) { #if 0 // Transpose values are tested in select_variant if ( args.transA != var.transA ) return false ; if ( args.transB != var.transB ) return false ; #endif //if (args.N>=var.divN && args.N % var.divN != 0 ) if ( args.N % var.divN != 0 ) return false ; if ( args.M % var.divM != 0 ) return false ; if ( args.beta==0 && var.mult.compare("__ALPHA")!=0) return false ; return true ; } static const Variant * select_variant_GCNSmallMatrices( clblasSgemmFunctor::Args & args, const char* DevName, cl_uint _64BitsUse ) { if(_64BitsUse!=64) { std::cout<<"we don't support clblas on 32 bits"<< std::endl; assert(1); return NULL; } if ( args.transA == clblasNoTrans ) { if ( args.transB == clblasNoTrans ) { if (true) { #if BUILD_KERNEL_FROM_STRING static const Variant variant = DGEMM_VARIANT_SRC(N,N,32,32,16,16,16,2,2,__ALPHABETA) ; if ( applicable(variant,args) ) return &variant ; static const Variant variantA = DGEMM_VARIANT_SRC(N,N,32,32,16,16,16,2,2,__ALPHA) ; if ( applicable(variantA,args) ) return &variantA ; #else if(!strcmp(DevName, "Tahiti")) { #ifndef CLBLAS_TAHITI_DYNAMIC_KERNEL static const Variant variant = SGEMM_VARIANT_BIN_CL1(N,N,32,32,16,16,16,2,2,64,TAHITI,__ALPHABETA ) ; if ( applicable(variant,args) ) return &variant ; static const Variant variantA = SGEMM_VARIANT_BIN_CL1(N,N,32,32,16,16,16,2,2,64,TAHITI,__ALPHA ) ; if ( applicable(variantA,args) ) return &variantA ; #endif //#ifndef CLBLAS_TAHITI_DYNAMIC_KERNEL } else if(!strcmp(DevName, "Hawaii")) { #ifndef CLBLAS_HAWAII_DYNAMIC_KERNEL static const Variant variant = SGEMM_VARIANT_BIN_CL2(N,N,32,32,16,16,16,2,2,64,HAWAII,__ALPHABETA ) ; if ( applicable(variant,args) ) return &variant ; static const Variant variantA = SGEMM_VARIANT_BIN_CL2(N,N,32,32,16,16,16,2,2,64,HAWAII,__ALPHA ) ; if ( applicable(variantA,args) ) return &variantA ; #endif //#ifndef CLBLAS_HAWAII_DYNAMIC_KERNEL } else if(!strcmp(DevName, "Bonaire")) { #ifndef CLBLAS_BONAIRE_DYNAMIC_KERNEL static const Variant variant = SGEMM_VARIANT_BIN_CL2(N,N,32,32,16,16,16,2,2,64,BONAIRE,__ALPHABETA ) ; if ( applicable(variant,args) ) return &variant ; static const Variant variantA = SGEMM_VARIANT_BIN_CL2(N,N,32,32,16,16,16,2,2,64,BONAIRE,__ALPHA ) ; if ( applicable(variantA,args) ) return &variantA ; #endif //#ifndef CLBLAS_BONAIRE_DYNAMIC_KERNEL } #endif } } if (args.transB == clblasTrans) { if (true) { #if BUILD_KERNEL_FROM_STRING static const Variant variant = DGEMM_VARIANT_SRC(N,T,32,32,16,16,16,2,2,__ALPHABETA) ; if ( applicable(variant,args) ) return &variant ; static const Variant variantA = DGEMM_VARIANT_SRC(N,T,32,32,16,16,16,2,2,__ALPHA) ; if ( applicable(variantA,args) ) return &variantA ; #else if(!strcmp(DevName, "Tahiti")) { #ifndef CLBLAS_TAHITI_DYNAMIC_KERNEL static const Variant variant = SGEMM_VARIANT_BIN_CL1(N,T,32,32,16,16,16,2,2,64,TAHITI,__ALPHABETA ) ; if ( applicable(variant,args) ) return &variant ; static const Variant variantA = SGEMM_VARIANT_BIN_CL1(N,T,32,32,16,16,16,2,2,64,TAHITI,__ALPHA ) ; if ( applicable(variantA,args) ) return &variantA ; #endif //#ifndef CLBLAS_TAHITI_DYNAMIC_KERNEL } else if(!strcmp(DevName, "Hawaii")) { #ifndef CLBLAS_HAWAII_DYNAMIC_KERNEL static const Variant variant = SGEMM_VARIANT_BIN_CL2(N,T,32,32,16,16,16,2,2,64,HAWAII,__ALPHABETA ) ; if ( applicable(variant,args) ) return &variant ; static const Variant variantA = SGEMM_VARIANT_BIN_CL2(N,T,32,32,16,16,16,2,2,64,HAWAII,__ALPHA ) ; if ( applicable(variantA,args) ) return &variantA ; #endif //#ifndef CLBLAS_HAWAII_DYNAMIC_KERNEL } else if(!strcmp(DevName, "Bonaire")) { #ifndef CLBLAS_BONAIRE_DYNAMIC_KERNEL static const Variant variant = SGEMM_VARIANT_BIN_CL2(N,T,32,32,16,16,16,2,2,64,BONAIRE,__ALPHABETA ) ; if ( applicable(variant,args) ) return &variant ; static const Variant variantA = SGEMM_VARIANT_BIN_CL2(N,T,32,32,16,16,16,2,2,64,BONAIRE,__ALPHA ) ; if ( applicable(variantA,args) ) return &variantA ; #endif //#ifndef CLBLAS_BONAIRE_DYNAMIC_KERNEL } #endif } } } else { if ( args.transB == clblasNoTrans ) { if (true) { #if BUILD_KERNEL_FROM_STRING static const Variant variant = DGEMM_VARIANT_SRC(T,N,32,32,16,16,16,2,2,__ALPHABETA) ; if ( applicable(variant,args) ) return &variant ; static const Variant variantA = DGEMM_VARIANT_SRC(T,N,32,32,16,16,16,2,2,__ALPHA) ; if ( applicable(variantA,args) ) return &variantA ; #else if(!strcmp(DevName, "Tahiti")) { #ifndef CLBLAS_TAHITI_DYNAMIC_KERNEL static const Variant variant = SGEMM_VARIANT_BIN_CL1(T,N,32,32,16,16,16,2,2,64,TAHITI,__ALPHABETA ) ; if ( applicable(variant,args) ) return &variant ; static const Variant variantA = SGEMM_VARIANT_BIN_CL1(T,N,32,32,16,16,16,2,2,64,TAHITI,__ALPHA ) ; if ( applicable(variantA,args) ) return &variantA ; #endif //#ifndef CLBLAS_TAHITI_DYNAMIC_KERNEL } else if(!strcmp(DevName, "Hawaii")) { #ifndef CLBLAS_HAWAII_DYNAMIC_KERNEL static const Variant variant = SGEMM_VARIANT_BIN_CL2(T,N,32,32,16,16,16,2,2,64,HAWAII,__ALPHABETA ) ; if ( applicable(variant,args) ) return &variant ; static const Variant variantA = SGEMM_VARIANT_BIN_CL2(T,N,32,32,16,16,16,2,2,64,HAWAII,__ALPHA ) ; if ( applicable(variantA,args) ) return &variantA ; #endif //#ifndef CLBLAS_HAWAII_DYNAMIC_KERNEL } else if(!strcmp(DevName, "Bonaire")) { #ifndef CLBLAS_BONAIRE_DYNAMIC_KERNEL static const Variant variant = SGEMM_VARIANT_BIN_CL2(T,N,32,32,16,16,16,2,2,64,BONAIRE,__ALPHABETA ) ; if ( applicable(variant,args) ) return &variant ; static const Variant variantA = SGEMM_VARIANT_BIN_CL2(T,N,32,32,16,16,16,2,2,64,BONAIRE,__ALPHA ) ; if ( applicable(variantA,args) ) return &variantA ; #endif //#ifndef CLBLAS_BONAIRE_DYNAMIC_KERNEL } #endif } } } return NULL; } clBlasGCNSgemmSmallMatricesFunctor::clBlasGCNSgemmSmallMatricesFunctor(Args & args, const Variant * variant, cl_int & err) { cl_device_id device; cl_context context; m_program=NULL; m_variant = variant; cl_command_queue queue = args.queue; err = getDeviceAndContext(queue, device, context); if( err != CL_SUCCESS ) { return; } if (VERB) printf(" ===> GET KERNEL %s\n", this->m_variant->kernel_name) ; //Ben do I use the correct "kernel_name"? BinaryLookup bl(context, device, "clBlasGCNSgemmSmallMatricesFunctor"); bl.variantRaw( this->m_variant->kernel_name, strlen(this->m_variant->kernel_name)+1 ) ; if ( !bl.found() ) // may create empty file or may wait until file is ready { if ( this->m_variant->bin != NULL ) { // build from a pre-compiled version of the kernel (SPIR or cl binaries) //only 1 binary containing all the kernel err = bl.buildFromBinary(this->m_variant->bin, this->m_variant->bin_size, this->m_variant->build_options); } else { //// directly build from a char* err = bl.buildFromSource(this->m_variant->source); return; } if ( err != CL_SUCCESS ) { if (VERB) printf(" ===> BUILD PROBLEM\n") ; return; } } this->m_program = bl.getProgram(); } clBlasGCNSgemmSmallMatricesFunctor * clBlasGCNSgemmSmallMatricesFunctor::provide(clblasSgemmFunctor::Args & args, const char* DevName) { if ( args.order == clblasRowMajor ) return NULL ; // The RowMajor case shall never occur. cl_device_id dev; cl_context ctxt; cl_int err = getDeviceAndContext(args.queue, dev, ctxt); if (err != CL_SUCCESS) { return NULL; } cl_uint bitness = getAddressBits(dev); const Variant * variant = select_variant_GCNSmallMatrices( args, DevName, bitness ) ; if ( variant == NULL ) return NULL ; CacheSMallsgemm::Lookup lookup(cachesmall, ctxt, dev, variant) ; if ( lookup.ok() ) { clBlasGCNSgemmSmallMatricesFunctor * functor = lookup.get(); functor->retain(); // increment the reference counter to avoid deletion while it is still beeing used return functor; } clBlasGCNSgemmSmallMatricesFunctor * functor = new clBlasGCNSgemmSmallMatricesFunctor(args, variant, err); if (err != CL_SUCCESS) { return NULL; } lookup.set(functor) ; return functor; } //need to rewrite execute!!! clblasStatus clBlasGCNSgemmSmallMatricesFunctor::execute(Args &args) { cl_int err; cl_command_queue queue = args.queue; if (VERB) printf(" ===> EXECUTE KERNEL %s, alpha =%f ,beta = %f\n", this->m_variant->kernel_name, args.alpha, args.beta) ; cl_kernel kernel; // int NBKernel = 0; kernel= clCreateKernel( this->m_program, this->m_variant->kernel_name, &err); if (err != CL_SUCCESS) return clblasStatus(err) ; //if (NBKernel != 4) return clblasStatus(clblasBuildProgramFailure) ; if (VERB) { printf(" ===> FOUND %s\n", this->m_variant->kernel_name) ; } int M = args.M, N = args.N, K = args.K; int lda = args.lda, ldb = args.ldb, ldc = args.ldc; int offsetA = args.offA; int offsetB = args.offB; int offsetC = args.offC; int arg=0 ; //// All dgemm kernels shall have the same arguments: (A,B,C,M,N,K,alpha,beta,lda,ldb,ldc,offa,offb,offc) setKernelArg(kernel, arg++, args.A); setKernelArg(kernel, arg++, args.B); setKernelArg(kernel, arg++, args.C); setKernelArg(kernel, arg++, M); setKernelArg(kernel, arg++, N); setKernelArg(kernel, arg++, K); setKernelArg(kernel, arg++, args.alpha); if (args.beta!=0 && this->m_variant->mult.compare("__ALPHA")!=0) setKernelArg(kernel, arg++, args.beta); setKernelArg(kernel, arg++, lda); setKernelArg(kernel, arg++, ldb); setKernelArg(kernel, arg++, ldc); setKernelArg(kernel, arg++, offsetA); setKernelArg(kernel, arg++, offsetB); setKernelArg(kernel, arg++, offsetC); // err = KernelsLaunch(queue, kernel, args); const size_t * ls = this->m_variant->ls ; // Each work group is made of ls[0] x ls[1] PE const size_t * bwi = this->m_variant->bwi ; // Each PE updates bwi[0] x bwi[1] values size_t globalThreads[2]; unsigned int thx, thy; thx = M/bwi[0] + ((M%bwi[0] != 0) ? 1 : 0); thx = thx/ls[0] + ((thx%ls[0] != 0) ? 1 : 0); thx = ls[0] * thx; thy = N/bwi[1] + ((N%bwi[1] != 0) ? 1 : 0); thy = thy/ls[1] + ((thy%ls[1] != 0) ? 1 : 0); thy = ls[1] * thy; globalThreads[0] = thx; globalThreads[1] = thy; err = clEnqueueNDRangeKernel(queue, kernel, 2, NULL, globalThreads, ls , args.numEventsInWaitList, args.eventWaitList, args.events); clReleaseKernel(kernel) ; if (VERB) printf(" ===> ERR=%d \n",(int)err) ; return clblasStatus(err) ; } clblas-2.10/src/library/blas/functor/gcn_zgemm.cc000066400000000000000000000265301264277366700220370ustar00rootroot00000000000000#include #include #include #include #include "clblas-internal.h" #include "solution_seq.h" #include #include #include #include #include #include #include #include "BinaryBuild.h" //for the moment only managing source code and cl binary #if BUILD_KERNEL_FROM_STRING #include "zgemm_gcn.clT" #else #include "zgemm_gcn.clHawaii_64.bin.clT" //#include "zgemm_gcn.clBonaire_64.bin.clT" //#include "ZGEMM_gcn.clTahiti_64.bin.clT" #endif // // The name of the 'const char *' providing the kernel OpenCL source // // dgemm_TATB_DIVN_DIVM_DIVK_BS0xBS1_NV0xNV1 // // For instance, DGEMM_SRC_NAME(N,T,32,64,8,8,8,4,8) is dgemm_NT_32_64_8_8x8_4x8 // #define ZGEMM_SRC_NAME(TA,TB,DIVN,DIVM,DIVK,BS0,BS1,NV0,NV1,MULT) zgemm_##TA##TB##_##DIVN##_##DIVM##_##DIVK##_##BS0##x##BS1##_##NV0##x##NV1##MULT #define ZGEMM_SRC_NAME_TAHITI(TA,TB,DIVN,DIVM,DIVK,BS0,BS1,NV0,NV1,BITS,MULT) zgemm_##TA##TB##_##DIVN##_##DIVM##_##DIVK##_##BS0##x##BS1##_##NV0##x##NV1##MULT##_##BITS##_bin_Tahiti #define ZGEMM_SRC_NAME_HAWAII(TA,TB,DIVN,DIVM,DIVK,BS0,BS1,NV0,NV1,BITS,MULT) zgemm_##TA##TB##_##DIVN##_##DIVM##_##DIVK##_##BS0##x##BS1##_##NV0##x##NV1##MULT##_##BITS##_bin_Hawaii #define ZGEMM_SRC_NAME_BONAIRE(TA,TB,DIVN,DIVM,DIVK,BS0,BS1,NV0,NV1,BITS,MULT) zgemm_##TA##TB##_##DIVN##_##DIVM##_##DIVK##_##BS0##x##BS1##_##NV0##x##NV1##MULT##_##BITS##_bin_Bonaire // // The name of the 'const char []' global variable that contain the SPIR data. // That name is similar to the one produced by DGEMM_SRC_NAME but suffixed by _spir // #define ZGEMM_SPIR_NAME(TA,TB,DIVN,DIVM,DIVK,BS0,BS1,NV0,NV1,MULT) zgemm_##TA##TB##_##DIVN##_##DIVM##_##DIVK##_##BS0##x##BS1##_##NV0##x##NV1_spir // // The name of the 'const char []' global variable that contain the CL binaries data. // That name is similar to the one produced by DGEMM_SRC_NAME but suffixed by _bin // // The name of the kernel itself. // This is basically the name returned by DGEMM_SRC_NAME but as string // #define ZGEMM_KERNEL_NAME(TA,TB,DIVN,DIVM,DIVK,BS0,BS1,NV0,NV1,MULT) "zgemm_" #TA #TB "_" #DIVN "_" #DIVM "_" #DIVK "_" #BS0 "x" #BS1 "_" #NV0 "x" #NV1 #MULT // // Helpers to transform N and T in proper clblas values for the macros above // #define trans_N clblasNoTrans #define trans_T clblasTrans // Fill a variant descriptor using OpenCL source #define ZGEMM_VARIANT_SRC(TA,TB,DIVN,DIVM,DIVK,BS0,BS1,NV0,NV1,MULT) { \ ZGEMM_KERNEL_NAME(TA,TB,DIVN,DIVM,DIVK,BS0,BS1,NV0,NV1,MULT) , \ ZGEMM_SRC_NAME(TA,TB,DIVN,DIVM,DIVK,BS0,BS1,NV0,NV1,MULT) , \ NULL, NULL, 0, \ trans_##TA, trans_##TB, \ DIVN,DIVM,DIVK, \ { BS0, BS1 } , \ { NV0, NV1 } \ } // Fill a variant descriptor using SPIR #define ZGEMM_VARIANT_SPIR(TA,TB,DIVN,DIVM,DIVK,BS0,BS1,NV0,NV1,MULT) { \ ZGEMM_KERNEL_NAME(TA,TB,DIVN,DIVM,DIVK,BS0,BS1,NV0,NV1) , \ NULL , "-x spir -spir-std=1.2" \ ZGEMM_SPIR_NAME(TA,TB,DIVN,DIVM,DIVK,BS0,BS1,NV0,NV1), \ sizeof(ZGEMM_SPIR_NAME(TA,TB,DIVN,DIVM,DIVK,BS0,BS1,NV0,NV1)), \ trans_##TA,trans_##TB, \ DIVN,DIVM,DIVK, \ { BS0, BS1 } , \ { NV0, NV1 } \ } // Fill a variant descriptor using CL Binaries #define ZGEMM_VARIANT_BIN_CL1(TA,TB,DIVN,DIVM,DIVK,BS0,BS1,NV0,NV1,BITS,DEVICE,MULT) { \ ZGEMM_KERNEL_NAME(TA,TB,DIVN,DIVM,DIVK,BS0,BS1,NV0,NV1,MULT) , \ NULL , NULL, \ ZGEMM_SRC_NAME##_##DEVICE(TA,TB,DIVN,DIVM,DIVK,BS0,BS1,NV0,NV1,BITS,MULT), \ sizeof(ZGEMM_SRC_NAME##_##DEVICE(TA,TB,DIVN,DIVM,DIVK,BS0,BS1,NV0,NV1,BITS,MULT)), \ trans_##TA,trans_##TB, \ DIVN,DIVM,DIVK, \ { BS0, BS1 } , \ { NV0, NV1 } \ } #define ZGEMM_VARIANT_BIN_CL2(TA,TB,DIVN,DIVM,DIVK,BS0,BS1,NV0,NV1,BITS,DEVICE,MULT) { \ ZGEMM_KERNEL_NAME(TA,TB,DIVN,DIVM,DIVK,BS0,BS1,NV0,NV1,MULT) , \ NULL , "-cl-std=CL2.0", \ ZGEMM_SRC_NAME##_##DEVICE(TA,TB,DIVN,DIVM,DIVK,BS0,BS1,NV0,NV1,BITS,MULT), \ sizeof(ZGEMM_SRC_NAME##_##DEVICE(TA,TB,DIVN,DIVM,DIVK,BS0,BS1,NV0,NV1,BITS,MULT)), \ trans_##TA,trans_##TB, \ DIVN,DIVM,DIVK, \ { BS0, BS1 } , \ { NV0, NV1 } \ } // Make it 1 to enable additional debug 'print' #define VERB 0 // Just because the full name is too long typedef clblasZgemmFunctorGCN::Variant Variant ; // // The static cache used to store all instances of clblasZgemmFunctorGCN // typedef clblasFunctorCache Cache ; static Cache cache ; // return true iff a kernel variant is applicable to the specified args static bool applicable( const Variant & var, clblasZgemmFunctor::Args & args ) { #if 0 // Transpose values are tested in select_variant if ( args.transA != var.transA ) return false ; if ( args.transB != var.transB ) return false ; #endif if ( args.N % var.divN != 0 ) return false ; if ( args.M % var.divM != 0 ) return false ; if ( args.K % var.divK != 0 ) return false ; //if ( args.beta==0 && var.mult.compare("__ALPHA")!=0) // return false ; return true ; } // // The goal of this function is to return the Variant to be used // for the ZGEMM specified by 'args'. // // The variants are typically tested sequentially from the more // specific to the more generic. Additional conditions can be // placed into the surrounding 'if' (typically that would be // to perform additional tests on M, N and K). // // static const Variant * select_variant( clblasZgemmFunctor::Args & args, const char* DevName, cl_uint _64BitsUse ) { if(_64BitsUse!=64) { std::cout<<"we don't support clblas on 32 bits"<< std::endl; assert(1); return NULL; } if ( args.transA == clblasNoTrans ) { if ( args.transB == clblasTrans ) { //we only manage the binary version here if(!strcmp(DevName, "Hawaii")) { #ifndef CLBLAS_HAWAII_DYNAMIC_KERNEL //zgemm_NT_32_64_8_16x16_2x4__ALPHABETA_64_bin_Hawaii static const Variant variant = ZGEMM_VARIANT_BIN_CL2(N,T,64,32,8,16,16,2,4,64,HAWAII, __ALPHABETA) ; if ( applicable(variant,args) ) // needs to return true return &variant; #endif } } } return NULL ; // No suitable variant ... will use the fallback } /* * constructor */ clblasZgemmFunctorGCN::clblasZgemmFunctorGCN(Args & args, const Variant * variant, cl_int & err) : m_program(0) , m_variant(variant) { cl_device_id device; cl_context context; cl_command_queue queue = args.queue; err = getDeviceAndContext(queue, device, context); if( err != CL_SUCCESS ) { return; } if (VERB) printf(" ===> GET KERNEL %s\n", this->m_variant->kernel_name) ; //Ben do I use the correct "kernel_name"? BinaryLookup bl(context, device, "clblasZgemmFunctorGCN"); //clGetDeviceInfo(device, CL_DEVICE_NAME); bl.variantRaw( this->m_variant->kernel_name, strlen(this->m_variant->kernel_name)+1 ) ; if ( !bl.found() ) // may create empty file or may wait until file is ready { if ( this->m_variant->bin != 0 ) { // build from a pre-compiled version of the kernel (SPIR or cl binaries) err = bl.buildFromBinary(this->m_variant->bin, this->m_variant->bin_size, this->m_variant->build_options); } else { // directly build from a char* err = bl.buildFromSource(this->m_variant->source); } if ( err != CL_SUCCESS ) { if (VERB) printf(" ===> BUILD PROBLEM\n") ; return; } } this->m_program = bl.getProgram(); } clblasStatus clblasZgemmFunctorGCN::execute(Args &args) { cl_int err; cl_command_queue queue = args.queue; if (VERB) printf(" ===> EXECUTE KERNEL %s\n", this->m_variant->kernel_name) ; cl_kernel kernel = clCreateKernel( this->m_program, this->m_variant->kernel_name, &err); if (err != CL_SUCCESS) return clblasStatus(err) ; if (VERB) printf(" ===> FOUND %s\n", this->m_variant->kernel_name) ; int M = args.M, N = args.N, K = args.K; int lda = args.lda, ldb = args.ldb, ldc = args.ldc; int offsetA = args.offA; int offsetB = args.offB; int offsetC = args.offC; int arg=0 ; // All zgemm kernels shall have the same arguments: (A,B,C,M,N,K,alpha,beta,lda,ldb,ldc,offa,offb,offc) setKernelArg(kernel, arg++, M); setKernelArg(kernel, arg++, N); setKernelArg(kernel, arg++, K); setKernelArg(kernel, arg++, args.alpha); setKernelArg(kernel, arg++, args.beta); setKernelArg(kernel, arg++, args.A); setKernelArg(kernel, arg++, args.B); setKernelArg(kernel, arg++, args.C); setKernelArg(kernel, arg++, lda); setKernelArg(kernel, arg++, ldb); setKernelArg(kernel, arg++, ldc); setKernelArg(kernel, arg++, offsetA); setKernelArg(kernel, arg++, offsetB); setKernelArg(kernel, arg++, offsetC); const size_t * ls = this->m_variant->ls ; // Each work group is made of ls[0] x ls[1] PE const size_t * bwi = this->m_variant->bwi ; // Each PE updates bwi[0] x bwi[1] values size_t globalThreads[2]; unsigned int thx, thy; thx = M/bwi[0] + ((M%bwi[0] != 0) ? 1 : 0); thx = thx/ls[0] + ((thx%ls[0] != 0) ? 1 : 0); thx = ls[0] * thx; thy = N/bwi[1] + ((N%bwi[1] != 0) ? 1 : 0); thy = thy/ls[1] + ((thy%ls[1] != 0) ? 1 : 0); thy = ls[1] * thy; globalThreads[0] = thx; globalThreads[1] = thy; err = clEnqueueNDRangeKernel(queue, kernel, 2, NULL, globalThreads, ls , args.numEventsInWaitList, args.eventWaitList, args.events); clReleaseKernel(kernel) ; if (VERB) printf(" ===> ERR=%d \n",(int)err) ; return clblasStatus(err) ; } clblasZgemmFunctorGCN * clblasZgemmFunctorGCN::provide(clblasZgemmFunctor::Args & args, const char* DevName) { if ( args.order == clblasRowMajor ) return NULL ; // The RowMajor case shall never occur. cl_device_id dev; cl_context ctxt; cl_int err = getDeviceAndContext(args.queue, dev, ctxt); if (err != CL_SUCCESS) { return NULL; } cl_uint bitness = getAddressBits(dev); const Variant * variant = select_variant( args, DevName, bitness ) ; if ( variant == NULL ) return NULL ; Cache::Lookup lookup(cache, ctxt, dev, variant) ; if ( lookup.ok() ) { clblasZgemmFunctorGCN * functor = lookup.get(); functor->retain(); // increment the reference counter to avoid deletion while it is still being used return functor; } clblasZgemmFunctorGCN * functor = new clblasZgemmFunctorGCN(args, variant, err); if (err != CL_SUCCESS) { return NULL; } lookup.set(functor) ; return functor; } clblas-2.10/src/library/blas/functor/gpu_dtrsm.cc000066400000000000000000000607471264277366700221050ustar00rootroot00000000000000#include #include #include #include #include "clblas-internal.h" #include "solution_seq.h" #include "functor.h" #include "binary_lookup.h" #include #include "functor_xtrsm.h" #include "gpu_dtrsm.h" #include "tahiti.h" #include "BinaryBuild.h" #if BUILD_KERNEL_FROM_STRING #include "dtrsm_gpu.clT" #else #include "dtrsm_gpu.clHawaii_64.bin.clT" #include "dtrsm_gpu.clTahiti_64.bin.clT" #endif // Make it 1 to enable additional debug 'print' #define VERB 0 //TODO //clReleaseKernel(kernel) ; #define BLOCK_SIZE 16 // inner blocking size, <=32 #define NB 128 // outer blocking size, >BLOCK_SIZE // // The static cache used to store all instances of clblasDtrsmFunctorGpu /clblasDgemmFunctorTahiti // typedef clblasFunctorCache Cache ; static Cache cache ; clblasDtrsmFunctorGpu::clblasDtrsmFunctorGpu(Args & args, cl_int & err, const char* DevName, cl_uint _64BitsUse) : m_program(0) { cl_device_id device; cl_context context; cl_command_queue queue = args.queue; err = getDeviceAndContext(queue, device, context); if( err != CL_SUCCESS ) { return; } if (VERB) printf(" ===> GET KERNEL %s\n", "clblasDtrsmFunctorGpu") ; BinaryLookup bl(context, device, "clblasDtrsmFunctorGpu"); if ( !bl.found() ) // may create empty file or may wait until file is ready { // directly build from a char* #if BUILD_KERNEL_FROM_STRING err = bl.buildFromSource(dtrsm_gpu_kernels); #else if(!strcmp(DevName, "Tahiti")) { #ifndef CLBLAS_TAHITI_DYNAMIC_KERNEL if(_64BitsUse==64) err = bl.buildFromBinary(dtrsm_gpu_kernels_64_bin_Tahiti, sizeof(dtrsm_gpu_kernels_64_bin_Tahiti), NULL); else { std::cout<<"we don't support clblas on 32 bits"<< std::endl; assert(1); } #endif } else if(!strcmp(DevName, "Hawaii")) { #ifndef CLBLAS_HAWAII_DYNAMIC_KERNEL if(_64BitsUse==64) err = bl.buildFromBinary(dtrsm_gpu_kernels_64_bin_Hawaii, sizeof(dtrsm_gpu_kernels_64_bin_Hawaii), NULL); else { std::cout<<"we don't support clblas on 32 bits"<< std::endl; assert(1); } #endif } #endif if ( err != CL_SUCCESS ) { if (VERB) printf(" ===> BUILD PROBLEM\n") ; return; } } this->m_program = bl.getProgram(); } #define CALL_KERNEL_TRIPLE_UPDATE(kernel_name, prg, queue, A, offA, d_dinvA, i, lda, M, event) \ do{ \ err = call_kernel_triple_update(kernel_name, prg, queue, A, offA, d_dinvA, i, lda, M, event); \ if(err != CL_SUCCESS) { \ return err; \ } \ } while(0) cl_int call_kernel_triple_update(const char* kernel_name, const cl_program prg, const cl_command_queue queue, cl_mem A, unsigned int offA, cl_mem d_dinvA, int i, unsigned int lda, int M, cl_event *event) { cl_int err = 0; unsigned int m = M; int npages = M/(i*2)+(M%(i*2)!=0); size_t globalLocal [2] = { (i <= 32)?(i/4):16, 4}; size_t globalThreads[2] = { (i/(globalLocal[0]*globalLocal[1]))* globalLocal[0], npages*(i/16) * globalLocal[1]}; cl_kernel kernel = clCreateKernel(prg, kernel_name, &err); if (err != CL_SUCCESS) { //printf( "create kernel %s failed with %d\n", kernel_name, err ); return err; } clSetKernelArg(kernel, 0, sizeof(cl_mem), &A); clSetKernelArg(kernel, 1, sizeof(unsigned int), &offA); clSetKernelArg(kernel, 2, sizeof(cl_mem), &d_dinvA); clSetKernelArg(kernel, 3, sizeof(int), &i); clSetKernelArg(kernel, 4, sizeof(unsigned int), &lda); clSetKernelArg(kernel, 5, sizeof(int), &npages); clSetKernelArg(kernel, 6, sizeof(unsigned int), &m); err = clEnqueueNDRangeKernel(queue, kernel, 2, NULL, globalThreads, globalLocal , 0, NULL, event); if (err != CL_SUCCESS) { clReleaseKernel(kernel); //printf( "execution of kernel %s failed with %d\n", kernel_name, err ); return err; } err = clReleaseKernel(kernel); return err; } //extern "C" cl_int diag_dtrtri (cl_program prg, cl_command_queue queue, int M, clblasUplo uplo, clblasDiag diag, cl_mem A, size_t offA, cl_mem d_dinvA, size_t lda, cl_event *event ) { cl_int err = 0; /* This routine is used in dtrsm */ int nthreads = (M/BLOCK_SIZE + (M % BLOCK_SIZE != 0)) * BLOCK_SIZE; unsigned int m = M; if (uplo == clblasLower) { cl_kernel diag_dtrtri_kernel_lower = clCreateKernel(prg, "DIAG_DTRTRI_KERNEL_LOWER", &err); if (err != CL_SUCCESS) { //printf( "create kernel -diag_dtrtri_kernel_lower- failed with %d\n", err ); return err; } int isDiagUnit = (diag == clblasUnit); clSetKernelArg(diag_dtrtri_kernel_lower, 0, sizeof(int), &isDiagUnit); clSetKernelArg(diag_dtrtri_kernel_lower, 1, sizeof(cl_mem), &A); clSetKernelArg(diag_dtrtri_kernel_lower, 2, sizeof(unsigned int), &offA); clSetKernelArg(diag_dtrtri_kernel_lower, 3, sizeof(cl_mem), &d_dinvA); clSetKernelArg(diag_dtrtri_kernel_lower, 4, sizeof(unsigned int), &lda); clSetKernelArg(diag_dtrtri_kernel_lower, 5, sizeof(unsigned int), &m); size_t globalThreads[1] = { nthreads }; size_t globalLocal [1] = { BLOCK_SIZE }; err = clEnqueueNDRangeKernel(queue, diag_dtrtri_kernel_lower, 1, NULL, globalThreads, globalLocal , 0, NULL, event); if (err != CL_SUCCESS) { //printf( "kernel -diag_dtrtri_kernel_lower- failed with %d\n", err ); return err; } err = clReleaseKernel(diag_dtrtri_kernel_lower); if (err != CL_SUCCESS) { return err; } // update the inverse up to the size of BLOCK_SIZE for( int i=BLOCK_SIZE; i < NB; i*=2 ) { switch (i) { case 16: CALL_KERNEL_TRIPLE_UPDATE("TRIPLE_DGEMM_UPDATE_16_PART1_L", prg, queue, A, offA, d_dinvA, i, lda, M, event); CALL_KERNEL_TRIPLE_UPDATE("TRIPLE_DGEMM_UPDATE_16_PART2_L", prg, queue, A, offA, d_dinvA, i, lda, M, event); break; case 32: CALL_KERNEL_TRIPLE_UPDATE("TRIPLE_DGEMM_UPDATE_32_PART1_L", prg, queue, A, offA, d_dinvA, i, lda, M, event); CALL_KERNEL_TRIPLE_UPDATE("TRIPLE_DGEMM_UPDATE_32_PART2_L", prg, queue, A, offA, d_dinvA, i, lda, M, event); break; case 64: CALL_KERNEL_TRIPLE_UPDATE("TRIPLE_DGEMM_UPDATE_64_PART1_L", prg, queue, A, offA, d_dinvA, i, lda, M, event); CALL_KERNEL_TRIPLE_UPDATE("TRIPLE_DGEMM_UPDATE_64_PART2_L", prg, queue, A, offA, d_dinvA, i, lda, M, event); break; default: CALL_KERNEL_TRIPLE_UPDATE("TRIPLE_DGEMM_UPDATE_ABOVE64_PART1_L", prg, queue, A, offA, d_dinvA, i, lda, M, event); CALL_KERNEL_TRIPLE_UPDATE("TRIPLE_DGEMM_UPDATE_ABOVE64_PART2_L", prg, queue, A, offA, d_dinvA, i, lda, M, event); CALL_KERNEL_TRIPLE_UPDATE("TRIPLE_DGEMM_UPDATE_ABOVE64_PART3_L", prg, queue, A, offA, d_dinvA, i, lda, M, event); break; } if (i*2 >= M) break; } } else { cl_kernel diag_dtrtri_kernel_upper = clCreateKernel(prg, "DIAG_DTRTRI_KERNEL_UPPER", &err); if (err != CL_SUCCESS) { //printf( "create kernel -diag_dtrtri_kernel_upper- failed with %d\n", err ); return err; } int isDiagUnit = (diag == clblasUnit); clSetKernelArg(diag_dtrtri_kernel_upper, 0, sizeof(int), &isDiagUnit); clSetKernelArg(diag_dtrtri_kernel_upper, 1, sizeof(cl_mem), &A); clSetKernelArg(diag_dtrtri_kernel_upper, 2, sizeof(unsigned int), &offA); clSetKernelArg(diag_dtrtri_kernel_upper, 3, sizeof(cl_mem), &d_dinvA); clSetKernelArg(diag_dtrtri_kernel_upper, 4, sizeof(unsigned int), &lda); clSetKernelArg(diag_dtrtri_kernel_upper, 5, sizeof(unsigned int), &m); size_t globalThreads[1] = { nthreads }; size_t globalLocal [1] = { BLOCK_SIZE }; err = clEnqueueNDRangeKernel(queue, diag_dtrtri_kernel_upper, 1, NULL, globalThreads, globalLocal , 0, NULL, event); if (err != CL_SUCCESS) { //printf( "kernel -diag_dtrtri_kernel_upper- failed with %d\n", err ); return err; } clReleaseKernel(diag_dtrtri_kernel_upper); if (err != CL_SUCCESS) { return err; } // update the inverse up to the size of BLOCK_SIZE for( int i=BLOCK_SIZE; i < NB; i*=2 ) { switch (i) { case 16: CALL_KERNEL_TRIPLE_UPDATE("TRIPLE_DGEMM_UPDATE_16_R", prg, queue, A, offA, d_dinvA, i, lda, M, event); break; case 32: CALL_KERNEL_TRIPLE_UPDATE("TRIPLE_DGEMM_UPDATE_32_PART1_R", prg, queue, A, offA, d_dinvA, i, lda, M, event); CALL_KERNEL_TRIPLE_UPDATE("TRIPLE_DGEMM_UPDATE_32_PART2_R", prg, queue, A, offA, d_dinvA, i, lda, M, event); break; case 64: CALL_KERNEL_TRIPLE_UPDATE("TRIPLE_DGEMM_UPDATE_64_PART1_R", prg, queue, A, offA, d_dinvA, i, lda, M, event); CALL_KERNEL_TRIPLE_UPDATE("TRIPLE_DGEMM_UPDATE_64_PART2_R", prg, queue, A, offA, d_dinvA, i, lda, M, event); break; default: CALL_KERNEL_TRIPLE_UPDATE("TRIPLE_DGEMM_UPDATE_ABOVE64_PART1_R", prg, queue, A, offA, d_dinvA, i, lda, M, event); CALL_KERNEL_TRIPLE_UPDATE("TRIPLE_DGEMM_UPDATE_ABOVE64_PART2_R", prg, queue, A, offA, d_dinvA, i, lda, M, event); CALL_KERNEL_TRIPLE_UPDATE("TRIPLE_DGEMM_UPDATE_ABOVE64_PART3_R", prg, queue, A, offA, d_dinvA, i, lda, M, event); break; } if (i*2 >= M) break; } } return err; } #define check_error(cmd) \ do{ \ cl_int xxxerr = cmd ; \ if (xxxerr != CL_SUCCESS) { \ if(InvA != 0) \ clReleaseMemObject(InvA); \ if(X != 0) \ clReleaseMemObject(X); \ return xxxerr; \ } \ } while(0) static cl_int clearBuffer( cl_command_queue queue , cl_mem buffer , size_t buffer_size ) { cl_int err = 0; cl_event event; // Hummm clEnqueueFillBuffer is OpenCL 1.2 !!! double zero = 0.0 ; err = clEnqueueFillBuffer(queue, buffer, &zero, sizeof(double), 0, // offset buffer_size, 0, NULL, &event ) ; return err; } #define nb 128 // outer blocking size, >BLOCK_SIZE #define min(x,y) ((x)<(y)?(x):(y)) cl_int cl_dtrsm( cl_program prg, cl_command_queue queue , clblasSide side, clblasUplo uplo, clblasTranspose transA, clblasDiag diag, int M, int N, double alpha, cl_mem A, size_t offA, size_t ldA, cl_mem B, size_t offB, size_t ldB, cl_event *event ) { cl_int err = 0; int i; cl_context context; err = getQueueContext(queue, &context); if(err != CL_SUCCESS) return err; /* quick return on wrong size */ if (M <= 0 || N <= 0) return clblasInvalidDim; double neg_one = -1.0 ; double one = 1.0 ; double zero = 0.0 ; // Compute the number of blocks of the specified 'size' to fully cover 'n' // Simply speaking, this is n/size rounded up. #define BLOCKS(n,size) ( ( (n) / size ) + ( (n) % (size) != 0 ) ) #define CLEANUP #define END_DGEMM_ARGS 1,&queue,0,NULL,event // Helper to compute pass the 3 arguments describing a (sub)-matrix to clblasDgemm #define _(M,i,j) M , (off##M + ((i)+(j)*ld##M) ) , ld##M cl_mem InvA = 0; cl_mem X = 0; // X of size mxn will contain the result size_t ldX = M ; size_t offX = 0; //must be 0: needed by the _(X,i,j) macro size_t size_X = N*ldX * sizeof(double); X = clCreateBuffer(context, CL_MEM_READ_WRITE, size_X, NULL, &err); check_error(err) ; err = clearBuffer( queue, X, size_X ) ; check_error(err) ; if (side == clblasLeft) { // side=L /* invert the diagonals * Allocate device memory for the inverted diagonal blocks, size=m*nb */ size_t ldInvA = nb ; size_t offInvA = 0; //must be 0: needed by the _(X,i,j) macro size_t size_InvA = ldInvA * BLOCKS(M,nb) * nb *sizeof(double); InvA = clCreateBuffer(context, CL_MEM_READ_WRITE, size_InvA, NULL, &err); check_error(err) ; err = clearBuffer( queue, InvA, size_InvA ) ; check_error(err) ; diag_dtrtri (prg, queue, M, uplo, diag, A, offA, InvA, ldA, event); // // Helper for C = alpha * transp(A) * B + beta * C // // In the calls below: // - the 1st matrix shall be either A or InvA transposed according to transA. // - the 2nd and 3rd matrices are either B and X // #define DGEMM_LEFT(m, n, k, alpha, A, B, beta, C) \ do { \ err = clblasDgemm(clblasColumnMajor, transA, clblasNoTrans , m, n, k, alpha, A, B, beta, C , 1, &queue, 0, NULL, event ) ; \ check_error(err) ; \ } while(0) if (transA == clblasNoTrans) { /* the non-transpose case */ if (uplo == clblasLower) { /* the lower case */ /* handle the first block seperately with alpha */ int mm = min(nb, (int) M); DGEMM_LEFT( mm, N, mm, alpha, _(InvA,0,0) , _(B,0,0), zero, _(X,0,0) ); if (nb < M) { DGEMM_LEFT( M-nb, N, nb, neg_one, _(A,nb,0), _(X,0,0), alpha, _(B,nb,0) ); /* the rest blocks */ for( i=nb; i < M; i += nb ) { mm = min((int)M-i, nb); DGEMM_LEFT( mm, N, mm, one, _(InvA,0,i), _(B,i,0), zero, _(X,i,0) ); if (i+nb >= M) break; DGEMM_LEFT( M-i-nb, N, nb, neg_one, _(A,i+nb,i), _(X,i,0), one, _(B,i+nb,0) ); } //check_last_error() ; } } else // if ( uplo == clblasUpper) { /* the upper case */ /* handle the first block seperately with alpha */ int mm = (M % nb == 0) ? nb : (M % nb); i = M-mm; DGEMM_LEFT( mm, N, mm, alpha, _(InvA,0,i), _(B,i,0), zero, _(X,i,0) ); if (i-nb >= 0) { DGEMM_LEFT( i, N, mm, neg_one, _(A,0,i), _(X,i,0), alpha, _(B,0,0) ); /* the rest blocks */ for( i=M-mm-nb; i >= 0; i -= nb ) { DGEMM_LEFT( nb, N, nb, one, _(InvA,0,i), _(B,i,0), zero, _(X,i,0) ); if (i-nb < 0) break; DGEMM_LEFT( i, N, nb, neg_one, _(A,0,i), _(X,i,0), one, _(B,0,0) ); } } } } else { /* the transpose case */ if (uplo == clblasLower) { /* the lower case */ /* handle the first block seperately with alpha */ int mm = (M % nb == 0) ? nb : (M % nb); i = M-mm; DGEMM_LEFT( mm, N, mm, alpha, _(InvA,0,i), _(B,i,0), zero, _(X,i,0) ); if (i-nb >= 0) { DGEMM_LEFT( i, N, mm, neg_one, _(A,i,0), _(X,i,0), alpha, _(B,0,0) ); /* the rest blocks */ for( i=M-mm-nb; i >= 0; i -= nb ) { DGEMM_LEFT( nb, N, nb, one, _(InvA,0,i), _(B,i,0), zero, _(X,i,0) ); if (i-nb < 0) break; DGEMM_LEFT( i, N, nb, neg_one, _(A,i,0), _(X,i,0), one, _(B,0,0) ); } } } else { /* the upper case */ /* handle the first block seperately with alpha */ int mm = min(nb, (int)M); DGEMM_LEFT( mm, N, mm, alpha, _(InvA,0,0), _(B,0,0), zero, _(X,0,0) ); if (nb < M) { DGEMM_LEFT( M-nb, N, nb, neg_one, _(A,0,nb) , _(X,0,0), alpha, _(B,nb,0) ); /* the rest blocks */ for( i=nb; i < M; i += nb ) { mm = min((int)M-i, nb); DGEMM_LEFT( mm, N, mm, one, _(InvA,0,i), _(B,i,0), zero, _(X,i,0) ); if (i+nb >= M) break; DGEMM_LEFT( M-i-nb, N, nb, neg_one, _(A,i,i+nb), _(X,i,0), one, _(B,i+nb,0) ); } } } } } else { // // Helper for C = alpha * B * A + beta * C // // In the calls below // - the 2nd matrix shall be either A or InvA transposed according to transA // - the 1st and 3rd matrices are either B and X // #define DGEMM_RIGHT(m,n,k, alpha, B, A, beta, C ) \ do { \ err = clblasDgemm(clblasColumnMajor, clblasNoTrans, transA , m, n, k, alpha, B, A, beta, C , 1, &queue, 0, NULL, event ) ; \ check_error(err) ; \ } while(0) // side=R /* invert the diagonals * Allocate device memory for the inverted diagonal blocks, size=n*BLOCK_SIZE */ /* invert the diagonals * Allocate device memory for the inverted diagonal blocks, size=m*nb */ size_t ldInvA = nb ; size_t offInvA = 0; //must be 0: needed by the _(X,i,j) macro size_t size_InvA = ldInvA * BLOCKS(N,nb) * nb *sizeof(double); InvA = clCreateBuffer(context, CL_MEM_READ_WRITE, size_InvA, NULL, &err); check_error(err) ; err = clearBuffer( queue, InvA, size_InvA ) ; check_error(err) ; diag_dtrtri (prg, queue, N, uplo, diag, A, offA, InvA, ldA, event); if (transA == clblasNoTrans) { /* the non-transpose case */ if (uplo == clblasLower) { /* the lower case */ /* handle the first block seperately with alpha */ int nn = (N % nb == 0) ? nb : (N % nb); i = N-nn; DGEMM_RIGHT( M, nn, nn, alpha, _(B,0,i), _(InvA,0,i), zero, _(X,0,i) ); if (i-nb >= 0) { DGEMM_RIGHT( M, i, nn, neg_one, _(X,0,i), _(A,i,0), alpha, _(B,0,0) ); /* the rest blocks */ for( i=N-nn-nb; i >= 0; i -= nb ) { DGEMM_RIGHT( M, nb, nb, one, _(B,0,i), _(InvA,0,i), zero, _(X,0,i) ); if (i-nb < 0) break; DGEMM_RIGHT( M, i, nb, neg_one, _(X,0,i), _(A,i,0), one, _(B,0,0) ); } } } else { /* the upper case */ /* handle the first block seperately with alpha */ int nn = min(nb, (int)N); DGEMM_RIGHT( M, nn, nn, alpha, _(B,0,0), _(InvA,0,0), zero, _(X,0,0) ); if (nb < N) { DGEMM_RIGHT( M, N-nb, nb, neg_one, _(X,0,0), _(A,0,nb), alpha, _(B,0,nb) ); /* the rest blocks */ for( i=nb; i < N; i += nb ) { nn = min(nb, (int)N-i); DGEMM_RIGHT( M, nn, nn, one, _(B,0,i), _(InvA,0,i), zero, _(X,0,i) ); if (i+nb >= N) break; DGEMM_RIGHT( M, N-i-nb, nb, neg_one, _(X,0,i), _(A,i,i+nb), one, _(B,0,i+nb) ); } } } } else { /* the transpose case */ if (uplo == clblasLower) { /* the lower case */ /* handle the first block seperately with alpha */ int nn = min(nb, (int)N); DGEMM_RIGHT( M, nn, nn, alpha, _(B,0,0), _(InvA,0,0), zero, _(X,0,0) ); if (nb < N) { DGEMM_RIGHT( M, N-nb, nb, neg_one, _(X,0,0), _(A,nb,0), alpha, _(B,0,nb) ); /* the rest blocks */ for( i=nb; i < N; i += nb ) { nn = min(nb, (int)N-i); DGEMM_RIGHT( M, nn, nn, one, _(B,0,i), _(InvA,0,i), zero, _(X,0,i) ); if (i+nb >= N) break; DGEMM_RIGHT( M, N-i-nb, nb, neg_one, _(X,0,i), _(A,nb+i,i), one, _(B,0,i+nb) ); } } } else { /* the upper case */ /* handle the first block seperately with alpha */ int nn = (N % nb == 0) ? nb : (N % nb); i = N-nn; DGEMM_RIGHT( M, nn, nn, alpha, _(B,0,i), _(InvA,0,i), zero, _(X,0,i) ); if (i-nb >= 0) { DGEMM_RIGHT( M, i, nn, neg_one, _(X,0,i), _(A,0,i), alpha, _(B,0,0) ); /* the rest blocks */ for( i=N-nn-nb; i >= 0; i -= nb ) { DGEMM_RIGHT( M, nb, nb, one, _(B,0,i), _(InvA,0,i), zero, _(X,0,i) ); if (i-nb < 0) break; DGEMM_RIGHT( M, i, nb, neg_one, _(X,0,i), _(A,0,i), one, _(B,0,0) ); } } } } } // Copy X(m,n) to B(m,n) { size_t src_origin[3] = { 0, 0, 0 } ; size_t dst_origin[3] = { offB*sizeof(double), 0, 0 } ; size_t region[3] = { M*sizeof(double), N, 1 } ; err = clEnqueueCopyBufferRect( queue, X, B, src_origin, dst_origin, region, ldX*sizeof(double), 0, ldB*sizeof(double), 0, 0, NULL, event) ; check_error(err) ; clReleaseMemObject(InvA); clReleaseMemObject(X); } return err; } clblasStatus clblasDtrsmFunctorGpu::execute(Args &args) { cl_int err; cl_command_queue queue = args.queue; if (VERB) printf(" ===> EXECUTE KERNEL %s\n", "dtrsm_gpu") ; cl_program prg = this->m_program; err = cl_dtrsm( prg, queue , args.side, args.uplo, args.transA, args.diag, args.M, args.N, args.alpha, args.A, args.offA, args.lda, args.B, args.offB, args.ldb, args.events ); if (VERB) printf(" ===> ERR=%d \n",(int)err) ; return clblasStatus(err) ; } clblasDtrsmFunctorGpu * clblasDtrsmFunctorGpu::provide(clblasDtrsmFunctor::Args & args , const char* DevName) { if ( args.order == clblasRowMajor ) return NULL ; // The RowMajor case shall never occur. cl_device_id dev; cl_context ctxt; cl_int err = getDeviceAndContext(args.queue, dev, ctxt); if (err != CL_SUCCESS) { return NULL; } cl_uint bitness = getAddressBits(dev); Cache::Lookup lookup(cache, ctxt, dev, true) ; if ( lookup.ok() ){ clblasDtrsmFunctorGpu * functor = lookup.get(); functor->retain(); // increment the reference counter to avoid deletion while it is still beeing used return functor; } clblasDtrsmFunctorGpu * functor = new clblasDtrsmFunctorGpu(args, err, DevName, bitness); if (err != CL_SUCCESS) { return NULL; } lookup.set(functor) ; return functor; } clblas-2.10/src/library/blas/functor/gpu_dtrsm192.cc000066400000000000000000000402401264277366700223230ustar00rootroot00000000000000#include #include #include #include #include "clblas-internal.h" #include "solution_seq.h" #include "functor.h" #include "binary_lookup.h" #include #include "functor_xtrsm.h" #include "gpu_dtrsm192.h" //#include "tahiti.h" #include "BinaryBuild.h" #if BUILD_KERNEL_FROM_STRING #include "dtrsm_gpu192.clT" #else #include "dtrsm_gpu192.clHawaii_64.bin.clT" #endif // Make it 1 to enable additional debug 'print' #define VERB 0 //TODO //clReleaseKernel(kernel) ; #define BLOCK_SIZE 12 // inner blocking size, <=32 #define NB 192 // outer blocking size, >BLOCK_SIZE // // The static cache used to store all instances of clblasDtrsmFunctorGpu /clblasDgemmFunctorTahiti // typedef clblasFunctorCache Cache ; static Cache cache ; clblasDtrsm192FunctorGpu::clblasDtrsm192FunctorGpu(Args & args, cl_int & err, const char* DevName, cl_uint _64BitsUse) : m_program(0) { cl_device_id device; cl_context context; cl_command_queue queue = args.queue; err = getDeviceAndContext(queue, device, context); if( err != CL_SUCCESS ) { return; } if (VERB) printf(" ===> GET KERNEL %s\n", "clblasDtrsmFunctorGpu") ; BinaryLookup bl(context, device, "clblasDtrsm192FunctorGpu"); if ( !bl.found() ) // may create empty file or may wait until file is ready { // directly build from a char* #if BUILD_KERNEL_FROM_STRING err = bl.buildFromSource(dtrsm_gpu_kernels); #else if(!strcmp(DevName, "Tahiti")) { } else if(!strcmp(DevName, "Hawaii")) { #ifndef CLBLAS_HAWAII_DYNAMIC_KERNEL if(_64BitsUse==64) err = bl.buildFromBinary(dtrsm_gpu192_kernels_64_bin_Hawaii, sizeof(dtrsm_gpu192_kernels_64_bin_Hawaii), NULL); else { std::cout<<"we don't support clblas on 32 bits"<< std::endl; assert(1); } #endif } #endif if ( err != CL_SUCCESS ) { if (VERB) printf(" ===> BUILD PROBLEM\n") ; return; } } this->m_program = bl.getProgram(); } #define CALL_KERNEL_TRIPLE_UPDATE(kernel_name, prg, queue, A, offA, d_dinvA, i, lda, M, event) \ do{ \ err = call_kernel_triple_update192(kernel_name, prg, queue, A, offA, d_dinvA, i, lda, M, event); \ if(err != CL_SUCCESS) { \ return err; \ } \ } while(0) cl_int call_kernel_triple_update192(const char* kernel_name, const cl_program prg, const cl_command_queue queue, cl_mem A, unsigned int offA, cl_mem d_dinvA, int i, unsigned int lda, int M, cl_event *event) { cl_int err = 0; unsigned int m = M; int npages = M/(i*2)+(M%(i*2)!=0); size_t globalLocal [2]; size_t globalThreads[2]; switch (i) { case 12: globalLocal[0] = 12; globalLocal[1] = 1; globalThreads[0] = npages * 12; globalThreads[1] = 1; break; case 24: globalLocal[0] = 24; globalLocal[1] = 2; globalThreads[0] = npages * 24; globalThreads[1] = 2; break; case 48: globalLocal[0] = 24; globalLocal[1] = 2; globalThreads[0] = npages * 48; globalThreads[1] = 4; break; case 96: globalLocal[0] = 24; globalLocal[1] = 2; globalThreads[0] = npages * 96; globalThreads[1] = 8; break; default: break; } cl_kernel kernel = clCreateKernel(prg, kernel_name, &err); if (err != CL_SUCCESS) { //printf( "create kernel %s failed with %d\n", kernel_name, err ); return err; } clSetKernelArg(kernel, 0, sizeof(cl_mem), &A); clSetKernelArg(kernel, 1, sizeof(unsigned int), &offA); clSetKernelArg(kernel, 2, sizeof(cl_mem), &d_dinvA); clSetKernelArg(kernel, 3, sizeof(int), &i); clSetKernelArg(kernel, 4, sizeof(unsigned int), &lda); clSetKernelArg(kernel, 5, sizeof(int), &npages); clSetKernelArg(kernel, 6, sizeof(unsigned int), &m); err = clEnqueueNDRangeKernel(queue, kernel, 2, NULL, globalThreads, globalLocal , 0, NULL, event); if (err != CL_SUCCESS) { clReleaseKernel(kernel); //printf( "execution of kernel %s failed with %d\n", kernel_name, err ); return err; } err = clReleaseKernel(kernel); return err; } //extern "C" cl_int diag_dtrtri192 (cl_program prg, cl_command_queue queue, int M, clblasUplo uplo, clblasDiag diag, cl_mem A, size_t offA, cl_mem d_dinvA, size_t lda, cl_event *event ) { cl_int err = 0; /* This routine is used in dtrsm */ //For side==right, M is actually N here int nthreads = (M/BLOCK_SIZE + (M % BLOCK_SIZE != 0)) * BLOCK_SIZE; unsigned int m = M; if (uplo == clblasLower) { //lower is not supported yet } else { cl_kernel diag_dtrtri_kernel_upper = clCreateKernel(prg, "DIAG_DTRTRI_KERNEL_UPPER", &err); if (err != CL_SUCCESS) { //printf( "create kernel -diag_dtrtri_kernel_upper- failed with %d\n", err ); return err; } int isDiagUnit = (diag == clblasUnit); clSetKernelArg(diag_dtrtri_kernel_upper, 0, sizeof(int), &isDiagUnit); clSetKernelArg(diag_dtrtri_kernel_upper, 1, sizeof(cl_mem), &A); clSetKernelArg(diag_dtrtri_kernel_upper, 2, sizeof(unsigned int), &offA); clSetKernelArg(diag_dtrtri_kernel_upper, 3, sizeof(cl_mem), &d_dinvA); clSetKernelArg(diag_dtrtri_kernel_upper, 4, sizeof(unsigned int), &lda); clSetKernelArg(diag_dtrtri_kernel_upper, 5, sizeof(unsigned int), &m); size_t globalThreads[1] = { nthreads }; size_t globalLocal [1] = { BLOCK_SIZE }; err = clEnqueueNDRangeKernel(queue, diag_dtrtri_kernel_upper, 1, NULL, globalThreads, globalLocal , 0, NULL, event); if (err != CL_SUCCESS) { //printf( "kernel -diag_dtrtri_kernel_upper- failed with %d\n", err ); return err; } clReleaseKernel(diag_dtrtri_kernel_upper); if (err != CL_SUCCESS) { return err; } // update the inverse up to the size of BLOCK_SIZE for( int i=BLOCK_SIZE; i < NB; i*=2 ) { switch (i) { case 12: CALL_KERNEL_TRIPLE_UPDATE("TRIPLE_DGEMM_UPDATE_12_R", prg, queue, A, offA, d_dinvA, i, lda, M, event); break; case 24: CALL_KERNEL_TRIPLE_UPDATE("TRIPLE_DGEMM_UPDATE_24_PART1_R", prg, queue, A, offA, d_dinvA, i, lda, M, event); CALL_KERNEL_TRIPLE_UPDATE("TRIPLE_DGEMM_UPDATE_24_PART2_R", prg, queue, A, offA, d_dinvA, i, lda, M, event); break; case 48: CALL_KERNEL_TRIPLE_UPDATE("TRIPLE_DGEMM_UPDATE_48_PART1_R", prg, queue, A, offA, d_dinvA, i, lda, M, event); CALL_KERNEL_TRIPLE_UPDATE("TRIPLE_DGEMM_UPDATE_48_PART2_R", prg, queue, A, offA, d_dinvA, i, lda, M, event); break; case 96: CALL_KERNEL_TRIPLE_UPDATE("TRIPLE_DGEMM_UPDATE_96_PART1_R", prg, queue, A, offA, d_dinvA, i, lda, M, event); CALL_KERNEL_TRIPLE_UPDATE("TRIPLE_DGEMM_UPDATE_96_PART2_R", prg, queue, A, offA, d_dinvA, i, lda, M, event); break; default: break; } if (i*2 >= M) break; } } return err; } #define check_error(cmd) \ do{ \ cl_int xxxerr = cmd ; \ if (xxxerr != CL_SUCCESS) { \ if(InvA != 0) \ clReleaseMemObject(InvA); \ if(X != 0) \ clReleaseMemObject(X); \ return xxxerr; \ } \ } while(0) static cl_int clearBuffer192( cl_command_queue queue , cl_mem buffer , size_t buffer_size ) { cl_int err = 0; cl_event event; // Hummm clEnqueueFillBuffer is OpenCL 1.2 !!! double zero = 0.0 ; err = clEnqueueFillBuffer(queue, buffer, &zero, sizeof(double), 0, // offset buffer_size, 0, NULL, &event ) ; return err; } #define nb 192 // outer blocking size, >BLOCK_SIZE #define min(x,y) ((x)<(y)?(x):(y)) cl_int cl_dtrsm192( cl_program prg, cl_command_queue queue , clblasSide side, clblasUplo uplo, clblasTranspose transA, clblasDiag diag, int M, int N, double alpha, cl_mem A, size_t offA, size_t ldA, cl_mem B, size_t offB, size_t ldB, cl_event *event ) { cl_int err = 0; int i; cl_context context; err = getQueueContext(queue, &context); if(err != CL_SUCCESS) return err; /* quick return on wrong size */ if (M <= 0 || N <= 0) return clblasInvalidDim; double neg_one = -1.0 ; double one = 1.0 ; double zero = 0.0 ; // Compute the number of blocks of the specified 'size' to fully cover 'n' // Simply speaking, this is n/size rounded up. #define BLOCKS(n,size) ( ( (n) / size ) + ( (n) % (size) != 0 ) ) #define CLEANUP #define END_DGEMM_ARGS 1,&queue,0,NULL,event // Helper to compute pass the 3 arguments describing a (sub)-matrix to clblasDgemm #define _(M,i,j) M , (off##M + ((i)+(j)*ld##M) ) , ld##M cl_mem InvA = 0; cl_mem X = 0; // X of size mxn will contain the result size_t ldX = M ; size_t offX = 0; //must be 0: needed by the _(X,i,j) macro size_t size_X = N*ldX * sizeof(double); X = clCreateBuffer(context, CL_MEM_READ_WRITE, size_X, NULL, &err); check_error(err) ; err = clearBuffer192( queue, X, size_X ) ; check_error(err) ; if (side == clblasLeft) { // side=L /* invert the diagonals * Allocate device memory for the inverted diagonal blocks, size=m*nb */ // Not supported yet with 192 block size } else { // // Helper for C = alpha * B * A + beta * C // // In the calls below // - the 2nd matrix shall be either A or InvA transposed according to transA // - the 1st and 3rd matrices are either B and X // #define DGEMM_RIGHT(m,n,k, alpha, B, A, beta, C ) \ do { \ err = clblasDgemm(clblasColumnMajor, clblasNoTrans, transA , m, n, k, alpha, B, A, beta, C , 1, &queue, 0, NULL, event ) ; \ check_error(err) ; \ } while(0) // side=R /* invert the diagonals * Allocate device memory for the inverted diagonal blocks, size=n*BLOCK_SIZE */ /* invert the diagonals * Allocate device memory for the inverted diagonal blocks, size=m*nb */ size_t ldInvA = nb ; size_t offInvA = 0; //must be 0: needed by the _(X,i,j) macro size_t size_InvA = ldInvA * BLOCKS(N,nb) * nb *sizeof(double); InvA = clCreateBuffer(context, CL_MEM_READ_WRITE, size_InvA, NULL, &err); check_error(err) ; err = clearBuffer192( queue, InvA, size_InvA ) ; check_error(err) ; diag_dtrtri192 (prg, queue, N, uplo, diag, A, offA, InvA, ldA, event); if (transA == clblasNoTrans) { /* the non-transpose case */ if (uplo == clblasLower) { /* the lower case */ /* handle the first block seperately with alpha */ // lower is not implemented yet } else { /* the upper case */ /* handle the first block seperately with alpha */ int nn = min(nb, (int)N); //DGEMM_RIGHT( M, nn, nn, alpha, _(B,0,0), _(InvA,0,0), zero, _(X,0,0) ); err = clblasDgemm(clblasColumnMajor, clblasNoTrans, clblasNoTrans, M, nn, nn, alpha, B, offB, ldB, InvA, offInvA, ldInvA, zero, X, offX, ldX, 1, &queue, 0, NULL, event); check_error(err); if (nb < N) { //DGEMM_RIGHT( M, N-nb, nb, neg_one, _(X,0,0), _(A,0,nb), alpha, _(B,0,nb) ); err = clblasDgemm(clblasColumnMajor, clblasNoTrans, clblasNoTrans, M, N - nb, nb, neg_one, X, offX, ldX, A, offA+ldA*nb, ldA, alpha, B, offB+nb*ldB, ldB, 1, &queue, 0, NULL, event); assert(err == CL_SUCCESS); /* the rest blocks */ for( i=nb; i < N; i += nb ) { nn = min(nb, (int)N-i); //DGEMM_RIGHT( M, nn, nn, one, _(B,0,i), _(InvA,0,i), zero, _(X,0,i) ); err = clblasDgemm(clblasColumnMajor, clblasNoTrans, clblasNoTrans, M, nn, nn, one, B, offB+i*ldB, ldB, InvA, offInvA+i*nb, ldInvA, zero, X, offX+i*ldX, ldX, 1, &queue, 0, NULL, event); assert(err == CL_SUCCESS); if (i+nb >= N) break; //DGEMM_RIGHT( M, N-i-nb, nb, neg_one, _(X,0,i), _(A,i,i+nb), one, _(B,0,i+nb) ); err = clblasDgemm(clblasColumnMajor, clblasNoTrans, clblasNoTrans, M, N - i - nb, nb, neg_one, X, offX+i*ldX, ldX, A, offA + i + (nb + i)*ldA, ldA, one, B, offB + (i + nb)*ldB, ldB, 1, &queue, 0, NULL, event); assert(err == CL_SUCCESS); } } } } else { /* the transpose case */ // trans is not implemented yet } } // Copy X(m,n) to B(m,n) { size_t src_origin[3] = { 0, 0, 0 } ; size_t dst_origin[3] = { offB*sizeof(double), 0, 0 } ; size_t region[3] = { M*sizeof(double), N, 1 } ; err = clEnqueueCopyBufferRect( queue, X, B, src_origin, dst_origin, region, ldX*sizeof(double), 0, ldB*sizeof(double), 0, 0, NULL, event) ; check_error(err) ; clReleaseMemObject(InvA); clReleaseMemObject(X); } return err; } clblasStatus clblasDtrsm192FunctorGpu::execute(Args &args) { cl_int err; cl_command_queue queue = args.queue; if (VERB) printf(" ===> EXECUTE KERNEL %s\n", "dtrsm_gpu") ; cl_program prg = this->m_program; err = cl_dtrsm192( prg, queue , args.side, args.uplo, args.transA, args.diag, args.M, args.N, args.alpha, args.A, args.offA, args.lda, args.B, args.offB, args.ldb, args.events ); if (VERB) printf(" ===> ERR=%d \n",(int)err) ; return clblasStatus(err) ; } clblasDtrsm192FunctorGpu * clblasDtrsm192FunctorGpu::provide(clblasDtrsmFunctor::Args & args , const char* DevName) { if ( args.order == clblasRowMajor ) return NULL ; // The RowMajor case shall never occur. cl_device_id dev; cl_context ctxt; cl_int err = getDeviceAndContext(args.queue, dev, ctxt); if (err != CL_SUCCESS) { return NULL; } cl_uint bitness = getAddressBits(dev); Cache::Lookup lookup(cache, ctxt, dev, true) ; if ( lookup.ok() ){ clblasDtrsm192FunctorGpu * functor = lookup.get(); functor->retain(); // increment the reference counter to avoid deletion while it is still beeing used return functor; } clblasDtrsm192FunctorGpu * functor = new clblasDtrsm192FunctorGpu(args, err, DevName, bitness); if (err != CL_SUCCESS) { return NULL; } lookup.set(functor) ; return functor; } clblas-2.10/src/library/blas/functor/hawaii.cc000066400000000000000000000177541264277366700213430ustar00rootroot00000000000000/* ************************************************************************ * Copyright 2014 Advanced Micro Devices, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * ************************************************************************/ #include #include "gpu_dtrsm.h" #include "gcn_dgemmCommon.h" #include "math.h" //#include "hawaii_dgemmChannelConflict.h" #include "hawaii_dgemmSplitKernel.h" #include "hawaii_sgemmSplitKernel.h" #include "gcn_dgemmSmallMatrices.h" #include "gcn_sgemmSmallMatrices.h" #include "hawaii_sgemmBranchKernel.h" #include "hawaii_sgemmSplit64_32.h" #include "gcn_zgemm.h" #include "gpu_dtrsm192.h" #include "hawaii_sgemmBig1024Kernel.h" FunctorSelectorHawaii FunctorSelectorHawaii::instance ; FunctorSelectorHawaii::FunctorSelectorHawaii() : clblasFunctorSelector(HAWAII) { } // // The selector function for DGEMM on hawaii // // clblasDgemmFunctor * FunctorSelectorHawaii::select_dgemm_specific(clblasDgemmFunctor::Args & args) { #ifdef CLBLAS_HAWAII_DYNAMIC_KERNEL return this->clblasFunctorSelector::select_dgemm_specific(args); #else clblasDgemmFunctor * functor; bool NN_NT = ((args.transA==clblasNoTrans && args.transB==clblasTrans ) || ( args.transA==clblasNoTrans && args.transB==clblasNoTrans )); bool SmallMatrices = args.M/6*args.N/6<85*85; SmallMatrices= SmallMatrices && ((args.M%24==0&&args.N%24==0)||(args.M%16==0&&args.N%16==0))&&args.K%8==0 && (args.transA==clblasNoTrans && args.transB==clblasTrans );//*/&&NN_NT; bool BestPerf= (args.M%48==0 && args.N%48==0) || (args.M%32==0 && args.M>4000 && args.N%32==0 && args.N>4000) || (args.M%40==0 && args.M>4000 && args.N%40==0 && args.N>4000) || ((args.M%32!=0 && args.M>1000) || (args.N%32!=0 && args.N>1000)) || ((args.M%40!=0 && args.M>1000) || (args.N%40!=0 && args.N>1000)) ; bool useSpliKernel = (NN_NT && BestPerf); if (args.alpha!=0) { if (SmallMatrices) { functor = clBlasGCNDgemmSmallMatricesFunctor::provide(args, "Hawaii"); if (functor) return functor; } else if ( useSpliKernel) { functor = clBlashawaiiDgemmSplitKernelFunctor::provide(args); if (functor) return functor; } functor = clBlasGCNdgemmCommonFunctor::provide(args, "Hawaii"); if (functor) return functor; //{ // functor = clBlashawaiiDgemmChannelConflictFunctor::provide(args); // if (functor) // return functor; //} } // else use the fallback implementation return this->clblasFunctorSelector::select_dgemm_specific(args); #endif } // The selector function for SGEMM on hawaii clblasSgemmFunctor * FunctorSelectorHawaii::select_sgemm_specific(clblasSgemmFunctor::Args & args) { #ifdef CLBLAS_HAWAII_DYNAMIC_KERNEL return this->clblasFunctorSelector::select_sgemm_specific(args); #else //TODO: the logic below is complicated; Needs cleanup; clblasSgemmFunctor * functor; bool Not_TT = ((args.transA==clblasNoTrans && args.transB==clblasTrans ) || ( args.transA==clblasNoTrans && args.transB==clblasNoTrans ) || ( args.transA==clblasTrans && args.transB==clblasNoTrans )); bool SmallMatrices = (args.M / 6 * args.N / 6 < 100 * 100) || (args.M / 6 * args.N / 6 < 180 * 180 && (args.M % 32 != 0 || args.N % 32 != 0)) || ((args.M % 64 != 0 && args.N % 64 != 0 && args.M < 1900 && args.N < 1900) && (args.M % 96 != 0 && args.N % 96 != 0 && args.M < 1900 && args.N < 1900)); bool SmallMatricesMod32= (SmallMatrices && (args.M%32==0&&args.N%32==0)) ; SmallMatricesMod32 = SmallMatricesMod32&&Not_TT&&args.K % 16 == 0; //SmallMatrices= false; bool useSpliKernel=((args.M%96==0 && args.N%96==0) || !(args.M%64==0 && args.N%64==0&& args.M<4000 &&args.N<4000)) ; useSpliKernel=useSpliKernel&&Not_TT; //functor = clBlashawaiiSgemmSplit64_32Functor::provide(args, "Hawaii"); //if (functor) // return functor; if ((args.lda % 1024 == 0) && (args.ldb % 1024 == 0) && (args.K > args.lda / 4)) { if ((args.lda == args.ldb) && (args.lda >= 4096) && (args.lda <= 8192)) // between 4096 and 8192 for now { if (args.lda != 6144)// 6144 is handled by a special case split { // we are going to call 16 GEMMs with M=M/2, N=N/2, K=K/4 // each GEMM requires M%128 == 0, N%128 == 0, K%16 == 0 if (args.M % 256 == 0 && args.N % 256 == 0 && args.K % 64 == 0) { functor = clBlashawaiiSgemmBig1024KernelFunctor::provide(args, "Hawaii"); if (functor) return functor; } } } } if ((args.M >= 1184 && args.N >= 1184) && (args.M <= 3872 && args.N <= 3872) && (args.M % 64 != 0 && args.N % 64 != 0) && (args.M % 96 != 0 && args.N % 96 != 0) && (args.K % 16 == 0)) { //all the mod32 sizes that is not mod64 or mod96 ranging from 1184 to 3872 //non mod32 cases are not implemented in this approach and are of less interest if ((args.M % 32 == 0 && args.N % 32 == 0) && (args.transA == clblasNoTrans && args.transB == clblasTrans)) { functor = clBlashawaiiSgemmSplit64_32Functor::provide(args, "Hawaii"); if (functor) return functor; } } //the English translation of below is: if small matrix that is (not mod32) and (not_TT) and K has to be mod 16 if (SmallMatrices && (!SmallMatricesMod32) && (Not_TT) && (args.K%16 == 0)) { functor = clBlashawaiiSgemmBranchKernelFunctor::provide(args, "Hawaii"); if (functor) return functor; } if (args.alpha!=0 ) { if (SmallMatricesMod32) { functor = clBlasGCNSgemmSmallMatricesFunctor::provide(args, "Hawaii"); if (functor) return functor; } if ( useSpliKernel) { functor = clBlashawaiiSgemmSplitKernelFunctor::provide(args, "Hawaii"); if (functor) return functor; } else { functor = clblasSgemmFunctorGCN::provide(args, "Hawaii"); if (functor) return functor; } } // else use the fallback implementation return this->clblasFunctorSelector::select_sgemm_specific(args); #endif } // The selector function for ZGEMM on hawaii clblasZgemmFunctor * FunctorSelectorHawaii::select_zgemm_specific(clblasZgemmFunctor::Args & args) { if ( args.M%32==0 && args.N%64==0 && args.K%8==0 && args.transA==clblasNoTrans && args.transB==clblasTrans && args.order==clblasColumnMajor) { return clblasZgemmFunctorGCN::provide(args, "Hawaii"); } else { return this->clblasFunctorSelector::select_zgemm_specific(args); } } // The selector function for DTRSM on hawaii // clblasDtrsmFunctor * FunctorSelectorHawaii::select_dtrsm_specific(clblasDtrsmFunctor::Args & args) { #ifdef CLBLAS_HAWAII_DYNAMIC_KERNEL return this->clblasFunctorSelector::select_dtrsm_specific(args); #else clblasDtrsmFunctor * functor; if ((args.M % 192 == 0) && (args.N % 192 == 0)) { //TODO: the implementation of sub block being 192 only supports //side == right //uplo == upper //trans == notrans //M and N need to be mod192 //subblock being 192 is prefered over 128 on Hawaii device since //it does not create "boundary" in DGEMM calls //Hawaii DGEMM calls have better performance when M N K are mod48 if ((args.side == clblasRight) && (args.uplo == clblasUpper) && (args.transA == clblasNoTrans)) { functor = clblasDtrsm192FunctorGpu::provide(args, "Hawaii"); if (functor) return functor; } } //sub block is 128 here functor = clblasDtrsmFunctorGpu::provide(args, "Hawaii"); if (functor) return functor; // else use the fallback implementation return this->clblasFunctorSelector::select_dtrsm_specific(args); #endif } clblas-2.10/src/library/blas/functor/hawaii_dgemmChannelConflict.cc000066400000000000000000000100301264277366700254430ustar00rootroot00000000000000#ifndef CLBLAS_HAWAII_DYNAMIC_KERNEL #include #include #include #include #include "clblas-internal.h" #include "solution_seq.h" #include #include #include #include #include #include #include "BinaryBuild.h" #include "hawaii_dgemmChannelConflict.h" #if BUILD_KERNEL_FROM_STRING #include "dgemm_hawaiiChannelConfilct.clT" #else #include "dgemm_hawaiiChannelConfilct.clHawaii_64.bin.clT" #endif // Just because the full name is too long typedef clblasDgemmFunctorGCN::Variant Variant ; //typedef clblasFunctorCache Cache ; //static Cache cache ; // Make it 1 to enable additional debug 'print' #define VERB 0 static const Variant * select_variant_hawaiiChannelConflict( clblasDgemmFunctor::Args & args, cl_uint _64BitsUse ) { //return the only variant we have for the moment as we only support NT!!!!! #if BUILD_KERNEL_FROM_STRING static const Variant variant = {"dgemmBlockTempLocalPrefetch", dgemm_NT_ChannelConflict, NULL, NULL, 0, clblasNoTrans, clblasTrans, 256,256,2, {8,8}, {2,4}}; return &variant; #else if(_64BitsUse==64) { static const Variant variant = {"dgemmBlockTempLocalPrefetch", NULL, NULL, dgemm_NT_ChannelConflict_64_bin_Hawaii, sizeof(dgemm_NT_ChannelConflict_64_bin_Hawaii), clblasNoTrans, clblasTrans, 256,256,2, {8,8}, {2,4}}; return &variant ; } else { std::cout<<"we don't support clblas on 32 bits"<< std::endl; assert(1); return NULL; } #endif //return NULL; } clBlashawaiiDgemmChannelConflictFunctor::clBlashawaiiDgemmChannelConflictFunctor(Args & args, const Variant * variant, cl_int & err) { cl_device_id device; cl_context context; m_program=NULL; m_variant = variant; cl_command_queue queue = args.queue; err = getDeviceAndContext(queue, device, context); if( err != CL_SUCCESS ) { return; } if (VERB) printf(" ===> GET KERNEL %s\n", this->m_variant->kernel_name) ; //Ben do I use the correct "kernel_name"? BinaryLookup bl(context, device, "clBlashawaiiDgemmChannelConflictFunctor"); bl.variantRaw( this->m_variant->kernel_name, strlen(this->m_variant->kernel_name)+1 ) ; if ( !bl.found() ) // may create empty file or may wait until file is ready { if ( this->m_variant->bin != 0 ) { // build from a pre-compiled version of the kernel (SPIR or cl binaries) err = bl.buildFromBinary(this->m_variant->bin, this->m_variant->bin_size, this->m_variant->build_options); } else { // directly build from a char* err = bl.buildFromSource(this->m_variant->source); } if ( err != CL_SUCCESS ) { if (VERB) printf(" ===> BUILD PROBLEM\n") ; return; } } this->m_program = bl.getProgram(); } clBlashawaiiDgemmChannelConflictFunctor * clBlashawaiiDgemmChannelConflictFunctor::provide(clblasDgemmFunctor::Args & args) { if ( args.order == clblasRowMajor ) return NULL ; // The RowMajor case shall never occur. cl_device_id dev; cl_context ctxt; cl_int err = getDeviceAndContext(args.queue, dev, ctxt); if (err != CL_SUCCESS) { return NULL; } cl_uint bitness = getAddressBits(dev); const Variant * variant = select_variant_hawaiiChannelConflict( args, bitness ) ; if ( variant == NULL ) return NULL ; //for now we only have one variant, but we are working on others one and therefore I prefer keeping the code to manage them //Cache::Lookup lookup(cache, ctxt, dev, variant) ; //if ( lookup.ok() ) //{ // clBlashawaiiDgemmChannelConflictFunctor * functor = lookup.get(); // functor->retain(); // increment the reference counter to avoid deletion while it is still beeing used // return functor; //} clBlashawaiiDgemmChannelConflictFunctor * functor = new clBlashawaiiDgemmChannelConflictFunctor(args, variant, err); if (err != CL_SUCCESS) { return NULL; } //lookup.set(functor) ; return functor; } #endifclblas-2.10/src/library/blas/functor/hawaii_dgemmSplitKernel.cc000066400000000000000000000544571264277366700246720ustar00rootroot00000000000000#ifndef CLBLAS_HAWAII_DYNAMIC_KERNEL #include #include #include #include #include "clblas-internal.h" #include "solution_seq.h" #include #include #include #include #include #include #include "BinaryBuild.h" #include "hawaii_dgemmSplitKernel.h" #if BUILD_KERNEL_FROM_STRING #include "dgemm_hawaiiSplitKernel.clT" #else #include "dgemm_hawaiiSplitKernel.clHawaii_64.bin.clT" #endif // Just because the full name is too long typedef clBlashawaiiDgemmSplitKernelFunctor::Variant Variant ; //define the string name of the soure/binary code #define DGEMM_SRC_NAME(TA,TB, DIVK, MULT) dgemm_##TA##TB##_##DIVK##_SPLIT##MULT #define DGEMM_SRC_NAME_HAWAII(TA,TB, DIVK, MULT, BITS) dgemm_##TA##TB##_##DIVK##_SPLIT##MULT##_##BITS##_bin_Hawaii //variant name used to differentiate the different ones #define DGEMM_VARIANT_NAME(TA,TB, DIVK, MULT) "dgemm_" #TA #TB "_" #DIVK "_SPLIT" #MULT //DGEMM_VARIANT_NAME(TA, TB, DIVM , DIVN, DIVK, GREATER48M, GREATER48N, NBKERNEL), #define DGEMM_KERNEL_NAME(TA,TB,DIVM,DIVN,DIVK,BS0,BS1,NV0,NV1,MULT, BLOC) "dgemm_" #TA #TB "_" #DIVM "_" #DIVN "_" #DIVK "_" #BS0 "x" #BS1 "_" #NV0 "x" #NV1 #MULT "_SPLIT_" #BLOC #define trans_N clblasNoTrans #define trans_T clblasTrans // Fill a variant descriptor using OpenCL source #define DGEMM_VARIANT_OBJ(TA,TB,DIVK,BS0,BS1,NV0,NV1, BITS, MULT, \ KERNEL_NAME_MAIN, KERNEL_NAME_ROW, KERNEL_NAME_COLUMN, KERNEL_NAME_SINGLE, \ KERNELS_SRC, \ KERNEL_BUILD_OPTIONS, \ KERNELS_BIN, \ KERNEL_BIN_SIZE) { \ DGEMM_VARIANT_NAME(TA,TB, DIVK, MULT), \ { KERNEL_NAME_MAIN, KERNEL_NAME_ROW, KERNEL_NAME_COLUMN, KERNEL_NAME_SINGLE } , \ KERNELS_SRC, \ KERNEL_BUILD_OPTIONS, \ KERNELS_BIN, \ KERNEL_BIN_SIZE, \ trans_##TA, trans_##TB, \ DIVK , \ { BS0, BS1 } , \ { NV0, NV1 } , \ #MULT \ } typedef clblasFunctorCache CacheSplit ; static CacheSplit cachesplit ; // Make it 1 to enable additional debug 'print' #define VERB 0 //static bool applicable( const Variant & var, clblasDgemmFunctor::Args & args, int RefMultiple ) //{ //#if 0 // // Transpose values are tested in select_variant // if ( args.transA != var.transA ) return false ; // if ( args.transB != var.transB ) return false ; //#endif // // //if (args.N>=var.divN && args.N % var.divN != 0 ) // if ( args.N % var.divN != 0 ) // return false ; // if ( args.M % var.divM != 0 ) // return false ; // if(var.Greater[0]?args.M=RefMultiple) // return false; // if(var.Greater[1]?args.N=RefMultiple) // return false; // if ( args.beta==0 && var.mult.compare("__ALPHA")!=0) // return false ; // return true ; //} static const Variant * select_variant_hawaiiSplitKernel( clblasDgemmFunctor::Args & args, cl_uint _64BitsUse ) { if ( args.transA == clblasNoTrans ) { if ( args.transB == clblasNoTrans ) { // ===== dgemm NN ====== const char* KName_NNMain = DGEMM_KERNEL_NAME(N, N, 48, 48, 8, 8, 8, 6, 6, __ALPHABETA, MAIN) ; const char* KName_NNRow = DGEMM_KERNEL_NAME(N, N, 1, 48, 8, 8, 8, 6, 6, __ALPHABETA, ROW) ; const char* KName_NNColumn = DGEMM_KERNEL_NAME(N, N, 48, 1, 8, 8, 8, 6, 6, __ALPHABETA, COLUMN) ; const char* KName_NNSingleWave = DGEMM_KERNEL_NAME(N, N, 1, 1, 8, 8, 8, 6, 6, __ALPHABETA, SINGLE) ; const char* KName_NNMainAlpha = DGEMM_KERNEL_NAME(N, N, 48, 48, 8, 8, 8, 6, 6, __ALPHA, MAIN) ; const char* KName_NNRowAlpha = DGEMM_KERNEL_NAME(N, N, 1, 48, 8, 8, 8, 6, 6, __ALPHA, ROW) ; const char* KName_NNColumnAlpha = DGEMM_KERNEL_NAME(N, N, 48, 1, 8, 8, 8, 6, 6, __ALPHA, COLUMN) ; const char* KName_NNSingleWaveAlpha = DGEMM_KERNEL_NAME(N, N, 1, 1, 8, 8, 8, 6, 6, __ALPHA, SINGLE) ; const char* KName_NNMainK1 = DGEMM_KERNEL_NAME(N, N, 48, 48, 1, 8, 8, 6, 6, __ALPHABETA, MAIN) ; const char* KName_NNRowK1 = DGEMM_KERNEL_NAME(N, N, 1, 48, 1, 8, 8, 6, 6, __ALPHABETA, ROW) ; const char* KName_NNColumnK1 = DGEMM_KERNEL_NAME(N, N, 48, 1, 1, 8, 8, 6, 6, __ALPHABETA, COLUMN) ; const char* KName_NNSingleWaveK1 = DGEMM_KERNEL_NAME(N, N, 1, 1, 1, 8, 8, 6, 6, __ALPHABETA, SINGLE) ; const char* KName_NNMainK1Alpha = DGEMM_KERNEL_NAME(N, N, 48, 48, 1, 8, 8, 6, 6, __ALPHA, MAIN) ; const char* KName_NNRowK1Alpha = DGEMM_KERNEL_NAME(N, N, 1, 48, 1, 8, 8, 6, 6, __ALPHA, ROW) ; const char* KName_NNColumnK1Alpha = DGEMM_KERNEL_NAME(N, N, 48, 1, 1, 8, 8, 6, 6, __ALPHA, COLUMN) ; const char* KName_NNSingleWaveK1Alpha = DGEMM_KERNEL_NAME(N, N, 1, 1, 1, 8, 8, 6, 6, __ALPHA, SINGLE) ; #if BUILD_KERNEL_FROM_STRING const char* KSrc_NTMain = DGEMM_SRC_NAME(N, N, 48, 48, 8, 8, 8, 6, 6, __ALPHABETA) ; const char* KSrc_NTRow = DGEMM_SRC_NAME(N, N, 1, 48, 8,8, 8, 6, 6, __ALPHABETA) ; const char* KSrc_NTColumn = DGEMM_SRC_NAME(N, N, 48, 1, 8, 8, 8, 6, 6, __ALPHABETA) ; const char* KSrc_NTSingleWave = DGEMM_SRC_NAME(N, N, 1, 1, 8, 8, 8, 6, 6, __ALPHABETA) ; #else const char* KBin_NNMain64 = DGEMM_SRC_NAME_HAWAII(N, N, 8, __ALPHABETA, 64) ; const size_t KBin_NNMainSize64 = sizeof(DGEMM_SRC_NAME_HAWAII(N, N, 8, __ALPHABETA, 64)) ; const char* KBin_NNMainAlpha64 = DGEMM_SRC_NAME_HAWAII(N, N, 8, __ALPHA, 64) ; const size_t KBin_NNMainAlphaSize64 = sizeof(DGEMM_SRC_NAME_HAWAII(N, N, 8, __ALPHA, 64)) ; const char* KBin_NNMainK164 = DGEMM_SRC_NAME_HAWAII(N, N, 1, __ALPHABETA, 64) ; const size_t KBin_NNMainK1Size64 = sizeof(DGEMM_SRC_NAME_HAWAII(N, N, 1, __ALPHABETA, 64)) ; const char* KBin_NNMainK1Alpha64 = DGEMM_SRC_NAME_HAWAII(N, N, 1, __ALPHA, 64) ; const size_t KBin_NNMainK1AlphaSize64 = sizeof(DGEMM_SRC_NAME_HAWAII(N, N, 1, __ALPHA, 64)) ; #endif if(args.K%8==0) { if (args.beta!=0) { if(_64BitsUse==64) { static const Variant variant = DGEMM_VARIANT_OBJ(N,N,8,8,8,6,6,64,__ALPHABETA, KName_NNMain,KName_NNRow, KName_NNColumn, KName_NNSingleWave , NULL, NULL, KBin_NNMain64, KBin_NNMainSize64) ; return &variant ; } else { std::cout<<"we don't support clblas on 32 bits"<< std::endl; assert(1); return NULL; } } else { if(_64BitsUse==64) { static const Variant variant = DGEMM_VARIANT_OBJ(N,N,8,8,8,6,6,64,__ALPHA, KName_NNMainAlpha,KName_NNRowAlpha, KName_NNColumnAlpha, KName_NNSingleWaveAlpha , NULL, NULL, KBin_NNMainAlpha64, KBin_NNMainAlphaSize64) ; return &variant ; } else { std::cout<<"we don't support clblas on 32 bits"<< std::endl; assert(1); return NULL; } } } else { if (args.beta!=0) { if(_64BitsUse==64) { static const Variant variant = DGEMM_VARIANT_OBJ(N,N,1,8,8,6,6,64,__ALPHABETA, KName_NNMainK1,KName_NNRowK1, KName_NNColumnK1, KName_NNSingleWaveK1 , NULL, NULL, KBin_NNMainK164, KBin_NNMainK1Size64) ; return &variant ; } else { std::cout<<"we don't support clblas on 32 bits"<< std::endl; assert(1); return NULL; } } else { if(_64BitsUse==64) { static const Variant variant = DGEMM_VARIANT_OBJ(N,N,1,8,8,6,6,64,__ALPHA, KName_NNMainK1Alpha,KName_NNRowK1Alpha, KName_NNColumnK1Alpha, KName_NNSingleWaveK1Alpha , NULL, NULL, KBin_NNMainK1Alpha64, KBin_NNMainK1AlphaSize64) ; return &variant ; } else { std::cout<<"we don't support clblas on 32 bits"<< std::endl; assert(1); return NULL; } } } } if (args.transB == clblasTrans) { const char* KName_NTMain = DGEMM_KERNEL_NAME(N, T, 48, 48, 8, 8, 8, 6, 6, __ALPHABETA, MAIN) ; const char* KName_NTRow = DGEMM_KERNEL_NAME(N, T, 1, 48, 8, 8, 8, 6, 6, __ALPHABETA, ROW) ; const char* KName_NTColumn = DGEMM_KERNEL_NAME(N, T, 48, 1, 8, 8, 8, 6, 6, __ALPHABETA, COLUMN) ; const char* KName_NTSingleWave = DGEMM_KERNEL_NAME(N, T, 1, 1, 8, 8, 8, 6, 6, __ALPHABETA, SINGLE) ; const char* KName_NTMainAlpha = DGEMM_KERNEL_NAME(N, T, 48, 48, 8, 8, 8, 6, 6, __ALPHA, MAIN) ; const char* KName_NTRowAlpha = DGEMM_KERNEL_NAME(N, T, 1, 48, 8, 8, 8, 6, 6, __ALPHA, ROW) ; const char* KName_NTColumnAlpha = DGEMM_KERNEL_NAME(N, T, 48, 1, 8, 8, 8, 6, 6, __ALPHA, COLUMN) ; const char* KName_NTSingleWaveAlpha = DGEMM_KERNEL_NAME(N, T, 1, 1, 8, 8, 8, 6, 6, __ALPHA, SINGLE) ; const char* KName_NTMainK1 = DGEMM_KERNEL_NAME(N, T, 48, 48, 1, 8, 8, 6, 6, __ALPHABETA, MAIN) ; const char* KName_NTRowK1 = DGEMM_KERNEL_NAME(N, T, 1, 48, 1, 8, 8, 6, 6, __ALPHABETA, ROW) ; const char* KName_NTColumnK1 = DGEMM_KERNEL_NAME(N, T, 48, 1, 1, 8, 8, 6, 6, __ALPHABETA, COLUMN) ; const char* KName_NTSingleWaveK1 = DGEMM_KERNEL_NAME(N, T, 1, 1, 1, 8, 8, 6, 6, __ALPHABETA, SINGLE) ; const char* KName_NTMainK1Alpha = DGEMM_KERNEL_NAME(N, T, 48, 48, 1, 8, 8, 6, 6, __ALPHA, MAIN) ; const char* KName_NTRowK1Alpha = DGEMM_KERNEL_NAME(N, T, 1, 48, 1, 8, 8, 6, 6, __ALPHA, ROW) ; const char* KName_NTColumnK1Alpha = DGEMM_KERNEL_NAME(N, T, 48, 1, 1, 8, 8, 6, 6, __ALPHA, COLUMN) ; const char* KName_NTSingleWaveK1Alpha = DGEMM_KERNEL_NAME(N, T, 1, 1, 1, 8, 8, 6, 6, __ALPHA, SINGLE) ; #if BUILD_KERNEL_FROM_STRING const char* KSrc_NTMain = DGEMM_SRC_NAME(N, T, 48, 48, 8, 8, 8, 6, 6, __ALPHABETA) ; const char* KSrc_NTRow = DGEMM_SRC_NAME(N, T, 1, 48, 8,8, 8, 6, 6, __ALPHABETA) ; const char* KSrc_NTColumn = DGEMM_SRC_NAME(N, T, 48, 1, 8, 8, 8, 6, 6, __ALPHABETA) ; const char* KSrc_NTSingleWave = DGEMM_SRC_NAME(N, T, 1, 1, 8, 8, 8, 6, 6, __ALPHABETA) ; #else const char* KBin_NTMain64 = DGEMM_SRC_NAME_HAWAII(N, T, 8, __ALPHABETA, 64) ; const size_t KBin_NTMainSize64 = sizeof(DGEMM_SRC_NAME_HAWAII(N, T, 8, __ALPHABETA, 64)) ; const char* KBin_NTMainAlpha64 = DGEMM_SRC_NAME_HAWAII(N, T, 8, __ALPHA, 64) ; const size_t KBin_NTMainAlphaSize64 = sizeof(DGEMM_SRC_NAME_HAWAII(N, T, 8, __ALPHA, 64)) ; const char* KBin_NTMainK164 = DGEMM_SRC_NAME_HAWAII(N, T, 1, __ALPHABETA, 64) ; const size_t KBin_NTMainK1Size64 = sizeof(DGEMM_SRC_NAME_HAWAII(N, T, 1, __ALPHABETA, 64)) ; const char* KBin_NTMainK1Alpha64 = DGEMM_SRC_NAME_HAWAII(N, T, 1, __ALPHA, 64) ; const size_t KBin_NTMainK1AlphaSize64 = sizeof(DGEMM_SRC_NAME_HAWAII(N, T, 1, __ALPHA, 64)) ; #endif // ===== dgemm NT ====== if(args.K%8==0) { if (args.beta!=0) { if(_64BitsUse==64) { static const Variant variant = DGEMM_VARIANT_OBJ(N,T,8,8,8,6,6,64,__ALPHABETA, KName_NTMain,KName_NTRow, KName_NTColumn, KName_NTSingleWave , NULL, NULL, KBin_NTMain64, KBin_NTMainSize64) ; return &variant ; } else { std::cout<<"we don't support clblas on 32 bits"<< std::endl; assert(1); return NULL; } } else { if(_64BitsUse==64) { static const Variant variant = DGEMM_VARIANT_OBJ(N,T,8,8,8,6,6,64,__ALPHA, KName_NTMainAlpha,KName_NTRowAlpha, KName_NTColumnAlpha, KName_NTSingleWaveAlpha , NULL, NULL, KBin_NTMainAlpha64, KBin_NTMainAlphaSize64) ; return &variant ; } else { std::cout<<"we don't support clblas on 32 bits"<< std::endl; assert(1); return NULL; } } } else { if (args.beta!=0) { if(_64BitsUse==64) { static const Variant variant = DGEMM_VARIANT_OBJ(N,T,1,8,8,6,6,64,__ALPHABETA, KName_NTMainK1,KName_NTRowK1, KName_NTColumnK1, KName_NTSingleWaveK1 , NULL, NULL, KBin_NTMainK164, KBin_NTMainK1Size64) ; return &variant ; } else { std::cout<<"we don't support clblas on 32 bits"<< std::endl; assert(1); return NULL; } } else { if(_64BitsUse==64) { static const Variant variant = DGEMM_VARIANT_OBJ(N,T,1,8,8,6,6,64,__ALPHA, KName_NTMainK1Alpha,KName_NTRowK1Alpha, KName_NTColumnK1Alpha, KName_NTSingleWaveK1Alpha , NULL, NULL, KBin_NTMainK1Alpha64, KBin_NTMainK1AlphaSize64) ; return &variant ; } else { std::cout<<"we don't support clblas on 32 bits"<< std::endl; assert(1); return NULL; } } } } } return NULL; } clBlashawaiiDgemmSplitKernelFunctor::clBlashawaiiDgemmSplitKernelFunctor(Args & args, const Variant * variant, cl_int & err) { cl_device_id device; cl_context context; m_program=NULL; m_variantSplit = variant; cl_command_queue queue = args.queue; err = getDeviceAndContext(queue, device, context); if( err != CL_SUCCESS ) { return; } if (VERB) printf(" ===> GET KERNEL %s\n", this->m_variantSplit->variantName) ; //Ben do I use the correct "kernel_name"? BinaryLookup bl(context, device, "clBlashawaiiDgemmSplitKernelFunctor"); bl.variantRaw( this->m_variantSplit->variantName, strlen(this->m_variantSplit->variantName)+1 ) ; if ( !bl.found() ) // may create empty file or may wait until file is ready { if ( this->m_variantSplit->bin != NULL ) { // build from a pre-compiled version of the kernel (SPIR or cl binaries) //only 1 binary containing all the kernel err = bl.buildFromBinary(this->m_variantSplit->bin, this->m_variantSplit->bin_size, /*this->m_variantSplit->build_options[i]*/ "-cl-std=2.0"); } else { //// directly build from a char* //for (int i=0; i<4; i++) // if(this->m_variantSplit->source[i] != 0) // err = bl.buildFromSource(this->m_variantSplit->source[i]); if (VERB) printf(" ===> BUILD PROBLEM WE DON'T SUPPORT SOURCE BUILD FOR SPLIT DGEMM\n") ; return; } if ( err != CL_SUCCESS ) { if (VERB) printf(" ===> BUILD PROBLEM\n") ; return; } } this->m_program = bl.getProgram(); } clBlashawaiiDgemmSplitKernelFunctor * clBlashawaiiDgemmSplitKernelFunctor::provide(clblasDgemmFunctor::Args & args) { if ( args.order == clblasRowMajor ) return NULL ; // The RowMajor case shall never occur. cl_device_id dev; cl_context ctxt; cl_int err = getDeviceAndContext(args.queue, dev, ctxt); if (err != CL_SUCCESS) { return NULL; } cl_uint bitness = getAddressBits(dev); int major; int minor; getCLVersion(dev, major, minor); if (major<2) return NULL; const Variant * variant = select_variant_hawaiiSplitKernel( args, bitness ) ; if ( variant == NULL ) return NULL ; CacheSplit::Lookup lookup(cachesplit, ctxt, dev, variant) ; if ( lookup.ok() ) { clBlashawaiiDgemmSplitKernelFunctor * functor = lookup.get(); functor->retain(); // increment the reference counter to avoid deletion while it is still beeing used return functor; } clBlashawaiiDgemmSplitKernelFunctor * functor = new clBlashawaiiDgemmSplitKernelFunctor(args, variant, err); if (err != CL_SUCCESS) { return NULL; } lookup.set(functor) ; return functor; } cl_int clBlashawaiiDgemmSplitKernelFunctor::KernelsLaunch(cl_command_queue queue, cl_kernel Kernel[4], Args &args) { size_t GlobalX =args.M/m_variantSplit->bwi[0]; GlobalX-=GlobalX%m_variantSplit->ls[0]; // size_t GlobalY = args.N/m_variantSplit->bwi[1]; GlobalY-=GlobalY%m_variantSplit->ls[1]; std::size_t gs[2] = {GlobalX, GlobalY}; cl_int error = 0; if (args.M%48==0 && args.N%48==0) { if (VERB) printf(" ===> EXECUTE KERNEL 0 \n") ; error = clEnqueueNDRangeKernel(queue, Kernel[0], 2, NULL, gs, m_variantSplit->ls, args.numEventsInWaitList, args.eventWaitList,args.events); return error; } if (args.M%48!=0 && args.N%48!=0 && args.M>=48 && args.N>=48 ) { if (VERB) printf(" ===> EXECUTE KERNEL 0, 1, 2, 3 \n") ; error = clEnqueueNDRangeKernel(queue, Kernel[0], 2, NULL, gs, m_variantSplit->ls, args.numEventsInWaitList, args.eventWaitList,NULL); gs[0] = 8; error |= clEnqueueNDRangeKernel(queue, Kernel[1], 2, NULL, gs, m_variantSplit->ls, 0, NULL,NULL); gs[1] = 8; gs[0] = GlobalX; error |= clEnqueueNDRangeKernel(queue, Kernel[2], 2, NULL, gs, m_variantSplit->ls, 0, NULL,NULL); gs[0] = 8; gs[1] = 8; error |= clEnqueueNDRangeKernel(queue, Kernel[3], 2, NULL, gs, m_variantSplit->ls, 0, NULL,args.events); return error; } if (args.M%48==0 && args.N%48!=0 && args.N>48 ) { if (VERB) printf(" ===> EXECUTE KERNEL 0, 2, \n") ; error = clEnqueueNDRangeKernel(queue, Kernel[0], 2, NULL, gs, m_variantSplit->ls, args.numEventsInWaitList, args.eventWaitList,NULL); gs[1] = 8; error |= clEnqueueNDRangeKernel(queue, Kernel[2], 2, NULL, gs, m_variantSplit->ls, 0, NULL, args.events); return error; } if (args.N%48==0 && args.M%48!=0 && args.M>48 ) { if (VERB) printf(" ===> EXECUTE KERNEL 0, 1 \n") ; error = clEnqueueNDRangeKernel(queue, Kernel[0], 2, NULL, gs, m_variantSplit->ls, args.numEventsInWaitList, args.eventWaitList,NULL); gs[0] = 8; error |= clEnqueueNDRangeKernel(queue, Kernel[1], 2, NULL, gs, m_variantSplit->ls, 0, NULL, args.events); return error; } if(args.M<48 && args.N%48==0) { if (VERB) printf(" ===> EXECUTE KERNEL 1, \n") ; gs[0] = 8; error |= clEnqueueNDRangeKernel(queue, Kernel[1], 2, NULL, gs, m_variantSplit->ls, args.numEventsInWaitList, args.eventWaitList, args.events); return error; } if(args.M<48 && args.N%48!=0 && args.N>=48) { if (VERB) printf(" ===> EXECUTE KERNEL 1, 3 \n") ; gs[0] = 8; error |= clEnqueueNDRangeKernel(queue, Kernel[1], 2, NULL, gs, m_variantSplit->ls, args.numEventsInWaitList, args.eventWaitList, NULL); gs[1] = 8; error |= clEnqueueNDRangeKernel(queue, Kernel[3], 2, NULL, gs, m_variantSplit->ls, 0, NULL,args.events); return error; } if(args.N<48 && args.M%48==0) { if (VERB) printf(" ===> EXECUTE KERNEL 2 \n") ; gs[1] = 8; error |= clEnqueueNDRangeKernel(queue, Kernel[2], 2, NULL, gs, m_variantSplit->ls, args.numEventsInWaitList, args.eventWaitList, args.events); return error; } if(args.N<48 && args.M%48!=0&& args.M>=48) { if (VERB) printf(" ===> EXECUTE KERNEL 2, 3 \n") ; gs[1] = 8; error |= clEnqueueNDRangeKernel(queue, Kernel[2], 2, NULL, gs, m_variantSplit->ls, args.numEventsInWaitList, args.eventWaitList, NULL); gs[0] = 8; error |= clEnqueueNDRangeKernel(queue, Kernel[3], 2, NULL, gs, m_variantSplit->ls, 0, NULL,args.events); return error; } if (args.N<48 && args.M<48) { if (VERB) printf(" ===> EXECUTE KERNEL 3 \n") ; gs[0] = 8; gs[1] = 8; error |= clEnqueueNDRangeKernel(queue, Kernel[3], 2, NULL, gs, m_variantSplit->ls,args.numEventsInWaitList, args.eventWaitList, args.events); return error; } return clblasNotImplemented; } clblasStatus clBlashawaiiDgemmSplitKernelFunctor::execute(Args &args) { cl_int err; cl_command_queue queue = args.queue; if (VERB) printf(" ===> EXECUTE KERNEL %s, alpha =%f ,beta = %f\n", this->m_variantSplit->kernel_name, args.alpha, args.beta) ; cl_kernel kernel[4]; int NBKernel = 0; for (int i=0; i<4; i++) { if (this->m_variantSplit->kernel_name[i]) { kernel[i ]= clCreateKernel( this->m_program, this->m_variantSplit->kernel_name[i], &err); if (err != CL_SUCCESS) return clblasStatus(err) ; NBKernel++; } else break; } if (NBKernel != 4) return clblasStatus(clblasBuildProgramFailure) ; if (VERB) { for (int i=0; i FOUND %s\n", this->m_variantSplit->kernel_name[i]) ; } unsigned int M = (unsigned int )args.M, N = (unsigned int )args.N, K = (unsigned int )args.K; unsigned int lda = (unsigned int )args.lda, ldb = (unsigned int )args.ldb, ldc = (unsigned int )args.ldc; int offsetA = (int)args.offA; int offsetB = (int)args.offB; int offsetC = (int)args.offC; int arg[4]={0, 0, 0, 0} ; //// All dgemm kernels shall have the same arguments: (A,B,C,M,N,K,alpha,beta,lda,ldb,ldc,offa,offb,offc) for (int i=0; i(kernel[i], arg[i]++, args.A); setKernelArg(kernel[i], arg[i]++, args.B); setKernelArg(kernel[i], arg[i]++, args.C); setKernelArg(kernel[i], arg[i]++, M); setKernelArg(kernel[i], arg[i]++, N); setKernelArg(kernel[i], arg[i]++, K); setKernelArg(kernel[i], arg[i]++, args.alpha); if (args.beta!=0 && this->m_variantSplit->mult.compare("__ALPHA")!=0) setKernelArg(kernel[i], arg[i]++, args.beta); setKernelArg(kernel[i], arg[i]++, lda); setKernelArg(kernel[i], arg[i]++, ldb); setKernelArg(kernel[i], arg[i]++, ldc); setKernelArg(kernel[i], arg[i]++, offsetA); setKernelArg(kernel[i], arg[i]++, offsetB); setKernelArg(kernel[i], arg[i]++, offsetC); } err = KernelsLaunch(queue, kernel, args); for (int i = 0; i ERR=%d \n",(int)err) ; return clblasStatus(err) ; } #endif clblas-2.10/src/library/blas/functor/hawaii_sgemmBig1024Kernel.cc000066400000000000000000000340201264277366700246060ustar00rootroot00000000000000#ifndef CLBLAS_HAWAII_DYNAMIC_KERNEL #include #include #include #include #include "clblas-internal.h" #include "solution_seq.h" #include #include #include #include #include #include #include "BinaryBuild.h" #include "hawaii_sgemmBig1024Kernel.h" //only non-multiples of 32 is implemented right now, which is a small matrix. #if BUILD_KERNEL_FROM_STRING #include "sgemm_gcn_bigMatrices.clT" #else #include "sgemm_gcn_bigMatrices.clHawaii_64.bin.clT" #include "sgemm_gcn_bigMatrices.clBonaire_64.bin.clT" #endif // Just because the full name is too long typedef clBlashawaiiSgemmBig1024KernelFunctor::Variant Variant; //define the string name of the soure/binary code #define SGEMM_SRC_NAME(TA,TB, DIVK, MULT) sgemm_##TA##TB##_##DIVK##_SPLIT##MULT #define SGEMM_SRC_NAME_HAWAII(TA,TB, DIVK, MULT, BITS) sgemm_##TA##TB##_##DIVK##_SPLIT##MULT##_##BITS##_bin_Hawaii #define SGEMM_SRC_NAME_BONAIRE(TA,TB, DIVK, MULT, BITS) sgemm_##TA##TB##_##DIVK##_SPLIT##MULT##_##BITS##_bin_Bonaire #define SGEMM_SRC_NAME_BIN(TA,TB, DIVK, MULT, BITS, DEVICE) SGEMM_SRC_NAME##_##DEVICE(TA,TB, DIVK, MULT, BITS) //variant name used to differentiate the different ones #define SGEMM_VARIANT_NAME(TA,TB, DIVK, MULT) "sgemm_" #TA #TB "_" #DIVK "_SPLIT" #MULT //SGEMM_VARIANT_NAME(TA, TB, DIVM , DIVN, DIVK, GREATER48M, GREATER48N, NBKERNEL), #define SGEMM_KERNEL_NAME(TA,TB,DIVM,DIVN,DIVK,BS0,BS1,NV0,NV1,MULT) "sgemm_" #TA #TB "_" #DIVM "_" #DIVN "_" #DIVK "_" #BS0 "x" #BS1 "_" #NV0 "x" #NV1 #MULT #define trans_N clblasNoTrans #define trans_T clblasTrans // Fill a variant descriptor using OpenCL source #define SGEMM_VARIANT_OBJ(TA,TB,DIVK,BS0,BS1,NV0,NV1, BITS, MULT, \ KERNEL_NAME_MAIN, \ KERNELS_SRC, \ KERNEL_BUILD_OPTIONS, \ KERNELS_BIN, \ KERNEL_BIN_SIZE) { \ SGEMM_VARIANT_NAME(TA,TB, DIVK, MULT), \ { KERNEL_NAME_MAIN } , \ KERNELS_SRC, \ KERNEL_BUILD_OPTIONS, \ KERNELS_BIN, \ KERNEL_BIN_SIZE, \ trans_##TA, trans_##TB, \ DIVK , \ { BS0, BS1 } , \ { NV0, NV1 } , \ #MULT \ } typedef clblasFunctorCache CacheBig1024; static CacheBig1024 cachebig1024; // Make it 1 to enable additional debug 'print' #define VERB 0 //static bool applicable( const Variant & var, clblasSgemmFunctor::Args & args, int RefMultiple ) //{ //#if 0 // // Transpose values are tested in select_variant // if ( args.transA != var.transA ) return false ; // if ( args.transB != var.transB ) return false ; //#endif // // //if (args.N>=var.divN && args.N % var.divN != 0 ) // if ( args.N % var.divN != 0 ) // return false ; // if ( args.M % var.divM != 0 ) // return false ; // if(var.Greater[0]?args.M=RefMultiple) // return false; // if(var.Greater[1]?args.N=RefMultiple) // return false; // if ( args.beta==0 && var.mult.compare("__ALPHA")!=0) // return false ; // return true ; //} static void to_upper(char* input) { while(*input) { *input=toupper(*input); input++; } } static const Variant * select_variant_Big1024Kernel(clblasSgemmFunctor::Args & args, const char* DevName, cl_uint _64BitsUse) { if (_64BitsUse != 64) { std::cout << "we don't support clblas on 32 bits" << std::endl; assert(1); return NULL; } if (args.transA == clblasNoTrans) { if (args.transB == clblasNoTrans) { // ===== sgemm NN ====== // sgemm NN does not have big 1024 perf drop problem return NULL; } if (args.transB == clblasTrans) { // ===== SGEMM NT ====== //sgemm_NT_128_128_16_16x16_8x8__ALPHABETA const char* KName_NT = SGEMM_KERNEL_NAME(N, T, 128, 128, 16, 16, 16, 8, 8, __ALPHABETA); const char* KBin_NT64; size_t KBin_NTSize64 = 0; #if BUILD_KERNEL_FROM_STRING //currently not supported return NULL; #else if (!strcmp(DevName, "Hawaii")) { //KBin_NT64 = SGEMM_SRC_NAME_BIN(N, T, 16, __ALPHABETA, 64, HAWAII) ; KBin_NT64 = sgemm_NT_128_128_16_16x16_8x8__ALPHABETA_64_bin_Hawaii; KBin_NTSize64 = sizeof(sgemm_NT_128_128_16_16x16_8x8__ALPHABETA_64_bin_Hawaii); } #endif // ===== SGEMM NT ====== static const Variant variant = SGEMM_VARIANT_OBJ(N, T, 16, 16, 16, 8, 8, 64, __ALPHABETA, KName_NT, NULL, NULL, KBin_NT64, KBin_NTSize64); return &variant; } } else { if (args.transB == clblasNoTrans) { // ===== sgemm TN ====== //sgemm TN does not have big 1024 perf drop problem return NULL; } return NULL; } return NULL; } clBlashawaiiSgemmBig1024KernelFunctor::clBlashawaiiSgemmBig1024KernelFunctor(Args & args, const Variant * variant, cl_int & err) { cl_device_id device; cl_context context; m_program=NULL; m_variantBig1024 = variant; cl_command_queue queue = args.queue; err = getDeviceAndContext(queue, device, context); if( err != CL_SUCCESS ) { return; } if (VERB) printf(" ===> GET KERNEL %s\n", this->m_variantBig1024->variantName); //Ben do I use the correct "kernel_name"? BinaryLookup bl(context, device, "clBlashawaiiSgemmBig1024KernelFunctor"); bl.variantRaw(this->m_variantBig1024->variantName, strlen(this->m_variantBig1024->variantName) + 1); if ( !bl.found() ) // may create empty file or may wait until file is ready { if (this->m_variantBig1024->bin != NULL) { // build from a pre-compiled version of the kernel (SPIR or cl binaries) //only 1 binary containing all the kernel err = bl.buildFromBinary(this->m_variantBig1024->bin, this->m_variantBig1024->bin_size, "-cl-std=2.0"); } else { //// directly build from a char* if (VERB) printf(" ===> BUILD PROBLEM WE DON'T SUPPORT SOURCE BUILD FOR big 1024 SGEMM\n") ; return; } if ( err != CL_SUCCESS ) { if (VERB) printf(" ===> BUILD PROBLEM\n") ; return; } } this->m_program = bl.getProgram(); } clBlashawaiiSgemmBig1024KernelFunctor * clBlashawaiiSgemmBig1024KernelFunctor::provide(clblasSgemmFunctor::Args & args, char* DevName) { if ( args.order == clblasRowMajor ) return NULL ; // The RowMajor case shall never occur. cl_device_id dev; cl_context ctxt; cl_int err = getDeviceAndContext(args.queue, dev, ctxt); if (err != CL_SUCCESS) { return NULL; } cl_uint bitness = getAddressBits(dev); int major; int minor; getCLVersion(dev, major, minor); //if (major<2) // return NULL; // to_upper( DevName); const Variant * variant = select_variant_Big1024Kernel(args, DevName, bitness); if ( variant == NULL ) return NULL ; CacheBig1024::Lookup lookup(cachebig1024, ctxt, dev, variant) ; if ( lookup.ok() ) { clBlashawaiiSgemmBig1024KernelFunctor * functor = lookup.get(); functor->retain(); // increment the reference counter to avoid deletion while it is still beeing used return functor; } clBlashawaiiSgemmBig1024KernelFunctor * functor = new clBlashawaiiSgemmBig1024KernelFunctor(args, variant, err); if (err != CL_SUCCESS) { return NULL; } lookup.set(functor) ; return functor; } cl_int clBlashawaiiSgemmBig1024KernelFunctor::KernelsLaunch(cl_command_queue queue, cl_kernel Kernel[1], Args &args) { if (args.lda < 7168) { //((Mvalue - 1) / 128 + 1) * 16 size_t GlobalX = ((args.M - 1) / 128 + 1) * 16; size_t GlobalY = ((args.N - 1) / 128 + 1) * 16; std::size_t gs[2] = { GlobalX, GlobalY }; cl_int error = 0; //if (VERB) printf(" ===> EXECUTE KERNEL 0 \n") ; error = clEnqueueNDRangeKernel(queue, Kernel[0], 2, NULL, gs, m_variantBig1024->ls, args.numEventsInWaitList, args.eventWaitList, args.events); return error; } else { //for example, when M=N=K=8192 //we are gonna call 16 GEMMs //each GEMM has M=N=K=4096 //note are direct GEMM call has a 0.7 TFLOPS performance // [ A11 | A12 | A13 | A14 ] [ B11 | B12 | B13 | B14 ] [ C11 | C12 ] // A = [ A21 | A22 | A23 | A24 ] B = [ B21 | B22 | B23 | B24 ] C = [ C21 | C22 ] // 16 GEMMs are // #01: C11 = a*A11*B11 + b*C11 // #02: C11 = a*A12*B12 + 1*C11 // #03: C11 = a*A13*B13 + 1*C11 // #04: C11 = a*A14*B14 + 1*C11 now we are done with C11 // #05: C12 = a*A11*B21 + b*C12 // #06: C12 = a*A12*B22 + 1*C12 // #07: C12 = a*A12*B22 + 1*C12 // #08: C12 = a*A12*B22 + 1*C12 now we are done with C12 // #09: C21 = a*A21*B11 + b*C21 // #10: C21 = a*A22*B12 + 1*C21 // #11: C21 = a*A23*B13 + 1*C21 // #12: C21 = a*A24*B14 + 1*C21 now we are done with C21 // #13: C22 = a*A21*B21 + b*C22 // #14: C22 = a*A22*B22 + 1*C22 // #15: C22 = a*A23*B23 + 1*C22 // #16: C22 = a*A24*B24 + 1*C22 now we are done with C22 unsigned int K_split_factor = 4; unsigned int M_split_factor = 2; unsigned int N_split_factor = 2; unsigned int small_M = args.M / M_split_factor; unsigned int small_N = args.N / N_split_factor; unsigned int small_K = args.K / K_split_factor; size_t GlobalX = ((small_M - 1) / 128 + 1) * 16; size_t GlobalY = ((small_N - 1) / 128 + 1) * 16; std::size_t gs[2] = { GlobalX, GlobalY }; cl_int error = 0; cl_float betaone = 1; error = clSetKernelArg(Kernel[0], 3, sizeof(cl_uint), &small_M); assert(error == CL_SUCCESS); error = clSetKernelArg(Kernel[0], 4, sizeof(cl_uint), &small_N); assert(error == CL_SUCCESS); error = clSetKernelArg(Kernel[0], 5, sizeof(cl_uint), &small_K); assert(error == CL_SUCCESS); for (int M_split_index = 0; M_split_index < M_split_factor; M_split_index++) { //2 groups of GEMMs splited by M from example for (int N_split_index = 0; N_split_index < N_split_factor; N_split_index++) { //2 groups of GEMMs splited by N from example unsigned int offc_C = args.ldc*args.N / N_split_factor * N_split_index + args.M / M_split_factor * M_split_index + args.offC; error = clSetKernelArg(Kernel[0], 13, sizeof(cl_uint), &offc_C); assert(error == CL_SUCCESS); for (int K_split_index = 0; K_split_index < K_split_factor; K_split_index++) { //4 GEMMs splited by K from example unsigned int offa_A = (args.M / M_split_factor * M_split_index) + (args.lda * args.K / K_split_factor * K_split_index) + args.offA; unsigned int offb_B = (args.N / N_split_factor * N_split_index) + (args.ldb * args.K / K_split_factor * K_split_index) + args.offB; error = clSetKernelArg(Kernel[0], 11, sizeof(cl_uint), &offa_A); assert(error == CL_SUCCESS); error = clSetKernelArg(Kernel[0], 12, sizeof(cl_uint), &offb_B); assert(error == CL_SUCCESS); if (K_split_index == 0) { error = clSetKernelArg(Kernel[0], 7, sizeof(cl_float), &(args.beta)); assert(error == CL_SUCCESS); if (M_split_index == 0 && N_split_index == 0) { //very first GEMM call error = clEnqueueNDRangeKernel(queue, Kernel[0], 2, NULL, gs, m_variantBig1024->ls, args.numEventsInWaitList, args.eventWaitList, NULL); assert(error == CL_SUCCESS); } else { error = clEnqueueNDRangeKernel(queue, Kernel[0], 2, NULL, gs, m_variantBig1024->ls, 0, NULL, NULL); assert(error == CL_SUCCESS); } } else { error = clSetKernelArg(Kernel[0], 7, sizeof(cl_float), &betaone); assert(error == CL_SUCCESS); if ((M_split_index == (M_split_factor - 1) ) && (N_split_index == (N_split_factor - 1)) && (K_split_index == (K_split_factor - 1))) { //very last GEMM call error = clEnqueueNDRangeKernel(queue, Kernel[0], 2, NULL, gs, m_variantBig1024->ls, 0, NULL, args.events); assert(error == CL_SUCCESS); } else { error = clEnqueueNDRangeKernel(queue, Kernel[0], 2, NULL, gs, m_variantBig1024->ls, 0, NULL, NULL); assert(error == CL_SUCCESS); } } } } } return error; } return clblasNotImplemented; } clblasStatus clBlashawaiiSgemmBig1024KernelFunctor::execute(Args &args) { cl_int err; cl_command_queue queue = args.queue; if (VERB) printf(" ===> EXECUTE KERNEL %s, alpha =%f ,beta = %f\n", this->m_variantBig1024->kernel_name, args.alpha, args.beta); cl_kernel kernel[1]; int NBKernel = 0; if (this->m_variantBig1024->kernel_name[0]) { kernel[0] = clCreateKernel(this->m_program, this->m_variantBig1024->kernel_name[0], &err); if (err != CL_SUCCESS) return clblasStatus(err) ; NBKernel++; } if (NBKernel != 1) return clblasStatus(clblasBuildProgramFailure) ; if (VERB) { for (int i=0; i FOUND %s\n", this->m_variantBig1024->kernel_name[i]); } int M = args.M, N = args.N, K = args.K; int lda = args.lda, ldb = args.ldb, ldc = args.ldc; int offsetA = args.offA; int offsetB = args.offB; int offsetC = args.offC; int arg[4]={0, 0, 0, 0} ; //// All sgemm kernels shall have the same arguments: (A,B,C,M,N,K,alpha,beta,lda,ldb,ldc,offa,offb,offc) for (int i=0; i(kernel[i], arg[i]++, args.A); setKernelArg(kernel[i], arg[i]++, args.B); setKernelArg(kernel[i], arg[i]++, args.C); setKernelArg(kernel[i], arg[i]++, M); setKernelArg(kernel[i], arg[i]++, N); setKernelArg(kernel[i], arg[i]++, K); setKernelArg(kernel[i], arg[i]++, args.alpha); setKernelArg(kernel[i], arg[i]++, args.beta); setKernelArg(kernel[i], arg[i]++, lda); setKernelArg(kernel[i], arg[i]++, ldb); setKernelArg(kernel[i], arg[i]++, ldc); setKernelArg(kernel[i], arg[i]++, offsetA); setKernelArg(kernel[i], arg[i]++, offsetB); setKernelArg(kernel[i], arg[i]++, offsetC); } err = KernelsLaunch(queue, kernel, args); for (int i = 0; i ERR=%d \n",(int)err) ; // err= clFinish(queue); return clblasStatus(err) ; } #endif clblas-2.10/src/library/blas/functor/hawaii_sgemmBranchKernel.cc000066400000000000000000000274611264277366700250060ustar00rootroot00000000000000#ifndef CLBLAS_HAWAII_DYNAMIC_KERNEL #include #include #include #include #include "clblas-internal.h" #include "solution_seq.h" #include #include #include #include #include #include #include "BinaryBuild.h" #include "hawaii_sgemmBranchKernel.h" //only non-multiples of 32 is implemented right now, which is a small matrix. #if BUILD_KERNEL_FROM_STRING #include "sgemm_gcn_SmallMatrices.clT" #else #include "sgemm_gcn_SmallMatrices.clHawaii_64.bin.clT" #include "sgemm_gcn_SmallMatrices.clBonaire_64.bin.clT" #endif // Just because the full name is too long typedef clBlashawaiiSgemmBranchKernelFunctor::Variant Variant; //define the string name of the soure/binary code #define SGEMM_SRC_NAME(TA,TB, DIVK, MULT) sgemm_##TA##TB##_##DIVK##_SPLIT##MULT #define SGEMM_SRC_NAME_HAWAII(TA,TB, DIVK, MULT, BITS) sgemm_##TA##TB##_##DIVK##_SPLIT##MULT##_##BITS##_bin_Hawaii #define SGEMM_SRC_NAME_BONAIRE(TA,TB, DIVK, MULT, BITS) sgemm_##TA##TB##_##DIVK##_SPLIT##MULT##_##BITS##_bin_Bonaire #define SGEMM_SRC_NAME_BIN(TA,TB, DIVK, MULT, BITS, DEVICE) SGEMM_SRC_NAME##_##DEVICE(TA,TB, DIVK, MULT, BITS) //variant name used to differentiate the different ones #define SGEMM_VARIANT_NAME(TA,TB, DIVK, MULT) "sgemm_" #TA #TB "_" #DIVK "_SPLIT" #MULT //SGEMM_VARIANT_NAME(TA, TB, DIVM , DIVN, DIVK, GREATER48M, GREATER48N, NBKERNEL), #define SGEMM_KERNEL_NAME(TA,TB,DIVM,DIVN,DIVK,BS0,BS1,NV0,NV1,MULT, BLOC) "sgemm_" #TA #TB "_" #DIVM "_" #DIVN "_" #DIVK "_" #BS0 "x" #BS1 "_" #NV0 "x" #NV1 #MULT "_" #BLOC #define trans_N clblasNoTrans #define trans_T clblasTrans // Fill a variant descriptor using OpenCL source #define SGEMM_VARIANT_OBJ(TA,TB,DIVK,BS0,BS1,NV0,NV1, BITS, MULT, \ KERNEL_NAME_MAIN, \ KERNELS_SRC, \ KERNEL_BUILD_OPTIONS, \ KERNELS_BIN, \ KERNEL_BIN_SIZE) { \ SGEMM_VARIANT_NAME(TA,TB, DIVK, MULT), \ { KERNEL_NAME_MAIN } , \ KERNELS_SRC, \ KERNEL_BUILD_OPTIONS, \ KERNELS_BIN, \ KERNEL_BIN_SIZE, \ trans_##TA, trans_##TB, \ DIVK , \ { BS0, BS1 } , \ { NV0, NV1 } , \ #MULT \ } typedef clblasFunctorCache CacheBranch; static CacheBranch cachebranch ; // Make it 1 to enable additional debug 'print' #define VERB 0 //static bool applicable( const Variant & var, clblasSgemmFunctor::Args & args, int RefMultiple ) //{ //#if 0 // // Transpose values are tested in select_variant // if ( args.transA != var.transA ) return false ; // if ( args.transB != var.transB ) return false ; //#endif // // //if (args.N>=var.divN && args.N % var.divN != 0 ) // if ( args.N % var.divN != 0 ) // return false ; // if ( args.M % var.divM != 0 ) // return false ; // if(var.Greater[0]?args.M=RefMultiple) // return false; // if(var.Greater[1]?args.N=RefMultiple) // return false; // if ( args.beta==0 && var.mult.compare("__ALPHA")!=0) // return false ; // return true ; //} static void to_upper(char* input) { while(*input) { *input=toupper(*input); input++; } } static const Variant * select_variant_BranchKernel(clblasSgemmFunctor::Args & args, const char* DevName, cl_uint _64BitsUse) { if (_64BitsUse != 64) { std::cout << "we don't support clblas on 32 bits" << std::endl; assert(1); return NULL; } if (args.transA == clblasNoTrans) { if (args.transB == clblasNoTrans) { // ===== sgemm NN ====== // sgemm_NN_32_32_16_16x16_2x2__ALPHABETA_BRANCH const char* KName_NN = SGEMM_KERNEL_NAME(N, N, 32, 32, 16, 16, 16, 2, 2, __ALPHABETA, BRANCH); const char* KBin_NN64; size_t KBin_NNSize64 = 0; #if BUILD_KERNEL_FROM_STRING //currently not supported return NULL; #else if (!strcmp(DevName, "Hawaii")) { KBin_NN64 = sgemm_NN_32_32_16_16x16_2x2__ALPHABETA_BRANCH_64_bin_Hawaii; KBin_NNSize64 = sizeof(sgemm_NN_32_32_16_16x16_2x2__ALPHABETA_BRANCH_64_bin_Hawaii); } #endif static const Variant variant = SGEMM_VARIANT_OBJ(N, N, 16, 16, 16, 2, 2, 64, __ALPHABETA, KName_NN, NULL, NULL, KBin_NN64, KBin_NNSize64); return &variant; } if (args.transB == clblasTrans) { //sgemm_NT_32_32_16_16x16_2x2__ALPHABETA_BRANCH const char* KName_NT = SGEMM_KERNEL_NAME(N, T, 32, 32, 16, 16, 16, 2, 2, __ALPHABETA, BRANCH); const char* KBin_NT64; size_t KBin_NTSize64 = 0; #if BUILD_KERNEL_FROM_STRING //currently not supported return NULL; #else if (!strcmp(DevName, "Hawaii")) { //KBin_NT64 = SGEMM_SRC_NAME_BIN(N, T, 16, __ALPHABETA, 64, HAWAII) ; KBin_NT64 = sgemm_NT_32_32_16_16x16_2x2__ALPHABETA_BRANCH_64_bin_Hawaii; KBin_NTSize64 = sizeof(sgemm_NT_32_32_16_16x16_2x2__ALPHABETA_BRANCH_64_bin_Hawaii); } else if (!strcmp(DevName, "Bonaire")) { #ifndef CLBLAS_BONAIRE_DYNAMIC_KERNEL KBin_NT64 = sgemm_NT_32_32_16_16x16_2x2__ALPHABETA_BRANCH_64_bin_Bonaire; KBin_NTSize64 = sizeof(sgemm_NT_32_32_16_16x16_2x2__ALPHABETA_BRANCH_64_bin_Bonaire); #endif } #endif // ===== SGEMM NT ====== static const Variant variant = SGEMM_VARIANT_OBJ(N, T, 16, 16, 16, 2, 2, 64, __ALPHABETA, KName_NT, NULL, NULL, KBin_NT64, KBin_NTSize64); return &variant; } } else { if (args.transB == clblasNoTrans) { // ===== sgemm TN ====== //sgemm_TN_32_32_16_16x16_2x2__ALPHABETA_BRANCH const char* KName_TN = SGEMM_KERNEL_NAME(T, N, 32, 32, 16, 16, 16, 2, 2, __ALPHABETA, BRANCH); const char* KBin_TN64; size_t KBin_TNSize64 = 0; #if BUILD_KERNEL_FROM_STRING //currently not supported return NULL; #else if (!strcmp(DevName, "Hawaii")) { KBin_TN64 = sgemm_TN_32_32_16_16x16_2x2__ALPHABETA_BRANCH_64_bin_Hawaii; KBin_TNSize64 = sizeof(sgemm_TN_32_32_16_16x16_2x2__ALPHABETA_BRANCH_64_bin_Hawaii); } #endif // ===== SGEMM NT ====== static const Variant variant = SGEMM_VARIANT_OBJ(T, N, 16, 16, 16, 2, 2, 64, __ALPHABETA, KName_TN, NULL, NULL, KBin_TN64, KBin_TNSize64); return &variant; } return NULL; } return NULL; } clBlashawaiiSgemmBranchKernelFunctor::clBlashawaiiSgemmBranchKernelFunctor(Args & args, const Variant * variant, cl_int & err) { cl_device_id device; cl_context context; m_program=NULL; m_variantBranch = variant; cl_command_queue queue = args.queue; err = getDeviceAndContext(queue, device, context); if( err != CL_SUCCESS ) { return; } if (VERB) printf(" ===> GET KERNEL %s\n", this->m_variantBranch->variantName) ; //Ben do I use the correct "kernel_name"? BinaryLookup bl(context, device, "clBlashawaiiSgemmBranchKernelFunctor"); bl.variantRaw( this->m_variantBranch->variantName, strlen(this->m_variantBranch->variantName)+1 ) ; if ( !bl.found() ) // may create empty file or may wait until file is ready { if ( this->m_variantBranch->bin != NULL ) { // build from a pre-compiled version of the kernel (SPIR or cl binaries) //only 1 binary containing all the kernel err = bl.buildFromBinary(this->m_variantBranch->bin, this->m_variantBranch->bin_size, /*this->m_variantBranch->build_options[i]*/ "-cl-std=2.0"); } else { //// directly build from a char* //for (int i=0; i<4; i++) // if(this->m_variantBranch->source[i] != 0) // err = bl.buildFromSource(this->m_variantSplit->source[i]); if (VERB) printf(" ===> BUILD PROBLEM WE DON'T SUPPORT SOURCE BUILD FOR Branch SGEMM\n") ; return; } if ( err != CL_SUCCESS ) { if (VERB) printf(" ===> BUILD PROBLEM\n") ; return; } } this->m_program = bl.getProgram(); } clBlashawaiiSgemmBranchKernelFunctor * clBlashawaiiSgemmBranchKernelFunctor::provide(clblasSgemmFunctor::Args & args, char* DevName) { if ( args.order == clblasRowMajor ) return NULL ; // The RowMajor case shall never occur. cl_device_id dev; cl_context ctxt; cl_int err = getDeviceAndContext(args.queue, dev, ctxt); if (err != CL_SUCCESS) { return NULL; } cl_uint bitness = getAddressBits(dev); int major; int minor; getCLVersion(dev, major, minor); //if (major<2) // return NULL; // to_upper( DevName); const Variant * variant = select_variant_BranchKernel( args, DevName, bitness ) ; if ( variant == NULL ) return NULL ; CacheBranch::Lookup lookup(cachebranch, ctxt, dev, variant) ; if ( lookup.ok() ) { clBlashawaiiSgemmBranchKernelFunctor * functor = lookup.get(); functor->retain(); // increment the reference counter to avoid deletion while it is still beeing used return functor; } clBlashawaiiSgemmBranchKernelFunctor * functor = new clBlashawaiiSgemmBranchKernelFunctor(args, variant, err); if (err != CL_SUCCESS) { return NULL; } lookup.set(functor) ; return functor; } cl_int clBlashawaiiSgemmBranchKernelFunctor::KernelsLaunch(cl_command_queue queue, cl_kernel Kernel[1], Args &args) { // ((Mvalue - 1) / 32 + 1) * 16 size_t GlobalX = ((args.M-1) /(m_variantBranch->bwi[0]*m_variantBranch->ls[0]) + 1)*16 ; // size_t GlobalY = ((args.N - 1) / (m_variantBranch->bwi[1] * m_variantBranch->ls[1]) + 1) * 16; std::size_t gs[2] = {GlobalX, GlobalY}; cl_int error = 0; if (VERB) printf(" ===> EXECUTE KERNEL 0 \n") ; error = clEnqueueNDRangeKernel(queue, Kernel[0], 2, NULL, gs, m_variantBranch->ls, args.numEventsInWaitList, args.eventWaitList,args.events); return error; return clblasNotImplemented; } clblasStatus clBlashawaiiSgemmBranchKernelFunctor::execute(Args &args) { cl_int err; cl_command_queue queue = args.queue; if (VERB) printf(" ===> EXECUTE KERNEL %s, alpha =%f ,beta = %f\n", this->m_variantBranch->kernel_name, args.alpha, args.beta) ; cl_kernel kernel[1]; int NBKernel = 0; if (this->m_variantBranch->kernel_name[0]) { kernel[0]= clCreateKernel( this->m_program, this->m_variantBranch->kernel_name[0], &err); if (err != CL_SUCCESS) return clblasStatus(err) ; NBKernel++; } if (NBKernel != 1) return clblasStatus(clblasBuildProgramFailure) ; if (VERB) { for (int i=0; i FOUND %s\n", this->m_variantBranch->kernel_name[i]) ; } int M = args.M, N = args.N, K = args.K; int lda = args.lda, ldb = args.ldb, ldc = args.ldc; int offsetA = args.offA; int offsetB = args.offB; int offsetC = args.offC; int arg[4]={0, 0, 0, 0} ; //// All sgemm kernels shall have the same arguments: (A,B,C,M,N,K,alpha,beta,lda,ldb,ldc,offa,offb,offc) for (int i=0; i(kernel[i], arg[i]++, args.A); setKernelArg(kernel[i], arg[i]++, args.B); setKernelArg(kernel[i], arg[i]++, args.C); setKernelArg(kernel[i], arg[i]++, M); setKernelArg(kernel[i], arg[i]++, N); setKernelArg(kernel[i], arg[i]++, K); setKernelArg(kernel[i], arg[i]++, args.alpha); //if (args.beta!=0 && this->m_variantBranch->mult.compare("__ALPHA")!=0) setKernelArg(kernel[i], arg[i]++, args.beta); setKernelArg(kernel[i], arg[i]++, lda); setKernelArg(kernel[i], arg[i]++, ldb); setKernelArg(kernel[i], arg[i]++, ldc); setKernelArg(kernel[i], arg[i]++, offsetA); setKernelArg(kernel[i], arg[i]++, offsetB); setKernelArg(kernel[i], arg[i]++, offsetC); } err = KernelsLaunch(queue, kernel, args); for (int i = 0; i ERR=%d \n",(int)err) ; // err= clFinish(queue); return clblasStatus(err) ; } #endif clblas-2.10/src/library/blas/functor/hawaii_sgemmSplit64_32.cc000066400000000000000000000301771264277366700242170ustar00rootroot00000000000000#if !defined CLBLAS_HAWAII_DYNAMIC_KERNEL || !defined CLBLAS_BONAIRE_DYNAMIC_KERNEL //this split kernel algorithm solves the main matrix with 64x64 micro tile size //solves the row boundry with 32x64 micro tile size //solves the column boundry with 64x32 micro tile size //solves the rest boundry with 32x32 micro tile size //assumption : after the main matrix being computed by kernels with 64x64 micro tile size, the boundary are of size 32. //in other words, M and N are mod32 and not mod64 #include #include #include #include #include "clblas-internal.h" #include "solution_seq.h" #include #include #include #include #include #include #include "BinaryBuild.h" #include "hawaii_sgemmSplit64_32.h" #if BUILD_KERNEL_FROM_STRING //#include "sgemm_hawaiiSplitKernel.clT" #else #ifndef CLBLAS_HAWAII_DYNAMIC_KERNEL #include "sgemm_hawaiiSplit64_32.clHawaii_64.bin.clT" #include "sgemm_gcn.clHawaii_64.bin.clT" #endif//CLBLAS_HAWAII_DYNAMIC_KERNEL #ifndef CLBLAS_BONAIRE_DYNAMIC_KERNEL //#include "sgemm_hawaiiSplitKernel.clBonaire_64.bin.clT" #endif //CLBLAS_BONAIRE_DYNAMIC_KERNEL #endif //BUILD_KERNEL_FROM_STRING // Just because the full name is too long typedef clBlashawaiiSgemmSplit64_32Functor::Variant Variant; //define the string name of the soure/binary code #define SGEMM_SRC_NAME(TA,TB, DIVK, MULT) sgemm_##TA##TB##_##DIVK##_SPLIT##MULT #define SGEMM_SRC_NAME_HAWAII(TA,TB, DIVK, MULT, BITS) sgemm_##TA##TB##_##DIVK##_SPLIT##MULT##_##BITS##_bin_Hawaii #define SGEMM_SRC_NAME_BONAIRE(TA,TB, DIVK, MULT, BITS) sgemm_##TA##TB##_##DIVK##_SPLIT##MULT##_##BITS##_bin_Bonaire #define SGEMM_SRC_NAME_BIN(TA,TB, DIVK, MULT, BITS, DEVICE) SGEMM_SRC_NAME##_##DEVICE(TA,TB, DIVK, MULT, BITS) //variant name used to differentiate the different ones #define SGEMM_VARIANT_NAME(TA,TB, DIVK, MULT) "sgemm_" #TA #TB "_" #DIVK "_SPLIT64_32" #MULT //SGEMM_VARIANT_NAME(TA, TB, DIVM , DIVN, DIVK, GREATER48M, GREATER48N, NBKERNEL), #define SGEMM_KERNEL_NAME(TA,TB,DIVM,DIVN,DIVK,BS0,BS1,NV0,NV1,MULT, BLOC) "sgemm_" #TA #TB "_" #DIVM "_" #DIVN "_" #DIVK "_" #BS0 "x" #BS1 "_" #NV0 "x" #NV1 #MULT "_SPLIT_" #BLOC #define trans_N clblasNoTrans #define trans_T clblasTrans // Fill a variant descriptor using OpenCL source #define SGEMM_VARIANT_OBJ(TA,TB,DIVK,BS0,BS1,NV0,NV1, BITS, MULT, \ KERNEL_NAME_MAIN, KERNEL_NAME_ROW, KERNEL_NAME_COLUMN, KERNEL_NAME_SINGLE, \ KERNELS_SRC, \ KERNEL_BUILD_OPTIONS, \ KERNELS_BIN, \ KERNEL_BIN_SIZE) { \ SGEMM_VARIANT_NAME(TA,TB, DIVK, MULT), \ { KERNEL_NAME_MAIN, KERNEL_NAME_ROW, KERNEL_NAME_COLUMN, KERNEL_NAME_SINGLE } , \ KERNELS_SRC, \ KERNEL_BUILD_OPTIONS, \ KERNELS_BIN, \ KERNEL_BIN_SIZE, \ trans_##TA, trans_##TB, \ DIVK , \ { BS0, BS1 } , \ { NV0, NV1 } , \ #MULT \ } typedef clblasFunctorCache CacheSplit; static CacheSplit cachesplit ; // Make it 1 to enable additional debug 'print' #define VERB 0 //static bool applicable( const Variant & var, clblasSgemmFunctor::Args & args, int RefMultiple ) //{ //#if 0 // // Transpose values are tested in select_variant // if ( args.transA != var.transA ) return false ; // if ( args.transB != var.transB ) return false ; //#endif // // //if (args.N>=var.divN && args.N % var.divN != 0 ) // if ( args.N % var.divN != 0 ) // return false ; // if ( args.M % var.divM != 0 ) // return false ; // if(var.Greater[0]?args.M=RefMultiple) // return false; // if(var.Greater[1]?args.N=RefMultiple) // return false; // if ( args.beta==0 && var.mult.compare("__ALPHA")!=0) // return false ; // return true ; //} static void to_upper(char* input) { while(*input) { *input=toupper(*input); input++; } } static const Variant * select_variant_SplitKernel( clblasSgemmFunctor::Args & args, const char* DevName, cl_uint _64BitsUse ) { if(_64BitsUse!=64) { std::cout<<"we don't support clblas on 32 bits"<< std::endl; assert(1); return NULL; } if ( args.transA == clblasNoTrans ) { if ( args.transB == clblasNoTrans ) { // ===== sgemm NN ====== // NN not implemented yet return NULL; } if (args.transB == clblasTrans) { const char* KName_NTMain = "sgemm_NT_64_64_16_16x16_4x4__ALPHABETA_SPLIT_MAIN" ; const char* KName_NTRow = "sgemm_NT_32_64_16_16x16_2x4__ALPHABETA_SPLIT_ROW" ; const char* KName_NTColumn = "sgemm_NT_64_32_16_16x16_4x2__ALPHABETA_SPLIT_COLUMN" ; const char* KName_NTSingleWave = "sgemm_NT_32_32_16_16x16_2x2__ALPHABETA_SPLIT_SINGLE" ; const char* KBin_NTMain64 ; size_t KBin_NTMainSize64 = 0; if (!strcmp(DevName, "Hawaii")) { #ifndef CLBLAS_HAWAII_DYNAMIC_KERNEL //KBin_NTMain64 = SGEMM_SRC_NAME_BIN(N, T, 16, __ALPHABETA, 64, HAWAII) ; //KBin_NTMainSize64 = sizeof(SGEMM_SRC_NAME_BIN(N, T, 16, __ALPHABETA, 64, HAWAII)) ; KBin_NTMain64 = sgemm_NT_64_32_SPLIT__ALPHABETA_64_bin_Hawaii; KBin_NTMainSize64 = sizeof(sgemm_NT_64_32_SPLIT__ALPHABETA_64_bin_Hawaii); #endif //CLBLAS_HAWAII_DYNAMIC_KERNEL } else if (!strcmp(DevName, "Bonaire")) { #ifndef CLBLAS_BONAIRE_DYNAMIC_KERNEL //not implemented for Bonaire yet #endif //#ifndef CLBLAS_BONAIRE_DYNAMIC_KERNEL } // ===== SGEMM NT ====== static const Variant variant = SGEMM_VARIANT_OBJ(N,T,16,16,16,4,4,64,__ALPHABETA, KName_NTMain,KName_NTRow, KName_NTColumn, KName_NTSingleWave , NULL, NULL, KBin_NTMain64, KBin_NTMainSize64) ; return &variant ; } } else { // TN and TT are not implemented yet return NULL; } return NULL; } clBlashawaiiSgemmSplit64_32Functor::clBlashawaiiSgemmSplit64_32Functor(Args & args, const Variant * variant, cl_int & err) { cl_device_id device; cl_context context; m_program=NULL; m_variantSplit = variant; cl_command_queue queue = args.queue; err = getDeviceAndContext(queue, device, context); if( err != CL_SUCCESS ) { return; } if (VERB) printf(" ===> GET KERNEL %s\n", this->m_variantSplit->variantName) ; //Ben do I use the correct "kernel_name"? BinaryLookup bl(context, device, "clBlashawaiiSgemmSplitKernelFunctor"); bl.variantRaw( this->m_variantSplit->variantName, strlen(this->m_variantSplit->variantName)+1 ) ; if ( !bl.found() ) // may create empty file or may wait until file is ready { if ( this->m_variantSplit->bin != NULL ) { // build from a pre-compiled version of the kernel (SPIR or cl binaries) //only 1 binary containing all the kernel err = bl.buildFromBinary(this->m_variantSplit->bin, this->m_variantSplit->bin_size, /*this->m_variantSplit->build_options[i]*/ "-cl-std=2.0"); } else { //// directly build from a char* //for (int i=0; i<4; i++) // if(this->m_variantSplit->source[i] != 0) // err = bl.buildFromSource(this->m_variantSplit->source[i]); if (VERB) printf(" ===> BUILD PROBLEM WE DON'T SUPPORT SOURCE BUILD FOR SPLIT SGEMM\n") ; return; } if ( err != CL_SUCCESS ) { if (VERB) printf(" ===> BUILD PROBLEM\n") ; return; } } this->m_program = bl.getProgram(); } clBlashawaiiSgemmSplit64_32Functor * clBlashawaiiSgemmSplit64_32Functor::provide(clblasSgemmFunctor::Args & args, char* DevName) { if ( args.order == clblasRowMajor ) return NULL ; // The RowMajor case shall never occur. cl_device_id dev; cl_context ctxt; cl_int err = getDeviceAndContext(args.queue, dev, ctxt); if (err != CL_SUCCESS) { return NULL; } cl_uint bitness = getAddressBits(dev); int major; int minor; getCLVersion(dev, major, minor); //if (major<2) // return NULL; // to_upper( DevName); const Variant * variant = select_variant_SplitKernel( args, DevName, bitness ) ; if ( variant == NULL ) return NULL ; CacheSplit::Lookup lookup(cachesplit, ctxt, dev, variant) ; if ( lookup.ok() ) { clBlashawaiiSgemmSplit64_32Functor * functor = lookup.get(); functor->retain(); // increment the reference counter to avoid deletion while it is still beeing used return functor; } clBlashawaiiSgemmSplit64_32Functor * functor = new clBlashawaiiSgemmSplit64_32Functor(args, variant, err); if (err != CL_SUCCESS) { return NULL; } lookup.set(functor) ; return functor; } cl_int clBlashawaiiSgemmSplit64_32Functor::KernelsLaunch(cl_command_queue queue, cl_kernel Kernel[4], Args &args) { //GlobalX = ((Mvalue - 1) / 64) * 16 //GlobalY = ((Nvalue - 1) / 64) * 16 size_t GlobalX = ((args.M - 1) / (m_variantSplit->bwi[0] * m_variantSplit->ls[0])) * 16; size_t GlobalY = ((args.N - 1) / (m_variantSplit->bwi[1] * m_variantSplit->ls[1])) * 16; std::size_t gs[2] = {GlobalX, GlobalY}; cl_int error = 0; //M and N are not mod64 and are mod32 if (args.M % 64 != 0 && args.N % 64 != 0 && args.M % 32 == 0 && args.N % 32 == 0 && args.M >= 64 && args.N >= 64) { if (VERB) printf(" ===> EXECUTE KERNEL 0, 1, 2, 3 \n") ; error = clEnqueueNDRangeKernel(queue, Kernel[0], 2, NULL, gs, m_variantSplit->ls, args.numEventsInWaitList, args.eventWaitList,NULL); gs[0] = 16; error |= clEnqueueNDRangeKernel(queue, Kernel[1], 2, NULL, gs, m_variantSplit->ls, 0, NULL,NULL); gs[1] = 16; gs[0] = GlobalX; error |= clEnqueueNDRangeKernel(queue, Kernel[2], 2, NULL, gs, m_variantSplit->ls, 0, NULL,NULL); gs[0] = 16; gs[1] = 16; error |= clEnqueueNDRangeKernel(queue, Kernel[3], 2, NULL, gs, m_variantSplit->ls, 0, NULL,args.events); return error; } return clblasNotImplemented; } clblasStatus clBlashawaiiSgemmSplit64_32Functor::execute(Args &args) { cl_int err; cl_command_queue queue = args.queue; if (VERB) printf(" ===> EXECUTE KERNEL %s, alpha =%f ,beta = %f\n", this->m_variantSplit->kernel_name, args.alpha, args.beta) ; cl_kernel kernel[4]; int NBKernel = 0; for (int i=0; i<4; i++) { if (this->m_variantSplit->kernel_name[i]) { kernel[i ]= clCreateKernel( this->m_program, this->m_variantSplit->kernel_name[i], &err); if (err != CL_SUCCESS) return clblasStatus(err) ; NBKernel++; } else break; } if (NBKernel != 4) return clblasStatus(clblasBuildProgramFailure) ; if (VERB) { for (int i=0; i FOUND %s\n", this->m_variantSplit->kernel_name[i]) ; } int M = args.M, N = args.N, K = args.K; int lda = args.lda, ldb = args.ldb, ldc = args.ldc; int offsetA = args.offA; int offsetB = args.offB; int offsetC = args.offC; int arg[4]={0, 0, 0, 0} ; //// All sgemm kernels shall have the same arguments: (A,B,C,M,N,K,alpha,beta,lda,ldb,ldc,offa,offb,offc) for (int i=0; i(kernel[i], arg[i]++, args.A); setKernelArg(kernel[i], arg[i]++, args.B); setKernelArg(kernel[i], arg[i]++, args.C); setKernelArg(kernel[i], arg[i]++, M); setKernelArg(kernel[i], arg[i]++, N); setKernelArg(kernel[i], arg[i]++, K); setKernelArg(kernel[i], arg[i]++, args.alpha); //if (args.beta!=0 && this->m_variantSplit->mult.compare("__ALPHA")!=0) setKernelArg(kernel[i], arg[i]++, args.beta); setKernelArg(kernel[i], arg[i]++, lda); setKernelArg(kernel[i], arg[i]++, ldb); setKernelArg(kernel[i], arg[i]++, ldc); setKernelArg(kernel[i], arg[i]++, offsetA); setKernelArg(kernel[i], arg[i]++, offsetB); setKernelArg(kernel[i], arg[i]++, offsetC); } err = KernelsLaunch(queue, kernel, args); for (int i = 0; i ERR=%d \n",(int)err) ; // err= clFinish(queue); return clblasStatus(err) ; } #endif clblas-2.10/src/library/blas/functor/hawaii_sgemmSplitKernel.cc000066400000000000000000001102721264277366700246750ustar00rootroot00000000000000#if !defined CLBLAS_HAWAII_DYNAMIC_KERNEL || !defined CLBLAS_BONAIRE_DYNAMIC_KERNEL //this split kernel algorithm solves the main matrix with 96x96 micro tile size //solves the row boundry with 16x96 micro tile size //solves the column boundry with 96x16 micro tile size //solves the rest boundry with 16x16 micro tile size #include #include #include #include #include "clblas-internal.h" #include "solution_seq.h" #include #include #include #include #include #include #include "BinaryBuild.h" #include "hawaii_sgemmSplitKernel.h" #if BUILD_KERNEL_FROM_STRING #include "sgemm_hawaiiSplitKernel.clT" #else #ifndef CLBLAS_HAWAII_DYNAMIC_KERNEL #include "sgemm_hawaiiSplitKernel.clHawaii_64.bin.clT" #endif//CLBLAS_HAWAII_DYNAMIC_KERNEL #ifndef CLBLAS_BONAIRE_DYNAMIC_KERNEL #include "sgemm_hawaiiSplitKernel.clBonaire_64.bin.clT" #endif //CLBLAS_BONAIRE_DYNAMIC_KERNEL #endif //BUILD_KERNEL_FROM_STRING // Just because the full name is too long typedef clBlashawaiiSgemmSplitKernelFunctor::Variant Variant ; //define the string name of the soure/binary code #define SGEMM_SRC_NAME(TA,TB, DIVK, MULT) sgemm_##TA##TB##_##DIVK##_SPLIT##MULT #define SGEMM_SRC_NAME_HAWAII(TA,TB, DIVK, MULT, BITS) sgemm_##TA##TB##_##DIVK##_SPLIT##MULT##_##BITS##_bin_Hawaii #define SGEMM_SRC_NAME_BONAIRE(TA,TB, DIVK, MULT, BITS) sgemm_##TA##TB##_##DIVK##_SPLIT##MULT##_##BITS##_bin_Bonaire #define SGEMM_SRC_NAME_BIN(TA,TB, DIVK, MULT, BITS, DEVICE) SGEMM_SRC_NAME##_##DEVICE(TA,TB, DIVK, MULT, BITS) //variant name used to differentiate the different ones #define SGEMM_VARIANT_NAME(TA,TB, DIVK, MULT) "sgemm_" #TA #TB "_" #DIVK "_SPLIT" #MULT //SGEMM_VARIANT_NAME(TA, TB, DIVM , DIVN, DIVK, GREATER48M, GREATER48N, NBKERNEL), #define SGEMM_KERNEL_NAME(TA,TB,DIVM,DIVN,DIVK,BS0,BS1,NV0,NV1,MULT, BLOC) "sgemm_" #TA #TB "_" #DIVM "_" #DIVN "_" #DIVK "_" #BS0 "x" #BS1 "_" #NV0 "x" #NV1 #MULT "_SPLIT_" #BLOC #define trans_N clblasNoTrans #define trans_T clblasTrans // Fill a variant descriptor using OpenCL source #define SGEMM_VARIANT_OBJ(TA,TB,DIVK,BS0,BS1,NV0,NV1, BITS, MULT, \ KERNEL_NAME_MAIN, KERNEL_NAME_ROW, KERNEL_NAME_COLUMN, KERNEL_NAME_SINGLE, \ KERNELS_SRC, \ KERNEL_BUILD_OPTIONS, \ KERNELS_BIN, \ KERNEL_BIN_SIZE) { \ SGEMM_VARIANT_NAME(TA,TB, DIVK, MULT), \ { KERNEL_NAME_MAIN, KERNEL_NAME_ROW, KERNEL_NAME_COLUMN, KERNEL_NAME_SINGLE } , \ KERNELS_SRC, \ KERNEL_BUILD_OPTIONS, \ KERNELS_BIN, \ KERNEL_BIN_SIZE, \ trans_##TA, trans_##TB, \ DIVK , \ { BS0, BS1 } , \ { NV0, NV1 } , \ #MULT \ } typedef clblasFunctorCache CacheSplit ; static CacheSplit cachesplit ; // Make it 1 to enable additional debug 'print' #define VERB 0 //static bool applicable( const Variant & var, clblasSgemmFunctor::Args & args, int RefMultiple ) //{ //#if 0 // // Transpose values are tested in select_variant // if ( args.transA != var.transA ) return false ; // if ( args.transB != var.transB ) return false ; //#endif // // //if (args.N>=var.divN && args.N % var.divN != 0 ) // if ( args.N % var.divN != 0 ) // return false ; // if ( args.M % var.divM != 0 ) // return false ; // if(var.Greater[0]?args.M=RefMultiple) // return false; // if(var.Greater[1]?args.N=RefMultiple) // return false; // if ( args.beta==0 && var.mult.compare("__ALPHA")!=0) // return false ; // return true ; //} static void to_upper(char* input) { while(*input) { *input=toupper(*input); input++; } } static const Variant * select_variant_SplitKernel( clblasSgemmFunctor::Args & args, const char* DevName, cl_uint _64BitsUse ) { if(_64BitsUse!=64) { std::cout<<"we don't support clblas on 32 bits"<< std::endl; assert(1); return NULL; } if ( args.transA == clblasNoTrans ) { if ( args.transB == clblasNoTrans ) { // ===== sgemm NN ====== // return NULL; const char* KName_NNMain = SGEMM_KERNEL_NAME(N, N, 96, 96, 16, 16, 16, 6, 6, __ALPHABETA, MAIN) ; const char* KName_NNRow = SGEMM_KERNEL_NAME(N, N, 1, 96, 16, 16, 16, 6, 6, __ALPHABETA, ROW) ; const char* KName_NNColumn = SGEMM_KERNEL_NAME(N, N, 96, 1, 16, 16, 16, 6, 6, __ALPHABETA, COLUMN) ; const char* KName_NNSingleWave = SGEMM_KERNEL_NAME(N, N, 1, 1, 16, 16, 16, 6, 6, __ALPHABETA, SINGLE) ; const char* KName_NNMainAlpha = SGEMM_KERNEL_NAME(N, N, 96, 96, 16, 16, 16, 6, 6, __ALPHA, MAIN) ; const char* KName_NNRowAlpha = SGEMM_KERNEL_NAME(N, N, 1, 96, 16, 16, 16, 6, 6, __ALPHA, ROW) ; const char* KName_NNColumnAlpha = SGEMM_KERNEL_NAME(N, N, 96, 1, 16, 16, 16, 6, 6, __ALPHA, COLUMN) ; const char* KName_NNSingleWaveAlpha = SGEMM_KERNEL_NAME(N, N, 1, 1, 16, 16, 16, 6, 6, __ALPHA, SINGLE) ; const char* KName_NNMainK1 = SGEMM_KERNEL_NAME(N, N, 96, 96, 1, 16, 16, 6, 6, __ALPHABETA, MAIN) ; const char* KName_NNRowK1 = SGEMM_KERNEL_NAME(N, N, 1, 96, 1, 16, 16, 6, 6, __ALPHABETA, ROW) ; const char* KName_NNColumnK1 = SGEMM_KERNEL_NAME(N, N, 96, 1, 1, 16, 16, 6, 6, __ALPHABETA, COLUMN) ; const char* KName_NNSingleWaveK1 = SGEMM_KERNEL_NAME(N, N, 1, 1, 1, 16, 16, 6, 6, __ALPHABETA, SINGLE) ; const char* KName_NNMainK1Alpha = SGEMM_KERNEL_NAME(N, N, 96, 96, 1, 16, 16, 6, 6, __ALPHA, MAIN) ; const char* KName_NNRowK1Alpha = SGEMM_KERNEL_NAME(N, N, 1, 96, 1, 16, 16, 6, 6, __ALPHA, ROW) ; const char* KName_NNColumnK1Alpha = SGEMM_KERNEL_NAME(N, N, 96, 1, 1, 16, 16, 6, 6, __ALPHA, COLUMN) ; const char* KName_NNSingleWaveK1Alpha = SGEMM_KERNEL_NAME(N, N, 1, 1, 1, 16, 16, 6, 6, __ALPHA, SINGLE) ; #if BUILD_KERNEL_FROM_STRING const char* KSrc_NTMain = SGEMM_SRC_NAME(N, N, 48, 48, 8, 8, 8, 6, 6, __ALPHABETA) ; const char* KSrc_NTRow = SGEMM_SRC_NAME(N, N, 1, 48, 8,8, 8, 6, 6, __ALPHABETA) ; const char* KSrc_NTColumn = SGEMM_SRC_NAME(N, N, 48, 1, 8, 8, 8, 6, 6, __ALPHABETA) ; const char* KSrc_NTSingleWave = SGEMM_SRC_NAME(N, N, 1, 1, 8, 8, 8, 6, 6, __ALPHABETA) ; #else const char* KBin_NNMain64 ; size_t KBin_NNMainSize64 = 0; const char* KBin_NNMainAlpha64 ; size_t KBin_NNMainAlphaSize64 = 0; const char* KBin_NNMainK164 ; size_t KBin_NNMainK1Size64 = 0; const char* KBin_NNMainK1Alpha64 ; size_t KBin_NNMainK1AlphaSize64 = 0; if (!strcmp(DevName, "Hawaii")) { #ifndef CLBLAS_HAWAII_DYNAMIC_KERNEL KBin_NNMain64 = SGEMM_SRC_NAME_BIN(N, N, 16, __ALPHABETA, 64, HAWAII) ; KBin_NNMainSize64 = sizeof(SGEMM_SRC_NAME_BIN(N, N, 16, __ALPHABETA, 64, HAWAII)) ; KBin_NNMainAlpha64 = SGEMM_SRC_NAME_BIN(N, N, 16, __ALPHA, 64, HAWAII) ; KBin_NNMainAlphaSize64 = sizeof(SGEMM_SRC_NAME_BIN(N, N, 16, __ALPHA, 64, HAWAII)) ; KBin_NNMainK164 = SGEMM_SRC_NAME_BIN(N, N, 1, __ALPHABETA, 64, HAWAII) ; KBin_NNMainK1Size64 = sizeof(SGEMM_SRC_NAME_BIN(N, N, 1, __ALPHABETA, 64, HAWAII)) ; KBin_NNMainK1Alpha64 = SGEMM_SRC_NAME_BIN(N, N, 1, __ALPHA, 64, HAWAII) ; KBin_NNMainK1AlphaSize64 = sizeof(SGEMM_SRC_NAME_BIN(N, N, 1, __ALPHA, 64, HAWAII)) ; #endif //CLBLAS_HAWAII_DYNAMIC_KERNEL } else if (!strcmp(DevName, "Bonaire")) { #ifndef CLBLAS_BONAIRE_DYNAMIC_KERNEL KBin_NNMain64 = SGEMM_SRC_NAME_BIN(N, N, 16, __ALPHABETA, 64, BONAIRE) ; KBin_NNMainSize64 = sizeof(SGEMM_SRC_NAME_BIN(N, N, 16, __ALPHABETA, 64, BONAIRE)) ; KBin_NNMainAlpha64 = SGEMM_SRC_NAME_BIN(N, N, 16, __ALPHA, 64, BONAIRE) ; KBin_NNMainAlphaSize64 = sizeof(SGEMM_SRC_NAME_BIN(N, T, 16, __ALPHA, 64, BONAIRE)) ; KBin_NNMainK164 = SGEMM_SRC_NAME_BIN(N, N, 1, __ALPHABETA, 64, BONAIRE) ; KBin_NNMainK1Size64 = sizeof(SGEMM_SRC_NAME_BIN(N, N, 1, __ALPHABETA, 64, BONAIRE)) ; KBin_NNMainK1Alpha64 = SGEMM_SRC_NAME_BIN(N, N, 1, __ALPHA, 64, BONAIRE) ; KBin_NNMainK1AlphaSize64 = sizeof(SGEMM_SRC_NAME_BIN(N, N, 1, __ALPHA, 64, BONAIRE)) ; #endif //#ifndef CLBLAS_BONAIRE_DYNAMIC_KERNEL } #endif if(args.K%16==0) { if (args.beta!=0) { if(_64BitsUse==64) { static const Variant variant = SGEMM_VARIANT_OBJ(N,N,16,16,16,6,6,64,__ALPHABETA, KName_NNMain,KName_NNRow, KName_NNColumn, KName_NNSingleWave , NULL, NULL, KBin_NNMain64, KBin_NNMainSize64) ; return &variant ; } else { std::cout<<"we don't support clblas on 32 bits"<< std::endl; assert(1); } } else { static const Variant variant = SGEMM_VARIANT_OBJ(N,N,16,16,16,6,6,64,__ALPHA, KName_NNMainAlpha,KName_NNRowAlpha, KName_NNColumnAlpha, KName_NNSingleWaveAlpha , NULL, NULL, KBin_NNMainAlpha64, KBin_NNMainAlphaSize64) ; return &variant ; } } else { if (args.beta!=0) { static const Variant variant = SGEMM_VARIANT_OBJ(N,N,1,16,16,6,6,64,__ALPHABETA, KName_NNMainK1,KName_NNRowK1, KName_NNColumnK1, KName_NNSingleWaveK1 , NULL, NULL, KBin_NNMainK164, KBin_NNMainK1Size64) ; return &variant ; } else { static const Variant variant = SGEMM_VARIANT_OBJ(N,N,1,16,16,6,6,64,__ALPHA, KName_NNMainK1Alpha,KName_NNRowK1Alpha, KName_NNColumnK1Alpha, KName_NNSingleWaveK1Alpha , NULL, NULL, KBin_NNMainK1Alpha64, KBin_NNMainK1AlphaSize64) ; return &variant ; } } } if (args.transB == clblasTrans) { const char* KName_NTMain = SGEMM_KERNEL_NAME(N, T, 96, 96, 16, 16, 16, 6, 6, __ALPHABETA, MAIN) ; const char* KName_NTRow = SGEMM_KERNEL_NAME(N, T, 1, 96, 16, 16, 16, 6, 6, __ALPHABETA, ROW) ; const char* KName_NTColumn = SGEMM_KERNEL_NAME(N, T, 96, 1, 16, 16, 16, 6, 6, __ALPHABETA, COLUMN) ; const char* KName_NTSingleWave = SGEMM_KERNEL_NAME(N, T, 1, 1, 16, 16, 16, 6, 6, __ALPHABETA, SINGLE) ; const char* KName_NTMainAlpha = SGEMM_KERNEL_NAME(N, T, 96, 96, 16, 16, 16, 6, 6, __ALPHA, MAIN) ; const char* KName_NTRowAlpha = SGEMM_KERNEL_NAME(N, T, 1, 96, 16, 16, 16, 6, 6, __ALPHA, ROW) ; const char* KName_NTColumnAlpha = SGEMM_KERNEL_NAME(N, T, 96, 1, 16, 16, 16, 6, 6, __ALPHA, COLUMN) ; const char* KName_NTSingleWaveAlpha = SGEMM_KERNEL_NAME(N, T, 1, 1, 16, 16, 16, 6, 6, __ALPHA, SINGLE) ; const char* KName_NTMainK1 = SGEMM_KERNEL_NAME(N, T, 96, 96, 1, 16, 16, 6, 6, __ALPHABETA, MAIN) ; const char* KName_NTRowK1 = SGEMM_KERNEL_NAME(N, T, 1, 96, 1, 16, 16, 6, 6, __ALPHABETA, ROW) ; const char* KName_NTColumnK1 = SGEMM_KERNEL_NAME(N, T, 96, 1, 1, 16, 16, 6, 6, __ALPHABETA, COLUMN) ; const char* KName_NTSingleWaveK1 = SGEMM_KERNEL_NAME(N, T, 1, 1, 1, 16, 16, 6, 6, __ALPHABETA, SINGLE) ; const char* KName_NTMainK1Alpha = SGEMM_KERNEL_NAME(N, T, 96, 96, 1, 16, 16, 6, 6, __ALPHA, MAIN) ; const char* KName_NTRowK1Alpha = SGEMM_KERNEL_NAME(N, T, 1, 96, 1, 16, 16, 6, 6, __ALPHA, ROW) ; const char* KName_NTColumnK1Alpha = SGEMM_KERNEL_NAME(N, T, 96, 1, 1, 16, 16, 6, 6, __ALPHA, COLUMN) ; const char* KName_NTSingleWaveK1Alpha = SGEMM_KERNEL_NAME(N, T, 1, 1, 1, 16, 16, 6, 6, __ALPHA, SINGLE) ; const char* KBin_NTMain64 ; size_t KBin_NTMainSize64 = 0; const char* KBin_NTMainAlpha64 ; size_t KBin_NTMainAlphaSize64 = 0; const char* KBin_NTMainK164 ; size_t KBin_NTMainK1Size64 = 0; const char* KBin_NTMainK1Alpha64 ; size_t KBin_NTMainK1AlphaSize64 = 0; #if BUILD_KERNEL_FROM_STRING const char* KSrc_NTMain = SGEMM_SRC_NAME(N, T, 96, 96, 16, 16, 16, 6, 6, __ALPHABETA) ; const char* KSrc_NTRow = SGEMM_SRC_NAME(N, T, 1, 96, 16,16, 16, 6, 6, __ALPHABETA) ; const char* KSrc_NTColumn = SGEMM_SRC_NAME(N, T, 96, 1, 16, 16, 16, 6, 6, __ALPHABETA) ; const char* KSrc_NTSingleWave = SGEMM_SRC_NAME(N, T, 1, 1, 16, 16, 16, 6, 6, __ALPHABETA) ; #else if (!strcmp(DevName, "Hawaii")) { #ifndef CLBLAS_HAWAII_DYNAMIC_KERNEL KBin_NTMain64 = SGEMM_SRC_NAME_BIN(N, T, 16, __ALPHABETA, 64, HAWAII) ; KBin_NTMainSize64 = sizeof(SGEMM_SRC_NAME_BIN(N, T, 16, __ALPHABETA, 64, HAWAII)) ; KBin_NTMainAlpha64 = SGEMM_SRC_NAME_BIN(N, T, 16, __ALPHA, 64, HAWAII) ; KBin_NTMainAlphaSize64 = sizeof(SGEMM_SRC_NAME_BIN(N, T, 16, __ALPHA, 64, HAWAII)) ; KBin_NTMainK164 = SGEMM_SRC_NAME_BIN(N, T, 1, __ALPHABETA, 64, HAWAII) ; KBin_NTMainK1Size64 = sizeof(SGEMM_SRC_NAME_BIN(N, T, 1, __ALPHABETA, 64, HAWAII)) ; KBin_NTMainK1Alpha64 = SGEMM_SRC_NAME_BIN(N, T, 1, __ALPHA, 64, HAWAII) ; KBin_NTMainK1AlphaSize64 = sizeof(SGEMM_SRC_NAME_BIN(N, T, 1, __ALPHA, 64, HAWAII)) ; #endif //CLBLAS_HAWAII_DYNAMIC_KERNEL } else if (!strcmp(DevName, "Bonaire")) { #ifndef CLBLAS_BONAIRE_DYNAMIC_KERNEL KBin_NTMain64 = SGEMM_SRC_NAME_BIN(N, T, 16, __ALPHABETA, 64, BONAIRE) ; KBin_NTMainSize64 = sizeof(SGEMM_SRC_NAME_BIN(N, T, 16, __ALPHABETA, 64, BONAIRE)) ; KBin_NTMainAlpha64 = SGEMM_SRC_NAME_BIN(N, T, 16, __ALPHA, 64, BONAIRE) ; KBin_NTMainAlphaSize64 = sizeof(SGEMM_SRC_NAME_BIN(N, T, 16, __ALPHA, 64, BONAIRE)) ; KBin_NTMainK164 = SGEMM_SRC_NAME_BIN(N, T, 1, __ALPHABETA, 64, BONAIRE) ; KBin_NTMainK1Size64 = sizeof(SGEMM_SRC_NAME_BIN(N, T, 1, __ALPHABETA, 64, BONAIRE)) ; KBin_NTMainK1Alpha64 = SGEMM_SRC_NAME_BIN(N, T, 1, __ALPHA, 64, BONAIRE) ; KBin_NTMainK1AlphaSize64 = sizeof(SGEMM_SRC_NAME_BIN(N, T, 1, __ALPHA, 64, BONAIRE)) ; #endif //#ifndef CLBLAS_BONAIRE_DYNAMIC_KERNEL } #endif // ===== SGEMM NT ====== if(args.K%16==0) { if (args.beta!=0) { static const Variant variant = SGEMM_VARIANT_OBJ(N,T,16,16,16,6,6,64,__ALPHABETA, KName_NTMain,KName_NTRow, KName_NTColumn, KName_NTSingleWave , NULL, NULL, KBin_NTMain64, KBin_NTMainSize64) ; return &variant ; } else { static const Variant variant = SGEMM_VARIANT_OBJ(N,T,16,16,16,6,6,64,__ALPHA, KName_NTMainAlpha,KName_NTRowAlpha, KName_NTColumnAlpha, KName_NTSingleWaveAlpha , NULL, NULL, KBin_NTMainAlpha64, KBin_NTMainAlphaSize64) ; return &variant ; } } else { if (args.beta!=0) { static const Variant variant = SGEMM_VARIANT_OBJ(N,T,1,16,16,6,6,64,__ALPHABETA, KName_NTMainK1,KName_NTRowK1, KName_NTColumnK1, KName_NTSingleWaveK1 , NULL, NULL, KBin_NTMainK164, KBin_NTMainK1Size64) ; return &variant ; } else { static const Variant variant = SGEMM_VARIANT_OBJ(N,T,1,16,16,6,6,64,__ALPHA, KName_NTMainK1Alpha,KName_NTRowK1Alpha, KName_NTColumnK1Alpha, KName_NTSingleWaveK1Alpha , NULL, NULL, KBin_NTMainK1Alpha64, KBin_NTMainK1AlphaSize64) ; return &variant ; } } } } else { if ( args.transB == clblasNoTrans ) { // ===== sgemm TN ====== // return NULL; const char* KName_TNMain = SGEMM_KERNEL_NAME(T, N, 96, 96, 16, 16, 16, 6, 6, __ALPHABETA, MAIN) ; const char* KName_TNRow = SGEMM_KERNEL_NAME(T, N, 1, 96, 16, 16, 16, 6, 6, __ALPHABETA, ROW) ; const char* KName_TNColumn = SGEMM_KERNEL_NAME(T, N, 96, 1, 16, 16, 16, 6, 6, __ALPHABETA, COLUMN) ; const char* KName_TNSingleWave = SGEMM_KERNEL_NAME(T, N, 1, 1, 16, 16, 16, 6, 6, __ALPHABETA, SINGLE) ; const char* KName_TNMainAlpha = SGEMM_KERNEL_NAME(T, N, 96, 96, 16, 16, 16, 6, 6, __ALPHA, MAIN) ; const char* KName_TNRowAlpha = SGEMM_KERNEL_NAME(T, N, 1, 96, 16, 16, 16, 6, 6, __ALPHA, ROW) ; const char* KName_TNColumnAlpha = SGEMM_KERNEL_NAME(T, N, 96, 1, 16, 16, 16, 6, 6, __ALPHA, COLUMN) ; const char* KName_TNSingleWaveAlpha = SGEMM_KERNEL_NAME(T, N, 1, 1, 16, 16, 16, 6, 6, __ALPHA, SINGLE) ; //const char* KName_TNMainK1 = SGEMM_KERNEL_NAME(N, N, 96, 96, 1, 16, 16, 6, 6, __ALPHABETA, MAIN) ; //const char* KName_TNRowK1 = SGEMM_KERNEL_NAME(N, N, 1, 96, 1, 16, 16, 6, 6, __ALPHABETA, ROW) ; //const char* KName_TNColumnK1 = SGEMM_KERNEL_NAME(N, N, 96, 1, 1, 16, 16, 6, 6, __ALPHABETA, COLUMN) ; //const char* KName_TNSingleWaveK1 = SGEMM_KERNEL_NAME(N, N, 1, 1, 1, 16, 16, 6, 6, __ALPHABETA, SINGLE) ; // //const char* KName_TNMainK1Alpha = SGEMM_KERNEL_NAME(N, N, 96, 96, 1, 16, 16, 6, 6, __ALPHA, MAIN) ; //const char* KName_TNRowK1Alpha = SGEMM_KERNEL_NAME(N, N, 1, 96, 1, 16, 16, 6, 6, __ALPHA, ROW) ; //const char* KName_TNColumnK1Alpha = SGEMM_KERNEL_NAME(N, N, 96, 1, 1, 16, 16, 6, 6, __ALPHA, COLUMN) ; //const char* KName_TNSingleWaveK1Alpha = SGEMM_KERNEL_NAME(N, N, 1, 1, 1, 16, 16, 6, 6, __ALPHA, SINGLE) ; #if BUILD_KERNEL_FROM_STRING const char* KSrc_TNMain = SGEMM_SRC_NAME(T, N, 48, 48, 8, 8, 8, 6, 6, __ALPHABETA) ; const char* KSrc_TNRow = SGEMM_SRC_NAME(T, N, 1, 48, 8,8, 8, 6, 6, __ALPHABETA) ; const char* KSrc_TNColumn = SGEMM_SRC_NAME(T, N, 48, 1, 8, 8, 8, 6, 6, __ALPHABETA) ; const char* KSrc_TNSingleWave = SGEMM_SRC_NAME(T, N, 1, 1, 8, 8, 8, 6, 6, __ALPHABETA) ; #else const char* KBin_TNMain64 ; size_t KBin_TNMainSize64 = 0; const char* KBin_TNMainAlpha64 ; size_t KBin_TNMainAlphaSize64 = 0; //const char* KBin_NNMainK164 ; //size_t KBin_NNMainK1Size64 = 0; //const char* KBin_NNMainK1Alpha64 ; //size_t KBin_NNMainK1AlphaSize64 = 0; if (!strcmp(DevName, "Hawaii")) { #ifndef CLBLAS_HAWAII_DYNAMIC_KERNEL KBin_TNMain64 = SGEMM_SRC_NAME_BIN(T, N, 16, __ALPHABETA, 64, HAWAII) ; KBin_TNMainSize64 = sizeof(SGEMM_SRC_NAME_BIN(T, N, 16, __ALPHABETA, 64, HAWAII)) ; KBin_TNMainAlpha64 = SGEMM_SRC_NAME_BIN(T, N, 16, __ALPHA, 64, HAWAII) ; KBin_TNMainAlphaSize64 = sizeof(SGEMM_SRC_NAME_BIN(T, N, 16, __ALPHA, 64, HAWAII)) ; //KBin_NNMainK164 = SGEMM_SRC_NAME_BIN(N, N, 1, __ALPHABETA, 64, HAWAII) ; //KBin_NNMainK1Size64 = sizeof(SGEMM_SRC_NAME_BIN(N, N, 1, __ALPHABETA, 64, HAWAII)) ; //KBin_NNMainK1Alpha64 = SGEMM_SRC_NAME_BIN(N, N, 1, __ALPHA, 64, HAWAII) ; //KBin_NNMainK1AlphaSize64 = sizeof(SGEMM_SRC_NAME_BIN(N, N, 1, __ALPHA, 64, HAWAII)) ; #endif //CLBLAS_HAWAII_DYNAMIC_KERNEL } else if (!strcmp(DevName, "Bonaire")) { #ifndef CLBLAS_BONAIRE_DYNAMIC_KERNEL KBin_TNMain64 = SGEMM_SRC_NAME_BIN(T, N, 16, __ALPHABETA, 64, BONAIRE) ; KBin_TNMainSize64 = sizeof(SGEMM_SRC_NAME_BIN(T, N, 16, __ALPHABETA, 64, BONAIRE)) ; KBin_TNMainAlpha64 = SGEMM_SRC_NAME_BIN(T, N, 16, __ALPHA, 64, BONAIRE) ; KBin_TNMainAlphaSize64 = sizeof(SGEMM_SRC_NAME_BIN(T, N, 16, __ALPHA, 64, BONAIRE)) ; //KBin_NNMainK164 = SGEMM_SRC_NAME_BIN(N, N, 1, __ALPHABETA, 64, BONAIRE) ; //KBin_NNMainK1Size64 = sizeof(SGEMM_SRC_NAME_BIN(N, N, 1, __ALPHABETA, 64, BONAIRE)) ; //KBin_NNMainK1Alpha64 = SGEMM_SRC_NAME_BIN(N, N, 1, __ALPHA, 64, BONAIRE) ; //KBin_NNMainK1AlphaSize64 = sizeof(SGEMM_SRC_NAME_BIN(N, N, 1, __ALPHA, 64, BONAIRE)) ; #endif //#ifndef CLBLAS_BONAIRE_DYNAMIC_KERNEL } #endif if(args.K%16==0) { if (args.beta!=0) { static const Variant variant = SGEMM_VARIANT_OBJ(T,N,16,16,16,6,6,64,__ALPHABETA, KName_TNMain,KName_TNRow, KName_TNColumn, KName_TNSingleWave , NULL, NULL, KBin_TNMain64, KBin_TNMainSize64) ; return &variant ; } else { static const Variant variant = SGEMM_VARIANT_OBJ(T,N,16,16,16,6,6,64,__ALPHA, KName_TNMainAlpha,KName_TNRowAlpha, KName_TNColumnAlpha, KName_TNSingleWaveAlpha , NULL, NULL, KBin_TNMainAlpha64, KBin_TNMainAlphaSize64) ; return &variant ; } } /* else { if (args.beta!=0) { if(_64BitsUse==64) { static const Variant variant = SGEMM_VARIANT_OBJ(N,N,1,16,16,6,6,64,__ALPHABETA, KName_NNMainK1,KName_NNRowK1, KName_NNColumnK1, KName_NNSingleWaveK1 , NULL, NULL, KBin_NNMainK164, KBin_NNMainK1Size64) ; return &variant ; } else { std::cout<<"we don't support clblas on 32 bits"<< std::endl; assert(1); } } else { if(_64BitsUse==64) { static const Variant variant = SGEMM_VARIANT_OBJ(N,N,1,16,16,6,6,64,__ALPHA, KName_NNMainK1Alpha,KName_NNRowK1Alpha, KName_NNColumnK1Alpha, KName_NNSingleWaveK1Alpha , NULL, NULL, KBin_NNMainK1Alpha64, KBin_NNMainK1AlphaSize64) ; return &variant ; } else { std::cout<<"we don't support clblas on 32 bits"<< std::endl; assert(1); } } }*/ } } return NULL; } clBlashawaiiSgemmSplitKernelFunctor::clBlashawaiiSgemmSplitKernelFunctor(Args & args, const Variant * variant, cl_int & err) { cl_device_id device; cl_context context; m_program=NULL; m_variantSplit = variant; cl_command_queue queue = args.queue; err = getDeviceAndContext(queue, device, context); if( err != CL_SUCCESS ) { return; } if (VERB) printf(" ===> GET KERNEL %s\n", this->m_variantSplit->variantName) ; //Ben do I use the correct "kernel_name"? BinaryLookup bl(context, device, "clBlashawaiiSgemmSplitKernelFunctor"); bl.variantRaw( this->m_variantSplit->variantName, strlen(this->m_variantSplit->variantName)+1 ) ; if ( !bl.found() ) // may create empty file or may wait until file is ready { if ( this->m_variantSplit->bin != NULL ) { // build from a pre-compiled version of the kernel (SPIR or cl binaries) //only 1 binary containing all the kernel err = bl.buildFromBinary(this->m_variantSplit->bin, this->m_variantSplit->bin_size, /*this->m_variantSplit->build_options[i]*/ "-cl-std=2.0"); } else { //// directly build from a char* //for (int i=0; i<4; i++) // if(this->m_variantSplit->source[i] != 0) // err = bl.buildFromSource(this->m_variantSplit->source[i]); if (VERB) printf(" ===> BUILD PROBLEM WE DON'T SUPPORT SOURCE BUILD FOR SPLIT SGEMM\n") ; return; } if ( err != CL_SUCCESS ) { if (VERB) printf(" ===> BUILD PROBLEM\n") ; return; } } this->m_program = bl.getProgram(); } clBlashawaiiSgemmSplitKernelFunctor * clBlashawaiiSgemmSplitKernelFunctor::provide(clblasSgemmFunctor::Args & args, char* DevName) { if ( args.order == clblasRowMajor ) return NULL ; // The RowMajor case shall never occur. cl_device_id dev; cl_context ctxt; cl_int err = getDeviceAndContext(args.queue, dev, ctxt); if (err != CL_SUCCESS) { return NULL; } cl_uint bitness = getAddressBits(dev); int major; int minor; getCLVersion(dev, major, minor); //if (major<2) // return NULL; // to_upper( DevName); const Variant * variant = select_variant_SplitKernel( args, DevName, bitness ) ; if ( variant == NULL ) return NULL ; CacheSplit::Lookup lookup(cachesplit, ctxt, dev, variant) ; if ( lookup.ok() ) { clBlashawaiiSgemmSplitKernelFunctor * functor = lookup.get(); functor->retain(); // increment the reference counter to avoid deletion while it is still beeing used return functor; } clBlashawaiiSgemmSplitKernelFunctor * functor = new clBlashawaiiSgemmSplitKernelFunctor(args, variant, err); if (err != CL_SUCCESS) { return NULL; } lookup.set(functor) ; return functor; } cl_int clBlashawaiiSgemmSplitKernelFunctor::KernelsLaunch(cl_command_queue queue, cl_kernel Kernel[4], Args &args) { size_t GlobalX =args.M/m_variantSplit->bwi[0]; GlobalX-=GlobalX%m_variantSplit->ls[0]; // size_t GlobalY = args.N/m_variantSplit->bwi[1]; GlobalY-=GlobalY%m_variantSplit->ls[1]; std::size_t gs[2] = {GlobalX, GlobalY}; cl_int error = 0; //deals with square matrix sizes where K is mod 16 for now if (args.lda == args.ldb) { if ((args.K % 16 == 0) && (args.lda >= 6144) && (args.ldb >= 6144)) { if ((args.lda % 1024 == 0) && (args.ldb % 1024 == 0) && (args.transA == clblasNoTrans) && (args.transB == clblasTrans)) { //handles special cases where a direct call to "sgemm_NT_96_96_16..." causes perf drop due to cache miss/thrashing //this special cases is: sgemm column major NT / sgemm row major TN; lda and ldb are big multiples of 1024 such as 4096 and 6144 //K is bigger than a threshold: 1536 for lda=ldb=6144 // int K_block_size; if (args.lda == 6144) { K_block_size = 1536; } else { K_block_size = 128; } if (args.M % 96 == 0 && args.N % 96 == 0) { if (VERB) printf(" ===> EXECUTE KERNEL 0 \n"); if (args.K > K_block_size) { //split into many GEMM calls with K = K_block_size //there are at least 2 GEMM calls int num_of_gemm = ((args.K - 1) / K_block_size) + 1; //call first GEMM unsigned int small_K = K_block_size; setKernelArg(Kernel[0], 5, small_K); error = clEnqueueNDRangeKernel(queue, Kernel[0], 2, NULL, gs, m_variantSplit->ls, args.numEventsInWaitList, args.eventWaitList, NULL); //call middle GEMMs unsigned beta_one = 1.0f; setKernelArg(Kernel[0], 7, beta_one); for (int i = 1; i < num_of_gemm - 1; i++) { unsigned offa_i = args.lda * (args.K / num_of_gemm) * i + args.offA; unsigned offb_i = args.ldb * (args.K / num_of_gemm) * i + args.offB; setKernelArg(Kernel[0], 11, offa_i); setKernelArg(Kernel[0], 12, offb_i); error = clEnqueueNDRangeKernel(queue, Kernel[0], 2, NULL, gs, m_variantSplit->ls, 0, NULL, NULL); } //call last GEMM //the last GEMM's K might be smaller than small_K unsigned int residue_K = args.K % small_K; if (residue_K == 0) residue_K = small_K; unsigned offa_i = args.lda * (args.K / num_of_gemm) * (num_of_gemm - 1) + args.offA; unsigned offb_i = args.ldb * (args.K / num_of_gemm) * (num_of_gemm - 1) + args.offB; setKernelArg(Kernel[0], 5, residue_K); setKernelArg(Kernel[0], 11, offa_i); setKernelArg(Kernel[0], 12, offb_i); error = clEnqueueNDRangeKernel(queue, Kernel[0], 2, NULL, gs, m_variantSplit->ls, 0, NULL, args.events); return error; } } if (args.M % 96 != 0 && args.N % 96 != 0 && args.M >= 96 && args.N >= 96) { if (VERB) printf(" ===> EXECUTE KERNEL 0, 1, 2, 3 \n"); if (args.K > K_block_size) { int num_of_gemm = ((args.K - 1) / K_block_size) + 1; //first 4 GEMMs unsigned int small_K = K_block_size; setKernelArg(Kernel[0], 5, small_K); error = clEnqueueNDRangeKernel(queue, Kernel[0], 2, NULL, gs, m_variantSplit->ls, args.numEventsInWaitList, args.eventWaitList, NULL); gs[0] = 16; error |= clEnqueueNDRangeKernel(queue, Kernel[1], 2, NULL, gs, m_variantSplit->ls, 0, NULL, NULL); gs[1] = 16; gs[0] = GlobalX; error |= clEnqueueNDRangeKernel(queue, Kernel[2], 2, NULL, gs, m_variantSplit->ls, 0, NULL, NULL); gs[0] = 16; gs[1] = 16; error |= clEnqueueNDRangeKernel(queue, Kernel[3], 2, NULL, gs, m_variantSplit->ls, 0, NULL, NULL); //middle GEMMs unsigned beta_one = 1.0f; setKernelArg(Kernel[0], 7, beta_one); for (int i = 1; i < num_of_gemm - 1; i++) { unsigned offa_i = args.lda * (args.K / num_of_gemm) * i + args.offA; unsigned offb_i = args.ldb * (args.K / num_of_gemm) * i + args.offB; setKernelArg(Kernel[0], 11, offa_i); setKernelArg(Kernel[0], 12, offb_i); //gs[2] = {GlobalX, GlobalY}; gs[0] = GlobalX; gs[1] = GlobalY; error = clEnqueueNDRangeKernel(queue, Kernel[0], 2, NULL, gs, m_variantSplit->ls, 0, NULL, NULL); gs[0] = 16; error |= clEnqueueNDRangeKernel(queue, Kernel[1], 2, NULL, gs, m_variantSplit->ls, 0, NULL, NULL); gs[1] = 16; gs[0] = GlobalX; error |= clEnqueueNDRangeKernel(queue, Kernel[2], 2, NULL, gs, m_variantSplit->ls, 0, NULL, NULL); gs[0] = 16; gs[1] = 16; error |= clEnqueueNDRangeKernel(queue, Kernel[3], 2, NULL, gs, m_variantSplit->ls, 0, NULL, NULL); } //last 4 GEMMs unsigned int residue_K = args.K % small_K; if (residue_K == 0) residue_K = small_K; unsigned offa_i = args.lda * (args.K / num_of_gemm) * (num_of_gemm - 1) + args.offA; unsigned offb_i = args.ldb * (args.K / num_of_gemm) * (num_of_gemm - 1) + args.offB; setKernelArg(Kernel[0], 5, residue_K); setKernelArg(Kernel[0], 11, offa_i); setKernelArg(Kernel[0], 12, offb_i); gs[0] = GlobalX; gs[1] = GlobalY; error = clEnqueueNDRangeKernel(queue, Kernel[0], 2, NULL, gs, m_variantSplit->ls, 0, NULL, NULL); gs[0] = 16; error |= clEnqueueNDRangeKernel(queue, Kernel[1], 2, NULL, gs, m_variantSplit->ls, 0, NULL, NULL); gs[1] = 16; gs[0] = GlobalX; error |= clEnqueueNDRangeKernel(queue, Kernel[2], 2, NULL, gs, m_variantSplit->ls, 0, NULL, NULL); gs[0] = 16; gs[1] = 16; error |= clEnqueueNDRangeKernel(queue, Kernel[3], 2, NULL, gs, m_variantSplit->ls, 0, NULL, args.events); return error; } } } } } if (args.M%96==0 && args.N%96==0) { if (VERB) printf(" ===> EXECUTE KERNEL 0 \n") ; error = clEnqueueNDRangeKernel(queue, Kernel[0], 2, NULL, gs, m_variantSplit->ls, args.numEventsInWaitList, args.eventWaitList,args.events); return error; } if (args.M%96!=0 && args.N%96!=0 && args.M>=96 && args.N>=96 ) { if (VERB) printf(" ===> EXECUTE KERNEL 0, 1, 2, 3 \n") ; error = clEnqueueNDRangeKernel(queue, Kernel[0], 2, NULL, gs, m_variantSplit->ls, args.numEventsInWaitList, args.eventWaitList,NULL); gs[0] = 16; error |= clEnqueueNDRangeKernel(queue, Kernel[1], 2, NULL, gs, m_variantSplit->ls, 0, NULL,NULL); gs[1] = 16; gs[0] = GlobalX; error |= clEnqueueNDRangeKernel(queue, Kernel[2], 2, NULL, gs, m_variantSplit->ls, 0, NULL,NULL); gs[0] = 16; gs[1] = 16; error |= clEnqueueNDRangeKernel(queue, Kernel[3], 2, NULL, gs, m_variantSplit->ls, 0, NULL,args.events); return error; } if (args.M%96==0 && args.N%96!=0 && args.N>96 ) { if (VERB) printf(" ===> EXECUTE KERNEL 0, 2, \n") ; error = clEnqueueNDRangeKernel(queue, Kernel[0], 2, NULL, gs, m_variantSplit->ls, args.numEventsInWaitList, args.eventWaitList,NULL); gs[1] = 16; error |= clEnqueueNDRangeKernel(queue, Kernel[2], 2, NULL, gs, m_variantSplit->ls, 0, NULL, args.events); return error; } if (args.N%96==0 && args.M%96!=0 && args.M>96 ) { if (VERB) printf(" ===> EXECUTE KERNEL 0, 1 \n") ; error = clEnqueueNDRangeKernel(queue, Kernel[0], 2, NULL, gs, m_variantSplit->ls, args.numEventsInWaitList, args.eventWaitList,NULL); gs[0] = 16; error |= clEnqueueNDRangeKernel(queue, Kernel[1], 2, NULL, gs, m_variantSplit->ls, 0, NULL, args.events); return error; } if(args.M<96 && args.N%96==0) { if (VERB) printf(" ===> EXECUTE KERNEL 1, \n") ; gs[0] = 16; error |= clEnqueueNDRangeKernel(queue, Kernel[1], 2, NULL, gs, m_variantSplit->ls, args.numEventsInWaitList, args.eventWaitList, args.events); return error; } if(args.M<96 && args.N%96!=0 && args.N>=96) { if (VERB) printf(" ===> EXECUTE KERNEL 1, 3 \n") ; gs[0] = 16; error |= clEnqueueNDRangeKernel(queue, Kernel[1], 2, NULL, gs, m_variantSplit->ls, args.numEventsInWaitList, args.eventWaitList, NULL); gs[1] = 16; error |= clEnqueueNDRangeKernel(queue, Kernel[3], 2, NULL, gs, m_variantSplit->ls, 0, NULL,args.events); return error; } if(args.N<96 && args.M%96==0) { if (VERB) printf(" ===> EXECUTE KERNEL 2 \n") ; gs[1] = 16; error |= clEnqueueNDRangeKernel(queue, Kernel[2], 2, NULL, gs, m_variantSplit->ls, args.numEventsInWaitList, args.eventWaitList, args.events); return error; } if(args.N<96 && args.M%96!=0&& args.M>=96) { if (VERB) printf(" ===> EXECUTE KERNEL 2, 3 \n") ; gs[1] = 16; error |= clEnqueueNDRangeKernel(queue, Kernel[2], 2, NULL, gs, m_variantSplit->ls, args.numEventsInWaitList, args.eventWaitList, NULL); gs[0] = 16; error |= clEnqueueNDRangeKernel(queue, Kernel[3], 2, NULL, gs, m_variantSplit->ls, 0, NULL,args.events); return error; } if (args.N<96 && args.M<96) { if (VERB) printf(" ===> EXECUTE KERNEL 3 \n") ; gs[0] = 16; gs[1] = 16; error |= clEnqueueNDRangeKernel(queue, Kernel[3], 2, NULL, gs, m_variantSplit->ls,args.numEventsInWaitList, args.eventWaitList, args.events); return error; } return clblasNotImplemented; } clblasStatus clBlashawaiiSgemmSplitKernelFunctor::execute(Args &args) { cl_int err; cl_command_queue queue = args.queue; if (VERB) printf(" ===> EXECUTE KERNEL %s, alpha =%f ,beta = %f\n", this->m_variantSplit->kernel_name, args.alpha, args.beta) ; cl_kernel kernel[4]; int NBKernel = 0; for (int i=0; i<4; i++) { if (this->m_variantSplit->kernel_name[i]) { kernel[i ]= clCreateKernel( this->m_program, this->m_variantSplit->kernel_name[i], &err); if (err != CL_SUCCESS) return clblasStatus(err) ; NBKernel++; } else break; } if (NBKernel != 4) return clblasStatus(clblasBuildProgramFailure) ; if (VERB) { for (int i=0; i FOUND %s\n", this->m_variantSplit->kernel_name[i]) ; } int M = args.M, N = args.N, K = args.K; int lda = args.lda, ldb = args.ldb, ldc = args.ldc; int offsetA = args.offA; int offsetB = args.offB; int offsetC = args.offC; int arg[4]={0, 0, 0, 0} ; //// All sgemm kernels shall have the same arguments: (A,B,C,M,N,K,alpha,beta,lda,ldb,ldc,offa,offb,offc) for (int i=0; i(kernel[i], arg[i]++, args.A); setKernelArg(kernel[i], arg[i]++, args.B); setKernelArg(kernel[i], arg[i]++, args.C); setKernelArg(kernel[i], arg[i]++, M); setKernelArg(kernel[i], arg[i]++, N); setKernelArg(kernel[i], arg[i]++, K); setKernelArg(kernel[i], arg[i]++, args.alpha); if (args.beta!=0 && this->m_variantSplit->mult.compare("__ALPHA")!=0) setKernelArg(kernel[i], arg[i]++, args.beta); setKernelArg(kernel[i], arg[i]++, lda); setKernelArg(kernel[i], arg[i]++, ldb); setKernelArg(kernel[i], arg[i]++, ldc); setKernelArg(kernel[i], arg[i]++, offsetA); setKernelArg(kernel[i], arg[i]++, offsetB); setKernelArg(kernel[i], arg[i]++, offsetC); } err = KernelsLaunch(queue, kernel, args); for (int i = 0; i ERR=%d \n",(int)err) ; // err= clFinish(queue); return clblasStatus(err) ; } #endif clblas-2.10/src/library/blas/functor/include/000077500000000000000000000000001264277366700211775ustar00rootroot00000000000000clblas-2.10/src/library/blas/functor/include/BinaryBuild.h000066400000000000000000000004121264277366700235510ustar00rootroot00000000000000#ifndef _BINARY_BUILD_ #define _BINARY_BUILD_ //#include "CL\opencl.h" //manage if we use cl binaries or cl source code //#define BUILD_KERNEL_FROM_STRING 1 //find if we use in 32 or 64 bits ISA //extern /*char * _64Bits;*/cl_uint _64Bits; #endif //_BINARY_BUILD_clblas-2.10/src/library/blas/functor/include/atomic_counter.h000066400000000000000000000071061264277366700243670ustar00rootroot00000000000000/* ************************************************************************ * Copyright 2014 Advanced Micro Devices, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * ************************************************************************/ #ifndef _CLBLAS_ATOMIC_COUNTER_H_ #define _CLBLAS_ATOMIC_COUNTER_H_ #include // // This header provides the class clblasAtomicCounter that can be // used to implement a thread-safe usage counter as follow: // // class MyObject // { // ... // clblasAtomicCounter counter ; // ... // // MyObject() : counter(1) { // ... // } // // void retain() { // counter.increment() ; // } // // void release() { // if ( counter.decrement() == 0 ) { // delete this ; // } // } // // } // // // This header provides 2 versions controled by the macro CLBLAS_USE_STD_ATOMIC: // // - if CLBLAS_USE_STD_ATOMIC is set to non-zero then an implementation // based on from C++11 is used // // - if CLBLAS_USE_STD_ATOMIC is set to zero then a portable but less efficient // version using mutex is used // // - else if CLBLAS_USE_STD_ATOMIC is unset then an automatic detection of // and C++11 is automatically attempted // // Remark: there exists several other atomic implementations (e.g. boost, ...) that // could be implemented here // #ifndef CLBLAS_USE_STD_ATOMIC // // FIXME: G++ does not properly declare __cplusplus according to the standard // but may provide at least in recent versions // #if __cplusplus >= 201103L #define CLBLAS_USE_STD_ATOMIC 1 #else #define CLBLAS_USE_STD_ATOMIC 0 #endif #endif #if CLBLAS_USE_STD_ATOMIC // This is the optimized version using std::atomic from C++11 // // On the long term that shall be the only version // #include class clblasAtomicCounter { private: std::atomic value; public: clblasAtomicCounter(int v) : value(v) { } // Increments the counter and returns the incremented value. // (so a pre-increment) int increment() { return ++ value; } // Decrements the counter and returns the decremented value. // (so a pre-decremment) int decrement() { return -- value; } // Provide the counter value int get(){ return value.load(); } }; #else // // A less optimized but more portable version using // a mutex to insure atomicity // class clblasAtomicCounter { private: int value; mutex_t * mutex ; public: clblasAtomicCounter(int v) : value(v) { mutex = mutexInit() ; } ~clblasAtomicCounter() { mutexDestroy(mutex) ; } int increment() { int v ; mutexLock( this->mutex ) ; v = ++ this->value ; mutexUnlock( this->mutex ) ; return v ; } int decrement() { int v ; mutexLock( this->mutex ) ; v = -- this->value; mutexUnlock( this->mutex ) ; return v ; } int get(){ int v ; mutexLock( this->mutex ) ; v = this->value ; mutexUnlock( this->mutex ) ; return v ; } }; #endif #endif clblas-2.10/src/library/blas/functor/include/bonaire.h000066400000000000000000000026771264277366700230030ustar00rootroot00000000000000/* ************************************************************************ * Copyright 2014 Advanced Micro Devices, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * ************************************************************************/ #ifndef _CLBLAS_FUNCTION_SELECTOR_BONAIRE_ #define _CLBLAS_FUNCTION_SELECTOR_BONAIRE_ #include //#include class FunctorSelectorBonaire : public clblasFunctorSelector { private: FunctorSelectorBonaire(); static FunctorSelectorBonaire instance; public: // we don't want to provide any DP algorithm as DP is slow on bonaire //virtual clblasDgemmFunctor * select_dgemm_specific(clblasDgemmFunctor::Args & args); virtual clblasSgemmFunctor * select_sgemm_specific(clblasSgemmFunctor::Args & args); // virtual clblasDtrsmFunctor * select_dtrsm_specific(clblasDtrsmFunctor::Args & args); }; #endif // _CLBLAS_FUNCTION_SELECTOR_BONAIRE_ clblas-2.10/src/library/blas/functor/include/functor.h000066400000000000000000000335741264277366700230440ustar00rootroot00000000000000/* ************************************************************************ * Copyright 2014 Advanced Micro Devices, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * ************************************************************************/ #ifndef _CLBLAS_FUNCTOR_H_ #define _CLBLAS_FUNCTOR_H_ #include #include #include #include #include "atomic_counter.h" #include "functor_utils.h" // // clblasFunctor is the base class for all functors used to implemetn clBLAS API calls // // A functor is basically an object that provides an implementation of a given function // as a virtual member. // // The clblasFunctor class itself does not provide such function but all its // derived classes shall provide one. For consistancy, it is recommanded to // use 'execute' as the name for the member providing that function. // // So the class clblasFunctor shall be derived once for each supported // function. For instance, the class clblasDgemmFunctor provides the base // functor class for implementing the DGEMM call. // // The class clblasDgemmFunctor is itself derived one or more times to provide // the real implementations of the DGEMM functionnality. // // The choice of the proper implementation amongst the available functors is // typically delegated to another mechanism (see clblasFunctorSelector for instance). // class clblasFunctor { public: clblasFunctor(); virtual ~clblasFunctor() ; static cl_int getDeviceAndContext(cl_command_queue q, cl_device_id & device, cl_context & context); static cl_uint getAddressBits(cl_device_id & device); static void getCLVersion(cl_device_id & device, int&major, int& minor); template static void setKernelArg(cl_kernel kernel, int index, T data) { cl_int err = clSetKernelArg(kernel, (unsigned int)index, sizeof(T), (const void *)&data); assert( err == CL_SUCCESS ); } static void setKernelArgPtr(cl_kernel kernel, int index, size_t sz, const void *data) { cl_int err = clSetKernelArg(kernel, (unsigned int)index, sz, data); assert( err == CL_SUCCESS ); } // Indicate that this object has one more user. // The default behavior is to increase the internal use counter. // This function is thread-safe. virtual void retain(); // Indicate that this object has one less user. // The default behavior is to decrease the internal use counter // and, if it reaches zero, to destroy this object. // This function is thread-safe. virtual void release(); private: // Thread-safe reference counter used by the default implementation // of retain and release clblasAtomicCounter refcount; //protected: // cl_program program; // cl_kernel kernel; }; // // This class shall be the base class for all Functor caches. // // The idea is that all Functor caches derived from that class // will register themselves in a global cache pool thus allowing // some global cache management tasks to be performed // // As of now, the only implemented task is to discard all // cache entries (see cleanFunctorCaches() typically called // during clblasTeardown()) but, in future versions, it // would be nice to have other management tasks such as // removing all cache entries using a specific context // or controling the overall size of the cache // class clblasFunctorCacheBase { public: // Constructor: will register the cache in global cache pool clblasFunctorCacheBase(); // Constructor: will unregister the cache in global cache pool virtual ~clblasFunctorCacheBase() ; public: // Discard all members of the cache virtual void discardAll() = 0 ; } ; // // A dummy class used to represent the absence of additional data. // // class clblasNoData { public: // Provide the operator '<' needed by std::less inline bool operator<(const clblasNoData &) const { return false ; } } ; // // Represents the only possible value for clblasNoData // #define CLBLAS_NO_DATA clblasNoData() // // The templated class clblasFunctorCache provides a reusable // implementation of a cache of functors of type F according to the // openCL context and the openCL device. // // The type D is optional and represents additional data used to // index the cache entries. The third optional template // argument CompareD is a comparison object for the type D similar // to those used in std::map or std::sort. // // The idea is that each physical implementation of a functor is // supposed to manage its own cache in order to become reusable // between subsequent calls to clBLAS. // // The cache implementation is thread-safe assuming that it is used // properly: // (1) The lookup() method shall be called to search an existing // entry corresponding to the specified context and device. // (2) If the lookup() is successfull then the reference counter // on the returned functor is implicitly increased and that // functor can be used immediately. // (3) If the lookup() is not successfull - that is if the // resulting functor is null, then the cache is locked for // writing and a new cache entry is prepared. The user // is then responsible for creating a new functor that must // be registered via a call to setFunctorInEntry(). // Alternatively, the new cache entry can be dropped by a // call to dropEntry(). // Not calling setFunctorInEntry() or dropEntry() will leave // the cache in a locked state thus making it unusable and // likely to cause a dead-lock. // // In order to simplify development and to avoid errors, the // clblasFunctorCache provides an Lookup object class that hides most // of those details and insure that the case does not stay in an // locked state (as long as the loopup object is properly destroyed). // // In case a non-trivial custom type D is specified it may be necessary // to provide the comparison operator '<' needed by less as in the // following example: // // struct MyData // { // // int x,y,z ; // // inline bool operator< ( const MyData &b ) const // { // const MyData & a = *this ; // if ( a.x < b.x ) return true else if ( a.x > b.x ) return false ; // if ( a.y < b.y ) return true else if ( a.y > b.y ) return false ; // if ( a.z < b.z ) return true else if ( a.z > b.z ) return false ; // return false ; // } // // } // // Alternatively, if the custom type D is known to be a fully // initialized POD (including the unused bits) then the operator // '<' can be provided using memcmp() // // #include // // struct MyData // { // MyData() { std::memset(this,0,sizeof(MyData) ; } // // int x,y,z ; // // inline bool operator< ( const MyData &b ) const // { // return std::memcmp( this, b ) < 0 ; // } // } // // // // template > class clblasFunctorCache : public clblasFunctorCacheBase { private: struct Key { cl_device_id dev; cl_context ctxt; D data ; // Additional user data in the key //compare two Keys bool operator< (const Key & b) const { const Key &a = *this; if(a.dev != b.dev) return a.dev < b.dev; if(a.ctxt != b.ctxt) return a.ctxt < b.ctxt; CompareD cmp ; return cmp(a.data,b.data) ; } }; typedef clblasFunctorCache Cache; // The current implementation is using a std::map internally. // That may not be the most efficient but that can easily be // changed if needed. typedef std::map Map; typedef typename Map::iterator Entry; private: Map m_map; rwlock_t * m_rwlock; public: //Cache constructor: init mutex clblasFunctorCache() { this->m_rwlock = rwlockInit(); } //Cache destructor: destroy mutex ~clblasFunctorCache(){ rwlockDestroy(this->m_rwlock); } public: // // Lookup objects are short time objects used to perform a single query in // the cache. // // The usage pattern of a Lookup object shall always be the same // // - Declare a local Lookup object // - Perform a call to the ok() member // (1) if true then use the functor returned by get() // (2) if false then the cache is locked until the user provides // a new functor with a call to set(). // - Destroy the Lookup object // // So a functor implementation can implement its own cache as illustrated // by the following example: // // class MyDGemmFunctor: public clblasDGemmFunctor // { // // ... // // typedef clblasFunctorCache Cache ; // static Cache cache; // // ... // // MyDGemmFunctor * provide(...) // { // MyDGemmFunctor * functor ; // // MyDGemmFunctor::Cache::Lookup lookup(MyDGemmFunctor::cache, ctxt, dev) ; // // if ( lookup.ok() ) // { // return lookup.get() ; // } // else // { // MyDGemmFunctor * functor = new MyDGemmFunctor(...); // lookup.set(functor) ; // return functor ; // } // // } // // ... // // } ; // // class Lookup { private: Entry m_entry ; F * m_functor ; Cache & m_cache ; public: // Constructor // // Perform a lookup in the specified cache // Lookup(Cache & cache, cl_context ctxt, cl_device_id dev , const D & data) : m_cache(cache) { this->m_functor = m_cache.lookup(ctxt,dev,data,this->m_entry) ; } // // Alternative constructor when D is the default type clblasNoData // Lookup(Cache & cache, cl_context ctxt, cl_device_id dev ) : m_cache(cache) { this->m_functor = m_cache.lookup(ctxt,dev,CLBLAS_NO_DATA,this->m_entry) ; } // Destructor ~Lookup() { if ( !this->ok() ) { // Hoops! Something went wrong! // It is important to drop the pending cache entry m_cache.dropPendingEntry(this->m_entry) ; } } bool ok() { return this->m_functor != NULL ; } F * get() { assert(this->ok()) ; //return m_cache.getFunctorFromEntry(this->entry) ; return this->m_functor; } // Set the functor in the void set(F* f) { assert(!this->ok()) ; assert(f != NULL) ; m_cache.fillPendingEntry(this->m_entry,f) ; this->m_functor = f ; this->m_functor->retain(); } } ; private: // Perform a lookup in the cache. // // In case of success, returns the found functor. // // In case of failure, locks the cache, creates a new pending cache entry (in argument 'entry') // and returns NULL. The pending 'entry' shall then be populated with a valid functor by a // call to fillPendingEntry() or shall be dropped by a called dropPendingEntry(). Any failure // to perform one of those action will let the cache in a locked state thus making is unusable // // Remark: Direct use of this member is discouraged. Use the Lookup classe instead. // F* lookup(cl_context ctxt, cl_device_id dev, const D & data, Entry & entry) { Key key = { dev, ctxt , data }; rwlockReadLock(this->m_rwlock); { Entry l_entry = this->m_map.find(key); if( l_entry != this->m_map.end() ) { entry = l_entry; F * f = entry->second; rwlockReadUnlock(this->m_rwlock); return f ; } } rwlockReadUnlock(this->m_rwlock); // key was not found! It must be created std::pair ret; rwlockWriteLock(this->m_rwlock); ret = this->m_map.insert ( std::make_pair(key,(F *) NULL) ); if (!ret.second) { // The key already exists! F * f = ret.first->second ; rwlockWriteUnlock(this->m_rwlock); return f ; } entry = ret.first; // Reminder: the lookup() returns with the cache in a write-locked state return NULL; }; // Fill a pending cache entry with a valid functor as provided by an // unsuccessfull call to lookup(). void fillPendingEntry(Entry & entry, F * functor) { assert(functor != NULL) ; entry->second = functor ; rwlockWriteUnlock(this->m_rwlock); } // Drop a pending cache entry with a valid functor as provided by // an unsuccessfull call to lookup(). void dropPendingEntry(Entry & entry ) { this->m_map.erase(entry) ; rwlockWriteUnlock(this->m_rwlock); } public: // Inherited members from clblasFunctorCacheBase void discardAll() { rwlockWriteLock(this->m_rwlock); while ( true ) { Entry entry = this->m_map.begin() ; if ( entry == this->m_map.end() ) break ; entry->second->release() ; this->m_map.erase(entry) ; } rwlockWriteUnlock(this->m_rwlock); } }; #endif // _CLBLAS_FUNCTOR_H_ clblas-2.10/src/library/blas/functor/include/functor_fill.h000066400000000000000000000057441264277366700240500ustar00rootroot00000000000000/* ************************************************************************ * Copyright 2014 Advanced Micro Devices, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * ************************************************************************/ #ifndef _CLBLAS_FUNCTOR_FILL_H_ #define _CLBLAS_FUNCTOR_FILL_H_ #include "functor.h" // // The clblasFill2DFunctor provides a method to fill a rectangular part or a // 2D matrix with a single element value. // // The element size shall be 1, 2, 4, 8 or 16. Other values are not (yet) supported // // The functor operates in ColumnMajor mode // class clblasFill2DFunctor : public clblasFunctor { public: struct Args { cl_mem A; size_t offA; // offset in A (in elements) size_t m ; // number of rows size_t n ; // number of columns size_t ldA ; // distance between two columns (in elements) int elemsize ; const void * value ; cl_command_queue queue; cl_uint numEventsInWaitList; const cl_event * eventWaitList; cl_event * events; Args(cl_mem A, size_t offA, size_t m, size_t n, size_t ldA, // distance between two columns (in elements) int elemsize, const void * value, // The fill value (elemsize bytes) cl_command_queue queue, cl_uint numEventsInWaitList, const cl_event * eventWaitList, cl_event * events) : A(A), offA(offA), m(m), n(n), ldA(ldA), elemsize(elemsize), value(value), queue(queue), numEventsInWaitList(numEventsInWaitList), eventWaitList(eventWaitList), events(events) { } } ; virtual clblasStatus execute(Args & args) = 0; } ; // // A default portable implementation of clblasFill2DFunctor // class clblasFill2DFunctorDefault : public clblasFill2DFunctor { private: clblasFill2DFunctorDefault(cl_context ctxt, cl_device_id dev, int elemsize, cl_int & err) ; ~clblasFill2DFunctorDefault() ; public: static clblasFill2DFunctorDefault * provide(Args & args) ; public: clblasStatus execute(Args & args) ; private: int m_elemsize ; // the element size. Will also be used as key in the cache cl_program m_program ; } ; #endif clblas-2.10/src/library/blas/functor/include/functor_hawaii_dgemm_NT_MN48.h000066400000000000000000000134511264277366700266760ustar00rootroot00000000000000/* ************************************************************************ * Copyright 2014 Advanced Micro Devices, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * ************************************************************************/ #include #include #include #define BUILD_KERNEL_FROM_STRING 0 #if BUILD_KERNEL_FROM_STRING #include #else #include #endif class clblasDgemmFunctorHawaii_NT_MN48 : public clblasDgemmFunctor { private: // Constructor & Destructor clblasDgemmFunctorHawaii_NT_MN48(Args & args, cl_int & err); ~clblasDgemmFunctorHawaii_NT_MN48(); public: // Members inherited from clblasDgemmFunctor virtual clblasStatus execute(Args & a); public: static clblasDgemmFunctorHawaii_NT_MN48 * provide(Args & args); private: typedef clblasFunctorCache Cache ; static Cache cache; private: cl_program program ; }; clblasDgemmFunctorHawaii_NT_MN48::Cache clblasDgemmFunctorHawaii_NT_MN48::cache; clblasDgemmFunctorHawaii_NT_MN48 * clblasDgemmFunctorHawaii_NT_MN48::provide(clblasDgemmFunctor::Args & args) { //Work only if TRANSA == N, TRANSB == T, M and N multiple of 48 //Note: Are K%48 == 0 LDA LDB %2 == 0 and OFFA OFFB %2 == 0 required? bool applicable = (args.transA == clblasNoTrans) && (args.transB == clblasTrans) && (args.M % 48 == 0) && (args.N % 48 == 0) && (args.K % 48 == 0) && (args.order == clblasColumnMajor) ; if(!applicable) { return NULL; } cl_device_id dev; cl_context ctxt; cl_int err = getDeviceAndContext(args.queue, dev, ctxt); if (err != CL_SUCCESS) { return NULL; } Cache::Lookup lookup(cache, ctxt, dev, true ) ; if ( lookup.ok() ){ clblasDgemmFunctorHawaii_NT_MN48 * functor = lookup.get(); functor->retain(); // increment the reference counter to avoid deletion while it is still beeing used return functor; } clblasDgemmFunctorHawaii_NT_MN48 * functor = new clblasDgemmFunctorHawaii_NT_MN48(args, err); if (err != CL_SUCCESS) { return NULL; } lookup.set(functor) ; return functor; } clblasDgemmFunctorHawaii_NT_MN48::clblasDgemmFunctorHawaii_NT_MN48(Args & args, cl_int & err) : program(0) { //Hawaii kernel here only for test. //Work only if TRANSA == N, TRANSB == T, M and N multiple of 48 //Note: Are K%48 == 0 LDA LDB %2 == 0 and OFFA OFFB %2 == 0 required? cl_device_id device; cl_context context; cl_command_queue queue = args.queue; err = getDeviceAndContext(queue, device, context); if( err != CL_SUCCESS ) { return; } BinaryLookup bl(context, device, "clblasDgemmFunctorHawaii_NT_MN48"); bl.variantInt(48); if ( !bl.found() ) // may create empty file or may wait until file is ready { #if BUILD_KERNEL_FROM_STRING // directly build from a char* err = bl.buildFromSource(DGEMM_NT_MN48_KERNEL); #else // build from compiled version of the kernel (SPIR) err = bl.buildFromBinary(DGEMM_NT_MN48_SPIR_KERNEL, sizeof(DGEMM_NT_MN48_SPIR_KERNEL)); #endif if( err != CL_SUCCESS ) { return; } } this->program = bl.getProgram(); } clblasDgemmFunctorHawaii_NT_MN48::~clblasDgemmFunctorHawaii_NT_MN48() { if (this->program) { clReleaseProgram( this->program ) ; } } clblasStatus clblasDgemmFunctorHawaii_NT_MN48::execute(Args & args) { cl_int err; cl_command_queue queue = args.queue; cl_kernel kernel = clCreateKernel( this->program, "dgemm", &err); if (err != CL_SUCCESS) return clblasStatus(err) ; int M = args.M, N = args.N, K = args.K; int lda = args.lda, ldb = args.ldb, ldc = args.ldc; int offsetA = args.offA; int offsetB = args.offB; int offsetC = args.offC; setKernelArg(kernel, 0, args.C); setKernelArg(kernel, 1, args.B); setKernelArg(kernel, 2, args.A); setKernelArg(kernel, 3, N); setKernelArg(kernel, 4, M); setKernelArg(kernel, 5, K); setKernelArg(kernel, 6, args.alpha); setKernelArg(kernel, 7, args.beta); setKernelArg(kernel, 8, ldc); setKernelArg(kernel, 9, ldb); setKernelArg(kernel, 10, lda); setKernelArg(kernel, 11, offsetC); setKernelArg(kernel, 12, offsetB); setKernelArg(kernel, 13, offsetA); const size_t ls[2] = {8, 8}; const size_t bwi[2] = {6, 6}; size_t globalThreads[2]; unsigned int thx, thy; thx = M/bwi[0] + ((M%bwi[0] != 0) ? 1 : 0); // Each PE updates (bwi[0] x bwi[1])=(6 x 6) values thx = thx/ls[0] + ((thx%ls[0] != 0) ? 1 : 0); // Each work group is made of (ls[0] x ls[1])=(8 x 8) PE thx = ls[0] * thx; thy = N/bwi[1] + ((N%bwi[1] != 0) ? 1 : 0); // Each PE updates (bwi[0] x bwi[1])=(6 x 6) values thy = thy/ls[1] + ((thy%ls[1] != 0) ? 1 : 0); // Each work group is made of (ls[0] x ls[1])=(8 x 8) PE thy = ls[1] * thy; globalThreads[0] = thx; globalThreads[1] = thy; err = clEnqueueNDRangeKernel(queue, kernel, 2, NULL, globalThreads, NULL , args.numEventsInWaitList, args.eventWaitList, args.events); clReleaseKernel(kernel) ; return clblasStatus(err) ; } clblas-2.10/src/library/blas/functor/include/functor_selector.h000066400000000000000000000142631264277366700247360ustar00rootroot00000000000000/* ************************************************************************ * Copyright 2014 Advanced Micro Devices, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * ************************************************************************/ #ifndef _CLBLAS_FUNCTOR_SELECTOR_H_ #define _CLBLAS_FUNCTOR_SELECTOR_H_ #if defined(__APPLE__) || defined(__MACOSX) #include #else #include #endif #include #include #include #include #include #include #include #include // // The purpose of clblasFunctorSelector is to provide some selection functions to // choose amongst all functors available for a given task. // // Each BLAS function xxxx is typically associated to at least two virtual methods: // // - select_xxxx_generic() to select a generic functor applicable to all // possible arguments. // // - select_xxxx_specific(...) to select the most optimized functor for the // specified arguments. // // A user willing to obtain a functor using one of those functions shall typically // query the most appropriate selector using one of the static find() functions. // // There is only one instance of the default clblasFunctorSelector that is used for // devices without a specialized version. // // So the clblasFunctorSelector is supposed to be derived once for each supported // device architecture (e.g. FunctorSelectorTahiti for the AMD Tahiti GPU). Each // of those derived class shall define a single global instance of itself that // will register itself in a global table of all known functor selectors. // // The specialized selector class shall then provide its own select virtual // methods for which it exists a specialized implementation. Those specialized // selection methods may fall back on the default method if they do not provide // an optimized functor in all cases // class clblasFunctorSelector { protected: // Constructor for the non-default instances specialized for a given device. clblasFunctorSelector(DeviceChip chip); private: // This constructor is only for the default_instance clblasFunctorSelector(); // The selector default use when no specialized version exists // for the current device static clblasFunctorSelector default_instance ; public: // Find the suitable functor selector for the specified queue static clblasFunctorSelector * find(cl_command_queue queue); // Find the suitable functor selector for the specified device static clblasFunctorSelector * find(cl_device_id device); // Find the suitable functor selector for the specified device architecture static clblasFunctorSelector * find(DeviceChip arch) ; // Find if the device is a FirePro one. If not we will return the default functor which won't use the fast kernel for GCN static int FindFirePro(cl_device_id device); public: // Provide a XGEMM Functor usable in all cases virtual clblasSgemmFunctor * select_sgemm_generic(); virtual clblasDgemmFunctor * select_dgemm_generic(); virtual clblasCgemmFunctor * select_cgemm_generic(); virtual clblasZgemmFunctor * select_zgemm_generic(); // Provide XGEMM functors optimized for specific arguments virtual clblasSgemmFunctor * select_sgemm_specific(clblasSgemmFunctor::Args & args); virtual clblasDgemmFunctor * select_dgemm_specific(clblasDgemmFunctor::Args & args); virtual clblasCgemmFunctor * select_cgemm_specific(clblasCgemmFunctor::Args & args); virtual clblasZgemmFunctor * select_zgemm_specific(clblasZgemmFunctor::Args & args); // Provide a XSCAL Functor usable in all cases virtual clblasSscalFunctor * select_sscal_generic(clblasSscalFunctor::Args & args); virtual clblasDscalFunctor * select_dscal_generic(clblasDscalFunctor::Args & args); virtual clblasCscalFunctor * select_cscal_generic(clblasCscalFunctor::Args & args); virtual clblasZscalFunctor * select_zscal_generic(clblasZscalFunctor::Args & args); virtual clblasCsscalFunctor * select_csscal_generic(clblasCsscalFunctor::Args & args); virtual clblasZdscalFunctor * select_zdscal_generic(clblasZdscalFunctor::Args & args); // Provide XSCAL functors optimized for specific arguments virtual clblasSscalFunctor * select_sscal_specific(clblasSscalFunctor::Args & args); virtual clblasDscalFunctor * select_dscal_specific(clblasDscalFunctor::Args & args); virtual clblasCscalFunctor * select_cscal_specific(clblasCscalFunctor::Args & args); virtual clblasZscalFunctor * select_zscal_specific(clblasZscalFunctor::Args & args); virtual clblasCsscalFunctor * select_csscal_specific(clblasCsscalFunctor::Args & args); virtual clblasZdscalFunctor * select_zdscal_specific(clblasZdscalFunctor::Args & args); // Provide a XGEMM Functor usable in all cases virtual clblasStrsmFunctor * select_strsm_generic(); virtual clblasDtrsmFunctor * select_dtrsm_generic(); virtual clblasCtrsmFunctor * select_ctrsm_generic(); virtual clblasZtrsmFunctor * select_ztrsm_generic(); // Provide XTRSM functors optimized for specific arguments virtual clblasStrsmFunctor * select_strsm_specific(clblasStrsmFunctor::Args & args); virtual clblasDtrsmFunctor * select_dtrsm_specific(clblasDtrsmFunctor::Args & args); virtual clblasCtrsmFunctor * select_ctrsm_specific(clblasCtrsmFunctor::Args & args); virtual clblasZtrsmFunctor * select_ztrsm_specific(clblasZtrsmFunctor::Args & args); // Provide functor to perform non-contiguous fill in a 2D matrix virtual clblasFill2DFunctor * select_fill2d_specific(clblasFill2DFunctor::Args & args); }; #endif // _CLBLAS_FUNCTOR_SELECTOR_H_ clblas-2.10/src/library/blas/functor/include/functor_utils.h000066400000000000000000000104211264277366700242460ustar00rootroot00000000000000/* ************************************************************************ * Copyright 2014 Advanced Micro Devices, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * ************************************************************************/ #ifndef _CLBLAS_FUNCTOR_UTILS_ #define _CLBLAS_FUNCTOR_UTILS_ #include static inline clblasStatus checkQueues(cl_uint numCommandQueues, cl_command_queue *commandQueues) { if (numCommandQueues<=0) { return clblasInvalidCommandQueue; } if (commandQueues == 0) { return clblasInvalidCommandQueue; } for (cl_uint i=0 ; i0 && eventWaitList == 0) { return clblasInvalidEventWaitList; } for (cl_uint i=0 ; i class clblasXgemmFunctor : public clblasFunctor { public: // Structure used to store all XGEMM arguments struct Args { clblasOrder order; clblasTranspose transA; clblasTranspose transB; size_t M; size_t N; size_t K; T alpha; cl_mem A; size_t offA; size_t lda; cl_mem B; size_t offB; size_t ldb; T beta; cl_mem C; size_t offC; size_t ldc; cl_command_queue queue; cl_uint numEventsInWaitList; const cl_event * eventWaitList; cl_event * events; Args(clblasOrder order, clblasTranspose transA, clblasTranspose transB, size_t M, size_t N, size_t K, T alpha, cl_mem A, size_t offA, size_t lda, cl_mem B, size_t offB, size_t ldb, T beta, cl_mem C, size_t offC, size_t ldc, cl_command_queue queue, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events) : order(order), transA(transA), transB(transB), M(M), N(N), K(K), alpha(alpha), A(A), offA(offA), lda(lda), B(B), offB(offB), ldb(ldb), beta(beta), C(C), offC(offC), ldc(ldc), queue(queue), numEventsInWaitList(numEventsInWaitList), eventWaitList(eventWaitList), events(events) { } }; public: virtual clblasStatus execute(Args &args) = 0; } ; // ================ SGEMM ================== // // Base class for all functors providing a SGEMM implementation // class clblasSgemmFunctor : public clblasXgemmFunctor { }; // // Fallback functor for SGEMM using the original solver mechanism // class clblasSgemmFunctorFallback : public clblasSgemmFunctor { public: // Inherited members from clblasFunctor virtual void retain(); virtual void release(); public: // Inherited members from clblasSgemmFunctor virtual clblasStatus execute(Args & a); public: static clblasSgemmFunctorFallback * provide (); }; // ================ DGEMM ================== // // // Base class for all functors providing a DGEMM implementation // class clblasDgemmFunctor : public clblasXgemmFunctor { }; // // Fallback functor for DGEMM using the original solver mechanism // class clblasDgemmFunctorFallback : public clblasDgemmFunctor { public: // Inherited members from clblasFunctor virtual void retain(); virtual void release(); public: // Inherited members from clblasDgemmFunctor virtual clblasStatus execute(Args & a); public: static clblasDgemmFunctorFallback * provide (); }; // ================ CGEMM ================== // // Base class for all functors providing a CGEMM implementation // class clblasCgemmFunctor : public clblasXgemmFunctor { }; // // Fallback functor for CGEMM using the original solver mechanism // class clblasCgemmFunctorFallback : public clblasCgemmFunctor { public: // Inherited members from clblasFunctor virtual void retain(); virtual void release(); public: // Inherited members from clblasCgemmFunctor virtual clblasStatus execute(Args & a); public: static clblasCgemmFunctorFallback * provide (); }; // ================ ZGEMM ================== // // Base class for all functors providing a ZGEMM implementation // class clblasZgemmFunctor : public clblasXgemmFunctor { }; // // Fallback functor for ZGEMM using the original solver mechanism // class clblasZgemmFunctorFallback : public clblasZgemmFunctor { public: // Inherited members from clblasFunctor virtual void retain(); virtual void release(); public: // Inherited members from clblasZgemmFunctor virtual clblasStatus execute(Args & a); public: static clblasZgemmFunctorFallback * provide (); }; #endif // _CLBLAS_FUNCTOR_XGEMM_H_ clblas-2.10/src/library/blas/functor/include/functor_xscal.h000066400000000000000000000126331264277366700242270ustar00rootroot00000000000000/* ************************************************************************ * Copyright 2014 Advanced Micro Devices, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * ************************************************************************/ #ifndef _CLBLAS_FUNCTOR_XSCAL_H_ #define _CLBLAS_FUNCTOR_XSCAL_H_ // // This file provides the declarations of all XSCAL functors and related classes. // // // #include // // Base class for all XSCAL functors (DSCAL, SSCAL, ...) // template class clblasXscalFunctor : public clblasFunctor { public: // Structure used to store all XSCAL arguments struct Args { size_t N; Talpha alpha; cl_mem X; size_t offx; int incx; cl_command_queue queue; cl_uint numEventsInWaitList; const cl_event * eventWaitList; cl_event * events; Args(size_t N, Talpha alpha, cl_mem X, size_t offx, int incx, cl_command_queue queue, cl_uint numEventsInWaitList, const cl_event * eventWaitList, cl_event * events) : N(N), alpha(alpha), X(X), offx(offx), incx(incx), queue(queue), numEventsInWaitList(numEventsInWaitList), eventWaitList(eventWaitList), events(events) { } }; virtual clblasStatus execute(Args & args) = 0; }; // // Base class for all functors providing a SSCAL implementation // class clblasSscalFunctor: public clblasXscalFunctor { }; // // Base class for all functors providing a DSCAL implementation // class clblasDscalFunctor: public clblasXscalFunctor { }; // // Base class for all functors providing a CSCAL implementation // class clblasCscalFunctor: public clblasXscalFunctor { }; // // Base class for all functors providing a ZSCAL implementation // class clblasZscalFunctor: public clblasXscalFunctor { }; // // Base class for all functors providing a CSSCAL implementation // class clblasCsscalFunctor: public clblasXscalFunctor { }; // // Base class for all functors providing a ZDSCAL implementation // class clblasZdscalFunctor: public clblasXscalFunctor { }; // // Fallback functor for SSCAL : implement the sscal using the old solver mechanism // class clblasSscalFunctorFallback : public clblasSscalFunctor { public: // Inherited members from clblasFunctor virtual void retain(); virtual void release(); public: // Inherited members from clblasSscalFunctor virtual clblasStatus execute(Args & a); public: static clblasSscalFunctorFallback * provide (); }; // // Fallback functor for DSCAL : implement the dscal using the old solver mechanism // class clblasDscalFunctorFallback : public clblasDscalFunctor { public: // Inherited members from clblasFunctor virtual void retain(); virtual void release(); public: // Inherited members from clblasDscalFunctor virtual clblasStatus execute(Args & a); public: static clblasDscalFunctorFallback * provide (); }; // // Fallback functor for CSCAL : implement the Cscal using the old solver mechanism // class clblasCscalFunctorFallback : public clblasCscalFunctor { public: // Inherited members from clblasFunctor virtual void retain(); virtual void release(); public: // Inherited members from clblasCscalFunctor virtual clblasStatus execute(Args & a); public: static clblasCscalFunctorFallback * provide (); }; // // Fallback functor for ZSCAL : implement the zscal using the old solver mechanism // class clblasZscalFunctorFallback : public clblasZscalFunctor { public: // Inherited members from clblasFunctor virtual void retain(); virtual void release(); public: // Inherited members from clblasZscalFunctor virtual clblasStatus execute(Args & a); public: static clblasZscalFunctorFallback * provide (); }; // // Fallback functor for CSSCAL : implement the Csscal using the old solver mechanism // class clblasCsscalFunctorFallback : public clblasCsscalFunctor { public: // Inherited members from clblasFunctor virtual void retain(); virtual void release(); public: // Inherited members from clblasCsscalFunctor virtual clblasStatus execute(Args & a); public: static clblasCsscalFunctorFallback * provide (); }; // // Fallback functor for ZDSCAL : implement the zdscal using the old solver mechanism // class clblasZdscalFunctorFallback : public clblasZdscalFunctor { public: // Inherited members from clblasFunctor virtual void retain(); virtual void release(); public: // Inherited members from clblasZdscalFunctor virtual clblasStatus execute(Args & a); public: static clblasZdscalFunctorFallback * provide (); }; #endif // _CLBLAS_FUNCTOR_XSCAL_H_ clblas-2.10/src/library/blas/functor/include/functor_xscal_generic.h000066400000000000000000000131501264277366700257160ustar00rootroot00000000000000/* ************************************************************************ * Copyright 2014 Advanced Micro Devices, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * ************************************************************************/ #ifndef _CLBLAS_FUNCTOR_XSCAL_GENERIC_H_ #define _CLBLAS_FUNCTOR_XSCAL_GENERIC_H_ #include // A POD type used to index the functors below struct _clblasXscalFunctorGenericData { int vecLen ; bool doVLOAD ; bool noUnity ; // operator< is needed for the cache bool operator<(const _clblasXscalFunctorGenericData &b) const { const _clblasXscalFunctorGenericData &a = *this ; if ( a.vecLen != b.vecLen ) return a.vecLen < b.vecLen ; if ( a.doVLOAD != b.doVLOAD ) return a.doVLOAD < b.doVLOAD ; if ( a.noUnity != b.noUnity ) return a.noUnity < b.noUnity ; return false ; } } ; // // Generic functor for SSCAL : implement the sscal using kprintf generator // class clblasSscalFunctorGeneric : public clblasSscalFunctor { public: typedef _clblasXscalFunctorGenericData Data ; Data data; public: // Constructor & Destructor clblasSscalFunctorGeneric(cl_context ctxt, cl_device_id dev, const Data & data, cl_int & err); ~clblasSscalFunctorGeneric(); public: // Inherited members from clblasSscalFunctor virtual clblasStatus execute(Args & a); public: static clblasSscalFunctorGeneric * provide (Args & a); public: typedef clblasFunctorCache Cache; static Cache cache; public: cl_program program; }; // // Generic functor for DSCAL : implement the dscal using the kprintf generator // class clblasDscalFunctorGeneric : public clblasDscalFunctor { public: typedef _clblasXscalFunctorGenericData Data ; Data data; public: // Constructor & Destructor clblasDscalFunctorGeneric(cl_context ctxt, cl_device_id dev, const Data & data, cl_int & err); ~clblasDscalFunctorGeneric(); public: // Inherited members from clblasDscalFunctor virtual clblasStatus execute(Args & a); public: static clblasDscalFunctorGeneric * provide (Args & a); public: typedef clblasFunctorCache Cache; static Cache cache; public: cl_program program; }; // // Generic functor for CSCAL : implement the Cscal using the kprintf generator // class clblasCscalFunctorGeneric : public clblasCscalFunctor { public: typedef _clblasXscalFunctorGenericData Data ; Data data; public: // Constructor & Destructor clblasCscalFunctorGeneric(cl_context ctxt, cl_device_id dev, const Data & data, cl_int & err); ~clblasCscalFunctorGeneric(); public: // Inherited members from clblasCscalFunctor virtual clblasStatus execute(Args & a); public: static clblasCscalFunctorGeneric * provide (Args & a); public: typedef clblasFunctorCache Cache; static Cache cache; public: cl_program program; }; // // Generic functor for ZSCAL : implement the zscal using the kprintf generator // class clblasZscalFunctorGeneric : public clblasZscalFunctor { public: typedef _clblasXscalFunctorGenericData Data ; Data data; public: // Constructor & Destructor clblasZscalFunctorGeneric(cl_context ctxt, cl_device_id dev, const Data & data, cl_int & err); ~clblasZscalFunctorGeneric(); public: // Inherited members from clblasZscalFunctor virtual clblasStatus execute(Args & a); public: static clblasZscalFunctorGeneric * provide (Args & a); public: typedef clblasFunctorCache Cache; static Cache cache; public: cl_program program; }; // // Generic functor for CSSCAL : implement the Csscal using the kprintf generator // class clblasCsscalFunctorGeneric : public clblasCsscalFunctor { public: typedef _clblasXscalFunctorGenericData Data ; Data data; public: // Constructor & Destructor clblasCsscalFunctorGeneric(cl_context ctxt, cl_device_id dev, const Data & data, cl_int & err); ~clblasCsscalFunctorGeneric(); public: // Inherited members from clblasCsscalFunctor virtual clblasStatus execute(Args & a); public: static clblasCsscalFunctorGeneric * provide (Args & a); public: typedef clblasFunctorCache Cache; static Cache cache; public: cl_program program; }; // // Generic functor for ZDSCAL : implement the zdscal using the kprintf generator // class clblasZdscalFunctorGeneric : public clblasZdscalFunctor { public: typedef _clblasXscalFunctorGenericData Data ; Data data; public: // Constructor & Destructor clblasZdscalFunctorGeneric(cl_context ctxt, cl_device_id dev, const Data & data, cl_int & err); ~clblasZdscalFunctorGeneric(); public: // Inherited members from clblasZdscalFunctor virtual clblasStatus execute(Args & a); public: static clblasZdscalFunctorGeneric * provide (Args & a); public: typedef clblasFunctorCache Cache; static Cache cache; public: cl_program program; }; #endif // _CLBLAS_FUNCTOR_XSCAL_GENERIC_H_ clblas-2.10/src/library/blas/functor/include/functor_xtrsm.h000066400000000000000000000117171264277366700242740ustar00rootroot00000000000000/* ************************************************************************ * Copyright 2014 Advanced Micro Devices, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * ************************************************************************/ #ifndef _CLBLAS_FUNCTOR_XTRSM_H_ #define _CLBLAS_FUNCTOR_XTRSM_H_ // // This file provides the declarations of all XTRSM functors and related classes. // // // #include "functor.h" // // Base class for all XTRSM functors (DTRSM, STRSM, ...) // template class clblasXtrsmFunctor : public clblasFunctor { public: // Structure used to store all XTRSM arguments struct Args { clblasOrder order; clblasSide side; clblasUplo uplo; clblasTranspose transA; clblasDiag diag; size_t M; size_t N; T alpha; cl_mem A; size_t offA; size_t lda; cl_mem B; size_t offB; size_t ldb; cl_command_queue queue; cl_uint numEventsInWaitList; const cl_event * eventWaitList; cl_event * events; Args(clblasOrder order, clblasSide side, clblasUplo uplo, clblasTranspose transA, clblasDiag diag, size_t M, size_t N, T alpha, cl_mem A, size_t offA, size_t lda, cl_mem B, size_t offB, size_t ldb, cl_command_queue queue, cl_uint numEventsInWaitList, const cl_event * eventWaitList, cl_event * events) : order(order), side(side), uplo(uplo), transA(transA), diag(diag), M(M), N(N), alpha(alpha), A(A), offA(offA), lda(lda), B(B), offB(offB), ldb(ldb), queue(queue), numEventsInWaitList(numEventsInWaitList), eventWaitList(eventWaitList), events(events) { } }; public: virtual clblasStatus execute(Args &args) = 0; } ; // ================ STRSM ================== // // Base class for all functors providing a STRSM implementation // class clblasStrsmFunctor : public clblasXtrsmFunctor { }; // // Fallback functor for STRSM using the original solver mechanism // class clblasStrsmFunctorFallback : public clblasStrsmFunctor { public: // Inherited members from clblasFunctor virtual void retain(); virtual void release(); public: // Inherited members from clblasStrsmFunctor virtual clblasStatus execute(Args & a); public: static clblasStrsmFunctorFallback * provide (); }; // ================ DTRSM ================== // // Base class for all functors providing a DTRSM implementation // class clblasDtrsmFunctor : public clblasXtrsmFunctor { }; // // Fallback functor for DTRSM using the original solver mechanism // class clblasDtrsmFunctorFallback : public clblasDtrsmFunctor { public: // Inherited members from clblasFunctor virtual void retain(); virtual void release(); public: // Inherited members from clblasDtrsmFunctor virtual clblasStatus execute(Args & a); public: static clblasDtrsmFunctorFallback * provide (); }; // ================ CTRSM ================== // // Base class for all functors providing a CTRSM implementation // class clblasCtrsmFunctor : public clblasXtrsmFunctor { }; // // Fallback functor for CTRSM using the original solver mechanism // class clblasCtrsmFunctorFallback : public clblasCtrsmFunctor { public: // Inherited members from clblasFunctor virtual void retain(); virtual void release(); public: // Inherited members from clblasCtrsmFunctor virtual clblasStatus execute(Args & a); public: static clblasCtrsmFunctorFallback * provide (); }; // ================ ZTRSM ================== // // Base class for all functors providing a ZTRSM implementation // class clblasZtrsmFunctor : public clblasXtrsmFunctor { }; // // Fallback functor for ZTRSM using the original solver mechanism // class clblasZtrsmFunctorFallback : public clblasZtrsmFunctor { public: // Inherited members from clblasFunctor virtual void retain(); virtual void release(); public: // Inherited members from clblasZtrsmFunctor virtual clblasStatus execute(Args & a); public: static clblasZtrsmFunctorFallback * provide (); }; #endif // _CLBLAS_FUNCTOR_XTRSM_H_ clblas-2.10/src/library/blas/functor/include/gcn_dgemm.h000066400000000000000000000037321264277366700232750ustar00rootroot00000000000000#ifndef CLBLASDGEMMFUNCTORGCN #define CLBLASDGEMMFUNCTORGCN #include class clblasDgemmFunctorGCN : public clblasDgemmFunctor { public: // // A structure that describes a kernel variant. // // It is important that all instances of those structures shall // be const and static because their addresses are used as keys // in the internal functor cache. // // Also, they shall all have a unique kernel name. // struct Variant { const char * kernel_name ; const char * source ; // the kernel source (shall be unique) const char * build_options; const char * bin ; size_t bin_size ; clblasTranspose transA ; // clblasTranspose transB ; // unsigned divN ; // Required divisor of N (use 1 when N can be of any value) unsigned divM ; // Required divisor of M (use 1 when M can be of any value) unsigned divK ; // Required divisor of K (use 1 when K can be of any value) size_t ls[2] ; // Local size (the work-group size) size_t bwi[2] ; // Block size work-item: Number of elements calculated by each work items // So basically each kernel is computing a block of // (ls[0]*bwi[0]) x (ls[1]*bwi[1]) // elements of C. std::string mult; } ; private: // Constructor & Destructor //clblasDgemmFunctorGCN(Args & args, const Variant * variant, cl_int & err) ; public: // Provide a suitable clblasDgemmFunctorGCN for the specified args // or NULL if none //static clblasDgemmFunctorGCN * provide(clblasDgemmFunctor::Args & args, const char* DevName) ; public: // inherited member from clblasDgemmFunctor virtual clblasStatus execute(Args &args) ; protected: cl_program m_program ; const Variant * m_variant ; // Pointer to a 'const static' object describing the kernel variant. } ; #endif clblas-2.10/src/library/blas/functor/include/gcn_dgemmCommon.h000066400000000000000000000007541264277366700244470ustar00rootroot00000000000000#ifndef DGEMMMGCNCOMMON #define DGEMMMGCNCOMMON #include "gcn_dgemm.h" class clBlasGCNdgemmCommonFunctor : public clblasDgemmFunctorGCN { private: // Constructor & Destructor clBlasGCNdgemmCommonFunctor(Args & args, const Variant * variant, cl_int & err) ; public: // Provide a suitable hawaii_dgemmChannelConflict for the specified args // or NULL if none static clBlasGCNdgemmCommonFunctor * provide(clblasDgemmFunctor::Args & args, const char* DevName) ; }; #endifclblas-2.10/src/library/blas/functor/include/gcn_dgemmSmallMatrices.h000066400000000000000000000012331264277366700257500ustar00rootroot00000000000000#ifndef GCN_DGEMMMSMALLMATRICES #define GCN_DGEMMMSMALLMATRICES #include "gcn_dgemm.h" class clBlasGCNDgemmSmallMatricesFunctor : public clblasDgemmFunctorGCN { public: private: // Constructor & Destructor clBlasGCNDgemmSmallMatricesFunctor(Args & args, const Variant * variant, cl_int & err) ; //cl_int KernelsLaunch(cl_command_queue queue, cl_kernel Kernel[4], Args &args); public: // Provide a suitable hawaii_dgemmChannelConflict for the specified args // or NULL if none static clBlasGCNDgemmSmallMatricesFunctor * provide(clblasDgemmFunctor::Args & args, const char* DevName) ; virtual clblasStatus execute(Args &args) ; }; #endifclblas-2.10/src/library/blas/functor/include/gcn_sgemm.h000066400000000000000000000043241264277366700233120ustar00rootroot00000000000000#ifndef CLBLASSGEMMFUNCTORGCN #define CLBLASSGEMMFUNCTORGCN #include class clblasSgemmFunctorGCN : public clblasSgemmFunctor { public: // // A structure that describes a kernel variant. // // It is important that all instances of those structures shall // be const and static because their addresses are used as keys // in the internal functor cache. // // Also, they shall all have a unique kernel name. // struct Variant { const char * kernel_name ; const char * source ; // the kernel source (shall be unique) const char * build_options; const char * bin ; size_t bin_size ; clblasTranspose transA ; // clblasTranspose transB ; // unsigned divN ; // Required divisor of N (use 1 when N can be of any value) unsigned divM ; // Required divisor of M (use 1 when M can be of any value) unsigned divK ; // Required divisor of K (use 1 when K can be of any value) size_t ls[2] ; // Local size (the work-group size) size_t bwi[2] ; // Block size work-item: Number of elements calculated by each work items // So basically each kernel is computing a block of // (ls[0]*bwi[0]) x (ls[1]*bwi[1]) // elements of C. std::string mult; } ; private: // Constructor & Destructor clblasSgemmFunctorGCN(Args & args, const Variant * variant, cl_int & err) ; public: // Provide a suitable clblasDgemmFunctorGCN for the specified args // or NULL if none static clblasSgemmFunctorGCN * provide(clblasSgemmFunctor::Args & args, const char* DevName) ; public: // inherited member from clblasDgemmFunctor virtual clblasStatus execute(Args &args) ; protected: //we need a default constructor as we derive this class, //but we can't use the specific constructor as the arguments won't be the same (variant!!!). //Maybe it worth revisiting this class to have something cleaner clblasSgemmFunctorGCN(){}; cl_program m_program ; protected: const Variant * m_variant ; // Pointer to a 'const static' object describing the kernel variant. } ; #endif clblas-2.10/src/library/blas/functor/include/gcn_sgemmSmallMatrices.h000066400000000000000000000012331264277366700257670ustar00rootroot00000000000000#ifndef GCN_SGEMMMSMALLMATRICES #define GCN_SGEMMMSMALLMATRICES #include "gcn_sgemm.h" class clBlasGCNSgemmSmallMatricesFunctor : public clblasSgemmFunctorGCN { public: private: // Constructor & Destructor clBlasGCNSgemmSmallMatricesFunctor(Args & args, const Variant * variant, cl_int & err) ; //cl_int KernelsLaunch(cl_command_queue queue, cl_kernel Kernel[4], Args &args); public: // Provide a suitable hawaii_dgemmChannelConflict for the specified args // or NULL if none static clBlasGCNSgemmSmallMatricesFunctor * provide(clblasSgemmFunctor::Args & args, const char* DevName) ; virtual clblasStatus execute(Args &args) ; }; #endifclblas-2.10/src/library/blas/functor/include/gcn_zgemm.h000066400000000000000000000043241264277366700233210ustar00rootroot00000000000000#ifndef CLBLASZGEMMFUNCTORGCN #define CLBLASZGEMMFUNCTORGCN #include class clblasZgemmFunctorGCN : public clblasZgemmFunctor { public: // // A structure that describes a kernel variant. // // It is important that all instances of those structures shall // be const and static because their addresses are used as keys // in the internal functor cache. // // Also, they shall all have a unique kernel name. // struct Variant { const char * kernel_name ; const char * source ; // the kernel source (shall be unique) const char * build_options; const char * bin ; size_t bin_size ; clblasTranspose transA ; // clblasTranspose transB ; // unsigned divN ; // Required divisor of N (use 1 when N can be of any value) unsigned divM ; // Required divisor of M (use 1 when M can be of any value) unsigned divK ; // Required divisor of K (use 1 when K can be of any value) size_t ls[2] ; // Local size (the work-group size) size_t bwi[2] ; // Block size work-item: Number of elements calculated by each work items // So basically each kernel is computing a block of // (ls[0]*bwi[0]) x (ls[1]*bwi[1]) // elements of C. std::string mult; } ; private: // Constructor & Destructor clblasZgemmFunctorGCN(Args & args, const Variant * variant, cl_int & err) ; public: // Provide a suitable clblasZgemmFunctorGCN for the specified args // or NULL if none static clblasZgemmFunctorGCN * provide(clblasZgemmFunctor::Args & args, const char* DevName) ; public: // inherited member from clblasZgemmFunctor virtual clblasStatus execute(Args &args) ; protected: //we need a default constructor as we derive this class, //but we can't use the specific constructor as the arguments won't be the same (variant!!!). //Maybe it worth revisiting this class to have something cleaner clblasZgemmFunctorGCN(){}; cl_program m_program ; protected: const Variant * m_variant ; // Pointer to a 'const static' object describing the kernel variant. } ; #endif clblas-2.10/src/library/blas/functor/include/gpu_dtrsm.h000066400000000000000000000012021264277366700233470ustar00rootroot00000000000000#ifndef _CLBLAS_DTRSM_FUNCTOR_GPU_H_ #define _CLBLAS_DTRSM_FUNCTOR_GPU_H_ class clblasDtrsmFunctorGpu : public clblasDtrsmFunctor { public: private: // Constructor & Destructor clblasDtrsmFunctorGpu(Args & args, cl_int & err, const char* DevName, cl_uint _64BitsUse) ; public: // Provide a suitable clblasDtrsmFunctorTahiti for the specified args // or NULL if none static clblasDtrsmFunctorGpu * provide(clblasDtrsmFunctor::Args & args, const char* DevName) ; public: // inherited member from clblasDtrsmFunctor virtual clblasStatus execute(Args &args) ; private: cl_program m_program ; } ; #endif clblas-2.10/src/library/blas/functor/include/gpu_dtrsm192.h000066400000000000000000000012211264277366700236040ustar00rootroot00000000000000#ifndef _CLBLAS_DTRSM192_FUNCTOR_GPU_H_ #define _CLBLAS_DTRSM192_FUNCTOR_GPU_H_ class clblasDtrsm192FunctorGpu : public clblasDtrsmFunctor { public: private: // Constructor & Destructor clblasDtrsm192FunctorGpu(Args & args, cl_int & err, const char* DevName, cl_uint _64BitsUse) ; public: // Provide a suitable clblasDtrsmFunctorTahiti for the specified args // or NULL if none static clblasDtrsm192FunctorGpu * provide(clblasDtrsmFunctor::Args & args, const char* DevName) ; public: // inherited member from clblasDtrsmFunctor virtual clblasStatus execute(Args &args) ; private: cl_program m_program ; } ; #endif clblas-2.10/src/library/blas/functor/include/hawaii.h000066400000000000000000000030321264277366700226100ustar00rootroot00000000000000/* ************************************************************************ * Copyright 2014 Advanced Micro Devices, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * ************************************************************************/ #ifndef _CLBLAS_FUNCTION_SELECTOR_HAWAII_ #define _CLBLAS_FUNCTION_SELECTOR_HAWAII_ #include //#include class FunctorSelectorHawaii : public clblasFunctorSelector { private: FunctorSelectorHawaii(); static FunctorSelectorHawaii instance; public: // Provide a dgemmFunctor usable only if N is a multiple of blocksize // and incx==1 virtual clblasDgemmFunctor * select_dgemm_specific(clblasDgemmFunctor::Args & args); virtual clblasSgemmFunctor * select_sgemm_specific(clblasSgemmFunctor::Args & args); virtual clblasZgemmFunctor * select_zgemm_specific(clblasZgemmFunctor::Args & args); virtual clblasDtrsmFunctor * select_dtrsm_specific(clblasDtrsmFunctor::Args & args); }; #endif // _CLBLAS_FUNCTION_SELECTOR_HAWAII_ clblas-2.10/src/library/blas/functor/include/hawaii_dgemmChannelConflict.h000066400000000000000000000010251264277366700267340ustar00rootroot00000000000000#ifndef HAWAII_DGEMMMCHANNELCONFLICT #define HAWAII_DGEMMMCHANNELCONFLICT #include "gcn_dgemm.h" class clBlashawaiiDgemmChannelConflictFunctor : public clblasDgemmFunctorGCN { private: // Constructor & Destructor clBlashawaiiDgemmChannelConflictFunctor(Args & args, const Variant * variant, cl_int & err) ; public: // Provide a suitable hawaii_dgemmChannelConflict for the specified args // or NULL if none static clBlashawaiiDgemmChannelConflictFunctor * provide(clblasDgemmFunctor::Args & args) ; }; #endifclblas-2.10/src/library/blas/functor/include/hawaii_dgemmSplitKernel.h000066400000000000000000000031261264277366700261420ustar00rootroot00000000000000#ifndef HAWAII_DGEMMMSPLITKERNEL #define HAWAII_DGEMMMSPLITKERNEL #include "gcn_dgemm.h" class clBlashawaiiDgemmSplitKernelFunctor : public clblasDgemmFunctorGCN { public: struct Variant { const char * variantName; const char * kernel_name[4] ; //order is main, row, column, single const char * source ; // the kernel source (shall be unique) const char * build_options; const char * bin ; size_t bin_size ; clblasTranspose transA ; // clblasTranspose transB ; // unsigned divK ; // Required divisor of N (use 1 when N can be of any value) size_t ls[2] ; // Local size (the work-group size) size_t bwi[2] ; // Block size work-item: Number of elements calculated by each work items // So basically each kernel is computing a block of // (ls[0]*bwi[0]) x (ls[1]*bwi[1]) // elements of C. std::string mult; } ; private: // Constructor & Destructor clBlashawaiiDgemmSplitKernelFunctor(Args & args, const Variant * variant, cl_int & err) ; cl_int KernelsLaunch(cl_command_queue queue, cl_kernel Kernel[4], Args &args); const Variant * m_variantSplit ; // Pointer to a 'const static' object describing the kernel variant. public: // Provide a suitable hawaii_dgemmChannelConflict for the specified args // or NULL if none static clBlashawaiiDgemmSplitKernelFunctor * provide(clblasDgemmFunctor::Args & args) ; virtual clblasStatus execute(Args &args) ; }; #endifclblas-2.10/src/library/blas/functor/include/hawaii_sgemmBig1024Kernel.h000066400000000000000000000031671264277366700261030ustar00rootroot00000000000000/* Handles lda=ldb=4096, 5120, 7168, 8192 lda=ldb=6144 should be handled by a special case in hawaii_sgemmSplitKernel */ #ifndef HAWAII_SGEMMBIG1024KERNEL #define HAWAII_SGEMMBIG1024KERNEL #include "gcn_sgemm.h" class clBlashawaiiSgemmBig1024KernelFunctor : public clblasSgemmFunctorGCN { public: struct Variant { const char * variantName; const char * kernel_name[1] ; //just one kernel here const char * source ; // the kernel source (shall be unique) const char * build_options; const char * bin ; size_t bin_size ; clblasTranspose transA ; // clblasTranspose transB ; // unsigned divK ; // Required divisor of N (use 1 when N can be of any value) size_t ls[2] ; // Local size (the work-group size) size_t bwi[2] ; // Block size work-item: Number of elements calculated by each work items // So basically each kernel is computing a block of // (ls[0]*bwi[0]) x (ls[1]*bwi[1]) // elements of C. std::string mult; } ; private: // Constructor & Destructor clBlashawaiiSgemmBig1024KernelFunctor(Args & args, const Variant * variant, cl_int & err); cl_int KernelsLaunch(cl_command_queue queue, cl_kernel Kernel[1], Args &args); const Variant * m_variantBig1024 ; // Pointer to a 'const static' object describing the kernel variant. public: static clBlashawaiiSgemmBig1024KernelFunctor * provide(clblasSgemmFunctor::Args & args, char* DevName); virtual clblasStatus execute(Args &args) ; }; #endifclblas-2.10/src/library/blas/functor/include/hawaii_sgemmBranchKernel.h000066400000000000000000000033311264277366700262610ustar00rootroot00000000000000/* Handles non multiples of 16, 32, 48, 64, 94 SGEMM in one kernel Only non multiples of 32 (NT) is implemented right now. */ #ifndef HAWAII_SGEMMBRANCHKERNEL #define HAWAII_SGEMMBRANCHKERNEL #include "gcn_sgemm.h" class clBlashawaiiSgemmBranchKernelFunctor : public clblasSgemmFunctorGCN { public: struct Variant { const char * variantName; const char * kernel_name[1] ; //just one kernel here const char * source ; // the kernel source (shall be unique) const char * build_options; const char * bin ; size_t bin_size ; clblasTranspose transA ; // clblasTranspose transB ; // unsigned divK ; // Required divisor of N (use 1 when N can be of any value) size_t ls[2] ; // Local size (the work-group size) size_t bwi[2] ; // Block size work-item: Number of elements calculated by each work items // So basically each kernel is computing a block of // (ls[0]*bwi[0]) x (ls[1]*bwi[1]) // elements of C. std::string mult; } ; private: // Constructor & Destructor clBlashawaiiSgemmBranchKernelFunctor(Args & args, const Variant * variant, cl_int & err); cl_int KernelsLaunch(cl_command_queue queue, cl_kernel Kernel[1], Args &args); const Variant * m_variantBranch ; // Pointer to a 'const static' object describing the kernel variant. public: // Provide a suitable hawaii_sgemmChannelConflict for the specified args // or NULL if none static clBlashawaiiSgemmBranchKernelFunctor * provide(clblasSgemmFunctor::Args & args, char* DevName) ; virtual clblasStatus execute(Args &args) ; }; #endifclblas-2.10/src/library/blas/functor/include/hawaii_sgemmSplit64_32.h000066400000000000000000000031361264277366700254770ustar00rootroot00000000000000#ifndef HAWAII_SGEMMMSPLIT64_32 #define HAWAII_SGEMMMSPLIT64_32 #include "gcn_sgemm.h" class clBlashawaiiSgemmSplit64_32Functor : public clblasSgemmFunctorGCN { public: struct Variant { const char * variantName; const char * kernel_name[4] ; //order is main, row, column, single const char * source ; // the kernel source (shall be unique) const char * build_options; const char * bin ; size_t bin_size ; clblasTranspose transA ; // clblasTranspose transB ; // unsigned divK ; // Required divisor of N (use 1 when N can be of any value) size_t ls[2] ; // Local size (the work-group size) size_t bwi[2] ; // Block size work-item: Number of elements calculated by each work items // So basically each kernel is computing a block of // (ls[0]*bwi[0]) x (ls[1]*bwi[1]) // elements of C. std::string mult; } ; private: // Constructor & Destructor clBlashawaiiSgemmSplit64_32Functor(Args & args, const Variant * variant, cl_int & err); cl_int KernelsLaunch(cl_command_queue queue, cl_kernel Kernel[4], Args &args); const Variant * m_variantSplit ; // Pointer to a 'const static' object describing the kernel variant. public: // Provide a suitable hawaii_sgemmChannelConflict for the specified args // or NULL if none static clBlashawaiiSgemmSplit64_32Functor * provide(clblasSgemmFunctor::Args & args, char* DevName); virtual clblasStatus execute(Args &args) ; }; #endifclblas-2.10/src/library/blas/functor/include/hawaii_sgemmSplitKernel.h000066400000000000000000000031461264277366700261630ustar00rootroot00000000000000#ifndef HAWAII_SGEMMMSPLITKERNEL #define HAWAII_SGEMMMSPLITKERNEL #include "gcn_sgemm.h" class clBlashawaiiSgemmSplitKernelFunctor : public clblasSgemmFunctorGCN { public: struct Variant { const char * variantName; const char * kernel_name[4] ; //order is main, row, column, single const char * source ; // the kernel source (shall be unique) const char * build_options; const char * bin ; size_t bin_size ; clblasTranspose transA ; // clblasTranspose transB ; // unsigned divK ; // Required divisor of N (use 1 when N can be of any value) size_t ls[2] ; // Local size (the work-group size) size_t bwi[2] ; // Block size work-item: Number of elements calculated by each work items // So basically each kernel is computing a block of // (ls[0]*bwi[0]) x (ls[1]*bwi[1]) // elements of C. std::string mult; } ; private: // Constructor & Destructor clBlashawaiiSgemmSplitKernelFunctor(Args & args, const Variant * variant, cl_int & err) ; cl_int KernelsLaunch(cl_command_queue queue, cl_kernel Kernel[4], Args &args); const Variant * m_variantSplit ; // Pointer to a 'const static' object describing the kernel variant. public: // Provide a suitable hawaii_sgemmChannelConflict for the specified args // or NULL if none static clBlashawaiiSgemmSplitKernelFunctor * provide(clblasSgemmFunctor::Args & args, char* DevName) ; virtual clblasStatus execute(Args &args) ; }; #endifclblas-2.10/src/library/blas/functor/include/tahiti.h000066400000000000000000000027071264277366700226400ustar00rootroot00000000000000/* ************************************************************************ * Copyright 2014 Advanced Micro Devices, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * ************************************************************************/ #ifndef _CLBLAS_FUNCTION_SELECTOR_TAHITI_ #define _CLBLAS_FUNCTION_SELECTOR_TAHITI_ #include //#include class FunctorSelectorTahiti : public clblasFunctorSelector { private: FunctorSelectorTahiti(); static FunctorSelectorTahiti instance; public: // Provide a dgemmFunctor usable only if N is a multiple of blocksize // and incx==1 virtual clblasDgemmFunctor * select_dgemm_specific(clblasDgemmFunctor::Args & args); virtual clblasDtrsmFunctor * select_dtrsm_specific(clblasDtrsmFunctor::Args & args); virtual clblasSgemmFunctor * select_sgemm_specific(clblasSgemmFunctor::Args & args); }; #endif // _CLBLAS_FUNCTION_SELECTOR_TAHITI_ clblas-2.10/src/library/blas/functor/tahiti.cc000066400000000000000000000073021264277366700213470ustar00rootroot00000000000000/* ************************************************************************ * Copyright 2014 Advanced Micro Devices, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * ************************************************************************/ #include #include "gpu_dtrsm.h" //#include "gcn_dgemm.h" #include "gcn_dgemmCommon.h" #include "gcn_dgemmSmallMatrices.h" #include "gcn_sgemmSmallMatrices.h" FunctorSelectorTahiti FunctorSelectorTahiti::instance ; FunctorSelectorTahiti::FunctorSelectorTahiti() : clblasFunctorSelector(TAHITI) { } // // The selector function for DGEMM on tahiti // // clblasDgemmFunctor * FunctorSelectorTahiti::select_dgemm_specific(clblasDgemmFunctor::Args & args) { #ifdef CLBLAS_TAHITI_DYNAMIC_KERNEL return this->clblasFunctorSelector::select_dgemm_specific(args); #else clblasDgemmFunctor * functor; bool NN_NT = ((args.transA==clblasNoTrans && args.transB==clblasTrans ) || ( args.transA==clblasNoTrans && args.transB==clblasNoTrans )); bool SmallMatrices = args.M/6*args.N/6<85*85; SmallMatrices= SmallMatrices && ((args.M%24==0&&args.N%24==0)||(args.M%16==0&&args.N%16==0))&&args.K%8==0 &&NN_NT; if (args.alpha!=0) { if (SmallMatrices) { functor = clBlasGCNDgemmSmallMatricesFunctor::provide(args, "Tahiti"); if (functor) return functor; } functor = clBlasGCNdgemmCommonFunctor::provide(args, "Tahiti"); if (functor) return functor; } // else use the fallback implementation return this->clblasFunctorSelector::select_dgemm_specific(args); #endif } // The selector function for DTRSM on tahiti // clblasDtrsmFunctor * FunctorSelectorTahiti::select_dtrsm_specific(clblasDtrsmFunctor::Args & args) { #ifdef CLBLAS_TAHITI_DYNAMIC_KERNEL return this->clblasFunctorSelector::select_dtrsm_specific(args); #else clblasDtrsmFunctor * functor; functor = clblasDtrsmFunctorGpu::provide(args, "Tahiti"); if (functor) return functor; // else use the fallback implementation return this->clblasFunctorSelector::select_dtrsm_specific(args); #endif } clblasSgemmFunctor * FunctorSelectorTahiti::select_sgemm_specific(clblasSgemmFunctor::Args & args) { #ifdef CLBLAS_TAHITI_DYNAMIC_KERNEL return this->clblasFunctorSelector::select_sgemm_specific(args); #else clblasSgemmFunctor * functor; bool Not_TT = ((args.transA==clblasNoTrans && args.transB==clblasTrans ) || ( args.transA==clblasNoTrans && args.transB==clblasNoTrans ) || ( args.transA==clblasTrans && args.transB==clblasNoTrans )); bool SmallMatrices = args.M/6*args.N/6<100*100 || ((args.M%64!=0 && args.N%64!=0 && args.M<1900 &&args.N<1900 ) && (args.M%96!=0 && args.N%96!=0 && args.M<1900 &&args.N<1900 )); SmallMatrices= (SmallMatrices && (args.M%32==0&&args.N%32==0)) ; SmallMatrices=SmallMatrices&&Not_TT&&args.K%16==0; if (args.alpha!=0) { if (SmallMatrices) { functor = clBlasGCNSgemmSmallMatricesFunctor::provide(args, "Tahiti"); if (functor) return functor; } functor = clblasSgemmFunctorGCN::provide(args, "Tahiti"); if (functor) return functor; } // else use the fallback implementation return this->clblasFunctorSelector::select_sgemm_specific(args); #endif }clblas-2.10/src/library/blas/generic/000077500000000000000000000000001264277366700175105ustar00rootroot00000000000000clblas-2.10/src/library/blas/generic/binary_lookup.cc000066400000000000000000000436501264277366700227040ustar00rootroot00000000000000/* ************************************************************************ * Copyright 2014 Advanced Micro Devices, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * ************************************************************************/ #include #include #include #include #include #include #include #include #include #include #include #ifdef _WIN32 #include #include // for _mkdir #else #include #endif extern "C" { #include } // size for clGetDeviceInfo queries #define SIZE 256 #define CAPS_DEBUG 0 #include static char * sep() { #ifdef __WIN32 return (char*)"\\"; #else return (char*)"/"; #endif } static std::string cache_path; static bool cache_enabled(false); extern "C" void clblasInitBinaryCache() { const char * path = getenv("CLBLAS_CACHE_PATH"); if (path) { cache_path = std::string(path) + sep(); cache_enabled = true; } else { cache_path = ""; } } BinaryLookup::CacheEntry::CacheEntry(const std::string & filename) : m_filename(filename), m_successful_creation(false) { } void BinaryLookup::CacheEntry::close() { #ifdef _WIN32 CloseHandle(this->m_handle); #else ::close(*(int*)this->m_handle); delete (int*)this->m_handle; #endif } bool BinaryLookup::CacheEntry::successful_creation() { return this->m_successful_creation; } bool BinaryLookup::CacheEntry::exclusive_create() { #ifdef _WIN32 HANDLE handle = CreateFile(this->m_filename.c_str(), GENERIC_WRITE, 0, // no share with other process NULL, CREATE_NEW, FILE_ATTRIBUTE_NORMAL, NULL); this->m_handle = handle; this->m_successful_creation = (handle != INVALID_HANDLE_VALUE); return this->m_successful_creation; #else int * fd = new int[1]; *fd = open (this->m_filename.c_str(), O_CREAT | O_EXCL, S_IRWXU | S_IRGRP | S_IXGRP | S_IROTH | S_IXOTH); this->m_handle = fd; this->m_successful_creation = (*fd != -1); return *fd >= 0; #endif } BinaryLookup::BinaryLookup(cl_context ctxt, cl_device_id device, const std::string & kernel_name) : m_context(ctxt), m_device(device), m_program(NULL), m_binary(0), m_signature(0), m_cache_enabled(cache_enabled) { // initialize the entry name this->m_cache_entry_name = kernel_name; if (this->m_cache_enabled) { // retrieve device informations to compute the path of the cache cl_int err = this->retrieveDeviceAndDriverInfo(); if (err != CL_SUCCESS) { cache_enabled = false; this->m_cache_enabled = false; } } } BinaryLookup::~BinaryLookup() { delete this->m_binary; delete this->m_signature; } BinaryLookup::Variant::Variant() : m_kind((VariantKind)0), m_size(0), m_data(0) { } BinaryLookup::Variant::Variant(VariantKind kind, char * data, size_t size) : m_kind(kind), m_size(size) { this->m_data = new char[this->m_size]; memcpy(this->m_data, data, size); } BinaryLookup::Variant::~Variant() { // delete this->m_data; } void BinaryLookup::variantInt(int num) { m_variants.push_back(Variant(INT, (char*)&num, sizeof(num))); } void BinaryLookup::variantDouble(double num) { m_variants.push_back(Variant(DOUBLE, (char*)&num, sizeof(num))); } void BinaryLookup::variantCompileOptions(const std::string & opts) { m_variants.push_back(Variant(STRING, (char*)opts.c_str(), opts.size())); } void BinaryLookup::variantRaw(const void * data, size_t bytes) { m_variants.push_back(Variant(DATA, (char*)data, bytes)); } enum BinaryRepresentation { LSB, MSB, UNKNOWN }; static enum BinaryRepresentation getStorageMode(char * data) { if (data[0] == 'C' && data[1] == 'L' && data[2] == 'B' && data[3] == '\0') return LSB; if (data[0] == 'B' && data[1] == 'L' && data[2] == 'C' && data[3] == '\0') return MSB; return UNKNOWN; } void BinaryLookup::finalizeVariant() { // serialize variants size_t whole_variant_size_in_bytes = 0; // store 1 byte for the variant kind whole_variant_size_in_bytes += this->m_variants.size() * sizeof(int); // for the variant kind whole_variant_size_in_bytes += this->m_variants.size() * sizeof(size_t); // for the variant size // add every variant sizes for(size_t i=0 ; im_variants.size() ; ++i) { const Variant & v = this->m_variants[i]; // compute the whole size of the signature whole_variant_size_in_bytes += v.m_size; } this->m_header.signature_size = whole_variant_size_in_bytes; this->m_signature = new char[whole_variant_size_in_bytes]; char * current_address = this->m_signature; for(size_t i=0 ; im_variants.size() ; ++i) { Variant v = this->m_variants[i]; // write the variant kind memcpy(current_address, &v.m_kind, sizeof(int)); current_address += sizeof(v.m_kind); // write the variant size memcpy(current_address, &v.m_size, sizeof(v.m_size)); current_address += sizeof(v.m_size); // write the variant itself memcpy(current_address, v.m_data, v.m_size); current_address += v.m_size; } // Update the cache entry name if there are variants... if (whole_variant_size_in_bytes != 0) { char * md5_sum = md5sum(this->m_signature, this->m_header.signature_size); this->m_cache_entry_name = md5_sum; delete md5_sum; } else { this->m_cache_entry_name += ".db"; } } bool BinaryLookup::loadHeader(std::ifstream &file, size_t length) { file.read ((char*)&this->m_header, sizeof(Header)); // FIXME: Consider LSB Vs MSB number representation assert(getStorageMode(this->m_header.magic_key) == LSB); if (this->m_header.whole_file_size != (int)length) { // the file has not been correctly initialized (yet) return false; } return true; } bool BinaryLookup::loadBinaryAndSignature(std::ifstream &file) { { this->m_binary = new unsigned char [this->m_header.binary_size]; const std::istream& res = file.read((char*)this->m_binary, this->m_header.binary_size); if (!res.good()) return false; } { this->m_signature = new char [this->m_header.signature_size]; const std::istream& res = file.read((char*)this->m_signature, this->m_header.signature_size); if (!res.good()) return false; this->m_variants.clear(); char * current = this->m_signature; for (int i=0 ; im_header.signature_size ; ++i) { Variant v; v.m_kind = *(VariantKind*) current; i += sizeof(int); current += sizeof(int); v.m_size = *(size_t*) current; i += sizeof(size_t); current += sizeof(size_t); v.m_data = new char[v.m_size]; memcpy(v.m_data, current, v.m_size); i += v.m_size; current += v.m_size; this->m_variants.push_back(v); } } return true; } bool BinaryLookup::tryLoadCacheFile() { // may create empty file or may wait until file is ready const std::string & filename = this->m_path + this->m_cache_entry_name; std::ifstream file (filename.c_str(), std::ios_base::binary); if (file.is_open()) { file.seekg (0, file.end); size_t length = file.tellg(); file.seekg (0, file.beg); if (length == 0) { // the file is corrupted, so return false return false; } bool st; st = loadHeader(file, length); if (! st) return false; st = loadBinaryAndSignature(file); if (! st) return false; file.close(); return true; } else { return false; } } bool BinaryLookup::found() { // if we could not create the directory, it is useless to if (! this->m_cache_enabled) { return false; // not found } this->finalizeVariant(); // serialize variant and cumpute checksum on it // also compute the tree to search from the cache entry (this->m_cache_entry_name, cache path ??) if (tryLoadCacheFile()) { cl_int err = buildFromBinary(this->m_binary, this->m_header.binary_size, NULL); // return false if the buildFromBinary failed, true else return err==CL_SUCCESS; } return false; } static cl_int getSingleBinaryFromProgram(cl_program program, std::vector & binary) { // 3 - Determine the size of each program binary size_t size; cl_int err = clGetProgramInfo(program, CL_PROGRAM_BINARY_SIZES, sizeof(size_t), &size, NULL); if (err != CL_SUCCESS) { std::cerr << "Error querying for program binary sizes" << std::endl; return err; } binary.resize(size); binary[0] = new unsigned char[size]; unsigned char * binary_address[1] = { binary[0] }; // 4 - Get all of the program binaries err = clGetProgramInfo(program, CL_PROGRAM_BINARIES, 1 * sizeof(unsigned char*), binary_address, NULL); if (err != CL_SUCCESS) { #if CAPS_DEBUG std::cerr << "Error querying for program binaries" << std::endl; #endif return err; } return CL_SUCCESS; } cl_int BinaryLookup::writeCacheFile(std::vector &data) { if (! this->m_cache_enabled) { return 0; } // exclusive open to ensure that only one thread will write the file const std::string & filename = this->m_path + this->m_cache_entry_name; CacheEntry cache_file(filename); bool created = cache_file.exclusive_create(); // try to exclusively create the cache file on the disk if (created) { // if it was created by the current thread, this one will write into cache file cache_file.close(); const std::string & filename = this->m_path + this->m_cache_entry_name; std::ofstream file (filename.c_str(), std::ios_base::binary); file.write((char*)&this->m_header, sizeof(m_header)); file.write((char*)data[0], this->m_header.binary_size); file.write((char*)this->m_signature, this->m_header.signature_size); file.close(); return CL_SUCCESS; } // other thread do not write the cache file //Ben : do we really need to output something here, all the different branches return 0 (CL_SUCCESS) return CL_SUCCESS; } cl_int BinaryLookup::populateCache() { // FIXME: support MSB this->m_header.magic_key[0] = 'C'; this->m_header.magic_key[1] = 'L'; this->m_header.magic_key[2] = 'B'; this->m_header.magic_key[3] = '\0'; std::vector data; cl_int err = getSingleBinaryFromProgram(this->m_program, data); if (err != CL_SUCCESS) { return err; } this->m_header.header_size = sizeof(Header); this->m_header.binary_size = data.size(); this->m_header.whole_file_size = this->m_header.header_size + this->m_header.binary_size + this->m_header.signature_size; err = writeCacheFile(data); return CL_SUCCESS; } cl_int BinaryLookup::buildFromSource(const char * source) { cl_int err; this->m_program = BinaryLookup::buildProgramFromSource(source, this->m_context, this->m_device, err); if (err != CL_SUCCESS) { fprintf(stderr, "Warning: clCreateProgramWithSource failed with code %d\n", err); return err; } // write to the cache this->populateCache(); return CL_SUCCESS; } cl_int BinaryLookup::buildFromLoadedBinary(const void * data, size_t len, const char * BuildOption) { cl_int err; this->m_program = BinaryLookup::buildProgramFromBinary((char*) data, len, this->m_context, this->m_device, err, BuildOption); return err; } cl_int BinaryLookup::buildFromBinary(const void * data, size_t len, const char * BuildOption ) { cl_int err = buildFromLoadedBinary(data, len, BuildOption); if (err != CL_SUCCESS) return err; // write to the cache this->populateCache(); return CL_SUCCESS; } cl_program BinaryLookup::buildProgramFromSource(const char * source, cl_context context, cl_device_id device, cl_int & err, const char * options) { cl_program program = clCreateProgramWithSource(context, 1, (const char **)&source, NULL, &err); if (err != CL_SUCCESS) return NULL; err = clBuildProgram(program, 1, /* FIXME: 1 device */ &device, options, NULL, NULL); if (err != CL_SUCCESS) return NULL; return program; } cl_program BinaryLookup::buildProgramFromBinary(const char * data, size_t data_size, cl_context context, cl_device_id device, cl_int & err, const char * options) { cl_program program = clCreateProgramWithBinary(context, 1, // num_device &device, // device_list &data_size, // lengths (const unsigned char **)&data, NULL, &err); if (err != CL_SUCCESS) { fprintf(stderr, "Warning: clCreateProgramWithBinary failed with code %d\n", err); return NULL; } err = clBuildProgram(program, 1, /* FIXME: 1 device */ &device, options, NULL, NULL); if (err != CL_SUCCESS) { return NULL; } return program; } cl_program BinaryLookup::getProgram() { return this->m_program; } void BinaryLookup::setProgram(cl_program program) { this->m_program = program; } static int make_directory(const std::string &path) { #ifdef _WIN32 return _mkdir (path.c_str()); #else return mkdir (path.c_str(), S_IRWXU); #endif } static void do_mkdir(const std::string &path) { int st = make_directory (path.c_str()); if (st != 0) { if ( errno != EEXIST ) { std::string tmp = "Cannot not create directory '" + std::string(path) + "': "; throw tmp; } } } cl_int BinaryLookup::retrieveDeviceAndDriverInfo() { char m_device_vendor[SIZE]; char m_device_name[SIZE]; char m_driver_version[SIZE]; cl_int err = clGetDeviceInfo(this->m_device, CL_DEVICE_VENDOR, sizeof(m_device_vendor), &m_device_vendor, NULL); if (err != CL_SUCCESS) { return err; } err = clGetDeviceInfo(this->m_device, CL_DEVICE_NAME, sizeof(m_device_name), &m_device_name, NULL); if (err != CL_SUCCESS) { return err; } err = clGetDeviceInfo(this->m_device, CL_DRIVER_VERSION, sizeof(m_driver_version), &m_driver_version, NULL); if (err != CL_SUCCESS) { return err; } #if CAPS_DEBUG fprintf(stderr, "device vendor = %s\n", this->m_device_vendor); fprintf(stderr, "device name = %s\n", this->m_device_name); fprintf(stderr, "driver version = %s\n", this->m_driver_version); #endif try { const std::string & root = (std::string(cache_path) + m_device_vendor + sep()); do_mkdir(root.c_str()); const std::string & root2 = (root + m_device_name + sep()); do_mkdir(root2.c_str()); const std::string & root3 = (root2 + m_driver_version + sep()); do_mkdir(root3.c_str()); const std::string & root4 = (root3 + this->m_cache_entry_name + sep()); do_mkdir(root4.c_str()); this->m_path = root4; return CL_SUCCESS; } catch (std::string & e) { fprintf(stderr, "%s\n", e.c_str()); cache_enabled = false; this->m_cache_enabled = false; return CL_INVALID_VALUE; } } clblas-2.10/src/library/blas/generic/blas_funcs.c000066400000000000000000000044731264277366700220030ustar00rootroot00000000000000/* ************************************************************************ * Copyright 2013 Advanced Micro Devices, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * ************************************************************************/ #include int funcBlasLevel(BlasFunctionID funcID) { switch(funcID) { case CLBLAS_SWAP: case CLBLAS_SCAL: case CLBLAS_COPY: case CLBLAS_AXPY: case CLBLAS_DOT: case CLBLAS_REDUCTION_EPILOGUE: case CLBLAS_ROTG: case CLBLAS_ROTMG: case CLBLAS_ROT: case CLBLAS_ROTM: case CLBLAS_iAMAX: case CLBLAS_NRM2: case CLBLAS_ASUM: return 1; case CLBLAS_GEMV: case CLBLAS_SYMV: case CLBLAS_TRMV: case CLBLAS_TRSV: case CLBLAS_TRSV_GEMV: case CLBLAS_HEMV: case CLBLAS_SYR: case CLBLAS_SYR2: case CLBLAS_GER: case CLBLAS_HER: case CLBLAS_HER2: case CLBLAS_TPMV: case CLBLAS_SPMV: case CLBLAS_HPMV: case CLBLAS_TPSV: case CLBLAS_SPR: case CLBLAS_SPR2: case CLBLAS_HPR: case CLBLAS_HPR2: case CLBLAS_GBMV: case CLBLAS_TBMV: case CLBLAS_SBMV: case CLBLAS_HBMV: case CLBLAS_TBSV: return 2; default: return 3; } } bool funcHasBeta(BlasFunctionID funcID) { return !funcHasTriangMatrix(funcID); } bool funcHasTriangMatrix(BlasFunctionID funcID) { bool ret = false; switch (funcID) { // go through case CLBLAS_TRMM: case CLBLAS_TRSM: case CLBLAS_TRMV: case CLBLAS_HEMV: case CLBLAS_TRSV: ret = true; break; default: /* do nothing */ break; } return ret; } clblas-2.10/src/library/blas/generic/common.c000066400000000000000000000546371264277366700211630ustar00rootroot00000000000000/* ************************************************************************ * Copyright 2013 Advanced Micro Devices, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * ************************************************************************/ #include #include #include #include #include #include #include #include "clblas-internal.h" #if defined(DUMP_CLBLAS_KERNELS) && !defined(KEEP_CLBLAS_KERNEL_SOURCES) #define KEEP_CLBLAS_KERNEL_SOURCES #endif int clblasInitialized = 0; CLBlasSolvers clblasSolvers[BLAS_FUNCTIONS_NUMBER]; struct KernelCache *clblasKernelCache = NULL; enum { BUILD_LOG_SIZE = 65536 }; static __inline void storeErrorCode(cl_int *error, cl_int code) { if (error != NULL) { *error = code; } } #ifndef PRINT_BUILD_ERRORS #define PRINT_BUILD_ERRORS #endif #ifdef PRINT_BUILD_ERRORS static char *allocBuildLog(void) { char *log; log = malloc(BUILD_LOG_SIZE); if (log) { log[0] = '\0'; } return log; } static void freeBuildLog(char *buildLog) { free(buildLog); } static void printBuildError( cl_int error, cl_device_id device, SolverKgen kgen, const SubproblemDim *dims, const PGranularity *pgran, const CLBLASKernExtra *kextra, const char *source, const char *buildLog) { char name[128]; char dimStr[1024]; char pgranStr[1024]; char *p; MemoryPattern *mempat = NULL; unsigned int i, j; const char *s; name[0] = '\0'; clGetDeviceInfo(device, CL_DEVICE_NAME, sizeof(name), name, NULL); // lookup memory pattern s = NULL; for (i = 0; i < BLAS_FUNCTIONS_NUMBER; i++) { for (j = 0; j < clblasSolvers[i].nrPatterns; j++) { mempat = &clblasSolvers[i].memPatterns[j]; if (kgen == mempat->sops->genKernel) { s = kernelTypeString(kextra->kernType); break; } } if (s != NULL) { break; } } // sprintf Subproblem dimensions p = dimStr; for (i = 0; i < mempat->nrLevels; i++) { p = sprintfGranulation(p, dims, i); strcat(p, "; "); p += strlen(p); } // sprintf data parallelism granularity sprintf(pgranStr, "pgran->wgDim = %d, pgran->wgSize[0] = %u, " "pgran->wgSize[1] = %u, pgran->wfSize = %u", pgran->wgDim, pgran->wgSize[0], pgran->wgSize[1], pgran->wfSize); fprintf(stderr, "\n========================================================\n\n"); fprintf(stderr, "AN INTERNAL KERNEL BUILD ERROR OCCURRED!\n"); fprintf(stderr, "device name = %s\n", name); fprintf(stderr, "error = %d\n", error); fprintf(stderr, "memory pattern = %s, %s kernel generator\n", mempat->name, s); fprintf(stderr, "Subproblem dimensions: %s\n", dimStr); fprintf(stderr, "Parallelism granularity: %s\n", pgranStr); fprintf(stderr, "Kernel extra flags: %u\n", kextra->flags); fprintf(stderr, "Source:\n\n%s\n\n", source); fprintf(stderr, "--------------------------------------------------------\n\n"); if (buildLog) { fprintf(stderr, "Build log:\n\n%s\n", buildLog); } else { fprintf(stderr, "Build log is unavailable\n"); } fprintf(stderr, "========================================================\n\n"); } #else /* PRINT_BUILD_ERRORS */ static __inline char* allocBuildLog(void) { /* stub, do nothing */ return NULL; } #define freeBuildLog(log) /* stub, do nothing */ #define printBuildError(error, device, kgen, \ dims, pgran, kextra, source, buildLog) /* stub, do nothing */ #endif /* !PRINT_BUILD_ERRORS */ static void extraDtor(struct Kernel *kernel) { if (kernel->extra != NULL) { free(kernel->extra); kernel->extra = NULL; } } static char *sprintfDim( char *buf, size_t dim, const char *dimName, int level, bool first) { if (!first) { strcat(buf, ", "); buf += strlen(buf); } if (dim == SUBDIM_UNUSED) { sprintf(buf, "dims[%d].%s = SUBDIM_UNUSED", level, dimName); } else { sprintf(buf, "dims[%d].%s = %lu", level, dimName, dim); } buf += strlen(buf); return buf; } const char VISIBILITY_HIDDEN *kernelTypeString(CLBlasKernelType ktype) { switch (ktype) { case CLBLAS_COMPUTING_KERNEL: return "computing"; case CLBLAS_PREP_A_KERNEL: return "preparative for matrix A"; case CLBLAS_PREP_B_KERNEL: return "preparative for matrix B"; default: return NULL; } } /* * Assign a scalar multiplied on a matrix a kernel argument */ void VISIBILITY_HIDDEN assignScalarKarg(KernelArg *arg, const void *value, DataType dtype) { arg->typeSize = dtypeSize(dtype); memcpy(arg->arg.data, value, arg->typeSize); } void VISIBILITY_HIDDEN calcGlobalThreads( size_t globalThreads[2], const SubproblemDim *wgDim, const PGranularity *pgran, size_t M, size_t N) { globalThreads[1] = 1; if ((wgDim->itemX != SUBDIM_UNUSED) && (wgDim->itemY != SUBDIM_UNUSED)) { size_t groupWorkX, groupWorkY; size_t nrGroupsX, nrGroupsY; int nrDims; groupWorkX = wgDim->itemX; groupWorkY = wgDim->itemY; nrGroupsX = N / groupWorkX; if (N % groupWorkX) { nrGroupsX++; } nrGroupsY = M / groupWorkY; if (M % groupWorkY) { nrGroupsY++; } nrDims = (pgran == NULL) ? 1 : pgran->wgDim; if (nrDims == 1) { globalThreads[0] = nrGroupsX * nrGroupsY; } else { globalThreads[0] = nrGroupsY; globalThreads[1] = nrGroupsX; } } else { size_t totalWork, groupWork; if (wgDim->itemX != SUBDIM_UNUSED) { totalWork = N; groupWork = wgDim->itemX; } else { totalWork = M; groupWork = wgDim->itemY; } globalThreads[0] = totalWork / groupWork; if (totalWork % groupWork) { globalThreads[0]++; } } if (pgran != NULL) { globalThreads[0] *= pgran->wgSize[0]; globalThreads[1] *= pgran->wgSize[1]; } } cl_int VISIBILITY_HIDDEN getKernelContext(cl_kernel kernel, cl_context *context) { cl_int err; cl_context ctx; err = clGetKernelInfo(kernel, CL_KERNEL_CONTEXT, sizeof(cl_context), &ctx, NULL); if (err != CL_SUCCESS) return err; if (context != NULL) *context = ctx; return err; } cl_int VISIBILITY_HIDDEN getQueueContext(cl_command_queue queue, cl_context *context) { cl_int err; cl_context ctx; err = clGetCommandQueueInfo(queue, CL_QUEUE_CONTEXT, sizeof(cl_context), &ctx, NULL); if (err != CL_SUCCESS) return err; if (context != NULL) *context = ctx; return err; } cl_int VISIBILITY_HIDDEN getQueueDevice(cl_command_queue queue, cl_device_id *device) { cl_int err; cl_device_id dev; err = clGetCommandQueueInfo(queue, CL_QUEUE_DEVICE, sizeof(cl_device_id), &dev, NULL); if (err != CL_SUCCESS) return err; if (device != NULL) *device = dev; return err; } cl_int VISIBILITY_HIDDEN getQueueProperties( cl_command_queue queue, cl_command_queue_properties *props) { cl_int err; cl_command_queue_properties p; err = clGetCommandQueueInfo(queue, CL_QUEUE_PROPERTIES, sizeof(cl_command_queue_properties), &p, NULL); if (err != CL_SUCCESS) return err; if (props != NULL) *props = p; return err; } Kernel VISIBILITY_HIDDEN *loadKernel( const unsigned char** buffer, size_t sizeBuffer, KernelKey *key, const CLBLASKernExtra *extra, cl_int *error) { cl_int status = CL_SUCCESS; Kernel* kernel; kernel = allocKernel(); if (kernel == NULL) { return NULL; } kernel->program = createClProgramWithBinary(key->context, key->device, (unsigned char*)*buffer, sizeBuffer, &status); if (status == CL_SUCCESS) { kernel->extraSize = sizeof(CLBLASKernExtra); kernel->extra = calloc(1, kernel->extraSize); *(CLBLASKernExtra*)(kernel->extra) = *extra; kernel->dtor = extraDtor; kernel->noSource = 1; } else { putKernel(NULL, kernel); storeErrorCode(error, status); kernel = NULL; } return kernel; } #if !defined(DUMP_CLBLAS_KERNELS) /* * Drop the program's source so as to consume memory as few as possible * at caching */ static cl_int dropProgramSource(cl_program *program, cl_context ctx, cl_device_id devID) { size_t size; unsigned char *bin; cl_program p = *program; cl_int err; size = getProgramBinarySize(p); bin = getProgramBinary(p); /* * Don't release the original program until a new one is created * in order to retain its own reference to the context if it is * released by user */ p = createClProgramWithBinary(ctx, devID, bin, size, &err); if (err == CL_SUCCESS) { clReleaseProgram(*program); *program = p; } free(bin); return err; } #endif /* !DUMP_CLBLAS_KERNELS */ Kernel *makeKernel( cl_device_id device, cl_context context, SolverKgen kernelGenerator, cl_program program, const SubproblemDim *dims, const PGranularity *pgran, const CLBLASKernExtra *extra, const char *buildOpts, cl_int *error) { cl_int err; char *source; ssize_t size; Kernel *kernel; char *log; #ifdef DEBUG_2 printf("Make kernel called\n"); printf("x : %d, y : %d, itemX: %d, itemY: %d\n", dims->x, dims->y, dims->itemX, dims->itemY); printf("PG : wgSize[0] : %d, wgSize[1] : %d, wfSize: %d\n", pgran->wgSize[0], pgran->wgSize[1], pgran->wfSize); #endif kernel = allocKernel(); if (kernel == NULL) { free(source); storeErrorCode(error, CL_OUT_OF_HOST_MEMORY); return NULL; } if (kernelGenerator) { size = kernelGenerator(NULL, 0, dims, pgran, (void*)extra); if (size < 0) { storeErrorCode(error, CL_OUT_OF_HOST_MEMORY); return NULL; } source = calloc(1, size); if (source == NULL) { storeErrorCode(error, CL_OUT_OF_HOST_MEMORY); return NULL; } if (kernelGenerator(source, size, dims, pgran, (void*)extra) != size) { free(source); storeErrorCode(error, CL_OUT_OF_HOST_MEMORY); return NULL; } log = allocBuildLog(); //#define DEBUG_2 #ifdef DEBUG_2 printf("Build Options used %s \n", buildOpts); printf("Source kernel used %s \n", source); #endif #undef DEBUG_2 kernel->program = buildClProgram(source, buildOpts, context, device, log, BUILD_LOG_SIZE, &err); if (err != CL_SUCCESS) { printBuildError(err, device, kernelGenerator, dims, pgran, extra, source, log); freeBuildLog(log); putKernel(NULL, kernel); free(source); storeErrorCode(error, err); return NULL; } else { // #define DEBUG_2 #ifdef DEBUG_2 printf("Kernel compilation succeeded\n"); #endif #undef DEBUG_2 } freeBuildLog(log); free(source); #if !defined(KEEP_CLBLAS_KERNEL_SOURCES) if (err == CL_SUCCESS) { err = dropProgramSource(&kernel->program, context, device); kernel->noSource = 1; } #endif /* !DUMP_CLBLAS_KERNELS */ if (err != CL_SUCCESS) { putKernel(NULL, kernel); storeErrorCode(error, err); return NULL; } } else { kernel->program = program; } kernel->extraSize = sizeof(CLBLASKernExtra); kernel->extra = calloc(1, kernel->extraSize); *(CLBLASKernExtra*)(kernel->extra) = *extra; kernel->dtor = extraDtor; storeErrorCode(error, CL_SUCCESS); return kernel; } void setupBuildOpts( char opts[BUILD_OPTS_MAXLEN], cl_device_id devID, MemoryPattern *mempat) { TargetDevice target; target.id = devID; identifyDevice(&target); opts[0] = '\0'; #if !defined NDEBUG // Nvidia runtime does not appear to support the -g flag, at least in their OpenCL v1.1 runtime if( target.ident.vendor != VENDOR_NVIDIA ) addBuildOpt( opts, BUILD_OPTS_MAXLEN, "-g" ); #endif /* NDEBUG */ if (target.ident.vendor == VENDOR_NVIDIA && !strcmp(mempat->name, "2-staged cached global memory based " "block trsm")) { addBuildOpt(opts, BUILD_OPTS_MAXLEN, "-cl-opt-disable"); } } void addBuildOpt( char * opts, size_t len, const char * option) { size_t l = strlen(opts); if (l > 0 && !isspace(opts[l-1]) && l+1 < len) { opts[l] = ' '; opts[l+1] = '\0'; l++; } strncat(opts, option, len - l - 1); } char VISIBILITY_HIDDEN *sprintfGranulation(char *buf, const SubproblemDim *dim, int level) { buf = sprintfDim(buf, dim[level].itemY, "itemY", level, true); buf = sprintfDim(buf, dim[level].itemX, "itemX", level, false); buf = sprintfDim(buf, dim[level].y, "y", level, false); buf = sprintfDim(buf, dim[level].x, "x", level, false); buf = sprintfDim(buf, dim[level].bwidth, "bwidth", level, false); strcat(buf, "; "); buf += strlen(buf); return buf; } clblasStatus VISIBILITY_HIDDEN checkMatrixSizes( DataType dtype, clblasOrder order, clblasTranspose transA, size_t M, size_t N, cl_mem A, size_t offA, size_t lda, // lda is passed as zero for packed matrices ErrorCodeSet err ) { size_t memSize, matrSize, tsize, memUsed; size_t unusedTail = 0; bool tra; if ((M == 0) || (N == 0)) { return clblasInvalidDim; } tsize = dtypeSize(dtype); tra = (order == clblasRowMajor && transA != clblasNoTrans) || (order == clblasColumnMajor && transA == clblasNoTrans); if( lda > 0 ) // For Non-packed matrices { if (tra) { if (lda < M) { switch( err ) { case A_MAT_ERRSET: return clblasInvalidLeadDimA; case B_MAT_ERRSET: return clblasInvalidLeadDimB; case C_MAT_ERRSET: return clblasInvalidLeadDimC; default: return clblasNotImplemented; } } matrSize = ((N - 1) * lda + M) * tsize; unusedTail = ( lda - N ) * tsize; } else { if (lda < N) { switch( err ) { case A_MAT_ERRSET: return clblasInvalidLeadDimA; case B_MAT_ERRSET: return clblasInvalidLeadDimB; case C_MAT_ERRSET: return clblasInvalidLeadDimC; default: return clblasNotImplemented; } } matrSize = ((M - 1) * lda + N) * tsize; unusedTail = ( lda - M ) * tsize; } } else { // For the case of packed matrices matrSize = ((M * (N+1)) / 2) * tsize; } offA *= tsize; if (clGetMemObjectInfo(A, CL_MEM_SIZE, sizeof(memSize), &memSize, NULL) != CL_SUCCESS) { switch( err ) { case A_MAT_ERRSET: return clblasInvalidMatA; case B_MAT_ERRSET: return clblasInvalidMatB; case C_MAT_ERRSET: return clblasInvalidMatC; default: return clblasNotImplemented; } } // Calculates the memory required. Note that 'matrSize' already takes into account the fact that // there might be an unused tail, i.e. the elements between lda and M in the last column if // column major is used or between lda and N in the last row if row major is used. memUsed = offA + matrSize; if (( memUsed > memSize ) || (offA + matrSize < offA)) { switch( err ) { case A_MAT_ERRSET: return clblasInsufficientMemMatA; case B_MAT_ERRSET: return clblasInsufficientMemMatB; case C_MAT_ERRSET: return clblasInsufficientMemMatC; default: return clblasNotImplemented; } } return clblasSuccess; } clblasStatus VISIBILITY_HIDDEN checkBandedMatrixSizes( DataType dtype, clblasOrder order, clblasTranspose transA, size_t M, size_t N, size_t KL, size_t KU, cl_mem A, size_t offA, size_t lda, ErrorCodeSet err ) { size_t memSize, matrSize, tsize, K, memUsed; size_t unusedTail = 0; bool tra; if ((M == 0) || (N == 0)) { return clblasInvalidDim; } tsize = dtypeSize(dtype); K = KL + KU + 1; tra = (order == clblasRowMajor && transA != clblasNoTrans) || (order == clblasColumnMajor && transA == clblasNoTrans); if (lda < K) { switch( err ) { case A_MAT_ERRSET: return clblasInvalidLeadDimA; case B_MAT_ERRSET: return clblasInvalidLeadDimB; case C_MAT_ERRSET: return clblasInvalidLeadDimC; default: return clblasNotImplemented; } } if (tra) { matrSize = ((N - 1) * lda + K) * tsize; unusedTail = ( lda - N ) * tsize; } else { matrSize = ((M - 1) * lda + K) * tsize; unusedTail = ( lda - M ) * tsize; } offA *= tsize; if (clGetMemObjectInfo(A, CL_MEM_SIZE, sizeof(memSize), &memSize, NULL) != CL_SUCCESS) { switch( err ) { case A_MAT_ERRSET: return clblasInvalidMatA; case B_MAT_ERRSET: return clblasInvalidMatB; case C_MAT_ERRSET: return clblasInvalidMatC; default: return clblasNotImplemented; } } // Calculates the memory required. Note that 'matrSize' already takes into account the fact that // there might be an unused tail, i.e. the elements between lda and M in the last column if // column major is used or between lda and N in the last row if row major is used. memUsed = offA + matrSize; if (memUsed > memSize) { switch( err ) { case A_MAT_ERRSET: return clblasInsufficientMemMatA; case B_MAT_ERRSET: return clblasInsufficientMemMatB; case C_MAT_ERRSET: return clblasInsufficientMemMatC; default: return clblasNotImplemented; } } return clblasSuccess; } clblasStatus VISIBILITY_HIDDEN checkVectorSizes( DataType dtype, size_t N, cl_mem x, size_t offx, int incx, ErrorCodeSet err ) { size_t memSize, sizev; size_t tsize; if (N == 0) { return clblasInvalidDim; } if (incx == 0) { switch( err ) { case X_VEC_ERRSET: return clblasInvalidIncX; case Y_VEC_ERRSET: return clblasInvalidIncY; default: return clblasNotImplemented; } } if (clGetMemObjectInfo(x, CL_MEM_SIZE, sizeof(memSize), &memSize, NULL) != CL_SUCCESS) { switch( err ) { case X_VEC_ERRSET: return clblasInvalidVecX; case Y_VEC_ERRSET: return clblasInvalidVecY; default: return clblasNotImplemented; } } tsize = dtypeSize(dtype); sizev = ((N - 1) * abs(incx) + 1) * tsize; offx *= tsize; if ((offx + sizev > memSize) || (offx + sizev < offx)) { switch( err ) { case X_VEC_ERRSET: return clblasInsufficientMemVecX; case Y_VEC_ERRSET: return clblasInsufficientMemVecY; default: return clblasNotImplemented; } } return clblasSuccess; } clblasStatus checkMemObjects( cl_mem A, cl_mem B, cl_mem C, bool checkC, ErrorCodeSet errA, ErrorCodeSet errB, ErrorCodeSet errC ) { cl_mem_object_type mobjType = 0; if (!clGetMemObjectInfo(A, CL_MEM_TYPE, sizeof(mobjType), &mobjType, NULL) && (mobjType != CL_MEM_OBJECT_BUFFER)) { switch( errA ) { case A_MAT_ERRSET: return clblasInvalidMatA; case B_MAT_ERRSET: return clblasInvalidMatB; case C_MAT_ERRSET: return clblasInvalidMatC; case X_VEC_ERRSET: return clblasInvalidVecX; case Y_VEC_ERRSET: return clblasInvalidVecY; default: return clblasNotImplemented; } } mobjType = 0; if (!clGetMemObjectInfo(B, CL_MEM_TYPE, sizeof(mobjType), &mobjType, NULL) && (mobjType != CL_MEM_OBJECT_BUFFER)) { switch( errB ) { case A_MAT_ERRSET: return clblasInvalidMatA; case B_MAT_ERRSET: return clblasInvalidMatB; case C_MAT_ERRSET: return clblasInvalidMatC; case X_VEC_ERRSET: return clblasInvalidVecX; case Y_VEC_ERRSET: return clblasInvalidVecY; default: return clblasNotImplemented; } } mobjType = 0; if (checkC && !clGetMemObjectInfo(C, CL_MEM_TYPE, sizeof(mobjType), &mobjType, NULL) && (mobjType != CL_MEM_OBJECT_BUFFER)) { switch( errC ) { case A_MAT_ERRSET: return clblasInvalidMatA; case B_MAT_ERRSET: return clblasInvalidMatB; case C_MAT_ERRSET: return clblasInvalidMatC; case X_VEC_ERRSET: return clblasInvalidVecX; case Y_VEC_ERRSET: return clblasInvalidVecY; default: return clblasNotImplemented; } } return clblasSuccess; } clblas-2.10/src/library/blas/generic/common2.cc000066400000000000000000000064701264277366700214000ustar00rootroot00000000000000/* ************************************************************************ * Copyright 2014 Advanced Micro Devices, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * ************************************************************************/ /* * Common C functions using C++ APIs */ #include #include #include #define CAPS_DEBUG 0 extern "C" Kernel* makeKernelCached(cl_device_id device, cl_context context, solver_id_t sid, KernelKey * key, SolverKgen kernelGenerator, const SubproblemDim *dims, const PGranularity *pgran, const CLBLASKernExtra *extra, const char *buildOpts, cl_int *error) { // For now on, use the solver id to identify the kernel // FIXME: it should be better to use the FunctionID but it sounds that there // is a confusion between BlasFunctionID and BlasRoutineID in makeSolutionSeq() function... char name[20]; sprintf(name, "solver%d", (int)sid); BinaryLookup bl(context, device, name); // Use the whole extra field as signature to identify the kernel binary for now on // may be improved... bl.variantInt(sid); bl.variantInt(key->nrDims); bl.variantRaw(key->subdims, sizeof(SubproblemDim) * key->nrDims); bl.variantRaw(extra, sizeof(CLBLASKernExtra)); if (bl.found()) { #if CAPS_DEBUG printf("Kernel loaded from cache\n"); #endif return makeKernel(device, context, 0, // generator is not needed because the program // was loaded from the disk bl.getProgram(), // pass the program loaded from the // disk dims, pgran, extra, buildOpts, error); } else { #if CAPS_DEBUG printf("Kernel generated from source\n"); #endif Kernel * kernel = makeKernel(device, context, kernelGenerator, 0, // cl_program = 0 because it was not loaded from the disk dims, pgran, extra, buildOpts, error); bl.setProgram(kernel->program); bl.populateCache(); return kernel; } } clblas-2.10/src/library/blas/generic/events.c000066400000000000000000000034751264277366700211710ustar00rootroot00000000000000/* ************************************************************************ * Copyright 2013 Advanced Micro Devices, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * ************************************************************************/ #include #include #include #include static const size_t ALLOCATION_STEP = 100; static mutex_t *lock = NULL; static cl_event *decomposeEvents = NULL; static size_t numDecomposeEvents = 0; static size_t maxDecomposeEvents = 0; void decomposeEventsSetup(void) { lock = mutexInit(); } void decomposeEventsTeardown(void) { mutexLock(lock); if (decomposeEvents != NULL) { free(decomposeEvents); } decomposeEvents = NULL; numDecomposeEvents = 0; maxDecomposeEvents = 0; mutexDestroy(lock); lock = NULL; } cl_event* decomposeEventsAlloc(void) { cl_event* e; mutexLock(lock); if (numDecomposeEvents == maxDecomposeEvents) { e = realloc(decomposeEvents, (maxDecomposeEvents + ALLOCATION_STEP) * sizeof(cl_event)); if (e == NULL) { mutexUnlock(lock); return NULL; } decomposeEvents = e; maxDecomposeEvents += ALLOCATION_STEP; } e = &(decomposeEvents[numDecomposeEvents++]); mutexUnlock(lock); return e; } clblas-2.10/src/library/blas/generic/functor_cache.cc000066400000000000000000000051621264277366700226260ustar00rootroot00000000000000/* ************************************************************************ * Copyright 2014 Advanced Micro Devices, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * ************************************************************************/ #include #include #include #include #include <../functor/include/functor.h> #include #include #include // ================================================== // == clblasFunctorCacheBase // ================================================== typedef std::set clblasFunctorCacheSet ; // Provide the set of all existing functor cache // // Remark: Since the set is typically populated by the constructors // of global objects, we use the "construct on first use" // idiom, to avoid the infamous "static initialization order fiasco". // See for example http://www.parashift.com/c++-faq/static-init-order.html // // Remark: The current implementation is not thread-safe but that should // be fine since the cache is supposed to be populated at startup // (assuming that all functor caches are global objects) and // static clblasFunctorCacheSet & getFunctorCacheSet() { static clblasFunctorCacheSet * all = new clblasFunctorCacheSet ; return * all ; } // // This function is supposed to be called from clblasTearDown to empty all caches // extern "C" void cleanFunctorCaches(void) { // Ask each registered cache to clean itself. clblasFunctorCacheSet & all = getFunctorCacheSet() ; for (clblasFunctorCacheSet::iterator it= all.begin(); it!=all.end(); ++it) { clblasFunctorCacheBase * cache = *it ; cache->discardAll() ; } } clblasFunctorCacheBase::clblasFunctorCacheBase() { // if ( _cleanFunctorCachesHook == 0 ) // _cleanFunctorCachesHook = cleanFunctorCaches ; // Install the hook to call cleanFunctorCaches clblasFunctorCacheSet & all = getFunctorCacheSet() ; all.insert(this) ; } clblasFunctorCacheBase::~clblasFunctorCacheBase() { clblasFunctorCacheSet & all = getFunctorCacheSet() ; all.erase(this) ; } clblas-2.10/src/library/blas/generic/kdump.c000066400000000000000000000114611264277366700207770ustar00rootroot00000000000000/* ************************************************************************ * Copyright 2013 Advanced Micro Devices, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * ************************************************************************/ #include #include #include #include #include #include "solution_seq.h" #ifdef DUMP_CLBLAS_KERNELS enum { SRC_BUFSIZE = 512244 }; static void getFuncName(char *name, BlasFunctionID funcID, DataType dtype) { switch (funcID) { case CLBLAS_GEMV: strcpy(name + 1, "GEMV"); break; case CLBLAS_SYMV: strcpy(name + 1, "SYMV"); break; case CLBLAS_GEMM: strcpy(name + 1, "GEMM"); break; case CLBLAS_TRMM: strcpy(name + 1, "TRMM"); break; case CLBLAS_TRSM: strcpy(name + 1, "TRSM"); break; case CLBLAS_SYRK: strcpy(name + 1, "SYRK"); break; case CLBLAS_SYR2K: strcpy(name + 1, "SYR2K"); break; default: break; } if (dtype == TYPE_FLOAT) { name[0] = 's'; } else { name[0] = dtypeToPrefix(dtype); } } static void addTranspSuffix(char *buf, clblasTranspose flag) { const char *s; if (flag == clblasNoTrans) { return; } s = (clblasTrans) ? "t" : "tc"; strcat(buf, s); } static void fileNameFromSolution( char *name, BlasFunctionID funcID, const SolutionStep *step) { const char *s; const CLBlasKargs *kargs = (const CLBlasKargs*)&step->args; bool isTriangFn; isTriangFn = (funcID == CLBLAS_TRMM || funcID == CLBLAS_TRSM); strcpy(name, "./"); name += strlen(name); getFuncName(name, funcID, kargs->dtype); s = (kargs->order == clblasRowMajor) ? "_row_" : "_col_"; strcat(name, s); addTranspSuffix(name, kargs->transA); if (isTriangFn) { s = (kargs->uplo == clblasUpper) ? "_upper" : "_lower"; strcat(name, s); s = (kargs->side == clblasRight) ? "_right" : "_left"; strcat(name, s); } else { addTranspSuffix(name, kargs->transB); } name += strlen(name); sprintf(name, "_%lu_%lu", kargs->M, kargs->N); if (!isTriangFn) { name += strlen(name); sprintf(name, "_%lu", kargs->K); } strcat(name, ".kdump"); } void dumpKernel( const SolutionStep *step, CLBlasKernelType ktype) { FILE *file; char tmp[1024]; MemoryPattern *pattern; const char *s; const CLBlasKargs *kargs = (const CLBlasKargs*)&step->args; char *srcBuf; unsigned int i; fileNameFromSolution(tmp, step->funcID, step); file = fopen((const char*)tmp, "a+"); pattern = &clblasSolvers[step->funcID].memPatterns[step->patternID]; // now, dump the info sprintf(tmp, "offset M = %lu, offset N = %lu, offset A = %lu," "offset BX = %lu, offset CY = %lu\n", kargs->offsetM, kargs->offsetN, kargs->offA, kargs->offBX, kargs->offCY); fputs(tmp, file); sprintf(tmp, "Memory pattern = %s\n", pattern->name); fputs(tmp, file); s = kernelTypeString(ktype); sprintf(tmp, "Kernel type = %s\n", s); fputs(tmp, file); // data parallelism granularity if (step->pgran.wgDim == 1) { sprintf(tmp, "work group size = %u\n", step->pgran.wgSize[0]); } else { sprintf(tmp, "work group size = %u x %u\n", step->pgran.wgSize[0], step->pgran.wgSize[1]); } fputs(tmp, file); fputs("Problem granulation\n", file); for (i = 0; i < pattern->nrLevels; i++) { sprintf(tmp, "[%u]: ", i); fputs(tmp, file); sprintfGranulation(tmp, step->subdims, i); fputs(tmp, file); fputs("\n", file); } srcBuf = malloc(SRC_BUFSIZE); if (srcBuf != NULL) { clGetProgramInfo(step->kernels[ktype]->program, CL_PROGRAM_SOURCE, SRC_BUFSIZE, srcBuf, NULL); fputs("Kernel source:\n\n", file); fputs(srcBuf, file); } else { fputs("Kernel source: not available\n", file); } free(srcBuf); fputs("--------------------------------------------------------------" "------------------------------------------------------------\n", file); fclose(file); } #endif /* DUMP_CLBLAS_KERNELS */ clblas-2.10/src/library/blas/generic/kernel_extra.c000066400000000000000000000016501264277366700223410ustar00rootroot00000000000000/* ************************************************************************ * Copyright 2013 Advanced Micro Devices, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * ************************************************************************/ #include #include "kernel_extra.h" int clblasKernelExtraCmp(const void *extra, const void *extraKey) { return memcmp(extra, extraKey, sizeof(CLBLASKernExtra)); } clblas-2.10/src/library/blas/generic/matrix_dims.c000066400000000000000000000100371264277366700221750ustar00rootroot00000000000000/* ************************************************************************ * Copyright 2013 Advanced Micro Devices, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * ************************************************************************/ #include static __inline bool isRightSide(BlasFunctionID funcID, clblasSide side) { return ((funcID == CLBLAS_TRMM || funcID == CLBLAS_TRSM) && side == clblasRight); } void VISIBILITY_HIDDEN swapDimXY(SubproblemDim *dim) { size_t tmp; tmp = dim->itemX; dim->itemX = dim->itemY; dim->itemY = tmp; tmp = dim->x; dim->x = dim->y; dim->y = tmp; } size_t VISIBILITY_HIDDEN matrBlockPitch( const SubproblemDim *dim, MatrixRole mrole, DataType dtype, clblasSide side) { size_t tsize = dtypeSize(dtype); size_t nfloats = tsize / sizeof(cl_float); size_t rowLen = 0; switch (mrole) { case MATRIX_A: case MATRIX_B: rowLen = dim->bwidth; break; case MATRIX_C: rowLen = (side == clblasLeft) ? dim->x : dim->y; break; default: break; } rowLen = fl4RowWidth(rowLen, tsize) * FLOAT4_VECLEN / nfloats; return rowLen; } cl_ulong VISIBILITY_HIDDEN matrBlockSize( SubproblemDim *dim, MatrixRole mrole, DataType dtype, clblasSide side) { size_t height, pitch; pitch = matrBlockPitch(dim, mrole, dtype, side); height = matrBlockHeight(dim, mrole, side); return (cl_ulong)height * pitch; } size_t VISIBILITY_HIDDEN matrBlockHeight( SubproblemDim *dim, MatrixRole mrole, clblasSide side) { size_t ret = 0; switch (mrole) { case MATRIX_A: ret = dim->y; break; case MATRIX_B: ret = dim->x; break; case MATRIX_C: ret = (side == clblasLeft) ? dim->y : dim->x; break; default: break; } return ret; } void VISIBILITY_HIDDEN kargsToProbDims( SubproblemDim *probDim, BlasFunctionID funcID, const CLBlasKargs *kargs, bool offset) { if (funcID == CLBLAS_SYMV) { if (offset) { probDim->y = kargs->offsetN; probDim->x = 0; probDim->bwidth = 0; } else { probDim->y = kargs->N; probDim->x = kargs->N; probDim->bwidth = kargs->K; } } else { if (offset) { probDim->y = kargs->offsetM; probDim->x = kargs->offsetN; } else { probDim->y = kargs->M; probDim->x = kargs->N; } if (isRightSide(funcID, kargs->side)) { swapDimXY(probDim); } if (funcID == CLBLAS_GEMV) { if (kargs->transA != clblasNoTrans) { swapDimXY(probDim); } probDim->bwidth = (offset) ? 0 : probDim->x; } else { probDim->bwidth = (offset) ? 0 : kargs->K; } } } void VISIBILITY_HIDDEN probDimsToKargs( CLBlasKargs *kargs, BlasFunctionID funcID, SubproblemDim *probDim, bool offset) { size_t *m, *n; SubproblemDim tmpDim; if (offset) { m = &kargs->offsetM; n = &kargs->offsetN; } else { m = &kargs->M; n = &kargs->N; kargs->K = probDim->bwidth; } tmpDim = *probDim; if (isRightSide(funcID, kargs->side)) { swapDimXY(&tmpDim); } if (funcID == CLBLAS_GEMV) { if (kargs->transA != clblasNoTrans) { swapDimXY(&tmpDim); } } *m = tmpDim.y; *n = tmpDim.x; } clblas-2.10/src/library/blas/generic/matrix_props.c000066400000000000000000000100671264277366700224070ustar00rootroot00000000000000/* ************************************************************************ * Copyright 2013 Advanced Micro Devices, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * ************************************************************************/ /* * Implementation of functions for determining matrix properties */ #include "matrix_props.h" static bool gemmIsTrans(KernelExtraFlags flags, MatrixRole mrole) { bool trans = false; bool order = false; switch (mrole) { case MATRIX_A: trans = ((flags & KEXTRA_TRANS_A) != 0); order = ((flags & KEXTRA_COLUMN_MAJOR) != 0); break; case MATRIX_B: trans = ((flags & KEXTRA_TRANS_B) != 0); order = !(flags & KEXTRA_COLUMN_MAJOR); break; case MATRIX_C: trans = false; order = ((flags & KEXTRA_COLUMN_MAJOR) != 0); break; default: break; } // each initial flag "flip" resulting need transposing flag return (trans ^ order); } static bool trxmIsTrans(KernelExtraFlags flags, MatrixRole mrole) { bool trans = false; bool order = false; bool side = ((flags & KEXTRA_SIDE_RIGHT) != 0); bool ret; switch (mrole) { case MATRIX_A: trans = ((flags & KEXTRA_TRANS_A) != 0); order = ((flags & KEXTRA_COLUMN_MAJOR) != 0); break; case MATRIX_B: case MATRIX_C: order = !(flags & KEXTRA_COLUMN_MAJOR); // row major break; default: break; } // each initial flag "flip" resulting need transposing flag ret = trans ^ order ^ side; if (mrole == MATRIX_C) { /* * the output matrix always has inverted transposing flags against * matrix B */ ret = !ret; } return ret; } static bool syrkIsTrans(KernelExtraFlags flags, MatrixRole mrole) { bool ret = false; switch (mrole) { case MATRIX_A: case MATRIX_B: { bool trans = ((flags & KEXTRA_TRANS_A) != 0); bool order = ((flags & KEXTRA_COLUMN_MAJOR) != 0); ret = (trans && !order) || (!trans && order); break; } case MATRIX_C: ret = ((flags & KEXTRA_COLUMN_MAJOR) != 0); break; default: break; } return ret; } static bool l2IsTrans(KernelExtraFlags flags, MatrixRole mrole) { bool ret; if (mrole == MATRIX_A) { bool trans = ((flags & KEXTRA_TRANS_A) != 0); bool order = ((flags & KEXTRA_COLUMN_MAJOR) != 0); ret = (trans && !order) || (!trans && order); } else { ret = false; } return ret; } bool isMatrixConj(KernelExtraFlags flags, MatrixRole mrole) { bool ret = false; switch (mrole) { case MATRIX_A: ret = ((flags & KEXTRA_CONJUGATE_A) != 0); break; case MATRIX_B: ret = ((flags & KEXTRA_CONJUGATE_B) != 0); break; default: ret = false; break; } return ret; } bool isMatrixAccessColMaj( BlasFunctionID funcID, KernelExtraFlags flags, MatrixRole mrole) { bool ret = false; switch (funcID) { case CLBLAS_SYMM: case CLBLAS_GEMM: case CLBLAS_GEMM2: ret = gemmIsTrans(flags, mrole); break; case CLBLAS_TRMM: case CLBLAS_TRSM: ret = trxmIsTrans(flags, mrole); break; case CLBLAS_SYRK: case CLBLAS_SYR2K: ret = syrkIsTrans(flags, mrole); break; case CLBLAS_TRMV: case CLBLAS_TRSV: case CLBLAS_TRSV_GEMV: ret = true; break; case CLBLAS_GEMV: case CLBLAS_SYMV: ret = l2IsTrans(flags, mrole); default: break; } return ret; } clblas-2.10/src/library/blas/generic/problem_iter.c000066400000000000000000000063741264277366700223510ustar00rootroot00000000000000/* ************************************************************************ * Copyright 2013 Advanced Micro Devices, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * ************************************************************************/ // Problem iterator to scatter solving, for passing over matrix A #include #include #include #include "matrix_dims.h" #include "problem_iter.h" void VISIBILITY_HIDDEN initProblemIterator( ProblemIterator *iter, BlasFunctionID funcID, MatrixRole mrole, CLBlasKargs *kargs, size_t maxPanels, size_t maxBlocks, SubproblemDim *topDim) { SubproblemDim tmp; iter->mrole = mrole; iter->funcID = funcID; kargsToProbDims(&tmp, funcID, kargs, false); iter->size = matrBlockHeight(&tmp, mrole, kargs->side); iter->globPitch = matrBlockPitch(&tmp, mrole, kargs->dtype, kargs->side); iter->maxPanels = maxPanels; iter->maxBlocks = maxBlocks; iter->uplo = kargs->uplo; iter->side = kargs->side; iter->dtype = kargs->dtype; iter->bpitch = matrBlockPitch(topDim, mrole, kargs->dtype, kargs->side); iter->bheight = matrBlockHeight(topDim, mrole, kargs->side); iteratorReset(iter); } void VISIBILITY_HIDDEN iteratorReset(ProblemIterator *iter) { if (isIterBackward(iter)) { iter->pos = iter->size; iter->prevPos = iter->size; } else { iter->pos = 0; iter->prevPos = 0; } } bool VISIBILITY_HIDDEN isIterBackward(ProblemIterator *iter) { bool ret = false; if (iter->funcID != CLBLAS_GEMM) { ret = (iter->side == clblasLeft && iter->uplo == clblasLower) || (iter->side == clblasRight && iter->uplo == clblasUpper); if (iter->funcID == CLBLAS_TRSM) { ret = !ret; } } return ret; } int VISIBILITY_HIDDEN iterateProblem(ProblemIterator *iter) { bool backward; size_t dy = 0; backward = isIterBackward(iter); if (((iter->funcID != CLBLAS_TRSM) && (!iter->maxPanels)) || ((iter->funcID == CLBLAS_TRSM) && (!iter->maxBlocks))) { iter->pos = (backward) ? 0 : iter->size; return 1; } iter->prevPos = iter->pos; if ((iter->funcID != CLBLAS_TRSM)) { dy = iter->maxPanels * iter->bheight; assert(dy != 0); } if (backward) { dy = szmin(iter->pos, dy); iter->pos -= dy; } else { dy = szmin(dy, iter->size - iter->pos); iter->pos += dy; } return (int)(backward && iter->pos == 0) || (!backward && iter->pos == iter->size); } size_t VISIBILITY_HIDDEN iterLastOffset(ProblemIterator *iter) { return (iter->pos > iter->prevPos) ? (iter->pos - iter->prevPos) : (iter->prevPos - iter->pos); } clblas-2.10/src/library/blas/generic/problem_iter.h000066400000000000000000000040601264277366700223440ustar00rootroot00000000000000/* ************************************************************************ * Copyright 2013 Advanced Micro Devices, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * ************************************************************************/ #ifndef PROBLEM_ITERATOR_H_ #define PROBLEM_ITERATOR_H_ #include #include "clblas-internal.h" #include "blas_funcs.h" // Problem iterator to scatter solving, for passing over matrix A typedef struct ProblemIterator { MatrixRole mrole; size_t pos; size_t prevPos; size_t size; size_t globPitch; BlasFunctionID funcID; clblasUplo uplo; clblasSide side; DataType dtype; size_t maxPanels; size_t maxBlocks; size_t bpitch; size_t bheight; } ProblemIterator; /* * @maxBlocks: maximal number of blocks to iterate with; * There is as little as 1 iteration if it is * set to 0. */ void VISIBILITY_HIDDEN initProblemIterator( ProblemIterator *iter, BlasFunctionID funcID, MatrixRole mrole, CLBlasKargs *kargs, size_t maxPanels, size_t maxBlocks, SubproblemDim *topDim); void VISIBILITY_HIDDEN iteratorReset(ProblemIterator *iter); bool VISIBILITY_HIDDEN isIterBackward(ProblemIterator *iter); /* * Iterate in some dimension based on maximal blocks info; * Iteration for the 'SDIM_BWIDTH' component is prohibited. * Returns 1 when achieve the end position */ int VISIBILITY_HIDDEN iterateProblem(ProblemIterator *iter); size_t VISIBILITY_HIDDEN iterLastOffset(ProblemIterator *iter); #endif /* PROBLEM_ITERATOR_H_ */ clblas-2.10/src/library/blas/generic/solution_assert.c000066400000000000000000000130211264277366700231060ustar00rootroot00000000000000/* ************************************************************************ * Copyright 2013 Advanced Micro Devices, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * ************************************************************************/ #include #include #include #include #include "solution_assert.h" #define ASSERT_GREQ_AND_DIV(a, b) assert(((a) >= (b)) && ((a) % (b) == 0)) // solution area typedef struct SolArea { size_t offsetM; size_t M; size_t offsetN; size_t N; ListNode node; } SolArea; #ifdef ASSERT_GRANULATION // check the found dimensions are not wrong void VISIBILITY_HIDDEN assertGranulation( SubproblemDim *dims, unsigned int nrDims, PGranularity *pgran, unsigned int thLevel) { unsigned int i; size_t gsize; /* * subproblem dimensions on all levels must meet the following requirements: * * 1) Item work piece is greater then a processing step * 2) Item work piece is integrally divisible on the processing step * 3) Work pieces and processing steps don't grows at forwarding to the bottom level * 4) At passing to the thread level, the subproblem must be strict divisible among * all the threads */ gsize = pgran->wgSize[0] * pgran->wgSize[1]; for (i = 0; i < nrDims; i++) { if (i || dims[i].itemX != SUBDIM_UNUSED) { ASSERT_GREQ_AND_DIV(dims[i].itemX, dims[i].x); } if (i || dims[i].itemY != SUBDIM_UNUSED) { ASSERT_GREQ_AND_DIV(dims[i].itemY, dims[i].y); } if (i) { ASSERT_GREQ_AND_DIV(dims[i - 1].x, dims[i].itemX); ASSERT_GREQ_AND_DIV(dims[i - 1].y, dims[i].itemY); ASSERT_GREQ_AND_DIV(dims[i - 1].bwidth, dims[i].bwidth); } } assert((dims[thLevel].itemX * dims[thLevel].itemY) * gsize == dims[thLevel - 1].x * dims[thLevel - 1].y); } #endif // ASSERT_GRANULATION #ifdef ASSERT_IMAGE_STEPS static __inline void assertEnclosed(size_t off1, size_t size1, size_t off2, size_t size2) { bool enc = ((off1 >= off2) && (off1 < off2 + size2) && (off1 + size1 > off2) && (off1 + size1 <= off2 + size2)); assert(enc); } static __inline bool isIntersected(size_t off1, size_t size1, size_t off2, size_t size2) { return ((off1 >= off2 && off1 < off2 + size2) || (off1 + size1 > off2 && off1 + size1 <= off2 + size2)); } static void freeSolAreaNode(ListNode *node) { SolArea *area = container_of(node, node, SolArea); free(area); } static void accProcessed(ListNode *node, void *priv) { SolArea *a1 = container_of(node, node, SolArea); SolArea *a2 = (SolArea*)priv; if (!isIntersected(a1->offsetM, a1->M, a2->offsetM, a2->M)) { a2->M += a1->M; if (a2->offsetM > a1->offsetM) { a2->offsetM = a1->offsetM; } } if (!isIntersected(a1->offsetN, a1->N, a2->offsetN, a2->N)) { a2->N += a1->N; if (a2->offsetN > a1->offsetN) { a2->offsetN = a1->offsetN; } } } static int solAreaCmp(ListNode *a, const void *b) { SolArea *area = container_of(a, node, SolArea); const CLBlasKargs *kargs = (const CLBlasKargs*)b; int ret; ret = isIntersected(kargs->offsetM, kargs->M, area->offsetM, area->M); ret = ret && isIntersected(kargs->offsetN, kargs->N, area->offsetN, area->N); return !ret; } void VISIBILITY_HIDDEN assertImageSubstep( SolutionStep *wholeStep, SolutionStep *substep, ListHead *doneSubsteps) { CLBlasKargs *kargs1 = &substep->args; CLBlasKargs *kargs2 = &wholeStep->args; ListNode *node; SolArea *area; assertEnclosed(kargs1->offsetM, kargs1->M, kargs2->offsetM, kargs2->M); assertEnclosed(kargs1->offsetN, kargs1->N, kargs2->offsetN, kargs2->N); node = listNodeSearch(doneSubsteps, (const void*)&substep->args, solAreaCmp); assert(!node); area = malloc(sizeof(SolArea)); if (area == NULL) { fprintf(stderr, "[%s, line %d]: Failed to allocate memory for image " "step assertion!\n", __FILE__, __LINE__); } else { area->offsetM = substep->args.offsetM; area->M = substep->args.M; area->offsetN = substep->args.offsetN; area->N = substep->args.N; listAddToTail(doneSubsteps, &area->node); } } void VISIBILITY_HIDDEN assertImageStep(SolutionStep *wholeStep, ListHead *doneSubsteps) { SolArea area; area.offsetM = SIZE_MAX; area.M = 0; area.offsetN = SIZE_MAX; area.N = 0; listDoForEachPriv(doneSubsteps, accProcessed, &area); assert((area.offsetM == wholeStep->args.offsetM) && (area.M == wholeStep->args.M) && (area.offsetM ==wholeStep->args.offsetM) && (area.N == wholeStep->args.N)); } void VISIBILITY_HIDDEN releaseImageAssertion(ListHead *doneSubsteps) { listDoForEachSafe(doneSubsteps, freeSolAreaNode); listInitHead(doneSubsteps); } #endif /* ASSERT_IMAGE_STEPS */ clblas-2.10/src/library/blas/generic/solution_assert.h000066400000000000000000000031751264277366700231240ustar00rootroot00000000000000/* ************************************************************************ * Copyright 2013 Advanced Micro Devices, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * ************************************************************************/ #ifndef SOLUTION_ASSERT_H_ #define SOLUTION_ASSERT_H_ #include "solution_seq.h" #ifdef ASSERT_GRANULATION void assertGranulation( SubproblemDim *dims, unsigned int nrDims, PGranularity *pgran, unsigned int thLevel); #else // ASSERT_GRANULATION // stub, do nothing #define assertGranulation(dims, nrDims, pgran, thLevel) #endif // !ASSERT_GRANULATION #ifdef ASSERT_IMAGE_STEPS void assertImageSubstep( SolutionStep *wholeStep, SolutionStep *substep, ListHead *doneSubsteps); void assertImageStep(SolutionStep *wholeStep, ListHead *doneSubsteps); void releaseImageAssertion(ListHead *doneSubsteps); #else /* ASSERT_IMAGE_STEPS */ // stubs #define assertImageSubstep(wholeStep, substep, doneSubsteps) #define assertImageStep(wholeStep, doneSubsteps) #define releaseImageAssertion(doneSubsteps) #endif /* !ASSERT_IMAGE_STEPS */ #endif /* SOLUTION_ASSERT_H_ */ clblas-2.10/src/library/blas/generic/solution_seq.c000066400000000000000000000326551264277366700224130ustar00rootroot00000000000000/* ************************************************************************ * Copyright 2013 Advanced Micro Devices, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * ************************************************************************/ #include #include #include #include "matrix_dims.h" #include "problem_iter.h" #include "solution_assert.h" #include "solution_seq.h" bool VISIBILITY_HIDDEN isMatrixInImage(MemoryPattern *pattern, MatrixRole mrole); void VISIBILITY_HIDDEN releaseStepImgs(SolutionStep *step); static cl_int enqueueKernel( SolutionStep *step, const Kernel *kernel, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *event); static void splitSolutionStep( SolutionStep *rem, SolutionStep *cut, SDimComponent component, size_t chunk, bool backward); static cl_int executeImageStep( SolutionStep *step, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *event); void freeSolutionSeq(ListHead *seq) { listDoForEachSafe(seq, freeSolutionStep); listInitHead(seq); } cl_int executeSolutionSeq(const ListHead *seq) { cl_int err = CL_SUCCESS; ListNode *i; SolutionStep *step; /* Enqueue computing kernels */ for (i = listNodeFirst(seq); (i != seq) && (err == CL_SUCCESS); i = i->next) { step = container_of(i, node, SolutionStep); if (step->cmdQueue == NULL) { continue; } if (step->args.scimage[0]) { err = executeImageStep(step, step->numEventsInWaitList, step->eventWaitList, step->event); } else { #ifdef DEBUG_2 printf("enqueueKernel from executreSolutionSeq...\n"); #endif err = enqueueKernel(step, step->kernels[CLBLAS_COMPUTING_KERNEL], step->numEventsInWaitList, step->eventWaitList, step->event); } } return err; } /* private functions */ void VISIBILITY_HIDDEN freeSolutionStep(ListNode *node) { SolutionStep *step = container_of(node, node, SolutionStep); int i; for (i = 0; i < MAX_CLBLAS_KERNELS_PER_STEP; i++) { if (step->kernels[i] != NULL) { putKernel(clblasKernelCache, step->kernels[i]); } } releaseStepImgs(step); free(step); } static cl_int enqueueKernel( SolutionStep *step, const Kernel *kernel, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *event) { cl_int err; KernelDesc kernelDesc; KernelErrorInfo errInfo; MemoryPattern *pattern; const CLBLASKernExtra *kextra = (const CLBLASKernExtra*)kernel->extra; SubproblemDim subdims[MAX_SUBDIMS]; step->args.kernType = kextra->kernType; pattern = &clblasSolvers[step->funcID].memPatterns[step->patternID]; kernelDesc.workDim = step->pgran.wgDim; memcpy(subdims, step->subdims, sizeof(step->subdims)); if(NULL==pattern->sops->calcThreads) { SubproblemDim globDim; const PGranularity *pgran; pgran = (pattern->nrLevels == 1) ? NULL : &step->pgran; kargsToProbDims(&globDim, step->funcID, &step->args, false); // fixup dimensions in respect with desired work dispatch order if ((step->pgran.wgDim == 2) && pattern->sops->innerDecompositionAxis) { if (pattern->sops->innerDecompositionAxis(&step->args) == DECOMP_AXIS_X) { /* * these dimensions will not used more anywhere, so we can * just swap them */ swapDimXY(&subdims[0]); swapDimXY(&subdims[1]); swapDimXY(&globDim); } } calcGlobalThreads(kernelDesc.globalThreads, subdims, pgran, globDim.y, globDim.x); } else { #ifdef DEBUG_2 printf("calcThreads is defined\n"); #endif pattern->sops->calcThreads( kernelDesc.globalThreads, subdims, &step->pgran, &step->args, kextra); } // // Store the numWGSpawned for this kernel // This size can be used by sequence-steps down the line // e.g. Reduction of intermediate results of each work group // step->pgran.numWGSpawned[0] = kernelDesc.globalThreads[0] / step->pgran.wgSize[0]; step->pgran.numWGSpawned[1] = kernelDesc.globalThreads[1] / step->pgran.wgSize[1]; kernelDesc.localThreads[0] = step->pgran.wgSize[0]; kernelDesc.localThreads[1] = step->pgran.wgSize[1]; kernelDesc.workDim = step->pgran.wgDim; kernelDesc.waitListSize = numEventsInWaitList; kernelDesc.eventWaitList = eventWaitList; kernelDesc.nowait = 1; kernelDesc.event = event; kernelDesc.needExecTime = 0; memset(kernelDesc.args, 0, sizeof(KernelArg) * MAX_KERNEL_ARGS); pattern->sops->assignKargs(kernelDesc.args, (const void*)&(step->args), kextra); errInfo.wrongArg = 0; errInfo.phase = 0; /* * TODO: log launchClKernel errors */ dumpKernel(step, kextra->kernType); err = clCreateKernelsInProgram(kernel->program, 1, &kernelDesc.kernel, NULL); if (err == CL_SUCCESS) { err = launchClKernel(&kernelDesc, step->cmdQueue, &errInfo); clReleaseKernel(kernelDesc.kernel); } return err; } bool VISIBILITY_HIDDEN isMatrixInImage( MemoryPattern *pattern, MatrixRole mrole) { const CLBLASMpatExtra *extra = (const CLBLASMpatExtra*)pattern->extra; bool ret = false; if (extra != NULL) { switch (mrole) { case MATRIX_A: ret = (extra->mobjA == CLMEM_IMAGE); break; case MATRIX_B: ret = (extra->mobjB == CLMEM_IMAGE); break; default: break; } } return ret; } void VISIBILITY_HIDDEN releaseStepImgs(SolutionStep *step) { int i; cl_mem *imgs = step->args.scimage; cl_device_id devID = NULL;; for (i = 0; (i < 2) && (imgs[i] != NULL); i++) { if (devID == NULL) { getQueueDevice(step->cmdQueue, &devID); } putSCImage(devID, imgs[i]); imgs[i] = NULL; //to avoid double release } } static cl_int executeImageStep( SolutionStep *step, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *event) { SolutionStep outerStep, innerStep, execStep; cl_int err = CL_SUCCESS; int currImg = 0; size_t imgWidth, imgHeight; size_t ha, hb; size_t maxPanels[MATRIX_ROLES_NUMBER], maxBlocks[MATRIX_ROLES_NUMBER]; size_t off; SubproblemDim wholeDim; MatrixRole mrole; CLBlasKargs *kargs = &step->args; cl_mem *imgs = kargs->scimage; MemoryPattern *mempat = &clblasSolvers[step->funcID].memPatterns[step->patternID]; ProblemIterator innerIter, outerIter; int oend = 0, iend; SDimComponent comp[2]; bool backward; ListHead doneSteps; CLBlasKernelType ktype; kargsToProbDims(&wholeDim, step->funcID, kargs, false); memset(maxPanels, 0, sizeof(maxPanels)); memset(maxBlocks, 0, sizeof(maxPanels)); memcpy(&outerStep, step, sizeof(SolutionStep)); memcpy(&execStep, step, sizeof(SolutionStep)); listInitHead(&doneSteps); /* * Cover the whole problem with dimension which matrix blocks are * fitted to images at. */ for (mrole = MATRIX_A; mrole < MATRIX_C; mrole++) { if (!isMatrixInImage(mempat, mrole)) { continue; } clGetImageInfo(imgs[currImg], CL_IMAGE_WIDTH, sizeof(imgWidth), &imgWidth, NULL); clGetImageInfo(imgs[currImg], CL_IMAGE_HEIGHT, sizeof(imgHeight), &imgHeight, NULL); if (step->funcID == CLBLAS_TRSM) { maxPanels[mrole] = 0; maxBlocks[mrole] = 0; } else { maxPanels[mrole] = imgHeight / matrBlockHeight(step->subdims, mrole, clblasLeft); } currImg++; } /* * for GEMM function we can take both the matrices as outer, it depends on * their sizes and image sizes */ if (step->funcID == CLBLAS_GEMM) { size_t dx, dy; // FIXME: check which of them use really an image ha = matrBlockHeight(&wholeDim, MATRIX_A, clblasLeft); hb = matrBlockHeight(&wholeDim, MATRIX_B, clblasLeft); dx = maxPanels[MATRIX_B] * matrBlockHeight(step->subdims, MATRIX_B, clblasLeft); dy = maxPanels[MATRIX_A] * matrBlockHeight(step->subdims, MATRIX_A, clblasLeft); // hb + (hb*ha)/dx < ha + (ha*hb)/dy if ((hb / ha) < (1 + hb / dy) / (1 + ha / dx)) { mrole = MATRIX_B; } else { mrole = MATRIX_A; } } else { mrole = MATRIX_B; } /* * Let's cover the whole image based step. * Pattern iterator is used for traversing */ initProblemIterator(&outerIter, step->funcID, mrole, kargs, maxPanels[mrole], maxBlocks[mrole], step->subdims); if (mrole == MATRIX_B) { comp[0] = SDIM_X; comp[1] = SDIM_Y; mrole = MATRIX_A; } else { comp[0] = SDIM_Y; comp[1] = SDIM_X; mrole = MATRIX_B; } initProblemIterator(&innerIter, step->funcID, mrole, kargs, maxPanels[mrole], maxBlocks[mrole], step->subdims); backward = isIterBackward(&innerIter); /* * Difference in overflowing checking in the outer and inner loops * is due to */ do { iteratorReset(&innerIter); iend = 0; oend = iterateProblem(&outerIter); off = iterLastOffset(&outerIter); splitSolutionStep(&outerStep, &execStep, comp[0], off, false); if (execStep.funcID == CLBLAS_GEMM) { fixupGemmOffsets(&execStep.args, execStep.extraFlags, 0); } memcpy(&innerStep, &execStep, sizeof(SolutionStep)); ktype = (comp[0] == SDIM_Y) ? CLBLAS_PREP_A_KERNEL : CLBLAS_PREP_B_KERNEL; if (execStep.kernels[ktype] != NULL) { err = enqueueKernel(&execStep, execStep.kernels[ktype], numEventsInWaitList, eventWaitList, event); if (err != CL_SUCCESS) { break; } } do { iend = iterateProblem(&innerIter); off = iterLastOffset(&innerIter); splitSolutionStep(&innerStep, &execStep, comp[1], off, backward); if (execStep.funcID == CLBLAS_GEMM) { fixupGemmOffsets(&execStep.args, execStep.extraFlags, 0); } assertImageSubstep(step, &execStep, &doneSteps); ktype = (comp[1] == SDIM_Y) ? CLBLAS_PREP_A_KERNEL : CLBLAS_PREP_B_KERNEL; if (execStep.kernels[ktype] != NULL) { err = enqueueKernel(&execStep, execStep.kernels[ktype], numEventsInWaitList, eventWaitList, event); } if (err == CL_SUCCESS) { err = enqueueKernel(&execStep, execStep.kernels[CLBLAS_COMPUTING_KERNEL], numEventsInWaitList, eventWaitList, event); } } while (!iend && (err == CL_SUCCESS)); } while (!oend && (err == CL_SUCCESS)); if (err == CL_SUCCESS) { assertImageStep(step, &doneSteps); } releaseImageAssertion(&doneSteps); return err; } static void splitSolutionStep( SolutionStep *rem, SolutionStep *cut, SDimComponent component, size_t chunk, bool backward) { SubproblemDim remDim, cutDim; SubproblemDim remDimOff, cutDimOff; kargsToProbDims(&remDimOff, rem->funcID, &rem->args, true); kargsToProbDims(&remDim, rem->funcID, &rem->args, false); memcpy(&cutDim, &remDim, sizeof(SubproblemDim)); memcpy(&cutDimOff, &remDimOff, sizeof(SubproblemDim)); memcpy(cut, rem, sizeof(SolutionStep)); if (component == SDIM_Y) { if (backward) { cutDimOff.y += remDim.y - chunk; } else { remDimOff.y += chunk; } cutDim.y = chunk; remDim.y -= chunk; } else { if (backward) { cutDimOff.x += remDim.x - chunk; } else { remDimOff.x += chunk; } cutDim.x = chunk; remDim.x -= chunk; } probDimsToKargs(&rem->args, rem->funcID, &remDimOff, true); probDimsToKargs(&rem->args, rem->funcID, &remDim, false); probDimsToKargs(&cut->args, cut->funcID, &cutDimOff, true); probDimsToKargs(&cut->args, cut->funcID, &cutDim, false); } clblas-2.10/src/library/blas/generic/solution_seq_make.c000066400000000000000000002102471264277366700234030ustar00rootroot00000000000000/* ************************************************************************ * Copyright 2013 Advanced Micro Devices, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * ************************************************************************/ #include #include #include #include #include #include #include #include #include #include "matrix_dims.h" #include "solution_assert.h" #include "solution_seq.h" #define DECOMPOSITION_THRESHOLD(type) (2560 * sizeof(cl_float) / dtypeSize(type)) /* From solution_seq.c */ bool VISIBILITY_HIDDEN isMatrixInImage(MemoryPattern *pattern, MatrixRole mrole); void VISIBILITY_HIDDEN releaseStepImgs(SolutionStep *step); #define isMatrixCached(pattern, mrole) \ checkMatrixMemLevelSet(pattern, mrole, (CLMEM_LEVEL_L2 | CLMEM_LEVEL_L1)) #define isLdsUsed(pattern) \ (checkMatrixMemLevelSet(pattern, MATRIX_A, CLMEM_LEVEL_LDS) || \ checkMatrixMemLevelSet(pattern, MATRIX_B, CLMEM_LEVEL_LDS)) enum { DEFAULT_BUFS_LSIZE_0 = 8, DEFAULT_BUFS_LSIZE_1 = 8, DEFAULT_CACHED_BUFS_LSIZE_0 = 8, DEFAULT_CACHED_BUFS_LSIZE_1 = 8 }; static cl_uint getQueueMaxImages(cl_command_queue queue); static bool checkMatrixMemLevelSet(MemoryPattern *pattern, MatrixRole mrole, meml_set_t mask); static void stripeDivision(BlasFunctionID funcID, const CLBlasKargs *args, ListHead *seq, cl_uint totalCUs); static void rectDivision(BlasFunctionID funcID, const CLBlasKargs *args, ListHead *seq, cl_uint totalCUs); static void triMatrixStripeDivision(BlasFunctionID funcID, const CLBlasKargs *args, ListHead *seq, cl_uint totalCUs); static cl_bool findBestPattern(SolutionStep *step); static void getDefaultStepGranulation(SolutionStep *step); static bool avoidLoadFromStorage(SolutionStep *step); static bool getStepResources(SolutionStep *step); static void getSuitableImageSizes(size_t *minWidth, size_t *minHeight, size_t *bestHeight, MatrixRole mrole, CLBlasKargs *kargs, unsigned int vecLen, SubproblemDim *subdims); static ListNode* decomposeTRXMStep(SolutionStep *step); static ListNode* decomposeSYRKStep(SolutionStep *step); static ListNode* decomposeSYR2KStep(SolutionStep *step); // Find vector length which lda and tile width is divisible on unsigned int appropriateVecLen(size_t ld, unsigned int tsize, size_t twidth, int funcLevel) { unsigned int vlen = sizeof(cl_float4) / tsize; if (funcLevel == 3) { vlen *= 2; } while (vlen > twidth) { vlen /= 2; } while ((ld % vlen) || (twidth % vlen)) { vlen /= 2; } return vlen; } /* * Select an appropriate vectorization to perform computation with. * It's done based upon the problem sizes and device type. The device type * is taken into account as well since not all devices allow not aligned * access to vector data. */ cl_int selectVectorization( const SolutionStep *step, CLBLASKernExtra *kextra) { const TargetDevice *device = &step->device; cl_device_type devType; cl_int err; size_t tw; bool tra; size_t checkedSizes[3]; int i, j; const CLBlasKargs *kargs = &step->args; KernelExtraFlags kflags = kextra->flags; KernelExtraFlags vecFlags[3] = { KEXTRA_NO_COPY_VEC_A, KEXTRA_NO_COPY_VEC_B, KEXTRA_NO_COPY_VEC_C }; unsigned int vlen; unsigned int tsize; MemoryPattern *mempat; const SubproblemDim *dim = &step->subdims[1]; int funcLevel; mempat = &clblasSolvers[step->funcID].memPatterns[step->patternID]; err = clGetDeviceInfo(device->id, CL_DEVICE_TYPE, sizeof(devType), &devType, NULL); if (err != CL_SUCCESS) { return err; } if (isLdsUsed(mempat)) { kextra->vecLenC = kextra->vecLen = sizeof(cl_float4) / dtypeSize(step->args.dtype); kextra->vecLenA = kextra->vecLenB = kextra->vecLen; } else { kextra->vecLenA = kextra->vecLenB = 0; } // select vectorization based upon leading dimensions and starting offsets for (i = 0; i < 2; i++) { if (!i) { // check by leading dimensions checkedSizes[0] = kargs->lda.matrix; if (funcBlasLevel(step->funcID) == 2) { checkedSizes[1] = checkedSizes[2] = 0; } else { checkedSizes[1] = kargs->ldb.matrix; checkedSizes[2] = kargs->ldc.matrix; } } else { // check by offsets checkedSizes[0] = kargs->offA; checkedSizes[1] = kargs->offBX; checkedSizes[2] = kargs->offCY; } if (funcHasTriangMatrix(step->funcID)) { checkedSizes[2] = checkedSizes[1]; } vlen = sizeof(cl_float4) / dtypeSize(step->args.dtype); /* * Disable vectorization at load from the global memory to LDS * if matrix width is not aligned on the boundary of the float4 */ for (j = 0; j < 3; j++) { if (checkedSizes[j] % vlen) { kflags |= vecFlags[j]; } } if ((step->funcID == CLBLAS_TRMV) || (step->funcID == CLBLAS_HEMV)) { if ( ( ((kflags & KEXTRA_UPPER_TRIANG)==0) && (kflags & KEXTRA_COLUMN_MAJOR) ) || ( ((kflags & KEXTRA_UPPER_TRIANG)) && ((kflags & KEXTRA_COLUMN_MAJOR) == 0)) ) { if( (kargs->N) % vlen) { kflags |= KEXTRA_NO_COPY_VEC_A; } } } if(mempat->sops->selectVectorization != NULL) { kflags |= mempat->sops->selectVectorization((void *)kargs, vlen); } if ((step->funcID == CLBLAS_TRSV) || (step->funcID == CLBLAS_TRSV_GEMV)) { // // TRTRI, GEMV Part - Only Scalar loads // PENDING: // Analyze Case by Case and selectively enable/disable // kflags |= KEXTRA_NO_COPY_VEC_A; kflags |= KEXTRA_NO_COPY_VEC_B; } // // Routines that Use LDS should be above this IF statement // if (isLdsUsed(mempat)) { continue; } // // Routines that dont use LDS have to be below the isLdsUsed() code // if (step->funcID == CLBLAS_GEMM2) { if ((step->subdims[0].y > step->args.M) || (step->subdims[0].x > step->args.N)) { kextra->vecLen = 1; } else { kextra->vecLen = sizeof(cl_float4) / dtypeSize(step->args.dtype); } kextra->vecLenA = kextra->vecLen; kextra->vecLenB = kextra->vecLen; kextra->vecLenC = kextra->vecLen; continue; } if (step->funcID == CLBLAS_GEMM_TAIL) { kextra->vecLen = 1; kextra->vecLenA = 1; kextra->vecLenB = 1; kextra->vecLenC = 1; continue; } funcLevel = funcBlasLevel(step->funcID); funcLevel = funcBlasLevel(step->funcID); /* * If the step's pattern uses LDS, it is responsible for alignment. * Otherwise it's needed to provide appropriate vector length */ tsize = dtypeSize(step->args.dtype); tra = isMatrixAccessColMaj(step->funcID, kflags, MATRIX_A); tw = (tra) ? dim->y : dim->bwidth; vlen = appropriateVecLen(checkedSizes[0], tsize, tw, funcLevel); kextra->vecLenA = (kextra->vecLenA) ? umin(kextra->vecLenA, vlen) : vlen; tra = isMatrixAccessColMaj(step->funcID, kflags, MATRIX_B); tw = ((funcLevel == 2) || !tra) ? dim->bwidth : dim->x; vlen = appropriateVecLen(checkedSizes[1], tsize, tw, funcLevel); kextra->vecLenB = (kextra->vecLenB) ? umin(kextra->vecLenB, vlen) : vlen; tra = isMatrixAccessColMaj(step->funcID, kflags, MATRIX_C ); tw = ((funcLevel == 2) || tra) ? dim->y : dim->x; vlen = appropriateVecLen( checkedSizes[2], tsize, tw, funcLevel ); kextra->vecLenC = kextra->vecLenC ? umin(vlen,kextra->vecLenC) : vlen; kextra->vecLen = umin(kextra->vecLenA, kextra->vecLenB); kextra->vecLen = umin(kextra->vecLenC, kextra->vecLen); } kextra->flags = kflags; return CL_SUCCESS; } /* * Replace 'offsetM' and 'offsetN' field with respective extra offset at * 'offA', 'offBX', 'offCY' and taking into accoutn offset along K */ void VISIBILITY_HIDDEN fixupGemmOffsets(CLBlasKargs *kargs, KernelExtraFlags kflags, size_t offsetK) { if (isMatrixAccessColMaj(CLBLAS_GEMM, kflags, MATRIX_A)) { kargs->offA += offsetK * kargs->lda.matrix + kargs->offsetM; } else { kargs->offA += kargs->offsetM * kargs->lda.matrix + offsetK; } if (isMatrixAccessColMaj(CLBLAS_GEMM, kflags, MATRIX_B)) { kargs->offBX += offsetK * kargs->ldb.matrix + kargs->offsetN; } else { kargs->offBX += kargs->offsetN * kargs->ldb.matrix + offsetK; } if (isMatrixAccessColMaj(CLBLAS_GEMM, kflags, MATRIX_C)) { kargs->offCY += kargs->offsetN * kargs->ldc.matrix + kargs->offsetM; } else { kargs->offCY += kargs->offsetM * kargs->ldc.matrix + kargs->offsetN; } kargs->offsetM = kargs->offsetN = 0; } ListNode *decomposeProblemStep(SolutionStep *step) { ListNode *node; switch (step->funcID) { case CLBLAS_TRMM: case CLBLAS_TRSM: node = decomposeTRXMStep(step); break; case CLBLAS_SYRK: node = decomposeSYRKStep(step); break; case CLBLAS_SYR2K: node = decomposeSYR2KStep(step); break; default: node = &step->node; break; } return node; } cl_int makeSolutionSeq( BlasFunctionID funcID, const CLBlasKargs *args, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events, ListHead *seq) { cl_int err; cl_uint j, totalCUs, numDevicesWithoutDoubles; bool hasDouble; SolutionStep *step; CLBLASKernExtra extra; ListNode *i; MemoryPattern *pattern; solver_id_t sid; KernelKey key; bool need[MAX_CLBLAS_KERNELS_PER_STEP] = {true}; CLBlasKernelType ktype; Kernel *kernel; bool loadData = false; unsigned char* buffer[MAX_CLBLAS_KERNELS_PER_STEP]; size_t sizeBuffer[MAX_CLBLAS_KERNELS_PER_STEP]; char bopts[BUILD_OPTS_MAXLEN]; // Moving bopts up. See the comments before findKernel() int ik; // first subdimension index in the subproblem dims array int firstDimIdx; if ((numCommandQueues == 0) || (commandQueues == NULL)) { return CL_INVALID_VALUE; } memset(buffer, 0, sizeof(buffer)); listInitHead(seq); totalCUs = 0; numDevicesWithoutDoubles = 0; for (j = 0; j < numCommandQueues; j++) { cl_device_id devID; err = getQueueDevice(commandQueues[j], &devID); if (err != CL_SUCCESS) { continue; } if (isDoubleBasedType(args->dtype)) { hasDouble = deviceHasNativeDouble(devID, &err); if (err != CL_SUCCESS) { continue; } if (!hasDouble) { numDevicesWithoutDoubles++; continue; } } step = calloc(1, sizeof(SolutionStep)); if (step == NULL) { freeSolutionSeq(seq); return CL_OUT_OF_HOST_MEMORY; } step->funcID = funcID; step->args = *args; step->args.addrBits = deviceAddressBits(devID, &err); step->cmdQueue = commandQueues[j]; step->numEventsInWaitList = numEventsInWaitList; step->eventWaitList = eventWaitList; step->event = NULL; if (events != NULL) { step->event = events + j; } step->pgran.wfSize = deviceWavefront(devID, &err); step->extraFlags = clblasArgsToKextraFlags(args, step->funcID); if (step->funcID == CLBLAS_SYR2K) { step->extraFlags |= KEXTRA_SYRK_2K_RANK; } step->device.id = devID; err = identifyDevice(&step->device); if (err != CL_SUCCESS) { freeSolutionSeq(seq); return err; } totalCUs += deviceComputeUnits(devID, &err); listAddToTail(seq, &step->node); } if (totalCUs == 0) { return (numDevicesWithoutDoubles == numCommandQueues) ? CL_INVALID_DEVICE : CL_INVALID_COMMAND_QUEUE; } memset(&extra, 0, sizeof(extra)); memset(bopts, 0, BUILD_OPTS_MAXLEN*sizeof(char)); extra.dtype = args->dtype; /* Split task between multiple command queues */ if (funcID == CLBLAS_GEMM) { rectDivision(funcID, args, seq, totalCUs); } else if ((funcID == CLBLAS_SYRK) || (funcID == CLBLAS_SYR2K)) { triMatrixStripeDivision(funcID, args, seq, totalCUs); } else { stripeDivision(funcID, args, seq, totalCUs); } /* Some steps can be decomposed into several sequential substeps */ parseEnvImplementation(); // Function level decomposition for (i = listNodeFirst(seq); i != seq; i = i->next) { step = container_of(i, node, SolutionStep); if (step->cmdQueue == NULL) { continue; } if (step->funcID == CLBLAS_GEMM) { fixupGemmOffsets(&step->args, step->extraFlags, 0); continue; } i = decomposeProblemStep(step); } #ifdef DEBUG_2 printf("Finding a kernel for each step\n"); #endif /* Find a kernel for each step */ for (i = listNodeFirst(seq); (i != seq) && (err == CL_SUCCESS); i = i->next) { DeviceIdent *ident; step = container_of(i, node, SolutionStep); if (step->cmdQueue == NULL) { continue; } ident = &step->device.ident; /* * Set vendor dependent flags * * FIXME: thrown this kludge away when generator interface will * support passing ident info */ if (ident->vendor == VENDOR_AMD) { step->extraFlags |= (KEXTRA_VENDOR_AMD | KEXTRA_ENABLE_MAD); } if (!findBestPattern(step)) { err = CL_OUT_OF_RESOURCES; break; } #ifdef DEBUG_2 printf("Find best pattern finished\n"); #endif pattern = &(clblasSolvers[step->funcID].memPatterns[step->patternID]); firstDimIdx = 2 - pattern->nrLevels; sid = makeSolverID(step->funcID, step->patternID); err = getQueueDevice(step->cmdQueue, &key.device); err = getQueueContext(step->cmdQueue, &key.context); detectProblemTails(step); extra.flags = step->extraFlags; if (pattern->sops->fixupArgs) { pattern->sops->fixupArgs(&step->args, &step->subdims[firstDimIdx], &extra); } step->extraFlags = extra.flags; key.nrDims = pattern->nrLevels; memset(key.subdims, 0, sizeof(key.subdims)); memcpy(key.subdims, &step->subdims[firstDimIdx], sizeof(SubproblemDim) * key.nrDims); detectOffsets(step); extra.flags = step->extraFlags; need[CLBLAS_PREP_A_KERNEL] = isMatrixInImage(pattern, MATRIX_A); need[CLBLAS_PREP_B_KERNEL] = isMatrixInImage(pattern, MATRIX_B); /* * Now, find and enqueue each kernel. Generate and build the kernel * on the fly if this kernel is not presented neither in the cache * no in the storage */ for (ktype = CLBLAS_COMPUTING_KERNEL; ktype < MAX_CLBLAS_KERNELS_PER_STEP; ktype++) { SubproblemDim prepDims[2]; if (!need[ktype]) { continue; } extra.kernType = ktype; err = selectVectorization(step, &extra); if (err != CL_SUCCESS) { break; } kernel = NULL; // // Now that the build options is a part of EXTRA structure, // it is also a part of the kernelKey // Setting of build options need to be done before // findKernel() // memset(bopts, 0, BUILD_OPTS_MAXLEN*sizeof(char)); setupBuildOpts(bopts, key.device, pattern); if (pattern->sops->setBuildOptions) { pattern->sops->setBuildOptions(bopts, (void*)(step)); } memcpy(extra.buildOptions, bopts, BUILD_OPTS_MAXLEN); if (areKernelsCacheable()) { kernel = findKernel(clblasKernelCache, sid, &key, &extra); } if (kernel == NULL) { if (!loadData && !avoidLoadFromStorage(step)) { size_t MNK = (step->args.M + step->args.N + step->args.K) / 3; loadData = !getKernelInfo(&step->device, pattern->name, extra.dtype, step->extraFlags, (int)MNK, &buffer[0], &sizeBuffer[0]); } if (buffer[ktype] != NULL){ kernel = loadKernel((const unsigned char**)&buffer[ktype], sizeBuffer[ktype], &key, &extra, &err); } else { SubproblemDim *dims; dims = (ktype == CLBLAS_COMPUTING_KERNEL) ? step->subdims : prepDims; #ifdef DEBUG_2 printf("Build options used : %s\n", bopts); #endif kernel = makeKernelCached(key.device, key.context, sid, &key, pattern->sops->genKernel, &dims[firstDimIdx], &step->pgran, &extra, bopts, &err); } if (kernel == NULL) { break; } if (areKernelsCacheable()) { getKernel(kernel); if (addKernelToCache(clblasKernelCache, sid, kernel, &key, clblasKernelExtraCmp)) { putKernel(clblasKernelCache, kernel); } } } else { #ifdef DEBUG_CONTEXT printf("KERNEL FOUND IN CACHE\n"); #endif } step->kernels[ktype] = kernel; } } if (err != CL_SUCCESS) { freeSolutionSeq(seq); } // free binary kernels for (ik = 0; ik < MAX_CLBLAS_KERNELS_PER_STEP; ++ik) { free(buffer[ik]); } return err; } static cl_uint getQueueMaxImages(cl_command_queue queue) { cl_int err; cl_device_id device; cl_command_queue_properties props; cl_bool imageSupport; imageSupport = CL_FALSE; err = getQueueDevice(queue, &device); if (err != CL_SUCCESS) { return 0; } err = clGetDeviceInfo(device, CL_DEVICE_IMAGE_SUPPORT, sizeof(imageSupport), &imageSupport, NULL); if (!imageSupport) { return 0; } props = 0; err = getQueueProperties(queue, &props); if (props & CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE) { return 0; } return 2; } static bool isTransBUsed(BlasFunctionID funcID) { if ((CLBLAS_GEMM == funcID) || (CLBLAS_GEMM2 == funcID) || (CLBLAS_GEMM_TAIL == funcID)) { return true; } else { return false; } } KernelExtraFlags clblasArgsToKextraFlags(const CLBlasKargs *args, BlasFunctionID funcID) { KernelExtraFlags flags = KEXTRA_NO_FLAGS; if (args->transA != clblasNoTrans) { flags |= KEXTRA_TRANS_A; } if (isTransBUsed(funcID) && args->transB != clblasNoTrans) { flags |= KEXTRA_TRANS_B; } if (isComplexType(args->dtype)) { if (args->transA == clblasConjTrans) { flags |= KEXTRA_CONJUGATE_A; } if (isTransBUsed(funcID) && args->transB == clblasConjTrans) { flags |= KEXTRA_CONJUGATE_B; } } if (args->order == clblasColumnMajor) { flags |= KEXTRA_COLUMN_MAJOR; } if ((funcID != CLBLAS_TRMM) && (funcID != CLBLAS_TRSM)) { // check if beta is zero ArgMultiplier z; memset(&z, 0, sizeof(z)); if (!memcmp(&args->beta, &z, sizeof(z))) { flags |= KEXTRA_BETA_ZERO; } } if (funcID != CLBLAS_GEMM) { if (args->uplo == clblasUpper) { flags |= KEXTRA_UPPER_TRIANG; } if (args->side == clblasRight) { flags |= KEXTRA_SIDE_RIGHT; } if (args->diag == clblasUnit) { flags |= KEXTRA_UNIT_DIAGONAL; } } if (funcID == CLBLAS_GEMV || funcID == CLBLAS_SYMV) { if (args->ldb.vector == 1) { flags |= KEXTRA_INCX_ONE; } if (args->ldc.vector == 1) { flags |= KEXTRA_INCY_ONE; } } return flags; } static bool checkMatrixMemLevelSet( MemoryPattern *pattern, MatrixRole mrole, meml_set_t mask) { const CLBLASMpatExtra *extra = (const CLBLASMpatExtra*)pattern->extra; meml_set_t mset; if (mrole == MATRIX_C || extra == NULL) { return false; } switch (mrole) { case MATRIX_A: mset = extra->aMset; break; case MATRIX_B: mset = extra->bMset; break; default: break; } return ((mset & mask) != 0); } /* Next three functions: stripeDivision(), rectDivision() and * triMatrixStripeDivision(), split output matrix into set of non-intersected * rectangles. Area of each rectangle depends on the number of Compute Units, * available on a device of the given queue. * Division is also aligned on the DIVISION_ALIGNMENT boundary. It is measured * in number of elements. */ /* This constant is used in: * - stripeDivision() * - rectDivision() * - triMatrixStripeDivision() * - decomposeTRXMStep() */ static const size_t DIVISION_ALIGNMENT = 128; static size_t align( size_t value, size_t alignment) { /* This implementation assumes that alignment is the power of 2. */ return (value + (alignment >> 1)) & (~(alignment - 1)); } /* Stripe division is done according to the picture: * * +------+--+----+--+ * | | | | | * | | | | | * | 1 | 2| 3 | 4| * | | | | | * | | | | | * +------+--+----+--+ */ static void stripeDivision( BlasFunctionID funcID, const CLBlasKargs *args, ListHead *seq, cl_uint totalCUs) { SolutionStep *step; ListNode *i; cl_int err; cl_device_id device; cl_uint nrCU; SubproblemDim size, offset, stepSize; bool first = true; kargsToProbDims(&offset, funcID, args, true); kargsToProbDims(&size, funcID, args, false); for (i = listNodeFirst(seq); i != seq; i = i->next) { step = container_of(i, node, SolutionStep); err = getQueueDevice(step->cmdQueue, &device); nrCU = deviceComputeUnits(device, &err); if (totalCUs == 0) { step->cmdQueue = NULL; continue; } stepSize = size; if (!first) { probDimsToKargs(&(step->args), funcID, &offset, true); } if (funcID == CLBLAS_GEMV) { if (totalCUs != nrCU) { stepSize.y = (size_t)(size.y * (double)nrCU / totalCUs + 0.5); stepSize.y = align(stepSize.y, DIVISION_ALIGNMENT); if (stepSize.y == 0) { step->cmdQueue = NULL; } else if (stepSize.y > size.y) { stepSize.y = size.y; totalCUs = nrCU; } } offset.y += stepSize.y; size.y -= stepSize.y; } else { if (totalCUs != nrCU) { stepSize.x = (size_t)(size.x * (double)nrCU / totalCUs + 0.5); stepSize.x = align(stepSize.x, DIVISION_ALIGNMENT); if (stepSize.x == 0) { step->cmdQueue = NULL; } else if (stepSize.x > size.x) { stepSize.x = size.x; totalCUs = nrCU; } } offset.x += stepSize.x; size.x -= stepSize.x; } totalCUs -= nrCU; probDimsToKargs(&(step->args), funcID, &stepSize, false); first = false; } } /* Rectangular division is done according to the picture: * * +------+-----+ * | | 2 | * | | | * | 1 +--+--+ * | |3 | 4| * | | | | * +------+--+--+ * * The longest side is divided first. */ static void rectDivision( BlasFunctionID funcID, const CLBlasKargs *args, ListHead *seq, cl_uint totalCUs) { SolutionStep *step, **sortedSteps; ListNode *i, *j; cl_int err; cl_device_id device; cl_uint nrCU, k, l; SubproblemDim size, offset, stepSize; unsigned int nrSteps = 0; /* 1. Sort steps according to the number of CU they have */ /* NOTE: We expect small number of steps, so simple insertion sort * would be enough. */ sortedSteps = calloc(listLength(seq), sizeof(*sortedSteps)); // assert(sortedSteps != NULL); k = 0; for (i = listNodeFirst(seq); i != seq; i = i->next, nrSteps++) { step = container_of(i, node, SolutionStep); err = getQueueDevice(step->cmdQueue, &device); sortedSteps[k] = step; nrCU = deviceComputeUnits(device, &err); for (j = i->next; j != seq; j = j->next) { step = container_of(i, node, SolutionStep); err = getQueueDevice(step->cmdQueue, &device); if (nrCU < deviceComputeUnits(device, &err)) { sortedSteps[k] = step; nrCU = deviceComputeUnits(device, &err); } } k++; } /* 2. Calculate rectangle sizes */ kargsToProbDims(&offset, funcID, args, true); kargsToProbDims(&size, funcID, args, false); stepSize = size; for (l = 0; l < k; l++) { step = sortedSteps[l]; err = getQueueDevice(step->cmdQueue, &device); nrCU = deviceComputeUnits(device, &err); if (totalCUs == 0) { step->cmdQueue = NULL; continue; } stepSize = size; if (l) { probDimsToKargs(&(step->args), funcID, &offset, true); } if (size.y > size.x) { if (totalCUs != nrCU) { stepSize.y = (size_t)(size.y * (double)nrCU / totalCUs + 0.5); stepSize.y = align(stepSize.y, DIVISION_ALIGNMENT); if (stepSize.y > size.y) { stepSize.y = size.y; totalCUs = nrCU; } else if (stepSize.y == 0) { step->cmdQueue = NULL; } } size.y -= stepSize.y; offset.y += stepSize.y; } else { if (totalCUs != nrCU) { stepSize.x = (size_t)(size.x * (double)nrCU / totalCUs + 0.5); stepSize.x = align(stepSize.x, DIVISION_ALIGNMENT); if (stepSize.x > size.x) { stepSize.x = size.x; totalCUs = nrCU; } else if (stepSize.x == 0) { step->cmdQueue = NULL; } } size.x -= stepSize.x; offset.x += stepSize.x; } probDimsToKargs(&(step->args), funcID, &stepSize, false); #ifdef DEBUG_2 printf("RectDivision:\n"); printf("\t offM=%d, offN=%d, M=%d, N=%d\n", step->args.offsetM, step->args.offsetN, step->args.M, step->args.N); #endif totalCUs -= nrCU; } free(sortedSteps); } /* Dividing triangular matrix (N x N) horizontally: * * +----+ * |\ | * +-\--+ * | \ | * | \| * +----+ * * Take into consideration the areas of triangles/trapezoids rather than * areas of stripes. */ static void triMatrixStripeDivision( BlasFunctionID funcID, const CLBlasKargs *args, ListHead *seq, cl_uint totalCUs) { SolutionStep *step; ListNode *i; cl_int err; cl_device_id device; cl_uint nrCU; SubproblemDim size, offset, stepSize, stepOffset; size_t top; kargsToProbDims(&offset, funcID, args, true); kargsToProbDims(&size, funcID, args, false); top = 0; if (args->uplo == clblasUpper) { offset.y += size.y; } stepSize = size; for (i = listNodeFirst(seq); i != seq; i = i->next) { step = container_of(i, node, SolutionStep); err = getQueueDevice(step->cmdQueue, &device); nrCU = deviceComputeUnits(device, &err); if (totalCUs == 0) { step->cmdQueue = NULL; continue; } if (args->uplo == clblasLower) { stepOffset = offset; } if (totalCUs != nrCU) { stepSize.y = (size_t)( sqrt(top * top + (double)nrCU / totalCUs * size.y * (top + size.x)) - top); stepSize.y = align(stepSize.y, DIVISION_ALIGNMENT); if ((stepSize.y == 0) || (stepSize.y > size.y)) { stepSize.y = size.y; totalCUs = nrCU; } else if (stepSize.y == 0) { step->cmdQueue = NULL; } /* We have to add special check because the direction of * splitting is 'bottom -> top' for UPLO = clblasUpper. */ else if (offset.y != align(offset.y, DIVISION_ALIGNMENT)) { size_t o = align(offset.y - stepSize.y, DIVISION_ALIGNMENT); if (o > offset.y) { o -= 2 * DIVISION_ALIGNMENT; } stepSize.y = offset.y - o; } } else { stepSize.y = size.y; } size.y -= stepSize.y; top += stepSize.y; if (args->uplo == clblasLower) { offset.y += stepSize.y; } else { offset.y -= stepSize.y; stepOffset = offset; } probDimsToKargs(&(step->args), funcID, &stepOffset, true); probDimsToKargs(&(step->args), funcID, &stepSize, false); totalCUs -= nrCU; } } static cl_bool findBestPattern(SolutionStep *step) { cl_uint maxImages; maxImages = getQueueMaxImages(step->cmdQueue); do { /* It may be non first attempt. Ensure that there are not * hold images for this step */ releaseStepImgs(step); step->patternID = selectPattern( step, maxImages ); assert(step->patternID != (unsigned int)-1); #ifdef DEBUG_2 printf("select Pattern Done\n"); #endif getStepGranulation(step); #ifdef DEBUG_2 printf("getStepGranulation done \n"); #endif assertGranulation(step->subdims, mempat->nrLevels, &step->pgran, mempat->thLevel); if (getStepResources(step)) break; } while (maxImages-- != 0); return (maxImages != (cl_uint)-1) ? CL_TRUE : CL_FALSE; } void detectProblemTails(SolutionStep *step) { SubproblemDim globDim, offDim; SubproblemDim *subdim; KernelExtraFlags kflags = KEXTRA_NO_FLAGS; subdim = step->subdims; kargsToProbDims(&globDim, step->funcID, &step->args, false); kargsToProbDims(&offDim, step->funcID, &step->args, true); #ifdef DEBUG_2 printf("detectProblemTails: subdimy=%d, subdimx=%d, subdimBwidth=%d\n", subdim->y, subdim->x, subdim->bwidth); #endif if (globDim.y % subdim->y) { kflags |= KEXTRA_TAILS_M; } if (globDim.x % subdim->x) { kflags |= KEXTRA_TAILS_N; } if (globDim.bwidth % subdim->bwidth) { kflags |= KEXTRA_TAILS_K; } if (clblasSolvers[step->funcID].memPatterns[step->patternID].nrLevels > 1) { if (globDim.y % subdim[1].y) { kflags |= KEXTRA_TAILS_M_LOWER; } if (globDim.x % subdim[1].x) { kflags |= KEXTRA_TAILS_N_LOWER; } if (globDim.bwidth % subdim[1].bwidth) { kflags |= KEXTRA_TAILS_K_LOWER; } } else { kflags |= (kflags & KEXTRA_TAILS_M) != 0 ? KEXTRA_TAILS_M_LOWER : 0; kflags |= (kflags & KEXTRA_TAILS_N) != 0 ? KEXTRA_TAILS_N_LOWER : 0; kflags |= (kflags & KEXTRA_TAILS_K) != 0 ? KEXTRA_TAILS_K_LOWER : 0; } // clean tails flags step->extraFlags &= ~(KEXTRA_TAILS_M | KEXTRA_TAILS_N | KEXTRA_TAILS_K | KEXTRA_TAILS_M_LOWER | KEXTRA_TAILS_N_LOWER | KEXTRA_TAILS_K_LOWER); // set tails flags step->extraFlags |= kflags; } void detectOffsets(SolutionStep *step) { const CLBlasKargs *args = &(step->args); KernelExtraFlags kflags = step->extraFlags; if (args->offsetM) { kflags |= KEXTRA_STARTM_NOT_ZERO; } if (args->offsetN) { kflags |= KEXTRA_STARTN_NOT_ZERO; } if (args->offA) { kflags |= KEXTRA_A_OFF_NOT_ZERO; } if (args->offBX) { kflags |= KEXTRA_BX_OFF_NOT_ZERO; } if (args->offCY) { kflags |= KEXTRA_CY_OFF_NOT_ZERO; } step->extraFlags = kflags; } //----------------------------------------------------------------------------- static unsigned int legacySelectPattern( BlasFunctionID funcID, unsigned int maxImages) { unsigned int id, i, n; MatrixRole mrole; MemoryPattern *pat; int score, maxScore = -1; id = -1; /* * Lookup all patterns, and assign a score per each matrix for * each pattern: * 0 - matrix is not cached * 2 - matrix is cached and stored in an image * 3 - matrix is cached and not stored in an image * * Find the pattern with the best score */ pat = clblasSolvers[funcID].memPatterns; for (i = 0; i < clblasSolvers[funcID].nrPatterns; i++, pat++) { score = 0; n = 0; for (mrole = MATRIX_A; mrole <= MATRIX_B; mrole++) { if (isMatrixCached(pat, mrole)) { if (isMatrixInImage(pat, mrole)) { n++; score += 2; } else { score += 3; } } } if (n > maxImages) { continue; } if (score > maxScore) { maxScore = score; id = i; } } return id; } //----------------------------------------------------------------------------- unsigned int selectPattern( SolutionStep* pStep, unsigned int maxImages ) { unsigned int i = 0; int selPatt = -1; int perf = -1; int maxPerf = -1; int funcID = pStep->funcID; unsigned int kflags = pStep->extraFlags; if (clblasSolvers[funcID].defaultPattern != -1) { // assert(clblasSolvers[funcID].defaultPattern < clblasSolvers[funcID].nrPatterns); return clblasSolvers[funcID].defaultPattern; } // select best-performing pattern for current case for( i = 0; i < clblasSolvers[funcID].nrPatterns; i++ ){ if( NULL != clblasSolvers[funcID].memPatterns[i].sops->getPatternPerf ){ perf = clblasSolvers[funcID].memPatterns[i].sops->getPatternPerf( kflags, (void*)&pStep->args); if( perf > maxPerf ){ selPatt = i; maxPerf = perf; } } // if not all patterns provide performace estimation functions // use legacy pattern selection else{ return legacySelectPattern( funcID, maxImages ); } } return selPatt; } //----------------------------------------------------------------------------- /* * Check if tile sizes exceed the entire problem and adjust them * accordingly if yes */ bool dimensionsExceedProblemSize(SolutionStep *step) { SubproblemDim probDim; SubproblemDim *dims = step->subdims; BlasFunctionID funcID = step->funcID; MemoryPattern *mempat = &clblasSolvers[funcID].memPatterns[step->patternID]; /* * Looks like kernels of other functions handle the case themselves * and don't expect that everyone can adjust chosen decomposition */ if (!( (funcID == CLBLAS_GEMV) || (funcID == CLBLAS_SYMV) || (funcID == CLBLAS_GEMM) || (funcID == CLBLAS_TRMM) || (funcID == CLBLAS_TRSM) || (funcID == CLBLAS_SYRK) || (funcID == CLBLAS_SYR2K)) ) { return false; } kargsToProbDims(&probDim, step->funcID, &step->args, false); if (mempat->nrLevels != 2) { return false; } dims = &dims[1]; if (dims->x > probDim.x || dims->y > probDim.y || dims->bwidth > probDim.bwidth) { return true; } return false; } void getMinimalStepGranulation(SolutionStep *step) { SubproblemDim *decompDims = NULL; SubproblemDim probDims[2]; size_t factor = 0; // EINVAL if( NULL == step ){ return; } if (step->funcID == CLBLAS_GEMM2) { return; } kargsToProbDims( probDims, step->funcID, &step->args, false); decompDims = step->subdims; // All exceeding dimensions are set to 1 if ( decompDims[1].itemX > probDims->x ) { factor = decompDims[1].itemX; decompDims[1].itemX = 1; decompDims[1].x /= factor; decompDims[0].itemX /= factor; decompDims[0].x /= factor; } if ( decompDims[1].itemY > probDims->y ) { factor = decompDims[1].itemY; decompDims[1].itemY = 1; decompDims[1].y /= factor; decompDims[0].itemY /= factor; decompDims[0].y /= factor; } if( decompDims[1].bwidth > probDims->bwidth ){ decompDims[0].bwidth /= decompDims[1].bwidth; decompDims[1].bwidth = 1; } } void getStepGranulation(SolutionStep *step) { SubproblemDim *dims = step->subdims; cl_device_id devID; double time; int status = GF_ERROR; size_t MNK; #ifdef DEBUG_2 printf("getStepGranulation called........\n"); #endif MemoryPattern *mempat = &clblasSolvers[step->funcID].memPatterns[step->patternID]; #ifdef DEBUG_2 printf("Got mempat structure.........0x%p\n", mempat); #endif #ifdef DEBUG_2 if ( mempat == NULL) { printf("mempat pointer is NULL...\n"); } else { printf("mempat pointer is non-null..\n"); if (mempat->sops == NULL) printf("sops is NULL\n"); else if (mempat->sops->getFlags == NULL) printf("getFlags() is NULL\n"); fflush(stdout); } #endif getQueueDevice(step->cmdQueue, &devID); #ifdef DEBUG_2 printf("QueueDevice done...\n"); #endif // try to load decomposition info from the storage /* * FIXME: It's a workaround so that to avoid getting some decomposition * sizes leading to strange hang ups */ if (!avoidLoadFromStorage(step)) { #ifdef DEBUG_2 printf("!avoidLoadFromStorage...Inside if\n"); #endif MNK = (step->args.M + step->args.N + step->args.K)/3; if (mempat->sops->innerDecompositionAxis) { size_t ld; // bas - banks aligned size, in bytes, should be // number of channels * bytes per channel // here it is set to 8*256 = 2048 = 512 floats size_t bas = 8*256; if (mempat->sops->innerDecompositionAxis(&step->args) == DECOMP_AXIS_X) { ld = step->args.ldb.matrix; } else { ld = step->args.lda.matrix; } if ((ld * dtypeSize(step->args.dtype)) % bas == 0) { //special bad case MNK = 0; } } if( step->funcID != CLBLAS_GEMM2 ) { status = getGranularityInfo(&step->device, mempat->name, step->args.dtype, step->extraFlags, (int)MNK, dims, &step->pgran, &time); } /* * Disable blocking for implementations dealing with cache reads * from the global memory */ //if (!(isLdsUsed(mempat) || (square && mempat->nrLevels == 2))) { // dims[0].bwidth = dims[1].bwidth; //} } #ifdef DEBUG_2 printf("isLoadFromStorage done..\n"); #endif //Query solver for default granulation if (status == GF_ERROR) { // temporary mock, untill all solvers will return required default problem granulation // TODO: deprecate the getDefaultStepGranulation(step) function if(NULL==mempat->sops->getDefaultDecomp) { getDefaultStepGranulation(step); } else { mempat->sops->getDefaultDecomp( &step->pgran, step->subdims, MAX_SUBDIMS, (void*)&step->args); } } if (dimensionsExceedProblemSize(step)) { getMinimalStepGranulation(step); } } void getDefaultStepGranulation(SolutionStep *step) { unsigned int nrFloats; MemoryPattern *mempat = &clblasSolvers[step->funcID].memPatterns[step->patternID]; SubproblemDim *dims = step->subdims; cl_ulong ldsSize; size_t wgX, wgY; bool square; SDimComponent component = SDIM_BWIDTH; DataType dtype = step->args.dtype; size_t tsize = dtypeSize(dtype); unsigned int i; SolverFlags sflags; unsigned int bcoeff; bool bothCached, fixedBw = false; cl_device_id devID; PGranularity *pgran = &step->pgran; size_t maxWorkGroupSize; int vecLen; size_t subdimyFactor = 1; size_t subdimxFactor = 1; #ifdef DEBUG_2 printf("getDefaultStepGranualtion called...\n"); #endif nrFloats = (unsigned int)(dtypeSize(dtype) / sizeof(cl_float)); square = ((mempat->sops->getFlags() & SF_TOP_INPUT_SQUARE_BLOCKS) != 0); bothCached = isMatrixCached(mempat, MATRIX_A) && isMatrixCached(mempat, MATRIX_B); if (step->cmdQueue != NULL) { getQueueDevice(step->cmdQueue, &devID); } else { devID = step->device.id; } clGetDeviceInfo(devID, CL_DEVICE_LOCAL_MEM_SIZE, sizeof(ldsSize), &ldsSize, NULL); clGetDeviceInfo(devID, CL_DEVICE_MAX_WORK_GROUP_SIZE, sizeof(size_t), &maxWorkGroupSize, NULL); /* * Setup dimensions allowing to use more or less effectively the local * memory or cache; */ if (square) { dims[0].x = (dtype == TYPE_COMPLEX_DOUBLE) ? 16 : 32; /* * FIXME: for now, we restrict ourselves with square blocks due * to compilation issues */ dims[0].y = dims[0].x; //(dtype == TYPE_FLOAT) ? 32 : 16 dims[0].bwidth = dims[0].y; bcoeff = nrFloats; wgY = DEFAULT_BUFS_LSIZE_0; wgX = DEFAULT_BUFS_LSIZE_1; } else { bcoeff = (dtype == TYPE_COMPLEX_DOUBLE) ? 2 : 1; if (bothCached) { wgY = DEFAULT_CACHED_BUFS_LSIZE_0; wgX = DEFAULT_CACHED_BUFS_LSIZE_1; } else { wgY = DEFAULT_BUFS_LSIZE_0; wgX = DEFAULT_BUFS_LSIZE_1; } if (step->funcID == CLBLAS_GEMM2) { subdimyFactor = 2; subdimxFactor = 1; bcoeff = 4; // 16/bcoeff = 4 - Thats the panel width we want } if ((step->funcID == CLBLAS_TRMV) || (step->funcID == CLBLAS_HEMV)) { if (maxWorkGroupSize >= 256) { wgX = 16; wgY = 16; } else if (maxWorkGroupSize >= 128) { wgX = 8; wgY = 16; } else { // // PENDING: What if maxWorkGroupSize < 64 ???? // wgX = 8; wgY = 8; } } /* * Set block sizes such so the work group would access the whole * memory channel or not exceed cache associativity for the modern * AMD GPU families. * * FIXME: throw the hardcoded constants away */ if (isMatrixInImage(mempat, MATRIX_A) || isMatrixAccessColMaj(step->funcID, step->extraFlags, MATRIX_A)) { dims[0].y = (64 * subdimyFactor) / nrFloats; fixedBw = true; } else { dims[0].y = (32 * subdimyFactor); } if (isMatrixInImage(mempat, MATRIX_B) || isMatrixAccessColMaj(step->funcID, step->extraFlags, MATRIX_B)) { dims[0].x = (64 * subdimxFactor) / nrFloats; fixedBw = true; } else { dims[0].x = (32 * subdimxFactor); } if (step->funcID == CLBLAS_GEMM2) { int count=0; // // NOTE: // wgX and wgY setting for this function must be the same as // CLBLAS_GEMM_TAIL below. // //vecLen = sizeof(cl_float4) / dtypeSize(step->args.dtype); // // PENDING: 16x16 works best on CYPRESS and 16x8 for Cayman // wgY = 8*subdimyFactor; wgX = 8*subdimxFactor; while((wgY * wgX) > maxWorkGroupSize) { if (count & 1) { wgY /= 2; dims[0].y /= 2; } else { wgX /= 2; dims[0].x /= 2; } count++; } } if (step->funcID == CLBLAS_GEMM_TAIL) { // // NOTE: wgY and wgX must be same as what is set for CLBLAS_GEMM2 above // vecLen = 1; // // PENDING: What if maxWorkGroupSize < 64 ???? // wgY = 8; wgX = 8; dims[0].y = wgY ; dims[0].x = wgX ; } if((step->funcID == CLBLAS_TRSV) || (step->funcID == CLBLAS_TRSV_GEMV)) { wgY = 8; wgX = 8; dims[0].y = 64; dims[0].x = 64; } dims[0].bwidth = 16 / bcoeff; } /* * Prevent using more than 1/2 of LDS so as to have at least 2 work groups * per compute unit */ if (ldsSize && mempat->sops->isFitToLDS) { ldsSize /= 2; while (!mempat->sops->isFitToLDS(dims, dtype, ldsSize, &step->args)) { /* * decrease current component and setup this one to decrease * on the next step; do not grow down block width below the * value with which the block line takes size of a float4 vector */ if (square) { dims[0].x /= 2; dims[0].y /= 2; dims[0].bwidth /= 2; } else { switch (component) { case SDIM_X: dims[0].x /= 2; if (dims[0].bwidth * tsize == sizeof(cl_float4)) { component = SDIM_Y; } else { component = SDIM_BWIDTH; } break; case SDIM_Y: dims[0].y /= 2; component = SDIM_X; break; case SDIM_BWIDTH: dims[0].bwidth /= 2; component = SDIM_Y; break; } } } assert(dims[0].x > 0 && dims[0].y > 0 && dims[0].bwidth * tsize >= sizeof(cl_float4)); } /* * adjust local size if a subproblem is not divisible * between all local threads */ for (; (wgY > 1) && (dims[0].y < wgY); wgY /= 2) { } for (; (wgX > 1) && (dims[0].x < wgX); wgX /= 2) { } sflags = mempat->sops->getFlags(); if (sflags & SF_WSPACE_2D) { pgran->wgDim = 2; dims[0].itemY = dims[0].y; pgran->wgSize[0] = (unsigned int)wgY; pgran->wgSize[1] = (unsigned int)wgX; } else { pgran->wgDim = 1; pgran->wgSize[0] = (unsigned int)(wgX * wgY); pgran->wgSize[1] = 1; } /* * Divide the work between threads */ dims[1].itemX = dims[0].x / wgX; dims[1].itemY = dims[0].y / wgY; dims[1].x = dims[1].itemX; dims[1].y = dims[1].itemY; if ((mempat->nrLevels == 1) && square) { dims[1].bwidth = dims[1].y; } else { i = fixedBw ? 4 : (8 / nrFloats); dims[1].bwidth = szmin(i, dims[0].bwidth); } dims[0].itemX = dims[0].x; dims[0].itemY = dims[0].y; /* * FIXME: Now, there are issues with generating kernels with non square * tiles in LDS less TRSM due to some fundamental restriction * of the core generator logic. Deprecate this kludge when * they will be eliminated */ #if 1 if ((step->funcID == CLBLAS_TRSM) && (step->patternID == 2)) { dims[1].bwidth = dims[1].y; } #endif if (funcHasTriangMatrix(step->funcID) && (pgran->wgDim == 1)) { dims[0].itemY = SUBDIM_UNUSED; if (mempat->nrLevels == 1) { dims[1].itemY = SUBDIM_UNUSED; } } if (!(isLdsUsed(mempat) || (square && mempat->nrLevels == 2))) { dims[0].bwidth = dims[1].bwidth; } /* * Ensure decomposition size for vectors in case * of level 2 routines equal to 1. */ if (funcBlasLevel(step->funcID) == 2) { size_t xBlocks; xBlocks = dims[0].x / dims[1].x; dims[0].x = 1; dims[1].itemX = 1; dims[1].x = 1; dims[0].bwidth = dims[1].bwidth * xBlocks; } // fixup work group size in respect with desired work dispatch order if ((pgran->wgDim == 2) && mempat->sops->innerDecompositionAxis) { if (mempat->sops->innerDecompositionAxis(&step->args) == DECOMP_AXIS_X) { unsigned int u; u = pgran->wgSize[0]; pgran->wgSize[0] = pgran->wgSize[1]; pgran->wgSize[1] = u; } } //printf("GDSG: suby = %lu, subx = %lu, bwidth0=%lu, bwidth1=%lu\n", dims[0].y, dims[0].x, dims[0].bwidth, dims[1].bwidth); } static bool avoidLoadFromStorage(SolutionStep *step) { bool notDiv; MemoryPattern *mempat = &clblasSolvers[step->funcID].memPatterns[step->patternID]; bool bothCached = isMatrixCached(mempat, MATRIX_A) && isMatrixCached(mempat, MATRIX_B); if (bothCached) { return false; } if ((step->funcID == CLBLAS_GEMM2) && ((step->args.pigFuncID == CLBLAS_SYMM) || (step->args.pigFuncID == CLBLAS_HEMM)) ) { // FIXME: Assuming that returning "true" will load defaultDecomposition sizes // But the statement below on TRSM is a bit confusing. // Returning FALSE here will load from storage in getStepGranulation() return true; } /* * don't load from storage data for LDS gemm, * not integrally divisible */ notDiv = (step->args.M % 64) || (step->args.N % 64) || (step->args.K % 64); return ((step->funcID == CLBLAS_GEMM) && notDiv); } static bool getStepResources(SolutionStep *step) { int i = 0; size_t tsize; unsigned int vecLen; size_t minWidth, minHeight, bestHeight, minSize, bestSize; MatrixRole mrole; cl_device_id devID; cl_context ctx; MemoryPattern *mempat; SubproblemDim probDim; CLBlasKargs *kargs = &step->args; bool ret = true; tsize = dtypeSize(kargs->dtype); vecLen = (unsigned int)(sizeof(cl_float4) / tsize); kargsToProbDims(&probDim, step->funcID, &step->args, false); getQueueContext(step->cmdQueue, &ctx); getQueueDevice(step->cmdQueue, &devID); mempat = &(clblasSolvers[step->funcID].memPatterns[step->patternID]); for (mrole = MATRIX_A, i = 0; mrole < MATRIX_C; mrole++) { if (isMatrixInImage(mempat, mrole)) { if (step->funcID == CLBLAS_TRSM) { //blocks unsigned int packRate; clblasOrder packOrder; size_t pitch; size_t matrWidth, matrHeight; CLBLASKernExtra extra; memset(&extra, 0, sizeof(extra)); extra.dtype = kargs->dtype; extra.flags = step->extraFlags; mempat->sops->imgPackMode(&extra, step->subdims, mrole, &packRate, &packOrder); // minimal size parameters pitch = matrBlockPitch(step->subdims, mrole, kargs->dtype, kargs->side); matrWidth = matrBlockPitch(&probDim, mrole, kargs->dtype, kargs->side); matrHeight = matrBlockHeight(&probDim, mrole, kargs->side); //One panel should fit to image if (packOrder == clblasRowMajor) { minWidth = divRoundUp(matrWidth, pitch) * pitch / vecLen; minHeight = packRate; minSize = minWidth * minHeight; // size of image to store all blocks bestSize = minHeight * (minWidth + pitch / vecLen) * divRoundUp(matrHeight, packRate) / 2; } else { minWidth = pitch / vecLen; minHeight = divRoundUp(matrHeight, packRate) * packRate; minSize = minWidth * minHeight; bestSize = minWidth * (minHeight + packRate) * divRoundUp(matrWidth, pitch) / 2; } minSize = bestSize; } else { //panels getSuitableImageSizes(&minWidth, &minHeight, &bestHeight, mrole, kargs, vecLen, step->subdims); minSize = minWidth * minHeight; bestSize = minWidth * bestHeight; } kargs->scimage[i] = getSCImage(ctx, devID, bestSize, minSize, minWidth); if (kargs->scimage[i] == NULL) { ret = false; break; } i++; } } return ret; } static void getSuitableImageSizes( size_t *minWidth, size_t *minHeight, size_t *bestHeight, MatrixRole mrole, CLBlasKargs *kargs, unsigned int vecLen, SubproblemDim *subdims) { size_t alignedM, alignedN, alignedK; alignedM = divRoundUp(kargs->M, subdims->y); alignedM *= subdims->y; alignedN = divRoundUp(kargs->N, subdims->x); alignedN *= subdims->x; alignedK = divRoundUp(kargs->K, subdims->bwidth); alignedK *= subdims->bwidth; switch (mrole) { case MATRIX_A: *minWidth = alignedK / vecLen; *bestHeight = alignedM; *minHeight = subdims->y; break; case MATRIX_B: *minWidth = alignedK / vecLen; *bestHeight = alignedN; *minHeight = subdims->x; break; case MATRIX_C: *minWidth = alignedN / vecLen; *bestHeight = alignedM; *minHeight = subdims->y; break; default: break; } } /* * TRxM -> TRxM + GEMM + TRxM * * When talking about matrix A splitting the following numbering is used: * * +---+---+ * | 1 | 2 | * +---+---+ * | 3 | 4 | * +---+---+ */ static ListNode* decomposeTRXMStep(SolutionStep *step) { CLBlasKargs *kargs = &(step->args); SolutionStep *trxm1 = NULL, *gemm = NULL, *trxm2 = NULL, *tmp; clblasUplo position; SubproblemDim size, offset; int swap; cl_float f; cl_double d; clblasImplementation impl = clblasDefaultGemm; size_t offsetK = 0; // skip decomposition for a trmm case which works faster without it if (step->funcID == CLBLAS_TRMM && !isDoubleBasedType(step->args.dtype) && isMatrixAccessColMaj(step->funcID, step->extraFlags, MATRIX_B)) { return &(step->node); } /* Implementation specific checks */ if ((getGemmPreferredPattern() != clblasDefaultGemm) && (getGemmPreferredPattern() != clblasBlockGemmWithCaching)) { return &(step->node); } if (step->funcID == CLBLAS_TRMM) { impl = getTrmmPreferredPattern(); if ((impl != clblasDefaultTrmm) && (impl != clblasBlockTrmmWithCaching)) { return &(step->node); } } else { impl = getTrsmPreferredPattern(); if ((impl != clblasDefaultTrsm) && (impl != clblasBlockTrsmWithCaching) && (impl != clblasBlockTrsmWithoutLds)) { return &(step->node); } } if ((kargs->side == clblasLeft) && (kargs->M < DECOMPOSITION_THRESHOLD(step->args.dtype))) { return &(step->node); } if ((kargs->side == clblasRight) && (kargs->N < DECOMPOSITION_THRESHOLD(step->args.dtype))) { return &(step->node); } trxm1 = calloc(1, sizeof(SolutionStep)); gemm = calloc(1, sizeof(SolutionStep)); trxm2 = calloc(1, sizeof(SolutionStep)); if ((trxm1 == NULL) || (gemm == NULL) || (trxm2 == NULL)) { if (trxm1 != NULL) { free(trxm1); } if (gemm != NULL) { free(gemm); } if (trxm2 != NULL) { free(trxm2); } return &(step->node); } memcpy(trxm1, step, sizeof(SolutionStep)); memcpy(gemm, step, sizeof(SolutionStep)); memcpy(trxm2, step, sizeof(SolutionStep)); gemm->funcID = CLBLAS_GEMM; gemm->args.C = kargs->B; gemm->args.ldc.matrix = kargs->ldb.matrix; gemm->args.offCY = kargs->offBX; switch (kargs->dtype) { case TYPE_FLOAT: if (step->funcID == CLBLAS_TRSM) { if (gemm->args.alpha.argFloat != 0.0f) { gemm->args.alpha.argFloat = -1 / gemm->args.alpha.argFloat; } } gemm->args.beta.argFloat = 1.0f; break; case TYPE_DOUBLE: if (step->funcID == CLBLAS_TRSM) { if (gemm->args.alpha.argDouble != 0.0f) { gemm->args.alpha.argDouble = -1 / gemm->args.alpha.argDouble; } } gemm->args.beta.argDouble = 1.0f; break; case TYPE_COMPLEX_FLOAT: if (step->funcID == CLBLAS_TRSM) { f = CREAL(gemm->args.alpha.argFloatComplex) * CREAL(gemm->args.alpha.argFloatComplex) + CIMAG(gemm->args.alpha.argFloatComplex) * CIMAG(gemm->args.alpha.argFloatComplex); if (f != 0.0f) { gemm->args.alpha.argFloatComplex = floatComplex( -CREAL(gemm->args.alpha.argFloatComplex) / f, CIMAG(gemm->args.alpha.argFloatComplex) / f); } } gemm->args.beta.argFloatComplex = floatComplex(1.0f, 0.0f); break; case TYPE_COMPLEX_DOUBLE: if (step->funcID == CLBLAS_TRSM) { d = CREAL(gemm->args.alpha.argDoubleComplex) * CREAL(gemm->args.alpha.argDoubleComplex) + CIMAG(gemm->args.alpha.argDoubleComplex) * CIMAG(gemm->args.alpha.argDoubleComplex); if (d != 0.0f) { gemm->args.alpha.argDoubleComplex = doubleComplex( -CREAL(gemm->args.alpha.argDoubleComplex) / d, CIMAG(gemm->args.alpha.argDoubleComplex) / d); } } gemm->args.beta.argDoubleComplex = doubleComplex(1.0f, 0.0f); break; } /* Actual position of matrix A's data to use */ if (kargs->transA == clblasNoTrans) { position = kargs->uplo; } else { position = (kargs->uplo == clblasUpper) ? clblasLower : clblasUpper; } /* Map trxm1 to A1 */ kargsToProbDims(&size, trxm1->funcID, &(trxm1->args), false); size.y = align(size.y / 2, DIVISION_ALIGNMENT); probDimsToKargs(&(trxm1->args), trxm1->funcID, &size, false); /* Map trxm2 to A4 */ kargsToProbDims(&offset, trxm2->funcID, &(trxm2->args), true); kargsToProbDims(&size, trxm2->funcID, &(trxm2->args), false); offset.y += align(size.y / 2, DIVISION_ALIGNMENT); size.y -= align(size.y / 2, DIVISION_ALIGNMENT); probDimsToKargs(&(trxm2->args), trxm2->funcID, &offset, true); probDimsToKargs(&(trxm2->args), trxm2->funcID, &size, false); if (kargs->side == clblasLeft) { trxm1->args.K = trxm1->args.M; trxm2->args.K = trxm2->args.M; gemm->args.transB = clblasNoTrans; if (position == clblasUpper) { /* Map gemm to A2 */ kargsToProbDims(&size, gemm->funcID, &(gemm->args), false); size.y = align(size.y / 2, DIVISION_ALIGNMENT); probDimsToKargs(&(gemm->args), gemm->funcID, &size, false); offsetK = align(gemm->args.K / 2, DIVISION_ALIGNMENT); gemm->args.K -= align(gemm->args.K / 2, DIVISION_ALIGNMENT); } else { /* Map gemm to A3 */ kargsToProbDims(&offset, gemm->funcID, &(gemm->args), true); kargsToProbDims(&size, gemm->funcID, &(gemm->args), false); offset.y += align(size.y / 2, DIVISION_ALIGNMENT); size.y -= align(size.y / 2, DIVISION_ALIGNMENT); probDimsToKargs(&(gemm->args), gemm->funcID, &offset, true); probDimsToKargs(&(gemm->args), gemm->funcID, &size, false); gemm->args.K = align(gemm->args.K / 2, DIVISION_ALIGNMENT); } } else { trxm1->args.K = trxm1->args.N; trxm2->args.K = trxm2->args.N; gemm->args.transA = clblasNoTrans; gemm->args.A = kargs->B; gemm->args.lda.matrix = kargs->ldb.matrix; gemm->args.offA = kargs->offBX; gemm->args.transB = kargs->transA; gemm->args.B = kargs->A; gemm->args.ldb.matrix = kargs->lda.matrix; gemm->args.offBX = kargs->offA; if (position == clblasUpper) { /* Map gemm to A2 */ kargsToProbDims(&offset, gemm->funcID, &(gemm->args), true); kargsToProbDims(&size, gemm->funcID, &(gemm->args), false); offset.x += align(size.x / 2, DIVISION_ALIGNMENT); size.x -= align(size.x / 2, DIVISION_ALIGNMENT); probDimsToKargs(&(gemm->args), gemm->funcID, &offset, true); probDimsToKargs(&(gemm->args), gemm->funcID, &size, false); gemm->args.K = align(gemm->args.K / 2, DIVISION_ALIGNMENT); } else { /* Map gemm to A3 */ kargsToProbDims(&size, gemm->funcID, &(gemm->args), false); size.x = align(size.x / 2, DIVISION_ALIGNMENT); probDimsToKargs(&(gemm->args), gemm->funcID, &size, false); offsetK = align(gemm->args.K / 2, DIVISION_ALIGNMENT); gemm->args.K -= align(gemm->args.K / 2, DIVISION_ALIGNMENT); } } trxm1->extraFlags = clblasArgsToKextraFlags(&(trxm1->args), trxm1->funcID); gemm->extraFlags = clblasArgsToKextraFlags(&(gemm->args), gemm->funcID); trxm2->extraFlags = clblasArgsToKextraFlags(&(trxm2->args), trxm2->funcID); fixupGemmOffsets(&gemm->args, gemm->extraFlags, offsetK); /* Swap trxm1 and trxm2 if needed. */ swap = 0; if (kargs->side == clblasLeft) { if ((step->funcID == CLBLAS_TRMM) && (position == clblasLower)) { swap = 1; } if ((step->funcID == CLBLAS_TRSM) && (position == clblasUpper)) { swap = 1; } } else { if ((step->funcID == CLBLAS_TRMM) && (position == clblasUpper)) { swap = 1; } if ((step->funcID == CLBLAS_TRSM) && (position == clblasLower)) { swap = 1; } } if (swap) { tmp = trxm1; trxm1 = trxm2; trxm2 = tmp; } /* Tie the sequence trmm1 - gemm - trmm2 together. */ trxm1->event = decomposeEventsAlloc(); trxm1->node.next = &(gemm->node); gemm->numEventsInWaitList = 1; gemm->eventWaitList = trxm1->event; gemm->event = decomposeEventsAlloc(); gemm->node.prev = &(trxm1->node); gemm->node.next = &(trxm2->node); trxm2->numEventsInWaitList = 1; trxm2->eventWaitList = gemm->event; trxm2->node.prev = &(gemm->node); /* Insert new sequence instead of current step */ trxm1->node.prev = step->node.prev; (trxm1->node.prev)->next = &(trxm1->node); step->node.prev = NULL; trxm2->node.next = step->node.next; (trxm2->node.next)->prev = &(trxm2->node); step->node.next = NULL; freeSolutionStep(&(step->node)); return &(trxm2->node); } /* * Decompose a SYRK problem in order to evaluate the diagonal part * separately. It's useful since the compiler allocates huge number * of registers for a code processing the diagonal. */ static ListNode* decomposeSYRKStep(SolutionStep *step) { CLBlasKargs *kargs = &step->args; SolutionStep *syrk2 = NULL; size_t thresh; ListNode *next; /* * Tail prediction. Believe that tile sizes will not exceed 8. * Disable decomposition if there are not subproblem tails at * the tile level because it can likely slowdown since diagonal * update is optimized. Actual tail detection is done after * the math decomposition. So the kludge is forced. */ if ((kargs->M % 8 == 0) && (kargs->N % 8 == 0)) { return &(step->node); } thresh = DECOMPOSITION_THRESHOLD(step->args.dtype); if (kargs->M < thresh / 2) { return &(step->node); } syrk2 = malloc(sizeof(SolutionStep)); if (syrk2 == NULL) { return &(step->node); } step->extraFlags |= KEXTRA_SYRK_SEPARATE_DIAGONAL; memcpy(syrk2, step, sizeof(SolutionStep)); syrk2->extraFlags |= KEXTRA_SYRK_EVALUATE_DIAGONAL; next = step->node.next; /* Synchronize the steps */ /* * This is to not disturb synchronization between the current and the next * step or to put the output user event to the tail of the chain if syrk2 * is the last step */ syrk2->event = step->event; step->event = decomposeEventsAlloc(); syrk2->numEventsInWaitList = 1; syrk2->eventWaitList = step->event; /* Insert the additional step to the list */ step->node.next = &syrk2->node; syrk2->node.prev = &step->node; syrk2->node.next = next; next->prev = &syrk2->node; return &(syrk2->node); } static ListNode* decomposeSYR2KStep(SolutionStep *step) { CLBlasKargs *kargs = &(step->args); SolutionStep *syrk1 = NULL, *syrk2 = NULL; size_t thresh; ListNode *node; /* SYR2K implementation is done as blocked with cache-usage optimization * only. Therefore, no implementation specific checks. */ thresh = DECOMPOSITION_THRESHOLD(step->args.dtype); if (kargs->M < thresh / 2) { return &(step->node); } syrk1 = calloc(1, sizeof(SolutionStep)); syrk2 = calloc(1, sizeof(SolutionStep)); if ((syrk1 == NULL) || (syrk2 == NULL)) { if (syrk1 != NULL) { free(syrk1); } if (syrk2 != NULL) { free(syrk2); } return &(step->node); } memcpy(syrk1, step, sizeof(SolutionStep)); memcpy(syrk2, step, sizeof(SolutionStep)); syrk2->args.A = kargs->B; syrk2->args.lda.matrix = kargs->ldb.matrix; syrk2->args.offA = kargs->offBX; syrk2->args.B = kargs->A; syrk2->args.ldb.matrix = kargs->lda.matrix; syrk2->args.offBX = kargs->offA; switch (kargs->dtype) { case TYPE_FLOAT: syrk2->args.beta.argFloat = 1.0f; break; case TYPE_DOUBLE: syrk2->args.beta.argDouble = 1.0f; break; case TYPE_COMPLEX_FLOAT: syrk2->args.beta.argFloatComplex = floatComplex(1.0f, 0.0f); break; case TYPE_COMPLEX_DOUBLE: syrk2->args.beta.argDoubleComplex = doubleComplex(1.0f, 0.0f); break; } syrk1->extraFlags = clblasArgsToKextraFlags(&(syrk1->args), syrk1->funcID); syrk1->extraFlags &= ~KEXTRA_SYRK_2K_RANK; syrk2->extraFlags = clblasArgsToKextraFlags(&(syrk2->args), syrk2->funcID); syrk2->extraFlags &= ~KEXTRA_SYRK_2K_RANK; /* Tie the sequence syrk1 - syrk2 together. */ syrk1->event = decomposeEventsAlloc(); syrk1->node.next = &(syrk2->node); syrk2->numEventsInWaitList = 1; syrk2->eventWaitList = syrk1->event; syrk2->node.prev = &(syrk1->node); /* Insert new sequence instead of current step */ syrk1->node.prev = step->node.prev; (syrk1->node.prev)->next = &(syrk1->node); step->node.prev = NULL; syrk2->node.next = step->node.next; (syrk2->node.next)->prev = &(syrk2->node); step->node.next = NULL; freeSolutionStep(&(step->node)); /* * Now, decompose each of these steps to evaluate the diagonal * part in a dedicated kernel */ decomposeSYRKStep(syrk1); node = decomposeSYRKStep(syrk2); return node; } clblas-2.10/src/library/blas/gens/000077500000000000000000000000001264277366700170305ustar00rootroot00000000000000clblas-2.10/src/library/blas/gens/asum.cpp000066400000000000000000000166531264277366700205140ustar00rootroot00000000000000/* ************************************************************************ * Copyright 2013 Advanced Micro Devices, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * ************************************************************************/ // #define DEBUG_ASUM #define WORKGROUPS_PER_CU 32 #include #include #include #include #include #include #include #include #include "blas_kgen.h" #include #include #include #define min(a, b) (((a) < (b)) ? (a) : (b)) extern "C" unsigned int dtypeSize(DataType type); static char Prefix[4]; static SolverFlags solverFlags(void) { #ifdef DEBUG_ASUM printf("solverFlags called...\n"); #endif return (SF_WSPACE_1D); } static void calcNrThreads( size_t threads[2], const SubproblemDim *subdims, const PGranularity *pgran, const void *args, const void *extra); static ssize_t generator( char *buf, size_t buflen, const struct SubproblemDim *subdims, const struct PGranularity *pgran, void *extra); static void fixupArgs(void *args, SubproblemDim *subdims, void *extra); static void assignKargs(KernelArg *args, const void *params, const void* extra ); extern "C" void initAsumRegisterPattern(MemoryPattern *mempat); static KernelExtraFlags selectVectorization( void *kargs, unsigned int vlen ); static void setBuildOpts( char * buildOptStr, const void *kArgs); static SolverOps asumOps = { generator, assignKargs, NULL, NULL, // Prepare Translate Dims NULL, // Inner Decomposition Axis calcNrThreads, NULL, solverFlags, fixupArgs, NULL, NULL, setBuildOpts, selectVectorization }; static KernelExtraFlags selectVectorization( void *args, unsigned int vlen ) { KernelExtraFlags kflags = KEXTRA_NO_FLAGS; CLBlasKargs *kargs = (CLBlasKargs *)args; if( (((kargs->offBX) % vlen) != 0) ) { kflags = KEXTRA_NO_COPY_VEC_A; } return kflags; } static void setBuildOpts( char * buildOptStr, const void *args) { const SolutionStep *step = (const SolutionStep *)args; const CLBlasKargs *kargs = (const CLBlasKargs *)(&step->args); if ( kargs->dtype == TYPE_DOUBLE || kargs->dtype == TYPE_COMPLEX_DOUBLE) { addBuildOpt( buildOptStr, BUILD_OPTS_MAXLEN, "-DDOUBLE_PRECISION"); #ifdef DEBUG_DOT printf("Setting build options ... Double... for DOUBLE PRECISION support\n"); #endif } if ( (kargs->dtype == TYPE_COMPLEX_FLOAT) || (kargs->dtype == TYPE_COMPLEX_DOUBLE)) { addBuildOpt( buildOptStr, BUILD_OPTS_MAXLEN, "-DCOMPLEX"); #ifdef DEBUG_ASUM printf("Setting build options ... Double... for DOUBLE PRECISION support\n"); #endif } if( (kargs->ldb.vector) != 1) { addBuildOpt( buildOptStr, BUILD_OPTS_MAXLEN, "-DINCX_NONUNITY"); } if( (kargs->ldb.vector) < 1) { addBuildOpt( buildOptStr, BUILD_OPTS_MAXLEN, "-DINCX_NEGATIVE"); } return; } static CLBLASMpatExtra mpatExtra; extern "C" void initAsumRegisterPattern(MemoryPattern *mempat) { #ifdef DEBUG_ASUM printf("initRegPattern called with mempat = 0x%p\n", mempat); #endif fflush(stdout); mempat->name = "Register accumulation based swap"; mempat->nrLevels = 2; mempat->cuLevel = 0; mempat->thLevel = 1; mempat->sops = &asumOps; mpatExtra.aMset = CLMEM_LEVEL_L2; mpatExtra.bMset = CLMEM_LEVEL_L2; mpatExtra.mobjA = CLMEM_GLOBAL_MEMORY; mpatExtra.mobjB = CLMEM_GLOBAL_MEMORY; mempat->extra = &mpatExtra; Prefix[TYPE_FLOAT] = 'S'; Prefix[TYPE_DOUBLE] = 'D'; Prefix[TYPE_COMPLEX_FLOAT] = 'C'; Prefix[TYPE_COMPLEX_DOUBLE] = 'Z'; } static void calcNrThreads( size_t threads[2], const SubproblemDim *subdims, const PGranularity *pgran, const void *args, const void *_extra) { DUMMY_ARG_USAGE(subdims); const CLBLASKernExtra *extra = ( CLBLASKernExtra *)_extra; CLBlasKargs *kargs = (CLBlasKargs *)args; SolutionStep *step = container_of(kargs, args, SolutionStep); TargetDevice *kDevice = &(step->device); cl_int err; unsigned int numComputeUnits = deviceComputeUnits( (kDevice->id), &err ); if(err != CL_SUCCESS) { numComputeUnits = 1; } unsigned int vecLen = extra->vecLenA; unsigned int blockSize = pgran->wgSize[0] * pgran->wgSize[1]; unsigned int wgToSpawn = ((kargs->N - 1)/ (blockSize*vecLen)) + 1; wgToSpawn = min( wgToSpawn, (numComputeUnits * WORKGROUPS_PER_CU) ); threads[0] = wgToSpawn * blockSize; threads[1] = 1; } // // FIXME: Report correct return value - Needs change in KPRINTF // static ssize_t generator( char *buf, size_t buflen, const struct SubproblemDim *subdims, const struct PGranularity *pgran, void *extra) { DUMMY_ARG_USAGE(subdims); size_t BLOCKSIZE = pgran->wgSize[0]; char tempTemplate[32*1024]; if ( buf == NULL) // return buffer size { buflen = (32 * 1024 * sizeof(char)); return (ssize_t)buflen; } CLBLASKernExtra *extraFlags = ( CLBLASKernExtra *)extra; #ifdef DEBUG_ASUM printf("ASUM GENERATOR called....\n"); printf("dataType : %c\n", Prefix[extraFlags->dtype]); #endif unsigned int vecLenA = extraFlags->vecLenA; #ifdef DEBUG_ASUM printf("Vector length used : %d\n\n", vecLenA); #endif bool doVLOAD = false; if( extraFlags->flags & KEXTRA_NO_COPY_VEC_A ) { doVLOAD = true; #ifdef DEBUG_ASUM printf("DOing VLOAD as Aligned Data Pointer not Availabe\n"); #endif } else { #ifdef DEBUG_ASUM printf("Using Aligned Data Pointer \n"); #endif } strcpy( tempTemplate, (char*)asum_kernel ); kprintf kobj( Prefix[extraFlags->dtype], vecLenA, doVLOAD, doVLOAD, BLOCKSIZE); kobj.spit((char*)buf, tempTemplate); return (32 * 1024 * sizeof(char)); } /* __kernel void %PREFIXasum_kernel( __global %TYPE *_X, __global %TYPE *scratchBuff, uint N, uint offx, int incx) */ static void assignKargs(KernelArg *args, const void *params, const void* ) { CLBlasKargs *blasArgs = (CLBlasKargs*)params; cl_int incx; INIT_KARG(&args[0], blasArgs->B); INIT_KARG(&args[1], blasArgs->D); initSizeKarg(&args[2], blasArgs->N); initSizeKarg(&args[3], blasArgs->offBX); incx = blasArgs->ldb.vector; INIT_KARG(&args[4], incx); return; } /** The purpose of this function is to add an work-group size indicator in kernelKey, so that a different kernel is generated when work-group size is changed. Reduction loop is unrolled in kprintf based on work-group size. Member of SubproblemDim- bwidth, will be used to store work-group size of the current kernel this will become a kernelKey, and kernel cache will be accordingly managed. Note -- SubproblemDim is a member of kernelKey **/ static void fixupArgs(void *args, SubproblemDim *subdims, void *extra) { DUMMY_ARG_USAGE(extra); CLBlasKargs *kargs = (CLBlasKargs*)args; SolutionStep *step = container_of(kargs, args, SolutionStep); subdims->bwidth = (step->pgran.wgSize[0]) * (step->pgran.wgSize[1]); } clblas-2.10/src/library/blas/gens/axpy_reg.cpp000066400000000000000000000151721264277366700213600ustar00rootroot00000000000000/* ************************************************************************ * Copyright 2013 Advanced Micro Devices, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * ************************************************************************/ /* * axpy generator */ //#define DEBUG_AXPY #define WORKGROUPS_PER_CU 32 #include #include #include #include #include #include #include #include #include "blas_kgen.h" #include #include #include #define min(a, b) (((a) < (b)) ? (a) : (b)) extern "C" unsigned int dtypeSize(DataType type); static char Prefix[4]; static SolverFlags solverFlags(void) { #ifdef DEBUG_AXPY printf("solverFlags called......\n"); #endif return (SF_WSPACE_1D); } static void calcNrThreads( size_t threads[2], const SubproblemDim *subdims, const PGranularity *pgran, const void *args, const void *extra); static ssize_t generator( char *buf, size_t buflen, const struct SubproblemDim *subdims, const struct PGranularity *pgran, void *extra); static void assignKargs(KernelArg *args, const void *params, const void* extra ); extern "C" void initAxpyRegisterPattern(MemoryPattern *mempat); static KernelExtraFlags selectVectorization( void *kargs, unsigned int vlen ); static void setBuildOpts( char * buildOptStr, const void *kArgs); static SolverOps axpyOps = { generator, assignKargs, NULL, NULL, // Prepare Translate Dims NULL, // Inner Decomposition Axis calcNrThreads, NULL, solverFlags, NULL, NULL, NULL, setBuildOpts, selectVectorization }; static KernelExtraFlags selectVectorization( void *args, unsigned int vlen ) { KernelExtraFlags kflags = KEXTRA_NO_FLAGS; CLBlasKargs *kargs = (CLBlasKargs *)args; if( (((kargs->offBX) % vlen) != 0) || (((kargs->offCY) % vlen) != 0) ) { kflags = KEXTRA_NO_COPY_VEC_A; } return kflags; } static void setBuildOpts( char * buildOptStr, const void *args) { const SolutionStep *step = (const SolutionStep *)args; const CLBlasKargs *kargs = (const CLBlasKargs *)(&step->args); if ( kargs->dtype == TYPE_DOUBLE || kargs->dtype == TYPE_COMPLEX_DOUBLE) { addBuildOpt( buildOptStr, BUILD_OPTS_MAXLEN, "-DDOUBLE_PRECISION"); #ifdef DEBUG_AXPY printf("Setting build options ... Double... for DOUBLE PRECISION support\n"); #endif } if( (kargs->ldb.vector) != 1) { addBuildOpt( buildOptStr, BUILD_OPTS_MAXLEN, "-DINCX_NONUNITY"); } if( (kargs->ldc.vector) != 1) { addBuildOpt( buildOptStr, BUILD_OPTS_MAXLEN, "-DINCY_NONUNITY"); } return; } static CLBLASMpatExtra mpatExtra; extern "C" void initAxpyRegisterPattern(MemoryPattern *mempat) { #ifdef DEBUG_AXPY printf("initRegPattern called with mempat = 0x%p\n", mempat); #endif fflush(stdout); mempat->name = "Register accumulation based swap"; mempat->nrLevels = 2; mempat->cuLevel = 0; mempat->thLevel = 1; mempat->sops = &axpyOps; mpatExtra.aMset = CLMEM_LEVEL_L2; mpatExtra.bMset = CLMEM_LEVEL_L2; mpatExtra.mobjA = CLMEM_GLOBAL_MEMORY; mpatExtra.mobjB = CLMEM_GLOBAL_MEMORY; mempat->extra = &mpatExtra; Prefix[TYPE_FLOAT] = 'S'; Prefix[TYPE_DOUBLE] = 'D'; Prefix[TYPE_COMPLEX_FLOAT] = 'C'; Prefix[TYPE_COMPLEX_DOUBLE] = 'Z'; } static void calcNrThreads( size_t threads[2], const SubproblemDim *subdims, const PGranularity *pgran, const void *args, const void *_extra) { DUMMY_ARG_USAGE(subdims); const CLBLASKernExtra *extra = ( CLBLASKernExtra *)_extra; CLBlasKargs *kargs = (CLBlasKargs *)args; SolutionStep *step = container_of(kargs, args, SolutionStep); TargetDevice *kDevice = &(step->device); cl_int err; unsigned int numComputeUnits = deviceComputeUnits( (kDevice->id), &err ); if(err != CL_SUCCESS) { numComputeUnits = 1; } unsigned int vecLen = extra->vecLenA; unsigned int blockSize = pgran->wgSize[0] * pgran->wgSize[1]; unsigned int wgToSpawn = ((kargs->N - 1)/ (blockSize*vecLen)) + 1; wgToSpawn = min( wgToSpawn, (numComputeUnits * WORKGROUPS_PER_CU) ); threads[0] = wgToSpawn * blockSize; threads[1] = 1; } // // FIXME: Report correct return value - Needs change in KPRINTF // static ssize_t generator( char *buf, size_t buflen, const struct SubproblemDim *subdims, const struct PGranularity *pgran, void *extra) { DUMMY_ARGS_USAGE_2(pgran, subdims); char tempTemplate[32*1024]; if ( buf == NULL) // return buffer size { buflen = (32 * 1024 * sizeof(char)); return (ssize_t)buflen; } CLBLASKernExtra *extraFlags = ( CLBLASKernExtra *)extra; #ifdef DEBUG_AXPY printf("AXPY GENERATOR called....\n"); printf("dataType : %c\n", Prefix[extraFlags->dtype]); #endif unsigned int vecLenA = extraFlags->vecLenA; #ifdef DEBUG_AXPY printf("Vector length used : %d\n\n", vecLenA); #endif bool doVLOAD = false; if( extraFlags->flags & KEXTRA_NO_COPY_VEC_A ) { doVLOAD = true; #ifdef DEBUG_AXPY printf("DOing VLOAD as Aligned Data Pointer not Availabe\n"); #endif } else { #ifdef DEBUG_AXPY printf("Using Aligned Data Pointer .......\n"); #endif } strcpy( tempTemplate, (char*)axpy_kernel ); kprintf kobj( Prefix[extraFlags->dtype], vecLenA, doVLOAD, doVLOAD); kobj.spit((char*)buf, tempTemplate); return (32 * 1024 * sizeof(char)); } /* __kernel void %PREFIXaxpy_kernel( %TYPE _alpha, __global %TYPE *_X, __global %TYPE *_Y, uint N, uint offx, int incx, uint offy, int incy ) */ static void assignKargs(KernelArg *args, const void *params, const void* ) { CLBlasKargs *blasArgs = (CLBlasKargs*)params; cl_int incx, incy; assignScalarKarg(&args[0], &(blasArgs->alpha), blasArgs->dtype); INIT_KARG(&args[1], blasArgs->A); INIT_KARG(&args[2], blasArgs->B); initSizeKarg(&args[3], blasArgs->N); initSizeKarg(&args[4], blasArgs->offBX); incx = blasArgs->ldb.vector; INIT_KARG(&args[5], incx); initSizeKarg(&args[6], blasArgs->offCY); incy = blasArgs->ldc.vector; INIT_KARG(&args[7], incy); return; } clblas-2.10/src/library/blas/gens/blas_kgen.c000066400000000000000000001241101264277366700211200ustar00rootroot00000000000000/* ************************************************************************ * Copyright 2013 Advanced Micro Devices, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * ************************************************************************/ /* * common stuff for blas related * kernel generators */ #include #include #include #include #include #include #include #include #include #include "blas_kgen.h" #include "gen_helper.h" #include "tile_iter.h" #include "kerngen.h" #define IDX_INVAL ((unsigned int)-1) enum { COORD_STRLEN = 64 }; static unsigned int getTmpVecLen( const BlasGenSettings *gset, UpdateResultFlags uflags, const char **vecName) { const CLBLASKernExtra *kextra = gset->kextra; unsigned int vecLen; if (isComplexType(kextra->dtype) || (uflags & (UPRES_GENERIC | UPRES_NO_VECTORIZATION))) { vecLen = 1; } else { vecLen = (gset->flags & BGF_DISTINCT_VECLEN) ? kextra->vecLenC : kextra->vecLen; getVectorTypeName(kextra->dtype, vecLen, vecName, NULL); } return vecLen; } /* * Try to transform kernel string to integer. * Return -1. If this is not a number. */ static int stringToInt(const char *str, unsigned int *num) { char *end; unsigned int n; int ret = -1; n = (unsigned int)strtol(str, &end, 10); // believe it is a number if the string has been parsed completely if ((end != str) && (*end == '\0')) { *num = n; ret = 0; } return ret; } void sprintfVecChunk( char *chunk, unsigned int vecLen, unsigned int clen, unsigned int vecOff) { const char *vect = "0123456789abcdef"; if (clen == vecLen) { chunk[0] = '\0'; } else { snprintf(chunk, clen + 3, ".s%s", vect + vecOff); chunk[clen + 2] = '\0'; } } unsigned int getVecLen(const BlasGenSettings *gset, BlasFunctionID funcID, MatrixRole mrole) { unsigned int vecLen = 0; const CLBLASKernExtra *kextra = gset->kextra; DUMMY_ARG_USAGE(funcID); if (!(gset->flags & BGF_DISTINCT_VECLEN)) { vecLen = umin(kextra->vecLenA, kextra->vecLenB); vecLen = umin(vecLen, kextra->vecLenC); } else { switch (mrole) { case MATRIX_A: vecLen = kextra->vecLenA; break; case MATRIX_B: vecLen = kextra->vecLenB; break; case MATRIX_C: vecLen = kextra->vecLenC; break; default: break; } } return vecLen; } void genScaleLeadingDimensions(struct KgenContext *ctx, const BlasGenSettings *gset) { const KernelVarNames *kvars; unsigned int vecLen; bool done = false; if (!(gset->flags & BGF_LD_IN_VECTORS)) { return; } kvars = &gset->varNames; vecLen = getVecLen(gset, CLBLAS_GEMM, MATRIX_A); if ((kvars->lda != NULL) && (vecLen > 1)) { kgenPrintf(ctx, "%s /= %u;\n", kvars->lda, vecLen); done = true; } vecLen = getVecLen(gset, CLBLAS_GEMM, MATRIX_B); if ((kvars->ldb != NULL) && (vecLen > 1) && (kvars->ldb != kvars->lda)) { kgenPrintf(ctx, "%s /= %u;\n", kvars->ldb, vecLen); done = true; } vecLen = getVecLen(gset, CLBLAS_GEMM, MATRIX_C); if ((kvars->ldc != NULL) && (vecLen > 1) && (kvars->ldc != kvars->lda) && (kvars->ldc != kvars->ldb)) { kgenPrintf(ctx, "%s /= %u;\n", kvars->ldc, vecLen); done = true; } if (done) { kgenAddBlankLine(ctx); } } void getPrivateAreaInfo( const BlasGenSettings *gset, BlasFunctionID funcID, MatrixRole mrole, PrivateArea *area) { const CLBLASKernExtra *kextra = gset->kextra; const SubproblemDim *dim = &gset->subdims[1]; area->vecLen = getVecLen(gset, funcID, mrole); getVectorTypeName(kextra->dtype, area->vecLen, &area->typeName, NULL); if (mrole == MATRIX_C) { area->size = (unsigned int)(divRoundUp(dim->x, area->vecLen) * dim->y); } else { size_t h = (mrole == MATRIX_A) ? dim->y : dim->x; area->size = (unsigned int)(h * dim->bwidth / area->vecLen); } } void declarePrivateArea( struct KgenContext *ctx, const PrivateArea *area, const char *baseName, PrivateStorageType storType) { char tmp[1024]; unsigned int i; // TODO: separate case for size equal to 1 if (storType == PRIV_STORAGE_ARRAY) { sprintf(tmp, "%s %s[%u];\n", area->typeName, baseName, area->size); } else { char *p; sprintf(tmp, "%s %s0", area->typeName, baseName); p = tmp + strlen(tmp); for (i = 1; i < area->size; i++) { sprintf(p, ", %s%u", baseName, i); p += strlen(p); } strcpy(p, ";\n"); } kgenAddStmt(ctx, tmp); } int defaultTilePostFetch( struct KgenContext *ctx, MatrixRole mrole, void *priv) { char tmp[1024], cond[128]; Kstring src; TilePostFetchPrivate *pfPriv = (TilePostFetchPrivate*)priv; bool distVect = (pfPriv->gset->flags & BGF_DISTINCT_VECLEN); const KernelVarNames *vnames = &pfPriv->gset->varNames; const CLBLASKernExtra *kextra = pfPriv->gset->kextra; const SubproblemDim *dim = &pfPriv->gset->subdims[1]; BlasFunctionID funcID = pfPriv->funcID; const Tile* tile; bool partA; unsigned int step; unsigned int i, j; int ret = 0; unsigned int maxJ = 0; unsigned int maxI = 0; if (!isNeedZeroTileTail(funcID, dim, kextra, mrole, distVect)) { return 0; } if (mrole == MATRIX_A) { tile = &pfPriv->gset->tileA; maxJ = tile->nrCols; maxI = tile->nrRows; } else { tile = &pfPriv->gset->tileBX; maxJ = tile->nrRows; maxI = tile->nrCols; } partA = (mrole == MATRIX_A) && tile->trans && !(pfPriv->gset->flags & BGF_WHOLE_A); step = tileLineSegmentLen(tile); step = (tile->trans ^ (mrole == MATRIX_A)) ? 1 : step; for (j = 0; (j < maxJ) && !ret; j++) { unsigned int k; k = umax(j, (unsigned int)pfPriv->fetchNumA); if (k) { sprintf(tmp, " + %u", k); } else { tmp[0] = '\0'; } sprintf(cond, "(%s%s < %s)", vnames->k, tmp, vnames->sizeK); for (i = 0; (i < maxI) && !ret; i += step) { if (mrole != MATRIX_A) { sprintfTileElement(&src, tile, j, i, step); } else { sprintfTileElement(&src, tile, i, j, step); } sprintf(tmp, "%s = %s ? %s : 0;\n", src.buf, cond, src.buf); ret = kgenAddStmt(ctx, tmp); } } if (partA) { pfPriv->fetchNumA++; } if ((tile->nrCols * tile->nrRows / tile->vecLen > 1) && !ret) { ret = kgenAddBlankLine(ctx); } return ret; } char dtypeToBlasPrefix(DataType dtype) { char c; if (dtype == TYPE_FLOAT) { c = 's'; } else { c = dtypeToPrefix(dtype); } return c; } TileMulFlags kextraToTilemulFlags(BlasFunctionID funcID, KernelExtraFlags kflags) { TileMulFlags mf = TILEMUL_NO_FLAGS; if (isMatrixAccessColMaj(funcID, kflags, MATRIX_A)) { mf |= TILEMUL_TRA; } if (isMatrixConj(kflags, MATRIX_A)) { mf |= TILEMUL_CONJA; } if (!isMatrixAccessColMaj(funcID, kflags, MATRIX_B)) { mf |= TILEMUL_TRB; } if (isMatrixConj(kflags, MATRIX_B)) { mf |= TILEMUL_CONJB; } return mf; } void getResultGPRsInfo( DataType dtype, const SubproblemDim *dims, unsigned int vecLen, unsigned int *nrRegs, const char **typeName) { if (isComplexType(dtype)) { if (nrRegs) { *nrRegs = (unsigned int)(dims->x * dims->y); } if (typeName != NULL) { *typeName = dtypeBuiltinType(dtype); } } else { // handle different vecLen values and fetch vector sizes if (nrRegs) { *nrRegs = (unsigned int)(divRoundUp(dims->x, vecLen) * dims->y); } if (typeName != NULL) { getVectorTypeName(dtype, vecLen, typeName, NULL); } } } static void genVectorCPtr( struct KgenContext *pCtx, const BlasGenSettings *pGSet, const char* GPtrName, const char* VCPtrName ) { const char *typeName; unsigned int vecLen = 0; vecLen = getVecLen( pGSet, 0, MATRIX_C ); vecLen = vecLen > pGSet->tileCY.vecLen ? pGSet->tileCY.vecLen : vecLen; getVectorTypeName( pGSet->kextra->dtype, vecLen, &typeName, NULL ); if ( 0 == (pGSet->flags & BGF_LD_IN_VECTORS) ) { vecLen = 1; } // Blas function ID is omitted if ( isComplexType( pGSet->kextra->dtype ) ) { vecLen *= 2; } if ( isDoubleBasedType(pGSet->kextra->dtype) ) { if ( 1 == vecLen ) { kgenPrintf( pCtx, "__global %s *%s = %s.d;\n", typeName, VCPtrName, GPtrName); } else { kgenPrintf( pCtx, "__global %s *%s = %s.d%dv;\n", typeName, VCPtrName, GPtrName, vecLen); } } else { if ( 1 == vecLen ) { kgenPrintf( pCtx, "__global %s *%s = %s.f;\n", typeName, VCPtrName, GPtrName); } else { kgenPrintf( pCtx, "__global %s *%s = %s.f%dv;\n", typeName, VCPtrName, GPtrName, vecLen); } } } static void updateOptimResultGen( struct KgenContext *pCtx, const BlasGenSettings *pGSet, BlasFunctionID funcID, UpdateResultOp op, UpdateResultFlags flags) { KernelExtraFlags kflags = pGSet->kextra->flags; Tile tempCTile; Tile fullCTile; unsigned int physVecLenC; DataType dtype; const KernelVarNames *pVNames = NULL; PhysTileIterator physIter; PhysTileIterator blkIter; char cPtrName[] = "pC"; const char *typeNameC; bool phyTrans = 0; unsigned int vecLen = 0; unsigned int nBlocks = 0; unsigned int i = 0; Kstring cElem; Kstring tempCElem; Kstring kstrFirst; Kstring kstrSecond; Kstring kstrThird; Kstring expr; //EINVAL if ( NULL == pCtx || NULL == pGSet ) { return; } dtype = pGSet->kextra->dtype; pVNames = &pGSet->varNames; phyTrans = ( (flags & UPRES_COLUMN_MAJOR ) != 0 ); physVecLenC = getVecLen( pGSet, funcID, MATRIX_C ); getVectorTypeName( dtype, getVecLen( pGSet,0,MATRIX_C ), &typeNameC, NULL ); // declare private C pointer genVectorCPtr( pCtx, pGSet, "uC", "pC" ); kgenAddBlankLine( pCtx ); // calculate the number of blocks, update should be divided on nBlocks = pGSet->tileCY.nrCols * pGSet->tileCY.nrRows/( pGSet->tileA.nrCols*pGSet->tileA.nrRows + pGSet->tileBX.nrCols*pGSet->tileBX.nrRows ); if( pGSet->tileCY.nrCols * pGSet->tileCY.nrRows%( pGSet->tileA.nrCols*pGSet->tileA.nrRows + pGSet->tileBX.nrCols*pGSet->tileBX.nrRows ) ){ nBlocks++; } nBlocks = roundUpPow2( (int)nBlocks ); // declare the temporary C tile // temporary C tile must have the same transposition as C matrix // for read-write optimization it also has the same vectorization if ( phyTrans ) { if ( nBlocks > pGSet->tileCY.nrCols ) { nBlocks = pGSet->tileCY.nrCols; } initTile( &tempCTile, "tempC", pGSet->tileCY.nrRows, pGSet->tileCY.nrCols/nBlocks, pGSet->tileCY.vecLen, dtype, PRIV_STORAGE_VARIABLE_SET, phyTrans, true ); initTile( &fullCTile, "fullC", pGSet->tileCY.nrRows, pGSet->tileCY.nrCols, pGSet->tileCY.vecLen, dtype, PRIV_STORAGE_VARIABLE_SET, phyTrans, true); } else { if ( nBlocks > pGSet->tileCY.nrRows ) { nBlocks = pGSet->tileCY.nrRows; } initTile( &tempCTile, "tempC", pGSet->tileCY.nrRows/nBlocks, pGSet->tileCY.nrCols, pGSet->tileCY.vecLen, dtype, PRIV_STORAGE_VARIABLE_SET, phyTrans, true ); initTile( &fullCTile, "fullC", pGSet->tileCY.nrRows, pGSet->tileCY.nrCols, pGSet->tileCY.vecLen, dtype, PRIV_STORAGE_VARIABLE_SET, phyTrans, true); } declareOneTileStorage( pCtx, &tempCTile ); // splitting update result on several blocks to prevent // increasing GPR usage for ( i = 0; i < nBlocks; i++ ) { kgenAddBlankLine(pCtx); // fetch ------------------------------------------------------------------ vecLen = umin( physVecLenC, pGSet->tileCY.vecLen ); vecLen = umin( vecLen, tileLineSegmentLen(&tempCTile) ); iterInit( &blkIter, &tempCTile, vecLen, 0 ); iterInit( &physIter, &fullCTile, vecLen, 0 ); iterSeekPhys( &physIter, blkIter.nrLines * i, blkIter.vec ); if (op == UPRES_SUM) { for ( ; 0 == iterIsEnd( &blkIter ); iterIterate( &blkIter ), iterIterate( &physIter ) ) { emptyKstring( &kstrFirst ); emptyKstring( &kstrSecond ); emptyKstring( &kstrThird ); emptyKstring( &cElem ); emptyKstring( &tempCElem ); sprintfTileElement( &tempCElem, &tempCTile, blkIter.row, blkIter.col, vecLen); ksprintf( &kstrFirst, "%d", physIter.line ); ksprintf( &kstrSecond, "%s", pVNames->ldc ); ksprintf( &kstrThird, "%d", blkIter.vec ); sprintfFastScalarMad( &expr, &kstrFirst, &kstrSecond, vecLen,//physVecLenC,//scale ldc &kstrThird); kgenPrintf( pCtx, "%s = %s[%s];\n", tempCElem.buf, cPtrName, expr.buf ); } } // beta --------------------------------------------------------------- if ( flags & UPRES_WITH_BETA ) { if ( isComplexType(dtype) || ( pGSet->tileCY.trans != tempCTile.trans ) ) { vecLen = 1; } //TODO: for real datatype find longest available veclen can be used //to generate more compact code else { vecLen = pGSet->tileCY.vecLen; } vecLen = umin( vecLen, tileLineSegmentLen(&tempCTile) ); iterInit( &blkIter, &tempCTile, vecLen, 0 ); for ( ; 0 == iterIsEnd( &blkIter ); iterIterate( &blkIter ) ) { sprintfTileElement( &tempCElem, &tempCTile, blkIter.row, blkIter.col, vecLen); if ( isComplexType(dtype) ) { //complex mad ksprintf( &kstrSecond, "%s", pVNames->beta ); sprintfComplexMulUpdate( &expr, &tempCElem, &tempCElem, &kstrSecond, NULL, isDoubleBasedType(dtype), 0, 0, 0 ); kgenPrintf( pCtx, "%s", expr.buf ); } else { if ((kflags & KEXTRA_ENABLE_MAD) != 0) { kgenPrintf( pCtx, "%s = mad(%s, %s, 0);\n", tempCElem.buf, tempCElem.buf, pVNames->beta); } else { kgenPrintf( pCtx, "%s = %s * %s;\n", tempCElem.buf, tempCElem.buf, pVNames->beta); } } } } // alpha--------------------------------------------------------------- if ( (phyTrans == pGSet->tileCY.trans) && (!isComplexType(dtype)) ) { vecLen = pGSet->tileCY.vecLen; } else { vecLen = 1; } vecLen = umin( vecLen, tileLineSegmentLen(&tempCTile) ); iterInit( &blkIter, &tempCTile, vecLen, 0 ); iterInit( &physIter, &fullCTile, vecLen, 0 ); iterSeekPhys( &physIter, blkIter.nrLines * i, blkIter.vec ); for ( ; 0 == iterIsEnd( &blkIter ); iterIterate( &blkIter ), iterIterate( &physIter) ) { const Kstring *dst; dst = (flags & UPRES_PRIV_DEST) ? &cElem : &tempCElem; sprintfTileElement( &tempCElem, &tempCTile, blkIter.row, blkIter.col, vecLen); sprintfTileElement( &cElem, &pGSet->tileCY, physIter.row, physIter.col, vecLen); // complex if ( isComplexType(dtype) ) { ksprintf( &kstrSecond, "%s", pVNames->alpha ); // upres op: sum or set, if set, third argument // of complex mad() is zero sprintfComplexMulUpdate( &expr, dst, &cElem, &kstrSecond, (op == UPRES_SUM) ? &tempCElem : NULL, isDoubleBasedType(dtype), 0, 0, 0); kgenPrintf( pCtx, "%s", expr.buf ); } // real else { // upres op: sum or set, if set, third argument // of mad() is zero if ((kflags & KEXTRA_ENABLE_MAD) != 0) { kgenPrintf( pCtx, "%s = mad(%s, %s, %s);\n", dst, cElem.buf, pVNames->alpha, (op == UPRES_SUM) ? tempCElem.buf : "0" ); } else { kgenPrintf( pCtx, "%s = %s * %s + %s;\n", dst, cElem.buf, pVNames->alpha, (op == UPRES_SUM) ? tempCElem.buf : "0" ); } } } if (flags & UPRES_PRIV_DEST) { return; } // store--------------------------------------------------------------- vecLen = umin( physVecLenC, pGSet->tileCY.vecLen ); vecLen = umin( vecLen, tileLineSegmentLen( &tempCTile ) ); iterInit( &blkIter, &tempCTile, vecLen, 0 ); iterInit( &physIter, &fullCTile, vecLen, 0 ); iterSeekPhys( &physIter, blkIter.nrLines * i, blkIter.vec ); for ( ; 0 == iterIsEnd( &blkIter ); iterIterate( &blkIter ), iterIterate( &physIter ) ) { emptyKstring( &kstrFirst ); emptyKstring( &kstrSecond ); emptyKstring( &kstrThird ); emptyKstring( &cElem ); emptyKstring( &tempCElem ); sprintfTileElement( &tempCElem, &tempCTile, blkIter.row, blkIter.col, vecLen); ksprintf( &kstrFirst, "%d", physIter.line ); ksprintf( &kstrSecond, "%s", pVNames->ldc ); ksprintf( &kstrThird, "%d", blkIter.vec ); sprintfFastScalarMad( &expr, &kstrFirst, &kstrSecond, vecLen,//physVecLenC,//scale ldc &kstrThird); kgenPrintf( pCtx, "%s[%s] = %s;\n", cPtrName, expr.buf, tempCElem.buf ); } } } int genUpdateResultSingle( struct KgenContext *ctx, const char *dst, const char *src, const BlasGenSettings *gset, UpdateResultOp op, UpdateResultFlags flags) { char tmp[1024]; char *p; const char *opStr; UpdateResultFlags m; int r; bool isComplex = isComplexType(gset->kextra->dtype); // copy destination with respective operator and additional operations if (flags & UPRES_WITH_BETA) { if (isComplex) { sprintf(tmp, "%s = %s * betaR + %s.yx * betaI + ", dst, dst, dst); } else { sprintf(tmp, "%s = %s * beta + ", dst, dst); } } else { opStr = (op == UPRES_SET) ? "=" : "+="; sprintf(tmp, "%s %s ", dst, opStr); } m = UPRES_WITH_BETA | UPRES_GENERIC; if (isComplex && ((flags & m) == m)) { strcat(tmp, "\n "); } p = tmp + strlen(tmp); // multiply source if (flags & UPRES_WITHOUT_ALPHA) { sprintf(p, "%s;\n", src); } else { if (isComplex) { sprintf(p, "%s * alphaR + %s.yx * alphaI;\n", src, src); } else { sprintf(p, "%s * alpha;\n", src); } } r = kgenAddStmt(ctx, tmp); return (r) ? -EOVERFLOW : 0; } static void updateGenericResultGen( struct KgenContext *ctx, const BlasGenSettings *gset, size_t pitch, UpresVarNames* uvars, UpdateResultOp op, UpdateResultFlags flags, const char *cachedName) { char tmp[1024], dst[128], src[128]; const char *boundNames[2] = {uvars->nrRows, uvars->nrCols}; const char *vecType = NULL; const char *vFieldVectorized; DataType dtype = gset->kextra->dtype; unsigned int wvlen; unsigned int sizes[2]; const char* vfield = dtypeUPtrField(dtype); bool tra = ((flags & UPRES_COLUMN_MAJOR) != 0); bool row = ((flags & UPRES_TAIL_ROW)); bool col = ((flags & UPRES_TAIL_COL)); bool iwc = ((flags & UPRES_INDEXING_WITH_CONSTANTS) != 0) || (gset->tileCY.storType != PRIV_STORAGE_ARRAY); int l0; int l1; bool revert = false; Kstring kstr; int rowId; int colId; sizes[0] = (unsigned int)gset->subdims[1].y; sizes[1] = (unsigned int)gset->subdims[1].x; if (iwc) { const char* l0var = boundNames[tra]; revert = (tra && col) || (!tra && row); if (revert) { sprintf(tmp, "uC.%s += (%s-1) * %s;\n", vfield, l0var, uvars->ld); } else { sprintf(tmp, "\n"); } kgenAddStmt(ctx, tmp); } wvlen = getTmpVecLen(gset, flags, &vecType); if (!iwc) { getVectorTypeName(dtype, wvlen, NULL, &vFieldVectorized); sprintf(tmp, "res.%s = c;\n", vFieldVectorized); kgenAddStmt(ctx, tmp); } if (flags & (UPRES_TAIL_ROW | UPRES_TAIL_COL)) { char offStr[64]; char *p = offStr; offStr[0] = '\0'; if (flags & UPRES_TAIL_ROW) { sprintf(offStr, " + (%u - %s) * %lu", sizes[0], uvars->nrRows, pitch); p += strlen(offStr); } if (flags & UPRES_TAIL_COL) { sprintf(p, " + (%u - %s)", sizes[1], uvars->nrCols); } if (iwc) { sprintf(tmp, "res.%s = uC.%s%s;\n", vfield, vfield, offStr); sprintf(tmp, "\n"); } else { sprintf(tmp, "res.%s = res.%s%s;\n", vfield, vfield, offStr); } kgenAddStmt(ctx, tmp); } if (iwc) { int l0st = 1; int l0en = sizes[tra]; int l1st = 1; int l1en = sizes[1-tra]; const char* l0var = boundNames[tra]; const char* l1var = boundNames[1-tra]; for (l0 = l0en; l0 >= l0st; l0--) { sprintf(tmp, "if (%s) ",l0var); kgenBeginBranch(ctx, tmp); sprintf(tmp, "switch (%s)", l1var); kgenBeginBranch(ctx, tmp); for (l1 = l1en; l1 >= l1st; l1--) { sprintf(tmp, "case %d:\n", l1); kgenAddStmt(ctx, tmp); if (tra) { rowId = (row)? (l1en-l1): (l1-l1st); colId = (col)? (l0-l0st): (l0en-l0); } else { /////////////////////////// rowId = (row)? (l0-l0st): (l0en-l0); colId = (col)? (l1en-l1) : (l1-l1st); } if ((tra && row) || (!tra && col)) { sprintf(dst, "uC.%s[(%s+%d) %% %i]", vfield, l1var, (l1en - l1), (int)l1en); } else { sprintf(dst, "uC.%s[%d]", vfield, (l1-l1st)); } sprintfTileElement(&kstr, &gset->tileCY, rowId, colId, wvlen); if (flags & UPRES_PRIV_DEST) { genUpdateResultSingle(ctx, kstr.buf, dst, gset, op, flags); } else { genUpdateResultSingle(ctx, dst, kstr.buf, gset, op, flags); } } kgenEndBranch(ctx, NULL); if (revert) { sprintf(tmp, "uC.%s -= %s;\n", vfield, uvars->ld); } else { sprintf(tmp, "uC.%s += %s;\n", vfield, uvars->ld); } kgenAddStmt(ctx, tmp); sprintf(tmp, "%s--;\n", l0var); kgenAddStmt(ctx, tmp); kgenEndBranch(ctx, NULL); } } else { sprintf(tmp, "for (i = 0; i < %s; i++)", boundNames[tra]); kgenBeginBranch(ctx, tmp); sprintf(tmp, "for (j = 0; j < %s; j++)", boundNames[1 - tra]); kgenBeginBranch(ctx, tmp); sprintf(dst, "uC.%s[i * %s + j]", vfield, uvars->ld); if (cachedName) { unsigned int i; char tmpcachedName[80] = " = "; strcat(tmpcachedName, cachedName); for (i = 3; i < strlen(tmpcachedName); i++) { if (strncmp(tmpcachedName+i, "%u", 2) == 0) { tmpcachedName[i+1] = 's'; } } sprintf(tmp, tmpcachedName, "i", "[j]"); strcat(dst, tmp); } // result (res) can be transposed independently of the matrix C // If the transposition of "C" and "result" is not consistent // then change the calculation of the index for "result" if (gset->tileCY.trans ^ tra) { sprintf(src, "res.%s[j * %lu + i]", vfield, pitch); } else { sprintf(src, "res.%s[i * %lu + j]", vfield, pitch); } if (flags & UPRES_PRIV_DEST) { genUpdateResultSingle(ctx, src, dst, gset, op, flags); } else { genUpdateResultSingle(ctx, dst, src, gset, op, flags); } kgenEndBranch(ctx, NULL); kgenEndBranch(ctx, NULL); } } //----------------------------------------------------------------------------- int updateResultGen( struct KgenContext *ctx, const BlasGenSettings *gset, BlasFunctionID funcID, UpdateResultOp op, UpdateResultFlags flags, const UpresVarNames *uvarNames) { char tmp[1024]; char *p = tmp; const char *typeName; const char *vecType = NULL; const char *vfield; const char *suff1; const char *suff2; int ret = 0; unsigned int sizes[2]; bool generic, tra; unsigned int wvlen; // length of vectors to copy with unsigned int uplen; // length of vectors to update result with size_t pitch; char LG; DataType dtype = gset->kextra->dtype; unsigned int vecLen; bool isInlined = (flags & UPRES_INLINE); UpresVarNames uvars; vecLen = (gset->flags & BGF_DISTINCT_VECLEN) ? gset->kextra->vecLenC : gset->kextra->vecLen; sizes[0] = (unsigned int)gset->subdims[1].y; sizes[1] = (unsigned int)gset->subdims[1].x; if (isComplexType(dtype)) { vecLen = 1; } if ((flags & UPRES_WITH_BETA) && (op != UPRES_SUM)) { return -EINVAL; } tra = ((flags & UPRES_COLUMN_MAJOR) != 0); generic = ((flags & UPRES_GENERIC) != 0); typeName = dtypeBuiltinType(dtype); vfield = dtypeUPtrField(dtype); pitch = roundUp(sizes[1], vecLen); // select write vectorization wvlen = getTmpVecLen(gset, flags, &vecType); uplen = (tra ^ gset->tileCY.trans || (flags & UPRES_NO_VECTORIZATION)) ? 1 : vecLen; suff1 = (generic) ? "Generic" : ""; suff2 = (flags & UPRES_PRIV_DEST) ? "Rev" : ""; LG = (flags & UPRES_USE_LDS) ? 'L' : 'G'; if (!isInlined) { const char *outTypeName; const char *memPref = (flags & UPRES_USE_LDS) ? "__local" : "__global"; getResultGPRsInfo(dtype, NULL, vecLen, NULL, &outTypeName); // define the function sprintf(tmp, "void\n" "updateResult%s%s%c(\n" " %s %s *C,\n" " %s *c,\n" " %s alpha,\n" " uint startRow,\n" " uint startCol,\n" " uint ld", suff1, suff2, LG, memPref, typeName, outTypeName, typeName); p += strlen(p); if (flags & UPRES_WITH_BETA) { sprintf(p, ",\n %s beta", typeName); p += strlen(p); } if (generic) { sprintf(p, ",\n uint nrRows,\n" " uint nrCols"); } uvars.result = "C"; uvars.ld = "ld"; uvars.startRow = "startRow"; uvars.startCol = "startCol"; uvars.nrRows = "nrRows"; uvars.nrCols = "nrCols"; strcat(p, ")\n"); kgenDeclareFunction(ctx, tmp); kgenBeginFuncBody(ctx); } else { memcpy(&uvars, uvarNames, sizeof(uvars)); } // declare local variables sprintf(tmp, "%cPtr uC;\n", LG); kgenAddStmt(ctx, tmp); if (generic) { kgenAddStmt(ctx, "int i, j;\n" "PPtr res;\n"); } else { /* * temporary pointer to pass correctly over the * destination array since destination rows can be * not aligned on a vector bound */ if (sizes[1 - tra] % wvlen != 0) { sprintf(tmp, "%cPtr tmpC;\n", LG); kgenAddStmt(ctx, tmp); } if (wvlen > uplen) { sprintf(tmp, "%s tmp;\n", vecType); kgenAddStmt(ctx, tmp); } } if (isComplexType(dtype) && !(flags & UPRES_WITHOUT_ALPHA)) { declareComplexMultParts(ctx, "alpha", typeName); if (flags & UPRES_WITH_BETA) { declareComplexMultParts(ctx, "beta", typeName); } } kgenAddBlankLine(ctx); // LD is scaled if ( gset->flags & BGF_LD_IN_VECTORS ) { vecLen = getVecLen(gset, 0, MATRIX_C); } else { vecLen = 1; } if (tra) { if ( vecLen > 1 ) { sprintf(tmp, "uC.%s = %s + (%s * %s + %s)/%d;\n", vfield, uvars.result, uvars.startCol, uvars.ld, uvars.startRow, vecLen); } else { sprintf(tmp, "uC.%s = %s + %s * %s + %s;\n", vfield, uvars.result, uvars.startCol, uvars.ld, uvars.startRow); } } else { if ( vecLen > 1 ) { sprintf(tmp, "uC.%s = %s + (%s * %s + %s)/%d;\n", vfield, uvars.result, uvars.startRow, uvars.ld, uvars.startCol, vecLen); } else { sprintf(tmp, "uC.%s = %s + %s * %s + %s;\n", vfield, uvars.result, uvars.startRow, uvars.ld, uvars.startCol); } } kgenAddStmt(ctx, tmp); if ((sizes[1 - tra] % wvlen != 0) && !generic) { kgenAddStmt(ctx, "tmpC = uC;\n"); } ret = kgenAddBlankLine(ctx); if (generic) { updateGenericResultGen(ctx, gset, pitch, &uvars, op, flags, uvarNames ? uvarNames->cachedName : NULL); } else { updateOptimResultGen(ctx, gset, funcID, op, flags); } if (!isInlined) { ret = kgenEndFuncBody(ctx); } return (ret) ? -EOVERFLOW : 0; } TailFetch checkForTailFetches( BlasFunctionID funcID, const SubproblemDim *dim, const CLBLASKernExtra *kextra, MatrixRole mrole, bool distVect, bool lowerTails) { TailFetch ret = FETCH_NO_TAILS; size_t x; KernelExtraFlags tailFlag; unsigned int vecLen; KernelExtraFlags tailFlagM, tailFlagN, tailFlagK; tailFlagM = lowerTails ? KEXTRA_TAILS_M_LOWER : KEXTRA_TAILS_M; tailFlagN = lowerTails ? KEXTRA_TAILS_N_LOWER : KEXTRA_TAILS_N; tailFlagK = lowerTails ? KEXTRA_TAILS_K_LOWER : KEXTRA_TAILS_K; if (mrole == MATRIX_A) { x = dim->y; tailFlag = tailFlagM; vecLen = (distVect) ? kextra->vecLenA : kextra->vecLen; } else { x = dim->x; tailFlag = tailFlagN; vecLen = (distVect) ? kextra->vecLenB : kextra->vecLen; } if (isMatrixAccessColMaj(funcID, kextra->flags, mrole)) { if ((kextra->flags & tailFlag) && (x != vecLen)) { ret |= FETCH_TAIL_COL; } if (kextra->flags & tailFlagK) { ret |= FETCH_TAIL_ROW; } } else if (kextra->flags & tailFlagK) { ret |= FETCH_TAIL_COL; } return ret; } bool isNeedZeroTileTail( BlasFunctionID funcID, const SubproblemDim *dim, const CLBLASKernExtra *kextra, MatrixRole mrole, bool distVect) { bool trans; TailFetch tf; trans = isMatrixAccessColMaj(funcID, kextra->flags, mrole); tf = checkForTailFetches(funcID, dim, kextra, mrole, distVect, true); return (trans && (tf & FETCH_TAIL_ROW)) || (!trans && (tf & FETCH_TAIL_COL)); } TailStatus checkGenAdjustTailCoords( struct KgenContext *ctx, BlasFunctionID funcID, const BlasGenSettings *gset, int *error) { char tmp[1024]; const SubproblemDim *dim = &gset->subdims[1]; const KernelVarNames *varNames = &gset->varNames; KernelExtraFlags kflags = gset->kextra->flags; TailStatus status = 0; int err = 0; int n = 0; if (!isMatrixAccessColMaj(funcID, kflags, MATRIX_A) && (kflags & KEXTRA_TAILS_M_LOWER)) { status |= TAIL_A_RAISED; sprintf(tmp, "if (%s + %lu > %s) {\n" " %s -= %lu - %s %% %lu;\n" "}\n", varNames->coordA, dim->y, varNames->sizeM, varNames->coordA, dim->y, varNames->sizeM, dim->y); if (ctx != NULL) { err = kgenAddStmt(ctx, tmp); n++; } } if (!isMatrixAccessColMaj(funcID, kflags, MATRIX_B) && (kflags & KEXTRA_TAILS_N_LOWER) && !err) { status |= TAIL_B_RAISED; sprintf(tmp, "if (%s + %lu > %s) {\n" " %s -= %lu - %s %% %lu;\n" "}\n", varNames->coordB, dim->x, varNames->sizeN, varNames->coordB, dim->x, varNames->sizeN, dim->x); if (ctx != NULL) { err = kgenAddStmt(ctx, tmp); n++; } } if (n && !err) { err = kgenAddBlankLine(ctx); } if (error != NULL) { *error = err; } return status; } int checkGenRestoreTailCoords( struct KgenContext *ctx, const BlasGenSettings *gset, TailStatus status) { char tmp[1024]; const SubproblemDim *dim = &gset->subdims[1]; const KernelVarNames *varNames = &gset->varNames; int ret = 0; int n = 0; if (status & TAIL_A_RAISED) { sprintf(tmp, "if ((%s + %lu == %s) && (%s %% %lu)) {\n" " %s += %lu - %s %% %lu;\n" "}\n", varNames->coordA, dim->y, varNames->sizeM, varNames->sizeM, dim->y, varNames->coordA, dim->y, varNames->sizeM, dim->y); ret = kgenAddStmt(ctx, tmp); n++; } if ((status & TAIL_B_RAISED) && !ret) { sprintf(tmp, "if ((%s + %lu == %s) && (%s %% %lu)) {\n" " %s += %lu - %s %% %lu;\n" "}\n", varNames->coordB, dim->x, varNames->sizeN, varNames->sizeN, dim->x, varNames->coordB, dim->x, varNames->sizeN, dim->x); kgenAddStmt(ctx, tmp); n++; } if (n) { ret = kgenAddBlankLine(ctx); } return (ret) ? -EOVERFLOW : 0; } UpdateResultFlags tailStatusToUpresFlags(TailStatus status) { UpdateResultFlags flags = 0; if (status & TAIL_A_RAISED) { flags |= UPRES_TAIL_ROW; } if (status & TAIL_B_RAISED) { flags |= UPRES_TAIL_COL; } return flags; } int declareComplexMultParts( struct KgenContext *ctx, const char *baseName, const char *typeName) { char tmp[1024]; int r; sprintf(tmp, "%s %sR = (%s)(%s.x);\n" "%s %sI = (%s)(-%s.y, %s.y);\n", typeName, baseName, typeName, baseName, typeName, baseName, typeName, baseName, baseName); r = kgenAddStmt(ctx, tmp); return (r) ? -EOVERFLOW : 0; } void sprintfFastScalarMad( Kstring *expr, const Kstring *first, const Kstring *second, unsigned int scale, const Kstring *third) { unsigned int u1 = 0, u2 = 0, u3 = 0; bool isNum1, isNum2, isNum3; int shift; bool done = false; const char *thirdStr; const char *suff3; // clear up what are these arguments if (isKstringEmpty(first)) { isNum1 = true; } else { isNum1 = !stringToInt(first->buf, &u1); } if (isKstringEmpty(second)) { isNum2 = true; } else { isNum2 = !stringToInt(second->buf, &u2); } if (!scale) { scale = 1; } if ((third == NULL) || isKstringEmpty(third)) { thirdStr = "0"; isNum3 = true; } else { thirdStr = third->buf; isNum3 = !stringToInt(thirdStr, &u3); } suff3 = (isNum3) ? "u" : ""; // singular case at which only the third component can contribute if ( (isNum1 && (u1 == 0)) || (isNum2 && (u2 /scale == 0))) { kstrcpy(expr, thirdStr); return; } if (isNum1 && isNum2) { if (isNum3) { ksprintf(expr, "%u", u1 * u2 / scale + u3); } else { ksprintf(expr, "%u + %s", u1 * u2 / scale, thirdStr); } done = true; } else if (isNum1) { /* * If the third argument is not used, then try to build the expression * using only shifts if 'scale' and the 'second argument' are both of * power of 2. Otherwise use mad24. */ if (isRoundedPow2(u1) && isRoundedPow2(scale)) { shift = findHighestSetBit(scale) - findHighestSetBit(u1); if (isNum3 && (u3 == 0)) { if (shift < 0) { ksprintf(expr, "(%s << %d)", second->buf, -shift); } else if (shift > 0) { ksprintf(expr, "(%s >> %d)", second->buf, shift); } else { kstrcpy(expr, second->buf); } } else if (shift > 0) { ksprintf(expr, "(%s >> %d) + %s", second->buf, shift, thirdStr); } else if (shift == 0) { ksprintf(expr, "%s + %s", second->buf, thirdStr); } else { ksprintf(expr, "mad24(%uu, %s, %s%s)", 1u << -shift, second->buf, thirdStr, suff3); } done = true; } } if (!done) { /* * Append unsiged suffixes to avoid cases at which one * operand is signed and the other is unsigned. Typically, * OpenCL compilers are strict and reject such expressions. */ if (isNum2) { if (u2 / scale == 1) { if (isNum3 && (u3 == 0)) { kstrcpy(expr, first->buf); } else { ksprintf(expr, "%s + %s", first->buf, thirdStr); } } else { ksprintf(expr, "mad24(%s, %uu, %s%s)", first->buf, u2 / scale, thirdStr, suff3); } } else { const char *suff1 = (isNum1) ? "u" : ""; Kstring tmp; const char *p = NULL; if (scale == 1) { p = second->buf; } else { p = tmp.buf; if (isRoundedPow2(scale)) { shift = findHighestSetBit(scale); ksprintf(&tmp, "(%s >> %d)", second->buf, shift); } else { ksprintf(&tmp, "%s / %d", second->buf, scale); } } ksprintf(expr, "mad24(%s%s, %s, %s%s)", first->buf, suff1, p, thirdStr, suff3); } } } clblas-2.10/src/library/blas/gens/blas_kgen.h000066400000000000000000000720331264277366700211330ustar00rootroot00000000000000/* ************************************************************************ * Copyright 2013 Advanced Micro Devices, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * ************************************************************************/ /* * Something specific for BLAS generators * * NOTE: * 1) All the blas kernel generators should * perceive fields of the SubproblemDim * structure as following: * 'y' - rows of matrix A, i. e. M dimension * of matrix C * 'x' - columns of matrix B and C * 'bwidth' - block width in K dimension * * 2) At generating copying functions and their calls one should * keep in mind, all the matrix blocks are copied in * the local memory such that sequentially accessed elements * are located in memory sequentially. In this context * transposing is perceived as transposing at copying * to/from the local memory, not matrix storage way in * the array passed to kernel. */ #ifndef BLAS_KGEN_H_ #define BLAS_KGEN_H_ #include #include #include #include #include #include #include #include "tile.h" #include "fetch.h" #define BLAS_KGEN_FORMAT 1 #define genInternalLoopEnd(ctx) kgenEndBranch(ctx, NULL) enum { MAX_OPENCL_VECTOR_LENGTH = 16 }; typedef enum TailFetch { FETCH_NO_TAILS = 0, FETCH_TAIL_ROW = 0x01, FETCH_TAIL_COL = 0x02 } TailFetch; /** * @internal * @brief Blas generator flags * @ingroup GEN_SETTINGS */ typedef enum BlasGenFlags { BGF_EXPLICIT_INLINE = 0x01, BGF_DISTINCT_VECLEN = 0x02, // TODO: replace with a flags with inverse semantics BGF_WHOLE_A = 0x04, /** Leading dimension are in vectors rather than in elements */ BGF_LD_IN_VECTORS = 0x08, /** * Objects in the global memory are accessed through the unified pointers. * This feature is deprecated and should be not used in new generators. * It is left for backward compatibility */ BGF_UPTRS = 0x10 } BlasGenFlags; /** * @internal * @brief Flags showing how problem tails are handled * @ingroup TAILS_HANDLING */ typedef enum TailStatus { /** Tail of the matrix A is raised */ TAIL_A_RAISED = 0x01, /** Tail of the matrix B is raised */ TAIL_B_RAISED = 0x02 } TailStatus; /** * @internal * @brief Tiles multiplier flags * @ingroup BLAS_MAJOR_SUBGENS */ typedef enum TileMulFlags { TILEMUL_NO_FLAGS = 0, /**< No flags */ TILEMUL_TRA = 0x01, /**< Transposed matrix A */ TILEMUL_TRB = 0x02, /**< Transposed matrix B */ TILEMUL_CONJA = 0x04, /**< Conjugated elements of A */ TILEMUL_CONJB = 0x08, /**< Conjugated elements of B */ TILEMUL_C_COLUMN_MAJOR = 0x10, /**< Column major block for matrix C */ TILEMUL_NOT_FETCH_B = 0x20, /**< Do not fetch matrix B block */ TILEMUL_EXTERN_RDECL = 0x40, /**< External register tiles declaration, the generator must not declare them itself */ /** * Deprecated. Use the repsective mode being a part of FetchAddr mode. * He is left just for backward compatibility to don't break the working * code and will be removed soon */ TILEMUL_WRAP_AROUND_TAIL = 0x80, /**< Sizes used for column skew are rounded to next vecLen bound */ /** Use global cyclic along subproblem A coordinate. * Deprecated. Don't use it */ TILEMUL_GLOBAL_CYCLIC_A = 0x100, /** Use global cyclic along subproblem B coordinate. * Deprecated don't use it */ TILEMUL_GLOBAL_CYCLIC_B = 0x200, /* Deprecated. Don't use it */ TILEMUL_GLOBAL_CYCLIC_K = 0x400, /**< Use global cyclic along K */ /** Use skew along subproblem A coordinate */ TILEMUL_SKEW_A = 0x800, /** Use skew along subproblem B coordinate. Deprecated */ TILEMUL_SKEW_B = 0x1000, /* Deprecated */ TILEMUL_SKEW_K = 0x2000, /**< Use skew along K */ /** Use size of whole matrix for cyclic addressing. Deprecated */ TILEMUL_GLOBAL_CYCLIC = TILEMUL_GLOBAL_CYCLIC_A | TILEMUL_GLOBAL_CYCLIC_B | TILEMUL_GLOBAL_CYCLIC_K, // Deprecated TILEMUL_SKEWS = TILEMUL_SKEW_A | TILEMUL_SKEW_B | TILEMUL_SKEW_K, /** Optimize coordinates calculations by storing coordinates values */ // Deprecated TILEMUL_OPTIMIZE_COORD_CALC = 0x4000, /** Use bwidth0 stride */ TILEMUL_BW_STRIDE = 0x8000, /** Optimize coordinates calculations by using vectors * and pointer increments */ // Deprecated TILEMUL_OPTIMIZE_VEC_COORDS = 0x10000, /** Do not increment K*/ TILEMUL_NOT_INC_K = 0x20000, /** * Use variants with explicit vectorization. Useful on platforms with * true SIMD. */ TILEMUL_FORCE_VECTORIZATION = 0x40000 } TileMulFlags; /** * @internal * @brief Tiles multiplier core * @ingroup BLAS_MAJOR_SUBGENS */ typedef enum TileMulCore { /** Use multiplication and addition operations */ TILEMUL_MULADD, /** Use the 'dot' function where possible */ TILEMUL_DOT, /** Use the 'mad' function */ TILEMUL_MAD } TileMulCore; /** * @internal * @brief Update result operations * @ingroup BLAS_MAJOR_SUBGENS */ typedef enum UpdateResultOp { /** Just set the values stored in a target buffer */ UPRES_SET, /** Summarize values stored in a target buffer with the temporary result */ UPRES_SUM } UpdateResultOp; /** * @internal * @brief Update result generator flags * @ingroup BLAS_MAJOR_SUBGENS */ typedef enum UpdateResultFlags { /** Resulting matrix is stored in the column major form */ UPRES_COLUMN_MAJOR = 0x01, /** Generic version, non optimal sizes */ UPRES_GENERIC = 0x02, /** Multiply result on beta */ UPRES_WITH_BETA = 0x04, /** do not multiply on the alpha scalar */ UPRES_WITHOUT_ALPHA = 0x08, /** * Destination is private memory; * if not set destination is in the global one */ UPRES_PRIV_DEST = 0x10, /** Use the local memory instead the global memory */ UPRES_USE_LDS = 0x20, /** Generate the inline version */ UPRES_INLINE = 0x40, /** Disable vectorization at memory access */ UPRES_NO_VECTORIZATION = 0x80, /** For the generic version useful data reside at the tile rows' tail */ UPRES_TAIL_ROW = 0x100, /** For the generic version useful data reside at the tile columns' tail */ UPRES_TAIL_COL = 0x200, /** Generate condition whether coordinates don't exceed problem bounds */ UPRES_EXCEED_PROBLEM_CONDITION = 0x400, /****/ UPRES_INDEXING_WITH_CONSTANTS = 0x800, /** Write result to C instead of B for functions with triangular matrix */ UPRES_TRIANG_WRITE_C = 0x1000 } UpdateResultFlags; typedef struct PrivateArea { const char *typeName; unsigned int vecLen; unsigned int size; } PrivateArea; /** * @internal * @defgroup GEN_SETTINGS Generator settings * @ingroup BLAS_GENERATORS */ /*@{*/ /** * @internal * @brief Kernel variable and argument names */ typedef struct KernelVarNames { const char *A; /**< Matrix A variable name */ const char *B; /**< Matrix B variable name */ const char *C; const char *LDS; /**< LDS pointer name */ const char *coordA; /**< Variable for subproblem A coordinate */ const char *coordB; /**< Variable for subproblem B coordinate */ const char *k; /**< Variable for incrementable K offset value*/ const char *skewA; /**< Variable for skews along A */ const char *skewB; /**< Variable for skews along B */ const char *skewK; /**< Variable for skews along K */ const char *sizeM; /**< Matrix A size M */ const char *sizeN; /**< Matrix B size N */ const char *sizeK; /**< Matrixes size K */ const char *lda; /**< Leading dimension of matrix A */ const char *ldb; /**< Leading dimension of matrix B */ const char *ldc; /**< Leading dimension of matrix C, in vectors */ const char *vectCoordA; /**< Vector containing indexes of tile a elements in matrix A */ const char *vectCoordB; /**< Vector containing indexes of tile b elements in matrix B*/ const char *startM; const char *startN; const char *startK; const char *alpha; const char *beta; } KernelVarNames; /** * @internal * @brief Blas generator settings * * This structure is designed to be used with most of subgenerators * and generator helpers. It is assumed to be initialized once at the * generator beginning and modified as few as possible over the rest of * the process. */ typedef struct BlasGenSettings { /** * Subproblem dimensions: * * work group dimensions are at index 0 * work item dimensions are at index 1 */ SubproblemDim subdims[2]; const PGranularity *pgran; /**< Data parallelism granularity */ const CLBLASKernExtra *kextra; /**< Kernel extra */ BlasGenFlags flags; /**< Global generator flags */ KernelVarNames varNames; /**< Kernel variables and argument names */ Tile tileA; Tile tileBX; Tile tileCY; } BlasGenSettings; /*@}*/ /** * @internal * @brief Variable names for the inline version of a function updating result * @ingroup BLAS_MAJOR_SUBGENS */ typedef struct UpresVarNames { const char *result; /**< Name of an output matrix */ /** Leading dimension of a matrix stored in the global memory */ const char *ld; const char *startRow; /**< Start row to update from */ const char *startCol; /**< Start column to update from */ const char *nrRows; /**< Number of rows */ const char *nrCols; /**< Number of columns */ const char *cachedName; /**< Name of lds chached values */ } UpresVarNames; /** * @internal * @brief Options for matrix tiles multiplication generator * @ingroup BLAS_MAJOR_SUBGENS */ typedef struct TileMulOpts { CLMemType memA; /**< type of memory matrix A is located on */ CLMemType memB; /**< type of memory matrix B is located on */ TileMulFlags flags; /**< Flags on objects and computing specifics */ TileMulCore core; /**< Multiply and add core */ int (*postFetch)( struct KgenContext *ctx, MatrixRole mrole, void *arg); /**< Tile post fetch callback */ void *postFetchPriv; /**< Postfetch callback's private date */ struct FetchContext *fctx; } TileMulOpts; typedef struct ZeroFuncs { char names[MATRIX_ROLES_NUMBER][FUNC_NAME_MAXLEN]; } ZeroFuncs; /** * @internal * @brief Private data for fetch postprocessing callback * @ingroup TAILS_HANDLING */ typedef struct TilePostFetchPrivate { BlasFunctionID funcID; const BlasGenSettings *gset; const char *regName; int fetchNumA; int wholeA; } TilePostFetchPrivate; void getPrivateAreaInfo( const BlasGenSettings *gset, BlasFunctionID funcID, MatrixRole mrole, PrivateArea *area); void declarePrivateArea( struct KgenContext *ctx, const PrivateArea *area, const char *baseName, PrivateStorageType storType); /* * Declare separately the real and imaginary part of * a complex multiplier. * * @ctx: generator context * @baseName: variable's base name matching to an existing variable * with not sepated parts * @typeName: variable type name * * Rule naming * real part: R * imaginary part: I * * On success returns 0, and -EOVERFLOW at source buffer * overflowing */ int declareComplexMultParts( struct KgenContext *ctx, const char *baseName, const char *typeName); /** * @internal * @defgroup CHECK_DECOMP_CACL_GRAN Checking decomposition and calculate * parallelism granularity * @ingroup BLAS_GENERATORS */ /*@{*/ /** * @brief Sanity check for decomposition * * @param[in] subdims Subproblem dimensions. 2 levels. * @param[in] minSize Minimum size for any of the dimension * components * @param[in] maxSize Maxium size which can't be exceeded by * any of the dimension components at the tile * layer * @param[in] maxRegs Maximum registers it's allowed to use * @param[in] dtype BLAS data type * @param[in] wholeA Is matrix A stored in registers entirely or * partially * * The function rejects only decompositions that are completely invalid or lead * to consumption of too many registers or just have component values at the * tile layer that are out of the range [\b MinSize, \b MaxSize]. * Completely invalid decompositions are those which don't allow to divide * problem integrally among work items, e. g. zeroed components are wrong, * the step components (x, y, bwidth) of the 0-th level not integrally * divisible on respective size components (itemX, itemY, bwidth) of the 1-st * level are wrong as well. The decomposition is also wrong if the size * components are not integrally divisible on the step components and not equal * to #SUBDIM_UNUSED. * * @return true if the decomposition is valid, or false otherwise */ bool decompSanityCheck( const SubproblemDim *subdims, unsigned int minSize, unsigned int maxSize, unsigned int maxRegs, DataType dtype, bool wholeA); /** * @brief Calculate granularity in case when a work item is responsible * for its own part of solution not overlapping with those of other * items * * @param[out] pgran Location to store calculated granularity * @pararm[in] subdims Subproblem dimensions * @param[in] xdim Dimension in the OpenCL work space X component * of decomposition is mapped on * @param[in] level Function BLAS level. Reserved for future use. * * If value of \b xdim is -1, then the function assumes that OpenCL work * space is single dimensional, and puts the product of granularity against * X and Y component to 0-th element of \b wgSize field. If its value is * 0 or 1, the function assumes that OpenCL work space is 2D and puts * granularity against X component to \b xdim element of \b wgSize field * of the granularity decriptor. Granularity against Y component is put to * 1 - \b xdim element. Other values are invalid and forces abort in debug * build. The function initializes the \b wgDim field properly. * * NOTE: Now, only this function is supported only for level 3 and * must not be called for level 2 */ void calcPgranDedicated( PGranularity *pgran, const SubproblemDim *subdims, int xdim, int level); /** * @brief Calculate granularity in case when several items evaluate the same * part of solution together * * @param[out] pgran Location to store calculated granularity * @pararm[in] subdims Subproblem dimensions * @param[in] xdim Dimension in the OpenCL work space X component * of decomposition is mapped on * @param[in] ydim Dimension in the OpenCL work space Y component * of decomposition is mapped on * @param[in] level Function BLAS level. Reserved for future use * * If \b xdim and \b ydim values are equal, then the function puts the product * of granularity against X and Y component to \b xdim element of \b wgSize * field. If not, it puts separated granularity for X and Y in \b xdim and * \b ydim element respectively. Both the values must be non negative and less * than 3 (since OpenCL workspace cannot have more than 3 dimensions). * If some of these parameters is zero, then the other one must be zero as well. * If some of these parameters is 2, then the other one must be 1. These * restrictions are caused by needs in reflecting \b bwidth in granularity * in case of multidimensional decomposition. For 2D and 3D decompositions * granularity for bwidth is calculated as well, and it is always mapped * onto 0-th workspace dimension. If some of these parameters are wrong, * it forces abort in debug build. The function sets the \b wgDim field * to maximum of xdim and ydim plus 1. * * NOTE: Now, only this function is supported only for level 3 and * must not be called for level 2 */ void calcPgranCooperative( PGranularity *pgran, const SubproblemDim *subdims, int xdim, int ydim, int level); /*@}*/ /** * @internal * @defgroup COMMON_MATH_OPERATIONS Constructing useful math expression * @ingroup BLAS_GENERATORS */ /*@{*/ /** * @brief Sprintf a complex MAD operation * * Operations: * - \f$ dst \leftarrow a * b + c \f$ * - \f$ dst \leftarrow conj(a) * b + c \f$ * - \f$ dst \leftarrow a * conj(b) + c \f$ * - \f$ dst \leftarrow conj(a) * conj(b) + c \f$ * * @param[out] expr String object to hold the target expression * @param[in] dst Destination argument * @param[in] a The first multiplier * @param[in] b The second multiplier * @param[in] c Added argument * @param[in] isDouble If set, the arguments have double precision * @param[in] isConjA If set, the argument A should be conjugated * @param[in] isConjB If set, the argument B should be conjugated * @param[in] TileMulCore Multiplying core * * The \b c argument can be NULL. In this case it is ignored, and the function * produces pure multiplication */ void sprintfComplexMulUpdate( Kstring *expr, const Kstring *dst, const Kstring *a, const Kstring *b, const Kstring *c, bool isDouble, bool conjA, bool conjB, TileMulCore core); void sprintfComplexMulUpdate_syr2k_beta0( Kstring *expr, const Kstring *dst, const Kstring *a, const Kstring *b, const Kstring *c, bool isDouble, bool conjA, bool conjB, TileMulCore core); /** * @brief Sprintf expression of fast scalar mad * * @param[out] expr Output expression * @param[in] first First multiplier * @param[in] second Second multiplier * @param[in] scale Scale of the second argument, i. e. it's divider. * Ignored if zero. * @param[in] third Added argument. Ignored if NULL. * * It can use mad24. So, expected result should not exceed 2^24 */ void sprintfFastScalarMad( Kstring *expr, const Kstring *first, const Kstring *second, unsigned int scale, const Kstring *third); /*@}*/ /** * @internal * @defgroup BLAS_GEN_MISC_FUNCTIONS Miscellaneous functions * @ingroup BLAS_GENERATORS */ /*@{*/ /** * @brief Default function prefix for the data type * * @param[in] dtype One of the data types supported by the library */ char dtypeToBlasPrefix(DataType dtype); /** * @brief Convert kernel extra flags to tilemul flags * * @param[in] funcID BLAS function ID * @param[in] kflags Kernel flags */ TileMulFlags kextraToTilemulFlags(BlasFunctionID funcID, KernelExtraFlags kflags); /** * @brief Get vector length elements should be fetched from (stored to) * the global memory * * @param[in] gset Generator settings * @param[in] funcID BLAS function ID (deprecated) * @param[in] mrole Role of the matrix to get vectorization for */ unsigned int getVecLen(const BlasGenSettings *gset, BlasFunctionID funcID, MatrixRole mrole); /** * @brief Sprintf chunk (set of components) of an OpenCL vector type * * @param[out] chunk Buffer to sprintf to * @param[in] vecLen Entire vector length * @param[in] clen Length of the chunk * @param[in] vecOff Starting component offset */ void sprintfVecChunk( char *chunk, unsigned int vecLen, unsigned int clen, unsigned int vecOff); /** * @brief Generate code containing scaling of leading dimensions on * vector size * * @param[out] ctx Generator context * @param[in] gset Generator settings * * The function first checks whether the scaling is actually needed. * If vector size is 1. If some of the kernel variables for 'lda', 'ldb' * or 'ldc' is NULL, the function skips code generation for the dimension. * Calling this function has no effect if the @ref BGF_LD_IN_VECTORS generator * flag is not set. If some of the leading dimensions are not unique, only * one of the instances is scaled. Originality of the dimensions is detected * by values of the respective pointers being a part of @ref KernelVarNames. * For example, 'lda' and 'ldb' pointers are the same, only 'lda' is scaled. */ void genScaleLeadingDimensions(struct KgenContext *ctx, const BlasGenSettings *gset); /*@}*/ /** * @internal * @brief Generate default post processing logic after tile fetch * * @param[out] ctx Generator context * @param[in] mrole Matrix role * @priv[out] Handler's private data * * @ingroup TAILS_HANDLING */ int defaultTilePostFetch( struct KgenContext *ctx, MatrixRole mrole, void *priv); void getResultGPRsInfo( DataType dtype, const SubproblemDim *dims, unsigned int vecLen, unsigned int *nrRegs, const char **typeName); /** * @internal * @defgroup BLAS_MAJOR_SUBGENS Major subgenerators * @ingroup BLAS_GENERATORS */ /*@{*/ /** * @internal * @brief Tiles fetching and multiplication inlined code generator * * @param[out] ctx Generator context * @param[in] gset Generator settings * @param[in] mulOpts TileMul-specific generator settings * * This function generates code which fetches tiles a and b from global or local * memory into private memory, multiply them storing result into tile c in * private memory and increment coordinate k. Caller is responsible for loop * along K.\n * All combinations of tiles a and b orientations are supported. Generated * code fetches tiles by vectors which size can be different for tiles a and b. * Complex types and conjugated tiles are supported. Global cycling is supported * for global memory fetching - this mean that if tile overlaps matrix * the tail of tile will be fetched from the beginning instead of accessing * memory outside the matrix.\n * Second level of subdimensions is used for tiles sizes.\n * Generated code will fetch tiles a, b, multiply them and add result to tile c * in private memory, then increment k. By default, k is incremented by * second level bwidth but it is incremented by first level bwidth if * @ref TILEMUL_BW_STRIDE flag is set. It is used if whole work group goes * along K loop.\n * Each tile can be fetched from global memory or from local memory. * If tile is fetched from local memory then leading dimensions for local * memory area are taken from first level subdimensions.\n * Post-fetch callback generator function can be called after fetching tiles * for zeroing tails or setting diagonal elements to one. This function is * provided by caller.\n * If second level bwidth is not equal to first level bwidth, and * @ref TILEMUL_BW_STRIDE flag is not set then TileMul generates * loop from zero to first level bwidth with second level bwidth step. The * most common case is second level bwidth equal to first level bwidth where * single iteration of multiplication is generated.\n * * If the caller assume for efficient fetching from the global memory and the * tilemul logic is generated within a loop, prepareFetchCycle() should be * called before generation of the loop. * * @return 0 on success * @return -EOVERFLOW on source buffer overflowing * @return -EINVAL if input arguments are invalid */ int tileMulGen( struct KgenContext *ctx, const BlasGenSettings *gset, const TileMulOpts *mulOpts); /** * @internal * @brief Tiles pure multiplication code generator * * @param[out] ctx Generator context * @param[in] gset Generator settings * @param[in] mulOpts TileMul-specific generator settings * * This function multiply two tiles, a and b, storing result in tile c. No * additional operations are made. It just performs tiles multiplication without * fetching, post-fetch processing and incrementing coordinates which can be * made by caller. * * @return 0 on success * @return -EOVERFLOW on source buffer overflowing */ int genMulTiles( struct KgenContext *ctx, const BlasGenSettings *gset, const TileMulOpts *mulOpts); /** * @internal * @brief Update result generator * * @param[out] ctx Generator context * @param[in] gset Generator settings * @param[in] op Update operation * @param[in] flags Update result flags * @argNames * * It generates a function applying an operation to the temporary result * stored in the private memory and updating the target result. *\n * The code can be generated as well in the form of callable function * as in the inlined form. *\n * List of taken argument differs depending on specified flags. In general, * these functions are defined as: \n * @code * void * funcName( * C, * *c, * alpha, * size_t startRow, * size_t startCol, * size_t ld * [, beta] * [,size_t nrRows] * [,size_t nrCols]) * @endcode * * @return 0 on success, -EOVERFLOW at source buffer overflowing. */ int updateResultGen( struct KgenContext *ctx, const BlasGenSettings *gset, BlasFunctionID funcId, UpdateResultOp op, UpdateResultFlags flags, const UpresVarNames *uvarNames); /** * @internal * @brief Produce a code updating a single result element * * @param[out] ctx Generator context * @param[in] dst Destination element expression * @param[in] src Source element expression * @param[in] gset Generator settings * @param[in] op Update operation * @param[in] flags Flags showing specifics of the code needed to be * generated * * @return 0 on success, -EOVERFLOW if the source buffer is exceeded. */ int genUpdateResultSingle( struct KgenContext *ctx, const char *dst, const char *src, const BlasGenSettings *gset, UpdateResultOp op, UpdateResultFlags flags); /*@}*/ TailFetch checkForTailFetches( BlasFunctionID funcID, const SubproblemDim *dim, const CLBLASKernExtra *kextra, MatrixRole mrole, bool distVect, bool lowerTails); bool isNeedZeroTileTail( BlasFunctionID funcID, const SubproblemDim *dim, const CLBLASKernExtra *kextra, MatrixRole mrole, bool distVect); /** * @internal * @brief Generate tail coordinates adjustment if needed * * @param[out] ctx Generator context * @param[in] funcID BLAS function ID * @param[in] gset Generator settings * @param[out] *error Location to store error. * Ignored if NULL. * * Adjust coordinates if work is distributed over matrix rows so as * a tile would not exceed the matrix bound. Cyclic addressing is not * applicable for that since skew over rows can be used for performance goals. * * If it's needed, issues an expression like * * if (coord.y + dy > M) { * coord.y -= dy - M % dy; * } * * Return status showing if the tails have been actually adjusted or not. * If \b ctx is NULL the function doesn't try to generate a code, but just * return actual tail handling status * * @ingroup TAILS_HANDLING */ TailStatus checkGenAdjustTailCoords( struct KgenContext *ctx, BlasFunctionID funcID, const BlasGenSettings *gset, int *error); /** * @internal * @brief Generate restoring original coordinates if needed * * @param[out] ctx Generator context * @param[in] gset Generator settings * @param[in] status Tails handling status * * Coordinates restoring is needed to have ability to write back result to * a correct location. * * If it's needed, issues an expression like * * if (coord.y + dy == M) { * coord.y += dy - M % dy; * } * * @ingroup TAILS_HANDLING */ int checkGenRestoreTailCoords( struct KgenContext *ctx, const BlasGenSettings *gset, TailStatus status); /** * @internal * @brief Convert tail handling status to the respective flags * of the update result generator * * @param[in] status Status of the handling to convert to * the update result flags * * @ingroup TAILS_HANDLING */ UpdateResultFlags tailStatusToUpresFlags(TailStatus status); #endif /* BLAS_KGEN_H_ */ clblas-2.10/src/library/blas/gens/blas_subgroup.c000066400000000000000000000333431264277366700220510ustar00rootroot00000000000000/* ************************************************************************ * Copyright 2013 Advanced Micro Devices, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * ************************************************************************/ #include "blas_subgroup.h" #include #include #include #include #include #include "blas_kgen.h" #include "gen_helper.h" #include "tile_iter.h" #include "kerngen.h" static int calcMergeStepSubgrN( const BlasGenSettings* pGSet, DataType dtype); static int declareSubgrLDS( struct KgenContext* pCtx, const BlasGenSettings* pGSet, DataType dtype); //----------------------------------------------------------------------------- // calculates best number of subgroups to be engaged in each merge step // simultaneously // Calculation is based on the register usage estimation // in order not to limit // the number of workgroups scheduled on the SIMD engine static int calcMergeStepSubgrN( const BlasGenSettings* pGSet, DataType dtype) { // hardware-specific options const int deviceLDS = 32768; const unsigned int gprsPerUnit = 240; int vecLenA = 0; int vecLenB = 0; int vecLenC = 0; int vecNumA = 0; int vecNumB = 0; int vecNumC = 0; int subgPerStep = 0; int bestLDS = 0; int gprsUsed = 0; int subgNum = 0; int itemsPerSubgroup = 0; if( NULL == pGSet || NULL == pGSet->pgran ){ return -EINVAL; } itemsPerSubgroup = pGSet->subdims[0].bwidth/ pGSet->subdims[1].bwidth; subgNum = (pGSet->subdims[0].x/pGSet->subdims[1].x)* (pGSet->subdims[0].y/pGSet->subdims[1].y); vecLenA = pGSet->tileA.vecLen; vecLenB = pGSet->tileBX.vecLen; vecLenC = pGSet->tileCY.vecLen; vecNumA = tileVectorsNum( &pGSet->tileA ); vecNumB = tileVectorsNum( &pGSet->tileBX ); vecNumC = tileVectorsNum( &pGSet->tileCY ); // registers hold 4-vectors of 32-bit floats or 2-vectors of doubles switch(dtype){ case TYPE_FLOAT: // each register holds 4 4-byte float values // 10 registers are used address, etc gprsUsed = vecNumA * (vecLenA/4) + vecNumB * (vecLenB/4) + vecNumC * (vecLenC/4) + 10; bestLDS = deviceLDS/(gprsPerUnit/gprsUsed); subgPerStep = bestLDS/(itemsPerSubgroup * vecNumC * vecLenC * 4 );//4-byte floats break; case TYPE_DOUBLE: // each register can hold 2 double values // 10 registers are used for address, etc gprsUsed = vecNumA * (vecLenA/2) + vecNumB * (vecLenB/2) + vecNumC * (vecLenC/2) + 10; bestLDS = deviceLDS/(gprsPerUnit/gprsUsed); subgPerStep = bestLDS/(itemsPerSubgroup * vecNumC * vecLenC * 8 );//8-byte doubles break; case TYPE_COMPLEX_FLOAT: // each register holds 2 4-byte float-based complex values // 10 registers are used address, etc gprsUsed = vecNumA * (vecLenA/2) + vecNumB * (vecLenB/2) + vecNumC * (vecLenC/2) + 10; bestLDS = deviceLDS/(gprsPerUnit/gprsUsed); subgPerStep = bestLDS/(itemsPerSubgroup * vecNumC * vecLenC * 8 );//2x4-byte floats break; case TYPE_COMPLEX_DOUBLE: // each register can hold 1 double-based complex value // 10 registers are used for address, etc gprsUsed = vecNumA * (vecLenA) + vecNumB * (vecLenB) + vecNumC * (vecLenC) + 10; bestLDS = deviceLDS/(gprsPerUnit/gprsUsed); subgPerStep = bestLDS/(itemsPerSubgroup * vecNumC * vecLenC * 16 );//2x8-byte double break; default: break ; } if( 0==subgPerStep ){ subgPerStep = 1; } // do not exceed physical number of subgroups in workgroup if( subgPerStep > subgNum ){ subgPerStep = subgNum; } return subgPerStep; } //----------------------------------------------------------------------------- // Add LDS array declaration(based on C matrix parameters) to the context // each row of C Matrix block may be splitted into separate vectors static int declareSubgrLDS( struct KgenContext* pCtx, const BlasGenSettings* pGSet, DataType dtype) { int vecLenC = 0; int vecNumC = 0; const char* typeName; const KernelVarNames *vnames = NULL; char tmp[512]; int itemsPerSubgroup = 0; int subgrPerStep = 0; if( NULL == pCtx || NULL == pGSet ){ return -EINVAL; } itemsPerSubgroup = pGSet->subdims[0].bwidth / pGSet->subdims[1].bwidth; subgrPerStep = calcMergeStepSubgrN(pGSet, dtype); vecLenC = pGSet->tileCY.vecLen; vecNumC = tileVectorsNum( &pGSet->tileCY ); typeName = dtypeBuiltinType(dtype); vnames = &pGSet->varNames; switch(dtype){ case TYPE_FLOAT: case TYPE_DOUBLE: if( vecLenC > 1){ sprintf( tmp, "__local %s%d a%s[%d*%d*%d];\n" "__local %s%d *%s = a%s;\n", typeName, vecLenC, vnames->LDS, itemsPerSubgroup, subgrPerStep, vecNumC, typeName, vecLenC, vnames->LDS, vnames->LDS); } else{ sprintf( tmp, "__local %s a%s[%d*%d*%d];\n" "__local %s *%s = a%s;\n", typeName, vnames->LDS, itemsPerSubgroup, subgrPerStep, vecNumC, typeName, vnames->LDS, vnames->LDS); } break; case TYPE_COMPLEX_FLOAT: sprintf( tmp, "__local float%d a%s[%d*%d*%d];\n" "__local float%d *%s = a%s;\n", vecLenC*2, vnames->LDS, itemsPerSubgroup, subgrPerStep, vecNumC, vecLenC*2, vnames->LDS, vnames->LDS); break; case TYPE_COMPLEX_DOUBLE: sprintf( tmp, "__local double%d a%s[%d*%d*%d];\n" "__local double%d *%s = a%s;\n", vecLenC*2, vnames->LDS, itemsPerSubgroup, subgrPerStep, vecNumC, vecLenC*2, vnames->LDS, vnames->LDS); break; default: // to avoid compilation warning break; } kgenAddStmt( pCtx, tmp ); return 0; } //----------------------------------------------------------------------------- int mergeUpdateResult( struct KgenContext* pCtx, BlasFunctionID funcID, struct BlasGenSettings* pGSet, struct SubgVarNames* pSubgVNames, UpdateResultFlags upResFlags, UpresProcPtr upresProcPtr ) { char tmp[2048]; int subgN = 0; int subgItems = 0; int aBlkH = 0; DataType dtype; Tile tileC; Tile tileScratch; KernelVarNames* pVNames; unsigned int vecLenC; unsigned int vecNumC; int subgPerStep = 0; if( NULL == pCtx || NULL == pGSet ){ return -EINVAL; } dtype = pGSet->kextra->dtype; subgN = ( pGSet->subdims[0].x/pGSet->subdims[1].x ) * ( pGSet->subdims[0].y/pGSet->subdims[1].y ); subgItems = pGSet->subdims[0].bwidth/ pGSet->subdims[1].bwidth; aBlkH = pGSet->subdims[1].y; pVNames = &pGSet->varNames; // calculate best number of subgroups to be engaged in each merge step subgPerStep = calcMergeStepSubgrN( pGSet, dtype ); vecLenC = pGSet->tileCY.vecLen; vecNumC = tileVectorsNum( &pGSet->tileCY ); kgenAddStmt(pCtx,"//-----MergeUpdateResult\n"); kgenAddBlankLine(pCtx); // declare local data storage array kgenAddStmt( pCtx, "// veclenC scratch[SUBG_ITEMS*MSTEP_SUBG*vecNumC]\n"); declareSubgrLDS( pCtx, pGSet, dtype); kgenAddBlankLine( pCtx ); kgenAddStmt(pCtx, "//LDS block has the same vectorization as C matrix block\n"); kgenAddStmt( pCtx, "//VNUM_C*((get_local_id(1)%MSTEP_SUBG)*SUBG_ITEMS" " +get_local_id(0) );\n"); sprintf(tmp, "scratch += " "%d*(" "(%s.y%%%d)*%d +" "%s.x );\n", vecNumC, pSubgVNames->itemId, subgPerStep, subgItems, pSubgVNames->itemId ); kgenAddStmt(pCtx, tmp); sprintf( tmp, "\nfor( uint mstep = 0; mstep < %d; mstep += %d )", subgN, subgPerStep); kgenBeginBranch(pCtx,tmp); kgenAddBlankLine(pCtx); sprintf( tmp, "if( (%s.y >= mstep)&&(%s.y < (mstep+%d)) )", pSubgVNames->itemId, pSubgVNames->itemId, subgPerStep); kgenBeginBranch(pCtx,tmp); // the LDS block size is similar to C matrix block size kgenAddBlankLine(pCtx); initTile(&tileC, "c", (unsigned int)pGSet->subdims[1].y, (unsigned int)pGSet->subdims[1].x, vecLenC, dtype, pGSet->tileCY.storType, pGSet->tileCY.trans, pGSet->tileCY.packed); initTile(&tileScratch, "scratch", (unsigned int)pGSet->subdims[1].y, (unsigned int)pGSet->subdims[1].x, vecLenC, dtype, PRIV_STORAGE_ARRAY, pGSet->tileCY.trans, pGSet->tileCY.packed); genTileCopy(pCtx, &tileScratch, &tileC, TILECOPY_ASSIGN); genZeroTile(pCtx, &tileC); // split merge if kgenEndBranch( pCtx, NULL ); // merge step if kgenAddBlankLine( pCtx ); //splitting if on two, to prevent barrier issue kgenAddBarrier( pCtx, CLK_LOCAL_MEM_FENCE ); kgenAddBlankLine( pCtx ); //---------------------------------------------- sprintf( tmp, "if( (%s.y >= mstep)&&(%s.y < (mstep+%d)) )", pSubgVNames->itemId, pSubgVNames->itemId, subgPerStep); kgenBeginBranch(pCtx,tmp); sprintf( tmp, "if ( 0 == %s.x )", pSubgVNames->itemId ); kgenBeginBranch( pCtx, tmp ); kgenAddBlankLine(pCtx); // Zero element of each subgroup also performs LDS merge sprintf( tmp, "for(uint k = 0; k < %d * %d; k += %d)", subgItems, aBlkH, aBlkH); kgenBeginBranch(pCtx, tmp); kgenAddBlankLine(pCtx); genTileCopy(pCtx, &tileC, &tileScratch, TILECOPY_ADD_ASSIGN ); kgenAddStmt(pCtx, "//Adding the LDS block size in vectors\n"); sprintf(tmp, "%s += %d;", pVNames->LDS, vecNumC); kgenAddStmt(pCtx, tmp); kgenAddBlankLine(pCtx); kgenEndBranch( pCtx, NULL ); // merge for() kgenAddBlankLine( pCtx ); // Write into global memory ------------------------------- if ( NULL != upresProcPtr ) { (*upresProcPtr)( pCtx, funcID, pGSet, upResFlags /*| UPRES_INDEXING_WITH_CONSTANTS*/, NULL, NULL, NULL ); } kgenAddBlankLine(pCtx); kgenEndBranch(pCtx, NULL); // merge and global write if kgenEndBranch(pCtx, NULL); // LDS write if kgenAddBarrier(pCtx, CLK_LOCAL_MEM_FENCE); //LDS write for kgenEndBranch(pCtx, NULL); return 0; } //----------------------------------------------------------------------------- int subgGetDefaultDecomp( PGranularity *pgran, SubproblemDim *subdims, void* pArgs ) { int itemsPerSubg = 8; int subgA = 4; int subgB = 2; int bw1 = 8; int x1 = 4; int y1 = 4; CLBlasKargs *kargs; if ( NULL == pArgs ) { return -EINVAL; } kargs = (CLBlasKargs *)pArgs; if( isComplexType(kargs->dtype) ){ bw1 /= 2; } if( isDoubleBasedType(kargs->dtype) ){ bw1 /= 2; } subdims[1].bwidth = bw1; subdims[1].x = subdims[1].itemX = x1; subdims[1].y = subdims[1].itemY = y1; subdims[0].bwidth = bw1 * itemsPerSubg; subdims[0].itemX = x1 * subgB; subdims[0].x = x1*subgB; subdims[0].itemY = y1*subgA; subdims[0].y = y1*subgA; switch ( pgran->wgDim ) { case 1: pgran->wgSize[0] = 64; pgran->wgSize[1] = 1; break; case 2: pgran->wgSize[0] = itemsPerSubg; pgran->wgSize[1] = 64/itemsPerSubg; break; default: pgran->wgSize[0] = 64; pgran->wgSize[1] = 1; break; } return 0; } clblas-2.10/src/library/blas/gens/blas_subgroup.h000066400000000000000000000032631264277366700220540ustar00rootroot00000000000000/* ************************************************************************ * Copyright 2013 Advanced Micro Devices, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * ************************************************************************/ #ifndef SUBGROUP_H #define SUBGROUP_H #include #include #include #include #include #include #include #include "blas_kgen.h" #include "tile.h" #include "fetch.h" typedef int (*UpresProcPtr)( struct KgenContext*, BlasFunctionID, const BlasGenSettings *, UpdateResultFlags, const char *, const char *, const char *); /** */ typedef struct SubgVarNames { const char* subgCoord; // 2-vector of subgroup ID by X and Y const char* itemId; // 2-vector of subgroup item id/subgroupID } SubgVarNames; /** */ int mergeUpdateResult( struct KgenContext* pCtx, BlasFunctionID funcID, struct BlasGenSettings* pGSet, SubgVarNames* pSubgVNames, UpdateResultFlags upResFlags, UpresProcPtr upresProcPtr ); /** */ int subgGetDefaultDecomp( PGranularity *pgran, SubproblemDim *subdims, void* pArgs ); #endif clblas-2.10/src/library/blas/gens/clTemplates/000077500000000000000000000000001264277366700213055ustar00rootroot00000000000000clblas-2.10/src/library/blas/gens/clTemplates/asum.cl000066400000000000000000000041731264277366700225770ustar00rootroot00000000000000/* ************************************************************************ * Copyright 2013 Advanced Micro Devices, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * ************************************************************************/ static const char *asum_kernel = " #ifdef DOUBLE_PRECISION #ifdef cl_khr_fp64 #pragma OPENCL EXTENSION cl_khr_fp64 : enable #else #pragma OPENCL EXTENSION cl_amd_fp64 : enable #endif #endif __kernel void %PREFIXasum_kernel( __global %TYPE *_X, __global %PTYPE *scratchBuff, uint N, uint offx, int incx) { __global %TYPE *X = _X + offx; %TYPE asum = (%TYPE) 0.0; #ifdef INCX_NEGATIVE if( get_global_id(0) == 0 ) { scratchBuff[0] = (%PTYPE)0.0; } return; #endif int gOffset; for( gOffset=(get_global_id(0) * %V); (gOffset + %V - 1)____x_x where // - is N or T representing the transpose operation on A // - is N or T representing the transpose operation on B // - is a required divisor of N (1 for any value) // - is a required divisor of M (1 for any value) // - is a required divisor of K (1 for any value) // - x is the block size // - x is the number of points computed per work-item // // For instance a kernel named 'dgemm_NT_16_32_1_8x8_2x4' // - would implement C = C + A * B' // - for N multiple of 16 // - for M multiple of 32 // - for any value of K // - using work-groups of size (8,8) // - with each thread computing 2x4 points of C // // // The kernel prototype shall be compatible with // // __kernel void dgemm( __global double const * restrict A, // __global double const * restrict B, // __global double * C, // uint M, // uint N, // uint K, // double alpha, // double beta, // uint lda, // uint ldb, // uint ldc, // uint offsetA, // uint offsetB , // uint offsetC // ) // // // // // // ===== dgemm_NT_MN48.cl static const char * dgemm_NT_24_24_8_8x8_3x3__ALPHABETA = " __attribute__((reqd_work_group_size(8,8,1))) __kernel void dgemm_NT_24_24_8_8x8_3x3__ALPHABETA( __global double const * restrict A, __global double const * restrict B, __global double * C, uint const M, uint const N, uint const K, double const alpha, double const beta, uint lda, uint ldb, uint ldc, uint const offsetA, uint const offsetB, uint const offsetC) { double rC[3][3] = {(double)0}; double rA[1][3]; double rB[1][3]; double PreFetchA[3]; double PreFetchB[3]; A += offsetA; B += offsetB; C+=offsetC; __local double lA[192]; __local double lB[192]; int gidx = get_group_id(0); int gidy = get_group_id(1); int idx = get_local_id(0); int idy = get_local_id(1); int idt = 8*idy + idx; int idxT = idt % 8; int idyT = idt / 8; A += gidx*24+ idxT + idyT*lda; B += gidy*24+ idxT + idyT*ldb; __local double* plA = lA + idyT*24+idxT; __local double* plB = lB + idyT*24+idxT; plA[0] = A[0]; plA[8] = A[8]; plA[16] = A[16]; plB[0] = B[0]; plB[8] = B[8]; plB[16] = B[16]; barrier(CLK_LOCAL_MEM_FENCE); //for( int block_k=0 ; block_k< K ; block_k+=8) uint block_k = K >> 3; do { //barrier(CLK_LOCAL_MEM_FENCE); A += 8*lda; B += 8*ldb; PreFetchA[0] = A[0]; PreFetchA[1] = A[8]; PreFetchA[2] = A[16]; PreFetchB[0] = B[0]; PreFetchB[1] = B[8]; PreFetchB[2] = B[16]; int offA = idx; int offB = idy; for( int k = 0 ; k < 8; k+=1) { rA[0][0] = lA[offA + 0]; rA[0][1] = lA[offA + 8]; rA[0][2] = lA[offA + 16]; rB[0][0] = lB[offB + 0]; rB[0][1] = lB[offB + 8]; rB[0][2] = lB[offB + 16]; offA += 24; offB += 24; rC[0][0]=mad(rA[0][0],rB[0][0],rC[0][0]); rC[1][0]=mad(rA[0][1],rB[0][0],rC[1][0]); rC[2][0]=mad(rA[0][2],rB[0][0],rC[2][0]); rC[0][1]=mad(rA[0][0],rB[0][1],rC[0][1]); rC[1][1]=mad(rA[0][1],rB[0][1],rC[1][1]); rC[2][1]=mad(rA[0][2],rB[0][1],rC[2][1]); rC[0][2]=mad(rA[0][0],rB[0][2],rC[0][2]); rC[1][2]=mad(rA[0][1],rB[0][2],rC[1][2]); rC[2][2]=mad(rA[0][2],rB[0][2],rC[2][2]); } barrier(CLK_LOCAL_MEM_FENCE); plA[0] = PreFetchA[0]; plA[8] = PreFetchA[1]; plA[16] = PreFetchA[2]; plB[0] = PreFetchB[0]; plB[8] = PreFetchB[1]; plB[16] = PreFetchB[2]; barrier(CLK_LOCAL_MEM_FENCE); // A += 8*lda; // B += 8*ldb; } while (--block_k > 0); C+= gidx*24; C+= idx; C+= gidy*24*ldc; C+= idy*ldc; C[0*ldc] = alpha*rC[0][0] + beta*C[0*ldc]; C[8*ldc] = alpha*rC[0][1] + beta*C[8*ldc]; C[16*ldc] = alpha*rC[0][2] + beta*C[16*ldc]; C+=8; C[0*ldc] = alpha*rC[1][0] + beta*C[0*ldc]; C[8*ldc] = alpha*rC[1][1] + beta*C[8*ldc]; C[16*ldc] = alpha*rC[1][2] + beta*C[16*ldc]; C+=8; C[0*ldc] = alpha*rC[2][0] + beta*C[0*ldc]; C[8*ldc] = alpha*rC[2][1] + beta*C[8*ldc]; C[16*ldc] = alpha*rC[2][2] + beta*C[16*ldc]; } "; static const char * dgemm_NT_16_16_8_8x8_2x2__ALPHABETA = " __attribute__((reqd_work_group_size(8,8,1))) __kernel void dgemm_NT_16_16_8_8x8_2x2__ALPHABETA( __global double const * restrict A, __global double const * restrict B, __global double * C, uint const M, uint const N, uint const K, double const alpha, double const beta, uint lda, uint ldb, uint ldc, uint const offsetA, uint const offsetB, uint const offsetC) { double rC[2][2] = {(double)0}; double rA[1][2]; double rB[1][2]; double PreFetchA[2]; double PreFetchB[2]; A += offsetA; B += offsetB; C+=offsetC; __local double lA[128]; __local double lB[128]; int gidx = get_group_id(0); int gidy = get_group_id(1); int idx = get_local_id(0); int idy = get_local_id(1); int idt = 8*idy + idx; int idxT = idt % 8; int idyT = idt / 8; A += gidx*16+ idxT + idyT*lda; B += gidy*16+ idxT + idyT*ldb; __local double* plA = lA + idyT*16+idxT; __local double* plB = lB + idyT*16+idxT; plA[0] = A[0]; plA[8] = A[8]; plB[0] = B[0]; plB[8] = B[8]; barrier(CLK_LOCAL_MEM_FENCE); //for( int block_k=0 ; block_k< K ; block_k+=8) uint block_k = K >> 3; do { //barrier(CLK_LOCAL_MEM_FENCE); A += 8*lda; B += 8*ldb; PreFetchA[0] = A[0]; PreFetchA[1] = A[8]; PreFetchB[0] = B[0]; PreFetchB[1] = B[8]; int offA = idx; int offB = idy; for( int k = 0 ; k < 8; k+=1) { rA[0][0] = lA[offA + 0]; rA[0][1] = lA[offA + 8]; rB[0][0] = lB[offB + 0]; rB[0][1] = lB[offB + 8]; offA += 16; offB += 16; rC[0][0]=mad(rA[0][0],rB[0][0],rC[0][0]); rC[1][0]=mad(rA[0][1],rB[0][0],rC[1][0]); rC[0][1]=mad(rA[0][0],rB[0][1],rC[0][1]); rC[1][1]=mad(rA[0][1],rB[0][1],rC[1][1]); } barrier(CLK_LOCAL_MEM_FENCE); plA[0] = PreFetchA[0]; plA[8] = PreFetchA[1]; plB[0] = PreFetchB[0]; plB[8] = PreFetchB[1]; barrier(CLK_LOCAL_MEM_FENCE); // A += 8*lda; // B += 8*ldb; } while (--block_k > 0); C+= gidx*16; C+= idx; C+= gidy*16*ldc; C+= idy*ldc; C[0*ldc] = alpha*rC[0][0] + beta*C[0*ldc]; C[8*ldc] = alpha*rC[0][1] + beta*C[8*ldc]; C+=8; C[0*ldc] = alpha*rC[1][0] + beta*C[0*ldc]; C[8*ldc] = alpha*rC[1][1] + beta*C[8*ldc]; } "; static const char * dgemm_NT_24_24_8_8x8_3x3__ALPHA = " __attribute__((reqd_work_group_size(8,8,1))) __kernel void dgemm_NT_24_24_8_8x8_3x3__ALPHA( __global double const * restrict A, __global double const * restrict B, __global double * C, uint const M, uint const N, uint const K, double const alpha, uint lda, uint ldb, uint ldc, uint const offsetA, uint const offsetB, uint const offsetC) { double rC[3][3] = {(double)0}; double rA[1][3]; double rB[1][3]; double PreFetchA[3]; double PreFetchB[3]; A += offsetA; B += offsetB; C+=offsetC; __local double lA[192]; __local double lB[192]; int gidx = get_group_id(0); int gidy = get_group_id(1); int idx = get_local_id(0); int idy = get_local_id(1); int idt = 8*idy + idx; int idxT = idt % 8; int idyT = idt / 8; A += gidx*24+ idxT + idyT*lda; B += gidy*24+ idxT + idyT*ldb; __local double* plA = lA + idyT*24+idxT; __local double* plB = lB + idyT*24+idxT; plA[0] = A[0]; plA[8] = A[8]; plA[16] = A[16]; plB[0] = B[0]; plB[8] = B[8]; plB[16] = B[16]; barrier(CLK_LOCAL_MEM_FENCE); //for( int block_k=0 ; block_k< K ; block_k+=8) uint block_k = K >> 3; do { //barrier(CLK_LOCAL_MEM_FENCE); A += 8*lda; B += 8*ldb; PreFetchA[0] = A[0]; PreFetchA[1] = A[8]; PreFetchA[2] = A[16]; PreFetchB[0] = B[0]; PreFetchB[1] = B[8]; PreFetchB[2] = B[16]; int offA = idx; int offB = idy; for( int k = 0 ; k < 8; k+=1) { rA[0][0] = lA[offA + 0]; rA[0][1] = lA[offA + 8]; rA[0][2] = lA[offA + 16]; rB[0][0] = lB[offB + 0]; rB[0][1] = lB[offB + 8]; rB[0][2] = lB[offB + 16]; offA += 24; offB += 24; rC[0][0]=mad(rA[0][0],rB[0][0],rC[0][0]); rC[1][0]=mad(rA[0][1],rB[0][0],rC[1][0]); rC[2][0]=mad(rA[0][2],rB[0][0],rC[2][0]); rC[0][1]=mad(rA[0][0],rB[0][1],rC[0][1]); rC[1][1]=mad(rA[0][1],rB[0][1],rC[1][1]); rC[2][1]=mad(rA[0][2],rB[0][1],rC[2][1]); rC[0][2]=mad(rA[0][0],rB[0][2],rC[0][2]); rC[1][2]=mad(rA[0][1],rB[0][2],rC[1][2]); rC[2][2]=mad(rA[0][2],rB[0][2],rC[2][2]); } barrier(CLK_LOCAL_MEM_FENCE); plA[0] = PreFetchA[0]; plA[8] = PreFetchA[1]; plA[16] = PreFetchA[2]; plB[0] = PreFetchB[0]; plB[8] = PreFetchB[1]; plB[16] = PreFetchB[2]; barrier(CLK_LOCAL_MEM_FENCE); // A += 8*lda; // B += 8*ldb; } while (--block_k > 0); C+= gidx*24; C+= idx; C+= gidy*24*ldc; C+= idy*ldc; C[0*ldc] = alpha*rC[0][0] ; C[8*ldc] = alpha*rC[0][1] ; C[16*ldc] = alpha*rC[0][2]; C+=8; C[0*ldc] = alpha*rC[1][0] ; C[8*ldc] = alpha*rC[1][1] ; C[16*ldc] = alpha*rC[1][2]; C+=8; C[0*ldc] = alpha*rC[2][0] ; C[8*ldc] = alpha*rC[2][1] ; C[16*ldc] = alpha*rC[2][2]; } "; static const char * dgemm_NT_16_16_8_8x8_2x2__ALPHA = " __attribute__((reqd_work_group_size(8,8,1))) __kernel void dgemm_NT_16_16_8_8x8_2x2__ALPHA( __global double const * restrict A, __global double const * restrict B, __global double * C, uint const M, uint const N, uint const K, double const alpha, uint lda, uint ldb, uint ldc, uint const offsetA, uint const offsetB, uint const offsetC) { double rC[2][2] = {(double)0}; double rA[1][2]; double rB[1][2]; double PreFetchA[2]; double PreFetchB[2]; A += offsetA; B += offsetB; C+=offsetC; __local double lA[128]; __local double lB[128]; int gidx = get_group_id(0); int gidy = get_group_id(1); int idx = get_local_id(0); int idy = get_local_id(1); int idt = 8*idy + idx; int idxT = idt % 8; int idyT = idt / 8; A += gidx*16+ idxT + idyT*lda; B += gidy*16+ idxT + idyT*ldb; __local double* plA = lA + idyT*16+idxT; __local double* plB = lB + idyT*16+idxT; plA[0] = A[0]; plA[8] = A[8]; plB[0] = B[0]; plB[8] = B[8]; barrier(CLK_LOCAL_MEM_FENCE); //for( int block_k=0 ; block_k< K ; block_k+=8) uint block_k = K >> 3; do { //barrier(CLK_LOCAL_MEM_FENCE); A += 8*lda; B += 8*ldb; PreFetchA[0] = A[0]; PreFetchA[1] = A[8]; PreFetchB[0] = B[0]; PreFetchB[1] = B[8]; int offA = idx; int offB = idy; for( int k = 0 ; k < 8; k+=1) { rA[0][0] = lA[offA + 0]; rA[0][1] = lA[offA + 8]; rB[0][0] = lB[offB + 0]; rB[0][1] = lB[offB + 8]; offA += 16; offB += 16; rC[0][0]=mad(rA[0][0],rB[0][0],rC[0][0]); rC[1][0]=mad(rA[0][1],rB[0][0],rC[1][0]); rC[0][1]=mad(rA[0][0],rB[0][1],rC[0][1]); rC[1][1]=mad(rA[0][1],rB[0][1],rC[1][1]); } barrier(CLK_LOCAL_MEM_FENCE); plA[0] = PreFetchA[0]; plA[8] = PreFetchA[1]; plB[0] = PreFetchB[0]; plB[8] = PreFetchB[1]; barrier(CLK_LOCAL_MEM_FENCE); // A += 8*lda; // B += 8*ldb; } while (--block_k > 0); C+= gidx*16; C+= idx; C+= gidy*16*ldc; C+= idy*ldc; C[0*ldc] = alpha*rC[0][0]; C[8*ldc] = alpha*rC[0][1]; C+=8; C[0*ldc] = alpha*rC[1][0]; C[8*ldc] = alpha*rC[1][1]; } "; static const char * dgemm_NN_24_24_8_8x8_3x3__ALPHABETA = " __attribute__((reqd_work_group_size(8,8,1))) __kernel void dgemm_NN_24_24_8_8x8_3x3__ALPHABETA( __global double const * restrict A, __global double const * restrict B, __global double * C, uint const M, uint const N, uint const K, double const alpha, double const beta, uint lda, uint ldb, uint ldc, uint const offsetA, uint const offsetB, uint const offsetC) { double rC[3][3] = {(double)0}; double rA[1][3]; double rB[1][3]; double PreFetchA[3]; double PreFetchB[3]; A += offsetA; B += offsetB; C+=offsetC; __local double lA[192]; __local double lB[192]; int gidx = get_group_id(0); int gidy = get_group_id(1); int idx = get_local_id(0); int idy = get_local_id(1); int idt = 8*idy + idx; int idxT = idt % 8; int idyT = idt / 8; A += gidx*24+ idxT + idyT*lda; B += gidy*24*ldb+ idxT + idyT*ldb; __local double* plA = lA + idyT*24+idxT; __local double* plB = lB + idxT*24+idyT; plA[0] = A[0]; plA[8] = A[8]; plA[16] = A[16]; plB[0] = B[0]; plB[8] = B[8*ldb]; plB[16] = B[16*ldb]; barrier(CLK_LOCAL_MEM_FENCE); //for( int block_k=0 ; block_k< K ; block_k+=8) uint block_k = K >> 3; do { //barrier(CLK_LOCAL_MEM_FENCE); A += 8*lda; B += 8; PreFetchA[0] = A[0]; PreFetchA[1] = A[8]; PreFetchA[2] = A[16]; PreFetchB[0] = B[0]; PreFetchB[1] = B[8*ldb]; PreFetchB[2] = B[16*ldb]; int offA = idx; int offB = idy; for( int k = 0 ; k < 8; k+=1) { rA[0][0] = lA[offA + 0]; rA[0][1] = lA[offA + 8]; rA[0][2] = lA[offA + 16]; rB[0][0] = lB[offB + 0]; rB[0][1] = lB[offB + 8]; rB[0][2] = lB[offB + 16]; offA += 24; offB += 24; rC[0][0]=mad(rA[0][0],rB[0][0],rC[0][0]); rC[1][0]=mad(rA[0][1],rB[0][0],rC[1][0]); rC[2][0]=mad(rA[0][2],rB[0][0],rC[2][0]); rC[0][1]=mad(rA[0][0],rB[0][1],rC[0][1]); rC[1][1]=mad(rA[0][1],rB[0][1],rC[1][1]); rC[2][1]=mad(rA[0][2],rB[0][1],rC[2][1]); rC[0][2]=mad(rA[0][0],rB[0][2],rC[0][2]); rC[1][2]=mad(rA[0][1],rB[0][2],rC[1][2]); rC[2][2]=mad(rA[0][2],rB[0][2],rC[2][2]); } barrier(CLK_LOCAL_MEM_FENCE); plA[0] = PreFetchA[0]; plA[8] = PreFetchA[1]; plA[16] = PreFetchA[2]; plB[0] = PreFetchB[0]; plB[8] = PreFetchB[1]; plB[16] = PreFetchB[2]; barrier(CLK_LOCAL_MEM_FENCE); // A += 8*lda; // B += 8*ldb; } while (--block_k > 0); C+= gidx*24; C+= idx; C+= gidy*24*ldc; C+= idy*ldc; C[0*ldc] = alpha*rC[0][0] + beta*C[0*ldc]; C[8*ldc] = alpha*rC[0][1] + beta*C[8*ldc]; C[16*ldc] = alpha*rC[0][2] + beta*C[16*ldc]; C+=8; C[0*ldc] = alpha*rC[1][0] + beta*C[0*ldc]; C[8*ldc] = alpha*rC[1][1] + beta*C[8*ldc]; C[16*ldc] = alpha*rC[1][2] + beta*C[16*ldc]; C+=8; C[0*ldc] = alpha*rC[2][0] + beta*C[0*ldc]; C[8*ldc] = alpha*rC[2][1] + beta*C[8*ldc]; C[16*ldc] = alpha*rC[2][2] + beta*C[16*ldc]; } "; static const char * dgemm_NN_16_16_8_8x8_2x2__ALPHABETA = " __attribute__((reqd_work_group_size(8,8,1))) __kernel void dgemm_NN_16_16_8_8x8_2x2__ALPHABETA( __global double const * restrict A, __global double const * restrict B, __global double * C, uint const M, uint const N, uint const K, double const alpha, double const beta, uint lda, uint ldb, uint ldc, uint const offsetA, uint const offsetB, uint const offsetC) { double rC[2][2] = {(double)0}; double rA[1][2]; double rB[1][2]; double PreFetchA[2]; double PreFetchB[2]; A += offsetA; B += offsetB; C+=offsetC; __local double lA[128]; __local double lB[128]; int gidx = get_group_id(0); int gidy = get_group_id(1); int idx = get_local_id(0); int idy = get_local_id(1); int idt = 8*idy + idx; int idxT = idt % 8; int idyT = idt / 8; A += gidx*16+ idxT + idyT*lda; B += gidy*16*ldb+ idxT + idyT*ldb; __local double* plA = lA + idyT*16+idxT; __local double* plB = lB + idxT*16+idyT; plA[0] = A[0]; plA[8] = A[8]; plB[0] = B[0]; plB[8] = B[8*ldb]; barrier(CLK_LOCAL_MEM_FENCE); //for( int block_k=0 ; block_k< K ; block_k+=8) uint block_k = K >> 3; do { //barrier(CLK_LOCAL_MEM_FENCE); A += 8*lda; B += 8; PreFetchA[0] = A[0]; PreFetchA[1] = A[8]; PreFetchB[0] = B[0]; PreFetchB[1] = B[8*ldb]; int offA = idx; int offB = idy; for( int k = 0 ; k < 8; k+=1) { rA[0][0] = lA[offA + 0]; rA[0][1] = lA[offA + 8]; rB[0][0] = lB[offB + 0]; rB[0][1] = lB[offB + 8]; offA += 16; offB += 16; rC[0][0]=mad(rA[0][0],rB[0][0],rC[0][0]); rC[1][0]=mad(rA[0][1],rB[0][0],rC[1][0]); rC[0][1]=mad(rA[0][0],rB[0][1],rC[0][1]); rC[1][1]=mad(rA[0][1],rB[0][1],rC[1][1]); } barrier(CLK_LOCAL_MEM_FENCE); plA[0] = PreFetchA[0]; plA[8] = PreFetchA[1]; plB[0] = PreFetchB[0]; plB[8] = PreFetchB[1]; barrier(CLK_LOCAL_MEM_FENCE); // A += 8*lda; // B += 8*ldb; } while (--block_k > 0); C+= gidx*16; C+= idx; C+= gidy*16*ldc; C+= idy*ldc; C[0*ldc] = alpha*rC[0][0] + beta*C[0*ldc]; C[8*ldc] = alpha*rC[0][1] + beta*C[8*ldc]; C+=8; C[0*ldc] = alpha*rC[1][0] + beta*C[0*ldc]; C[8*ldc] = alpha*rC[1][1] + beta*C[8*ldc]; } "; static const char * dgemm_NN_24_24_8_8x8_3x3__ALPHA = " __attribute__((reqd_work_group_size(8,8,1))) __kernel void dgemm_NN_24_24_8_8x8_3x3__ALPHA( __global double const * restrict A, __global double const * restrict B, __global double * C, uint const M, uint const N, uint const K, double const alpha, uint lda, uint ldb, uint ldc, uint const offsetA, uint const offsetB, uint const offsetC) { double rC[3][3] = {(double)0}; double rA[1][3]; double rB[1][3]; double PreFetchA[3]; double PreFetchB[3]; A += offsetA; B += offsetB; C+=offsetC; __local double lA[192]; __local double lB[192]; int gidx = get_group_id(0); int gidy = get_group_id(1); int idx = get_local_id(0); int idy = get_local_id(1); int idt = 8*idy + idx; int idxT = idt % 8; int idyT = idt / 8; A += gidx*24+ idxT + idyT*lda; B += gidy*24*ldb+ idxT + idyT*ldb; __local double* plA = lA + idyT*24+idxT; __local double* plB = lB + idxT*24+idyT; plA[0] = A[0]; plA[8] = A[8]; plA[16] = A[16]; plB[0] = B[0]; plB[8] = B[8*ldb]; plB[16] = B[16*ldb]; barrier(CLK_LOCAL_MEM_FENCE); //for( int block_k=0 ; block_k< K ; block_k+=8) uint block_k = K >> 3; do { //barrier(CLK_LOCAL_MEM_FENCE); A += 8*lda; B += 8; PreFetchA[0] = A[0]; PreFetchA[1] = A[8]; PreFetchA[2] = A[16]; PreFetchB[0] = B[0]; PreFetchB[1] = B[8*ldb]; PreFetchB[2] = B[16*ldb]; int offA = idx; int offB = idy; for( int k = 0 ; k < 8; k+=1) { rA[0][0] = lA[offA + 0]; rA[0][1] = lA[offA + 8]; rA[0][2] = lA[offA + 16]; rB[0][0] = lB[offB + 0]; rB[0][1] = lB[offB + 8]; rB[0][2] = lB[offB + 16]; offA += 24; offB += 24; rC[0][0]=mad(rA[0][0],rB[0][0],rC[0][0]); rC[1][0]=mad(rA[0][1],rB[0][0],rC[1][0]); rC[2][0]=mad(rA[0][2],rB[0][0],rC[2][0]); rC[0][1]=mad(rA[0][0],rB[0][1],rC[0][1]); rC[1][1]=mad(rA[0][1],rB[0][1],rC[1][1]); rC[2][1]=mad(rA[0][2],rB[0][1],rC[2][1]); rC[0][2]=mad(rA[0][0],rB[0][2],rC[0][2]); rC[1][2]=mad(rA[0][1],rB[0][2],rC[1][2]); rC[2][2]=mad(rA[0][2],rB[0][2],rC[2][2]); } barrier(CLK_LOCAL_MEM_FENCE); plA[0] = PreFetchA[0]; plA[8] = PreFetchA[1]; plA[16] = PreFetchA[2]; plB[0] = PreFetchB[0]; plB[8] = PreFetchB[1]; plB[16] = PreFetchB[2]; barrier(CLK_LOCAL_MEM_FENCE); // A += 8*lda; // B += 8*ldb; } while (--block_k > 0); C+= gidx*24; C+= idx; C+= gidy*24*ldc; C+= idy*ldc; C[0*ldc] = alpha*rC[0][0] ; C[8*ldc] = alpha*rC[0][1] ; C[16*ldc] = alpha*rC[0][2]; C+=8; C[0*ldc] = alpha*rC[1][0] ; C[8*ldc] = alpha*rC[1][1] ; C[16*ldc] = alpha*rC[1][2]; C+=8; C[0*ldc] = alpha*rC[2][0] ; C[8*ldc] = alpha*rC[2][1] ; C[16*ldc] = alpha*rC[2][2]; } "; static const char * dgemm_NN_16_16_8_8x8_2x2__ALPHA = " __attribute__((reqd_work_group_size(8,8,1))) __kernel void dgemm_NN_16_16_8_8x8_2x2__ALPHA( __global double const * restrict A, __global double const * restrict B, __global double * C, uint const M, uint const N, uint const K, double const alpha, uint lda, uint ldb, uint ldc, uint const offsetA, uint const offsetB, uint const offsetC) { double rC[2][2] = {(double)0}; double rA[1][2]; double rB[1][2]; double PreFetchA[2]; double PreFetchB[2]; A += offsetA; B += offsetB; C+=offsetC; __local double lA[128]; __local double lB[128]; int gidx = get_group_id(0); int gidy = get_group_id(1); int idx = get_local_id(0); int idy = get_local_id(1); int idt = 8*idy + idx; int idxT = idt % 8; int idyT = idt / 8; A += gidx*16+ idxT + idyT*lda; B += gidy*16*ldb+ idxT + idyT*ldb; __local double* plA = lA + idyT*16+idxT; __local double* plB = lB + idxT*16+idyT; plA[0] = A[0]; plA[8] = A[8]; plB[0] = B[0]; plB[8] = B[8*ldb]; barrier(CLK_LOCAL_MEM_FENCE); //for( int block_k=0 ; block_k< K ; block_k+=8) uint block_k = K >> 3; do { //barrier(CLK_LOCAL_MEM_FENCE); A += 8*lda; B += 8; PreFetchA[0] = A[0]; PreFetchA[1] = A[8]; PreFetchB[0] = B[0]; PreFetchB[1] = B[8*ldb]; int offA = idx; int offB = idy; for( int k = 0 ; k < 8; k+=1) { rA[0][0] = lA[offA + 0]; rA[0][1] = lA[offA + 8]; rB[0][0] = lB[offB + 0]; rB[0][1] = lB[offB + 8]; offA += 16; offB += 16; rC[0][0]=mad(rA[0][0],rB[0][0],rC[0][0]); rC[1][0]=mad(rA[0][1],rB[0][0],rC[1][0]); rC[0][1]=mad(rA[0][0],rB[0][1],rC[0][1]); rC[1][1]=mad(rA[0][1],rB[0][1],rC[1][1]); } barrier(CLK_LOCAL_MEM_FENCE); plA[0] = PreFetchA[0]; plA[8] = PreFetchA[1]; plB[0] = PreFetchB[0]; plB[8] = PreFetchB[1]; barrier(CLK_LOCAL_MEM_FENCE); // A += 8*lda; // B += 8*ldb; } while (--block_k > 0); C+= gidx*16; C+= idx; C+= gidy*16*ldc; C+= idy*ldc; C[0*ldc] = alpha*rC[0][0]; C[8*ldc] = alpha*rC[0][1]; C+=8; C[0*ldc] = alpha*rC[1][0]; C[8*ldc] = alpha*rC[1][1]; } "; clblas-2.10/src/library/blas/gens/clTemplates/dgemm_hawai.cl000066400000000000000000005644711264277366700241100ustar00rootroot00000000000000 // DGEMM kernels for Hawai & Tahiti // // All kernels are ColumnMajor. The case RowMajor is handled by an earlier transformation // into an equivalent ColumnMajor (using the property that (A*B)' is equal to B'A' that the // conversion from Row to Column major is basically a transpose) // // The naming scheme for the kernels is dgemm_____x_x where // - is N or T representing the transpose operation on A // - is N or T representing the transpose operation on B // - is a required divisor of N (1 for any value) // - is a required divisor of M (1 for any value) // - is a required divisor of K (1 for any value) // - x is the block size // - x is the number of points computed per work-item // // For instance a kernel named 'dgemm_NT_16_32_1_8x8_2x4' // - would implement C = C + A * B' // - for N multiple of 16 // - for M multiple of 32 // - for any value of K // - using work-groups of size (8,8) // - with each thread computing 2x4 points of C // // // The kernel prototype shall be compatible with // // __kernel void dgemm( __global double const * restrict A, // __global double const * restrict B, // __global double * C, // uint M, // uint N, // uint K, // double alpha, // double beta, // uint lda, // uint ldb, // uint ldc, // uint offsetA, // uint offsetB , // uint offsetC // ) // // // // // // ===== dgemm_NT_MN48.cl static const char * dgemm_NT_48_48_8_8x8_6x6__ALPHABETA = " typedef union GPtr { __global float *f; __global double *d; __global float2 *f2v; __global double2 *d2v; } GPtr; __attribute__((reqd_work_group_size(8,8,1))) __kernel void dgemm_NT_48_48_8_8x8_6x6__ALPHABETA(__global double2 const * restrict A, __global double2 const * restrict B, __global double * C, uint const M, uint const N, uint const K, double const alpha, double const beta, uint lda, uint ldb, uint ldc, uint offsetA, uint offsetB, uint offsetC) { GPtr uA, uB; uA.d2v = (__global double2 *)A; uB.d2v = (__global double2 *)B; // C += offsetC; uA.d += offsetA; uB.d += offsetB; C += offsetC; double rC[6][6] = {(double)0}; double rA[6]; double rB[6]; __local double lA[392]; __local double lB[392]; int gidx = get_group_id(0); int gidy = get_group_id(1); int idx = get_local_id(0); int idy = get_local_id(1); uA.d += 2*(gidx*24 + idx) + idy*lda; uB.d += 2*(gidy*24 + idx) + idy*ldb; uint block_k = K >> 3; do { __local double* plA = lA + idy*48 + 2*idx; __local double* plB = lB + idy*48 + 2*idx; barrier(CLK_LOCAL_MEM_FENCE); vstore2( uA.d2v[0 ], 0, plA+0 ); vstore2( uA.d2v[8 ], 0, plA+16 ); vstore2( uA.d2v[16], 0, plA+32 ); vstore2( uB.d2v[0 ], 0, plB+0 ); vstore2( uB.d2v[8 ], 0, plB+16 ); vstore2( uB.d2v[16], 0, plB+32 ); barrier(CLK_LOCAL_MEM_FENCE); int offA = idx << 1; int offB = idy << 1; #pragma unroll 1 for(uint k = 0 ; k < 8; k+=1) { rA[0] = lA[offA + 0]; rA[1] = lA[offA + 1]; rA[2] = lA[offA + 16]; rA[3] = lA[offA + 17]; rA[4] = lA[offA + 32]; rA[5] = lA[offA + 33]; rB[0] = lB[offB + 0]; rB[1] = lB[offB + 1]; rB[2] = lB[offB + 16]; rB[3] = lB[offB + 17]; rB[4] = lB[offB + 32]; rB[5] = lB[offB + 33]; offA += 48; offB += 48; rC[0][0] = mad(rA[0],rB[0],rC[0][0]); rC[0][1] = mad(rA[1],rB[0],rC[0][1]); rC[0][2] = mad(rA[2],rB[0],rC[0][2]); rC[0][3] = mad(rA[3],rB[0],rC[0][3]); rC[0][4] = mad(rA[4],rB[0],rC[0][4]); rC[0][5] = mad(rA[5],rB[0],rC[0][5]); rC[1][0] = mad(rA[0],rB[1],rC[1][0]); rC[1][1] = mad(rA[1],rB[1],rC[1][1]); rC[1][2] = mad(rA[2],rB[1],rC[1][2]); rC[1][3] = mad(rA[3],rB[1],rC[1][3]); rC[1][4] = mad(rA[4],rB[1],rC[1][4]); rC[1][5] = mad(rA[5],rB[1],rC[1][5]); rC[2][0] = mad(rA[0],rB[2],rC[2][0]); rC[2][1] = mad(rA[1],rB[2],rC[2][1]); rC[2][2] = mad(rA[2],rB[2],rC[2][2]); rC[2][3] = mad(rA[3],rB[2],rC[2][3]); rC[2][4] = mad(rA[4],rB[2],rC[2][4]); rC[2][5] = mad(rA[5],rB[2],rC[2][5]); rC[3][0] = mad(rA[0],rB[3],rC[3][0]); rC[3][1] = mad(rA[1],rB[3],rC[3][1]); rC[3][2] = mad(rA[2],rB[3],rC[3][2]); rC[3][3] = mad(rA[3],rB[3],rC[3][3]); rC[3][4] = mad(rA[4],rB[3],rC[3][4]); rC[3][5] = mad(rA[5],rB[3],rC[3][5]); rC[4][0] = mad(rA[0],rB[4],rC[4][0]); rC[4][1] = mad(rA[1],rB[4],rC[4][1]); rC[4][2] = mad(rA[2],rB[4],rC[4][2]); rC[4][3] = mad(rA[3],rB[4],rC[4][3]); rC[4][4] = mad(rA[4],rB[4],rC[4][4]); rC[4][5] = mad(rA[5],rB[4],rC[4][5]); rC[5][0] = mad(rA[0],rB[5],rC[5][0]); rC[5][1] = mad(rA[1],rB[5],rC[5][1]); rC[5][2] = mad(rA[2],rB[5],rC[5][2]); rC[5][3] = mad(rA[3],rB[5],rC[5][3]); rC[5][4] = mad(rA[4],rB[5],rC[5][4]); rC[5][5] = mad(rA[5],rB[5],rC[5][5]); } uA.d += lda << 3; uB.d += ldb << 3; } while (--block_k > 0); int offset_x = gidx*48+ idx*2; int offset_y = gidy*48+ idy*2; (C[(offset_x + 0) + (offset_y + 0) * ldc] = mad(beta, C[(offset_x + 0) + (offset_y + 0) * ldc], alpha * rC[0][0])); (C[(offset_x + 1) + (offset_y + 0) * ldc] = mad(beta, C[(offset_x + 1) + (offset_y + 0) * ldc], alpha * rC[0][1])); (C[(offset_x + 0) + (offset_y + 1) * ldc] = mad(beta, C[(offset_x + 0) + (offset_y + 1) * ldc], alpha * rC[1][0])); (C[(offset_x + 1) + (offset_y + 1) * ldc] = mad(beta, C[(offset_x + 1) + (offset_y + 1) * ldc], alpha * rC[1][1])); (C[(offset_x + 0) + (offset_y + 16) * ldc] = mad(beta, C[(offset_x + 0) + (offset_y + 16) * ldc], alpha * rC[2][0])); (C[(offset_x + 1) + (offset_y + 16) * ldc] = mad(beta, C[(offset_x + 1) + (offset_y + 16) * ldc], alpha * rC[2][1])); (C[(offset_x + 0) + (offset_y + 17) * ldc] = mad(beta, C[(offset_x + 0) + (offset_y + 17) * ldc], alpha * rC[3][0])); (C[(offset_x + 1) + (offset_y + 17) * ldc] = mad(beta, C[(offset_x + 1) + (offset_y + 17) * ldc], alpha * rC[3][1])); (C[(offset_x + 0) + (offset_y + 32) * ldc] = mad(beta, C[(offset_x + 0) + (offset_y + 32) * ldc], alpha * rC[4][0])); (C[(offset_x + 1) + (offset_y + 32) * ldc] = mad(beta, C[(offset_x + 1) + (offset_y + 32) * ldc], alpha * rC[4][1])); (C[(offset_x + 0) + (offset_y + 33) * ldc] = mad(beta, C[(offset_x + 0) + (offset_y + 33) * ldc], alpha * rC[5][0])); (C[(offset_x + 1) + (offset_y + 33) * ldc] = mad(beta, C[(offset_x + 1) + (offset_y + 33) * ldc], alpha * rC[5][1])); (C[(offset_x + 16) + (offset_y + 0) * ldc] = mad(beta, C[(offset_x + 16) + (offset_y + 0) * ldc], alpha * rC[0][2])); (C[(offset_x + 17) + (offset_y + 0) * ldc] = mad(beta, C[(offset_x + 17) + (offset_y + 0) * ldc], alpha * rC[0][3])); (C[(offset_x + 16) + (offset_y + 1) * ldc] = mad(beta, C[(offset_x + 16) + (offset_y + 1) * ldc], alpha * rC[1][2])); (C[(offset_x + 17) + (offset_y + 1) * ldc] = mad(beta, C[(offset_x + 17) + (offset_y + 1) * ldc], alpha * rC[1][3])); (C[(offset_x + 16) + (offset_y + 16) * ldc] = mad(beta, C[(offset_x + 16) + (offset_y + 16) * ldc], alpha * rC[2][2])); (C[(offset_x + 17) + (offset_y + 16) * ldc] = mad(beta, C[(offset_x + 17) + (offset_y + 16) * ldc], alpha * rC[2][3])); (C[(offset_x + 16) + (offset_y + 17) * ldc] = mad(beta, C[(offset_x + 16) + (offset_y + 17) * ldc], alpha * rC[3][2])); (C[(offset_x + 17) + (offset_y + 17) * ldc] = mad(beta, C[(offset_x + 17) + (offset_y + 17) * ldc], alpha * rC[3][3])); (C[(offset_x + 16) + (offset_y + 32) * ldc] = mad(beta, C[(offset_x + 16) + (offset_y + 32) * ldc], alpha * rC[4][2])); (C[(offset_x + 17) + (offset_y + 32) * ldc] = mad(beta, C[(offset_x + 17) + (offset_y + 32) * ldc], alpha * rC[4][3])); (C[(offset_x + 16) + (offset_y + 33) * ldc] = mad(beta, C[(offset_x + 16) + (offset_y + 33) * ldc], alpha * rC[5][2])); (C[(offset_x + 17) + (offset_y + 33) * ldc] = mad(beta, C[(offset_x + 17) + (offset_y + 33) * ldc], alpha * rC[5][3])); (C[(offset_x + 32) + (offset_y + 0) * ldc] = mad(beta, C[(offset_x + 32) + (offset_y + 0) * ldc], alpha * rC[0][4])); (C[(offset_x + 33) + (offset_y + 0) * ldc] = mad(beta, C[(offset_x + 33) + (offset_y + 0) * ldc], alpha * rC[0][5])); (C[(offset_x + 32) + (offset_y + 1) * ldc] = mad(beta, C[(offset_x + 32) + (offset_y + 1) * ldc], alpha * rC[1][4])); (C[(offset_x + 33) + (offset_y + 1) * ldc] = mad(beta, C[(offset_x + 33) + (offset_y + 1) * ldc], alpha * rC[1][5])); (C[(offset_x + 32) + (offset_y + 16) * ldc] = mad(beta, C[(offset_x + 32) + (offset_y + 16) * ldc], alpha * rC[2][4])); (C[(offset_x + 33) + (offset_y + 16) * ldc] = mad(beta, C[(offset_x + 33) + (offset_y + 16) * ldc], alpha * rC[2][5])); (C[(offset_x + 32) + (offset_y + 17) * ldc] = mad(beta, C[(offset_x + 32) + (offset_y + 17) * ldc], alpha * rC[3][4])); (C[(offset_x + 33) + (offset_y + 17) * ldc] = mad(beta, C[(offset_x + 33) + (offset_y + 17) * ldc], alpha * rC[3][5])); (C[(offset_x + 32) + (offset_y + 32) * ldc] = mad(beta, C[(offset_x + 32) + (offset_y + 32) * ldc], alpha * rC[4][4])); (C[(offset_x + 33) + (offset_y + 32) * ldc] = mad(beta, C[(offset_x + 33) + (offset_y + 32) * ldc], alpha * rC[4][5])); (C[(offset_x + 32) + (offset_y + 33) * ldc] = mad(beta, C[(offset_x + 32) + (offset_y + 33) * ldc], alpha * rC[5][4])); (C[(offset_x + 33) + (offset_y + 33) * ldc] = mad(beta, C[(offset_x + 33) + (offset_y + 33) * ldc], alpha * rC[5][5])); } "; static const char * dgemm_NT_48_48_8_8x8_6x6__ALPHA = " typedef union GPtr { __global float *f; __global double *d; __global float2 *f2v; __global double2 *d2v; } GPtr; __attribute__((reqd_work_group_size(8,8,1))) __kernel void dgemm_NT_48_48_8_8x8_6x6__ALPHA(__global double2 const * restrict A, __global double2 const * restrict B, __global double * C, uint const M, uint const N, uint const K, double const alpha, uint lda, uint ldb, uint ldc, uint offsetA, uint offsetB, uint offsetC) { GPtr uA, uB; uA.d2v = (__global double2 *)A; uB.d2v = (__global double2 *)B; // C += offsetC; uA.d += offsetA; uB.d += offsetB; C += offsetC; double rC[6][6] = {(double)0}; double rA[6]; double rB[6]; __local double lA[392]; __local double lB[392]; int gidx = get_group_id(0); int gidy = get_group_id(1); int idx = get_local_id(0); int idy = get_local_id(1); uA.d += 2*(gidx*24 + idx) + idy*lda; uB.d += 2*(gidy*24 + idx) + idy*ldb; uint block_k = K >> 3; do { __local double* plA = lA + idy*48 + 2*idx; __local double* plB = lB + idy*48 + 2*idx; barrier(CLK_LOCAL_MEM_FENCE); vstore2( uA.d2v[0 ], 0, plA+0 ); vstore2( uA.d2v[8 ], 0, plA+16 ); vstore2( uA.d2v[16], 0, plA+32 ); vstore2( uB.d2v[0 ], 0, plB+0 ); vstore2( uB.d2v[8 ], 0, plB+16 ); vstore2( uB.d2v[16], 0, plB+32 ); barrier(CLK_LOCAL_MEM_FENCE); int offA = idx << 1; int offB = idy << 1; #pragma unroll 1 for(uint k = 0 ; k < 8; k+=1) { rA[0] = lA[offA + 0]; rA[1] = lA[offA + 1]; rA[2] = lA[offA + 16]; rA[3] = lA[offA + 17]; rA[4] = lA[offA + 32]; rA[5] = lA[offA + 33]; rB[0] = lB[offB + 0]; rB[1] = lB[offB + 1]; rB[2] = lB[offB + 16]; rB[3] = lB[offB + 17]; rB[4] = lB[offB + 32]; rB[5] = lB[offB + 33]; offA += 48; offB += 48; rC[0][0] = mad(rA[0],rB[0],rC[0][0]); rC[0][1] = mad(rA[1],rB[0],rC[0][1]); rC[0][2] = mad(rA[2],rB[0],rC[0][2]); rC[0][3] = mad(rA[3],rB[0],rC[0][3]); rC[0][4] = mad(rA[4],rB[0],rC[0][4]); rC[0][5] = mad(rA[5],rB[0],rC[0][5]); rC[1][0] = mad(rA[0],rB[1],rC[1][0]); rC[1][1] = mad(rA[1],rB[1],rC[1][1]); rC[1][2] = mad(rA[2],rB[1],rC[1][2]); rC[1][3] = mad(rA[3],rB[1],rC[1][3]); rC[1][4] = mad(rA[4],rB[1],rC[1][4]); rC[1][5] = mad(rA[5],rB[1],rC[1][5]); rC[2][0] = mad(rA[0],rB[2],rC[2][0]); rC[2][1] = mad(rA[1],rB[2],rC[2][1]); rC[2][2] = mad(rA[2],rB[2],rC[2][2]); rC[2][3] = mad(rA[3],rB[2],rC[2][3]); rC[2][4] = mad(rA[4],rB[2],rC[2][4]); rC[2][5] = mad(rA[5],rB[2],rC[2][5]); rC[3][0] = mad(rA[0],rB[3],rC[3][0]); rC[3][1] = mad(rA[1],rB[3],rC[3][1]); rC[3][2] = mad(rA[2],rB[3],rC[3][2]); rC[3][3] = mad(rA[3],rB[3],rC[3][3]); rC[3][4] = mad(rA[4],rB[3],rC[3][4]); rC[3][5] = mad(rA[5],rB[3],rC[3][5]); rC[4][0] = mad(rA[0],rB[4],rC[4][0]); rC[4][1] = mad(rA[1],rB[4],rC[4][1]); rC[4][2] = mad(rA[2],rB[4],rC[4][2]); rC[4][3] = mad(rA[3],rB[4],rC[4][3]); rC[4][4] = mad(rA[4],rB[4],rC[4][4]); rC[4][5] = mad(rA[5],rB[4],rC[4][5]); rC[5][0] = mad(rA[0],rB[5],rC[5][0]); rC[5][1] = mad(rA[1],rB[5],rC[5][1]); rC[5][2] = mad(rA[2],rB[5],rC[5][2]); rC[5][3] = mad(rA[3],rB[5],rC[5][3]); rC[5][4] = mad(rA[4],rB[5],rC[5][4]); rC[5][5] = mad(rA[5],rB[5],rC[5][5]); } uA.d += lda << 3; uB.d += ldb << 3; } while (--block_k > 0); int offset_x = gidx*48+ idx*2; int offset_y = gidy*48+ idy*2; C[(offset_x + 0) + (offset_y + 0) * ldc] = alpha * rC[0][0]; C[(offset_x + 1) + (offset_y + 0) * ldc] = alpha * rC[0][1]; C[(offset_x + 0) + (offset_y + 1) * ldc] = alpha * rC[1][0]; C[(offset_x + 1) + (offset_y + 1) * ldc] = alpha * rC[1][1]; C[(offset_x + 0) + (offset_y + 16) * ldc] = alpha * rC[2][0]; C[(offset_x + 1) + (offset_y + 16) * ldc] = alpha * rC[2][1]; C[(offset_x + 0) + (offset_y + 17) * ldc] = alpha * rC[3][0]; C[(offset_x + 1) + (offset_y + 17) * ldc] = alpha * rC[3][1]; C[(offset_x + 0) + (offset_y + 32) * ldc] = alpha * rC[4][0]; C[(offset_x + 1) + (offset_y + 32) * ldc] = alpha * rC[4][1]; C[(offset_x + 0) + (offset_y + 33) * ldc] = alpha * rC[5][0]; C[(offset_x + 1) + (offset_y + 33) * ldc] = alpha * rC[5][1]; C[(offset_x + 16) + (offset_y + 0) * ldc] = alpha * rC[0][2]; C[(offset_x + 17) + (offset_y + 0) * ldc] = alpha * rC[0][3]; C[(offset_x + 16) + (offset_y + 1) * ldc] = alpha * rC[1][2]; C[(offset_x + 17) + (offset_y + 1) * ldc] = alpha * rC[1][3]; C[(offset_x + 16) + (offset_y + 16) * ldc] = alpha * rC[2][2]; C[(offset_x + 17) + (offset_y + 16) * ldc] = alpha * rC[2][3]; C[(offset_x + 16) + (offset_y + 17) * ldc] = alpha * rC[3][2]; C[(offset_x + 17) + (offset_y + 17) * ldc] = alpha * rC[3][3]; C[(offset_x + 16) + (offset_y + 32) * ldc] = alpha * rC[4][2]; C[(offset_x + 17) + (offset_y + 32) * ldc] = alpha * rC[4][3]; C[(offset_x + 16) + (offset_y + 33) * ldc] = alpha * rC[5][2]; C[(offset_x + 17) + (offset_y + 33) * ldc] = alpha * rC[5][3]; C[(offset_x + 32) + (offset_y + 0) * ldc] = alpha * rC[0][4]; C[(offset_x + 33) + (offset_y + 0) * ldc] = alpha * rC[0][5]; C[(offset_x + 32) + (offset_y + 1) * ldc] = alpha * rC[1][4]; C[(offset_x + 33) + (offset_y + 1) * ldc] = alpha * rC[1][5]; C[(offset_x + 32) + (offset_y + 16) * ldc] = alpha * rC[2][4]; C[(offset_x + 33) + (offset_y + 16) * ldc] = alpha * rC[2][5]; C[(offset_x + 32) + (offset_y + 17) * ldc] = alpha * rC[3][4]; C[(offset_x + 33) + (offset_y + 17) * ldc] = alpha * rC[3][5]; C[(offset_x + 32) + (offset_y + 32) * ldc] = alpha * rC[4][4]; C[(offset_x + 33) + (offset_y + 32) * ldc] = alpha * rC[4][5]; C[(offset_x + 32) + (offset_y + 33) * ldc] = alpha * rC[5][4]; C[(offset_x + 33) + (offset_y + 33) * ldc] = alpha * rC[5][5]; } "; static const char * dgemm_NT_32_32_8_8x8_4x4__ALPHABETA = " __attribute__((reqd_work_group_size(8,8,1))) __kernel void dgemm_NT_32_32_8_8x8_4x4__ALPHABETA( __global double const * restrict A, __global double const * restrict B, __global double * C, uint const M, uint const N, uint const K, double const alpha, double const beta, uint lda, uint ldb, uint ldc, uint const offsetA, uint const offsetB, uint const offsetC) { double rC[4][4] = {(double)0}; double rA[1][4]; double rB[1][4]; double PreFetchA[4]; double PreFetchB[4]; A += offsetA; B += offsetB; C+=offsetC; __local double lA[264]; __local double lB[264]; int gidx = get_group_id(0); int gidy = get_group_id(1); int idx = get_local_id(0); int idy = get_local_id(1); int idt = 8*idy + idx; int idxT = idt % 16; int idyT = idt / 16; A += gidx*32+ idxT + idyT*lda; B += gidy*32+ idxT + idyT*ldb; __local double* plA = lA + idyT*33+idxT; __local double* plB = lB + idyT*33+idxT; plA[0] = A[0]; plA[16] = A[16]; plA[132] = A[4*lda]; plA[148] = A[16+4*lda]; plB[0] = B[0]; plB[16] = B[16]; plB[132] = B[4*ldb]; plB[148] = B[16+4*ldb]; barrier(CLK_LOCAL_MEM_FENCE); for( int block_k=0 ; block_k< K ; block_k+=8) { //barrier(CLK_LOCAL_MEM_FENCE); A += 8*lda; B += 8*ldb; PreFetchA[0] = A[0]; PreFetchA[1] = A[16]; PreFetchA[2] = A[4*lda]; PreFetchA[3] = A[16+4*lda]; PreFetchB[0] = B[0]; PreFetchB[1] = B[16]; PreFetchB[2] = B[4*ldb]; PreFetchB[3] = B[16+4*ldb]; int offA = idx; int offB = idy; for( int k = 0 ; k < 8; k+=1) { rA[0][0] = lA[offA + 0]; rA[0][1] = lA[offA + 8]; rA[0][2] = lA[offA + 16]; rA[0][3] = lA[offA + 24]; rB[0][0] = lB[offB + 0]; rB[0][1] = lB[offB + 8]; rB[0][2] = lB[offB + 16]; rB[0][3] = lB[offB + 24]; offA += 33; offB += 33; rC[0][0]=mad(rA[0][0],rB[0][0],rC[0][0]); rC[1][0]=mad(rA[0][1],rB[0][0],rC[1][0]); rC[2][0]=mad(rA[0][2],rB[0][0],rC[2][0]); rC[3][0]=mad(rA[0][3],rB[0][0],rC[3][0]); rC[0][1]=mad(rA[0][0],rB[0][1],rC[0][1]); rC[1][1]=mad(rA[0][1],rB[0][1],rC[1][1]); rC[2][1]=mad(rA[0][2],rB[0][1],rC[2][1]); rC[3][1]=mad(rA[0][3],rB[0][1],rC[3][1]); rC[0][2]=mad(rA[0][0],rB[0][2],rC[0][2]); rC[1][2]=mad(rA[0][1],rB[0][2],rC[1][2]); rC[2][2]=mad(rA[0][2],rB[0][2],rC[2][2]); rC[3][2]=mad(rA[0][3],rB[0][2],rC[3][2]); rC[0][3]=mad(rA[0][0],rB[0][3],rC[0][3]); rC[1][3]=mad(rA[0][1],rB[0][3],rC[1][3]); rC[2][3]=mad(rA[0][2],rB[0][3],rC[2][3]); rC[3][3]=mad(rA[0][3],rB[0][3],rC[3][3]); } barrier(CLK_LOCAL_MEM_FENCE); plA[0] = PreFetchA[0]; plA[16] = PreFetchA[1]; plA[132] = PreFetchA[2]; plA[148] = PreFetchA[3]; plB[0] = PreFetchB[0]; plB[16] = PreFetchB[1]; plB[132] = PreFetchB[2]; plB[148] = PreFetchB[3]; barrier(CLK_LOCAL_MEM_FENCE); // A += 8*lda; // B += 8*ldb; } C+= gidx*32; C+= idx; C+= gidy*32*ldc; C+= idy*ldc; C[0*ldc] = alpha*rC[0][0] + beta*C[0*ldc]; C[8*ldc] = alpha*rC[0][1] + beta*C[8*ldc]; C[16*ldc] = alpha*rC[0][2] + beta*C[16*ldc]; C[24*ldc] = alpha*rC[0][3] + beta*C[24*ldc]; C+=8; C[0*ldc] = alpha*rC[1][0] + beta*C[0*ldc]; C[8*ldc] = alpha*rC[1][1] + beta*C[8*ldc]; C[16*ldc] = alpha*rC[1][2] + beta*C[16*ldc]; C[24*ldc] = alpha*rC[1][3] + beta*C[24*ldc]; C+=8; C[0*ldc] = alpha*rC[2][0] + beta*C[0*ldc]; C[8*ldc] = alpha*rC[2][1] + beta*C[8*ldc]; C[16*ldc] = alpha*rC[2][2] + beta*C[16*ldc]; C[24*ldc] = alpha*rC[2][3] + beta*C[24*ldc]; C+=8; C[0*ldc] = alpha*rC[3][0] + beta*C[0*ldc]; C[8*ldc] = alpha*rC[3][1] + beta*C[8*ldc]; C[16*ldc] = alpha*rC[3][2] + beta*C[16*ldc]; C[24*ldc] = alpha*rC[3][3] + beta*C[24*ldc]; C+=8; } "; static const char * dgemm_NT_32_32_8_8x8_4x4__ALPHA = " __attribute__((reqd_work_group_size(8,8,1))) __kernel void dgemm_NT_32_32_8_8x8_4x4__ALPHA( __global double const * restrict A, __global double const * restrict B, __global double * C, uint const M, uint const N, uint const K, double const alpha, uint lda, uint ldb, uint ldc, uint const offsetA, uint const offsetB, uint const offsetC) { double rC[4][4] = {(double)0}; double rA[1][4]; double rB[1][4]; double PreFetchA[4]; double PreFetchB[4]; A += offsetA; B += offsetB; C+=offsetC; __local double lA[264]; __local double lB[264]; int gidx = get_group_id(0); int gidy = get_group_id(1); int idx = get_local_id(0); int idy = get_local_id(1); int idt = 8*idy + idx; int idxT = idt % 16; int idyT = idt / 16; A += gidx*32+ idxT + idyT*lda; B += gidy*32+ idxT + idyT*ldb; __local double* plA = lA + idyT*33+idxT; __local double* plB = lB + idyT*33+idxT; plA[0] = A[0]; plA[16] = A[16]; plA[132] = A[4*lda]; plA[148] = A[16+4*lda]; plB[0] = B[0]; plB[16] = B[16]; plB[132] = B[4*ldb]; plB[148] = B[16+4*ldb]; barrier(CLK_LOCAL_MEM_FENCE); for( int block_k=0 ; block_k< K ; block_k+=8) { //barrier(CLK_LOCAL_MEM_FENCE); A += 8*lda; B += 8*ldb; PreFetchA[0] = A[0]; PreFetchA[1] = A[16]; PreFetchA[2] = A[4*lda]; PreFetchA[3] = A[16+4*lda]; PreFetchB[0] = B[0]; PreFetchB[1] = B[16]; PreFetchB[2] = B[4*ldb]; PreFetchB[3] = B[16+4*ldb]; int offA = idx; int offB = idy; for( int k = 0 ; k < 8; k+=1) { rA[0][0] = lA[offA + 0]; rA[0][1] = lA[offA + 8]; rA[0][2] = lA[offA + 16]; rA[0][3] = lA[offA + 24]; rB[0][0] = lB[offB + 0]; rB[0][1] = lB[offB + 8]; rB[0][2] = lB[offB + 16]; rB[0][3] = lB[offB + 24]; offA += 33; offB += 33; rC[0][0]=mad(rA[0][0],rB[0][0],rC[0][0]); rC[1][0]=mad(rA[0][1],rB[0][0],rC[1][0]); rC[2][0]=mad(rA[0][2],rB[0][0],rC[2][0]); rC[3][0]=mad(rA[0][3],rB[0][0],rC[3][0]); rC[0][1]=mad(rA[0][0],rB[0][1],rC[0][1]); rC[1][1]=mad(rA[0][1],rB[0][1],rC[1][1]); rC[2][1]=mad(rA[0][2],rB[0][1],rC[2][1]); rC[3][1]=mad(rA[0][3],rB[0][1],rC[3][1]); rC[0][2]=mad(rA[0][0],rB[0][2],rC[0][2]); rC[1][2]=mad(rA[0][1],rB[0][2],rC[1][2]); rC[2][2]=mad(rA[0][2],rB[0][2],rC[2][2]); rC[3][2]=mad(rA[0][3],rB[0][2],rC[3][2]); rC[0][3]=mad(rA[0][0],rB[0][3],rC[0][3]); rC[1][3]=mad(rA[0][1],rB[0][3],rC[1][3]); rC[2][3]=mad(rA[0][2],rB[0][3],rC[2][3]); rC[3][3]=mad(rA[0][3],rB[0][3],rC[3][3]); } barrier(CLK_LOCAL_MEM_FENCE); plA[0] = PreFetchA[0]; plA[16] = PreFetchA[1]; plA[132] = PreFetchA[2]; plA[148] = PreFetchA[3]; plB[0] = PreFetchB[0]; plB[16] = PreFetchB[1]; plB[132] = PreFetchB[2]; plB[148] = PreFetchB[3]; barrier(CLK_LOCAL_MEM_FENCE); // A += 8*lda; // B += 8*ldb; } C+= gidx*32; C+= idx; C+= gidy*32*ldc; C+= idy*ldc; C[0*ldc] = alpha*rC[0][0] ; C[8*ldc] = alpha*rC[0][1] ; C[16*ldc] = alpha*rC[0][2]; C[24*ldc] = alpha*rC[0][3]; C+=8; ; C[0*ldc] = alpha*rC[1][0] ; C[8*ldc] = alpha*rC[1][1] ; C[16*ldc] = alpha*rC[1][2]; C[24*ldc] = alpha*rC[1][3]; C+=8; ; C[0*ldc] = alpha*rC[2][0] ; C[8*ldc] = alpha*rC[2][1] ; C[16*ldc] = alpha*rC[2][2]; C[24*ldc] = alpha*rC[2][3]; C+=8; ; C[0*ldc] = alpha*rC[3][0] ; C[8*ldc] = alpha*rC[3][1] ; C[16*ldc] = alpha*rC[3][2]; C[24*ldc] = alpha*rC[3][3]; C+=8; } "; static const char * dgemm_NT_40_40_8_8x8_5x5__ALPHABETA = " __attribute__((reqd_work_group_size(8,8,1))) __kernel void dgemm_NT_40_40_8_8x8_5x5__ALPHABETA( __global double const * restrict A, __global double const * restrict B, __global double * C, uint const M, uint const N, uint const K, double const alpha, double const beta, uint lda, uint ldb, uint ldc, uint const offsetA, uint const offsetB, uint const offsetC) { double rC[5][5] = {(double)0}; double rA[1][5]; double rB[1][5]; double PreFetchA[5]; double PreFetchB[5]; //double PreFetchA_5; //double PreFetchB_5; A += offsetA; B += offsetB; C+=offsetC; __local double lA[320]; __local double lB[320]; int gidx = get_group_id(0); int gidy = get_group_id(1); int idx = get_local_id(0); int idy = get_local_id(1); int idt = 8*idy + idx; int idxT = idt % 8; int idyT = idt / 8; A += gidx*40+ idxT + idyT*lda; B += gidy*40+ idxT + idyT*ldb; __local double* plA = lA + idyT*40+idxT; __local double* plB = lB + idyT*40+idxT; plA[0] = A[0]; plA[8] = A[8]; plA[16] = A[16]; plA[24] = A[24]; plA[32] = A[32]; plB[0] = B[0]; plB[8] = B[8]; plB[16] = B[16]; plB[24] = B[24]; plB[32] = B[32]; barrier(CLK_LOCAL_MEM_FENCE); //for( int block_k=0 ; block_k< K ; block_k+=8) uint block_k = K >> 3; do { /* barrier(CLK_LOCAL_MEM_FENCE); plA[0] = A[0]; plA[8] = A[8]; plA[16] = A[16]; plA[24] = A[24]; plA[32] = A[32]; plB[0] = B[0]; plB[8] = B[8]; plB[16] = B[16]; plB[24] = B[24]; plB[32] = B[32]; barrier(CLK_LOCAL_MEM_FENCE); */ A += 8*lda; B += 8*ldb; PreFetchA[0] = A[0]; PreFetchA[1] = A[8]; PreFetchA[2] = A[16]; PreFetchA[3] = A[24]; PreFetchA[4] = A[32]; PreFetchB[0] = B[0]; PreFetchB[1] = B[8]; PreFetchB[2] = B[16]; PreFetchB[3] = B[24]; PreFetchB[4] = B[32]; int offA = idx; int offB = idy; // int off256 = 256; #pragma unroll 1 for( int k = 0 ; k < 8; k+=1) { rA[0][0] = lA[offA + 0]; rA[0][1] = lA[offA + 8]; rA[0][2] = lA[offA + 16]; rA[0][3] = lA[offA + 24]; rA[0][4] = lA[offA + 32]; rB[0][0] = lB[offB + 0]; rB[0][1] = lB[offB + 8]; rB[0][2] = lB[offB + 16]; rB[0][3] = lB[offB + 24]; rB[0][4] = lB[offB + 32]; offA += 40; offB += 40; //off256 -= 24; rC[0][0]=mad(rA[0][0],rB[0][0],rC[0][0]); rC[1][0]=mad(rA[0][1],rB[0][0],rC[1][0]); rC[2][0]=mad(rA[0][2],rB[0][0],rC[2][0]); rC[3][0]=mad(rA[0][3],rB[0][0],rC[3][0]); rC[4][0]=mad(rA[0][4],rB[0][0],rC[4][0]); rC[0][1]=mad(rA[0][0],rB[0][1],rC[0][1]); rC[1][1]=mad(rA[0][1],rB[0][1],rC[1][1]); rC[2][1]=mad(rA[0][2],rB[0][1],rC[2][1]); rC[3][1]=mad(rA[0][3],rB[0][1],rC[3][1]); rC[4][1]=mad(rA[0][4],rB[0][1],rC[4][1]); rC[0][2]=mad(rA[0][0],rB[0][2],rC[0][2]); rC[1][2]=mad(rA[0][1],rB[0][2],rC[1][2]); rC[2][2]=mad(rA[0][2],rB[0][2],rC[2][2]); rC[3][2]=mad(rA[0][3],rB[0][2],rC[3][2]); rC[4][2]=mad(rA[0][4],rB[0][2],rC[4][2]); rC[0][3]=mad(rA[0][0],rB[0][3],rC[0][3]); rC[1][3]=mad(rA[0][1],rB[0][3],rC[1][3]); rC[2][3]=mad(rA[0][2],rB[0][3],rC[2][3]); rC[3][3]=mad(rA[0][3],rB[0][3],rC[3][3]); rC[4][3]=mad(rA[0][4],rB[0][3],rC[4][3]); rC[0][4]=mad(rA[0][0],rB[0][4],rC[0][4]); rC[1][4]=mad(rA[0][1],rB[0][4],rC[1][4]); rC[2][4]=mad(rA[0][2],rB[0][4],rC[2][4]); rC[3][4]=mad(rA[0][3],rB[0][4],rC[3][4]); rC[4][4]=mad(rA[0][4],rB[0][4],rC[4][4]); } barrier(CLK_LOCAL_MEM_FENCE); plA[0] = PreFetchA[0]; plA[8] = PreFetchA[1]; plA[16] = PreFetchA[2]; plA[24] = PreFetchA[3]; plA[32] = PreFetchA[4]; plB[0] = PreFetchB[0]; plB[8] = PreFetchB[1]; plB[16] = PreFetchB[2]; plB[24] = PreFetchB[3]; plB[32] = PreFetchB[4]; barrier(CLK_LOCAL_MEM_FENCE); // A += 8*lda; // B += 8*ldb; } while (--block_k > 0); C+= gidx*40; C+= idx; C+= gidy*40*ldc; C+= idy*ldc; C[0*ldc] = alpha*rC[0][0] + beta*C[0*ldc]; C[8*ldc] = alpha*rC[0][1] + beta*C[8*ldc]; C[16*ldc] = alpha*rC[0][2] + beta*C[16*ldc]; C[24*ldc] = alpha*rC[0][3] + beta*C[24*ldc]; C[32*ldc] = alpha*rC[0][4] + beta*C[32*ldc]; C+=8; C[0*ldc] = alpha*rC[1][0] + beta*C[0*ldc]; C[8*ldc] = alpha*rC[1][1] + beta*C[8*ldc]; C[16*ldc] = alpha*rC[1][2] + beta*C[16*ldc]; C[24*ldc] = alpha*rC[1][3] + beta*C[24*ldc]; C[32*ldc] = alpha*rC[1][4] + beta*C[32*ldc]; C+=8; C[0*ldc] = alpha*rC[2][0] + beta*C[0*ldc]; C[8*ldc] = alpha*rC[2][1] + beta*C[8*ldc]; C[16*ldc] = alpha*rC[2][2] + beta*C[16*ldc]; C[24*ldc] = alpha*rC[2][3] + beta*C[24*ldc]; C[32*ldc] = alpha*rC[2][4] + beta*C[32*ldc]; C+=8; C[0*ldc] = alpha*rC[3][0] + beta*C[0*ldc]; C[8*ldc] = alpha*rC[3][1] + beta*C[8*ldc]; C[16*ldc] = alpha*rC[3][2] + beta*C[16*ldc]; C[24*ldc] = alpha*rC[3][3] + beta*C[24*ldc]; C[32*ldc] = alpha*rC[3][4] + beta*C[32*ldc]; C+=8; C[0*ldc] = alpha*rC[4][0] + beta*C[0*ldc]; C[8*ldc] = alpha*rC[4][1] + beta*C[8*ldc]; C[16*ldc] = alpha*rC[4][2] + beta*C[16*ldc]; C[24*ldc] = alpha*rC[4][3] + beta*C[24*ldc]; C[32*ldc] = alpha*rC[4][4] + beta*C[32*ldc]; } "; static const char * dgemm_NT_40_40_8_8x8_5x5__ALPHA = " __attribute__((reqd_work_group_size(8,8,1))) __kernel void dgemm_NT_40_40_8_8x8_5x5__ALPHA( __global double const * restrict A, __global double const * restrict B, __global double * C, uint const M, uint const N, uint const K, double const alpha, uint lda, uint ldb, uint ldc, uint const offsetA, uint const offsetB, uint const offsetC) { double rC[5][5] = {(double)0}; double rA[1][5]; double rB[1][5]; double PreFetchA[5]; double PreFetchB[5]; //double PreFetchA_5; //double PreFetchB_5; A += offsetA; B += offsetB; C+=offsetC; __local double lA[320]; __local double lB[320]; int gidx = get_group_id(0); int gidy = get_group_id(1); int idx = get_local_id(0); int idy = get_local_id(1); int idt = 8*idy + idx; int idxT = idt % 8; int idyT = idt / 8; A += gidx*40+ idxT + idyT*lda; B += gidy*40+ idxT + idyT*ldb; __local double* plA = lA + idyT*40+idxT; __local double* plB = lB + idyT*40+idxT; plA[0] = A[0]; plA[8] = A[8]; plA[16] = A[16]; plA[24] = A[24]; plA[32] = A[32]; plB[0] = B[0]; plB[8] = B[8]; plB[16] = B[16]; plB[24] = B[24]; plB[32] = B[32]; barrier(CLK_LOCAL_MEM_FENCE); //for( int block_k=0 ; block_k< K ; block_k+=8) uint block_k = K >> 3; do { /* barrier(CLK_LOCAL_MEM_FENCE); plA[0] = A[0]; plA[8] = A[8]; plA[16] = A[16]; plA[24] = A[24]; plA[32] = A[32]; plB[0] = B[0]; plB[8] = B[8]; plB[16] = B[16]; plB[24] = B[24]; plB[32] = B[32]; barrier(CLK_LOCAL_MEM_FENCE); */ A += 8*lda; B += 8*ldb; PreFetchA[0] = A[0]; PreFetchA[1] = A[8]; PreFetchA[2] = A[16]; PreFetchA[3] = A[24]; PreFetchA[4] = A[32]; PreFetchB[0] = B[0]; PreFetchB[1] = B[8]; PreFetchB[2] = B[16]; PreFetchB[3] = B[24]; PreFetchB[4] = B[32]; int offA = idx; int offB = idy; // int off256 = 256; #pragma unroll 1 for( int k = 0 ; k < 8; k+=1) { rA[0][0] = lA[offA + 0]; rA[0][1] = lA[offA + 8]; rA[0][2] = lA[offA + 16]; rA[0][3] = lA[offA + 24]; rA[0][4] = lA[offA + 32]; rB[0][0] = lB[offB + 0]; rB[0][1] = lB[offB + 8]; rB[0][2] = lB[offB + 16]; rB[0][3] = lB[offB + 24]; rB[0][4] = lB[offB + 32]; offA += 40; offB += 40; //off256 -= 24; rC[0][0]=mad(rA[0][0],rB[0][0],rC[0][0]); rC[1][0]=mad(rA[0][1],rB[0][0],rC[1][0]); rC[2][0]=mad(rA[0][2],rB[0][0],rC[2][0]); rC[3][0]=mad(rA[0][3],rB[0][0],rC[3][0]); rC[4][0]=mad(rA[0][4],rB[0][0],rC[4][0]); rC[0][1]=mad(rA[0][0],rB[0][1],rC[0][1]); rC[1][1]=mad(rA[0][1],rB[0][1],rC[1][1]); rC[2][1]=mad(rA[0][2],rB[0][1],rC[2][1]); rC[3][1]=mad(rA[0][3],rB[0][1],rC[3][1]); rC[4][1]=mad(rA[0][4],rB[0][1],rC[4][1]); rC[0][2]=mad(rA[0][0],rB[0][2],rC[0][2]); rC[1][2]=mad(rA[0][1],rB[0][2],rC[1][2]); rC[2][2]=mad(rA[0][2],rB[0][2],rC[2][2]); rC[3][2]=mad(rA[0][3],rB[0][2],rC[3][2]); rC[4][2]=mad(rA[0][4],rB[0][2],rC[4][2]); rC[0][3]=mad(rA[0][0],rB[0][3],rC[0][3]); rC[1][3]=mad(rA[0][1],rB[0][3],rC[1][3]); rC[2][3]=mad(rA[0][2],rB[0][3],rC[2][3]); rC[3][3]=mad(rA[0][3],rB[0][3],rC[3][3]); rC[4][3]=mad(rA[0][4],rB[0][3],rC[4][3]); rC[0][4]=mad(rA[0][0],rB[0][4],rC[0][4]); rC[1][4]=mad(rA[0][1],rB[0][4],rC[1][4]); rC[2][4]=mad(rA[0][2],rB[0][4],rC[2][4]); rC[3][4]=mad(rA[0][3],rB[0][4],rC[3][4]); rC[4][4]=mad(rA[0][4],rB[0][4],rC[4][4]); } barrier(CLK_LOCAL_MEM_FENCE); plA[0] = PreFetchA[0]; plA[8] = PreFetchA[1]; plA[16] = PreFetchA[2]; plA[24] = PreFetchA[3]; plA[32] = PreFetchA[4]; plB[0] = PreFetchB[0]; plB[8] = PreFetchB[1]; plB[16] = PreFetchB[2]; plB[24] = PreFetchB[3]; plB[32] = PreFetchB[4]; barrier(CLK_LOCAL_MEM_FENCE); // A += 8*lda; // B += 8*ldb; } while (--block_k > 0); C+= gidx*40; C+= idx; C+= gidy*40*ldc; C+= idy*ldc; C[0*ldc] = alpha*rC[0][0] ; C[8*ldc] = alpha*rC[0][1] ; C[16*ldc] = alpha*rC[0][2]; C[24*ldc] = alpha*rC[0][3]; C[32*ldc] = alpha*rC[0][4]; C+=8; C[0*ldc] = alpha*rC[1][0] ; C[8*ldc] = alpha*rC[1][1] ; C[16*ldc] = alpha*rC[1][2]; C[24*ldc] = alpha*rC[1][3]; C[32*ldc] = alpha*rC[1][4]; C+=8; C[0*ldc] = alpha*rC[2][0] ; C[8*ldc] = alpha*rC[2][1] ; C[16*ldc] = alpha*rC[2][2]; C[24*ldc] = alpha*rC[2][3]; C[32*ldc] = alpha*rC[2][4]; C+=8; C[0*ldc] = alpha*rC[3][0] ; C[8*ldc] = alpha*rC[3][1] ; C[16*ldc] = alpha*rC[3][2]; C[24*ldc] = alpha*rC[3][3]; C[32*ldc] = alpha*rC[3][4]; C+=8; C[0*ldc] = alpha*rC[4][0] ; C[8*ldc] = alpha*rC[4][1] ; C[16*ldc] = alpha*rC[4][2]; C[24*ldc] = alpha*rC[4][3]; C[32*ldc] = alpha*rC[4][4]; } "; // ============= genericDgemm.cl // was DgemmGenericMNK // M, N, K /// local size 8,8 //padding 32 static const char * dgemm_NT_1_1_1_8x8_4x4__ALPHABETA = " __attribute__((reqd_work_group_size(8,8,1))) __kernel void dgemm_NT_1_1_1_8x8_4x4__ALPHABETA( __global double const * restrict A, __global double const * restrict B, __global double * C, uint const M, uint const N, uint const K, double const alpha, double const beta, uint lda, uint ldb, uint ldc, uint const offsetA, uint const offsetB, uint const offsetC) { double rC[4][4] = {(double)0}; double rA[1][4]; double rB[1][4]; A += offsetA; B += offsetB; C+=offsetC; __local double lA[264]; __local double lB[264]; int gidx = get_group_id(0); int gidy = get_group_id(1); int idx = get_local_id(0); int idy = get_local_id(1); int idt = 8*idy + idx; int idxT = idt % 16; int idyT = idt / 16; A += gidx*32+ idxT + idyT*lda; B += gidy*32+ idxT + idyT*ldb; __local double* plA = lA + idyT*33+1*idxT; __local double* plB = lB + idyT*33+1*idxT; if(gidx==get_num_groups(0)-1 || gidy==get_num_groups(1)-1 ) { int CurrentOffSetA = gidx*32+ idxT; int CurrentOffSetB = gidy*32+ idxT; for( int block_k=0 ; block_k< K ; block_k+=8) { barrier(CLK_LOCAL_MEM_FENCE); plA[0] = CurrentOffSetA>=M?0.0:A[0]; plA[16] = CurrentOffSetA+16>=M?0.0:A[16]; plA[132] = CurrentOffSetA>=M?0.0:A[4*lda]; plA[148] = CurrentOffSetA+16>=M?0.0:A[16+4*lda]; plB[0] = CurrentOffSetB>=N?0.0:B[0]; plB[16] = CurrentOffSetB+16>=N?0.0:B[16]; plB[132] = CurrentOffSetB>=N?0.0:B[4*ldb]; plB[148] = CurrentOffSetB+16>=N?0.0:B[16+4*ldb]; barrier(CLK_LOCAL_MEM_FENCE); int offA = idx; int offB = idy; for( int k = 0 ; k < min(8u, K-block_k); k+=1) { rA[0][0] = lA[offA + 0]; rA[0][1] = lA[offA + 8]; rA[0][2] = lA[offA + 16]; rA[0][3] = lA[offA + 24]; rB[0][0] = lB[offB + 0]; rB[0][1] = lB[offB + 8]; rB[0][2] = lB[offB + 16]; rB[0][3] = lB[offB + 24]; offA += 33; offB += 33; rC[0][0]=mad(rA[0][0],rB[0][0],rC[0][0]); rC[1][0]=mad(rA[0][1],rB[0][0],rC[1][0]); rC[2][0]=mad(rA[0][2],rB[0][0],rC[2][0]); rC[3][0]=mad(rA[0][3],rB[0][0],rC[3][0]); rC[0][1]=mad(rA[0][0],rB[0][1],rC[0][1]); rC[1][1]=mad(rA[0][1],rB[0][1],rC[1][1]); rC[2][1]=mad(rA[0][2],rB[0][1],rC[2][1]); rC[3][1]=mad(rA[0][3],rB[0][1],rC[3][1]); rC[0][2]=mad(rA[0][0],rB[0][2],rC[0][2]); rC[1][2]=mad(rA[0][1],rB[0][2],rC[1][2]); rC[2][2]=mad(rA[0][2],rB[0][2],rC[2][2]); rC[3][2]=mad(rA[0][3],rB[0][2],rC[3][2]); rC[0][3]=mad(rA[0][0],rB[0][3],rC[0][3]); rC[1][3]=mad(rA[0][1],rB[0][3],rC[1][3]); rC[2][3]=mad(rA[0][2],rB[0][3],rC[2][3]); rC[3][3]=mad(rA[0][3],rB[0][3],rC[3][3]); } A += 8*lda; B += 8*ldb; } C+= gidx*32; C+= idx; C+= gidy*32*ldc; C+= idy*ldc; int OffSetCM = gidx*32+idx; int OffSetCN = gidy*32+idy; if(OffSetCM>=M || OffSetCN>=N) return; for (int i = 0; i<4; i++) { C[0*ldc] = alpha*rC[i][0] + beta*C[0*ldc]; if (OffSetCN+8=M) return; } } else { for( int block_k=0 ; block_k< K ; block_k+=8) { barrier(CLK_LOCAL_MEM_FENCE); plA[0] = A[0]; plA[16] = A[16]; plA[132] = A[4*lda]; plA[148] = A[16+4*lda]; plB[0] = B[0]; plB[16] = B[16]; plB[132] = B[4*ldb]; plB[148] = B[16+4*ldb]; barrier(CLK_LOCAL_MEM_FENCE); int offA = idx; int offB = idy; for( int k = 0 ; k < min(8u, K-block_k); k+=1) { rA[0][0] = lA[offA + 0]; rA[0][1] = lA[offA + 8]; rA[0][2] = lA[offA + 16]; rA[0][3] = lA[offA + 24]; rB[0][0] = lB[offB + 0]; rB[0][1] = lB[offB + 8]; rB[0][2] = lB[offB + 16]; rB[0][3] = lB[offB + 24]; offA += 33; offB += 33; rC[0][0]=mad(rA[0][0],rB[0][0],rC[0][0]); rC[1][0]=mad(rA[0][1],rB[0][0],rC[1][0]); rC[2][0]=mad(rA[0][2],rB[0][0],rC[2][0]); rC[3][0]=mad(rA[0][3],rB[0][0],rC[3][0]); rC[0][1]=mad(rA[0][0],rB[0][1],rC[0][1]); rC[1][1]=mad(rA[0][1],rB[0][1],rC[1][1]); rC[2][1]=mad(rA[0][2],rB[0][1],rC[2][1]); rC[3][1]=mad(rA[0][3],rB[0][1],rC[3][1]); rC[0][2]=mad(rA[0][0],rB[0][2],rC[0][2]); rC[1][2]=mad(rA[0][1],rB[0][2],rC[1][2]); rC[2][2]=mad(rA[0][2],rB[0][2],rC[2][2]); rC[3][2]=mad(rA[0][3],rB[0][2],rC[3][2]); rC[0][3]=mad(rA[0][0],rB[0][3],rC[0][3]); rC[1][3]=mad(rA[0][1],rB[0][3],rC[1][3]); rC[2][3]=mad(rA[0][2],rB[0][3],rC[2][3]); rC[3][3]=mad(rA[0][3],rB[0][3],rC[3][3]); } A += 8*lda; B += 8*ldb; } C+= gidx*32; C+= idx; C+= gidy*32*ldc; C+= idy*ldc; C[0*ldc] = alpha*rC[0][0] + beta*C[0*ldc]; C[8*ldc] = alpha*rC[0][1] + beta*C[8*ldc]; C[16*ldc] = alpha*rC[0][2] + beta*C[16*ldc]; C[24*ldc] = alpha*rC[0][3] + beta*C[24*ldc]; C+=8; C[0*ldc] = alpha*rC[1][0] + beta*C[0*ldc]; C[8*ldc] = alpha*rC[1][1] + beta*C[8*ldc]; C[16*ldc] = alpha*rC[1][2] + beta*C[16*ldc]; C[24*ldc] = alpha*rC[1][3] + beta*C[24*ldc]; C+=8; C[0*ldc] = alpha*rC[2][0] + beta*C[0*ldc]; C[8*ldc] = alpha*rC[2][1] + beta*C[8*ldc]; C[16*ldc] = alpha*rC[2][2] + beta*C[16*ldc]; C[24*ldc] = alpha*rC[2][3] + beta*C[24*ldc]; C+=8; C[0*ldc] = alpha*rC[3][0] + beta*C[0*ldc]; C[8*ldc] = alpha*rC[3][1] + beta*C[8*ldc]; C[16*ldc] = alpha*rC[3][2] + beta*C[16*ldc]; C[24*ldc] = alpha*rC[3][3] + beta*C[24*ldc]; } } "; static const char * dgemm_NT_1_1_1_8x8_4x4__ALPHA = " __attribute__((reqd_work_group_size(8,8,1))) __kernel void dgemm_NT_1_1_1_8x8_4x4__ALPHA( __global double const * restrict A, __global double const * restrict B, __global double * C, uint const M, uint const N, uint const K, double const alpha, uint lda, uint ldb, uint ldc, uint const offsetA, uint const offsetB, uint const offsetC) { double rC[4][4] = {(double)0}; double rA[1][4]; double rB[1][4]; A += offsetA; B += offsetB; C+=offsetC; __local double lA[264]; __local double lB[264]; int gidx = get_group_id(0); int gidy = get_group_id(1); int idx = get_local_id(0); int idy = get_local_id(1); int idt = 8*idy + idx; int idxT = idt % 16; int idyT = idt / 16; A += gidx*32+ idxT + idyT*lda; B += gidy*32+ idxT + idyT*ldb; __local double* plA = lA + idyT*33+1*idxT; __local double* plB = lB + idyT*33+1*idxT; if(gidx==get_num_groups(0)-1 || gidy==get_num_groups(1)-1 ) { int CurrentOffSetA = gidx*32+ idxT; int CurrentOffSetB = gidy*32+ idxT; for( int block_k=0 ; block_k< K ; block_k+=8) { barrier(CLK_LOCAL_MEM_FENCE); plA[0] = CurrentOffSetA>=M?0.0:A[0]; plA[16] = CurrentOffSetA+16>=M?0.0:A[16]; plA[132] = CurrentOffSetA>=M?0.0:A[4*lda]; plA[148] = CurrentOffSetA+16>=M?0.0:A[16+4*lda]; plB[0] = CurrentOffSetB>=N?0.0:B[0]; plB[16] = CurrentOffSetB+16>=N?0.0:B[16]; plB[132] = CurrentOffSetB>=N?0.0:B[4*ldb]; plB[148] = CurrentOffSetB+16>=N?0.0:B[16+4*ldb]; barrier(CLK_LOCAL_MEM_FENCE); int offA = idx; int offB = idy; for( int k = 0 ; k < min(8u, K-block_k); k+=1) { rA[0][0] = lA[offA + 0]; rA[0][1] = lA[offA + 8]; rA[0][2] = lA[offA + 16]; rA[0][3] = lA[offA + 24]; rB[0][0] = lB[offB + 0]; rB[0][1] = lB[offB + 8]; rB[0][2] = lB[offB + 16]; rB[0][3] = lB[offB + 24]; offA += 33; offB += 33; rC[0][0]=mad(rA[0][0],rB[0][0],rC[0][0]); rC[1][0]=mad(rA[0][1],rB[0][0],rC[1][0]); rC[2][0]=mad(rA[0][2],rB[0][0],rC[2][0]); rC[3][0]=mad(rA[0][3],rB[0][0],rC[3][0]); rC[0][1]=mad(rA[0][0],rB[0][1],rC[0][1]); rC[1][1]=mad(rA[0][1],rB[0][1],rC[1][1]); rC[2][1]=mad(rA[0][2],rB[0][1],rC[2][1]); rC[3][1]=mad(rA[0][3],rB[0][1],rC[3][1]); rC[0][2]=mad(rA[0][0],rB[0][2],rC[0][2]); rC[1][2]=mad(rA[0][1],rB[0][2],rC[1][2]); rC[2][2]=mad(rA[0][2],rB[0][2],rC[2][2]); rC[3][2]=mad(rA[0][3],rB[0][2],rC[3][2]); rC[0][3]=mad(rA[0][0],rB[0][3],rC[0][3]); rC[1][3]=mad(rA[0][1],rB[0][3],rC[1][3]); rC[2][3]=mad(rA[0][2],rB[0][3],rC[2][3]); rC[3][3]=mad(rA[0][3],rB[0][3],rC[3][3]); } A += 8*lda; B += 8*ldb; } C+= gidx*32; C+= idx; C+= gidy*32*ldc; C+= idy*ldc; int OffSetCM = gidx*32+idx; int OffSetCN = gidy*32+idy; if(OffSetCM>=M || OffSetCN>=N) return; for (int i = 0; i<4; i++) { C[0*ldc] = alpha*rC[i][0]; if (OffSetCN+8=M) return; } } else { for( int block_k=0 ; block_k< K ; block_k+=8) { barrier(CLK_LOCAL_MEM_FENCE); plA[0] = A[0]; plA[16] = A[16]; plA[132] = A[4*lda]; plA[148] = A[16+4*lda]; plB[0] = B[0]; plB[16] = B[16]; plB[132] = B[4*ldb]; plB[148] = B[16+4*ldb]; barrier(CLK_LOCAL_MEM_FENCE); int offA = idx; int offB = idy; for( int k = 0 ; k < min(8u, K-block_k); k+=1) { rA[0][0] = lA[offA + 0]; rA[0][1] = lA[offA + 8]; rA[0][2] = lA[offA + 16]; rA[0][3] = lA[offA + 24]; rB[0][0] = lB[offB + 0]; rB[0][1] = lB[offB + 8]; rB[0][2] = lB[offB + 16]; rB[0][3] = lB[offB + 24]; offA += 33; offB += 33; rC[0][0]=mad(rA[0][0],rB[0][0],rC[0][0]); rC[1][0]=mad(rA[0][1],rB[0][0],rC[1][0]); rC[2][0]=mad(rA[0][2],rB[0][0],rC[2][0]); rC[3][0]=mad(rA[0][3],rB[0][0],rC[3][0]); rC[0][1]=mad(rA[0][0],rB[0][1],rC[0][1]); rC[1][1]=mad(rA[0][1],rB[0][1],rC[1][1]); rC[2][1]=mad(rA[0][2],rB[0][1],rC[2][1]); rC[3][1]=mad(rA[0][3],rB[0][1],rC[3][1]); rC[0][2]=mad(rA[0][0],rB[0][2],rC[0][2]); rC[1][2]=mad(rA[0][1],rB[0][2],rC[1][2]); rC[2][2]=mad(rA[0][2],rB[0][2],rC[2][2]); rC[3][2]=mad(rA[0][3],rB[0][2],rC[3][2]); rC[0][3]=mad(rA[0][0],rB[0][3],rC[0][3]); rC[1][3]=mad(rA[0][1],rB[0][3],rC[1][3]); rC[2][3]=mad(rA[0][2],rB[0][3],rC[2][3]); rC[3][3]=mad(rA[0][3],rB[0][3],rC[3][3]); } A += 8*lda; B += 8*ldb; } C+= gidx*32; C+= idx; C+= gidy*32*ldc; C+= idy*ldc; C[0*ldc] = alpha*rC[0][0]; C[8*ldc] = alpha*rC[0][1]; C[16*ldc] = alpha*rC[0][2]; C[24*ldc] = alpha*rC[0][3]; C+=8; C[0*ldc] = alpha*rC[1][0]; C[8*ldc] = alpha*rC[1][1]; C[16*ldc] = alpha*rC[1][2]; C[24*ldc] = alpha*rC[1][3]; C+=8; C[0*ldc] = alpha*rC[2][0]; C[8*ldc] = alpha*rC[2][1]; C[16*ldc] = alpha*rC[2][2]; C[24*ldc] = alpha*rC[2][3]; C+=8; C[0*ldc] = alpha*rC[3][0]; C[8*ldc] = alpha*rC[3][1]; C[16*ldc] = alpha*rC[3][2]; C[24*ldc] = alpha*rC[3][3]; } } "; // was DgemmGenericMN // M, N, K%8 static const char * dgemm_NT_1_1_8_8x8_4x4__ALPHABETA = " __attribute__((reqd_work_group_size(8,8,1))) __kernel void dgemm_NT_1_1_8_8x8_4x4__ALPHABETA( __global double const * restrict A, __global double const * restrict B, __global double * C, uint const M, uint const N, uint const K, double const alpha, double const beta, uint lda, uint ldb, uint ldc, uint const offsetA, uint const offsetB, uint const offsetC) { double rC[4][4] = {(double)0}; double rA[1][4]; double rB[1][4]; A += offsetA; B += offsetB; C+=offsetC; __local double lA[264]; __local double lB[264]; int gidx = get_group_id(0); int gidy = get_group_id(1); int idx = get_local_id(0); int idy = get_local_id(1); int idt = 8*idy + idx; int idxT = idt % 16; int idyT = idt / 16; A += gidx*32+ idxT + idyT*lda; B += gidy*32+ idxT + idyT*ldb; __local double* plA = lA + idyT*33+1*idxT; __local double* plB = lB + idyT*33+1*idxT; if(gidx==get_num_groups(0)-1 || gidy==get_num_groups(1)-1 ) { int CurrentOffSetA = gidx*32+ idxT; int CurrentOffSetB = gidy*32+ idxT; for( int block_k=0 ; block_k< K ; block_k+=8) { barrier(CLK_LOCAL_MEM_FENCE); plA[0] = CurrentOffSetA>=M?0.0:A[0]; plA[16] = CurrentOffSetA+16>=M?0.0:A[16]; plA[132] = CurrentOffSetA>=M?0.0:A[4*lda]; plA[148] = CurrentOffSetA+16>=M?0.0:A[16+4*lda]; plB[0] = CurrentOffSetB>=N?0.0:B[0]; plB[16] = CurrentOffSetB+16>=N?0.0:B[16]; plB[132] = CurrentOffSetB>=N?0.0:B[4*ldb]; plB[148] = CurrentOffSetB+16>=N?0.0:B[16+4*ldb]; barrier(CLK_LOCAL_MEM_FENCE); int offA = 1*idx; int offB = 1*idy; for( int k = 0 ; k < 8; k+=1) { rA[0][0] = lA[offA + 0]; rA[0][1] = lA[offA + 8]; rA[0][2] = lA[offA + 16]; rA[0][3] = lA[offA + 24]; rB[0][0] = lB[offB + 0]; rB[0][1] = lB[offB + 8]; rB[0][2] = lB[offB + 16]; rB[0][3] = lB[offB + 24]; offA += 33; offB += 33; rC[0][0]=mad(rA[0][0],rB[0][0],rC[0][0]); rC[1][0]=mad(rA[0][1],rB[0][0],rC[1][0]); rC[2][0]=mad(rA[0][2],rB[0][0],rC[2][0]); rC[3][0]=mad(rA[0][3],rB[0][0],rC[3][0]); rC[0][1]=mad(rA[0][0],rB[0][1],rC[0][1]); rC[1][1]=mad(rA[0][1],rB[0][1],rC[1][1]); rC[2][1]=mad(rA[0][2],rB[0][1],rC[2][1]); rC[3][1]=mad(rA[0][3],rB[0][1],rC[3][1]); rC[0][2]=mad(rA[0][0],rB[0][2],rC[0][2]); rC[1][2]=mad(rA[0][1],rB[0][2],rC[1][2]); rC[2][2]=mad(rA[0][2],rB[0][2],rC[2][2]); rC[3][2]=mad(rA[0][3],rB[0][2],rC[3][2]); rC[0][3]=mad(rA[0][0],rB[0][3],rC[0][3]); rC[1][3]=mad(rA[0][1],rB[0][3],rC[1][3]); rC[2][3]=mad(rA[0][2],rB[0][3],rC[2][3]); rC[3][3]=mad(rA[0][3],rB[0][3],rC[3][3]); } A += 8*lda; B += 8*ldb; } C+= gidx*32; C+= idx; C+= gidy*32*ldc; C+= idy*ldc; int OffSetCM = gidx*32+idx; int OffSetCN = gidy*32+idy; if(OffSetCM>=M || OffSetCN>=N) { return; } for (int i = 0; i<4; i++) { C[0*ldc] = alpha*rC[i][0] + beta*C[0*ldc]; if (OffSetCN+8=M) { return; } } } else { for( int block_k=0 ; block_k< K ; block_k+=8) { barrier(CLK_LOCAL_MEM_FENCE); plA[0] = A[0]; plA[16] = A[16]; plA[132] = A[4*lda]; plA[148] = A[16+4*lda]; plB[0] = B[0]; plB[16] = B[16]; plB[132] = B[4*ldb]; plB[148] = B[16+4*ldb]; barrier(CLK_LOCAL_MEM_FENCE); int offA = idx; int offB = idy; for( int k = 0 ; k < 8; k+=1) { rA[0][0] = lA[offA + 0]; rA[0][1] = lA[offA + 8]; rA[0][2] = lA[offA + 16]; rA[0][3] = lA[offA + 24]; rB[0][0] = lB[offB + 0]; rB[0][1] = lB[offB + 8]; rB[0][2] = lB[offB + 16]; rB[0][3] = lB[offB + 24]; offA += 33; offB += 33; rC[0][0]=mad(rA[0][0],rB[0][0],rC[0][0]); rC[1][0]=mad(rA[0][1],rB[0][0],rC[1][0]); rC[2][0]=mad(rA[0][2],rB[0][0],rC[2][0]); rC[3][0]=mad(rA[0][3],rB[0][0],rC[3][0]); rC[0][1]=mad(rA[0][0],rB[0][1],rC[0][1]); rC[1][1]=mad(rA[0][1],rB[0][1],rC[1][1]); rC[2][1]=mad(rA[0][2],rB[0][1],rC[2][1]); rC[3][1]=mad(rA[0][3],rB[0][1],rC[3][1]); rC[0][2]=mad(rA[0][0],rB[0][2],rC[0][2]); rC[1][2]=mad(rA[0][1],rB[0][2],rC[1][2]); rC[2][2]=mad(rA[0][2],rB[0][2],rC[2][2]); rC[3][2]=mad(rA[0][3],rB[0][2],rC[3][2]); rC[0][3]=mad(rA[0][0],rB[0][3],rC[0][3]); rC[1][3]=mad(rA[0][1],rB[0][3],rC[1][3]); rC[2][3]=mad(rA[0][2],rB[0][3],rC[2][3]); rC[3][3]=mad(rA[0][3],rB[0][3],rC[3][3]); } A += 8*lda; B += 8*ldb; } C+= gidx*32; C+= idx*1; C+= gidy*32*ldc; C+= idy*1*ldc; C[0*ldc] = alpha*rC[0][0] + beta*C[0*ldc]; C[8*ldc] = alpha*rC[0][1] + beta*C[8*ldc]; C[16*ldc] = alpha*rC[0][2] + beta*C[16*ldc]; C[24*ldc] = alpha*rC[0][3] + beta*C[24*ldc]; C+=8; C[0*ldc] = alpha*rC[1][0] + beta*C[0*ldc]; C[8*ldc] = alpha*rC[1][1] + beta*C[8*ldc]; C[16*ldc] = alpha*rC[1][2] + beta*C[16*ldc]; C[24*ldc] = alpha*rC[1][3] + beta*C[24*ldc]; C+=8; C[0*ldc] = alpha*rC[2][0] + beta*C[0*ldc]; C[8*ldc] = alpha*rC[2][1] + beta*C[8*ldc]; C[16*ldc] = alpha*rC[2][2] + beta*C[16*ldc]; C[24*ldc] = alpha*rC[2][3] + beta*C[24*ldc]; C+=8; C[0*ldc] = alpha*rC[3][0] + beta*C[0*ldc]; C[8*ldc] = alpha*rC[3][1] + beta*C[8*ldc]; C[16*ldc] = alpha*rC[3][2] + beta*C[16*ldc]; C[24*ldc] = alpha*rC[3][3] + beta*C[24*ldc]; } } "; static const char * dgemm_NT_1_1_8_8x8_4x4__ALPHA = " __attribute__((reqd_work_group_size(8,8,1))) __kernel void dgemm_NT_1_1_8_8x8_4x4__ALPHA( __global double const * restrict A, __global double const * restrict B, __global double * C, uint const M, uint const N, uint const K, double const alpha, uint lda, uint ldb, uint ldc, uint const offsetA, uint const offsetB, uint const offsetC) { double rC[4][4] = {(double)0}; double rA[1][4]; double rB[1][4]; A += offsetA; B += offsetB; C+=offsetC; __local double lA[264]; __local double lB[264]; int gidx = get_group_id(0); int gidy = get_group_id(1); int idx = get_local_id(0); int idy = get_local_id(1); int idt = 8*idy + idx; int idxT = idt % 16; int idyT = idt / 16; A += gidx*32+ idxT + idyT*lda; B += gidy*32+ idxT + idyT*ldb; __local double* plA = lA + idyT*33+1*idxT; __local double* plB = lB + idyT*33+1*idxT; if(gidx==get_num_groups(0)-1 || gidy==get_num_groups(1)-1 ) { int CurrentOffSetA = gidx*32+ idxT; int CurrentOffSetB = gidy*32+ idxT; for( int block_k=0 ; block_k< K ; block_k+=8) { barrier(CLK_LOCAL_MEM_FENCE); plA[0] = CurrentOffSetA>=M?0.0:A[0]; plA[16] = CurrentOffSetA+16>=M?0.0:A[16]; plA[132] = CurrentOffSetA>=M?0.0:A[4*lda]; plA[148] = CurrentOffSetA+16>=M?0.0:A[16+4*lda]; plB[0] = CurrentOffSetB>=N?0.0:B[0]; plB[16] = CurrentOffSetB+16>=N?0.0:B[16]; plB[132] = CurrentOffSetB>=N?0.0:B[4*ldb]; plB[148] = CurrentOffSetB+16>=N?0.0:B[16+4*ldb]; barrier(CLK_LOCAL_MEM_FENCE); int offA = 1*idx; int offB = 1*idy; for( int k = 0 ; k < 8; k+=1) { rA[0][0] = lA[offA + 0]; rA[0][1] = lA[offA + 8]; rA[0][2] = lA[offA + 16]; rA[0][3] = lA[offA + 24]; rB[0][0] = lB[offB + 0]; rB[0][1] = lB[offB + 8]; rB[0][2] = lB[offB + 16]; rB[0][3] = lB[offB + 24]; offA += 33; offB += 33; rC[0][0]=mad(rA[0][0],rB[0][0],rC[0][0]); rC[1][0]=mad(rA[0][1],rB[0][0],rC[1][0]); rC[2][0]=mad(rA[0][2],rB[0][0],rC[2][0]); rC[3][0]=mad(rA[0][3],rB[0][0],rC[3][0]); rC[0][1]=mad(rA[0][0],rB[0][1],rC[0][1]); rC[1][1]=mad(rA[0][1],rB[0][1],rC[1][1]); rC[2][1]=mad(rA[0][2],rB[0][1],rC[2][1]); rC[3][1]=mad(rA[0][3],rB[0][1],rC[3][1]); rC[0][2]=mad(rA[0][0],rB[0][2],rC[0][2]); rC[1][2]=mad(rA[0][1],rB[0][2],rC[1][2]); rC[2][2]=mad(rA[0][2],rB[0][2],rC[2][2]); rC[3][2]=mad(rA[0][3],rB[0][2],rC[3][2]); rC[0][3]=mad(rA[0][0],rB[0][3],rC[0][3]); rC[1][3]=mad(rA[0][1],rB[0][3],rC[1][3]); rC[2][3]=mad(rA[0][2],rB[0][3],rC[2][3]); rC[3][3]=mad(rA[0][3],rB[0][3],rC[3][3]); } A += 8*lda; B += 8*ldb; } C+= gidx*32; C+= idx; C+= gidy*32*ldc; C+= idy*ldc; int OffSetCM = gidx*32+idx; int OffSetCN = gidy*32+idy; if(OffSetCM>=M || OffSetCN>=N) { return; } for (int i = 0; i<4; i++) { C[0*ldc] = alpha*rC[i][0]; if (OffSetCN+8=M) { return; } } } else { for( int block_k=0 ; block_k< K ; block_k+=8) { barrier(CLK_LOCAL_MEM_FENCE); plA[0] = A[0]; plA[16] = A[16]; plA[132] = A[4*lda]; plA[148] = A[16+4*lda]; plB[0] = B[0]; plB[16] = B[16]; plB[132] = B[4*ldb]; plB[148] = B[16+4*ldb]; barrier(CLK_LOCAL_MEM_FENCE); int offA = idx; int offB = idy; for( int k = 0 ; k < 8; k+=1) { rA[0][0] = lA[offA + 0]; rA[0][1] = lA[offA + 8]; rA[0][2] = lA[offA + 16]; rA[0][3] = lA[offA + 24]; rB[0][0] = lB[offB + 0]; rB[0][1] = lB[offB + 8]; rB[0][2] = lB[offB + 16]; rB[0][3] = lB[offB + 24]; offA += 33; offB += 33; rC[0][0]=mad(rA[0][0],rB[0][0],rC[0][0]); rC[1][0]=mad(rA[0][1],rB[0][0],rC[1][0]); rC[2][0]=mad(rA[0][2],rB[0][0],rC[2][0]); rC[3][0]=mad(rA[0][3],rB[0][0],rC[3][0]); rC[0][1]=mad(rA[0][0],rB[0][1],rC[0][1]); rC[1][1]=mad(rA[0][1],rB[0][1],rC[1][1]); rC[2][1]=mad(rA[0][2],rB[0][1],rC[2][1]); rC[3][1]=mad(rA[0][3],rB[0][1],rC[3][1]); rC[0][2]=mad(rA[0][0],rB[0][2],rC[0][2]); rC[1][2]=mad(rA[0][1],rB[0][2],rC[1][2]); rC[2][2]=mad(rA[0][2],rB[0][2],rC[2][2]); rC[3][2]=mad(rA[0][3],rB[0][2],rC[3][2]); rC[0][3]=mad(rA[0][0],rB[0][3],rC[0][3]); rC[1][3]=mad(rA[0][1],rB[0][3],rC[1][3]); rC[2][3]=mad(rA[0][2],rB[0][3],rC[2][3]); rC[3][3]=mad(rA[0][3],rB[0][3],rC[3][3]); } A += 8*lda; B += 8*ldb; } C+= gidx*32; C+= idx*1; C+= gidy*32*ldc; C+= idy*1*ldc; C[0*ldc] = alpha*rC[0][0] ; C[8*ldc] = alpha*rC[0][1] ; C[16*ldc] = alpha*rC[0][2]; C[24*ldc] = alpha*rC[0][3]; C+=8; C[0*ldc] = alpha*rC[1][0] ; C[8*ldc] = alpha*rC[1][1] ; C[16*ldc] = alpha*rC[1][2]; C[24*ldc] = alpha*rC[1][3]; C+=8; C[0*ldc] = alpha*rC[2][0] ; C[8*ldc] = alpha*rC[2][1] ; C[16*ldc] = alpha*rC[2][2]; C[24*ldc] = alpha*rC[2][3]; C+=8; C[0*ldc] = alpha*rC[3][0] ; C[8*ldc] = alpha*rC[3][1] ; C[16*ldc] = alpha*rC[3][2]; C[24*ldc] = alpha*rC[3][3]; } } "; // // was DgemmGenericK // static const char * dgemm_NT_32_32_1_8x8_4x4__ALPHABETA = " __attribute__((reqd_work_group_size(8,8,1))) __kernel void dgemm_NT_32_32_1_8x8_4x4__ALPHABETA( __global double const * restrict A, __global double const * restrict B, __global double * C, uint const M, uint const N, uint const K, double const alpha, double const beta, uint lda, uint ldb, uint ldc, uint const offsetA, uint const offsetB, uint const offsetC) { double rC[4][4] = {(double)0}; double rA[1][4]; double rB[1][4]; double PreFetchA[4]; double PreFetchB[4]; A += offsetA; B += offsetB; C+=offsetC; __local double lA[264]; __local double lB[264]; int gidx = get_group_id(0); int gidy = get_group_id(1); int idx = get_local_id(0); int idy = get_local_id(1); int idt = 8*idy + idx; int idxT = idt % 16; int idyT = idt / 16; A += gidx*32+ idxT + idyT*lda; B += gidy*32+ idxT + idyT*ldb; __local double* plA = lA + idyT*33+1*idxT; __local double* plB = lB + idyT*33+1*idxT; plA[0] = A[0]; plA[16] = A[16]; plA[132] = A[4*lda]; plA[148] = A[16+4*lda]; plB[0] = B[0]; plB[16] = B[16]; plB[132] = B[4*ldb]; plB[148] = B[16+4*ldb]; barrier(CLK_LOCAL_MEM_FENCE); for( int block_k=0 ; block_k< K ; block_k+=8) { // barrier(CLK_LOCAL_MEM_FENCE); A += 8*lda; B += 8*ldb; PreFetchA[0] = A[0]; PreFetchA[1] = A[16]; PreFetchA[2] = A[4*lda]; PreFetchA[3] = A[16+4*lda]; PreFetchB[0] = B[0]; PreFetchB[1] = B[16]; PreFetchB[2] = B[4*ldb]; PreFetchB[3] = B[16+4*ldb]; int offA = idx; int offB = idy; for( int k = 0 ; k < min(8u, K-block_k); k+=1) { rA[0][0] = lA[offA + 0]; rA[0][1] = lA[offA + 8]; rA[0][2] = lA[offA + 16]; rA[0][3] = lA[offA + 24]; rB[0][0] = lB[offB + 0]; rB[0][1] = lB[offB + 8]; rB[0][2] = lB[offB + 16]; rB[0][3] = lB[offB + 24]; offA += 33; offB += 33; rC[0][0]=mad(rA[0][0],rB[0][0],rC[0][0]); rC[1][0]=mad(rA[0][1],rB[0][0],rC[1][0]); rC[2][0]=mad(rA[0][2],rB[0][0],rC[2][0]); rC[3][0]=mad(rA[0][3],rB[0][0],rC[3][0]); rC[0][1]=mad(rA[0][0],rB[0][1],rC[0][1]); rC[1][1]=mad(rA[0][1],rB[0][1],rC[1][1]); rC[2][1]=mad(rA[0][2],rB[0][1],rC[2][1]); rC[3][1]=mad(rA[0][3],rB[0][1],rC[3][1]); rC[0][2]=mad(rA[0][0],rB[0][2],rC[0][2]); rC[1][2]=mad(rA[0][1],rB[0][2],rC[1][2]); rC[2][2]=mad(rA[0][2],rB[0][2],rC[2][2]); rC[3][2]=mad(rA[0][3],rB[0][2],rC[3][2]); rC[0][3]=mad(rA[0][0],rB[0][3],rC[0][3]); rC[1][3]=mad(rA[0][1],rB[0][3],rC[1][3]); rC[2][3]=mad(rA[0][2],rB[0][3],rC[2][3]); rC[3][3]=mad(rA[0][3],rB[0][3],rC[3][3]); } barrier(CLK_LOCAL_MEM_FENCE); plA[0] = PreFetchA[0]; plA[16] = PreFetchA[1]; plA[132] = PreFetchA[2]; plA[148] = PreFetchA[3]; plB[0] = PreFetchB[0]; plB[16] = PreFetchB[1]; plB[132] = PreFetchB[2]; plB[148] = PreFetchB[3]; barrier(CLK_LOCAL_MEM_FENCE); } C+= gidx*32; C+= idx; C+= gidy*32*ldc; C+= idy*ldc; C[0*ldc] = alpha*rC[0][0] + beta*C[0*ldc]; C[8*ldc] = alpha*rC[0][1] + beta*C[8*ldc]; C[16*ldc] = alpha*rC[0][2] + beta*C[16*ldc]; C[24*ldc] = alpha*rC[0][3] + beta*C[24*ldc]; C+=8; C[0*ldc] = alpha*rC[1][0] + beta*C[0*ldc]; C[8*ldc] = alpha*rC[1][1] + beta*C[8*ldc]; C[16*ldc] = alpha*rC[1][2] + beta*C[16*ldc]; C[24*ldc] = alpha*rC[1][3] + beta*C[24*ldc]; C+=8; C[0*ldc] = alpha*rC[2][0] + beta*C[0*ldc]; C[8*ldc] = alpha*rC[2][1] + beta*C[8*ldc]; C[16*ldc] = alpha*rC[2][2] + beta*C[16*ldc]; C[24*ldc] = alpha*rC[2][3] + beta*C[24*ldc]; C+=8; C[0*ldc] = alpha*rC[3][0] + beta*C[0*ldc]; C[8*ldc] = alpha*rC[3][1] + beta*C[8*ldc]; C[16*ldc] = alpha*rC[3][2] + beta*C[16*ldc]; C[24*ldc] = alpha*rC[3][3] + beta*C[24*ldc]; C+=8; } "; static const char * dgemm_NT_32_32_1_8x8_4x4__ALPHA = " __attribute__((reqd_work_group_size(8,8,1))) __kernel void dgemm_NT_32_32_1_8x8_4x4__ALPHA( __global double const * restrict A, __global double const * restrict B, __global double * C, uint const M, uint const N, uint const K, double const alpha, uint lda, uint ldb, uint ldc, uint const offsetA, uint const offsetB, uint const offsetC) { double rC[4][4] = {(double)0}; double rA[1][4]; double rB[1][4]; double PreFetchA[4]; double PreFetchB[4]; A += offsetA; B += offsetB; C+=offsetC; __local double lA[264]; __local double lB[264]; int gidx = get_group_id(0); int gidy = get_group_id(1); int idx = get_local_id(0); int idy = get_local_id(1); int idt = 8*idy + idx; int idxT = idt % 16; int idyT = idt / 16; A += gidx*32+ idxT + idyT*lda; B += gidy*32+ idxT + idyT*ldb; __local double* plA = lA + idyT*33+1*idxT; __local double* plB = lB + idyT*33+1*idxT; plA[0] = A[0]; plA[16] = A[16]; plA[132] = A[4*lda]; plA[148] = A[16+4*lda]; plB[0] = B[0]; plB[16] = B[16]; plB[132] = B[4*ldb]; plB[148] = B[16+4*ldb]; barrier(CLK_LOCAL_MEM_FENCE); for( int block_k=0 ; block_k< K ; block_k+=8) { // barrier(CLK_LOCAL_MEM_FENCE); A += 8*lda; B += 8*ldb; PreFetchA[0] = A[0]; PreFetchA[1] = A[16]; PreFetchA[2] = A[4*lda]; PreFetchA[3] = A[16+4*lda]; PreFetchB[0] = B[0]; PreFetchB[1] = B[16]; PreFetchB[2] = B[4*ldb]; PreFetchB[3] = B[16+4*ldb]; int offA = idx; int offB = idy; for( int k = 0 ; k < min(8u, K-block_k); k+=1) { rA[0][0] = lA[offA + 0]; rA[0][1] = lA[offA + 8]; rA[0][2] = lA[offA + 16]; rA[0][3] = lA[offA + 24]; rB[0][0] = lB[offB + 0]; rB[0][1] = lB[offB + 8]; rB[0][2] = lB[offB + 16]; rB[0][3] = lB[offB + 24]; offA += 33; offB += 33; rC[0][0]=mad(rA[0][0],rB[0][0],rC[0][0]); rC[1][0]=mad(rA[0][1],rB[0][0],rC[1][0]); rC[2][0]=mad(rA[0][2],rB[0][0],rC[2][0]); rC[3][0]=mad(rA[0][3],rB[0][0],rC[3][0]); rC[0][1]=mad(rA[0][0],rB[0][1],rC[0][1]); rC[1][1]=mad(rA[0][1],rB[0][1],rC[1][1]); rC[2][1]=mad(rA[0][2],rB[0][1],rC[2][1]); rC[3][1]=mad(rA[0][3],rB[0][1],rC[3][1]); rC[0][2]=mad(rA[0][0],rB[0][2],rC[0][2]); rC[1][2]=mad(rA[0][1],rB[0][2],rC[1][2]); rC[2][2]=mad(rA[0][2],rB[0][2],rC[2][2]); rC[3][2]=mad(rA[0][3],rB[0][2],rC[3][2]); rC[0][3]=mad(rA[0][0],rB[0][3],rC[0][3]); rC[1][3]=mad(rA[0][1],rB[0][3],rC[1][3]); rC[2][3]=mad(rA[0][2],rB[0][3],rC[2][3]); rC[3][3]=mad(rA[0][3],rB[0][3],rC[3][3]); } barrier(CLK_LOCAL_MEM_FENCE); plA[0] = PreFetchA[0]; plA[16] = PreFetchA[1]; plA[132] = PreFetchA[2]; plA[148] = PreFetchA[3]; plB[0] = PreFetchB[0]; plB[16] = PreFetchB[1]; plB[132] = PreFetchB[2]; plB[148] = PreFetchB[3]; barrier(CLK_LOCAL_MEM_FENCE); } C+= gidx*32; C+= idx; C+= gidy*32*ldc; C+= idy*ldc; C[0*ldc] = alpha*rC[0][0] ; C[8*ldc] = alpha*rC[0][1] ; C[16*ldc] = alpha*rC[0][2]; C[24*ldc] = alpha*rC[0][3]; C+=8; C[0*ldc] = alpha*rC[1][0] ; C[8*ldc] = alpha*rC[1][1] ; C[16*ldc] = alpha*rC[1][2]; C[24*ldc] = alpha*rC[1][3]; C+=8; C[0*ldc] = alpha*rC[2][0] ; C[8*ldc] = alpha*rC[2][1] ; C[16*ldc] = alpha*rC[2][2]; C[24*ldc] = alpha*rC[2][3]; C+=8; C[0*ldc] = alpha*rC[3][0] ; C[8*ldc] = alpha*rC[3][1] ; C[16*ldc] = alpha*rC[3][2]; C[24*ldc] = alpha*rC[3][3]; C+=8; } "; // ============ TNDgemmColumn.cl static const char * dgemm_TN_32_32_16_8x16_4x2__ALPHABETA = " __attribute__( (reqd_work_group_size(8, 16, 1)) ) __kernel void dgemm_TN_32_32_16_8x16_4x2__ALPHABETA( __global double const * restrict A, __global double const * restrict B, __global double * C, uint const M, uint const N, uint const K, double const alpha, double const beta, uint lda, uint ldb, uint ldc, uint const offsetA, uint const offsetB , uint const offsetC ) { double rC[4][2] = {(double)0}; double rA[1][4]; double rB[1][2]; A += offsetA; B += offsetB; C+=offsetC; __local double lA[528]; __local double lB[528]; int gidx = get_group_id(0); int gidy = get_group_id(1); int idx = get_local_id(0); int idy = get_local_id(1); int idt = 8*idy + idx; int idxT = idt % 16; int idyT = idt / 16; A += gidx*32*lda + idxT + idyT*lda; B += gidy*32*ldb+ idxT + idyT*ldb; for( int block_k=0 ; block_k< K ; block_k+=16) { __local double* plA = lA + idxT*33+ idyT; __local double* plB = lB + idxT*33+ idyT; barrier(CLK_LOCAL_MEM_FENCE); plA[0] = A[0+0*lda]; plA[8] = A[0+8*lda]; plA[16] = A[0+16*lda]; plA[24] = A[0+24*lda]; plB[0] = B[0+0*ldb]; plB[8] = B[0+8*ldb]; plB[16] = B[0+16*ldb]; plB[24] = B[0+24*ldb]; barrier(CLK_LOCAL_MEM_FENCE); int offA = 1*idx; int offB = 1*idy; for( int k = 0 ; k < 16; k+=1) { rA[0][0] = lA[offA + 0]; rA[0][1] = lA[offA + 8]; rA[0][2] = lA[offA + 16]; rA[0][3] = lA[offA + 24]; rB[0][0] = lB[offB + 0]; rB[0][1] = lB[offB + 16]; offA += 33; offB += 33; rC[0][0]=mad(rA[0][0],rB[0][0],rC[0][0]); rC[1][0]=mad(rA[0][1],rB[0][0],rC[1][0]); rC[2][0]=mad(rA[0][2],rB[0][0],rC[2][0]); rC[3][0]=mad(rA[0][3],rB[0][0],rC[3][0]); rC[0][1]=mad(rA[0][0],rB[0][1],rC[0][1]); rC[1][1]=mad(rA[0][1],rB[0][1],rC[1][1]); rC[2][1]=mad(rA[0][2],rB[0][1],rC[2][1]); rC[3][1]=mad(rA[0][3],rB[0][1],rC[3][1]); } A += 16; B += 16; } C+= gidx*32; C+= idx; C+= gidy*32*ldc; C+= idy*ldc; C[0*ldc] = alpha*rC[0][0] + beta*C[0*ldc]; C[16*ldc] = alpha*rC[0][1] + beta*C[16*ldc]; C+=8; C[0*ldc] = alpha*rC[1][0] + beta*C[0*ldc]; C[16*ldc] = alpha*rC[1][1] + beta*C[16*ldc]; C+=8; C[0*ldc] = alpha*rC[2][0] + beta*C[0*ldc]; C[16*ldc] = alpha*rC[2][1] + beta*C[16*ldc]; C+=8; C[0*ldc] = alpha*rC[3][0] + beta*C[0*ldc]; C[16*ldc] = alpha*rC[3][1] + beta*C[16*ldc]; } "; static const char * dgemm_TN_32_32_16_8x16_4x2__ALPHA = " __attribute__( (reqd_work_group_size(8, 16, 1)) ) __kernel void dgemm_TN_32_32_16_8x16_4x2__ALPHA( __global double const * restrict A, __global double const * restrict B, __global double * C, uint const M, uint const N, uint const K, double const alpha, uint lda, uint ldb, uint ldc, uint const offsetA, uint const offsetB , uint const offsetC ) { double rC[4][2] = {(double)0}; double rA[1][4]; double rB[1][2]; A += offsetA; B += offsetB; C+=offsetC; __local double lA[528]; __local double lB[528]; int gidx = get_group_id(0); int gidy = get_group_id(1); int idx = get_local_id(0); int idy = get_local_id(1); int idt = 8*idy + idx; int idxT = idt % 16; int idyT = idt / 16; A += gidx*32*lda + idxT + idyT*lda; B += gidy*32*ldb+ idxT + idyT*ldb; for( int block_k=0 ; block_k< K ; block_k+=16) { __local double* plA = lA + idxT*33+ idyT; __local double* plB = lB + idxT*33+ idyT; barrier(CLK_LOCAL_MEM_FENCE); plA[0] = A[0+0*lda]; plA[8] = A[0+8*lda]; plA[16] = A[0+16*lda]; plA[24] = A[0+24*lda]; plB[0] = B[0+0*ldb]; plB[8] = B[0+8*ldb]; plB[16] = B[0+16*ldb]; plB[24] = B[0+24*ldb]; barrier(CLK_LOCAL_MEM_FENCE); int offA = 1*idx; int offB = 1*idy; for( int k = 0 ; k < 16; k+=1) { rA[0][0] = lA[offA + 0]; rA[0][1] = lA[offA + 8]; rA[0][2] = lA[offA + 16]; rA[0][3] = lA[offA + 24]; rB[0][0] = lB[offB + 0]; rB[0][1] = lB[offB + 16]; offA += 33; offB += 33; rC[0][0]=mad(rA[0][0],rB[0][0],rC[0][0]); rC[1][0]=mad(rA[0][1],rB[0][0],rC[1][0]); rC[2][0]=mad(rA[0][2],rB[0][0],rC[2][0]); rC[3][0]=mad(rA[0][3],rB[0][0],rC[3][0]); rC[0][1]=mad(rA[0][0],rB[0][1],rC[0][1]); rC[1][1]=mad(rA[0][1],rB[0][1],rC[1][1]); rC[2][1]=mad(rA[0][2],rB[0][1],rC[2][1]); rC[3][1]=mad(rA[0][3],rB[0][1],rC[3][1]); } A += 16; B += 16; } C+= gidx*32; C+= idx; C+= gidy*32*ldc; C+= idy*ldc; C[0*ldc] = alpha*rC[0][0] ; C[16*ldc] = alpha*rC[0][1]; C+=8; C[0*ldc] = alpha*rC[1][0] ; C[16*ldc] = alpha*rC[1][1]; C+=8; C[0*ldc] = alpha*rC[2][0] ; C[16*ldc] = alpha*rC[2][1]; C+=8; C[0*ldc] = alpha*rC[3][0] ; C[16*ldc] = alpha*rC[3][1]; } "; static const char * dgemm_TN_48_48_8_8x8_6x6__ALPHABETA = " __attribute__( (reqd_work_group_size(8, 8, 1)) ) __kernel void dgemm_TN_48_48_8_8x8_6x6__ALPHABETA( __global double const * restrict A, __global double const * restrict B, __global double * C, uint const M, uint const N, uint const K, double const alpha, double const beta, uint lda, uint ldb, uint ldc, uint const offsetA, uint const offsetB, uint const offsetC ) { double rC[6][6] = {(double)0}; double rA[1][6]; double rB[1][6]; A += offsetA; B += offsetB; C+=offsetC; __local double lA[392]; __local double lB[392]; int gidx = get_group_id(0); int gidy = get_group_id(1); int idx = get_local_id(0); int idy = get_local_id(1); int idt = 8*idy + idx; int idxT = idt % 4; int idyT = idt / 4; A += gidx*48*lda + idxT + idyT*lda; B += gidy*48*ldb+ idxT + idyT*ldb; //for( int block_k=0 ; block_k< K ; block_k+=8) uint block_k = K >> 3; do { __local double* plA = lA + idxT*49+ idyT; __local double* plB = lB + idxT*49+ idyT; barrier(CLK_LOCAL_MEM_FENCE); plA[0] = A[0]; plA[196] = A[4]; plA[16] = A[16*lda]; plA[212] = A[4+16*lda]; plA[32] = A[32*lda]; plA[228] = A[4+32*lda]; plB[0] = B[0]; plB[196] = B[4+0*ldb]; plB[16] = B[0+16*ldb]; plB[212] = B[4+16*ldb]; plB[32] = B[0+32*ldb]; plB[228] = B[4+32*ldb]; barrier(CLK_LOCAL_MEM_FENCE); int offA = 1*idx; int offB = 1*idy; for( int k = 0 ; k < 8; k+=1) { rA[0][0] = lA[offA + 0]; rA[0][1] = lA[offA + 8]; rA[0][2] = lA[offA + 16]; rA[0][3] = lA[offA + 24]; rA[0][4] = lA[offA + 32]; rA[0][5] = lA[offA + 40]; rB[0][0] = lB[offB + 0]; rB[0][1] = lB[offB + 8]; rB[0][2] = lB[offB + 16]; rB[0][3] = lB[offB + 24]; rB[0][4] = lB[offB + 32]; rB[0][5] = lB[offB + 40]; offA += 49; offB += 49; rC[0][0]=mad(rA[0][0],rB[0][0],rC[0][0]); rC[1][0]=mad(rA[0][1],rB[0][0],rC[1][0]); rC[2][0]=mad(rA[0][2],rB[0][0],rC[2][0]); rC[3][0]=mad(rA[0][3],rB[0][0],rC[3][0]); rC[4][0]=mad(rA[0][4],rB[0][0],rC[4][0]); rC[5][0]=mad(rA[0][5],rB[0][0],rC[5][0]); rC[0][1]=mad(rA[0][0],rB[0][1],rC[0][1]); rC[1][1]=mad(rA[0][1],rB[0][1],rC[1][1]); rC[2][1]=mad(rA[0][2],rB[0][1],rC[2][1]); rC[3][1]=mad(rA[0][3],rB[0][1],rC[3][1]); rC[4][1]=mad(rA[0][4],rB[0][1],rC[4][1]); rC[5][1]=mad(rA[0][5],rB[0][1],rC[5][1]); rC[0][2]=mad(rA[0][0],rB[0][2],rC[0][2]); rC[1][2]=mad(rA[0][1],rB[0][2],rC[1][2]); rC[2][2]=mad(rA[0][2],rB[0][2],rC[2][2]); rC[3][2]=mad(rA[0][3],rB[0][2],rC[3][2]); rC[4][2]=mad(rA[0][4],rB[0][2],rC[4][2]); rC[5][2]=mad(rA[0][5],rB[0][2],rC[5][2]); rC[0][3]=mad(rA[0][0],rB[0][3],rC[0][3]); rC[1][3]=mad(rA[0][1],rB[0][3],rC[1][3]); rC[2][3]=mad(rA[0][2],rB[0][3],rC[2][3]); rC[3][3]=mad(rA[0][3],rB[0][3],rC[3][3]); rC[4][3]=mad(rA[0][4],rB[0][3],rC[4][3]); rC[5][3]=mad(rA[0][5],rB[0][3],rC[5][3]); rC[0][4]=mad(rA[0][0],rB[0][4],rC[0][4]); rC[1][4]=mad(rA[0][1],rB[0][4],rC[1][4]); rC[2][4]=mad(rA[0][2],rB[0][4],rC[2][4]); rC[3][4]=mad(rA[0][3],rB[0][4],rC[3][4]); rC[4][4]=mad(rA[0][4],rB[0][4],rC[4][4]); rC[5][4]=mad(rA[0][5],rB[0][4],rC[5][4]); rC[0][5]=mad(rA[0][0],rB[0][5],rC[0][5]); rC[1][5]=mad(rA[0][1],rB[0][5],rC[1][5]); rC[2][5]=mad(rA[0][2],rB[0][5],rC[2][5]); rC[3][5]=mad(rA[0][3],rB[0][5],rC[3][5]); rC[4][5]=mad(rA[0][4],rB[0][5],rC[4][5]); rC[5][5]=mad(rA[0][5],rB[0][5],rC[5][5]); } A += 8; B += 8; } while (--block_k > 0); C+= gidx*48; C+= idx; C+= gidy*48*ldc; C+= idy*ldc; C[0*ldc] = alpha*rC[0][0] + beta*C[0*ldc]; C[8*ldc] = alpha*rC[0][1] + beta*C[8*ldc]; C[16*ldc] = alpha*rC[0][2] + beta*C[16*ldc]; C[24*ldc] = alpha*rC[0][3] + beta*C[24*ldc]; C[32*ldc] = alpha*rC[0][4] + beta*C[32*ldc]; C[40*ldc] = alpha*rC[0][5] + beta*C[40*ldc]; C+=8; C[0*ldc] = alpha*rC[1][0] + beta*C[0*ldc]; C[8*ldc] = alpha*rC[1][1] + beta*C[8*ldc]; C[16*ldc] = alpha*rC[1][2] + beta*C[16*ldc]; C[24*ldc] = alpha*rC[1][3] + beta*C[24*ldc]; C[32*ldc] = alpha*rC[1][4] + beta*C[32*ldc]; C[40*ldc] = alpha*rC[1][5] + beta*C[40*ldc]; C+=8; C[0*ldc] = alpha*rC[2][0] + beta*C[0*ldc]; C[8*ldc] = alpha*rC[2][1] + beta*C[8*ldc]; C[16*ldc] = alpha*rC[2][2] + beta*C[16*ldc]; C[24*ldc] = alpha*rC[2][3] + beta*C[24*ldc]; C[32*ldc] = alpha*rC[2][4] + beta*C[32*ldc]; C[40*ldc] = alpha*rC[2][5] + beta*C[40*ldc]; C+=8; C[0*ldc] = alpha*rC[3][0] + beta*C[0*ldc]; C[8*ldc] = alpha*rC[3][1] + beta*C[8*ldc]; C[16*ldc] = alpha*rC[3][2] + beta*C[16*ldc]; C[24*ldc] = alpha*rC[3][3] + beta*C[24*ldc]; C[32*ldc] = alpha*rC[3][4] + beta*C[32*ldc]; C[40*ldc] = alpha*rC[3][5] + beta*C[40*ldc]; C+=8; C[0*ldc] = alpha*rC[4][0] + beta*C[0*ldc]; C[8*ldc] = alpha*rC[4][1] + beta*C[8*ldc]; C[16*ldc] = alpha*rC[4][2] + beta*C[16*ldc]; C[24*ldc] = alpha*rC[4][3] + beta*C[24*ldc]; C[32*ldc] = alpha*rC[4][4] + beta*C[32*ldc]; C[40*ldc] = alpha*rC[4][5] + beta*C[40*ldc]; C+=8; C[0*ldc] = alpha*rC[5][0] + beta*C[0*ldc]; C[8*ldc] = alpha*rC[5][1] + beta*C[8*ldc]; C[16*ldc] = alpha*rC[5][2] + beta*C[16*ldc]; C[24*ldc] = alpha*rC[5][3] + beta*C[24*ldc]; C[32*ldc] = alpha*rC[5][4] + beta*C[32*ldc]; C[40*ldc] = alpha*rC[5][5] + beta*C[40*ldc]; C+=8; } "; static const char * dgemm_TN_48_48_8_8x8_6x6__ALPHA = " __attribute__( (reqd_work_group_size(8, 8, 1)) ) __kernel void dgemm_TN_48_48_8_8x8_6x6__ALPHA( __global double const * restrict A, __global double const * restrict B, __global double * C, uint const M, uint const N, uint const K, double const alpha, uint lda, uint ldb, uint ldc, uint const offsetA, uint const offsetB, uint const offsetC ) { double rC[6][6] = {(double)0}; double rA[1][6]; double rB[1][6]; A += offsetA; B += offsetB; C+=offsetC; __local double lA[392]; __local double lB[392]; int gidx = get_group_id(0); int gidy = get_group_id(1); int idx = get_local_id(0); int idy = get_local_id(1); int idt = 8*idy + idx; int idxT = idt % 4; int idyT = idt / 4; A += gidx*48*lda + idxT + idyT*lda; B += gidy*48*ldb+ idxT + idyT*ldb; //for( int block_k=0 ; block_k< K ; block_k+=8) uint block_k = K >> 3; do { __local double* plA = lA + idxT*49+ idyT; __local double* plB = lB + idxT*49+ idyT; barrier(CLK_LOCAL_MEM_FENCE); plA[0] = A[0]; plA[196] = A[4]; plA[16] = A[16*lda]; plA[212] = A[4+16*lda]; plA[32] = A[32*lda]; plA[228] = A[4+32*lda]; plB[0] = B[0]; plB[196] = B[4+0*ldb]; plB[16] = B[0+16*ldb]; plB[212] = B[4+16*ldb]; plB[32] = B[0+32*ldb]; plB[228] = B[4+32*ldb]; barrier(CLK_LOCAL_MEM_FENCE); int offA = 1*idx; int offB = 1*idy; for( int k = 0 ; k < 8; k+=1) { rA[0][0] = lA[offA + 0]; rA[0][1] = lA[offA + 8]; rA[0][2] = lA[offA + 16]; rA[0][3] = lA[offA + 24]; rA[0][4] = lA[offA + 32]; rA[0][5] = lA[offA + 40]; rB[0][0] = lB[offB + 0]; rB[0][1] = lB[offB + 8]; rB[0][2] = lB[offB + 16]; rB[0][3] = lB[offB + 24]; rB[0][4] = lB[offB + 32]; rB[0][5] = lB[offB + 40]; offA += 49; offB += 49; rC[0][0]=mad(rA[0][0],rB[0][0],rC[0][0]); rC[1][0]=mad(rA[0][1],rB[0][0],rC[1][0]); rC[2][0]=mad(rA[0][2],rB[0][0],rC[2][0]); rC[3][0]=mad(rA[0][3],rB[0][0],rC[3][0]); rC[4][0]=mad(rA[0][4],rB[0][0],rC[4][0]); rC[5][0]=mad(rA[0][5],rB[0][0],rC[5][0]); rC[0][1]=mad(rA[0][0],rB[0][1],rC[0][1]); rC[1][1]=mad(rA[0][1],rB[0][1],rC[1][1]); rC[2][1]=mad(rA[0][2],rB[0][1],rC[2][1]); rC[3][1]=mad(rA[0][3],rB[0][1],rC[3][1]); rC[4][1]=mad(rA[0][4],rB[0][1],rC[4][1]); rC[5][1]=mad(rA[0][5],rB[0][1],rC[5][1]); rC[0][2]=mad(rA[0][0],rB[0][2],rC[0][2]); rC[1][2]=mad(rA[0][1],rB[0][2],rC[1][2]); rC[2][2]=mad(rA[0][2],rB[0][2],rC[2][2]); rC[3][2]=mad(rA[0][3],rB[0][2],rC[3][2]); rC[4][2]=mad(rA[0][4],rB[0][2],rC[4][2]); rC[5][2]=mad(rA[0][5],rB[0][2],rC[5][2]); rC[0][3]=mad(rA[0][0],rB[0][3],rC[0][3]); rC[1][3]=mad(rA[0][1],rB[0][3],rC[1][3]); rC[2][3]=mad(rA[0][2],rB[0][3],rC[2][3]); rC[3][3]=mad(rA[0][3],rB[0][3],rC[3][3]); rC[4][3]=mad(rA[0][4],rB[0][3],rC[4][3]); rC[5][3]=mad(rA[0][5],rB[0][3],rC[5][3]); rC[0][4]=mad(rA[0][0],rB[0][4],rC[0][4]); rC[1][4]=mad(rA[0][1],rB[0][4],rC[1][4]); rC[2][4]=mad(rA[0][2],rB[0][4],rC[2][4]); rC[3][4]=mad(rA[0][3],rB[0][4],rC[3][4]); rC[4][4]=mad(rA[0][4],rB[0][4],rC[4][4]); rC[5][4]=mad(rA[0][5],rB[0][4],rC[5][4]); rC[0][5]=mad(rA[0][0],rB[0][5],rC[0][5]); rC[1][5]=mad(rA[0][1],rB[0][5],rC[1][5]); rC[2][5]=mad(rA[0][2],rB[0][5],rC[2][5]); rC[3][5]=mad(rA[0][3],rB[0][5],rC[3][5]); rC[4][5]=mad(rA[0][4],rB[0][5],rC[4][5]); rC[5][5]=mad(rA[0][5],rB[0][5],rC[5][5]); } A += 8; B += 8; } while (--block_k > 0); C+= gidx*48; C+= idx; C+= gidy*48*ldc; C+= idy*ldc; C[0*ldc] = alpha*rC[0][0] ; C[8*ldc] = alpha*rC[0][1] ; C[16*ldc] = alpha*rC[0][2]; C[24*ldc] = alpha*rC[0][3]; C[32*ldc] = alpha*rC[0][4]; C[40*ldc] = alpha*rC[0][5]; C+=8; C[0*ldc] = alpha*rC[1][0] ; C[8*ldc] = alpha*rC[1][1] ; C[16*ldc] = alpha*rC[1][2]; C[24*ldc] = alpha*rC[1][3]; C[32*ldc] = alpha*rC[1][4]; C[40*ldc] = alpha*rC[1][5]; C+=8; C[0*ldc] = alpha*rC[2][0] ; C[8*ldc] = alpha*rC[2][1] ; C[16*ldc] = alpha*rC[2][2]; C[24*ldc] = alpha*rC[2][3]; C[32*ldc] = alpha*rC[2][4]; C[40*ldc] = alpha*rC[2][5]; C+=8; C[0*ldc] = alpha*rC[3][0] ; C[8*ldc] = alpha*rC[3][1] ; C[16*ldc] = alpha*rC[3][2]; C[24*ldc] = alpha*rC[3][3]; C[32*ldc] = alpha*rC[3][4]; C[40*ldc] = alpha*rC[3][5]; C+=8; C[0*ldc] = alpha*rC[4][0] ; C[8*ldc] = alpha*rC[4][1] ; C[16*ldc] = alpha*rC[4][2]; C[24*ldc] = alpha*rC[4][3]; C[32*ldc] = alpha*rC[4][4]; C[40*ldc] = alpha*rC[4][5]; C+=8; C[0*ldc] = alpha*rC[5][0] ; C[8*ldc] = alpha*rC[5][1] ; C[16*ldc] = alpha*rC[5][2]; C[24*ldc] = alpha*rC[5][3]; C[32*ldc] = alpha*rC[5][4]; C[40*ldc] = alpha*rC[5][5]; } "; static const char * dgemm_TN_48_48_16_8x8_6x6__ALPHABETA = " __attribute__( (reqd_work_group_size(8, 8, 1)) ) __kernel void dgemm_TN_48_48_16_8x8_6x6__ALPHABETA( __global double const * restrict A, __global double const * restrict B, __global double * C, uint const M, uint const N, uint const K, double const alpha, double const beta, uint lda, uint ldb, uint ldc, uint const offsetA, uint const offsetB, uint const offsetC) { double rC[6][6] = {(double)0}; double rA[1][6]; double rB[1][6]; A += offsetA; B += offsetB; C+=offsetC; __local double lA[784]; __local double lB[784]; int gidx = get_group_id(0); int gidy = get_group_id(1); int idx = get_local_id(0); int idy = get_local_id(1); int idt = 8*idy + idx; int idxT = idt % 8; int idyT = idt / 8; A += gidx*48*lda + idxT + idyT*lda; B += gidy*48*ldb+ idxT + idyT*ldb; //for( int block_k=0 ; block_k< K ; block_k+=16) uint block_k = K >> 4; do { __local double* plA = lA + idxT*49+ idyT; __local double* plB = lB + idxT*49+ idyT; barrier(CLK_LOCAL_MEM_FENCE); plA[0] = A[0+0*lda]; plA[392] = A[8+0*lda]; plA[8] = A[0+8*lda]; plA[400] = A[8+8*lda]; plA[16] = A[0+16*lda]; plA[408] = A[8+16*lda]; plA[24] = A[0+24*lda]; plA[416] = A[8+24*lda]; plA[32] = A[0+32*lda]; plA[424] = A[8+32*lda]; plA[40] = A[0+40*lda]; plA[432] = A[8+40*lda]; plB[0] = B[0+0*ldb]; plB[392] = B[8+0*ldb]; plB[8] = B[0+8*ldb]; plB[400] = B[8+8*ldb]; plB[16] = B[0+16*ldb]; plB[408] = B[8+16*ldb]; plB[24] = B[0+24*ldb]; plB[416] = B[8+24*ldb]; plB[32] = B[0+32*ldb]; plB[424] = B[8+32*ldb]; plB[40] = B[0+40*ldb]; plB[432] = B[8+40*ldb]; barrier(CLK_LOCAL_MEM_FENCE); int offA = 1*idx; int offB = 1*idy; for( int k = 0 ; k < 16; k+=1) { rA[0][0] = lA[offA + 0]; rA[0][1] = lA[offA + 8]; rA[0][2] = lA[offA + 16]; rA[0][3] = lA[offA + 24]; rA[0][4] = lA[offA + 32]; rA[0][5] = lA[offA + 40]; rB[0][0] = lB[offB + 0]; rB[0][1] = lB[offB + 8]; rB[0][2] = lB[offB + 16]; rB[0][3] = lB[offB + 24]; rB[0][4] = lB[offB + 32]; rB[0][5] = lB[offB + 40]; offA += 49; offB += 49; rC[0][0]=mad(rA[0][0],rB[0][0],rC[0][0]); rC[1][0]=mad(rA[0][1],rB[0][0],rC[1][0]); rC[2][0]=mad(rA[0][2],rB[0][0],rC[2][0]); rC[3][0]=mad(rA[0][3],rB[0][0],rC[3][0]); rC[4][0]=mad(rA[0][4],rB[0][0],rC[4][0]); rC[5][0]=mad(rA[0][5],rB[0][0],rC[5][0]); rC[0][1]=mad(rA[0][0],rB[0][1],rC[0][1]); rC[1][1]=mad(rA[0][1],rB[0][1],rC[1][1]); rC[2][1]=mad(rA[0][2],rB[0][1],rC[2][1]); rC[3][1]=mad(rA[0][3],rB[0][1],rC[3][1]); rC[4][1]=mad(rA[0][4],rB[0][1],rC[4][1]); rC[5][1]=mad(rA[0][5],rB[0][1],rC[5][1]); rC[0][2]=mad(rA[0][0],rB[0][2],rC[0][2]); rC[1][2]=mad(rA[0][1],rB[0][2],rC[1][2]); rC[2][2]=mad(rA[0][2],rB[0][2],rC[2][2]); rC[3][2]=mad(rA[0][3],rB[0][2],rC[3][2]); rC[4][2]=mad(rA[0][4],rB[0][2],rC[4][2]); rC[5][2]=mad(rA[0][5],rB[0][2],rC[5][2]); rC[0][3]=mad(rA[0][0],rB[0][3],rC[0][3]); rC[1][3]=mad(rA[0][1],rB[0][3],rC[1][3]); rC[2][3]=mad(rA[0][2],rB[0][3],rC[2][3]); rC[3][3]=mad(rA[0][3],rB[0][3],rC[3][3]); rC[4][3]=mad(rA[0][4],rB[0][3],rC[4][3]); rC[5][3]=mad(rA[0][5],rB[0][3],rC[5][3]); rC[0][4]=mad(rA[0][0],rB[0][4],rC[0][4]); rC[1][4]=mad(rA[0][1],rB[0][4],rC[1][4]); rC[2][4]=mad(rA[0][2],rB[0][4],rC[2][4]); rC[3][4]=mad(rA[0][3],rB[0][4],rC[3][4]); rC[4][4]=mad(rA[0][4],rB[0][4],rC[4][4]); rC[5][4]=mad(rA[0][5],rB[0][4],rC[5][4]); rC[0][5]=mad(rA[0][0],rB[0][5],rC[0][5]); rC[1][5]=mad(rA[0][1],rB[0][5],rC[1][5]); rC[2][5]=mad(rA[0][2],rB[0][5],rC[2][5]); rC[3][5]=mad(rA[0][3],rB[0][5],rC[3][5]); rC[4][5]=mad(rA[0][4],rB[0][5],rC[4][5]); rC[5][5]=mad(rA[0][5],rB[0][5],rC[5][5]); } A += 16; B += 16; } while (--block_k > 0); C+= gidx*48; C+= idx*1; C+= gidy*48*ldc; C+= idy*1*ldc; C[0*ldc] = alpha*rC[0][0] + beta*C[0*ldc]; C[8*ldc] = alpha*rC[0][1] + beta*C[8*ldc]; C[16*ldc] = alpha*rC[0][2] + beta*C[16*ldc]; C[24*ldc] = alpha*rC[0][3] + beta*C[24*ldc]; C[32*ldc] = alpha*rC[0][4] + beta*C[32*ldc]; C[40*ldc] = alpha*rC[0][5] + beta*C[40*ldc]; C+=8; C[0*ldc] = alpha*rC[1][0] + beta*C[0*ldc]; C[8*ldc] = alpha*rC[1][1] + beta*C[8*ldc]; C[16*ldc] = alpha*rC[1][2] + beta*C[16*ldc]; C[24*ldc] = alpha*rC[1][3] + beta*C[24*ldc]; C[32*ldc] = alpha*rC[1][4] + beta*C[32*ldc]; C[40*ldc] = alpha*rC[1][5] + beta*C[40*ldc]; C+=8; C[0*ldc] = alpha*rC[2][0] + beta*C[0*ldc]; C[8*ldc] = alpha*rC[2][1] + beta*C[8*ldc]; C[16*ldc] = alpha*rC[2][2] + beta*C[16*ldc]; C[24*ldc] = alpha*rC[2][3] + beta*C[24*ldc]; C[32*ldc] = alpha*rC[2][4] + beta*C[32*ldc]; C[40*ldc] = alpha*rC[2][5] + beta*C[40*ldc]; C+=8; C[0*ldc] = alpha*rC[3][0] + beta*C[0*ldc]; C[8*ldc] = alpha*rC[3][1] + beta*C[8*ldc]; C[16*ldc] = alpha*rC[3][2] + beta*C[16*ldc]; C[24*ldc] = alpha*rC[3][3] + beta*C[24*ldc]; C[32*ldc] = alpha*rC[3][4] + beta*C[32*ldc]; C[40*ldc] = alpha*rC[3][5] + beta*C[40*ldc]; C+=8; C[0*ldc] = alpha*rC[4][0] + beta*C[0*ldc]; C[8*ldc] = alpha*rC[4][1] + beta*C[8*ldc]; C[16*ldc] = alpha*rC[4][2] + beta*C[16*ldc]; C[24*ldc] = alpha*rC[4][3] + beta*C[24*ldc]; C[32*ldc] = alpha*rC[4][4] + beta*C[32*ldc]; C[40*ldc] = alpha*rC[4][5] + beta*C[40*ldc]; C+=8; C[0] = alpha*rC[5][0] + beta*C[0]; C[8*ldc] = alpha*rC[5][1] + beta*C[8*ldc]; C[16*ldc] = alpha*rC[5][2] + beta*C[16*ldc]; C[24*ldc] = alpha*rC[5][3] + beta*C[24*ldc]; C[32*ldc] = alpha*rC[5][4] + beta*C[32*ldc]; C[40*ldc] = alpha*rC[5][5] + beta*C[40*ldc]; C+=8; } "; static const char * dgemm_TN_48_48_16_8x8_6x6__ALPHA = " __attribute__( (reqd_work_group_size(8, 8, 1)) ) __kernel void dgemm_TN_48_48_16_8x8_6x6__ALPHA( __global double const * restrict A, __global double const * restrict B, __global double * C, uint const M, uint const N, uint const K, double const alpha, uint lda, uint ldb, uint ldc, uint const offsetA, uint const offsetB, uint const offsetC) { double rC[6][6] = {(double)0}; double rA[1][6]; double rB[1][6]; A += offsetA; B += offsetB; C+=offsetC; __local double lA[784]; __local double lB[784]; int gidx = get_group_id(0); int gidy = get_group_id(1); int idx = get_local_id(0); int idy = get_local_id(1); int idt = 8*idy + idx; int idxT = idt % 8; int idyT = idt / 8; A += gidx*48*lda + idxT + idyT*lda; B += gidy*48*ldb+ idxT + idyT*ldb; //for( int block_k=0 ; block_k< K ; block_k+=16) uint block_k = K >> 4; do { __local double* plA = lA + idxT*49+ idyT; __local double* plB = lB + idxT*49+ idyT; barrier(CLK_LOCAL_MEM_FENCE); plA[0] = A[0+0*lda]; plA[392] = A[8+0*lda]; plA[8] = A[0+8*lda]; plA[400] = A[8+8*lda]; plA[16] = A[0+16*lda]; plA[408] = A[8+16*lda]; plA[24] = A[0+24*lda]; plA[416] = A[8+24*lda]; plA[32] = A[0+32*lda]; plA[424] = A[8+32*lda]; plA[40] = A[0+40*lda]; plA[432] = A[8+40*lda]; plB[0] = B[0+0*ldb]; plB[392] = B[8+0*ldb]; plB[8] = B[0+8*ldb]; plB[400] = B[8+8*ldb]; plB[16] = B[0+16*ldb]; plB[408] = B[8+16*ldb]; plB[24] = B[0+24*ldb]; plB[416] = B[8+24*ldb]; plB[32] = B[0+32*ldb]; plB[424] = B[8+32*ldb]; plB[40] = B[0+40*ldb]; plB[432] = B[8+40*ldb]; barrier(CLK_LOCAL_MEM_FENCE); int offA = 1*idx; int offB = 1*idy; for( int k = 0 ; k < 16; k+=1) { rA[0][0] = lA[offA + 0]; rA[0][1] = lA[offA + 8]; rA[0][2] = lA[offA + 16]; rA[0][3] = lA[offA + 24]; rA[0][4] = lA[offA + 32]; rA[0][5] = lA[offA + 40]; rB[0][0] = lB[offB + 0]; rB[0][1] = lB[offB + 8]; rB[0][2] = lB[offB + 16]; rB[0][3] = lB[offB + 24]; rB[0][4] = lB[offB + 32]; rB[0][5] = lB[offB + 40]; offA += 49; offB += 49; rC[0][0]=mad(rA[0][0],rB[0][0],rC[0][0]); rC[1][0]=mad(rA[0][1],rB[0][0],rC[1][0]); rC[2][0]=mad(rA[0][2],rB[0][0],rC[2][0]); rC[3][0]=mad(rA[0][3],rB[0][0],rC[3][0]); rC[4][0]=mad(rA[0][4],rB[0][0],rC[4][0]); rC[5][0]=mad(rA[0][5],rB[0][0],rC[5][0]); rC[0][1]=mad(rA[0][0],rB[0][1],rC[0][1]); rC[1][1]=mad(rA[0][1],rB[0][1],rC[1][1]); rC[2][1]=mad(rA[0][2],rB[0][1],rC[2][1]); rC[3][1]=mad(rA[0][3],rB[0][1],rC[3][1]); rC[4][1]=mad(rA[0][4],rB[0][1],rC[4][1]); rC[5][1]=mad(rA[0][5],rB[0][1],rC[5][1]); rC[0][2]=mad(rA[0][0],rB[0][2],rC[0][2]); rC[1][2]=mad(rA[0][1],rB[0][2],rC[1][2]); rC[2][2]=mad(rA[0][2],rB[0][2],rC[2][2]); rC[3][2]=mad(rA[0][3],rB[0][2],rC[3][2]); rC[4][2]=mad(rA[0][4],rB[0][2],rC[4][2]); rC[5][2]=mad(rA[0][5],rB[0][2],rC[5][2]); rC[0][3]=mad(rA[0][0],rB[0][3],rC[0][3]); rC[1][3]=mad(rA[0][1],rB[0][3],rC[1][3]); rC[2][3]=mad(rA[0][2],rB[0][3],rC[2][3]); rC[3][3]=mad(rA[0][3],rB[0][3],rC[3][3]); rC[4][3]=mad(rA[0][4],rB[0][3],rC[4][3]); rC[5][3]=mad(rA[0][5],rB[0][3],rC[5][3]); rC[0][4]=mad(rA[0][0],rB[0][4],rC[0][4]); rC[1][4]=mad(rA[0][1],rB[0][4],rC[1][4]); rC[2][4]=mad(rA[0][2],rB[0][4],rC[2][4]); rC[3][4]=mad(rA[0][3],rB[0][4],rC[3][4]); rC[4][4]=mad(rA[0][4],rB[0][4],rC[4][4]); rC[5][4]=mad(rA[0][5],rB[0][4],rC[5][4]); rC[0][5]=mad(rA[0][0],rB[0][5],rC[0][5]); rC[1][5]=mad(rA[0][1],rB[0][5],rC[1][5]); rC[2][5]=mad(rA[0][2],rB[0][5],rC[2][5]); rC[3][5]=mad(rA[0][3],rB[0][5],rC[3][5]); rC[4][5]=mad(rA[0][4],rB[0][5],rC[4][5]); rC[5][5]=mad(rA[0][5],rB[0][5],rC[5][5]); } A += 16; B += 16; } while (--block_k > 0); C+= gidx*48; C+= idx; C+= gidy*48*ldc; C+= idy*ldc; C[0*ldc] = alpha*rC[0][0] ; C[8*ldc] = alpha*rC[0][1] ; C[16*ldc] = alpha*rC[0][2]; C[24*ldc] = alpha*rC[0][3]; C[32*ldc] = alpha*rC[0][4]; C[40*ldc] = alpha*rC[0][5]; C+=8; C[0*ldc] = alpha*rC[1][0] ; C[8*ldc] = alpha*rC[1][1] ; C[16*ldc] = alpha*rC[1][2]; C[24*ldc] = alpha*rC[1][3]; C[32*ldc] = alpha*rC[1][4]; C[40*ldc] = alpha*rC[1][5]; C+=8; C[0*ldc] = alpha*rC[2][0] ; C[8*ldc] = alpha*rC[2][1] ; C[16*ldc] = alpha*rC[2][2]; C[24*ldc] = alpha*rC[2][3]; C[32*ldc] = alpha*rC[2][4]; C[40*ldc] = alpha*rC[2][5]; C+=8; C[0*ldc] = alpha*rC[3][0] ; C[8*ldc] = alpha*rC[3][1] ; C[16*ldc] = alpha*rC[3][2]; C[24*ldc] = alpha*rC[3][3]; C[32*ldc] = alpha*rC[3][4]; C[40*ldc] = alpha*rC[3][5]; C+=8; C[0*ldc] = alpha*rC[4][0] ; C[8*ldc] = alpha*rC[4][1] ; C[16*ldc] = alpha*rC[4][2]; C[24*ldc] = alpha*rC[4][3]; C[32*ldc] = alpha*rC[4][4]; C[40*ldc] = alpha*rC[4][5]; C+=8; C[0] = alpha*rC[5][0] ; C[8*ldc] = alpha*rC[5][1] ; C[16*ldc] = alpha*rC[5][2]; C[24*ldc] = alpha*rC[5][3]; C[32*ldc] = alpha*rC[5][4]; C[40*ldc] = alpha*rC[5][5]; } "; static const char * dgemm_TN_1_1_1_8x16_4x2__ALPHABETA = " __attribute__( (reqd_work_group_size(8, 16, 1)) ) __kernel void dgemm_TN_1_1_1_8x16_4x2__ALPHABETA( __global double const * restrict A, __global double const * restrict B, __global double * C, uint const M, uint const N, uint const K, double const alpha, double const beta, uint lda, uint ldb, uint ldc, uint const offsetA, uint const offsetB, uint const offsetC) { double rC[4][2] = {(double)0}; double rA[1][4]; double rB[1][2]; A += offsetA; B += offsetB; C += offsetC; __local double lA[528]; __local double lB[528]; int gidx = get_group_id(0); int gidy = get_group_id(1); int idx = get_local_id(0); int idy = get_local_id(1); int idt = 8*idy + idx; int idxT = idt % 16; int idyT = idt / 16; A += gidx*32*lda + idxT + idyT*lda; B += gidy*32*ldb+ idxT + idyT*ldb; __local double* plA = lA + idxT*33+ idyT; __local double* plB = lB + idxT*33+ idyT; if(gidx==get_num_groups(0)-1 || gidy==get_num_groups(1)-1 ) { int CurrentOffSetA = idyT; int CurrentOffSetB = idyT; for( int block_k=0 ; block_k< K ; block_k+=16) { barrier(CLK_LOCAL_MEM_FENCE); plA[0] = CurrentOffSetA>=M?0.0:A[0]; plA[8] = CurrentOffSetA+8>=M?0.0:A[8*lda]; plA[16] = CurrentOffSetA+16>=M?0.0:A[16*lda]; plA[24] = CurrentOffSetA+24>=M?0.0:A[24*lda]; plB[0] = CurrentOffSetB>=N?0.0:B[0]; plB[8] = CurrentOffSetB+8>=N?0.0:B[8*ldb]; plB[16] = CurrentOffSetB+16>=N?0.0:B[16*ldb]; plB[24] = CurrentOffSetB+24>=N?0.0:B[24*ldb]; barrier(CLK_LOCAL_MEM_FENCE); int offA = idx; int offB = idy; for( int k = 0 ; k < min(16u, K-block_k); k+=1) { rA[0][0] = lA[offA ]; rA[0][1] = lA[offA + 8]; rA[0][2] = lA[offA + 16]; rA[0][3] = lA[offA + 24]; rB[0][0] = lB[offB ]; rB[0][1] = lB[offB + 16]; offA += 33; offB += 33; rC[0][0]=mad(rA[0][0],rB[0][0],rC[0][0]); rC[1][0]=mad(rA[0][1],rB[0][0],rC[1][0]); rC[2][0]=mad(rA[0][2],rB[0][0],rC[2][0]); rC[3][0]=mad(rA[0][3],rB[0][0],rC[3][0]); rC[0][1]=mad(rA[0][0],rB[0][1],rC[0][1]); rC[1][1]=mad(rA[0][1],rB[0][1],rC[1][1]); rC[2][1]=mad(rA[0][2],rB[0][1],rC[2][1]); rC[3][1]=mad(rA[0][3],rB[0][1],rC[3][1]); } A += 16; B += 16; } C+= gidx*32; C+= idx; C+= gidy*32*ldc; C+= idy*ldc; int OffSetCM = gidx*32+idx; int OffSetCN = gidy*32+idy; if(OffSetCM>=M || OffSetCN>=N) return; for(int i = 0; i<4; i++) { C[0*ldc] = alpha*rC[i][0] + beta*C[0]; if (OffSetCN+16=M) return; } } else { for( int block_k=0 ; block_k< K ; block_k+=16) { barrier(CLK_LOCAL_MEM_FENCE); plA[0] = A[0]; plA[8] = A[8*lda]; plA[16] = A[16*lda]; plA[24] = A[24*lda]; plB[0] = B[0]; plB[8] = B[8*ldb]; plB[16] = B[16*ldb]; plB[24] = B[24*ldb]; barrier(CLK_LOCAL_MEM_FENCE); int offA = idx; int offB = idy; for( int k = 0 ; k < min(16u, K-block_k); k+=1) { rA[0][0] = lA[offA ]; rA[0][1] = lA[offA + 8]; rA[0][2] = lA[offA + 16]; rA[0][3] = lA[offA + 24]; rB[0][0] = lB[offB ]; rB[0][1] = lB[offB + 16]; offA += 33; offB += 33; rC[0][0]=mad(rA[0][0],rB[0][0],rC[0][0]); rC[1][0]=mad(rA[0][1],rB[0][0],rC[1][0]); rC[2][0]=mad(rA[0][2],rB[0][0],rC[2][0]); rC[3][0]=mad(rA[0][3],rB[0][0],rC[3][0]); rC[0][1]=mad(rA[0][0],rB[0][1],rC[0][1]); rC[1][1]=mad(rA[0][1],rB[0][1],rC[1][1]); rC[2][1]=mad(rA[0][2],rB[0][1],rC[2][1]); rC[3][1]=mad(rA[0][3],rB[0][1],rC[3][1]); } A += 16; B += 16; } C+= gidx*32; C+= idx*1; C+= gidy*32*ldc; C+= idy*1*ldc; C[0*ldc] = alpha*rC[0][0] + beta*C[0*ldc]; C[16*ldc] = alpha*rC[0][1] + beta*C[16*ldc]; C+=8; C[0*ldc] = alpha*rC[1][0] + beta*C[0*ldc]; C[16*ldc] = alpha*rC[1][1] + beta*C[16*ldc]; C+=8; C[0*ldc] = alpha*rC[2][0] + beta*C[0*ldc]; C[16*ldc] = alpha*rC[2][1] + beta*C[16*ldc]; C+=8; C[0*ldc] = alpha*rC[3][0] + beta*C[0*ldc]; C[16*ldc] = alpha*rC[3][1] + beta*C[16*ldc]; } } "; static const char * dgemm_TN_1_1_1_8x16_4x2__ALPHA = " __attribute__( (reqd_work_group_size(8, 16, 1)) ) __kernel void dgemm_TN_1_1_1_8x16_4x2__ALPHA( __global double const * restrict A, __global double const * restrict B, __global double * C, uint const M, uint const N, uint const K, double const alpha, uint lda, uint ldb, uint ldc, uint const offsetA, uint const offsetB, uint const offsetC) { double rC[4][2] = {(double)0}; double rA[1][4]; double rB[1][2]; A += offsetA; B += offsetB; C += offsetC; __local double lA[528]; __local double lB[528]; int gidx = get_group_id(0); int gidy = get_group_id(1); int idx = get_local_id(0); int idy = get_local_id(1); int idt = 8*idy + idx; int idxT = idt % 16; int idyT = idt / 16; A += gidx*32*lda + idxT + idyT*lda; B += gidy*32*ldb+ idxT + idyT*ldb; __local double* plA = lA + idxT*33+ idyT; __local double* plB = lB + idxT*33+ idyT; if(gidx==get_num_groups(0)-1 || gidy==get_num_groups(1)-1 ) { int CurrentOffSetA = idyT; int CurrentOffSetB = idyT; for( int block_k=0 ; block_k< K ; block_k+=16) { barrier(CLK_LOCAL_MEM_FENCE); plA[0] = CurrentOffSetA>=M?0.0:A[0]; plA[8] = CurrentOffSetA+8>=M?0.0:A[8*lda]; plA[16] = CurrentOffSetA+16>=M?0.0:A[16*lda]; plA[24] = CurrentOffSetA+24>=M?0.0:A[24*lda]; plB[0] = CurrentOffSetB>=N?0.0:B[0]; plB[8] = CurrentOffSetB+8>=N?0.0:B[8*ldb]; plB[16] = CurrentOffSetB+16>=N?0.0:B[16*ldb]; plB[24] = CurrentOffSetB+24>=N?0.0:B[24*ldb]; barrier(CLK_LOCAL_MEM_FENCE); int offA = idx; int offB = idy; for( int k = 0 ; k < min(16u, K-block_k); k+=1) { rA[0][0] = lA[offA ]; rA[0][1] = lA[offA + 8]; rA[0][2] = lA[offA + 16]; rA[0][3] = lA[offA + 24]; rB[0][0] = lB[offB ]; rB[0][1] = lB[offB + 16]; offA += 33; offB += 33; rC[0][0]=mad(rA[0][0],rB[0][0],rC[0][0]); rC[1][0]=mad(rA[0][1],rB[0][0],rC[1][0]); rC[2][0]=mad(rA[0][2],rB[0][0],rC[2][0]); rC[3][0]=mad(rA[0][3],rB[0][0],rC[3][0]); rC[0][1]=mad(rA[0][0],rB[0][1],rC[0][1]); rC[1][1]=mad(rA[0][1],rB[0][1],rC[1][1]); rC[2][1]=mad(rA[0][2],rB[0][1],rC[2][1]); rC[3][1]=mad(rA[0][3],rB[0][1],rC[3][1]); } A += 16; B += 16; } C+= gidx*32; C+= idx; C+= gidy*32*ldc; C+= idy*ldc; int OffSetCM = gidx*32+idx; int OffSetCN = gidy*32+idy; if(OffSetCM>=M || OffSetCN>=N) return; for(int i = 0; i<4; i++) { C[0*ldc] = alpha*rC[i][0]; if (OffSetCN+16=M) return; } } else { for( int block_k=0 ; block_k< K ; block_k+=16) { barrier(CLK_LOCAL_MEM_FENCE); plA[0] = A[0]; plA[8] = A[8*lda]; plA[16] = A[16*lda]; plA[24] = A[24*lda]; plB[0] = B[0]; plB[8] = B[8*ldb]; plB[16] = B[16*ldb]; plB[24] = B[24*ldb]; barrier(CLK_LOCAL_MEM_FENCE); int offA = idx; int offB = idy; for( int k = 0 ; k < min(16u, K-block_k); k+=1) { rA[0][0] = lA[offA ]; rA[0][1] = lA[offA + 8]; rA[0][2] = lA[offA + 16]; rA[0][3] = lA[offA + 24]; rB[0][0] = lB[offB ]; rB[0][1] = lB[offB + 16]; offA += 33; offB += 33; rC[0][0]=mad(rA[0][0],rB[0][0],rC[0][0]); rC[1][0]=mad(rA[0][1],rB[0][0],rC[1][0]); rC[2][0]=mad(rA[0][2],rB[0][0],rC[2][0]); rC[3][0]=mad(rA[0][3],rB[0][0],rC[3][0]); rC[0][1]=mad(rA[0][0],rB[0][1],rC[0][1]); rC[1][1]=mad(rA[0][1],rB[0][1],rC[1][1]); rC[2][1]=mad(rA[0][2],rB[0][1],rC[2][1]); rC[3][1]=mad(rA[0][3],rB[0][1],rC[3][1]); } A += 16; B += 16; } C+= gidx*32; C+= idx; C+= gidy*32*ldc; C+= idy*ldc; C[0*ldc] = alpha*rC[0][0]; C[16*ldc] = alpha*rC[0][1]; C+=8; C[0*ldc] = alpha*rC[1][0]; C[16*ldc] = alpha*rC[1][1]; C+=8; C[0*ldc] = alpha*rC[2][0]; C[16*ldc] = alpha*rC[2][1]; C+=8; C[0*ldc] = alpha*rC[3][0]; C[16*ldc] = alpha*rC[3][1]; } } "; static const char * dgemm_TN_1_1_16_8x16_4x2__ALPHABETA = " __attribute__( (reqd_work_group_size(8, 16, 1)) ) __kernel void dgemm_TN_1_1_16_8x16_4x2__ALPHABETA( __global double const * restrict A, __global double const * restrict B, __global double * C, uint const M, uint const N, uint const K, double const alpha, double const beta, uint lda, uint ldb, uint ldc, uint const offsetA, uint const offsetB, uint const offsetC) { double rC[4][2] = {(double)0}; double rA[1][4]; double rB[1][2]; A += offsetA; B += offsetB; C += offsetC; __local double lA[528]; __local double lB[528]; int gidx = get_group_id(0); int gidy = get_group_id(1); int idx = get_local_id(0); int idy = get_local_id(1); int idt = 8*idy + idx; int idxT = idt % 16; int idyT = idt / 16; A += gidx*32*lda + idxT + idyT*lda; B += gidy*32*ldb+ idxT + idyT*ldb; __local double* plA = lA + idxT*33+ idyT; __local double* plB = lB + idxT*33+ idyT; if(gidx==get_num_groups(0)-1 || gidy==get_num_groups(1)-1 ) { int CurrentOffSetA = idyT; int CurrentOffSetB = idyT; for( int block_k=0 ; block_k< K ; block_k+=16) { barrier(CLK_LOCAL_MEM_FENCE); plA[0] = CurrentOffSetA>=M?0.0:A[0]; plA[8] = CurrentOffSetA+8>=M?0.0:A[8*lda]; plA[16] = CurrentOffSetA+16>=M?0.0:A[16*lda]; plA[24] = CurrentOffSetA+24>=M?0.0:A[24*lda]; plB[0] = CurrentOffSetB>=N?0.0:B[0]; plB[8] = CurrentOffSetB+8>=N?0.0:B[8*ldb]; plB[16] = CurrentOffSetB+16>=N?0.0:B[16*ldb]; plB[24] = CurrentOffSetB+24>=N?0.0:B[24*ldb]; barrier(CLK_LOCAL_MEM_FENCE); int offA = idx; int offB = idy; for( int k = 0 ; k < 16; k+=1) { rA[0][0] = lA[offA ]; rA[0][1] = lA[offA + 8]; rA[0][2] = lA[offA + 16]; rA[0][3] = lA[offA + 24]; rB[0][0] = lB[offB ]; rB[0][1] = lB[offB + 16]; offA += 33; offB += 33; rC[0][0]=mad(rA[0][0],rB[0][0],rC[0][0]); rC[1][0]=mad(rA[0][1],rB[0][0],rC[1][0]); rC[2][0]=mad(rA[0][2],rB[0][0],rC[2][0]); rC[3][0]=mad(rA[0][3],rB[0][0],rC[3][0]); rC[0][1]=mad(rA[0][0],rB[0][1],rC[0][1]); rC[1][1]=mad(rA[0][1],rB[0][1],rC[1][1]); rC[2][1]=mad(rA[0][2],rB[0][1],rC[2][1]); rC[3][1]=mad(rA[0][3],rB[0][1],rC[3][1]); } A += 16; B += 16; } C+= gidx*32; C+= idx; C+= gidy*32*ldc; C+= idy*ldc; int OffSetCM = gidx*32+idx; int OffSetCN = gidy*32+idy; if(OffSetCM>=M || OffSetCN>=N) return; for(int i = 0; i<4; i++) { C[0*ldc] = alpha*rC[i][0] + beta*C[0]; if (OffSetCN+16=M) return; } } else { for( int block_k=0 ; block_k< K ; block_k+=16) { barrier(CLK_LOCAL_MEM_FENCE); plA[0] = A[0]; plA[8] = A[8*lda]; plA[16] = A[16*lda]; plA[24] = A[24*lda]; plB[0] = B[0]; plB[8] = B[8*ldb]; plB[16] = B[16*ldb]; plB[24] = B[24*ldb]; barrier(CLK_LOCAL_MEM_FENCE); int offA = idx; int offB = idy; for( int k = 0 ; k < 16; k+=1) { rA[0][0] = lA[offA ]; rA[0][1] = lA[offA + 8]; rA[0][2] = lA[offA + 16]; rA[0][3] = lA[offA + 24]; rB[0][0] = lB[offB ]; rB[0][1] = lB[offB + 16]; offA += 33; offB += 33; rC[0][0]=mad(rA[0][0],rB[0][0],rC[0][0]); rC[1][0]=mad(rA[0][1],rB[0][0],rC[1][0]); rC[2][0]=mad(rA[0][2],rB[0][0],rC[2][0]); rC[3][0]=mad(rA[0][3],rB[0][0],rC[3][0]); rC[0][1]=mad(rA[0][0],rB[0][1],rC[0][1]); rC[1][1]=mad(rA[0][1],rB[0][1],rC[1][1]); rC[2][1]=mad(rA[0][2],rB[0][1],rC[2][1]); rC[3][1]=mad(rA[0][3],rB[0][1],rC[3][1]); } A += 16; B += 16; } C+= gidx*32; C+= idx*1; C+= gidy*32*ldc; C+= idy*1*ldc; C[0*ldc] = alpha*rC[0][0] + beta*C[0*ldc]; C[16*ldc] = alpha*rC[0][1] + beta*C[16*ldc]; C+=8; C[0*ldc] = alpha*rC[1][0] + beta*C[0*ldc]; C[16*ldc] = alpha*rC[1][1] + beta*C[16*ldc]; C+=8; C[0*ldc] = alpha*rC[2][0] + beta*C[0*ldc]; C[16*ldc] = alpha*rC[2][1] + beta*C[16*ldc]; C+=8; C[0*ldc] = alpha*rC[3][0] + beta*C[0]; C[16*ldc] = alpha*rC[3][1] + beta*C[16*ldc]; C+=8; } } "; static const char * dgemm_TN_1_1_16_8x16_4x2__ALPHA = " __attribute__( (reqd_work_group_size(8, 16, 1)) ) __kernel void dgemm_TN_1_1_16_8x16_4x2__ALPHA( __global double const * restrict A, __global double const * restrict B, __global double * C, uint const M, uint const N, uint const K, double const alpha, uint lda, uint ldb, uint ldc, uint const offsetA, uint const offsetB, uint const offsetC) { double rC[4][2] = {(double)0}; double rA[1][4]; double rB[1][2]; A += offsetA; B += offsetB; C += offsetC; __local double lA[528]; __local double lB[528]; int gidx = get_group_id(0); int gidy = get_group_id(1); int idx = get_local_id(0); int idy = get_local_id(1); int idt = 8*idy + idx; int idxT = idt % 16; int idyT = idt / 16; A += gidx*32*lda + idxT + idyT*lda; B += gidy*32*ldb+ idxT + idyT*ldb; __local double* plA = lA + idxT*33+ idyT; __local double* plB = lB + idxT*33+ idyT; if(gidx==get_num_groups(0)-1 || gidy==get_num_groups(1)-1 ) { int CurrentOffSetA = idyT; int CurrentOffSetB = idyT; for( int block_k=0 ; block_k< K ; block_k+=16) { barrier(CLK_LOCAL_MEM_FENCE); plA[0] = CurrentOffSetA>=M?0.0:A[0]; plA[8] = CurrentOffSetA+8>=M?0.0:A[8*lda]; plA[16] = CurrentOffSetA+16>=M?0.0:A[16*lda]; plA[24] = CurrentOffSetA+24>=M?0.0:A[24*lda]; plB[0] = CurrentOffSetB>=N?0.0:B[0]; plB[8] = CurrentOffSetB+8>=N?0.0:B[8*ldb]; plB[16] = CurrentOffSetB+16>=N?0.0:B[16*ldb]; plB[24] = CurrentOffSetB+24>=N?0.0:B[24*ldb]; barrier(CLK_LOCAL_MEM_FENCE); int offA = idx; int offB = idy; for( int k = 0 ; k < 16; k+=1) { rA[0][0] = lA[offA ]; rA[0][1] = lA[offA + 8]; rA[0][2] = lA[offA + 16]; rA[0][3] = lA[offA + 24]; rB[0][0] = lB[offB ]; rB[0][1] = lB[offB + 16]; offA += 33; offB += 33; rC[0][0]=mad(rA[0][0],rB[0][0],rC[0][0]); rC[1][0]=mad(rA[0][1],rB[0][0],rC[1][0]); rC[2][0]=mad(rA[0][2],rB[0][0],rC[2][0]); rC[3][0]=mad(rA[0][3],rB[0][0],rC[3][0]); rC[0][1]=mad(rA[0][0],rB[0][1],rC[0][1]); rC[1][1]=mad(rA[0][1],rB[0][1],rC[1][1]); rC[2][1]=mad(rA[0][2],rB[0][1],rC[2][1]); rC[3][1]=mad(rA[0][3],rB[0][1],rC[3][1]); } A += 16; B += 16; } C+= gidx*32; C+= idx; C+= gidy*32*ldc; C+= idy*ldc; int OffSetCM = gidx*32+idx; int OffSetCN = gidy*32+idy; if(OffSetCM>=M || OffSetCN>=N) return; for(int i = 0; i<4; i++) { C[0*ldc] = alpha*rC[i][0]; if (OffSetCN+16=M) return; } } else { for( int block_k=0 ; block_k< K ; block_k+=16) { barrier(CLK_LOCAL_MEM_FENCE); plA[0] = A[0]; plA[8] = A[8*lda]; plA[16] = A[16*lda]; plA[24] = A[24*lda]; plB[0] = B[0]; plB[8] = B[8*ldb]; plB[16] = B[16*ldb]; plB[24] = B[24*ldb]; barrier(CLK_LOCAL_MEM_FENCE); int offA = idx; int offB = idy; for( int k = 0 ; k < 16; k+=1) { rA[0][0] = lA[offA ]; rA[0][1] = lA[offA + 8]; rA[0][2] = lA[offA + 16]; rA[0][3] = lA[offA + 24]; rB[0][0] = lB[offB ]; rB[0][1] = lB[offB + 16]; offA += 33; offB += 33; rC[0][0]=mad(rA[0][0],rB[0][0],rC[0][0]); rC[1][0]=mad(rA[0][1],rB[0][0],rC[1][0]); rC[2][0]=mad(rA[0][2],rB[0][0],rC[2][0]); rC[3][0]=mad(rA[0][3],rB[0][0],rC[3][0]); rC[0][1]=mad(rA[0][0],rB[0][1],rC[0][1]); rC[1][1]=mad(rA[0][1],rB[0][1],rC[1][1]); rC[2][1]=mad(rA[0][2],rB[0][1],rC[2][1]); rC[3][1]=mad(rA[0][3],rB[0][1],rC[3][1]); } A += 16; B += 16; } C+= gidx*32; C+= idx*1; C+= gidy*32*ldc; C+= idy*1*ldc; C[0*ldc] = alpha*rC[0][0]; C[16*ldc] = alpha*rC[0][1]; C+=8; C[0*ldc] = alpha*rC[1][0]; C[16*ldc] = alpha*rC[1][1]; C+=8; C[0*ldc] = alpha*rC[2][0]; C[16*ldc] = alpha*rC[2][1]; C+=8; C[0*ldc] = alpha*rC[3][0]; C[16*ldc] = alpha*rC[3][1]; } } "; static const char * dgemm_TN_32_32_1_8x16_4x2__ALPHABETA = " __attribute__( (reqd_work_group_size(8, 16, 1)) ) __kernel void dgemm_TN_32_32_1_8x16_4x2__ALPHABETA( __global double const * restrict A, __global double const * restrict B, __global double * C, uint const M, uint const N, uint const K, double const alpha, double const beta, uint lda, uint ldb, uint ldc, uint const offsetA, uint const offsetB, uint const offsetC) { double rC[4][2] = {(double)0}; double rA[1][4]; double rB[1][2]; A += offsetA; B += offsetB; C+=offsetC; __local double lA[528]; __local double lB[528]; int gidx = get_group_id(0); int gidy = get_group_id(1); int idx = get_local_id(0); int idy = get_local_id(1); int idt = 8*idy + idx; int idxT = idt % 16; int idyT = idt / 16; A += gidx*32*lda + idxT + idyT*lda; B += gidy*32*ldb+ idxT + idyT*ldb; for( int block_k=0 ; block_k< K ; block_k+=16) { __local double* plA = lA + idxT*33+ idyT; __local double* plB = lB + idxT*33+ idyT; barrier(CLK_LOCAL_MEM_FENCE); plA[0] = A[0+0*lda]; plA[8] = A[0+8*lda]; plA[16] = A[0+16*lda]; plA[24] = A[0+24*lda]; plB[0] = B[0+0*ldb]; plB[8] = B[0+8*ldb]; plB[16] = B[0+16*ldb]; plB[24] = B[0+24*ldb]; barrier(CLK_LOCAL_MEM_FENCE); int offA = 1*idx; int offB = 1*idy; for( int k = 0 ; k < min(16u, K-block_k); k+=1) { rA[0][0] = lA[offA + 0]; rA[0][1] = lA[offA + 8]; rA[0][2] = lA[offA + 16]; rA[0][3] = lA[offA + 24]; rB[0][0] = lB[offB + 0]; rB[0][1] = lB[offB + 16]; offA += 33; offB += 33; rC[0][0]=mad(rA[0][0],rB[0][0],rC[0][0]); rC[1][0]=mad(rA[0][1],rB[0][0],rC[1][0]); rC[2][0]=mad(rA[0][2],rB[0][0],rC[2][0]); rC[3][0]=mad(rA[0][3],rB[0][0],rC[3][0]); rC[0][1]=mad(rA[0][0],rB[0][1],rC[0][1]); rC[1][1]=mad(rA[0][1],rB[0][1],rC[1][1]); rC[2][1]=mad(rA[0][2],rB[0][1],rC[2][1]); rC[3][1]=mad(rA[0][3],rB[0][1],rC[3][1]); } A += 16; B += 16; } C+= gidx*32; C+= idx*1; C+= gidy*32*ldc; C+= idy*1*ldc; C[0*ldc] = alpha*rC[0][0] + beta*C[0*ldc]; C[16*ldc] = alpha*rC[0][1] + beta*C[16*ldc]; C+=8; C[0*ldc] = alpha*rC[1][0] + beta*C[0*ldc]; C[16*ldc] = alpha*rC[1][1] + beta*C[16*ldc]; C+=8; C[0*ldc] = alpha*rC[2][0] + beta*C[0*ldc]; C[16*ldc] = alpha*rC[2][1] + beta*C[16*ldc]; C+=8; C[0*ldc] = alpha*rC[3][0] + beta*C[0*ldc]; C[16*ldc] = alpha*rC[3][1] + beta*C[16*ldc]; C+=8; } "; static const char * dgemm_TN_32_32_1_8x16_4x2__ALPHA = " __attribute__( (reqd_work_group_size(8, 16, 1)) ) __kernel void dgemm_TN_32_32_1_8x16_4x2__ALPHA( __global double const * restrict A, __global double const * restrict B, __global double * C, uint const M, uint const N, uint const K, double const alpha, uint lda, uint ldb, uint ldc, uint const offsetA, uint const offsetB, uint const offsetC) { double rC[4][2] = {(double)0}; double rA[1][4]; double rB[1][2]; A += offsetA; B += offsetB; C+=offsetC; __local double lA[528]; __local double lB[528]; int gidx = get_group_id(0); int gidy = get_group_id(1); int idx = get_local_id(0); int idy = get_local_id(1); int idt = 8*idy + idx; int idxT = idt % 16; int idyT = idt / 16; A += gidx*32*lda + idxT + idyT*lda; B += gidy*32*ldb+ idxT + idyT*ldb; for( int block_k=0 ; block_k< K ; block_k+=16) { __local double* plA = lA + idxT*33+ idyT; __local double* plB = lB + idxT*33+ idyT; barrier(CLK_LOCAL_MEM_FENCE); plA[0] = A[0+0*lda]; plA[8] = A[0+8*lda]; plA[16] = A[0+16*lda]; plA[24] = A[0+24*lda]; plB[0] = B[0+0*ldb]; plB[8] = B[0+8*ldb]; plB[16] = B[0+16*ldb]; plB[24] = B[0+24*ldb]; barrier(CLK_LOCAL_MEM_FENCE); int offA = 1*idx; int offB = 1*idy; for( int k = 0 ; k < min(16u, K-block_k); k+=1) { rA[0][0] = lA[offA + 0]; rA[0][1] = lA[offA + 8]; rA[0][2] = lA[offA + 16]; rA[0][3] = lA[offA + 24]; rB[0][0] = lB[offB + 0]; rB[0][1] = lB[offB + 16]; offA += 33; offB += 33; rC[0][0]=mad(rA[0][0],rB[0][0],rC[0][0]); rC[1][0]=mad(rA[0][1],rB[0][0],rC[1][0]); rC[2][0]=mad(rA[0][2],rB[0][0],rC[2][0]); rC[3][0]=mad(rA[0][3],rB[0][0],rC[3][0]); rC[0][1]=mad(rA[0][0],rB[0][1],rC[0][1]); rC[1][1]=mad(rA[0][1],rB[0][1],rC[1][1]); rC[2][1]=mad(rA[0][2],rB[0][1],rC[2][1]); rC[3][1]=mad(rA[0][3],rB[0][1],rC[3][1]); } A += 16; B += 16; } C+= gidx*32; C+= idx*1; C+= gidy*32*ldc; C+= idy*1*ldc; C[0*ldc] = alpha*rC[0][0]; C[16*ldc] = alpha*rC[0][1]; C+=8; C[0*ldc] = alpha*rC[1][0]; C[16*ldc] = alpha*rC[1][1]; C+=8; C[0*ldc] = alpha*rC[2][0]; C[16*ldc] = alpha*rC[2][1]; C+=8; C[0*ldc] = alpha*rC[3][0]; C[16*ldc] = alpha*rC[3][1]; } "; static const char * dgemm_NN_48_48_8_8x8_6x6__ALPHABETA = " __attribute__( (reqd_work_group_size(8, 8, 1)) ) __kernel void dgemm_NN_48_48_8_8x8_6x6__ALPHABETA(__global double const * restrict A, __global double const * restrict B, __global double * C, uint const M, uint const N, uint const K, double const alpha, double const beta, uint lda, uint ldb, uint ldc, uint offsetA, uint offsetB, uint offsetC) { double rC[6][6] = {(double)0}; double rA[1][6]; double rB[1][6]; A += offsetA; B += offsetB; C+=offsetC; __local double lA[392]; __local double lB[392]; int gidx = get_group_id(0); int gidy = get_group_id(1); int idx = get_local_id(0); int idy = get_local_id(1); int idt = 8*idy + idx; int idxT = idt % 8; int idyT = idt / 8; A += gidx*48+ idxT + idyT*lda; B += gidy*48*ldb+ idx + idy*ldb; __local double* plA = lA + idyT*49 + idxT; __local double* plB = lB + idxT*49 + idyT; //for( int block_k=0 ; block_k< K ; block_k+=8) uint block_k = K >> 3; do { barrier(CLK_LOCAL_MEM_FENCE); plA[0] = A[0+0*lda]; plA[8] = A[8+0*lda]; plA[16] = A[16+0*lda]; plA[24] = A[24+0*lda]; plA[32] = A[32+0*lda]; plA[40] = A[40+0*lda]; plB[0] = B[0+0*ldb]; plB[8] = B[0+8*ldb]; plB[16] = B[0+16*ldb]; plB[24] = B[0+24*ldb]; plB[32] = B[0+32*ldb]; plB[40] = B[0+40*ldb]; barrier(CLK_LOCAL_MEM_FENCE); int offA = 1*idx; int offB = 1*idy; for( int k = 0 ; k < 8; k+=1){ rA[0][0] = lA[offA + 0]; rA[0][1] = lA[offA + 8]; rA[0][2] = lA[offA + 16]; rA[0][3] = lA[offA + 24]; rA[0][4] = lA[offA + 32]; rA[0][5] = lA[offA + 40]; rB[0][0] = lB[offB + 0]; rB[0][1] = lB[offB + 8]; rB[0][2] = lB[offB + 16]; rB[0][3] = lB[offB + 24]; rB[0][4] = lB[offB + 32]; rB[0][5] = lB[offB + 40]; offA += 49; offB += 49; rC[0][0]=mad(rA[0][0],rB[0][0],rC[0][0]); rC[1][0]=mad(rA[0][1],rB[0][0],rC[1][0]); rC[2][0]=mad(rA[0][2],rB[0][0],rC[2][0]); rC[3][0]=mad(rA[0][3],rB[0][0],rC[3][0]); rC[4][0]=mad(rA[0][4],rB[0][0],rC[4][0]); rC[5][0]=mad(rA[0][5],rB[0][0],rC[5][0]); rC[0][1]=mad(rA[0][0],rB[0][1],rC[0][1]); rC[1][1]=mad(rA[0][1],rB[0][1],rC[1][1]); rC[2][1]=mad(rA[0][2],rB[0][1],rC[2][1]); rC[3][1]=mad(rA[0][3],rB[0][1],rC[3][1]); rC[4][1]=mad(rA[0][4],rB[0][1],rC[4][1]); rC[5][1]=mad(rA[0][5],rB[0][1],rC[5][1]); rC[0][2]=mad(rA[0][0],rB[0][2],rC[0][2]); rC[1][2]=mad(rA[0][1],rB[0][2],rC[1][2]); rC[2][2]=mad(rA[0][2],rB[0][2],rC[2][2]); rC[3][2]=mad(rA[0][3],rB[0][2],rC[3][2]); rC[4][2]=mad(rA[0][4],rB[0][2],rC[4][2]); rC[5][2]=mad(rA[0][5],rB[0][2],rC[5][2]); rC[0][3]=mad(rA[0][0],rB[0][3],rC[0][3]); rC[1][3]=mad(rA[0][1],rB[0][3],rC[1][3]); rC[2][3]=mad(rA[0][2],rB[0][3],rC[2][3]); rC[3][3]=mad(rA[0][3],rB[0][3],rC[3][3]); rC[4][3]=mad(rA[0][4],rB[0][3],rC[4][3]); rC[5][3]=mad(rA[0][5],rB[0][3],rC[5][3]); rC[0][4]=mad(rA[0][0],rB[0][4],rC[0][4]); rC[1][4]=mad(rA[0][1],rB[0][4],rC[1][4]); rC[2][4]=mad(rA[0][2],rB[0][4],rC[2][4]); rC[3][4]=mad(rA[0][3],rB[0][4],rC[3][4]); rC[4][4]=mad(rA[0][4],rB[0][4],rC[4][4]); rC[5][4]=mad(rA[0][5],rB[0][4],rC[5][4]); rC[0][5]=mad(rA[0][0],rB[0][5],rC[0][5]); rC[1][5]=mad(rA[0][1],rB[0][5],rC[1][5]); rC[2][5]=mad(rA[0][2],rB[0][5],rC[2][5]); rC[3][5]=mad(rA[0][3],rB[0][5],rC[3][5]); rC[4][5]=mad(rA[0][4],rB[0][5],rC[4][5]); rC[5][5]=mad(rA[0][5],rB[0][5],rC[5][5]); } A += 8*lda; B += 8; } while (--block_k > 0); C+= gidx*48; C+= idx; C+= gidy*48*ldc; C+= idy*ldc; C[0*ldc] = alpha*rC[0][0] + beta*C[0*ldc]; C[8*ldc] = alpha*rC[0][1] + beta*C[8*ldc]; C[16*ldc] = alpha*rC[0][2] + beta*C[16*ldc]; C[24*ldc] = alpha*rC[0][3] + beta*C[24*ldc]; C[32*ldc] = alpha*rC[0][4] + beta*C[32*ldc]; C[40*ldc] = alpha*rC[0][5] + beta*C[40*ldc]; C+=8; C[0*ldc] = alpha*rC[1][0] + beta*C[0*ldc]; C[8*ldc] = alpha*rC[1][1] + beta*C[8*ldc]; C[16*ldc] = alpha*rC[1][2] + beta*C[16*ldc]; C[24*ldc] = alpha*rC[1][3] + beta*C[24*ldc]; C[32*ldc] = alpha*rC[1][4] + beta*C[32*ldc]; C[40*ldc] = alpha*rC[1][5] + beta*C[40*ldc]; C+=8; C[0*ldc] = alpha*rC[2][0] + beta*C[0*ldc]; C[8*ldc] = alpha*rC[2][1] + beta*C[8*ldc]; C[16*ldc] = alpha*rC[2][2] + beta*C[16*ldc]; C[24*ldc] = alpha*rC[2][3] + beta*C[24*ldc]; C[32*ldc] = alpha*rC[2][4] + beta*C[32*ldc]; C[40*ldc] = alpha*rC[2][5] + beta*C[40*ldc]; C+=8; C[0*ldc] = alpha*rC[3][0] + beta*C[0*ldc]; C[8*ldc] = alpha*rC[3][1] + beta*C[8*ldc]; C[16*ldc] = alpha*rC[3][2] + beta*C[16*ldc]; C[24*ldc] = alpha*rC[3][3] + beta*C[24*ldc]; C[32*ldc] = alpha*rC[3][4] + beta*C[32*ldc]; C[40*ldc] = alpha*rC[3][5] + beta*C[40*ldc]; C+=8; C[0*ldc] = alpha*rC[4][0] + beta*C[0*ldc]; C[8*ldc] = alpha*rC[4][1] + beta*C[8*ldc]; C[16*ldc] = alpha*rC[4][2] + beta*C[16*ldc]; C[24*ldc] = alpha*rC[4][3] + beta*C[24*ldc]; C[32*ldc] = alpha*rC[4][4] + beta*C[32*ldc]; C[40*ldc] = alpha*rC[4][5] + beta*C[40*ldc]; C+=8; C[0*ldc] = alpha*rC[5][0] + beta*C[0*ldc]; C[8*ldc] = alpha*rC[5][1] + beta*C[8*ldc]; C[16*ldc] = alpha*rC[5][2] + beta*C[16*ldc]; C[24*ldc] = alpha*rC[5][3] + beta*C[24*ldc]; C[32*ldc] = alpha*rC[5][4] + beta*C[32*ldc]; C[40*ldc] = alpha*rC[5][5] + beta*C[40*ldc]; } "; static const char * dgemm_NN_48_48_8_8x8_6x6__ALPHA = " __attribute__( (reqd_work_group_size(8, 8, 1)) ) __kernel void dgemm_NN_48_48_8_8x8_6x6__ALPHA(__global double const * restrict A, __global double const * restrict B, __global double * C, uint const M, uint const N, uint const K, double const alpha, uint lda, uint ldb, uint ldc, uint offsetA, uint offsetB, uint offsetC) { double rC[6][6] = {(double)0}; double rA[1][6]; double rB[1][6]; A += offsetA; B += offsetB; C+=offsetC; __local double lA[392]; __local double lB[392]; int gidx = get_group_id(0); int gidy = get_group_id(1); int idx = get_local_id(0); int idy = get_local_id(1); int idt = 8*idy + idx; int idxT = idt % 8; int idyT = idt / 8; A += gidx*48+ idxT + idyT*lda; B += gidy*48*ldb+ idx + idy*ldb; __local double* plA = lA + idyT*49 + idxT; __local double* plB = lB + idxT*49 + idyT; //for( int block_k=0 ; block_k< K ; block_k+=8) uint block_k = K >> 3; do { barrier(CLK_LOCAL_MEM_FENCE); plA[0] = A[0+0*lda]; plA[8] = A[8+0*lda]; plA[16] = A[16+0*lda]; plA[24] = A[24+0*lda]; plA[32] = A[32+0*lda]; plA[40] = A[40+0*lda]; plB[0] = B[0+0*ldb]; plB[8] = B[0+8*ldb]; plB[16] = B[0+16*ldb]; plB[24] = B[0+24*ldb]; plB[32] = B[0+32*ldb]; plB[40] = B[0+40*ldb]; barrier(CLK_LOCAL_MEM_FENCE); int offA = 1*idx; int offB = 1*idy; for( int k = 0 ; k < 8; k+=1){ rA[0][0] = lA[offA + 0]; rA[0][1] = lA[offA + 8]; rA[0][2] = lA[offA + 16]; rA[0][3] = lA[offA + 24]; rA[0][4] = lA[offA + 32]; rA[0][5] = lA[offA + 40]; rB[0][0] = lB[offB + 0]; rB[0][1] = lB[offB + 8]; rB[0][2] = lB[offB + 16]; rB[0][3] = lB[offB + 24]; rB[0][4] = lB[offB + 32]; rB[0][5] = lB[offB + 40]; offA += 49; offB += 49; rC[0][0]=mad(rA[0][0],rB[0][0],rC[0][0]); rC[1][0]=mad(rA[0][1],rB[0][0],rC[1][0]); rC[2][0]=mad(rA[0][2],rB[0][0],rC[2][0]); rC[3][0]=mad(rA[0][3],rB[0][0],rC[3][0]); rC[4][0]=mad(rA[0][4],rB[0][0],rC[4][0]); rC[5][0]=mad(rA[0][5],rB[0][0],rC[5][0]); rC[0][1]=mad(rA[0][0],rB[0][1],rC[0][1]); rC[1][1]=mad(rA[0][1],rB[0][1],rC[1][1]); rC[2][1]=mad(rA[0][2],rB[0][1],rC[2][1]); rC[3][1]=mad(rA[0][3],rB[0][1],rC[3][1]); rC[4][1]=mad(rA[0][4],rB[0][1],rC[4][1]); rC[5][1]=mad(rA[0][5],rB[0][1],rC[5][1]); rC[0][2]=mad(rA[0][0],rB[0][2],rC[0][2]); rC[1][2]=mad(rA[0][1],rB[0][2],rC[1][2]); rC[2][2]=mad(rA[0][2],rB[0][2],rC[2][2]); rC[3][2]=mad(rA[0][3],rB[0][2],rC[3][2]); rC[4][2]=mad(rA[0][4],rB[0][2],rC[4][2]); rC[5][2]=mad(rA[0][5],rB[0][2],rC[5][2]); rC[0][3]=mad(rA[0][0],rB[0][3],rC[0][3]); rC[1][3]=mad(rA[0][1],rB[0][3],rC[1][3]); rC[2][3]=mad(rA[0][2],rB[0][3],rC[2][3]); rC[3][3]=mad(rA[0][3],rB[0][3],rC[3][3]); rC[4][3]=mad(rA[0][4],rB[0][3],rC[4][3]); rC[5][3]=mad(rA[0][5],rB[0][3],rC[5][3]); rC[0][4]=mad(rA[0][0],rB[0][4],rC[0][4]); rC[1][4]=mad(rA[0][1],rB[0][4],rC[1][4]); rC[2][4]=mad(rA[0][2],rB[0][4],rC[2][4]); rC[3][4]=mad(rA[0][3],rB[0][4],rC[3][4]); rC[4][4]=mad(rA[0][4],rB[0][4],rC[4][4]); rC[5][4]=mad(rA[0][5],rB[0][4],rC[5][4]); rC[0][5]=mad(rA[0][0],rB[0][5],rC[0][5]); rC[1][5]=mad(rA[0][1],rB[0][5],rC[1][5]); rC[2][5]=mad(rA[0][2],rB[0][5],rC[2][5]); rC[3][5]=mad(rA[0][3],rB[0][5],rC[3][5]); rC[4][5]=mad(rA[0][4],rB[0][5],rC[4][5]); rC[5][5]=mad(rA[0][5],rB[0][5],rC[5][5]); } A += 8*lda; B += 8; } while (--block_k > 0); C+= gidx*48; C+= idx; C+= gidy*48*ldc; C+= idy*ldc; C[0*ldc] = alpha*rC[0][0] ; C[8*ldc] = alpha*rC[0][1] ; C[16*ldc] = alpha*rC[0][2]; C[24*ldc] = alpha*rC[0][3]; C[32*ldc] = alpha*rC[0][4]; C[40*ldc] = alpha*rC[0][5]; C+=8; C[0*ldc] = alpha*rC[1][0] ; C[8*ldc] = alpha*rC[1][1] ; C[16*ldc] = alpha*rC[1][2]; C[24*ldc] = alpha*rC[1][3]; C[32*ldc] = alpha*rC[1][4]; C[40*ldc] = alpha*rC[1][5]; C+=8; C[0*ldc] = alpha*rC[2][0] ; C[8*ldc] = alpha*rC[2][1] ; C[16*ldc] = alpha*rC[2][2]; C[24*ldc] = alpha*rC[2][3]; C[32*ldc] = alpha*rC[2][4]; C[40*ldc] = alpha*rC[2][5]; C+=8; C[0*ldc] = alpha*rC[3][0] ; C[8*ldc] = alpha*rC[3][1] ; C[16*ldc] = alpha*rC[3][2]; C[24*ldc] = alpha*rC[3][3]; C[32*ldc] = alpha*rC[3][4]; C[40*ldc] = alpha*rC[3][5]; C+=8; C[0*ldc] = alpha*rC[4][0] ; C[8*ldc] = alpha*rC[4][1] ; C[16*ldc] = alpha*rC[4][2]; C[24*ldc] = alpha*rC[4][3]; C[32*ldc] = alpha*rC[4][4]; C[40*ldc] = alpha*rC[4][5]; C+=8; C[0*ldc] = alpha*rC[5][0] ; C[8*ldc] = alpha*rC[5][1] ; C[16*ldc] = alpha*rC[5][2]; C[24*ldc] = alpha*rC[5][3]; C[32*ldc] = alpha*rC[5][4]; C[40*ldc] = alpha*rC[5][5]; } "; static const char * dgemm_NN_32_32_8_8x8_4x4__ALPHABETA = " __attribute__( (reqd_work_group_size(8, 8, 1)) ) __kernel void dgemm_NN_32_32_8_8x8_4x4__ALPHABETA(__global double const * restrict A, __global double const * restrict B, __global double * C, uint const M, uint const N, uint const K, double const alpha, double const beta, uint lda, uint ldb, uint ldc, uint offsetA, uint offsetB, uint offsetC) { double rC[4][4] = {(double)0}; double rA[1][4]; double rB[1][4]; double PreFetchA[4]; double PreFetchB[4]; A += offsetA; B += offsetB; C+=offsetC; __local double lA[264]; __local double lB[264]; int gidx = get_group_id(0); int gidy = get_group_id(1); int idx = get_local_id(0); int idy = get_local_id(1); int idt = 8*idy + idx; int idxT = idt % 8; int idyT = idt / 8; A += gidx*32+ idxT + idyT*lda; B += gidy*32*ldb+ idxT + idyT*ldb; __local double* plA = lA + idyT*33 + idxT; __local double* plB = lB + idxT*33 + idyT; plA[0] = A[0]; plA[8] = A[8]; plA[16] = A[16]; plA[24] = A[24]; plB[0] = B[0]; plB[8] = B[8*ldb]; plB[16] = B[16*ldb]; plB[24] = B[24*ldb]; barrier(CLK_LOCAL_MEM_FENCE); for( int block_k=0 ; block_k< K ; block_k+=8) { // barrier(CLK_LOCAL_MEM_FENCE); A += 8*lda; B += 8; PreFetchA[0] = A[0]; PreFetchA[1] = A[8]; PreFetchA[2] = A[16]; PreFetchA[3] = A[24]; PreFetchB[0] = B[0]; PreFetchB[1] = B[8*ldb]; PreFetchB[2] = B[16*ldb]; PreFetchB[3] = B[24*ldb]; /* plA[0] = A[0+0*lda]; plA[8] = A[8+0*lda]; plA[16] = A[16+0*lda]; plA[24] = A[24+0*lda]; plB[0] = B[0+0*ldb]; plB[8] = B[0+8*ldb]; plB[16] = B[0+16*ldb]; plB[24] = B[0+24*ldb];*/ // barrier(CLK_LOCAL_MEM_FENCE); int offA = idx; int offB = idy; for( int k = 0 ; k < 8; k+=1) { rA[0][0] = lA[offA + 0]; rA[0][1] = lA[offA + 8]; rA[0][2] = lA[offA + 16]; rA[0][3] = lA[offA + 24]; rB[0][0] = lB[offB + 0]; rB[0][1] = lB[offB + 8]; rB[0][2] = lB[offB + 16]; rB[0][3] = lB[offB + 24]; offA += 33; offB += 33; rC[0][0]=mad(rA[0][0],rB[0][0],rC[0][0]); rC[1][0]=mad(rA[0][1],rB[0][0],rC[1][0]); rC[2][0]=mad(rA[0][2],rB[0][0],rC[2][0]); rC[3][0]=mad(rA[0][3],rB[0][0],rC[3][0]); rC[0][1]=mad(rA[0][0],rB[0][1],rC[0][1]); rC[1][1]=mad(rA[0][1],rB[0][1],rC[1][1]); rC[2][1]=mad(rA[0][2],rB[0][1],rC[2][1]); rC[3][1]=mad(rA[0][3],rB[0][1],rC[3][1]); rC[0][2]=mad(rA[0][0],rB[0][2],rC[0][2]); rC[1][2]=mad(rA[0][1],rB[0][2],rC[1][2]); rC[2][2]=mad(rA[0][2],rB[0][2],rC[2][2]); rC[3][2]=mad(rA[0][3],rB[0][2],rC[3][2]); rC[0][3]=mad(rA[0][0],rB[0][3],rC[0][3]); rC[1][3]=mad(rA[0][1],rB[0][3],rC[1][3]); rC[2][3]=mad(rA[0][2],rB[0][3],rC[2][3]); rC[3][3]=mad(rA[0][3],rB[0][3],rC[3][3]); } barrier(CLK_LOCAL_MEM_FENCE); plA[0] = PreFetchA[0]; plA[8] = PreFetchA[1]; plA[16] = PreFetchA[2]; plA[24] = PreFetchA[3]; plB[0] = PreFetchB[0]; plB[8] = PreFetchB[1]; plB[16] = PreFetchB[2]; plB[24] = PreFetchB[3]; barrier(CLK_LOCAL_MEM_FENCE); /* A += 8*lda; B += 8;*/ } C+= gidx*32; C+= idx; C+= gidy*32*ldc; C+= idy*ldc; C[0*ldc] = alpha*rC[0][0] + beta*C[0*ldc]; C[8*ldc] = alpha*rC[0][1] + beta*C[8*ldc]; C[16*ldc] = alpha*rC[0][2] + beta*C[16*ldc]; C[24*ldc] = alpha*rC[0][3] + beta*C[24*ldc]; C+=8; C[0*ldc] = alpha*rC[1][0] + beta*C[0*ldc]; C[8*ldc] = alpha*rC[1][1] + beta*C[8*ldc]; C[16*ldc] = alpha*rC[1][2] + beta*C[16*ldc]; C[24*ldc] = alpha*rC[1][3] + beta*C[24*ldc]; C+=8; C[0*ldc] = alpha*rC[2][0] + beta*C[0*ldc]; C[8*ldc] = alpha*rC[2][1] + beta*C[8*ldc]; C[16*ldc] = alpha*rC[2][2] + beta*C[16*ldc]; C[24*ldc] = alpha*rC[2][3] + beta*C[24*ldc]; C+=8; C[0*ldc] = alpha*rC[3][0] + beta*C[0*ldc]; C[8*ldc] = alpha*rC[3][1] + beta*C[8*ldc]; C[16*ldc] = alpha*rC[3][2] + beta*C[16*ldc]; C[24*ldc] = alpha*rC[3][3] + beta*C[24*ldc]; } "; static const char * dgemm_NN_32_32_8_8x8_4x4__ALPHA = " __attribute__( (reqd_work_group_size(8, 8, 1)) ) __kernel void dgemm_NN_32_32_8_8x8_4x4__ALPHA(__global double const * restrict A, __global double const * restrict B, __global double * C, uint const M, uint const N, uint const K, double const alpha, uint lda, uint ldb, uint ldc, uint offsetA, uint offsetB, uint offsetC) { double rC[4][4] = {(double)0}; double rA[1][4]; double rB[1][4]; double PreFetchA[4]; double PreFetchB[4]; A += offsetA; B += offsetB; C+=offsetC; __local double lA[264]; __local double lB[264]; int gidx = get_group_id(0); int gidy = get_group_id(1); int idx = get_local_id(0); int idy = get_local_id(1); int idt = 8*idy + idx; int idxT = idt % 8; int idyT = idt / 8; A += gidx*32+ idxT + idyT*lda; B += gidy*32*ldb+ idxT + idyT*ldb; __local double* plA = lA + idyT*33 + idxT; __local double* plB = lB + idxT*33 + idyT; plA[0] = A[0]; plA[8] = A[8]; plA[16] = A[16]; plA[24] = A[24]; plB[0] = B[0]; plB[8] = B[8*ldb]; plB[16] = B[16*ldb]; plB[24] = B[24*ldb]; barrier(CLK_LOCAL_MEM_FENCE); for( int block_k=0 ; block_k< K ; block_k+=8) { // barrier(CLK_LOCAL_MEM_FENCE); A += 8*lda; B += 8; PreFetchA[0] = A[0]; PreFetchA[1] = A[8]; PreFetchA[2] = A[16]; PreFetchA[3] = A[24]; PreFetchB[0] = B[0]; PreFetchB[1] = B[8*ldb]; PreFetchB[2] = B[16*ldb]; PreFetchB[3] = B[24*ldb]; /* plA[0] = A[0+0*lda]; plA[8] = A[8+0*lda]; plA[16] = A[16+0*lda]; plA[24] = A[24+0*lda]; plB[0] = B[0+0*ldb]; plB[8] = B[0+8*ldb]; plB[16] = B[0+16*ldb]; plB[24] = B[0+24*ldb];*/ // barrier(CLK_LOCAL_MEM_FENCE); int offA = idx; int offB = idy; for( int k = 0 ; k < 8; k+=1) { rA[0][0] = lA[offA + 0]; rA[0][1] = lA[offA + 8]; rA[0][2] = lA[offA + 16]; rA[0][3] = lA[offA + 24]; rB[0][0] = lB[offB + 0]; rB[0][1] = lB[offB + 8]; rB[0][2] = lB[offB + 16]; rB[0][3] = lB[offB + 24]; offA += 33; offB += 33; rC[0][0]=mad(rA[0][0],rB[0][0],rC[0][0]); rC[1][0]=mad(rA[0][1],rB[0][0],rC[1][0]); rC[2][0]=mad(rA[0][2],rB[0][0],rC[2][0]); rC[3][0]=mad(rA[0][3],rB[0][0],rC[3][0]); rC[0][1]=mad(rA[0][0],rB[0][1],rC[0][1]); rC[1][1]=mad(rA[0][1],rB[0][1],rC[1][1]); rC[2][1]=mad(rA[0][2],rB[0][1],rC[2][1]); rC[3][1]=mad(rA[0][3],rB[0][1],rC[3][1]); rC[0][2]=mad(rA[0][0],rB[0][2],rC[0][2]); rC[1][2]=mad(rA[0][1],rB[0][2],rC[1][2]); rC[2][2]=mad(rA[0][2],rB[0][2],rC[2][2]); rC[3][2]=mad(rA[0][3],rB[0][2],rC[3][2]); rC[0][3]=mad(rA[0][0],rB[0][3],rC[0][3]); rC[1][3]=mad(rA[0][1],rB[0][3],rC[1][3]); rC[2][3]=mad(rA[0][2],rB[0][3],rC[2][3]); rC[3][3]=mad(rA[0][3],rB[0][3],rC[3][3]); } barrier(CLK_LOCAL_MEM_FENCE); plA[0] = PreFetchA[0]; plA[8] = PreFetchA[1]; plA[16] = PreFetchA[2]; plA[24] = PreFetchA[3]; plB[0] = PreFetchB[0]; plB[8] = PreFetchB[1]; plB[16] = PreFetchB[2]; plB[24] = PreFetchB[3]; barrier(CLK_LOCAL_MEM_FENCE); /* A += 8*lda; B += 8;*/ } C+= gidx*32; C+= idx; C+= gidy*32*ldc; C+= idy*ldc; C[0*ldc] = alpha*rC[0][0] ; C[8*ldc] = alpha*rC[0][1] ; C[16*ldc] = alpha*rC[0][2]; C[24*ldc] = alpha*rC[0][3]; C+=8; C[0*ldc] = alpha*rC[1][0] ; C[8*ldc] = alpha*rC[1][1] ; C[16*ldc] = alpha*rC[1][2]; C[24*ldc] = alpha*rC[1][3]; C+=8; C[0*ldc] = alpha*rC[2][0] ; C[8*ldc] = alpha*rC[2][1] ; C[16*ldc] = alpha*rC[2][2]; C[24*ldc] = alpha*rC[2][3]; C+=8; C[0*ldc] = alpha*rC[3][0] ; C[8*ldc] = alpha*rC[3][1] ; C[16*ldc] = alpha*rC[3][2]; C[24*ldc] = alpha*rC[3][3]; } "; static const char * dgemm_NN_1_1_8_8x8_4x4__ALPHABETA = " __attribute__( (reqd_work_group_size(8, 8, 1)) ) __kernel void dgemm_NN_1_1_8_8x8_4x4__ALPHABETA(__global double const * restrict A, __global double const * restrict B, __global double * C, uint const M, uint const N, uint const K, double const alpha, double const beta, uint lda, uint ldb, uint ldc, uint offsetA, uint offsetB, uint offsetC) { double rC[4][4] = {(double)0}; double rA[1][4]; double rB[1][4]; A += offsetA; B += offsetB; C+=offsetC; __local double lA[264]; __local double lB[264]; int gidx = get_group_id(0); int gidy = get_group_id(1); int idx = get_local_id(0); int idy = get_local_id(1); int idt = 8*idy + idx; int idxT = idt % 8; int idyT = idt / 8; A += gidx*32+ idxT + idyT*lda; B += gidy*32*ldb+ idxT + idyT*ldb; __local double* plA = lA + idyT*33+idxT; __local double* plB = lB + idxT*33+ idyT; if(gidx==get_num_groups(0)-1 || gidy==get_num_groups(1)-1 ) { int CurrentOffSetA = gidx*32 + idxT; int CurrentOffSetB = idyT; for( int block_k=0 ; block_k< K ; block_k+=8) { barrier(CLK_LOCAL_MEM_FENCE); plA[0] = CurrentOffSetA>=M?0.0:A[0]; plA[8] = CurrentOffSetA+8>=M?0.0:A[8]; plA[16] = CurrentOffSetA+16>=M?0.0:A[16]; plA[24] = CurrentOffSetA+24>=M?0.0:A[24]; plB[0] = CurrentOffSetB>=N?0.0:B[0]; plB[8] = CurrentOffSetB+8>=N?0.0:B[0+8*ldb]; plB[16] = CurrentOffSetB+16>=N?0.0: B[0+16*ldb]; plB[24] = CurrentOffSetB+24>=N?0.0:B[0+24*ldb]; barrier(CLK_LOCAL_MEM_FENCE); int offA = idx; int offB = idy; for( int k = 0 ; k < 8; k+=1) { rA[0][0] = lA[offA + 0]; rA[0][1] = lA[offA + 8]; rA[0][2] = lA[offA + 16]; rA[0][3] = lA[offA + 24]; rB[0][0] = lB[offB + 0]; rB[0][1] = lB[offB + 8]; rB[0][2] = lB[offB + 16]; rB[0][3] = lB[offB + 24]; offA += 33; offB += 33; rC[0][0]=mad(rA[0][0],rB[0][0],rC[0][0]); rC[1][0]=mad(rA[0][1],rB[0][0],rC[1][0]); rC[2][0]=mad(rA[0][2],rB[0][0],rC[2][0]); rC[3][0]=mad(rA[0][3],rB[0][0],rC[3][0]); rC[0][1]=mad(rA[0][0],rB[0][1],rC[0][1]); rC[1][1]=mad(rA[0][1],rB[0][1],rC[1][1]); rC[2][1]=mad(rA[0][2],rB[0][1],rC[2][1]); rC[3][1]=mad(rA[0][3],rB[0][1],rC[3][1]); rC[0][2]=mad(rA[0][0],rB[0][2],rC[0][2]); rC[1][2]=mad(rA[0][1],rB[0][2],rC[1][2]); rC[2][2]=mad(rA[0][2],rB[0][2],rC[2][2]); rC[3][2]=mad(rA[0][3],rB[0][2],rC[3][2]); rC[0][3]=mad(rA[0][0],rB[0][3],rC[0][3]); rC[1][3]=mad(rA[0][1],rB[0][3],rC[1][3]); rC[2][3]=mad(rA[0][2],rB[0][3],rC[2][3]); rC[3][3]=mad(rA[0][3],rB[0][3],rC[3][3]); } A += 8*lda; B += 8; } C+= gidx*32; C+= idx; C+= gidy*32*ldc; C+= idy*ldc; int OffSetCM = gidx*32+idx; int OffSetCN = gidy*32+idy; if(OffSetCM>=M || OffSetCN>=N) return; for (int i = 0; i<4; i++) { C[0] = alpha*rC[i][0] + beta*C[0]; if (OffSetCN+8=M) return; } } else { for( int block_k=0 ; block_k< K ; block_k+=8) { barrier(CLK_LOCAL_MEM_FENCE); plA[0] = A[0+0*lda]; plA[8] = A[8+0*lda]; plA[16] = A[16+0*lda]; plA[24] = A[24+0*lda]; plB[0] = B[0+0*ldb]; plB[8] = B[0+8*ldb]; plB[16] = B[0+16*ldb]; plB[24] = B[0+24*ldb]; barrier(CLK_LOCAL_MEM_FENCE); int offA = idx; int offB = idy; for( int k = 0 ; k < 8; k+=1) { rA[0][0] = lA[offA + 0]; rA[0][1] = lA[offA + 8]; rA[0][2] = lA[offA + 16]; rA[0][3] = lA[offA + 24]; rB[0][0] = lB[offB + 0]; rB[0][1] = lB[offB + 8]; rB[0][2] = lB[offB + 16]; rB[0][3] = lB[offB + 24]; offA += 33; offB += 33; rC[0][0]=mad(rA[0][0],rB[0][0],rC[0][0]); rC[1][0]=mad(rA[0][1],rB[0][0],rC[1][0]); rC[2][0]=mad(rA[0][2],rB[0][0],rC[2][0]); rC[3][0]=mad(rA[0][3],rB[0][0],rC[3][0]); rC[0][1]=mad(rA[0][0],rB[0][1],rC[0][1]); rC[1][1]=mad(rA[0][1],rB[0][1],rC[1][1]); rC[2][1]=mad(rA[0][2],rB[0][1],rC[2][1]); rC[3][1]=mad(rA[0][3],rB[0][1],rC[3][1]); rC[0][2]=mad(rA[0][0],rB[0][2],rC[0][2]); rC[1][2]=mad(rA[0][1],rB[0][2],rC[1][2]); rC[2][2]=mad(rA[0][2],rB[0][2],rC[2][2]); rC[3][2]=mad(rA[0][3],rB[0][2],rC[3][2]); rC[0][3]=mad(rA[0][0],rB[0][3],rC[0][3]); rC[1][3]=mad(rA[0][1],rB[0][3],rC[1][3]); rC[2][3]=mad(rA[0][2],rB[0][3],rC[2][3]); rC[3][3]=mad(rA[0][3],rB[0][3],rC[3][3]); } A += 8*lda; B += 8; } C+= gidx*32; C+= idx; C+= gidy*32*ldc; C+= idy*ldc; for (int i = 0; i<4; i++) { C[0] = alpha*rC[i][0] + beta*C[0]; C[8*ldc] = alpha*rC[i][1] + beta*C[8*ldc]; C[16*ldc] = alpha*rC[i][2] + beta*C[16*ldc]; C[24*ldc] = alpha*rC[i][3] + beta*C[24*ldc]; C+=8; } } } "; static const char * dgemm_NN_1_1_8_8x8_4x4__ALPHA = " __attribute__( (reqd_work_group_size(8, 8, 1)) ) __kernel void dgemm_NN_1_1_8_8x8_4x4__ALPHA(__global double const * restrict A, __global double const * restrict B, __global double * C, uint const M, uint const N, uint const K, double const alpha, uint lda, uint ldb, uint ldc, uint offsetA, uint offsetB, uint offsetC) { double rC[4][4] = {(double)0}; double rA[1][4]; double rB[1][4]; A += offsetA; B += offsetB; C+=offsetC; __local double lA[264]; __local double lB[264]; int gidx = get_group_id(0); int gidy = get_group_id(1); int idx = get_local_id(0); int idy = get_local_id(1); int idt = 8*idy + idx; int idxT = idt % 8; int idyT = idt / 8; A += gidx*32+ idxT + idyT*lda; B += gidy*32*ldb+ idxT + idyT*ldb; __local double* plA = lA + idyT*33+idxT; __local double* plB = lB + idxT*33+ idyT; if(gidx==get_num_groups(0)-1 || gidy==get_num_groups(1)-1 ) { int CurrentOffSetA = gidx*32 + idxT; int CurrentOffSetB = idyT; for( int block_k=0 ; block_k< K ; block_k+=8) { barrier(CLK_LOCAL_MEM_FENCE); plA[0] = CurrentOffSetA>=M?0.0:A[0]; plA[8] = CurrentOffSetA+8>=M?0.0:A[8]; plA[16] = CurrentOffSetA+16>=M?0.0:A[16]; plA[24] = CurrentOffSetA+24>=M?0.0:A[24]; plB[0] = CurrentOffSetB>=N?0.0:B[0]; plB[8] = CurrentOffSetB+8>=N?0.0:B[0+8*ldb]; plB[16] = CurrentOffSetB+16>=N?0.0: B[0+16*ldb]; plB[24] = CurrentOffSetB+24>=N?0.0:B[0+24*ldb]; barrier(CLK_LOCAL_MEM_FENCE); int offA = idx; int offB = idy; for( int k = 0 ; k < 8; k+=1) { rA[0][0] = lA[offA + 0]; rA[0][1] = lA[offA + 8]; rA[0][2] = lA[offA + 16]; rA[0][3] = lA[offA + 24]; rB[0][0] = lB[offB + 0]; rB[0][1] = lB[offB + 8]; rB[0][2] = lB[offB + 16]; rB[0][3] = lB[offB + 24]; offA += 33; offB += 33; rC[0][0]=mad(rA[0][0],rB[0][0],rC[0][0]); rC[1][0]=mad(rA[0][1],rB[0][0],rC[1][0]); rC[2][0]=mad(rA[0][2],rB[0][0],rC[2][0]); rC[3][0]=mad(rA[0][3],rB[0][0],rC[3][0]); rC[0][1]=mad(rA[0][0],rB[0][1],rC[0][1]); rC[1][1]=mad(rA[0][1],rB[0][1],rC[1][1]); rC[2][1]=mad(rA[0][2],rB[0][1],rC[2][1]); rC[3][1]=mad(rA[0][3],rB[0][1],rC[3][1]); rC[0][2]=mad(rA[0][0],rB[0][2],rC[0][2]); rC[1][2]=mad(rA[0][1],rB[0][2],rC[1][2]); rC[2][2]=mad(rA[0][2],rB[0][2],rC[2][2]); rC[3][2]=mad(rA[0][3],rB[0][2],rC[3][2]); rC[0][3]=mad(rA[0][0],rB[0][3],rC[0][3]); rC[1][3]=mad(rA[0][1],rB[0][3],rC[1][3]); rC[2][3]=mad(rA[0][2],rB[0][3],rC[2][3]); rC[3][3]=mad(rA[0][3],rB[0][3],rC[3][3]); } A += 8*lda; B += 8; } C+= gidx*32; C+= idx; C+= gidy*32*ldc; C+= idy*ldc; int OffSetCM = gidx*32+idx; int OffSetCN = gidy*32+idy; if(OffSetCM>=M || OffSetCN>=N) return; for (int i = 0; i<4; i++) { C[0] = alpha*rC[i][0]; if (OffSetCN+8=M) return; } } else { for( int block_k=0 ; block_k< K ; block_k+=8) { barrier(CLK_LOCAL_MEM_FENCE); plA[0] = A[0+0*lda]; plA[8] = A[8+0*lda]; plA[16] = A[16+0*lda]; plA[24] = A[24+0*lda]; plB[0] = B[0+0*ldb]; plB[8] = B[0+8*ldb]; plB[16] = B[0+16*ldb]; plB[24] = B[0+24*ldb]; barrier(CLK_LOCAL_MEM_FENCE); int offA = idx; int offB = idy; for( int k = 0 ; k < 8; k+=1) { rA[0][0] = lA[offA + 0]; rA[0][1] = lA[offA + 8]; rA[0][2] = lA[offA + 16]; rA[0][3] = lA[offA + 24]; rB[0][0] = lB[offB + 0]; rB[0][1] = lB[offB + 8]; rB[0][2] = lB[offB + 16]; rB[0][3] = lB[offB + 24]; offA += 33; offB += 33; rC[0][0]=mad(rA[0][0],rB[0][0],rC[0][0]); rC[1][0]=mad(rA[0][1],rB[0][0],rC[1][0]); rC[2][0]=mad(rA[0][2],rB[0][0],rC[2][0]); rC[3][0]=mad(rA[0][3],rB[0][0],rC[3][0]); rC[0][1]=mad(rA[0][0],rB[0][1],rC[0][1]); rC[1][1]=mad(rA[0][1],rB[0][1],rC[1][1]); rC[2][1]=mad(rA[0][2],rB[0][1],rC[2][1]); rC[3][1]=mad(rA[0][3],rB[0][1],rC[3][1]); rC[0][2]=mad(rA[0][0],rB[0][2],rC[0][2]); rC[1][2]=mad(rA[0][1],rB[0][2],rC[1][2]); rC[2][2]=mad(rA[0][2],rB[0][2],rC[2][2]); rC[3][2]=mad(rA[0][3],rB[0][2],rC[3][2]); rC[0][3]=mad(rA[0][0],rB[0][3],rC[0][3]); rC[1][3]=mad(rA[0][1],rB[0][3],rC[1][3]); rC[2][3]=mad(rA[0][2],rB[0][3],rC[2][3]); rC[3][3]=mad(rA[0][3],rB[0][3],rC[3][3]); } A += 8*lda; B += 8; } C+= gidx*32; C+= idx; C+= gidy*32*ldc; C+= idy*ldc; for (int i = 0; i<4; i++) { C[0] = alpha*rC[i][0]; C[8*ldc] = alpha*rC[i][1]; C[16*ldc] = alpha*rC[i][2] ; C[24*ldc] = alpha*rC[i][3]; C+=8; } } } "; static const char * dgemm_NN_1_1_1_8x8_4x4__ALPHABETA = " __attribute__( (reqd_work_group_size(8, 8, 1)) ) __kernel void dgemm_NN_1_1_1_8x8_4x4__ALPHABETA(__global double const * restrict A, __global double const * restrict B, __global double * C, uint const M, uint const N, uint const K, double const alpha, double const beta, uint lda, uint ldb, uint ldc, uint offsetA, uint offsetB, uint offsetC) { double rC[4][4] = {(double)0}; double rA[1][4]; double rB[1][4]; A += offsetA; B += offsetB; C+=offsetC; __local double lA[264]; __local double lB[264]; int gidx = get_group_id(0); int gidy = get_group_id(1); int idx = get_local_id(0); int idy = get_local_id(1); int idt = 8*idy + idx; int idxT = idt % 8; int idyT = idt / 8; A += gidx*32+ idxT + idyT*lda; B += gidy*32*ldb+ idxT + idyT*ldb; __local double* plA = lA + idyT*33+idxT; __local double* plB = lB + idxT*33+ idyT; if(gidx==get_num_groups(0)-1 || gidy==get_num_groups(1)-1 ) { int CurrentOffSetA = gidx*32 + idxT; int CurrentOffSetB = idyT; for( int block_k=0 ; block_k< K ; block_k+=8) { barrier(CLK_LOCAL_MEM_FENCE); plA[0] = CurrentOffSetA>=M?0.0:A[0]; plA[8] = CurrentOffSetA+8>=M?0.0:A[8]; plA[16] = CurrentOffSetA+16>=M?0.0:A[16]; plA[24] = CurrentOffSetA+24>=M?0.0:A[24]; plB[0] = CurrentOffSetB>=N?0.0:B[0]; plB[8] = CurrentOffSetB+8>=N?0.0:B[0+8*ldb]; plB[16] = CurrentOffSetB+16>=N?0.0: B[0+16*ldb]; plB[24] = CurrentOffSetB+24>=N?0.0:B[0+24*ldb]; barrier(CLK_LOCAL_MEM_FENCE); int offA = idx; int offB = idy; for( int k = 0 ; k < min(8u, K-block_k); k+=1) { rA[0][0] = lA[offA + 0]; rA[0][1] = lA[offA + 8]; rA[0][2] = lA[offA + 16]; rA[0][3] = lA[offA + 24]; rB[0][0] = lB[offB + 0]; rB[0][1] = lB[offB + 8]; rB[0][2] = lB[offB + 16]; rB[0][3] = lB[offB + 24]; offA += 33; offB += 33; rC[0][0]=mad(rA[0][0],rB[0][0],rC[0][0]); rC[1][0]=mad(rA[0][1],rB[0][0],rC[1][0]); rC[2][0]=mad(rA[0][2],rB[0][0],rC[2][0]); rC[3][0]=mad(rA[0][3],rB[0][0],rC[3][0]); rC[0][1]=mad(rA[0][0],rB[0][1],rC[0][1]); rC[1][1]=mad(rA[0][1],rB[0][1],rC[1][1]); rC[2][1]=mad(rA[0][2],rB[0][1],rC[2][1]); rC[3][1]=mad(rA[0][3],rB[0][1],rC[3][1]); rC[0][2]=mad(rA[0][0],rB[0][2],rC[0][2]); rC[1][2]=mad(rA[0][1],rB[0][2],rC[1][2]); rC[2][2]=mad(rA[0][2],rB[0][2],rC[2][2]); rC[3][2]=mad(rA[0][3],rB[0][2],rC[3][2]); rC[0][3]=mad(rA[0][0],rB[0][3],rC[0][3]); rC[1][3]=mad(rA[0][1],rB[0][3],rC[1][3]); rC[2][3]=mad(rA[0][2],rB[0][3],rC[2][3]); rC[3][3]=mad(rA[0][3],rB[0][3],rC[3][3]); } A += 8*lda; B += 8; } C+= gidx*32; C+= idx; C+= gidy*32*ldc; C+= idy*ldc; int OffSetCM = gidx*32+idx; int OffSetCN = gidy*32+idy; if(OffSetCM>=M || OffSetCN>=N) return; for (int i = 0; i<4; i++) { C[0] = alpha*rC[i][0] + beta*C[0]; if (OffSetCN+8=M) return; } } else { for( int block_k=0 ; block_k< K ; block_k+=8) { barrier(CLK_LOCAL_MEM_FENCE); plA[0] = A[0+0*lda]; plA[8] = A[8+0*lda]; plA[16] = A[16+0*lda]; plA[24] = A[24+0*lda]; plB[0] = B[0+0*ldb]; plB[8] = B[0+8*ldb]; plB[16] = B[0+16*ldb]; plB[24] = B[0+24*ldb]; barrier(CLK_LOCAL_MEM_FENCE); int offA = idx; int offB = idy; for( int k = 0 ; k < min(8u, K-block_k); k+=1) { rA[0][0] = lA[offA + 0]; rA[0][1] = lA[offA + 8]; rA[0][2] = lA[offA + 16]; rA[0][3] = lA[offA + 24]; rB[0][0] = lB[offB + 0]; rB[0][1] = lB[offB + 8]; rB[0][2] = lB[offB + 16]; rB[0][3] = lB[offB + 24]; offA += 33; offB += 33; rC[0][0]=mad(rA[0][0],rB[0][0],rC[0][0]); rC[1][0]=mad(rA[0][1],rB[0][0],rC[1][0]); rC[2][0]=mad(rA[0][2],rB[0][0],rC[2][0]); rC[3][0]=mad(rA[0][3],rB[0][0],rC[3][0]); rC[0][1]=mad(rA[0][0],rB[0][1],rC[0][1]); rC[1][1]=mad(rA[0][1],rB[0][1],rC[1][1]); rC[2][1]=mad(rA[0][2],rB[0][1],rC[2][1]); rC[3][1]=mad(rA[0][3],rB[0][1],rC[3][1]); rC[0][2]=mad(rA[0][0],rB[0][2],rC[0][2]); rC[1][2]=mad(rA[0][1],rB[0][2],rC[1][2]); rC[2][2]=mad(rA[0][2],rB[0][2],rC[2][2]); rC[3][2]=mad(rA[0][3],rB[0][2],rC[3][2]); rC[0][3]=mad(rA[0][0],rB[0][3],rC[0][3]); rC[1][3]=mad(rA[0][1],rB[0][3],rC[1][3]); rC[2][3]=mad(rA[0][2],rB[0][3],rC[2][3]); rC[3][3]=mad(rA[0][3],rB[0][3],rC[3][3]); } A += 8*lda; B += 8; } C+= gidx*32; C+= idx; C+= gidy*32*ldc; C+= idy*ldc; for (int i = 0; i<4; i++) { C[0] = alpha*rC[i][0] + beta*C[0]; C[8*ldc] = alpha*rC[i][1] + beta*C[8*ldc]; C[16*ldc] = alpha*rC[i][2] + beta*C[16*ldc]; C[24*ldc] = alpha*rC[i][3] + beta*C[24*ldc]; C+=8; } } } "; static const char * dgemm_NN_1_1_1_8x8_4x4__ALPHA = " __attribute__( (reqd_work_group_size(8, 8, 1)) ) __kernel void dgemm_NN_1_1_1_8x8_4x4__ALPHA(__global double const * restrict A, __global double const * restrict B, __global double * C, uint const M, uint const N, uint const K, double const alpha, uint lda, uint ldb, uint ldc, uint offsetA, uint offsetB, uint offsetC) { double rC[4][4] = {(double)0}; double rA[1][4]; double rB[1][4]; A += offsetA; B += offsetB; C+=offsetC; __local double lA[264]; __local double lB[264]; int gidx = get_group_id(0); int gidy = get_group_id(1); int idx = get_local_id(0); int idy = get_local_id(1); int idt = 8*idy + idx; int idxT = idt % 8; int idyT = idt / 8; A += gidx*32+ idxT + idyT*lda; B += gidy*32*ldb+ idxT + idyT*ldb; __local double* plA = lA + idyT*33+idxT; __local double* plB = lB + idxT*33+ idyT; if(gidx==get_num_groups(0)-1 || gidy==get_num_groups(1)-1 ) { int CurrentOffSetA = gidx*32 + idxT; int CurrentOffSetB = idyT; for( int block_k=0 ; block_k< K ; block_k+=8) { barrier(CLK_LOCAL_MEM_FENCE); plA[0] = CurrentOffSetA>=M?0.0:A[0]; plA[8] = CurrentOffSetA+8>=M?0.0:A[8]; plA[16] = CurrentOffSetA+16>=M?0.0:A[16]; plA[24] = CurrentOffSetA+24>=M?0.0:A[24]; plB[0] = CurrentOffSetB>=N?0.0:B[0]; plB[8] = CurrentOffSetB+8>=N?0.0:B[0+8*ldb]; plB[16] = CurrentOffSetB+16>=N?0.0: B[0+16*ldb]; plB[24] = CurrentOffSetB+24>=N?0.0:B[0+24*ldb]; barrier(CLK_LOCAL_MEM_FENCE); int offA = idx; int offB = idy; for( int k = 0 ; k < min(8u, K-block_k); k+=1) { rA[0][0] = lA[offA + 0]; rA[0][1] = lA[offA + 8]; rA[0][2] = lA[offA + 16]; rA[0][3] = lA[offA + 24]; rB[0][0] = lB[offB + 0]; rB[0][1] = lB[offB + 8]; rB[0][2] = lB[offB + 16]; rB[0][3] = lB[offB + 24]; offA += 33; offB += 33; rC[0][0]=mad(rA[0][0],rB[0][0],rC[0][0]); rC[1][0]=mad(rA[0][1],rB[0][0],rC[1][0]); rC[2][0]=mad(rA[0][2],rB[0][0],rC[2][0]); rC[3][0]=mad(rA[0][3],rB[0][0],rC[3][0]); rC[0][1]=mad(rA[0][0],rB[0][1],rC[0][1]); rC[1][1]=mad(rA[0][1],rB[0][1],rC[1][1]); rC[2][1]=mad(rA[0][2],rB[0][1],rC[2][1]); rC[3][1]=mad(rA[0][3],rB[0][1],rC[3][1]); rC[0][2]=mad(rA[0][0],rB[0][2],rC[0][2]); rC[1][2]=mad(rA[0][1],rB[0][2],rC[1][2]); rC[2][2]=mad(rA[0][2],rB[0][2],rC[2][2]); rC[3][2]=mad(rA[0][3],rB[0][2],rC[3][2]); rC[0][3]=mad(rA[0][0],rB[0][3],rC[0][3]); rC[1][3]=mad(rA[0][1],rB[0][3],rC[1][3]); rC[2][3]=mad(rA[0][2],rB[0][3],rC[2][3]); rC[3][3]=mad(rA[0][3],rB[0][3],rC[3][3]); } A += 8*lda; B += 8; } C+= gidx*32; C+= idx; C+= gidy*32*ldc; C+= idy*ldc; int OffSetCM = gidx*32+idx; int OffSetCN = gidy*32+idy; if(OffSetCM>=M || OffSetCN>=N) return; for (int i = 0; i<4; i++) { C[0] = alpha*rC[i][0]; if (OffSetCN+8=M) return; } } else { for( int block_k=0 ; block_k< K ; block_k+=8) { barrier(CLK_LOCAL_MEM_FENCE); plA[0] = A[0+0*lda]; plA[8] = A[8+0*lda]; plA[16] = A[16+0*lda]; plA[24] = A[24+0*lda]; plB[0] = B[0+0*ldb]; plB[8] = B[0+8*ldb]; plB[16] = B[0+16*ldb]; plB[24] = B[0+24*ldb]; barrier(CLK_LOCAL_MEM_FENCE); int offA = idx; int offB = idy; for( int k = 0 ; k < min(8u, K-block_k); k+=1) { rA[0][0] = lA[offA + 0]; rA[0][1] = lA[offA + 8]; rA[0][2] = lA[offA + 16]; rA[0][3] = lA[offA + 24]; rB[0][0] = lB[offB + 0]; rB[0][1] = lB[offB + 8]; rB[0][2] = lB[offB + 16]; rB[0][3] = lB[offB + 24]; offA += 33; offB += 33; rC[0][0]=mad(rA[0][0],rB[0][0],rC[0][0]); rC[1][0]=mad(rA[0][1],rB[0][0],rC[1][0]); rC[2][0]=mad(rA[0][2],rB[0][0],rC[2][0]); rC[3][0]=mad(rA[0][3],rB[0][0],rC[3][0]); rC[0][1]=mad(rA[0][0],rB[0][1],rC[0][1]); rC[1][1]=mad(rA[0][1],rB[0][1],rC[1][1]); rC[2][1]=mad(rA[0][2],rB[0][1],rC[2][1]); rC[3][1]=mad(rA[0][3],rB[0][1],rC[3][1]); rC[0][2]=mad(rA[0][0],rB[0][2],rC[0][2]); rC[1][2]=mad(rA[0][1],rB[0][2],rC[1][2]); rC[2][2]=mad(rA[0][2],rB[0][2],rC[2][2]); rC[3][2]=mad(rA[0][3],rB[0][2],rC[3][2]); rC[0][3]=mad(rA[0][0],rB[0][3],rC[0][3]); rC[1][3]=mad(rA[0][1],rB[0][3],rC[1][3]); rC[2][3]=mad(rA[0][2],rB[0][3],rC[2][3]); rC[3][3]=mad(rA[0][3],rB[0][3],rC[3][3]); } A += 8*lda; B += 8; } C+= gidx*32; C+= idx; C+= gidy*32*ldc; C+= idy*ldc; for (int i = 0; i<4; i++) { C[0] = alpha*rC[i][0]; C[8*ldc] = alpha*rC[i][1]; C[16*ldc] = alpha*rC[i][2]; C[24*ldc] = alpha*rC[i][3]; C+=8; } } } "; static const char * dgemm_NN_32_32_1_8x8_4x4__ALPHABETA = " __attribute__( (reqd_work_group_size(8, 8, 1)) ) __kernel void dgemm_NN_32_32_1_8x8_4x4__ALPHABETA(__global double const * restrict A, __global double const * restrict B, __global double * C, uint const M, uint const N, uint const K, double const alpha, double const beta, uint lda, uint ldb, uint ldc, uint offsetA, uint offsetB, uint offsetC) { double rC[4][4] = {(double)0}; double rA[1][4]; double rB[1][4]; double PreFetchA [4]; double PreFetchB [4]; A += offsetA; B += offsetB; C+=offsetC; __local double lA[264]; __local double lB[264]; int gidx = get_group_id(0); int gidy = get_group_id(1); int idx = get_local_id(0); int idy = get_local_id(1); int idt = 8*idy + idx; int idxT = idt % 8; int idyT = idt / 8; A += gidx*32+ idxT + idyT*lda; B += gidy*32*ldb+ idxT + idyT*ldb; __local double* plA = lA + idyT*33+1*idxT; __local double* plB = lB + idxT*33+ idyT; plA[0] = A[0]; plA[8] = A[8]; plA[16] = A[16]; plA[24] = A[24]; plB[0] = B[0]; plB[8] = B[0+8*ldb]; plB[16] = B[0+16*ldb]; plB[24] = B[0+24*ldb]; barrier(CLK_LOCAL_MEM_FENCE); for( int block_k=0 ; block_k< K ; block_k+=8) { //barrier(CLK_LOCAL_MEM_FENCE); A += 8*lda; B += 8; PreFetchA[0] = A[0]; PreFetchA[1] = A[8]; PreFetchA[2] = A[16]; PreFetchA[3] = A[24]; PreFetchB[0] = B[0]; PreFetchB[1] = B[8*ldb]; PreFetchB[2] = B[16*ldb]; PreFetchB[3] = B[24*ldb]; int offA = idx; int offB = idy; for( int k = 0 ; k < min(8u, K-block_k); k+=1) { rA[0][0] = lA[offA + 0]; rA[0][1] = lA[offA + 8]; rA[0][2] = lA[offA + 16]; rA[0][3] = lA[offA + 24]; rB[0][0] = lB[offB + 0]; rB[0][1] = lB[offB + 8]; rB[0][2] = lB[offB + 16]; rB[0][3] = lB[offB + 24]; offA += 33; offB += 33; rC[0][0]=mad(rA[0][0],rB[0][0],rC[0][0]); rC[1][0]=mad(rA[0][1],rB[0][0],rC[1][0]); rC[2][0]=mad(rA[0][2],rB[0][0],rC[2][0]); rC[3][0]=mad(rA[0][3],rB[0][0],rC[3][0]); rC[0][1]=mad(rA[0][0],rB[0][1],rC[0][1]); rC[1][1]=mad(rA[0][1],rB[0][1],rC[1][1]); rC[2][1]=mad(rA[0][2],rB[0][1],rC[2][1]); rC[3][1]=mad(rA[0][3],rB[0][1],rC[3][1]); rC[0][2]=mad(rA[0][0],rB[0][2],rC[0][2]); rC[1][2]=mad(rA[0][1],rB[0][2],rC[1][2]); rC[2][2]=mad(rA[0][2],rB[0][2],rC[2][2]); rC[3][2]=mad(rA[0][3],rB[0][2],rC[3][2]); rC[0][3]=mad(rA[0][0],rB[0][3],rC[0][3]); rC[1][3]=mad(rA[0][1],rB[0][3],rC[1][3]); rC[2][3]=mad(rA[0][2],rB[0][3],rC[2][3]); rC[3][3]=mad(rA[0][3],rB[0][3],rC[3][3]); } barrier(CLK_LOCAL_MEM_FENCE); plA[0] = PreFetchA[0]; plA[8] = PreFetchA[1]; plA[16] = PreFetchA[2]; plA[24] = PreFetchA[3]; plB[0] = PreFetchB[0]; plB[8] = PreFetchB[1]; plB[16] = PreFetchB[2]; plB[24] = PreFetchB[3]; barrier(CLK_LOCAL_MEM_FENCE); } C+= gidx*32; C+= idx*1; C+= gidy*32*ldc; C+= idy*1*ldc; C[0*ldc] = alpha*rC[0][0] + beta*C[0*ldc]; C[8*ldc] = alpha*rC[0][1] + beta*C[8*ldc]; C[16*ldc] = alpha*rC[0][2] + beta*C[16*ldc]; C[24*ldc] = alpha*rC[0][3] + beta*C[24*ldc]; C+=8; C[0*ldc] = alpha*rC[1][0] + beta*C[0*ldc]; C[8*ldc] = alpha*rC[1][1] + beta*C[8*ldc]; C[16*ldc] = alpha*rC[1][2] + beta*C[16*ldc]; C[24*ldc] = alpha*rC[1][3] + beta*C[24*ldc]; C+=8; C[0*ldc] = alpha*rC[2][0] + beta*C[0*ldc]; C[8*ldc] = alpha*rC[2][1] + beta*C[8*ldc]; C[16*ldc] = alpha*rC[2][2] + beta*C[16*ldc]; C[24*ldc] = alpha*rC[2][3] + beta*C[24*ldc]; C+=8; C[0*ldc] = alpha*rC[3][0] + beta*C[0*ldc]; C[8*ldc] = alpha*rC[3][1] + beta*C[8*ldc]; C[16*ldc] = alpha*rC[3][2] + beta*C[16*ldc]; C[24*ldc] = alpha*rC[3][3] + beta*C[24*ldc]; C+=8; } "; static const char * dgemm_NN_32_32_1_8x8_4x4__ALPHA = " __attribute__( (reqd_work_group_size(8, 8, 1)) ) __kernel void dgemm_NN_32_32_1_8x8_4x4__ALPHA(__global double const * restrict A, __global double const * restrict B, __global double * C, uint const M, uint const N, uint const K, double const alpha, uint lda, uint ldb, uint ldc, uint offsetA, uint offsetB, uint offsetC) { double rC[4][4] = {(double)0}; double rA[1][4]; double rB[1][4]; double PreFetchA [4]; double PreFetchB [4]; A += offsetA; B += offsetB; C+=offsetC; __local double lA[264]; __local double lB[264]; int gidx = get_group_id(0); int gidy = get_group_id(1); int idx = get_local_id(0); int idy = get_local_id(1); int idt = 8*idy + idx; int idxT = idt % 8; int idyT = idt / 8; A += gidx*32+ idxT + idyT*lda; B += gidy*32*ldb+ idxT + idyT*ldb; __local double* plA = lA + idyT*33+1*idxT; __local double* plB = lB + idxT*33+ idyT; plA[0] = A[0]; plA[8] = A[8]; plA[16] = A[16]; plA[24] = A[24]; plB[0] = B[0]; plB[8] = B[0+8*ldb]; plB[16] = B[0+16*ldb]; plB[24] = B[0+24*ldb]; barrier(CLK_LOCAL_MEM_FENCE); for( int block_k=0 ; block_k< K ; block_k+=8) { //barrier(CLK_LOCAL_MEM_FENCE); A += 8*lda; B += 8; PreFetchA[0] = A[0]; PreFetchA[1] = A[8]; PreFetchA[2] = A[16]; PreFetchA[3] = A[24]; PreFetchB[0] = B[0]; PreFetchB[1] = B[8*ldb]; PreFetchB[2] = B[16*ldb]; PreFetchB[3] = B[24*ldb]; int offA = idx; int offB = idy; for( int k = 0 ; k < min(8u, K-block_k); k+=1) { rA[0][0] = lA[offA + 0]; rA[0][1] = lA[offA + 8]; rA[0][2] = lA[offA + 16]; rA[0][3] = lA[offA + 24]; rB[0][0] = lB[offB + 0]; rB[0][1] = lB[offB + 8]; rB[0][2] = lB[offB + 16]; rB[0][3] = lB[offB + 24]; offA += 33; offB += 33; rC[0][0]=mad(rA[0][0],rB[0][0],rC[0][0]); rC[1][0]=mad(rA[0][1],rB[0][0],rC[1][0]); rC[2][0]=mad(rA[0][2],rB[0][0],rC[2][0]); rC[3][0]=mad(rA[0][3],rB[0][0],rC[3][0]); rC[0][1]=mad(rA[0][0],rB[0][1],rC[0][1]); rC[1][1]=mad(rA[0][1],rB[0][1],rC[1][1]); rC[2][1]=mad(rA[0][2],rB[0][1],rC[2][1]); rC[3][1]=mad(rA[0][3],rB[0][1],rC[3][1]); rC[0][2]=mad(rA[0][0],rB[0][2],rC[0][2]); rC[1][2]=mad(rA[0][1],rB[0][2],rC[1][2]); rC[2][2]=mad(rA[0][2],rB[0][2],rC[2][2]); rC[3][2]=mad(rA[0][3],rB[0][2],rC[3][2]); rC[0][3]=mad(rA[0][0],rB[0][3],rC[0][3]); rC[1][3]=mad(rA[0][1],rB[0][3],rC[1][3]); rC[2][3]=mad(rA[0][2],rB[0][3],rC[2][3]); rC[3][3]=mad(rA[0][3],rB[0][3],rC[3][3]); } barrier(CLK_LOCAL_MEM_FENCE); plA[0] = PreFetchA[0]; plA[8] = PreFetchA[1]; plA[16] = PreFetchA[2]; plA[24] = PreFetchA[3]; plB[0] = PreFetchB[0]; plB[8] = PreFetchB[1]; plB[16] = PreFetchB[2]; plB[24] = PreFetchB[3]; barrier(CLK_LOCAL_MEM_FENCE); } C+= gidx*32; C+= idx; C+= gidy*32*ldc; C+= idy*ldc; C[0*ldc] = alpha*rC[0][0] ; C[8*ldc] = alpha*rC[0][1] ; C[16*ldc] = alpha*rC[0][2]; C[24*ldc] = alpha*rC[0][3]; C+=8; C[0*ldc] = alpha*rC[1][0] ; C[8*ldc] = alpha*rC[1][1] ; C[16*ldc] = alpha*rC[1][2]; C[24*ldc] = alpha*rC[1][3]; C+=8; C[0*ldc] = alpha*rC[2][0] ; C[8*ldc] = alpha*rC[2][1] ; C[16*ldc] = alpha*rC[2][2]; C[24*ldc] = alpha*rC[2][3]; C+=8; C[0*ldc] = alpha*rC[3][0] ; C[8*ldc] = alpha*rC[3][1] ; C[16*ldc] = alpha*rC[3][2]; C[24*ldc] = alpha*rC[3][3]; } "; static const char * dgemm_NN_40_40_8_8x8_5x5__ALPHABETA = " __attribute__((reqd_work_group_size(8,8,1))) __kernel void dgemm_NN_40_40_8_8x8_5x5__ALPHABETA( __global double const * restrict A, __global double const * restrict B, __global double * C, uint const M, uint const N, uint const K, double const alpha, double const beta, uint lda, uint ldb, uint ldc, uint const offsetA, uint const offsetB, uint const offsetC) { double rC[5][5] = {(double)0}; double rA[1][5]; double rB[1][5]; double PreFetchA[5]; double PreFetchB[5]; //double PreFetchA_5; //double PreFetchB_5; A += offsetA; B += offsetB; C+=offsetC; __local double lA[320]; __local double lB[320]; int gidx = get_group_id(0); int gidy = get_group_id(1); int idx = get_local_id(0); int idy = get_local_id(1); int idt = 8*idy + idx; int idxT = idt % 8; int idyT = idt / 8; A += gidx*40+ idxT + idyT*lda; B += gidy*40*ldb+ idxT + idyT*ldb; __local double* plA = lA + idyT*40+idxT; __local double* plB = lB + idxT*40+idyT; plA[0] = A[0]; plA[8] = A[8]; plA[16] = A[16]; plA[24] = A[24]; plA[32] = A[32]; plB[0] = B[0]; plB[8] = B[8*ldb]; plB[16] = B[16*ldb]; plB[24] = B[24*ldb]; plB[32] = B[32*ldb]; barrier(CLK_LOCAL_MEM_FENCE); //for( int block_k=0 ; block_k< K ; block_k+=8) uint block_k = K >> 3; do { /* barrier(CLK_LOCAL_MEM_FENCE); plA[0] = A[0]; plA[8] = A[8]; plA[16] = A[16]; plA[24] = A[24]; plA[32] = A[32]; plB[0] = B[0]; plB[8] = B[8*ldb]; plB[16] = B[16*ldb]; plB[24] = B[24*ldb]; plB[32] = B[32*ldb]; barrier(CLK_LOCAL_MEM_FENCE); */ A += 8*lda; B += 8; PreFetchA[0] = A[0]; PreFetchA[1] = A[8]; PreFetchA[2] = A[16]; PreFetchA[3] = A[24]; PreFetchA[4] = A[32]; PreFetchB[0] = B[0]; PreFetchB[1] = B[8*ldb]; PreFetchB[2] = B[16*ldb]; PreFetchB[3] = B[24*ldb]; PreFetchB[4] = B[32*ldb]; int offA = idx; int offB = idy; // int off256 = 256; #pragma unroll 1 for( int k = 0 ; k < 8; k+=1) { rA[0][0] = lA[offA + 0]; rA[0][1] = lA[offA + 8]; rA[0][2] = lA[offA + 16]; rA[0][3] = lA[offA + 24]; rA[0][4] = lA[offA + 32]; rB[0][0] = lB[offB + 0]; rB[0][1] = lB[offB + 8]; rB[0][2] = lB[offB + 16]; rB[0][3] = lB[offB + 24]; rB[0][4] = lB[offB + 32]; offA += 40; offB += 40; // off256 -= 24; rC[0][0]=mad(rA[0][0],rB[0][0],rC[0][0]); rC[1][0]=mad(rA[0][1],rB[0][0],rC[1][0]); rC[2][0]=mad(rA[0][2],rB[0][0],rC[2][0]); rC[3][0]=mad(rA[0][3],rB[0][0],rC[3][0]); rC[4][0]=mad(rA[0][4],rB[0][0],rC[4][0]); rC[0][1]=mad(rA[0][0],rB[0][1],rC[0][1]); rC[1][1]=mad(rA[0][1],rB[0][1],rC[1][1]); rC[2][1]=mad(rA[0][2],rB[0][1],rC[2][1]); rC[3][1]=mad(rA[0][3],rB[0][1],rC[3][1]); rC[4][1]=mad(rA[0][4],rB[0][1],rC[4][1]); rC[0][2]=mad(rA[0][0],rB[0][2],rC[0][2]); rC[1][2]=mad(rA[0][1],rB[0][2],rC[1][2]); rC[2][2]=mad(rA[0][2],rB[0][2],rC[2][2]); rC[3][2]=mad(rA[0][3],rB[0][2],rC[3][2]); rC[4][2]=mad(rA[0][4],rB[0][2],rC[4][2]); rC[0][3]=mad(rA[0][0],rB[0][3],rC[0][3]); rC[1][3]=mad(rA[0][1],rB[0][3],rC[1][3]); rC[2][3]=mad(rA[0][2],rB[0][3],rC[2][3]); rC[3][3]=mad(rA[0][3],rB[0][3],rC[3][3]); rC[4][3]=mad(rA[0][4],rB[0][3],rC[4][3]); rC[0][4]=mad(rA[0][0],rB[0][4],rC[0][4]); rC[1][4]=mad(rA[0][1],rB[0][4],rC[1][4]); rC[2][4]=mad(rA[0][2],rB[0][4],rC[2][4]); rC[3][4]=mad(rA[0][3],rB[0][4],rC[3][4]); rC[4][4]=mad(rA[0][4],rB[0][4],rC[4][4]); } barrier(CLK_LOCAL_MEM_FENCE); plA[0] = PreFetchA[0]; plA[8] = PreFetchA[1]; plA[16] = PreFetchA[2]; plA[24] = PreFetchA[3]; plA[32] = PreFetchA[4]; plB[0] = PreFetchB[0]; plB[8] = PreFetchB[1]; plB[16] = PreFetchB[2]; plB[24] = PreFetchB[3]; plB[32] = PreFetchB[4]; barrier(CLK_LOCAL_MEM_FENCE); // A += 8*lda; // B += 8; } while (--block_k > 0); C+= gidx*40; C+= idx; C+= gidy*40*ldc; C+= idy*ldc; C[0*ldc] = alpha*rC[0][0] + beta*C[0*ldc]; C[8*ldc] = alpha*rC[0][1] + beta*C[8*ldc]; C[16*ldc] = alpha*rC[0][2] + beta*C[16*ldc]; C[24*ldc] = alpha*rC[0][3] + beta*C[24*ldc]; C[32*ldc] = alpha*rC[0][4] + beta*C[32*ldc]; C+=8; C[0*ldc] = alpha*rC[1][0] + beta*C[0*ldc]; C[8*ldc] = alpha*rC[1][1] + beta*C[8*ldc]; C[16*ldc] = alpha*rC[1][2] + beta*C[16*ldc]; C[24*ldc] = alpha*rC[1][3] + beta*C[24*ldc]; C[32*ldc] = alpha*rC[1][4] + beta*C[32*ldc]; C+=8; C[0*ldc] = alpha*rC[2][0] + beta*C[0*ldc]; C[8*ldc] = alpha*rC[2][1] + beta*C[8*ldc]; C[16*ldc] = alpha*rC[2][2] + beta*C[16*ldc]; C[24*ldc] = alpha*rC[2][3] + beta*C[24*ldc]; C[32*ldc] = alpha*rC[2][4] + beta*C[32*ldc]; C+=8; C[0*ldc] = alpha*rC[3][0] + beta*C[0*ldc]; C[8*ldc] = alpha*rC[3][1] + beta*C[8*ldc]; C[16*ldc] = alpha*rC[3][2] + beta*C[16*ldc]; C[24*ldc] = alpha*rC[3][3] + beta*C[24*ldc]; C[32*ldc] = alpha*rC[3][4] + beta*C[32*ldc]; C+=8; C[0*ldc] = alpha*rC[4][0] + beta*C[0*ldc]; C[8*ldc] = alpha*rC[4][1] + beta*C[8*ldc]; C[16*ldc] = alpha*rC[4][2] + beta*C[16*ldc]; C[24*ldc] = alpha*rC[4][3] + beta*C[24*ldc]; C[32*ldc] = alpha*rC[4][4] + beta*C[32*ldc]; } "; static const char * dgemm_NN_40_40_8_8x8_5x5__ALPHA = " __attribute__((reqd_work_group_size(8,8,1))) __kernel void dgemm_NN_40_40_8_8x8_5x5__ALPHA( __global double const * restrict A, __global double const * restrict B, __global double * C, uint const M, uint const N, uint const K, double const alpha, uint lda, uint ldb, uint ldc, uint const offsetA, uint const offsetB, uint const offsetC) { double rC[5][5] = {(double)0}; double rA[1][5]; double rB[1][5]; double PreFetchA[5]; double PreFetchB[5]; //double PreFetchA_5; //double PreFetchB_5; A += offsetA; B += offsetB; C+=offsetC; __local double lA[320]; __local double lB[320]; int gidx = get_group_id(0); int gidy = get_group_id(1); int idx = get_local_id(0); int idy = get_local_id(1); int idt = 8*idy + idx; int idxT = idt % 8; int idyT = idt / 8; A += gidx*40+ idxT + idyT*lda; B += gidy*40*ldb+ idxT + idyT*ldb; __local double* plA = lA + idyT*40+idxT; __local double* plB = lB + idxT*40+idyT; plA[0] = A[0]; plA[8] = A[8]; plA[16] = A[16]; plA[24] = A[24]; plA[32] = A[32]; plB[0] = B[0]; plB[8] = B[8*ldb]; plB[16] = B[16*ldb]; plB[24] = B[24*ldb]; plB[32] = B[32*ldb]; barrier(CLK_LOCAL_MEM_FENCE); //for( int block_k=0 ; block_k< K ; block_k+=8) uint block_k = K >> 3; do { /* barrier(CLK_LOCAL_MEM_FENCE); plA[0] = A[0]; plA[8] = A[8]; plA[16] = A[16]; plA[24] = A[24]; plA[32] = A[32]; plB[0] = B[0]; plB[8] = B[8*ldb]; plB[16] = B[16*ldb]; plB[24] = B[24*ldb]; plB[32] = B[32*ldb]; barrier(CLK_LOCAL_MEM_FENCE); */ A += 8*lda; B += 8; PreFetchA[0] = A[0]; PreFetchA[1] = A[8]; PreFetchA[2] = A[16]; PreFetchA[3] = A[24]; PreFetchA[4] = A[32]; PreFetchB[0] = B[0]; PreFetchB[1] = B[8*ldb]; PreFetchB[2] = B[16*ldb]; PreFetchB[3] = B[24*ldb]; PreFetchB[4] = B[32*ldb]; int offA = idx; int offB = idy; // int off256 = 256; #pragma unroll 1 for( int k = 0 ; k < 8; k+=1) { rA[0][0] = lA[offA + 0]; rA[0][1] = lA[offA + 8]; rA[0][2] = lA[offA + 16]; rA[0][3] = lA[offA + 24]; rA[0][4] = lA[offA + 32]; rB[0][0] = lB[offB + 0]; rB[0][1] = lB[offB + 8]; rB[0][2] = lB[offB + 16]; rB[0][3] = lB[offB + 24]; rB[0][4] = lB[offB + 32]; offA += 40; offB += 40; // off256 -= 24; rC[0][0]=mad(rA[0][0],rB[0][0],rC[0][0]); rC[1][0]=mad(rA[0][1],rB[0][0],rC[1][0]); rC[2][0]=mad(rA[0][2],rB[0][0],rC[2][0]); rC[3][0]=mad(rA[0][3],rB[0][0],rC[3][0]); rC[4][0]=mad(rA[0][4],rB[0][0],rC[4][0]); rC[0][1]=mad(rA[0][0],rB[0][1],rC[0][1]); rC[1][1]=mad(rA[0][1],rB[0][1],rC[1][1]); rC[2][1]=mad(rA[0][2],rB[0][1],rC[2][1]); rC[3][1]=mad(rA[0][3],rB[0][1],rC[3][1]); rC[4][1]=mad(rA[0][4],rB[0][1],rC[4][1]); rC[0][2]=mad(rA[0][0],rB[0][2],rC[0][2]); rC[1][2]=mad(rA[0][1],rB[0][2],rC[1][2]); rC[2][2]=mad(rA[0][2],rB[0][2],rC[2][2]); rC[3][2]=mad(rA[0][3],rB[0][2],rC[3][2]); rC[4][2]=mad(rA[0][4],rB[0][2],rC[4][2]); rC[0][3]=mad(rA[0][0],rB[0][3],rC[0][3]); rC[1][3]=mad(rA[0][1],rB[0][3],rC[1][3]); rC[2][3]=mad(rA[0][2],rB[0][3],rC[2][3]); rC[3][3]=mad(rA[0][3],rB[0][3],rC[3][3]); rC[4][3]=mad(rA[0][4],rB[0][3],rC[4][3]); rC[0][4]=mad(rA[0][0],rB[0][4],rC[0][4]); rC[1][4]=mad(rA[0][1],rB[0][4],rC[1][4]); rC[2][4]=mad(rA[0][2],rB[0][4],rC[2][4]); rC[3][4]=mad(rA[0][3],rB[0][4],rC[3][4]); rC[4][4]=mad(rA[0][4],rB[0][4],rC[4][4]); } barrier(CLK_LOCAL_MEM_FENCE); plA[0] = PreFetchA[0]; plA[8] = PreFetchA[1]; plA[16] = PreFetchA[2]; plA[24] = PreFetchA[3]; plA[32] = PreFetchA[4]; plB[0] = PreFetchB[0]; plB[8] = PreFetchB[1]; plB[16] = PreFetchB[2]; plB[24] = PreFetchB[3]; plB[32] = PreFetchB[4]; barrier(CLK_LOCAL_MEM_FENCE); // A += 8*lda; // B += 8; } while (--block_k > 0); C+= gidx*40; C+= idx; C+= gidy*40*ldc; C+= idy*ldc; C[0*ldc] = alpha*rC[0][0] ; C[8*ldc] = alpha*rC[0][1] ; C[16*ldc] = alpha*rC[0][2]; C[24*ldc] = alpha*rC[0][3]; C[32*ldc] = alpha*rC[0][4]; C+=8; C[0*ldc] = alpha*rC[1][0] ; C[8*ldc] = alpha*rC[1][1] ; C[16*ldc] = alpha*rC[1][2]; C[24*ldc] = alpha*rC[1][3]; C[32*ldc] = alpha*rC[1][4]; C+=8; C[0*ldc] = alpha*rC[2][0] ; C[8*ldc] = alpha*rC[2][1] ; C[16*ldc] = alpha*rC[2][2]; C[24*ldc] = alpha*rC[2][3]; C[32*ldc] = alpha*rC[2][4]; C+=8; C[0*ldc] = alpha*rC[3][0] ; C[8*ldc] = alpha*rC[3][1] ; C[16*ldc] = alpha*rC[3][2]; C[24*ldc] = alpha*rC[3][3]; C[32*ldc] = alpha*rC[3][4]; C+=8; C[0*ldc] = alpha*rC[4][0] ; C[8*ldc] = alpha*rC[4][1] ; C[16*ldc] = alpha*rC[4][2]; C[24*ldc] = alpha*rC[4][3]; C[32*ldc] = alpha*rC[4][4]; } ";clblas-2.10/src/library/blas/gens/clTemplates/dgemm_hawaiiChannelConfilct.cl000066400000000000000000000073471264277366700272260ustar00rootroot00000000000000static const char * dgemm_NT_ChannelConflict = " typedef union GPtr { __global double *d; __global double2 *d2v; __global double4 *d4v; __global double8 *d8v; __global double16 *d16v; } GPtr; __attribute__((reqd_work_group_size(8, 8, 1))) void __kernel dgemmBlockTempLocalPrefetch(__global double2 const * restrict A, __global double2 const * restrict B, __global double2 * C, uint const M, uint const N, uint const K, double const alpha, double const beta, uint lda, uint ldb, uint ldc, uint offsetA, uint offsetB, uint offsetC) { double2 a0 ; double b0[8], b1[8]; double b0T, b1T; double2 c[8] = {(double2)(0,0)}; int4 coord = 0; /* contains coordB, coordA, k */ lda /= 2; ldb /= 2; int get_group_id_1; int get_global_id_1; A += (int)get_global_id(0); int lidY = get_local_id(1); get_group_id_1 = (get_group_id(0) + get_group_id(1))% get_num_groups(1); get_global_id_1 = get_group_id_1 * get_local_size(1) /*+ get_local_id(1)*/; //kif = (N % 256 != 0); // get_global_id_1 = (kif*(uint)get_global_id(1)) + ((1-kif)*get_global_id_1); B += get_global_id_1 * 4 ; coord.y = 2 * (int)get_global_id(0); coord.x = 8 * (get_global_id_1+lidY); GPtr uB; uB.d2v = B; local double blockB [128]; int lid = get_local_id(0)+8*lidY; blockB[lid] = uB.d[lid]; blockB[lid+64] = uB.d[lid+2*ldb]; barrier(CLK_LOCAL_MEM_FENCE); for (int k1 = 0; k1 < K; k1 += 2) { /* -- Tiles multiplier -- */ ///barrier(CLK_LOCAL_MEM_FENCE); uB.d2v += (ldb << 1); b0T = uB.d[lid]; b1T = uB.d[lid+2*ldb]; a0 = A[0]; for (int i=0; i<8; i++) { b0[i] = blockB[i+8*lidY]; b1[i] = blockB[i+64+8*lidY]; } for (int i=0; i<8;i++) c[i] = mad(a0, b0[i], c[i]); a0 = A[lda]; for (int i=0; i<8;i++) c[i] = mad(a0, b1[i], c[i]); A += (lda << 1); barrier(CLK_LOCAL_MEM_FENCE); blockB[lid] = b0T; blockB[lid+64] = b1T; barrier(CLK_LOCAL_MEM_FENCE); // uB.d2v += (ldb << 1); /* ---------------------- */ } GPtr uC; uC.d = C + (coord.x * ldc + coord.y)/2; __global double2 *pC = uC.d2v; double2 tempC0, tempC1, tempC2, tempC3, tempC4, tempC5, tempC6, tempC7; tempC0 = pC[0]; tempC1 = pC[(ldc >> 1)]; tempC2 = pC[ldc]; tempC3 = pC[mad24(3u, (ldc >> 1), 0u)]; tempC4 = pC[(ldc << 1)]; tempC5 = pC[mad24(5u, (ldc >> 1), 0u)]; tempC6 = pC[mad24(6u, (ldc >> 1), 0u)]; tempC7 = pC[mad24(7u, (ldc >> 1), 0u)]; tempC0 = mad(tempC0, beta, 0); tempC1 = mad(tempC1, beta, 0); tempC2 = mad(tempC2, beta, 0); tempC3 = mad(tempC3, beta, 0); tempC4 = mad(tempC4, beta, 0); tempC5 = mad(tempC5, beta, 0); tempC6 = mad(tempC6, beta, 0); tempC7 = mad(tempC7, beta, 0); tempC0 = mad(c[0], alpha, tempC0); tempC1 = mad(c[1], alpha, tempC1); tempC2 = mad(c[2], alpha, tempC2); tempC3 = mad(c[3], alpha, tempC3); tempC4 = mad(c[4], alpha, tempC4); tempC5 = mad(c[5], alpha, tempC5); tempC6 = mad(c[6], alpha, tempC6); tempC7 = mad(c[7], alpha, tempC7); pC[0] = tempC0; pC[(ldc >> 1)] = tempC1; pC[ldc] = tempC2; pC[mad24(3u, (ldc >> 1), 0u)] = tempC3; pC[(ldc << 1)] = tempC4; pC[mad24(5u, (ldc >> 1), 0u)] = tempC5; pC[mad24(6u, (ldc >> 1), 0u)] = tempC6; pC[mad24(7u, (ldc >> 1), 0u)] = tempC7; } ";clblas-2.10/src/library/blas/gens/clTemplates/dgemm_hawaiiSplitKernel.cl000066400000000000000000004764531264277366700264400ustar00rootroot00000000000000static const char * dgemm_NT_8_SPLIT__ALPHABETA = " //static const char * dgemm_NT_48_48_8_8x8_6x6__ALPHABETA_SPLIT = " typedef union GPtr { __global float *f; __global double *d; __global float2 *f2v; __global double2 *d2v; } GPtr; #define M6x6 \ rA[0] = lA[offA + 0]; \ rA[1] = lA[offA + 1]; \ rA[2] = lA[offA + 16]; \ rA[3] = lA[offA + 17]; \ rA[4] = lA[offA + 32]; \ rA[5] = lA[offA + 33]; \ rB[0] = lB[offB + 0]; \ rB[1] = lB[offB + 1]; \ rB[2] = lB[offB + 16]; \ rB[3] = lB[offB + 17]; \ rB[4] = lB[offB + 32]; \ rB[5] = lB[offB + 33]; \ offA += 48; \ offB += 48; \ rC[0][0] = mad(rA[0],rB[0],rC[0][0]); \ rC[0][1] = mad(rA[1],rB[0],rC[0][1]); \ rC[0][2] = mad(rA[2],rB[0],rC[0][2]); \ rC[0][3] = mad(rA[3],rB[0],rC[0][3]); \ rC[0][4] = mad(rA[4],rB[0],rC[0][4]); \ rC[0][5] = mad(rA[5],rB[0],rC[0][5]); \ rC[1][0] = mad(rA[0],rB[1],rC[1][0]); \ rC[1][1] = mad(rA[1],rB[1],rC[1][1]); \ rC[1][2] = mad(rA[2],rB[1],rC[1][2]); \ rC[1][3] = mad(rA[3],rB[1],rC[1][3]); \ rC[1][4] = mad(rA[4],rB[1],rC[1][4]); \ rC[1][5] = mad(rA[5],rB[1],rC[1][5]); \ rC[2][0] = mad(rA[0],rB[2],rC[2][0]); \ rC[2][1] = mad(rA[1],rB[2],rC[2][1]); \ rC[2][2] = mad(rA[2],rB[2],rC[2][2]); \ rC[2][3] = mad(rA[3],rB[2],rC[2][3]); \ rC[2][4] = mad(rA[4],rB[2],rC[2][4]); \ rC[2][5] = mad(rA[5],rB[2],rC[2][5]); \ rC[3][0] = mad(rA[0],rB[3],rC[3][0]); \ rC[3][1] = mad(rA[1],rB[3],rC[3][1]); \ rC[3][2] = mad(rA[2],rB[3],rC[3][2]); \ rC[3][3] = mad(rA[3],rB[3],rC[3][3]); \ rC[3][4] = mad(rA[4],rB[3],rC[3][4]); \ rC[3][5] = mad(rA[5],rB[3],rC[3][5]); \ rC[4][0] = mad(rA[0],rB[4],rC[4][0]); \ rC[4][1] = mad(rA[1],rB[4],rC[4][1]); \ rC[4][2] = mad(rA[2],rB[4],rC[4][2]); \ rC[4][3] = mad(rA[3],rB[4],rC[4][3]); \ rC[4][4] = mad(rA[4],rB[4],rC[4][4]); \ rC[4][5] = mad(rA[5],rB[4],rC[4][5]); \ rC[5][0] = mad(rA[0],rB[5],rC[5][0]); \ rC[5][1] = mad(rA[1],rB[5],rC[5][1]); \ rC[5][2] = mad(rA[2],rB[5],rC[5][2]); \ rC[5][3] = mad(rA[3],rB[5],rC[5][3]); \ rC[5][4] = mad(rA[4],rB[5],rC[5][4]); \ rC[5][5] = mad(rA[5],rB[5],rC[5][5]); \ barrier(CLK_LOCAL_MEM_FENCE); __attribute__((reqd_work_group_size(8,8,1))) __kernel void dgemm_NT_48_48_8_8x8_6x6__ALPHABETA_SPLIT_MAIN(__global double2 const * restrict A, __global double2 const * restrict B, __global double * C, uint const M, uint const N, uint const K, double const alpha, double const beta, uint lda, uint ldb, uint ldc, uint offsetA, uint offsetB, uint offsetC) { GPtr uA, uB; uA.d2v = (__global double2 *)A; uB.d2v = (__global double2 *)B; // C += offsetC; uA.d += offsetA; uB.d += offsetB; C += offsetC; double rC[6][6] = {(double)0}; double rA[6]; double rB[6]; __local double lA[392]; __local double lB[392]; int gidx = get_group_id(0); int gidy = get_group_id(1); int idx = get_local_id(0); int idy = get_local_id(1); uA.d += 2*(gidx*24 + idx) + idy*lda; uB.d += 2*(gidy*24 + idx) + idy*ldb; int block_k = K >> 3; do { __local double2* plA = (__local double2*)(lA + idy*48 + 2*idx); __local double2* plB = (__local double2*)(lB + idy*48 + 2*idx); // barrier(CLK_LOCAL_MEM_FENCE); plB[0 ] = uB.d2v[0 ]; plB[8 ] = uB.d2v[8 ]; plB[16] = uB.d2v[16]; plA[0 ] = uA.d2v[0 ]; plA[8 ] = uA.d2v[8 ]; plA[16] = uA.d2v[16]; barrier(CLK_LOCAL_MEM_FENCE); int offA = idx << 1; int offB = idy << 1; M6x6 M6x6 M6x6 M6x6 M6x6 M6x6 M6x6 M6x6 uA.d += lda << 3; uB.d += ldb << 3; } while (--block_k > 0); int offset_x = gidx*48+ idx*2; int offset_y = gidy*48+ idy*2; (C[(offset_x + 0) + (offset_y + 0) * ldc] = mad(beta, C[(offset_x + 0) + (offset_y + 0) * ldc], alpha * rC[0][0])); (C[(offset_x + 1) + (offset_y + 0) * ldc] = mad(beta, C[(offset_x + 1) + (offset_y + 0) * ldc], alpha * rC[0][1])); (C[(offset_x + 0) + (offset_y + 1) * ldc] = mad(beta, C[(offset_x + 0) + (offset_y + 1) * ldc], alpha * rC[1][0])); (C[(offset_x + 1) + (offset_y + 1) * ldc] = mad(beta, C[(offset_x + 1) + (offset_y + 1) * ldc], alpha * rC[1][1])); (C[(offset_x + 0) + (offset_y + 16) * ldc] = mad(beta, C[(offset_x + 0) + (offset_y + 16) * ldc], alpha * rC[2][0])); (C[(offset_x + 1) + (offset_y + 16) * ldc] = mad(beta, C[(offset_x + 1) + (offset_y + 16) * ldc], alpha * rC[2][1])); (C[(offset_x + 0) + (offset_y + 17) * ldc] = mad(beta, C[(offset_x + 0) + (offset_y + 17) * ldc], alpha * rC[3][0])); (C[(offset_x + 1) + (offset_y + 17) * ldc] = mad(beta, C[(offset_x + 1) + (offset_y + 17) * ldc], alpha * rC[3][1])); (C[(offset_x + 0) + (offset_y + 32) * ldc] = mad(beta, C[(offset_x + 0) + (offset_y + 32) * ldc], alpha * rC[4][0])); (C[(offset_x + 1) + (offset_y + 32) * ldc] = mad(beta, C[(offset_x + 1) + (offset_y + 32) * ldc], alpha * rC[4][1])); (C[(offset_x + 0) + (offset_y + 33) * ldc] = mad(beta, C[(offset_x + 0) + (offset_y + 33) * ldc], alpha * rC[5][0])); (C[(offset_x + 1) + (offset_y + 33) * ldc] = mad(beta, C[(offset_x + 1) + (offset_y + 33) * ldc], alpha * rC[5][1])); (C[(offset_x + 16) + (offset_y + 0) * ldc] = mad(beta, C[(offset_x + 16) + (offset_y + 0) * ldc], alpha * rC[0][2])); (C[(offset_x + 17) + (offset_y + 0) * ldc] = mad(beta, C[(offset_x + 17) + (offset_y + 0) * ldc], alpha * rC[0][3])); (C[(offset_x + 16) + (offset_y + 1) * ldc] = mad(beta, C[(offset_x + 16) + (offset_y + 1) * ldc], alpha * rC[1][2])); (C[(offset_x + 17) + (offset_y + 1) * ldc] = mad(beta, C[(offset_x + 17) + (offset_y + 1) * ldc], alpha * rC[1][3])); (C[(offset_x + 16) + (offset_y + 16) * ldc] = mad(beta, C[(offset_x + 16) + (offset_y + 16) * ldc], alpha * rC[2][2])); (C[(offset_x + 17) + (offset_y + 16) * ldc] = mad(beta, C[(offset_x + 17) + (offset_y + 16) * ldc], alpha * rC[2][3])); (C[(offset_x + 16) + (offset_y + 17) * ldc] = mad(beta, C[(offset_x + 16) + (offset_y + 17) * ldc], alpha * rC[3][2])); (C[(offset_x + 17) + (offset_y + 17) * ldc] = mad(beta, C[(offset_x + 17) + (offset_y + 17) * ldc], alpha * rC[3][3])); (C[(offset_x + 16) + (offset_y + 32) * ldc] = mad(beta, C[(offset_x + 16) + (offset_y + 32) * ldc], alpha * rC[4][2])); (C[(offset_x + 17) + (offset_y + 32) * ldc] = mad(beta, C[(offset_x + 17) + (offset_y + 32) * ldc], alpha * rC[4][3])); (C[(offset_x + 16) + (offset_y + 33) * ldc] = mad(beta, C[(offset_x + 16) + (offset_y + 33) * ldc], alpha * rC[5][2])); (C[(offset_x + 17) + (offset_y + 33) * ldc] = mad(beta, C[(offset_x + 17) + (offset_y + 33) * ldc], alpha * rC[5][3])); (C[(offset_x + 32) + (offset_y + 0) * ldc] = mad(beta, C[(offset_x + 32) + (offset_y + 0) * ldc], alpha * rC[0][4])); (C[(offset_x + 33) + (offset_y + 0) * ldc] = mad(beta, C[(offset_x + 33) + (offset_y + 0) * ldc], alpha * rC[0][5])); (C[(offset_x + 32) + (offset_y + 1) * ldc] = mad(beta, C[(offset_x + 32) + (offset_y + 1) * ldc], alpha * rC[1][4])); (C[(offset_x + 33) + (offset_y + 1) * ldc] = mad(beta, C[(offset_x + 33) + (offset_y + 1) * ldc], alpha * rC[1][5])); (C[(offset_x + 32) + (offset_y + 16) * ldc] = mad(beta, C[(offset_x + 32) + (offset_y + 16) * ldc], alpha * rC[2][4])); (C[(offset_x + 33) + (offset_y + 16) * ldc] = mad(beta, C[(offset_x + 33) + (offset_y + 16) * ldc], alpha * rC[2][5])); (C[(offset_x + 32) + (offset_y + 17) * ldc] = mad(beta, C[(offset_x + 32) + (offset_y + 17) * ldc], alpha * rC[3][4])); (C[(offset_x + 33) + (offset_y + 17) * ldc] = mad(beta, C[(offset_x + 33) + (offset_y + 17) * ldc], alpha * rC[3][5])); (C[(offset_x + 32) + (offset_y + 32) * ldc] = mad(beta, C[(offset_x + 32) + (offset_y + 32) * ldc], alpha * rC[4][4])); (C[(offset_x + 33) + (offset_y + 32) * ldc] = mad(beta, C[(offset_x + 33) + (offset_y + 32) * ldc], alpha * rC[4][5])); (C[(offset_x + 32) + (offset_y + 33) * ldc] = mad(beta, C[(offset_x + 32) + (offset_y + 33) * ldc], alpha * rC[5][4])); (C[(offset_x + 33) + (offset_y + 33) * ldc] = mad(beta, C[(offset_x + 33) + (offset_y + 33) * ldc], alpha * rC[5][5])); } __attribute__((reqd_work_group_size(8,8,1))) __kernel void dgemm_NT_1_48_8_8x8_6x6__ALPHABETA_SPLIT_ROW(__global double2 const * restrict A, __global double2 const * restrict B, __global double * C, uint const M, uint const N, uint const K, double const alpha, double const beta, uint lda, uint ldb, uint ldc, uint offsetA, uint offsetB, uint offsetC) { GPtr uA, uB; uA.d2v = (__global double2 *)A; uB.d2v = (__global double2 *)B; // C += offsetC; uA.d += offsetA; uB.d += offsetB; C += offsetC; double rC[6][6] = {(double)0}; double rA[6]; double rB[6]; __local double lA[392]; __local double lB[392]; int gidx = M/48;//get_group_id(0); int gidy = get_group_id(1); int idx = get_local_id(0); int idy = get_local_id(1); int CurrentOffSetA = 2*(gidx*24 + idx); uA.d += 2*(gidx*24 + idx) + idy*lda; uB.d += 2*(gidy*24 + idx) + idy*ldb; int block_k = K >> 3; do { __local double* plA = (lA + idy*48 + 2*idx); __local double2* plB = (__local double2*)(lB + idy*48 + 2*idx); // barrier(CLK_LOCAL_MEM_FENCE); plB[0 ] = uB.d2v[0 ]; plB[8 ] = uB.d2v[8 ]; plB[16] = uB.d2v[16]; // plB[0 ] = uB.d[0 ]; // plB[1 ] = uB.d[1 ]; // plB[16] = uB.d[16 ]; // plB[17] = uB.d[17]; // plB[32] = uB.d[32]; // plB[33] = uB.d[33]; plA[0] = CurrentOffSetA>=M?0.0:uA.d[0]; plA[1] = CurrentOffSetA+1>=M?0.0:uA.d[1]; plA[16] = CurrentOffSetA+16>=M?0.0:uA.d[16]; plA[17] = CurrentOffSetA+17>=M?0.0:uA.d[17]; plA[32] = CurrentOffSetA+32>=M?0.0:uA.d[32]; plA[33] = CurrentOffSetA+33>=M?0.0:uA.d[33]; barrier(CLK_LOCAL_MEM_FENCE); int offA = idx << 1; int offB = idy << 1; M6x6 M6x6 M6x6 M6x6 M6x6 M6x6 M6x6 M6x6 uA.d += lda << 3; uB.d += ldb << 3; } while (--block_k > 0); int offset_x = gidx*48+ idx*2; int offset_y = gidy*48+ idy*2; if(offset_x>=M ) return; C+=offset_x+offset_y*ldc; for (int i = 0; i<3; i++) { C[0] = mad( beta, C[0] , alpha * rC[0][2*i] ); C[ldc] = mad( beta, C[ldc] , alpha * rC[1][2*i] ); C[16*ldc] = mad( beta, C[16*ldc], alpha * rC[2][2*i] ); C[17*ldc] = mad( beta, C[17*ldc], alpha * rC[3][2*i] ); C[32*ldc] = mad( beta, C[32*ldc], alpha * rC[4][2*i] ); C[33*ldc] = mad( beta, C[33*ldc], alpha * rC[5][2*i] ); if(offset_x+1>=M ) return; C[1] = mad( beta, C[1] , alpha * rC[0][2*i+1] ); C[1+ldc] = mad( beta, C[1+ldc] , alpha * rC[1][2*i+1] ); C[1+16*ldc] = mad( beta, C[1+16*ldc], alpha * rC[2][2*i+1] ); C[1+17*ldc] = mad( beta, C[1+17*ldc], alpha * rC[3][2*i+1] ); C[1+32*ldc] = mad( beta, C[1+32*ldc], alpha * rC[4][2*i+1] ); C[1+33*ldc] = mad( beta, C[1+33*ldc], alpha * rC[5][2*i+1] ); C+=16; offset_x+=16; if(offset_x>=M ) return; } } __attribute__((reqd_work_group_size(8,8,1))) __kernel void dgemm_NT_48_1_8_8x8_6x6__ALPHABETA_SPLIT_COLUMN(__global double2 const * restrict A, __global double2 const * restrict B, __global double * C, uint const M, uint const N, uint const K, double const alpha, double const beta, uint lda, uint ldb, uint ldc, uint offsetA, uint offsetB, uint offsetC) { GPtr uA, uB; uA.d2v = (__global double2 *)A; uB.d2v = (__global double2 *)B; // C += offsetC; uA.d += offsetA; uB.d += offsetB; C += offsetC; double rC[6][6] = {(double)0}; double rA[6]; double rB[6]; __local double lA[392]; __local double lB[392]; int gidx = get_group_id(0); int gidy = N/48;//get_group_id(1); int idx = get_local_id(0); int idy = get_local_id(1); int CurrentOffSetB = 2*(gidy*24 + idx); uA.d += 2*(gidx*24 + idx) + idy*lda; uB.d += 2*(gidy*24 + idx) + idy*ldb; int block_k = K >> 3; do { __local double2* plA = (__local double2*)(lA + idy*48 + 2*idx); __local double* plB = (lB + idy*48 + 2*idx); // barrier(CLK_LOCAL_MEM_FENCE); //plB[0 ] = uB.d2v[0 ]; //plB[8 ] = uB.d2v[8 ]; //plB[16] = uB.d2v[16]; plB[0 ] = CurrentOffSetB>=N?0.0:uB.d[0 ]; plB[1 ] = CurrentOffSetB+1>=N?0.0:uB.d[1 ]; plB[16] = CurrentOffSetB+16>=N?0.0:uB.d[16 ]; plB[17] = CurrentOffSetB+17>=N?0.0:uB.d[17]; plB[32] = CurrentOffSetB+32>=N?0.0:uB.d[32]; plB[33] = CurrentOffSetB+33>=N?0.0:uB.d[33]; plA[0 ] = uA.d2v[0 ]; plA[8 ] = uA.d2v[8 ]; plA[16] = uA.d2v[16]; barrier(CLK_LOCAL_MEM_FENCE); int offA = idx << 1; int offB = idy << 1; M6x6 M6x6 M6x6 M6x6 M6x6 M6x6 M6x6 M6x6 uA.d += lda << 3; uB.d += ldb << 3; } while (--block_k > 0); int offset_x = gidx*48+ idx*2; int offset_y = gidy*48+ idy*2; if( offset_y>=N ) return; C+=offset_x+offset_y*ldc; for (int i = 0; i<3; i++) { C[0] = mad( beta, C[0], alpha * rC[0][2*i] ); C[1] = mad( beta, C[1], alpha * rC[0][2*i+1] ); if(offset_y+1> 3; do { __local double* plA = (lA + idy*48 + 2*idx); __local double* plB = (lB + idy*48 + 2*idx); // barrier(CLK_LOCAL_MEM_FENCE); //plB[0 ] = uB.d2v[0 ]; //plB[8 ] = uB.d2v[8 ]; //plB[16] = uB.d2v[16]; //plA[0 ] = uA.d2v[0 ]; //plA[8 ] = uA.d2v[8 ]; //plA[16] = uA.d2v[16]; plB[0 ] = CurrentOffSetB>=N?0.0:uB.d[0 ]; plB[1 ] = CurrentOffSetB+1>=N?0.0:uB.d[1 ]; plB[16] = CurrentOffSetB+16>=N?0.0:uB.d[16 ]; plB[17] = CurrentOffSetB+17>=N?0.0:uB.d[17]; plB[32] = CurrentOffSetB+32>=N?0.0:uB.d[32]; plB[33] = CurrentOffSetB+33>=N?0.0:uB.d[33]; plA[0] = CurrentOffSetA>=M?0.0:uA.d[0]; plA[1] = CurrentOffSetA+1>=M?0.0:uA.d[1]; plA[16] = CurrentOffSetA+16>=M?0.0:uA.d[16]; plA[17] = CurrentOffSetA+17>=M?0.0:uA.d[17]; plA[32] = CurrentOffSetA+32>=M?0.0:uA.d[32]; plA[33] = CurrentOffSetA+33>=M?0.0:uA.d[33]; barrier(CLK_LOCAL_MEM_FENCE); int offA = idx << 1; int offB = idy << 1; M6x6 M6x6 M6x6 M6x6 M6x6 M6x6 M6x6 M6x6 uA.d += lda << 3; uB.d += ldb << 3; } while (--block_k > 0); int offset_x = gidx*48+ idx*2; int offset_y = gidy*48+ idy*2; if(offset_x>=M || offset_y>=N ) return; C+=offset_x+offset_y*ldc; for (int i = 0; i<3; i++) { C[0] = mad( beta, C[0] , alpha * rC[0][2*i] ); if(offset_y+1=M ) return; C[1] = mad( beta, C[1] , alpha * rC[0][2*i+1] ); if(offset_y+1=M ) return; } } "; static const char * dgemm_NT_8_SPLIT__ALPHA = " typedef union GPtr { __global float *f; __global double *d; __global float2 *f2v; __global double2 *d2v; } GPtr; #define M6x6 \ rA[0] = lA[offA + 0]; \ rA[1] = lA[offA + 1]; \ rA[2] = lA[offA + 16]; \ rA[3] = lA[offA + 17]; \ rA[4] = lA[offA + 32]; \ rA[5] = lA[offA + 33]; \ rB[0] = lB[offB + 0]; \ rB[1] = lB[offB + 1]; \ rB[2] = lB[offB + 16]; \ rB[3] = lB[offB + 17]; \ rB[4] = lB[offB + 32]; \ rB[5] = lB[offB + 33]; \ offA += 48; \ offB += 48; \ rC[0][0] = mad(rA[0],rB[0],rC[0][0]); \ rC[0][1] = mad(rA[1],rB[0],rC[0][1]); \ rC[0][2] = mad(rA[2],rB[0],rC[0][2]); \ rC[0][3] = mad(rA[3],rB[0],rC[0][3]); \ rC[0][4] = mad(rA[4],rB[0],rC[0][4]); \ rC[0][5] = mad(rA[5],rB[0],rC[0][5]); \ rC[1][0] = mad(rA[0],rB[1],rC[1][0]); \ rC[1][1] = mad(rA[1],rB[1],rC[1][1]); \ rC[1][2] = mad(rA[2],rB[1],rC[1][2]); \ rC[1][3] = mad(rA[3],rB[1],rC[1][3]); \ rC[1][4] = mad(rA[4],rB[1],rC[1][4]); \ rC[1][5] = mad(rA[5],rB[1],rC[1][5]); \ rC[2][0] = mad(rA[0],rB[2],rC[2][0]); \ rC[2][1] = mad(rA[1],rB[2],rC[2][1]); \ rC[2][2] = mad(rA[2],rB[2],rC[2][2]); \ rC[2][3] = mad(rA[3],rB[2],rC[2][3]); \ rC[2][4] = mad(rA[4],rB[2],rC[2][4]); \ rC[2][5] = mad(rA[5],rB[2],rC[2][5]); \ rC[3][0] = mad(rA[0],rB[3],rC[3][0]); \ rC[3][1] = mad(rA[1],rB[3],rC[3][1]); \ rC[3][2] = mad(rA[2],rB[3],rC[3][2]); \ rC[3][3] = mad(rA[3],rB[3],rC[3][3]); \ rC[3][4] = mad(rA[4],rB[3],rC[3][4]); \ rC[3][5] = mad(rA[5],rB[3],rC[3][5]); \ rC[4][0] = mad(rA[0],rB[4],rC[4][0]); \ rC[4][1] = mad(rA[1],rB[4],rC[4][1]); \ rC[4][2] = mad(rA[2],rB[4],rC[4][2]); \ rC[4][3] = mad(rA[3],rB[4],rC[4][3]); \ rC[4][4] = mad(rA[4],rB[4],rC[4][4]); \ rC[4][5] = mad(rA[5],rB[4],rC[4][5]); \ rC[5][0] = mad(rA[0],rB[5],rC[5][0]); \ rC[5][1] = mad(rA[1],rB[5],rC[5][1]); \ rC[5][2] = mad(rA[2],rB[5],rC[5][2]); \ rC[5][3] = mad(rA[3],rB[5],rC[5][3]); \ rC[5][4] = mad(rA[4],rB[5],rC[5][4]); \ rC[5][5] = mad(rA[5],rB[5],rC[5][5]); \ barrier(CLK_LOCAL_MEM_FENCE); __attribute__((reqd_work_group_size(8,8,1))) __kernel void dgemm_NT_48_48_8_8x8_6x6__ALPHA_SPLIT_MAIN(__global double2 const * restrict A, __global double2 const * restrict B, __global double * C, uint const M, uint const N, uint const K, double const alpha, uint lda, uint ldb, uint ldc, uint offsetA, uint offsetB, uint offsetC) { GPtr uA, uB; uA.d2v = (__global double2 *)A; uB.d2v = (__global double2 *)B; // C += offsetC; uA.d += offsetA; uB.d += offsetB; C += offsetC; double rC[6][6] = {(double)0}; double rA[6]; double rB[6]; __local double lA[392]; __local double lB[392]; int gidx = get_group_id(0); int gidy = get_group_id(1); int idx = get_local_id(0); int idy = get_local_id(1); uA.d += 2*(gidx*24 + idx) + idy*lda; uB.d += 2*(gidy*24 + idx) + idy*ldb; int block_k = K >> 3; do { __local double2* plA = (__local double2*)(lA + idy*48 + 2*idx); __local double2* plB = (__local double2*)(lB + idy*48 + 2*idx); // barrier(CLK_LOCAL_MEM_FENCE); plB[0 ] = uB.d2v[0 ]; plB[8 ] = uB.d2v[8 ]; plB[16] = uB.d2v[16]; plA[0 ] = uA.d2v[0 ]; plA[8 ] = uA.d2v[8 ]; plA[16] = uA.d2v[16]; barrier(CLK_LOCAL_MEM_FENCE); int offA = idx << 1; int offB = idy << 1; M6x6 M6x6 M6x6 M6x6 M6x6 M6x6 M6x6 M6x6 uA.d += lda << 3; uB.d += ldb << 3; } while (--block_k > 0); int offset_x = gidx*48+ idx*2; int offset_y = gidy*48+ idy*2; (C[(offset_x + 0) + (offset_y + 0) * ldc] = alpha * rC[0][0]); (C[(offset_x + 1) + (offset_y + 0) * ldc] = alpha * rC[0][1]); (C[(offset_x + 0) + (offset_y + 1) * ldc] = alpha * rC[1][0]); (C[(offset_x + 1) + (offset_y + 1) * ldc] = alpha * rC[1][1]); (C[(offset_x + 0) + (offset_y + 16) * ldc] = alpha * rC[2][0]); (C[(offset_x + 1) + (offset_y + 16) * ldc] = alpha * rC[2][1]); (C[(offset_x + 0) + (offset_y + 17) * ldc] = alpha * rC[3][0]); (C[(offset_x + 1) + (offset_y + 17) * ldc] = alpha * rC[3][1]); (C[(offset_x + 0) + (offset_y + 32) * ldc] = alpha * rC[4][0]); (C[(offset_x + 1) + (offset_y + 32) * ldc] = alpha * rC[4][1]); (C[(offset_x + 0) + (offset_y + 33) * ldc] = alpha * rC[5][0]); (C[(offset_x + 1) + (offset_y + 33) * ldc] = alpha * rC[5][1]); (C[(offset_x + 16) + (offset_y + 0) * ldc] = alpha * rC[0][2]); (C[(offset_x + 17) + (offset_y + 0) * ldc] = alpha * rC[0][3]); (C[(offset_x + 16) + (offset_y + 1) * ldc] = alpha * rC[1][2]); (C[(offset_x + 17) + (offset_y + 1) * ldc] = alpha * rC[1][3]); (C[(offset_x + 16) + (offset_y + 16) * ldc] = alpha * rC[2][2]); (C[(offset_x + 17) + (offset_y + 16) * ldc] = alpha * rC[2][3]); (C[(offset_x + 16) + (offset_y + 17) * ldc] = alpha * rC[3][2]); (C[(offset_x + 17) + (offset_y + 17) * ldc] = alpha * rC[3][3]); (C[(offset_x + 16) + (offset_y + 32) * ldc] = alpha * rC[4][2]); (C[(offset_x + 17) + (offset_y + 32) * ldc] = alpha * rC[4][3]); (C[(offset_x + 16) + (offset_y + 33) * ldc] = alpha * rC[5][2]); (C[(offset_x + 17) + (offset_y + 33) * ldc] = alpha * rC[5][3]); (C[(offset_x + 32) + (offset_y + 0) * ldc] = alpha * rC[0][4]); (C[(offset_x + 33) + (offset_y + 0) * ldc] = alpha * rC[0][5]); (C[(offset_x + 32) + (offset_y + 1) * ldc] = alpha * rC[1][4]); (C[(offset_x + 33) + (offset_y + 1) * ldc] = alpha * rC[1][5]); (C[(offset_x + 32) + (offset_y + 16) * ldc] = alpha * rC[2][4]); (C[(offset_x + 33) + (offset_y + 16) * ldc] = alpha * rC[2][5]); (C[(offset_x + 32) + (offset_y + 17) * ldc] = alpha * rC[3][4]); (C[(offset_x + 33) + (offset_y + 17) * ldc] = alpha * rC[3][5]); (C[(offset_x + 32) + (offset_y + 32) * ldc] = alpha * rC[4][4]); (C[(offset_x + 33) + (offset_y + 32) * ldc] = alpha * rC[4][5]); (C[(offset_x + 32) + (offset_y + 33) * ldc] = alpha * rC[5][4]); (C[(offset_x + 33) + (offset_y + 33) * ldc] = alpha * rC[5][5]); } __attribute__((reqd_work_group_size(8,8,1))) __kernel void dgemm_NT_1_48_8_8x8_6x6__ALPHA_SPLIT_ROW(__global double2 const * restrict A, __global double2 const * restrict B, __global double * C, uint const M, uint const N, uint const K, double const alpha, uint lda, uint ldb, uint ldc, uint offsetA, uint offsetB, uint offsetC) { GPtr uA, uB; uA.d2v = (__global double2 *)A; uB.d2v = (__global double2 *)B; // C += offsetC; uA.d += offsetA; uB.d += offsetB; C += offsetC; double rC[6][6] = {(double)0}; double rA[6]; double rB[6]; __local double lA[392]; __local double lB[392]; int gidx = M/48;//get_group_id(0); int gidy = get_group_id(1); int idx = get_local_id(0); int idy = get_local_id(1); int CurrentOffSetA = 2*(gidx*24 + idx); uA.d += 2*(gidx*24 + idx) + idy*lda; uB.d += 2*(gidy*24 + idx) + idy*ldb; int block_k = K >> 3; do { __local double* plA = (lA + idy*48 + 2*idx); __local double2* plB = (__local double2*)(lB + idy*48 + 2*idx); // barrier(CLK_LOCAL_MEM_FENCE); plB[0 ] = uB.d2v[0 ]; plB[8 ] = uB.d2v[8 ]; plB[16] = uB.d2v[16]; plA[0] = CurrentOffSetA>=M?0.0:uA.d[0]; plA[1] = CurrentOffSetA+1>=M?0.0:uA.d[1]; plA[16] = CurrentOffSetA+16>=M?0.0:uA.d[16]; plA[17] = CurrentOffSetA+17>=M?0.0:uA.d[17]; plA[32] = CurrentOffSetA+32>=M?0.0:uA.d[32]; plA[33] = CurrentOffSetA+33>=M?0.0:uA.d[33]; barrier(CLK_LOCAL_MEM_FENCE); int offA = idx << 1; int offB = idy << 1; M6x6 M6x6 M6x6 M6x6 M6x6 M6x6 M6x6 M6x6 uA.d += lda << 3; uB.d += ldb << 3; } while (--block_k > 0); int offset_x = gidx*48+ idx*2; int offset_y = gidy*48+ idy*2; if(offset_x>=M ) return; C+=offset_x+offset_y*ldc; for (int i = 0; i<3; i++) { C[0] = alpha * rC[0][2*i] ; C[ldc] = alpha * rC[1][2*i] ; C[16*ldc] = alpha * rC[2][2*i] ; C[17*ldc] = alpha * rC[3][2*i] ; C[32*ldc] = alpha * rC[4][2*i] ; C[33*ldc] = alpha * rC[5][2*i] ; if(offset_x+1>=M ) return; C[1] = alpha * rC[0][2*i+1] ; C[1+ldc] = alpha * rC[1][2*i+1] ; C[1+16*ldc] = alpha * rC[2][2*i+1] ; C[1+17*ldc] = alpha * rC[3][2*i+1] ; C[1+32*ldc] = alpha * rC[4][2*i+1] ; C[1+33*ldc] = alpha * rC[5][2*i+1] ; C+=16; offset_x+=16; if(offset_x>=M ) return; } } __attribute__((reqd_work_group_size(8,8,1))) __kernel void dgemm_NT_48_1_8_8x8_6x6__ALPHA_SPLIT_COLUMN(__global double2 const * restrict A, __global double2 const * restrict B, __global double * C, uint const M, uint const N, uint const K, double const alpha, uint lda, uint ldb, uint ldc, uint offsetA, uint offsetB, uint offsetC) { GPtr uA, uB; uA.d2v = (__global double2 *)A; uB.d2v = (__global double2 *)B; // C += offsetC; uA.d += offsetA; uB.d += offsetB; C += offsetC; double rC[6][6] = {(double)0}; double rA[6]; double rB[6]; __local double lA[392]; __local double lB[392]; int gidx = get_group_id(0); int gidy = N/48;//get_group_id(1); int idx = get_local_id(0); int idy = get_local_id(1); int CurrentOffSetB = 2*(gidy*24 + idx); uA.d += 2*(gidx*24 + idx) + idy*lda; uB.d += 2*(gidy*24 + idx) + idy*ldb; int block_k = K >> 3; do { __local double2* plA = (__local double2*) (lA + idy*48 + 2*idx); __local double* plB = (lB + idy*48 + 2*idx); // barrier(CLK_LOCAL_MEM_FENCE); plB[0 ] = CurrentOffSetB>=N?0.0:uB.d[0 ]; plB[1 ] = CurrentOffSetB+1>=N?0.0:uB.d[1 ]; plB[16] = CurrentOffSetB+16>=N?0.0:uB.d[16 ]; plB[17] = CurrentOffSetB+17>=N?0.0:uB.d[17]; plB[32] = CurrentOffSetB+32>=N?0.0:uB.d[32]; plB[33] = CurrentOffSetB+33>=N?0.0:uB.d[33]; plA[0 ] = uA.d2v[0 ]; plA[8 ] = uA.d2v[8 ]; plA[16] = uA.d2v[16]; barrier(CLK_LOCAL_MEM_FENCE); int offA = idx << 1; int offB = idy << 1; M6x6 M6x6 M6x6 M6x6 M6x6 M6x6 M6x6 M6x6 uA.d += lda << 3; uB.d += ldb << 3; } while (--block_k > 0); int offset_x = gidx*48+ idx*2; int offset_y = gidy*48+ idy*2; if( offset_y>=N ) return; C+=offset_x+offset_y*ldc; for (int i = 0; i<3; i++) { C[0] = alpha * rC[0][2*i] ; C[1] = alpha * rC[0][2*i+1] ; if(offset_y+1> 3; do { __local double* plA = (lA + idy*48 + 2*idx); __local double* plB = (lB + idy*48 + 2*idx); // barrier(CLK_LOCAL_MEM_FENCE); //plB[0 ] = uB.d2v[0 ]; //plB[8 ] = uB.d2v[8 ]; //plB[16] = uB.d2v[16]; //plA[0 ] = uA.d2v[0 ]; //plA[8 ] = uA.d2v[8 ]; //plA[16] = uA.d2v[16]; plB[0 ] = CurrentOffSetB>=N?0.0:uB.d[0 ]; plB[1 ] = CurrentOffSetB+1>=N?0.0:uB.d[1 ]; plB[16] = CurrentOffSetB+16>=N?0.0:uB.d[16 ]; plB[17] = CurrentOffSetB+17>=N?0.0:uB.d[17]; plB[32] = CurrentOffSetB+32>=N?0.0:uB.d[32]; plB[33] = CurrentOffSetB+33>=N?0.0:uB.d[33]; plA[0] = CurrentOffSetA>=M?0.0:uA.d[0]; plA[1] = CurrentOffSetA+1>=M?0.0:uA.d[1]; plA[16] = CurrentOffSetA+16>=M?0.0:uA.d[16]; plA[17] = CurrentOffSetA+17>=M?0.0:uA.d[17]; plA[32] = CurrentOffSetA+32>=M?0.0:uA.d[32]; plA[33] = CurrentOffSetA+33>=M?0.0:uA.d[33]; barrier(CLK_LOCAL_MEM_FENCE); int offA = idx << 1; int offB = idy << 1; M6x6 M6x6 M6x6 M6x6 M6x6 M6x6 M6x6 M6x6 uA.d += lda << 3; uB.d += ldb << 3; } while (--block_k > 0); int offset_x = gidx*48+ idx*2; int offset_y = gidy*48+ idy*2; if(offset_x>=M || offset_y>=N ) return; C+=offset_x+offset_y*ldc; for (int i = 0; i<3; i++) { C[0] = alpha * rC[0][2*i] ; if(offset_y+1=M ) return; C[1] = alpha * rC[0][2*i+1] ; if(offset_y+1=M ) return; } } "; static const char * dgemm_NT_1_SPLIT__ALPHABETA = " typedef union GPtr { __global float *f; __global double *d; __global float2 *f2v; __global double2 *d2v; } GPtr; __attribute__((reqd_work_group_size(8,8,1))) __kernel void dgemm_NT_48_48_1_8x8_6x6__ALPHABETA_SPLIT_MAIN(__global double2 const * restrict A, __global double2 const * restrict B, __global double * C, uint const M, uint const N, uint const K, double const alpha, double const beta, uint lda, uint ldb, uint ldc, uint offsetA, uint offsetB, uint offsetC) { GPtr uA, uB; uA.d2v = (__global double2 *)A; uB.d2v = (__global double2 *)B; // C += offsetC; uA.d += offsetA; uB.d += offsetB; C += offsetC; double rC[6][6] = {(double)0}; double rA[6]; double rB[6]; __local double lA[392]; __local double lB[392]; int gidx = get_group_id(0); int gidy = get_group_id(1); int idx = get_local_id(0); int idy = get_local_id(1); uA.d += 2*(gidx*24 + idx) + idy*lda; uB.d += 2*(gidy*24 + idx) + idy*ldb; int block_k = 0;//K >> 3; do { __local double2* plA = (__local double2*)(lA + idy*48 + 2*idx); __local double2* plB = (__local double2*)(lB + idy*48 + 2*idx); // barrier(CLK_LOCAL_MEM_FENCE); plB[0 ] = uB.d2v[0 ]; plB[8 ] = uB.d2v[8 ]; plB[16] = uB.d2v[16]; plA[0 ] = uA.d2v[0 ]; plA[8 ] = uA.d2v[8 ]; plA[16] = uA.d2v[16]; barrier(CLK_LOCAL_MEM_FENCE); int offA = idx << 1; int offB = idy << 1; #pragma unroll 1 for(uint k = 0 ; k < min(8u, K-block_k ) ; k+=1) { rA[0] = lA[offA + 0]; rA[1] = lA[offA + 1]; rA[2] = lA[offA + 16]; rA[3] = lA[offA + 17]; rA[4] = lA[offA + 32]; rA[5] = lA[offA + 33]; rB[0] = lB[offB + 0]; rB[1] = lB[offB + 1]; rB[2] = lB[offB + 16]; rB[3] = lB[offB + 17]; rB[4] = lB[offB + 32]; rB[5] = lB[offB + 33]; offA += 48; offB += 48; rC[0][0] = mad(rA[0],rB[0],rC[0][0]); rC[0][1] = mad(rA[1],rB[0],rC[0][1]); rC[0][2] = mad(rA[2],rB[0],rC[0][2]); rC[0][3] = mad(rA[3],rB[0],rC[0][3]); rC[0][4] = mad(rA[4],rB[0],rC[0][4]); rC[0][5] = mad(rA[5],rB[0],rC[0][5]); rC[1][0] = mad(rA[0],rB[1],rC[1][0]); rC[1][1] = mad(rA[1],rB[1],rC[1][1]); rC[1][2] = mad(rA[2],rB[1],rC[1][2]); rC[1][3] = mad(rA[3],rB[1],rC[1][3]); rC[1][4] = mad(rA[4],rB[1],rC[1][4]); rC[1][5] = mad(rA[5],rB[1],rC[1][5]); rC[2][0] = mad(rA[0],rB[2],rC[2][0]); rC[2][1] = mad(rA[1],rB[2],rC[2][1]); rC[2][2] = mad(rA[2],rB[2],rC[2][2]); rC[2][3] = mad(rA[3],rB[2],rC[2][3]); rC[2][4] = mad(rA[4],rB[2],rC[2][4]); rC[2][5] = mad(rA[5],rB[2],rC[2][5]); rC[3][0] = mad(rA[0],rB[3],rC[3][0]); rC[3][1] = mad(rA[1],rB[3],rC[3][1]); rC[3][2] = mad(rA[2],rB[3],rC[3][2]); rC[3][3] = mad(rA[3],rB[3],rC[3][3]); rC[3][4] = mad(rA[4],rB[3],rC[3][4]); rC[3][5] = mad(rA[5],rB[3],rC[3][5]); rC[4][0] = mad(rA[0],rB[4],rC[4][0]); rC[4][1] = mad(rA[1],rB[4],rC[4][1]); rC[4][2] = mad(rA[2],rB[4],rC[4][2]); rC[4][3] = mad(rA[3],rB[4],rC[4][3]); rC[4][4] = mad(rA[4],rB[4],rC[4][4]); rC[4][5] = mad(rA[5],rB[4],rC[4][5]); rC[5][0] = mad(rA[0],rB[5],rC[5][0]); rC[5][1] = mad(rA[1],rB[5],rC[5][1]); rC[5][2] = mad(rA[2],rB[5],rC[5][2]); rC[5][3] = mad(rA[3],rB[5],rC[5][3]); rC[5][4] = mad(rA[4],rB[5],rC[5][4]); rC[5][5] = mad(rA[5],rB[5],rC[5][5]); barrier(CLK_LOCAL_MEM_FENCE); } uA.d += lda << 3; uB.d += ldb << 3; block_k+=8; } while (block_k < K); int offset_x = gidx*48+ idx*2; int offset_y = gidy*48+ idy*2; (C[(offset_x + 0) + (offset_y + 0) * ldc] = mad(beta, C[(offset_x + 0) + (offset_y + 0) * ldc], alpha * rC[0][0])); (C[(offset_x + 1) + (offset_y + 0) * ldc] = mad(beta, C[(offset_x + 1) + (offset_y + 0) * ldc], alpha * rC[0][1])); (C[(offset_x + 0) + (offset_y + 1) * ldc] = mad(beta, C[(offset_x + 0) + (offset_y + 1) * ldc], alpha * rC[1][0])); (C[(offset_x + 1) + (offset_y + 1) * ldc] = mad(beta, C[(offset_x + 1) + (offset_y + 1) * ldc], alpha * rC[1][1])); (C[(offset_x + 0) + (offset_y + 16) * ldc] = mad(beta, C[(offset_x + 0) + (offset_y + 16) * ldc], alpha * rC[2][0])); (C[(offset_x + 1) + (offset_y + 16) * ldc] = mad(beta, C[(offset_x + 1) + (offset_y + 16) * ldc], alpha * rC[2][1])); (C[(offset_x + 0) + (offset_y + 17) * ldc] = mad(beta, C[(offset_x + 0) + (offset_y + 17) * ldc], alpha * rC[3][0])); (C[(offset_x + 1) + (offset_y + 17) * ldc] = mad(beta, C[(offset_x + 1) + (offset_y + 17) * ldc], alpha * rC[3][1])); (C[(offset_x + 0) + (offset_y + 32) * ldc] = mad(beta, C[(offset_x + 0) + (offset_y + 32) * ldc], alpha * rC[4][0])); (C[(offset_x + 1) + (offset_y + 32) * ldc] = mad(beta, C[(offset_x + 1) + (offset_y + 32) * ldc], alpha * rC[4][1])); (C[(offset_x + 0) + (offset_y + 33) * ldc] = mad(beta, C[(offset_x + 0) + (offset_y + 33) * ldc], alpha * rC[5][0])); (C[(offset_x + 1) + (offset_y + 33) * ldc] = mad(beta, C[(offset_x + 1) + (offset_y + 33) * ldc], alpha * rC[5][1])); (C[(offset_x + 16) + (offset_y + 0) * ldc] = mad(beta, C[(offset_x + 16) + (offset_y + 0) * ldc], alpha * rC[0][2])); (C[(offset_x + 17) + (offset_y + 0) * ldc] = mad(beta, C[(offset_x + 17) + (offset_y + 0) * ldc], alpha * rC[0][3])); (C[(offset_x + 16) + (offset_y + 1) * ldc] = mad(beta, C[(offset_x + 16) + (offset_y + 1) * ldc], alpha * rC[1][2])); (C[(offset_x + 17) + (offset_y + 1) * ldc] = mad(beta, C[(offset_x + 17) + (offset_y + 1) * ldc], alpha * rC[1][3])); (C[(offset_x + 16) + (offset_y + 16) * ldc] = mad(beta, C[(offset_x + 16) + (offset_y + 16) * ldc], alpha * rC[2][2])); (C[(offset_x + 17) + (offset_y + 16) * ldc] = mad(beta, C[(offset_x + 17) + (offset_y + 16) * ldc], alpha * rC[2][3])); (C[(offset_x + 16) + (offset_y + 17) * ldc] = mad(beta, C[(offset_x + 16) + (offset_y + 17) * ldc], alpha * rC[3][2])); (C[(offset_x + 17) + (offset_y + 17) * ldc] = mad(beta, C[(offset_x + 17) + (offset_y + 17) * ldc], alpha * rC[3][3])); (C[(offset_x + 16) + (offset_y + 32) * ldc] = mad(beta, C[(offset_x + 16) + (offset_y + 32) * ldc], alpha * rC[4][2])); (C[(offset_x + 17) + (offset_y + 32) * ldc] = mad(beta, C[(offset_x + 17) + (offset_y + 32) * ldc], alpha * rC[4][3])); (C[(offset_x + 16) + (offset_y + 33) * ldc] = mad(beta, C[(offset_x + 16) + (offset_y + 33) * ldc], alpha * rC[5][2])); (C[(offset_x + 17) + (offset_y + 33) * ldc] = mad(beta, C[(offset_x + 17) + (offset_y + 33) * ldc], alpha * rC[5][3])); (C[(offset_x + 32) + (offset_y + 0) * ldc] = mad(beta, C[(offset_x + 32) + (offset_y + 0) * ldc], alpha * rC[0][4])); (C[(offset_x + 33) + (offset_y + 0) * ldc] = mad(beta, C[(offset_x + 33) + (offset_y + 0) * ldc], alpha * rC[0][5])); (C[(offset_x + 32) + (offset_y + 1) * ldc] = mad(beta, C[(offset_x + 32) + (offset_y + 1) * ldc], alpha * rC[1][4])); (C[(offset_x + 33) + (offset_y + 1) * ldc] = mad(beta, C[(offset_x + 33) + (offset_y + 1) * ldc], alpha * rC[1][5])); (C[(offset_x + 32) + (offset_y + 16) * ldc] = mad(beta, C[(offset_x + 32) + (offset_y + 16) * ldc], alpha * rC[2][4])); (C[(offset_x + 33) + (offset_y + 16) * ldc] = mad(beta, C[(offset_x + 33) + (offset_y + 16) * ldc], alpha * rC[2][5])); (C[(offset_x + 32) + (offset_y + 17) * ldc] = mad(beta, C[(offset_x + 32) + (offset_y + 17) * ldc], alpha * rC[3][4])); (C[(offset_x + 33) + (offset_y + 17) * ldc] = mad(beta, C[(offset_x + 33) + (offset_y + 17) * ldc], alpha * rC[3][5])); (C[(offset_x + 32) + (offset_y + 32) * ldc] = mad(beta, C[(offset_x + 32) + (offset_y + 32) * ldc], alpha * rC[4][4])); (C[(offset_x + 33) + (offset_y + 32) * ldc] = mad(beta, C[(offset_x + 33) + (offset_y + 32) * ldc], alpha * rC[4][5])); (C[(offset_x + 32) + (offset_y + 33) * ldc] = mad(beta, C[(offset_x + 32) + (offset_y + 33) * ldc], alpha * rC[5][4])); (C[(offset_x + 33) + (offset_y + 33) * ldc] = mad(beta, C[(offset_x + 33) + (offset_y + 33) * ldc], alpha * rC[5][5])); } __attribute__((reqd_work_group_size(8,8,1))) __kernel void dgemm_NT_1_48_1_8x8_6x6__ALPHABETA_SPLIT_ROW(__global double2 const * restrict A, __global double2 const * restrict B, __global double * C, uint const M, uint const N, uint const K, double const alpha, double const beta, uint lda, uint ldb, uint ldc, uint offsetA, uint offsetB, uint offsetC) { GPtr uA, uB; uA.d2v = (__global double2 *)A; uB.d2v = (__global double2 *)B; // C += offsetC; uA.d += offsetA; uB.d += offsetB; C += offsetC; double rC[6][6] = {(double)0}; double rA[6]; double rB[6]; __local double lA[392]; __local double lB[392]; int gidx = M/48;//get_group_id(0); int gidy = get_group_id(1); int idx = get_local_id(0); int idy = get_local_id(1); int CurrentOffSetA = 2*(gidx*24 + idx); uA.d += 2*(gidx*24 + idx) + idy*lda; uB.d += 2*(gidy*24 + idx) + idy*ldb; int block_k = 0;//K >> 3; do { __local double* plA =(lA + idy*48 + 2*idx); __local double2* plB = (__local double2*)(lB + idy*48 + 2*idx); // barrier(CLK_LOCAL_MEM_FENCE); plB[0 ] = uB.d2v[0 ]; plB[8 ] = uB.d2v[8 ]; plB[16] = uB.d2v[16]; plA[0] = CurrentOffSetA>=M?0.0:uA.d[0]; plA[1] = CurrentOffSetA+1>=M?0.0:uA.d[1]; plA[16] = CurrentOffSetA+16>=M?0.0:uA.d[16]; plA[17] = CurrentOffSetA+17>=M?0.0:uA.d[17]; plA[32] = CurrentOffSetA+32>=M?0.0:uA.d[32]; plA[33] = CurrentOffSetA+33>=M?0.0:uA.d[33]; barrier(CLK_LOCAL_MEM_FENCE); int offA = idx << 1; int offB = idy << 1; #pragma unroll 1 for(uint k = 0 ; k < min(8u, K-block_k ); k+=1) { rA[0] = lA[offA + 0]; rA[1] = lA[offA + 1]; rA[2] = lA[offA + 16]; rA[3] = lA[offA + 17]; rA[4] = lA[offA + 32]; rA[5] = lA[offA + 33]; rB[0] = lB[offB + 0]; rB[1] = lB[offB + 1]; rB[2] = lB[offB + 16]; rB[3] = lB[offB + 17]; rB[4] = lB[offB + 32]; rB[5] = lB[offB + 33]; offA += 48; offB += 48; rC[0][0] = mad(rA[0],rB[0],rC[0][0]); rC[0][1] = mad(rA[1],rB[0],rC[0][1]); rC[0][2] = mad(rA[2],rB[0],rC[0][2]); rC[0][3] = mad(rA[3],rB[0],rC[0][3]); rC[0][4] = mad(rA[4],rB[0],rC[0][4]); rC[0][5] = mad(rA[5],rB[0],rC[0][5]); rC[1][0] = mad(rA[0],rB[1],rC[1][0]); rC[1][1] = mad(rA[1],rB[1],rC[1][1]); rC[1][2] = mad(rA[2],rB[1],rC[1][2]); rC[1][3] = mad(rA[3],rB[1],rC[1][3]); rC[1][4] = mad(rA[4],rB[1],rC[1][4]); rC[1][5] = mad(rA[5],rB[1],rC[1][5]); rC[2][0] = mad(rA[0],rB[2],rC[2][0]); rC[2][1] = mad(rA[1],rB[2],rC[2][1]); rC[2][2] = mad(rA[2],rB[2],rC[2][2]); rC[2][3] = mad(rA[3],rB[2],rC[2][3]); rC[2][4] = mad(rA[4],rB[2],rC[2][4]); rC[2][5] = mad(rA[5],rB[2],rC[2][5]); rC[3][0] = mad(rA[0],rB[3],rC[3][0]); rC[3][1] = mad(rA[1],rB[3],rC[3][1]); rC[3][2] = mad(rA[2],rB[3],rC[3][2]); rC[3][3] = mad(rA[3],rB[3],rC[3][3]); rC[3][4] = mad(rA[4],rB[3],rC[3][4]); rC[3][5] = mad(rA[5],rB[3],rC[3][5]); rC[4][0] = mad(rA[0],rB[4],rC[4][0]); rC[4][1] = mad(rA[1],rB[4],rC[4][1]); rC[4][2] = mad(rA[2],rB[4],rC[4][2]); rC[4][3] = mad(rA[3],rB[4],rC[4][3]); rC[4][4] = mad(rA[4],rB[4],rC[4][4]); rC[4][5] = mad(rA[5],rB[4],rC[4][5]); rC[5][0] = mad(rA[0],rB[5],rC[5][0]); rC[5][1] = mad(rA[1],rB[5],rC[5][1]); rC[5][2] = mad(rA[2],rB[5],rC[5][2]); rC[5][3] = mad(rA[3],rB[5],rC[5][3]); rC[5][4] = mad(rA[4],rB[5],rC[5][4]); rC[5][5] = mad(rA[5],rB[5],rC[5][5]); barrier(CLK_LOCAL_MEM_FENCE); } uA.d += lda << 3; uB.d += ldb << 3; block_k+=8; } while (block_k < K); int offset_x = gidx*48+ idx*2; int offset_y = gidy*48+ idy*2; if(offset_x>=M ) return; C+=offset_x+offset_y*ldc; for (int i = 0; i<3; i++) { C[0] = mad( beta, C[0] , alpha * rC[0][2*i] ); C[ldc] = mad( beta, C[ldc] , alpha * rC[1][2*i] ); C[16*ldc] = mad( beta, C[16*ldc], alpha * rC[2][2*i] ); C[17*ldc] = mad( beta, C[17*ldc], alpha * rC[3][2*i] ); C[32*ldc] = mad( beta, C[32*ldc], alpha * rC[4][2*i] ); C[33*ldc] = mad( beta, C[33*ldc], alpha * rC[5][2*i] ); if(offset_x+1>=M ) return; C[1] = mad( beta, C[1] , alpha * rC[0][2*i+1] ); C[1+ldc] = mad( beta, C[1+ldc] , alpha * rC[1][2*i+1] ); C[1+16*ldc] = mad( beta, C[1+16*ldc], alpha * rC[2][2*i+1] ); C[1+17*ldc] = mad( beta, C[1+17*ldc], alpha * rC[3][2*i+1] ); C[1+32*ldc] = mad( beta, C[1+32*ldc], alpha * rC[4][2*i+1] ); C[1+33*ldc] = mad( beta, C[1+33*ldc], alpha * rC[5][2*i+1] ); C+=16; offset_x+=16; if(offset_x>=M ) return; } } __attribute__((reqd_work_group_size(8,8,1))) __kernel void dgemm_NT_48_1_1_8x8_6x6__ALPHABETA_SPLIT_COLUMN(__global double2 const * restrict A, __global double2 const * restrict B, __global double * C, uint const M, uint const N, uint const K, double const alpha, double const beta, uint lda, uint ldb, uint ldc, uint offsetA, uint offsetB, uint offsetC) { GPtr uA, uB; uA.d2v = (__global double2 *)A; uB.d2v = (__global double2 *)B; // C += offsetC; uA.d += offsetA; uB.d += offsetB; C += offsetC; double rC[6][6] = {(double)0}; double rA[6]; double rB[6]; __local double lA[392]; __local double lB[392]; int gidx = get_group_id(0); int gidy = N/48;//get_group_id(1); int idx = get_local_id(0); int idy = get_local_id(1); int CurrentOffSetB = 2*(gidy*24 + idx); uA.d += 2*(gidx*24 + idx) + idy*lda; uB.d += 2*(gidy*24 + idx) + idy*ldb; int block_k = 0;//K >> 3; do { __local double2* plA = (__local double2*)(lA + idy*48 + 2*idx); __local double* plB = (lB + idy*48 + 2*idx); // barrier(CLK_LOCAL_MEM_FENCE); plB[0 ] = CurrentOffSetB>=N?0.0:uB.d[0 ]; plB[1 ] = CurrentOffSetB+1>=N?0.0:uB.d[1 ]; plB[16] = CurrentOffSetB+16>=N?0.0:uB.d[16 ]; plB[17] = CurrentOffSetB+17>=N?0.0:uB.d[17]; plB[32] = CurrentOffSetB+32>=N?0.0:uB.d[32]; plB[33] = CurrentOffSetB+33>=N?0.0:uB.d[33]; plA[0 ] = uA.d2v[0 ]; plA[8 ] = uA.d2v[8 ]; plA[16] = uA.d2v[16]; barrier(CLK_LOCAL_MEM_FENCE); int offA = idx << 1; int offB = idy << 1; #pragma unroll 1 for(uint k = 0 ; k =N ) return; C+=offset_x+offset_y*ldc; for (int i = 0; i<3; i++) { C[0] = mad( beta, C[0], alpha * rC[0][2*i] ); C[1] = mad( beta, C[1], alpha * rC[0][2*i+1] ); if(offset_y+1> 3; do { __local double* plA = (lA + idy*48 + 2*idx); __local double* plB = (lB + idy*48 + 2*idx); // barrier(CLK_LOCAL_MEM_FENCE); plB[0 ] = CurrentOffSetB>=N?0.0:uB.d[0 ]; plB[1 ] = CurrentOffSetB+1>=N?0.0:uB.d[1 ]; plB[16] = CurrentOffSetB+16>=N?0.0:uB.d[16 ]; plB[17] = CurrentOffSetB+17>=N?0.0:uB.d[17]; plB[32] = CurrentOffSetB+32>=N?0.0:uB.d[32]; plB[33] = CurrentOffSetB+33>=N?0.0:uB.d[33]; plA[0] = CurrentOffSetA>=M?0.0:uA.d[0]; plA[1] = CurrentOffSetA+1>=M?0.0:uA.d[1]; plA[16] = CurrentOffSetA+16>=M?0.0:uA.d[16]; plA[17] = CurrentOffSetA+17>=M?0.0:uA.d[17]; plA[32] = CurrentOffSetA+32>=M?0.0:uA.d[32]; plA[33] = CurrentOffSetA+33>=M?0.0:uA.d[33]; barrier(CLK_LOCAL_MEM_FENCE); int offA = idx << 1; int offB = idy << 1; #pragma unroll 1 for(uint k = 0 ; k < min(8u, K-block_k ); k+=1) { rA[0] = lA[offA + 0]; rA[1] = lA[offA + 1]; rA[2] = lA[offA + 16]; rA[3] = lA[offA + 17]; rA[4] = lA[offA + 32]; rA[5] = lA[offA + 33]; rB[0] = lB[offB + 0]; rB[1] = lB[offB + 1]; rB[2] = lB[offB + 16]; rB[3] = lB[offB + 17]; rB[4] = lB[offB + 32]; rB[5] = lB[offB + 33]; offA += 48; offB += 48; rC[0][0] = mad(rA[0],rB[0],rC[0][0]); rC[0][1] = mad(rA[1],rB[0],rC[0][1]); rC[0][2] = mad(rA[2],rB[0],rC[0][2]); rC[0][3] = mad(rA[3],rB[0],rC[0][3]); rC[0][4] = mad(rA[4],rB[0],rC[0][4]); rC[0][5] = mad(rA[5],rB[0],rC[0][5]); rC[1][0] = mad(rA[0],rB[1],rC[1][0]); rC[1][1] = mad(rA[1],rB[1],rC[1][1]); rC[1][2] = mad(rA[2],rB[1],rC[1][2]); rC[1][3] = mad(rA[3],rB[1],rC[1][3]); rC[1][4] = mad(rA[4],rB[1],rC[1][4]); rC[1][5] = mad(rA[5],rB[1],rC[1][5]); rC[2][0] = mad(rA[0],rB[2],rC[2][0]); rC[2][1] = mad(rA[1],rB[2],rC[2][1]); rC[2][2] = mad(rA[2],rB[2],rC[2][2]); rC[2][3] = mad(rA[3],rB[2],rC[2][3]); rC[2][4] = mad(rA[4],rB[2],rC[2][4]); rC[2][5] = mad(rA[5],rB[2],rC[2][5]); rC[3][0] = mad(rA[0],rB[3],rC[3][0]); rC[3][1] = mad(rA[1],rB[3],rC[3][1]); rC[3][2] = mad(rA[2],rB[3],rC[3][2]); rC[3][3] = mad(rA[3],rB[3],rC[3][3]); rC[3][4] = mad(rA[4],rB[3],rC[3][4]); rC[3][5] = mad(rA[5],rB[3],rC[3][5]); rC[4][0] = mad(rA[0],rB[4],rC[4][0]); rC[4][1] = mad(rA[1],rB[4],rC[4][1]); rC[4][2] = mad(rA[2],rB[4],rC[4][2]); rC[4][3] = mad(rA[3],rB[4],rC[4][3]); rC[4][4] = mad(rA[4],rB[4],rC[4][4]); rC[4][5] = mad(rA[5],rB[4],rC[4][5]); rC[5][0] = mad(rA[0],rB[5],rC[5][0]); rC[5][1] = mad(rA[1],rB[5],rC[5][1]); rC[5][2] = mad(rA[2],rB[5],rC[5][2]); rC[5][3] = mad(rA[3],rB[5],rC[5][3]); rC[5][4] = mad(rA[4],rB[5],rC[5][4]); rC[5][5] = mad(rA[5],rB[5],rC[5][5]); barrier(CLK_LOCAL_MEM_FENCE); } uA.d += lda << 3; uB.d += ldb << 3; block_k+=8; } while (block_k < K); int offset_x = gidx*48+ idx*2; int offset_y = gidy*48+ idy*2; if(offset_x>=M || offset_y>=N ) return; C+=offset_x+offset_y*ldc; for (int i = 0; i<3; i++) { C[0] = mad( beta, C[0] , alpha * rC[0][2*i] ); if(offset_y+1=M ) return; C[1] = mad( beta, C[1] , alpha * rC[0][2*i+1] ); if(offset_y+1=M ) return; } } "; static const char * dgemm_NT_1_SPLIT__ALPHA = " typedef union GPtr { __global float *f; __global double *d; __global float2 *f2v; __global double2 *d2v; } GPtr; #define M6x6 \ rA[0] = lA[offA + 0]; \ rA[1] = lA[offA + 1]; \ rA[2] = lA[offA + 16]; \ rA[3] = lA[offA + 17]; \ rA[4] = lA[offA + 32]; \ rA[5] = lA[offA + 33]; \ rB[0] = lB[offB + 0]; \ rB[1] = lB[offB + 1]; \ rB[2] = lB[offB + 16]; \ rB[3] = lB[offB + 17]; \ rB[4] = lB[offB + 32]; \ rB[5] = lB[offB + 33]; \ offA += 48; \ offB += 48; \ rC[0][0] = mad(rA[0],rB[0],rC[0][0]); \ rC[0][1] = mad(rA[1],rB[0],rC[0][1]); \ rC[0][2] = mad(rA[2],rB[0],rC[0][2]); \ rC[0][3] = mad(rA[3],rB[0],rC[0][3]); \ rC[0][4] = mad(rA[4],rB[0],rC[0][4]); \ rC[0][5] = mad(rA[5],rB[0],rC[0][5]); \ rC[1][0] = mad(rA[0],rB[1],rC[1][0]); \ rC[1][1] = mad(rA[1],rB[1],rC[1][1]); \ rC[1][2] = mad(rA[2],rB[1],rC[1][2]); \ rC[1][3] = mad(rA[3],rB[1],rC[1][3]); \ rC[1][4] = mad(rA[4],rB[1],rC[1][4]); \ rC[1][5] = mad(rA[5],rB[1],rC[1][5]); \ rC[2][0] = mad(rA[0],rB[2],rC[2][0]); \ rC[2][1] = mad(rA[1],rB[2],rC[2][1]); \ rC[2][2] = mad(rA[2],rB[2],rC[2][2]); \ rC[2][3] = mad(rA[3],rB[2],rC[2][3]); \ rC[2][4] = mad(rA[4],rB[2],rC[2][4]); \ rC[2][5] = mad(rA[5],rB[2],rC[2][5]); \ rC[3][0] = mad(rA[0],rB[3],rC[3][0]); \ rC[3][1] = mad(rA[1],rB[3],rC[3][1]); \ rC[3][2] = mad(rA[2],rB[3],rC[3][2]); \ rC[3][3] = mad(rA[3],rB[3],rC[3][3]); \ rC[3][4] = mad(rA[4],rB[3],rC[3][4]); \ rC[3][5] = mad(rA[5],rB[3],rC[3][5]); \ rC[4][0] = mad(rA[0],rB[4],rC[4][0]); \ rC[4][1] = mad(rA[1],rB[4],rC[4][1]); \ rC[4][2] = mad(rA[2],rB[4],rC[4][2]); \ rC[4][3] = mad(rA[3],rB[4],rC[4][3]); \ rC[4][4] = mad(rA[4],rB[4],rC[4][4]); \ rC[4][5] = mad(rA[5],rB[4],rC[4][5]); \ rC[5][0] = mad(rA[0],rB[5],rC[5][0]); \ rC[5][1] = mad(rA[1],rB[5],rC[5][1]); \ rC[5][2] = mad(rA[2],rB[5],rC[5][2]); \ rC[5][3] = mad(rA[3],rB[5],rC[5][3]); \ rC[5][4] = mad(rA[4],rB[5],rC[5][4]); \ rC[5][5] = mad(rA[5],rB[5],rC[5][5]); \ barrier(CLK_LOCAL_MEM_FENCE); __attribute__((reqd_work_group_size(8,8,1))) __kernel void dgemm_NT_48_48_1_8x8_6x6__ALPHA_SPLIT_MAIN(__global double2 const * restrict A, __global double2 const * restrict B, __global double * C, uint const M, uint const N, uint const K, double const alpha, uint lda, uint ldb, uint ldc, uint offsetA, uint offsetB, uint offsetC) { GPtr uA, uB; uA.d2v = (__global double2 *)A; uB.d2v = (__global double2 *)B; // C += offsetC; uA.d += offsetA; uB.d += offsetB; C += offsetC; double rC[6][6] = {(double)0}; double rA[6]; double rB[6]; __local double lA[392]; __local double lB[392]; int gidx = get_group_id(0); int gidy = get_group_id(1); int idx = get_local_id(0); int idy = get_local_id(1); uA.d += 2*(gidx*24 + idx) + idy*lda; uB.d += 2*(gidy*24 + idx) + idy*ldb; int block_k = 0;//K >> 3; do { __local double2* plA = (__local double2*)(lA + idy*48 + 2*idx); __local double2* plB = (__local double2*)(lB + idy*48 + 2*idx); // barrier(CLK_LOCAL_MEM_FENCE); plB[0 ] = uB.d2v[0 ]; plB[8 ] = uB.d2v[8 ]; plB[16] = uB.d2v[16]; plA[0 ] = uA.d2v[0 ]; plA[8 ] = uA.d2v[8 ]; plA[16] = uA.d2v[16]; barrier(CLK_LOCAL_MEM_FENCE); int offA = idx << 1; int offB = idy << 1; #pragma unroll 1 for(uint k = 0 ; k < min(8u, K-block_k ); k+=1) { rA[0] = lA[offA + 0]; rA[1] = lA[offA + 1]; rA[2] = lA[offA + 16]; rA[3] = lA[offA + 17]; rA[4] = lA[offA + 32]; rA[5] = lA[offA + 33]; rB[0] = lB[offB + 0]; rB[1] = lB[offB + 1]; rB[2] = lB[offB + 16]; rB[3] = lB[offB + 17]; rB[4] = lB[offB + 32]; rB[5] = lB[offB + 33]; offA += 48; offB += 48; rC[0][0] = mad(rA[0],rB[0],rC[0][0]); rC[0][1] = mad(rA[1],rB[0],rC[0][1]); rC[0][2] = mad(rA[2],rB[0],rC[0][2]); rC[0][3] = mad(rA[3],rB[0],rC[0][3]); rC[0][4] = mad(rA[4],rB[0],rC[0][4]); rC[0][5] = mad(rA[5],rB[0],rC[0][5]); rC[1][0] = mad(rA[0],rB[1],rC[1][0]); rC[1][1] = mad(rA[1],rB[1],rC[1][1]); rC[1][2] = mad(rA[2],rB[1],rC[1][2]); rC[1][3] = mad(rA[3],rB[1],rC[1][3]); rC[1][4] = mad(rA[4],rB[1],rC[1][4]); rC[1][5] = mad(rA[5],rB[1],rC[1][5]); rC[2][0] = mad(rA[0],rB[2],rC[2][0]); rC[2][1] = mad(rA[1],rB[2],rC[2][1]); rC[2][2] = mad(rA[2],rB[2],rC[2][2]); rC[2][3] = mad(rA[3],rB[2],rC[2][3]); rC[2][4] = mad(rA[4],rB[2],rC[2][4]); rC[2][5] = mad(rA[5],rB[2],rC[2][5]); rC[3][0] = mad(rA[0],rB[3],rC[3][0]); rC[3][1] = mad(rA[1],rB[3],rC[3][1]); rC[3][2] = mad(rA[2],rB[3],rC[3][2]); rC[3][3] = mad(rA[3],rB[3],rC[3][3]); rC[3][4] = mad(rA[4],rB[3],rC[3][4]); rC[3][5] = mad(rA[5],rB[3],rC[3][5]); rC[4][0] = mad(rA[0],rB[4],rC[4][0]); rC[4][1] = mad(rA[1],rB[4],rC[4][1]); rC[4][2] = mad(rA[2],rB[4],rC[4][2]); rC[4][3] = mad(rA[3],rB[4],rC[4][3]); rC[4][4] = mad(rA[4],rB[4],rC[4][4]); rC[4][5] = mad(rA[5],rB[4],rC[4][5]); rC[5][0] = mad(rA[0],rB[5],rC[5][0]); rC[5][1] = mad(rA[1],rB[5],rC[5][1]); rC[5][2] = mad(rA[2],rB[5],rC[5][2]); rC[5][3] = mad(rA[3],rB[5],rC[5][3]); rC[5][4] = mad(rA[4],rB[5],rC[5][4]); rC[5][5] = mad(rA[5],rB[5],rC[5][5]); barrier(CLK_LOCAL_MEM_FENCE); } uA.d += lda << 3; uB.d += ldb << 3; block_k+=8; } while (block_k < K); int offset_x = gidx*48+ idx*2; int offset_y = gidy*48+ idy*2; (C[(offset_x + 0) + (offset_y + 0) * ldc] = alpha * rC[0][0]); (C[(offset_x + 1) + (offset_y + 0) * ldc] = alpha * rC[0][1]); (C[(offset_x + 0) + (offset_y + 1) * ldc] = alpha * rC[1][0]); (C[(offset_x + 1) + (offset_y + 1) * ldc] = alpha * rC[1][1]); (C[(offset_x + 0) + (offset_y + 16) * ldc] = alpha * rC[2][0]); (C[(offset_x + 1) + (offset_y + 16) * ldc] = alpha * rC[2][1]); (C[(offset_x + 0) + (offset_y + 17) * ldc] = alpha * rC[3][0]); (C[(offset_x + 1) + (offset_y + 17) * ldc] = alpha * rC[3][1]); (C[(offset_x + 0) + (offset_y + 32) * ldc] = alpha * rC[4][0]); (C[(offset_x + 1) + (offset_y + 32) * ldc] = alpha * rC[4][1]); (C[(offset_x + 0) + (offset_y + 33) * ldc] = alpha * rC[5][0]); (C[(offset_x + 1) + (offset_y + 33) * ldc] = alpha * rC[5][1]); (C[(offset_x + 16) + (offset_y + 0) * ldc] = alpha * rC[0][2]); (C[(offset_x + 17) + (offset_y + 0) * ldc] = alpha * rC[0][3]); (C[(offset_x + 16) + (offset_y + 1) * ldc] = alpha * rC[1][2]); (C[(offset_x + 17) + (offset_y + 1) * ldc] = alpha * rC[1][3]); (C[(offset_x + 16) + (offset_y + 16) * ldc] = alpha * rC[2][2]); (C[(offset_x + 17) + (offset_y + 16) * ldc] = alpha * rC[2][3]); (C[(offset_x + 16) + (offset_y + 17) * ldc] = alpha * rC[3][2]); (C[(offset_x + 17) + (offset_y + 17) * ldc] = alpha * rC[3][3]); (C[(offset_x + 16) + (offset_y + 32) * ldc] = alpha * rC[4][2]); (C[(offset_x + 17) + (offset_y + 32) * ldc] = alpha * rC[4][3]); (C[(offset_x + 16) + (offset_y + 33) * ldc] = alpha * rC[5][2]); (C[(offset_x + 17) + (offset_y + 33) * ldc] = alpha * rC[5][3]); (C[(offset_x + 32) + (offset_y + 0) * ldc] = alpha * rC[0][4]); (C[(offset_x + 33) + (offset_y + 0) * ldc] = alpha * rC[0][5]); (C[(offset_x + 32) + (offset_y + 1) * ldc] = alpha * rC[1][4]); (C[(offset_x + 33) + (offset_y + 1) * ldc] = alpha * rC[1][5]); (C[(offset_x + 32) + (offset_y + 16) * ldc] = alpha * rC[2][4]); (C[(offset_x + 33) + (offset_y + 16) * ldc] = alpha * rC[2][5]); (C[(offset_x + 32) + (offset_y + 17) * ldc] = alpha * rC[3][4]); (C[(offset_x + 33) + (offset_y + 17) * ldc] = alpha * rC[3][5]); (C[(offset_x + 32) + (offset_y + 32) * ldc] = alpha * rC[4][4]); (C[(offset_x + 33) + (offset_y + 32) * ldc] = alpha * rC[4][5]); (C[(offset_x + 32) + (offset_y + 33) * ldc] = alpha * rC[5][4]); (C[(offset_x + 33) + (offset_y + 33) * ldc] = alpha * rC[5][5]); } __attribute__((reqd_work_group_size(8,8,1))) __kernel void dgemm_NT_1_48_1_8x8_6x6__ALPHA_SPLIT_ROW(__global double2 const * restrict A, __global double2 const * restrict B, __global double * C, uint const M, uint const N, uint const K, double const alpha, uint lda, uint ldb, uint ldc, uint offsetA, uint offsetB, uint offsetC) { GPtr uA, uB; uA.d2v = (__global double2 *)A; uB.d2v = (__global double2 *)B; // C += offsetC; uA.d += offsetA; uB.d += offsetB; C += offsetC; double rC[6][6] = {(double)0}; double rA[6]; double rB[6]; __local double lA[392]; __local double lB[392]; int gidx = M/48;//get_group_id(0); int gidy = get_group_id(1); int idx = get_local_id(0); int idy = get_local_id(1); int CurrentOffSetA = 2*(gidx*24 + idx); uA.d += 2*(gidx*24 + idx) + idy*lda; uB.d += 2*(gidy*24 + idx) + idy*ldb; int block_k = 0;//K >> 3; do { __local double* plA = (lA + idy*48 + 2*idx); __local double2* plB = (__local double2*)(lB + idy*48 + 2*idx); // barrier(CLK_LOCAL_MEM_FENCE); plB[0 ] = uB.d2v[0 ]; plB[8 ] = uB.d2v[8 ]; plB[16] = uB.d2v[16]; plA[0] = CurrentOffSetA>=M?0.0:uA.d[0]; plA[1] = CurrentOffSetA+1>=M?0.0:uA.d[1]; plA[16] = CurrentOffSetA+16>=M?0.0:uA.d[16]; plA[17] = CurrentOffSetA+17>=M?0.0:uA.d[17]; plA[32] = CurrentOffSetA+32>=M?0.0:uA.d[32]; plA[33] = CurrentOffSetA+33>=M?0.0:uA.d[33]; barrier(CLK_LOCAL_MEM_FENCE); int offA = idx << 1; int offB = idy << 1; #pragma unroll 1 for(uint k = 0 ; k < min(8u, K-block_k ); k+=1) { rA[0] = lA[offA + 0]; rA[1] = lA[offA + 1]; rA[2] = lA[offA + 16]; rA[3] = lA[offA + 17]; rA[4] = lA[offA + 32]; rA[5] = lA[offA + 33]; rB[0] = lB[offB + 0]; rB[1] = lB[offB + 1]; rB[2] = lB[offB + 16]; rB[3] = lB[offB + 17]; rB[4] = lB[offB + 32]; rB[5] = lB[offB + 33]; offA += 48; offB += 48; rC[0][0] = mad(rA[0],rB[0],rC[0][0]); rC[0][1] = mad(rA[1],rB[0],rC[0][1]); rC[0][2] = mad(rA[2],rB[0],rC[0][2]); rC[0][3] = mad(rA[3],rB[0],rC[0][3]); rC[0][4] = mad(rA[4],rB[0],rC[0][4]); rC[0][5] = mad(rA[5],rB[0],rC[0][5]); rC[1][0] = mad(rA[0],rB[1],rC[1][0]); rC[1][1] = mad(rA[1],rB[1],rC[1][1]); rC[1][2] = mad(rA[2],rB[1],rC[1][2]); rC[1][3] = mad(rA[3],rB[1],rC[1][3]); rC[1][4] = mad(rA[4],rB[1],rC[1][4]); rC[1][5] = mad(rA[5],rB[1],rC[1][5]); rC[2][0] = mad(rA[0],rB[2],rC[2][0]); rC[2][1] = mad(rA[1],rB[2],rC[2][1]); rC[2][2] = mad(rA[2],rB[2],rC[2][2]); rC[2][3] = mad(rA[3],rB[2],rC[2][3]); rC[2][4] = mad(rA[4],rB[2],rC[2][4]); rC[2][5] = mad(rA[5],rB[2],rC[2][5]); rC[3][0] = mad(rA[0],rB[3],rC[3][0]); rC[3][1] = mad(rA[1],rB[3],rC[3][1]); rC[3][2] = mad(rA[2],rB[3],rC[3][2]); rC[3][3] = mad(rA[3],rB[3],rC[3][3]); rC[3][4] = mad(rA[4],rB[3],rC[3][4]); rC[3][5] = mad(rA[5],rB[3],rC[3][5]); rC[4][0] = mad(rA[0],rB[4],rC[4][0]); rC[4][1] = mad(rA[1],rB[4],rC[4][1]); rC[4][2] = mad(rA[2],rB[4],rC[4][2]); rC[4][3] = mad(rA[3],rB[4],rC[4][3]); rC[4][4] = mad(rA[4],rB[4],rC[4][4]); rC[4][5] = mad(rA[5],rB[4],rC[4][5]); rC[5][0] = mad(rA[0],rB[5],rC[5][0]); rC[5][1] = mad(rA[1],rB[5],rC[5][1]); rC[5][2] = mad(rA[2],rB[5],rC[5][2]); rC[5][3] = mad(rA[3],rB[5],rC[5][3]); rC[5][4] = mad(rA[4],rB[5],rC[5][4]); rC[5][5] = mad(rA[5],rB[5],rC[5][5]); barrier(CLK_LOCAL_MEM_FENCE); } uA.d += lda << 3; uB.d += ldb << 3; block_k+=8; } while (block_k < K); int offset_x = gidx*48+ idx*2; int offset_y = gidy*48+ idy*2; if(offset_x>=M ) return; C+=offset_x+offset_y*ldc; for (int i = 0; i<3; i++) { C[0] = alpha * rC[0][2*i] ; C[ldc] = alpha * rC[1][2*i] ; C[16*ldc] = alpha * rC[2][2*i] ; C[17*ldc] = alpha * rC[3][2*i] ; C[32*ldc] = alpha * rC[4][2*i] ; C[33*ldc] = alpha * rC[5][2*i] ; if(offset_x+1>=M ) return; C[1] = alpha * rC[0][2*i+1] ; C[1+ldc] = alpha * rC[1][2*i+1] ; C[1+16*ldc] = alpha * rC[2][2*i+1] ; C[1+17*ldc] = alpha * rC[3][2*i+1] ; C[1+32*ldc] = alpha * rC[4][2*i+1] ; C[1+33*ldc] = alpha * rC[5][2*i+1] ; C+=16; offset_x+=16; if(offset_x>=M ) return; } } __attribute__((reqd_work_group_size(8,8,1))) __kernel void dgemm_NT_48_1_1_8x8_6x6__ALPHA_SPLIT_COLUMN(__global double2 const * restrict A, __global double2 const * restrict B, __global double * C, uint const M, uint const N, uint const K, double const alpha, uint lda, uint ldb, uint ldc, uint offsetA, uint offsetB, uint offsetC) { GPtr uA, uB; uA.d2v = (__global double2 *)A; uB.d2v = (__global double2 *)B; // C += offsetC; uA.d += offsetA; uB.d += offsetB; C += offsetC; double rC[6][6] = {(double)0}; double rA[6]; double rB[6]; __local double lA[392]; __local double lB[392]; int gidx = get_group_id(0); int gidy = N/48;//get_group_id(1); int idx = get_local_id(0); int idy = get_local_id(1); int CurrentOffSetB = 2*(gidy*24 + idx); uA.d += 2*(gidx*24 + idx) + idy*lda; uB.d += 2*(gidy*24 + idx) + idy*ldb; int block_k = 0;//K >> 3; do { __local double2* plA = (__local double2*)(lA + idy*48 + 2*idx); __local double* plB = (__local double*)(lB + idy*48 + 2*idx); // barrier(CLK_LOCAL_MEM_FENCE); plB[0 ] = CurrentOffSetB>=N?0.0:uB.d[0 ]; plB[1 ] = CurrentOffSetB+1>=N?0.0:uB.d[1 ]; plB[16] = CurrentOffSetB+16>=N?0.0:uB.d[16 ]; plB[17] = CurrentOffSetB+17>=N?0.0:uB.d[17]; plB[32] = CurrentOffSetB+32>=N?0.0:uB.d[32]; plB[33] = CurrentOffSetB+33>=N?0.0:uB.d[33]; plA[0 ] = uA.d2v[0 ]; plA[8 ] = uA.d2v[8 ]; plA[16] = uA.d2v[16]; barrier(CLK_LOCAL_MEM_FENCE); int offA = idx << 1; int offB = idy << 1; #pragma unroll 1 for(uint k = 0 ; k < min(8u, K-block_k ); k+=1) { rA[0] = lA[offA + 0]; rA[1] = lA[offA + 1]; rA[2] = lA[offA + 16]; rA[3] = lA[offA + 17]; rA[4] = lA[offA + 32]; rA[5] = lA[offA + 33]; rB[0] = lB[offB + 0]; rB[1] = lB[offB + 1]; rB[2] = lB[offB + 16]; rB[3] = lB[offB + 17]; rB[4] = lB[offB + 32]; rB[5] = lB[offB + 33]; offA += 48; offB += 48; rC[0][0] = mad(rA[0],rB[0],rC[0][0]); rC[0][1] = mad(rA[1],rB[0],rC[0][1]); rC[0][2] = mad(rA[2],rB[0],rC[0][2]); rC[0][3] = mad(rA[3],rB[0],rC[0][3]); rC[0][4] = mad(rA[4],rB[0],rC[0][4]); rC[0][5] = mad(rA[5],rB[0],rC[0][5]); rC[1][0] = mad(rA[0],rB[1],rC[1][0]); rC[1][1] = mad(rA[1],rB[1],rC[1][1]); rC[1][2] = mad(rA[2],rB[1],rC[1][2]); rC[1][3] = mad(rA[3],rB[1],rC[1][3]); rC[1][4] = mad(rA[4],rB[1],rC[1][4]); rC[1][5] = mad(rA[5],rB[1],rC[1][5]); rC[2][0] = mad(rA[0],rB[2],rC[2][0]); rC[2][1] = mad(rA[1],rB[2],rC[2][1]); rC[2][2] = mad(rA[2],rB[2],rC[2][2]); rC[2][3] = mad(rA[3],rB[2],rC[2][3]); rC[2][4] = mad(rA[4],rB[2],rC[2][4]); rC[2][5] = mad(rA[5],rB[2],rC[2][5]); rC[3][0] = mad(rA[0],rB[3],rC[3][0]); rC[3][1] = mad(rA[1],rB[3],rC[3][1]); rC[3][2] = mad(rA[2],rB[3],rC[3][2]); rC[3][3] = mad(rA[3],rB[3],rC[3][3]); rC[3][4] = mad(rA[4],rB[3],rC[3][4]); rC[3][5] = mad(rA[5],rB[3],rC[3][5]); rC[4][0] = mad(rA[0],rB[4],rC[4][0]); rC[4][1] = mad(rA[1],rB[4],rC[4][1]); rC[4][2] = mad(rA[2],rB[4],rC[4][2]); rC[4][3] = mad(rA[3],rB[4],rC[4][3]); rC[4][4] = mad(rA[4],rB[4],rC[4][4]); rC[4][5] = mad(rA[5],rB[4],rC[4][5]); rC[5][0] = mad(rA[0],rB[5],rC[5][0]); rC[5][1] = mad(rA[1],rB[5],rC[5][1]); rC[5][2] = mad(rA[2],rB[5],rC[5][2]); rC[5][3] = mad(rA[3],rB[5],rC[5][3]); rC[5][4] = mad(rA[4],rB[5],rC[5][4]); rC[5][5] = mad(rA[5],rB[5],rC[5][5]); barrier(CLK_LOCAL_MEM_FENCE); } uA.d += lda << 3; uB.d += ldb << 3; block_k+=8; } while (block_k < K); int offset_x = gidx*48+ idx*2; int offset_y = gidy*48+ idy*2; if( offset_y>=N ) return; C+=offset_x+offset_y*ldc; for (int i = 0; i<3; i++) { C[0] = alpha * rC[0][2*i] ; C[1] = alpha * rC[0][2*i+1] ; if(offset_y+1> 3; do { __local double* plA = (lA + idy*48 + 2*idx); __local double* plB = (lB + idy*48 + 2*idx); // barrier(CLK_LOCAL_MEM_FENCE); plB[0 ] = CurrentOffSetB>=N?0.0:uB.d[0 ]; plB[1 ] = CurrentOffSetB+1>=N?0.0:uB.d[1 ]; plB[16] = CurrentOffSetB+16>=N?0.0:uB.d[16 ]; plB[17] = CurrentOffSetB+17>=N?0.0:uB.d[17]; plB[32] = CurrentOffSetB+32>=N?0.0:uB.d[32]; plB[33] = CurrentOffSetB+33>=N?0.0:uB.d[33]; plA[0] = CurrentOffSetA>=M?0.0:uA.d[0]; plA[1] = CurrentOffSetA+1>=M?0.0:uA.d[1]; plA[16] = CurrentOffSetA+16>=M?0.0:uA.d[16]; plA[17] = CurrentOffSetA+17>=M?0.0:uA.d[17]; plA[32] = CurrentOffSetA+32>=M?0.0:uA.d[32]; plA[33] = CurrentOffSetA+33>=M?0.0:uA.d[33]; barrier(CLK_LOCAL_MEM_FENCE); int offA = idx << 1; int offB = idy << 1; #pragma unroll 1 for(uint k = 0 ; k < min(8u, K-block_k ); k+=1) { rA[0] = lA[offA + 0]; rA[1] = lA[offA + 1]; rA[2] = lA[offA + 16]; rA[3] = lA[offA + 17]; rA[4] = lA[offA + 32]; rA[5] = lA[offA + 33]; rB[0] = lB[offB + 0]; rB[1] = lB[offB + 1]; rB[2] = lB[offB + 16]; rB[3] = lB[offB + 17]; rB[4] = lB[offB + 32]; rB[5] = lB[offB + 33]; offA += 48; offB += 48; rC[0][0] = mad(rA[0],rB[0],rC[0][0]); rC[0][1] = mad(rA[1],rB[0],rC[0][1]); rC[0][2] = mad(rA[2],rB[0],rC[0][2]); rC[0][3] = mad(rA[3],rB[0],rC[0][3]); rC[0][4] = mad(rA[4],rB[0],rC[0][4]); rC[0][5] = mad(rA[5],rB[0],rC[0][5]); rC[1][0] = mad(rA[0],rB[1],rC[1][0]); rC[1][1] = mad(rA[1],rB[1],rC[1][1]); rC[1][2] = mad(rA[2],rB[1],rC[1][2]); rC[1][3] = mad(rA[3],rB[1],rC[1][3]); rC[1][4] = mad(rA[4],rB[1],rC[1][4]); rC[1][5] = mad(rA[5],rB[1],rC[1][5]); rC[2][0] = mad(rA[0],rB[2],rC[2][0]); rC[2][1] = mad(rA[1],rB[2],rC[2][1]); rC[2][2] = mad(rA[2],rB[2],rC[2][2]); rC[2][3] = mad(rA[3],rB[2],rC[2][3]); rC[2][4] = mad(rA[4],rB[2],rC[2][4]); rC[2][5] = mad(rA[5],rB[2],rC[2][5]); rC[3][0] = mad(rA[0],rB[3],rC[3][0]); rC[3][1] = mad(rA[1],rB[3],rC[3][1]); rC[3][2] = mad(rA[2],rB[3],rC[3][2]); rC[3][3] = mad(rA[3],rB[3],rC[3][3]); rC[3][4] = mad(rA[4],rB[3],rC[3][4]); rC[3][5] = mad(rA[5],rB[3],rC[3][5]); rC[4][0] = mad(rA[0],rB[4],rC[4][0]); rC[4][1] = mad(rA[1],rB[4],rC[4][1]); rC[4][2] = mad(rA[2],rB[4],rC[4][2]); rC[4][3] = mad(rA[3],rB[4],rC[4][3]); rC[4][4] = mad(rA[4],rB[4],rC[4][4]); rC[4][5] = mad(rA[5],rB[4],rC[4][5]); rC[5][0] = mad(rA[0],rB[5],rC[5][0]); rC[5][1] = mad(rA[1],rB[5],rC[5][1]); rC[5][2] = mad(rA[2],rB[5],rC[5][2]); rC[5][3] = mad(rA[3],rB[5],rC[5][3]); rC[5][4] = mad(rA[4],rB[5],rC[5][4]); rC[5][5] = mad(rA[5],rB[5],rC[5][5]); barrier(CLK_LOCAL_MEM_FENCE); } uA.d += lda << 3; uB.d += ldb << 3; block_k+=8; } while (block_k < K); int offset_x = gidx*48+ idx*2; int offset_y = gidy*48+ idy*2; if(offset_x>=M || offset_y>=N ) return; C+=offset_x+offset_y*ldc; for (int i = 0; i<3; i++) { C[0] = alpha * rC[0][2*i] ; if(offset_y+1=M ) return; C[1] = alpha * rC[0][2*i+1] ; if(offset_y+1=M ) return; } } "; static const char * dgemm_NN_8_SPLIT__ALPHABETA = " #define M6x6 \ rA[0] = lA[offA + 0]; \ rA[1] = lA[offA + 8]; \ rA[2] = lA[offA + 16]; \ rA[3] = lA[offA + 24]; \ rA[4] = lA[offA + 32]; \ rA[5] = lA[offA + 40]; \ rB[0] = lB[offB + 0]; \ rB[1] = lB[offB + 8]; \ rB[2] = lB[offB + 16]; \ rB[3] = lB[offB + 24]; \ rB[4] = lB[offB + 32]; \ rB[5] = lB[offB + 40]; \ offA += 49; \ offB += 49; \ rC[0][0]=mad(rA[0],rB[0],rC[0][0]); \ rC[1][0]=mad(rA[1],rB[0],rC[1][0]); \ rC[2][0]=mad(rA[2],rB[0],rC[2][0]); \ rC[3][0]=mad(rA[3],rB[0],rC[3][0]); \ rC[4][0]=mad(rA[4],rB[0],rC[4][0]); \ rC[5][0]=mad(rA[5],rB[0],rC[5][0]); \ rC[0][1]=mad(rA[0],rB[1],rC[0][1]); \ rC[1][1]=mad(rA[1],rB[1],rC[1][1]); \ rC[2][1]=mad(rA[2],rB[1],rC[2][1]); \ rC[3][1]=mad(rA[3],rB[1],rC[3][1]); \ rC[4][1]=mad(rA[4],rB[1],rC[4][1]); \ rC[5][1]=mad(rA[5],rB[1],rC[5][1]); \ rC[0][2]=mad(rA[0],rB[2],rC[0][2]); \ rC[1][2]=mad(rA[1],rB[2],rC[1][2]); \ rC[2][2]=mad(rA[2],rB[2],rC[2][2]); \ rC[3][2]=mad(rA[3],rB[2],rC[3][2]); \ rC[4][2]=mad(rA[4],rB[2],rC[4][2]); \ rC[5][2]=mad(rA[5],rB[2],rC[5][2]); \ rC[0][3]=mad(rA[0],rB[3],rC[0][3]); \ rC[1][3]=mad(rA[1],rB[3],rC[1][3]); \ rC[2][3]=mad(rA[2],rB[3],rC[2][3]); \ rC[3][3]=mad(rA[3],rB[3],rC[3][3]); \ rC[4][3]=mad(rA[4],rB[3],rC[4][3]); \ rC[5][3]=mad(rA[5],rB[3],rC[5][3]); \ rC[0][4]=mad(rA[0],rB[4],rC[0][4]); \ rC[1][4]=mad(rA[1],rB[4],rC[1][4]); \ rC[2][4]=mad(rA[2],rB[4],rC[2][4]); \ rC[3][4]=mad(rA[3],rB[4],rC[3][4]); \ rC[4][4]=mad(rA[4],rB[4],rC[4][4]); \ rC[5][4]=mad(rA[5],rB[4],rC[5][4]); \ rC[0][5]=mad(rA[0],rB[5],rC[0][5]); \ rC[1][5]=mad(rA[1],rB[5],rC[1][5]); \ rC[2][5]=mad(rA[2],rB[5],rC[2][5]); \ rC[3][5]=mad(rA[3],rB[5],rC[3][5]); \ rC[4][5]=mad(rA[4],rB[5],rC[4][5]); \ rC[5][5]=mad(rA[5],rB[5],rC[5][5]); \ barrier(CLK_LOCAL_MEM_FENCE); __attribute__((reqd_work_group_size(8,8,1))) __kernel void dgemm_NN_48_48_8_8x8_6x6__ALPHABETA_SPLIT_MAIN(__global double const * restrict A, __global double const * restrict B, __global double * C, uint const M, uint const N, uint const K, double const alpha, double const beta, uint lda, uint ldb, uint ldc, uint offsetA, uint offsetB, uint offsetC) { A += offsetA; B += offsetB; C += offsetC; double rC[6][6] = {(double)0}; double rA[6]; double rB[6]; __local double lA[392]; __local double lB[392]; int gidx = get_group_id(0); int gidy = get_group_id(1); int idx = get_local_id(0); int idy = get_local_id(1); int idt = 8*idy + idx; int idxT = idt % 8; int idyT = idt / 8; A += gidx*48+ idxT + idyT*lda; B += gidy*48*ldb+ idx + idy*ldb; int block_k = K >> 3; do { __local double* plA = lA + idyT*49 + idxT; __local double* plB = lB + idxT*49 + idyT; plA[0] = A[0+0*lda]; plA[8] = A[8+0*lda]; plA[16] = A[16+0*lda]; plA[24] = A[24+0*lda]; plA[32] = A[32+0*lda]; plA[40] = A[40+0*lda]; plB[0] = B[0+0*ldb]; plB[8] = B[0+8*ldb]; plB[16] = B[0+16*ldb]; plB[24] = B[0+24*ldb]; plB[32] = B[0+32*ldb]; plB[40] = B[0+40*ldb]; barrier(CLK_LOCAL_MEM_FENCE); int offA = idx; int offB = idy; M6x6 M6x6 M6x6 M6x6 M6x6 M6x6 M6x6 M6x6 A += lda << 3; B += 8; } while (--block_k > 0); C+= gidx*48; C+= idx; C+= gidy*48*ldc; C+= idy*ldc; C[0*ldc] = alpha*rC[0][0] + beta*C[0*ldc]; C[8*ldc] = alpha*rC[0][1] + beta*C[8*ldc]; C[16*ldc] = alpha*rC[0][2] + beta*C[16*ldc]; C[24*ldc] = alpha*rC[0][3] + beta*C[24*ldc]; C[32*ldc] = alpha*rC[0][4] + beta*C[32*ldc]; C[40*ldc] = alpha*rC[0][5] + beta*C[40*ldc]; C+=8; C[0*ldc] = alpha*rC[1][0] + beta*C[0*ldc]; C[8*ldc] = alpha*rC[1][1] + beta*C[8*ldc]; C[16*ldc] = alpha*rC[1][2] + beta*C[16*ldc]; C[24*ldc] = alpha*rC[1][3] + beta*C[24*ldc]; C[32*ldc] = alpha*rC[1][4] + beta*C[32*ldc]; C[40*ldc] = alpha*rC[1][5] + beta*C[40*ldc]; C+=8; C[0*ldc] = alpha*rC[2][0] + beta*C[0*ldc]; C[8*ldc] = alpha*rC[2][1] + beta*C[8*ldc]; C[16*ldc] = alpha*rC[2][2] + beta*C[16*ldc]; C[24*ldc] = alpha*rC[2][3] + beta*C[24*ldc]; C[32*ldc] = alpha*rC[2][4] + beta*C[32*ldc]; C[40*ldc] = alpha*rC[2][5] + beta*C[40*ldc]; C+=8; C[0*ldc] = alpha*rC[3][0] + beta*C[0*ldc]; C[8*ldc] = alpha*rC[3][1] + beta*C[8*ldc]; C[16*ldc] = alpha*rC[3][2] + beta*C[16*ldc]; C[24*ldc] = alpha*rC[3][3] + beta*C[24*ldc]; C[32*ldc] = alpha*rC[3][4] + beta*C[32*ldc]; C[40*ldc] = alpha*rC[3][5] + beta*C[40*ldc]; C+=8; C[0*ldc] = alpha*rC[4][0] + beta*C[0*ldc]; C[8*ldc] = alpha*rC[4][1] + beta*C[8*ldc]; C[16*ldc] = alpha*rC[4][2] + beta*C[16*ldc]; C[24*ldc] = alpha*rC[4][3] + beta*C[24*ldc]; C[32*ldc] = alpha*rC[4][4] + beta*C[32*ldc]; C[40*ldc] = alpha*rC[4][5] + beta*C[40*ldc]; C+=8; C[0*ldc] = alpha*rC[5][0] + beta*C[0*ldc]; C[8*ldc] = alpha*rC[5][1] + beta*C[8*ldc]; C[16*ldc] = alpha*rC[5][2] + beta*C[16*ldc]; C[24*ldc] = alpha*rC[5][3] + beta*C[24*ldc]; C[32*ldc] = alpha*rC[5][4] + beta*C[32*ldc]; C[40*ldc] = alpha*rC[5][5] + beta*C[40*ldc]; } __attribute__((reqd_work_group_size(8,8,1))) __kernel void dgemm_NN_1_48_8_8x8_6x6__ALPHABETA_SPLIT_ROW(__global double const * restrict A, __global double const * restrict B, __global double * C, uint const M, uint const N, uint const K, double const alpha, double const beta, uint lda, uint ldb, uint ldc, uint offsetA, uint offsetB, uint offsetC) { A += offsetA; B += offsetB; C += offsetC; double rC[6][6] = {(double)0}; double rA[6]; double rB[6]; __local double lA[392]; __local double lB[392]; int gidx = M/48;//get_group_id(0); int gidy = get_group_id(1); int idx = get_local_id(0); int idy = get_local_id(1); int idt = 8*idy + idx; int idxT = idt % 8; int idyT = idt / 8; int CurrentOffSetA = gidx*48 + idxT; A += gidx*48+ idxT + idyT*lda; B += gidy*48*ldb+ idx + idy*ldb; int block_k = K >> 3; do { __local double* plA = lA + idyT*49 + idxT; __local double* plB = lB + idxT*49 + idyT; plB[0] = B[0+0*ldb]; plB[8] = B[0+8*ldb]; plB[16] = B[0+16*ldb]; plB[24] = B[0+24*ldb]; plB[32] = B[0+32*ldb]; plB[40] = B[0+40*ldb]; plA[0] = CurrentOffSetA>=M?0.0:A[0]; plA[8] = CurrentOffSetA+8>=M?0.0:A[8]; plA[16] = CurrentOffSetA+16>=M?0.0:A[16]; plA[24] = CurrentOffSetA+24>=M?0.0:A[24]; plA[32] = CurrentOffSetA+32>=M?0.0:A[32]; plA[40] = CurrentOffSetA+40>=M?0.0:A[40]; barrier(CLK_LOCAL_MEM_FENCE); int offA = idx ; int offB = idy ; M6x6 M6x6 M6x6 M6x6 M6x6 M6x6 M6x6 M6x6 A += lda << 3; B += 8; } while (--block_k > 0); int offset_x = gidx*48+ idx; int offset_y = gidy*48+ idy; if(offset_x>=M ) return; C+= offset_x+ldc*offset_y; //for (int i=0; i<6; i++) int i = 0; do { C[0*ldc+i*8] = alpha*rC[i][0] + beta*C[0*ldc+i*8]; C[8*ldc+i*8] = alpha*rC[i][1] + beta*C[8*ldc+i*8]; C[16*ldc+i*8] = alpha*rC[i][2] + beta*C[16*ldc+i*8]; C[24*ldc+i*8] = alpha*rC[i][3] + beta*C[24*ldc+i*8]; C[32*ldc+i*8] = alpha*rC[i][4] + beta*C[32*ldc+i*8]; C[40*ldc+i*8] = alpha*rC[i][5] + beta*C[40*ldc+i*8]; offset_x += 8; if(offset_x>=M) return; }while (++i < 6); } __attribute__((reqd_work_group_size(8,8,1))) __kernel void dgemm_NN_48_1_8_8x8_6x6__ALPHABETA_SPLIT_COLUMN(__global double const * restrict A, __global double const * restrict B, __global double * C, uint const M, uint const N, uint const K, double const alpha, double const beta, uint lda, uint ldb, uint ldc, uint offsetA, uint offsetB, uint offsetC) { A += offsetA; B += offsetB; C += offsetC; double rC[6][6] = {(double)0}; double rA[6]; double rB[6]; __local double lA[392]; __local double lB[392]; int gidx = get_group_id(0); int gidy = N/48;//get_group_id(1); int idx = get_local_id(0); int idy = get_local_id(1); int idt = 8*idy + idx; int idxT = idt % 8; int idyT = idt / 8; A += gidx*48+ idxT + idyT*lda; B += (gidy*48+idyT)*ldb + idxT; int CurrentOffSetB = gidy*48 + idyT; int block_k = K >> 3; do { __local double* plA = lA + idyT*49 + idxT; __local double* plB = lB + idxT*49 + idyT; plB[0] = CurrentOffSetB>=N?0.0:B[0]; plB[8] = CurrentOffSetB+8>=N?0.0:B[0+8*ldb]; plB[16] = CurrentOffSetB+16>=N?0.0:B[0+16*ldb]; plB[24] = CurrentOffSetB+24>=N?0.0:B[0+24*ldb]; plB[32] = CurrentOffSetB+32>=N?0.0:B[0+32*ldb]; plB[40] = CurrentOffSetB+40>=N?0.0:B[0+40*ldb]; plA[0] = A[0+0*lda]; plA[8] = A[8+0*lda]; plA[16] = A[16+0*lda]; plA[24] = A[24+0*lda]; plA[32] = A[32+0*lda]; plA[40] = A[40+0*lda]; barrier(CLK_LOCAL_MEM_FENCE); int offA = idx ; int offB = idy ; M6x6 M6x6 M6x6 M6x6 M6x6 M6x6 M6x6 M6x6 A += lda << 3; B += 8; } while (--block_k > 0); int offset_x = gidx*48+ idx; int offset_y = gidy*48+ idy; if(offset_y>=N ) return; C+= offset_x+ldc*offset_y; //for (int i=0; i<6; i++) int i = 0; do { C[i*8] = alpha*rC[i][0] + beta*C[i*8]; if (offset_y+8> 3; do { plB[0] = CurrentOffSetB>=N?0.0:B[0]; plB[8] = CurrentOffSetB+8>=N?0.0:B[8*ldb]; plB[16] = CurrentOffSetB+16>=N?0.0:B[16*ldb]; plB[24] = CurrentOffSetB+24>=N?0.0:B[24*ldb]; plB[32] = CurrentOffSetB+32>=N?0.0:B[32*ldb]; plB[40] = CurrentOffSetB+40>=N?0.0:B[40*ldb]; plA[0] = CurrentOffSetA>=M?0.0:A[0]; plA[8] = CurrentOffSetA+8>=M?0.0:A[8]; plA[16] = CurrentOffSetA+16>=M?0.0:A[16]; plA[24] = CurrentOffSetA+24>=M?0.0:A[24]; plA[32] = CurrentOffSetA+32>=M?0.0:A[32]; plA[40] = CurrentOffSetA+40>=M?0.0:A[40]; barrier(CLK_LOCAL_MEM_FENCE); int offA = idx ; int offB = idy ; M6x6 M6x6 M6x6 M6x6 M6x6 M6x6 M6x6 M6x6 A += lda << 3; B += 8; } while (--block_k > 0); int offset_x = gidx*48+ idx; int offset_y = gidy*48+ idy; if(offset_x>=M || offset_y>=N ) return; C+= offset_x+ldc*offset_y; //for (int i=0; i<6; i++) int i = 0; do { C[0*ldc+i*8] = alpha*rC[i][0] + beta*C[0*ldc+i*8]; if (offset_y+8=M) return; }while (++i < 6); } "; static const char * dgemm_NN_8_SPLIT__ALPHA = " #define M6x6 \ rA[0] = lA[offA + 0];\ rA[1] = lA[offA + 8];\ rA[2] = lA[offA + 16];\ rA[3] = lA[offA + 24];\ rA[4] = lA[offA + 32];\ rA[5] = lA[offA + 40];\ rB[0] = lB[offB + 0];\ rB[1] = lB[offB + 8];\ rB[2] = lB[offB + 16];\ rB[3] = lB[offB + 24]; \ rB[4] = lB[offB + 32]; \ rB[5] = lB[offB + 40]; \ offA += 49; \ offB += 49; \ rC[0][0]=mad(rA[0],rB[0],rC[0][0]); \ rC[1][0]=mad(rA[1],rB[0],rC[1][0]); \ rC[2][0]=mad(rA[2],rB[0],rC[2][0]); \ rC[3][0]=mad(rA[3],rB[0],rC[3][0]); \ rC[4][0]=mad(rA[4],rB[0],rC[4][0]); \ rC[5][0]=mad(rA[5],rB[0],rC[5][0]); \ rC[0][1]=mad(rA[0],rB[1],rC[0][1]); \ rC[1][1]=mad(rA[1],rB[1],rC[1][1]); \ rC[2][1]=mad(rA[2],rB[1],rC[2][1]); \ rC[3][1]=mad(rA[3],rB[1],rC[3][1]); \ rC[4][1]=mad(rA[4],rB[1],rC[4][1]); \ rC[5][1]=mad(rA[5],rB[1],rC[5][1]); \ rC[0][2]=mad(rA[0],rB[2],rC[0][2]); \ rC[1][2]=mad(rA[1],rB[2],rC[1][2]); \ rC[2][2]=mad(rA[2],rB[2],rC[2][2]); \ rC[3][2]=mad(rA[3],rB[2],rC[3][2]); \ rC[4][2]=mad(rA[4],rB[2],rC[4][2]); \ rC[5][2]=mad(rA[5],rB[2],rC[5][2]); \ rC[0][3]=mad(rA[0],rB[3],rC[0][3]); \ rC[1][3]=mad(rA[1],rB[3],rC[1][3]); \ rC[2][3]=mad(rA[2],rB[3],rC[2][3]); \ rC[3][3]=mad(rA[3],rB[3],rC[3][3]); \ rC[4][3]=mad(rA[4],rB[3],rC[4][3]); \ rC[5][3]=mad(rA[5],rB[3],rC[5][3]); \ rC[0][4]=mad(rA[0],rB[4],rC[0][4]); \ rC[1][4]=mad(rA[1],rB[4],rC[1][4]); \ rC[2][4]=mad(rA[2],rB[4],rC[2][4]); \ rC[3][4]=mad(rA[3],rB[4],rC[3][4]); \ rC[4][4]=mad(rA[4],rB[4],rC[4][4]); \ rC[5][4]=mad(rA[5],rB[4],rC[5][4]); \ rC[0][5]=mad(rA[0],rB[5],rC[0][5]); \ rC[1][5]=mad(rA[1],rB[5],rC[1][5]); \ rC[2][5]=mad(rA[2],rB[5],rC[2][5]); \ rC[3][5]=mad(rA[3],rB[5],rC[3][5]); \ rC[4][5]=mad(rA[4],rB[5],rC[4][5]); \ rC[5][5]=mad(rA[5],rB[5],rC[5][5]); \ barrier(CLK_LOCAL_MEM_FENCE); __attribute__((reqd_work_group_size(8,8,1))) __kernel void dgemm_NN_48_48_8_8x8_6x6__ALPHA_SPLIT_MAIN(__global double const * restrict A, __global double const * restrict B, __global double * C, uint const M, uint const N, uint const K, double const alpha, uint lda, uint ldb, uint ldc, uint offsetA, uint offsetB, uint offsetC) { A += offsetA; B += offsetB; C += offsetC; double rC[6][6] = {(double)0}; double rA[6]; double rB[6]; __local double lA[392]; __local double lB[392]; int gidx = get_group_id(0); int gidy = get_group_id(1); int idx = get_local_id(0); int idy = get_local_id(1); int idt = 8*idy + idx; int idxT = idt % 8; int idyT = idt / 8; A += gidx*48+ idxT + idyT*lda; B += gidy*48*ldb+ idx + idy*ldb; int block_k = K >> 3; do { __local double* plA = lA + idyT*49 + idxT; __local double* plB = lB + idxT*49 + idyT; plA[0] = A[0+0*lda]; plA[8] = A[8+0*lda]; plA[16] = A[16+0*lda]; plA[24] = A[24+0*lda]; plA[32] = A[32+0*lda]; plA[40] = A[40+0*lda]; plB[0] = B[0+0*ldb]; plB[8] = B[0+8*ldb]; plB[16] = B[0+16*ldb]; plB[24] = B[0+24*ldb]; plB[32] = B[0+32*ldb]; plB[40] = B[0+40*ldb]; barrier(CLK_LOCAL_MEM_FENCE); int offA = idx ; int offB = idy ; M6x6 M6x6 M6x6 M6x6 M6x6 M6x6 M6x6 M6x6 A += lda << 3; B += 8; } while (--block_k > 0); C+= gidx*48; C+= idx; C+= gidy*48*ldc; C+= idy*ldc; C[0*ldc] = alpha*rC[0][0] ; C[8*ldc] = alpha*rC[0][1] ; C[16*ldc] = alpha*rC[0][2] ; C[24*ldc] = alpha*rC[0][3] ; C[32*ldc] = alpha*rC[0][4] ; C[40*ldc] = alpha*rC[0][5] ; C+=8; ; C[0*ldc] = alpha*rC[1][0] ; C[8*ldc] = alpha*rC[1][1] ; C[16*ldc] = alpha*rC[1][2] ; C[24*ldc] = alpha*rC[1][3] ; C[32*ldc] = alpha*rC[1][4] ; C[40*ldc] = alpha*rC[1][5] ; C+=8; ; C[0*ldc] = alpha*rC[2][0] ; C[8*ldc] = alpha*rC[2][1] ; C[16*ldc] = alpha*rC[2][2] ; C[24*ldc] = alpha*rC[2][3] ; C[32*ldc] = alpha*rC[2][4] ; C[40*ldc] = alpha*rC[2][5] ; C+=8; ; C[0*ldc] = alpha*rC[3][0] ; C[8*ldc] = alpha*rC[3][1] ; C[16*ldc] = alpha*rC[3][2] ; C[24*ldc] = alpha*rC[3][3] ; C[32*ldc] = alpha*rC[3][4] ; C[40*ldc] = alpha*rC[3][5] ; C+=8; ; C[0*ldc] = alpha*rC[4][0] ; C[8*ldc] = alpha*rC[4][1] ; C[16*ldc] = alpha*rC[4][2] ; C[24*ldc] = alpha*rC[4][3] ; C[32*ldc] = alpha*rC[4][4] ; C[40*ldc] = alpha*rC[4][5] ; C+=8; ; C[0*ldc] = alpha*rC[5][0] ; C[8*ldc] = alpha*rC[5][1] ; C[16*ldc] = alpha*rC[5][2] ; C[24*ldc] = alpha*rC[5][3] ; C[32*ldc] = alpha*rC[5][4] ; C[40*ldc] = alpha*rC[5][5] ; } __attribute__((reqd_work_group_size(8,8,1))) __kernel void dgemm_NN_1_48_8_8x8_6x6__ALPHA_SPLIT_ROW(__global double const * restrict A, __global double const * restrict B, __global double * C, uint const M, uint const N, uint const K, double const alpha, uint lda, uint ldb, uint ldc, uint offsetA, uint offsetB, uint offsetC) { A += offsetA; B += offsetB; C += offsetC; double rC[6][6] = {(double)0}; double rA[6]; double rB[6]; __local double lA[392]; __local double lB[392]; int gidx = M/48;//get_group_id(0); int gidy = get_group_id(1); int idx = get_local_id(0); int idy = get_local_id(1); int idt = 8*idy + idx; int idxT = idt % 8; int idyT = idt / 8; int CurrentOffSetA = gidx*48 + idxT; A += gidx*48+ idxT + idyT*lda; B += gidy*48*ldb+ idx + idy*ldb; int block_k = K >> 3; do { __local double* plA = lA + idyT*49 + idxT; __local double* plB = lB + idxT*49 + idyT; plB[0] = B[0+0*ldb]; plB[8] = B[0+8*ldb]; plB[16] = B[0+16*ldb]; plB[24] = B[0+24*ldb]; plB[32] = B[0+32*ldb]; plB[40] = B[0+40*ldb]; plA[0] = CurrentOffSetA>=M?0.0:A[0]; plA[8] = CurrentOffSetA+8>=M?0.0:A[8]; plA[16] = CurrentOffSetA+16>=M?0.0:A[16]; plA[24] = CurrentOffSetA+24>=M?0.0:A[24]; plA[32] = CurrentOffSetA+32>=M?0.0:A[32]; plA[40] = CurrentOffSetA+40>=M?0.0:A[40]; barrier(CLK_LOCAL_MEM_FENCE); int offA = idx ; int offB = idy ; M6x6 M6x6 M6x6 M6x6 M6x6 M6x6 M6x6 M6x6 A += lda << 3; B += 8; } while (--block_k > 0); int offset_x = gidx*48+ idx; int offset_y = gidy*48+ idy; if(offset_x>=M ) return; C+= offset_x+ldc*offset_y; for (int i=0; i<6; i++) { C[0*ldc+i*8] = alpha*rC[i][0] ; C[8*ldc+i*8] = alpha*rC[i][1] ; C[16*ldc+i*8] = alpha*rC[i][2] ; C[24*ldc+i*8] = alpha*rC[i][3] ; C[32*ldc+i*8] = alpha*rC[i][4] ; C[40*ldc+i*8] = alpha*rC[i][5] ; offset_x += 8; if(offset_x>=M) return; } } __attribute__((reqd_work_group_size(8,8,1))) __kernel void dgemm_NN_48_1_8_8x8_6x6__ALPHA_SPLIT_COLUMN(__global double const * restrict A, __global double const * restrict B, __global double * C, uint const M, uint const N, uint const K, double const alpha, uint lda, uint ldb, uint ldc, uint offsetA, uint offsetB, uint offsetC) { A += offsetA; B += offsetB; C += offsetC; double rC[6][6] = {(double)0}; double rA[6]; double rB[6]; __local double lA[392]; __local double lB[392]; int gidx = get_group_id(0); int gidy = N/48;//get_group_id(1); int idx = get_local_id(0); int idy = get_local_id(1); int idt = 8*idy + idx; int idxT = idt % 8; int idyT = idt / 8; int CurrentOffSetB = gidy*48 + idyT; A += gidx*48+ idxT + idyT*lda; B += gidy*48*ldb+ idx + idy*ldb; int block_k = K >> 3; do { __local double* plA = lA + idyT*49 + idxT; __local double* plB = lB + idxT*49 + idyT; plB[0] = CurrentOffSetB>=N?0.0:B[0]; plB[8] = CurrentOffSetB+8>=N?0.0:B[0+8*ldb]; plB[16] = CurrentOffSetB+16>=N?0.0:B[0+16*ldb]; plB[24] = CurrentOffSetB+24>=N?0.0:B[0+24*ldb]; plB[32] = CurrentOffSetB+32>=N?0.0:B[0+32*ldb]; plB[40] = CurrentOffSetB+40>=N?0.0:B[0+40*ldb]; plA[0] = A[0+0*lda]; plA[8] = A[8+0*lda]; plA[16] = A[16+0*lda]; plA[24] = A[24+0*lda]; plA[32] = A[32+0*lda]; plA[40] = A[40+0*lda]; barrier(CLK_LOCAL_MEM_FENCE); int offA = idx ; int offB = idy ; M6x6 M6x6 M6x6 M6x6 M6x6 M6x6 M6x6 M6x6 A += lda << 3; B += 8; } while (--block_k > 0); int offset_x = gidx*48+ idx; int offset_y = gidy*48+ idy; if(offset_y>=N ) return; C+= offset_x+ldc*offset_y; //for (int i=0; i<6; i++) int i = 0; do { C[0*ldc+i*8] = alpha*rC[i][0] ; if (offset_y+8> 3; do { __local double* plA = lA + idyT*49 + idxT; __local double* plB = lB + idxT*49 + idyT; plB[0] = CurrentOffSetB>=N?0.0:B[0]; plB[8] = CurrentOffSetB+8>=N?0.0:B[0+8*ldb]; plB[16] = CurrentOffSetB+16>=N?0.0:B[0+16*ldb]; plB[24] = CurrentOffSetB+24>=N?0.0:B[0+24*ldb]; plB[32] = CurrentOffSetB+32>=N?0.0:B[0+32*ldb]; plB[40] = CurrentOffSetB+40>=N?0.0:B[0+40*ldb]; plA[0] = CurrentOffSetA>=M?0.0:A[0]; plA[8] = CurrentOffSetA+8>=M?0.0:A[8]; plA[16] = CurrentOffSetA+16>=M?0.0:A[16]; plA[24] = CurrentOffSetA+24>=M?0.0:A[24]; plA[32] = CurrentOffSetA+32>=M?0.0:A[32]; plA[40] = CurrentOffSetA+40>=M?0.0:A[40]; barrier(CLK_LOCAL_MEM_FENCE); int offA = idx ; int offB = idy ; M6x6 M6x6 M6x6 M6x6 M6x6 M6x6 M6x6 M6x6 A += lda << 3; B += 8; } while (--block_k > 0); int offset_x = gidx*48+ idx; int offset_y = gidy*48+ idy; if(offset_x>=M || offset_y>=N ) return; C+= offset_x+ldc*offset_y; //for (int i=0; i<6; i++) int i = 0; do { C[0*ldc+i*8] = alpha*rC[i][0] ; if (offset_y+8=M) return; }while (++i < 6); } "; static const char * dgemm_NN_1_SPLIT__ALPHABETA = " __attribute__((reqd_work_group_size(8,8,1))) __kernel void dgemm_NN_48_48_1_8x8_6x6__ALPHABETA_SPLIT_MAIN(__global double const * restrict A, __global double const * restrict B, __global double * C, uint const M, uint const N, uint const K, double const alpha, double const beta, uint lda, uint ldb, uint ldc, uint offsetA, uint offsetB, uint offsetC) { A += offsetA; B += offsetB; C += offsetC; double rC[6][6] = {(double)0}; double rA[6]; double rB[6]; __local double lA[392]; __local double lB[392]; int gidx = get_group_id(0); int gidy = get_group_id(1); int idx = get_local_id(0); int idy = get_local_id(1); int idt = 8*idy + idx; int idxT = idt % 8; int idyT = idt / 8; A += gidx*48+ idxT + idyT*lda; B += gidy*48*ldb+ idx + idy*ldb; int block_k = 0;//K >> 3; do { __local double* plA = lA + idyT*49 + idxT; __local double* plB = lB + idxT*49 + idyT; plB[0] = B[0+0*ldb]; plB[8] = B[0+8*ldb]; plB[16] = B[0+16*ldb]; plB[24] = B[0+24*ldb]; plB[32] = B[0+32*ldb]; plB[40] = B[0+40*ldb]; plA[0] = A[0+0*lda]; plA[8] = A[8+0*lda]; plA[16] = A[16+0*lda]; plA[24] = A[24+0*lda]; plA[32] = A[32+0*lda]; plA[40] = A[40+0*lda]; barrier(CLK_LOCAL_MEM_FENCE); int offA = idx ; int offB = idy ; #pragma unroll 1 for(uint k = 0 ; k > 3; do { __local double* plA = lA + idyT*49 + idxT; __local double* plB = lB + idxT*49 + idyT; plB[0] = B[0+0*ldb]; plB[8] = B[0+8*ldb]; plB[16] = B[0+16*ldb]; plB[24] = B[0+24*ldb]; plB[32] = B[0+32*ldb]; plB[40] = B[0+40*ldb]; plA[0] = CurrentOffSetA>=M?0.0:A[0]; plA[8] = CurrentOffSetA+8>=M?0.0:A[8]; plA[16] = CurrentOffSetA+16>=M?0.0:A[16]; plA[24] = CurrentOffSetA+24>=M?0.0:A[24]; plA[32] = CurrentOffSetA+32>=M?0.0:A[32]; plA[40] = CurrentOffSetA+40>=M?0.0:A[40]; barrier(CLK_LOCAL_MEM_FENCE); int offA = idx ; int offB = idy ; #pragma unroll 1 for(uint k = 0 ; k =M ) return; C+= offset_x+ldc*offset_y; //for (int i=0; i<6; i++) int i = 0; do { C[0*ldc+i*8] = alpha*rC[i][0] + beta*C[0*ldc+i*8]; C[8*ldc+i*8] = alpha*rC[i][1] + beta*C[8*ldc+i*8]; C[16*ldc+i*8] = alpha*rC[i][2] + beta*C[16*ldc+i*8]; C[24*ldc+i*8] = alpha*rC[i][3] + beta*C[24*ldc+i*8]; C[32*ldc+i*8] = alpha*rC[i][4] + beta*C[32*ldc+i*8]; C[40*ldc+i*8] = alpha*rC[i][5] + beta*C[40*ldc+i*8]; offset_x += 8; if(offset_x>=M) return; }while (++i < 6); } __attribute__((reqd_work_group_size(8,8,1))) __kernel void dgemm_NN_48_1_1_8x8_6x6__ALPHABETA_SPLIT_COLUMN(__global double const * restrict A, __global double const * restrict B, __global double * C, uint const M, uint const N, uint const K, double const alpha, double const beta, uint lda, uint ldb, uint ldc, uint offsetA, uint offsetB, uint offsetC) { A += offsetA; B += offsetB; C += offsetC; double rC[6][6] = {(double)0}; double rA[6]; double rB[6]; __local double lA[392]; __local double lB[392]; int gidx = get_group_id(0); int gidy = N/48;//get_group_id(1); int idx = get_local_id(0); int idy = get_local_id(1); int idt = 8*idy + idx; int idxT = idt % 8; int idyT = idt / 8; A += gidx*48+ idxT + idyT*lda; B += gidy*48*ldb+ idx + idy*ldb; int CurrentOffSetB = gidy*48 + idyT; int block_k = 0;//K >> 3; do { __local double* plA = lA + idyT*49 + idxT; __local double* plB = lB + idxT*49 + idyT; plB[0] = CurrentOffSetB>=N?0.0:B[0]; plB[8] = CurrentOffSetB+8>=N?0.0:B[0+8*ldb]; plB[16] = CurrentOffSetB+16>=N?0.0:B[0+16*ldb]; plB[24] = CurrentOffSetB+24>=N?0.0:B[0+24*ldb]; plB[32] = CurrentOffSetB+32>=N?0.0:B[0+32*ldb]; plB[40] = CurrentOffSetB+40>=N?0.0:B[0+40*ldb]; plA[0] = A[0+0*lda]; plA[8] = A[8+0*lda]; plA[16] = A[16+0*lda]; plA[24] = A[24+0*lda]; plA[32] = A[32+0*lda]; plA[40] = A[40+0*lda]; barrier(CLK_LOCAL_MEM_FENCE); int offA = idx ; int offB = idy ; #pragma unroll 1 for(uint k = 0 ; k =N ) return; C+= offset_x+ldc*offset_y; //for (int i=0; i<6; i++) int i = 0; do { C[0*ldc+i*8] = alpha*rC[i][0] + beta*C[0*ldc+i*8]; if (offset_y+8> 3; do { __local double* plA = lA + idyT*49 + idxT; __local double* plB = lB + idxT*49 + idyT; plB[0] = CurrentOffSetB>=N?0.0:B[0]; plB[8] = CurrentOffSetB+8>=N?0.0:B[0+8*ldb]; plB[16] = CurrentOffSetB+16>=N?0.0:B[0+16*ldb]; plB[24] = CurrentOffSetB+24>=N?0.0:B[0+24*ldb]; plB[32] = CurrentOffSetB+32>=N?0.0:B[0+32*ldb]; plB[40] = CurrentOffSetB+40>=N?0.0:B[0+40*ldb]; plA[0] = CurrentOffSetA>=M?0.0:A[0]; plA[8] = CurrentOffSetA+8>=M?0.0:A[8]; plA[16] = CurrentOffSetA+16>=M?0.0:A[16]; plA[24] = CurrentOffSetA+24>=M?0.0:A[24]; plA[32] = CurrentOffSetA+32>=M?0.0:A[32]; plA[40] = CurrentOffSetA+40>=M?0.0:A[40]; barrier(CLK_LOCAL_MEM_FENCE); int offA = idx ; int offB = idy ; #pragma unroll 1 for(uint k = 0 ; k =M || offset_y>=N ) return; C+= offset_x+ldc*offset_y; //for (int i=0; i<6; i++) int i = 0; do { C[0*ldc+i*8] = alpha*rC[i][0] + beta*C[0*ldc+i*8]; if (offset_y+8=M) return; }while (++i < 6); } "; static const char * dgemm_NN_1_SPLIT__ALPHA = " __attribute__((reqd_work_group_size(8,8,1))) __kernel void dgemm_NN_48_48_1_8x8_6x6__ALPHA_SPLIT_MAIN(__global double const * restrict A, __global double const * restrict B, __global double * C, uint const M, uint const N, uint const K, double const alpha, uint lda, uint ldb, uint ldc, uint offsetA, uint offsetB, uint offsetC) { A += offsetA; B += offsetB; C += offsetC; double rC[6][6] = {(double)0}; double rA[6]; double rB[6]; __local double lA[392]; __local double lB[392]; int gidx = get_group_id(0); int gidy = get_group_id(1); int idx = get_local_id(0); int idy = get_local_id(1); int idt = 8*idy + idx; int idxT = idt % 8; int idyT = idt / 8; A += gidx*48+ idxT + idyT*lda; B += gidy*48*ldb+ idx + idy*ldb; int block_k = 0;//K >> 3; do { __local double* plA = lA + idyT*49 + idxT; __local double* plB = lB + idxT*49 + idyT; plA[0] = A[0+0*lda]; plA[8] = A[8+0*lda]; plA[16] = A[16+0*lda]; plA[24] = A[24+0*lda]; plA[32] = A[32+0*lda]; plA[40] = A[40+0*lda]; plB[0] = B[0+0*ldb]; plB[8] = B[0+8*ldb]; plB[16] = B[0+16*ldb]; plB[24] = B[0+24*ldb]; plB[32] = B[0+32*ldb]; plB[40] = B[0+40*ldb]; barrier(CLK_LOCAL_MEM_FENCE); int offA = idx ; int offB = idy ; #pragma unroll 1 for(uint k = 0 ; k > 3; do { __local double* plA = lA + idyT*49 + idxT; __local double* plB = lB + idxT*49 + idyT; plB[0] = B[0+0*ldb]; plB[8] = B[0+8*ldb]; plB[16] = B[0+16*ldb]; plB[24] = B[0+24*ldb]; plB[32] = B[0+32*ldb]; plB[40] = B[0+40*ldb]; plA[0] = CurrentOffSetA>=M?0.0:A[0]; plA[8] = CurrentOffSetA+8>=M?0.0:A[8]; plA[16] = CurrentOffSetA+16>=M?0.0:A[16]; plA[24] = CurrentOffSetA+24>=M?0.0:A[24]; plA[32] = CurrentOffSetA+32>=M?0.0:A[32]; plA[40] = CurrentOffSetA+40>=M?0.0:A[40]; barrier(CLK_LOCAL_MEM_FENCE); int offA = idx ; int offB = idy ; #pragma unroll 1 for(uint k = 0 ; k =M ) return; C+= offset_x+ldc*offset_y; //for (int i=0; i<6; i++) int i = 0; do { C[0*ldc+i*8] = alpha*rC[i][0] ; C[8*ldc+i*8] = alpha*rC[i][1] ; C[16*ldc+i*8] = alpha*rC[i][2] ; C[24*ldc+i*8] = alpha*rC[i][3]; C[32*ldc+i*8] = alpha*rC[i][4]; C[40*ldc+i*8] = alpha*rC[i][5]; offset_x += 8; if(offset_x>=M) return; }while (++i < 6); } __attribute__((reqd_work_group_size(8,8,1))) __kernel void dgemm_NN_48_1_1_8x8_6x6__ALPHA_SPLIT_COLUMN(__global double const * restrict A, __global double const * restrict B, __global double * C, uint const M, uint const N, uint const K, double const alpha, uint lda, uint ldb, uint ldc, uint offsetA, uint offsetB, uint offsetC) { A += offsetA; B += offsetB; C += offsetC; double rC[6][6] = {(double)0}; double rA[6]; double rB[6]; __local double lA[392]; __local double lB[392]; int gidx = get_group_id(0); int gidy = N/48;//get_group_id(1); int idx = get_local_id(0); int idy = get_local_id(1); int idt = 8*idy + idx; int idxT = idt % 8; int idyT = idt / 8; int CurrentOffSetB = gidy*48 + idyT; A += gidx*48+ idxT + idyT*lda; B += gidy*48*ldb+ idx + idy*ldb; int block_k = 0;//K >> 3; do { __local double* plA = lA + idyT*49 + idxT; __local double* plB = lB + idxT*49 + idyT; plB[0] = CurrentOffSetB>=N?0.0:B[0]; plB[8] = CurrentOffSetB+8>=N?0.0:B[0+8*ldb]; plB[16] = CurrentOffSetB+16>=N?0.0:B[0+16*ldb]; plB[24] = CurrentOffSetB+24>=N?0.0:B[0+24*ldb]; plB[32] = CurrentOffSetB+32>=N?0.0:B[0+32*ldb]; plB[40] = CurrentOffSetB+40>=N?0.0:B[0+40*ldb]; plA[0] = A[0+0*lda]; plA[8] = A[8+0*lda]; plA[16] = A[16+0*lda]; plA[24] = A[24+0*lda]; plA[32] = A[32+0*lda]; plA[40] = A[40+0*lda]; barrier(CLK_LOCAL_MEM_FENCE); int offA = idx ; int offB = idy ; #pragma unroll 1 for(uint k = 0 ; k =N ) return; C+= offset_x+ldc*offset_y; //for (int i=0; i<6; i++) int i = 0; do { C[0*ldc+i*8] = alpha*rC[i][0] ; if (offset_y+8> 3; do { __local double* plA = lA + idyT*49 + idxT; __local double* plB = lB + idxT*49 + idyT; plB[0] = CurrentOffSetB>=N?0.0:B[0]; plB[8] = CurrentOffSetB+8>=N?0.0:B[0+8*ldb]; plB[16] = CurrentOffSetB+16>=N?0.0:B[0+16*ldb]; plB[24] = CurrentOffSetB+24>=N?0.0:B[0+24*ldb]; plB[32] = CurrentOffSetB+32>=N?0.0:B[0+32*ldb]; plB[40] = CurrentOffSetB+40>=N?0.0:B[0+40*ldb]; plA[0] = CurrentOffSetA>=M?0.0:A[0]; plA[8] = CurrentOffSetA+8>=M?0.0:A[8]; plA[16] = CurrentOffSetA+16>=M?0.0:A[16]; plA[24] = CurrentOffSetA+24>=M?0.0:A[24]; plA[32] = CurrentOffSetA+32>=M?0.0:A[32]; plA[40] = CurrentOffSetA+40>=M?0.0:A[40]; barrier(CLK_LOCAL_MEM_FENCE); int offA = idx ; int offB = idy ; #pragma unroll 1 for(uint k = 0 ; k =M ||offset_y>=N ) return; C+= offset_x+ldc*offset_y; //for (int i=0; i<6; i++) int i = 0; do { C[0*ldc+i*8] = alpha*rC[i][0] ; if (offset_y+8=M) return; }while (++i < 6); } ";clblas-2.10/src/library/blas/gens/clTemplates/dot.cl000066400000000000000000000053011264277366700224120ustar00rootroot00000000000000/* ************************************************************************ * Copyright 2013 Advanced Micro Devices, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * ************************************************************************/ static const char *dot_kernel = " #ifdef DOUBLE_PRECISION #ifdef cl_khr_fp64 #pragma OPENCL EXTENSION cl_khr_fp64 : enable #else #pragma OPENCL EXTENSION cl_amd_fp64 : enable #endif #endif __kernel void %PREFIXdot_kernel( __global %TYPE *_X, __global %TYPE *_Y, __global %TYPE *scratchBuff, uint N, uint offx, int incx, uint offy, int incy, int doConj ) { __global %TYPE *X = _X + offx; __global %TYPE *Y = _Y + offy; %TYPE dotP = (%TYPE) 0.0; if ( incx < 0 ) { X = X + (N - 1) * abs(incx); } if ( incy < 0 ) { Y = Y + (N - 1) * abs(incy); } int gOffset; for( gOffset=(get_global_id(0) * %V); (gOffset + %V - 1) 4. Successive threads are now ITEMX apart // Instead, we can make them float4 apart to get highest L1 cache bandwidth // 5. A.B^T - actualCol, actualRow optimization // static const char *GEMM_NN_KERNEL = " #ifdef DOUBLE_PRECISION #ifdef cl_khr_fp64 #pragma OPENCL EXTENSION cl_khr_fp64 : enable #else #pragma OPENCL EXTENSION cl_amd_fp64 : enable #endif #endif __kernel void GEMM_NN__KERNEL ( __global %TYPE const * restrict _A, __global %TYPE const * restrict _B, __global %TYPE *_C, uint M, uint N, uint _K, uint _lda, uint _ldb, uint ldc, uint offa, uint offb, uint offc, %TYPE alpha, %TYPE beta #ifdef TAIL_RUN , uint tailStartM, uint tailStartN #endif ) { const int V = %V; __global %TYPE const *restrict A; __global %TYPE const *restrict B; __global %TYPE *C = _C + offc; uint K = _K; uint lda, ldb; uint rowA, colA, rowB, colB, rowC, colC; uint numGroupsOnY; uint row, col; uint tid = get_local_id(0); int panel; int ACOLSTART, ACOLEND; uint MV; // // %WIDTH - Preferably 16 // %ITEMY, %ITEMX - 1 Thread is responsible for %ITEMY * %ITEMX sub-matrix in C // %ITEMY must be divisible by %V for NN kernel // The entire workgroup loops-together to complete ITEMY-ITEMX sub-matrix // uint threadsY = %WIDTH; uint threadsX = get_local_size(0)/threadsY; // // Column-Major ordering of Workgroups // // %ITEMY - Number of elements , a workitem processes in Y direction. // %ITEMX - Number of elements , a workitem processes in X direction. // // %V - Vectoring Width // %PANEL(*) - Panel Width to access Rows of A and Columns of B // Right now, %V is assumed to be the panel width. // We dont use %PANEL in the current implementation. // MV = M; #ifndef TAIL_RUN { uint bidX, bidY; uint blockDimY; #ifdef M_TAIL_PRESENT MV = M - (M % (%V)); #endif if (MV == 0) { return; } blockDimY = ((M-1) / (threadsY * %ITEMY)) + 1; bidY = ( get_group_id(0) % ( blockDimY)); bidX = ( get_group_id(0) / ( blockDimY)); // // Note: // Using the new Map function does not yeild any performnce gain. // In fact, it degraded the performance // Keep this commented. // //mapWorkGroupToTileNumber(M, N, &bidY, &bidX); // // is the left-top of the TILE region // in the output C matrix that will be determined // by this workgroup // row = (bidY * (threadsY * %ITEMY)); col = (bidX * (threadsX * %ITEMX)); } #else { uint nWorkGroupsAY, nWorkGroupsAX, nWorkGroupsA; uint bidY, bidX; if (M == tailStartM) { nWorkGroupsA = 0; } else { nWorkGroupsAY = ((M - tailStartM - 1)/threadsY + 1); nWorkGroupsAX = ((tailStartN - 1)/threadsX + 1); nWorkGroupsA = nWorkGroupsAY * nWorkGroupsAX; } if (get_group_id(0) < nWorkGroupsA) { bidY = get_group_id(0) % (nWorkGroupsAY); bidX = get_group_id(0) / nWorkGroupsAY; row = tailStartM + (bidY * threadsY * %ITEMY); col = (bidX * threadsX * %ITEMX); } else { uint nWorkGroupsBY, nWorkGroupsBX; nWorkGroupsBY = ((M-1)/threadsY) + 1; nWorkGroupsBX = ((N-tailStartN-1)/threadsX) + 1; bidY = (get_group_id(0) - nWorkGroupsA) % (nWorkGroupsBY); bidX = (get_group_id(0) - nWorkGroupsA) / nWorkGroupsBY; row = (bidY * threadsY * %ITEMY); col = tailStartN + (bidX * threadsX * %ITEMX); } } #endif // // ACOLSTART, ACOLEND // SYMM Matrix multiplication proceeds by multiplying panels on A's block-row // with panels on B's block-column. // However due to symmetric nature of A/B matrix compounded by the fact that // only upper OR lower triangle of the symm matrix is available, vector-loads // are not possible while traversing certain regions of the matrix. // ACOLStart and ACOLEnd - signify what portion of SYMM can be achieved through // this NN kernel. The SYMM handler has to compose the SYMM in-terms of GEMM kernels // #ifdef __SYMM_LEFT__ // MxM * MxN A = _A + offa; lda = _lda; B = _B + offb; ldb = _ldb; K = M; #ifndef __SYMM_DIAGONAL__ #ifdef __SYMM_LOWER__ ACOLSTART = 0; ACOLEND = row; #elif defined(__SYMM_UPPER__) ACOLSTART = row + (threadsY*(%ITEMY)); ACOLEND = K; #else #error GEMM_NN_KERNEL #endif #else ACOLSTART = row; ACOLEND = row + (threadsY*(%ITEMY)); #endif if (ACOLEND > K) { ACOLEND = K; } /* if (get_local_id(0) == 0) { printf(\" GEMM_NN_KERNEL : SYMM_LEFT: Setting ACOLSTART to %d and ACOLEND to %d \\n \" , ACOLSTART, ACOLEND); } */ #elif defined(__SYMM_RIGHT__) // MxN * NxN A = _B + offb; lda = _ldb; B = _A + offa; ldb = _lda; K = N; #ifndef __SYMM_DIAGONAL__ #ifdef __SYMM_UPPER__ ACOLSTART = 0; ACOLEND = col; #elif defined(__SYMM_LOWER__) ACOLSTART = col + (threadsX*(%ITEMX)); ACOLEND = K; #else #error GEMM_NN_KERNEl #endif #else ACOLSTART = col; ACOLEND = col + (threadsX*(%ITEMX)); #endif if (ACOLEND > K) { ACOLEND = K; } #else A = _A + offa; B = _B + offb; K = _K; lda = _lda; ldb = _ldb; ACOLSTART = 0; ACOLEND = K; #endif uint offsetY = (tid % threadsY) * %V; uint offsetX = (tid / threadsY) * %ITEMX; rowA = row + offsetY; colB = (col+offsetX); #ifndef TAIL_RUN bool tailBlock = ((row >= M) || (col >= N)); #else bool tailBlock = (row >= tailStartM); #endif /* #ifdef TAIL_RUN if ((rowA >= M) || (colB >= N)) { return; } #endif */ #ifndef TAIL_RUN // Non-tail RUN if (tailBlock == true) { return; } #elif defined(TAIL_RUN) // TAIL RUN if (tailBlock == false) { return; } #else #error GEMM_NN_KERNEL #endif %TYPE%V AVAL[%V][(%ITEMY_BY_V)]; // 8 #ifdef COMPLEX %TYPE%HV AVALEVEN[%V][(%ITEMY_BY_V)]; // 8 %TYPE%HV AVALODD[%V][(%ITEMY_BY_V)]; // 8 #endif %TYPE%V BVAL[%ITEMX]; #ifdef COMPLEX %TYPE%HV BVALEVEN[%ITEMX]; %TYPE%HV BVALODD[%ITEMX]; #endif %TYPE%V CVAL[(%ITEMY_BY_V)][%ITEMX]; #ifdef COMPLEX %TYPE%HV CVALEVEN[(%ITEMY_BY_V)][%ITEMX]; %TYPE%HV CVALODD[(%ITEMY_BY_V)][%ITEMX]; #endif %IF(%ITEMY_BY_V) #pragma unroll %ITEMY_BY_V for(uint i=0; i< (%ITEMY_BY_V); i++) { %IF(%ITEMX) #pragma unroll %ITEMX for(uint j=0; j<(%ITEMX); j++) { CVAL[i][j] = (%TYPE%V) 0; #ifdef COMPLEX CVALEVEN[i][j] = (%TYPE%HV) 0; CVALODD[i][j] = (%TYPE%HV) 0; #endif } } uint ACOL; for(ACOL=ACOLSTART; ((ACOL+ %V -1) < ACOLEND); ACOL += %V) { { // // Load B values // %IF(%ITEMX) #pragma unroll %ITEMX for(uint bcol = 0; bcol < %ITEMX; bcol++) { #ifdef N_TAIL_PRESENT uint actualCol; actualCol = ((colB + bcol) >= N) ? (N-1) : (colB + bcol); #endif #if !defined(__SYMM_DIAGONAL__) || defined(__SYMM_LEFT__) #ifndef N_TAIL_PRESENT BVAL[bcol] = %VLOAD(0, (&B[ACOL + (colB + bcol)*ldb])); #else BVAL[bcol] = %VLOAD(0, (&B[ACOL + (actualCol)*ldb])); #endif #else // defined(__SYMM_DIAGONAL__) && defined(__SYMM_RIGHT__) #ifndef N_TAIL_PRESENT BVAL[bcol] = SYMM_VECTOR_LOAD_USING_SCALAR(B, N, ldb, ACOL, (colB + bcol)); #else BVAL[bcol] = SYMM_VECTOR_LOAD_USING_SCALAR(B, N, ldb, ACOL, actualCol); #endif #endif // // If Complex data, load the real and imaginary parts into separate register banks // #ifdef COMPLEX BVALEVEN[bcol] = BVAL[bcol].even; BVALODD[bcol] = BVAL[bcol].odd; #endif } } { // // Load A values // // // PENDNG BUG FIX: Unroll Factor should be according to PANEL Size // Previoously PANEL was size of V. So ITEMY worked // Current Workaround - Panel same as %V - See gemm_cached.cpp // %IF(%ITEMY_BY_V) #pragma unroll %ITEMY_BY_V for(uint j=0; j< (%ITEMY_BY_V); j++) { #pragma unroll %V for(uint i = 0; i < %V; i++) { uint actualRow; #if !defined(__SYMM_DIAGONAL__) || defined(__SYMM_RIGHT__) #ifndef M_TAIL_PRESENT AVAL[i][j] = %VLOAD(0, (&A[(rowA + j*threadsY*(V)) + (ACOL + i)*lda]) ); #else actualRow = ((rowA + j*threadsY*(V)) >= MV) ? (MV-%V) : (rowA + j*threadsY*(V)); AVAL[i][j] = %VLOAD(0, (&A[actualRow + (ACOL + i)*lda]) ); #endif #else // CASE: SYMM_DIAGONAL && SYMM_LEFT #ifndef M_TAIL_PRESENT //AVAL[c][r] = %VLOAD(0, (&A[(rowA + r*threadsY*(V)) + (ACOL + c)*lda]) ); AVAL[i][j] = SYMM_VECTOR_LOAD_USING_SCALAR(A, M, lda, (rowA + j*threadsY*(V)) , (ACOL + i)); #else actualRow = ((rowA + j*threadsY*(V)) >= MV) ? (MV-%V) : (rowA + j*threadsY*(V)); AVAL[i][j] = SYMM_VECTOR_LOAD_USING_SCALAR(A, M, lda, actualRow, (ACOL + i)); #endif #endif // // If Complex data, load the real and imaginary parts into separate register banks // #ifdef COMPLEX AVALEVEN[i][j] = AVAL[i][j].even; AVALODD[i][j] = AVAL[i][j].odd; #endif } } } { %IF(%ITEMY_BY_V) #pragma unroll %ITEMY_BY_V for(uint i=0; i<(%ITEMY_BY_V); i++) { %IF(%ITEMX) #pragma unroll %ITEMX for(uint j=0; j<(%ITEMX); j++) { #ifndef COMPLEX %VFOR_REAL { CVAL[i][j] = mad(AVAL[%VFORINDEX][i], BVAL[j]%VFORSUFFIX, CVAL[i][j]); } #else // // Pending - Replace by %COMPLEX_VMAD() // %VFOR_REAL { // // PENDING Needs a FIX // CVALEVEN[i][j] = mad(AVALEVEN[%VFORINDEX][i], BVALEVEN[j]%VFORSUFFIX, CVALEVEN[i][j]); CVALODD[i][j] = mad(AVALEVEN[%VFORINDEX][i], BVALODD[j]%VFORSUFFIX, CVALODD[i][j]); CVALEVEN[i][j] = mad(AVALODD[%VFORINDEX][i], -BVALODD[j]%VFORSUFFIX, CVALEVEN[i][j]); CVALODD[i][j] = mad(AVALODD[%VFORINDEX][i], BVALEVEN[j]%VFORSUFFIX, CVALODD[i][j]); } #endif } } } #ifdef GEMM_NEEDS_BARRIER barrier(CLK_LOCAL_MEM_FENCE); #endif } #ifdef COMPLEX %IF(%ITEMY_BY_V) #pragma unroll %ITEMY_BY_V for(uint i=0; i< (%ITEMY_BY_V); i++) { %IF(%ITEMX) #pragma unroll %ITEMX for(uint j=0; j<(%ITEMX); j++) { %COMPLEX_JOIN(CVAL[i][j], CVALEVEN[i][j], CVALODD[i][j]); } } #endif // // Tail blocks never execute this FOR loop as they execute with Vector Width of 1 // for(; ACOL < ACOLEND; ACOL ++) { // // Load B values // %IF(%ITEMX) #pragma unroll %ITEMX for(uint bcol = 0; bcol < %ITEMX; bcol++) { // // PENDING: PANEL iteration to Load the Panel Depth iterating by %V // #if !defined(__SYMM_DIAGONAL__) || defined(__SYMM_LEFT__) { %TYPE SCAL; #ifndef N_TAIL_PRESENT SCAL = B[ACOL + (colB + bcol)*ldb]; BVAL[bcol] = %VMAKEVEC(SCAL); #else SCAL = B[ACOL + ((colB + bcol)%(N))*ldb]; BVAL[bcol] = %VMAKEVEC(SCAL); #endif } #else // SYMM_DIAGONAL && SYMM_RIGHT { %TYPE SCAL; #ifndef N_TAIL_PRESENT SCAL = SYMM_SCALAR_LOAD(B, N, ldb, ACOL, (colB + bcol)); BVAL[bcol] = %VMAKEVEC(SCAL); #else SCAL = SYMM_SCALAR_LOAD(B, N, ldb, ACOL, ((colB + bcol)%(N))); BVAL[bcol] = %VMAKEVEC(SCAL); #endif } #endif } // // Load A values // %IF(%ITEMY_BY_V) #pragma unroll %ITEMY_BY_V for(uint i = 0; i < (%ITEMY_BY_V); i++) // 1 * ITEMY/V { #if !defined(__SYMM_DIAGONAL__) || defined(__SYMM_RIGHT__) #ifndef M_TAIL_PRESENT AVAL[0][i] = %VLOAD(0, (&A[(rowA + i*threadsY*(V)) + (ACOL)*lda]) ); #else AVAL[0][i] = %VLOAD(0, (&A[(((rowA + i*threadsY*(V))) % (MV)) + (ACOL)*lda]) ); #endif #else // defined(DIAGONAL) && (LEFT) #ifndef M_TAIL_PRESENT AVAL[0][i] = SYMM_VECTOR_LOAD_USING_SCALAR(A, M, lda, (rowA + i*threadsY*(V)) , (ACOL)); #else AVAL[0][i] = SYMM_VECTOR_LOAD_USING_SCALAR(A, M, lda, ((rowA + i*threadsY*(V)) % (MV)), (ACOL)); #endif #endif } { %IF(%ITEMY_BY_V) #pragma unroll %ITEMY_BY_V for(uint i=0; i<(%ITEMY_BY_V); i++) { %IF(%ITEMX) #pragma unroll %ITEMX for(uint j=0; j<(%ITEMX); j++) { %VMAD(CVAL[i][j] , AVAL[0][i] , BVAL[j]); } } } } /* if ((get_group_id(0) == 0) && (get_local_id(0) == 0)) { printf(\"Updating C Matrix: Alpha = %f, Beta = %f\\n\", alpha, beta); } */ // // STORE Result in C // %TYPE%V reg , betareg, alphareg; %TYPE%V alphav, betav; alphav = %VMAKEVEC(alpha); betav = %VMAKEVEC(beta); %IF(%ITEMY_BY_V) #pragma unroll %ITEMY_BY_V for(uint i=0; i< (%ITEMY_BY_V); i++) { %IF(%ITEMX) #pragma unroll %ITEMX for(uint j=0; j<(%ITEMX); j++) { #if !defined(M_TAIL_PRESENT) && !defined(N_TAIL_PRESENT) reg = %VLOAD(0, (&C[rowA + i*threadsY*V + (colB+j)*ldc])); %VMUL(betareg, betav, reg); %VMUL(alphareg, alphav, CVAL[i][j]); %ADD( reg, betareg, alphareg); %VSTORE(reg, 0, (&C[(rowA + i*threadsY*V) + (colB+j)*ldc])); #else if (((rowA + i*threadsY*V) < MV) && ((colB + j) < N)) { reg = %VLOAD(0, (&C[rowA + i*threadsY*V + (colB+j)*ldc])); %VMUL(betareg, betav, reg); %VMUL(alphareg, alphav, CVAL[i][j]); %ADD( reg, betareg, alphareg); %VSTORE(reg, 0, (&C[(rowA + i*threadsY*V) + (colB+j)*ldc])); } #endif } } return; } "; static const char *GEMM_NT_KERNEL = " #ifdef DOUBLE_PRECISION #ifdef cl_khr_fp64 #pragma OPENCL EXTENSION cl_khr_fp64 : enable #else #pragma OPENCL EXTENSION cl_amd_fp64 : enable #endif #endif //#undef COMPLEX //#pragma OPENCL EXTENSION cl_amd_printf : enable __kernel void GEMM_NT__KERNEL ( __global %TYPE const * restrict _A, __global %TYPE const * restrict _B, __global %TYPE *_C, uint M, uint N, uint _K, uint _lda, uint _ldb, uint ldc, uint offa, uint offb, uint offc, %TYPE alpha, %TYPE beta #ifdef TAIL_RUN , uint tailStartM, uint tailStartN #endif ) { const int V = %V; __global %TYPE const *restrict A; __global %TYPE const *restrict B; __global %TYPE *C = _C + offc; uint K = _K; uint lda, ldb; uint rowA, colA, rowB, colB, rowC, colC; uint numGroupsOnY; uint row, col; uint tid = get_local_id(0); int panel; int ACOLSTART, ACOLEND; uint MV, NV; // // %WIDTH - Preferably 16 // %ITEMY, %ITEMX - 1 Thread is responsible for %ITEMY * %ITEMX sub-matrix in C // %ITEMY and %ITEMX must be divisible by %V for NT kernel // The entire workgroup loops-together to complete ITEMY-ITEMX sub-matrix // uint threadsY = %WIDTH; uint threadsX = get_local_size(0)/threadsY; // // Column-Major ordering of Workgroups // // %ITEMY - Number of elements , a workitem processes in Y direction. // %ITEMX - Number of elements , a workitem processes in X direction. // // %V - Vectoring Width // %PANEL(*) - Panel Width to access Rows of A and Columns of B // Right now, %V is assumed to be the panel width. // We dont use %PANEL in the current implementation. // MV = M; NV = N; #ifndef TAIL_RUN { uint bidX, bidY; uint blockDimY; #ifdef M_TAIL_PRESENT MV = M - (M % (%V)); if (MV == 0) { return; } #endif #ifdef N_TAIL_PRESENT NV = N - (N% (%V)); if (NV == 0) { return; } #endif blockDimY = ((M-1) / (threadsY * %ITEMY)) + 1; uint blockID = get_group_id(0); getBlockNumber(blockDimY, blockID, &bidY, &bidX, 1); // // is the left-top of the TILE region // in the output C matrix that will be determined // by this workgroup // row = (bidY * (threadsY * %ITEMY)); col = (bidX * (threadsX * %ITEMX)); } #else { uint nWorkGroupsAY, nWorkGroupsAX, nWorkGroupsA; uint bidY, bidX; MV = M; if (M == tailStartM) { nWorkGroupsA = 0; } else { nWorkGroupsAY = ((M - tailStartM - 1)/threadsY + 1); nWorkGroupsAX = ((tailStartN - 1)/threadsX + 1); nWorkGroupsA = nWorkGroupsAY * nWorkGroupsAX; } if (get_group_id(0) < nWorkGroupsA) { bidY = get_group_id(0) % (nWorkGroupsAY); bidX = get_group_id(0) / nWorkGroupsAY; row = tailStartM + (bidY * threadsY * %ITEMY); col = (bidX * threadsX * %ITEMX); NV = tailStartN; } else { uint nWorkGroupsBY, nWorkGroupsBX; nWorkGroupsBY = ((M-1)/threadsY) + 1; nWorkGroupsBX = ((N-tailStartN-1)/threadsX) + 1; bidY = (get_group_id(0) - nWorkGroupsA) % (nWorkGroupsBY); bidX = (get_group_id(0) - nWorkGroupsA) / nWorkGroupsBY; row = (bidY * threadsY * %ITEMY); col = tailStartN + (bidX * threadsX * %ITEMX); NV = N; } } #endif // // ACOLSTART, ACOLEND // SYMM Matrix multiplication proceeds by multiplying panels on A's block-row // with panels on B's block-column. // However due to symmetric nature of A matrix compounded by the fact that // only upper OR lower triangle of the symm matrix is available, vector-loads // are not possible while traversing certain regions of the matrix. // ACOLStart and ACOLEnd - signify what portion of SYMM can be achieved through // this NT kernel. The SYMM handler has to compose the SYMM in-terms of GEMM kernels // #ifdef __SYMM_LEFT__ #error GEMM_NT_KERNEL Should not be called in __SYMM_LEFT__ case! #elif defined(__SYMM_RIGHT__) // MxN * NxN A = _B + offb; lda = _ldb; B = _A + offa; ldb = _lda; K = N; #ifndef __SYMM_DIAGONAL__ #ifdef __SYMM_UPPER__ ACOLSTART = col + (threadsX*(%ITEMX)); ACOLEND = K; #elif defined(__SYMM_LOWER__) ACOLSTART = 0; ACOLEND = col; #else #error GEMM_NT_KERNEL : Neither SYMM_UPPER nor SYMM_LOWER is defined! #endif #else ACOLSTART = col; ACOLEND = col + (threadsX*(%ITEMX)); #endif if (ACOLEND > K) { ACOLEND = K; } #else // GEMM A = _A + offa; B = _B + offb; K = _K; lda = _lda; ldb = _ldb; ACOLSTART = 0; ACOLEND = K; #endif uint offsetY = (tid % threadsY) * %V; uint offsetX = (tid / threadsY) * %ITEMX; rowA = row + offsetY; colB = col + offsetX; #ifndef TAIL_RUN bool tailBlock = ((row >= M) || (col >= N)); #else bool tailBlock = ((row >= tailStartM) || (col >= tailStartN)); #endif /* Should be handled with TAIL_PRESENT Macros. if ((rowA >= M) || (colB >= N)) { return; } */ #ifndef TAIL_RUN // Non-tail RUN if (tailBlock == true) { return; } #else // TAIL RUN - This case never happens. if (tailBlock == false) { return; } #endif %TYPE%V CVAL[(%ITEMY_BY_V)][%ITEMX]; #ifdef COMPLEX %TYPE%HV CVALEVEN[(%ITEMY_BY_V)][%ITEMX]; %TYPE%HV CVALODD[(%ITEMY_BY_V)][%ITEMX]; #endif %IF(%ITEMY_BY_V) #pragma unroll %ITEMY_BY_V for(uint i=0; i< (%ITEMY_BY_V); i++) { %IF(%ITEMX) #pragma unroll %ITEMX for(uint j=0; j<(%ITEMX); j++) { CVAL[i][j] = (%TYPE%V) 0; #ifdef COMPLEX CVALEVEN[i][j] = (%TYPE%HV) 0; CVALODD[i][j] = (%TYPE%HV) 0; #endif } } uint ACOL; for(ACOL=ACOLSTART; ((ACOL+%V-1) < ACOLEND); ACOL += %V /* %PANEL */) { %TYPE%V AVAL[%V][(%ITEMY_BY_V)]; // [%PANEL][%ITEMY_BY_V] %TYPE%V BVAL[%ITEMX_BY_V][%V]; // [%PANEL][%ITEMX] #ifdef COMPLEX %TYPE%HV AVALEVEN[%V][(%ITEMY_BY_V)]; // [%PANEL][%ITEMY_BY_V] %TYPE%HV AVALODD[%V][(%ITEMY_BY_V)]; // [%PANEL][%ITEMY_BY_V] %TYPE%HV BVALEVEN[%ITEMX_BY_V][%V]; // [%PANEL][%ITEMX] %TYPE%HV BVALODD[%ITEMX_BY_V][%V]; // [%PANEL][%ITEMX] #endif { // // Load B values // %IF(%V) #pragma unroll %V for(uint panel=0; panel < %V; panel++) { %IF(%ITEMX_BY_V) #pragma unroll %ITEMX_BY_V for(uint bcol = 0; bcol < %ITEMX_BY_V; bcol++) { // // PENDING: PANEL iteration to Load the Panel Depth iterating by %V // #ifndef __SYMM_DIAGONAL__ #ifndef N_TAIL_PRESENT BVAL[bcol][panel] = %VLOAD(0, (&B[(ACOL + panel)*ldb + (colB + bcol*(V))])); #else BVAL[bcol][panel] = %VLOAD(0, (&B[(ACOL + panel)*ldb + ((colB + bcol*V) % NV)])); #endif #else #ifndef N_TAIL_PRESENT BVAL[bcol][panel] = SYMM_VECTOR_LOAD_USING_SCALAR(B, N, ldb, (colB + bcol*(V)), (ACOL + panel)); #else BVAL[bcol][panel] = SYMM_VECTOR_LOAD_USING_SCALAR(B, N, ldb, ((colB + bcol*V) % NV), (ACOL + panel)); #endif #endif #ifdef CONJUGATE_B %TYPE%V conjTemp = BVAL[bcol][panel]; %CONJUGATE(1, conjTemp); BVAL[bcol][panel] = conjTemp; #endif #ifdef COMPLEX { BVALEVEN[bcol][panel] = BVAL[bcol][panel].even; BVALODD[bcol][panel] = BVAL[bcol][panel].odd; } #endif } } // // Load A values // %IF(%ITEMY) #pragma unroll %ITEMY for(uint i = 0; i < (%V * (%ITEMY_BY_V)) /* PANEL * ITEMY/V */; i++) { const uint yiterations = %ITEMY_BY_V; uint c = (i / yiterations); uint r = (i % yiterations); #ifndef M_TAIL_PRESENT AVAL[c][r] = %VLOAD(0, (&A[(rowA + r*threadsY*(V)) + (ACOL + c)*lda]) ); #else AVAL[c][r] = %VLOAD(0, (&A[((rowA + r*threadsY*(V)) % MV) + (ACOL + c)*lda]) ); #endif #ifdef COMPLEX AVALEVEN[c][r] = AVAL[c][r].even; AVALODD[c][r] = AVAL[c][r].odd; #endif } } %IF(%V) #pragma unroll %V for(uint panel=0; panel<(%V); panel++) { %IF(%ITEMY_BY_V) #pragma unroll %ITEMY_BY_V for(uint i=0; i<(%ITEMY_BY_V); i++) { %IF(%ITEMX_BY_V) #pragma unroll %ITEMX_BY_V for(uint j=0; j<(%ITEMX_BY_V); j++) { const int CX = j * (%V); #ifndef COMPLEX %VFOR_REAL { CVAL[i][CX + %VFORINDEX] = mad(AVAL[panel][i], BVAL[j][panel]%VFORSUFFIX, CVAL[i][CX + %VFORINDEX]); } #else // // PENDING: Replace with %COMPLEX_MAD op // %VFOR_REAL { CVALEVEN[i][CX + %VFORINDEX] = mad(AVALEVEN[panel][i], BVALEVEN[j][panel]%VFORSUFFIX, CVALEVEN[i][CX + %VFORINDEX]); CVALODD[i][CX + %VFORINDEX] = mad(AVALEVEN[panel][i], BVALODD[j][panel]%VFORSUFFIX, CVALODD[i][CX + %VFORINDEX]); CVALEVEN[i][CX + %VFORINDEX] = mad(AVALODD[panel][i], -BVALODD[j][panel]%VFORSUFFIX, CVALEVEN[i][CX + %VFORINDEX]); CVALODD[i][CX + %VFORINDEX] = mad(AVALODD[panel][i], BVALEVEN[j][panel]%VFORSUFFIX, CVALODD[i][CX + %VFORINDEX]); } #endif } } } #ifdef GEMM_NEEDS_BARRIER barrier(CLK_LOCAL_MEM_FENCE); #endif } #ifdef COMPLEX %IF(%ITEMY_BY_V) #pragma unroll %ITEMY_BY_V for(uint i=0; i< (%ITEMY_BY_V); i++) { %IF(%ITEMX) #pragma unroll %ITEMX for(uint j=0; j<(%ITEMX); j++) { %COMPLEX_JOIN(CVAL[i][j], CVALEVEN[i][j], CVALODD[i][j]); } } #endif // // Tail blocks never execute this FOR loop as they execute with Vector Width of 1 // for(; ACOL < ACOLEND; ACOL ++) { %TYPE%V AVAL[(%ITEMY_BY_V)]; // [%PANEL][%ITEMY_BY_V] %TYPE BVAL[%ITEMX]; // [%PANEL][%ITEMX] // // Load B values // %IF(%ITEMX) #pragma unroll %ITEMX for(uint bcol = 0; bcol < %ITEMX; bcol++) { %TYPE SCALAR; // // PENDING: PANEL iteration to Load the Panel Depth iterating by %V // { #ifndef __SYMM_DIAGONAL__ #ifndef N_TAIL_PRESENT SCALAR = B[ACOL*ldb + (colB + bcol)]; #else SCALAR = B[ACOL*ldb + ((colB + bcol) % NV)]; #endif #else #ifndef N_TAIL_PRESENT SCALAR = SYMM_SCALAR_LOAD(B, N, ldb, (colB + bcol), ACOL ); #else SCALAR = SYMM_SCALAR_LOAD(B, N, ldb, ((colB + bcol) % NV), ACOL); #endif #endif #ifdef CONJUGATE_B %CONJUGATE(1, SCALAR); #endif BVAL[bcol] = (SCALAR); } } // // Load A values // %IF(%ITEMY_BY_V) #pragma unroll %ITEMY_BY_V for(uint i = 0; i < (%ITEMY_BY_V); i++) // 1 * ITEMY/V { #ifndef M_TAIL_PRESENT AVAL[i] = %VLOAD(0, (&A[(rowA + i*threadsY*(V)) + (ACOL)*lda]) ); #else AVAL[i] = %VLOAD(0, (&A[((rowA + i*threadsY*(V)) % MV) + (ACOL)*lda]) ); #endif } { %IF(%ITEMY_BY_V) #pragma unroll %ITEMY_BY_V for(uint i=0; i<(%ITEMY_BY_V); i++) { %IF(%ITEMX) #pragma unroll %ITEMX for(uint j=0; j<(%ITEMX); j++) { %VMAD(CVAL[i][j] , AVAL[i] , BVAL[j]); } } } } // // STORE Result in C // %TYPE%V reg , betareg, alphareg; %TYPE%V alphav, betav; alphav = %VMAKEVEC(alpha); betav = %VMAKEVEC(beta); #ifndef HERK %IF(%ITEMY_BY_V) #pragma unroll %ITEMY_BY_V for(uint i=0; i< (%ITEMY_BY_V); i++) { %IF(%ITEMX) #pragma unroll %ITEMX for(uint j=0; j<(%ITEMX); j++) { #if !defined(M_TAIL_PRESENT) && !defined(N_TAIL_PRESENT) reg = %VLOAD(0, (&C[rowA + i*threadsY*V + (colB+j)*ldc])); %VMUL(betareg, betav, reg); %VMUL(alphareg, alphav, CVAL[i][j]); %ADD( reg, betareg, alphareg); %VSTORE(reg, 0, (&C[(rowA + i*threadsY*V) + (colB+j)*ldc])); #else if (((rowA + i*threadsY*V) < MV) && ((colB+j) < NV)) { reg = %VLOAD(0, (&C[rowA + i*threadsY*V + (colB+j)*ldc])); %VMUL(betareg, betav, reg); %VMUL(alphareg, alphav, CVAL[i][j]); %ADD( reg, betareg, alphareg); %VSTORE(reg, 0, (&C[(rowA + i*threadsY*V) + (colB+j)*ldc])); } #endif } } #else %IF(%ITEMY_BY_V) #pragma unroll %ITEMY_BY_V for(uint i=0; i<(%ITEMY_BY_V); i++) { %IF(%ITEMX) #pragma unroll %ITEMX for(uint j=0; j<(%ITEMX); j++) { int actualRow = rowA + i*threadsY*V; int actualCol = colB + j; #if !defined(M_TAIL_PRESENT) && !defined(N_TAIL_PRESENT) { %VMUL(alphareg, alphav, CVAL[i][j]); //%TYPE temp[%V]; //*(__private %TYPE%V *)(&temp) = alphareg; //#pragma unroll %V //for(uint r = 0; r < %V; r++) %VFOR { #ifdef HERK_LOWER_TRIANGLE if((actualRow + %VFORINDEX) >= (actualCol)) #else if((actualRow + %VFORINDEX) <= (actualCol)) #endif { %TYPE C_s = C[%VFORINDEX + actualRow + actualCol * ldc]; %TYPE beta_s; %MUL(beta_s, beta, C_s); C_s = alphareg%VFORSUFFIX + beta_s; if((%VFORINDEX + actualRow) == actualCol) { C_s.odd = 0.0f; } C[%VFORINDEX + actualRow + actualCol * ldc] = C_s; } } } #else { if (((rowA + i*threadsY*V) < MV) && ((colB+j) < NV)) { %VMUL(alphareg, alphav, CVAL[i][j]); //%TYPE temp[%V]; //*(__private %TYPE%V *)(&temp) = alphareg; //#pragma unroll %V //for(uint r = 0; r < %V; r++) %VFOR { #ifdef HERK_LOWER_TRIANGLE if((%VFORINDEX + actualRow) >= (actualCol)) #else if((%VFORINDEX + actualRow) <= (actualCol)) #endif { %TYPE C_s = C[%VFORINDEX + actualRow + actualCol * ldc]; %TYPE beta_s; %MUL(beta_s, beta, C_s); C_s = alphareg%VFORSUFFIX + beta_s; if((%VFORINDEX + actualRow) == actualCol) { C_s.odd = 0.0f; } C[%VFORINDEX + actualRow + actualCol * ldc] = C_s; } } } } #endif } } #endif return; } "; static const char *GEMM_TN_KERNEL = " #ifdef DOUBLE_PRECISION #ifdef cl_khr_fp64 #pragma OPENCL EXTENSION cl_khr_fp64 : enable #else #pragma OPENCL EXTENSION cl_amd_fp64 : enable #endif #endif //#pragma OPENCL EXTENSION cl_amd_printf : enable __kernel void GEMM_TN__KERNEL ( __global %TYPE const * restrict _A, __global %TYPE const * restrict _B, __global %TYPE *_C, uint M, uint N, uint _K, uint _lda, uint _ldb, uint ldc, uint offa, uint offb, uint offc, %TYPE alpha, %TYPE beta #ifdef TAIL_RUN , uint tailStartM, uint tailStartN #endif ) { const int V = %V; const int ITEMY = %ITEMY; __global %TYPE const *restrict A; __global %TYPE const *restrict B; __global %TYPE *C = _C + offc; uint K = _K; uint lda, ldb; uint rowA, colA, rowB, colB, rowC, colC; uint numGroupsOnY; uint row, col; uint tid = get_local_id(0); int panel; int ACOLSTART, ACOLEND; uint MV, bidX; uint bidY; uint blockDimX; // // %WIDTH - Preferably 16 // %ITEMY, %ITEMX - 1 Thread is responsible for %ITEMY * %ITEMX sub-matrix in C // %ITEMY must be divisible by %V for NN kernel // The entire workgroup loops-together to complete ITEMY-ITEMX sub-matrix // uint threadsY = %WIDTH; uint threadsX = get_local_size(0)/threadsY; // // Row-Major ordering of Workgroups // // %ITEMY - Number of elements , a workitem processes in Y direction. // %ITEMX - Number of elements , a workitem processes in X direction. // // %V - Vectoring Width // %PANEL(*) - Panel Width to access Rows of A and Columns of B // Right now, %V is assumed to be the panel width. // We dont use %PANEL in the current implementation. // MV = M; #ifndef TAIL_RUN { blockDimX = ((N-1) / (threadsX * %ITEMX)) + 1; uint blockID = get_group_id(0); getBlockNumber(blockDimX, blockID, &bidY, &bidX, 0); // // is the left-top of the TILE region // in the output C matrix that will be determined // by this workgroup // row = (bidY * (threadsY * %ITEMY)); col = (bidX * (threadsX * %ITEMX)); } #else #error GEMM_TN_KERNEL: TAIL_RUN is NOT needed for TN Kernel! #endif // // ACOLSTART, ACOLEND // SYMM Matrix multiplication proceeds by multiplying panels on A's block-row // with panels on B's block-column. // However due to symmetric nature of A/B matrix compounded by the fact that // only upper OR lower triangle of the symm matrix is available, vector-loads // are not possible while traversing certain regions of the matrix. // ACOLStart and ACOLEnd - signify what portion of SYMM can be achieved through // this TN kernel. The SYMM handler has to compose the SYMM in-terms of GEMM kernels // SYMMETRIC LOAD routines are used when traversing the diaognal region wherease normal rules // hold good otherwise. // #ifdef __SYMM_LEFT__ // MxM * MxN A = _A + offa; lda = _lda; B = _B + offb; ldb = _ldb; K = M; #ifndef __SYMM_DIAGONAL__ #ifdef __SYMM_LOWER__ ACOLSTART = row + (threadsY * %ITEMY); ACOLEND = K; /* if (get_local_id(0) == 0) { printf(\"GEMM_TN_KERNEL: SYMM_LOWER: Setting ACOLSTART to %d, ACOLEND = %d\\n\", ACOLSTART, ACOLEND); } */ #elif defined(__SYMM_UPPER__) ACOLSTART = 0; ACOLEND = row; #else #error GEMM_TN_KERNEL #endif #else ACOLSTART = row; ACOLEND = row + (threadsY * %ITEMY); #endif if (ACOLEND > K) { ACOLEND = K; } #elif defined(__SYMM_RIGHT__) // MxN * NxN #error GEMM_TN_KERNEL: Internal Error: Should not be called in SYMM_RIGHT case! Right is Wrong! #else // GEMM Case A = _A + offa; B = _B + offb; K = _K; lda = _lda; ldb = _ldb; ACOLSTART = 0; ACOLEND = K; #endif uint offsetX = (tid % threadsX) * %ITEMX; uint offsetY = (tid / threadsX) * %ITEMY; rowA = (row + offsetY); colB = (col + offsetX); #ifndef TAIL_RUN bool tailBlock = ((row >= M) || (col >= N)); #else #error GEMM_TN_KERNEL: No TAIL_RUN for TN case #endif %TYPE%V AVAL[%ITEMY]; // %ITEMY * %PANEL #ifdef COMPLEX %TYPE%HV AVALEVEN[%ITEMY]; // %ITEMY * %PANEL %TYPE%HV AVALODD[%ITEMY]; // %ITEMY * %PANEL #endif %TYPE%V BVAL[%ITEMX]; #ifdef COMPLEX %TYPE%HV BVALEVEN[%ITEMX]; // %ITEMY * %PANEL %TYPE%HV BVALODD[%ITEMX]; // %ITEMY * %PANEL #endif %TYPE CVAL[%ITEMY][%ITEMX]; #ifdef COMPLEX %TYPE%HV CVALEVEN[%ITEMY][%ITEMX]; // %ITEMY * %PANEL %TYPE%HV CVALODD[%ITEMY][%ITEMX]; // %ITEMY * %PANEL #endif %IF(%ITEMY) #pragma unroll %ITEMY for(uint i=0; i< (%ITEMY); i++) { %IF(%ITEMX) #pragma unroll %ITEMX for(uint j=0; j<(%ITEMX); j++) { #ifdef COMPLEX CVAL[i][j] = (%TYPE) 0; CVALEVEN[i][j] = (%TYPE%HV) 0; CVALODD[i][j] = (%TYPE%HV) 0; #else CVAL[i][j] = (%TYPE) 0; #endif } } int ACOL; uint actualCol; uint actualRow; int ACOLENDV; int numIterations = (ACOLEND - ACOLSTART) / (%V) ; if (numIterations >= 0) { ACOLENDV = ACOLSTART + (numIterations * (%V)); } else { ACOLENDV = ACOLEND; } if (ldb % (512) == 0) // PENDING: 512 needs to be a configurable { // // ASSUMPTION(SYMM Variants): \"ACOLSTART\" is perfectly divisble by \"%V\" // ACOLSTART depends on the tile size on Y direction // Since Vector-sizes are hardly 1, 2,4, 8 or 16, we can assume that // this is indeed the case // // // Assumption is that 32/16/8 is divisble by any value in %V // int num32Iterations = (ACOLENDV - ACOLSTART) / (32/(sizeof(%TYPE)/sizeof(float))); if (num32Iterations <= 0) { ACOL = ACOLSTART; } else { int startIteration = bidX % num32Iterations; ACOL = ACOLSTART + ( startIteration * (32/(sizeof(%TYPE)/sizeof(float)))); } } else { ACOL = ACOLSTART; } for(int itr=0; itr= MV) ? (MV-1) : (rowA + i); AVAL[i] = %VLOAD(0, (&A[actualRow*lda + ACOL]) ); #endif #else #ifndef M_TAIL_PRESENT AVAL[i] = SYMM_VECTOR_LOAD_USING_SCALAR(A, M, lda, ACOL, (rowA+i)); #else actualRow = ((rowA + i) >= MV) ? (MV-1) : (rowA + i); AVAL[i] = SYMM_VECTOR_LOAD_USING_SCALAR(A, M, lda, ACOL, actualRow); #endif #endif #ifdef CONJUGATE_A %TYPE%V conjTemp = AVAL[i]; %CONJUGATE(1, conjTemp); AVAL[i] = conjTemp; #endif #ifdef COMPLEX AVALEVEN[i] = AVAL[i].even; AVALODD[i] = AVAL[i].odd; #endif } // // Load B values // %IF(%ITEMX) #pragma unroll %ITEMX for(int j=0; j<(%ITEMX); j++) { #ifndef N_TAIL_PRESENT BVAL[j] = %VLOAD(0, (&B[ACOL + (colB + j)*ldb])); #else actualCol = ((colB + j) >= N) ? (N-1) : (colB + j); BVAL[j] = %VLOAD(0, (&B[ACOL + (actualCol)*ldb])); #endif #ifdef COMPLEX BVALEVEN[j] = BVAL[j].even; BVALODD[j] = BVAL[j].odd; #endif } } // LOAD A and B Over // MATH Begin %IF(%ITEMX) #pragma unroll %ITEMX for(int j=0; j<(%ITEMX); j++) { %IF(%ITEMY) #pragma unroll %ITEMY for(int i=0; i<(%ITEMY); i++) { #ifndef COMPLEX %VMAD_AND_REDUCE(CVAL[i][j] , AVAL[i], BVAL[j]); #else CVALEVEN[i][j] = mad(AVALEVEN[i], BVALEVEN[j], CVALEVEN[i][j]); CVALEVEN[i][j] = mad(AVALODD[i], -BVALODD[j], CVALEVEN[i][j]); CVALODD[i][j] = mad(AVALEVEN[i], BVALODD[j], CVALODD[i][j]); CVALODD[i][j] = mad(AVALODD[i], BVALEVEN[j], CVALODD[i][j]); /* EVENSUM = AVALEVEN[i] * BVALEVEN[j]; EVENSUM = mad(AVALODD[i], -BVALODD[j], EVENSUM); ODDSUM = AVALEVEN[i]*BVALODD[j]; ODDSUM = mad(AVALODD[i], BVALEVEN[j], ODDSUM); CVAL[i][j].S0 += EVENSUM.S0 + EVENSUM.S1; CVAL[i][j].S1 += ODDSUM.S0 + ODDSUM.S1; */ #endif } } ACOL = ((ACOL + %V) == ACOLENDV) ? ACOLSTART : (ACOL + %V); //%PANEL } #ifdef COMPLEX { %IF(%ITEMY) #pragma unroll %ITEMY for(uint i=0; i< (%ITEMY); i++) { %IF(%ITEMX) #pragma unroll %ITEMX for(uint j=0; j<(%ITEMX); j++) { CVAL[i][j].even = %REDUCE_SUM_REAL_HV(CVALEVEN[i][j]); CVAL[i][j].odd = %REDUCE_SUM_REAL_HV(CVALODD[i][j]); } } } #endif ACOL = ACOLENDV; for(; ACOL < ACOLEND; ACOL ++) { // // Load B values // %IF(%ITEMX) #pragma unroll %ITEMX for(uint bcol = 0; bcol < %ITEMX; bcol++) { // // PENDING: PANEL iteration to Load the Panel Depth iterating by %V // #ifndef N_TAIL_PRESENT BVAL[bcol] = %VMAKEVEC(B[ACOL + (colB + bcol)*ldb]); #else BVAL[bcol] = %VMAKEVEC(B[ACOL + ((colB + bcol)%(N))*ldb]); #endif } // // Load A values // %IF(%ITEMY) #pragma unroll %ITEMY for(uint i = 0; i < (%ITEMY); i++) // 1 * ITEMY/V { #ifndef __SYMM_DIAGONAL__ { #ifndef M_TAIL_PRESENT AVAL[i] = %VMAKEVEC(A[(rowA + i)*lda + ACOL]); #else AVAL[i] = %VMAKEVEC(A[((rowA + i) % MV)*lda + ACOL]); #endif } #else { %TYPE t; #ifndef M_TAIL_PRESENT t = SYMM_SCALAR_LOAD(A, M, lda, ACOL, (rowA+i) ); #else t = SYMM_SCALAR_LOAD(A, M, lda, ACOL, ((rowA + i) % MV)); #endif AVAL[i] = %VMAKEVEC(t); } #endif #ifdef CONJUGATE_A %CONJUGATE(1, AVAL[i]); #endif } { %IF(%ITEMY) #pragma unroll %ITEMY for(uint i=0; i<(%ITEMY); i++) { %IF(%ITEMX) #pragma unroll %ITEMX for(uint j=0; j<(%ITEMX); j++) { %MAD_AND_REDUCE(CVAL[i][j] , AVAL[i] , BVAL[j]); } } } } // // STORE Result in C // %TYPE%V reg , betareg, alphareg; %TYPE reg_s , betareg_s, alphareg_s; %TYPE%V alphav, betav; alphav = %VMAKEVEC(alpha); betav = %VMAKEVEC(beta); //%TYPE CVALV_TEMP[%V]; %TYPE%V CVALV; #ifndef HERK %IF(%ITEMY_BY_V) #pragma unroll %ITEMY_BY_V for(uint i=0; i< (%ITEMY_BY_V); i++) { %IF(%ITEMX) #pragma unroll %ITEMX for(uint j=0; j<(%ITEMX); j++) { //#pragma unroll %V //for(uint k=0; k< (%V); k++) %VFOR { CVALV%VFORSUFFIX = CVAL[i*V + %VFORINDEX][j]; } //CVALV = *(__private %TYPE%V *)CVALV_TEMP; #if !defined(M_TAIL_PRESENT) && !defined(N_TAIL_PRESENT) reg = %VLOAD(0, (&C[(rowA + i*V) + (colB+j)*ldc])); %VMUL(betareg, betav, reg); %VMUL(alphareg, alphav, CVALV); %ADD( reg, betareg, alphareg); %VSTORE(reg, 0, (&C[(rowA + i*V) + (colB+j)*ldc])); #else if (((rowA + i*V + V - 1) < M) && ((colB + j) < N)) { reg = %VLOAD(0, (&C[rowA + i*V + (colB+j)*ldc])); %VMUL(betareg, betav, reg); %VMUL(alphareg, alphav, CVALV); %ADD( reg, betareg, alphareg); %VSTORE(reg, 0, (&C[(rowA + i*V) + (colB+j)*ldc])); } else { if ((colB + j) < N) { //%TYPE TEMP[%V]; //*(__private %TYPE%V *) TEMP = CVALV; //#pragma unroll %V //for(uint v=0; ((v< %V) && ((rowA + (i * %V) + v) < M) ); v++) %VFOR { if (((rowA + (i * %V) + %VFORINDEX) < M) ) { %TYPE c; c = C[rowA + i*V + %VFORINDEX + (colB+j)*ldc]; %MUL(betareg_s, c, beta); c = CVALV%VFORSUFFIX; %MUL(alphareg_s, c, alpha); %ADD(c, betareg_s, alphareg_s); C[rowA + i*V + %VFORINDEX + (colB+j)*ldc] = c; } } } } #endif } } #else %IF(%ITEMY_BY_V) #pragma unroll %ITEMY_BY_V for(uint i=0; i< (%ITEMY_BY_V); i++) { %IF(%ITEMX) #pragma unroll %ITEMX for(uint j=0; j<(%ITEMX); j++) { int actualRow = rowA + i*V; int actualCol = colB + j; //#pragma unroll %V //for(uint k=0; k< (%V); k++) %VFOR { CVALV%VFORSUFFIX = CVAL[i*V + %VFORINDEX][j]; } //CVALV = *(__private %TYPE%V *)CVALV_TEMP; #if !defined(M_TAIL_PRESENT) && !defined(N_TAIL_PRESENT) %VMUL(alphareg, alphav, CVALV); //%TYPE temp[%V]; //*(__private %TYPE%V *)(&temp) = alphareg; //#pragma unroll %V //for(uint r = 0; r < %V; r++) %VFOR { #ifdef HERK_LOWER_TRIANGLE if((%VFORINDEX + actualRow) >= (actualCol)) #else if((%VFORINDEX + actualRow) <= (actualCol)) #endif { %TYPE C_s = C[%VFORINDEX + actualRow + actualCol * ldc]; %TYPE beta_s; %MUL(beta_s, beta, C_s); C_s = alphareg%VFORSUFFIX + beta_s; if((%VFORINDEX + actualRow) == actualCol) { C_s.odd = 0.0f; } C[%VFORINDEX + actualRow + actualCol * ldc] = C_s; } } #else if (((rowA + i*V + V - 1) < M) && ((colB + j) < N)) { %VMUL(alphareg, alphav, CVALV); //%TYPE temp[%V]; //*(__private %TYPE%V *)(&temp) = alphareg; //#pragma unroll %V //for(uint r = 0; r < %V; r++) %VFOR { #ifdef HERK_LOWER_TRIANGLE if((%VFORINDEX + actualRow) >= (actualCol)) #else if((%VFORINDEX + actualRow) <= (actualCol)) #endif { %TYPE C_s = C[%VFORINDEX + actualRow + actualCol * ldc]; %TYPE beta_s; %MUL(beta_s, beta, C_s); C_s = alphareg%VFORSUFFIX + beta_s; if((%VFORINDEX + actualRow) == actualCol) { C_s.odd = 0.0f; } C[%VFORINDEX + actualRow + actualCol * ldc] = C_s; } } } else { if ((colB + j) < N) { //%TYPE TEMP[%V]; //*(__private %TYPE%V *)(&TEMP) = CVALV; //#pragma unroll %V //for(uint r=0; ((r< %V) && ((rowA + (i * %V) + r) < M) ); r++) %VFOR { if (((rowA + (i * %V) + %VFORINDEX) < M)) { #ifdef HERK_LOWER_TRIANGLE if((%VFORINDEX + actualRow) >= (actualCol)) #else if((%VFORINDEX + actualRow) <= (actualCol)) #endif { %TYPE c; c = C[%VFORINDEX + actualRow + (actualCol)*ldc]; %MUL(betareg_s, c, beta); c = CVALV%VFORSUFFIX; %MUL(alphareg_s, c, alpha); %ADD(c, betareg_s, alphareg_s); if((%VFORINDEX + actualRow) == (actualCol)) { c.odd = 0.0f; } C[%VFORINDEX + actualRow + actualCol * ldc] = c; } } } } } #endif } } #endif return; } "; clblas-2.10/src/library/blas/gens/clTemplates/gemm_helper.cl000066400000000000000000000054721264277366700241210ustar00rootroot00000000000000/* ************************************************************************ * Copyright 2013 Advanced Micro Devices, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * ************************************************************************/ static const char *GEMM_HELPER = " void getBlockNumber(uint nBlocks, uint blockID, uint *bidY, uint *bidX, uint flag) { #ifndef HERK { if(flag) //Column Major ordering for NT kernels { *bidY = ( blockID % ( nBlocks)); *bidX = ( blockID / ( nBlocks)); } else //Row Major ordering for TN kernels { *bidX = ( blockID % ( nBlocks)); *bidY = ( blockID / ( nBlocks)); } } #else { volatile uint _i = 0, _j = 0; for ( _j = (blockID / nBlocks); _j < nBlocks; _j++) { _i = blockID - ((_j*((2* nBlocks) + 1 - _j))/2) + _j; if ( _i < nBlocks && ( _i >= 0) ) { break; } } #ifdef HERK_LOWER_TRIANGLE *bidY = _i; *bidX = _j; #else *bidY = _j; *bidX = _i; #endif } #endif } // // mapWorkGroupToTileNumber() - Maps a workgroup number to a Tile position in output matrix // Groups the full tiles together and half-tiles together and maps the workgroup number // such that full tiles are processed wholly by consecutive workgroups and half-tiles are // processed by consecutive workgroups // // ASSUMPTION: // Assumes column major numbering of workgroup // // Observation: // This new grouping yielded worse results than normal column-major order. // Tested with GEMM NN kernel. So, we will not be using this function. // This is here just for completeness sake // void mapWorkGroupToTileNumber(uint M, uint N, uint *bidY, uint *bidX) { uint fullTilesOnY, numTilesOnX; uint relativeGroupId; numTilesOnX = ((N-1) / ((get_local_size(0) / %WIDTH) * %ITEMX)) + 1; fullTilesOnY = (M / (%WIDTH * %ITEMY)); if (get_group_id(0) < (numTilesOnX * fullTilesOnY) ) { *bidY = ( get_group_id(0) % ( fullTilesOnY)); *bidX = ( get_group_id(0) / ( fullTilesOnY)); } else { relativeGroupId = get_group_id(0) - (numTilesOnX * fullTilesOnY); *bidY = fullTilesOnY; *bidX = relativeGroupId; } } "; clblas-2.10/src/library/blas/gens/clTemplates/ger.cl000066400000000000000000000203401264277366700224010ustar00rootroot00000000000000/* ************************************************************************ * Copyright 2013 Advanced Micro Devices, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * ************************************************************************/ // Column-Major Case static const char *ger_C_kernel = " #ifdef DOUBLE_PRECISION #ifdef cl_khr_fp64 #pragma OPENCL EXTENSION cl_khr_fp64 : enable #else #pragma OPENCL EXTENSION cl_amd_fp64 : enable #endif #endif #define BH %BH_DEF #define BW %BW_DEF __kernel void %PREFIXger_C_kernel( __global %TYPE const* restrict _X, __global %TYPE const* restrict _Y, __global %TYPE* _A, uint M, uint N, uint offx, int incx, uint offy, int incy, uint offa, uint lda, %TYPE alpha, int doConj ) { __global %TYPE* A; __global %TYPE const* restrict X; __global %TYPE const* restrict Y; A = _A + offa; X = _X + offx; Y = _Y + offy; if ( incx < 0 ) // Goto end of vector { X = X + ( M - 1) * abs(incx); } if ( incy < 0 ) // Goto end of vector { Y = Y + ( N - 1) * abs(incy); } // create local memory __local %TYPE%V localXV[ BH ]; __local %TYPE *localX = (__local %TYPE *)localXV; __local %TYPE localY[ BW ]; uint lID = get_local_id( 0 ); uint gID = get_group_id( 0 ); uint tIDy = lID & ( BH-1 ); //get y coordinate of a thread in 1D workgroup uint tIDx = lID / BH; //get x coordinate of a thread in !D workgroup uint nBlocksX = (( N + BW - 1) / BW ); uint nBlocksY = (( M + BH * %V - 1 ) / ( BH * %V )); uint gIDy = gID % nBlocksY; //get y coordinate of a workgroup in 1D grid uint gIDx = gID / nBlocksY; // get x coordinate of a workgroup in a 1D grid uint row = (( BH * gIDy)+ tIDy) * %V; uint col = (( BW * gIDx)+ tIDx); if( (gIDx != (nBlocksX-1)) && (gIDy != (nBlocksY-1)) ) // Completely vector blocks { //populate local memory for( int i = lID; i< ( BH * %V); i+= get_local_size(0) ) { int idx = i + ( gIDy * BH * %V); localX[ i ] = *(X + (idx * incx)); } for( int i = lID; i< BW; i+= get_local_size(0) ) { int idx = i + ( gIDx * BW); localY[ i ] = *(Y + (idx * incy)); } barrier(CLK_LOCAL_MEM_FENCE); %TYPE%V prevA, temp; %TYPE yReg = localY[ tIDx ]; %TYPE%V xReg = *(__local %TYPE%V*)(&localX[ tIDy * %V]); prevA = %VLOAD( 0, ( A + col*lda + row ) ); %CONJUGATE(doConj, yReg); %VMUL( temp, xReg, alpha ); %VMAD( prevA, temp, yReg); %VSTORE( prevA, 0 , ( A + col*lda + row ) ); } else // Border blocks in both X & Y direction { //populate local memory for( int i = lID; i< ( BH * %V); i+= get_local_size(0) ) { int idx = i + ( gIDy * BH * %V); if ( idx < M ) { localX[ i ] = *(X + (idx * incx)); } } for( int i = lID; i< BW; i+= get_local_size(0) ) { int idx = i + ( gIDx * BW); if ( idx < N) { localY[ i ] = *(Y + (idx * incy)); } } barrier(CLK_LOCAL_MEM_FENCE); uint gTIDx = (gIDx * BW) + tIDx; if ( gTIDx < N) // if whithin last column { if( (row + %V - 1) < M ) // if the next V rows are still within M, then do vector math { %TYPE%V prevA, temp; %TYPE yReg = localY[ tIDx ]; %TYPE%V xReg = *(__local %TYPE%V*)(&localX[ tIDy * %V]); prevA = %VLOAD( 0, ( A + col*lda + row ) ); %CONJUGATE(doConj, yReg); %VMUL( temp, xReg, alpha ); %VMAD( prevA, temp, yReg); %VSTORE( prevA, 0 , ( A + col*lda + row ) ); } else if( row < M ) //else do scalar multiplication { %TYPE xRegS, yReg, prevA, temp; for( int i=row; i const char * trsv_CU_SolveTriangle_kernel = " #ifdef DOUBLE_PRECISION #ifdef cl_khr_fp64 #pragma OPENCL EXTENSION cl_khr_fp64 : enable #else #pragma OPENCL EXTENSION cl_amd_fp64 : enable #endif #endif #ifdef PACKED #define A( row, col) (*( A + (((col)*((col)+1))/2 + (row)))) #elif defined(BANDED) #define A( row, col) A[ (row) * lda + (col)] #else #define A( row, col) A[ (row) + (col) * lda] #endif // Only one workgroup of threads launched __kernel void %PREFIXtrsv_CU_SolveTriangle_kernel( __global %TYPE const * restrict _A, __global %TYPE* _xnew, uint N, int incx, int isUnity, uint lda, int doConj, int startRow, int startCol, uint offa, uint offx #ifdef BANDED , uint KU #endif ) { __global %TYPE* xnew; __global %TYPE const * restrict A = _A + offa; if ( incx < 0 ) // Goto end of vector { xnew = _xnew + offx + ( N - 1) * abs(incx); } else { xnew = _xnew + offx; } __local %TYPE xShared; // To share solved x value with other threads.. size_t gIdx = get_global_id(0); size_t bIdx = get_group_id(0); size_t threadIdx= get_local_id(0); %TYPE sum = %MAKEVEC(0.0); %TYPE xVal = %MAKEVEC(0.0); %TYPE loadedA = %MAKEVEC(0.0); int targetCol = startCol; int targetRow = startRow + threadIdx; int loops = (startCol - startRow) + 1; #ifdef BANDED int bandCol = (loops - 1) - threadIdx; #endif for( int i=0; i < loops; i++) { if ( targetRow == targetCol) { xVal = xnew[ targetRow * incx]; %SUB(sum, xVal, sum); if( isUnity) { xShared = sum; } else // Handle diagonal element { #ifdef BANDED loadedA = A((targetRow), (bandCol)); #else loadedA = A((targetRow), (targetCol)); #endif %CONJUGATE(doConj, loadedA); %DIV(xShared, sum, loadedA); } xnew[ targetRow * incx ] = xShared; } // Sync so that xShared it available to all threads barrier(CLK_LOCAL_MEM_FENCE); if ( targetRow < targetCol) { #ifdef BANDED loadedA = A((targetRow), (bandCol)); #else loadedA = A((targetRow), (targetCol)); #endif %CONJUGATE(doConj, loadedA); %MAD(sum, loadedA, xShared); } // Avoid Race... barrier(CLK_LOCAL_MEM_FENCE); targetCol--; #ifdef BANDED bandCol--; #endif } }"; const char * trsv_CL_SolveTriangle_kernel = " #ifdef DOUBLE_PRECISION #ifdef cl_khr_fp64 #pragma OPENCL EXTENSION cl_khr_fp64 : enable #else #pragma OPENCL EXTENSION cl_amd_fp64 : enable #endif #endif #ifdef PACKED #define A(row, col) (*( A + ((( (col) *((2*N) + 1 - (col))) / 2) + ((row) - (col))))) #elif defined(BANDED) #define A(row, col) A[ (row) * lda + (col)] #else #define A(row, col) A[ (row) + (col) * lda] #endif #pragma OPENCL EXTENSION cl_amd_printf : enable // Only one block of threads launched __kernel void %PREFIXtrsv_CL_SolveTriangle_kernel( __global const %TYPE* _A, __global %TYPE* _xnew, uint N, int incx, int isUnity, uint lda, int doConj, int startCol, int endRow, uint offa, uint offx #ifdef BANDED , uint KL #endif ) { __global %TYPE* xnew; __global %TYPE* A = _A + offa; if ( incx < 0 ) // Goto end of vector { xnew = _xnew + offx + ( N - 1) * abs(incx); } else { xnew = _xnew + offx; } __local %TYPE xShared; // To share solved x value with other threads.. size_t gIdx = get_global_id(0); size_t bIdx = get_group_id(0); size_t threadIdx= get_local_id(0); %TYPE sum = %MAKEVEC(0.0); %TYPE xVal = %MAKEVEC(0.0); %TYPE loadedA = %MAKEVEC(0.0); int targetCol = startCol; int targetRow = endRow - threadIdx; int loops = (endRow - startCol) + 1; #ifdef BANDED int bandCol = (KL + 1) - loops + threadIdx; #endif // printf(\"%u : bandCol %d targetCol %d targetRow %d loops %d KL %d\\n\", threadIdx, bandCol, targetCol, targetRow, loops, KL); for( int i=0; i < loops; i++) { if ( targetRow == targetCol) { xVal = xnew[ targetRow * incx]; //printf(\"Before1 %u : xShared %f, sum %f\\n\", threadIdx, xShared, sum); %SUB(sum, xVal, sum); //printf(\"Before2 %u : xShared %f, sum %f XvAL %f, targetRow %d\\n\", threadIdx, xShared, sum, xVal, targetRow); if( isUnity) { xShared = sum; } else // Handle diagonal element { #ifndef BANDED loadedA = A((targetRow), (targetCol)); #else loadedA = A((targetRow), (bandCol)); #endif %CONJUGATE(doConj, loadedA); %DIV(xShared, sum, loadedA); } //printf(\"After %u : xShared %f, sum %f\\n\", threadIdx, xShared, sum); xnew[ targetRow * incx ] = xShared; } // Sync so that xShared it available to all threads barrier(CLK_LOCAL_MEM_FENCE); if ( targetRow > targetCol) { #ifndef BANDED loadedA = A((targetRow), (targetCol)); #else loadedA = A((targetRow), (bandCol)); #endif %CONJUGATE(doConj, loadedA); //printf(\"%u : xShared %f, sum %f loadedA %f\\n\", threadIdx, xShared, sum, loadedA); %MAD(sum, loadedA, xShared); } // Avoid Race... barrier(CLK_LOCAL_MEM_FENCE); targetCol++; #ifdef BANDED bandCol++; #endif } } "; const char * trsv_CUT_SolveTriangle_kernel = " #ifdef DOUBLE_PRECISION #ifdef cl_khr_fp64 #pragma OPENCL EXTENSION cl_khr_fp64 : enable #else #pragma OPENCL EXTENSION cl_amd_fp64 : enable #endif #endif #ifdef PACKED #define A( row, col) (*( A + (((col)*((col)+1))/2 + (row)))) #elif defined(BANDED) #define A(row, col) A[ (row) * lda + (col)] #else #define A( row, col) A[ (row) + (col) * lda] #endif #pragma OPENCL EXTENSION cl_amd_printf : enable __kernel void %PREFIXtrsv_CUT_SolveTriangle_kernel( __global const %TYPE* _A, __global %TYPE* _xnew, uint N, int incx, int isUnity, uint lda, int doConj, int startRow, int endRow, uint offa, uint offx #ifdef BANDED , uint KU #endif ) { __global %TYPE* xnew; __global const %TYPE* A = _A + offa; if ( incx < 0 ) // Goto end of vector { xnew = _xnew + offx + ( N - 1) * abs(incx); } else { xnew = _xnew + offx; } int blockSize = get_local_size(0); int threadID = get_local_id(0); int targetRow; #ifdef BANDED int bandRow = startRow; int bandCol = threadID; // printf(\"threadID %d, bandRow %d bandCol %d\\n\",threadID, bandRow, bandCol); #endif __local volatile %TYPE saccShared[%TRIANGLE_HEIGHT]; targetRow = startRow + threadID; //#pragma unroll for( int idx = threadID; (idx < %TRIANGLE_HEIGHT) && ((startRow + idx) < endRow); idx += blockSize) { saccShared[idx] = xnew[ (startRow + idx) * incx]; } barrier(CLK_LOCAL_MEM_FENCE); %TYPE diagA = %INIT(0.0); if(targetRow < endRow) { #ifndef BANDED diagA = A((targetRow), (targetRow)); #else diagA = A((startRow + threadID), (0)); #endif %CONJUGATE(doConj, diagA); } %TYPE tempA, tempS; for(int i = 0; i < %TRIANGLE_HEIGHT; i++) { if((i <= threadID) && (i > 0) && (targetRow < endRow)) { #ifndef BANDED tempA = A((startRow + i - 1), (targetRow)); #else tempA = A((bandRow - 1), (bandCol + 1)); // printf(\"threadID %d, bandRow %d bandCol %d A %f\\n\",threadID, bandRow, bandCol, tempA); #endif %CONJUGATE(doConj, tempA); %MUL(tempS, tempA, saccShared[i-1]); %SUB(saccShared[threadID], saccShared[threadID], tempS); } if((i == threadID) && (targetRow < endRow) && (!isUnity)) { tempS = saccShared[threadID]; // printf(\"threadID %d, saccShared %f, diagA %f\\n\", threadID, tempS, diagA); %DIV(saccShared[threadID], tempS, diagA); } barrier(CLK_LOCAL_MEM_FENCE); #ifdef BANDED bandRow++; bandCol--; #endif } barrier(CLK_LOCAL_MEM_FENCE); if(targetRow < endRow) { xnew[(targetRow * incx)] = saccShared[threadID]; } } "; const char * trsv_CLT_SolveTriangle_kernel = " #ifdef DOUBLE_PRECISION #ifdef cl_khr_fp64 #pragma OPENCL EXTENSION cl_khr_fp64 : enable #else #pragma OPENCL EXTENSION cl_amd_fp64 : enable #endif #endif #ifdef PACKED #define A(row, col) (*( A + ((( (col) *((2*N) + 1 - (col))) / 2) + ((row) - (col))))) #elif defined(BANDED) #define A(row, col) A[ (row) * lda + (col)] #else #define A(row, col) A[ (row) + (col) * lda] #endif #pragma OPENCL EXTENSION cl_amd_printf : enable // Column-Major Lower Non-Unity case // StartRow points to actual Row to start from( absolute Column number) // endRow points to actual Row to stop + 1( absolute Column number) __kernel void %PREFIXtrsv_CLT_SolveTriangle_kernel( __global const %TYPE* _A, __global %TYPE* _xnew, uint N, int incx, int isUnity, uint lda, int doConj, int startRow, int endRow, uint offa, uint offx #ifdef BANDED ,uint KL #endif ) { __global %TYPE* xnew; __global const %TYPE *A = _A + offa; if ( incx < 0 ) // Goto end of vector { xnew = _xnew + offx + ( N - 1) * abs(incx); } else { xnew = _xnew + offx; } int blockSize = get_local_size(0); int threadID = get_local_id(0); __local volatile %TYPE saccShared[%TRIANGLE_HEIGHT]; int targetRow; targetRow = (endRow - 1) - threadID; #ifdef BANDED int bandRow = (endRow - 1); int bandCol = (KL) - threadID; #endif //#pragma unroll for( int idx = threadID; (idx < %TRIANGLE_HEIGHT) && (((endRow - 1) - idx) >= startRow); idx += blockSize) { saccShared[idx] = xnew[((endRow - 1) - idx) * incx]; } barrier(CLK_LOCAL_MEM_FENCE); %TYPE diagA = %INIT(0.0); if(targetRow >= startRow) { #ifndef BANDED diagA = A((targetRow), (targetRow)); #else diagA = A((bandRow - threadID), (KL)); // printf(\"ThreadID %d, bandRow %d bandCol %d\\n\", threadID, bandRow, bandCol); #endif %CONJUGATE(doConj, diagA); } %TYPE tempA, tempS; for( int i = (endRow - 1); i >= startRow; i--) { if((targetRow == i) && (!isUnity)) { tempS = saccShared[threadID]; %DIV(saccShared[threadID], tempS, diagA); } barrier(CLK_LOCAL_MEM_FENCE); if((targetRow < i) && (targetRow >= startRow)) { #ifndef BANDED tempA = A((i), (targetRow)); #else tempA = A((bandRow), (bandCol)); #endif %CONJUGATE(doConj, tempA); %MUL(tempS, tempA, saccShared[(endRow - 1) - i]); %SUB(saccShared[threadID], saccShared[threadID], tempS); } barrier(CLK_LOCAL_MEM_FENCE); #ifdef BANDED bandRow--; bandCol++; #endif } barrier(CLK_LOCAL_MEM_FENCE); if(targetRow >= startRow) { xnew[(targetRow * incx)] = saccShared[threadID]; } } "; clblas-2.10/src/library/blas/gens/clTemplates/trsv_gemv.cl000066400000000000000000001136441264277366700236520ustar00rootroot00000000000000/* ************************************************************************ * Copyright 2013 Advanced Micro Devices, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * ************************************************************************/ // Compute Rectangle + Traingle const char * trsv_CU_ComputeRectangle_kernel = " #ifdef DOUBLE_PRECISION #ifdef cl_khr_fp64 #pragma OPENCL EXTENSION cl_khr_fp64 : enable #else #pragma OPENCL EXTENSION cl_amd_fp64 : enable #endif #endif #define TARGET_ROWS_BY_VEC ((%TARGET_ROWS)/(%V)) #ifdef PACKED #define A( row, col) (*( A + (((col)*((col)+1))/2 + (row)))) #else #define A( row, col) A[ (row) + (col) * lda] #endif __kernel void %PREFIXtrsv_CU_ComputeRectangle_kernel( __global %TYPE const * restrict _A, __global %TYPE* _xnew, uint N, int incx, int isUnity, uint lda, int doConj, int startCol, int rowsLeft, uint offa, uint offx) { __global %TYPE* xnew; __global %TYPE* A = _A + offa; if ( incx < 0 ) // Goto end of vector { xnew = _xnew + offx + ( N - 1) * abs(incx); } else { xnew = _xnew + offx; } size_t bIdx = get_group_id(0); size_t threadIdx= get_local_id(0); // Get total blocks launched size_t nBlocks = ((rowsLeft - 1) / %TARGET_ROWS) + 1; %TYPE sum = %MAKEVEC( 0.0); %TYPE loadedA = %MAKEVEC( 0.0); // First Block does scalar stuff... // Only this gets executed if nBlocks == 1 if ( bIdx == 0) { int targetCol = startCol; int targetRow = threadIdx; int lastRow = rowsLeft - ( nBlocks - 1) * %TARGET_ROWS - 1; if ( nBlocks > 1) { if ( targetRow <= lastRow) { for( int i=0; i < %TARGET_ROWS; i++) { // All threads look at same xnew // Should use Shared Memory .. %TYPE xVal = xnew[ targetCol * incx]; loadedA = A((targetRow), (targetCol)); %CONJUGATE(doConj, loadedA); %MAD(sum, loadedA, xVal); targetCol--; } %SUB(xnew[ targetRow * incx], xnew[targetRow * incx], sum); } } else // Solve the traingle -- no more kernel launches required { if ( targetRow <= lastRow) { for( int i=0; i < %TARGET_ROWS; i++) { // All threads look at same xnew // Should use Shared Memory .. %TYPE xVal = xnew[ targetCol * incx]; loadedA = A((targetRow), (targetCol)); %CONJUGATE(doConj, loadedA); %MAD(sum, loadedA, xVal); targetCol--; } } // Change targetCol to point to Triangle last column for all threads // As the above condition ( targetRow <= lastRow) changes targetCol for only threads with condition true targetCol = startCol - %TARGET_ROWS; __local %TYPE xShared; // To share solved x value with other threads.. for( int i=0; i < (lastRow + 1); i++) { if ( targetRow == targetCol) { %TYPE xVal = xnew[ targetRow * incx]; %SUB(sum, xVal, sum); xShared = sum; xnew[ targetRow * incx ] = xShared; } barrier(CLK_LOCAL_MEM_FENCE); if ( targetRow < targetCol) { loadedA = A((targetRow), (targetCol)); %CONJUGATE(doConj, loadedA); %MAD(sum, loadedA, xShared); } // Avoid Race barrier(CLK_LOCAL_MEM_FENCE); targetCol--; } } } else { size_t rowShift = ((threadIdx % ( TARGET_ROWS_BY_VEC )) * %V); size_t colShift = threadIdx / TARGET_ROWS_BY_VEC; int rowStart = rowsLeft - ( %TARGET_ROWS * (nBlocks - bIdx) ); int row = rowStart + rowShift; %TYPE sumTemp = %MAKEVEC(0.0); %TYPE%V sum = %VMAKEVEC(sumTemp); __local %TYPE xData[ %TARGET_WIDTH]; //#pragma unroll for( int i=1; i <= %NLOOPS; i++) { // Put startCol to start of BLOCKSIZE Block int startColp = startCol - (%TARGET_WIDTH * i) + 1; if ( threadIdx < %TARGET_WIDTH) { xData[threadIdx] = xnew[ (startColp + threadIdx) * incx]; } barrier(CLK_LOCAL_MEM_FENCE); int col = startColp + colShift; %TYPE xDataVal = xData[ colShift ]; %TYPE%V xDataVec= %VMAKEVEC( xDataVal); %TYPE%V loadedA = %VLOAD( 0, &A((row), (col))); %CONJUGATE(doConj, loadedA); %VMAD(sum, loadedA, xDataVec); barrier(CLK_LOCAL_MEM_FENCE); } __local %TYPE%V sDataTemp[TARGET_ROWS_BY_VEC * %TARGET_WIDTH]; //__local %TYPE* sData = sDataTemp; sDataTemp[(threadIdx % ( TARGET_ROWS_BY_VEC )) + (colShift * TARGET_ROWS_BY_VEC)] = sum; barrier(CLK_LOCAL_MEM_FENCE); //int TARGET_ROWS = %TARGET_ROWS; // Last Block // Do Scalar reduction for last block // Followed by solving the triangle if ( bIdx == ( nBlocks - 1)) { %TYPE sumTemp = %MAKEVEC(0.0); %TYPE%V sumVec = %VMAKEVEC(sumTemp); %TYPE%V loadedAVec = %VMAKEVEC(sumTemp); //int targetRow = rowStart + threadIdx; int targetCol = startCol- %TARGET_ROWS; // Col where triangle last col overlaps // Do vector reduction if ( threadIdx < TARGET_ROWS_BY_VEC ) { //#pragma unroll for( int j=0; j < %TARGET_WIDTH; j++) { %ADD(sumVec, sumVec, sDataTemp[ threadIdx + j * TARGET_ROWS_BY_VEC]); } } __local %TYPE xShared[%V]; int targetRowTemp = rowStart + threadIdx * %V; int VECTOR_SIZE = %V; //#pragma unroll for( int i=0; i < (TARGET_ROWS_BY_VEC); i++) { if ( threadIdx == (TARGET_ROWS_BY_VEC - 1 - i)) { // Read X-vector %TYPE xVal[%V]; //#pragma unroll for( int j = 0; j < %V; j++) { xVal[j] = xnew[ (targetRowTemp + j)* incx]; } // Read A %Vx%V region into reg %TYPE reg[%V][%V]; //#pragma unroll for( int idx = 0; idx < ( %V * %V); idx++) { int m = idx / ( %V ); // Row : Col-Major idx... int n = idx % ( %V ); // Col if ( n > m ) { reg[m][n] = A( (targetRowTemp + m), (targetCol -( %V - 1 - n))); %CONJUGATE(doConj, reg[m][n]); } } %TYPE sumVecReg[%V]; %VSTOREWITHINCX(sumVecReg, sumVec, 1); // Solve for first x - Do the rest in loop %TYPE x[%V]; %SUB(x[VECTOR_SIZE - 1], xVal[VECTOR_SIZE - 1], sumVecReg[VECTOR_SIZE - 1]); xShared[%V - 1] = x[%V - 1]; xnew[ (targetRowTemp + %V - 1)* incx ] = x[%V - 1]; //#pragma unroll for(int m= ( %V - 2); m >=0; m--) { %SUB(x[m], xVal[m], sumVecReg[m]); } //#pragma unroll for( int idx = (( ( %V * %V) - 1) - %V); idx > 0; idx--) { int m = idx / %V; // Row : Row-Major idx, x[3] is solved before x[2] int n = idx % ( %V );// Col if ( n > m) { //x[m] = x[m] - reg[m][n] * x[n]; %MAD(x[m], reg[m][n], (-x[n])); } } // Store results //#pragma unroll for(int m = 0; m < %V; m++) { xShared[m] = x[m]; xnew[ (targetRowTemp + m)* incx ] = x[m]; } } // Sync so that xShared it available to all threads barrier(CLK_LOCAL_MEM_FENCE); if ( threadIdx < (TARGET_ROWS_BY_VEC - 1 - i)) { //#pragma unroll for( int j=0; j < %V; j++) { //sumVec += vload4( 0, &A((targetRowTemp), (targetCol -j))) * xShared[%V - 1 -j]; %TYPE%V loadedAVec = %VLOAD( 0, &A((targetRowTemp), (targetCol -j))); %CONJUGATE(doConj, loadedAVec); %VMAD(sumVec, loadedAVec, xShared[VECTOR_SIZE - 1 -j]); } } targetCol = targetCol - %V; // Avoid Race... barrier(CLK_LOCAL_MEM_FENCE); } } else { // Do Vector Reduction on each block except the last Block if ( threadIdx < TARGET_ROWS_BY_VEC) { %TYPE accTemp = %MAKEVEC(0.0); %TYPE%V acc = %VMAKEVEC(accTemp); //#pragma unroll for( int j=0; j < %TARGET_WIDTH; j++) { %ADD(acc, acc, sDataTemp[ threadIdx + j * TARGET_ROWS_BY_VEC]); } // Store the result int targetRow = rowStart + threadIdx * %V; __global %TYPE* xNewPtr = xnew + targetRow * incx; //float4 value = (float4)( xNewPtr[0], xNewPtr[incx], xNewPtr[incx * 2], xNewPtr[incx *3]); %TYPE%V value; %VLOADWITHINCX(value, xNewPtr, incx); // Compute result %SUB(value, value, acc); // Store results //VSTOREWITHINCX( xNewPtr, value, incx); %VSTOREWITHINCX(xNewPtr, value, incx); } } } } "; const char *trsv_CU_ComputeRectangle_NonUnity_kernel = " #ifdef DOUBLE_PRECISION #ifdef cl_khr_fp64 #pragma OPENCL EXTENSION cl_khr_fp64 : enable #else #pragma OPENCL EXTENSION cl_amd_fp64 : enable #endif #endif #define TARGET_ROWS_BY_VEC ((%TARGET_ROWS)/(%V)) #ifdef PACKED #define A( row, col) (*( A + (((col)*((col)+1))/2 + (row)))) #else #define A( row, col) A[ (row) + (col) * lda] #endif // Compute Rectangle + Traingle __kernel void %PREFIXtrsv_CU_ComputeRectangle_NonUnity_kernel( __global %TYPE const * restrict _A, __global %TYPE* _xnew, uint N, int incx, int isUnity, uint lda, int doConj, int startCol, int rowsLeft, uint offa, uint offx) { __global %TYPE* xnew; __global %TYPE* A = _A + offa; if ( incx < 0 ) // Goto end of vector { xnew = _xnew + offx + ( N - 1) * abs(incx); } else { xnew = _xnew + offx; } size_t bIdx = get_group_id(0); size_t threadIdx= get_local_id(0); // Get total blocks launched size_t nBlocks = (rowsLeft - 1) / %TARGET_ROWS + 1; %TYPE sum = %MAKEVEC( 0.0); %TYPE loadedA = %MAKEVEC( 0.0); // First Block does scalar stuff... // Only this gets executed if nBlocks == 1 if ( bIdx == 0) { int targetCol = startCol; int targetRow = threadIdx; int lastRow = rowsLeft - ( nBlocks - 1) * %TARGET_ROWS - 1; if ( nBlocks > 1) { if ( targetRow <= lastRow) { for( int i=0; i < %TARGET_ROWS; i++) { // All threads look at same xnew // Should use Shared Memory .. %TYPE xVal = xnew[ targetCol * incx]; loadedA = A((targetRow), (targetCol)); %CONJUGATE(doConj, loadedA); %MAD(sum, loadedA, xVal); targetCol--; } %SUB(xnew[ targetRow * incx], xnew[targetRow * incx], sum); } } else // Solve the traingle -- no more kernel launches required { if ( targetRow <= lastRow) { for( int i=0; i < %TARGET_ROWS; i++) { // All threads look at same xnew // Should use Shared Memory .. %TYPE xVal = xnew[ targetCol * incx]; loadedA = A((targetRow), (targetCol)); %CONJUGATE(doConj, loadedA); %MAD(sum, loadedA, xVal); targetCol--; } } // Change targetCol to point to Triangle last column for all threads // As the above condition ( targetRow <= lastRow) changes targetCol for only threads with condition true targetCol = startCol - %TARGET_ROWS; __local %TYPE xShared; // To share solved x value with other threads.. for( int i=0; i < (lastRow + 1); i++) { if ( targetRow == targetCol) { %TYPE xVal = xnew[ targetRow * incx]; sum = xVal - sum; // Handle diagonal element loadedA = A((targetRow), (targetCol)); %CONJUGATE(doConj, loadedA); %DIV(xShared, sum, loadedA); xnew[ targetRow * incx ] = xShared; } barrier(CLK_LOCAL_MEM_FENCE); if ( targetRow < targetCol) { loadedA = A((targetRow), (targetCol)); %CONJUGATE(doConj, loadedA); %MAD(sum, loadedA, xShared); } // Avoid Race barrier(CLK_LOCAL_MEM_FENCE); targetCol--; } } } else { size_t rowShift = ((threadIdx % ( TARGET_ROWS_BY_VEC )) * %V); size_t colShift = threadIdx / TARGET_ROWS_BY_VEC; int rowStart = rowsLeft - ( %TARGET_ROWS * (nBlocks - bIdx) ); int row = rowStart + rowShift; %TYPE sumTemp = %MAKEVEC(0.0); %TYPE%V sum = %VMAKEVEC(sumTemp); __local %TYPE xData[ %TARGET_WIDTH]; //#pragma unroll for( int i=1; i <= %NLOOPS; i++) { // Put startCol to start of BLOCKSIZE Block int startColp = startCol - (%TARGET_WIDTH * i) + 1; if ( threadIdx < %TARGET_WIDTH) { xData[threadIdx] = xnew[ (startColp + threadIdx) * incx]; } barrier(CLK_LOCAL_MEM_FENCE); int col = startColp + colShift; %TYPE xDataVal = xData[ colShift ]; %TYPE%V xDataVec= %VMAKEVEC( xDataVal); %TYPE%V loadedA = %VLOAD( 0, &A((row), (col))); %CONJUGATE(doConj, loadedA); %VMAD(sum, loadedA, xDataVec); barrier(CLK_LOCAL_MEM_FENCE); } __local %TYPE%V sDataTemp[TARGET_ROWS_BY_VEC * %TARGET_WIDTH]; //__local %TYPE* sData = sDataTemp; sDataTemp[(threadIdx % ( TARGET_ROWS_BY_VEC )) + (colShift * TARGET_ROWS_BY_VEC)] = sum; barrier(CLK_LOCAL_MEM_FENCE); //int TARGET_ROWS = %TARGET_ROWS; // Last Block // Do Scalar reduction for last block // Followed by solving the triangle if ( bIdx == ( nBlocks - 1)) { %TYPE sumTemp = %MAKEVEC(0.0); %TYPE%V sumVec = %VMAKEVEC(sumTemp); %TYPE%V loadedAVec = %VMAKEVEC(sumTemp); //int targetRow = rowStart + threadIdx; int targetCol = startCol- %TARGET_ROWS; // Col where triangle last col overlaps // Do vector reduction if ( threadIdx < TARGET_ROWS_BY_VEC ) { //#pragma unroll for( int j=0; j < %TARGET_WIDTH; j++) { %ADD(sumVec, sumVec, sDataTemp[ threadIdx + j * TARGET_ROWS_BY_VEC]); } } __local %TYPE xShared[%V]; int targetRowTemp = rowStart + threadIdx * %V; int VECTOR_SIZE = %V; //#pragma unroll for( int i=0; i < (TARGET_ROWS_BY_VEC); i++) { if ( threadIdx == (TARGET_ROWS_BY_VEC - 1 - i)) { // Read X-vector %TYPE xVal[%V]; //#pragma unroll for( int j = 0; j < %V; j++) { xVal[j] = xnew[ (targetRowTemp + j)* incx]; } // Read A %Vx%V region into reg %TYPE reg[%V][%V]; //#pragma unroll for( int idx = 0; idx < ( %V * %V); idx++) { int m = idx % ( %V ); // Row : Col-Major idx... int n = idx / ( %V ); // Col if ( n >= m ) { reg[m][n] = A((targetRowTemp + m), (targetCol -( %V - 1 - n))); %CONJUGATE(doConj, reg[m][n]); } } %TYPE sumVecReg[%V]; %VSTOREWITHINCX(sumVecReg, sumVec, 1); // Solve for first x - Do the rest in loop %TYPE x[%V]; %SUB(x[VECTOR_SIZE - 1], xVal[VECTOR_SIZE - 1], sumVecReg[VECTOR_SIZE - 1]); %DIV(sumVecReg[VECTOR_SIZE - 1], x[VECTOR_SIZE -1], reg[VECTOR_SIZE - 1][VECTOR_SIZE - 1]); x[VECTOR_SIZE -1] = sumVecReg[VECTOR_SIZE - 1]; xShared[%V - 1] = x[%V - 1]; xnew[ (targetRowTemp + %V - 1)* incx ] = x[%V - 1]; //#pragma unroll for(int m = ( %V - 2); m >=0; m--) { %SUB(x[m], xVal[m], sumVecReg[m]); } //#pragma unroll for( int idx = (( ( %V * %V) - 1) - %V); idx >= 0; idx--) { int m = idx / %V; // Row : Row-Major idx, x[3] is solved before x[2] int n = idx % ( %V );// Col if ( n > m) { //x[m] = x[m] - reg[m][n] * x[n]; %MAD(x[m], reg[m][n], (-x[n])); } else if ( m == n) { %DIV(sumVecReg[m], x[m], reg[m][m]); x[m] = sumVecReg[m]; } } // Store results //#pragma unroll for(int m = 0; m < %V; m++) { xShared[m] = x[m]; xnew[ (targetRowTemp + m)* incx ] = x[m]; } } // Sync so that xShared it available to all threads barrier(CLK_LOCAL_MEM_FENCE); if ( threadIdx < (TARGET_ROWS_BY_VEC - 1 - i)) { //#pragma unroll for( int j=0; j < %V; j++) { //sumVec += vload4( 0, &A((targetRowTemp), (targetCol -j))) * xShared[%V - 1 -j]; %TYPE%V loadedAVec = %VLOAD( 0, &A((targetRowTemp), (targetCol -j))); %CONJUGATE(doConj, loadedAVec); %VMAD(sumVec, loadedAVec, xShared[VECTOR_SIZE - 1 -j]); } } targetCol = targetCol - %V; // Avoid Race... barrier(CLK_LOCAL_MEM_FENCE); } } else { // Do Vector Reduction on each block except the last Block if ( threadIdx < TARGET_ROWS_BY_VEC) { %TYPE accTemp = %MAKEVEC(0.0); %TYPE%V acc = %VMAKEVEC(accTemp); //#pragma unroll for( int j=0; j < %TARGET_WIDTH; j++) { %ADD(acc, acc, sDataTemp[ threadIdx + j * TARGET_ROWS_BY_VEC]); } // Store the result int targetRow = rowStart + threadIdx * %V; __global %TYPE* xNewPtr = xnew + targetRow * incx; //float4 value = (float4)( xNewPtr[0], xNewPtr[incx], xNewPtr[incx * 2], xNewPtr[incx *3]); %TYPE%V value; %VLOADWITHINCX(value, xNewPtr, incx); // Compute result %SUB(value, value, acc); // Store results //VSTOREWITHINCX( xNewPtr, value, incx); %VSTOREWITHINCX(xNewPtr, value, incx); } } } } "; const char *trsv_CL_ComputeRectangle_kernel = " #ifdef DOUBLE_PRECISION #ifdef cl_khr_fp64 #pragma OPENCL EXTENSION cl_khr_fp64 : enable #else #pragma OPENCL EXTENSION cl_amd_fp64 : enable #endif #endif #define TARGET_ROWS_BY_VEC ((%TARGET_ROWS)/(%V)) #ifdef PACKED #define A(row, col) (*( A + ((( (col) *((2*N) + 1 - (col))) / 2) + ((row) - (col))))) #else #define A(row, col) A[ (row) + (col) * lda] #endif // Compute Rectangle + Traingle __kernel void %PREFIXtrsv_CL_ComputeRectangle_kernel( __global const %TYPE* _A, __global %TYPE* _xnew, uint N, int incx, int isUnity, uint lda, int doConj, int startCol, int rowsLeft, uint offa, uint offx) { __global %TYPE* xnew; __global %TYPE* A = _A + offa; if ( incx < 0 ) // Goto end of vector { xnew = _xnew + offx + ( N - 1) * abs(incx); } else { xnew = _xnew + offx; } size_t bIdx = get_group_id(0); size_t threadIdx= get_local_id(0); // Get total blocks launched size_t nBlocks = (rowsLeft - 1) / %TARGET_ROWS + 1; %TYPE sum = %MAKEVEC( 0.0); %TYPE loadedA = %MAKEVEC( 0.0); // Last Block does scalar stuff... // Only this gets executed if nBlocks == 1 if ( bIdx == (nBlocks - 1)) { int targetCol = startCol; int startRow = (N - rowsLeft) + ( bIdx) * %TARGET_ROWS; int targetRow = startRow + threadIdx; int lastRow = startRow + rowsLeft - ( nBlocks - 1) * %TARGET_ROWS - 1; if ( nBlocks > 1) { if ( targetRow <= lastRow) { for( int i=0; i < %TARGET_ROWS; i++) { // All threads look at same xnew // Should use Shared Memory .. %TYPE xVal = xnew[ targetCol * incx]; loadedA = A((targetRow), (targetCol)); %CONJUGATE(doConj, loadedA); %MAD(sum, loadedA, xVal); targetCol++; } %SUB(xnew[ targetRow * incx], xnew[targetRow * incx], sum); } } else // Solve the traingle -- no more kernel launches required { if ( targetRow <= lastRow) { for( int i=0; i < %TARGET_ROWS; i++) { // All threads look at same xnew // Should use Shared Memory .. %TYPE xVal = xnew[ targetCol * incx]; loadedA = A((targetRow), (targetCol)); %CONJUGATE(doConj, loadedA); %MAD(sum, loadedA, xVal); targetCol++; } } // Change targetCol to point to Triangle last column for all threads // As the above condition ( targetRow <= lastRow) changes targetCol for only threads with condition true targetCol = startCol + %TARGET_ROWS; __local %TYPE xShared; // To share solved x value with other threads.. for( int i=0; i < ((lastRow -startRow) + 1); i++) { if ( targetRow == targetCol) { %TYPE xVal = xnew[ targetRow * incx]; sum = xVal - sum; if( isUnity) { xShared = sum; } else // Handle diagonal element { loadedA = A((targetRow), (targetCol)); %CONJUGATE(doConj, loadedA); %DIV(xShared, sum, loadedA); } xnew[ targetRow * incx ] = xShared; } barrier(CLK_LOCAL_MEM_FENCE); if ( targetRow <= lastRow) { loadedA = A((targetRow), (targetCol)); %CONJUGATE(doConj, loadedA); %MAD(sum, loadedA, xShared); } // Avoid Race barrier(CLK_LOCAL_MEM_FENCE); targetCol++; } } } else { size_t rowShift = ((threadIdx % ( TARGET_ROWS_BY_VEC )) * %V); size_t colShift = threadIdx / TARGET_ROWS_BY_VEC; int rowStart = (N - rowsLeft) + ( bIdx) * %TARGET_ROWS; int row = rowStart + rowShift; %TYPE sumTemp = %MAKEVEC(0.0); %TYPE%V sum = %VMAKEVEC(sumTemp); __local %TYPE xData[ %TARGET_WIDTH]; //#pragma unroll for( int i=1; i <= %NLOOPS; i++) { // Put startCol to start of BLOCKSIZE Block int startColp = startCol + (%TARGET_WIDTH * (i - 1)); if ( threadIdx < %TARGET_WIDTH) { xData[threadIdx] = xnew[ (startColp + threadIdx) * incx]; } barrier(CLK_LOCAL_MEM_FENCE); int col = startColp + colShift; %TYPE xDataVal = xData[ colShift ]; %TYPE%V xDataVec= %VMAKEVEC( xDataVal); %TYPE%V loadedA = %VLOAD( 0, &A((row), (col))); %CONJUGATE(doConj, loadedA); %VMAD(sum, loadedA, xDataVec); barrier(CLK_LOCAL_MEM_FENCE); } __local %TYPE%V sDataTemp[TARGET_ROWS_BY_VEC * %TARGET_WIDTH]; //__local %TYPE* sData = sDataTemp; sDataTemp[(threadIdx % ( TARGET_ROWS_BY_VEC )) + (colShift * TARGET_ROWS_BY_VEC)] = sum; barrier(CLK_LOCAL_MEM_FENCE); //int TARGET_ROWS = %TARGET_ROWS; // Last Block // Do Scalar reduction for last block // Followed by solving the triangle if ( bIdx == 0 ) { %TYPE sumTemp = %MAKEVEC(0.0); %TYPE%V sumVec = %VMAKEVEC(sumTemp); %TYPE%V loadedAVec = %VMAKEVEC(sumTemp); //int targetRow = rowStart + threadIdx; int targetCol = startCol + %TARGET_ROWS; // Col where triangle last col overlaps // Do vector reduction if ( threadIdx < TARGET_ROWS_BY_VEC ) { //#pragma unroll for( int j=0; j < %TARGET_WIDTH; j++) { %ADD(sumVec, sumVec, sDataTemp[ threadIdx + j * TARGET_ROWS_BY_VEC]); } } __local %TYPE xShared[%V]; int targetRowTemp = rowStart + threadIdx * %V; int VECTOR_SIZE = %V; //#pragma unroll for( int i=0; i < (TARGET_ROWS_BY_VEC); i++) { if ( threadIdx == i ) { // Read X-vector %TYPE xVal[%V]; //#pragma unroll for( int j = 0; j < %V; j++) { xVal[j] = xnew[ (targetRowTemp + j)* incx]; } // Read A %Vx%V region into reg %TYPE reg[%V][%V]; //#pragma unroll for( int idx = 0; idx < ( %V * %V); idx++) { int m = idx % ( %V ); // Row : Col-Major idx... int n = idx / ( %V ); // Col if ( m > n ) { reg[m][n] = A((targetRowTemp + m), (targetCol + n)); %CONJUGATE(doConj, reg[m][n]); } } %TYPE sumVecReg[%V]; %VSTOREWITHINCX(sumVecReg, sumVec, 1); // Solve for first x - Do the rest in loop %TYPE x[%V]; %SUB(x[0], xVal[0], sumVecReg[0]); xShared[0] = x[0]; xnew[ (targetRowTemp)* incx ] = x[0]; //#pragma unroll for(int m = 1; m < %V; m++) { %SUB(x[m], xVal[m], sumVecReg[m]); } //#pragma unroll for( int idx = %V; idx < (( %V * %V) - 1); idx++) { int m = idx / %V; // Row : Row-Major idx, x[1] is solved before x[2] int n = idx % ( %V );// Col if ( m > n) { //x[m] = x[m] - reg[m][n] * x[n]; %MAD(x[m], reg[m][n], (-x[n])); } } // Store results //#pragma unroll for(int m = 0; m < %V; m++) { xShared[m] = x[m]; xnew[ (targetRowTemp + m)* incx ] = x[m]; } } // Sync so that xShared it available to all threads barrier(CLK_LOCAL_MEM_FENCE); if ( (threadIdx > i) && ( threadIdx < (TARGET_ROWS_BY_VEC)) ) { //#pragma unroll for( int j=0; j < %V; j++) { %TYPE%V loadedAVec = %VLOAD( 0, &A((targetRowTemp), (targetCol +j))); %CONJUGATE(doConj, loadedAVec); %VMAD(sumVec, loadedAVec, xShared[j]); } } targetCol = targetCol + %V; // Avoid Race... barrier(CLK_LOCAL_MEM_FENCE); } } else { // Do Vector Reduction on each block except the last Block if ( threadIdx < TARGET_ROWS_BY_VEC) { %TYPE accTemp = %MAKEVEC(0.0); %TYPE%V acc = %VMAKEVEC(accTemp); //#pragma unroll for( int j=0; j < %TARGET_WIDTH; j++) { %ADD(acc, acc, sDataTemp[ threadIdx + j * TARGET_ROWS_BY_VEC]); } // Store the result int targetRow = rowStart + threadIdx * %V; __global %TYPE* xNewPtr = xnew + targetRow * incx; //float4 value = (float4)( xNewPtr[0], xNewPtr[incx], xNewPtr[incx * 2], xNewPtr[incx *3]); %TYPE%V value; %VLOADWITHINCX(value, xNewPtr, incx); // Compute result %SUB(value, value, acc); // Store results %VSTOREWITHINCX(xNewPtr, value, incx); } } } } "; const char *trsv_CL_ComputeRectangle_NonUnity_kernel = " #ifdef DOUBLE_PRECISION #ifdef cl_khr_fp64 #pragma OPENCL EXTENSION cl_khr_fp64 : enable #else #pragma OPENCL EXTENSION cl_amd_fp64 : enable #endif #endif #define TARGET_ROWS_BY_VEC ((%TARGET_ROWS)/(%V)) #ifdef PACKED #define A(row, col) (*( A + ((( (col) *((2*N) + 1 - (col))) / 2) + ((row) - (col))))) #else #define A(row, col) A[ (row) + (col) * lda] #endif // Compute Rectangle + Traingle __kernel void %PREFIXtrsv_CL_ComputeRectangle_NonUnity_kernel( __global const %TYPE* _A, __global %TYPE* _xnew, uint N, int incx, int isUnity, uint lda, int doConj, int startCol, int rowsLeft, uint offa, uint offx) { __global %TYPE* xnew; __global %TYPE* A = _A + offa; if ( incx < 0 ) // Goto end of vector { xnew = _xnew + offx + ( N - 1) * abs(incx); } else { xnew = _xnew + offx; } size_t bIdx = get_group_id(0); size_t threadIdx= get_local_id(0); // Get total blocks launched size_t nBlocks = (rowsLeft - 1) / %TARGET_ROWS + 1; %TYPE sum = %MAKEVEC( 0.0); %TYPE loadedA = %MAKEVEC( 0.0); // Last Block does scalar stuff... // Only this gets executed if nBlocks == 1 if ( bIdx == (nBlocks - 1)) { int targetCol = startCol; int startRow = (N - rowsLeft) + ( bIdx) * %TARGET_ROWS; int targetRow = startRow + threadIdx; int lastRow = startRow + rowsLeft - ( nBlocks - 1) * %TARGET_ROWS - 1; if ( nBlocks > 1) { if ( targetRow <= lastRow) { for( int i=0; i < %TARGET_ROWS; i++) { // All threads look at same xnew // Should use Shared Memory .. %TYPE xVal = xnew[ targetCol * incx]; loadedA = A((targetRow), (targetCol)); %CONJUGATE(doConj, loadedA); %MAD(sum, loadedA, xVal); targetCol++; } %SUB(xnew[ targetRow * incx], xnew[targetRow * incx], sum); } } else // Solve the traingle -- no more kernel launches required { if ( targetRow <= lastRow) { for( int i=0; i < %TARGET_ROWS; i++) { // All threads look at same xnew // Should use Shared Memory .. %TYPE xVal = xnew[ targetCol * incx]; loadedA = A((targetRow), (targetCol)); %CONJUGATE(doConj, loadedA); %MAD(sum, loadedA, xVal); targetCol++; } } // Change targetCol to point to Triangle last column for all threads // As the above condition ( targetRow <= lastRow) changes targetCol for only threads with condition true targetCol = startCol + %TARGET_ROWS; __local %TYPE xShared; // To share solved x value with other threads.. for( int i=0; i < ((lastRow -startRow) + 1); i++) { if ( targetRow == targetCol) { %TYPE xVal = xnew[ targetRow * incx]; sum = xVal - sum; // Handle diagonal element loadedA = A((targetRow), (targetCol)); %CONJUGATE(doConj, loadedA); %DIV(xShared, sum, loadedA); xnew[ targetRow * incx ] = xShared; } barrier(CLK_LOCAL_MEM_FENCE); if ( targetRow <= lastRow) { loadedA = A((targetRow), (targetCol)); %CONJUGATE(doConj, loadedA); %MAD(sum, loadedA, xShared); } // Avoid Race barrier(CLK_LOCAL_MEM_FENCE); targetCol++; } } } else { size_t rowShift = ((threadIdx % ( TARGET_ROWS_BY_VEC )) * %V); size_t colShift = threadIdx / TARGET_ROWS_BY_VEC; int rowStart = (N - rowsLeft) + ( bIdx) * %TARGET_ROWS; int row = rowStart + rowShift; %TYPE sumTemp = %MAKEVEC(0.0); %TYPE%V sum = %VMAKEVEC(sumTemp); __local %TYPE xData[ %TARGET_WIDTH]; //#pragma unroll for( int i=1; i <= %NLOOPS; i++) { // Put startCol to start of BLOCKSIZE Block int startColp = startCol + (%TARGET_WIDTH * (i - 1)); if ( threadIdx < %TARGET_WIDTH) { xData[threadIdx] = xnew[ (startColp + threadIdx) * incx]; } barrier(CLK_LOCAL_MEM_FENCE); int col = startColp + colShift; %TYPE xDataVal = xData[ colShift ]; %TYPE%V xDataVec= %VMAKEVEC( xDataVal); %TYPE%V loadedA = %VLOAD( 0, &A((row), (col))); %CONJUGATE(doConj, loadedA); %VMAD(sum, loadedA, xDataVec); barrier(CLK_LOCAL_MEM_FENCE); } __local %TYPE%V sDataTemp[TARGET_ROWS_BY_VEC * %TARGET_WIDTH]; //__local %TYPE* sData = sDataTemp; sDataTemp[(threadIdx % ( TARGET_ROWS_BY_VEC )) + (colShift * TARGET_ROWS_BY_VEC)] = sum; barrier(CLK_LOCAL_MEM_FENCE); //int TARGET_ROWS = %TARGET_ROWS; // Last Block // Do Scalar reduction for last block // Followed by solving the triangle if ( bIdx == 0 ) { %TYPE sumTemp = %MAKEVEC(0.0); %TYPE%V sumVec = %VMAKEVEC(sumTemp); %TYPE%V loadedAVec = %VMAKEVEC(sumTemp); //int targetRow = rowStart + threadIdx; int targetCol = startCol + %TARGET_ROWS; // Col where triangle last col overlaps // Do vector reduction if ( threadIdx < TARGET_ROWS_BY_VEC ) { //#pragma unroll for( int j=0; j < %TARGET_WIDTH; j++) { %ADD(sumVec, sumVec, sDataTemp[ threadIdx + j * TARGET_ROWS_BY_VEC]); } } __local %TYPE xShared[%V]; int targetRowTemp = rowStart + threadIdx * %V; int VECTOR_SIZE = %V; //#pragma unroll for( int i=0; i < (TARGET_ROWS_BY_VEC); i++) { if ( threadIdx == i ) { // Read X-vector %TYPE xVal[%V]; //#pragma unroll for( int j = 0; j < %V; j++) { xVal[j] = xnew[ (targetRowTemp + j)* incx]; } // Read A %Vx%V region into reg %TYPE reg[%V][%V]; //#pragma unroll for( int idx = 0; idx < ( %V * %V); idx++) { int m = idx % ( %V ); // Row : Col-Major idx... int n = idx / ( %V ); // Col if ( m >= n ) { reg[m][n] = A((targetRowTemp + m), (targetCol + n)); %CONJUGATE(doConj, reg[m][n]); } } %TYPE sumVecReg[%V]; %VSTOREWITHINCX(sumVecReg, sumVec, 1); // Solve for first x - Do the rest in loop %TYPE x[%V]; %SUB(x[0], xVal[0], sumVecReg[0]); %DIV(sumVecReg[0], x[0], reg[0][0]); x[0] = sumVecReg[0]; xShared[0] = sumVecReg[0]; xnew[ (targetRowTemp)* incx ] = sumVecReg[0]; //#pragma unroll for(int m = 1; m < %V; m++) { %SUB(x[m], xVal[m], sumVecReg[m]); } //#pragma unroll for( int idx = %V; idx < (%V * %V); idx++) { int m = idx / %V; // Row : Row-Major idx, x[1] is solved before x[2] int n = idx % ( %V );// Col if ( m > n) { //x[m] = x[m] - reg[m][n] * x[n]; %MAD(x[m], reg[m][n], (-x[n])); } else if ( m == n) { %DIV(sumVecReg[m], x[m], reg[m][m]); x[m] = sumVecReg[m]; } } // Store results //#pragma unroll for(int m = 1; m < %V; m++) { xShared[m] = x[m]; xnew[ (targetRowTemp + m)* incx ] = x[m]; } } // Sync so that xShared it available to all threads barrier(CLK_LOCAL_MEM_FENCE); if ( (threadIdx > i) && ( threadIdx < (TARGET_ROWS_BY_VEC)) ) { //#pragma unroll for( int j=0; j < %V; j++) { %TYPE%V loadedAVec = %VLOAD( 0, &A((targetRowTemp), (targetCol +j))); %CONJUGATE(doConj, loadedAVec); %VMAD(sumVec, loadedAVec, xShared[j]); } } targetCol = targetCol + %V; // Avoid Race... barrier(CLK_LOCAL_MEM_FENCE); } } else { // Do Vector Reduction on each block except the last Block if ( threadIdx < TARGET_ROWS_BY_VEC) { %TYPE accTemp = %MAKEVEC(0.0); %TYPE%V acc = %VMAKEVEC(accTemp); //#pragma unroll for( int j=0; j < %TARGET_WIDTH; j++) { %ADD(acc, acc, sDataTemp[ threadIdx + j * TARGET_ROWS_BY_VEC]); } // Store the result int targetRow = rowStart + threadIdx * %V; __global %TYPE* xNewPtr = xnew + targetRow * incx; //float4 value = (float4)( xNewPtr[0], xNewPtr[incx], xNewPtr[incx * 2], xNewPtr[incx *3]); %TYPE%V value; %VLOADWITHINCX(value, xNewPtr, incx); // Compute result %SUB(value, value, acc); // Store results %VSTOREWITHINCX(xNewPtr, value, incx); } } } } "; const char *trsv_CUT_ComputeRectangle_kernel = " #ifdef DOUBLE_PRECISION #ifdef cl_khr_fp64 #pragma OPENCL EXTENSION cl_khr_fp64 : enable #else #pragma OPENCL EXTENSION cl_amd_fp64 : enable #endif #endif #ifdef PACKED #define A( row, col) (*( A + (((col)*((col)+1))/2 + (row)))) #else #define A( row, col) A[ (row) + (col) * lda] #endif __kernel void %PREFIXtrsv_CUT_ComputeRectangle_kernel(__global const %TYPE* _A, __global %TYPE* _xnew, uint N, int incx, int isUnity, uint lda, int doConj, int startRow, int endRow, uint offa, uint offx) { __global %TYPE* xnew; __global %TYPE* A = _A + offa; if ( incx < 0 ) // Goto end of vector { xnew = _xnew + offx + ( N - 1) * abs(incx); } else { xnew = _xnew + offx; } int threadID = get_local_id(0); int threadID_Y, threadID_X; int blockSize = %BLOCKSIZE, blockSize_x, blockSize_y; int blkid = get_group_id(0); int V= %V; __local %TYPE solved[%TRIANGLE_HEIGHT]; __local %TYPE reduce[%TARGET_HEIGHT][ %BLOCKSIZE / %TARGET_HEIGHT]; __local %TYPE%V *solved_vec; int blockStartRow; int triangleHeight; %TYPE%V acc; %TYPE%V loadedAVec; %TYPE sacc; %TYPE accTemp; triangleHeight = endRow - startRow; /* if ((triangleHeight != %TRIANGLE_HEIGHT) || ((triangleHeight % V) != 0)) { // throw -1; // // It is the caller's responsibility to solve triangle whose width // is a multiple of VECTOR SIZE before calling this routine. // This makes the width of the rectangle to be multiple of VECTOR SIZE. // Thus threads can iterate without looking out for vector-unfriendly // dimensions. // This condition can be maintained for any dimension of the input matrix // So, generality is not broken here. // *(__global int*)0 = 0; } if (( %BLOCKSIZE % %TARGET_HEIGHT) != 0) { // throw -1; // // Awkward Block Size. Impossible to write neat code. // The set of threads belonging to the last threadID_X will not have // blockSize_Y number of threads. // *(__global int*)0 = 0; } */ blockSize_y = %TARGET_HEIGHT; blockSize_x = %BLOCKSIZE / %TARGET_HEIGHT; threadID_Y = threadID % %TARGET_HEIGHT; threadID_X = threadID / %TARGET_HEIGHT; blockStartRow = endRow + (blkid * blockSize_x); blockStartRow += threadID_X; for(int i=threadID; i< %TRIANGLE_HEIGHT; i+=blockSize) { solved[i] = xnew[(startRow + i)*incx]; } barrier(CLK_LOCAL_MEM_FENCE); solved_vec = solved; accTemp = %INIT(0.0); acc = %VMAKEVEC( accTemp); if (blockStartRow < N) { for(int i=threadID_Y; i<(triangleHeight/V); i+=blockSize_y) { loadedAVec = %VLOAD(0, &A((startRow + i*V), (blockStartRow))); %CONJUGATE(doConj, loadedAVec); %VMAD(acc, solved_vec[i], loadedAVec); //startRow == startCol as well. } sacc = %REDUCE_SUM(acc); // Put stuff in shared memory for final reduction reduce[threadID_Y][threadID_X] = sacc; } barrier(CLK_LOCAL_MEM_FENCE); if ( threadID < blockSize_x) { sacc = %INIT(0.0); //#pragma unroll for( int i=0; i < %TARGET_HEIGHT; i++) { %ADD(sacc, sacc, reduce[i][threadID]); } blockStartRow = endRow + (blkid * blockSize_x); blockStartRow += threadID; if ( blockStartRow < N) { %SUB(xnew[(blockStartRow)*incx], xnew[(blockStartRow)*incx], sacc); } } } "; const char *trsv_CLT_ComputeRectangle_kernel=" #ifdef DOUBLE_PRECISION #ifdef cl_khr_fp64 #pragma OPENCL EXTENSION cl_khr_fp64 : enable #else #pragma OPENCL EXTENSION cl_amd_fp64 : enable #endif #endif #ifdef PACKED #define A(row, col) (*( A + ((( (col) *((2*N) + 1 - (col))) / 2) + ((row) - (col))))) #else #define A(row, col) A[ (row) + (col) * lda] #endif __kernel void %PREFIXtrsv_CLT_ComputeRectangle_kernel( __global const %TYPE* _A, __global %TYPE* _xnew, uint N, int incx, int isUnity, uint lda, int doConj, int startRow, int endRow, uint offa, uint offx) { __global %TYPE* xnew; __global %TYPE* A = _A + offa; if ( incx < 0 ) // Goto end of vector { xnew = _xnew + offx + ( N - 1) * abs(incx); } else { xnew = _xnew + offx; } int threadID = get_local_id(0); int threadID_Y, threadID_X; int blockSize = %BLOCKSIZE, blockSize_x, blockSize_y; int blkid = get_group_id(0); int V= %V; __local %TYPE solved[%TRIANGLE_HEIGHT]; __local %TYPE reduce[%TARGET_HEIGHT][ %BLOCKSIZE / %TARGET_HEIGHT]; __local %TYPE%V *solved_vec; int blockStartRow; int triangleHeight; %TYPE%V acc; %TYPE%V loadedAVec; %TYPE sacc; %TYPE accTemp; triangleHeight = endRow - startRow; blockSize_y = %TARGET_HEIGHT; blockSize_x = %BLOCKSIZE / %TARGET_HEIGHT; threadID_Y = threadID % %TARGET_HEIGHT; threadID_X = threadID / %TARGET_HEIGHT; blockStartRow = startRow - 1 - (blkid * blockSize_x); blockStartRow -= threadID_X; for(int i=threadID; i< %TRIANGLE_HEIGHT; i+=blockSize) { solved[i] = xnew[(startRow + i)*incx]; } barrier(CLK_LOCAL_MEM_FENCE); solved_vec = solved; accTemp = %INIT(0.0); acc = %VMAKEVEC( accTemp); if (blockStartRow >= 0) { for(int i=threadID_Y; i<(triangleHeight/V); i+=blockSize_y) { loadedAVec = %VLOAD(0, &A((startRow+ i*V) , (blockStartRow))); %CONJUGATE(doConj, loadedAVec); %VMAD(acc, solved_vec[i], loadedAVec); //startRow == startCol as well. } sacc = %REDUCE_SUM(acc); // Put stuff in shared memory for final reduction reduce[threadID_Y][threadID_X] = sacc; } barrier(CLK_LOCAL_MEM_FENCE); if ( threadID < blockSize_x) { sacc = %INIT(0.0); //#pragma unroll for( int i=0; i < %TARGET_HEIGHT; i++) { %ADD(sacc, sacc, reduce[i][threadID]); } blockStartRow = startRow - 1 - (blkid * blockSize_x); blockStartRow -= threadID; if ( blockStartRow < N) { %SUB(xnew[(blockStartRow)*incx], xnew[(blockStartRow)*incx], sacc); } } } "; clblas-2.10/src/library/blas/gens/clTemplates/zgemm_gcn.cl000066400000000000000000000304411264277366700235750ustar00rootroot00000000000000/******************************************************************************* * Notes: * for column major, id(0) is row so C data is coalesced * for row major, id(0) is col ******************************************************************************/ static const char * zgemm_NT_64_32_8_16x16_2x4__ALPHABETA = " // convert preprocs to ints for comparison #define _S_ 1 #define _D_ 2 #define _C_ 3 #define _Z_ 4 /******************************************************************************* * Pre-Processor "Strings" ******************************************************************************/ #define COLUMN_MAJOR_STR ColMajor #define ROW_MAJOR_STR RowMajor /******************************************************************************* * Kernel PreProcessor Definitions ******************************************************************************/ #define WG_NUM_ROWS 16 #define WG_NUM_COLS 16 #define MICRO_TILE_NUM_ROWS 2 #define MICRO_TILE_NUM_COLS 4 #define NUM_UNROLL_ITER 8 #define ORDER ColMajor #define TRANSPOSE_A N #define TRANSPOSE_B T #define DATA_TYPE _Z_ #define MACRO_TILE_NUM_ROWS 32 #define MACRO_TILE_NUM_COLS 64 // each row lengthened by this ammount #define LOCAL_ROW_PAD 1 // each col lengthened by this ammount #define LOCAL_COL_PAD 1 /******************************************************************************* * Global Memory Indices * Note: (a==b)==(c==d) means if both are true or neither is true ******************************************************************************/ /* col-major non-transposed * row-major transposed */ #define GET_GLOBAL_INDEX_N(ROW,COL,STRIDE) ((COL)*(STRIDE)+(ROW)) /* col-major transposed * row-major non-transposed */ #define GET_GLOBAL_INDEX_T(ROW,COL,STRIDE) ((ROW)*(STRIDE)+(COL)) // global A #if (ORDER==COLUMN_MAJOR_STR) == (TRANSPOSE_A==N) #define GET_GLOBAL_INDEX_A(ROW,COL) GET_GLOBAL_INDEX_N((ROW),(COL),(lda)) #else #define GET_GLOBAL_INDEX_A(ROW,COL) GET_GLOBAL_INDEX_T((ROW),(COL),(lda)) #endif // global B #if (ORDER==COLUMN_MAJOR_STR) == (TRANSPOSE_B==N) #define GET_GLOBAL_INDEX_B(ROW,COL) GET_GLOBAL_INDEX_T((ROW),(COL),(ldb)) #else #define GET_GLOBAL_INDEX_B(ROW,COL) GET_GLOBAL_INDEX_N((ROW),(COL),(ldb)) #endif // global C #if (ORDER==COLUMN_MAJOR_STR) #define GET_GLOBAL_INDEX_C(ROW,COL) GET_GLOBAL_INDEX_N((ROW),(COL),(ldc)) #else #define GET_GLOBAL_INDEX_C(ROW,COL) GET_GLOBAL_INDEX_T((ROW),(COL),(ldc)) #endif /******************************************************************************* * Local Memory Indices ******************************************************************************/ // localA - rotated 90 degrees from B but use same accessor unless slow #define GET_LOCAL_INDEX_A(ROW,COL) (ROW + COL*(MACRO_TILE_NUM_ROWS+LOCAL_COL_PAD) ) #define GET_LOCAL_STEP_A ( ((MACRO_TILE_NUM_COLS)+(LOCAL_ROW_PAD)) \ * ((WG_NUM_ROWS)*(WG_NUM_COLS)/(MACRO_TILE_NUM_COLS)) // localB #define GET_LOCAL_INDEX_B(ROW,COL) ((COL) + (ROW)*((MACRO_TILE_NUM_COLS)+(LOCAL_ROW_PAD)) ) #define GET_LOCAL_STEP_B ( ((MACRO_TILE_NUM_COLS)+(LOCAL_ROW_PAD)) \ * ((WG_NUM_ROWS)*(WG_NUM_COLS)/(MACRO_TILE_NUM_COLS)) /******************************************************************************* * Data Types ******************************************************************************/ // single precision #if DATA_TYPE==_S_ #define DATA_TYPE_STR float #define DATA_TYPE_CHAR s #define TYPE_MAD(MUL0,MUL1,DST) DST = mad(MUL0,MUL1,DST); #define TYPE_MAD2( DST, ALPHA, REG, BETA ) DST = (ALPHA)*(REG) + (BETA)*(DST); // double precision #elif DATA_TYPE==_D_ #define DATA_TYPE_STR double #define DATA_TYPE_CHAR d #define TYPE_MAD(MUL0,MUL1,DST) DST = mad(MUL0,MUL1,DST); #define TYPE_MAD2( DST, ALPHA, REG, BETA ) DST = (ALPHA)*(REG) + (BETA)*(DST); // complex single precision #elif DATA_TYPE==_C_ #define DATA_TYPE_STR float2 #define DATA_TYPE_CHAR c #define TYPE_MAD(MUL0,MUL1,DST) \ DST.s0 = mad( MUL0.s0, MUL1.s0, DST.s0 ); \ DST.s0 = mad( -MUL0.s1, MUL1.s1, DST.s0 ); \ DST.s1 = mad( MUL0.s0, MUL1.s1, DST.s1 ); \ DST.s1 = mad( MUL0.s1, MUL1.s0, DST.s1 ); #define TYPE_MAD2( DST, ALPHA, REG, BETA ) \ /* (1) */ \ type_mad2_tmp = REG.s0; \ REG.s0 *= ALPHA.s0; \ REG.s0 = mad( -ALPHA.s1, REG.s1, REG.s0 ); \ REG.s1 *= ALPHA.s0; \ REG.s1 = mad( ALPHA.s1, type_mad2_tmp, REG.s1 ); \ /* (2) */ \ REG.s0 = mad( BETA.s0, DST.s0, REG.s0 ); \ REG.s0 = mad( -BETA.s1, DST.s1, REG.s0 ); \ REG.s1 = mad( BETA.s1, DST.s0, REG.s1 ); \ REG.s1 = mad( BETA.s0, DST.s1, REG.s1 ); \ /* (3) */ \ DST = REG; // complex double precision #else #define DATA_TYPE_STR double2 #define DATA_TYPE_CHAR z #define TYPE_MAD(MUL0,MUL1,DST) \ DST.s0 = mad( MUL0.s0, MUL1.s0, DST.s0 ); \ DST.s0 = mad( -MUL0.s1, MUL1.s1, DST.s0 ); \ DST.s1 = mad( MUL0.s0, MUL1.s1, DST.s1 ); \ DST.s1 = mad( MUL0.s1, MUL1.s0, DST.s1 ); #define TYPE_MAD2( DST, ALPHA, REG, BETA ) \ /* (1) */ \ type_mad2_tmp = REG.s0; \ REG.s0 *= ALPHA.s0; \ REG.s0 = mad( -ALPHA.s1, REG.s1, REG.s0 ); \ REG.s1 *= ALPHA.s0; \ REG.s1 = mad( ALPHA.s1, type_mad2_tmp, REG.s1 ); \ /* (2) */ \ REG.s0 = mad( BETA.s0, DST.s0, REG.s0 ); \ REG.s0 = mad( -BETA.s1, DST.s1, REG.s0 ); \ REG.s1 = mad( BETA.s1, DST.s0, REG.s1 ); \ REG.s1 = mad( BETA.s0, DST.s1, REG.s1 ); \ /* (3) */ \ DST = REG; #endif /******************************************************************************* * 2x4 micro tile ******************************************************************************/ #define MAD2x4 \ rA[0] = localA[offA + 0*WG_NUM_ROWS]; \ rA[1] = localA[offA + 1*WG_NUM_ROWS]; \ rB[0] = localB[offB + 0*WG_NUM_COLS]; \ rB[1] = localB[offB + 1*WG_NUM_COLS]; \ rB[2] = localB[offB + 2*WG_NUM_COLS]; \ rB[3] = localB[offB + 3*WG_NUM_COLS]; \ offA += (MACRO_TILE_NUM_ROWS+LOCAL_COL_PAD); \ offB += (MACRO_TILE_NUM_COLS+LOCAL_ROW_PAD); \ TYPE_MAD(rA[0],rB[0],rC[0][0]); \ TYPE_MAD(rA[1],rB[0],rC[1][0]); \ TYPE_MAD(rA[0],rB[1],rC[0][1]); \ TYPE_MAD(rA[1],rB[1],rC[1][1]); \ TYPE_MAD(rA[0],rB[2],rC[0][2]); \ TYPE_MAD(rA[1],rB[2],rC[1][2]); \ TYPE_MAD(rA[0],rB[3],rC[0][3]); \ TYPE_MAD(rA[1],rB[3],rC[1][3]); \ mem_fence(CLK_LOCAL_MEM_FENCE); // concatenate kernel name // zgemm_NT_64_32_8_16x16_2x4__ALPHABETA #define CONCAT_NAME(DT,TA,TB,TILE_COLS,TILE_ROWS,NUI,WGR,WGC,MTR,MTC) \ DT ## gemm_ ## TA ## TB ## _ ## TILE_COLS ## _ ## TILE_ROWS ## _ ## NUI ## _ ## WGR ## x ## WGC ## _ ## MTR ## x ## MTC ## __ALPHABETA #define KERNEL_NAME(DT,TA,TB,TILE_COLS,TILE_ROWS,NUI,WGR,WGC,MTR,MTC) CONCAT_NAME(DT,TA,TB,TILE_COLS,TILE_ROWS,NUI,WGR,WGC,MTR,MTC) /******************************************************************************* * Kernel ******************************************************************************/ __attribute__((reqd_work_group_size(WG_NUM_COLS,WG_NUM_ROWS,1))) __kernel void KERNEL_NAME(DATA_TYPE_CHAR,TRANSPOSE_A,TRANSPOSE_B,MACRO_TILE_NUM_COLS,MACRO_TILE_NUM_ROWS,NUM_UNROLL_ITER,WG_NUM_ROWS,WG_NUM_COLS,MICRO_TILE_NUM_ROWS,MICRO_TILE_NUM_COLS) ( uint const M, uint const N, uint const K, DATA_TYPE_STR const alpha, DATA_TYPE_STR const beta, __global DATA_TYPE_STR const * restrict A, __global DATA_TYPE_STR const * restrict B, __global DATA_TYPE_STR * C, uint const lda, uint const ldb, uint const ldc, uint const offsetA, uint const offsetB, uint const offsetC ) { // apply offsets A += offsetA; B += offsetB; C += offsetC; // registers DATA_TYPE_STR rC[MICRO_TILE_NUM_ROWS][MICRO_TILE_NUM_COLS] = { {0} }; DATA_TYPE_STR rA[MICRO_TILE_NUM_ROWS]; DATA_TYPE_STR rB[MICRO_TILE_NUM_COLS]; // local memory __local DATA_TYPE_STR localA[NUM_UNROLL_ITER*(MACRO_TILE_NUM_ROWS+LOCAL_COL_PAD)]; __local DATA_TYPE_STR localB[NUM_UNROLL_ITER*(MACRO_TILE_NUM_COLS+LOCAL_ROW_PAD)]; /* * for coalesced C writing * if column major, id(0) is row * if row major, id(0) is col */ uint groupRow = get_group_id(0); uint groupCol = get_group_id(1); uint localRow = get_local_id(0); uint localCol = get_local_id(1); uint localSerial = localRow + localCol*WG_NUM_ROWS; /***************************************************************************** * global indices being loaded ****************************************************************************/ // which gAij is this thread responsible for loading? #define globalARow (groupRow*MACRO_TILE_NUM_ROWS + localSerial%MACRO_TILE_NUM_ROWS) #define globalACol (localSerial/MACRO_TILE_NUM_ROWS) #define globalAIdx (GET_GLOBAL_INDEX_A( globalARow, globalACol ) ) A += globalAIdx; // which gBij is this thread responsible for loading? #define globalBRow (localSerial/MACRO_TILE_NUM_COLS) #define globalBCol (groupCol*MACRO_TILE_NUM_COLS + localSerial%MACRO_TILE_NUM_COLS) #define globalBIdx (GET_GLOBAL_INDEX_B( globalBRow, globalBCol ) ) B += globalBIdx; uint block_k = K / NUM_UNROLL_ITER; #pragma nounroll do { /*************************************************************************** * local indices being written **************************************************************************/ // which lAij is this thread responsible for writing? #define localARow (localSerial % MACRO_TILE_NUM_ROWS) #define localACol (localSerial / MACRO_TILE_NUM_ROWS) #define localAStride ( (MACRO_TILE_NUM_ROWS+LOCAL_COL_PAD) * (WG_NUM_ROWS*WG_NUM_COLS/MACRO_TILE_NUM_ROWS) ) #define globalAStride ( GET_GLOBAL_INDEX_A(0, (WG_NUM_ROWS*WG_NUM_COLS/MACRO_TILE_NUM_ROWS) ) ) #define localAIdx ( GET_LOCAL_INDEX_A(localARow, localACol) ) __local DATA_TYPE_STR *lA = localA + localAIdx; // which lBij is this thread responsible for writing? #define localBRow ( localSerial / MACRO_TILE_NUM_COLS ) #define localBCol ( localSerial % MACRO_TILE_NUM_COLS ) #define localBIdx ( GET_LOCAL_INDEX_B(localBRow, localBCol) ) #define localBStride ( (MACRO_TILE_NUM_COLS+LOCAL_ROW_PAD) * (WG_NUM_ROWS*WG_NUM_COLS/MACRO_TILE_NUM_COLS) ) #define globalBStride ( GET_GLOBAL_INDEX_B( (WG_NUM_ROWS*WG_NUM_COLS/MACRO_TILE_NUM_COLS), 0 ) ) __local DATA_TYPE_STR *lB = localB + localBIdx; barrier(CLK_LOCAL_MEM_FENCE); /*************************************************************************** * Load global -> local * num loads = num threads / total loads **************************************************************************/ // 2x4 uTile x 8unroll lA[ 0*localAStride ] = A[ 0*globalAStride ]; lB[ 0*localBStride ] = B[ 0*globalBStride ]; lB[ 1*localBStride ] = B[ 1*globalBStride ]; barrier(CLK_LOCAL_MEM_FENCE); uint offA = localRow; uint offB = localCol; /*************************************************************************** * do mads in registers **************************************************************************/ MAD2x4 MAD2x4 MAD2x4 MAD2x4 MAD2x4 MAD2x4 MAD2x4 MAD2x4 // fully shift A += lda*NUM_UNROLL_ITER; // b/c N B += ldb*NUM_UNROLL_ITER; // b/c T } while (--block_k > 0); // which global Cij is this thread responsible for computing? uint globalCRow = groupRow * MACRO_TILE_NUM_ROWS + localRow; uint globalCCol = groupCol * MACRO_TILE_NUM_COLS + localCol; /*************************************************************************** * write data **************************************************************************/ double type_mad2_tmp; // used in TYPE_MAD2 TYPE_MAD2( C[ GET_GLOBAL_INDEX_C( globalCRow+0*WG_NUM_ROWS, globalCCol+0*WG_NUM_COLS) ], alpha, rC[0][0], beta ) TYPE_MAD2( C[ GET_GLOBAL_INDEX_C( globalCRow+0*WG_NUM_ROWS, globalCCol+1*WG_NUM_COLS) ], alpha, rC[0][1], beta ) TYPE_MAD2( C[ GET_GLOBAL_INDEX_C( globalCRow+0*WG_NUM_ROWS, globalCCol+2*WG_NUM_COLS) ], alpha, rC[0][2], beta ) TYPE_MAD2( C[ GET_GLOBAL_INDEX_C( globalCRow+0*WG_NUM_ROWS, globalCCol+3*WG_NUM_COLS) ], alpha, rC[0][3], beta ) TYPE_MAD2( C[ GET_GLOBAL_INDEX_C( globalCRow+1*WG_NUM_ROWS, globalCCol+0*WG_NUM_COLS) ], alpha, rC[1][0], beta ) TYPE_MAD2( C[ GET_GLOBAL_INDEX_C( globalCRow+1*WG_NUM_ROWS, globalCCol+1*WG_NUM_COLS) ], alpha, rC[1][1], beta ) TYPE_MAD2( C[ GET_GLOBAL_INDEX_C( globalCRow+1*WG_NUM_ROWS, globalCCol+2*WG_NUM_COLS) ], alpha, rC[1][2], beta ) TYPE_MAD2( C[ GET_GLOBAL_INDEX_C( globalCRow+1*WG_NUM_ROWS, globalCCol+3*WG_NUM_COLS) ], alpha, rC[1][3], beta ) } "; clblas-2.10/src/library/blas/gens/copy_reg.cpp000066400000000000000000000147471264277366700213600ustar00rootroot00000000000000/* ************************************************************************ * Copyright 2013 Advanced Micro Devices, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * ************************************************************************/ /* * copy generator */ //#define DEBUG_COPY #define WORKGROUPS_PER_CU 32 #include #include #include #include #include #include #include #include #include "blas_kgen.h" #include #include #include #define min(a, b) (((a) < (b)) ? (a) : (b)) extern "C" unsigned int dtypeSize(DataType type); static char Prefix[4]; static SolverFlags solverFlags(void) { #ifdef DEBUG_COPY printf("solverFlags called...\n"); #endif return (SF_WSPACE_1D); } static void calcNrThreads( size_t threads[2], const SubproblemDim *subdims, const PGranularity *pgran, const void *args, const void *extra); static ssize_t generator( char *buf, size_t buflen, const struct SubproblemDim *subdims, const struct PGranularity *pgran, void *extra); static void assignKargs(KernelArg *args, const void *params, const void* extra ); extern "C" void initCopyRegisterPattern(MemoryPattern *mempat); static KernelExtraFlags selectVectorization( void *kargs, unsigned int vlen ); static void setBuildOpts( char * buildOptStr, const void *kArgs); static SolverOps copyOps = { generator, assignKargs, NULL, NULL, // Prepare Translate Dims NULL, // Inner Decomposition Axis calcNrThreads, NULL, solverFlags, NULL, NULL, NULL, setBuildOpts, selectVectorization }; static KernelExtraFlags selectVectorization( void *args, unsigned int vlen ) { KernelExtraFlags kflags = KEXTRA_NO_FLAGS; CLBlasKargs *kargs = (CLBlasKargs *)args; if( (((kargs->offBX) % vlen) != 0) || (((kargs->offCY) % vlen) != 0) ) { kflags = KEXTRA_NO_COPY_VEC_A; } return kflags; } static void setBuildOpts( char * buildOptStr, const void *args) { const SolutionStep *step = (const SolutionStep *)args; const CLBlasKargs *kargs = (const CLBlasKargs *)(&step->args); if ( kargs->dtype == TYPE_DOUBLE || kargs->dtype == TYPE_COMPLEX_DOUBLE) { addBuildOpt( buildOptStr, BUILD_OPTS_MAXLEN, "-DDOUBLE_PRECISION"); #ifdef DEBUG_COPY printf("Setting build options ... Double... for DOUBLE PRECISION support\n"); #endif } if( (kargs->ldb.vector) != 1) { addBuildOpt( buildOptStr, BUILD_OPTS_MAXLEN, "-DINCX_NONUNITY"); } if( (kargs->ldc.vector) != 1) { addBuildOpt( buildOptStr, BUILD_OPTS_MAXLEN, "-DINCY_NONUNITY"); } return; } static CLBLASMpatExtra mpatExtra; extern "C" void initCopyRegisterPattern(MemoryPattern *mempat) { #ifdef DEBUG_COPY printf("initREgPattern called with mempat = 0x%p\n", mempat); #endif fflush(stdout); mempat->name = "Register accumulation based swap"; mempat->nrLevels = 2; mempat->cuLevel = 0; mempat->thLevel = 1; mempat->sops = ©Ops; mpatExtra.aMset = CLMEM_LEVEL_L2; mpatExtra.bMset = CLMEM_LEVEL_L2; mpatExtra.mobjA = CLMEM_GLOBAL_MEMORY; mpatExtra.mobjB = CLMEM_GLOBAL_MEMORY; mempat->extra = &mpatExtra; Prefix[TYPE_FLOAT] = 'S'; Prefix[TYPE_DOUBLE] = 'D'; Prefix[TYPE_COMPLEX_FLOAT] = 'C'; Prefix[TYPE_COMPLEX_DOUBLE] = 'Z'; } static void calcNrThreads( size_t threads[2], const SubproblemDim *subdims, const PGranularity *pgran, const void *args, const void *_extra) { DUMMY_ARG_USAGE(subdims); const CLBLASKernExtra *extra = ( CLBLASKernExtra *)_extra; CLBlasKargs *kargs = (CLBlasKargs *)args; SolutionStep *step = container_of(kargs, args, SolutionStep); TargetDevice *kDevice = &(step->device); cl_int err; unsigned int numComputeUnits = deviceComputeUnits( (kDevice->id), &err ); if(err != CL_SUCCESS) { numComputeUnits = 1; } unsigned int vecLen = extra->vecLenA; unsigned int blockSize = pgran->wgSize[0] * pgran->wgSize[1]; unsigned int wgToSpawn = ((kargs->N - 1)/ (blockSize*vecLen)) + 1; wgToSpawn = min( wgToSpawn, (numComputeUnits * WORKGROUPS_PER_CU) ); threads[0] = wgToSpawn * blockSize; threads[1] = 1; } static ssize_t generator( char *buf, size_t buflen, const struct SubproblemDim *subdims, const struct PGranularity *pgran, void *extra) { DUMMY_ARGS_USAGE_2(pgran, subdims); char tempTemplate[32*1024]; if ( buf == NULL) // return buffer size { buflen = (32 * 1024 * sizeof(char)); return (ssize_t)buflen; } CLBLASKernExtra *extraFlags = ( CLBLASKernExtra *)extra; #ifdef DEBUG_COPY printf("COPY GENERATOR called....\n"); printf("dataType : %c\n", Prefix[extraFlags->dtype]); #endif unsigned int vecLenA = extraFlags->vecLenA; #ifdef DEBUG_COPY printf("Vector length used : %d\n\n", vecLenA); #endif bool doVLOAD = false; if( extraFlags->flags & KEXTRA_NO_COPY_VEC_A ) { doVLOAD = true; #ifdef DEBUG_COPY printf("DOing VLOAD as Aligned Data Pointer not Availabe\n"); #endif } else { #ifdef DEBUG_COPY printf("Using Aligned Data Pointer .........................\n"); #endif } strcpy( tempTemplate, (char*)copy_kernel ); kprintf kobj( Prefix[extraFlags->dtype], vecLenA, doVLOAD, doVLOAD); kobj.spit((char*)buf, tempTemplate); return (32 * 1024 * sizeof(char)); } /* __kernel void %PREFIXcopy_kernel( __global %TYPE *_X, __global %TYPE *_Y, uint N, uint offx, int incx, uint offy, int incy ) */ static void assignKargs(KernelArg *args, const void *params, const void* ) { CLBlasKargs *blasArgs = (CLBlasKargs*)params; cl_int incx, incy; INIT_KARG(&args[0], blasArgs->A); INIT_KARG(&args[1], blasArgs->B); initSizeKarg(&args[2], blasArgs->N); initSizeKarg(&args[3], blasArgs->offBX); incx = blasArgs->ldb.vector; INIT_KARG(&args[4], incx); initSizeKarg(&args[5], blasArgs->offCY); incy = blasArgs->ldc.vector; INIT_KARG(&args[6], incy); return; } clblas-2.10/src/library/blas/gens/decomposition.c000066400000000000000000000103001264277366700220420ustar00rootroot00000000000000/* ************************************************************************ * Copyright 2013 Advanced Micro Devices, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * ************************************************************************/ /* * This module contains implementation of API for checking * decompositions and calculate granularity */ #include #include #include #include "blas_kgen.h" static __inline bool checkSizeStepRelation(size_t size, size_t step) { return ((size == SUBDIM_UNUSED) || (size && (size % step == 0))); } bool decompSanityCheck( const SubproblemDim *subdims, unsigned int minSize, unsigned int maxSize, unsigned int maxRegs, DataType dtype, bool wholeA) { bool ret; if( 0 == subdims[0].x || 0 == subdims[0].y || 0 == subdims[0].bwidth || 0 == subdims[1].x || 0 == subdims[1].y || 0 == subdims[1].bwidth ){ return false; } if ( ((subdims[1].x < minSize) ||(subdims[1].x > maxSize)) || ((subdims[1].y < minSize) || (subdims[1].y > maxSize)) || ((subdims[1].bwidth < minSize) || (subdims[1].bwidth > maxSize)) ) { return false; } // the group block must consist of integer number of subgroup blocks if( subdims[0].x % subdims[1].itemX || subdims[0].y % subdims[1].itemY || subdims[0].bwidth % subdims[1].bwidth ){ return false; } ret = checkSizeStepRelation(subdims[0].itemX, subdims[0].x); ret = ret && checkSizeStepRelation(subdims[0].itemY, subdims[0].y); ret = ret && checkSizeStepRelation(subdims[1].itemX, subdims[1].x); ret = ret && checkSizeStepRelation(subdims[1].itemY, subdims[1].y); if (ret) { size_t regUse; size_t regsA; if (wholeA) { regsA = subdims[1].y * subdims[1].bwidth; } else { regsA = szmax(subdims[1].y, subdims[1].bwidth); } // estimate register usage, drop // inevitably slowed decompositions regUse = ( regsA + subdims[1].bwidth * subdims[1].x + subdims[1].x * subdims[1].y ) * dtypeSize(dtype); regUse /= 16; // 16 bytes per register ret = (regUse <= maxRegs); } return ret; } void calcPgranDedicated( PGranularity *pgran, const SubproblemDim *subdims, int xdim, int level) { unsigned int xg, yg; DUMMY_ARG_USAGE(level); assert((xdim >= -1) && (xdim <= 1)); xg = (unsigned int)(subdims[0].x / subdims[1].itemX); yg = (unsigned int)(subdims[0].y / subdims[1].itemY); if (xdim == -1) { pgran->wgSize[0] = xg * yg; pgran->wgSize[1] = 1; pgran->wgDim = 1; } else { pgran->wgSize[xdim] = xg; pgran->wgSize[1 - xdim] = yg; pgran->wgDim = 2; } } void calcPgranCooperative( PGranularity *pgran, const SubproblemDim *subdims, int xdim, int ydim, int level) { unsigned int xg, yg; DUMMY_ARG_USAGE(level); assert((xdim >= 0) && (xdim <= 2)); assert((ydim >= 0) && (ydim <= 2)); assert((xdim && ydim) && (!xdim && !ydim)); assert(!( ((xdim == 2) && (ydim == 0)) || ((ydim == 2) && (xdim == 0)) )); xg = (unsigned int)(subdims[0].x / subdims[1].itemX); yg = (unsigned int)(subdims[0].y / subdims[1].itemY); if (xdim == ydim) { pgran->wgSize[xdim] = xg * yg; } else { pgran->wgSize[xdim] = xg; pgran->wgSize[ydim] = yg; } if ((xdim > 0) || (ydim > 0)) { pgran->wgSize[0] = (unsigned int)(subdims[0].bwidth / subdims[1].bwidth); } pgran->wgDim = umax(xdim, ydim) + 1; } clblas-2.10/src/library/blas/gens/dot.cpp000066400000000000000000000170351264277366700203300ustar00rootroot00000000000000/* ************************************************************************ * Copyright 2013 Advanced Micro Devices, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * ************************************************************************/ /* * dot generator */ //#define DEBUG_DOT #define WORKGROUPS_PER_CU 32 #include #include #include #include #include #include #include #include #include "blas_kgen.h" #include #include #include #define min(a, b) (((a) < (b)) ? (a) : (b)) extern "C" unsigned int dtypeSize(DataType type); static char Prefix[4]; static SolverFlags solverFlags(void) { #ifdef DEBUG_DOT printf("solverFlags called...\n"); #endif return (SF_WSPACE_1D); } static void calcNrThreads( size_t threads[2], const SubproblemDim *subdims, const PGranularity *pgran, const void *args, const void *extra); static ssize_t generator( char *buf, size_t buflen, const struct SubproblemDim *subdims, const struct PGranularity *pgran, void *extra); static void fixupArgs(void *args, SubproblemDim *subdims, void *extra); static void assignKargs(KernelArg *args, const void *params, const void* extra ); extern "C" void initDotRegisterPattern(MemoryPattern *mempat); static KernelExtraFlags selectVectorization( void *kargs, unsigned int vlen ); static void setBuildOpts( char * buildOptStr, const void *kArgs); static SolverOps dotOps = { generator, assignKargs, NULL, NULL, // Prepare Translate Dims NULL, // Inner Decomposition Axis calcNrThreads, NULL, solverFlags, fixupArgs, NULL, NULL, setBuildOpts, selectVectorization }; static KernelExtraFlags selectVectorization( void *args, unsigned int vlen ) { KernelExtraFlags kflags = KEXTRA_NO_FLAGS; CLBlasKargs *kargs = (CLBlasKargs *)args; if( (((kargs->offBX) % vlen) != 0) || (((kargs->offCY) % vlen) != 0) ) { kflags = KEXTRA_NO_COPY_VEC_A; } return kflags; } static void setBuildOpts( char * buildOptStr, const void *args) { const SolutionStep *step = (const SolutionStep *)args; const CLBlasKargs *kargs = (const CLBlasKargs *)(&step->args); if ( kargs->dtype == TYPE_DOUBLE || kargs->dtype == TYPE_COMPLEX_DOUBLE) { addBuildOpt( buildOptStr, BUILD_OPTS_MAXLEN, "-DDOUBLE_PRECISION"); #ifdef DEBUG_DOT printf("Setting build options ... Double... for DOUBLE PRECISION support\n"); #endif } if( (kargs->ldb.vector) != 1) { addBuildOpt( buildOptStr, BUILD_OPTS_MAXLEN, "-DINCX_NONUNITY"); } if( (kargs->ldc.vector) != 1) { addBuildOpt( buildOptStr, BUILD_OPTS_MAXLEN, "-DINCY_NONUNITY"); } return; } static CLBLASMpatExtra mpatExtra; extern "C" void initDotRegisterPattern(MemoryPattern *mempat) { #ifdef DEBUG_DOT printf("initRegPattern called with mempat = 0x%p\n", mempat); #endif fflush(stdout); mempat->name = "Register accumulation based swap"; mempat->nrLevels = 2; mempat->cuLevel = 0; mempat->thLevel = 1; mempat->sops = &dotOps; mpatExtra.aMset = CLMEM_LEVEL_L2; mpatExtra.bMset = CLMEM_LEVEL_L2; mpatExtra.mobjA = CLMEM_GLOBAL_MEMORY; mpatExtra.mobjB = CLMEM_GLOBAL_MEMORY; mempat->extra = &mpatExtra; Prefix[TYPE_FLOAT] = 'S'; Prefix[TYPE_DOUBLE] = 'D'; Prefix[TYPE_COMPLEX_FLOAT] = 'C'; Prefix[TYPE_COMPLEX_DOUBLE] = 'Z'; } static void calcNrThreads( size_t threads[2], const SubproblemDim *subdims, const PGranularity *pgran, const void *args, const void *_extra) { DUMMY_ARG_USAGE(subdims); const CLBLASKernExtra *extra = ( CLBLASKernExtra *)_extra; CLBlasKargs *kargs = (CLBlasKargs *)args; SolutionStep *step = container_of(kargs, args, SolutionStep); TargetDevice *kDevice = &(step->device); cl_int err; unsigned int numComputeUnits = deviceComputeUnits( (kDevice->id), &err ); if(err != CL_SUCCESS) { numComputeUnits = 1; } unsigned int vecLen = extra->vecLenA; unsigned int blockSize = pgran->wgSize[0] * pgran->wgSize[1]; unsigned int wgToSpawn = ((kargs->N - 1)/ (blockSize*vecLen)) + 1; wgToSpawn = min( wgToSpawn, (numComputeUnits * WORKGROUPS_PER_CU) ); threads[0] = wgToSpawn * blockSize; threads[1] = 1; } // // FIXME: Report correct return value - Needs change in KPRINTF // static ssize_t generator( char *buf, size_t buflen, const struct SubproblemDim *subdims, const struct PGranularity *pgran, void *extra) { DUMMY_ARG_USAGE(subdims); size_t BLOCKSIZE = pgran->wgSize[0]; char tempTemplate[32*1024]; if ( buf == NULL) // return buffer size { buflen = (32 * 1024 * sizeof(char)); return (ssize_t)buflen; } CLBLASKernExtra *extraFlags = ( CLBLASKernExtra *)extra; #ifdef DEBUG_DOT printf("DOT GENERATOR called....\n"); printf("dataType : %c\n", Prefix[extraFlags->dtype]); #endif unsigned int vecLenA = extraFlags->vecLenA; #ifdef DEBUG_DOT printf("Vector length used : %d\n\n", vecLenA); #endif bool doVLOAD = false; if( extraFlags->flags & KEXTRA_NO_COPY_VEC_A ) { doVLOAD = true; #ifdef DEBUG_DOT printf("DOing VLOAD as Aligned Data Pointer not Availabe\n"); #endif } else { #ifdef DEBUG_DOT printf("Using Aligned Data Pointer .........................\n"); #endif } strcpy( tempTemplate, (char*)dot_kernel ); kprintf kobj( Prefix[extraFlags->dtype], vecLenA, doVLOAD, doVLOAD, BLOCKSIZE); kobj.spit((char*)buf, tempTemplate); return (32 * 1024 * sizeof(char)); } /* __kernel void %PREFIXdot_kernel( __global %TYPE *_X, __global %TYPE *_Y, __global %TYPE *scratchBuff, uint N, uint offx, int incx, uint offy, int incy, int doConj ) */ static void assignKargs(KernelArg *args, const void *params, const void* ) { CLBlasKargs *blasArgs = (CLBlasKargs*)params; cl_int incx, incy, doConj; INIT_KARG(&args[0], blasArgs->B); INIT_KARG(&args[1], blasArgs->C); INIT_KARG(&args[2], blasArgs->D); initSizeKarg(&args[3], blasArgs->N); initSizeKarg(&args[4], blasArgs->offBX); incx = blasArgs->ldb.vector; INIT_KARG(&args[5], incx); initSizeKarg(&args[6], blasArgs->offCY); incy = blasArgs->ldc.vector; INIT_KARG(&args[7], incy); doConj = blasArgs->K; INIT_KARG(&args[8], doConj); return; } /** The purpose of this function is to add an work-group size indicator in kernelKey, so that a different kernel is generated when work-group size is changed. Reduction loop is unrolled in kprintf based on work-group size. Member of SubproblemDim- bwidth, will be used to store work-group size of the current kernel this will become a kernelKey, and kernel cache will be accordingly managed. Note -- SubproblemDim is a member of kernelKey **/ static void fixupArgs(void *args, SubproblemDim *subdims, void *extra) { DUMMY_ARG_USAGE(extra); CLBlasKargs *kargs = (CLBlasKargs*)args; SolutionStep *step = container_of(kargs, args, SolutionStep); subdims->bwidth = (step->pgran.wgSize[0]) * (step->pgran.wgSize[1]); } clblas-2.10/src/library/blas/gens/fetch.c000066400000000000000000001711311264277366700202710ustar00rootroot00000000000000/* ************************************************************************ * Copyright 2013 Advanced Micro Devices, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * ************************************************************************/ /* * COMMON DESCRIPTION: * * This module implements generation of fetches from memory to registers. * It support various optimization strategies depending on used addressing * modes, size of tiles, etc. Such a strategy is provided by an object * that is named addressing agent. * * The module supports explicit statements repordering so as to group together * scattered ALU and FETCH statements. The reordering is implemented by means * of the statement batch. Scheme of priority assignment for statements put * to the batch within the same call: * - Statments declaring and initializing variables have the highest * priority because all the sebsequent ones depend on it. * - Fetch statements have the decreased priority if any preparative * statements have really been generated * - Statements for updating variables have more decreased priority * - If an updating variable statement has been generated before full * tile fetch completion, priority for the next fetch statement is * decreased so as to don't disturb statements dependency. */ #include #include #include #include #include #include #include "blas_kgen.h" #define MAX_LENGTH 4096 #define BITS_INT (sizeof(int) * 8) struct FetchContext; enum { MAX_AUXILIARY_VARNUM = 32, MAX_ADDR_AGENTS = 8, ADDR_AGENT_PRIVATE_SIZE = 64, /* * buffer size enough to fit a declaration of a vectorized coordinate, * expressions for all components, operators for building a correct syntax * construction, and blanks between 2 adjacent component initializers */ COORD_BUFSIZE = (MAX_OPENCL_VECTOR_LENGTH + 1) * (sizeof(Kstring) + 2) + 16, /* * Priority of all statement declaring and initializing some variables */ PREPARE_VARS_STMT_PRIORITY = 0, GENERIC_OPT_LEVELS = FOPTLEV_PREFETCH | FOPTLEV_CAN_SHARE_TMP_AB | FOPTLEV_MERGE_FETCHES }; /* * Agent for some addressing scheme. Incapsulates creation and updating * of auxiliary variables and building offset expressions */ typedef struct AddrAgent { Kstring vars[MAX_AUXILIARY_VARNUM]; // usage counters for using for A and B int usageCount[2]; // loop preparation counters for A and B int loopPrepCount[2]; char priv[ADDR_AGENT_PRIVATE_SIZE]; bool (*match)(const struct FetchContext*); /* * Generate code preparing needed variables. Must return 1 if some * variables has been actually prepared, 0 otherwise */ int (*prepareVars)(struct FetchContext*); /* * Generate code updating variables. Must return 1 if some variables * has been actually prepared, 0 otherwise. * 'stmtPriority' means the priority that must have a statement that * is the agent is going to add to the batch */ int (*updateVars)(struct FetchContext*, unsigned int nextLine, unsigned int nextVec, int stmtPriority); void (*sprintfAddrOffset)(Kstring*, struct FetchContext*, unsigned int line, unsigned int vec); } AddressingAgent; // Preperties of the current operation of offset evaluation. struct OffsetEvalProps { // global size K is in vectors bool gkInVect; // all coordinates are in vectors bool coordInVect; /* * don't multiply coordinate in the second physical dimension * on leading dimension, it is already done */ bool ldNotMul; /* * Vector length of linear component in leading dimension. * Number of linear coordinates in the leading dimension taken * by an addressing agent at a time at offset evaluation must be * equal to this number. */ unsigned int leadVecLen; }; typedef struct FetchContext { // addressing mode that should be used in fetch operations FetchAddrMode addrMode; // optimization levels of code generation FetchOptLevel optLevels; AddressingAgent agents[MAX_ADDR_AGENTS]; AddressingAgent *currAgent; AddressingAgent *prevAgent; const BlasGenSettings *gset; const FetchOpts *fopts; // statement batch used at the current generation struct StatementBatch *batch; // Respective physical tile in global memory Tile physTile; // physical dimension passed in the outer loop int outerDim; struct OffsetEvalProps oevp; bool isLoopPreparation; // markers of context validity for matrix A and B bool valid[2]; } FetchContext; struct PhysOffsetComponents { Kstring base; Kstring offset; Kstring bound; }; /* * Raw leading dimension. This a pair of a leading dimension * expressed in number of elements and value on with which it * should be scaled for correct addressing. * Scale set to '0' means that the value in elements matches the * value in vectors */ struct RawLD { Kstring str; unsigned int scale; }; static const char *vectComponents = "0123456789abcdef"; static void sprintfOffsetStateless(Kstring *expr, FetchContext *fctx, unsigned int line, unsigned int vec); static void initStatelessAgent(AddressingAgent *agent); static void initTmpCoordAgent(AddressingAgent *agent); static void initPersCoordAgent(AddressingAgent *agent); void (*initAgentsTable[])(AddressingAgent *agent) = { initStatelessAgent, initTmpCoordAgent, initPersCoordAgent, NULL }; static __inline bool isOne(const Kstring *kstr) { return (kstr->buf[0] == '1') && (kstr->buf[1] == '\0'); } static __inline bool isZero(const Kstring *kstr) { return (kstr->buf[0] == '0') && (kstr->buf[1] == '\0'); } static __inline bool isLocalMemoryUsed(const FetchOpts *fopts) { return ((fopts->mrole == MATRIX_A) && (fopts->memA == CLMEM_LOCAL_MEMORY)) || ((fopts->mrole == MATRIX_B) && (fopts->memB == CLMEM_LOCAL_MEMORY)); } static __inline unsigned int tileVecColsNum(const Tile *physTile) { return physTile->nrCols / physTile->vecLen; } static __inline bool canBeFetchesMerged(const FetchContext *fctx) { return (fctx->optLevels & FOPTLEV_MERGE_FETCHES) != 0; } /* * Returns if the linear offsets along the dimension K * can be shared for tiles A and B */ static bool canBeKoffShared(const FetchContext *fctx) { unsigned int vlenA, vlenB; bool canShare; vlenA = getVecLen(fctx->gset, CLBLAS_GEMM, MATRIX_A); vlenB = getVecLen(fctx->gset, CLBLAS_GEMM, MATRIX_B); canShare = !fctx->gset->tileA.trans && fctx->gset->tileBX.trans && (vlenA == vlenB); canShare = canShare && (fctx->currAgent == fctx->prevAgent) && ((fctx->optLevels & FOPTLEV_CAN_SHARE_TMP_AB) != 0); return canShare; } static __inline const Tile* getDstTile(const FetchContext *fctx) { return (fctx->fopts->mrole == MATRIX_A) ? &fctx->gset->tileA : &fctx->gset->tileBX; } static __inline bool isFetchContextValid(const FetchContext *fctx) { int i = (fctx->fopts->mrole == MATRIX_A) ? 0 : 1; return fctx->valid[i]; } static __inline void invalidateFetchContext(FetchContext *fctx) { int i = (fctx->fopts->mrole == MATRIX_A) ? 0 : 1; fctx->valid[i] = false; } static __inline int agentUsageCount(const FetchContext *fctx) { int i = (fctx->fopts->mrole == MATRIX_A) ? 0 : 1; return fctx->currAgent->usageCount[i]; } static __inline void incAgentUsageCount(FetchContext *fctx) { int i = (fctx->fopts->mrole == MATRIX_A) ? 0 : 1; fctx->currAgent->usageCount[i]++; } static __inline int agentLoopPrepCount(const FetchContext *fctx) { int i = (fctx->fopts->mrole == MATRIX_A) ? 0 : 1; return fctx->currAgent->loopPrepCount[i]; } static __inline void incAgentLoopPrepCount(FetchContext *fctx) { int i = (fctx->fopts->mrole == MATRIX_A) ? 0 : 1; fctx->currAgent->loopPrepCount[i]++; } static int bwidthPhysDimension(const FetchContext *fctx) { int dim; const Tile *tile; tile = getDstTile(fctx); if (fctx->fopts->mrole == MATRIX_A) { dim = (tile->trans) ? 1 : 0; } else { dim = (tile->trans) ? 0 : 1; } return dim; } static FetchAddrMode fetchAddrModeFromMulOpts(const TileMulOpts *mulOpts) { FetchAddrMode mode = FETCH_ADDR_NORMAL; TileMulFlags mflags = mulOpts->flags; if (mflags & (TILEMUL_SKEW_A | TILEMUL_GLOBAL_CYCLIC_A)) { mode |= FETCH_ADDR_A_CYCLICAL; } if (mflags & (TILEMUL_SKEW_B | TILEMUL_GLOBAL_CYCLIC_B)) { mode |= FETCH_ADDR_B_CYCLICAL; } if (mflags & (TILEMUL_SKEW_K | TILEMUL_GLOBAL_CYCLIC_K)) { mode |= FETCH_ADDR_K_CYCLICAL; } if (mflags & TILEMUL_WRAP_AROUND_TAIL) { mode |= FETCH_ADDR_TAILK_PADD; } return mode; } static void sprintfVectorComponent( Kstring *kstr, const char *baseName, unsigned int n, unsigned int maxn) { assert(n < maxn); if (maxn == 1) { kstrcpy(kstr, baseName); } else { ksprintf(kstr, "%s.s%c", baseName, vectComponents[n]); } } /* * sprintf base coordinate and scale it in accordance with * used mode and vector length so as it is in vectors */ static void sprintfNormalizedBaseCoord( Kstring *kstr, const char *name, int physDim, FetchContext *fctx) { int shift = findHighestSetBit(fctx->physTile.vecLen); if (physDim || fctx->oevp.coordInVect || (shift == 0)) { kstrcpy(kstr, name); } else { ksprintf(kstr, "(uint)(%s >> %d)", name, shift); } } static void sprintfOffsetVector(Kstring *kstr, unsigned int base, unsigned int len) { if (len == 1) { ksprintf(kstr, "%u", base); } else { unsigned int i; ksprintf(kstr, "(uint%u)(%u", len, base); for (i = 1; i < len; i++) { kstrcatf(kstr, ", %u", base + i); } kstrcatf(kstr, "%c", ')'); } } static void sprintfLinearOffset( Kstring *expr, const struct PhysOffsetComponents *comp, bool swapBaseOff) { int cnt = 0; const Kstring *kstr = NULL; bool isBounded; expr->buf[0] = '\0'; if (!isKstringEmpty(&comp->base) && !isZero(&comp->base)) { cnt++; kstr = &comp->base; } if (!isKstringEmpty(&comp->offset) && !isZero(&comp->offset)) { cnt++; kstr = &comp->offset; } if (cnt == 0) { return; } isBounded = !isKstringEmpty(&comp->bound); if (cnt == 2) { const Kstring *first = (swapBaseOff) ? &comp->offset : &comp->base; const Kstring *second = (swapBaseOff) ? &comp->base : &comp->offset; if (isBounded) { ksprintf(expr, "(%s + %s) %% %s", first->buf, second->buf, &comp->bound.buf); } else { ksprintf(expr, "%s + %s", first->buf, second->buf); } } else { if (isBounded) { ksprintf(expr, "%s %% %s", kstr->buf, &comp->bound.buf); } else { kstrcpy(expr, kstr->buf); } } } /* * Estimate if address offset evaluation will be cheap without any savings. * If kxy is 0, then predicate it for the coordinates along the dimension K, * otherwise do it for the coordinates along rows of A or columns of B. */ static bool estimateOffsetEvalCheap(const FetchContext *fctx, int kxy) { int kdim; unsigned int n; const Tile *physTile; FetchAddrMode relFlag, cycFlag; bool needNorm; /* * Criteria: * Evaluation is cheap if addressing is relative or number of * elements in this dimension doesn't exceed 2 and no transform * to vectors (normalization) or cycling is needed. */ kdim = bwidthPhysDimension(fctx); physTile = &fctx->physTile; needNorm = (physTile->vecLen > 1); if (!kxy) { n = (kdim) ? physTile->nrRows : tileVecColsNum(physTile); relFlag = FETCH_ADDR_K_RELATIVE; cycFlag = FETCH_ADDR_K_CYCLICAL; needNorm = needNorm && !kdim; } else { MatrixRole mrole = fctx->fopts->mrole; n = (kdim) ? tileVecColsNum(physTile) : physTile->nrRows; relFlag = (mrole == MATRIX_A) ? FETCH_ADDR_A_RELATIVE : FETCH_ADDR_B_RELATIVE; cycFlag = (mrole == MATRIX_A) ? FETCH_ADDR_A_CYCLICAL : FETCH_ADDR_B_CYCLICAL; needNorm = needNorm && kdim; } return ( (fctx->addrMode & relFlag) || ((n <= 2) && !(needNorm || (fctx->addrMode & cycFlag))) ); } /* * Predicate if register consumption will be high if the * generator request a space for 'nrCoords' coordinates. * The 'isPers' argument shows if these are persistent * coordinates or not. * The 'isSummary' argument shows if this is summary number * of coordinates for both the tiles or only for one of * the tiles. */ static bool predictHighRegConsumption( const FetchContext *fctx, unsigned int nrCoords, bool isPers, bool isSummary) { unsigned int max; DUMMY_ARG_USAGE(fctx); // TODO: take into account number of registers consumed by the tiles max = (isPers) ? 12 : 16; if (isSummary) { max *= 2; } return !(nrCoords < max); } static void sprintfLeadingDimension(Kstring *ld, const FetchContext *fctx) { bool done = false; const char *varName; varName = (fctx->fopts->mrole == MATRIX_A) ? fctx->gset->varNames.lda : fctx->gset->varNames.ldb; if (!(fctx->gset->flags & BGF_LD_IN_VECTORS)) { int shift; shift = findHighestSetBit(fctx->physTile.vecLen); if (shift != 0) { ksprintf(ld, "(uint)(%s >> %d)", varName, shift); done = true; } } if (!done) { kstrcpy(ld, varName); } } /* * fill raw leading dimension */ static void fillRawLD( struct RawLD *ld, const FetchContext *fctx) { const char *varName; varName = (fctx->fopts->mrole == MATRIX_A) ? fctx->gset->varNames.lda : fctx->gset->varNames.ldb; kstrcpy(&ld->str, varName); ld->scale = (fctx->gset->flags & BGF_LD_IN_VECTORS) ? 0 : fctx->physTile.vecLen; } /* * Spintf bound for the K component in case of storing a matrix * in the global memory */ static void sprintfGboundK(Kstring *kstr, const FetchContext *fctx) { int dim; const char *varK = fctx->gset->varNames.sizeK; unsigned int vecLen; int shift; vecLen = fctx->physTile.vecLen; shift = findHighestSetBit(vecLen); dim = bwidthPhysDimension(fctx); if (dim || fctx->oevp.gkInVect || (shift == 0)) { kstrcpy(kstr, varK); } else { if (fctx->addrMode & FETCH_ADDR_TAILK_PADD) { ksprintf(kstr, "(uint)((%s + %u) >> %d)", varK, vecLen - 1, shift); } else { ksprintf(kstr, "(uint)(%s >> %d)", varK, shift); } } } static void selectAddrAgent(FetchContext *fctx) { unsigned int level; FetchOptLevel origLevels; FetchOptLevel prefLev, mergeLev; int i; bool last = false; prefLev = fctx->optLevels & FOPTLEV_PREFETCH; /* * The merge level doesn't affect addressing agents in any way. * So, clear it for a time so as they wouldn't even know if it * is used or not. */ mergeLev = fctx->optLevels & FOPTLEV_MERGE_FETCHES; origLevels = fctx->optLevels & ~FOPTLEV_MERGE_FETCHES; fctx->currAgent = NULL; /* * Selecting criteria: Any of the agents supporting an optimization level * as high as possible which is suitable for these generator settings. */ for (level = 1 << (sizeof(int) * 8 - 1); !last && (fctx->currAgent == NULL); level >>= 1) { last = (level == 0); if (!(last || (origLevels & level))) { continue; } fctx->optLevels = (FetchOptLevel)level | prefLev; for (i = 0; i < MAX_ADDR_AGENTS; i++) { fctx->currAgent = &fctx->agents[i]; if (fctx->currAgent->match == NULL) { fctx->currAgent = NULL; break; } if (fctx->currAgent->match(fctx)) { break; } fctx->currAgent = NULL; } } fctx->optLevels = origLevels | mergeLev; assert(fctx->currAgent != NULL); } static unsigned int persVarDepthK(const FetchContext *fctx, unsigned int maxVarVecLen) { unsigned int depth = 0; unsigned int maxDepth; int kdim; unsigned int vlen = 0; const Tile *physTile = &fctx->physTile; kdim = bwidthPhysDimension(fctx); vlen = tileVectorsNum(physTile); vlen = umin(vlen, maxVarVecLen); if (kdim) { depth = vlen / tileVecColsNum(physTile); maxDepth = physTile->nrRows; } else { depth = vlen / physTile->nrRows; maxDepth = tileVecColsNum(physTile); } /* * If the dimension K is traversed in the inner loop, and * not all coordinates can be saved, then using persistent * coordinates is prohibited because there is no chance to * update the vectorized coordinate till the end of the whole * tile fetch. */ if ((fctx->outerDim != kdim) && (depth < maxDepth)) { depth = 0; } return depth; } static void genInitVectCoord( FetchContext *fctx, const Kstring *name, unsigned int lenXY, unsigned int depthK, bool decl, bool isConst) { const Tile *physTile; char buf[COORD_BUFSIZE]; char *p = NULL; unsigned int i, k, lenFull; int kdim; const char *declPref; bool needVect; Kstring aoff; unsigned int vlen; Kstring coordType; kdim = bwidthPhysDimension(fctx); physTile = &fctx->physTile; lenFull = (kdim) ? tileVecColsNum(physTile) : physTile->nrRows; /* * If it makes sense Using vectorization at offset evaluation to * avoid extra casting of coordinate in vectors to coordinate in elements */ needVect = decl && ( (!kdim && (depthK > 1) && (lenXY == 1)) || (kdim && (depthK == 1) && (lenXY > 1)) ); vlen = lenXY * depthK; // coordinate declarator declPref = (isConst) ? "const " : ""; if (decl) { if (vlen == 1) { ksprintf(&coordType, "%suint", declPref); } else { ksprintf(&coordType, "%suint%u", declPref, vlen); } } // declaration + initialization if (needVect || (decl && (vlen == 1))) { if (needVect) { fctx->oevp.leadVecLen = vlen; } sprintfOffsetStateless(&aoff, fctx, 0, 0); kgenBatchPrintf(fctx->batch, PREPARE_VARS_STMT_PRIORITY, "%s %s = %s;\n", coordType.buf, name->buf, aoff.buf); fctx->oevp.leadVecLen = 1; } else { unsigned int n = 0; if (decl) { p = buf + sprintf(buf, "%suint%u %s = {", declPref, vlen, name->buf); } for (k = 0; k < depthK; k++) { for (i = 0; i < lenXY; i++) { unsigned int line, vec; line = (kdim) ? k : i; vec = (kdim) ? i : k; sprintfOffsetStateless(&aoff, fctx, line, vec); if (decl) { const char *pref = (n % 3) ? ", " : ""; p += sprintf(p, "%s%s", pref, aoff.buf); // split long lines n++; if (!(n % 3) && (n != vlen)) { p += sprintf(p, "%s", ",\n\t\t"); } } else { kgenBatchPrintf(fctx->batch, PREPARE_VARS_STMT_PRIORITY, "%s.s%c = %s;\n", name->buf, vectComponents[k * lenFull + i], aoff.buf); } } } if (decl) { strcpy(p, "};\n"); assert(p + 4 < buf + COORD_BUFSIZE); kgenAddStmtToBatch(fctx->batch, PREPARE_VARS_STMT_PRIORITY, buf); } } } /**************** Implement different addressing agents *********************/ /********** Stateless (without precoputing) memory addressing agent *********/ static bool matchStateless(const FetchContext *fctx) { return !(fctx->optLevels & ~GENERIC_OPT_LEVELS); } static void sprintfOffsetStateless( Kstring *expr, FetchContext *fctx, unsigned int line, unsigned int vec) { FetchAddrMode addrMode = fctx->addrMode; bool isRel; // shows if addressing is relative const Tile *physTile; bool useLocal; int kdim; unsigned int i, u; struct PhysOffsetComponents comps; Kstring leadStr, secStr; struct RawLD leadDim; bool vectLead; bool swap; Kstring *kstr; const KernelVarNames *kvars = &fctx->gset->varNames; unsigned int vecLen; unsigned int offVlen; const char *p; FetchAddrMode amask; MatrixRole mrole = fctx->fopts->mrole; const SubproblemDim *subdim = fctx->gset->subdims; emptyKstring(&secStr); emptyKstring(&leadStr); offVlen = fctx->oevp.leadVecLen; vectLead = (offVlen > 1); physTile = &fctx->physTile; vecLen = physTile->vecLen; kdim = bwidthPhysDimension(fctx); useLocal = isLocalMemoryUsed(fctx->fopts); // fill components relating to X or Y memset(&comps, 0, sizeof(comps)); amask = (mrole == MATRIX_A) ? FETCH_ADDR_A_RELATIVE : FETCH_ADDR_B_RELATIVE; isRel = ((addrMode & amask) != 0); // base if (!isRel) { p = (mrole == MATRIX_A) ? kvars->coordA : kvars->coordB; sprintfNormalizedBaseCoord(&comps.base, p, 1 - kdim, fctx); } // offset u = (kdim) ? vec : line; i = (kdim) ? offVlen : 1; if (u || i) { sprintfOffsetVector(&comps.offset, u, i); } // bound amask = (mrole == MATRIX_A) ? FETCH_ADDR_A_CYCLICAL : FETCH_ADDR_B_CYCLICAL; if (addrMode & amask) { if (useLocal || isRel) { u = (kdim) ? tileVecColsNum(physTile) : physTile->nrRows; ksprintf(&comps.bound, "%u", u); } else { // global bound if (kdim) { /* * For X and Y dimension the single task is to prevent * exceeding buffer bounds. Using leading dimension for * this is the easiest. */ sprintfLeadingDimension(&comps.bound, fctx); } else { const char *var = (fctx->fopts->mrole == MATRIX_A) ? fctx->gset->varNames.sizeM : fctx->gset->varNames.sizeN; kstrcpy(&comps.bound, var); } } } kstr = (kdim) ? &leadStr : &secStr; swap = kdim && vectLead; sprintfLinearOffset(kstr, &comps, swap); // fill components relating to bwidth memset(&comps, 0, sizeof(comps)); isRel = ((addrMode & FETCH_ADDR_K_RELATIVE) != 0); // base if (!isRel) { sprintfNormalizedBaseCoord(&comps.base, kvars->k, kdim, fctx); } // offset u = (kdim) ? line : vec; i = (kdim) ? 1 : offVlen; if (u || i) { sprintfOffsetVector(&comps.offset, u, i); } // bound if (addrMode & (FETCH_ADDR_K_CYCLICAL)) { if (useLocal || isRel) { if (useLocal) { u = (unsigned int)subdim->bwidth; } else { u = (kdim) ? physTile->nrRows : tileVecColsNum(physTile); } ksprintf(&comps.bound, "%u", u); } else { sprintfGboundK(&comps.bound, fctx); } } kstr = (kdim) ? &secStr : &leadStr; swap = !kdim && vectLead; sprintfLinearOffset(kstr, &comps, swap); if (fctx->oevp.ldNotMul) { kstrcpy(&leadDim.str, "1"); leadDim.scale = 0; } else if (useLocal) { leadDim.scale = 0; if (kdim) { u = (unsigned int)((mrole == MATRIX_A) ? subdim->y : subdim->x); } else { u = (unsigned int)subdim->bwidth; } ksprintf(&leadDim.str, "%u", u / vecLen); } else { fillRawLD(&leadDim, fctx); } // Build the full expression if (!isKstringEmpty(&leadStr) && vectLead) { Kstring tmp; sprintfFastScalarMad(&tmp, &secStr, &leadDim.str, leadDim.scale, NULL); if (isZero(&tmp)) { kstrcpy(expr, leadStr.buf); } else { ksprintf(expr, "%s + %s", leadStr.buf, tmp.buf); } } else { sprintfFastScalarMad(expr, &secStr, &leadDim.str, leadDim.scale, &leadStr); } } static void initStatelessAgent(AddressingAgent *agent) { memset(agent, 0, sizeof(AddressingAgent)); agent->match = matchStateless; agent->sprintfAddrOffset = sprintfOffsetStateless; } /************* Addressing agent using temporary coordinates ****************/ /* * Common approach: * * Save base offsets along both the physical dimensions so as to just * have only one add operation per each further offset evaluation. * Prediction of hight register consumption is used to decide how many * of offsets for each dimension can be saved. * 2 attempts are made. On the first one the maximal number of offsets is * tried to be allocated. This number is equal to the number of tile lines * or vectors in a line respectively. If this number will adittely cause * high register consumption, then only one offset is tried to be allocated. * If the situation repeats, then the offsets in this dimension are not saved * at all. * * Next point is that only those offsets are precomputed that are estimated * to take a lot of computing resources. * * In case of cyclical mode in the dimension K it is saved the global * size K in vectors. * * Offsets for A and B along the dimension K are be shared if the * caller advice to do that and number of them for A and B is the same. */ enum { TMP_COORD_AY, TMP_COORD_AK, TMP_A_VSIZEK, TMP_COORD_BX, TMP_COORD_BK, TMP_B_VSIZEK }; /* * The structure stores length of vectorized temporary variables storing * offsets for matrices A and B along rows/columns and the dimension K. */ typedef struct TmpCoordInfo { // vector length of the offset coordinate of A along rows unsigned int yaVlen; // vector length of the offset coordinate of A along the dimension K unsigned int kaVlen; // vector length of the offset coordinate of B along columns unsigned int xbVlen; // vector length of the offset coordinate of B along the dimension K unsigned int kbVlen; /* * shows if the respective coordinates are * declared as constants or not */ bool yaIsConst; bool kaIsConst; bool xbIsConst; bool kbIsConst; // force relative addressing along K for the matrix A bool forceRelA; // force relative addressign along K for the matrix B bool forceRelB; } MAY_ALIAS TmpCoordInfo; static unsigned int selectTmpCoordsNum( const FetchContext *fctx, unsigned int currNum, unsigned int reqNum, bool canShare) { if (predictHighRegConsumption(fctx, currNum + reqNum, false, canShare)) { if (predictHighRegConsumption(fctx, currNum + 1, false, canShare)) { reqNum = 0; } else { reqNum = 1; } } return reqNum; } /* * check if such number of temporary coordinates has any sence, * i. e. will lead eventually to mode efficient evaluation */ static bool tmpNumSanityCheck( unsigned int num, bool isConst, int kxy, bool isLoopPrep, const FetchContext *fctx) { unsigned int maxCoords[2]; int dim; bool ret = true; const Tile *physTile = &fctx->physTile; maxCoords[0] = tileVecColsNum(physTile); maxCoords[1] = physTile->nrRows; dim = bwidthPhysDimension(fctx); if (kxy) { dim = 1 - dim; } /* * Believe it is not reasonable if it is not constant value * and used few times. It is also right for constant values along X and Y * if they prepared within a loop rather than in advance * because the compiler is not able to recognize that those values are * not needed to be revaluated at each loop iteration. It is also not * reasonable if it is precomputed only one constant value whict doesn't * actually simplify evaluating linear coordinates in the same dimension: * believe it is so, if there is no vectorization at fetching or addressing * is cyclical, or this is a coordinate mapped to the second physical * dimension (because neverthless this assumes multiplication on leading * dimension) */ if (!isConst) { ret = (maxCoords[1 - dim] > 2); } else { FetchAddrMode cycMode; bool isCycled; if (!kxy) { cycMode = FETCH_ADDR_K_CYCLICAL; } else { ret = (isLoopPrep || (maxCoords[1 - dim] > 1)); cycMode = (fctx->fopts->mrole == MATRIX_A) ? FETCH_ADDR_A_CYCLICAL : FETCH_ADDR_B_CYCLICAL; } ret = ret && (!dim || (num == maxCoords[dim])); isCycled = ((fctx->addrMode & cycMode) != 0); if (!dim) { ret = ret && ((num > 1) || (physTile->vecLen > 1) || isCycled); } ret = ret && !(isCycled && (num < maxCoords[dim])); } return ret; } /* * Force relative addressing along K or X/Y dimension */ static __inline void forceRelativeAddressing(FetchContext *fctx, int kxy) { if (!kxy) { fctx->addrMode |= FETCH_ADDR_K_RELATIVE; fctx->addrMode &= ~FETCH_ADDR_K_CYCLICAL; } else { fctx->addrMode |= (FETCH_ADDR_A_RELATIVE | FETCH_ADDR_B_RELATIVE); fctx->addrMode &= ~(FETCH_ADDR_A_CYCLICAL | FETCH_ADDR_B_CYCLICAL); } } static bool matchTmpCoordBased(const FetchContext *fctx) { bool ret; if ((fctx->optLevels & ~GENERIC_OPT_LEVELS) != FOPTLEV_TMP_COORD_PRECOMPUTING) { ret = false; } else { ret = !(estimateOffsetEvalCheap(fctx, 0) && estimateOffsetEvalCheap(fctx, 1)); } return ret; } static int prepareTmpCoords(FetchContext *fctx) { FetchAddrMode addrMode = fctx->addrMode; Kstring *vars = fctx->currAgent->vars; MatrixRole mrole = fctx->fopts->mrole; const Tile *physTile; const Kstring *kstr; TmpCoordInfo *info = (TmpCoordInfo*)fctx->currAgent->priv; int kdim; // for sure known summary number of allocated coordinates unsigned int coordsNum = 0; unsigned int n; unsigned int prepared = 0; unsigned int maxCoords[2]; bool canShare; bool isConst; bool normBoundK; Kstring *boundVars[2] = {&vars[TMP_A_VSIZEK], &vars[TMP_B_VSIZEK]}; int bvidx; // bound variable index in the previously declared array bool skip = false; /* * Believe that number of previously allocated coordinates * for the other tile is reliable if the caller advice to share * possible variables */ canShare = canBeKoffShared(fctx); if (canShare) { if (mrole == MATRIX_A) { coordsNum = info->xbVlen + info->kbVlen; } else { coordsNum = info->yaVlen + info->kaVlen; } } kdim = bwidthPhysDimension(fctx); physTile = &fctx->physTile; maxCoords[0] = tileVecColsNum(physTile); maxCoords[1] = physTile->nrRows; normBoundK = !kdim && !isLocalMemoryUsed(fctx->fopts) && (fctx->addrMode & FETCH_ADDR_K_CYCLICAL) && (physTile->vecLen > 1); n = 0; if (!estimateOffsetEvalCheap(fctx, 1)) { n = selectTmpCoordsNum(fctx, coordsNum, maxCoords[1 - kdim], canShare); isConst = (n == maxCoords[1 - kdim]) || (kdim == fctx->outerDim); if (!tmpNumSanityCheck(n, isConst, 1, fctx->isLoopPreparation, fctx)) { n = 0; } /* * Variable coordinates cannot be prepared before the loop starts. * If prepare before loop, the coordinates are considered as persistent * for more adequate prediction of register consumption. * Check also if if the coordinates for X or Y have been * already prepared at the loop preparation stage */ if (fctx->isLoopPreparation) { skip = !isConst || predictHighRegConsumption(fctx, coordsNum + n, true, canShare); } else { skip = isConst && (agentLoopPrepCount(fctx) > agentUsageCount(fctx)); } if (!skip) { if (mrole == MATRIX_A) { kstrcpy(&vars[TMP_COORD_AY], "ay"); kstr = &vars[TMP_COORD_AY]; info->yaIsConst = isConst; } else { kstrcpy(&vars[TMP_COORD_BX], "bx"); kstr = &vars[TMP_COORD_BX]; info->xbIsConst = isConst; } if (n) { /* * There are only needed offsets along rows of A or columns * of B. So, ensure that another offset components for A and B * don't contribute to the final expression. Setting for them * relative and not cycled addressing guarantees that the * respective expression will be equal to zero */ forceRelativeAddressing(fctx, 0); // fire immediate generating of coordinates declaration genInitVectCoord(fctx, kstr, n, 1, true, isConst); // restore original addressing mode fctx->addrMode = addrMode; prepared++; } } coordsNum += n; } if (!skip) { if (mrole == MATRIX_A) { info->yaVlen = n; } else { info->xbVlen = n; } } bvidx = (mrole == MATRIX_A) ? 0 : 1; if (normBoundK) { // global size K in vectors for the cyclical addressing if (canShare) { kstrcpy(boundVars[bvidx], boundVars[1 - bvidx]->buf); } else if (fctx->isLoopPreparation || (agentLoopPrepCount(fctx) <= agentUsageCount(fctx))) { const char *name; Kstring boundK; name = (mrole == MATRIX_A) ? "vKA" : "vKB"; kstrcpy(boundVars[bvidx], name); sprintfGboundK(&boundK, fctx); kgenBatchPrintf(fctx->batch, PREPARE_VARS_STMT_PRIORITY, "const uint %s = %s;\n", boundVars[bvidx]->buf, boundK.buf); prepared++; } } else { // clear the bound because it may be already not actual emptyKstring(boundVars[bvidx]); } if (!fctx->isLoopPreparation) { n = 0; if (!estimateOffsetEvalCheap(fctx, 0)) { unsigned int maxn; // Ignore sharing if number of needed variables is not equal if (canShare) { maxn = (mrole == MATRIX_A) ? info->kbVlen : info->kaVlen; } else { maxn = maxCoords[kdim]; } n = selectTmpCoordsNum(fctx, coordsNum, maxn, canShare); if (n != maxn) { canShare = false; } if (canShare) { if (mrole == MATRIX_A) { kstrcpy(&vars[TMP_COORD_AK], vars[TMP_COORD_BK].buf); info->kaIsConst = info->kbIsConst; } else { kstrcpy(&vars[TMP_COORD_BK], vars[TMP_COORD_AK].buf); info->kbIsConst = info->kaIsConst; } } else { n = selectTmpCoordsNum(fctx, coordsNum, maxCoords[kdim], canShare); isConst = (n == maxCoords[kdim]) || (kdim != fctx->outerDim); if (!tmpNumSanityCheck(n, isConst, 0, false, fctx)) { n = 0; } if (mrole == MATRIX_A) { kstrcpy(&vars[TMP_COORD_AK], "ak"); kstr = &vars[TMP_COORD_AK]; info->kaIsConst = isConst; } else { kstrcpy(&vars[TMP_COORD_BK], "bk"); kstr = &vars[TMP_COORD_BK]; info->kbIsConst = isConst; } if (n) { const BlasGenSettings *gset = fctx->gset; BlasGenSettings newGset; // substitute normalized bound K if it has been precomputed if (normBoundK) { int idx = (mrole == MATRIX_A) ? TMP_A_VSIZEK : TMP_B_VSIZEK; memcpy(&newGset, gset, sizeof(BlasGenSettings)); newGset.varNames.sizeK = vars[idx].buf; fctx->gset = &newGset; fctx->oevp.gkInVect = true; } forceRelativeAddressing(fctx, 1); genInitVectCoord(fctx, kstr, 1, n, true, isConst); fctx->addrMode = addrMode; fctx->oevp.gkInVect = false; fctx->gset = gset; prepared++; } } } if (mrole == MATRIX_A) { info->kaVlen = n; } else { info->kbVlen = n; } } return (prepared != 0); } static int updateTmpCoords( struct FetchContext *fctx, unsigned int nextLine, unsigned int nextVec, int stmtPriority) { TmpCoordInfo *info = (TmpCoordInfo*)fctx->currAgent->priv; const Kstring *var = NULL; Kstring *agvars = fctx->currAgent->vars; const Tile *physTile = &fctx->physTile; int relIdx = 0; int ret = 0; if (!( (nextLine < physTile->nrRows) && (nextVec < tileVecColsNum(physTile)) )) { return 0; } /* * Update not constants coordinates. Only one coordinate for * each matrix can be non constant. */ if (fctx->fopts->mrole == MATRIX_A) { if ((info->yaVlen == 1) && !info->yaIsConst) { var = &agvars[TMP_COORD_AY]; } else if ((info->kaVlen == 1) && !info->kaIsConst) { var = &agvars[TMP_COORD_AK]; relIdx = 1; } } else { if ((info->xbVlen == 1) && !info->xbIsConst) { var = &agvars[TMP_COORD_BX]; } else if ((info->kbVlen == 1) && !info->kbIsConst) { var = &agvars[TMP_COORD_BK]; relIdx = 1; } } if (var != NULL) { Kstring offset; FetchAddrMode origMode = fctx->addrMode; /* * See the comment for coordinates initialization along X and Y * in prepareTmpCoords() to understand why the following is needed */ forceRelativeAddressing(fctx, relIdx); sprintfOffsetStateless(&offset, fctx, nextLine, nextVec); kgenBatchPrintf(fctx->batch, stmtPriority, "%s = %s;\n", var->buf, offset.buf); fctx->addrMode = origMode; ret = 1; } return ret; } static void sprintfTmpCoordBasedOffset( Kstring *expr, FetchContext *fctx, unsigned int line, unsigned int vec) { int kdim; const TmpCoordInfo *info = (TmpCoordInfo*)fctx->currAgent->priv; MatrixRole mrole = fctx->fopts->mrole; const Kstring *agvars = fctx->currAgent->vars; const Kstring *varK, *varXY; unsigned int xy, k; bool isConstK, isConstXY; bool savedK, savedXY; unsigned int maxK, maxXY; unsigned int idxK, idxXY; const BlasGenSettings *gset = fctx->gset; BlasGenSettings newGset; unsigned int phySizes[2]; Kstring tmpXY, tmpK; memcpy(&newGset, gset, sizeof(BlasGenSettings)); fctx->gset = &newGset; phySizes[0] = tileVecColsNum(&fctx->physTile); phySizes[1] = fctx->physTile.nrRows; kdim = bwidthPhysDimension(fctx); xy = (kdim) ? vec : line; k = (kdim) ? line : vec; /* * If the full set of precomputed coordinates for both the dimensions * has been saved, then form the target expression simply as sum of the * respective values in the dimensions. If the set is not full, e. g. only * the coordinate for the top left tile corner is saved, or no coordinates * is saved at all, then substitute kernel variables with respective * precomputed values (it there is some for the dimension), select new line * and vector accordingly, and invoke sprintf of the stateless agent. * At invoking the stateless agent cyclical addressing is disabled for * dimension having full set of precomputed coordinates because they * already take this into account. Eventually, since precomputed coordinates * for the second physical dimension already include multiplication on * leading dimension, disable this step for the stateless agent */ if (mrole == MATRIX_A) { isConstXY = info->yaIsConst; maxXY = info->yaVlen; varXY = &agvars[TMP_COORD_AY]; } else { isConstXY = info->xbIsConst; maxXY = info->xbVlen; varXY = &agvars[TMP_COORD_BX]; } idxXY = umin(xy, maxXY - 1); savedXY = maxXY && (!isConstXY || (xy < maxXY)); if (mrole == MATRIX_A) { isConstK = info->kaIsConst; maxK = info->kaVlen; varK = &agvars[TMP_COORD_AK]; } else { isConstK = info->kbIsConst; maxK = info->kbVlen; varK = &agvars[TMP_COORD_BK]; } idxK = umin(k, maxK - 1); savedK = maxK && (!isConstK || (k < maxK)); if (savedXY && savedK) { sprintfVectorComponent(&tmpXY, varXY->buf, idxXY, maxXY); sprintfVectorComponent(&tmpK, varK->buf, idxK, maxK); ksprintf(expr, "%s + %s", tmpXY.buf, tmpK.buf); } else { FetchAddrMode origMode = fctx->addrMode; unsigned int newLine = line; unsigned int newVec = vec; KernelVarNames *kvars = &newGset.varNames; const char **cname; if (maxXY) { cname = (mrole == MATRIX_A) ? &kvars->coordA : &kvars->coordB; sprintfVectorComponent(&tmpXY, varXY->buf, idxXY, maxXY); *cname = tmpXY.buf; if ( savedXY && (!kdim || (maxXY == phySizes[1 - kdim])) ) { if (mrole == MATRIX_A) { fctx->addrMode &= ~FETCH_ADDR_A_CYCLICAL; } else { fctx->addrMode &= ~FETCH_ADDR_B_CYCLICAL; } } if (kdim) { newVec = (savedXY) ? 0 : vec; fctx->oevp.coordInVect = true; } else { newLine = (savedXY) ? 0 : line; } } if (maxK) { sprintfVectorComponent(&tmpK, varK->buf, idxK, maxK); newGset.varNames.k = tmpK.buf; if ( savedK && (kdim || (maxK == phySizes[kdim])) ) { fctx->addrMode &= ~FETCH_ADDR_K_CYCLICAL; } if (kdim) { newLine = (savedK) ? 0 : line; } else { newVec = (savedK) ? 0 : vec; fctx->oevp.coordInVect = true; } } // Substitute the bound along K if it's needed if ((fctx->addrMode & FETCH_ADDR_K_CYCLICAL) && (maxK < phySizes[kdim])) { varK = (mrole == MATRIX_A) ? &agvars[TMP_A_VSIZEK] : &agvars[TMP_B_VSIZEK]; if (!isKstringEmpty(varK)) { newGset.varNames.sizeK = varK->buf; fctx->oevp.gkInVect = true; } } // Finally disable multiplying on leading dimension if ((maxXY && !kdim) || (maxK && kdim)) { fctx->oevp.ldNotMul = true; } // let the staless agent doesnt's stand idly by sprintfOffsetStateless(expr, fctx, newLine, newVec); // restore original settings fctx->oevp.coordInVect = false; fctx->oevp.gkInVect = false; fctx->oevp.ldNotMul = false; fctx->addrMode = origMode; } fctx->gset = gset; } static void initTmpCoordAgent(AddressingAgent *agent) { memset(agent, 0, sizeof(AddressingAgent)); agent->match = matchTmpCoordBased; agent->prepareVars = prepareTmpCoords; agent->updateVars = updateTmpCoords; agent->sprintfAddrOffset = sprintfTmpCoordBasedOffset; } /************* Addressing agent using persistent coordinates ***************/ enum { PERS_COORD_A, PERS_COORD_B, MAX_PERS_COORD_VECLEN = 8 }; typedef struct PersCoordInfo { // length of the vectorized coordinate for A unsigned int vlenA; // length of the vectorized coordinate for B unsigned int vlenB; } MAY_ALIAS PersCoordInfo; static unsigned int persCoordIdx( const Tile *physTile, unsigned int line, unsigned int vec, int kdim) { unsigned int n; if ((line == physTile->nrRows) || (vec == tileVecColsNum(physTile))) { n = tileVectorsNum(physTile); } else if (kdim) { n = line * tileVecColsNum(physTile) + vec; } else { n = vec * physTile->nrRows + line; } return n; } static bool matchPersCoordBased(const FetchContext *fctx) { bool ret; if ((fctx->optLevels & ~GENERIC_OPT_LEVELS) != FOPTLEV_PERS_COORD_PRECOMPUTING) { ret = false; } else { unsigned int maxK, depthK; int kdim; ret = !(estimateOffsetEvalCheap(fctx, 0) && estimateOffsetEvalCheap(fctx, 1)) && !isLocalMemoryUsed(fctx->fopts); ret = ret && !(fctx->addrMode & (FETCH_ADDR_K_RELATIVE | FETCH_ADDR_K_CYCLICAL)); /* * Don't use this agent if dimension K is passed in the inner loop * and maximum possible number of coordinates is not sufficient to * cover the entire tile size in this dimension. Using this agent * also makes no sense if even single step along K cannot be covered. */ depthK = persVarDepthK(fctx, MAX_PERS_COORD_VECLEN); // take any huge number to know maximum depth along K maxK = persVarDepthK(fctx, 16384); kdim = bwidthPhysDimension(fctx); ret = ret && (depthK && ((depthK == maxK) || (fctx->outerDim == kdim))); } return ret; } static int preparePersCoords(FetchContext *fctx) { unsigned int depthK; unsigned int n; Kstring *var; bool decl; int kdim; PersCoordInfo *info; MatrixRole mrole; if (agentLoopPrepCount(fctx) > agentUsageCount(fctx)) { return 0; } info = (PersCoordInfo*)fctx->currAgent->priv; mrole = fctx->fopts->mrole; if (mrole == MATRIX_A) { var = &fctx->currAgent->vars[PERS_COORD_A]; decl = isKstringEmpty(var); if (decl) { kstrcpy(var, "vca"); } } else { var = &fctx->currAgent->vars[PERS_COORD_B]; decl = isKstringEmpty(var); if (decl) { kstrcpy(var, "vcb"); } } kdim = bwidthPhysDimension(fctx); n = (kdim) ? tileVecColsNum(&fctx->physTile) : fctx->physTile.nrRows; depthK = persVarDepthK(fctx, MAX_PERS_COORD_VECLEN); if (mrole == MATRIX_A) { info->vlenA = n * depthK; } else { info->vlenB = n * depthK; } genInitVectCoord(fctx, var, n, depthK, decl, false); return 1; } static int updatePersCoords( FetchContext *fctx, unsigned int nextLine, unsigned int nextVec, int stmtPriority) { unsigned int step; int kdim; struct StatementBatch *batch = fctx->batch; const Kstring *var = (fctx->fopts->mrole == MATRIX_A) ? &fctx->currAgent->vars[PERS_COORD_A] : &fctx->currAgent->vars[PERS_COORD_B]; unsigned int nextCoord, maxCoords; PersCoordInfo *info = (PersCoordInfo*)fctx->currAgent->priv; const Tile *physTile; kdim = bwidthPhysDimension(fctx); maxCoords = (fctx->fopts->mrole == MATRIX_A) ? info->vlenA : info->vlenB; nextCoord = persCoordIdx(&fctx->physTile, nextLine, nextVec, kdim); if (nextCoord % maxCoords != 0) { return 0; } physTile = &fctx->physTile; step = (kdim) ? (maxCoords / tileVecColsNum(physTile)) : (maxCoords / physTile->nrRows); if (fctx->addrMode & FETCH_ADDR_BW_STRIDE) { step *= (unsigned int)fctx->gset->subdims[0].bwidth; } if (kdim) { struct RawLD ld; Kstring tmp1, tmp2; fillRawLD(&ld, fctx); ksprintf(&tmp1, "%u", step); sprintfFastScalarMad(&tmp2, &tmp1, &ld.str, ld.scale, NULL); kgenBatchPrintf(batch, stmtPriority, "%s += %s;\n", var->buf, tmp2.buf); } else { kgenBatchPrintf(batch, stmtPriority, "%s += %u;\n", var->buf, step); } return 1; } static void sprintfPersCoordBasedOffset( Kstring *kstr, FetchContext *fctx, unsigned int line, unsigned int vec) { const Kstring *var; unsigned int kdim; unsigned int idx, maxIdx; PersCoordInfo *info = (PersCoordInfo*)fctx->currAgent->priv; kdim = bwidthPhysDimension(fctx); maxIdx = (fctx->fopts->mrole == MATRIX_A) ? info->vlenA : info->vlenB; idx = persCoordIdx(&fctx->physTile, line, vec, kdim); var = (fctx->fopts->mrole == MATRIX_A) ? &fctx->currAgent->vars[PERS_COORD_A] : &fctx->currAgent->vars[PERS_COORD_B]; sprintfVectorComponent(kstr, var->buf, idx % maxIdx, maxIdx); } static void initPersCoordAgent(AddressingAgent *agent) { memset(agent, 0, sizeof(AddressingAgent)); agent->match = matchPersCoordBased; agent->prepareVars = preparePersCoords; agent->updateVars = updatePersCoords; agent->sprintfAddrOffset = sprintfPersCoordBasedOffset; } /***************************************************************************/ static void initPhysTile(FetchContext *fctx) { MatrixRole mrole = fctx->fopts->mrole; const BlasGenSettings *gset = fctx->gset; const Tile *dstTile; bool trans; Tile *physTile = &fctx->physTile; dstTile = getDstTile(fctx); trans = dstTile->trans; memset(physTile, 0, sizeof(Tile)); if ((mrole == MATRIX_A) && !(gset->flags & BGF_WHOLE_A)) { const SubproblemDim *dim = &gset->subdims[1]; physTile->nrRows = (unsigned int)(trans ? dim->bwidth : dim->y); physTile->nrCols = (unsigned int)(trans ? dim->y : dim->bwidth); } else { physTile->nrRows = trans ? dstTile->nrCols : dstTile->nrRows; physTile->nrCols = trans ? dstTile->nrRows : dstTile->nrCols; } physTile->vecLen = getVecLen(gset, CLBLAS_GEMM, mrole); physTile->baseName = (mrole == MATRIX_A) ? gset->varNames.A : gset->varNames.B; } static void sprintfPhysTileElement( Kstring *elem, FetchContext *fctx, unsigned int line, unsigned int vec) { Kstring ptr; Kstring off; const char *varName; const BlasGenSettings *gset = fctx->gset; varName = (fctx->fopts->mrole == MATRIX_A) ? gset->varNames.A : gset->varNames.B; if (fctx->gset->flags & BGF_UPTRS) { const char *ptrName; getVectorTypeName(gset->kextra->dtype, fctx->physTile.vecLen, NULL, &ptrName); ksprintf(&ptr, "%s.%s", varName, ptrName); } else { kstrcpy(&ptr, varName); } fctx->currAgent->sprintfAddrOffset(&off, fctx, line, vec); ksprintf(elem, "%s[%s]", ptr.buf, off.buf); } static void genHandLoad( FetchContext *fctx, const Tile *dstTile, unsigned int lineOffset, unsigned int line, unsigned int vec, unsigned int vecLen, int stmtPriority) { Kstring src, dst; unsigned int row, col; row = (dstTile->trans) ? (vec * vecLen) : line; col = (dstTile->trans) ? line : (vec * vecLen); sprintfPhysTileElement(&src, fctx, line + lineOffset, vec); sprintfTileElement(&dst, dstTile, row, col, vecLen); kgenBatchPrintf(fctx->batch, stmtPriority, "%s = %s;\n", dst.buf, src.buf); } /* * Invoke update variable methods if it is presented. * Return priority that must be used for subsequent statements. * Via the parameter 'priority' the function accept the last used * priority level */ static int checkGenUpdateVars( FetchContext *fctx, unsigned int nextLine, unsigned int nextVec, int priority) { AddressingAgent *agent = fctx->currAgent; const Tile *physTile = &fctx->physTile; int nextPrio; bool endTile; endTile = (nextLine == physTile->nrRows) || (nextVec == physTile->nrCols); if (endTile) { kgenAddStmtToBatch(fctx->batch, priority, "\n"); } nextPrio = canBeFetchesMerged(fctx) ? (priority + 1) : priority; if (agent->updateVars && agent->updateVars(fctx, nextLine, nextVec, nextPrio)) { if (canBeFetchesMerged(fctx)) { priority += 2; } } else if (!endTile && (fctx->fopts->linesNum == 1) && tileVecColsNum(physTile) > 1) { kgenAddStmtToBatch(fctx->batch, priority, "\n"); } return priority; } static void doGenFetch(FetchContext *fctx) { const FetchOpts *fetchOpts = fctx->fopts; unsigned int lineOffset = fetchOpts->lineOffset; unsigned int linesNumber = fetchOpts->linesNum; const Tile *physTile, *dstTile; unsigned int i, j; // length of vectors the tile will be fetched with unsigned int vecLen; int priority = PREPARE_VARS_STMT_PRIORITY + 1; physTile = &fctx->physTile; dstTile = getDstTile(fctx); vecLen = umin(dstTile->vecLen, physTile->vecLen); if (fctx->outerDim) { for (i = 0; i < linesNumber; i++) { for (j = 0; j < physTile->nrCols / vecLen; j++) { /* * TODO: add ability to use load with vload() depending * on some option set */ genHandLoad(fctx, dstTile, lineOffset, i, j, vecLen, priority); } priority = checkGenUpdateVars(fctx, lineOffset + i + 1, 0, priority); } } else { for (j = 0; j < tileVecColsNum(physTile); j++) { for (i = 0; i < linesNumber; i++) { genHandLoad(fctx, dstTile, lineOffset, i, j, vecLen, priority); } priority = checkGenUpdateVars(fctx, lineOffset, j + 1, priority); } } } struct FetchContext *createFetchContext(void) { FetchContext *fctx; int i = 0; fctx = calloc(1, sizeof(FetchContext)); if (fctx != NULL) { fctx->addrMode = FETCH_ADDR_NORMAL; fctx->optLevels = FOPTLEV_TMP_COORD_PRECOMPUTING; } // init addressing agents while (initAgentsTable[i] != NULL) { initAgentsTable[i](&fctx->agents[i]); i++; } fctx->oevp.leadVecLen = 1; fctx->outerDim = 1; return fctx; } void destroyFetchContext(struct FetchContext *fctx) { free(fctx); } FetchOptLevel getFetchOptLevels(struct FetchContext *fctx) { return fctx->optLevels; } void enableFetchOptLevels(struct FetchContext *fctx, FetchOptLevel levels) { fctx->optLevels |= levels; } void disableFetchOptLevels(struct FetchContext *fctx, FetchOptLevel levels) { fctx->optLevels &= ~levels; } FetchAddrMode getFetchAddrMode(const struct FetchContext *fctx) { return fctx->addrMode; } void setFetchAddrMode(struct FetchContext *fctx, FetchAddrMode mode) { fctx->addrMode = mode; } FetchAddrMode setDefaultFetchAddrMode( struct FetchContext *fctx, const BlasGenSettings *gset, FetchAddrMode mask, int tailStatus, bool processTailK) { FetchAddrMode addrMode = fctx->addrMode; KernelExtraFlags kflags = gset->kextra->flags; if ((kflags & KEXTRA_TAILS_M_LOWER) && !(tailStatus & TAIL_A_RAISED)) { addrMode &= ~FETCH_ADDR_A_RELATIVE; addrMode |= FETCH_ADDR_A_CYCLICAL; } else { addrMode &= ~FETCH_ADDR_A_CYCLICAL; addrMode |= FETCH_ADDR_A_RELATIVE; } if ((kflags & KEXTRA_TAILS_N_LOWER) && !(tailStatus & TAIL_B_RAISED)) { addrMode &= ~FETCH_ADDR_B_RELATIVE; addrMode |= FETCH_ADDR_B_CYCLICAL; } else { addrMode &= ~FETCH_ADDR_B_CYCLICAL; addrMode |= FETCH_ADDR_B_RELATIVE; } if (kflags & KEXTRA_TAILS_K_LOWER) { addrMode &= ~FETCH_ADDR_K_RELATIVE; } else { addrMode |= FETCH_ADDR_K_RELATIVE; } if (processTailK) { addrMode |= FETCH_ADDR_K_CYCLICAL | FETCH_ADDR_TAILK_PADD; } else { addrMode &= ~(FETCH_ADDR_K_CYCLICAL | FETCH_ADDR_TAILK_PADD); } addrMode &= ~mask; fctx->addrMode = addrMode; return addrMode; } int prepareFetchLoop( struct KgenContext *genCtx, struct FetchContext *fetchCtx, const BlasGenSettings *gset, CLMemType memA, CLMemType memB) { AddressingAgent *agent, *saved; FetchOpts fopts; int i; int ret = 0; int cnt = 0; memset(&fopts, 0, sizeof(FetchOpts)); fopts.memA = memA; fopts.memB = memB; fetchCtx->fopts = &fopts; fetchCtx->gset = gset; fetchCtx->batch = createStmtBatch(); if (fetchCtx->batch == NULL) { return -ENOMEM; } saved = fetchCtx->prevAgent; fetchCtx->isLoopPreparation = true; for (i = 0; i < 2; i++) { fopts.mrole = (i) ? MATRIX_A : MATRIX_B; initPhysTile(fetchCtx); selectAddrAgent(fetchCtx); agent = fetchCtx->currAgent; if (agent->prepareVars) { if (agent->prepareVars(fetchCtx)) { cnt++; incAgentLoopPrepCount(fetchCtx); /* * Substitute previous agent so as the it could * know that some variables can be really shared * if it is selected again */ fetchCtx->prevAgent = agent; } } } fetchCtx->isLoopPreparation = false; fetchCtx->prevAgent = saved; if (cnt) { flushStmtBatch(genCtx, fetchCtx->batch); ret = kgenAddBlankLine(genCtx); if (ret) { ret = -EOVERFLOW; } } destroyStmtBatch(fetchCtx->batch); fetchCtx->batch = NULL; return ret; } void revalidateFetchContext(struct FetchContext *fctx, MatrixRole mrole) { if (fctx->currAgent != NULL) { int i = (mrole == MATRIX_A) ? 0 : 1; fctx->valid[i] = true; } } static void genFetchCommon(struct FetchContext *fctx) { if (fctx->fopts->mulOpts) { fctx->addrMode = fetchAddrModeFromMulOpts(fctx->fopts->mulOpts); } // prepare needed variables if (!isFetchContextValid(fctx)) { fctx->prevAgent = fctx->currAgent; selectAddrAgent(fctx); if (fctx->currAgent->prepareVars && fctx->currAgent->prepareVars(fctx)) { kgenAddStmtToBatch(fctx->batch, PREPARE_VARS_STMT_PRIORITY, "\n"); } } // fire fetch generation revalidateFetchContext(fctx, fctx->fopts->mrole); doGenFetch(fctx); incAgentUsageCount(fctx); invalidateFetchContext(fctx); } int genFetchInputTile( struct KgenContext *ctx, struct FetchContext *fctx, const BlasGenSettings *gset, const FetchOpts *fetchOpts) { int ret; fctx->batch = createStmtBatch(); if (fctx->batch == NULL) { return -ENOMEM; } fctx->fopts = fetchOpts; fctx->gset = gset; initPhysTile(fctx); genFetchCommon(fctx); ret = flushStmtBatch(ctx, fctx->batch); destroyStmtBatch(fctx->batch); fctx->batch = NULL; return (ret) ? -EOVERFLOW : 0; } void genFetchInputTileBatch( struct StatementBatch *batch, struct FetchContext *fctx, const struct BlasGenSettings *gset, const FetchOpts *fetchOpts) { fctx->fopts = fetchOpts; fctx->gset = gset; initPhysTile(fctx); fctx->batch = batch; genFetchCommon(fctx); fctx->batch = NULL; } clblas-2.10/src/library/blas/gens/fetch.h000066400000000000000000000314751264277366700203040ustar00rootroot00000000000000/* ************************************************************************ * Copyright 2013 Advanced Micro Devices, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * ************************************************************************/ #ifndef FETCH_H_ #define FETCH_H_ /** * @internal * @defgroup FETCH_GEN Generating fetches from memory * @ingroup BLAS_GENERATORS */ /*@{*/ /** * @internal * @brief Context for the fetch generator */ struct FetchContext; struct BlasGenSettings; //enum TailStatus; // FIXME: Deprecated. Throw later struct TileMulOpts; /** * @internal * @brief Optimization levels for the fetch generator with witch the caller * can control some aspects of the code generation. * * !!NOTE: At expanding this list, the levels must be placed in ascending * order of their importance. * * @ingroup BLAS_MAJOR_SUBGENS */ typedef enum FetchOptLevel { /** Expand the fetch loop in the way providing a prefetch effect */ FOPTLEV_PREFETCH = 0x01, /** * Can share temporary coordinates for A and B. Usable in case when * A and fetches are fired sequentially and hence in some cases can * share the same temporary coordinates. Must be set only if fetch * has been already fired for one of the tiles. Otherwise result is * undefined. */ FOPTLEV_CAN_SHARE_TMP_AB = 0x02, /** * Reorder generated statements so as fethes would be groupped * all together */ FOPTLEV_MERGE_FETCHES = 0x04, /** Enable using of temporary precomputed coordinates */ FOPTLEV_TMP_COORD_PRECOMPUTING = 0x08, /** Enable using of persistent precomputed coordinates */ FOPTLEV_PERS_COORD_PRECOMPUTING = 0x10 } FetchOptLevel; /** * @internal * @brief Addressing modes for the fetch generator */ typedef enum FetchAddrMode { /** * Normal mode. Fetching is performed only with full vectors. * Physical coordinates in memory are absolute for the matrices and * evaluated only based on the logical coordinates along rows of the * matrix \b A, columns of the matrix \b B and coordinate along K */ FETCH_ADDR_NORMAL = 0, /** * Pointer for the matrix A is set at start of the tile panel. * All resulting coordinates will be relative against this base. * KernelVarNames::CoordA the generator settings structure is not used */ FETCH_ADDR_A_RELATIVE = 0x01, /** * Pointer for the matrix B is set at start of the tile panel. * All resulting coordinates will be relative against this base. * KernelVarNames::CoordB the generator settings structure is not used */ FETCH_ADDR_B_RELATIVE = 0x02, /** * Pointers for A and B match the current coordinate along dimension K and * thus set at the beginning of the tile. All resulting coordinates will be * relative against the current value of the pointers. * KernelVarNames::CoordA, KernelVarNames::coordB and KernelVarNames * accessible via the generator settings structure are not used */ FETCH_ADDR_K_RELATIVE = 0x04, /** * Cyclical addressing along rows of \b A. That means substracting * number of rows from the coordinate in case of exceeding it. */ FETCH_ADDR_A_CYCLICAL = 0x08, /** Cyclical addressing along columns of B */ FETCH_ADDR_B_CYCLICAL = 0x10, /** Cyclical addressing along K dimension */ FETCH_ADDR_K_CYCLICAL = 0x20, /** * Perform padding of the trailing part along dimension K. * That allows perform a vectorized fetch of tail including a piece being * outside the size along K. It affects only if K expands along the leading * dimension */ FETCH_ADDR_TAILK_PADD = 0x40, /* * Expand loop with stride equal to witdth of the top level block */ FETCH_ADDR_BW_STRIDE = 0x80 } FetchAddrMode; // FIXME: Deprecated and should be thrown away later union FetchTmpVarName { const char *idx; const char *uptr; }; /** * @internal * @brief Specific settings for the fetching generator * @ingroup BLAS_MAJOR_SUBGENS */ typedef struct FetchOpts { MatrixRole mrole; CLMemType memA; /**< type of memory matrix A is located on */ CLMemType memB; /**< type of memory matrix B is located on */ unsigned int lineOffset; unsigned int linesNum; const char *regName; // TODO: the field is deprecated. Remove it /* * FIXME: one more klugde for backward compatibility; get addressing * mode from the options of tilemul */ const struct TileMulOpts *mulOpts; // TODO: All the following fields are deprecated. Remove it union FetchTmpVarName tmpYvar; union FetchTmpVarName tmpXvar; const char *alvM; /**< vecLen-aligned M in vectors */ const char *alvN; /**< vecLen-aligned N in vectors */ const char *alvKA; /**< vecLen-aligned K in vectors of A */ const char *avlKB; /**< vecLen-aligned K in vectors of B */ const char *ax; /**< matrix A x coordinate, in vectors */ const char *ay; /**< matrix A y coordinate */ const char *bx; /**< matrix B x coordinate, in vectors */ const char *by; /**< matrix B y coordinate */ const char *ldav; /**< matrix A leading dimension, in vectors */ const char *ldbv; /**< matrix B leading dimension, in vectors */ const char *skewArow; /**< matrix A rows skew */ const char *skewAcol; /**< matrix A columns skew, in vectors */ const char *skewBrow; /**< matrix A rows skew */ const char *skewBcol; /**< matrix A columns skew, in vectors */ } FetchOpts; /** * @internal * @brief Create context for the fetch generator * * After creation there are enabled optimization levels relating * to precomputing with storing to temporary coordinates. * Addressing mode is set to ::FETCH_ADDR_NORMAL * * @return pointer to a new context object on success, NULL otherwise */ struct FetchContext *createFetchContext(void); /** * @internal * @brief Destroy fetch generator context * * @param[out] fctx Fetch generator context to destroy */ void destroyFetchContext(struct FetchContext *fctx); /** * @internal * @brief Get current fetch optimization levels * * @param[in] fctx Fetch context */ FetchOptLevel getFetchOptLevels(struct FetchContext *fctx); /** * @internal * @brief Enable needed code optimization levels the fetch generator * * @param[out] ctx Generator context * @param[in] opts Fetch Options */ void enableFetchOptLevels(struct FetchContext *fctx, FetchOptLevel levels); /** * @internal * @brief Disable unneeded code optimization levels for the fetch generator * * @param[out] ctx Generator context * @param[in] opts Fetch Options */ void disableFetchOptLevels(struct FetchContext *fctx, FetchOptLevel levels); /** * @internal * @brief Get current addressing mode used by the fetch generator * * @param[in] fctx Fetch context */ FetchAddrMode getFetchAddrMode(const struct FetchContext *fctx); /** * @internal * @brief Set addressing mode for the fetch generator * * @param[out] fctx Fetch context * @param[in] mode Addressing mode to set */ void setFetchAddrMode(struct FetchContext *fctx, FetchAddrMode mode); /** * @internal * @brief Set default fetch addressing mode based on the problem specifics * * @param[out] fctx Fetch context * @param[in] gset Generator settings * @param[in] mask Addressing mode mask * @param[in] tailStatus Tails handling status * @param[in] processTailK Flag showing if the tail part along the * dimension K is picked up or not. * * Primarily, the function checks if there are tails along rows of A, * columns of B, dimension K and if some tails are raised or not. * Based on this info and also taking into account fetch vector length, * it set appropriate addressing mode to don't exceed matrix bounds during * the fetch operations. If there are not "small" tails for rows of A and * columns of B is selects relative addressing for them. If there are not * "small" tails along K, it selects relative addressing for this dimension * as well. * * The addressing mode mask passed via the \b mask parameter is used to * not set addressing modes not suitable for callers. Resulting addressing * mode which is set is presented as bitwise AND of a default value selected * by the function and bitwise negated value of the mask * * \b tailStatus is a bit mask of values consisting the #TailStatus enumeration. * * @return Addressing mode the function set during the last call. */ FetchAddrMode setDefaultFetchAddrMode( struct FetchContext *fctx, const struct BlasGenSettings *gset, FetchAddrMode mask, int tailStatus, bool processTailK); /** * @internal * @brief Prepare the fetch generator to generate efficient fetches * within the K loop * * @param[out] genCtx Generator context * @param[out] fetchCtx Fetch context * @param[in] gset Generator settings * @param[in] memA Type of memory the matrix A is stored in * @param[in] memB Type of memory the matrix B is stored in * * Basically, the function lets to declare all needed for work of the fetch * generator. If a user lots upon efficient fetching within the tilemul loop, * he should call the function before generating that loop. * If it is not invoked, the fetch generator produces a code in some default * way which may be far from efficient. The stuff prepared with the function is * valid only for one fetch call. If the user needs to use the same once again, * it may use revalidateFetchContext(). */ int prepareFetchLoop( struct KgenContext *genCtx, struct FetchContext *fetchCtx, const struct BlasGenSettings *gset, CLMemType memA, CLMemType memB); /** * @internal * @brief Revalidate fetch context * * @param[out] fctx Fetch context * @param[in] mrole Matrix to revalidate the context for * * Enable the fetch generator to use the stuff produces with the last call * of prepareFetch() once again. */ void revalidateFetchContext(struct FetchContext *fctx, MatrixRole mrole); /** * @internal * @brief Tile fetching generator * * @param[out] genCtx Generator context * @param[in] fetchCtx FetchContext * @param[in] gset Generator settings * @param[in] fetchOpts Fetch-specific generator options * * This function generates code which fetches tile a or b from global or local * memory into private memory.\n * Generated code fetches tiles by vectors using coordinate values in vectors * from @ref FetchOpts. * Complex types and conjugated tiles are supported. Global cycling is supported * for global memory fetching - this mean that if tile overlaps matrix * the tail of tile will be fetched from the beginning instead of accessing * memory outside the matrix.\n * Second level of subdimensions is used for tile sizes.\n * Tile can be fetched from global memory or from local memory. * If tile is fetched from local memory then leading dimensions for local * memory area are taken from first level subdimensions.\n * Post-fetch callback generator function can be called after fetching tile * for zeroing tails or setting diagonal elements to one. This function is * provided by caller in @ref TileMulOpts.postFetch.\n * After the function completes its work it invalidates the fetch context, and * all the stuff that has been prepared before, will not be used in the next * fetch transaction. * * @return 0 on success * @return -EOVERFLOW on source buffer overflowing */ int genFetchInputTile( struct KgenContext *genCtx, struct FetchContext *fetchCtx, const struct BlasGenSettings *gset, const FetchOpts *fetchOpts); /** * @internal * @brief Fetch input tile * * @param[out] batch Statement batch * @param[in] gset Generator settings * @param[in] fetchOpts Fetch Options * * The function has the same effect and semantics as the previous one, * but put the code to the intermediate statement batch rather than a target * generator context. */ void genFetchInputTileBatch( struct StatementBatch *batch, struct FetchContext *fctx, const struct BlasGenSettings *gset, const FetchOpts *fetchOpts); /*@}*/ #endif /* FETCH_H_ */ clblas-2.10/src/library/blas/gens/gbmv.cpp000066400000000000000000000305431264277366700204740ustar00rootroot00000000000000/* ************************************************************************ * Copyright 2013 Advanced Micro Devices, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * ************************************************************************/ /* * gbmv generator */ //#define DEBUG_GBMV #include #include #include #include #include #include #include #include #include "blas_kgen.h" #include #include #include extern "C" unsigned int dtypeSize(DataType type); static char Prefix[4]; static int getDefaultDecomposition( PGranularity *pgran, SubproblemDim *subdims, unsigned int subdimsNum, void *pArgs); static SolverFlags solverFlags(void) { #ifdef DEBUG_GBMV printf("solverFlags callen......\n"); #endif return (SF_WSPACE_1D); } static void calcNrThreads( size_t threads[2], const SubproblemDim *subdims, const PGranularity *pgran, const void *args, const void *extra); static ssize_t generator( char *buf, size_t buflen, const struct SubproblemDim *subdims, const struct PGranularity *pgran, void *extra); static void assignKargs(KernelArg *args, const void *params, const void* extra ); extern "C" void initGbmvRegisterPattern(MemoryPattern *mempat); static void setBuildOpts( char * buildOptStr, const void *kArgs); static bool isFitToLDS( SubproblemDim *dim, DataType dtype, cl_ulong ldsSize, const void *kernelArgs); static SolverOps gbmvOps = { generator, assignKargs, isFitToLDS, NULL, // Prepare Translate Dims NULL, // Inner Decomposition Axis calcNrThreads, NULL, solverFlags, NULL, getDefaultDecomposition, NULL, setBuildOpts, NULL }; static void setBuildOpts( char * buildOptStr, const void *args) { const SolutionStep *step = (const SolutionStep *)args; const CLBlasKargs *kargs = (const CLBlasKargs *)(&step->args); if ( (kargs->dtype == TYPE_DOUBLE) || (kargs->dtype == TYPE_COMPLEX_DOUBLE) ) { addBuildOpt( buildOptStr, BUILD_OPTS_MAXLEN, "-DDOUBLE_PRECISION"); #ifdef DEBUG_GBMV printf("Setting build options ... Double... for DOUBLE PRECISION support\n"); #endif } if( kargs->pigFuncID == CLBLAS_TBMV ) { addBuildOpt( buildOptStr, BUILD_OPTS_MAXLEN, "-DTBMV_ONLY"); if( kargs->diag == clblasUnit ) { addBuildOpt( buildOptStr, BUILD_OPTS_MAXLEN, "-DUNIT_DIAG"); } } if( ((kargs->pigFuncID == CLBLAS_GBMV) || (kargs->pigFuncID == CLBLAS_TBMV)) && (kargs->transA == clblasConjTrans) ) { addBuildOpt( buildOptStr, BUILD_OPTS_MAXLEN, "-DDO_CONJ"); } if( (kargs->pigFuncID == CLBLAS_SBMV) || (kargs->pigFuncID == CLBLAS_HBMV) ) { bool isUpper = ( kargs->uplo == clblasUpper )? true: false; isUpper = ( kargs->order == clblasColumnMajor )? !isUpper : isUpper; if( isUpper ) addBuildOpt( buildOptStr, BUILD_OPTS_MAXLEN, "-DGIVEN_SHBMV_UPPER"); else addBuildOpt( buildOptStr, BUILD_OPTS_MAXLEN, "-DGIVEN_SHBMV_LOWER"); if(kargs->pigFuncID == CLBLAS_HBMV) { addBuildOpt( buildOptStr, BUILD_OPTS_MAXLEN, "-DHBMV_ONLY"); if( kargs->order == clblasColumnMajor ) // Since routine calls Row-major, the whole matrix has to be conjugated while loading { addBuildOpt( buildOptStr, BUILD_OPTS_MAXLEN, "-DDO_CONJ"); } } } return; } static CLBLASMpatExtra mpatExtra; extern "C" void initGbmvRegisterPattern(MemoryPattern *mempat) { #ifdef DEBUG_GBMV printf("initGBMVREgPattern called with mempat = 0x%p\n", mempat); #endif fflush(stdout); mempat->name = "Register accumulation based gbmv"; mempat->nrLevels = 2; mempat->cuLevel = 0; mempat->thLevel = 1; mempat->sops = &gbmvOps; mpatExtra.aMset = CLMEM_LEVEL_L2; mpatExtra.bMset = CLMEM_LEVEL_L1 | CLMEM_LEVEL_LDS; // For "x" vector mpatExtra.mobjA = CLMEM_BUFFER; mpatExtra.mobjB = CLMEM_BUFFER; mempat->extra = &mpatExtra; Prefix[TYPE_FLOAT] = 'S'; Prefix[TYPE_DOUBLE] = 'D'; Prefix[TYPE_COMPLEX_FLOAT] = 'C'; Prefix[TYPE_COMPLEX_DOUBLE] = 'Z'; } static void calcNrThreads( size_t threads[2], const SubproblemDim *subdims, const PGranularity *pgran, const void *args, const void *_extra) { int BLOCKSIZE = pgran->wgSize[0] * pgran->wgSize[1]; // 1D Block size_t fM, fN; const CLBlasKargs *kargs = (const CLBlasKargs *)args; const CLBLASKernExtra *extra = ( CLBLASKernExtra *)_extra; clblasOrder order = ( extra->flags & KEXTRA_COLUMN_MAJOR) ? clblasColumnMajor: clblasRowMajor; clblasTranspose trans = ( extra->flags & KEXTRA_TRANS_A) ? clblasTrans : (( extra->flags & KEXTRA_CONJUGATE_A) ? clblasConjTrans: clblasNoTrans); fM = kargs->M; fN = kargs->N; if ( order == clblasColumnMajor ) { order = clblasRowMajor; fM = kargs->N; fN = kargs->M; if ( trans == clblasNoTrans) { trans = clblasTrans; } else if ( trans == clblasTrans ) { trans = clblasNoTrans; } else // clblasConjTrans { trans = clblasNoTrans; } } if( (kargs->pigFuncID == CLBLAS_SBMV) || (kargs->pigFuncID == CLBLAS_HBMV) ) // Only NT kernel is used { trans = clblasNoTrans; } size_t blocks; size_t H = subdims->x; size_t TARGET_ROWS = BLOCKSIZE / H; if( trans == clblasNoTrans ) { blocks = ((fM - 1)/ TARGET_ROWS) + 1; } else { blocks = ((fN - 1)/ H) + 1; } threads[0] = blocks * BLOCKSIZE; threads[1] = 1; #ifdef DEBUG_GBMV printf("calcNrThreads called from gbmv.cpp\n"); printf("BLOCKSIZE : %d, subdims->x : %d\n", BLOCKSIZE, H); printf("blocks : %d\n", blocks); printf("pgran-wgSize[0] : %d, globalthreads[0] : %d\n", pgran->wgSize[0], threads[0]); #endif } static ssize_t generator( char *buf, size_t buflen, const struct SubproblemDim *subdims, const struct PGranularity *pgran, void *extra) { size_t BLOCKSIZE = pgran->wgSize[0]; size_t H = subdims->x; char tempTemplate[64*1024]; char def_target_rows[10], def_h[10]; SolutionStep *step = container_of( pgran , pgran, SolutionStep); // NOTE: using container_of() to get pigFuncID CLBlasKargs* kargs = (CLBlasKargs*) &(step->args); if ( buf == NULL) // return buffer size { buflen = (64 * 1024 * sizeof(char)); return (ssize_t)buflen; } CLBLASKernExtra *extraFlags = ( CLBLASKernExtra *)extra; //clblasUplo uplo = ( extraFlags->flags & KEXTRA_UPPER_TRIANG) ? clblasUpper : clblasLower; clblasOrder order = ( extraFlags->flags & KEXTRA_COLUMN_MAJOR) ? clblasColumnMajor: clblasRowMajor; clblasTranspose trans = ( extraFlags->flags & KEXTRA_TRANS_A) ? clblasTrans : (( extraFlags->flags & KEXTRA_CONJUGATE_A) ? clblasConjTrans: clblasNoTrans); if ( order == clblasColumnMajor ) { order = clblasRowMajor; if ( trans == clblasNoTrans) { trans = clblasTrans; } else if ( trans == clblasTrans ) { trans = clblasNoTrans; } else // clblasConjTrans { trans = clblasNoTrans; } } if( (kargs->pigFuncID == CLBLAS_SBMV) || (kargs->pigFuncID == CLBLAS_HBMV) ) // Only NT kernel is used { trans = clblasNoTrans; } if ((BLOCKSIZE % H) != 0) { printf("WARNING: GBMV: generator: Invalid Block Size\n"); return 0; } size_t TARGET_ROWS = BLOCKSIZE / H; if ( trans == clblasNoTrans) { strcpy(tempTemplate, (char*)gbmv_RNT_kernel); } else // Transpose cases... { strcpy(tempTemplate, (char*)gbmv_RT_kernel);; } unsigned int vecLenA = extraFlags->vecLenA; bool doVLOAD = false; // Always scalar load for banded matrices kprintf kobj( Prefix[extraFlags->dtype], vecLenA, doVLOAD); sprintf( def_target_rows, "%d", (int)TARGET_ROWS ); sprintf( def_h, "%d", (int)H ); #ifdef DEBUG_GBMV printf("GBMV GENERATOR called....\n"); if((( extraFlags->flags & KEXTRA_TRANS_A) || ( extraFlags ->flags & KEXTRA_CONJUGATE_A ))) { printf("A is trans or CONJ-TRANS\n"); } else { printf("A is noTrans...\n"); } printf("TARGET ROWS = %s\n", def_target_rows); printf("H = %s\n", def_h); printf("dataType : %c\n", Prefix[extraFlags->dtype]); #endif kobj.put("%DEF_H", (const char *)def_h); kobj.put("%DEF_TARGET_ROWS", (const char *)def_target_rows); kobj.spit((char*)buf, tempTemplate); return (64 * 1024 * sizeof(char)); } /* __kernel void %PREFIXgbmv_RNT_kernel( __global const %TYPE * _A, __global %TYPE * _y_vector, __global %TYPE const* restrict _x_vector, uint M, uint N, uint KL, uint KU, uint lda, int incx, int incy, uint offa, uint offx, uint offy ifndef TBMV_ONLY ,%TYPE alpha, %TYPE beta endif */ static void assignKargs(KernelArg *args, const void *params, const void* ) { CLBlasKargs *blasArgs = (CLBlasKargs*)params; size_t fM, fN, fKL, fKU; cl_int inc; if( blasArgs->order == clblasColumnMajor ) // M, N, KL, KU gets swapped { fM = blasArgs->N; fN = blasArgs->M; fKL = blasArgs->KU; fKU = blasArgs->KL; } else { fM = blasArgs->M; fN = blasArgs->N; fKL = blasArgs->KL; fKU = blasArgs->KU; } INIT_KARG(&args[0], blasArgs->A); //A - input matrix - argument INIT_KARG(&args[1], blasArgs->C); //y - y vector INIT_KARG(&args[2], blasArgs->B); //x - actual x vector argument initSizeKarg(&args[3], fM); initSizeKarg(&args[4], fN); initSizeKarg(&args[5], fKL); initSizeKarg(&args[6], fKU); initSizeKarg(&args[7], blasArgs->lda.matrix); inc = blasArgs->ldb.vector; INIT_KARG(&args[8], inc); inc = blasArgs->ldc.vector; INIT_KARG(&args[9], inc); initSizeKarg(&args[10], blasArgs->offa); initSizeKarg(&args[11], blasArgs->offBX); initSizeKarg(&args[12], blasArgs->offCY); // For GBMV, SBMV, HBMV both alpha and beta has to be passed. if( (blasArgs->pigFuncID == CLBLAS_GBMV) || (blasArgs->pigFuncID == CLBLAS_SBMV) || (blasArgs->pigFuncID == CLBLAS_HBMV) ) { assignScalarKarg(&args[13], &(blasArgs->alpha), blasArgs->dtype); assignScalarKarg(&args[14], &(blasArgs->beta), blasArgs->dtype); } #ifdef DEBUG_GBMV printf("KL %d\tKU %d\n", fKL, fKU); #endif return; } static bool isFitToLDS( SubproblemDim *dim, DataType dtype, cl_ulong ldsSize, const void *kernelArgs) { kernelArgs = kernelArgs; // To remove warnings cl_ulong maxSize = ( (dim[0].x+1) * dim[0].y ) * sizeof(dtype); return ( maxSize <= ldsSize ); } static int getDefaultDecomposition( PGranularity *pgran, SubproblemDim *subdims, unsigned int subdimsNum, void *pArgs) { SolutionStep *step = container_of( pgran , pgran, SolutionStep); size_t maxWorkGroupSize; cl_device_id devID = step->device.id; size_t wgX, wgY; pArgs = pArgs; clGetDeviceInfo(devID, CL_DEVICE_MAX_WORK_GROUP_SIZE, sizeof(size_t), &maxWorkGroupSize, NULL); if (maxWorkGroupSize >= 256) { wgX = 32; wgY = 8; } else if (maxWorkGroupSize >= 128) { wgX = 32; wgY = 4; } else { // // PENDING: What if maxWorkGroupSize < 64 ???? // wgX = 32; wgY = 2; } pgran->wgDim = 1; //1D blocking pgran->wgSize[0] = (unsigned int)(wgX * wgY); pgran->wgSize[1] = 1; if(subdimsNum > 0) { subdims[0].y = wgY ; subdims[0].x = wgX ; subdims[0].itemX = subdims[0].x; subdims[0].itemY = subdims[0].y; subdims[0].bwidth = 1; } if(subdimsNum > 1) { subdims[1].itemY = 1; subdims[1].itemX = 1; subdims[1].y = subdims[1].itemY; subdims[1].x = subdims[1].itemX; subdims[1].bwidth = 1; } return 0; } clblas-2.10/src/library/blas/gens/gemm.c000066400000000000000000001162151264277366700201270ustar00rootroot00000000000000/* ************************************************************************ * Copyright 2013 Advanced Micro Devices, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * ************************************************************************/ /* * Cached global buffers based gemm generator */ #include #include #include #include #include #include #include #include #include #include "blas_kgen.h" #include "blas_subgroup.h" #include "gen_helper.h" typedef struct { size_t staggered; } MAY_ALIAS extraData_t; static CLBLASMpatExtra mpatExtra; static ssize_t blockGen( char *buf, size_t buflen, const struct SubproblemDim *subdims, const struct PGranularity *pgran, void *extra); static ssize_t subgGen( char *pBuf, size_t buflen, const struct SubproblemDim *pSubDims, const struct PGranularity *pPGran, void *pExtra ); static void assignBlockKargs( KernelArg *args, const void *params, const void *extra); static bool blockCheckCalcDecomp( PGranularity *pgran, SubproblemDim *subdims, unsigned int subdimsNum, DataType dtype, int check); static int blockGetPerf( unsigned int kflags, const void *args); static void assignSubgKargs( KernelArg *args, const void *params, const void *extra); static SolverFlags solverFlags(void); static DecompositionAxis innerDecompositionAxis(const void *args); static int gemmSubgGetDefaultDecomp( PGranularity *pgran, SubproblemDim *subdims, unsigned int subdimsNum, void * pArgs); static bool subgCheckCalcDecomp( PGranularity *pgran, SubproblemDim *subdims, unsigned int subdimsNum, DataType dtype, int check); static void subgCalcGlobalThreads( size_t threads[2], const SubproblemDim *subdims, const PGranularity *pgran, const void *args, const void *extra ); static int subgGetPerf( unsigned int kflags, const void *args); static void fixupArgs(void *args, SubproblemDim *subdims, void *extra); static SolverOps blockSOps = { blockGen, assignBlockKargs, NULL, blockGetPerf, innerDecompositionAxis, NULL, NULL, solverFlags, NULL,// fixup kargs NULL, //blockGetDefaultDecomp, blockCheckCalcDecomp, NULL, NULL }; static SolverOps subgSOps = { subgGen, assignSubgKargs, NULL, subgGetPerf, innerDecompositionAxis, subgCalcGlobalThreads, NULL, solverFlags, fixupArgs,// fixup kargs gemmSubgGetDefaultDecomp, subgCheckCalcDecomp, NULL, NULL }; //***************************************************************************** //----------------------------------------------------------------------------- static void genSetupItemPtr( struct KgenContext *ctx, const BlasGenSettings *gset, MatrixRole mrole) { char tmp[1024]; unsigned int vecLen; char ldv[64]; int shift; char ptrLit; char shiftMul[128]; size_t tileWidth; int widx; KernelExtraFlags kflags = gset->kextra->flags; /* * The matrix was made B inner if every thread should accesses their * elements with a large stride but accesses elements of the matrix A * sequentially to provide more coalesced memory accesses. * Otherwise, the matrix A was made inner. */ widx = (!isMatrixAccessColMaj(CLBLAS_GEMM, kflags, MATRIX_A) && isMatrixAccessColMaj(CLBLAS_GEMM, kflags, MATRIX_B)) ? 1 : 0; vecLen = getVecLen(gset, CLBLAS_GEMM, mrole); shift = findHighestSetBit(vecLen); if (mrole == MATRIX_A) { tileWidth = gset->subdims[1].y; ptrLit = 'A'; if ((shift > 0) && !(gset->flags & BGF_LD_IN_VECTORS)) { sprintf(ldv, "(lda >> %d)", shift); } else { strcpy(ldv, "lda"); } } else { tileWidth = gset->subdims[1].x; ptrLit = 'B'; if ((shift > 0) && !(gset->flags & BGF_LD_IN_VECTORS)) { sprintf(ldv, "(ldb >> %d)", shift); } else { strcpy(ldv, "ldb"); } widx = 1 - widx; } if (isMatrixAccessColMaj(CLBLAS_GEMM, kflags, mrole)) { if (tileWidth / vecLen > 1) { sprintf(shiftMul, " * %lu", tileWidth / vecLen); } else { shiftMul[0] = '\0'; } // Alternative calculate global thead id to eliminate Channel Conflicts. if (mrole == MATRIX_B) { int bankSize = 2048; int dataSize = 0; int grShift; DataType dtype = gset->kextra->dtype; switch (dtype) { case TYPE_FLOAT: dataSize = 4; break; case TYPE_COMPLEX_DOUBLE: dataSize = 16; break; default: dataSize = 8; break; } grShift = bankSize/ dataSize; sprintf(tmp, "get_group_id_%d = (get_group_id(0) + get_group_id(1))" "%% get_num_groups(%d);\n", widx, widx); kgenAddStmt(ctx, tmp); sprintf(tmp, "get_global_id_%d = get_group_id_%d * get_local_size(%d) " "+ get_local_id(%d);\n",widx, widx, widx, widx); kgenAddStmt(ctx, tmp); sprintf(tmp, "kif = (N %% %d != 0);\n" "get_global_id_%d = (kif*(uint)get_global_id(%d)) + " "((1-kif)*get_global_id_%d);\n",grShift, widx, widx, widx); kgenAddStmt(ctx, tmp); sprintf(tmp, "%c += get_global_id_%d%s;", ptrLit, widx, shiftMul); } else { sprintf(tmp, "%c += (uint)get_global_id(%d)%s;\n", ptrLit, widx, shiftMul); } } else { sprintf(tmp, "%c += %luu * (uint)get_global_id(%d) * %s;\n", ptrLit, tileWidth, widx, ldv); } kgenAddStmt(ctx, tmp); } static void genShiftPointers( struct KgenContext *ctx, const BlasGenSettings *gset, KernelExtraFlags kflags, bool vectorizedPtrs) { char tmp[1024]; unsigned int flags[3] = {KEXTRA_A_OFF_NOT_ZERO, KEXTRA_BX_OFF_NOT_ZERO, KEXTRA_CY_OFF_NOT_ZERO}; char ptrNames[3] = {'A', 'B', 'C'}; const char *offNames[3] = {"offA", "offB", "offC"}; MatrixRole mroles[3] = {MATRIX_A, MATRIX_B, MATRIX_C}; int i; for (i = 0; i < 3; i++) { if (kflags & flags[i]) { unsigned int vecLen; vecLen = getVecLen(gset, CLBLAS_GEMM, mroles[i]); if( vectorizedPtrs && (vecLen > 1) ) { sprintf(tmp, "%c += %s / %u;\n", ptrNames[i], offNames[i], vecLen); } else { sprintf(tmp, "%c += %s;\n", ptrNames[i], offNames[i]); } kgenAddStmt(ctx, tmp); } } } //----------------------------------------------------------------------------- static void sprintfOffABC( char *str, KernelExtraFlags kflags) { str[0] = '\0'; if (kflags & KEXTRA_A_OFF_NOT_ZERO) { str += sprintf(str, ",\n const uint offA"); } if (kflags & KEXTRA_BX_OFF_NOT_ZERO) { str += sprintf(str, ",\n const uint offB"); } if (kflags & KEXTRA_CY_OFF_NOT_ZERO) { str += sprintf(str, ",\n const uint offC"); } } static void declareKernel( struct KgenContext *ctx, const BlasGenSettings *gset, const char *nameSuffix) { char tmp[4096]; char offABC[1024]; char fpref; char *tnameA, *tnameB; const char *tnameC; const char *rawType; DataType dtype = gset->kextra->dtype; unsigned int vecLen; const PGranularity *pgran = gset->pgran; fpref = dtypeToBlasPrefix(dtype); rawType = dtypeBuiltinType(dtype); vecLen = getVecLen(gset, CLBLAS_GEMM, MATRIX_A); getVectorTypeName(dtype, vecLen, (const char **)&tnameA, NULL); vecLen = getVecLen(gset, CLBLAS_GEMM, MATRIX_B); getVectorTypeName(dtype, vecLen, (const char **)&tnameB, NULL); // FIXME - take into account flag BGF_LD_IN_VECTORS //sprintf( tnameC, "%s", rawType ); getVectorTypeName( dtype, getVecLen( gset, 0, MATRIX_C ), &tnameC, NULL ); sprintfOffABC(offABC, gset->kextra->flags); sprintf(tmp, "__attribute__((reqd_work_group_size(%u, %u, 1)))\n" "void __kernel\n" "%cgemm%s(\n" " uint M,\n" " uint N,\n" " uint K,\n" " const %s alpha,\n" " const %s beta,\n" " const __global %s *restrict A,\n" " const __global %s *restrict B,\n" " __global %s *C,\n" " uint lda,\n" " uint ldb,\n" " uint ldc%s)\n", pgran->wgSize[0], pgran->wgSize[1], fpref, nameSuffix, rawType, rawType, tnameA, tnameB, tnameC, offABC); kgenDeclareFunction(ctx, tmp); } //----------------------------------------------------------------------------- static void genHitMatrixCheck( struct KgenContext *ctx, KernelExtraFlags kflags) { /* tails of upper level blocks */ bool tailsM = kflags & KEXTRA_TAILS_M; bool tailsN = kflags & KEXTRA_TAILS_N; if (tailsM) { if (tailsN) { kgenAddStmt(ctx, "if ((coord.y >= M) || (coord.x >= N)) {\n"); } else { kgenAddStmt(ctx, "if (coord.y >= M) {\n"); } } else { if (tailsN) { kgenAddStmt(ctx, "if (coord.x >= N) {\n"); } } if (tailsM || tailsN) { kgenAddStmt(ctx, " return;\n}\n\n"); } } //----------------------------------------------------------------------------- static ssize_t blockGen( char *buf, size_t buflen, const struct SubproblemDim *subdims, const struct PGranularity *pgran, void *extra) { struct KgenContext *ctx; CLBLASKernExtra *kextra = (CLBLASKernExtra*)extra; KernelExtraFlags kflags = kextra->flags; bool isRelA, isRelB; bool tailsK = ((kflags & KEXTRA_TAILS_K_LOWER) != 0); DataType dtype = kextra->dtype; char tmp[2048]; bool doubleBased = isDoubleBasedType(dtype); BlasGenSettings gset; KernelVarNames *vnames = &gset.varNames; TileMulOpts mulOpts; ssize_t ret; char globalIdB[64]; const char *alignedK; FetchAddrMode addrMode, addrMask = 0; FetchOpts fopts; TilePostFetchPrivate pfPriv; TailStatus tailStatus; UpdateResultFlags upFlags; unsigned int i; unsigned int vecLen; int isColMajA; int isColMajB; memset(&gset, 0, sizeof(gset)); memset(&mulOpts, 0, sizeof(mulOpts)); memset(&pfPriv, 0, sizeof(pfPriv)); memset(&fopts, 0, sizeof(fopts)); memcpy(gset.subdims, subdims, sizeof(gset.subdims)); gset.flags = BGF_DISTINCT_VECLEN | BGF_LD_IN_VECTORS; // FIXME: throw the explicit constant away switch (dtype) { case TYPE_FLOAT: // i = 12; i = 16; break; case TYPE_COMPLEX_DOUBLE: i = 6; break; default: i = 8; break; } if (subdims[1].y + subdims[1].x <= i) { gset.flags |= BGF_WHOLE_A; } gset.kextra = kextra; gset.pgran = pgran; //avoid [0].bw loop gset.subdims[0].bwidth = gset.subdims[1].bwidth; mulOpts.core = ((kflags & KEXTRA_ENABLE_MAD) && (dtype != TYPE_COMPLEX_FLOAT)) ? TILEMUL_MAD : TILEMUL_MULADD; mulOpts.memA = CLMEM_GLOBAL_MEMORY; mulOpts.memB = CLMEM_GLOBAL_MEMORY; mulOpts.fctx = createFetchContext(); if (mulOpts.fctx == NULL) { return -ENOMEM; } ctx = createKgenContext(buf, buflen, true); if (ctx == NULL) { destroyFetchContext(mulOpts.fctx); return -ENOMEM; } isColMajA = isMatrixAccessColMaj(CLBLAS_GEMM, kflags, MATRIX_A); isColMajB = isMatrixAccessColMaj(CLBLAS_GEMM, kflags, MATRIX_B); alignedK = (tailsK) ? "Kbase" : "K"; // setup kernel variables vnames->A = "A"; vnames->B = "B"; vnames->C = "C"; vnames->coordA = "coord.y"; vnames->coordB = "coord.x"; vnames->k = "coord.z"; vnames->sizeK = alignedK; vnames->sizeM = "M"; vnames->sizeN = "N"; vnames->lda = "lda"; vnames->ldb = "ldb"; vnames->ldc = "ldc"; vnames->alpha = "alpha"; vnames->beta = "beta"; // at first, generate needed declarations ret = kgenDeclareUptrs(ctx, doubleBased); declareKernel(ctx, &gset, "Block"); ret = kgenBeginFuncBody(ctx); if (tailsK) { sprintf(tmp, "const uint Ktail = K %% %lu;\n" "const uint Kbase = K - Ktail;\n", subdims[1].bwidth); kgenAddStmt(ctx, tmp); alignedK = "Kbase"; } else { alignedK = "K"; } initDefaultTiles(&gset, CLBLAS_GEMM, 0, PRIV_STORAGE_VARIABLE_SET); declareTileStorages(ctx, &gset); kgenAddStmt(ctx, "uint4 coord = 0u; /* contains coordB, coordA, k */\n"); kgenAddBlankLine(ctx); vecLen = getVecLen(&gset, CLBLAS_GEMM, MATRIX_A); if (vecLen > 1) { kgenPrintf(ctx, "lda /= %u;\n", vecLen); } vecLen = getVecLen(&gset, CLBLAS_GEMM, MATRIX_B); if (vecLen > 1) { kgenPrintf(ctx, "ldb /= %u;\n", vecLen); } /* * The matrix was made B inner if every thread should accesses their * elements with a large stride but accesses elements of the matrix A * sequentially to provide more coalesced memory accesses. * Otherwise, the matrix A was made inner. */ i = (!isColMajA && isColMajB) ? 1 : 0; tailStatus = checkGenAdjustTailCoords(NULL, CLBLAS_GEMM, &gset, NULL); if (tailStatus & TAIL_A_RAISED) { addrMask |= FETCH_ADDR_A_RELATIVE; } if (tailStatus & TAIL_B_RAISED) { addrMask |= FETCH_ADDR_B_RELATIVE; } enableFetchOptLevels(mulOpts.fctx, FOPTLEV_MERGE_FETCHES); addrMode = setDefaultFetchAddrMode(mulOpts.fctx, &gset, addrMask, tailStatus, false); isRelA = ((addrMode & FETCH_ADDR_A_RELATIVE) != 0); isRelB = ((addrMode & FETCH_ADDR_B_RELATIVE) != 0); // Alternative calculate global thead id to eliminate Channel conflicts if (isRelB && isMatrixAccessColMaj(CLBLAS_GEMM, gset.kextra->flags, MATRIX_B)) { sprintf(globalIdB, "get_global_id_%d", 1-i); sprintf(tmp, "uint kif;\n" "uint get_group_id_%d;\n" "uint get_global_id_%d;\n",1-i, 1-i); kgenAddStmt(ctx, tmp); } else { sprintf(globalIdB, "(uint)get_global_id(%d)", 1-i); } if (!(isColMajA || isColMajB)) { size_t tsize; tsize = dtypeSize(dtype); sprintf(tmp, "coord.z = (get_local_id(0) %% 2 * %lu) %% %s;\n", sizeof(cl_float8) / tsize, alignedK); kgenAddStmt(ctx, tmp); /* * Adjust fetch addressing mode. It is used staggered access. That * means there is a starting offset along K and hence addressing * in this dimension should be cycled. */ addrMode &= ~FETCH_ADDR_K_RELATIVE; addrMode |= FETCH_ADDR_K_CYCLICAL; setFetchAddrMode(mulOpts.fctx, addrMode & ~addrMask); } if (isRelA) { genSetupItemPtr(ctx, &gset, MATRIX_A); } if (isRelB) { genSetupItemPtr(ctx, &gset, MATRIX_B); } /* * Setup coordinates and check if they don't exceed matrix */ sprintf(tmp, "\n" "coord.y = %luu * (uint)get_global_id(%d);\n" "coord.x = %luu * (uint)%s;\n", subdims[1].y, i, subdims[1].x, globalIdB); kgenAddStmt(ctx, tmp); genHitMatrixCheck(ctx, kflags); genShiftPointers(ctx, &gset, kflags, true); genZeroTile(ctx, &gset.tileCY); tailStatus = checkGenAdjustTailCoords(ctx, CLBLAS_GEMM, &gset, NULL); mulOpts.core = ((kflags & KEXTRA_ENABLE_MAD) != 0) ? TILEMUL_MAD : TILEMUL_MULADD; mulOpts.flags |= TILEMUL_EXTERN_RDECL; mulOpts.flags |= kextraToTilemulFlags(CLBLAS_GEMM, kflags); sprintf(tmp, "for (uint k1 = 0; k1 < %s; k1 += %lu)", alignedK, subdims[1].bwidth); prepareFetchLoop(ctx, mulOpts.fctx, &gset, CLMEM_GLOBAL_MEMORY, CLMEM_GLOBAL_MEMORY); kgenBeginBranch(ctx, tmp); ret = tileMulGen(ctx, &gset, &mulOpts); if (ret != 0) { goto out; } kgenEndBranch(ctx, NULL); // 0..K loop kgenAddBlankLine(ctx); //Optionally handle tails along K if (tailsK) { setDefaultFetchAddrMode(mulOpts.fctx, &gset, addrMask, tailStatus, true); vnames->sizeK = "K"; pfPriv.fetchNumA = 0; pfPriv.wholeA = 0; pfPriv.funcID = CLBLAS_GEMM; pfPriv.gset = &gset; mulOpts.postFetch = defaultTilePostFetch; mulOpts.postFetchPriv = &pfPriv; if (!(isColMajA || isColMajB)) { kgenAddStmt(ctx, "coord.z = Kbase;\n"); } sprintf(tmp, "for (uint k1 = 0u; k1 < Ktail; k1 += %luu)", subdims[1].bwidth); kgenBeginBranch(ctx, tmp); ret = tileMulGen(ctx, &gset, &mulOpts); if (ret != 0) { goto out; } kgenEndBranch(ctx, NULL); // 0..Ktail loop kgenAddBlankLine(ctx); } gset.kextra = kextra; checkGenRestoreTailCoords(ctx, &gset, tailStatus); upFlags = kextraToUpresFlags(CLBLAS_GEMM, kflags); upFlags |= tailStatusToUpresFlags(tailStatus); upFlags |= UPRES_INDEXING_WITH_CONSTANTS; genResultUpdateWithFlags(ctx, CLBLAS_GEMM, &gset, upFlags, NULL, NULL, NULL); kgenEndFuncBody(ctx); ret = kgenAddBlankLine(ctx); if (!ret) { ret = (ssize_t)kgenSourceSize(ctx) + 1; } out: destroyFetchContext(mulOpts.fctx); destroyKgenContext(ctx); return (ret < 0) ? -EOVERFLOW : ret; } //----------------------------------------------------------------------------- /* the generator for subgroup access pattern (used when A and B matrices are accessed row-major)*/ static ssize_t subgGen( char *pBuf, size_t buflen, const struct SubproblemDim *pSubDims, const struct PGranularity *pPGran, void *pExtra ) { struct KgenContext *pCtx; CLBLASKernExtra *pKExtra = (CLBLASKernExtra*)pExtra; KernelExtraFlags kflags = pKExtra->flags; DataType dtype = pKExtra->dtype; size_t staggered = ((extraData_t*)&pKExtra->solverPriv)->staggered; char tmp[2048]; BlasGenSettings gset; TileMulOpts mulOpts; ssize_t ret; FetchOpts fopts; TilePostFetchPrivate pfPriv; UpdateResultFlags upResFlags = 0; TailStatus tailStatus; FetchAddrMode addrMode; Kstring exprK; SubgVarNames subVNames; KernelVarNames *vnames = NULL; const char *alignedK; unsigned int vecLenA; bool isDoubleBased = isDoubleBasedType(dtype); bool tailsLowerK = ( (kflags & KEXTRA_TAILS_K_LOWER) != 0 ); bool tailsM = ( (kflags & KEXTRA_TAILS_M) != 0 ); bool tailsN = ( (kflags & KEXTRA_TAILS_N) != 0 ); bool tailsLowerM = ( (kflags & KEXTRA_TAILS_M_LOWER) != 0 ); bool tailsLowerN = ( (kflags & KEXTRA_TAILS_N_LOWER) != 0 ); unsigned int subgroupsA = 0; unsigned int subgroupsB = 0; memset(&gset, 0, sizeof(gset)); memset(&mulOpts, 0, sizeof(mulOpts)); memset(&pfPriv, 0, sizeof(pfPriv)); memset(&fopts, 0, sizeof(fopts)); memcpy( gset.subdims, pSubDims, sizeof(gset.subdims) ); gset.pgran = pPGran; gset.flags = BGF_DISTINCT_VECLEN | BGF_WHOLE_A | BGF_LD_IN_VECTORS; gset.kextra = pKExtra; vnames = &gset.varNames; // setting the basic names for kernel variables vnames->A = "A"; vnames->B = "B"; vnames->C = "C"; vnames->LDS = "scratch"; vnames->sizeM = "M"; vnames->sizeN = "N"; vnames->lda = "lda"; vnames->ldb = "ldb"; vnames->ldc = "ldc"; vnames->alpha = "alpha"; vnames->beta = "beta"; vnames->vectCoordA = "vca"; vnames->vectCoordB = "vcb"; vnames->k = exprK.buf; subgroupsA = (unsigned int)(gset.subdims[0].y/gset.subdims[1].y); subgroupsB = (unsigned int)(gset.subdims[0].x/gset.subdims[1].x); initDefaultTiles(&gset, CLBLAS_GEMM, 0, PRIV_STORAGE_VARIABLE_SET); vecLenA = gset.tileA.vecLen; // channel offset based coordinate ksprintf(&exprK, "( (uint)(get_group_id(0))*%lu + k )", staggered/vecLenA*vecLenA); // starting code generation-------------------------------------------------- pCtx = createKgenContext(pBuf, buflen, true); if ( pCtx == NULL) { return -ENOMEM; } //define required macros /* B_BLK_H should be one of common vector sizes, as matrix C is accessed by vectors of this length*/ sprintf(tmp,"#define A_BLK_H %lu\n",gset.subdims[1].y); kgenAddStmt(pCtx,tmp); sprintf(tmp,"#define B_BLK_H %lu\n",gset.subdims[1].x); kgenAddStmt(pCtx,tmp); sprintf(tmp,"#define SUBG_ITEMS %d\n",pPGran->wgSize[0]); kgenAddStmt(pCtx,tmp); sprintf(tmp,"#define SUBG_A %d\n",subgroupsA); kgenAddStmt(pCtx,tmp); sprintf(tmp,"#define SUBG_B %d\n",subgroupsB); kgenAddStmt(pCtx,tmp); kgenAddBlankLine(pCtx); kgenAddStmt(pCtx,tmp); sprintf( tmp, "#define K_VLEN_A %u\n" "#define K_VLEN_B %u\n", getVecLen(&gset, CLBLAS_GEMM, MATRIX_A), getVecLen(&gset, CLBLAS_GEMM, MATRIX_B)); kgenAddStmt(pCtx,tmp); kgenAddBlankLine(pCtx); // Declare pointer unions kgenDeclareUptrs(pCtx, isDoubleBased); kgenAddBlankLine(pCtx); // declaring kernel function declareKernel( pCtx, &gset, "Subgroup" ); ret = kgenBeginFuncBody( pCtx ); // kernel generation steps: // register variables declarations----------------------------------------- // K tail // if postfetch should be engaged, generate tail code for // whole subgroup, otherwise tail is handled by main cycle. if( tailsLowerK ){ sprintf(tmp, "uint Ktail = K %% %lu;\n" "uint Kbase = K - Ktail;\n", pSubDims[0].bwidth); kgenAddStmt(pCtx, tmp); alignedK = "Kbase"; } else { alignedK = "K"; } vnames->sizeK = alignedK; declareTileStorages(pCtx, &gset); // scaling leading dims // If lower-K tails need to be handled, vectorized access is disabled // scaling is performed by factor 1 sprintf(tmp, "%s /= K_VLEN_A;\n", vnames->lda); kgenAddStmt(pCtx, tmp); sprintf(tmp, "%s /= K_VLEN_B;\n", vnames->ldb); kgenAddStmt(pCtx, tmp); //declare variables for subgroup mode subVNames.itemId = "itemId"; kgenAddBlankLine( pCtx ); kgenPrintf( pCtx, "int2 %s;\n", subVNames.itemId ); // item id kgenPrintf( pCtx, "%s.x = get_local_id(0);\n", subVNames.itemId ); // subgroup id kgenPrintf( pCtx, "%s.y = get_local_id(1);\n", subVNames.itemId ); kgenAddBlankLine( pCtx ); // coordinate variables vnames->coordA = "coordY"; vnames->coordB = "coordX"; // generate offsets genShiftPointers( pCtx, &gset, kflags, true ); // FIXME add new subgroup variables support sprintf(tmp, "int %s = " "A_BLK_H*( " "get_group_id(1)*SUBG_A + " "get_local_id(1)/SUBG_B );\n", vnames->coordA); kgenAddStmt(pCtx, tmp); sprintf(tmp, "int %s = " "B_BLK_H*( " "get_group_id(0)*SUBG_B + " "get_local_id(1)%%SUBG_B );\n", vnames->coordB); kgenAddStmt(pCtx, tmp); kgenAddBlankLine(pCtx); // Block M N tails. Drop excess blocks ------------------------------------ kgenAddStmt(pCtx,"uint skipTileMul = 0;\n"); //M if( tailsM ){ kgenAddStmt(pCtx,"//M block tail\n"); sprintf(tmp, "if( %s >= %s )", vnames->coordA, vnames->sizeM); kgenBeginBranch( pCtx,tmp ); kgenAddStmt(pCtx,"skipTileMul = 1;\n"); kgenEndBranch(pCtx,NULL); } //N if( tailsN ){ kgenAddStmt(pCtx,"//N block tail\n"); sprintf(tmp, "if( %s >= %s )", vnames->coordB, vnames->sizeN); kgenBeginBranch( pCtx,tmp ); kgenAddStmt(pCtx,"skipTileMul = 1;\n"); kgenEndBranch(pCtx,NULL); } kgenAddBlankLine(pCtx); //"Lower" tails if( tailsLowerM || tailsLowerN ){ kgenAddStmt(pCtx, "//Raising \"Lower\" M N tails\n"); } tailStatus = checkGenAdjustTailCoords(pCtx, CLBLAS_GEMM, &gset, NULL); // A, B pointers----------------------------------------------------------- sprintf(tmp, "A += %s*%s;\n", vnames->lda, vnames->coordA); kgenAddStmt(pCtx, tmp); sprintf(tmp, "B += %s*%s;\n", vnames->ldb, vnames->coordB); kgenAddStmt(pCtx, tmp); // calculated in vectors, C access is aligned to. // if row of C-block is splitted into smaller vectors - // multiply offset by number of these vectors kgenAddBlankLine(pCtx); genZeroTile( pCtx, &gset.tileCY ); kgenAddBlankLine(pCtx); kgenAddBlankLine(pCtx); mulOpts.fctx = createFetchContext(); if (mulOpts.fctx == NULL) { destroyKgenContext(pCtx); return -ENOMEM; } enableFetchOptLevels(mulOpts.fctx, FOPTLEV_CAN_SHARE_TMP_AB); addrMode = setDefaultFetchAddrMode(mulOpts.fctx, &gset, FETCH_ADDR_K_RELATIVE, tailStatus, false); addrMode |= FETCH_ADDR_A_RELATIVE | FETCH_ADDR_B_RELATIVE | FETCH_ADDR_K_CYCLICAL; setFetchAddrMode(mulOpts.fctx, addrMode); prepareFetchLoop(pCtx, mulOpts.fctx, &gset, CLMEM_GLOBAL_MEMORY, CLMEM_GLOBAL_MEMORY); if( tailsM || tailsN ){ kgenBeginBranch(pCtx,"if( !skipTileMul )"); } sprintf(tmp, "for(int k = %u*get_local_id(0); k < %s; k += %u*SUBG_ITEMS)", vecLenA, alignedK, vecLenA); kgenBeginBranch( pCtx, tmp ); // tiles multiplier-------------------------------------------------------- mulOpts.memA = CLMEM_GLOBAL_MEMORY; mulOpts.memB = CLMEM_GLOBAL_MEMORY; mulOpts.core = ((kflags & KEXTRA_ENABLE_MAD) != 0) ? TILEMUL_MAD : TILEMUL_MULADD; mulOpts.flags = kextraToTilemulFlags( CLBLAS_GEMM, kflags ); mulOpts.flags |= TILEMUL_EXTERN_RDECL; mulOpts.flags |= TILEMUL_NOT_INC_K; mulOpts.flags |= TILEMUL_BW_STRIDE; /* both matrices are accessed row - major */ mulOpts.flags |= TILEMUL_TRB; ret = tileMulGen( pCtx, &gset, &mulOpts ); if (ret != 0) { goto out; } kgenEndBranch(pCtx, NULL); kgenAddBlankLine(pCtx); // K - Tail if ( tailsLowerK ) { setFetchAddrMode(mulOpts.fctx, addrMode | FETCH_ADDR_TAILK_PADD); vnames->sizeK = "K"; vnames->k = "k"; kgenPrintf(pCtx, "uint %s = %s + get_local_id(0)*%u;\n", vnames->k, alignedK, vecLenA); pfPriv.fetchNumA = 0; pfPriv.wholeA = 0; pfPriv.funcID = CLBLAS_GEMM; pfPriv.gset = &gset; mulOpts.postFetch = defaultTilePostFetch; mulOpts.postFetchPriv = &pfPriv; kgenBeginBranch(pCtx, NULL); ret = tileMulGen(pCtx, &gset, &mulOpts); if (ret != 0) { goto out; } kgenEndBranch(pCtx, NULL); } if( tailsM || tailsN ){ kgenEndBranch(pCtx, NULL); // skip tilemul condition } kgenAddBlankLine(pCtx); upResFlags = kextraToUpresFlags(CLBLAS_GEMM, kflags) | tailStatusToUpresFlags(tailStatus); // restore coordinates, if tail was raised checkGenRestoreTailCoords(pCtx, &gset, tailStatus); // merge and update result mergeUpdateResult( pCtx, CLBLAS_GEMM, &gset, &subVNames, upResFlags | UPRES_EXCEED_PROBLEM_CONDITION | UPRES_INDEXING_WITH_CONSTANTS, (UpresProcPtr)genResultUpdateWithFlags ); kgenEndFuncBody(pCtx); if (!ret) { ret = (ssize_t)kgenSourceSize(pCtx) + 1; } out: destroyFetchContext(mulOpts.fctx); destroyKgenContext(pCtx); return (ret < 0) ? -EOVERFLOW : ret; } //----------------------------------------------------------------------------- static void assignBlockKargs(KernelArg *args, const void *params, const void *extra) { CLBlasKargs *blasArgs = (CLBlasKargs*)params; KernelExtraFlags kflags = ((const CLBLASKernExtra*)extra)->flags; int idx; (void)extra; initSizeKarg(&args[0], blasArgs->M); initSizeKarg(&args[1], blasArgs->N); initSizeKarg(&args[2], blasArgs->K); assignScalarKarg(&args[3], &(blasArgs->alpha), blasArgs->dtype); assignScalarKarg(&args[4], &(blasArgs->beta), blasArgs->dtype); INIT_KARG(&args[5], blasArgs->A); INIT_KARG(&args[6], blasArgs->B); INIT_KARG(&args[7], blasArgs->C); initSizeKarg(&args[8], blasArgs->lda.matrix); initSizeKarg(&args[9], blasArgs->ldb.matrix); initSizeKarg(&args[10], blasArgs->ldc.matrix); idx = 11; if (kflags & KEXTRA_A_OFF_NOT_ZERO) { initSizeKarg(&args[idx++], blasArgs->offA); } if (kflags & KEXTRA_BX_OFF_NOT_ZERO) { initSizeKarg(&args[idx++], blasArgs->offBX); } if (kflags & KEXTRA_CY_OFF_NOT_ZERO) { initSizeKarg(&args[idx++], blasArgs->offCY); } } static bool blockCheckCalcDecomp( PGranularity *pgran, SubproblemDim *subdims, unsigned int subdimsNum, DataType dtype, int check) { bool ret = true; bool ret_multiple = false; int i; DUMMY_ARG_USAGE(subdimsNum); if (check == PGRAN_CHECK) { unsigned int minSize, maxSize; maxSize = (dtype == TYPE_COMPLEX_DOUBLE) ? 4 : 8; minSize = (dtype == TYPE_COMPLEX_DOUBLE) ? 1 : 2; ret = decompSanityCheck(subdims, minSize, maxSize, 24, dtype, true); ret = ret && (subdims[0].bwidth == subdims[1].bwidth); for(i = 0; i < ( (pgran->maxWorkGroupSize) / (pgran->wfSize) ); i++) { // returns true if wgSize[0] * wgSize[1] is multiples of the 64 but not bigger than maxWorkGroupSize ret_multiple = ret_multiple || ( pgran->wgSize[0] * pgran->wgSize[1] == pgran->wfSize * (i + 1) ); } ret = ret && ret_multiple; } else { calcPgranDedicated(pgran, subdims, 1, 3); } return ret; } //----------------------------------------------------------------------------- static void assignSubgKargs(KernelArg *args, const void *params, const void *extra) { CLBlasKargs *blasArgs = (CLBlasKargs*)params; KernelExtraFlags kflags = ((const CLBLASKernExtra*)extra)->flags; int idx = 0; (void)extra; initSizeKarg(&args[0], blasArgs->M); initSizeKarg(&args[1], blasArgs->N); initSizeKarg(&args[2], blasArgs->K); assignScalarKarg(&args[3], &(blasArgs->alpha), blasArgs->dtype); assignScalarKarg(&args[4], &(blasArgs->beta), blasArgs->dtype); INIT_KARG(&args[5], blasArgs->A); INIT_KARG(&args[6], blasArgs->B); INIT_KARG(&args[7], blasArgs->C); initSizeKarg(&args[8], blasArgs->lda.matrix); initSizeKarg(&args[9], blasArgs->ldb.matrix); initSizeKarg(&args[10], blasArgs->ldc.matrix); idx = 11; if (kflags & KEXTRA_A_OFF_NOT_ZERO) { initSizeKarg(&args[idx++], blasArgs->offA); } if (kflags & KEXTRA_BX_OFF_NOT_ZERO) { initSizeKarg(&args[idx++], blasArgs->offBX); } if (kflags & KEXTRA_CY_OFF_NOT_ZERO) { initSizeKarg(&args[idx++], blasArgs->offCY); } return; } //----------------------------------------------------------------------------- static DecompositionAxis innerDecompositionAxis(const void *args) { const CLBlasKargs *kargs = args; int tra, trb; tra = (kargs->order == clblasColumnMajor) ^ (kargs->transA != clblasNoTrans); trb = (kargs->order == clblasRowMajor) ^ (kargs->transB != clblasNoTrans); /* * Make the matrix B inner if every thread should access their elements * with a large stride but accesses elements of the matrix A sequentially * to provide more coalesced memory accesses. */ return (!tra && trb) ? DECOMP_AXIS_X : DECOMP_AXIS_Y; } //----------------------------------------------------------------------------- static SolverFlags solverFlags(void) { return (SF_WSPACE_2D); } //----------------------------------------------------------------------------- static void fixupArgs(void *args, SubproblemDim *subdims, void *extra) { CLBlasKargs *kargs = (CLBlasKargs*)args; extraData_t *extraData = (extraData_t*)&((CLBLASKernExtra*)extra)->solverPriv; const size_t nChans = 8; // !!!DEVICE DEPENDED!!! const size_t wideChans = 64; // !!!DEVICE DEPENDED!!! const size_t sizeType[] = {1,2,2,4}; size_t sizeBlock = wideChans * nChans / sizeType[kargs->dtype]; size_t off = kargs->K % sizeBlock; if (off == 0) { extraData->staggered = roundUp(subdims[1].bwidth * sizeType[kargs->dtype] , wideChans / sizeType[kargs->dtype]); } else { extraData->staggered = 0; } } //----------------------------------------------------------------------------- void InitGEMMCachedBlockPattern(MemoryPattern *mempat) { mempat->name = "Cached global memory based block gemm"; mempat->nrLevels = 2; mempat->cuLevel = 0; mempat->thLevel = 1; mempat->sops = &blockSOps; mpatExtra.aMset = CLMEM_LEVEL_L1; mpatExtra.bMset = CLMEM_LEVEL_L1; mpatExtra.mobjA = CLMEM_BUFFER; mpatExtra.mobjB = CLMEM_BUFFER; mempat->extra = &mpatExtra; } //----------------------------------------------------------------------------- static int blockGetPerf( unsigned int kflags, const void *args) { (void)args; if( !isMatrixAccessColMaj( CLBLAS_GEMM, kflags, MATRIX_A ) && !isMatrixAccessColMaj( CLBLAS_GEMM, kflags, MATRIX_B ) ){ return PPERF_AVERAGE; } return PPERF_GOOD; } //----------------------------------------------------------------------------- void InitGEMMCachedSubgroupPattern(MemoryPattern *mempat) { mempat->name = "Cached global memory based subgroup gemm"; mempat->nrLevels = 2; mempat->cuLevel = 0; mempat->thLevel = 1; mempat->sops = &subgSOps; mpatExtra.aMset = CLMEM_LEVEL_L1; mpatExtra.bMset = CLMEM_LEVEL_L1; mpatExtra.mobjA = CLMEM_BUFFER; mpatExtra.mobjB = CLMEM_BUFFER; mempat->extra = &mpatExtra; } //----------------------------------------------------------------------------- static int gemmSubgGetDefaultDecomp( PGranularity *pgran, SubproblemDim *subdims, unsigned int subdimsNum, void *pArgs ) { DUMMY_ARG_USAGE(subdimsNum); pgran->wgDim = 2; return subgGetDefaultDecomp( pgran, subdims, pArgs ); } //----------------------------------------------------------------------------- static bool subgCheckCalcDecomp( PGranularity *pgran, SubproblemDim *subdims, unsigned int subdimsNum, DataType dtype, int check) { unsigned int subgroupsA = 0; unsigned int subgroupsB = 0; unsigned int itemsPerSubg = 0; unsigned int regUse = 0; //EINVAL if( (subdimsNum<2)|| (NULL==pgran)|| (NULL==subdims) ){ return false; } if( 0 == subdims[0].x || 0 == subdims[0].y || 0 == subdims[0].bwidth || 0 == subdims[1].x || 0 == subdims[1].y || 0 == subdims[1].bwidth ){ return false; } if( subdims[1].x != subdims[1].itemX || subdims[1].y != subdims[1].itemY ){ return false; } // the group block must consist of integer number of subgroup blocks if( subdims[0].x % subdims[1].x || subdims[0].y % subdims[1].y || subdims[0].bwidth % subdims[1].bwidth ){ return false; } if( !(isDoubleBasedType(dtype) && isComplexType(dtype) ) ){ if ( subdims[1].x < 2 || subdims[1].y < 2 || subdims[1].bwidth < 2 ) { return false; } } // check dimensions if( subdims[1].bwidth > 8 || subdims[1].x > 8 || subdims[1].y > 8 ){ return false; } // estimate register usage, drop // inevitably slowed decompositions regUse = ( subdims[1].bwidth * subdims[1].x + subdims[1].bwidth * subdims[1].y + subdims[1].x * subdims[1].y ) * dtypeSize(dtype); regUse /= 16; // 16 bytes per register if( regUse >= 50 ){ return false; } // validate the subgroup decomposition itemsPerSubg = subdims[0].bwidth/subdims[1].bwidth; subgroupsA = subdims[0].y/subdims[1].y; subgroupsB = subdims[0].x/subdims[1].x; // passed PGranularity should be checked if( PGRAN_CHECK == check ){ if( pgran->wgSize[0] != itemsPerSubg || pgran->wgSize[1] != subgroupsA*subgroupsB ){ return false; } //filter subgroup numbers with poor performance //(less than 2 items in subgroup) if( pgran->wgSize[0] < 2 ){ return false; } // drop groups consisting of number of items other than 64 if( pgran->wgSize[0] * pgran->wgSize[1] != 64 ){ return false; } } // PGranularity should be calculated else{ pgran->wgSize[0] = itemsPerSubg; pgran->wgSize[1] = subgroupsA*subgroupsB; } pgran->wgDim = 2; /*Debug out for Tune*/ return true; } //----------------------------------------------------------------------------- static void subgCalcGlobalThreads( size_t threads[2], const SubproblemDim *subdims, const PGranularity *pgran, const void *args, const void *extra ) { CLBlasKargs *pArgs; //EINVAL if( NULL == subdims || NULL == pgran || NULL == args || NULL == extra) { return; } pArgs = (CLBlasKargs*)args; threads[0] = (pArgs->N/subdims[0].x)*pgran->wgSize[0]; threads[1] = (pArgs->M/subdims[0].y)*pgran->wgSize[1]; // N tail group if( pArgs->N%subdims[0].x ){ threads[0] += pgran->wgSize[0]; } // M tail group if( pArgs->M%subdims[0].y ){ threads[1] += pgran->wgSize[1]; } } //----------------------------------------------------------------------------- static int subgGetPerf( unsigned int kflags, const void *args) { DUMMY_ARG_USAGE(args); if( !isMatrixAccessColMaj( CLBLAS_GEMM, kflags, MATRIX_A ) && !isMatrixAccessColMaj( CLBLAS_GEMM, kflags, MATRIX_B ) ){ return PPERF_GOOD; } return PPERF_NOT_SUPPORTED; } clblas-2.10/src/library/blas/gens/gemm_cached.cpp000066400000000000000000000337211264277366700217560ustar00rootroot00000000000000/* ************************************************************************ * Copyright 2013 Advanced Micro Devices, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * ************************************************************************/ /* * Cached global buffers based gemm generator */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include "tuned_numbers.h" //#define DEBUG_GEMM_2 static CLBLASMpatExtra mpatExtra; static char Prefix[4]; /* Function, finding default decomposition */ static int getDefaultDecomposition( PGranularity *pgran, SubproblemDim *subdims, unsigned int subdimsNum, void *pArgs); static ssize_t generator( char *buf, size_t buflen, const struct SubproblemDim *subdims, const struct PGranularity *pgran, void *extra); static void assignKargs(KernelArg *args, const void *params, const void *extra); static SolverFlags solverFlags(void); static void setBuildOpts( char * buildOptStr, const void *args); static void calcNrThreads( size_t threads[2], const SubproblemDim *subdims, const PGranularity *pgran, const void *args, const void *extra); static SolverOps gemmSops = { generator, assignKargs, NULL, NULL, NULL, calcNrThreads, NULL, solverFlags, NULL, getDefaultDecomposition, NULL, setBuildOpts, NULL }; static void calcNrThreads( size_t threads[2], const SubproblemDim *subdims, const PGranularity *pgran, const void *args, const void *extra) { const CLBlasKargs *kargs = (const CLBlasKargs *)args; //const CLBLASKernExtra *kextra = ( CLBLASKernExtra *)extra; //KernelExtraFlags kflags = kextra->flags; size_t M, N; M = kargs->M; N = kargs->N; threads[1] = 1; if ((subdims->x != SUBDIM_UNUSED) && (subdims->y != SUBDIM_UNUSED)) { size_t groupWorkX, groupWorkY; size_t nrGroupsX, nrGroupsY; int nrDims; groupWorkX = subdims->x; groupWorkY = subdims->y; nrGroupsX = N / groupWorkX; if (N % groupWorkX) { nrGroupsX++; } nrGroupsY = M / groupWorkY; if (M % groupWorkY) { nrGroupsY++; } nrDims = (pgran == NULL) ? 1 : pgran->wgDim; threads[0] = nrGroupsX * nrGroupsY; if(kargs->pigFuncID == CLBLAS_HERK) { threads[0] = (nrGroupsY * (nrGroupsY + 1)) / 2; } } if (pgran != NULL) { threads[0] *= pgran->wgSize[0]; threads[1] *= pgran->wgSize[1]; } } static void setBuildOpts( char * buildOptStr, const void *args) { SolutionStep *step = (SolutionStep *)args; const CLBlasKargs *kargs = (const CLBlasKargs *)(&step->args); const SubproblemDim *dims = step->subdims; //size_t vecLen = sizeof(cl_float4)/dtypeSize(kargs->dtype); KernelExtraFlags kflags = step->extraFlags; blockSizes bestSize = bestBlockSizeForDevice( step ); if ( kargs->dtype == TYPE_DOUBLE || kargs->dtype == TYPE_COMPLEX_DOUBLE) { addBuildOpt( buildOptStr, BUILD_OPTS_MAXLEN, "-DDOUBLE_PRECISION"); } if (isComplexType(kargs->dtype)) { addBuildOpt( buildOptStr, BUILD_OPTS_MAXLEN, "-DCOMPLEX"); } if ((bestSize.useBarrier) == 1) { addBuildOpt( buildOptStr, BUILD_OPTS_MAXLEN, "-DGEMM_NEEDS_BARRIER"); } if (kargs->M % dims->y) { addBuildOpt( buildOptStr, BUILD_OPTS_MAXLEN, "-DM_TAIL_PRESENT"); } if (kargs->N % dims->x) { addBuildOpt( buildOptStr, BUILD_OPTS_MAXLEN, "-DN_TAIL_PRESENT"); } if (kflags & KEXTRA_CONJUGATE_A) { addBuildOpt( buildOptStr, BUILD_OPTS_MAXLEN, "-DCONJUGATE_A"); } if (kflags & KEXTRA_CONJUGATE_B) { addBuildOpt( buildOptStr, BUILD_OPTS_MAXLEN, "-DCONJUGATE_B"); } switch(kargs->pigFuncID) { case CLBLAS_HEMM: case CLBLAS_SYMM: case CLBLAS_SYMM_DIAGONAL: case CLBLAS_HEMM_DIAGONAL: #ifdef DEBUG_GEMM_2 printf("GEMM2: setBuildOpts: Setting options for SYMM\n"); #endif if (kargs->side == clblasLeft) { addBuildOpt( buildOptStr, BUILD_OPTS_MAXLEN, "-D__SYMM_LEFT__"); } if (kargs->side == clblasRight) { addBuildOpt( buildOptStr, BUILD_OPTS_MAXLEN, "-D__SYMM_RIGHT__"); } if (kargs->uplo == clblasLower) { addBuildOpt( buildOptStr, BUILD_OPTS_MAXLEN, "-D__SYMM_LOWER__"); } if (kargs->uplo == clblasUpper) { addBuildOpt( buildOptStr, BUILD_OPTS_MAXLEN, "-D__SYMM_UPPER__"); } // Define the order for Legacy sake. if (kargs->order == clblasColumnMajor) { addBuildOpt( buildOptStr, BUILD_OPTS_MAXLEN, "-D__SYMM_COLMAJOR__"); } else { addBuildOpt( buildOptStr, BUILD_OPTS_MAXLEN, "-D__SYMM_ROWMAJOR__"); } if ((kargs->pigFuncID == CLBLAS_SYMM_DIAGONAL) || (kargs->pigFuncID == CLBLAS_HEMM_DIAGONAL)) { addBuildOpt( buildOptStr, BUILD_OPTS_MAXLEN, "-D__SYMM_DIAGONAL__"); } if (kargs->pigFuncID == CLBLAS_HEMM_DIAGONAL) { addBuildOpt( buildOptStr, BUILD_OPTS_MAXLEN, "-D__HEMM__"); } break; case CLBLAS_HERK: addBuildOpt( buildOptStr, BUILD_OPTS_MAXLEN, "-DHERK"); if(kargs->uplo == clblasLower) { addBuildOpt( buildOptStr, BUILD_OPTS_MAXLEN, "-DHERK_LOWER_TRIANGLE"); } else if(kargs->uplo == clblasUpper) { addBuildOpt( buildOptStr, BUILD_OPTS_MAXLEN, "-DHERK_UPPER_TRIANGLE"); } break; default: break; } #ifdef DEBUG_GEMM_2 printf("buildStr: %s\n", buildOptStr); #endif return; } static ssize_t generator( char *buf, size_t buflen, const struct SubproblemDim *subdims, const struct PGranularity *pgran, void *extra) { CLBLASKernExtra *kextra = (CLBLASKernExtra*)extra; KernelExtraFlags kflags = kextra->flags; DataType dtype = kextra->dtype; char tempTemplate[64*1024]; //PENDING: Is it safe to have 64K in stack for threadSafety? char itemx[10], itemy[10], width[10], itemy_by_width[10], itemx_by_width[10]; char bwidth[10], panel_by_v[10]; size_t Y, X, BLOCKSIZE, ITEMX, ITEMY; bool doVLOAD = false; unsigned int veclen; if (buf == NULL) { buflen = 64*1024*sizeof(char); return (ssize_t)buflen; } // // PENDING: Add Support for Row Major // if ((kflags & KEXTRA_COLUMN_MAJOR) == 0) { return 0; } if ((kflags & KEXTRA_NO_COPY_VEC_A) || (kflags & KEXTRA_NO_COPY_VEC_B) || (kflags & KEXTRA_NO_COPY_VEC_C)) { #ifdef DEBUG_GEMM_2 printf("GEMM2: Doing un-aligned access\n"); #endif doVLOAD= true; } else { #ifdef DEBUG_GEMM_2 printf("GEMM2: Doing Aligned access\n"); #endif } BLOCKSIZE = pgran->wgSize[0]; #ifdef DEBUG_GEMM_2 printf("GEMM2- generator(): Blocksize passed = %lu, subdimy = %lu, subdimx = %lu, veclen = %d \n", BLOCKSIZE, subdims->y, subdims->x, kextra->vecLen); #endif veclen = kextra->vecLen; ITEMY = subdims->itemY; ITEMX = subdims->itemX; Y = subdims->y / ITEMY; X = subdims->x / ITEMX; // // Handle in-compatible subdims and workgroup sizes // We will use "veclen" of 1 as our shield against these in-compatible // geometries. // if ( (ITEMY % kextra->vecLen) || ((ITEMX % kextra->vecLen) && (kflags & KEXTRA_TRANS_B)) ) { // // FIXME: // This kernel must be stored against vecLen of 1 in Kernel Cache. // This needs change in EXTRA structure. However, this is against the API. // We are going against the API by changing fields in EXTRA structure. // One alternate FIX is to return an error. // kextra->vecLen = kextra->vecLenA = kextra->vecLenB = kextra->vecLenC = 1; doVLOAD = true; veclen = 1; } // // PENDING: Selective Vectorization for A, B and C access has to be added // in KPRINTF module (VLOADA, VLOADB, VLOADC, VSTOREC) // kprintf kobj(Prefix[dtype], veclen, doVLOAD, doVLOAD); // Only Vectored Access sprintf(width, "%lu", Y); sprintf(itemy, "%lu", ITEMY); sprintf(itemx, "%lu", ITEMX); sprintf(itemy_by_width, "%lu", (size_t) ITEMY/veclen); sprintf(itemx_by_width, "%lu", (size_t) ITEMX/veclen); //sprintf(bwidth, "%lu", subdims->bwidth); //sprintf(panel_by_v, "%lu", (subdims->bwidth / veclen)); sprintf(bwidth, "%lu", (size_t) veclen); sprintf(panel_by_v, "%lu", (size_t) 1); kobj.put("%WIDTH", width); kobj.put("%ITEMX", itemx); kobj.put("%ITEMY", itemy); kobj.put("%ITEMY_BY_V", itemy_by_width); kobj.put("%ITEMX_BY_V", itemx_by_width); kobj.put("%PANEL", bwidth); kobj.put("%PANEL_BY_V", panel_by_v); #ifdef DEBUG_GEMM_2 printf("ColMajor GEMM - WIDTH = %s, PANEL = %lu, ITEMX = %s, ITEMY = %s, Veclen = %lu\n", width, subdims->bwidth, itemx, itemy, veclen); #endif strcpy(tempTemplate, SYMM_HEMM_HELPER); if ((kflags & KEXTRA_TRANS_A) == 0) { if (kflags & KEXTRA_TRANS_B) { #ifdef DEBUG_GEMM_2 printf("Using GEMM_NT_KERNEL\n"); #endif strcat(tempTemplate, GEMM_HELPER); strcat(tempTemplate, GEMM_NT_KERNEL); } else { #ifdef DEBUG_GEMM_2 printf("Using GEMM_NN_KERNEL\n"); #endif strcat(tempTemplate, GEMM_HELPER); strcat(tempTemplate, GEMM_NN_KERNEL); } } else { // PENDING: if (kflags & KEXTRA_TRANS_B) { tempTemplate[0] = 0; } else { #ifdef DEBUG_GEMM_2 printf("Using GEMM_TN_KERNEL\n"); #endif strcat(tempTemplate, GEMM_HELPER); strcat(tempTemplate, GEMM_TN_KERNEL); } } kobj.spit(buf, tempTemplate); #ifdef DEBUG_GEMM_KPRINTF printf("Kernel = \n%s\n", buf); #endif size_t tail = strlen(buf) + 1; while(tail < 64*1024) { buf[tail++] = 0; } return 64*1024*sizeof(char); } static void assignKargs(KernelArg *args, const void *params, const void*) { CLBlasKargs *blasArgs = (CLBlasKargs*)params; #ifdef DEBUG_GEMM_2 printf("SAlpha=%f, DAlpha=%f, CAlpha =<%f, %f>, DAlpha=<%f, %f>\n", blasArgs->alpha.argFloat, blasArgs->alpha.argDouble, CREAL(blasArgs->alpha.argFloatComplex), CIMAG(blasArgs->alpha.argFloatComplex), CREAL(blasArgs->alpha.argDoubleComplex) , CIMAG(blasArgs->alpha.argDoubleComplex)); printf("SBeta=%f, DBeta=%f, CBeta=<%f, %f>, DBeta=<%f, %f>\n", blasArgs->beta.argFloat, blasArgs->beta.argDouble, CREAL(blasArgs->beta.argFloatComplex), CIMAG(blasArgs->beta.argFloatComplex), CREAL(blasArgs->beta.argDoubleComplex) , CIMAG(blasArgs->beta.argDoubleComplex)); #endif INIT_KARG(&args[0], blasArgs->A); //A - input matrix - argument INIT_KARG(&args[1], blasArgs->B); //x - result buffer = _xnew argument INIT_KARG(&args[2], blasArgs->C); //y - scratch == _x_vector argument initSizeKarg(&args[3], blasArgs->M); initSizeKarg(&args[4], blasArgs->N); initSizeKarg(&args[5], blasArgs->K); initSizeKarg(&args[6], blasArgs->lda.matrix); initSizeKarg(&args[7], blasArgs->ldb.matrix); initSizeKarg(&args[8], blasArgs->ldc.matrix); initSizeKarg(&args[9], blasArgs->offA); initSizeKarg(&args[10], blasArgs->offBX); initSizeKarg(&args[11], blasArgs->offCY); assignScalarKarg(&args[12], &(blasArgs->alpha), blasArgs->dtype); assignScalarKarg(&args[13], &(blasArgs->beta), blasArgs->dtype); return; } static SolverFlags solverFlags(void) { return (SF_WSPACE_1D); } extern "C" void initGemmV2CachedPattern(MemoryPattern *mempat) { mempat->name = "Cached global memory based block gemm"; mempat->nrLevels = 2; mempat->cuLevel = 0; mempat->thLevel = 1; mempat->sops = &gemmSops; mpatExtra.aMset = CLMEM_LEVEL_L1; mpatExtra.bMset = CLMEM_LEVEL_L1; mpatExtra.mobjA = CLMEM_BUFFER; mpatExtra.mobjB = CLMEM_BUFFER; mempat->extra = &mpatExtra; Prefix[TYPE_FLOAT] = 'S'; Prefix[TYPE_DOUBLE] = 'D'; Prefix[TYPE_COMPLEX_FLOAT] = 'C'; Prefix[TYPE_COMPLEX_DOUBLE] = 'Z'; } static int getDefaultDecomposition( PGranularity *pgran, SubproblemDim *subdims, unsigned int subdimsNum, void *pArgs) { DUMMY_ARG_USAGE(pArgs); // // FIXME: container_of() - Counts on the fact that "getDefaultDecomposition" is called // with step->pgran, step->subdims // SolutionStep *step = container_of( pgran , pgran, SolutionStep); blockSizes bestSize = bestBlockSizeForDevice( step ); pgran->wgSize[0] = bestSize.TY * bestSize.TX; pgran->wgSize[1] = 1; pgran->wgDim = 1; if (subdimsNum >= 1) { subdims[0].y = bestSize.TY * bestSize.ITEMY; subdims[0].x = bestSize.TX * bestSize.ITEMX; subdims[0].itemY = bestSize.ITEMY; subdims[0].itemX = bestSize.ITEMX; subdims[0].bwidth = 4; } if (subdimsNum >= 2) { subdims[1].y = bestSize.TY * bestSize.ITEMY; subdims[1].x = bestSize.TX * bestSize.ITEMX; subdims[1].itemY = bestSize.ITEMY; subdims[1].itemX = bestSize.ITEMX; subdims[1].bwidth = 4; } return 0; } clblas-2.10/src/library/blas/gens/gemm_tail_cached.cpp000066400000000000000000000322701264277366700227650ustar00rootroot00000000000000/* ************************************************************************ * Copyright 2013 Advanced Micro Devices, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * ************************************************************************/ /* * Cached global buffers based gemm generator */ #include #include #include #include #include #include #include #include #include #include #include #include extern "C" int gemmHasNTail(size_t N, int vecLen, clblasOrder order, clblasTranspose transA, clblasTranspose transB); extern "C" int gemmHasMTail(size_t M, int vecLen, clblasOrder order, clblasTranspose transA, clblasTranspose transB); //#define DEBUG_GEMM_TAIL static CLBLASMpatExtra mpatExtra; static char Prefix[4]; static ssize_t generator( char *buf, size_t buflen, const struct SubproblemDim *subdims, const struct PGranularity *pgran, void *extra); static void assignKargs(KernelArg *args, const void *params, const void *extra); static SolverFlags solverFlags(void); static void setBuildOpts( char * buildOptStr, const void *args); static void calcNrThreads( size_t threads[2], const SubproblemDim *subdims, const PGranularity *pgran, const void *args, const void *extra); static SolverOps gemmSops = { generator, assignKargs, NULL, NULL, NULL, calcNrThreads, NULL, solverFlags, NULL, NULL, NULL, setBuildOpts, NULL }; static void setBuildOpts( char * buildOptStr, const void *args) { const SolutionStep *step = (const SolutionStep *)args; const CLBlasKargs *kargs = (const CLBlasKargs *)(&step->args); KernelExtraFlags kflags = step->extraFlags; addBuildOpt( buildOptStr, BUILD_OPTS_MAXLEN, "-DTAIL_RUN -DM_TAIL_PRESENT -DN_TAIL_PRESENT"); if ( kargs->dtype == TYPE_DOUBLE || kargs->dtype == TYPE_COMPLEX_DOUBLE) { addBuildOpt( buildOptStr, BUILD_OPTS_MAXLEN, "-DDOUBLE_PRECISION"); #ifdef DEBUG_GEMM_TAIL printf("Setting build options ... Double... for DOUBLE PRECISION support\n"); #endif } if (isComplexType(kargs->dtype)) { addBuildOpt( buildOptStr, BUILD_OPTS_MAXLEN, "-DCOMPLEX"); } if (kflags & KEXTRA_CONJUGATE_A) { addBuildOpt( buildOptStr, BUILD_OPTS_MAXLEN, "-DCONJUGATE_A"); } if (kflags & KEXTRA_CONJUGATE_B) { addBuildOpt( buildOptStr, BUILD_OPTS_MAXLEN, "-DCONJUGATE_B"); } switch(kargs->pigFuncID) { case CLBLAS_GEMM2: case CLBLAS_GEMM_TAIL: break; case CLBLAS_HERK: addBuildOpt( buildOptStr, BUILD_OPTS_MAXLEN, "-DHERK"); if(kargs->uplo == clblasLower) { addBuildOpt( buildOptStr, BUILD_OPTS_MAXLEN, "-DHERK_LOWER_TRIANGLE"); } else if(kargs->uplo == clblasUpper) { addBuildOpt( buildOptStr, BUILD_OPTS_MAXLEN, "-DHERK_UPPER_TRIANGLE"); } break; case CLBLAS_HEMM: case CLBLAS_SYMM_DIAGONAL: case CLBLAS_HEMM_DIAGONAL: case CLBLAS_SYMM: #ifdef DEBUG_GEMM_2 printf("GEMM2: setBuildOpts: Setting options for SYMM\n"); #endif if (kargs->side == clblasLeft) { addBuildOpt( buildOptStr, BUILD_OPTS_MAXLEN, "-D__SYMM_LEFT__"); } if (kargs->side == clblasRight) { addBuildOpt( buildOptStr, BUILD_OPTS_MAXLEN, "-D__SYMM_RIGHT__"); } if (kargs->uplo == clblasLower) { addBuildOpt( buildOptStr, BUILD_OPTS_MAXLEN, "-D__SYMM_LOWER__"); } if (kargs->uplo == clblasUpper) { addBuildOpt( buildOptStr, BUILD_OPTS_MAXLEN, "-D__SYMM_UPPER__"); } // Define the order for Legacy sake. if (kargs->order == clblasColumnMajor) { addBuildOpt( buildOptStr, BUILD_OPTS_MAXLEN, "-D__SYMM_COLMAJOR__"); } else { addBuildOpt( buildOptStr, BUILD_OPTS_MAXLEN, "-D__SYMM_ROWMAJOR__"); } if ((kargs->pigFuncID == CLBLAS_SYMM_DIAGONAL) || (kargs->pigFuncID == CLBLAS_HEMM_DIAGONAL)) { addBuildOpt( buildOptStr, BUILD_OPTS_MAXLEN, "-D__SYMM_DIAGONAL__"); } if (kargs->pigFuncID == CLBLAS_HEMM_DIAGONAL) { addBuildOpt( buildOptStr, BUILD_OPTS_MAXLEN, "-D__HEMM__"); } break; default: printf("GEMM TAIL: Unknown pigFuncID\n"); break; } #ifdef DEBUG_GEMM_TAIL printf("GEMMTAIL: Build options = %s\n", buildOptStr); #endif } static void calcNrThreads( size_t threads[2], const SubproblemDim *subdims, const PGranularity *pgran, const void *args, const void *extra) { int BLOCKSIZE = pgran->wgSize[0]; // 1D Block size_t tailM, tailN, M, N; size_t Y, X; size_t nWorkGroupsAY, nWorkGroupsAX, nWorkGroupsA; size_t nWorkGroupsBY, nWorkGroupsBX, nWorkGroupsB; size_t totalWorkGroups; #ifdef DEBUG_GEMM_TAIL printf("calcNrThreads called from gemm_tail.cpp\n"); #endif const CLBlasKargs *kargs = (const CLBlasKargs *)args; const CLBLASKernExtra *kextra = ( CLBLASKernExtra *)extra; KernelExtraFlags kflags = kextra->flags; // // RowMajor GEMM can be expressed in terms of Column Major GEMM // if ((kflags & KEXTRA_COLUMN_MAJOR) == 0) { printf("calcNrThreads: FIXME: RowMajor is NOT supported \n"); return; } if (kextra->vecLenA != 1) { printf("GEMM_TAIL: calcNrThreads(): Vector Length must be 1 for TAIL. Non-one Vector Length Requested\n"); return; } tailM = kargs->tailStartM; tailN = kargs->tailStartN; M = kargs->M; N = kargs->N; Y = 8; if (Y != subdims->y) { Y = subdims->y; } X = BLOCKSIZE/Y; /* LEGACY CODE: Outdated now. TAIL can handle this condition now using MTAIL_PRESENT and NTAIL_PRESENT if (tailN % X) { printf("GEMM_TAIL: calcNrThreads(): WARNING: tailN is not divisible by X. Will produce Wrong results!\n"); } */ // // A Tail Workgroup will process YxX panel // /* ______________ | | | | | | | | | B Tail panel |___________| | |___________|__| <--- A --> */ if(tailM != M) { #ifdef DEBUG_GEMM_TAIL printf("GEMM_TAIL: M has TAIL\n"); #endif nWorkGroupsAY = ((M - tailM -1)/Y + 1); nWorkGroupsAX = ((tailN - 1)/X + 1); nWorkGroupsA = nWorkGroupsAY * nWorkGroupsAX; } else { nWorkGroupsA = 0; } if (tailN != N) { #ifdef DEBUG_GEMM_TAIL printf("GEMM_TAIL: N has TAIL\n"); #endif nWorkGroupsBY = ((M-1)/Y) + 1; nWorkGroupsBX = ((N-tailN-1)/X) + 1; nWorkGroupsB = nWorkGroupsBY * nWorkGroupsBX; } else { nWorkGroupsB = 0; } totalWorkGroups = nWorkGroupsA + nWorkGroupsB; threads[0] = totalWorkGroups * BLOCKSIZE; threads[1] = 1; #ifdef DEBUG_GEMM_TAIL printf("GEMM_TAIL: calcNrThreads(): vlen:%d, , , nWorkGroupsB<%lu,%lu>\n", kextra->vecLenA, tailM, M, tailN, N, nWorkGroupsAY, nWorkGroupsAX, nWorkGroupsBY, nWorkGroupsBX); printf("GEMM_TAIL: calcNrThreads(): globalThreads0=%lu, globalThreads1=%lu\n", threads[0], threads[1]); #endif return; } static ssize_t generator( char *buf, size_t buflen, const struct SubproblemDim *subdims, const struct PGranularity *pgran, void *extra) { CLBLASKernExtra *kextra = (CLBLASKernExtra*)extra; KernelExtraFlags kflags = kextra->flags; DataType dtype = kextra->dtype; char tempTemplate[32*1024]; char itemx[10], itemy[10], width[10], itemy_by_width[10], itemx_by_width[10]; size_t Y, X, BLOCKSIZE, ITEMX, ITEMY; if (buf == NULL) { buflen = 32*1024*sizeof(char); return (ssize_t)buflen; } // // PENDING: Add Support for Row Major at the xAPI.c level // Row major calcs can be expressed in terms of column major // if ((kflags & KEXTRA_COLUMN_MAJOR) == 0) { return 0; } kprintf kobj(Prefix[dtype], 1, false, false); // Only Scalar Access BLOCKSIZE = pgran->wgSize[0]; #ifdef DEBUG_GEMM_TAIL printf("GEMM- generator(): Blocksize passed = %lu, subdimy = %lu, subdimx = %lu, veclen = %d \n", BLOCKSIZE, subdims->y, subdims->x, kextra->vecLenA); #endif Y = 8; if (Y != subdims->y) { //printf("GEMM_TAIL: generator(): WARNING: subdims->y is un-suitable.\n"); Y = subdims->y; } X = BLOCKSIZE/Y; ITEMY = (subdims->y) / Y; ITEMX = (subdims->x) / X; if (ITEMX == 0) { ITEMX = 1; } if ((BLOCKSIZE % Y) || ((subdims->y) % Y) || ((subdims->x)%X) || (ITEMY % kextra->vecLenA) || ((X*ITEMX) % kextra->vecLenA)) { printf("WARNING: GEMM TAIL - generator: subdim and blocksize in-compatible. This code should never execute!\n"); } sprintf(width, "%lu", Y); sprintf(itemy, "%lu", ITEMY); sprintf(itemx, "%lu", ITEMX); sprintf(itemy_by_width, "%lu", (size_t) ITEMY/kextra->vecLenA); sprintf(itemx_by_width, "%lu", (size_t) ITEMX/kextra->vecLenA); kobj.put("%WIDTH", width); kobj.put("%ITEMX", itemx); kobj.put("%ITEMY", itemy); kobj.put("%ITEMY_BY_V", itemy_by_width); kobj.put("%ITEMX_BY_V", itemx_by_width); kobj.put("%PANEL", "1"); kobj.put("%PANEL_BY_V", "1"); #ifdef DEBUG_GEMM_TAIL printf("ColMajor GEMM - WIDTH = %s, ITEMX = %s, ITEMY = %s\n", width, itemx, itemy); #endif strcpy(tempTemplate, SYMM_HEMM_HELPER); if ((kflags & KEXTRA_TRANS_A) == 0) { if (kflags & KEXTRA_TRANS_B) { #ifdef DEBUG_GEMM_TAIL printf("GEMM_TAIL: Using GEMM_NT_KERNEL\n"); #endif strcat(tempTemplate, GEMM_NT_KERNEL); } else { #ifdef DEBUG_GEMM_TAIL printf("GEMM_TAIL: Using GEMM_NN_KERNEL\n"); #endif strcat(tempTemplate, GEMM_NN_KERNEL); } } else { // // NOTE: A^T * B Never leaves any tails. This should NEVER be called. // PENDING: A^T * B^T support is PENDING tempTemplate[0] = 0; } kobj.spit(buf, tempTemplate); //#ifdef DEBUG_GEMM_TAIL //printf("Kernel = \n%s\n", buf); //#endif size_t tail = strlen(buf) + 1; while(tail < 32*1024) { buf[tail++] = 0; } return 32*1024*sizeof(char); } static void assignKargs(KernelArg *args, const void *params, const void*) { CLBlasKargs *blasArgs = (CLBlasKargs*)params; #ifdef DEBUG_GEMM_TAIL printf("SAlpha=%f, DAlpha=%f, CAlpha =<%f, %f>, DAlpha=<%f, %f>\n", blasArgs->alpha.argFloat, blasArgs->alpha.argDouble, CREAL(blasArgs->alpha.argFloatComplex), CIMAG(blasArgs->alpha.argFloatComplex), CREAL(blasArgs->alpha.argDoubleComplex) , CIMAG(blasArgs->alpha.argDoubleComplex)); printf("SBeta=%f, DBeta=%f, CBeta=<%f, %f>, DBeta=<%f, %f>\n", blasArgs->beta.argFloat, blasArgs->beta.argDouble, CREAL(blasArgs->beta.argFloatComplex), CIMAG(blasArgs->beta.argFloatComplex), CREAL(blasArgs->beta.argDoubleComplex) , CIMAG(blasArgs->beta.argDoubleComplex)); printf("TailStartM = %lu, TailStartN = %lu\n", blasArgs->tailStartM, blasArgs->tailStartN); #endif INIT_KARG(&args[0], blasArgs->A); //A - input matrix - argument INIT_KARG(&args[1], blasArgs->B); //x - result buffer = _xnew argument INIT_KARG(&args[2], blasArgs->C); //y - scratch == _x_vector argument initSizeKarg(&args[3], blasArgs->M); initSizeKarg(&args[4], blasArgs->N); initSizeKarg(&args[5], blasArgs->K); initSizeKarg(&args[6], blasArgs->lda.matrix); initSizeKarg(&args[7], blasArgs->ldb.matrix); initSizeKarg(&args[8], blasArgs->ldc.matrix); initSizeKarg(&args[9], blasArgs->offA); initSizeKarg(&args[10], blasArgs->offBX); initSizeKarg(&args[11], blasArgs->offCY); assignScalarKarg(&args[12], &(blasArgs->alpha), blasArgs->dtype); assignScalarKarg(&args[13], &(blasArgs->beta), blasArgs->dtype); initSizeKarg(&args[14], blasArgs->tailStartM); initSizeKarg(&args[15], blasArgs->tailStartN); return; } static SolverFlags solverFlags(void) { return (SF_WSPACE_1D); } extern "C" void initGemmV2TailCachedPattern(MemoryPattern *mempat) { mempat->name = "Cached global memory based gemm tail"; mempat->nrLevels = 2; mempat->cuLevel = 0; mempat->thLevel = 1; mempat->sops = &gemmSops; mpatExtra.aMset = CLMEM_LEVEL_L1; mpatExtra.bMset = CLMEM_LEVEL_L1; mpatExtra.mobjA = CLMEM_BUFFER; mpatExtra.mobjB = CLMEM_BUFFER; mempat->extra = &mpatExtra; Prefix[TYPE_FLOAT] = 'S'; Prefix[TYPE_DOUBLE] = 'D'; Prefix[TYPE_COMPLEX_FLOAT] = 'C'; Prefix[TYPE_COMPLEX_DOUBLE] = 'Z'; } clblas-2.10/src/library/blas/gens/gemv.c000066400000000000000000000441431264277366700201400ustar00rootroot00000000000000/* ************************************************************************ * Copyright 2013 Advanced Micro Devices, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * ************************************************************************/ /* * gemv generator */ #include #include #include #include #include #include #include #include #include #include "blas_kgen.h" #include "xxmv_common.h" typedef struct { size_t staggered; } MAY_ALIAS extraData_t; static const char *gemvDecl = "__attribute__((reqd_work_group_size(%lu, %lu, 1)))\n" "void __kernel\n" "%cgemv(\n" " uint %c,\n" " uint %c,\n" " const %s alpha,\n" " const __global %s *restrict A,\n" " const __global %s *restrict X,\n" "%s" " __global %s *Y,\n" " uint lda" "%s" // offset A, X and Y "%s" "%s)\n"; static CLBLASMpatExtra mpatExtra; static ssize_t generator( char *buf, size_t buflen, const struct SubproblemDim *subdims, const struct PGranularity *pgran, void *extra); static void assignKargs(KernelArg *args, const void *params, const void *extra); static void fixupArgs(void *args, SubproblemDim *subdims, void *extra); static SolverFlags solverFlags(void); static bool isFitToLDS( SubproblemDim *dim, DataType dtype, cl_ulong ldsSize, const void *kernelArgs); static void calcNrThreads( size_t threads[2], const SubproblemDim *subdims, const PGranularity *pgran, const void *args, const void *extra); static bool subgCheckCalcDecomp( PGranularity *pgran, SubproblemDim *subdims, unsigned int subdimsNum, DataType dtype, int check); static int subgGetDefaultDecomp( PGranularity *pgran, SubproblemDim *subdims, unsigned int subdimsNum, void * pArgs); static SolverOps gemvSops = { generator, assignKargs, isFitToLDS, NULL, NULL, calcNrThreads, NULL, solverFlags, fixupArgs, subgGetDefaultDecomp,//getDefaultDecomposition subgCheckCalcDecomp, //get Decomp. list NULL, NULL }; static void declareGemvKernel( struct KgenContext *ctx, DataType dtype, const PGranularity *pgran, KernelExtraFlags kflags) { char sizeNames[2] = {'M', 'N'}; bool incxOne = ((kflags & KEXTRA_INCX_ONE) != 0); bool incyOne = ((kflags & KEXTRA_INCY_ONE) != 0); bool beta0 = ((kflags & KEXTRA_BETA_ZERO) != 0); const char *incxDecl = incxOne ? "" : ",\n const int incx"; const char *incyDecl = incyOne ? "" : ",\n const int incy"; char offDecl[128]; char betaDecl[128]; char tmp[512]; char fpref; bool tra = ((kflags & KEXTRA_TRANS_A) != 0); const char *typeName; typeName = dtypeBuiltinType(dtype); fpref = dtypeToBlasPrefix(dtype); offDecl[0] = '\0'; if (kflags & KEXTRA_A_OFF_NOT_ZERO) { strcpy(offDecl, ",\n const uint offA"); } if (kflags & KEXTRA_BX_OFF_NOT_ZERO) { strcat(offDecl, ",\n const uint offX"); } if (kflags & KEXTRA_CY_OFF_NOT_ZERO) { strcat(offDecl, ",\n const uint offY"); } if (beta0) { betaDecl[0] = '\0'; } else { sprintf(betaDecl, " const %s beta,\n", typeName); } sprintf(tmp, gemvDecl, pgran->wgSize[0], pgran->wgSize[1], fpref, sizeNames[tra], sizeNames[1 - tra], typeName, typeName, typeName, betaDecl, typeName, offDecl, incxDecl, incyDecl); kgenDeclareFunction(ctx, tmp); } static void setFetchHandler( TileMulOpts *mulOpts, const BlasGenSettings *gset, int handler(struct KgenContext *ctx, MatrixRole mrole, void *priv), TilePostFetchPrivate *priv) { int i, nrPrivs; const char *regName = NULL; nrPrivs = 1; for (i = 0; i < nrPrivs; i++) { priv[i].fetchNumA = 0; priv[i].wholeA = 1; priv[i].funcID = CLBLAS_GEMV; priv[i].gset = gset; priv[i].regName = regName; mulOpts->postFetch = handler; mulOpts->postFetchPriv = priv; } } // global memory based kernel generator static ssize_t generator( char *buf, size_t buflen, const struct SubproblemDim *subdims, const struct PGranularity *pgran, void *extra) { struct KgenContext *ctx; CLBLASKernExtra *kextra = (CLBLASKernExtra*)extra; KernelExtraFlags kflags = kextra->flags; size_t staggered = ((extraData_t*)&kextra->solverPriv)->staggered; //yes, KEXTRA_TAILS_K because it is set if N % bw != 0 bool tailN = ((kflags & KEXTRA_TAILS_K) != 0); bool tailM = ((kflags & KEXTRA_TAILS_M) != 0); char tmp[4096]; DataType dtype = kextra->dtype; bool doubleBased = isDoubleBasedType(dtype); BlasGenSettings gset; TileMulOpts mulOpts; KernelVarNames *vnames = &gset.varNames; ssize_t ret; TilePostFetchPrivate pfPriv; unsigned int vecLen = kextra->vecLen; const char *outTypeName; const char *gid = "get_group_id(0)"; const char *lid = "get_local_id(0)"; const char *typeName; size_t wgSize; //unsigned int nStep = 32; unsigned int bStep = subdims[0].bwidth / subdims[1].bwidth; //8; unsigned int cLocal; bool isComplex = isComplexType(dtype); unsigned int nPlans; typeName = dtypeBuiltinType(dtype); memset(&gset, 0, sizeof(gset)); memset(&mulOpts, 0, sizeof(mulOpts)); ctx = createKgenContext(buf, buflen, true); if (ctx == NULL) { return -ENOMEM; } // at first, generate needed declarations kgenDeclareUptrs(ctx, doubleBased); // now, generate the kernel declareGemvKernel(ctx, dtype, pgran, kflags); ret = kgenBeginFuncBody(ctx); kgenAddStmt(ctx, "// M always denotes length of Y " "and N denotes length of X in the kernel\n"); /* 1D work space. Matrix is divided among wi, each calculates it's own * part of vector y */ wgSize = (subdims[0].y / subdims[1].y) * (subdims[0].bwidth / subdims[1].bwidth); assert(pgran->wgSize[0] == wgSize); assert(subdims[0].x == 1); assert(subdims[1].x == 1); cLocal = wgSize/bStep; memcpy(gset.subdims, subdims, sizeof(gset.subdims)); gset.subdims[0].itemX = gset.subdims[0].x = 1; gset.subdims[1].itemX = gset.subdims[1].x = 1; gset.subdims[0].bwidth = gset.subdims[1].bwidth; gset.pgran = pgran; gset.kextra = kextra; gset.flags = BGF_UPTRS; initDefaultTiles(&gset, CLBLAS_GEMV, 0, PRIV_STORAGE_VARIABLE_SET); if (isComplex) { gset.tileCY.vecLen = 1; } declareTileStorages(ctx, &gset); genZeroTile(ctx, &gset.tileCY); getVectorTypeName(dtype, gset.tileCY.vecLen, &outTypeName, NULL); nPlans = gset.tileCY.nrRows / gset.tileCY.vecLen; sprintf(tmp, "__local %s localRes[%u][%u];\n", outTypeName, pgran->wgSize[0], nPlans); kgenAddStmt(ctx, tmp); sprintf(tmp, "uint coordA = (%s * %u + %s %% %u) * %lu;\n", gid, bStep, lid, bStep, subdims[1].y); kgenAddStmt(ctx, tmp); sprintf(tmp, "uint k0 = (%s / %u) * %lu;\n", lid, bStep, subdims[1].bwidth); kgenAddStmt(ctx, tmp); kgenAddBlankLine(ctx); kgenBeginBranch(ctx,"if (coordA < M && k0 < N)"); genIncPointers(ctx, kflags); sprintf(tmp, "const GPtr Ag = {(__global %s*)A};\n" "const GPtr Xg = {(__global %s*)X};\n", typeName, typeName); kgenAddStmt(ctx, tmp); kgenAddBlankLine(ctx); if (tailN) { sprintf(tmp, "uint Ntail = N %% %lu;\n", subdims[1].bwidth); kgenAddStmt(ctx, tmp); kgenAddStmt(ctx, "N -= Ntail;\n"); kgenAddBlankLine(ctx); } mulOpts.flags |= TILEMUL_OPTIMIZE_COORD_CALC; if (tailM) { mulOpts.flags |= TILEMUL_GLOBAL_CYCLIC_A; } vnames->A = "Ag"; vnames->B = "Xg"; vnames->coordA = "coordA"; vnames->coordB = ""; //should not be used for vector vnames->k = "k"; vnames->lda = "lda"; vnames->sizeK = "N"; vnames->sizeM = "M"; mulOpts.flags |= TILEMUL_NOT_FETCH_B | TILEMUL_TRB | TILEMUL_C_COLUMN_MAJOR | TILEMUL_NOT_INC_K; if ((kflags & KEXTRA_CONJUGATE_A) != 0) { mulOpts.flags |= TILEMUL_CONJA; } if (isMatrixAccessColMaj(CLBLAS_GEMV, kflags, MATRIX_A)) { mulOpts.flags |= TILEMUL_TRA; } if ((kflags & KEXTRA_ENABLE_MAD) != 0) { mulOpts.core = TILEMUL_MAD; } else { mulOpts.core = TILEMUL_MULADD; } mulOpts.memA = CLMEM_GLOBAL_MEMORY; mulOpts.memB = CLMEM_GLOBAL_MEMORY; if (!isMatrixAccessColMaj(CLBLAS_GEMV, kflags, MATRIX_A)) { gset.subdims[0].bwidth = pgran->wgSize[0] * subdims[1].bwidth; mulOpts.flags |= TILEMUL_BW_STRIDE; } sprintf(tmp, "uint k = k0;\nfor (; k < N; k += %lu)", cLocal*subdims[1].bwidth); kgenBeginBranch(ctx, tmp); if (staggered) { vnames->k = "k1"; sprintf(tmp, "const uint k1 = (k + get_group_id(0)*%lu)%%N;\n",staggered); kgenAddStmt(ctx, tmp); } genFetchX(ctx, &gset.tileBX, gset.kextra->vecLen, dtype, vnames, mulOpts.flags, kflags); ret = tileMulGen(ctx, &gset, &mulOpts); if (ret != 0) { return ret; } vnames->k = "k"; kgenEndBranch(ctx, NULL); /* k loop */ if (tailN) { /* Handle tail along vector X */ kgenAddStmt(ctx, "N += Ntail;\n"); kgenBeginBranch(ctx, "if (k < N)"); mulOpts.flags |= TILEMUL_SKEW_B; genFetchX(ctx, &gset.tileBX, gset.kextra->vecLen, dtype, vnames, mulOpts.flags, kflags); mulOpts.flags |= TILEMUL_GLOBAL_CYCLIC_K|TILEMUL_WRAP_AROUND_TAIL; setFetchHandler(&mulOpts, &gset, defaultTilePostFetch, &pfPriv); ret = tileMulGen(ctx, &gset, &mulOpts); if (ret != 0) { return ret; } kgenEndBranch(ctx, NULL); } if (!isMatrixAccessColMaj(CLBLAS_GEMV, kflags, MATRIX_A)) { gset.subdims[0].bwidth = subdims[1].bwidth; mulOpts.flags &= ~TILEMUL_BW_STRIDE; } kgenEndBranch(ctx,NULL); genStoreLocalResult(ctx, &gset.tileCY, lid); kgenAddBarrier(ctx, CLK_LOCAL_MEM_FENCE); kgenAddBlankLine(ctx); sprintf(tmp, "if (%s < %u && coordA < M && k0 < N)", lid, bStep); kgenBeginBranch(ctx, tmp); genAddLocalResult(ctx, &gset.tileCY, lid, cLocal, bStep); /* write back the results */ /* y := alpha*A*x + beta*y */ setResultPos(ctx, kflags, vnames->coordA); updateResultVectorTiled(ctx, kflags, vecLen, &gset.tileCY); kgenEndBranch(ctx, NULL); kgenEndFuncBody(ctx); ret = kgenAddBlankLine(ctx); if (!ret) { ret = (ssize_t)kgenSourceSize(ctx) + 1; } destroyKgenContext(ctx); return (ret < 0) ? -EOVERFLOW : ret; } static void assignKargs(KernelArg *args, const void *params, const void *extra) { const CLBlasKargs *blasArgs = (const CLBlasKargs*)params; KernelExtraFlags kflags = ((const CLBLASKernExtra*)extra)->flags; cl_int inc; int i; initSizeKarg(&args[0], blasArgs->M); initSizeKarg(&args[1], blasArgs->N); assignScalarKarg(&args[2], &(blasArgs->alpha), blasArgs->dtype); INIT_KARG(&args[3], blasArgs->A); INIT_KARG(&args[4], blasArgs->B); i = 5; if (!(kflags & KEXTRA_BETA_ZERO)) { assignScalarKarg(&args[i++], &(blasArgs->beta), blasArgs->dtype); } INIT_KARG(&args[i], blasArgs->C); i++; initSizeKarg(&args[i++], blasArgs->lda.matrix); if (kflags & KEXTRA_A_OFF_NOT_ZERO) { initSizeKarg(&args[i++], blasArgs->offA); } if (kflags & KEXTRA_BX_OFF_NOT_ZERO) { initSizeKarg(&args[i++], blasArgs->offBX); } if (kflags & KEXTRA_CY_OFF_NOT_ZERO) { initSizeKarg(&args[i++], blasArgs->offCY); } if (!(kflags & KEXTRA_INCX_ONE)) { inc = blasArgs->ldb.vector; INIT_KARG(&args[i], inc); i++; } if (!(kflags & KEXTRA_INCY_ONE)) { inc = blasArgs->ldc.vector; INIT_KARG(&args[i], inc); i++; } } static void fixupArgs(void *args, SubproblemDim *subdims, void *extra) { CLBlasKargs *kargs = (CLBlasKargs*)args; KernelExtraFlags kflags = ((CLBLASKernExtra*)extra)->flags; const size_t nChans = 8; // !!!DEVICE DEPENDED!!! const size_t wideChans = 64; // !!!DEVICE DEPENDED!!! const size_t sizeType[] = {1,2,2,4}; size_t sizeBlock = wideChans * nChans / sizeType[kargs->dtype]; size_t off = kargs->K % sizeBlock; extraData_t *extraData = (extraData_t*)&((CLBLASKernExtra*)extra)->solverPriv; if (off == 0 && !isMatrixAccessColMaj(CLBLAS_GEMV, kflags, MATRIX_A)) { /* * FIXME: staggered access is not enabled now since for some reason * it leads to slowdown at small sizes */ extraData->staggered = 0; // wideChans / sizeType[kargs->dtype]; } else { extraData->staggered = 0; } (void)subdims; off = (kargs->offsetM) ? kargs->offsetM : kargs->offsetN; if (off) { if (isMatrixAccessColMaj(CLBLAS_GEMV, kflags, MATRIX_A)) { kargs->offA += off; } else { kargs->offA += off * kargs->lda.matrix; } if (kargs->ldc.vector < 0) { // K store the original height of the matrix A kargs->offCY += (kargs->K - off) * abs(kargs->ldc.vector); } else { kargs->offCY += off * kargs->ldc.vector; } } kargs->offsetM = kargs->offsetN = 0; } static int subgGetDefaultDecomp( PGranularity *pgran, SubproblemDim *subdims, unsigned int subdimsNum, void * pArgs) { (void)subdimsNum; DUMMY_ARG_USAGE(pArgs); pgran->wgDim = 1; pgran->wgSize[0] = 64; pgran->wgSize[1] = 1; subdims[1].bwidth = 4; subdims[1].itemX = subdims[1].x = 1; subdims[1].itemY = subdims[1].y = 4; subdims[0].bwidth = 8 * subdims[1].bwidth; subdims[0].itemX = subdims[0].x = 1; subdims[0].itemY = subdims[0].y = 8 * subdims[1].y; return 0; } static bool isFitToLDS( SubproblemDim *dim, DataType dtype, cl_ulong ldsSize, const void *kernelArgs) { (void)kernelArgs; if (1) { cl_ulong size; /* * One needs y1 * wgSize size of local memory in elements, but * y1 is not calculated yet. The expression below produces * reliable a larger value. It is larger in dims[1].bwidth times. */ size = dim[0].y * dim[0].bwidth * dtypeSize(dtype); return (size <= ldsSize); } return true; } static void calcNrThreads( size_t threads[2], const SubproblemDim *subdims, const PGranularity *pgran, const void *args, const void *extra) { size_t yLen; /* Length of "Y" vector */ const CLBlasKargs *kargs = args; unsigned int subgr = pgran->wgSize[0] / (subdims[0].bwidth / subdims[1].bwidth); (void)subdims; (void)extra; yLen = kargs->transA == clblasNoTrans ? kargs->M : kargs->N; if (yLen == 0) { yLen = 1; //launch one group to avoid CL_INVALID_WORK_GROUP_SIZE error } //each work item handles y1 lines threads[0] = divRoundUp(yLen, subdims[1].y) * subgr; threads[0] = roundUp(threads[0], pgran->wgSize[0]); threads[1] = 0; } static SolverFlags solverFlags(void) { return (SF_WSPACE_1D); } static bool subgCheckCalcDecomp( PGranularity *pgran, SubproblemDim *subdims, unsigned int subdimsNum, DataType dtype, int check) { unsigned int divider1 = dtypeSize(dtype)/sizeof(cl_float); unsigned int divider0 = 2-!isComplexType(dtype); //EINVAL if( (subdimsNum<2)|| (NULL==pgran)|| (NULL==subdims) ){ return false; } if( 0 == subdims[0].x || 0 == subdims[0].y || 0 == subdims[0].bwidth || 0 == subdims[1].x || 0 == subdims[1].y || 0 == subdims[1].bwidth ){ return false; } if( subdims[1].x != subdims[1].itemX || subdims[1].y != subdims[1].itemY ){ return false; } // the group block must consist of integer number of subgroup blocks if( subdims[0].x % subdims[1].x || subdims[0].y % subdims[1].y || subdims[0].bwidth % subdims[1].bwidth ){ return false; } //check fitting of bw to common vector sizes if( isComplexType(dtype) ){ if( 2*subdims[1].bwidth > 32 ){ return false; } } // check dimensions if( subdims[1].bwidth > 16 / divider1 || subdims[1].x > 1 || subdims[1].y > 16 / divider1 ){ return false; } if( subdims[0].bwidth > 256 / divider0 || subdims[0].x > 1 || subdims[0].y > 256 / divider0 ){ return false; } if (64 != (subdims[0].y / subdims[1].y) * (subdims[0].bwidth / subdims[1].bwidth)) { return false; } // passed PGranularity should be checked if( PGRAN_CHECK == check ){ if( pgran->wgSize[0] * pgran->wgSize[1] != 64 ){ return false; } } // PGranularity should be calculated else{ pgran->wgDim = 1; pgran->wgSize[1] = 1; pgran->wgSize[0] = 64; //subdims[0].bwidth = (pgran->wgSize[0] * subdims[1].bwidth) / // (subdims[0].y / subdims[1].y); } /*Debug out for Tune*/ return true; } //----------------------------------------------------------------------------- void initGemvPattern(MemoryPattern *mempat) { mempat->name = "Cached global memory based block gemv"; mempat->nrLevels = 2; mempat->cuLevel = 0; mempat->thLevel = 1; mempat->sops = &gemvSops; mpatExtra.aMset = CLMEM_LEVEL_L1; mpatExtra.bMset = CLMEM_LEVEL_L1; mpatExtra.mobjA = CLMEM_BUFFER; mpatExtra.mobjB = CLMEM_BUFFER; mempat->extra = &mpatExtra; } clblas-2.10/src/library/blas/gens/gen_helper.c000066400000000000000000000353761264277366700213220ustar00rootroot00000000000000/* ************************************************************************ * Copyright 2013 Advanced Micro Devices, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * ************************************************************************/ #include #include "gen_helper.h" #include "clblas_stddef.h" #define IDX_INVAL ((unsigned int)-1) typedef struct CopyPattern { SubproblemDim dim; const PGranularity *pgran; DataType dtype; DBlockCopyDirection dir; DBlockCopyFlags flags; bool generic; bool zeroing; } CopyPattern; static __inline void dimSwapXY(SubproblemDim *dim) { size_t tmp = dim->x; dim->x = dim->y; dim->y = tmp; } /* * Initialize a dimension structure with the * respective values if it's needed or mark them * as unused */ static void checkInitSubdim( SubproblemDim *dim, unsigned int flags, unsigned int checkedFlag, size_t x, size_t y) { if (flags & checkedFlag) { dim->x = x; dim->y = y; } else { dim->x = SUBDIM_UNUSED; dim->y = SUBDIM_UNUSED; } } /* * check if such dimension instance * does already exist in the array */ static int lookupDim( const SubproblemDim *dim, unsigned int idx) { unsigned int i; for (i = 0; i < idx; i++) { if (dim[i].x == dim[idx].x && dim[i].y == dim[idx].y) { break; } } return (i == idx) ? IDX_INVAL : i; } static int cpyGenCallback(struct KgenContext *ctx, const void *pattern) { const CopyPattern *pat = (CopyPattern*)pattern; const void *dim = (pat->generic) ? NULL : &pat->dim; return copyDataBlockGen(ctx, dim, pat->pgran, pat->dtype, pat->dir, pat->flags); } static void initCopyPattern( CopyPattern *pattern, const SubproblemDim *blasDim, KernelExtraFlags flags, MatrixRole mrole, BlasFunctionID funcID) { SubproblemDim *dim = &pattern->dim; unsigned int vecFlag = 0; pattern->flags = 0; if (blasDim == NULL) { pattern->generic = true; dim->x = 0; dim->y = 0; } else { pattern->generic = false; switch (mrole) { case MATRIX_A: dim->x = blasDim->bwidth; dim->y = blasDim->y; break; case MATRIX_B: dim->x = blasDim->bwidth; dim->y = blasDim->x; break; case MATRIX_C: dim->x = blasDim->x; dim->y = blasDim->y; break; default: break; } } switch (mrole) { case MATRIX_A: vecFlag = KEXTRA_NO_COPY_VEC_A; break; case MATRIX_B: vecFlag = KEXTRA_NO_COPY_VEC_B; break; case MATRIX_C: if ((funcID == CLBLAS_TRMM) || (funcID == CLBLAS_TRSM)) { vecFlag = KEXTRA_NO_COPY_VEC_B; } else { vecFlag = KEXTRA_NO_COPY_VEC_C; } break; default: break; } if (flags & vecFlag) { pattern->flags |= DBLOCK_COPY_NOT_VECTORIZE; } if (isMatrixAccessColMaj(funcID, flags, mrole)) { if ((pattern->dir == DBLOCK_GLOBAL_TO_LOCAL) && !pattern->generic) { dimSwapXY(dim); } pattern->flags |= DBLOCK_COPY_TRANSPOSE; } if (isMatrixConj(flags, mrole)) { pattern->flags |= DBLOCK_COPY_CONJUGATE; } } int generateBufCopyFuncs( CopyBufFuncs *funcNames, struct KgenContext *ctx, BlasFunctionID funcID, const BlasGenSettings *gset, BufCopyHelperFlags flags) { CopyPattern pattern; struct KgenGuard *guard; int ret = 0; MatrixRole mrole; bool needed[MATRIX_ROLES_NUMBER]; KernelExtraFlags kgenFlags = gset->kextra->flags; DataType dtype = gset->kextra->dtype; const SubproblemDim *blasDim = gset->subdims; const PGranularity *pgran = gset->pgran; bool outputTails = (kgenFlags & (KEXTRA_TAILS_M | KEXTRA_TAILS_N)); guard = createKgenGuard(ctx, cpyGenCallback, sizeof(CopyPattern)); if (guard == NULL) { return -ENOMEM; } memset(&pattern, 0, sizeof(pattern)); pattern.dir = DBLOCK_GLOBAL_TO_LOCAL; pattern.dtype = dtype; pattern.pgran = pgran; needed[MATRIX_A] = (flags & BCHF_MATRIX_A); needed[MATRIX_B] = (flags & BCHF_MATRIX_B); needed[MATRIX_C] = (flags & BCHF_READ_OUTPUT); for (mrole = MATRIX_A; mrole <= MATRIX_C; mrole++) { if (!needed[mrole]) { continue; } initCopyPattern(&pattern, blasDim, kgenFlags, mrole, funcID); findGenerateFunction(guard, &pattern, funcNames->read[mrole], FUNC_NAME_MAXLEN); kgenAddBlankLine(ctx); } if (flags & BCHF_WRITE_OUTPUT) { if (flags & BCHF_IMAGE_WRITE) { pattern.dir = DBLOCK_LOCAL_TO_IMAGE; initCopyPattern(&pattern, NULL, kgenFlags, MATRIX_A, funcID); pattern.flags &= ~DBLOCK_COPY_TRANSPOSE; } else { pattern.dir = DBLOCK_LOCAL_TO_GLOBAL; initCopyPattern(&pattern, blasDim, kgenFlags, MATRIX_C, funcID); } ret = findGenerateFunction(guard, &pattern, funcNames->write, FUNC_NAME_MAXLEN); kgenAddBlankLine(ctx); } if (ret) { destroyKgenGuard(guard); return ret; } // reevaluate needed flags needed[MATRIX_A] = needed[MATRIX_A] && (kgenFlags & (KEXTRA_TAILS_M | KEXTRA_TAILS_K)); needed[MATRIX_B] = needed[MATRIX_B] && (kgenFlags & (KEXTRA_TAILS_N | KEXTRA_TAILS_K)); needed[MATRIX_C] = needed[MATRIX_C] && outputTails; pattern.dir = DBLOCK_GLOBAL_TO_LOCAL; for (mrole = MATRIX_A; mrole <= MATRIX_C; mrole++) { if (!needed[mrole]) { continue; } initCopyPattern(&pattern, NULL, kgenFlags, mrole, funcID); findGenerateFunction(guard, &pattern, funcNames->readGeneric[mrole], FUNC_NAME_MAXLEN); kgenAddBlankLine(ctx); } if ((flags & BCHF_WRITE_OUTPUT) && outputTails) { if (flags & BCHF_IMAGE_WRITE) { pattern.dir = DBLOCK_LOCAL_TO_IMAGE; initCopyPattern(&pattern, NULL, kgenFlags, MATRIX_A, funcID); pattern.flags &= ~DBLOCK_COPY_TRANSPOSE; } else { pattern.dir = DBLOCK_LOCAL_TO_GLOBAL; initCopyPattern(&pattern,NULL, kgenFlags, MATRIX_C, funcID); } ret = findGenerateFunction(guard, &pattern, funcNames->writeGeneric, FUNC_NAME_MAXLEN); kgenAddBlankLine(ctx); } destroyKgenGuard(guard); return ret; } int generateZeroingFuncs( ZeroFuncs *funcNames, struct KgenContext *ctx, const SubproblemDim *blasDim, const PGranularity *pgran, DataType dtype, ZeroGenHelperFlags flags) { int ret = 0; SubproblemDim dim[MATRIX_ROLES_NUMBER]; size_t tsize, nvecs; unsigned int i, j; tsize = dtypeSize(dtype); nvecs = fl4RowWidth(blasDim->bwidth, tsize); checkInitSubdim(&dim[MATRIX_A], flags, ZF_MATRIX_A, nvecs * blasDim->y, 1); checkInitSubdim(&dim[MATRIX_B], flags, ZF_MATRIX_B, nvecs * blasDim->x, 1); nvecs = fl4RowWidth(blasDim->x, tsize); checkInitSubdim(&dim[MATRIX_C], flags, ZF_MATRIX_C, nvecs * blasDim->y, 1); for (i = 0; (i < MATRIX_ROLES_NUMBER) && !ret; i++) { if (dim[i].x == SUBDIM_UNUSED) { continue; } // check whether the function is already generated j = lookupDim(dim, i); if (j != IDX_INVAL) { strcpy(funcNames->names[i], funcNames->names[j]); } else { ret = f4zeroBlockGen(ctx, &dim[i], pgran, "__local"); if (!ret) { kgenGetLastFuncName(funcNames->names[i], FUNC_NAME_MAXLEN, ctx); } kgenAddBlankLine(ctx); } } return ret; } UpdateResultFlags kextraToUpresFlags(BlasFunctionID funcID, KernelExtraFlags kflags) { UpdateResultFlags uf = 0; if (funcHasBeta(funcID) && !(kflags & KEXTRA_BETA_ZERO)) { uf |= UPRES_WITH_BETA; } if (isMatrixAccessColMaj(funcID, kflags, MATRIX_C)) { uf |= UPRES_COLUMN_MAJOR; } if (kflags & KEXTRA_NO_COPY_VEC_C) { uf |= UPRES_NO_VECTORIZATION; } return uf; } int generateResultUpdate( struct KgenContext *ctx, BlasFunctionID funcID, const BlasGenSettings *gset, const char *optFuncName, const char *genericFuncName) { UpdateResultFlags flags; flags = kextraToUpresFlags(funcID, gset->kextra->flags); return genResultUpdateWithFlags(ctx, funcID, gset, flags, optFuncName, genericFuncName, NULL); } int genResultUpdateWithFlags( struct KgenContext *ctx, BlasFunctionID funcID, const BlasGenSettings *gset, UpdateResultFlags flags, const char *optFuncName, const char *genericFuncName, const char *cachedName) { KernelExtraFlags kflags = gset->kextra->flags; UpdateResultOp op; char tmp[1024]; int ret = 0; const char *coordY, *coordX; UpresVarNames uvars; const KernelVarNames *kvarNames = &gset->varNames; const SubproblemDim *dim = &gset->subdims[1]; bool areTails, useCondition; memset(&uvars, 0, sizeof(uvars)); coordX = kvarNames->coordB; coordY = kvarNames->coordA; if (funcHasTriangMatrix(funcID)) { if (flags & UPRES_TRIANG_WRITE_C) { uvars.result = "C"; } else { uvars.result = "B"; } uvars.ld = "ldb"; } else { uvars.result = "C"; uvars.ld = "ldc"; } uvars.cachedName = cachedName; /* For now, kernels that do not use UPRES_EXCEED_PROBLEM_CONDITION * must return in case problem exceeds more precise lower level conditions * (KEXTRA_TAILS_M_LOWER, KEXTRA_TAILS_N_LOWER) before updating result */ areTails = (kflags & (KEXTRA_TAILS_M | KEXTRA_TAILS_N)); useCondition = areTails && ((flags & UPRES_EXCEED_PROBLEM_CONDITION) != 0); if (useCondition) { bool tailM = (kflags & KEXTRA_TAILS_M) != 0; bool tailN = (kflags & KEXTRA_TAILS_N) != 0; if (tailM) { if (tailN) { sprintf(tmp, "if ((%s < %s) && (%s < %s))", coordY, kvarNames->sizeM, coordX, kvarNames->sizeN); } else { sprintf(tmp, "if (%s < %s)", coordY, kvarNames->sizeM); } } else { // here tailN is true sprintf(tmp, "if (%s < %s)", coordX, kvarNames->sizeN); } kgenBeginBranch(ctx, tmp); } else { kgenAddBlankLine(ctx); } if (optFuncName) { const char *betaStr; betaStr = (flags & UPRES_WITH_BETA) ? ", beta" : ""; // update with functions invoking if (!(kflags & (KEXTRA_TAILS_M_LOWER | KEXTRA_TAILS_N_LOWER))) { sprintf(tmp, "%s(%s, c, alpha, %s, %s, %s%s);\n", optFuncName, uvars.result, coordY, coordX, uvars.ld, betaStr); } else { sprintf(tmp, "uint y = min(%luu, %s - (uint)%s);\n" "uint x = min(%luu, %s - (uint)%s);\n" "if ((y == %lu) && (x == %lu)) {\n" " %s(%s, c, alpha, %s, %s, %s%s);\n" "}\n" "else {\n" " %s(%s, c, alpha, %s, %s, %s%s, y, x);\n" "}\n", dim->y, kvarNames->sizeM, coordY, dim->x, kvarNames->sizeN, coordX, dim->y, dim->x, optFuncName, uvars.result, coordY, coordX, uvars.ld, betaStr, genericFuncName, uvars.result, coordY, coordX, uvars.ld, betaStr); } kgenAddStmt(ctx, tmp); } else { // inline result update flags |= UPRES_INLINE; op = (flags & UPRES_WITH_BETA) ? UPRES_SUM : UPRES_SET; uvars.startRow = coordY; uvars.startCol = coordX; uvars.nrRows = "y"; uvars.nrCols = "x"; if (!(kflags & (KEXTRA_TAILS_M_LOWER | KEXTRA_TAILS_N_LOWER))) { ret = updateResultGen(ctx, gset, funcID, op, flags, &uvars); } else { sprintf(tmp, "uint y = min(%luu, %s - (uint)%s);\n" "uint x = min(%luu, %s - (uint)%s);\n", dim->y, kvarNames->sizeM, coordY, dim->x, kvarNames->sizeN, coordX); kgenAddStmt(ctx, tmp); sprintf(tmp, "if ((y == %lu) && (x == %lu))", dim->y, dim->x); kgenBeginBranch(ctx, tmp); // optimized update updateResultGen(ctx, gset, funcID, op, flags, &uvars); kgenEndBranch(ctx, NULL); kgenBeginBranch(ctx, "else "); // not optimized update flags |= UPRES_GENERIC; updateResultGen(ctx, gset, funcID, op, flags, &uvars); ret = kgenEndBranch(ctx, NULL); } } if (useCondition) { ret = kgenEndBranch(ctx, NULL); } return (ret) ? -EOVERFLOW : 0; } //----------------------------------------------------------------------------- void checkGenBeginHitMatrixBlock( struct KgenContext *ctx, KernelExtraFlags kflags) { bool tailsM = (kflags & KEXTRA_TAILS_M) != 0; bool tailsN = (kflags & KEXTRA_TAILS_N) != 0; if (tailsM) { if (tailsN) { kgenBeginBranch(ctx, "if ((coord.x < N) && (coord.y < M))"); } else { kgenBeginBranch(ctx, "if (coord.y < M)"); } } else { if (tailsN) { kgenBeginBranch(ctx, "if (coord.x < N)"); } } } //----------------------------------------------------------------------------- void checkGenEndHitMatrixBlock( struct KgenContext *ctx, KernelExtraFlags kflags) { if (kflags & (KEXTRA_TAILS_M | KEXTRA_TAILS_N)) { kgenEndBranch(ctx, NULL); } }clblas-2.10/src/library/blas/gens/gen_helper.h000066400000000000000000000077341264277366700213240ustar00rootroot00000000000000/* ************************************************************************ * Copyright 2013 Advanced Micro Devices, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * ************************************************************************/ #ifndef GEN_HELPER_H_ #define GEN_HELPER_H_ #include #include #include #include "blas_kgen.h" typedef enum BufCopyHelperFlags { // buffer copy functions are needed for matrix A blocks BCHF_MATRIX_A = 0x01, // buffer copy functions are needed for matrix B blocks BCHF_MATRIX_B = 0x02, /* * read block of output matrix * (either B or C) */ BCHF_READ_OUTPUT = 0x04, // write block of output matrix BCHF_WRITE_OUTPUT = 0x08, // not unroll loops in transposing versions of customized generators BCHF_NOT_UNROLL_TRANSPOSE = 0x10, // output to image BCHF_IMAGE_WRITE = 0x20 } BufCopyHelperFlags; typedef enum ZeroGenHelperFlags { ZF_MATRIX_A = 0x01, ZF_MATRIX_B = 0x02, ZF_MATRIX_C = 0x04 } ZeroGenHelperFlags; /* * Name of functions copying matrix blocks between the global * and the local memory. Contains customized and generic transposing * or not transposing variants for reading and writing back depending on * generator flags, for all the matrices. * * A function name contained in a 'read*' field matches to a function * copying data from the global memory to the local, and this one * contained in a 'write*' field matches to a function copying in * inverse direction. */ typedef struct CopyBufFuncs { char read[MATRIX_ROLES_NUMBER][FUNC_NAME_MAXLEN]; char write[FUNC_NAME_MAXLEN]; char readGeneric[MATRIX_ROLES_NUMBER][FUNC_NAME_MAXLEN]; char writeGeneric[FUNC_NAME_MAXLEN]; } CopyBufFuncs; /* * Generate all needed functions copying matrix * blocks between the global and the local memory * * @funcs: function names structure * @ctx: generator context * @funcID: function ID * @gset: generator settings * @flags: helper flags * * The 'flags' field of the 'gset' structure must store flags from * the 'BufCopyHelperFlags' enumeration * * Name of functions dealing with blocks of the output matrix * are always stored to 'MATRIX_C' name fields. * * On success returns 0. If generation fails due * to buffer overflowing, returns -1. */ int generateBufCopyFuncs( CopyBufFuncs *funcNames, struct KgenContext *ctx, BlasFunctionID funcID, const BlasGenSettings *gset, BufCopyHelperFlags flags); /* * Have the same semantics as the previous helper, * but generate functions for zeroing local buffers. */ int generateZeroingFuncs( ZeroFuncs *funcNames, struct KgenContext *ctx, const SubproblemDim *blasDim, const PGranularity *pgran, DataType dtype, ZeroGenHelperFlags flags); UpdateResultFlags kextraToUpresFlags(BlasFunctionID, KernelExtraFlags kflags); int generateResultUpdate( struct KgenContext *ctx, BlasFunctionID funcID, const BlasGenSettings *gset, const char *optFuncName, const char *genericFuncName); int genResultUpdateWithFlags( struct KgenContext *ctx, BlasFunctionID funcID, const BlasGenSettings *gset, UpdateResultFlags flags, const char *optFuncName, const char *genericFuncName, const char *cachedName); void checkGenBeginHitMatrixBlock( struct KgenContext *ctx, KernelExtraFlags kflags); void checkGenEndHitMatrixBlock( struct KgenContext *ctx, KernelExtraFlags kflags); #endif /* GEN_HELPER_H_ */ clblas-2.10/src/library/blas/gens/gen_init.c000066400000000000000000000243651264277366700210020ustar00rootroot00000000000000/* ************************************************************************ * Copyright 2013 Advanced Micro Devices, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * ************************************************************************/ /* * Generators initialization */ #include #include "clblas-internal.h" #include "init.h" unsigned int initGemmMemPatterns(MemoryPattern *mempats) { initGemmLdsPattern(&mempats[0]); initGemmImgPattern(&mempats[1]); InitGEMMCachedBlockPattern(&mempats[2]); InitGEMMCachedSubgroupPattern(&mempats[3]); return 4; } int getGemmMemPatternIndex(clblasImplementation impl) { switch (impl) { case clblasLdsBlockGemm: return 0; case clblasImageBlockGemm: return 1; case clblasBlockGemmWithCaching: return 2; case clblasSubgroupGemmWithCaching: return 3; default: return -1; } } clblasImplementation getGemmPreferredPattern(void) { switch (clblasSolvers[CLBLAS_GEMM].defaultPattern) { case 0: return clblasLdsBlockGemm; case 1: return clblasImageBlockGemm; case 2: return clblasBlockGemmWithCaching; case 3: return clblasSubgroupGemmWithCaching; default: return clblasDefaultGemm; } } unsigned int initGemvMemPatterns(MemoryPattern *mempats) { initGemvPattern(mempats); return 1; } int getGemvMemPatternIndex(clblasImplementation impl) { switch (impl) { default: return -1; } } unsigned int initSymvMemPatterns(MemoryPattern *mempats) { initSymvPattern(mempats); return 1; } int getSymvMemPatternIndex(clblasImplementation impl) { switch (impl) { default: return -1; } } unsigned int initTrmmMemPatterns(MemoryPattern *mempats) { initTrmmLdsPattern(mempats); initTrmmImgPattern(&mempats[1]); initTrmmCachedBlockPattern(&mempats[2]); initTrmmCachedSubgroupPattern(&mempats[3]); return 4; } int getTrmmMemPatternIndex(clblasImplementation impl) { switch (impl) { case clblasLdsBlockTrmm: return 0; case clblasImageBlockTrmm: return 1; case clblasBlockTrmmWithCaching: return 2; case clblasSubgroupTrmmWithCaching: return 3; default: return -1; } } clblasImplementation getTrmmPreferredPattern(void) { switch (clblasSolvers[CLBLAS_TRMM].defaultPattern) { case 0: return clblasLdsBlockTrmm; case 1: return clblasImageBlockTrmm; case 2: return clblasBlockTrmmWithCaching; case 3: return clblasSubgroupTrmmWithCaching; default: return clblasDefaultTrmm; } } unsigned int initTrsmMemPatterns(MemoryPattern *mempats) { initTrsmLdsPattern(mempats); initTrsmImgPattern(&mempats[1]); initTrsmLdsLessCachedPattern(&mempats[2]); initTrsmCachedPattern(&mempats[3]); return 4; } int getTrsmMemPatternIndex(clblasImplementation impl) { switch (impl) { case clblasLdsBlockTrsm: return 0; case clblasImageBlockTrsm: return 1; case clblasBlockTrsmWithoutLds: return 2; case clblasBlockTrsmWithCaching: return 3; default: return -1; } } clblasImplementation getTrsmPreferredPattern(void) { switch (clblasSolvers[CLBLAS_TRSM].defaultPattern) { case 0: return clblasLdsBlockTrsm; case 1: return clblasImageBlockTrsm; case 2: return clblasBlockTrsmWithoutLds; case 3: return clblasBlockTrsmWithCaching; default: return clblasDefaultTrsm; } } unsigned int initSyrkMemPatterns(MemoryPattern *mempats) { initSyrkBlockPattern(&mempats[0]); initSyrkSubgPattern(&mempats[1]); return 2; } clblasImplementation getSyrkPreferredPattern(void) { switch (clblasSolvers[CLBLAS_SYRK].defaultPattern) { case 0: return clblasBlockSyrk; case 1: return clblasSubgSyrk; default: return clblasDefaultSyrk; } } int getSyrkMemPatternIndex(clblasImplementation impl) { switch (impl) { case clblasBlockSyrk: return 0; case clblasSubgSyrk: return 1; default: return -1; } } unsigned int initSyr2kMemPatterns(MemoryPattern *mempats) { initSyr2kBlockPattern(&mempats[0]); initSyr2kSubgPattern(&mempats[1]); return 2; } clblasImplementation getSyr2kPreferredPattern(void) { switch (clblasSolvers[CLBLAS_SYR2K].defaultPattern) { case 0: return clblasBlockSyr2k; case 1: return clblasSubgSyr2k; default: return clblasDefaultSyr2k; } } int getSyr2kMemPatternIndex(clblasImplementation impl) { switch (impl) { case clblasBlockSyr2k: return 0; case clblasSubgSyr2k: return 1; default: return -1; } } unsigned int initTrmvMemPatterns(MemoryPattern *mempats) { initTrmvRegisterPattern(&mempats[0]); return 1; } int getTrmvMemPatternIndex(clblasImplementation impl) { switch(impl) { default: return -1; } } unsigned int initTrsvMemPatterns(MemoryPattern *mempats) { initTrsvDefaultPattern(&mempats[0]); return 1; } int getTrsvMemPatternIndex(clblasImplementation impl) { switch(impl) { default: return -1; } } unsigned int initSyrMemPatterns(MemoryPattern *mempats) { initSyrDefaultPattern(&mempats[0]); return 1; } int getSyrMemPatternIndex(clblasImplementation impl) { switch(impl) { default: return -1; } } unsigned int initSyr2MemPatterns(MemoryPattern *mempats) { initSyr2DefaultPattern(&mempats[0]); return 1; } int getSyr2MemPatternIndex(clblasImplementation impl) { switch(impl) { default: return -1; } } unsigned int initTrsvGemvMemPatterns(MemoryPattern *mempats) { initTrsvGemvDefaultPattern(&mempats[0]); return 1; } int getTrsvGemvMemPatternIndex(clblasImplementation impl) { switch(impl) { default: return -1; } } unsigned int initSymmMemPatterns(MemoryPattern *mempats) { initSymmDefaultPattern(&mempats[0]); return 1; } int getSymmMemPatternIndex(clblasImplementation impl) { switch(impl) { default: return -1; } } unsigned int initGemmV2MemPatterns(MemoryPattern *mempats) { initGemmV2CachedPattern(mempats); return 1; } int getGemmV2MemPatternIndex(clblasImplementation impl) { switch(impl) { default: return -1; } } unsigned int initGemmV2TailMemPatterns(MemoryPattern *mempats) { initGemmV2TailCachedPattern(mempats); return 1; } int getGemmV2TailMemPatternIndex(clblasImplementation impl) { switch(impl) { default: return -1; } } unsigned int initGerMemPatterns(MemoryPattern *mempats) { initGerRegisterPattern(&mempats[0]); return 1; } int getGerMemPatternIndex(clblasImplementation impl) { switch(impl) { default: return -1; } } unsigned int initHerMemPatterns(MemoryPattern *mempats) { initHerDefaultPattern(&mempats[0]); return 1; } int getHerMemPatternIndex(clblasImplementation impl) { switch(impl) { default: return -1; } } unsigned int initHer2MemPatterns(MemoryPattern *mempats) { initHer2DefaultPattern(&mempats[0]); return 1; } int getHer2MemPatternIndex(clblasImplementation impl) { switch(impl) { default: return -1; } } unsigned int initGbmvMemPatterns(MemoryPattern *mempats) { initGbmvRegisterPattern(&mempats[0]); return 1; } int getGbmvMemPatternIndex(clblasImplementation impl) { switch(impl) { default: return -1; } } unsigned int initSwapMemPatterns(MemoryPattern *mempats) { initSwapRegisterPattern(&mempats[0]); return 1; } int getSwapMemPatternIndex(clblasImplementation impl) { switch(impl) { default: return -1; } } unsigned int initScalMemPatterns(MemoryPattern *mempats) { initScalRegisterPattern(&mempats[0]); return 1; } int getScalMemPatternIndex(clblasImplementation impl) { switch(impl) { default: return -1; } } unsigned int initCopyMemPatterns(MemoryPattern *mempats) { initCopyRegisterPattern(&mempats[0]); return 1; } int getCopyMemPatternIndex(clblasImplementation impl) { switch(impl) { default: return -1; } } unsigned int initAxpyMemPatterns(MemoryPattern *mempats) { initAxpyRegisterPattern(&mempats[0]); return 1; } int getAxpyMemPatternIndex(clblasImplementation impl) { switch(impl) { default: return -1; } } unsigned int initDotMemPatterns(MemoryPattern *mempats) { initDotRegisterPattern(&mempats[0]); return 1; } int getDotMemPatternIndex(clblasImplementation impl) { switch(impl) { default: return -1; } } unsigned int initReductionMemPatterns(MemoryPattern *mempats) { initReductionRegisterPattern(&mempats[0]); return 1; } int getReductionMemPatternIndex(clblasImplementation impl) { switch(impl) { default: return -1; } } unsigned int initRotgMemPatterns(MemoryPattern *mempats) { initRotgRegisterPattern(&mempats[0]); return 1; } int getRotgMemPatternIndex(clblasImplementation impl) { switch(impl) { default: return -1; } } unsigned int initRotmgMemPatterns(MemoryPattern *mempats) { initRotmgRegisterPattern(&mempats[0]); return 1; } int getRotmgMemPatternIndex(clblasImplementation impl) { switch(impl) { default: return -1; } } unsigned int initRotmMemPatterns(MemoryPattern *mempats) { initRotmRegisterPattern(&mempats[0]); return 1; } int getRotmMemPatternIndex(clblasImplementation impl) { switch(impl) { default: return -1; } } unsigned int initiAmaxMemPatterns(MemoryPattern *mempats) { initiAmaxRegisterPattern(&mempats[0]); return 1; } int getiAmaxMemPatternIndex(clblasImplementation impl) { switch(impl) { default: return -1; } } unsigned int initNrm2MemPatterns(MemoryPattern *mempats) { initNrm2RegisterPattern(&mempats[0]); return 1; } int getNrm2MemPatternIndex(clblasImplementation impl) { switch(impl) { default: return -1; } } unsigned int initAsumMemPatterns(MemoryPattern *mempats) { initAsumRegisterPattern(&mempats[0]); return 1; } int getAsumMemPatternIndex(clblasImplementation impl) { switch(impl) { default: return -1; } } clblas-2.10/src/library/blas/gens/ger_lds.cpp000066400000000000000000000244471264277366700211660ustar00rootroot00000000000000/* ************************************************************************ * Copyright 2013 Advanced Micro Devices, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * ************************************************************************/ /* * ger generator */ //#define DEBUG_GER #include #include #include #include #include #include #include #include #include "blas_kgen.h" #include #include #include extern "C" unsigned int dtypeSize(DataType type); static char Prefix[4]; static int getDefaultDecomposition( PGranularity *pgran, SubproblemDim *subdims, unsigned int subdimsNum, void *pArgs); static SolverFlags solverFlags(void) { #ifdef DEBUG_GER printf("solverFlags callen......\n"); #endif return (SF_WSPACE_1D); } static void calcNrThreads( size_t threads[2], const SubproblemDim *subdims, const PGranularity *pgran, const void *args, const void *extra); static ssize_t generator( char *buf, size_t buflen, const struct SubproblemDim *subdims, const struct PGranularity *pgran, void *extra); static void assignKargs(KernelArg *args, const void *params, const void* ); extern "C" void initGerRegisterPattern(MemoryPattern *mempat); static KernelExtraFlags selectVectorization( void *kargs, unsigned int vlen ); static void setBuildOpts( char * buildOptStr, const void *kArgs); static bool isFitToLDS( SubproblemDim *dim, DataType dtype, cl_ulong ldsSize, const void *kernelArgs); static SolverOps gerOps = { generator, assignKargs, isFitToLDS, NULL, // Prepare Translate Dims NULL, // Inner Decomposition Axis calcNrThreads, NULL, // Related to images solverFlags, NULL, getDefaultDecomposition, NULL, setBuildOpts, selectVectorization }; static KernelExtraFlags selectVectorization( void *args, unsigned int vlen ) { KernelExtraFlags kflags = KEXTRA_NO_FLAGS; CLBlasKargs *kargs = (CLBlasKargs *)args; if(((kargs->lda.matrix) % vlen) != 0) { kflags = KEXTRA_NO_COPY_VEC_A; } return kflags; } static void setBuildOpts( char * buildOptStr, const void *args) { const SolutionStep *step = (const SolutionStep *)args; const CLBlasKargs *kargs = (const CLBlasKargs *)(&step->args); if ( kargs->dtype == TYPE_DOUBLE || kargs->dtype == TYPE_COMPLEX_DOUBLE) { addBuildOpt( buildOptStr, BUILD_OPTS_MAXLEN, "-DDOUBLE_PRECISION"); #ifdef DEBUG_GER printf("Setting build options ... Double... for DOUBLE PRECISION support\n"); #endif } return; } static CLBLASMpatExtra mpatExtra; extern "C" void initGerRegisterPattern(MemoryPattern *mempat) { mempat->name = "Register accumulation based ger"; mempat->nrLevels = 2; mempat->cuLevel = 0; mempat->thLevel = 1; mempat->sops = &gerOps; //CHECK THIS mpatExtra.aMset = CLMEM_LEVEL_L2; mpatExtra.bMset = CLMEM_LEVEL_L1 | CLMEM_LEVEL_LDS; // For "x" vector mpatExtra.mobjA = CLMEM_BUFFER; mpatExtra.mobjB = CLMEM_BUFFER; mempat->extra = &mpatExtra; Prefix[TYPE_FLOAT] = 'S'; Prefix[TYPE_DOUBLE] = 'D'; Prefix[TYPE_COMPLEX_FLOAT] = 'C'; Prefix[TYPE_COMPLEX_DOUBLE] = 'Z'; #ifdef DEBUG_GER printf("initGerRegPattern called with mempat = 0x%p\n", mempat); fflush(stdout); #endif } static void calcNrThreads( size_t threads[2], const SubproblemDim *subdims, const PGranularity *pgran, const void *args, const void *_extra) { const CLBlasKargs *kargs = (const CLBlasKargs *)args; const CLBLASKernExtra *extra = ( CLBLASKernExtra *)_extra; size_t BLOCKSIZE = pgran->wgSize[0] * pgran->wgSize[1]; // 1D Block size_t BH, BW; unsigned int VEC_LEN = extra->vecLenA; clblasOrder order = ( extra->flags & KEXTRA_COLUMN_MAJOR) ? clblasColumnMajor: clblasRowMajor; size_t nBlocksY; //number of blocks in Y dir ( Although we say 1D block to opencl ) size_t nBlocksX; //number of blocks in X dir BH = subdims->y; BW = subdims->x; if ( order == clblasColumnMajor ) { nBlocksY = ( kargs->M + BH*VEC_LEN - 1 ) / (BH*VEC_LEN); nBlocksX = ( kargs->N + BW - 1) / BW; } else { nBlocksY = ( kargs->M + BH - 1) / BH; nBlocksX = ( kargs->N + BW*VEC_LEN - 1) / (BW*VEC_LEN); } size_t blocks = nBlocksX * nBlocksY; threads[0] = blocks * BLOCKSIZE; threads[1] = 1; #ifdef DEBUG_GER printf("calcNrThreads called from GER_Reg.cpp.. wgSize[0]: %u\twgSize[1]: %u\n", pgran->wgSize[0], pgran->wgSize[1]); printf("subdim->y :%u\t subdim->x : %u\n", subdims->y, subdims->x); printf("kargs-> M : %d, kargs-> N: %d, BH: %d, BW: %d\n", kargs->M, kargs->N, BH, BW); printf("blocks : %d\tglobalthreads[0] : %u\t VecLen :%d\n", blocks, threads[0], VEC_LEN); #endif } // // FIXME: Report correct return value - Needs change in KPRINTF // static ssize_t generator( char *buf, size_t buflen, const struct SubproblemDim *subdims, const struct PGranularity *pgran, void *extra) { size_t BH, BW;//BLOCKSIZE = pgran->wgSize[0]; // Because we are using 1D block unsigned int VEC_LEN; char tempTemplate[32*1024]; char bhStr[10], bwStr[10]; pgran = pgran; // To remove warnings if ( buf == NULL) // return buffer size { buflen = (64 * 1024 * sizeof(char)); return (ssize_t)buflen; } CLBLASKernExtra *extraFlags = ( CLBLASKernExtra *)extra; clblasOrder order = ( extraFlags->flags & KEXTRA_COLUMN_MAJOR) ? clblasColumnMajor: clblasRowMajor; VEC_LEN = extraFlags->vecLenA; #ifdef DEBUG_GER printf("GER GENERATOR called.... with %s order, DataType %c & Vector-Length: %d\n", ((order == clblasColumnMajor)? "ColumnMajor": "RowMajor"), Prefix[extraFlags->dtype], VEC_LEN ); #endif if( order == clblasColumnMajor ) { strcpy( tempTemplate, (char*)ger_C_kernel ); } else { strcpy( tempTemplate, (char*)ger_R_kernel ); } // FIXME: VECTORSIZE HARD CODED // FIXME: SetKernelArgs.. sends offa, offx, and lda should be received as uint bool doVLOAD = false; if( extraFlags->flags & KEXTRA_NO_COPY_VEC_A ) { doVLOAD = true; #ifdef DEBUG_GER printf("DOing VLOAD as Aligned Data Pointer not Availabe\n"); #endif } else { #ifdef DEBUG_GER printf("Using Aligned Data Pointer .........................\n"); #endif } kprintf kobj( Prefix[extraFlags->dtype], VEC_LEN, doVLOAD, doVLOAD); BH = subdims->y; BW = subdims->x; sprintf( bhStr, "%" SPREFIX "u", BH ); sprintf( bwStr, "%" SPREFIX "u", BW ); #ifdef DEBUG_GER printf("BH = %s\n", bhStr); printf("BW = %s\n", bwStr); #endif kobj.put("%BH_DEF", (const char *)bhStr); kobj.put("%BW_DEF", (const char *)bwStr); kobj.spit((char*)buf, tempTemplate); return (64 * 1024 * sizeof(char)); // return 0;//(ret < 0) ? -EOVERFLOW : ret; } /* ( __global const %TYPE* X, __global const %TYPE* Y, __global %TYPE* A, uint M, uint N, uint offx, int incx, uint offy, int incy, uint offa, uint lda, %TYPE alpha, int doConj ) */ static void assignKargs(KernelArg *args, const void *params, const void*) { CLBlasKargs *blasArgs = (CLBlasKargs*)params; cl_int incx, incy, doConj; INIT_KARG(&args[0], blasArgs->B); // B - our X vector INIT_KARG(&args[1], blasArgs->C); // C - our Y vector INIT_KARG(&args[2], blasArgs->A); // A - matrix A initSizeKarg(&args[3], blasArgs->M); initSizeKarg(&args[4], blasArgs->N); incx = blasArgs->ldb.vector; incy = blasArgs->ldc.vector; initSizeKarg(&args[5], blasArgs->offBX); INIT_KARG(&args[6], incx); initSizeKarg(&args[7], blasArgs->offCY); INIT_KARG(&args[8], incy); initSizeKarg(&args[9], blasArgs->offa); initSizeKarg(&args[10], blasArgs->lda.matrix); assignScalarKarg(&args[11], &(blasArgs->alpha), blasArgs->dtype); doConj = (cl_int)(blasArgs->K); INIT_KARG(&args[12], doConj); // K was used as doConj #ifdef DEBUG_GER printf("doConj = %d\n", doConj ); #endif return; } static bool isFitToLDS( SubproblemDim *dim, DataType dtype, cl_ulong ldsSize, const void *kernelArgs) { const CLBlasKargs *kargs = (const CLBlasKargs*)kernelArgs; SolutionStep *step = container_of(kargs, args, SolutionStep); unsigned int vecLen; vecLen = ((CLBLASKernExtra*)(step->kernels[CLBLAS_COMPUTING_KERNEL]->extra))->vecLenA; cl_ulong maxSize; if( kargs->order == clblasColumnMajor ) { maxSize = ( dim[0].x + (dim[0].y * vecLen) ) * sizeof(dtype); } else { maxSize = ( (dim[0].x * vecLen) + dim[0].y ) * sizeof(dtype); } return ( maxSize <= ldsSize ); } static int getDefaultDecomposition( PGranularity *pgran, SubproblemDim *subdims, unsigned int subdimsNum, void *pArgs) { SolutionStep *step = container_of( pgran , pgran, SolutionStep); size_t maxWorkGroupSize; cl_device_id devID = step->device.id; size_t wgX, wgY; pArgs = pArgs; clGetDeviceInfo(devID, CL_DEVICE_MAX_WORK_GROUP_SIZE, sizeof(size_t), &maxWorkGroupSize, NULL); if( step->args.order == clblasColumnMajor ) { wgY = 16; // BH preferably 16(quarter wave-front) subdims[0].y = wgY; wgX = maxWorkGroupSize / wgY; // BW is left upto maxWorkGroupSize of the device wgX = szmin( wgX, 16 ); subdims[0].x = wgX; } else { wgX = 16; subdims[0].x = wgX; wgY = maxWorkGroupSize / wgX; wgY = szmin( wgY, 16 ); subdims[0].y = wgY; } pgran->wgDim = 1; //1D blocking pgran->wgSize[0] = (unsigned int)(wgX * wgY); pgran->wgSize[1] = 1; if(subdimsNum > 0) { subdims[0].itemX = subdims[0].x; subdims[0].itemY = subdims[0].y; subdims[0].bwidth = 1; } if(subdimsNum > 1) { subdims[1].itemY = 1; subdims[1].itemX = 1; subdims[1].y = subdims[1].itemY; subdims[1].x = subdims[1].itemX; subdims[1].bwidth = 1; } return 0; } clblas-2.10/src/library/blas/gens/her2_lds.cpp000066400000000000000000000214671264277366700212500ustar00rootroot00000000000000/* ************************************************************************ * Copyright 2013 Advanced Micro Devices, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * ************************************************************************/ /* * HER2 Generator */ //#define DEBUG_HER2 #include #include #include #include #include #include #include #include #include "blas_kgen.h" #include #include #include extern "C" unsigned int dtypeSize(DataType type); static char Prefix[4]; static SolverFlags solverFlags(void) { #ifdef DEBUG_HER2 printf("solverFlags called......\n"); #endif return (SolverFlags)(SF_WSPACE_1D | SF_TOP_INPUT_SQUARE_BLOCKS); } static void calcNrThreads( size_t threads[2], const SubproblemDim *subdims, const PGranularity *pgran, const void *args, const void *extra); static ssize_t generator( char *buf, size_t buflen, const struct SubproblemDim *subdims, const struct PGranularity *pgran, void *extra); static void assignKargs(KernelArg *args, const void *params, const void*); extern "C" void initHer2DefaultPattern(MemoryPattern *mempat); static KernelExtraFlags selectVectorization( void *kargs, unsigned int vlen ); static void setBuildOpts( char * buildOptStr, const void *kArgs); static bool isFitToLDS( SubproblemDim *dim, DataType dtype, cl_ulong ldsSize, const void *kernelArgs); static SolverOps her2Ops = { generator, assignKargs, isFitToLDS, NULL, // Prepare Translate Dims NULL, // Inner Decomposition Axis calcNrThreads, NULL, solverFlags, NULL, NULL, NULL, setBuildOpts, selectVectorization }; static KernelExtraFlags selectVectorization( void *args, unsigned int vlen ) { KernelExtraFlags kflags = KEXTRA_NO_FLAGS; CLBlasKargs *kargs = (CLBlasKargs *)args; if(kargs->uplo == clblasUpper) { if( (kargs->N) % vlen) { kflags = KEXTRA_NO_COPY_VEC_A; } } if( kargs->pigFuncID == CLBLAS_HPR2 ) { kflags = KEXTRA_NO_COPY_VEC_A; // Packed-case never do aligned access } return kflags; } static void setBuildOpts( char * buildOptStr, const void *args) { const SolutionStep *step = (const SolutionStep *)args; const CLBlasKargs *kargs = (const CLBlasKargs *)(&step->args); if ( kargs->dtype == TYPE_COMPLEX_DOUBLE ) { addBuildOpt( buildOptStr, BUILD_OPTS_MAXLEN, "-DDOUBLE_PRECISION"); #ifdef DEBUG_HER2 printf("Setting build options ... Double... for DOUBLE PRECISION support\n"); #endif } if( kargs->order == clblasRowMajor ) { addBuildOpt( buildOptStr, BUILD_OPTS_MAXLEN, "-DHER2_ROWMAJOR"); #ifdef DEBUG_HER2 printf("Setting build options ... HERMITIAN2_ROWMAJOR... for row-major support\n"); #endif } if( kargs->pigFuncID == CLBLAS_HPR2 ) { addBuildOpt( buildOptStr, BUILD_OPTS_MAXLEN, "-DPACKED"); } //Build options for syr2_her2.clT to generate HER2 related code. addBuildOpt( buildOptStr, BUILD_OPTS_MAXLEN, "-DHER2_ONLY"); return; } static CLBLASMpatExtra mpatExtra; extern "C" void initHer2DefaultPattern(MemoryPattern *mempat) { #ifdef DEBUG_HER2 printf("initHerDefaultPattern called with mempat = 0x%p\n", (void *)mempat); fflush(stdout); #endif mempat->name = "LDS based her2"; mempat->nrLevels = 2; mempat->cuLevel = 0; mempat->thLevel = 1; mempat->sops = &her2Ops; mpatExtra.aMset = 0; mpatExtra.bMset = CLMEM_LEVEL_LDS; // For "x" vector //mpatExtra.cMset = CLMEM_LEVEL_LDS; // For "y" vector mpatExtra.mobjA = CLMEM_GLOBAL_MEMORY; mpatExtra.mobjB = CLMEM_GLOBAL_MEMORY; //mpatExtra.mobjC = CLMEM_GLOBAL_MEMORY; mempat->extra = &mpatExtra; Prefix[TYPE_COMPLEX_FLOAT] = 'C'; Prefix[TYPE_COMPLEX_DOUBLE] = 'Z'; } static void calcNrThreads( size_t threads[2], const SubproblemDim *subdims, const PGranularity *pgran, const void *args, const void *_extra) { int BLOCKSIZE = pgran->wgSize[0]; // 1D Block #ifdef DEBUG_HER2 printf("calcNrThreads called from her2_lds.cpp\n"); #endif const CLBlasKargs *kargs = (const CLBlasKargs *)args; const CLBLASKernExtra *extra; extra = ( CLBLASKernExtra *)_extra; #ifdef DEBUG_HER2 printf("subdims->y : %d, subdims->x : %d\n", (int)subdims->y, (int)subdims->x); #endif size_t TARGETROWS = subdims->y ; #ifdef DEBUG_HER2 printf("kargs-> N : %d, TARGETROWS: %d\n", (int)kargs->N, TARGETROWS); #endif size_t blocks = ((kargs->N - 1)/ TARGETROWS) + 1; #ifdef DEBUG_HER2 printf("blocks : %d\n", blocks); #endif threads[0] = ((blocks * (blocks + 1)) / 2) * BLOCKSIZE; #ifdef DEBUG_HER2 printf("pgran-wgSize[0] : %d, globalthreads[0] : %d\n", (int)pgran->wgSize[0], (int)threads[0]); #endif threads[1] = 1; } // // FIXME: Report correct return value - Needs change in KPRINTF // static ssize_t generator( char *buf, size_t buflen, const struct SubproblemDim *subdims, const struct PGranularity *pgran, void *extra) { int BLOCKSIZE = pgran->wgSize[0]; char tempTemplate[64*1024]; char targetRows[10], blockSize[10]; if ( buf == NULL) // return buffer size { buflen = (64 * 1024 * sizeof(char)); return (ssize_t)buflen; } CLBLASKernExtra *extraFlags = ( CLBLASKernExtra *)extra; #ifdef DEBUG_HER2 printf("HER2 GENERATOR called....\n"); #endif clblasUplo uplo = ( extraFlags->flags & KEXTRA_UPPER_TRIANG) ? clblasUpper : clblasLower; if ((subdims->y % extraFlags->vecLenA) != 0) { printf("WARNING: HER2: generator: TARGETROWS must be divisible by Vector Length\n"); return 0; } size_t TARGETROWS = 0; ( uplo == clblasLower )? (strcpy(tempTemplate, (char*)syr2_her2_CL_kernel)) : (strcpy(tempTemplate, (char*)syr2_her2_CU_kernel)); TARGETROWS = subdims->y; if ((BLOCKSIZE % TARGETROWS) != 0) { printf("WARNING: HER2: generator: Invalid Block Size\n"); return 0; } #ifdef DEBUG_HER2 printf("dataType : %c\n", Prefix[extraFlags->dtype]); #endif // FIXME: VECTORSIZE HARD CODED // FIXME : SetKernelArgs.. sends offa, offx, and lda should be received as uint unsigned int vecLenA = extraFlags->vecLenA; #ifdef DEBUG_HER2 printf("Vector length used : %d\n\n", vecLenA); #endif bool doVLOAD = false; if( extraFlags->flags & KEXTRA_NO_COPY_VEC_A ) { doVLOAD = true; #ifdef DEBUG_HER2 printf("DOing VLOAD as Aligned Data Pointer not Availabe\n"); #endif } else { #ifdef DEBUG_HER2 printf("Using Aligned Data Pointer .........................\n"); #endif } kprintf kobj( Prefix[extraFlags->dtype], vecLenA, doVLOAD, doVLOAD); sprintf( targetRows, "%" SPREFIX "u", TARGETROWS ); sprintf( blockSize, "%d", BLOCKSIZE ); #ifdef DEBUG_HER2 printf("TARGET ROWS = %s\n", targetRows); printf("BLOCK SIZE = %s\n", blockSize); #endif kobj.put("%TARGET_ROWS", (const char *)targetRows); kobj.put("%BLOCKSIZE", (const char *) blockSize); kobj.spit((char*)buf, tempTemplate); return (64 * 1024 * sizeof(char)); // return 0;//(ret < 0) ? -EOVERFLOW : ret; } /* ( __global %TYPE* _A, __global const %TYPE* _X, __global const %TYPE* _Y, int N, int offx, int incx, int offy, int incy, int offa, int lda, %TYPE alpha) */ static void assignKargs(KernelArg *args, const void *params, const void*) { CLBlasKargs *blasArgs = (CLBlasKargs*)params; cl_int inc; INIT_KARG(&args[0], blasArgs->A); //A - input/output matrix - argument INIT_KARG(&args[1], blasArgs->B); //X - x vector INIT_KARG(&args[2], blasArgs->C); //Y - y vector initSizeKarg(&args[3], blasArgs->N); initSizeKarg(&args[4], blasArgs->offBX); inc = blasArgs->ldb.vector; INIT_KARG(&args[5], inc); initSizeKarg(&args[6], blasArgs->offCY); inc = blasArgs->ldc.vector; INIT_KARG(&args[7], inc); initSizeKarg(&args[8], blasArgs->offa); initSizeKarg(&args[9], blasArgs->lda.matrix); assignScalarKarg(&args[10], &(blasArgs->alpha), blasArgs->dtype); return; } static bool isFitToLDS( SubproblemDim *dim, DataType dtype, cl_ulong ldsSize, const void *kernelArgs) { cl_ulong maxSize; CLBlasKargs *blasArgs; blasArgs = (CLBlasKargs *)kernelArgs; // 4 buffers for xShared, yShared, xSharedTrans and ySharedTrans and 2 integers for the values of iShared and jShared. maxSize = (dim->y * 4 * sizeof(dtype)) + (2 * sizeof(int)); return ((maxSize) <= ldsSize); } //#undef DEBUG_HER2 clblas-2.10/src/library/blas/gens/her_lds.cpp000066400000000000000000000210641264277366700211570ustar00rootroot00000000000000/* ************************************************************************ * Copyright 2013 Advanced Micro Devices, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * ************************************************************************/ /* * HER Generator */ #include #include #include #include #include #include #include #include #include "blas_kgen.h" #include #include #include //#define DEBUG_HER extern "C" unsigned int dtypeSize(DataType type); static char Prefix[4]; static SolverFlags solverFlags(void) { #ifdef DEBUG_HER printf("solverFlags called......\n"); #endif return (SolverFlags)(SF_WSPACE_1D | SF_TOP_INPUT_SQUARE_BLOCKS); } static void calcNrThreads( size_t threads[2], const SubproblemDim *subdims, const PGranularity *pgran, const void *args, const void *extra); static ssize_t generator( char *buf, size_t buflen, const struct SubproblemDim *subdims, const struct PGranularity *pgran, void *extra); static void assignKargs(KernelArg *args, const void *params, const void*); extern "C" void initHerDefaultPattern(MemoryPattern *mempat); static KernelExtraFlags selectVectorization( void *kargs, unsigned int vlen ); static void setBuildOpts( char * buildOptStr, const void *kArgs); static bool isFitToLDS( SubproblemDim *dim, DataType dtype, cl_ulong ldsSize, const void *kernelArgs); static SolverOps herOps = { generator, assignKargs, isFitToLDS, NULL, // Prepare Translate Dims NULL, // Inner Decomposition Axis calcNrThreads, NULL, solverFlags, NULL, NULL, NULL, setBuildOpts, selectVectorization }; static KernelExtraFlags selectVectorization( void *args, unsigned int vlen ) { KernelExtraFlags kflags = KEXTRA_NO_FLAGS; CLBlasKargs *kargs = (CLBlasKargs *)args; if(kargs->uplo == clblasUpper) { if( (kargs->N) % vlen) { kflags = KEXTRA_NO_COPY_VEC_A; } } if( kargs->pigFuncID == CLBLAS_HPR ) { kflags = KEXTRA_NO_COPY_VEC_A; // Packed-case never do aligned access } return kflags; } static void setBuildOpts( char * buildOptStr, const void *args) { const SolutionStep *step = (const SolutionStep *)args; const CLBlasKargs *kargs = (const CLBlasKargs *)(&step->args); if ( kargs->dtype == TYPE_COMPLEX_DOUBLE ) { addBuildOpt( buildOptStr, BUILD_OPTS_MAXLEN, "-DDOUBLE_PRECISION"); #ifdef DEBUG_HER printf("Setting build options ... Double... for DOUBLE PRECISION support\n"); #endif } if( kargs->order == clblasRowMajor ) { addBuildOpt( buildOptStr, BUILD_OPTS_MAXLEN, "-DHERMITIAN_ROWMAJOR"); #ifdef DEBUG_HER printf("Setting build options ... HERMITIAN_ROWMAJOR... for row-major support\n"); #endif } if( kargs->pigFuncID == CLBLAS_HPR ) { addBuildOpt( buildOptStr, BUILD_OPTS_MAXLEN, "-DPACKED"); } //Build options for syr_her.clT to generate HER related code. addBuildOpt( buildOptStr, BUILD_OPTS_MAXLEN, "-DHER_ONLY"); return; } static CLBLASMpatExtra mpatExtra; extern "C" void initHerDefaultPattern(MemoryPattern *mempat) { #ifdef DEBUG_HER printf("initHerDefaultPattern called with mempat = 0x%p\n", (void *)mempat); fflush(stdout); #endif mempat->name = "LDS based HER"; mempat->nrLevels = 2; mempat->cuLevel = 0; mempat->thLevel = 1; mempat->sops = &herOps; mpatExtra.aMset = 0; mpatExtra.bMset = CLMEM_LEVEL_LDS; // For "x" vector mpatExtra.mobjA = CLMEM_GLOBAL_MEMORY; mpatExtra.mobjB = CLMEM_GLOBAL_MEMORY; mempat->extra = &mpatExtra; Prefix[TYPE_COMPLEX_FLOAT] = 'C'; Prefix[TYPE_COMPLEX_DOUBLE] = 'Z'; } static void calcNrThreads( size_t threads[2], const SubproblemDim *subdims, const PGranularity *pgran, const void *args, const void *_extra) { int BLOCKSIZE = pgran->wgSize[0]; // 1D Block #ifdef DEBUG_HER printf("calcNrThreads called from her_lds.cpp\n"); #endif const CLBlasKargs *kargs = (const CLBlasKargs *)args; const CLBLASKernExtra *extra; extra = ( CLBLASKernExtra *)_extra; #ifdef DEBUG_HER printf("subdims->y : %d, subdims->x : %d\n", (int)subdims->y, (int)subdims->x); #endif size_t TARGETROWS = subdims->y ; #ifdef DEBUG_HER printf("kargs-> N : %d, TARGETROWS: %d\n", (int)kargs->N, TARGETROWS); #endif size_t blocks = ((kargs->N - 1)/ TARGETROWS) + 1; #ifdef DEBUG_HER printf("blocks : %d\n", blocks); #endif threads[0] = ((blocks * (blocks + 1)) / 2) * BLOCKSIZE; #ifdef DEBUG_HER printf("pgran-wgSize[0] : %d, globalthreads[0] : %d\n", (int)pgran->wgSize[0], (int)threads[0]); #endif threads[1] = 1; } // // FIXME: Report correct return value - Needs change in KPRINTF // static ssize_t generator( char *buf, size_t buflen, const struct SubproblemDim *subdims, const struct PGranularity *pgran, void *extra) { int BLOCKSIZE = pgran->wgSize[0]; char tempTemplate[32*1024]; char targetRows[10], blockSize[10]; if ( buf == NULL) // return buffer size { buflen = (64 * 1024 * sizeof(char)); return (ssize_t)buflen; } CLBLASKernExtra *extraFlags = ( CLBLASKernExtra *)extra; #ifdef DEBUG_HER printf("HER GENERATOR called....\n"); #endif clblasUplo uplo = ( extraFlags->flags & KEXTRA_UPPER_TRIANG) ? clblasUpper : clblasLower; if ((subdims->y % extraFlags->vecLenA) != 0) { printf("WARNING: HER: generator: TARGETROWS must be divisible by Vector Length\n"); return 0; } size_t TARGETROWS = 0; ( uplo == clblasLower )? (strcpy(tempTemplate, (char*)syr_her_CL_kernel)) : (strcpy(tempTemplate, (char*)syr_her_CU_kernel)); TARGETROWS = subdims->y; if ((BLOCKSIZE % TARGETROWS) != 0) { printf("WARNING: HER: generator: Invalid Block Size\n"); return 0; } #ifdef DEBUG_HER printf("dataType : %c\n", Prefix[extraFlags->dtype]); #endif // FIXME: VECTORSIZE HARD CODED // FIXME : SetKernelArgs.. sends offa, offx, and lda should be received as uint unsigned int vecLenA = extraFlags->vecLenA; #ifdef DEBUG_HER printf("Vector length used : %d\n\n", vecLenA); #endif bool doVLOAD = false; if( extraFlags->flags & KEXTRA_NO_COPY_VEC_A ) { doVLOAD = true; #ifdef DEBUG_HER printf("DOing VLOAD as Aligned Data Pointer not Availabe\n"); #endif } else { #ifdef DEBUG_HER printf("Using Aligned Data Pointer .........................\n"); #endif } kprintf kobj( Prefix[extraFlags->dtype], vecLenA, doVLOAD, doVLOAD); sprintf( targetRows, "%" SPREFIX "u", TARGETROWS ); sprintf( blockSize, "%d", BLOCKSIZE ); #ifdef DEBUG_HER printf("TARGET ROWS = %s\n", targetRows); printf("BLOCK SIZE = %s\n", blockSize); #endif kobj.put("%TARGET_ROWS", (const char *)targetRows); kobj.put("%BLOCKSIZE", (const char *) blockSize); kobj.spit((char*)buf, tempTemplate); return (64 * 1024 * sizeof(char)); // return 0;//(ret < 0) ? -EOVERFLOW : ret; } /* ( __global %TYPE* _A, __global const %TYPE* _X, int N, int offx, int incx, int offa, int lda, %PTYPE alpha ) */ static void assignKargs(KernelArg *args, const void *params, const void*) { CLBlasKargs *blasArgs = (CLBlasKargs*)params; cl_int incx; INIT_KARG(&args[0], blasArgs->A); //A - input/output matrix - argument INIT_KARG(&args[1], blasArgs->B); //x - x vector initSizeKarg(&args[2], blasArgs->N); initSizeKarg(&args[3], blasArgs->offBX); incx = blasArgs->ldb.vector; INIT_KARG(&args[4], incx); initSizeKarg(&args[5], blasArgs->offa); initSizeKarg(&args[6], blasArgs->lda.matrix); DataType alphaType = (blasArgs->dtype == TYPE_COMPLEX_FLOAT)? TYPE_FLOAT : TYPE_DOUBLE; assignScalarKarg(&args[7], &(blasArgs->alpha), alphaType); return; } static bool isFitToLDS( SubproblemDim *dim, DataType dtype, cl_ulong ldsSize, const void *kernelArgs) { cl_ulong maxSize; CLBlasKargs *blasArgs; blasArgs = (CLBlasKargs *)kernelArgs; // 2 buffers for xShared and yShared and 2 integers for the values of iShared and jShared. maxSize = (dim->y * 2 * sizeof(dtype)) + (2 * sizeof(int)); return ((maxSize) <= ldsSize); } //#undef DEBUG_HER clblas-2.10/src/library/blas/gens/iamax.cpp000066400000000000000000000164371264277366700206460ustar00rootroot00000000000000/* ************************************************************************ * Copyright 2013 Advanced Micro Devices, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * ************************************************************************/ /* * amax generator */ //#define DEBUG_AMAX #include #include #include #include #include #include #include #include #include "blas_kgen.h" #include #include #include extern "C" unsigned int dtypeSize(DataType type); static char Prefix[4]; static SolverFlags solverFlags(void) { #ifdef DEBUG_AMAX printf("solverFlags called...\n"); #endif return (SF_WSPACE_1D); } static void calcNrThreads( size_t threads[2], const SubproblemDim *subdims, const PGranularity *pgran, const void *args, const void *extra); static ssize_t generator( char *buf, size_t buflen, const struct SubproblemDim *subdims, const struct PGranularity *pgran, void *extra); static void fixupArgs(void *args, SubproblemDim *subdims, void *extra); static void assignKargs(KernelArg *args, const void *params, const void* extra ); extern "C" void initAmaxRegisterPattern(MemoryPattern *mempat); static KernelExtraFlags selectVectorization( void *kargs, unsigned int vlen ); static void setBuildOpts( char * buildOptStr, const void *kArgs); static SolverOps amaxOps = { generator, assignKargs, NULL, NULL, // Prepare Translate Dims NULL, // Inner Decomposition Axis calcNrThreads, NULL, solverFlags, fixupArgs, NULL, NULL, setBuildOpts, selectVectorization }; static KernelExtraFlags selectVectorization( void *args, unsigned int vlen ) { KernelExtraFlags kflags = KEXTRA_NO_FLAGS; CLBlasKargs *kargs = (CLBlasKargs *)args; if( (((kargs->offa) % vlen) != 0)) { kflags = KEXTRA_NO_COPY_VEC_A; } return kflags; } static void setBuildOpts( char * buildOptStr, const void *args) { const SolutionStep *step = (const SolutionStep *)args; const CLBlasKargs *kargs = (const CLBlasKargs *)(&step->args); if ( kargs->dtype == TYPE_DOUBLE || kargs->dtype == TYPE_COMPLEX_DOUBLE) { addBuildOpt( buildOptStr, BUILD_OPTS_MAXLEN, "-DDOUBLE_PRECISION"); #ifdef DEBUG_AMAX printf("Setting build options ... Double... for DOUBLE PRECISION support\n"); #endif } if( (kargs->ldb.vector) != 1) { addBuildOpt( buildOptStr, BUILD_OPTS_MAXLEN, "-DINCX_NONUNITY"); } if( (kargs->ldb.vector) < 1) { addBuildOpt( buildOptStr, BUILD_OPTS_MAXLEN, "-DRETURN_ON_INVALID"); } if( (kargs->redctnType == REDUCE_MAX_WITH_INDEX_ATOMICS)) { addBuildOpt( buildOptStr, BUILD_OPTS_MAXLEN, "-DREDUCE_MAX_WITH_INDEX_ATOMICS"); } return; } static CLBLASMpatExtra mpatExtra; extern "C" void initiAmaxRegisterPattern(MemoryPattern *mempat) { #ifdef DEBUG_AMAX printf("initRegPattern called with mempat = 0x%p\n", mempat); #endif fflush(stdout); mempat->name = "Register AMAX"; mempat->nrLevels = 2; mempat->cuLevel = 0; mempat->thLevel = 1; mempat->sops = &amaxOps; mpatExtra.aMset = CLMEM_LEVEL_L2; mpatExtra.bMset = CLMEM_LEVEL_L2; mpatExtra.mobjA = CLMEM_GLOBAL_MEMORY; mpatExtra.mobjB = CLMEM_GLOBAL_MEMORY; mempat->extra = &mpatExtra; Prefix[TYPE_FLOAT] = 'S'; Prefix[TYPE_DOUBLE] = 'D'; Prefix[TYPE_COMPLEX_FLOAT] = 'C'; Prefix[TYPE_COMPLEX_DOUBLE] = 'Z'; } static void calcNrThreads( size_t threads[2], const SubproblemDim *subdims, const PGranularity *pgran, const void *args, const void *_extra) { DUMMY_ARG_USAGE(subdims); int BLOCKSIZE = pgran->wgSize[0] * pgran->wgSize[1]; // 1D Block const CLBLASKernExtra *extra = ( CLBLASKernExtra *)_extra; unsigned int VEC_LEN = extra->vecLenA; #ifdef DEBUG_AMAX printf("calcNrThreads called from amax.cpp\n"); #endif const CLBlasKargs *kargs = (CLBlasKargs *)args; size_t blocks = ((kargs->N - 1)/ (BLOCKSIZE*VEC_LEN)) + 1; #ifdef DEBUG_AMAX printf("blocks : %d\n", blocks); #endif threads[0] = blocks * BLOCKSIZE; #ifdef DEBUG_AMAX printf("pgran-wgSize[0] : %d, globalthreads[0] : %d\n", pgran->wgSize[0], threads[0]); #endif threads[1] = 1; } // // FIXME: Report correct return value - Needs change in KPRINTF // static ssize_t generator( char *buf, size_t buflen, const struct SubproblemDim *subdims, const struct PGranularity *pgran, void *extra) { DUMMY_ARG_USAGE(subdims); size_t BLOCKSIZE = pgran->wgSize[0]; char tempTemplate[32*1024]; if ( buf == NULL) // return buffer size { buflen = (32 * 1024 * sizeof(char)); return (ssize_t)buflen; } CLBLASKernExtra *extraFlags = ( CLBLASKernExtra *)extra; #ifdef DEBUG_AMAX printf("AMAX GENERATOR called....\n"); printf("dataType : %c\n", Prefix[extraFlags->dtype]); #endif unsigned int vecLenA = extraFlags->vecLenA; #ifdef DEBUG_AMAX printf("Vector length used : %d\n\n", vecLenA); #endif bool doVLOAD = false; if( extraFlags->flags & KEXTRA_NO_COPY_VEC_A ) { doVLOAD = true; #ifdef DEBUG_AMAX printf("DOing VLOAD as Aligned Data Pointer not Availabe\n"); #endif } else { #ifdef DEBUG_AMAX printf("Using Aligned Data Pointer .........................\n"); #endif } strcpy( tempTemplate, (char*)iamax_kernel ); kprintf kobj( Prefix[extraFlags->dtype], vecLenA, doVLOAD, doVLOAD, BLOCKSIZE); kobj.spit((char*)buf, tempTemplate); return (32 * 1024 * sizeof(char)); } /* __kernel void %PREFIXiamax_kernel( __global %TYPE *_X, __global %TYPE _scratchBuf, __global %TYPE *_iMax, uint N, uint offx, int incx, uint offiMax ) */ static void assignKargs(KernelArg *args, const void *params, const void* ) { CLBlasKargs *blasArgs = (CLBlasKargs*)params; cl_int incx; INIT_KARG(&args[0], blasArgs->B); INIT_KARG(&args[1], blasArgs->D); initSizeKarg(&args[2], blasArgs->N); initSizeKarg(&args[3], blasArgs->offb); incx = blasArgs->ldb.vector; INIT_KARG(&args[4], incx); return; } /** The purpose of this function is to add an work-group size indicator in kernelKey, so that a different kernel is generated when work-group size is changed. Reduction loop is unrolled in kprintf based on work-group size. Member of SubproblemDim- bwidth, will be used to store work-group size of the current kernel this will become a kernelKey, and kernel cache will be accordingly managed. Note -- SubproblemDim is a member of kernelKey **/ static void fixupArgs(void *args, SubproblemDim *subdims, void *extra) { DUMMY_ARG_USAGE(extra); CLBlasKargs *kargs = (CLBlasKargs*)args; SolutionStep *step = container_of(kargs, args, SolutionStep); subdims->bwidth = (step->pgran.wgSize[0]) * (step->pgran.wgSize[1]); } clblas-2.10/src/library/blas/gens/init.h000066400000000000000000000061341264277366700201500ustar00rootroot00000000000000/* ************************************************************************ * Copyright 2013 Advanced Micro Devices, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * ************************************************************************/ /* * Declarations generators initialization */ #ifndef INIT_H_ #define INIT_H_ #ifdef __cplusplus extern "C" { #endif void initGemvPattern(MemoryPattern *mempat); void InitGEMMCachedBlockPattern(MemoryPattern *mempat); void InitGEMMCachedSubgroupPattern(MemoryPattern *mempat); void initGemmLdsPattern(MemoryPattern *mempat); void initGemmImgPattern(MemoryPattern *mempat); void initTrmmCachedBlockPattern(MemoryPattern *mempat); void initTrmmCachedSubgroupPattern(MemoryPattern *mempat); void initTrmmLdsPattern(MemoryPattern *mempat); void initTrmmImgPattern(MemoryPattern *mempat); void initTrsmLdsPattern(MemoryPattern *mempat); void initTrsmImgPattern(MemoryPattern *mempat); void initTrsmCachedPattern(MemoryPattern *mempat); void initTrsmLdsLessCachedPattern(MemoryPattern *mempat); void initSyr2kBlockPattern(MemoryPattern *mempat); void initSyr2kSubgPattern(MemoryPattern *mempat); void initSyrkBlockPattern(MemoryPattern *mempat); void initSyrkSubgPattern(MemoryPattern *mempat); void initSymvPattern(MemoryPattern *mempat); void initTrmvRegisterPattern(MemoryPattern *mempat); void initTrsvDefaultPattern(MemoryPattern *mempat); void initTrsvGemvDefaultPattern(MemoryPattern *mempat); void initSymmDefaultPattern(MemoryPattern *mempat); void initGerRegisterPattern(MemoryPattern *mempat); void initSyrDefaultPattern(MemoryPattern *mempat); void initSyr2DefaultPattern(MemoryPattern *mempat); void initHerDefaultPattern(MemoryPattern *mempat); void initHer2DefaultPattern(MemoryPattern *mempat); void initGemmV2CachedPattern(MemoryPattern *mempat); void initGemmV2TailCachedPattern(MemoryPattern *mempat); void initGbmvRegisterPattern(MemoryPattern *mempat); void initSwapRegisterPattern(MemoryPattern *mempat); void initScalRegisterPattern(MemoryPattern *mempat); void initCopyRegisterPattern(MemoryPattern *mempat); void initAxpyRegisterPattern(MemoryPattern *mempat); void initDotRegisterPattern(MemoryPattern *mempat); void initReductionRegisterPattern(MemoryPattern *mempat); void initRotgRegisterPattern(MemoryPattern *mempat); void initRotmgRegisterPattern(MemoryPattern *mempat); void initRotmRegisterPattern(MemoryPattern *mempat); void initiAmaxRegisterPattern(MemoryPattern *mempat); void initNrm2RegisterPattern(MemoryPattern *mempat); void initAsumRegisterPattern(MemoryPattern *mempat); #ifdef __cplusplus } #endif #endif /* INIT_H_ */ clblas-2.10/src/library/blas/gens/kprintf.cpp000066400000000000000000002424261264277366700212230ustar00rootroot00000000000000/* ************************************************************************ * Copyright 2013 Advanced Micro Devices, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * ************************************************************************/ #include static const char *types[] = { "float", "float2", "float3", "float4", "float8", "float16", "double", "double2", "double3", "double4", "double8", "double16" }; static const char*vloadTypes[] = { "vload", "vload2", "vload3", "vload4", "vload8", "vload16" }; static const char*vstoreTypes[] = { "vstore", "vstore2", "vstore3", "vstore4", "vstore8", "vstore16" }; static const char *vecIndices[] = { "S0", "S1", "S2", "S3", "S4", "S5", "S6", "S7", "S8", "S9", "SA", "SB", "SC", "SD", "SE", "SF" }; static const char *vecIndicesWithDot[] = { ".S0", ".S1", ".S2", ".S3", ".S4", ".S5", ".S6", ".S7", ".S8", ".S9", ".SA", ".SB", ".SC", ".SD", ".SE", ".SF" }; static const char *vecComplexIndicesWithDot[] = { ".s01", ".s23", ".s45", ".s67", ".s89", ".sAB", ".sCD", ".sEF" }; static const char *vectorWidthTypes[] = { "1", "2", "3", "4", "6", "8", "16" }; static const char *numbers[] = { "0", "1", "2", "3", "4" , "5", "6" ,"7", "8", "9", "10", "11", "12", "13", "14", "15", "16" }; //#define MUL_SCALAR_UNROLL //#define DIV_SCALAR_UNROLL kprintf::fmt_t kprintf::get(const char *key) { std::vector::iterator t; int l, knownLength, lengthKeyMax = -1; struct fmt retval; retval.key=NULL; retval.value=NULL; knownLength = (int)strlen(key); for(t = v.begin(); t != v.end(); t++) { l = (int)strlen((*t).key); if (l > knownLength) { continue; } if (strncmp(key, (*t).key, l) == 0) { if (l > lengthKeyMax) { retval = (*t); lengthKeyMax = l; } } } return retval; } const char * kprintf::findType(char *type) { size_t i; for(i=0; i 1) { generateVecSuffix(vecSuffix, effectiveVectorWidthOnBaseType); generateVecSuffix(vecSuffixPtype, vecWidth); strcat(derivedType, vecSuffix); strcat(derivedTypePtype, vecSuffixPtype); string = findType(derivedType); if (string != NULL) { put("%TYPE%V", string ); DERIVED = string; } else { std::cout << "kprint() constructor: Invalid vector width specified" << std::endl; throw -1; } string = findType(derivedTypePtype); if (string != NULL) { put("%PTYPE%V", string ); } else { std::cout << "kprint() constructor: Invalid vector width specified" << std::endl; throw -1; } } else { put("%TYPE%V", baseType); string = findType(derivedTypePtype); put("%PTYPE%V", string); // FIXED DERIVED = baseType; } // // Register HALF (%HV), QUARTER(%QV), HALF_QUARTER(%OV) types // struct fmt f; f = get("%TYPE%V"); registerReducedTypes(f.value, 2); registerReducedTypes(f.value, 4); registerReducedTypes(f.value, 8); registerSuperTypes(f.value, 2); registerSuperTypes(f.value, 4); registerSuperTypes(f.value, 8); HALFWORD = get("%TYPE%HV").value; QUARTERWORD = get("%TYPE%QV").value; HALFQUARTERWORD = get("%TYPE%OV").value; registerVectorWidth(); // Register MakeVector : V, HV, QV, OV put("%MAKEV", NULL); put("%MAKHV", NULL); put("%MAKQV", NULL); put("%MAKOV", NULL); } void kprintf::registerReducedTypes( const char* in, int div) { char vecSuffix[3] = {0}; char tempStr[9] = {0}; const char* reducedCase = (div == 2) ? "%TYPE%HV" : ( div == 4) ? "%TYPE%QV" : "%TYPE%OV"; const char* reducedVectorLength = (div == 2) ? "%HV" : ( div == 4) ? "%QV" : "%OV"; bool vecSuffixEmpty = false; if ( !( effectiveVectorWidthOnBaseType / div)) { //std::cout << "Warning : Vector reduces to zero - registering " << reducedCase << " as NULL" << std::endl; put(reducedCase, "NULL"); return; } if ((effectiveVectorWidthOnBaseType / div) > 1) { generateVecSuffix( vecSuffix, effectiveVectorWidthOnBaseType / div); } else { vecSuffix[0] = '\0'; vecSuffixEmpty = true; } if( in[4] == 't') // float { strcpy( tempStr, "float"); } else { strcpy( tempStr, "double"); } strcat( tempStr, vecSuffix); put( reducedCase, findType(tempStr)); if (vecSuffixEmpty == false) put( reducedVectorLength, findVectorWidthType(vecSuffix)); else put( reducedVectorLength, "1"); } void kprintf::registerSuperTypes( const char* in, int mul) { char vecSuffix[3] = {0}; char tempStr[9] = {0}; const char* superCase = ((mul == 2) ? "%TYPE%DV" : ( mul == 4) ? "%TYPE%QUADV" : "%TYPE%OCTAV"); const char* superVectorLength = ((mul == 2) ? "%DV" : ( mul == 4) ? "%QUADV" : "%OCTAV"); if ( ( effectiveVectorWidthOnBaseType * mul) > 16) { //std::cout << "Warning : Super Vector is not a OCL type- registering " << superCase << " as NULL" << std::endl; put(superCase, "NULL"); return; } if ((effectiveVectorWidthOnBaseType * mul) > 1) { generateVecSuffix( vecSuffix, effectiveVectorWidthOnBaseType * mul); } else { vecSuffix[0] = '\0'; } if( in[4] == 't') // float { strcpy( tempStr, "float"); } else { strcpy( tempStr, "double"); } strcat( tempStr, vecSuffix); put( superCase, findType(tempStr)); put( superVectorLength, findVectorWidthType(vecSuffix)); } char* kprintf::mystrtok( char* in, const char* tok) { char* last; if ( in ) // in is not NULL { last = in; // Initialize strtokPtr strtokPtr = in; // look for '(' while( *strtokPtr != '(') { strtokPtr++; } *strtokPtr = '\0'; strtokPtr++; strtokCount = 1; } else { last = strtokPtr; // Look for tokens other than '(' while(strtokPtr[0]) { bool tokenFound = false; for( size_t i=0 ; i <= (strlen(tok) - 1); i++) { if (*strtokPtr == tok[i]) { if ( tok[i] == '(') { strtokCount++; continue; } else if ( tok[i] == ')') { strtokCount--; if ( strtokCount != 0) { continue; } } // Token matched *strtokPtr = '\0'; tokenFound = true; break; } } if ( tokenFound) { strtokPtr++; break; } strtokPtr++; } } return last; } // // VLOAD %TYPE%V from (%PTYPE*) kind of memory locations // The Kernel writers should use "%TYPE" and "%TYPE%V" for kernel aguments, local variables etc.. // However, while loading using %VLOAD, they should cast the pointers as "%PTYPE *" because // VLOADn imposes certain restrictions. // Having the pointers as %TYPE and %TYPE%V relieves us from address calculations for primitives // which are vectors (like float2, double2 etc..) // void kprintf::registerVLOAD() { const char *string; char vecSuffix[3] = {0}; char tempStr[9] = {0}; generateVecSuffix( vecSuffix, effectiveVectorWidthOnBaseType); // VLOAD %TYPE%V from %PTYPE kind of memory locations strcpy( tempStr, "vload"); strcat( tempStr, vecSuffix); string = findTypeVLOAD(tempStr); if (string != NULL) { put( "%VLOAD", string); } else { std::cerr << "registerVLOAD: " << tempStr << " not a valid VLOAD type" << std::endl; } } void kprintf::registerVSTORE(void) { const char *string; char vecSuffix[3] = {0}; char tempStr[9] = {0}; generateVecSuffix( vecSuffix, effectiveVectorWidthOnBaseType); // VSTORE %TYPE%V from %PTYPE kind of memory locations strcpy( tempStr, "vstore"); if (effectiveVectorWidthOnBaseType > 1) { strcat( tempStr, vecSuffix); } string = findTypeVSTORE(tempStr); if (string != NULL) { put( "%VSTORE_VALUE", string); } else { std::cerr << "registerVSTORE: " << tempStr << " not a valid VSTORE type" << std::endl; } } void kprintf::registerVectorWidth() { const char *string; char vecSuffix[3] = {0}; generateVecSuffix( vecSuffix, vectorWidth); // VLOAD %TYPE%V from %PTYPE kind of memory locations string = findVectorWidthType(vecSuffix); if (string != NULL) { put( "%V", string); } else { std::cerr << "registerVectorWidth: " << string << " not a valid Vector Width size" << std::endl; } } void kprintf::handleMakeVector(char **_src, char **_dst, int div) { int numCharsWritten = 0; char id[256]; char * ptr; char *src = *_src; char *dst = *_dst; ptr = mystrtok( src, "()"); ptr = mystrtok( NULL, "()"); // Get ID strcpy( id, ptr); *_src = ptr + strlen(ptr) + 1; if ( div == 0 ) // Scalar Case { numCharsWritten = sprintf(dst,"(%s)(", BASE); dst += numCharsWritten; if ( s_or_v == VECTOR) { if ( strcmp( BASE,"float") == 0 || strcmp( BASE,"float2") == 0) { numCharsWritten = sprintf(dst," %s%c,", id, 'f'); } else { numCharsWritten = sprintf(dst," %s,", id); } dst += numCharsWritten; } if ( strcmp( BASE,"float") == 0 || strcmp( BASE,"float2") == 0 ) { numCharsWritten = sprintf(dst," %s%c)", id,'f'); } else { numCharsWritten = sprintf(dst," %s)", id); } dst += numCharsWritten; *_dst = dst; } else { numCharsWritten = sprintf(dst,"(%s)(", (div == 1)? DERIVED : (div == 2)? HALFWORD : (div == 4)? QUARTERWORD: HALFQUARTERWORD); dst += numCharsWritten; for( int i = 1 ; i < (vectorWidth/ div); i++) { numCharsWritten = sprintf(dst," %s,", id); dst += numCharsWritten; } numCharsWritten = sprintf(dst," %s)", id); dst += numCharsWritten; *_dst = dst; } } void kprintf::handleMUL(char **_src, char **_dst, bool vmul) { int numCharsWritten = 0; char id1[256], id2[256], id3[256]; char * ptr; char *src = *_src; char *dst = *_dst; int vwidth=1; ptr = mystrtok( src, "(,)"); ptr = mystrtok( NULL, "(,)"); // Get first ID strcpy( id1, ptr); ptr = mystrtok( NULL, "(,)"); // Get second ID strcpy( id2, ptr); ptr = mystrtok( NULL, "(,)"); // Get third ID strcpy( id3, ptr); *_src = ptr + strlen(ptr) + 1; //std::cout << id1 << " " << id2 << " " << id3 << std::endl; if ( (strcmp(id1, id2) == 0) || (strcmp(id1, id3)==0) || (strcmp(id2,id3) == 0) ) { if (vmul == false) { std::cout << "%MUL( C, A, B) : C , A and B have to be UNIQUE" << std::endl; } else { std::cout << "%VMUL( C, A, B) : C , A and B have to be UNIQUE" << std::endl; } throw -1; } switch(s_or_v) { case SCALAR: numCharsWritten = sprintf(dst, "%s = %s * %s", id1, id2, id3); dst += numCharsWritten; break; case VECTOR: if (vmul == true) { vwidth = vectorWidth; } else { vwidth = 1; } #ifdef MUL_SCALAR_UNROLL for(int i=0; i 2) { if ((i % 2) == 0) { numCharsWritten = sprintf(dst, "%s.%s = %s.%s;\n", id1, vecIndices[i], id2, vecIndices[i/2]); dst += numCharsWritten; } else { numCharsWritten = sprintf(dst, "%s.%s = %s.%s;\n", id1, vecIndices[i], id3, vecIndices[i/2]); dst += numCharsWritten; } } else { if ((i % 2) == 0) { numCharsWritten = sprintf(dst, "%s.%s = %s;\n", id1, vecIndices[i], id2); dst += numCharsWritten; } else { numCharsWritten = sprintf(dst, "%s.%s = %s;\n", id1, vecIndices[i], id3); dst += numCharsWritten; } } } break; default: std::cout << "handleComplexJoin: s_or_v is neither scalar nor a vector" << std::endl; throw -1; } *_dst = dst; } void kprintf::handleDIV(char **_src, char **_dst, bool vdiv) { int numCharsWritten = 0; char id1[256], id2[256], id3[256]; char * ptr; char *src = *_src; char *dst = *_dst; int vwidth=1; ptr = mystrtok( src, "(,)"); ptr = mystrtok( NULL, "(,)"); // Get first ID strcpy( id1, ptr); ptr = mystrtok( NULL, "(,)"); // Get second ID strcpy( id2, ptr); ptr = mystrtok( NULL, "(,)"); // Get third ID strcpy( id3, ptr); *_src = ptr + strlen(ptr) + 1; //std::cout << id1 << " " << id2 << " " << id3 << std::endl; if ( (strcmp(id1, id2) == 0) || (strcmp(id1, id3)==0) || (strcmp(id2,id3) == 0) ) { if (vdiv == false) { std::cout << "%DIV( C, A, B) : C , A and B have to be UNIQUE" << std::endl; } else { std::cout << "%VDIV( C, A, B) : C , A and B have to be UNIQUE" << std::endl; } throw -1; } switch(s_or_v) { case SCALAR: numCharsWritten = sprintf(dst, "%s = %s / %s", id1, id2, id3); dst += numCharsWritten; break; case VECTOR: if (vdiv == true) { vwidth = vectorWidth; } else { vwidth = 1; } #ifdef DIV_SCALAR_UNROLL for(int i=0; idoVLOAD) || (effectiveVectorWidthOnBaseType == 1)) { numCharsWritten = sprintf(dst, "*((__global %s*)(%s))", DERIVED, ptr); dst += numCharsWritten; } else { offsetptr = id2; for( int i=0; ; i++, offsetptr++) { if ( *offsetptr == ',') break; } offsetptr++; *offsetptr = '\0'; const char *string; char vecSuffix[3] = {0}; char tempStr[9] = {0}; generateVecSuffix( vecSuffix, effectiveVectorWidthOnBaseType); // VLOAD %TYPE%V from %PTYPE kind of memory locations strcpy( tempStr, "vload"); strcat( tempStr, vecSuffix); string = findTypeVLOAD(tempStr); if (string != NULL) { put( "%VLOAD", string); } else { std::cerr << "handleAlignedDataAccess: " << tempStr << " not a valid VLOAD type" << std::endl; } struct fmt f; f = get("%PTYPE"); numCharsWritten = sprintf(dst, "%s( %s (__global %s *)%s)", tempStr, id2, f.value, ptr); dst += numCharsWritten; } *_dst = dst; } // // %VSTORE(data, 0, address) // void kprintf::handleAlignedVSTORE(char **_src, char **_dst) { int numCharsWritten = 0; char * ptr, *id1, *id2, *id3; char *src = *_src; char *dst = *_dst; ptr = mystrtok( src, "()"); // Get rid of %VSTORE keyword id1 = mystrtok( NULL, ","); // PTR now points to "data" id2 = mystrtok( NULL, ","); // PTR now points to "0" id3 = mystrtok( NULL, "()"); // PTR now points to "address" which is wrapped around in () *_src = id3 + strlen(id3) + 1; if (( ! this->doVSTORE) || (effectiveVectorWidthOnBaseType == 1)) { numCharsWritten = sprintf(dst, "*((__global %s*)(%s) + %s) = %s", DERIVED, id3, id2, id1); // NOTE:Assuming "__global" dst += numCharsWritten; } else { struct fmt vstore, ptype; vstore = get("%VSTORE_VALUE"); ptype = get("%PTYPE"); if ((vstore.value == NULL) || (ptype.value == NULL)) { numCharsWritten = sprintf(dst, "--ERROR in VSTORE--"); dst += numCharsWritten; return; } numCharsWritten = sprintf(dst, "%s( %s, %s, (__global %s *)%s)", vstore.value, id1, id2, ptype.value, id3); dst += numCharsWritten; } *_dst = dst; return; } void kprintf::handlePredicate(char **_src, char **_dst) { //int numCharsWritten = 0; char * ptr, *id1; char *src = *_src; char *dst = *_dst; ptr = mystrtok( src, "()"); // Get rid of %IF keyword id1 = mystrtok( NULL, ")"); // PTR now points to "data" *_src = id1 + strlen(id1) + 1; src = *_src; struct fmt predicate = get(id1); int condition = atoi(predicate.value); if (condition >= 1) // PENDING: (condition > 1) worked fine before. { //printf("KPRINTF: Handle Predicate is TRUE - Predicate = %s\n", predicate.value); return; } else { //printf("KPRINTF: Handle Predicate is FALSE - predicate = %s\n", predicate.value); while((*src != '\0') && (*src != '\n')) { src++; } *dst = '\n'; dst++; } *_dst = dst; *_src = src; return; } void kprintf::handleADD_SUB(char **_src, char **_dst, const char op) { int numCharsWritten = 0; char id1[256], id2[256], id3[256]; char * ptr; char *src = *_src; char *dst = *_dst; ptr = mystrtok( src, "(,)"); ptr = mystrtok( NULL, "(,)"); // Get first ID strcpy( id1, ptr); ptr = mystrtok( NULL, "(,)"); // Get second ID strcpy( id2, ptr); ptr = mystrtok( NULL, "(,)"); // Get third ID strcpy( id3, ptr); *_src = ptr + strlen(ptr) + 1; numCharsWritten = sprintf(dst, "%s = %s %c %s", id1, id2, op, id3); dst += numCharsWritten; *_dst = dst; } void kprintf::handleVLoadWithIncx(char **_src, char **_dst, bool ignoreFirst) { int numCharsWritten = 0; char id1[256], id2[256], id3[256]; char * ptr; char *src = *_src; char *dst = *_dst; ptr = mystrtok( src, "(,)"); ptr = mystrtok( NULL, "(,)"); // Get first ID strcpy( id1, ptr); ptr = mystrtok( NULL, "(,)"); // Get second ID strcpy( id2, ptr); ptr = mystrtok( NULL, "(,)"); // Get third ID strcpy( id3, ptr); *_src = ptr + strlen(ptr) + 1; if (ignoreFirst == false) { numCharsWritten = sprintf(dst,"%s = ", id1); dst += numCharsWritten; } numCharsWritten = sprintf(dst,"(%s)(", DERIVED); dst += numCharsWritten; for( int i = 0 ; i < (vectorWidth - 1); i++) { numCharsWritten = sprintf(dst," %s[0 + (%s * %d)],", id2, id3, i); dst += numCharsWritten; } numCharsWritten = sprintf(dst," %s[0 + (%s * %d)])", id2, id3, vectorWidth - 1); dst += numCharsWritten; *_dst = dst; } void kprintf::handleVStoreWithIncx(char **_src, char **_dst) { int numCharsWritten = 0; char id1[256], id2[256], id3[256]; char * ptr; char *src = *_src; char *dst = *_dst; ptr = mystrtok( src, "(,)"); ptr = mystrtok( NULL, "(,)"); // Get first ID strcpy( id1, ptr); ptr = mystrtok( NULL, "(,)"); // Get second ID strcpy( id2, ptr); ptr = mystrtok( NULL, "(,)"); // Get third ID strcpy( id3, ptr); *_src = ptr + strlen(ptr) + 1; if ( s_or_v == SCALAR) { for( int i = 0 ; i < (vectorWidth); i++) { if (vectorWidth != 1) { numCharsWritten = sprintf(dst," %s[0 + (%s * %d)] = %s.%s;\n", id1, id3, i, id2, vecIndices[i]); } else { numCharsWritten = sprintf(dst," %s[0 + (%s * %d)] = %s;\n", id1, id3, i, id2); } dst += numCharsWritten; } } else { for( int i = 0 ; i < (vectorWidth); i++) { numCharsWritten = sprintf(dst," %s[0 + (%s * %d)] = %s.s%d%d;\n", id1, id3, i, id2, (i*2), (i*2 + 1)); dst += numCharsWritten; } } *_dst = dst; } void kprintf::handleReduceSum(char **_src, char **_dst) { int numCharsWritten = 0; char id1[256]; char * ptr; char *src = *_src; char *dst = *_dst; ptr = mystrtok( src, "(,)"); ptr = mystrtok( NULL, "(,)"); // Get first ID strcpy( id1, ptr); *_src = ptr + strlen(ptr) + 1; if(vectorWidth > 1) { if ( s_or_v == SCALAR) { for( int i = 0 ; i < (vectorWidth - 1); i++) { numCharsWritten = sprintf(dst,"%s.%s + ", id1, vecIndices[i]); dst += numCharsWritten; } numCharsWritten = sprintf(dst,"%s.%s;\n", id1, vecIndices[ (vectorWidth - 1)]); dst += numCharsWritten; } else { for( int i = 0 ; i < (vectorWidth- 1); i++) { numCharsWritten = sprintf(dst,"%s.s%d%d + ", id1,(i*2), (i*2 + 1)); dst += numCharsWritten; } numCharsWritten = sprintf(dst,"%s.s%d%d;\n", id1,((vectorWidth- 1)*2), ((vectorWidth- 1)*2 + 1)); dst += numCharsWritten; } } else { numCharsWritten = sprintf(dst,"(%s);\n", id1); dst += numCharsWritten; } *_dst = dst; } void kprintf::handleReduceMax(char **_src, char **_dst) { int numCharsWritten = 0; // val, maxVal, index, impl char id1[256], id2[256], id3[256], id4[256]; char tempStr[512]; char * ptr; char *src = *_src; char *dst = *_dst; bool reduceMaxWithIndex = false, followLowIndex = true; ptr = mystrtok( src, "(,)"); ptr = mystrtok( NULL, "(,)"); // Get first ID strcpy( id1, ptr); // After the first parameter is parsed, extract everything till you encounter ';' // Store this substring in a temp string. Then check if any extra parameter(overloaded) was passed using this substring ptr = mystrtok( NULL, ";"); *_src = ptr + strlen(ptr) + 1; // 'src' string parsing is over at this point tempStr[0] = '('; tempStr[1] = 0; strcat(tempStr, ptr); ptr = mystrtok( tempStr, "(,)"); ptr = mystrtok( NULL, "(,)"); // extract 2nd parameter from tempStr. Will be empty if 2nd parameter was not passed strcpy( id2, ptr); ptr = mystrtok( NULL, "(,)"); strcpy( id3, ptr); ptr = mystrtok( NULL, "(,)"); strcpy( id4, ptr); if(strcmp(id3, "") != 0) { reduceMaxWithIndex = true; } if(!strcmp(id4, "0")) { followLowIndex = false; } #ifdef DEBUG_AMAX std::cerr << "Handling AMAX CASE: reduceMaxWithIndex:" << reduceMaxWithIndex << " and followLowIndex: " << followLowIndex << " id1:" << id1 << " id2:" << id2 << " id3:" << id3 << " id4:" << id4 << std::endl; #endif if(vectorWidth > 1) { if ((s_or_v == SCALAR) && (!reduceMaxWithIndex)) { for( int i = 0 ; i < (vectorWidth - 1); i++) { numCharsWritten = sprintf(dst,"fmax( %s.%s, ", id1, vecIndices[i]); dst += numCharsWritten; } numCharsWritten = sprintf(dst," %s.%s ", id1, vecIndices[ (vectorWidth - 1)]); dst += numCharsWritten; for( int i = 0 ; i < (vectorWidth - 1); i++) { numCharsWritten = sprintf(dst,")"); dst += numCharsWritten; } numCharsWritten = sprintf(dst,";\n"); dst += numCharsWritten; } else if(reduceMaxWithIndex) { if(followLowIndex) { numCharsWritten = sprintf(dst,"%s = 0;",id3); dst += numCharsWritten; for(int i = 1 ; i < (vectorWidth); i++) { numCharsWritten = sprintf(dst,"\n\t(%s.%s > %s.S0)? (%s = %d, %s.S0 = %s.%s):1;", id1, vecIndices[i], id1, id3, i, id1, id1, vecIndices[i]); dst += numCharsWritten; } numCharsWritten = sprintf(dst,"\n\t%s = %s.s0;", id2, id1); dst += numCharsWritten; } else // Follow High Index { numCharsWritten = sprintf(dst,"%s = 0;",id3); dst += numCharsWritten; for(int i = 1 ; i < (vectorWidth); i++) { numCharsWritten = sprintf(dst,"\n\t(%s.%s >= %s.S0)? (%s = %d, %s.S0 = %s.%s):1;", id1, vecIndices[i], id1, id3, i, id1, id1, vecIndices[i]); dst += numCharsWritten; } numCharsWritten = sprintf(dst,"\n\t%s = %s.s0;", id2, id1); dst += numCharsWritten; } } else { for( int i = 0 ; i < (vectorWidth - 1); i++) { numCharsWritten = sprintf(dst,"fmax( %s.s%d%d, ", id1, (i*2), (i*2 + 1)); dst += numCharsWritten; } numCharsWritten = sprintf(dst," %s.s%d%d ", id1, ((vectorWidth- 1)*2), ((vectorWidth- 1)*2 + 1)); dst += numCharsWritten; for( int i = 0 ; i < (vectorWidth - 1); i++) { numCharsWritten = sprintf(dst,")"); dst += numCharsWritten; } numCharsWritten = sprintf(dst,";\n"); dst += numCharsWritten; } } else { if(reduceMaxWithIndex) { numCharsWritten = sprintf(dst, "%s = 0;\n",id3); dst += numCharsWritten; numCharsWritten = sprintf(dst, "%s = %s;\n", id2, id1); dst += numCharsWritten; } else { numCharsWritten = sprintf(dst,"(%s);\n", id1); dst += numCharsWritten; } } *_dst = dst; } void kprintf::handleReduceMin(char **_src, char **_dst) { int numCharsWritten = 0; char id1[256]; char * ptr; char *src = *_src; char *dst = *_dst; ptr = mystrtok( src, "(,)"); ptr = mystrtok( NULL, "(,)"); // Get first ID strcpy( id1, ptr); *_src = ptr + strlen(ptr) + 1; if(vectorWidth > 1) { if ( s_or_v == SCALAR) { for( int i = 0 ; i < (vectorWidth - 1); i++) { numCharsWritten = sprintf(dst,"fmin( %s.%s, ", id1, vecIndices[i]); dst += numCharsWritten; } numCharsWritten = sprintf(dst," %s.%s ", id1, vecIndices[ (vectorWidth - 1)]); dst += numCharsWritten; for( int i = 0 ; i < (vectorWidth - 1); i++) { numCharsWritten = sprintf(dst,")"); dst += numCharsWritten; } numCharsWritten = sprintf(dst,";\n"); dst += numCharsWritten; } else { for( int i = 0 ; i < (vectorWidth - 1); i++) { numCharsWritten = sprintf(dst,"fmin( %s.s%d%d, ", id1, (i*2), (i*2 + 1)); dst += numCharsWritten; } numCharsWritten = sprintf(dst," %s.s%d%d ", id1, ((vectorWidth- 1)*2), ((vectorWidth- 1)*2 + 1)); dst += numCharsWritten; for( int i = 0 ; i < (vectorWidth - 1); i++) { numCharsWritten = sprintf(dst,")"); dst += numCharsWritten; } numCharsWritten = sprintf(dst,";\n"); dst += numCharsWritten; } } else { numCharsWritten = sprintf(dst,"(%s);\n", id1); dst += numCharsWritten; } *_dst = dst; } void kprintf::handleReduceHypot(char **_src, char **_dst) { int numCharsWritten = 0; char id1[256]; char * ptr; char *src = *_src; char *dst = *_dst; ptr = mystrtok( src, "(,)"); ptr = mystrtok( NULL, "(,)"); // Get first ID strcpy( id1, ptr); *_src = ptr + strlen(ptr) + 1; if(vectorWidth > 1) { if ( s_or_v == SCALAR) { for( int i = 0 ; i < (vectorWidth - 1); i++) { numCharsWritten = sprintf(dst,"hypot( %s.%s, ", id1, vecIndices[i]); dst += numCharsWritten; } numCharsWritten = sprintf(dst," %s.%s ", id1, vecIndices[ (vectorWidth - 1)]); dst += numCharsWritten; for( int i = 0 ; i < (vectorWidth - 1); i++) { numCharsWritten = sprintf(dst,")"); dst += numCharsWritten; } numCharsWritten = sprintf(dst,";\n"); dst += numCharsWritten; } else { for( int i = 0 ; i < (vectorWidth - 1); i++) { numCharsWritten = sprintf(dst,"hypot( %s.s%d%d, ", id1, (i*2), (i*2 + 1)); dst += numCharsWritten; } numCharsWritten = sprintf(dst," %s.s%d%d ", id1, ((vectorWidth- 1)*2), ((vectorWidth- 1)*2 + 1)); dst += numCharsWritten; for( int i = 0 ; i < (vectorWidth - 1); i++) { numCharsWritten = sprintf(dst,")"); dst += numCharsWritten; } numCharsWritten = sprintf(dst,";\n"); dst += numCharsWritten; } } else { numCharsWritten = sprintf(dst,"(%s);\n", id1); dst += numCharsWritten; } *_dst = dst; } // // scalar = %REDUCE_SUM_REAL_HV(half-vector), %REDUCE_SUM_REAL_V(vector) // void kprintf::handleReduceSumReal(char **_src, char **_dst, int vlength) { int numCharsWritten = 0; char id1[256]; char * ptr; char *src = *_src; char *dst = *_dst; ptr = mystrtok( src, "(,)"); ptr = mystrtok( NULL, "(,)"); // Get first ID strcpy( id1, ptr); *_src = ptr + strlen(ptr) + 1; if (!vlength) //Can happen for SCALAR cases where source code contains this within COMPLEX define { // // Dont generate a thing. // The src pointer has already been advanced to next line // Just move on.. // return; } if (vlength != 1) { for( int i = 0 ; i < (vlength - 1); i++) { numCharsWritten = sprintf(dst,"(%s).%s + ", id1, vecIndices[i]); dst += numCharsWritten; } numCharsWritten = sprintf(dst,"(%s).%s;\n", id1, vecIndices[ (vlength - 1)]); dst += numCharsWritten; } else { numCharsWritten = sprintf(dst,"(%s);\n ", id1); dst += numCharsWritten; } *_dst = dst; } void kprintf::handleCONJUGATE(char **_src, char **_dst) { // %CONJUGATE( doConj, loadedA ); // loadedA = ((doConj == 1)? (loadedA.odd = -loadedA.odd, loadedA) : loadedA); int numCharsWritten = 0; char id1[256], id2[256]; char * ptr; char *src = *_src; char *dst = *_dst; ptr = mystrtok( src, "(,)"); ptr = mystrtok( NULL, "(,)"); // Get first ID strcpy( id1, ptr); ptr = mystrtok( NULL, "(,)"); // Get second ID strcpy( id2, ptr); *_src = ptr + strlen(ptr) + 1; if ( s_or_v == VECTOR) { numCharsWritten = sprintf(dst,"%s = ((%s == 1)? ( %s.odd = -%s.odd, %s) : %s)", id2, id1, id2, id2, id2, id2); dst += numCharsWritten; } *_dst = dst; } void kprintf::handleClearImaginary(char **_src, char **_dst) { // %CLEAR_IMAGINARY( varName ); // generates varName.odd = 0; incase of complex type int numCharsWritten = 0; char id1[256]; char * ptr; char *src = *_src; char *dst = *_dst; ptr = mystrtok( src, "(,)"); ptr = mystrtok( NULL, "(,)"); // Get first ID strcpy( id1, ptr); *_src = ptr + strlen(ptr) + 1; if ( s_or_v == VECTOR) { numCharsWritten = sprintf(dst,"%s.odd = 0.0f", id1); dst += numCharsWritten; } *_dst = dst; } static const char * itoa(int n) { if (n > 16) return (const char*) NULL; return numbers[n]; } // // PENDING: COMPLEX DATA TYPE HANDLING may need special attention // void kprintf::handleVFOR(char **src, char **dst, bool isReal) { char *start, *end; char *vforBody, *vforBodyTemp, *vforGeneratedBody; int bracecount = 0; int vforBodyLength; if (isReal == false) { start = (*src) + strlen("%VFOR"); } else { start = (*src) + strlen("%VFOR_REAL"); } while ( (*start != '{') && (*start != 0)) { //PENDING: if (notwhitespace(*start)) { signal exception bad syntax } start++; } if (*start == 0) { // PENDING: Raise an EXCEPTION! printf("KPRINTF: handleVFOR: Bad Syntax...\n"); return; } bracecount = 1; end = start+1; while(bracecount) { if (*end == 0) { break; } else if (*end == '{') { bracecount++; } else if (*end == '}') { bracecount--; } end++; } if (*end == 0) { // PENDING: Raise an EXCEPTION! printf("KPRINTF: handleVFOR: Bad Syntax...\n"); return; } vforBodyLength = end - start; vforBody = (char*)malloc((vforBodyLength + 1)*sizeof(char)); vforBodyTemp = (char*)malloc((vforBodyLength + 1)*sizeof(char)); vforGeneratedBody = (char*)malloc(((vforBodyLength + 1)*sizeof(char)) * vectorWidth * 2); memcpy(vforBody, start, vforBodyLength); vforBody[vforBodyLength] = 0; for(int v=0; vdataType, this->vectorWidth, this->doVLOAD, this->doVSTORE); child->put("%VFORINDEX", itoa(v)); if ((isReal == true) || (this->dataType == 'S') || (this->dataType == 'D')) { // // Treat like REAL type // if (vectorWidth != 1) { child->put("%VFORSUFFIX", vecIndicesWithDot[v]); } else { child->put("%VFORSUFFIX", ""); } } else { // Complex Data Type Involved if (vectorWidth != 1) { child->put("%VFORSUFFIX", vecComplexIndicesWithDot[v]); } else { child->put("%VFORSUFFIX", ""); } } strcpy(vforBodyTemp, vforBody); child->spit(vforGeneratedBody, vforBodyTemp); strcat(*dst, vforGeneratedBody); *dst += strlen(vforGeneratedBody); delete child; } *src = end; free(vforBody); free(vforBodyTemp); free(vforGeneratedBody); return; } void kprintf::handleReductionFramework(char **_src, char **_dst, REDUCTION_TYPE reductionType) { /* * Syntax: %REDUCTION_BY_SUM( privateVariableName ); or * %REDUCTION_BY_MAX( privateVariableName ); or * %REDUCTION_BY_MAX( privateVariableName, privateVariableName2, privateVarName3); or * %REDUCTION_BY_MIN( privateVariableName ); or * %REDUCTION_BY_HYPOT( privateVariableName ); or * %REDUCTION_BY_SSQ( scale, ssq ); * Reduces all elements in a workgroup by taking value from 'privateVariableName' of each work-item * and places the reduced item in 'privateVariableName' of the first work-item (work-item 0) * */ int numCharsWritten = 0; // Value, Index, Implementation char privateVarName[256], privateVarName2[256], privateVarName3[256]; char tempStr[512]; char * ptr; char *src = *_src; char *dst = *_dst; bool reductionWithIndex = false; RedWithIndexImpl impl; ptr = mystrtok( src, "(,)"); ptr = mystrtok( NULL, "(,)"); // Get first ID strcpy( privateVarName, ptr); // After the first parameter is parsed, extract everything till you encounter ';' // Store this substring in a temp string. Then check if any extra parameter(overloaded) was passed using this substring ptr = mystrtok( NULL, ";"); *_src = ptr + strlen(ptr) + 1; // 'src' string parsing is over at this point tempStr[0] = '('; tempStr[1] = 0; strcat(tempStr, ptr); ptr = mystrtok( tempStr, "(,)"); ptr = mystrtok( NULL, "(,)"); // extract 2nd parameter from tempStr. Will be empty if 2nd parameter was not passed strcpy( privateVarName2, ptr); ptr = mystrtok( NULL, "(,)"); strcpy( privateVarName3, ptr); // This indicates that there was a second parameter in the call // Overloaded call of REDUCTION_BY_MAX for MAX_WITH_INDEX // if(strcmp(privateVarName3, "") != 0) { reductionWithIndex = true; if(!strcmp(privateVarName3, "0")) { impl = ATOMIC_FLI; } else if(!strcmp(privateVarName3, "1")) { impl = REG_FLI; } else if(!strcmp(privateVarName3, "2")) { impl = ATOMIC_FHI; } else if(!strcmp(privateVarName3, "3")) { impl = REG_FHI; } else { std::cerr << "ERROR: Invalid Reduction Type implementation"; } } char ldsVarName[8], ldsVarName2[8], localId[8], selected[8]; char p1[8], p2[8], p3[8], p4[8], p5[8]; getRandomString(ldsVarName, 5); getRandomString(ldsVarName2, 5); getRandomString(localId, 5); getRandomString(selected, 5); getRandomString(p1, 5); getRandomString(p2, 5); getRandomString(p3, 5); getRandomString(p4, 5); getRandomString(p5, 5); if(reductionWithIndex) { numCharsWritten = sprintf(dst, "uint %s;\n", selected); dst += numCharsWritten; numCharsWritten = sprintf(dst, "__local %s %s [ %d ];\n", (get("%PTYPE").value), ldsVarName, (this->wgSize)); dst += numCharsWritten; numCharsWritten = sprintf(dst, "\tuint %s = get_local_id(0);\n\t%s [ %s ] = %s;\n", localId, ldsVarName, localId, privateVarName); dst += numCharsWritten; switch(impl) { case REG_FLI: numCharsWritten = sprintf(dst, "\t__local uint %s [ %d ];\n", ldsVarName2, (this->wgSize)); dst += numCharsWritten; numCharsWritten = sprintf(dst, "\t%s [ %s ] = %s;\n", ldsVarName2, localId, privateVarName2); dst += numCharsWritten; break; case ATOMIC_FLI: numCharsWritten = sprintf(dst, "\t__local uint %s[1];\n", ldsVarName2); dst += numCharsWritten; numCharsWritten = sprintf(dst, "\tif(%s == 0){%s[0] = UINT_MAX;}\n", localId, ldsVarName2); dst += numCharsWritten; break; } } else { if(reductionType == REDUCTION_BY_SSQ) { numCharsWritten = sprintf(dst, "__local %s %s [ %d ], %s [ %d ];\n", (get("%PTYPE").value), ldsVarName, (this->wgSize), ldsVarName2, (this->wgSize) ); dst += numCharsWritten; numCharsWritten = sprintf(dst, "\tuint %s = get_local_id(0);\n\t %s [ %s ] = %s; %s [ %s ] = %s;\n", localId, ldsVarName, localId, privateVarName, ldsVarName2, localId, privateVarName2); dst += numCharsWritten; numCharsWritten = sprintf(dst, "\t%s %s, %s, %s, %s, %s;\n", (get("%PTYPE").value), p1, p2, p3, p4, p5); dst += numCharsWritten; } else { numCharsWritten = sprintf(dst, "__local %s %s [ %d ];\n", (get("%TYPE").value), ldsVarName, (this->wgSize)); dst += numCharsWritten; numCharsWritten = sprintf(dst, "\tuint %s = get_local_id(0);\n\t %s [ %s ] = %s;\n", localId, ldsVarName, localId, privateVarName); dst += numCharsWritten; } } numCharsWritten = sprintf(dst, "\tbarrier(CLK_LOCAL_MEM_FENCE);\n\n"); dst += numCharsWritten; // selected = (ldsVal[lid+32] > ldsVal[lid]) ? lid + 32 : lid; // selected = (ldsVal[lid+32] == ldsVal[lid]) ? (ldsIndex[lid+32] < ldsIndex[lid] ? lid + 32 : lid) : selected; for( int i=(this->wgSize/2); i>=2; i=(i/2) ) { if(reductionWithIndex) { switch(impl) { //case ATOMIC_FLI: //case ATOMIC_FHI: case REG_FLI: //case REG_FHI: numCharsWritten = sprintf(dst, "\tif( %s < %d ) {\n ", localId, i); dst += numCharsWritten; numCharsWritten = sprintf(dst, "\n\t%s = (%s[%s + %d] > %s[%s]) ? %s + %d : %s;", selected, ldsVarName, localId, i, ldsVarName, localId, localId, i, localId); dst += numCharsWritten; numCharsWritten = sprintf(dst, "\n\t%s = (%s[%s + %d] == %s[%s]) ? ((%s[%s + %d] < %s[%s]) ? %s + %d : %s) : %s;", selected, ldsVarName, localId, i, ldsVarName, localId, ldsVarName2, localId, i, ldsVarName2, localId, localId, i, localId, selected); dst += numCharsWritten; numCharsWritten = sprintf(dst, "\t%s[%s] = %s[%s];\n\t %s[%s] = %s[%s];\n", ldsVarName, localId, ldsVarName, selected, ldsVarName2, localId, ldsVarName2, selected); dst += numCharsWritten; break; case ATOMIC_FLI: numCharsWritten = sprintf(dst, "\tif( %s < %d ) {\n ", localId, i); dst += numCharsWritten; numCharsWritten = sprintf(dst, "\n\t%s[%s] = fmax(%s[%s + %d], %s[%s]);", ldsVarName, localId, ldsVarName, localId, i, ldsVarName, localId); dst += numCharsWritten; break; } } else { numCharsWritten = sprintf(dst, "\tif( %s < %d ) {\n\t\t", localId, i); dst += numCharsWritten; switch( reductionType ) { case REDUCTION_BY_SUM : numCharsWritten = sprintf(dst, " %s [ %s ] = %s [ %s ] + %s [ %s + %d ];\n", ldsVarName, localId, ldsVarName, localId, ldsVarName, localId, i); dst += numCharsWritten; break; case REDUCTION_BY_MAX : numCharsWritten = sprintf(dst, " %s [ %s ] = fmax( %s [ %s ] , %s [ %s + %d ] );\n", ldsVarName, localId, ldsVarName, localId, ldsVarName, localId, i); dst += numCharsWritten; break; case REDUCTION_BY_MIN : numCharsWritten = sprintf(dst, " %s [ %s ] = fmin( %s [ %s ] , %s [ %s + %d ] );\n", ldsVarName, localId, ldsVarName, localId, ldsVarName, localId, i); dst += numCharsWritten; break; case REDUCTION_BY_HYPOT : numCharsWritten = sprintf(dst, " %s [ %s ] = hypot( %s [ %s ] , %s [ %s + %d ] );\n", ldsVarName, localId, ldsVarName, localId, ldsVarName, localId, i); dst += numCharsWritten; break; case REDUCTION_BY_SSQ : numCharsWritten = sprintf(dst, " %s = %s = %s [ %s ];\n", p1, p2, ldsVarName, localId); dst += numCharsWritten; numCharsWritten = sprintf(dst, "\t %s = %s [ %s ];\n", p3, ldsVarName2, localId); dst += numCharsWritten; numCharsWritten = sprintf(dst, "\t %s = %s [ %s + %d];\n\t %s = %s [ %s + %d];\n", p4, ldsVarName, localId, i, p5, ldsVarName2, localId, i); dst += numCharsWritten; numCharsWritten = sprintf(dst, "\t %s = fmax( %s, %s );\n", p2, p2, p4); dst += numCharsWritten; numCharsWritten = sprintf(dst, "\t %s = (isnotequal(%s, (%s)0.0))?\n", p3, p2, (get("%PTYPE").value)); dst += numCharsWritten; numCharsWritten = sprintf(dst, "\t (((%s / %s) * (%s / %s) * %s) + ((%s / %s) * (%s / %s) * %s)) : %s;\n", p1, p2, p1, p2, p3, p4, p2, p4, p2, p5, p3); dst += numCharsWritten; numCharsWritten = sprintf(dst, "\t %s [ %s ] = %s;\n %s [ %s ] = %s;\n", ldsVarName, localId, p2, ldsVarName2, localId, p3); dst += numCharsWritten; break; default : printf("\nInvalid reduction operator!!\n"); throw -1; break; } } numCharsWritten = sprintf(dst, "\t}\n\tbarrier(CLK_LOCAL_MEM_FENCE);\n\n"); dst += numCharsWritten; } if(reductionWithIndex) { switch(impl) { case REG_FLI: numCharsWritten = sprintf(dst, "\tif( %s == 0 ) {\n\t%s = (%s[1] > %s[0]) ? 1 : 0;\n", localId, selected, ldsVarName, ldsVarName); dst += numCharsWritten; numCharsWritten = sprintf(dst, "\t%s = (%s[1] == %s[0]) ? ((%s[1] < %s[0]) ? 1 : 0) : %s;\n", selected, ldsVarName, ldsVarName, ldsVarName2, ldsVarName2, selected); dst += numCharsWritten; numCharsWritten = sprintf(dst, "\t%s = %s[%s];\n\t %s = %s[%s];}\n", privateVarName, ldsVarName, selected, privateVarName2, ldsVarName2, selected); dst += numCharsWritten; break; case ATOMIC_FLI: numCharsWritten = sprintf(dst, "\tif(%s == 0){%s[0] = fmax(%s[1], %s[0]);}\n", localId, ldsVarName, ldsVarName, ldsVarName); dst += numCharsWritten; numCharsWritten = sprintf(dst, "\tbarrier(CLK_LOCAL_MEM_FENCE);\n"); dst += numCharsWritten; numCharsWritten = sprintf(dst, "\tif(%s == %s[0]){atomic_min((%s + 0), %s);}\n", privateVarName, ldsVarName, ldsVarName2, privateVarName2); dst += numCharsWritten; numCharsWritten = sprintf(dst, "\tbarrier(CLK_LOCAL_MEM_FENCE);\n"); dst += numCharsWritten; numCharsWritten = sprintf(dst, "\tif(%s == 0){%s = %s[0]; %s = %s[0];}\n", localId, privateVarName2, ldsVarName2, privateVarName, ldsVarName); dst += numCharsWritten; numCharsWritten = sprintf(dst, "\tbarrier(CLK_LOCAL_MEM_FENCE);\n"); dst += numCharsWritten; break; } } else { numCharsWritten = sprintf(dst, "\tif( %s == 0 ) {\n\t", localId); dst += numCharsWritten; switch( reductionType ) { case REDUCTION_BY_SUM : numCharsWritten = sprintf(dst, "%s = %s [0] + %s [1];\n\t}", privateVarName, ldsVarName, ldsVarName); dst += numCharsWritten; break; case REDUCTION_BY_MAX : numCharsWritten = sprintf(dst, "%s = fmax( %s [0] , %s [1] );\n\t}", privateVarName, ldsVarName, ldsVarName); dst += numCharsWritten; break; case REDUCTION_BY_MIN : numCharsWritten = sprintf(dst, "%s = fmin( %s [0] , %s [1] );\n\t}", privateVarName, ldsVarName, ldsVarName); dst += numCharsWritten; break; case REDUCTION_BY_HYPOT : numCharsWritten = sprintf(dst, "%s = hypot( %s [0] , %s [1] );\n\t}", privateVarName, ldsVarName, ldsVarName); dst += numCharsWritten; break; case REDUCTION_BY_SSQ : numCharsWritten = sprintf(dst, " %s = %s = %s [0];\n", p1, p2, ldsVarName); dst += numCharsWritten; numCharsWritten = sprintf(dst, "\t %s = %s [0];\n", p3, ldsVarName2); dst += numCharsWritten; numCharsWritten = sprintf(dst, "\t %s = %s [1];\n\t %s = %s [1];\n", p4, ldsVarName, p5, ldsVarName2); dst += numCharsWritten; numCharsWritten = sprintf(dst, "\t %s = fmax( %s, %s );\n", p2, p2, p4); dst += numCharsWritten; numCharsWritten = sprintf(dst, "\t %s = (isnotequal(%s, (%s)0.0))?\n", p3, p2, (get("%PTYPE").value)); dst += numCharsWritten; numCharsWritten = sprintf(dst, "\t (((%s / %s) * (%s / %s) * %s) + ((%s / %s) * (%s / %s) * %s)) : %s;\n", p1, p2, p1, p2, p3, p4, p2, p4, p2, p5, p3); dst += numCharsWritten; numCharsWritten = sprintf(dst, "\t %s = %s;\n\t %s = %s;\n\t}", privateVarName, p2, privateVarName2, p3); dst += numCharsWritten; break; default : printf("\nInvalid reduction operator!!\n"); throw -1; break; } } *_dst = dst; } void kprintf::handleVABS(char **_src, char **_dst) { int numCharsWritten = 0; char id1[256]; char * ptr; char *src = *_src; char *dst = *_dst; ptr = mystrtok( src, "(,)"); ptr = mystrtok( NULL, "(,)"); // Get first ID strcpy( id1, ptr); *_src = ptr + strlen(ptr) + 1; if(s_or_v == SCALAR) { numCharsWritten = sprintf(dst, "fabs(%s)", id1); dst += numCharsWritten; } else { numCharsWritten = sprintf(dst, "fabs(%s.even) + fabs(%s.odd)", id1, id1); dst += numCharsWritten; } *_dst = dst; } void kprintf::getRandomString(char *str, int length) { static char charset[] = "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ1234567890"; length = (length==0)? 1: length; str[0] = charset[rand() % 52]; // First char has to be alphabet for (int i = 1; i < length; i++) str[i] = charset[rand() % 62]; str[length] = '\0'; return; } void kprintf::doConstruct(const char *type, int vecWidth, bool doVLOAD, bool doVSTORE, int _wgSize) { this->doVLOAD = doVLOAD; this->doVSTORE = doVSTORE; this->wgSize = _wgSize; if ((strcmp(type, "single") != 0) && (strcmp(type,"double") != 0) && (strcmp(type,"complex") != 0) && (strcmp(type,"doublecomplex") != 0)) { std::cout << "kprint() constructor: Type is not supported" << std::endl; throw -1; } if (vecWidth <= 0) { std::cout << "kprint() constructor: vecWidth is <= 0" << std::endl; throw -1; } maxKeySize = 0; // NOTE: This has to be done before REGISTERING types. Dependency on "put" // // Arrive at %TYPE and %TYPE%V attributes // if (strcmp(type,"single") == 0) { put("%PTYPE", "float"); // Primitive Type put("%PREFIX", "S"); // Prefix registerType("float", vecWidth); } if (strcmp(type,"double") == 0) { put("%PTYPE", "double"); // Primitive Type put("%PREFIX", "D"); // Prefix registerType("double", vecWidth); } if (strcmp(type,"complex") == 0) { put("%PTYPE", "float"); // Primitive Type put("%PREFIX", "C"); // Prefix registerType("float2", vecWidth, 2); } if (strcmp(type,"doublecomplex") == 0) { put("%PTYPE", "double"); // Primitive Type put("%PREFIX", "Z"); // Prefix registerType("double2", vecWidth, 2); } registerVSTORE(); //Get "%VSTORE_VALUE" - This is for internal use to handle %VLOAD put("%VLOAD", NULL); put("%VSTORE", NULL); put("%CONJUGATE", NULL);//Directive put("%CLEAR_IMAGINARY", NULL);//Directive put("%COMPLEX_JOIN", NULL);//Directive put("%MAD", NULL); //Directive put("%VMAD", NULL); //Directive put("%VMAD_AND_REDUCE", NULL); //Directive put("%MAD_AND_REDUCE", NULL); //Directive put("%MUL", NULL); //Directive put("%VMUL", NULL); //Directive put("%ADD", NULL); //Directive put("%SUB", NULL); //Directive put("%DIV", NULL); //Directive put("%VDIV", NULL); //Directive put("%MAKEVEC", NULL); //Directive put("%VMAKEVEC", NULL); //Directive put("%INIT", NULL); //Directive put("%VMAKEHVEC", NULL);//Directive put("%VMAKEQVEC", NULL);//Directive put("%VMAKEOVEC", NULL);//Directive put("%VLOADWITHINCX", NULL);//Directive put("%VLOADWITHINCXV2", NULL);//Directive put("%VSTOREWITHINCX", NULL);//Directive put("%REDUCE_SUM", NULL);//Directive put("%REDUCE_SUM_REAL_HV", NULL);//Directive put("%REDUCE_MAX", NULL);//Directive put("%REDUCE_MIN", NULL);//Directive put("%REDUCE_HYPOT", NULL);//Directive put("%IF", NULL);//Directive put("%VFOR_REAL", NULL);//Directive put("%VFOR", NULL);//Directive put("%REDUCTION_BY_SUM", NULL); //Directive put("%REDUCTION_BY_MAX", NULL); //Directive put("%REDUCTION_BY_MIN", NULL); //Directive put("%REDUCTION_BY_HYPOT", NULL); //Directive put("%REDUCTION_BY_SSQ", NULL); //Directive put("%VABS", NULL); //Directive put("%ABS", NULL); //Directive srand((unsigned int)time(NULL)); return; } kprintf::kprintf(char _type, int vecWidth, bool doVLOAD, bool doVSTORE, int _wgSize) { this->dataType = _type; switch(_type) { case 'S': doConstruct("single", vecWidth, doVLOAD, doVSTORE, _wgSize); break; case 'D': doConstruct("double", vecWidth, doVLOAD, doVSTORE, _wgSize); break; case 'C': doConstruct("complex", vecWidth, doVLOAD, doVSTORE, _wgSize); break; case 'Z': doConstruct("doublecomplex", vecWidth, doVLOAD, doVSTORE, _wgSize); break; default: printf("WARNING: kprintf called with wrong arguments!\n"); break; } return; } kprintf::kprintf(const char *type, int vecWidth, bool doVLOAD, bool doVSTORE, int _wgSize) { if (strcmp(type, "single") == 0) this->dataType = 'S'; else if (strcmp(type, "double") == 0) this->dataType = 'D'; else if (strcmp(type, "complex") == 0) this->dataType = 'C'; else if (strcmp(type, "doublecomplex") == 0) this->dataType = 'Z'; doConstruct(type, vecWidth, doVLOAD, doVSTORE, _wgSize); return; } void kprintf::put(const char *key, const char *value) { struct fmt f; if(key[0] != '%') { std::cout << "Addition of key " << key << " failed as it does not start with %" << std::endl; return; } f.key = key; f.value = value; if (strlen(key) > maxKeySize) { maxKeySize = strlen(key); } v.push_back(f); return; } // // PENDING: // Needs ammendment at a later point of time when we support MACROS // int kprintf::real_strlen(const char *src) { int length = 0; struct fmt f; while(src[0]) { f = get(src); if (f.value != NULL) { length += (int)strlen(f.value); src += strlen(f.key); } else { length++; src++; } } return length+1; // +1 for the '\0' character } void kprintf::spit(char *dst, char *src) { struct fmt f; while(src[0]) { f = get(src); if ((f.value != NULL) || (f.key != NULL)) { if(f.value != NULL) { // // Normal Replacement Would Suffice // strncpy(dst, f.value, strlen(f.value)); dst += strlen(f.value); src += strlen(f.key); } else { // // Directive - Function Like Macro // if( strcmp(f.key, "%MAD") == 0) { handleMAD(&src, &dst); } else if ( strcmp(f.key, "%VMAD") == 0) { handleMAD(&src, &dst, true); } else if ( strcmp(f.key, "%VMAD_AND_REDUCE") == 0) { handleVMAD_AND_REDUCE(&src, &dst); } else if ( strcmp(f.key, "%MAD_AND_REDUCE") == 0) { handleMAD_AND_REDUCE(&src, &dst); } else if ( strcmp(f.key, "%CONJUGATE") == 0) { handleCONJUGATE(&src, &dst); } else if ( strcmp(f.key, "%CLEAR_IMAGINARY") == 0) { handleClearImaginary(&src, &dst); } else if (strcmp(f.key, "%MUL") == 0) { handleMUL(&src, &dst); } else if (strcmp(f.key, "%VMUL") == 0) { handleMUL(&src, &dst, true); } else if (strcmp(f.key, "%ADD") == 0) { handleADD_SUB(&src, &dst, '+'); } else if (strcmp(f.key, "%SUB") == 0) { handleADD_SUB(&src, &dst, '-'); } else if (strcmp(f.key, "%DIV") == 0) { handleDIV(&src, &dst); } else if (strcmp(f.key, "%VDIV") == 0) { handleDIV(&src, &dst, true); } else if (strcmp(f.key, "%VMAKEVEC") == 0) { handleMakeVector(&src, &dst); } else if (strcmp(f.key, "%VMAKEHVEC") == 0) { handleMakeVector(&src, &dst, 2); } else if (strcmp(f.key, "%VMAKEQVEC") == 0) { handleMakeVector(&src, &dst, 4); } else if (strcmp(f.key, "%VMAKEOVEC") == 0) { handleMakeVector(&src, &dst, 8); } else if ((strcmp(f.key, "%MAKEVEC") == 0) || (strcmp(f.key, "%INIT") == 0) ) { handleMakeVector(&src, &dst, 0); // To handle Scalar case } else if (strcmp(f.key, "%VLOADWITHINCX") == 0) { handleVLoadWithIncx(&src, &dst); }else if (strcmp(f.key, "%VLOADWITHINCXV2") == 0) { handleVLoadWithIncx(&src, &dst, true); } else if (strcmp(f.key, "%VSTOREWITHINCX") == 0) { handleVStoreWithIncx(&src, &dst); }else if (strcmp(f.key, "%REDUCE_SUM") == 0) { handleReduceSum(&src, &dst); } else if (strcmp(f.key, "%REDUCE_SUM_REAL_HV") == 0) { handleReduceSumReal(&src, &dst, effectiveVectorWidthOnBaseType/2); } else if (strcmp(f.key, "%REDUCE_MAX") == 0) { handleReduceMax(&src, &dst); } else if (strcmp(f.key, "%REDUCE_MIN") == 0) { handleReduceMin(&src, &dst); } else if (strcmp(f.key, "%REDUCE_HYPOT") == 0) { handleReduceHypot(&src, &dst); }else if (strcmp(f.key, "%VLOAD") == 0) { handleAlignedDataAccess(&src, &dst); }else if (strcmp(f.key, "%VSTORE") == 0) { handleAlignedVSTORE(&src, &dst); } else if (strcmp(f.key, "%IF") == 0) { handlePredicate(&src, &dst); } else if (strcmp(f.key, "%COMPLEX_JOIN") == 0) { handleComplexJoin(&src, &dst); } else if (strcmp(f.key, "%VFOR_REAL") == 0) { handleVFOR(&src, &dst, true); } else if (strcmp(f.key,"%VFOR") == 0) { handleVFOR(&src, &dst, false); } else if (strcmp(f.key,"%REDUCTION_BY_SUM") == 0) { handleReductionFramework(&src, &dst, REDUCTION_BY_SUM); } else if (strcmp(f.key,"%REDUCTION_BY_MAX") == 0) { handleReductionFramework(&src, &dst, REDUCTION_BY_MAX); } else if (strcmp(f.key,"%REDUCTION_BY_MIN") == 0) { handleReductionFramework(&src, &dst, REDUCTION_BY_MIN); } else if (strcmp(f.key,"%REDUCTION_BY_HYPOT") == 0) { handleReductionFramework(&src, &dst, REDUCTION_BY_HYPOT); } else if (strcmp(f.key,"%REDUCTION_BY_SSQ") == 0) { handleReductionFramework(&src, &dst, REDUCTION_BY_SSQ); } else if (strcmp(f.key,"%VABS") == 0) { handleVABS(&src, &dst); } else { std::cerr << "Problems in spitting: Internal error. Unable to handle key " << f.key << std::endl; *dst = *src; dst++; src++; } } } else { *dst = *src; dst++; src++; } } *dst = '\0'; } clblas-2.10/src/library/blas/gens/legacy/000077500000000000000000000000001264277366700202745ustar00rootroot00000000000000clblas-2.10/src/library/blas/gens/legacy/blas_kgen_legacy.c000066400000000000000000000461441264277366700237220ustar00rootroot00000000000000/* ************************************************************************ * Copyright 2013 Advanced Micro Devices, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * ************************************************************************/ /* * common stuff for blas related * kernel generators, legacy part */ #include #include #include #include #include #include #include #include #include "blas_kgen_legacy.h" void declareBlasEnums(struct KgenContext *ctx) { kgenAddStmt(ctx, "typedef enum clblasOrderEnum {\n" " clblasRowMajor,\n" " clblasColumnMajor\n" "} clblasOrder;\n" "\n" "typedef enum clblasTransposeEnum {\n" " clblasNoTrans,\n" " clblasTrans,\n" " clblasConjTrans\n" "} clblasTranspose;\n" "\n" "typedef enum clblasUploEnum {\n" " clblasUpper,\n" " clblasLower\n" "} clblasUplo;\n" "\n" "typedef enum clblasDiagEnum {\n" " clblasUnit,\n" " clblasNonUnit\n" "} clblasDiag;\n" "\n" "typedef enum clblasSideEnum {\n" " clblasLeft,\n" " clblasRight\n" "} clblasSide;\n\n"); } static unsigned int getTmpVecLen( const BlasGenSettings *gset, UpdateResultFlags uflags, const char **vecName) { const CLBLASKernExtra *kextra = gset->kextra; unsigned int vecLen; if (isComplexType(kextra->dtype) || (uflags & (UPRES_GENERIC | UPRES_NO_VECTORIZATION))) { vecLen = 1; } else { vecLen = (gset->flags & BGF_DISTINCT_VECLEN) ? kextra->vecLenC : kextra->vecLen; getVectorTypeName(kextra->dtype, vecLen, vecName, NULL); } return vecLen; } static void updateOptimResultGen( struct KgenContext *ctx, const BlasGenSettings *gset, unsigned int wvlen, unsigned int pitch, unsigned int regOff, const char *ldName, UpdateResultOp op, UpdateResultFlags flags, const char *cachedName) { char tmp[1024]; int tra, isDouble; bool useReg = true; char *regRole; char dst[80], src[80]; char vchunkTmp[64], vchunkReg[64]; unsigned int sizes[2]; unsigned int i, j, k; unsigned int off; const char *vfield; DataType dtype = gset->kextra->dtype; bool isPrivDest = ((flags & UPRES_PRIV_DEST) != 0); unsigned int vecLen; // vector length of the result's register block // vector length to update with at immediate operations unsigned int uplen; // vector length of the temporary storage location unsigned int tmpVecLen; const char *ptrName; sizes[0] = (unsigned int)gset->subdims[1].y; sizes[1] = (unsigned int)gset->subdims[1].x; j = 0; tra = ((flags & UPRES_COLUMN_MAJOR) != 0); isDouble = isDoubleBasedType(dtype); vfield = dtypeUPtrField(dtype); vecLen = (gset->flags & BGF_DISTINCT_VECLEN) ? gset->kextra->vecLenC : gset->kextra->vecLen; tmpVecLen = getTmpVecLen(gset, flags, NULL); getVectorTypeName(dtype, wvlen, NULL, &ptrName); if (isComplexType(dtype)) { vecLen = 1; } uplen = (tra || (flags & UPRES_NO_VECTORIZATION)) ? 1 : vecLen; /* * Pass recursively over the major dimension with power of 2 vectors. * If the used type size is less then the current vector size, * use assembling/disassembling into/from a temporary vector. This is * for trying to increase effectiveness of operations with the global * memory due to vectorization. */ if (wvlen > sizes[1 - tra]) { wvlen /= 2; updateOptimResultGen(ctx, gset, wvlen, pitch, regOff, ldName, op, flags, cachedName); return; } if (wvlen == 1) { kgenAddStmt(ctx, "// Copy with single words\n"); } else { const char *s = (isDouble) ? "double" : "float"; sprintf(tmp, "// Copy with %s%d vectors\n", s, wvlen); kgenAddStmt(ctx, tmp); } for (i = 0; i < sizes[tra]; i++) { unsigned int roff; if (tra) { roff = regOff + i; } else { roff = regOff + i * pitch; } for (j = 0; j < sizes[1 - tra] / wvlen; j++) { if (wvlen > uplen) { if (isPrivDest) { sprintfVecChunk(vchunkTmp, tmpVecLen, wvlen, 0); sprintf(tmp, "tmp%s = uC.%s[%u];\n", vchunkTmp, ptrName, j); kgenAddStmt(ctx, tmp); } else { // assemble vector for (k = 0; k < wvlen; k += uplen) { off = (tra) ? (roff + k * pitch) : (roff + k); sprintfVecChunk(vchunkTmp, tmpVecLen, uplen, k); sprintfVecChunk(vchunkReg, vecLen, uplen, off % vecLen); sprintf(tmp, "tmp%s = c[%u]%s;\n", vchunkTmp, off / vecLen, vchunkReg); kgenAddStmt(ctx, tmp); } } } if (isPrivDest && (wvlen > uplen)) { // disassemble temporary vector and do immediate result update for (k = 0; k < wvlen; k += uplen) { off = (tra) ? (roff + k * pitch) : (roff + k); sprintfVecChunk(vchunkTmp, tmpVecLen, uplen, k); sprintfVecChunk(vchunkReg, vecLen, uplen, off % vecLen); sprintf(src, "tmp%s", vchunkTmp); sprintf(dst, "c[%u]%s", off / vecLen, vchunkReg); genUpdateResultSingle(ctx, dst, src, gset, op, flags); } } else { if (wvlen > uplen) { sprintfVecChunk(vchunkTmp, tmpVecLen, wvlen, 0); sprintf(src, "tmp%s", vchunkTmp); useReg = false; } if (!isPrivDest) { sprintf(dst, "uC.%s[%u]", ptrName, j); if (cachedName) { char *p = dst + strlen(dst); strcat(p, " = "); p = dst + strlen(dst); sprintf(p, cachedName, i, j); } regRole = src; } else { useReg = true; regRole = dst; sprintf(src, "uC.%s[%u]", ptrName, j); } if (useReg) { sprintfVecChunk(vchunkReg, vecLen, uplen, roff % vecLen); sprintf(regRole, "c[%u]%s", roff / vecLen, vchunkReg); } genUpdateResultSingle(ctx, dst, src, gset, op, flags); } // update register offset if (tra) { roff += wvlen * pitch; } else { roff += wvlen; } } // move the destination pointer to the next line if ((i != sizes[tra] - 1)) { sprintf(tmp, "uC.%s += %s;\n", vfield, ldName); kgenAddStmt(ctx, tmp); if (tra) { kgenAddBlankLine(ctx); } } } if (j * wvlen != sizes[1 - tra]) { // increment pointers if (tra) { regOff += j * wvlen * pitch; } else { regOff += j * wvlen; } sprintf(tmp, "\n" "uC.%s = tmpC.%s + %u;\n" "tmpC = uC;\n", vfield, vfield, j * wvlen); kgenAddStmt(ctx, tmp); // go down sizes[1 - tra] -= j * wvlen; wvlen /= 2; updateOptimResultGen(ctx, gset, wvlen, pitch, regOff, ldName, op, flags, cachedName); } } static void updateGenericResultGen( struct KgenContext *ctx, const BlasGenSettings *gset, size_t pitch, UpresVarNames* uvars, UpdateResultOp op, UpdateResultFlags flags, const char *cachedName) { char tmp[1024], dst[128], src[128]; const char *boundNames[2] = {uvars->nrRows, uvars->nrCols}; const char *vecType = NULL; const char *vFieldVectorized; DataType dtype = gset->kextra->dtype; unsigned int wvlen; unsigned int sizes[2]; const char* vfield = dtypeUPtrField(dtype); bool tra = ((flags & UPRES_COLUMN_MAJOR) != 0); bool row = ((flags & UPRES_TAIL_ROW)); bool col = ((flags & UPRES_TAIL_COL)); bool iwc = ((flags & UPRES_INDEXING_WITH_CONSTANTS) != 0); int l0; int l1; unsigned int vecLen; // vector length of the result's register block // vector length to update with at immediate operations unsigned int uplen; // vector length of the temporary storage location char vchunkReg[64]; bool revert = false; vecLen = (gset->flags & BGF_DISTINCT_VECLEN) ? gset->kextra->vecLenC : gset->kextra->vecLen; if (isComplexType(dtype)) { vecLen = 1; } uplen = (tra || (flags & UPRES_NO_VECTORIZATION)) ? 1 : vecLen; uplen = 1; sizes[0] = (unsigned int)gset->subdims[1].y; sizes[1] = (unsigned int)gset->subdims[1].x; if (iwc) { const char* l0var = boundNames[tra]; revert = (tra && col) || (!tra && row); if (revert) { sprintf(tmp, "uC.%s += (%s-1) * %s;\n", vfield, l0var, uvars->ld); } else { sprintf(tmp, "\n"); } kgenAddStmt(ctx, tmp); } wvlen = getTmpVecLen(gset, flags, &vecType); getVectorTypeName(dtype, wvlen, NULL, &vFieldVectorized); sprintf(tmp, "res.%s = c;\n", vFieldVectorized); kgenAddStmt(ctx, tmp); if (flags & (UPRES_TAIL_ROW | UPRES_TAIL_COL)) { char offStr[64]; char *p = offStr; offStr[0] = '\0'; if (flags & UPRES_TAIL_ROW) { sprintf(offStr, " + (%u - %s) * %lu", sizes[0], uvars->nrRows, pitch); p += strlen(offStr); } if (flags & UPRES_TAIL_COL) { sprintf(p, " + (%u - %s)", sizes[1], uvars->nrCols); } if (iwc) { sprintf(tmp, "res.%s = uC.%s%s;\n", vfield, vfield, offStr); sprintf(tmp, "\n"); } else { sprintf(tmp, "res.%s = res.%s%s;\n", vfield, vfield, offStr); } kgenAddStmt(ctx, tmp); } if (iwc) { int l0st = 1; int l0en = sizes[tra]; int l1st = 1; int l1en = sizes[1-tra]; const char* l0var = boundNames[tra]; const char* l1var = boundNames[1-tra]; for (l0 = l0en; l0 >= l0st; l0--) { sprintf(tmp, "if (%s) ",l0var); kgenBeginBranch(ctx, tmp); sprintf(tmp, "switch (%s)", l1var); kgenBeginBranch(ctx, tmp); for (l1 = l1en; l1 >= l1st; l1--) { int resId; sprintf(tmp, "case %d:\n", l1); kgenAddStmt(ctx, tmp); if (tra) { resId = (row) ? (l1en-l1)*(int)pitch : (l1-l1st)*(int)pitch; resId += (col)? (l0-l0st): (l0en-l0); } else { /////////////////////////// resId = (row) ? (l0-l0st)*(int)pitch : (l0en-l0)*(int)pitch; resId += (col)? (l1en-l1) : (l1-l1st); } if ((tra && row) || (!tra && col)) { sprintf(dst, "uC.%s[(%s+%d) %% %i]", vfield, l1var, (l1en - l1), (int)l1en); } else { sprintf(dst, "uC.%s[%d]", vfield, (l1-l1st)); } sprintfVecChunk(vchunkReg, vecLen, uplen, resId % vecLen); sprintf(src, "c[%u]%s", resId / vecLen, vchunkReg); if (flags & UPRES_PRIV_DEST) { genUpdateResultSingle(ctx, src, dst, gset, op, flags); } else { genUpdateResultSingle(ctx, dst, src, gset, op, flags); } } kgenEndBranch(ctx, NULL); if (revert) { sprintf(tmp, "uC.%s -= %s;\n", vfield, uvars->ld); } else { sprintf(tmp, "uC.%s += %s;\n", vfield, uvars->ld); } kgenAddStmt(ctx, tmp); sprintf(tmp, "%s--;\n", l0var); kgenAddStmt(ctx, tmp); kgenEndBranch(ctx, NULL); } } else { sprintf(tmp, "for (i = 0; i < %s; i++)", boundNames[tra]); kgenBeginBranch(ctx, tmp); sprintf(tmp, "for (j = 0; j < %s; j++)", boundNames[1 - tra]); kgenBeginBranch(ctx, tmp); sprintf(dst, "uC.%s[i * %s + j]", vfield, uvars->ld); if (cachedName) { unsigned int i; char tmpcachedName[80] = " = "; strcat(tmpcachedName, cachedName); for (i = 3; i < strlen(tmpcachedName); i++) { if (strncmp(tmpcachedName+i, "%u", 2) == 0) { tmpcachedName[i+1] = 's'; } } sprintf(tmp, tmpcachedName, "i", "[j]"); strcat(dst, tmp); } if (tra) { sprintf(src, "res.%s[j * %lu + i]", vfield, pitch); } else { sprintf(src, "res.%s[i * %lu + j]", vfield, pitch); } if (flags & UPRES_PRIV_DEST) { genUpdateResultSingle(ctx, src, dst, gset, op, flags); } else { genUpdateResultSingle(ctx, dst, src, gset, op, flags); } kgenEndBranch(ctx, NULL); kgenEndBranch(ctx, NULL); } } int updateResultGenOld( struct KgenContext *ctx, const BlasGenSettings *gset, UpdateResultOp op, UpdateResultFlags flags, const UpresVarNames *uvarNames) { char tmp[1024]; char *p = tmp; const char *typeName; const char *vecType = NULL; const char *vfield; const char *suff1; const char *suff2; int ret = 0; unsigned int sizes[2]; bool generic, tra; unsigned int wvlen; // length of vectors to copy with unsigned int uplen; // length of vectors to update result with size_t pitch; char LG; DataType dtype = gset->kextra->dtype; unsigned int vecLen; bool isInlined = (flags & UPRES_INLINE); UpresVarNames uvars; vecLen = (gset->flags & BGF_DISTINCT_VECLEN) ? gset->kextra->vecLenC : gset->kextra->vecLen; sizes[0] = (unsigned int)gset->subdims[1].y; sizes[1] = (unsigned int)gset->subdims[1].x; if (isComplexType(dtype)) { vecLen = 1; } if ((flags & UPRES_WITH_BETA) && (op != UPRES_SUM)) { return -EINVAL; } tra = ((flags & UPRES_COLUMN_MAJOR) != 0); generic = ((flags & UPRES_GENERIC) != 0); typeName = dtypeBuiltinType(dtype); vfield = dtypeUPtrField(dtype); pitch = roundUp(sizes[1], vecLen); // select write vectorization wvlen = getTmpVecLen(gset, flags, &vecType); uplen = (tra || (flags & UPRES_NO_VECTORIZATION)) ? 1 : vecLen; suff1 = (generic) ? "Generic" : ""; suff2 = (flags & UPRES_PRIV_DEST) ? "Rev" : ""; LG = (flags & UPRES_USE_LDS) ? 'L' : 'G'; if (!isInlined) { const char *outTypeName; const char *memPref = (flags & UPRES_USE_LDS) ? "__local" : "__global"; getResultGPRsInfo(dtype, NULL, vecLen, NULL, &outTypeName); // define the function sprintf(tmp, "void\n" "updateResult%s%s%c(\n" " %s %s *C,\n" " %s *c,\n" " %s alpha,\n" " uint startRow,\n" " uint startCol,\n" " uint ld", suff1, suff2, LG, memPref, typeName, outTypeName, typeName); p += strlen(p); if (flags & UPRES_WITH_BETA) { sprintf(p, ",\n %s beta", typeName); p += strlen(p); } if (generic) { sprintf(p, ",\n uint nrRows,\n" " uint nrCols"); } uvars.result = "C"; uvars.ld = "ld"; uvars.startRow = "startRow"; uvars.startCol = "startCol"; uvars.nrRows = "nrRows"; uvars.nrCols = "nrCols"; strcat(p, ")\n"); kgenDeclareFunction(ctx, tmp); kgenBeginFuncBody(ctx); } else { memcpy(&uvars, uvarNames, sizeof(uvars)); } // declare local variables sprintf(tmp, "%cPtr uC;\n", LG); kgenAddStmt(ctx, tmp); if (generic) { kgenAddStmt(ctx, "int i, j;\n" "PPtr res;\n"); } else { /* * temporary pointer to pass correctly over the * destination array since destination rows can be * not aligned on a vector bound */ if (sizes[1 - tra] % wvlen != 0) { sprintf(tmp, "%cPtr tmpC;\n", LG); kgenAddStmt(ctx, tmp); } if (wvlen > uplen) { sprintf(tmp, "%s tmp;\n", vecType); kgenAddStmt(ctx, tmp); } } if (isComplexType(dtype) && !(flags & UPRES_WITHOUT_ALPHA)) { declareComplexMultParts(ctx, "alpha", typeName); if (flags & UPRES_WITH_BETA) { declareComplexMultParts(ctx, "beta", typeName); } } kgenAddBlankLine(ctx); if (tra) { sprintf(tmp, "uC.%s = %s + %s * %s + %s;\n", vfield, uvars.result, uvars.startCol, uvars.ld, uvars.startRow); } else { sprintf(tmp, "uC.%s = %s + %s * %s + %s;\n", vfield, uvars.result, uvars.startRow, uvars.ld, uvars.startCol); } kgenAddStmt(ctx, tmp); if ((sizes[1 - tra] % wvlen != 0) && !generic) { kgenAddStmt(ctx, "tmpC = uC;\n"); } ret = kgenAddBlankLine(ctx); if (generic) { updateGenericResultGen(ctx, gset, pitch, &uvars, op, flags, uvarNames ? uvarNames->cachedName : NULL); } else { updateOptimResultGen(ctx, gset, wvlen, (unsigned int)pitch, 0, uvars.ld, op, flags, uvarNames ? uvarNames->cachedName : NULL); } if (!isInlined) { ret = kgenEndFuncBody(ctx); } return (ret) ? -EOVERFLOW : 0; } clblas-2.10/src/library/blas/gens/legacy/blas_kgen_legacy.h000066400000000000000000000137611264277366700237260ustar00rootroot00000000000000/* ************************************************************************ * Copyright 2013 Advanced Micro Devices, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * ************************************************************************/ #ifndef BLAS_KGEN_LEGACY_H_ #define BLAS_KGEN_LEGACY_H_ #include "../blas_kgen.h" /** * @internal * @brief Block multiplier flags * @ingroup BLAS_MAJOR_GENS */ typedef enum BlkmulFlags { BLKMUL_NO_FLAGS, /**< No flags */ BLKMUL_TRANSPOSE = 0x01, /**< Transpose result */ BLKMUL_IMAGE_PACKED = 0x02, /**< Data in image are packed */ /** * Accumulate multiplication results to a * private location provided by caller */ BLKMUL_OUTPUT_PRIVATE = 0x04, BLKMUL_SKEW_ROW = 0x08, /**< Use skew over block rows */ BLKMUL_SKEW_COLUMN = 0x10, /**< Use skew over block columns */ BLKMUL_INLINE = 0x20, /**< Generate an inline version */ BLKMUL_TRANSPOSED_B = 0x40, /**< Block B is transposed */ /** Don't use "&" operation in cyclic address evaluation, use always "%" */ BLKMUL_AVOID_AND = 0x80 } BlkMulFlags; /** * @internal * @brief Block multiplier core * @ingroup BLAS_MAJOR_GENS */ typedef enum BlkmulCore { /** Use separate multiplication and summation implemented by hand */ BLKMUL_SEPARATE_MULADD, /** Use the 'dot' function */ BLKMUL_DOT, /** Use the 'mad' function */ BLKMUL_MAD } BlkmulCore; /** * @internal * @brief Argument names for the inline version of the block * multiplier * @ingroup BLAS_MAJOR_GENS */ typedef struct BlkmulArgNames { const char *coordA; /**< Matrix A start coordinates */ const char *coordB; /**< Matrix B start coordinates */ const char *skewRow; /**< Skew over rows */ const char *skewCol; /**< Skew over columns */ const char *k; /**< Counter name in the loop over K */ const char *vectBoundK; /**< Bound in the loop over K */ } BlkmulArgNames; /** * @internal * @brief Options for matrix block multiplication * generator * @ingroup BLAS_MAJOR_GENS */ typedef struct BlkMulOpts { /** OpenCL memory object type storing matrix (whole or its blocks) A */ CLMemType aMobj; /** OpenCL memory object type storing matrix (whole or its blocks) A */ CLMemType bMobj; BlkMulFlags flags; /**< Specific flags */ BlkmulCore core; /**< Multiply and add core */ /** List of argument names for the inline version */ BlkmulArgNames argNames; } BlkMulOpts; void declareBlasEnums(struct KgenContext *ctx); /** * @internal * @brief Matrix block multiplication generator * * @param[out] ctx Generator context * @param[in] subdims Subproblem dimensions; the first level reflects * dimensions of the large blocks processed with the * whole work group, and the second level * reflects sizes of immediately multiplied small * blocks within the single work item * @param[in] dtype Data type the multiplying function will be * generated for * @param[in] opts Block multiplication options * * Generated functions have the following definitions: \n *\n * For the buffer based version: * @code * void * funcName( * alpha, * LPtr A, * LPtr B, * LPtr C, * [,int2 skewRow] * [,int skewCol]); * @endcode * * Function naming rule: * (type prefix)gemmBlock[Transp]__ *\n * It's assumed A, B and C point to start of data to be * processed during this step. *\n * For the image based version: \n * @code * void * funcName( * alpha, * __read_only image2d_t A, * int2 coordA, * __read_only image2d_t B, * int2 coordB, * LPtr C, * [,int2 skewRow], * [,int skewCol]); * @endcode * * Where coordA and coordB mean start image coordinates to fetch data from. *\n * For the image based version a mixed variant is possible when * either A or B blocks are passed through the local memory. *\n * The 'skewRow' and 'skewCol' are optional arguments if the * 'BLKMUL_SKEW_ROW' and "BLKMUL_SKEW_COLUMN" flag is specified * respectively. 'y' field of the row skew is for the block A, and the * 'x' one is for the block B. *\n * Output result can be put directly into a private location provided by the * caller instead of the local one. It is achieved with 'BLKMUL_OUTPUT_PRIVATE' * flag using. *\n * Pointer to this location should have the following types depending on the type * of processed data: \n * - float4 - for float * - float2 - for complex float * - double2 - for double and complex double *\n\n * Alpha is not taken in this case. *\n * The multiplier can be generated as well in the form of the dedicated * function as in the inline form inserted to a kernel. \n In case of inline * version the block multiplier becomes in fact the tile multiplier. In this * case the caller should provide iteration over K. * * @return 0 on success, -EOVERFLOW on source buffer overflowing */ /** * @internal * @defgroup BLAS_MAJOR_GENS BLAS specific generators * @ingroup MAJOR_GENS */ /*@{*/ int blkMulGen( struct KgenContext *ctx, const SubproblemDim subdims[2], DataType dtype, const BlkMulOpts *opts); int updateResultGenOld( struct KgenContext *ctx, const BlasGenSettings *gset, UpdateResultOp op, UpdateResultFlags flags, const UpresVarNames *uvarNames); /*@}*/ #endif /* BLAS_KGEN_LEGACY_H_ */ clblas-2.10/src/library/blas/gens/legacy/blkmul.c000066400000000000000000000555341264277366700217420ustar00rootroot00000000000000/* ************************************************************************ * Copyright 2013 Advanced Micro Devices, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * ************************************************************************/ /* * TODO: throw away this generator and replace it with tileMulGen() in all * kernel generators */ #include #include #include #include #include #include #include #include #include #include "../blas_kgen.h" #include "blas_kgen_legacy.h" #define MAX_LENGTH 4096 #define BITS_INT (sizeof(int) * 8) typedef enum VectMulType { VECT_MULT_REAL, VECT_MULT_COMPLEX_REAL, VECT_MULT_IMAG_FLOAT, VECT_MULT_IMAG_DOUBLE } VectMulType; static __inline bool isPower2(size_t a) { return (a && ((a & (a - 1)) == 0)); } /* * get vector chunk size to copy * taking into account its alignment */ static unsigned int vecChunkSize(size_t offset, size_t vecLen) { size_t chunk; for (chunk = vecLen; (chunk > 1) && (offset % chunk); chunk /= 2) { } return (unsigned int)chunk; } static void getCyclicAddrData( BlkMulFlags flags, const char **op, size_t *value, size_t bound) { if (isPower2(bound) && !(flags & BLKMUL_AVOID_AND)) { *op = "&"; *value = bound - 1; } else { *op = "%"; *value = bound; } } static void sprintfInputOffset( char *buf, MatrixRole mrole, int row, int col, size_t vecPitch, size_t bheight, const BlkMulOpts *opts, BlkmulArgNames *argNames, bool singleStepK) { const char *vfield; const char *coordName; const char *op; size_t bound; char colOff[64], rowOff[64]; CLMemType mtype; BlkMulFlags flags = opts->flags; vfield = (mrole == MATRIX_A) ? "y" : "x"; mtype = (mrole == MATRIX_A) ? opts->aMobj : opts->bMobj; if ((mrole == MATRIX_B) && (flags & BLKMUL_TRANSPOSED_B)) { flags &= ~BLKMUL_SKEW_ROW; } if (flags & BLKMUL_SKEW_ROW) { getCyclicAddrData(flags, &op, &bound, bheight); sprintf(rowOff, "((%s.%s + %d) %s %lu)", argNames->skewRow, vfield, row, op, bound); } else { sprintf(rowOff, "%d", row); } if (flags & BLKMUL_SKEW_COLUMN) { getCyclicAddrData(flags, &op, &bound, vecPitch); if (flags & BLKMUL_INLINE) { if (singleStepK) { sprintf(colOff, "%d", col); } else { sprintf(colOff, "(%s + %s + %d) %% %s", argNames->skewCol, argNames->k, col, argNames->vectBoundK); } } else { if (singleStepK) { sprintf(colOff, "%s", argNames->skewCol); } else { sprintf(colOff, "((skewCol + k + %d) %s %lu)", col, op, bound); } } } else { sprintf(colOff, "%d", col); } if (mtype == CLMEM_IMAGE) { coordName = (mrole == MATRIX_A) ? argNames->coordA : argNames->coordB; if (flags & BLKMUL_IMAGE_PACKED) { sprintf(buf, "(int2)(%s.x + mad24(%s, %lu, %s), %s.y)", coordName, rowOff, vecPitch, colOff, coordName); } else { sprintf(buf, "(int2)(%s.x + %s, %s.y + %s)", coordName, colOff, coordName, rowOff); } } else { if (flags & BLKMUL_SKEW_ROW) { sprintf(buf, "mad24(%s, %lu, %s)", rowOff, vecPitch, colOff); } else { sprintf(buf, "%lu + %s", row * vecPitch, colOff); } } } static void genRealDot( struct KgenContext *ctx, size_t m, size_t n, size_t nrCols, size_t lenK, unsigned int vecLen) { size_t k; char tmp[MAX_LENGTH], prefix[MAX_LENGTH]; const char *vect = "xyzw"; size_t regPitch = nrCols; size_t off; if (regPitch % vecLen) { regPitch += vecLen - regPitch % vecLen; } off = m * regPitch + n; sprintf(prefix, "c[%lu].%c += ", off / vecLen, vect[off % vecLen]); for (k = 0; k < lenK / vecLen; k++) { off = n * lenK / vecLen + k; sprintf(tmp, "%sdot(a[%lu], b[%lu]);\n", prefix, k, off); kgenAddStmt(ctx, tmp); } } /* * sprintf vector multiplication expression */ static void genVecMul( struct KgenContext *ctx, size_t currCol, size_t lenK, VectMulType type) { size_t k; char tmp[MAX_LENGTH]; const char *suff[] = {"", "", ".yxwz", ".yx"}; sprintf(tmp, "sum = a[%d] * b[%lu]%s", 0, currCol * lenK, suff[type]); for (k = 1; k < lenK; k++) { sprintf(tmp, "%s + a[%lu] * b[%lu]%s", tmp, k, currCol * lenK + k, suff[type]); } strcat(tmp, ";\n"); kgenAddStmt(ctx, tmp); } /* * sprintf vector multiplication expression using mad()'s */ static void genMadMul( struct KgenContext *ctx, size_t currCol, size_t lenK, VectMulType type) { size_t k; char tmp[MAX_LENGTH]; const char *suff[] = {"", "", ".yxwz", ".yx"}; sprintf(tmp, "sum = a[%d] * b[%lu]%s;\n", 0, currCol * lenK, suff[type]); for (k = 1; k < lenK; k++) { sprintf(tmp, "%ssum = mad(a[%lu], b[%lu]%s, sum);\n", tmp, k, currCol * lenK + k, suff[type]); } kgenAddStmt(ctx, tmp); } /* * sprint expression for all the vector components * accumulation */ static void genVecSum( struct KgenContext *ctx, DataType dataType, size_t currRow, size_t currCol, size_t nrCols, unsigned int vecLen, VectMulType mulType) { const char *vect = "xyzw"; unsigned long vecOff, regOff; char c; unsigned int k; size_t pitch = nrCols; char tmp1[MAX_LENGTH], tmp2[MAX_LENGTH]; unsigned int sumLen; // get offset taking into account alignment if ((pitch % vecLen) && !isComplexType(dataType)) { pitch += vecLen - pitch % vecLen; } regOff = (unsigned int)(currRow * pitch + currCol); if (isComplexType(dataType)) { vecOff = (mulType == VECT_MULT_COMPLEX_REAL) ? 0 : 1; sumLen = vecLen * 2; } else { vecOff = regOff % vecLen; regOff /= vecLen; sumLen = vecLen; } sprintf(tmp1, " sum.x"); for (k = 1; k < sumLen; k++) { c = ((mulType == VECT_MULT_COMPLEX_REAL) && (k & 1)) ? '-' : '+'; sprintf(tmp1, "%s %c sum.%c", tmp1, c, vect[k]); } sprintf(tmp2, "c[%lu].%c += %s;\n", regOff, vect[vecOff], tmp1); kgenAddStmt(ctx, tmp2); } /* * vector multiplication expression using mad() operations */ static void genMad( struct KgenContext *ctx, DataType dataType, size_t currRow, size_t currCol, size_t nrCols, size_t lenK, unsigned int vecLen, bool vectorized) { const char *vect = {"xyzw"}; unsigned long vecOff, regOff; unsigned int k; size_t pitch = nrCols; char tmp[MAX_LENGTH]; unsigned int sumLen; int bIndex; // get offset taking into account alignment if ((pitch % vecLen) && !isComplexType(dataType)) { pitch += vecLen - pitch % vecLen; } regOff = (unsigned int)(currRow * pitch + currCol); vecOff = (unsigned int)(regOff % vecLen); if (isComplexType(dataType)) { sumLen = vecLen * 2; for (k = 0; k < lenK; k++) { int aIndex = k; bIndex = (int)(currCol * lenK + k); sprintf(tmp, "c[%lu] = mad(a[%d].xy, (float2)(b[%d].x), c[%lu]);\n", regOff, aIndex, bIndex, regOff); kgenAddStmt(ctx, tmp); sprintf(tmp, "c[%lu] = mad(a[%d].yx, (float2)(-b[%d].y, b[%d].y), c[%lu]);\n", regOff, aIndex, bIndex, bIndex, regOff); kgenAddStmt(ctx, tmp); sprintf(tmp, "c[%lu] = mad(a[%d].zw, (float2)(b[%d].z), c[%lu]);\n", regOff, aIndex, bIndex, regOff); kgenAddStmt(ctx, tmp); sprintf(tmp, "c[%lu] = mad(a[%d].wz, (float2)(-b[%d].w, b[%d].w), c[%lu]);\n", regOff, aIndex, bIndex, bIndex, regOff); kgenAddStmt(ctx, tmp); } } else { // Real case if (vectorized) { const char *tname = (isDoubleBasedType(dataType)) ? "double" : "float"; regOff = (unsigned int)(currRow * nrCols / vecLen + currCol); for (k = 0; k < lenK * vecLen; k++) { bIndex = (int)(currCol * lenK * vecLen + k); sprintf(tmp, "c[%lu] = mad((%s%u)a[%u].%c, b[%d], c[%lu]);\n", regOff, tname, vecLen, k / vecLen, vect[k % vecLen], bIndex, regOff); kgenAddStmt(ctx, tmp); } } else { int dimNum; regOff /= vecLen; sumLen = vecLen; if (isDoubleBasedType(dataType)) { dimNum = 2; } else { dimNum = 4; } for (k = 0; k < sumLen*lenK; k++) { sprintf(tmp, "c[%lu].%c = mad(a[%u].%c, b[%lu].%c, " "c[%lu].%c);\n", regOff, vect[vecOff], k / sumLen, vect[k % dimNum], currCol * lenK + (k / sumLen), vect[k % dimNum], regOff, vect[vecOff]); kgenAddStmt(ctx, tmp); } kgenAddBlankLine(ctx); } } } static void getUpdateSkewCoords( struct KgenContext *ctx, const BlkMulOpts *opts, size_t subK, size_t pitchA, size_t pitchB, unsigned int vecLen, const char *ptrNameIn) { char tmp[1024]; bool trb = ((opts->flags & BLKMUL_TRANSPOSED_B) != 0); if (!(opts->flags & BLKMUL_SKEW_COLUMN)) { kgenAddBlankLine(ctx); if (opts->aMobj == CLMEM_IMAGE) { sprintf(tmp, "coordA.x += %lu;\n", subK / vecLen); } else { sprintf(tmp, "A.%s += %lu;\n", ptrNameIn, subK / vecLen); } kgenAddStmt(ctx, tmp); if (!trb) { subK /= vecLen; } if (opts->bMobj == CLMEM_IMAGE) { const char *vfield = (trb) ? "y" : "x"; sprintf(tmp, "coordB.%s += %lu;\n", vfield, subK); } else { size_t u = (trb) ? (subK * pitchB / vecLen) : subK; sprintf(tmp, "B.%s += %lu;\n", ptrNameIn, u); } kgenAddStmt(ctx, tmp); } else if (subK == vecLen) { if (isPower2(pitchA / vecLen)) { sprintf(tmp, "\nskewCol = (skewCol + 1) & %lu;\n", pitchA / vecLen - 1); } else { sprintf(tmp, "\nskewCol = (skewCol + 1) %% %lu;\n", pitchA / vecLen); } kgenAddStmt(ctx, tmp); } } // MUST BE LATER DEPRECATED static void genScaleAccResults( struct KgenContext *ctx, DataType dtype, size_t m, size_t n, size_t outPitch, unsigned int vecLen, bool transpose) { char s[MAX_LENGTH]; const char *vect = "xyzw"; char vecChunk[6]; size_t inOff = 0, outOff, vecOff; size_t regPitch = n; size_t i, j, k; bool isDouble; const char *ptrNames[2][4] = { {"f", "f2v", "", "f4v"}, {"d", "d2v", "", ""}}; if ((regPitch % vecLen) && !isComplexType(dtype)) { regPitch += vecLen - regPitch % vecLen; } isDouble = isDoubleBasedType(dtype); for (i = 0; i < m; i++) { j = 0; inOff = i * regPitch; do { /* * get power of 2 size vector element to copy * in the case without transposing and copy * just with single element in the case with * transposing */ if (transpose) { k = 1; outOff = (j * outPitch + i); } else { if (isComplexType(dtype)) { k = 1; } else { k = vecChunkSize(j, vecLen); k = szmin(k, n - j); } outOff = (i * outPitch + j); } if (isComplexType(dtype)) { sprintf(s, "tempC.%s[%lu] += " "c[%lu] * alphaR + c[%lu].yx * alphaI;\n", ptrNames[isDouble][1], outOff, inOff, inOff); } else { if (k == vecLen) { strcpy(vecChunk, ""); } else { vecOff = inOff % vecLen; strcpy(vecChunk, "."); strncat(vecChunk, &vect[vecOff], k); } sprintf(s, "tempC.%s[%lu] += c[%lu]%s * alpha;\n", ptrNames[isDouble][k - 1], outOff / k, inOff / vecLen, vecChunk); } kgenAddStmt(ctx, s); j += k; inOff += k; } while (j < n); } } static void declareBlkMul( struct KgenContext *ctx, DataType dtype, size_t m, size_t n, const BlkMulOpts *opts, BlkmulArgNames *argNames) { char s[MAX_LENGTH]; const char *s1; char c; const char *typeName; bool isPriv = (opts->flags & BLKMUL_OUTPUT_PRIVATE); c = dtypeToBlasPrefix(dtype); typeName = dtypeBuiltinType(dtype); s1 = (opts->flags & BLKMUL_TRANSPOSE) ? "Transp" : ""; // fill argument names argNames->coordA = "coordA"; argNames->coordB = "coordB"; argNames->skewRow = "skewRow"; argNames->skewCol = "skewCol"; sprintf(s, "void\n" "%cgemmBlock%s_%lu_%lu(\n", c, s1, m, n); if (!isPriv) { sprintf(s, "%s %s alpha,\n", s, typeName); } if (opts->aMobj == CLMEM_IMAGE) { sprintf(s, "%s __read_only image2d_t A,\n" " int2 coordA,\n", s); } else { sprintf(s, "%s LPtr A,\n", s); } if (opts->bMobj == CLMEM_IMAGE) { sprintf(s, "%s __read_only image2d_t B,\n" " int2 coordB,\n", s); } else { sprintf(s, "%s LPtr B,\n", s); } if (opts->flags & BLKMUL_OUTPUT_PRIVATE) { if (isDoubleBasedType(dtype)) { typeName = "double2"; } else { typeName = (dtype == TYPE_COMPLEX_FLOAT) ? "float2" : "float4"; } sprintf(s, "%s %s *c", s, typeName); } else { sprintf(s, "%s LPtr tempC", s); } if (opts->flags & BLKMUL_SKEW_ROW) { sprintf(s, "%s,\n int2 skewRow", s); } if (opts->flags & BLKMUL_SKEW_COLUMN) { sprintf(s, "%s,\n int skewCol", s); } strcat(s, ")\n"); kgenDeclareFunction(ctx, (const char*)s); } int blkMulGen( struct KgenContext *ctx, const SubproblemDim subdims[2], DataType dtype, const BlkMulOpts *opts) { char s[MAX_LENGTH], s1[MAX_LENGTH]; const char *tNameIn, *tNameOut, *ptrNameIn; size_t vecLen, vlenJ, vlenK; size_t i, j, k; size_t m, n, subK; unsigned int nrRegs; int ret = 0; bool isReal, isDouble; bool isImageA, isImageB; size_t off; size_t pitchA, pitchB, pitchC; unsigned int tsize = dtypeSize(dtype); bool transpose = (opts->flags & BLKMUL_TRANSPOSE); bool trb = ((opts->flags & BLKMUL_TRANSPOSED_B) != 0); bool isPriv = (opts->flags & BLKMUL_OUTPUT_PRIVATE); bool isInlined = (opts->flags & BLKMUL_INLINE); BlkmulCore core = opts->core; BlkmulArgNames argNames; // code to fetch from images for double and float based types const char *imageFetch[2] = { "%c[%lu] = as_float4(read_imageui(%s, sampler, %s));\n", "%c[%lu] = as_double2(read_imageui(%s, sampler, %s));\n"}; if (trb && (opts->flags & BLKMUL_SKEW_COLUMN)) { return -EINVAL; } memcpy(&argNames, &opts->argNames, sizeof(BlkmulArgNames)); strcpy(s, ""); isImageA = (opts->aMobj == CLMEM_IMAGE); isImageB = (opts->bMobj == CLMEM_IMAGE); m = subdims[1].y; n = subdims[1].x; subK = subdims[1].bwidth; tsize = dtypeSize(dtype); // matrix block pitches pitchA = matrBlockPitch(subdims, MATRIX_A, dtype, clblasLeft); k = (trb) ? subdims[0].x : subdims[0].bwidth; pitchB = fl4RowWidth(k, tsize) * sizeof(cl_float4) / tsize; pitchC = matrBlockPitch(subdims, MATRIX_C, dtype, clblasLeft); isReal = !isComplexType(dtype); isDouble = isDoubleBasedType(dtype); vecLen = FLOAT4_VECLEN * sizeof(cl_float) / tsize; if (isDouble) { tNameIn = "double2"; ptrNameIn = "d2v"; } else { tNameIn = "float4"; ptrNameIn = "f4v"; } getResultGPRsInfo(dtype, &subdims[1], (unsigned int)vecLen, &nrRegs, &tNameOut); if (!isInlined) { declareBlkMul(ctx, dtype, m, n, opts, &argNames); kgenBeginFuncBody(ctx); } //variables declaration if (isImageA || isImageB) { kgenAddStmt(ctx, "const sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE " "| CLK_ADDRESS_NONE | CLK_FILTER_NEAREST;\n"); } if (!isInlined) { strcpy(s, "uint k;\n"); } sprintf(s, "%s%s a[%lu], b[%lu];\n",s , tNameIn, subK / vecLen, n * subK / vecLen); if (!isPriv) { // declare registers for result sprintf(s, "%s%s c[%u];\n", s, tNameOut, nrRegs); } // 'dot' function can't be used for complex types if (isComplexType(dtype) && (core == BLKMUL_DOT)) { core = BLKMUL_SEPARATE_MULADD; } if ((core == BLKMUL_SEPARATE_MULADD) || isComplexType(dtype)) { sprintf(s,"%s%s sum;\n", s, tNameIn); } kgenAddStmt(ctx, s); if (!isPriv && !isReal) { declareComplexMultParts(ctx, "alpha", tNameOut); } kgenAddBlankLine(ctx); // zeroing temporary multiplication data stored to registers if (!isPriv) { sprintf(s, "for (k = 0; k < %u; k++) {\n" " c[k] = 0;\n" "}\n\n", nrRegs); kgenAddStmt(ctx, s); } //main loop start if (!isInlined) { // initial skew correction if ((opts->flags & BLKMUL_SKEW_COLUMN) && (subK == vecLen)) { if (isPower2(pitchA / vecLen) && !(opts->flags & BLKMUL_AVOID_AND)) { sprintf(s, "skewCol = skewCol & %lu;\n", pitchA / vecLen - 1); } else { sprintf(s, "\nskewCol = skewCol %% %lu;\n", pitchA / vecLen); } kgenAddStmt(ctx, s); } sprintf(s, "\nfor (k = 0; k < %lu; k += %lu)", subdims[0].bwidth / vecLen, subK / vecLen); ret = kgenBeginBranch(ctx, s); } if (trb) { vlenJ = vecLen; vlenK = 1; } else { vlenJ = 1; vlenK = vecLen; } for (j = 0; j < n / vlenJ; j++) { // fetch elements of matrix B for (k = 0; k < subK / vlenK; k++) { size_t coords[2] = {k, j}; if (trb) { off = j * subK + k; } else { off = j * subK / vecLen + k; } sprintfInputOffset(s1, MATRIX_B, (int)coords[1 - trb], (int)coords[trb], pitchB / vecLen, subdims[1].x, opts, &argNames, (subK == vecLen)); if (isImageB) { sprintf(s, imageFetch[isDouble], 'b', off, "B", s1); } else { sprintf(s, "b[%lu] = B.%s[%s];\n", off, ptrNameIn, s1); } ret = kgenAddStmt(ctx, s); } } for (i = 0; i < m; i++) { kgenAddBlankLine(ctx); // fetch elements of matrix A from single row for (k = 0; k < subK / vecLen; k++) { sprintfInputOffset(s1, MATRIX_A, (int)i, (int)k, pitchA / vecLen, subdims[1].y, opts, &argNames, (subK == vecLen)); if (isImageA) { sprintf(s, imageFetch[isDouble], 'a', k, "A", s1); } else { sprintf(s,"a[%lu] = A.%s[%s];\n", k, ptrNameIn, s1); } ret = kgenAddStmt(ctx, s); } // multiply matrix A row on matrix B block for (j = 0; j < n / vlenJ; j++) { if (isReal) { //real case switch (core) { case BLKMUL_DOT: genRealDot(ctx, i, j, n, subK, (unsigned int)vecLen); break; case BLKMUL_MAD: genMad(ctx, dtype, i, j, n, subK / vecLen, (unsigned int)vecLen, trb); break; case BLKMUL_SEPARATE_MULADD: genVecMul(ctx, j, subK / vecLen, VECT_MULT_REAL); genVecSum(ctx, dtype, i, j, n, (unsigned int)vecLen, VECT_MULT_REAL); break; } } else { //complex case VectMulType mulType = (dtype == TYPE_COMPLEX_FLOAT) ? VECT_MULT_IMAG_FLOAT : VECT_MULT_IMAG_DOUBLE; if (core == BLKMUL_MAD) { //real part genMadMul(ctx, j, subK / vecLen, VECT_MULT_COMPLEX_REAL); genVecSum(ctx, dtype, i, j, n, (unsigned int)vecLen, VECT_MULT_COMPLEX_REAL); //imaginary part genMadMul(ctx, j, subK / vecLen, mulType); genVecSum(ctx, dtype, i, j, n, (unsigned int)vecLen, mulType); } else { //real part genVecMul(ctx, j, subK / vecLen, VECT_MULT_COMPLEX_REAL); genVecSum(ctx, dtype, i, j, n, (unsigned int)vecLen, VECT_MULT_COMPLEX_REAL); //imaginary part genVecMul(ctx, j, subK / vecLen, mulType); genVecSum(ctx, dtype, i, j, n, (unsigned int)vecLen, mulType); } } } } // update coordinates/skews and end the loop if (!isInlined) { getUpdateSkewCoords(ctx, opts, subK, pitchA, pitchB, (unsigned int)vecLen, ptrNameIn); kgenEndBranch(ctx, NULL); } if (!isPriv) { kgenAddBlankLine(ctx); genScaleAccResults(ctx, dtype, m, n, pitchC, (unsigned int)vecLen, transpose); } if (!isInlined) { ret = kgenEndFuncBody(ctx); } return ret ? -EOVERFLOW : 0; } clblas-2.10/src/library/blas/gens/legacy/gemm_img.c000066400000000000000000000550231264277366700222260ustar00rootroot00000000000000/* ************************************************************************ * Copyright 2013 Advanced Micro Devices, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * ************************************************************************/ /* * gemm image based generators */ #include #include #include #include #include #include #include #include #include #include "blas_kgen_legacy.h" #include "../gen_helper.h" #include "gen_helper_legacy.h" static CLBLASMpatExtra mpatExtra; static const char *prepareImagesGemmDeclA = "void __kernel\n" "%cprepareImageA(\n" " clblasOrder order,\n" " clblasTranspose transA,\n" " uint M,\n" " uint K,\n" " __global %s *A,\n" " uint lda,\n" " __write_only image2d_t imgA,\n" " uint offsetA)\n"; static const char *prepareImagesGemmDeclB = "void __kernel\n" "%cprepareImageB(\n" " clblasOrder order,\n" " clblasTranspose transB,\n" " uint N,\n" " uint K,\n" " __global %s *B,\n" " uint ldb,\n" " __write_only image2d_t imgB,\n" " uint offsetB)\n"; static const char *imgGemmDecl = "__attribute__((reqd_work_group_size(%lu, %lu, 1)))\n" "void __kernel\n" "%cgemmImg(\n" " const uint M,\n" " const uint N,\n" " const uint K,\n" " const %s alpha,\n" " const __read_only image2d_t A,\n" " const __read_only image2d_t B,\n" " const %s beta,\n" " __global %s *C,\n" " const uint ldc,\n" " const uint offsetC)\n"; static ssize_t generator( char *buf, size_t buflen, const struct SubproblemDim *subdims, const struct PGranularity *pgran, void *extra); static ssize_t preparator( char *buf, size_t buflen, const struct SubproblemDim *subdims, const struct PGranularity *pgran, void *extra); static ssize_t genWrapper( char *buf, size_t buflen, const struct SubproblemDim *subdims, const struct PGranularity *pgran, void *extra) { CLBLASKernExtra *kextra = (CLBLASKernExtra*)extra; if (kextra->kernType == CLBLAS_COMPUTING_KERNEL) { return generator(buf, buflen, subdims, pgran, extra); } else { return preparator(buf, buflen, subdims, pgran, extra); } } static void assignKargs(KernelArg *args, const void *params, const void *extra); static bool isFitToLDS( SubproblemDim *dim, DataType dtype, cl_ulong ldsSize, const void *kernelArgs); static SolverFlags solverFlags(void); static void calcNrThreads( size_t threads[2], const SubproblemDim *subdims, const PGranularity *pgran, const void *args, const void *extra); static int imgGetPerf( unsigned int kflags, const void *args); static SolverOps imgSops = { genWrapper, assignKargs, isFitToLDS, imgGetPerf, NULL, calcNrThreads, NULL, solverFlags, NULL, //fixupKargs NULL, //getDefaultDecomp NULL, //getDecompList NULL, NULL }; // Preparation function for images based kernel generator static ssize_t preparator( char *buf, size_t buflen, const struct SubproblemDim *subdims, const struct PGranularity *pgran, void *extra) { struct KgenContext *ctx; char tmp[4096], conjStr[1024]; CLBLASKernExtra *kextra = (CLBLASKernExtra*)extra; CopyImgFuncs copyImgFuncs; DataType dtype = kextra->dtype; BlasGenSettings gset; unsigned int vecLen; unsigned int tsize; const char *typeName; char fpref; bool b; size_t localBufSize; ssize_t ret; const char *conjCond; const char *functionHeadA = "int tra, aligned;\n" "const uint bpr = (K + %lu) / %lu;\n" "uint m = (gid / bpr) * %lu;\n" "uint k = (gid %% bpr) * %lu;\n" "uint x, y;\n" "__local %s temp[%lu];\n" "\n" "A += offsetA;\n" "tra = (!transA && order == clblasColumnMajor) ||\n" " (transA && order == clblasRowMajor);\n" "if (m >= M) {\n" " return;\n" "}\n"; const char *functionHeadB = "int trb, aligned;\n" "const uint bpr = (K + %lu) / %lu;\n" "const uint n = (gid / bpr) * %lu;\n" "const uint k = (gid %% bpr) * %lu;\n" "uint x, y;\n" "__local %s temp[%lu];\n" "\n" "B += offsetB;\n" "trb = (!transB && order == clblasRowMajor) ||\n" " (transB && order == clblasColumnMajor);\n" "if (n >= N) {\n" " return;\n" "}\n"; // Distribute blocks across compute units and copy matrix A to image. // Transposition and filling with zeros in unaligned cases is made using // buffer in local memory. const char *copyToImageA = "//copy matrix A block\n" "y = m + %u <= M ? %u : M - m;\n" "x = k + %u <= K ? %u : K - k;\n" "aligned = (x == %u) && (y == %u) && %d;\n" "int atcase = aligned * 10 + tra;\n" "%s" // conjugated check "if (atcase != 10) {\n" " %s((__local float4*)temp);\n" " barrier(CLK_LOCAL_MEM_FENCE);\n" "}\n" "switch(atcase) {\n" "case 10: //aligned, not transposed\n" " %s(imgA, k / %u, m, (GPtr)A, m, k, lda);\n" " break;\n" "%s" // conjugated case "case 1: //not aligned, transposed\n" " // generic transposed global to local\n" " %s((LPtr)temp, (GPtr)A, k, m, x, y, %u, lda);\n" " break;\n" "case 0: //not aligned, not transposed\n" " // generic global to local\n" " %s((LPtr) temp, (GPtr)A, m, k, y, x, %u, lda);\n" " break;\n" "case 11: //aligned, transposed\n" " // optimized transposed global to local\n" " %s((LPtr) temp, (GPtr)A, k, m, lda);\n" " break;\n" "}\n" "if (atcase != 10) {\n" " barrier(CLK_LOCAL_MEM_FENCE);\n" " %s(imgA, k / %u, m, (LPtr) temp);\n" "}\n" "\n"; const char *copyToImageB = "//copy matrix B block\n" "y = n + %u <= N ? %u : N - n;\n" "x = k + %u <= K ? %u : K - k;\n" "aligned = (x == %u) && (y == %u) && %d;\n" "int atcase = aligned * 10 + trb;\n" "%s" // conjugated check "if (atcase != 10) {\n" " %s((__local float4*)temp);\n" " barrier(CLK_LOCAL_MEM_FENCE);\n" "}\n" "switch (atcase) {\n" "case 10: //aligned, not transposed\n" " %s(imgB, k / %u, n, (GPtr)B, n, k, ldb);\n" " break;\n" "%s" // conjugated case "case 1: //not aligned, transposed\n" " // generic transposed global to local\n" " %s((LPtr)temp, (GPtr)B, k, n, x, y, %u, ldb);\n" " break;\n" "case 0: //not aligned, not transposed\n" " // generic global to local\n" " %s((LPtr)temp, (GPtr)B, n, k, y, x, %u, ldb);\n" " break;\n" "case 11: //transposed, aligned\n" " // optimized transposed global to local\n" " %s((LPtr)temp, (GPtr)B, k, n, ldb);\n" " break;\n" "}\n" "if (atcase != 10) {\n" " barrier(CLK_LOCAL_MEM_FENCE);\n" " %s(imgB, k / %u, n, (LPtr)temp);\n" "}\n" "\n"; memset(©ImgFuncs, 0, sizeof(copyImgFuncs)); memset(&gset, 0, sizeof(gset)); ctx = createKgenContext(buf, buflen, true); if (ctx == NULL) { return -ENOMEM; } tsize = dtypeSize(dtype); b = isDoubleBasedType(dtype); kgenDeclareUptrs(ctx, b); declareBlasEnums(ctx); memcpy(gset.subdims, subdims, sizeof(gset.subdims)); gset.kextra = kextra; gset.pgran = pgran; // generate necessary memory to image copying functions generateImageCopyFuncs(©ImgFuncs, ctx, CLBLAS_GEMM, &gset); kgenAddBlankLine(ctx); vecLen = sizeof(cl_float4) / dtypeSize(dtype); typeName = dtypeBuiltinType(dtype); fpref = dtypeToBlasPrefix(dtype); if (kextra->kernType == CLBLAS_PREP_A_KERNEL) { sprintf(tmp, prepareImagesGemmDeclA, fpref, typeName, typeName); kgenDeclareFunction(ctx, tmp); ret = kgenBeginFuncBody(ctx); // same local buffer is used for both matrix A and matrix B blocks localBufSize = subdims[1].y * fl4RowWidth(subdims[1].bwidth, tsize); localBufSize *= vecLen; kgenDeclareGroupID(ctx, "gid", pgran); sprintf(tmp, functionHeadA, subdims[1].bwidth - 1, subdims[1].bwidth, subdims[1].y, subdims[1].bwidth, typeName, localBufSize); kgenAddStmt(ctx, tmp); if (isComplexType(dtype)) { conjCond = "atcase += ((atcase == 10) && " "(transA == clblasConjTrans)) ? 100 : 0;\n"; sprintf(conjStr, "case 110: //conjugated, not transposed, aligned\n" " %s((LPtr)temp, (GPtr)A, m, k, lda);\n" " break;\n", copyImgFuncs.globalToLocal[MATRIX_A]); } else { conjCond = ""; strcpy(conjStr, ""); } sprintf(tmp, copyToImageA, subdims[1].y, subdims[1].y, // y = m + dy <= M ?... subdims[1].bwidth, subdims[1].bwidth, // x = k + bw <= K ?... subdims[1].bwidth, subdims[1].y, // aligned = (x==bw1)&&(y==dy1) (kextra->flags & KEXTRA_NO_COPY_VEC_A) == 0, conjCond, copyImgFuncs.zeroBlock[MATRIX_A], copyImgFuncs.globalToImage[MATRIX_A], vecLen, conjStr, copyImgFuncs.globalToLocalTransposedGeneric[MATRIX_A], subdims[1].bwidth, copyImgFuncs.globalToLocalGeneric[MATRIX_A], subdims[1].bwidth, copyImgFuncs.globalToLocalTransposed[MATRIX_A], copyImgFuncs.localToImage[MATRIX_A], vecLen); kgenAddStmt(ctx, tmp); } else { // PREP_B sprintf(tmp, prepareImagesGemmDeclB, fpref, typeName, typeName); kgenDeclareFunction(ctx, tmp); ret = kgenBeginFuncBody(ctx); // same local buffer is used for both matrix A and matrix B blocks localBufSize = subdims[1].x * fl4RowWidth(subdims[1].bwidth, tsize); localBufSize *= vecLen; kgenDeclareGroupID(ctx, "gid", pgran); sprintf(tmp, functionHeadB, subdims[1].bwidth - 1, subdims[1].bwidth, subdims[1].x, subdims[1].bwidth, typeName, localBufSize); kgenAddStmt(ctx, tmp); if (isComplexType(dtype)) { conjCond = "atcase += ((atcase == 10) && " "(transB == clblasConjTrans)) ? 100 : 0;\n"; sprintf(conjStr, "case 110: //conjugated, not transposed, aligned\n" " %s((LPtr)temp, (GPtr)B, n, k, ldb);\n" " break;\n", copyImgFuncs.globalToLocal[MATRIX_B]); } else { conjCond = ""; strcpy(conjStr, ""); } sprintf(tmp, copyToImageB, subdims[1].x, subdims[1].x, // y = n + dy <= N ?... subdims[1].bwidth, subdims[1].bwidth, // x = k + bw <= K ?... subdims[1].bwidth, subdims[1].x, // aligned = (x==bw1)&&(y==dx1) (kextra->flags & KEXTRA_NO_COPY_VEC_B) == 0, conjCond, copyImgFuncs.zeroBlock[MATRIX_B], copyImgFuncs.globalToImage[MATRIX_B], vecLen, conjStr, copyImgFuncs.globalToLocalTransposedGeneric[MATRIX_B], subdims[1].bwidth, copyImgFuncs.globalToLocalGeneric[MATRIX_B], subdims[1].bwidth, copyImgFuncs.globalToLocalTransposed[MATRIX_B], copyImgFuncs.localToImage[MATRIX_B], vecLen); kgenAddStmt(ctx, tmp); } kgenEndFuncBody(ctx); ret = kgenAddBlankLine(ctx); if (!ret) { ret = (ssize_t)kgenSourceSize(ctx) + 1; } destroyKgenContext(ctx); return (ret < 0) ? -EOVERFLOW : ret; } static void initKernelVarNames(KernelVarNames *kvars, KernelExtraFlags kflags) { kvars->A = "imgA"; kvars->B = "imgB"; if (isMatrixAccessColMaj(CLBLAS_GEMM, kflags, MATRIX_A)) { kvars->coordA = "coordA.x"; } else { kvars->coordA = "coordA.y"; } if (isMatrixAccessColMaj(CLBLAS_GEMM, kflags, MATRIX_B)) { kvars->coordB = "coordB.x"; } else { kvars->coordB = "coordB.y"; } kvars->sizeM = "M"; kvars->sizeN = "N"; kvars->sizeK = "K"; } // global memory based kernel generator static ssize_t generator( char *buf, size_t buflen, const struct SubproblemDim *subdims, const struct PGranularity *pgran, void *extra) { struct KgenContext *ctx; CLBLASKernExtra *kextra = (CLBLASKernExtra*)extra; char tmp[4096], tmp1[4096]; char *p; // is the iteration over N, N at the top level const char *typeName; char fpref; DataType dtype = kextra->dtype; ssize_t ret; BlasGenSettings gset; BlkMulOpts mulOpts; unsigned int tsize; unsigned int vecLen, outVecLen; bool b; const char *outTypeName; unsigned int i; unsigned int nrRegs, regPitch; int tra, trb; char vect[2] = {'y', 'x'}; const char *coordConstants = "const uint workItemM = get_global_id(0) * %lu;\n" "const uint workItemN = get_global_id(1) * %lu;\n" "const int2 skewRow = (int2)(0, get_local_id(0) %% %lu);\n" "uint vectK = (K + %u) / %u;\n"; /* * template for image based gemm preparation part * for two dimensional work space */ const char *localVariables = "uint k0;\n" "int2 coordA = (int2)(0, workItemM);\n" "int2 coordB = (int2)(0, workItemN);\n" "%s c[%u];\n\n"; tsize = dtypeSize(dtype); vecLen = sizeof(cl_float4) / dtypeSize(dtype); if (isComplexType(dtype)) { regPitch = (unsigned int)subdims[1].x; } else { regPitch = (unsigned int) fl4RowWidth(subdims[1].x, tsize) * sizeof(cl_float4) / tsize; } memset(&gset, 0, sizeof(gset)); memcpy(gset.subdims, subdims, sizeof(gset.subdims)); gset.kextra = kextra; gset.pgran = pgran; initKernelVarNames(&gset.varNames, kextra->flags); ctx = createKgenContext(buf, buflen, true); if (ctx == NULL) { return -ENOMEM; } // at first, generate needed declarations and auxiliary functions b = isDoubleBasedType(dtype); kgenDeclareUptrs(ctx, b); typeName = dtypeBuiltinType(dtype); fpref = dtypeToBlasPrefix(dtype); // now, generate the kernel sprintf(tmp, imgGemmDecl, pgran->wgSize[0], pgran->wgSize[1], fpref, typeName, typeName, typeName); kgenDeclareFunction(ctx, tmp); ret = kgenBeginFuncBody(ctx); // constants sprintf(tmp, coordConstants, subdims[1].y, subdims[1].x, subdims[1].y, vecLen - 1, vecLen); kgenAddStmt(ctx, tmp); /* * Calculate local buffer pitches, and then declare local * variables */ getResultGPRsInfo(dtype, &subdims[1], vecLen, &nrRegs, &outTypeName); sprintf(tmp, localVariables, outTypeName, nrRegs); kgenAddStmt(ctx, tmp); // check if offset exceeds matrix kgenAddStmt(ctx, "if ((workItemM >= M) ||" "(workItemN >= N)) {\n" " return;\n" "}\n"); kgenAddStmt(ctx, "C += offsetC;\n"); // zero C block sprintf(tmp, "for (k0 = 0; k0 < %u; k0++) {\n" " c[k0] = 0;\n" "}\n\n", nrRegs); kgenAddStmt(ctx, tmp); // block multiplication inlined function sprintf(tmp, "for (k0 = 0; k0 < vectK; k0 += %lu)", subdims[1].bwidth / vecLen); kgenBeginBranch(ctx, tmp); mulOpts.aMobj = CLMEM_IMAGE; mulOpts.bMobj = CLMEM_IMAGE; mulOpts.flags = BLKMUL_OUTPUT_PRIVATE | BLKMUL_SKEW_ROW | BLKMUL_INLINE; if (isComplexType(dtype)) { mulOpts.core = BLKMUL_SEPARATE_MULADD; } else { mulOpts.core = BLKMUL_MAD; } mulOpts.argNames.coordA = "coordA"; mulOpts.argNames.coordB = "coordB"; mulOpts.argNames.skewCol = "skewCol"; mulOpts.argNames.skewRow = "skewRow"; mulOpts.argNames.k = "k0"; mulOpts.argNames.vectBoundK = "vectK"; ret = blkMulGen(ctx, subdims, dtype, &mulOpts); if (ret) { destroyKgenContext(ctx); return -EOVERFLOW; } // update image coordinates sprintf(tmp, "\ncoordA.x += %lu;\n" "coordB.x += %lu;\n", subdims[1].bwidth / vecLen, subdims[1].bwidth / vecLen); kgenAddStmt(ctx, tmp); kgenEndBranch(ctx, NULL); // reorder the given solution outVecLen = isComplexType(dtype) ? 1 : vecLen; p = tmp1; for (i = 0; i < regPitch / outVecLen; i++) { unsigned int k = (unsigned int)(subdims[1].y - 1) * regPitch / outVecLen + i; sprintf(p, "\n" " tmp = c[%u];\n" " for (j = %lu; j >= 0; j--) {\n" " c[(j+1) * %u + %u] = c[j * %u + %u];\n" " }\n" " c[%u] = tmp;\n", k, subdims[1].y - 2, regPitch / outVecLen, i, regPitch / outVecLen, i, i); p += strlen(p); } sprintf(tmp, "\n" "for (k0 = 0; k0 < skewRow.y; k0++) {\n" " int j;\n" " %s tmp;\n" "%s" "}\n" "\n", outTypeName, tmp1); kgenAddStmt(ctx, tmp); tra = isMatrixAccessColMaj(CLBLAS_GEMM, kextra->flags, MATRIX_A); trb = isMatrixAccessColMaj(CLBLAS_GEMM, kextra->flags, MATRIX_B); sprintf(tmp, "coordA.%c = workItemM;\n" "coordB.%c = workItemN;\n\n", vect[tra], vect[trb]); kgenAddStmt(ctx, tmp); // write back the tile evaluated generateResultUpdateOld(ctx, CLBLAS_GEMM, &gset, NULL, NULL); kgenEndFuncBody(ctx); ret = kgenAddBlankLine(ctx); if (!ret) { ret = (ssize_t)kgenSourceSize(ctx) + 1; } destroyKgenContext(ctx); return (ret < 0) ? -EOVERFLOW : ret; } static void assignKargs(KernelArg *args, const void *params, const void *extra) { const CLBlasKargs *blasArgs = (const CLBlasKargs*)params; (void)extra; switch (blasArgs->kernType) { case CLBLAS_COMPUTING_KERNEL: // arguments for computational kernel initSizeKarg(&args[0], blasArgs->M); initSizeKarg(&args[1], blasArgs->N); initSizeKarg(&args[2], blasArgs->K); assignScalarKarg(&args[3], &(blasArgs->alpha), blasArgs->dtype); INIT_KARG(&args[4], blasArgs->scimage[0]); INIT_KARG(&args[5], blasArgs->scimage[1]); assignScalarKarg(&args[6], &(blasArgs->beta), blasArgs->dtype); initMemobjKarg(&args[7], blasArgs->C, NULL, 0, 0); initSizeKarg(&args[8], blasArgs->ldc.matrix); initSizeKarg(&args[9], blasArgs->offCY); break; case CLBLAS_PREP_A_KERNEL: INIT_KARG(&args[0], blasArgs->order); INIT_KARG(&args[1], blasArgs->transA); initSizeKarg(&args[2], blasArgs->M); initSizeKarg(&args[3], blasArgs->K); initMemobjKarg(&args[4], blasArgs->A, NULL, 0, 0); initSizeKarg(&args[5], blasArgs->lda.matrix); INIT_KARG(&args[6], blasArgs->scimage[0]); initSizeKarg(&args[7], blasArgs->offA); break; case CLBLAS_PREP_B_KERNEL: INIT_KARG(&args[0], blasArgs->order); INIT_KARG(&args[1], blasArgs->transB); initSizeKarg(&args[2], blasArgs->N); initSizeKarg(&args[3], blasArgs->K); initMemobjKarg(&args[4], blasArgs->B, NULL, 0, 0); initSizeKarg(&args[5], blasArgs->ldb.matrix); INIT_KARG(&args[6], blasArgs->scimage[1]); initSizeKarg(&args[7], blasArgs->offBX); break; default: //this should not happen break; } } static bool isFitToLDS( SubproblemDim *dim, DataType dtype, cl_ulong ldsSize, const void *kernelArgs) { cl_ulong size; const CLBlasKargs *kargs = (const CLBlasKargs*)kernelArgs; size = matrBlockSize(&dim[1], MATRIX_C, dtype, kargs->side); return (size * dtypeSize(dtype) <= ldsSize); } static void calcNrThreads( size_t threads[2], const SubproblemDim *subdims, const PGranularity *pgran, const void *args, const void *extra) { const CLBlasKargs *kargs = args; (void)extra; if (kargs->kernType != CLBLAS_COMPUTING_KERNEL) { const size_t *whole, *part; size_t nrGroups; // each thread gets one block if (kargs->kernType == CLBLAS_PREP_A_KERNEL) { whole = &kargs->M; part = &subdims[0].itemY; } else { whole = &kargs->N; part = &subdims[0].itemX; } nrGroups = *whole / *part + (*whole % *part != 0); nrGroups *= (kargs->K / subdims[0].bwidth + (kargs->K % subdims[0].bwidth != 0)); threads[0] = pgran->wgSize[0] * nrGroups; threads[1] = pgran->wgSize[1]; } else { calcGlobalThreads(threads, &subdims[0], pgran, kargs->M, kargs->N); } } static SolverFlags solverFlags(void) { return (SF_WSPACE_2D); } void initGemmImgPattern(MemoryPattern *mempat) { mempat->name = "Image based block gemm"; mempat->nrLevels = 2; mempat->cuLevel = 0; mempat->thLevel = 1; mempat->sops = &imgSops; mpatExtra.aMset = CLMEM_LEVEL_L1 | CLMEM_LEVEL_LDS; mpatExtra.bMset = CLMEM_LEVEL_L1 | CLMEM_LEVEL_LDS; mpatExtra.mobjA = CLMEM_IMAGE; mpatExtra.mobjB = CLMEM_IMAGE; mempat->extra = &mpatExtra; } static int imgGetPerf( unsigned int kflags, const void *args) { (void)args; (void)kflags; return PPERF_POOR; } clblas-2.10/src/library/blas/gens/legacy/gemm_lds.c000066400000000000000000000406511264277366700222350ustar00rootroot00000000000000/* ************************************************************************ * Copyright 2013 Advanced Micro Devices, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * ************************************************************************/ /* * LDS based block GEMM generator */ #include #include #include #include #include #include #include #include #include #include "../init.h" #include "blas_kgen_legacy.h" #include "gen_helper_legacy.h" #include "../gen_helper.h" static CLBLASMpatExtra mpatExtra; static ssize_t generator( char *buf, size_t buflen, const struct SubproblemDim *subdims, const struct PGranularity *pgran, void *extra); static void assignKargs(KernelArg *args, const void *params, const void *extra); static bool isFitToLDS( SubproblemDim *dim, DataType dtype, cl_ulong ldsSize, const void *kernelArgs); static SolverFlags solverFlags(void); static int ldsGetPerf( unsigned int kflags, const void *args); static SolverOps solverOps = { generator, assignKargs, isFitToLDS, ldsGetPerf, NULL, NULL, NULL, solverFlags, NULL, //fixupKargs NULL, //getDefaultDecomp NULL, //getDecompList NULL, NULL }; static void declareKernel( struct KgenContext *ctx, DataType dtype, const PGranularity *pgran) { char tmp[1024]; char fpref; const char *typeName; typeName = dtypeBuiltinType(dtype); fpref = dtypeToBlasPrefix(dtype); sprintf(tmp, "__attribute__((reqd_work_group_size(%u, %u, 1)))\n" "void __kernel\n" "%cgemm(\n" " uint M,\n" " uint N,\n" " uint K,\n" " %s alpha,\n" " __global %s *A,\n" " uint lda,\n" " __global %s *B,\n" " uint ldb,\n" " %s beta,\n" " __global %s *C,\n" " uint ldc,\n" " const uint offA,\n" " const uint offB,\n" " const uint offC)\n", pgran->wgSize[0], pgran->wgSize[1], fpref, typeName, typeName, typeName, typeName, typeName); kgenDeclareFunction(ctx, tmp); } static void declareLocalVariables( struct KgenContext *ctx, DataType dtype, const SubproblemDim *dims, const PGranularity *pgran) { char tmp[1024]; const char *inTypeName, *outTypeName; size_t pitchAB; unsigned int nrRegs; unsigned int vecLen; inTypeName = dtypeBuiltinType(dtype); pitchAB = matrBlockPitch(dims, MATRIX_A, dtype, clblasLeft); vecLen = sizeof(cl_float4) / dtypeSize(dtype); getResultGPRsInfo(dtype, &dims[1], vecLen, &nrRegs, &outTypeName); sprintf(tmp, "uint m0, k0;\n" "__local %s tempA[%lu];\n" "__local %s tempB[%lu];\n" "%s c[%u];\n" "uint currM, currN, groupsPan;\n" "uint2 coordA, coordB;\n" "uint x, y;\n", inTypeName, pitchAB * dims[0].y, inTypeName, pitchAB * dims[0].x, outTypeName, nrRegs); kgenAddStmt(ctx, tmp); kgenDeclareLocalID(ctx, "lid", pgran); kgenDeclareGroupID(ctx, "gid", pgran); kgenAddBlankLine(ctx); } static void genPrepareBlockA( struct KgenContext *ctx, const SubproblemDim *dim, DataType dtype, const CopyBufFuncs *copyFuncs, const ZeroFuncs *zeroFuncs, KernelExtraFlags flags) { char tmp[1024]; size_t pitch; const char *coordName[2] = {"currM", "k0"}; const char *sizeName[2] = {"y", "x"}; size_t bsize[2] = {dim->y, dim->bwidth}; int tra; tra = isMatrixAccessColMaj(CLBLAS_GEMM, flags, MATRIX_A); pitch = matrBlockPitch(dim, MATRIX_A, dtype, clblasLeft); /* * If the (sub)problem is integrally divisible, * skip any checks, and just read with optimal blocks, * otherwise check for tails and then read with a * fast function in the case of optimal blocks, and with * the slow one in the case of tails respectively */ if (!(flags & (KEXTRA_TAILS_M | KEXTRA_TAILS_K))) { sprintf(tmp, "%s((LPtr)tempA, (GPtr)A, %s, %s, lda);\n", copyFuncs->read[MATRIX_A], coordName[tra], coordName[1 - tra]); } else { sprintf(tmp, "y = (currM + %lu <= M) ? %lu : M - currM;\n" "x = (k0 + %lu <= K) ? %lu : K - k0;\n" "if ((y == %lu) && (x == %lu)) {\n" // fast read " %s((LPtr)tempA, (GPtr)A, %s, %s, lda);\n" "}\n" "else {\n" " %s((__local float4*)tempA);\n" // zeroing " barrier(CLK_LOCAL_MEM_FENCE);\n" // slow read " %s((LPtr)tempA, (GPtr)A, %s, %s, %s, %s, %lu, lda);\n" "}\n\n", bsize[0], bsize[0], bsize[1], bsize[1], bsize[0], bsize[1], copyFuncs->read[MATRIX_A], coordName[tra], coordName[1 - tra], zeroFuncs->names[MATRIX_A], copyFuncs->readGeneric[MATRIX_A], coordName[tra], coordName[1 - tra], sizeName[tra], sizeName[1 - tra], pitch); } kgenAddStmt(ctx, tmp); } static void genPrepareBlockB( struct KgenContext *ctx, const SubproblemDim *dim, DataType dtype, const CopyBufFuncs *copyFuncs, const ZeroFuncs *zeroFuncs, KernelExtraFlags flags) { char tmp[1024]; size_t pitch; const char *coordName[2] = {"currN", "k0"}; const char *sizeName[2] = {"y", "x"}; size_t bsize[2] = {dim->x, dim->bwidth}; int trb; pitch = matrBlockPitch(dim, MATRIX_B, dtype, clblasLeft); trb = isMatrixAccessColMaj(CLBLAS_GEMM, flags, MATRIX_B); if (!(flags & (KEXTRA_TAILS_N | KEXTRA_TAILS_K))) { sprintf(tmp, "%s((LPtr)tempB, (GPtr)B, %s, %s, ldb);\n", copyFuncs->read[MATRIX_B], coordName[trb], coordName[1 - trb]); } else { sprintf(tmp, "y = (currN + %lu <= N) ? %lu : N - currN;\n" "x = (k0 + %lu <= K) ? %lu : K - k0;\n" "if ((y == %lu) && (x == %lu)) {\n" // fast read " %s((LPtr)tempB, (GPtr)B, %s, %s, ldb);\n" "}\n" "else {\n" " %s((__local float4*)tempB);\n" // zeroing " barrier(CLK_LOCAL_MEM_FENCE);\n" // slow read " %s((LPtr)tempB, (GPtr)B, %s, %s, %s, %s, %lu, ldb);\n" "}\n\n", bsize[0], bsize[0], bsize[1], bsize[1], bsize[0], bsize[1], copyFuncs->read[MATRIX_B], coordName[trb], coordName[1 - trb], zeroFuncs->names[MATRIX_B], copyFuncs->readGeneric[MATRIX_B], coordName[trb], coordName[1 - trb], sizeName[trb], sizeName[1 - trb], pitch); } kgenAddStmt(ctx, tmp); } static void genZeroResult( struct KgenContext *ctx, DataType dtype, const SubproblemDim *dims) { unsigned int n; char tmp[1024]; unsigned int vecLen; vecLen = sizeof(cl_float4) / dtypeSize(dtype); getResultGPRsInfo(dtype, &dims[1], vecLen, &n, NULL); sprintf(tmp, "\n" "for (x = 0; x < %u; x++) {\n" " c[x] = 0;\n" "}\n\n", n); kgenAddStmt(ctx, tmp); } static void initKernelVarNames(KernelVarNames *kvars, KernelExtraFlags kflags) { kvars->A = "A"; kvars->B = "B"; if (isMatrixAccessColMaj(CLBLAS_GEMM, kflags, MATRIX_A)) { kvars->coordA = "coordA.x"; } else { kvars->coordA = "coordA.y"; } if (isMatrixAccessColMaj(CLBLAS_GEMM, kflags, MATRIX_B)) { kvars->coordB = "coordB.x"; } else { kvars->coordB = "coordB.y"; } kvars->sizeM = "M"; kvars->sizeN = "N"; kvars->sizeK = "K"; } static ssize_t generator( char *buf, size_t buflen, const struct SubproblemDim *subdims, const struct PGranularity *pgran, void *extra) { struct KgenContext *ctx; CLBLASKernExtra *kextra = (CLBLASKernExtra*)extra; KernelExtraFlags kflags = kextra->flags; char tmp[1024]; char blkmul[128]; char updateResFn[FUNC_NAME_MAXLEN]; char updateResGenericFn[FUNC_NAME_MAXLEN]; CopyBufFuncs copyFuncs; ZeroFuncs zeroFuncs; DataType dtype = kextra->dtype; ssize_t ret; BlasGenSettings gset; BlkMulOpts mulOpts; size_t pitchAB; const char *s; bool b; int tra, trb; unsigned int l1Pans; unsigned int vecLen; char vect[2] = {'y', 'x'}; UpdateResultFlags upFlags; ctx = createKgenContext(buf, buflen, true); if (ctx == NULL) { return -ENOMEM; } // at first, generate needed declarations and auxiliary functions pitchAB = matrBlockPitch(subdims, MATRIX_A, dtype, clblasLeft); b = isDoubleBasedType(dtype); kgenDeclareUptrs(ctx, b); // generator settings initialization memset(&gset, 0, sizeof(gset)); memcpy(gset.subdims, subdims, sizeof(gset.subdims)); gset.kextra = kextra; gset.pgran = pgran; initKernelVarNames(&gset.varNames, kflags); generateBufCopyFuncs(©Funcs, ctx, CLBLAS_GEMM, &gset, BCHF_MATRIX_A | BCHF_MATRIX_B); generateUpresFuncs(ctx, CLBLAS_GEMM, &gset, updateResFn, updateResGenericFn); generateZeroingFuncs(&zeroFuncs, ctx, &subdims[0], pgran, dtype, ZF_MATRIX_A | ZF_MATRIX_B); // block multiplication function mulOpts.aMobj = CLMEM_BUFFER; mulOpts.bMobj = CLMEM_BUFFER; mulOpts.flags = BLKMUL_OUTPUT_PRIVATE | BLKMUL_SKEW_COLUMN; if (isComplexType(dtype)) { mulOpts.core = BLKMUL_SEPARATE_MULADD; } else { mulOpts.core = BLKMUL_MAD; } ret = blkMulGen(ctx, subdims, dtype, &mulOpts); if (ret) { destroyKgenContext(ctx); return -EOVERFLOW; } kgenAddBlankLine(ctx); kgenGetLastFuncName(blkmul, sizeof(blkmul), ctx); // now, generate the kernel declareKernel(ctx, dtype, pgran); kgenBeginFuncBody(ctx); declareLocalVariables(ctx, dtype, subdims, pgran); // Shift matrices' origins according to offsetM and offsetN. kgenAddBlankLine(ctx); tmp[0] = '\0'; strcat(tmp, "A += offA;\n"); strcat(tmp, "B += offB;\n"); strcat(tmp, "C += offC;\n"); kgenAddStmt(ctx, tmp); kgenAddBlankLine(ctx); /* * Output matrix is divided into squares, each work group * gets such a square. Get current panel coordinates * depending on which matrix must be outer. * Assign different inner matrix's panels processed * at the same time to different work groups in order to * reduce global memory bank conflicts. Use cyclic * addressing for this purpose */ sprintf(tmp, // number of outer panels "groupsPan = N / %lu;\n" "if (N %% %lu) {\n" " groupsPan++;\n" "}\n" "x = gid %% groupsPan;\n" // outer panel number "y = gid / groupsPan;\n" // outer inner number "currN = x * %lu;\n" "\n" // number of inner panels "groupsPan = M / %lu;\n" "if (M %% %lu) {\n" " groupsPan++;\n" "}\n" // inner panel number using cyclic addressing "y = (x + y) %% groupsPan;\n" "currM = y * %lu;\n" "\n", subdims[0].itemX, subdims[0].itemX, subdims[0].itemX, subdims[0].itemY, subdims[0].itemY, subdims[0].itemY); ret = kgenAddStmt(ctx, tmp); tra = isMatrixAccessColMaj(CLBLAS_GEMM, kflags, MATRIX_A); trb = isMatrixAccessColMaj(CLBLAS_GEMM, kflags, MATRIX_B); sprintf(tmp, "coordA.%c = currM;\n" "coordA.%c = 0;\n" "coordB.%c = currN;\n" "coordB.%c = 0;\n\n", vect[tra], vect[1 - tra], vect[trb], vect[1 - trb]); kgenAddStmt(ctx, tmp); genZeroResult(ctx, dtype, subdims); // loop over K sprintf(tmp, "for (k0 = 0; k0 < K; k0 += %lu)", subdims[0].bwidth); kgenBeginBranch(ctx, tmp); genPrepareBlockA(ctx, subdims, dtype, ©Funcs, &zeroFuncs, kflags); genPrepareBlockB(ctx, subdims, dtype, ©Funcs, &zeroFuncs, kflags); kgenAddBarrier(ctx, CLK_LOCAL_MEM_FENCE); l1Pans = (unsigned int)subdims[0].x / (unsigned int)subdims[1].x; vecLen = sizeof(cl_float4) / dtypeSize(dtype); // and eventually multiply the blocks and update the current result getResultGPRsInfo(dtype, &subdims[1], vecLen, NULL, &s); sprintf(tmp, "%s((LPtr)(tempA + (lid / %u * %lu) * %lu),\n" " (LPtr)(tempB + (lid %% %u * %lu) * %lu),\n" " (%s*)c, lid);\n", blkmul, l1Pans, subdims[1].y, pitchAB, l1Pans, subdims[1].x, pitchAB, s); kgenAddStmt(ctx, tmp); kgenAddBarrier(ctx, CLK_LOCAL_MEM_FENCE); kgenEndBranch(ctx, NULL); // loop over K // update result logic sprintf(tmp, "coordA.%c += lid / %u * %lu;\n" "coordB.%c += lid %% %u * %lu;\n", vect[tra], l1Pans, subdims[1].y, vect[trb], l1Pans, subdims[1].x); kgenAddStmt(ctx, tmp); if (kflags & (KEXTRA_TAILS_M | KEXTRA_TAILS_N)) { sprintf(tmp, "if (coordA.%c >= M || coordB.%c >= N) {\n" " return;\n" "}\n", vect[tra], vect[trb]); kgenAddStmt(ctx, tmp); } kgenAddBlankLine(ctx); upFlags = kextraToUpresFlags(CLBLAS_GEMM, kflags); upFlags |= UPRES_EXCEED_PROBLEM_CONDITION; genResultUpdateWithFlagsOld(ctx, CLBLAS_GEMM, &gset, upFlags, updateResFn, updateResGenericFn, NULL); ret = kgenEndFuncBody(ctx); if (!ret) { ret = (ssize_t)kgenSourceSize(ctx) + 1; } destroyKgenContext(ctx); return (ret < 0) ? -EOVERFLOW : ret; } static void assignKargs(KernelArg *args, const void *params, const void *extra) { CLBlasKargs *blasArgs = (CLBlasKargs*)params; (void)extra; initSizeKarg(&args[0], blasArgs->M); initSizeKarg(&args[1], blasArgs->N); initSizeKarg(&args[2], blasArgs->K); assignScalarKarg(&args[3], &(blasArgs->alpha), blasArgs->dtype); initMemobjKarg(&args[4], blasArgs->A, NULL, 0, 0); initSizeKarg(&args[5], blasArgs->lda.matrix); initMemobjKarg(&args[6], blasArgs->B, NULL, 0, 0); initSizeKarg(&args[7], blasArgs->ldb.matrix); assignScalarKarg(&args[8], &(blasArgs->beta), blasArgs->dtype); initMemobjKarg(&args[9], blasArgs->C, NULL, 0, 0); initSizeKarg(&args[10], blasArgs->ldc.matrix); initSizeKarg(&args[11], blasArgs->offA); initSizeKarg(&args[12], blasArgs->offBX); initSizeKarg(&args[13], blasArgs->offCY); } static bool isFitToLDS( SubproblemDim *dim, DataType dtype, cl_ulong ldsSize, const void *kernelArgs) { cl_ulong size; (void)kernelArgs; size = matrBlockSize(dim, MATRIX_A, dtype, clblasLeft); size += matrBlockSize(dim, MATRIX_B, dtype, clblasLeft); size += matrBlockSize(dim, MATRIX_C, dtype, clblasLeft); return (size * dtypeSize(dtype) <= ldsSize); } static SolverFlags solverFlags(void) { return (SF_WSPACE_2D); } void initGemmLdsPattern(MemoryPattern *mempat) { mempat->name = "LDS based block gemm"; mempat->nrLevels = 2; mempat->cuLevel = 0; mempat->thLevel = 1; mempat->sops = &solverOps; mpatExtra.aMset = CLMEM_LEVEL_LDS; mpatExtra.bMset = CLMEM_LEVEL_LDS; mpatExtra.mobjA = CLMEM_BUFFER; mpatExtra.mobjB = CLMEM_BUFFER; mempat->extra = &mpatExtra; } static int ldsGetPerf( unsigned int kflags, const void *args) { (void)args; (void)kflags; return PPERF_POOR; } clblas-2.10/src/library/blas/gens/legacy/gen_helper_legacy.c000066400000000000000000000340221264277366700240750ustar00rootroot00000000000000/* ************************************************************************ * Copyright 2013 Advanced Micro Devices, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * ************************************************************************/ #include #include "gen_helper_legacy.h" #include "blas_kgen_legacy.h" #include "../gen_helper.h" typedef struct CopyPattern { SubproblemDim dim; const PGranularity *pgran; DataType dtype; DBlockCopyDirection dir; DBlockCopyFlags flags; bool generic; bool zeroing; } CopyPattern; static int cpyImgGenCallback(struct KgenContext *ctx, const void *pattern) { const CopyPattern *pat = (CopyPattern*)pattern; const void *dim = (pat->generic) ? NULL : &pat->dim; if(pat->zeroing) { return f4zeroBlockGen(ctx, dim, pat->pgran, "__local"); } else { return copyDataBlockGen(ctx, dim, pat->pgran, pat->dtype, pat->dir, pat->flags); } } int generateImageCopyFuncs( CopyImgFuncs *copyFuncs, struct KgenContext *ctx, BlasFunctionID funcID, const BlasGenSettings *gset) { const SubproblemDim *dims = gset->subdims; KernelExtraFlags kflags = gset->kextra->flags; DataType dtype = gset->kextra->dtype; const PGranularity *pgran = gset->pgran; CopyPattern pattern; // mandatory flags for global to local copying DBlockCopyFlags glcpFlags[2] = {0, 0}; struct KgenGuard *guard; unsigned int tsize; int ret = 0; bool isTra, areTails, isConjA; bool customize; if (kflags & KEXTRA_NO_COPY_VEC_A) { glcpFlags[0] = DBLOCK_COPY_NOT_VECTORIZE; } if (kflags & KEXTRA_NO_COPY_VEC_B) { glcpFlags[1] = DBLOCK_COPY_NOT_VECTORIZE; } tsize = dtypeSize(dtype); isTra = isMatrixAccessColMaj(funcID, kflags, MATRIX_A); isConjA = isMatrixConj(kflags, MATRIX_A); areTails = (kflags & (KEXTRA_TAILS_M | KEXTRA_TAILS_N)); customize = (funcID == CLBLAS_TRMM); guard = createKgenGuard(ctx, cpyImgGenCallback, sizeof(CopyPattern)); if (guard == NULL) { return -ENOMEM; } memset(&pattern, 0, sizeof(pattern)); pattern.zeroing = false; pattern.dim = dims[0]; pattern.dir = DBLOCK_GLOBAL_TO_IMAGE; pattern.dtype = dtype; pattern.flags = 0; pattern.generic = false; pattern.pgran = pgran; if (!(customize && (isTra || isConjA))) { pattern.dim.x = dims[0].bwidth; pattern.dim.y = dims[0].y; findGenerateFunction(guard, &pattern, copyFuncs->globalToImage[MATRIX_A], FUNC_NAME_MAXLEN); kgenAddBlankLine(ctx); } pattern.dim.x = dims[0].bwidth; pattern.dim.y = dims[0].x; findGenerateFunction(guard, &pattern, copyFuncs->globalToImage[MATRIX_B], FUNC_NAME_MAXLEN); kgenAddBlankLine(ctx); pattern.dim.x = dims[0].bwidth; pattern.dim.y = dims[1].y; pattern.dir = DBLOCK_LOCAL_TO_IMAGE; findGenerateFunction(guard, &pattern, copyFuncs->localToImage[MATRIX_A], FUNC_NAME_MAXLEN); kgenAddBlankLine(ctx); pattern.dim.x = dims[0].bwidth; pattern.dim.y = dims[1].x; pattern.dir = DBLOCK_LOCAL_TO_IMAGE; findGenerateFunction(guard, &pattern, copyFuncs->localToImage[MATRIX_B], FUNC_NAME_MAXLEN); kgenAddBlankLine(ctx); // Global to local optimized pattern.dir = DBLOCK_GLOBAL_TO_LOCAL; if (customize || isComplexType(dtype)) { pattern.flags = (!customize || isConjA) ? DBLOCK_COPY_CONJUGATE : 0; pattern.flags |= glcpFlags[0]; pattern.dim.x = dims[0].bwidth; pattern.dim.y = dims[1].y; findGenerateFunction(guard, &pattern, copyFuncs->globalToLocal[MATRIX_A], FUNC_NAME_MAXLEN); kgenAddBlankLine(ctx); } if ((funcID == CLBLAS_GEMM) && isComplexType(dtype)) { pattern.flags = DBLOCK_COPY_CONJUGATE | glcpFlags[1]; pattern.dim.x = dims[0].bwidth; pattern.dim.y = dims[1].x; findGenerateFunction(guard, &pattern, copyFuncs->globalToLocal[MATRIX_B], FUNC_NAME_MAXLEN); kgenAddBlankLine(ctx); } // Global to local generic pattern.dim = dims[0]; pattern.dir = DBLOCK_GLOBAL_TO_LOCAL; pattern.generic = true; if (!customize || areTails) { pattern.flags = (isConjA) ? DBLOCK_COPY_CONJUGATE : 0; pattern.flags |= glcpFlags[0]; findGenerateFunction(guard, &pattern, copyFuncs->globalToLocalGeneric[MATRIX_A], FUNC_NAME_MAXLEN); kgenAddBlankLine(ctx); } pattern.flags = (kflags & KEXTRA_CONJUGATE_B) ? DBLOCK_COPY_CONJUGATE : 0; pattern.flags |= glcpFlags[1]; findGenerateFunction(guard, &pattern, copyFuncs->globalToLocalGeneric[MATRIX_B], FUNC_NAME_MAXLEN); kgenAddBlankLine(ctx); // Global to local transposed functions pattern.dir = DBLOCK_GLOBAL_TO_LOCAL; pattern.flags = (kflags & KEXTRA_NO_COPY_VEC_A) ? DBLOCK_COPY_NOT_VECTORIZE : 0; pattern.flags |= glcpFlags[0]; if (!customize || isTra) { pattern.generic = false; if (isConjA) { pattern.flags |= DBLOCK_COPY_TRANSPOSE | DBLOCK_COPY_CONJUGATE; } else { pattern.flags |= DBLOCK_COPY_TRANSPOSE; } pattern.dim.x = dims[1].y; pattern.dim.y = dims[0].bwidth; findGenerateFunction(guard, &pattern, copyFuncs->globalToLocalTransposed[MATRIX_A], FUNC_NAME_MAXLEN); kgenAddBlankLine(ctx); } if (!customize || (isTra && areTails)) { pattern.generic = true; pattern.dim.x = 0; pattern.dim.y = 0; findGenerateFunction(guard, &pattern, copyFuncs->globalToLocalTransposedGeneric[MATRIX_A], FUNC_NAME_MAXLEN); kgenAddBlankLine(ctx); } pattern.generic = false; pattern.dim.x = dims[1].x; pattern.dim.y = dims[0].bwidth; if (kflags & KEXTRA_CONJUGATE_B) { pattern.flags = DBLOCK_COPY_TRANSPOSE | DBLOCK_COPY_CONJUGATE; } else { pattern.flags = DBLOCK_COPY_TRANSPOSE; } pattern.flags |= glcpFlags[1]; findGenerateFunction(guard, &pattern, copyFuncs->globalToLocalTransposed[MATRIX_B], FUNC_NAME_MAXLEN); kgenAddBlankLine(ctx); pattern.generic = true; pattern.dim.x = 0; pattern.dim.y = 0; findGenerateFunction(guard, &pattern, copyFuncs->globalToLocalTransposedGeneric[MATRIX_B], FUNC_NAME_MAXLEN); kgenAddBlankLine(ctx); // generate two local zeroing functions for matrix A and matrix B blocks pattern.zeroing = true; pattern.dim = dims[0]; pattern.generic = false; pattern.flags = 0; pattern.dim.y = 1; pattern.dim.x = fl4RowWidth(dims[0].bwidth, tsize) * dims[1].y; findGenerateFunction(guard, &pattern, copyFuncs->zeroBlock[MATRIX_A], FUNC_NAME_MAXLEN); kgenAddBlankLine(ctx); pattern.dim.x = fl4RowWidth(dims[0].bwidth, tsize) * dims[1].x; findGenerateFunction(guard, &pattern, copyFuncs->zeroBlock[MATRIX_B], FUNC_NAME_MAXLEN); ret = kgenAddBlankLine(ctx); destroyKgenGuard(guard); return ret; } int generateResultUpdateOld( struct KgenContext *ctx, BlasFunctionID funcID, const BlasGenSettings *gset, const char *optFuncName, const char *genericFuncName) { UpdateResultFlags flags; flags = kextraToUpresFlags(funcID, gset->kextra->flags); return genResultUpdateWithFlagsOld(ctx, funcID, gset, flags, optFuncName, genericFuncName, NULL); } int genResultUpdateWithFlagsOld( struct KgenContext *ctx, BlasFunctionID funcID, const BlasGenSettings *gset, UpdateResultFlags flags, const char *optFuncName, const char *genericFuncName, const char *cachedName) { KernelExtraFlags kflags = gset->kextra->flags; UpdateResultOp op; char tmp[1024]; int ret = 0; const char *coordY, *coordX; UpresVarNames uvars; const KernelVarNames *kvarNames = &gset->varNames; const SubproblemDim *dim = &gset->subdims[1]; bool areTails, useCondition; memset(&uvars, 0, sizeof(uvars)); coordX = kvarNames->coordB; coordY = kvarNames->coordA; if (funcHasTriangMatrix(funcID)) { if (flags & UPRES_TRIANG_WRITE_C) { uvars.result = "C"; } else { uvars.result = "B"; } uvars.ld = "ldb"; } else { uvars.result = "C"; uvars.ld = "ldc"; } uvars.cachedName = cachedName; /* For now, kernels that do not use UPRES_EXCEED_PROBLEM_CONDITION * must return in case problem exceeds more precise lower level conditions * (KEXTRA_TAILS_M_LOWER, KEXTRA_TAILS_N_LOWER) before updating result */ areTails = (kflags & (KEXTRA_TAILS_M | KEXTRA_TAILS_N)); useCondition = areTails && ((flags & UPRES_EXCEED_PROBLEM_CONDITION) != 0); if (useCondition) { bool tailM = (kflags & KEXTRA_TAILS_M) != 0; bool tailN = (kflags & KEXTRA_TAILS_N) != 0; if (tailM) { if (tailN) { sprintf(tmp, "if ((%s < %s) && (%s < %s))", coordY, kvarNames->sizeM, coordX, kvarNames->sizeN); } else { sprintf(tmp, "if (%s < %s)", coordY, kvarNames->sizeM); } } else { // here tailN is true sprintf(tmp, "if (%s < %s)", coordX, kvarNames->sizeN); } kgenBeginBranch(ctx, tmp); } else { kgenAddBlankLine(ctx); } if (optFuncName) { const char *betaStr; betaStr = (flags & UPRES_WITH_BETA) ? ", beta" : ""; // update with functions invoking if (!(kflags & (KEXTRA_TAILS_M_LOWER | KEXTRA_TAILS_N_LOWER))) { sprintf(tmp, "%s(%s, c, alpha, %s, %s, %s%s);\n", optFuncName, uvars.result, coordY, coordX, uvars.ld, betaStr); } else { sprintf(tmp, "uint y = min(%luu, %s - (uint)%s);\n" "uint x = min(%luu, %s - (uint)%s);\n" "if ((y == %lu) && (x == %lu)) {\n" " %s(%s, c, alpha, %s, %s, %s%s);\n" "}\n" "else {\n" " %s(%s, c, alpha, %s, %s, %s%s, y, x);\n" "}\n", dim->y, kvarNames->sizeM, coordY, dim->x, kvarNames->sizeN, coordX, dim->y, dim->x, optFuncName, uvars.result, coordY, coordX, uvars.ld, betaStr, genericFuncName, uvars.result, coordY, coordX, uvars.ld, betaStr); } kgenAddStmt(ctx, tmp); } else { // inline result update flags |= UPRES_INLINE; op = (flags & UPRES_WITH_BETA) ? UPRES_SUM : UPRES_SET; uvars.startRow = coordY; uvars.startCol = coordX; uvars.nrRows = "y"; uvars.nrCols = "x"; if (!(kflags & (KEXTRA_TAILS_M_LOWER | KEXTRA_TAILS_N_LOWER))) { ret = updateResultGenOld(ctx, gset, op, flags, &uvars); } else { sprintf(tmp, "uint y = min(%luu, %s - (uint)%s);\n" "uint x = min(%luu, %s - (uint)%s);\n", dim->y, kvarNames->sizeM, coordY, dim->x, kvarNames->sizeN, coordX); kgenAddStmt(ctx, tmp); sprintf(tmp, "if ((y == %lu) && (x == %lu))", dim->y, dim->x); kgenBeginBranch(ctx, tmp); // optimized update updateResultGenOld(ctx, gset, op, flags, &uvars); kgenEndBranch(ctx, NULL); flags |= UPRES_GENERIC; kgenBeginBranch(ctx, "else "); // not optimized update updateResultGenOld(ctx, gset, op, flags, &uvars); ret = kgenEndBranch(ctx, NULL); } } if (useCondition) { ret = kgenEndBranch(ctx, NULL); } return (ret) ? -EOVERFLOW : 0; } int genUpresFuncsWithFlags( struct KgenContext *ctx, const BlasGenSettings *gset, UpdateResultFlags flags, char optFuncName[FUNC_NAME_MAXLEN], char genericFuncName[FUNC_NAME_MAXLEN]) { KernelExtraFlags kflags = gset->kextra->flags; UpdateResultOp op; int ret; op = (flags & UPRES_WITH_BETA) ? UPRES_SUM : UPRES_SET; updateResultGenOld(ctx, gset, op, flags, NULL); ret = kgenAddBlankLine(ctx); if (ret) { return -EOVERFLOW; } kgenGetLastFuncName(optFuncName, FUNC_NAME_MAXLEN, ctx); if (kflags & (KEXTRA_TAILS_M | KEXTRA_TAILS_N)) { flags |= UPRES_GENERIC; updateResultGenOld(ctx, gset, op, flags, NULL); kgenAddBlankLine(ctx); kgenGetLastFuncName(genericFuncName, FUNC_NAME_MAXLEN, ctx); } return (ret) ? -EOVERFLOW : 0; } int generateUpresFuncs( struct KgenContext *ctx, BlasFunctionID funcID, const BlasGenSettings *gset, char optFuncName[FUNC_NAME_MAXLEN], char genericFuncName[FUNC_NAME_MAXLEN]) { UpdateResultFlags flags; flags = kextraToUpresFlags(funcID, gset->kextra->flags); return genUpresFuncsWithFlags(ctx, gset, flags, optFuncName, genericFuncName); } clblas-2.10/src/library/blas/gens/legacy/gen_helper_legacy.h000066400000000000000000000044151264277366700241050ustar00rootroot00000000000000/* ************************************************************************ * Copyright 2013 Advanced Micro Devices, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * ************************************************************************/ #ifndef GEN_HELPER_LEGACY_H_ #define GEN_HELPER_LEGACY_H_ #include #include #include #include "../blas_kgen.h" typedef struct CopyImgFuncs { char localToImage[2][FUNC_NAME_MAXLEN]; char globalToImage[2][FUNC_NAME_MAXLEN]; char globalToLocalTransposed[2][FUNC_NAME_MAXLEN]; char globalToLocalTransposedGeneric[2][FUNC_NAME_MAXLEN]; char globalToLocal[2][FUNC_NAME_MAXLEN]; char globalToLocalGeneric[2][FUNC_NAME_MAXLEN]; char zeroBlock[2][FUNC_NAME_MAXLEN]; } CopyImgFuncs; int generateImageCopyFuncs( CopyImgFuncs *copyFuncs, struct KgenContext *ctx, BlasFunctionID funcID, const BlasGenSettings *gset); int generateResultUpdateOld( struct KgenContext *ctx, BlasFunctionID funcID, const BlasGenSettings *gset, const char *optFuncName, const char *genericFuncName); int genResultUpdateWithFlagsOld( struct KgenContext *ctx, BlasFunctionID funcID, const BlasGenSettings *gset, UpdateResultFlags flags, const char *optFuncName, const char *genericFuncName, const char *cachedName); int generateUpresFuncs( struct KgenContext *ctx, BlasFunctionID funcID, const BlasGenSettings *gset, char optFuncName[FUNC_NAME_MAXLEN], char genericFuncName[FUNC_NAME_MAXLEN]); int genUpresFuncsWithFlags( struct KgenContext *ctx, const BlasGenSettings *gset, UpdateResultFlags flags, char optFuncName[FUNC_NAME_MAXLEN], char genericFuncName[FUNC_NAME_MAXLEN]); #endif /* GEN_HELPER_LEGACY_H_ */ clblas-2.10/src/library/blas/gens/legacy/tests/000077500000000000000000000000001264277366700214365ustar00rootroot00000000000000clblas-2.10/src/library/blas/gens/legacy/tests/CMakeLists.txt000066400000000000000000000047221264277366700242030ustar00rootroot00000000000000# ######################################################################## # Copyright 2013 Advanced Micro Devices, Inc. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # ######################################################################## set(SRC_BLKMUL ../blkmul.c ${clBLAS_SOURCE_DIR}/library/common/kerngen_core.c ${clBLAS_SOURCE_DIR}/library/common/kgen_basic.c ${clBLAS_SOURCE_DIR}/library/common/kgen_loop_helper.c ${clBLAS_SOURCE_DIR}/library/common/misc.c ${clBLAS_SOURCE_DIR}/library/blas/gens/blas_kgen.c ${clBLAS_SOURCE_DIR}/library/blas/gens/tile.c ${clBLAS_SOURCE_DIR}/library/blas/gens/tile_iter.c ${clBLAS_SOURCE_DIR}/library/blas/gens/legacy/blas_kgen_legacy.c ${clBLAS_SOURCE_DIR}/library/blas/gens/gen_helper.c ${clBLAS_SOURCE_DIR}/library/blas/gens/legacy/gen_helper_legacy.c ${clBLAS_SOURCE_DIR}/library/blas/generic/blas_funcs.c ${clBLAS_SOURCE_DIR}/library/blas/generic/matrix_dims.c ${clBLAS_SOURCE_DIR}/library/blas/generic/matrix_props.c ${clBLAS_SOURCE_DIR}/library/common/gens/dblock_kgen.c ${clBLAS_SOURCE_DIR}/library/blas/gens/tilemul.c ${clBLAS_SOURCE_DIR}/library/blas/gens/fetch.c ${clBLAS_SOURCE_DIR}/library/common/kgen_guard.c ${clBLAS_SOURCE_DIR}/library/common/list.c ${clBLAS_SOURCE_DIR}/library/common/mutex.c ${clBLAS_SOURCE_DIR}/library/common/trace_malloc.c t_blkmul.c ) include_directories(${OPENCL_INCLUDE_DIRS} ${clBLAS_SOURCE_DIR} ${clBLAS_SOURCE_DIR}/include ${clBLAS_SOURCE_DIR}/library/blas/include ${clBLAS_SOURCE_DIR}/library/blas/gens) add_executable(t_blkmul ${SRC_BLKMUL}) target_link_libraries(t_blkmul ${OPENCL_LIBRARIES}) set_target_properties( t_blkmul PROPERTIES RUNTIME_OUTPUT_DIRECTORY "${PROJECT_BINARY_DIR}/staging" ) # CPack configuration; include the executable into the package install( TARGETS t_blkmul RUNTIME DESTINATION bin${SUFFIX_BIN} LIBRARY DESTINATION lib${SUFFIX_LIB} ARCHIVE DESTINATION lib${SUFFIX_LIB}/import ) clblas-2.10/src/library/blas/gens/legacy/tests/t_blkmul.c000066400000000000000000000530271264277366700234220ustar00rootroot00000000000000/* ************************************************************************ * Copyright 2013 Advanced Micro Devices, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * ************************************************************************/ #ifdef __APPLE__ #include #else #include #endif #include #include #include #include #include #include #include "../blas_kgen_legacy.h" enum { ITEM_WORK_M = 8, ITEM_WORK_N = 8, GROUP_SIZE = ITEM_WORK_M * ITEM_WORK_N, BLOCKS_K = 2, PACK_RATE = 4, RAND_BOUND = 10 }; // float types based unified pointer typedef union FPtr { void *v; cl_float *f; cl_double *d; cl_float2 *f2; cl_double2 *d2; } FPtr; // float type based unified data type typedef union FType { unsigned char u[sizeof(cl_double)]; cl_float f; cl_float2 f2; cl_double d; cl_double2 d2; } FType; static void usage(void) { printf("USAGE: blkmul_test type [--iter i] [--imA] [--imB] [--alpha] a " "--[img-packed]\n" "type argument can be a value from the following list:s, d, c, z\n" "iter - number of iterations\n" "imA, imB - image usage for matrix\n" "img-packed - store elements of matrix A or (and) B " "to an image in the packed way\n"); } static void imageSizes( int *height, int *width, int blockHeight, int blockWidth, int AB, int typeSize, int packed) { *width = blockWidth * typeSize / 16; *height = blockHeight; if (packed) { int smallHeight = (AB) ? (blockHeight / ITEM_WORK_M) : (blockHeight / ITEM_WORK_N); *width *= smallHeight * PACK_RATE; *height /= smallHeight * PACK_RATE; } } void addTestPrefix(struct KgenContext *ctx, bool isDouble) { kgenDeclareUptrs(ctx, isDouble); } void addTestSuffix( struct KgenContext *ctx, const SubproblemDim subdims[2], DataType type, BlkMulOpts *mulOpts) { char c; char s[300]; bool isImageA, isImageB; char *tName; size_t m, n, k; size_t blockWidth; char imgXA[64], imgYA[64], imgXB[64], imgYB[64]; unsigned int vecLen = sizeof(cl_float4) / dtypeSize(type); isImageA = (mulOpts->aMobj == CLMEM_IMAGE); isImageB = (mulOpts->bMobj == CLMEM_IMAGE); m = subdims[1].y; n = subdims[1].x; k = subdims[1].bwidth; blockWidth = k * BLOCKS_K; switch (type) { case TYPE_FLOAT: c = 's'; tName = "float"; break; case TYPE_DOUBLE: c = 'd'; tName = "double"; break; case TYPE_COMPLEX_FLOAT: c = 'c'; tName = "float2"; break; case TYPE_COMPLEX_DOUBLE: c = 'z'; tName = "double2"; break; default: return; } kgenAddBlankLine(ctx); kgenAddStmt(ctx, "__kernel void\n"); kgenAddStmt(ctx, "blkmul_test(\n"); sprintf(s," %s alpha,\n", tName); kgenAddStmt(ctx, s); if (isImageA) { kgenAddStmt(ctx, " __read_only image2d_t A,\n"); } else { sprintf(s," __global %s *A,\n", tName); kgenAddStmt(ctx, s); } if (isImageB) { kgenAddStmt(ctx, " __read_only image2d_t B,\n"); } else { sprintf(s," __global %s *B,\n", tName); kgenAddStmt(ctx, s); } kgenAddStmt(ctx, " size_t M,\n" " size_t N,\n" " size_t K,\n"); sprintf(s," __global %s *C,\n", tName); kgenAddStmt(ctx, s); kgenAddStmt(ctx, " size_t iter)\n"); kgenBeginFuncBody(ctx); kgenAddStmt(ctx, "size_t i, j, it, m0, n0;\n"); if (!isImageA) { sprintf(s,"__local %s LA[%lu];\n", tName, m * ITEM_WORK_M * blockWidth); kgenAddStmt(ctx, s); } else { if (mulOpts->flags & BLKMUL_IMAGE_PACKED) { sprintf(imgXA, "(m0 / %lu) %% %d * %lu", m, PACK_RATE, m * blockWidth / vecLen); sprintf(imgYA, "m0 / %lu", m * PACK_RATE); } else { strcpy(imgXA, "0"); strcpy(imgYA, "m0"); } } if (!isImageB) { sprintf(s,"__local %s LB[%lu];\n", tName, n * ITEM_WORK_N * blockWidth); kgenAddStmt(ctx, s); } else { if (mulOpts->flags & BLKMUL_IMAGE_PACKED) { sprintf(imgXB, "(n0 / %lu) %% %d * %lu", n, PACK_RATE, n * blockWidth / vecLen); sprintf(imgYB, "n0 / %lu", n * PACK_RATE); } else { strcpy(imgXB, "0"); strcpy(imgYB, "n0"); } } sprintf(s,"__local %s LC[%lu];\n", tName, n * m * GROUP_SIZE); kgenAddStmt(ctx, s); sprintf(s, "m0 = %lu * (get_global_id(0) / %d);\n" "n0 = %lu * (get_global_id(0) %% %d);\n", m, ITEM_WORK_N, n, ITEM_WORK_N); kgenAddStmt(ctx, s); if (!isImageA) { kgenAddBlankLine(ctx); sprintf(s, "for (i = m0; i < m0 + %lu; i++)", m); kgenBeginBranch(ctx, s); kgenBeginBranch(ctx, "for (j = 0; j < K; j++)"); kgenAddStmt(ctx,"LA[i * K + j] = A[i * K + j];\n"); kgenEndBranch(ctx, NULL); kgenEndBranch(ctx, NULL); } if (!isImageB) { kgenAddBlankLine(ctx); sprintf(s, "for (i = n0; i < n0 + %lu; i++)", n); kgenBeginBranch(ctx, s); kgenBeginBranch(ctx,"for (j = 0; j < K; j++)"); kgenAddStmt(ctx,"LB[i * K + j] = B[i * K + j];\n"); kgenEndBranch(ctx, NULL); kgenEndBranch(ctx, NULL); } kgenAddBlankLine(ctx); kgenAddBlankLine(ctx); kgenBeginBranch(ctx,"for (it = 0; it < iter; it++)"); sprintf(s, "for (i = m0; i < m0 + %lu; i++)", m); kgenBeginBranch(ctx, s); sprintf(s, "for (j = n0; j < n0 + %lu; j++)", n); kgenBeginBranch(ctx, s); kgenAddStmt(ctx,"LC[i * N + j] = 0;\n"); kgenEndBranch(ctx, NULL); kgenEndBranch(ctx, NULL); if (isImageA) { if (isImageB) { sprintf(s, "%cgemmBlock_%lu_%lu(alpha, A, (int2)(%s, %s), B, " "(int2)(%s, %s), (LPtr)(LC + m0 * %lu + n0));\n", c, m, n, imgXA, imgYA, imgXB, imgYB, subdims[0].x); } else { sprintf(s, "%cgemmBlock_%lu_%lu(alpha, A, (int2)(%s, %s), " "(LPtr)(LB + n0 * %lu), (LPtr)(LC + m0 * %lu + n0));\n", c, m, n, imgXA, imgYA, subdims[0].bwidth, subdims[0].x); } } else { if (isImageB) { sprintf(s, "%cgemmBlock_%lu_%lu(alpha, (LPtr)(LA + m0 * %lu), B, " "(int2)(%s, %s), (LPtr)(LC + m0 * %lu + n0));\n", c, m, n, subdims[0].bwidth, imgXB, imgYB, subdims[0].x); } else { sprintf(s, "%cgemmBlock_%lu_%lu(alpha, (LPtr)(LA + m0 * %lu), " "(LPtr)(LB + n0 * %lu), (LPtr)(LC + m0 * %lu + n0));\n", c, m, n, subdims[0].bwidth, subdims[0].bwidth, subdims[0].x); } } kgenAddStmt(ctx, s); kgenEndBranch(ctx, NULL); kgenAddBlankLine(ctx); sprintf(s, "for (i = m0; i < m0 + %lu; i++)", m); kgenBeginBranch(ctx, s); sprintf(s, "for (j = n0; j < n0 + %lu; j++)", n); kgenBeginBranch(ctx, s); kgenAddStmt(ctx,"C[i * N + j] = LC[i * N + j];\n"); kgenEndBranch(ctx, NULL); kgenEndBranch(ctx, NULL); kgenEndFuncBody(ctx); } cl_int run (char *ker, cl_uint M, cl_uint N, cl_uint K, FType alpha, DataType type, BlkMulOpts *mulOpts, cl_uint iter) { cl_int err; cl_platform_id platform; cl_context ctx; cl_device_id device; cl_command_queue queue; cl_event evt; FType tmp; cl_mem imA, imB, bufC; FPtr A, B, C, C_naive; bool is_complex = type == TYPE_COMPLEX_FLOAT || type == TYPE_COMPLEX_DOUBLE; bool is_double = type == TYPE_DOUBLE || type == TYPE_COMPLEX_DOUBLE; cl_uint nwords = (is_complex) ? 2 : 1; unsigned int tsize = dtypeSize(type); cl_kernel kernel; const cl_image_format image_format = {CL_RGBA, CL_FLOAT}; size_t i, j, k; size_t globalWorkSize[1] = {GROUP_SIZE}; size_t localWorkSize[1] = {GROUP_SIZE}; char log[100000]; size_t logSize; cl_long sTime, fTime; cl_program program = NULL; const char *kernelName = "blkmul_test"; int imgWidth, imgHeight; bool packed = (mulOpts->flags & BLKMUL_IMAGE_PACKED); clGetPlatformIDs(1, &platform, NULL); clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, 1, &device, NULL); ctx = clCreateContext(NULL, 1, &device, NULL, NULL, &err); if (err != CL_SUCCESS) { return err; } queue = clCreateCommandQueue(ctx, device, CL_QUEUE_PROFILING_ENABLE, &err); if (err != CL_SUCCESS) { return err; } /* Prepare OpenCL kernel and its arguments */ program = clCreateProgramWithSource(ctx, 1, (const char**)&ker, NULL, NULL); err = clBuildProgram(program, 1, &device, NULL, NULL, NULL); if (err != CL_SUCCESS){ clGetProgramBuildInfo (program, device, CL_PROGRAM_BUILD_LOG, 100000, log, &logSize); printf("%s", log); clReleaseProgram(program); return err; } kernel = clCreateKernel(program, kernelName, &err); if (err != CL_SUCCESS){ clReleaseProgram(program); return err; } /* Memory allocation */ A.v = malloc(M * K * tsize); B.v = malloc(K * N * tsize); C.v = malloc(M * N * tsize); C_naive.v = malloc(M * N * tsize); srand(0); if (is_double) { for(i = 0; i < M * K * nwords; i++){ A.d[i] = (double)(rand() % RAND_BOUND); } for(i = 0; i < N * K * nwords; i++){ B.d[i] = (double)(rand() % RAND_BOUND); } for(i = 0; i < M * N * nwords; i++){ C.d[i] = 0.0; C_naive.d[i] = 0.0; } } else { for(i = 0; i < M * K * nwords; i++){ A.f[i] = (float)(rand() % RAND_BOUND); } for(i = 0; i < N * K * nwords; i++){ B.f[i] = (float)(rand() % RAND_BOUND); } for(i = 0; i < M * N * nwords; i++){ C.f[i] = 0.0; C_naive.f[i] = 0.0; } } if (mulOpts->aMobj == CLMEM_IMAGE) { imageSizes(&imgHeight, &imgWidth, M, K, 0, tsize, packed); imA = clCreateImage2D (ctx, CL_MEM_READ_ONLY | CL_MEM_USE_HOST_PTR, &image_format, imgWidth, imgHeight, 0, A.v, &err); } else { imA = clCreateBuffer(ctx, CL_MEM_WRITE_ONLY | CL_MEM_USE_HOST_PTR, K * M * tsize, A.v, &err); } if (err != CL_SUCCESS) { clReleaseKernel(kernel); return err; } if (mulOpts->bMobj == CLMEM_IMAGE) { imageSizes(&imgHeight, &imgWidth, N, K, 0, tsize, packed); imB = clCreateImage2D (ctx, CL_MEM_READ_ONLY | CL_MEM_USE_HOST_PTR, &image_format, imgWidth, imgHeight, 0, B.v, &err); } else { imB = clCreateBuffer(ctx, CL_MEM_WRITE_ONLY | CL_MEM_USE_HOST_PTR, K * N * tsize, B.v, &err); } if (err != CL_SUCCESS) { clReleaseMemObject(imA); clReleaseKernel(kernel); return err; } bufC = clCreateBuffer(ctx, CL_MEM_WRITE_ONLY | CL_MEM_USE_HOST_PTR, M * N * tsize, C.v, &err); if (err != CL_SUCCESS) { clReleaseMemObject(imB); clReleaseMemObject(imA); clReleaseKernel(kernel); return err; } err = clEnqueueWriteBuffer (queue, bufC, CL_TRUE, 0, M * N * tsize, C.v, 0, NULL, NULL); /* Argument setting and kernel execution */ err = clSetKernelArg(kernel, 0, tsize, alpha.u); err |= clSetKernelArg(kernel, 1, sizeof(imA), &imA); err |= clSetKernelArg(kernel, 2, sizeof(imB), &imB); err |= clSetKernelArg(kernel, 3, sizeof(M), &M); err |= clSetKernelArg(kernel, 4, sizeof(N), &N); err |= clSetKernelArg(kernel, 5, sizeof(K), &K); err |= clSetKernelArg(kernel, 6, sizeof(bufC), &bufC); err |= clSetKernelArg(kernel, 7, sizeof(iter), &iter); if (err != CL_SUCCESS) { clReleaseMemObject(bufC); clReleaseMemObject(imB); clReleaseMemObject(imA); clReleaseKernel(kernel); return err; } err = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, globalWorkSize, localWorkSize, 0, NULL, &evt); if (err != CL_SUCCESS) { clReleaseMemObject(bufC); clReleaseMemObject(imB); clReleaseMemObject(imA); clReleaseKernel(kernel); return err; } err = clFinish(queue); err = clEnqueueReadBuffer (queue, bufC, CL_TRUE, 0, M * N * tsize, C.v, 0, NULL, NULL); /* Naive CPU multiplication */ if (is_double) { if (is_complex) { for (i = 0; i < M; i++) { for (j = 0; j < N; j++) { for (k = 0; k < K; k++) { C_naive.d[(i * N + j) * 2] += A.d[(i * K + k) * 2] * B.d[(j * K + k) * 2] - A.d[(i * K + k) * 2 + 1] * B.d[(j * K + k) * 2 + 1]; C_naive.d[(i * N + j) * 2 + 1] += A.d[(i * K + k) * 2] * B.d[(j * K + k) * 2 + 1] + A.d[(i * K + k) * 2 + 1] * B.d[(j * K + k) * 2]; } tmp.d2.s[0] = C_naive.d[(i * N + j) * 2] * alpha.d2.s[0] - C_naive.d[(i * N + j) * 2 + 1] * alpha.d2.s[1]; tmp.d2.s[1] = C_naive.d[(i * N + j) * 2] * alpha.d2.s[1] + C_naive.d[(i * N + j) * 2 + 1] * alpha.d2.s[0]; C_naive.d[(i * N + j) * 2] = tmp.d2.s[0]; C_naive.d[(i * N + j) * 2 + 1] = tmp.d2.s[1]; } } for (i = 0; i < M * N; i++) { if ((C.d[i * 2] != C_naive.d[i * 2]) || (C.d[i * 2 + 1] != C_naive.d[i * 2 + 1])) { printf("Differ at (%lu, %lu): (%lf; %lf) != (%lf; %lf)\n", i / N, i % N, C.d[i * 2], C.d[i * 2 + 1], C_naive.d[i * 2], C_naive.d[i * 2 + 1]); break; } } if (i == M * N) { printf("Match\n"); } } else { for (i = 0; i < M; i++) { for (j = 0; j < N; j++) { for (k = 0; k < K; k++) { C_naive.d[i * N + j] += A.d[i * K + k] * B.d[j * K + k]; } C_naive.d[i * N + j] *= alpha.d; } } for (i = 0; i < M * N; i++) { if (C.d[i] != C_naive.d[i]) { printf("Differ at (%lu, %lu): %lf != %lf\n", i / N, i % N, C.d[i], C_naive.d[i]); break; } } if (i == M * N) { printf("Match\n"); } } } else { if (is_complex) { for (i = 0; i < M; i++) { for (j = 0; j < N; j++) { for (k = 0; k < K; k++) { C_naive.f[(i * N + j) * 2] += A.f[(i * K + k) * 2] * B.f[(j * K + k) * 2] - A.f[(i * K + k) * 2 + 1] * B.f[(j * K + k) * 2 + 1]; C_naive.f[(i * N + j) * 2 + 1] += A.f[(i * K + k) * 2] * B.f[(j * K + k) * 2 + 1] + A.f[(i * K + k) * 2 + 1] * B.f[(j * K + k) * 2]; } tmp.f2.s[0] = C_naive.f[(i * N + j) * 2] * alpha.f2.s[0] - C_naive.f[(i * N + j) * 2 + 1] * alpha.f2.s[1]; tmp.f2.s[1] = C_naive.f[(i * N + j) * 2] * alpha.f2.s[1] + C_naive.f[(i * N + j) * 2 + 1] * alpha.f2.s[0]; C_naive.f[(i * N + j) * 2] = tmp.f2.s[0]; C_naive.f[(i * N + j) * 2 + 1] = tmp.f2.s[1]; } } for (i = 0; i < M * N; i++) { if ((C.f[i * 2] != C_naive.f[i * 2]) || (C.f[i * 2 + 1] != C_naive.f[i * 2 + 1])) { printf("Differ at (%lu, %lu): (%lf; %lf) != (%lf; %lf)\n", i / N, i % N, C.f[i * 2], C.f[i * 2 + 1], C_naive.f[i * 2], C_naive.f[i * 2 + 1]); break; } } if (i == M * N) { printf("Match\n"); } } else { for (i = 0; i < M; i++) { for (j = 0; j < N; j++) { for (k = 0; k < K; k++) { C_naive.f[i * N + j] += A.f[i * K + k] * B.f[j * K + k]; } C_naive.f[i * N + j] *= alpha.f; } } for (i = 0; i < M * N; i++) { if (C.f[i] != C_naive.f[i]) { printf("Differ at (%lu, %lu): %lf != %lf\n", i / N, i % N, C.f[i], C_naive.f[i]); break; } } if (i == M * N) { printf("Match\n"); } } } /* End of naive CPU multiplication */ clGetEventProfilingInfo(evt, CL_PROFILING_COMMAND_START, sizeof(cl_ulong), &sTime, NULL); clGetEventProfilingInfo(evt, CL_PROFILING_COMMAND_END, sizeof(cl_ulong), &fTime, NULL); printf("Total multiplication time: %d ms\nTime per iteration: %d ns\n", (int)((fTime-sTime)/1000000), (int)((fTime-sTime)/iter)); clReleaseMemObject(bufC); clReleaseMemObject(imB); clReleaseMemObject(imA); clReleaseKernel(kernel); return CL_SUCCESS; } int main(int argc, char *argv[]) { char out[65535]; SubproblemDim subdims[2]; BlkMulOpts mulOpts; DataType dtype; int i; cl_uint iter = 1, blockM = 4, blockN = 4, blockK = 8; struct KgenContext *ctx = createKgenContext(out, 65535, 1); FType alpha; int cmdAlpha = 0; mulOpts.aMobj = CLMEM_BUFFER; mulOpts.bMobj = CLMEM_BUFFER; mulOpts.flags = BLKMUL_NO_FLAGS; // parse command line if (argc < 2) { usage(); return 1; } if (!strcmp(argv[1], "s")) { dtype = TYPE_FLOAT; alpha.f = 1; } else if (!strcmp(argv[1], "d")) { dtype = TYPE_DOUBLE; alpha.d = 1; } else if (!strcmp(argv[1], "c")) { dtype = TYPE_COMPLEX_FLOAT; alpha.f2.s[0] = 1; alpha.f2.s[1] = 0; } else if (!strcmp(argv[1], "z")) { dtype = TYPE_COMPLEX_DOUBLE; alpha.d2.s[0] = 1; alpha.d2.s[1] = 0; } else { printf("Wrong type specified: %s\n", argv[1]); return 1; } for (i = 2; i < argc; i++) { if (strcmp(argv[i], "--imA") == 0) { mulOpts.aMobj = CLMEM_IMAGE; continue; } if (strcmp(argv[i], "--imB") == 0) { mulOpts.bMobj = CLMEM_IMAGE; continue; } if (strcmp(argv[i], "--img-packed") == 0) { mulOpts.flags |= BLKMUL_IMAGE_PACKED; continue; } if (strcmp(argv[i], "--iter") == 0) { if (i + 1 == argc) { printf("Error: 'iter' argument is not specified\n"); usage(); return 1; } iter = atoi(argv[i + 1]); i++; continue; } if (strcmp(argv[i], "--alpha") == 0) { if (i + 1 == argc) { printf("Error: 'alpha' argument is not specified\n"); usage(); return 1; } cmdAlpha = atoi(argv[i + 1]); i++; continue; } if (i + 2 >= argc) { printf("Error: Not all sizes are specified\n"); usage(); return 1; } blockM = atoi(argv[i]); blockN = atoi(argv[i + 1]); blockK = atoi(argv[i + 2]); i += 2; } if (cmdAlpha) { switch (dtype) { case TYPE_FLOAT: alpha.f = cmdAlpha; break; case TYPE_DOUBLE: alpha.d = cmdAlpha; break; case TYPE_COMPLEX_FLOAT: alpha.f2.s[0] = cmdAlpha; alpha.f2.s[1] = -cmdAlpha / 2; break; case TYPE_COMPLEX_DOUBLE: alpha.d2.s[0] = cmdAlpha; alpha.d2.s[1] = -cmdAlpha / 2; break; default: break; } } subdims[0].y = blockM * ITEM_WORK_M; subdims[0].x = blockN * ITEM_WORK_N; subdims[0].bwidth = blockK * BLOCKS_K; subdims[1].y = blockM; subdims[1].x = blockN; subdims[1].bwidth = blockK; memset(out, 0, sizeof(out)); i = isDoubleBasedType(dtype); addTestPrefix(ctx, i); blkMulGen(ctx, subdims, dtype, &mulOpts); addTestSuffix(ctx, subdims, dtype, &mulOpts); run(out, subdims[0].y, subdims[0].x, subdims[0].bwidth, alpha, dtype, &mulOpts, iter); destroyKgenContext(ctx); return 0; } clblas-2.10/src/library/blas/gens/legacy/trmm_img.c000066400000000000000000000625511264277366700222640ustar00rootroot00000000000000/* ************************************************************************ * Copyright 2013 Advanced Micro Devices, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * ************************************************************************/ /* * trmm image based generator */ #include #include #include #include #include #include #include #include #include #include #include "blas_kgen_legacy.h" #include "../gen_helper.h" #include "gen_helper_legacy.h" #include "trxm_common_legacy.h" static CLBLASMpatExtra mpatExtra; static ssize_t generator( char *buf, size_t buflen, const struct SubproblemDim *subdims, const struct PGranularity *pgran, void *extra); static ssize_t preparator( char *buf, size_t buflen, const struct SubproblemDim *subdims, const struct PGranularity *pgran, void *extra); static ssize_t genWrapper( char *buf, size_t buflen, const struct SubproblemDim *subdims, const struct PGranularity *pgran, void *extra) { CLBLASKernExtra *kextra = (CLBLASKernExtra*)extra; if (kextra->kernType == CLBLAS_COMPUTING_KERNEL) { return generator(buf, buflen, subdims, pgran, extra); } else { return preparator(buf, buflen, subdims, pgran, extra); } } static void assignKargs(KernelArg *args, const void *params, const void *extra); static bool isFitToLDS( SubproblemDim *dim, DataType dtype, cl_ulong ldsSize, const void *kernelArgs); static SolverFlags solverFlags(void); static void calcNrThreads( size_t threads[2], const SubproblemDim *subdims, const PGranularity *pgran, const void *args, const void *extra); static int getPerf( unsigned int kflags, const void *args); static SolverOps imgSops = { genWrapper, assignKargs, isFitToLDS, getPerf, NULL, calcNrThreads, NULL, solverFlags, NULL, //fixupKargs NULL, //getDefaultDecomp NULL, //getDecompList NULL, NULL }; static void imgToCopyBufFuncs( CopyBufFuncs *bufFuncs, const CopyImgFuncs *imgFuncs, KernelExtraFlags kflags) { memcpy(bufFuncs->write, imgFuncs->localToImage, FUNC_NAME_MAXLEN); if (isMatrixAccessColMaj(CLBLAS_TRMM, kflags, MATRIX_A)) { memcpy(bufFuncs->read[MATRIX_A], imgFuncs->globalToLocalTransposed[MATRIX_A], FUNC_NAME_MAXLEN); memcpy(bufFuncs->readGeneric[MATRIX_A], imgFuncs->globalToLocalTransposedGeneric[MATRIX_A], FUNC_NAME_MAXLEN); } else { memcpy(bufFuncs->read[MATRIX_A], imgFuncs->globalToLocal[MATRIX_A], FUNC_NAME_MAXLEN); memcpy(bufFuncs->readGeneric[MATRIX_A], imgFuncs->globalToLocalGeneric[MATRIX_A], FUNC_NAME_MAXLEN); } } static void genPrepKernelA( struct KgenContext *ctx, const SubproblemDim *subdims, KernelExtraFlags kflags, DataType dtype, CopyImgFuncs *copyImgFuncs, const PGranularity *pgran) { char tmp[4096]; bool isBranch = false; size_t localBufSize; unsigned int tsize, vecLen; const char *typeName; CopyBufFuncs copyBufFuncs; char fpref; fpref = dtypeToBlasPrefix(dtype); typeName = dtypeBuiltinType(dtype); tsize = dtypeSize(dtype); vecLen = sizeof(cl_float4) / tsize; localBufSize = subdims[1].y * fl4RowWidth(subdims[1].bwidth, tsize); localBufSize *= vecLen; imgToCopyBufFuncs(©BufFuncs, copyImgFuncs, kflags); sprintf(tmp, "void __kernel\n" "%cprepareImageA(\n" " uint M,\n" " __global %s *A,\n" " uint lda,\n" " __write_only image2d_t imgA,\n" " uint startM,\n" " uint origM,\n" " uint offA)\n", fpref, typeName); kgenDeclareFunction(ctx, tmp); kgenBeginFuncBody(ctx); kgenDeclareGroupID(ctx, "gid", pgran); kgenDeclareLocalID(ctx, "lid", pgran); sprintf(tmp, "const uint bpr = (origM + %lu) / %lu;\n" "uint currM = startM + (gid / bpr) * %lu;\n" "uint k0 = (gid %% bpr) * %lu;\n" "uint x, y;\n" "__local %s tempA[%lu];\n" "bool processed = false;\n\n", subdims[1].bwidth - 1, subdims[1].bwidth, subdims[1].y, subdims[1].bwidth, typeName, localBufSize); kgenAddStmt(ctx, tmp); kgenAddStmt(ctx, "A += offA;\n"); if (!(isMatrixAccessColMaj(CLBLAS_TRMM, kflags, MATRIX_A) || isMatrixConj(kflags, MATRIX_A))) { if (isMatrixUpper(kflags)) { sprintf(tmp, "if (k0 >= currM + %lu)", subdims[1].y); } else { sprintf(tmp, "if (k0 + %lu <= currM)", subdims[1].bwidth); } kgenBeginBranch(ctx, tmp); sprintf(tmp, "if ((currM + %lu <= M + startM) && " "(k0 + %lu <= origM) && %d) {\n" // write directly to an image from the global memory " %s(imgA, k0 / %u, currM - startM, (GPtr)A, " "currM, k0, lda);\n" " processed = true;\n" "}\n", subdims[1].y, subdims[1].bwidth, (kflags & KEXTRA_NO_COPY_VEC_A) == 0, copyImgFuncs->globalToImage[MATRIX_A], vecLen); kgenAddStmt(ctx, tmp); kgenEndBranch(ctx, NULL); kgenBeginBranch(ctx, "if (!processed)"); isBranch = true; } // now, zeroing blocks entirely located in the "other" triangle if (isMatrixUpper(kflags)) { sprintf(tmp, "if (k0 + %lu <= currM) {\n" " %s((__local float4*)tempA);\n" "}\n", subdims[1].bwidth, copyImgFuncs->zeroBlock[MATRIX_A]); } else { sprintf(tmp, "if (k0 >= currM + %lu) {\n" " %s((__local float4*)tempA);\n" "}\n", subdims[1].y, copyImgFuncs->zeroBlock[MATRIX_A]); } kgenAddStmt(ctx, tmp); // useful block path, reading data from the global memory to the local one kgenBeginBranch(ctx, "else"); kgenAddStmt(ctx, "M += startM;\n"); genPrepareTrxmBlockA(ctx, subdims, dtype, ©BufFuncs, (ZeroFuncs*)copyImgFuncs->zeroBlock, kflags, "origM"); kgenAddBarrier(ctx, CLK_LOCAL_MEM_FENCE); kgenAddStmt(ctx, "M -= startM;\n"); genTriangMatrBlock(ctx, subdims, dtype, kflags); kgenEndBranch(ctx, NULL); // and write to the image kgenAddBarrier(ctx, CLK_LOCAL_MEM_FENCE); sprintf(tmp, "%s(imgA, k0 / %u, currM - startM, (LPtr)tempA);\n", copyImgFuncs->localToImage[MATRIX_A], vecLen); kgenAddStmt(ctx, tmp); if (isBranch) { kgenEndBranch(ctx, NULL); } kgenEndFuncBody(ctx); } static void genPrepKernelB( struct KgenContext *ctx, const SubproblemDim *subdims, DataType dtype, CopyImgFuncs *copyImgFuncs, const PGranularity *pgran, KernelExtraFlags kflags) { char tmp[4096]; size_t localBufSize; unsigned int tsize, vecLen; const char *typeName; char fpref; const char *funcHead = "bool trb, aligned;\n" "const uint bpr = (origM + %lu) / %lu;\n" "const uint n = startN + (gid / bpr) * %lu;\n" "const uint k = (gid %% bpr) * %lu;\n" "uint x, y;\n" "__local %s temp[%lu];\n" "\n" "B += offB;\n" "trb = (order == clblasRowMajor) ^ (side == clblasRight);\n" "N += startN;\n"; const char *funcBody = "//copy matrix B block\n" "y = n + %u <= N ? %u : N - n;\n" "x = k + %u <= origM ? %u : origM - k;\n" "aligned = (x == %u) && (y == %u) && %d;\n" "if (aligned && !trb) {\n" " %s(imgB, k / %u, n - startN, (GPtr)B, n, k, ldb);\n" "}\n" "else {\n" " if (n >= N) {\n" // just zero, this is padding related part " %s((__local float4*)temp);\n" " }\n" " else if (!aligned) {\n" " // zero local memory\n" " %s((__local float4*)temp);\n" " barrier(CLK_LOCAL_MEM_FENCE);\n" " if (trb) {\n" " // generic transposed global to local\n" " %s((LPtr)temp, (GPtr)B, k, n, x, y, %u, ldb);\n" " }\n" " else {\n" " // generic global to local\n" " %s((LPtr)temp, (GPtr)B, n, k, y, x, %u, ldb);\n" " }\n" " }\n" " else {\n" " if (trb) {//transposed, aligned\n" " // optimized transposed global to local\n" " %s((LPtr)temp, (GPtr)B, k, n, ldb);\n" " }\n" " }\n" " barrier(CLK_LOCAL_MEM_FENCE);\n" " %s(imgB, k / %u, n - startN, (LPtr)temp);\n" "}\n" "\n"; fpref = dtypeToBlasPrefix(dtype); typeName = dtypeBuiltinType(dtype); tsize = dtypeSize(dtype); vecLen = sizeof(cl_float4) / tsize; localBufSize = subdims[1].x * fl4RowWidth(subdims[1].bwidth, tsize); localBufSize *= vecLen; sprintf(tmp, "void __kernel\n" "%cprepareImageB(\n" " clblasOrder order,\n" " clblasSide side,\n" " uint N,\n" " __global %s *B,\n" " uint ldb,\n" " __write_only image2d_t imgB,\n" " uint startN,\n" " uint origM,\n" " uint offB)\n", fpref, typeName); kgenDeclareFunction(ctx, tmp); kgenBeginFuncBody(ctx); kgenDeclareGroupID(ctx, "gid", pgran); sprintf(tmp, funcHead, subdims[1].bwidth - 1, subdims[1].bwidth, subdims[1].x, subdims[1].bwidth, typeName, localBufSize); kgenAddStmt(ctx, tmp); sprintf(tmp, funcBody, subdims[1].x, subdims[1].x, // y = n + dy <= N ?... subdims[1].bwidth, subdims[1].bwidth, // x = k + bw <= M ?... subdims[1].bwidth, subdims[1].x, // aligned = (x==bw1)&&(y==dx1) (kflags & KEXTRA_NO_COPY_VEC_B) == 0, copyImgFuncs->globalToImage[MATRIX_B], vecLen, copyImgFuncs->zeroBlock[MATRIX_B], copyImgFuncs->zeroBlock[MATRIX_B], copyImgFuncs->globalToLocalTransposedGeneric[MATRIX_B], subdims[1].bwidth, copyImgFuncs->globalToLocalGeneric[MATRIX_B], subdims[1].bwidth, copyImgFuncs->globalToLocalTransposed[MATRIX_B], copyImgFuncs->localToImage[MATRIX_B], vecLen); kgenAddStmt(ctx, tmp); kgenEndFuncBody(ctx); } static void declareMainKernel( struct KgenContext *ctx, DataType dtype, KernelExtraFlags kflags, const PGranularity *pgran) { char tmp[4048]; char fpref; const char *typeName; char coordNames[2] = {'M', 'N'}; int side = ((kflags & KEXTRA_SIDE_RIGHT) != 0); fpref = dtypeToBlasPrefix(dtype); typeName = dtypeBuiltinType(dtype); sprintf(tmp, "__attribute__((reqd_work_group_size(%u, %u, 1)))\n" "void __kernel\n" "%ctrmmImg(\n" " uint %c,\n" " uint %c,\n" " const %s alpha,\n" " const __read_only image2d_t A,\n" " const __read_only image2d_t B,\n" " __global %s *C,\n" " uint ldb,\n" " const uint start%c,\n" " const uint start%c,\n" " const uint origM,\n" " const uint offB)\n", pgran->wgSize[0], pgran->wgSize[1], fpref, coordNames[side], coordNames[1 - side], typeName, typeName, coordNames[side], coordNames[1 - side]); kgenDeclareFunction(ctx, tmp); } // Preparation function for images based kernel generator static ssize_t preparator( char *buf, size_t buflen, const struct SubproblemDim *subdims, const struct PGranularity *pgran, void *extra) { struct KgenContext *ctx; CLBLASKernExtra *kextra = (CLBLASKernExtra*)extra; CopyImgFuncs copyImgFuncs; BlasGenSettings gset; ssize_t ret; bool b; memset(©ImgFuncs, 0, sizeof(copyImgFuncs)); memset(&gset, 0, sizeof(gset)); ctx = createKgenContext(buf, buflen, true); if (ctx == NULL) { return -ENOMEM; } b = isDoubleBasedType(kextra->dtype); kgenDeclareUptrs(ctx, b); if (kextra->kernType == CLBLAS_PREP_B_KERNEL) { declareBlasEnums(ctx); } memcpy(gset.subdims, subdims, sizeof(gset.subdims)); gset.kextra = kextra; gset.pgran = pgran; // generate necessary memory to image copying functions generateImageCopyFuncs(©ImgFuncs, ctx, CLBLAS_TRMM, &gset); kgenAddBlankLine(ctx); if (kextra->kernType == CLBLAS_PREP_A_KERNEL) { genPrepKernelA(ctx, subdims, kextra->flags, kextra->dtype, ©ImgFuncs, pgran); } else { genPrepKernelB(ctx, subdims, kextra->dtype, ©ImgFuncs, pgran, kextra->flags); } ret = kgenAddBlankLine(ctx); if (!ret) { ret = (ssize_t)kgenSourceSize(ctx) + 1; } destroyKgenContext(ctx); return (ret < 0) ? -EOVERFLOW : ret; } static void initKernelVarNames(KernelVarNames *kvars, KernelExtraFlags kflags) { kvars->A = "imgA"; kvars->B = "imgB"; if (isMatrixAccessColMaj(CLBLAS_TRMM, kflags, MATRIX_A)) { kvars->coordA = "coordA.x"; } else { kvars->coordA = "coordA.y"; } if (isMatrixAccessColMaj(CLBLAS_TRMM, kflags, MATRIX_B)) { kvars->coordB = "coordB.x"; } else { kvars->coordB = "coordB.y"; } kvars->sizeM = "M"; kvars->sizeN = "N"; kvars->sizeK = "K"; } static ssize_t generator( char *buf, size_t buflen, const struct SubproblemDim *subdims, const struct PGranularity *pgran, void *extra) { struct KgenContext *ctx; CLBLASKernExtra *kextra = (CLBLASKernExtra*)extra; char tmp[4096], tmp1[4096]; char *p; // is the iteration over N, N at the top level const char *typeName; DataType dtype = kextra->dtype; ssize_t ret; BlasGenSettings gset; BlkMulOpts mulOpts; unsigned int tsize; unsigned int vecLen, outVecLen; bool b; const char *outTypeName; unsigned int i; unsigned int nrRegs, regPitch; KernelExtraFlags kflags = kextra->flags; int tra, trb; char coordNames[2] = {'M', 'N'}; char vect[2] = {'y', 'x'}; const char *coordConstants = "const uint workItemM = startM + get_global_id(0) * %lu;\n" "const uint workItemN = startN + get_global_id(1) * %lu;\n" "const int2 skewRow = (int2)(0, get_local_id(0) %% %lu);\n" "uint vectK = (origM + %u) / %u;\n"; /* * template for image based trmm preparation part * for two dimensional work space */ const char *localVariables = "uint k0;\n" "int2 coordA = (int2)(0, workItemM - startM);\n" "int2 coordB = (int2)(0, workItemN - startN);\n" "%s c[%u];\n\n"; memset(&gset, 0, sizeof(gset)); memcpy(gset.subdims, subdims, sizeof(gset.subdims)); gset.kextra = kextra; gset.pgran = pgran; initKernelVarNames(&gset.varNames, kflags); tsize = dtypeSize(dtype); vecLen = sizeof(cl_float4) / dtypeSize(dtype); if (isComplexType(dtype)) { regPitch = (unsigned int)subdims[1].x; } else { regPitch = (unsigned int) fl4RowWidth(subdims[1].x, tsize) * sizeof(cl_float4) / tsize; } ctx = createKgenContext(buf, buflen, true); if (ctx == NULL) { return -ENOMEM; } // at first, generate needed declarations and auxiliary functions b = isDoubleBasedType(dtype); kgenDeclareUptrs(ctx, b); typeName = dtypeBuiltinType(dtype); // now, generate the kernel declareMainKernel(ctx, dtype, kflags, pgran); ret = kgenBeginFuncBody(ctx); // constants sprintf(tmp, coordConstants, subdims[1].y, subdims[1].x, subdims[1].y, vecLen - 1, vecLen); kgenAddStmt(ctx, tmp); /* * Calculate local buffer pitches, and then declare local * variables */ getResultGPRsInfo(dtype, &subdims[1], vecLen, &nrRegs, &outTypeName); sprintf(tmp, localVariables, outTypeName, nrRegs); kgenAddStmt(ctx, tmp); // check if offset exceeds matrix kgenAddStmt(ctx, "if ((workItemM >= startM + M) ||" "(workItemN >= startN + N)) {\n" " return;\n" "}\n"); // zero C block sprintf(tmp, "for (k0 = 0; k0 < %u; k0++) {\n" " c[k0] = 0;\n" "}\n\n", nrRegs); kgenAddStmt(ctx, tmp); // loop over K if (isMatrixUpper(kflags)) { sprintf(tmp, "coordA.x = vectK - %lu;\n" "coordB.x = coordA.x;\n", subdims[1].bwidth / vecLen); kgenAddStmt(ctx, tmp); sprintf(tmp, "for (k0 = ((workItemM/%lu)*%lu)/%u; " "k0 < vectK; k0 += %lu)", subdims[0].bwidth, subdims[0].bwidth, vecLen, subdims[1].bwidth / vecLen); } else { size_t dk; dk = (subdims[1].y > subdims[1].bwidth) ? subdims[1].y : subdims[1].bwidth; dk = dk / vecLen + 1; sprintf(tmp, "for (k0 = 0; " "k0 < min((workItemM+%u)/%u + %lu, vectK); " "k0 += %lu)", vecLen - 1, vecLen, dk, subdims[1].bwidth / vecLen); } kgenBeginBranch(ctx, tmp); mulOpts.aMobj = CLMEM_IMAGE; mulOpts.bMobj = CLMEM_IMAGE; mulOpts.flags = BLKMUL_OUTPUT_PRIVATE | BLKMUL_SKEW_ROW | BLKMUL_INLINE | BLKMUL_AVOID_AND; if (isComplexType(dtype)) { mulOpts.core = BLKMUL_SEPARATE_MULADD; } else { mulOpts.core = BLKMUL_MAD; } mulOpts.argNames.coordA = "coordA"; mulOpts.argNames.coordB = "coordB"; mulOpts.argNames.skewCol = "skewCol"; mulOpts.argNames.skewRow = "skewRow"; mulOpts.argNames.k = "k0"; mulOpts.argNames.vectBoundK = "vectK"; ret = blkMulGen(ctx, subdims, dtype, &mulOpts); if (ret) { destroyKgenContext(ctx); return -EOVERFLOW; } // update image coordinates if (isMatrixUpper(kflags)) { // In this case loop is inverted to avoid 'random' skews sprintf(tmp, "\ncoordA.x -= %lu;\n" "coordB.x -= %lu;\n", subdims[1].bwidth / vecLen, subdims[1].bwidth / vecLen); } else { sprintf(tmp, "\ncoordA.x += %lu;\n" "coordB.x += %lu;\n", subdims[1].bwidth / vecLen, subdims[1].bwidth / vecLen); } kgenAddStmt(ctx, tmp); kgenEndBranch(ctx, NULL); // reorder the given solution outVecLen = isComplexType(dtype) ? 1 : vecLen; p = tmp1; for (i = 0; i < regPitch / outVecLen; i++) { unsigned int k = (unsigned int)(subdims[1].y - 1) * regPitch / outVecLen + i; sprintf(p, "\n" " tmp = c[%u];\n" " for (j = %lu; j >= 0; j--) {\n" " c[(j+1) * %u + %u] = c[j * %u + %u];\n" " }\n" " c[%u] = tmp;\n", k, subdims[1].y - 2, regPitch / outVecLen, i, regPitch / outVecLen, i, i); p += strlen(p); } sprintf(tmp, "\n" "for (k0 = 0; k0 < skewRow.y; k0++) {\n" " int j;\n" " %s tmp;\n" "%s" "}\n" "\n", outTypeName, tmp1); kgenAddStmt(ctx, tmp); // write back the tile evaluated tra = isMatrixAccessColMaj(CLBLAS_TRMM, kextra->flags, MATRIX_A); trb = isMatrixAccessColMaj(CLBLAS_TRMM, kextra->flags, MATRIX_B); sprintf(tmp, "coordA.%c = workItemM - startM;\n" "coordB.%c = workItemN - startN;\n\n", vect[tra], vect[trb]); kgenAddStmt(ctx, tmp); kgenBeginBranch(ctx, NULL); trb = isMatrixAccessColMaj(CLBLAS_TRMM, kextra->flags, MATRIX_C); sprintf(tmp, "__global %s *B = C + offB + start%c * ldb + start%c;\n\n", typeName, coordNames[trb], coordNames[1 - trb]); kgenAddStmt(ctx, tmp); generateResultUpdateOld(ctx, CLBLAS_TRMM, &gset, NULL, NULL); kgenEndBranch(ctx, NULL); kgenEndFuncBody(ctx); ret = kgenAddBlankLine(ctx); if (!ret) { ret = (ssize_t)kgenSourceSize(ctx) + 1; } destroyKgenContext(ctx); return (ret < 0) ? -EOVERFLOW : ret; } static void assignKargs(KernelArg *args, const void *params, const void *extra) { const CLBlasKargs *blasArgs = (const CLBlasKargs*)params; int side = (blasArgs->side == clblasRight); size_t sizes[2] = {blasArgs->M, blasArgs->N}; size_t offs[2] = {blasArgs->offsetM, blasArgs->offsetN}; (void)extra; switch (blasArgs->kernType) { case CLBLAS_COMPUTING_KERNEL: initSizeKarg(&args[0], blasArgs->M); initSizeKarg(&args[1], blasArgs->N); assignScalarKarg(&args[2], &(blasArgs->alpha), blasArgs->dtype); INIT_KARG(&args[3], blasArgs->scimage[0]); INIT_KARG(&args[4], blasArgs->scimage[1]); initMemobjKarg(&args[5], blasArgs->B, NULL, 0, 0); initSizeKarg(&args[6], blasArgs->ldb.matrix); initSizeKarg(&args[7], blasArgs->offsetM); initSizeKarg(&args[8], blasArgs->offsetN); initSizeKarg(&args[9], blasArgs->K); initSizeKarg(&args[10], blasArgs->offBX); break; case CLBLAS_PREP_A_KERNEL: initSizeKarg(&args[0], sizes[side]); initMemobjKarg(&args[1], blasArgs->A, NULL, 0, 0); initSizeKarg(&args[2], blasArgs->lda.matrix); INIT_KARG(&args[3], blasArgs->scimage[0]); initSizeKarg(&args[4], offs[side]); initSizeKarg(&args[5], blasArgs->K); initSizeKarg(&args[6], blasArgs->offA); break; case CLBLAS_PREP_B_KERNEL: INIT_KARG(&args[0], blasArgs->order); INIT_KARG(&args[1], blasArgs->side); initSizeKarg(&args[2], sizes[1 - side]); initMemobjKarg(&args[3], blasArgs->B, NULL, 0, 0); initSizeKarg(&args[4], blasArgs->ldb.matrix); INIT_KARG(&args[5], blasArgs->scimage[1]); initSizeKarg(&args[6], offs[1 - side]); initSizeKarg(&args[7], blasArgs->K); initSizeKarg(&args[8], blasArgs->offBX); break; default: //this should not happen break; } } static bool isFitToLDS( SubproblemDim *dim, DataType dtype, cl_ulong ldsSize, const void *kernelArgs) { cl_ulong size; const CLBlasKargs *kargs = (const CLBlasKargs*)kernelArgs; size = matrBlockSize(&dim[1], MATRIX_C, dtype, kargs->side); return (size * dtypeSize(dtype) <= ldsSize); } static void calcNrThreads( size_t threads[2], const SubproblemDim *subdims, const PGranularity *pgran, const void *args, const void *extra) { const CLBlasKargs *kargs = args; size_t m, n, k; (void)extra; //form inner subdims with respect of multiplication side if (kargs->side == clblasRight) { m = kargs->N; n = kargs->M; //original N was stored in K k = kargs->K; } else { m = kargs->M; n = kargs->N; //original M was stored in K k = kargs->K; } if (kargs->kernType != CLBLAS_COMPUTING_KERNEL) { size_t whole, part; size_t nrGroups; // each thread gets one block if (kargs->kernType == CLBLAS_PREP_A_KERNEL) { whole = m; part = subdims[0].itemY; } else { whole = n; part = subdims[0].itemX; } nrGroups = whole / part + (whole % part != 0); nrGroups *= (k / subdims[0].bwidth + (k % subdims[0].bwidth != 0)); threads[0] = pgran->wgSize[0] * nrGroups; threads[1] = pgran->wgSize[1]; } else { calcGlobalThreads(threads, &subdims[0], pgran, m, n); } } static SolverFlags solverFlags(void) { return (SF_WSPACE_2D); } void initTrmmImgPattern(MemoryPattern *mempat) { mempat->name = "Image based block trmm"; mempat->nrLevels = 2; mempat->cuLevel = 0; mempat->thLevel = 1; mempat->sops = &imgSops; mpatExtra.aMset = CLMEM_LEVEL_L1 | CLMEM_LEVEL_LDS; mpatExtra.bMset = CLMEM_LEVEL_L1 | CLMEM_LEVEL_LDS; mpatExtra.mobjA = CLMEM_IMAGE; mpatExtra.mobjB = CLMEM_IMAGE; mempat->extra = &mpatExtra; } static int getPerf( unsigned int kflags, const void *args) { DUMMY_ARG_USAGE(kflags); DUMMY_ARG_USAGE(args); return PPERF_POOR; } clblas-2.10/src/library/blas/gens/legacy/trmm_lds.c000066400000000000000000000350721264277366700222700ustar00rootroot00000000000000/* ************************************************************************ * Copyright 2013 Advanced Micro Devices, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * ************************************************************************/ /* * LDS based generator */ #include #include #include #include #include #include #include #include #include #include "../init.h" #include "blas_kgen_legacy.h" #include "gen_helper_legacy.h" #include "../gen_helper.h" #include "../trxm_common.h" #include "trxm_common_legacy.h" static CLBLASMpatExtra mpatExtra; static ssize_t generator( char *buf, size_t buflen, const struct SubproblemDim *subdims, const struct PGranularity *pgran, void *extra); static void assignKargs(KernelArg *args, const void *params, const void *extra); static bool isFitToLDS( SubproblemDim *dim, DataType dtype, cl_ulong ldsSize, const void *kernelArgs); static SolverFlags solverFlags(void); static int getPerf( unsigned int kflags, const void *args); static SolverOps solverOps = { generator, assignKargs, isFitToLDS, getPerf, NULL, NULL, NULL, solverFlags, NULL, //fixupKargs NULL, //getDefaultDecomp NULL, //getDecompList NULL, NULL }; static void genPrepareBlockC( struct KgenContext *ctx, const ZeroFuncs *zeroFuncs) { char tmp[2048]; sprintf(tmp, "%s((__local float4*)tempC);\n", zeroFuncs->names[MATRIX_C]); kgenAddStmt(ctx, tmp); } static void genWriteBlockB( struct KgenContext *ctx, const SubproblemDim *dim, DataType dtype, const CopyBufFuncs *copyFuncs, KernelExtraFlags kflags) { char tmp[1024]; size_t pitch; const char *coordName[2] = {"currM", "currN"}; int trb; trb = isMatrixAccessColMaj(CLBLAS_TRMM, kflags, MATRIX_C); pitch = matrBlockPitch(dim, MATRIX_C, dtype, clblasLeft); if (!(kflags & (KEXTRA_TAILS_N | KEXTRA_TAILS_M))) { sprintf(tmp, "%s((GPtr)B, (LPtr)tempC, %s, %s, ldb);\n", copyFuncs->write, coordName[trb], coordName[1 - trb]); } else { sprintf(tmp, "y = (currM + %lu <= M) ? %lu : M - currM;\n" "x = (currN + %lu <= N) ? %lu : N - currN;\n" "if ((y == %lu) && (x == %lu)) {\n" // fast rwrite " %s((GPtr)B, (LPtr)tempC, %s, %s, ldb);\n" "}\n" "else {\n" // slow write " %s((GPtr)B, (LPtr)tempC, %s, %s, y, x, ldb, %lu);\n" "}\n\n", dim->y, dim->y, dim->x, dim->x, dim->y, dim->x, copyFuncs->write, coordName[trb], coordName[1 - trb], copyFuncs->writeGeneric, coordName[trb], coordName[1 - trb], pitch); } kgenAddStmt(ctx, tmp); } static void genInitCurrM( struct KgenContext *ctx, const SubproblemDim *dim, KernelExtraFlags kflags) { char tmp[1024]; if (isMatrixUpper(kflags)) { strcpy(tmp, "currM = 0;\n"); } else { sprintf(tmp, "currM = (M - 1) / %lu * %lu;\n", dim->y, dim->y); } kgenAddStmt(ctx, tmp); kgenAddBlankLine(ctx); } static void genInternalLoopCtl( struct KgenContext *ctx, const SubproblemDim *dim, KernelExtraFlags kflags) { char tmp[1024]; if (isMatrixUpper(kflags)) { if (!(kflags & KEXTRA_TAILS_M)) { sprintf(tmp, "for (k0 = M - %lu; (k0 + %lu > currM) && (k0 < M); " "k0 -= %lu)", dim->bwidth, dim->bwidth, dim->bwidth); } else { sprintf(tmp, "for (k0 = (M - 1) / %lu * %lu; k0 + %lu > currM; " "k0 -= %lu)", dim->bwidth, dim->bwidth, dim->bwidth, dim->bwidth); } } else { sprintf(tmp, "for (k0 = 0; (k0 < currM + %lu) && (k0 < M); " "k0 += %lu)", dim->y, dim->bwidth); } kgenBeginBranch(ctx, tmp); } static void initKernelVarNames(KernelVarNames *kvars, KernelExtraFlags kflags) { kvars->A = "A"; kvars->B = "B"; if (isMatrixAccessColMaj(CLBLAS_TRMM, kflags, MATRIX_A)) { kvars->coordA = "coordA.x"; } else { kvars->coordA = "coordA.y"; } if (isMatrixAccessColMaj(CLBLAS_TRMM, kflags, MATRIX_B)) { kvars->coordB = "coordB.x"; } else { kvars->coordB = "coordB.y"; } kvars->sizeM = "M"; kvars->sizeN = "N"; kvars->sizeK = "origM"; } static ssize_t generator( char *buf, size_t buflen, const struct SubproblemDim *subdims, const struct PGranularity *pgran, void *extra) { struct KgenContext *ctx; CLBLASKernExtra *kextra = (CLBLASKernExtra*)extra; char tmp[2048]; char blkmul[128]; char updateResFn[FUNC_NAME_MAXLEN]; char updateResGenericFn[FUNC_NAME_MAXLEN]; CopyBufFuncs copyFuncs; ZeroFuncs zeroFuncs; DataType dtype = kextra->dtype; ssize_t ret; BlasGenSettings gset; BlkMulOpts mulOpts; size_t pitchAB, pitchC; bool b; KernelExtraFlags kflags = kextra->flags; const char *outTypeName; unsigned int nrRegs; bool useLocalC; unsigned int vecLen = sizeof(cl_float4) / dtypeSize(dtype); int tra, trb; unsigned int l1Pans; char vect[2] = {'y', 'x'}; if (pgran->wgDim != 1) { return -EINVAL; } ctx = createKgenContext(buf, buflen, true); if (ctx == NULL) { return -ENOMEM; } /* Code that updates block of B matrix using local registers or use mad's * doesn't work on some GPUs. As a workaround use buffer in local memory * for unaligned matrix sizes */ useLocalC = (kflags & (KEXTRA_TAILS_M | KEXTRA_TAILS_N)); memset(&gset, 0, sizeof(gset)); memcpy(gset.subdims, subdims, sizeof(gset.subdims)); gset.pgran = pgran; gset.kextra = kextra; initKernelVarNames(&gset.varNames, kflags); // at first, generate needed declarations and auxiliary functions b = isDoubleBasedType(dtype); kgenDeclareUptrs(ctx, b); generateBufCopyFuncs(©Funcs, ctx, CLBLAS_TRMM, &gset, BCHF_MATRIX_A | BCHF_MATRIX_B | BCHF_WRITE_OUTPUT); if (useLocalC) { generateZeroingFuncs(&zeroFuncs, ctx, &subdims[0], pgran, dtype, ZF_MATRIX_A | ZF_MATRIX_B | ZF_MATRIX_C); } else { generateUpresFuncs(ctx, CLBLAS_TRMM, &gset, updateResFn, updateResGenericFn); generateZeroingFuncs(&zeroFuncs, ctx, &subdims[0], pgran, dtype, ZF_MATRIX_A | ZF_MATRIX_B); } kgenAddBlankLine(ctx); // block multiplication function mulOpts.aMobj = CLMEM_BUFFER; mulOpts.bMobj = CLMEM_BUFFER; if (useLocalC) { mulOpts.flags = BLKMUL_SKEW_COLUMN; } else { mulOpts.flags = BLKMUL_OUTPUT_PRIVATE | BLKMUL_SKEW_COLUMN; } // BLKMUL_MAD doesn't work here on all cards so use SEPARATE_MULADD always // as a workaround mulOpts.core = BLKMUL_SEPARATE_MULADD; ret = blkMulGen(ctx, subdims, dtype, &mulOpts); if (ret) { destroyKgenContext(ctx); return -EOVERFLOW; } kgenAddBlankLine(ctx); kgenGetLastFuncName(blkmul, sizeof(blkmul), ctx); // now, generate the kernel declareTrxmKernel(ctx, dtype, pgran, kflags, CLBLAS_TRMM, NULL, false, false); ret = kgenBeginFuncBody(ctx); /* * Calculate local buffer pitches, and then insert the * preparative code */ pitchAB = matrBlockPitch(subdims, MATRIX_A, dtype, clblasLeft); pitchC = matrBlockPitch(subdims, MATRIX_C, dtype, clblasLeft); getResultGPRsInfo(dtype, &subdims[1], vecLen, &nrRegs, &outTypeName); declareLdsBasedTrxmVariables(ctx, dtype, subdims, pgran, useLocalC); /* * B matrix is divided on panels, each work group * multiply such a panel on the whole matrix A. */ sprintf(tmp, "currN = gid * %lu;\n", subdims->x); kgenAddStmt(ctx, tmp); genInitCurrM(ctx, subdims, kflags); if (((kflags & (KEXTRA_SIDE_RIGHT | KEXTRA_STARTM_NOT_ZERO)) == KEXTRA_STARTM_NOT_ZERO) || ((kflags & (KEXTRA_SIDE_RIGHT | KEXTRA_STARTN_NOT_ZERO)) == (KEXTRA_SIDE_RIGHT | KEXTRA_STARTN_NOT_ZERO))) { kgenAddStmt(ctx, "A += lda * offsetM + offsetM;\n"); } if (kflags & KEXTRA_A_OFF_NOT_ZERO) { kgenAddStmt(ctx, "A += offA;\n"); } genTrxmBMatrShift(ctx, kflags, false); tra = isMatrixAccessColMaj(CLBLAS_TRMM, kflags, MATRIX_A); trb = isMatrixAccessColMaj(CLBLAS_TRMM, kflags, MATRIX_B); l1Pans = (unsigned int)subdims[0].x / (unsigned int)subdims[1].x; sprintf(tmp, "coordB.%c = currN + lid %% %u * %lu;\n" "coordB.%c = 0;\n\n", vect[trb], l1Pans, subdims[1].x, vect[1 - trb]); kgenAddStmt(ctx, tmp); // loop over M sprintf(tmp, "for (m0 = 0; m0 < M; m0 += %lu)", subdims->y); kgenBeginBranch(ctx, tmp); sprintf(tmp, "coordA.%c = currM + lid / %u * %lu;\n" "coordA.%c = 0;\n\n", vect[tra], l1Pans, subdims[1].y, vect[1 - tra]); kgenAddStmt(ctx, tmp); if (useLocalC) { genPrepareBlockC(ctx, &zeroFuncs); } else { // zero work item C block sprintf(tmp, "for (k0 = 0; k0 < %u; k0++) {\n" " c[k0] = 0;\n" "}\n\n", nrRegs); kgenAddStmt(ctx, tmp); } /* * In the first pass the part without triangle blocks is processed, * and in the second one only triangle blocks are processed */ genInternalLoopCtl(ctx, subdims, kflags); genPrepareTrxmBlockA(ctx, subdims, dtype, ©Funcs, &zeroFuncs, kflags, "M"); genPrepareTrxmBlockB(ctx, subdims, dtype, ©Funcs, &zeroFuncs, kflags); kgenAddBarrier(ctx, CLK_LOCAL_MEM_FENCE); kgenAddBlankLine(ctx); genTriangMatrBlock(ctx, subdims, dtype, kflags); // and eventually multiply the blocks and update the matrix C block if (useLocalC) { sprintf(tmp, "%s(alpha, (LPtr)(tempA + (lid / %u * %lu) * %lu), \n" " (LPtr)(tempB + (lid %% %u * %lu) * %lu),\n" " (LPtr)(tempC + (lid / %u * %lu) * %lu + \n" " (lid %% %u * %lu)), lid);\n", blkmul, l1Pans, subdims[1].y, pitchAB, l1Pans, subdims[1].x, pitchAB, l1Pans, subdims[1].y, pitchC, l1Pans, subdims[1].x); } else { sprintf(tmp, "%s((LPtr)(tempA + (lid / %u * %lu) * %lu), " "(LPtr)(tempB + (lid %% %u * %lu) * %lu), c, lid);\n", blkmul, l1Pans, subdims[1].y, pitchAB, l1Pans, subdims[1].x, pitchAB); } kgenAddStmt(ctx, tmp); kgenAddBarrier(ctx, CLK_LOCAL_MEM_FENCE); genInternalLoopEnd(ctx); // loop over K kgenAddBlankLine(ctx); // write back the block, it's evaluated if (useLocalC) { genWriteBlockB(ctx, subdims, dtype, ©Funcs, kflags); kgenAddBarrier(ctx, CLK_GLOBAL_MEM_FENCE); } else { if (kflags & (KEXTRA_TAILS_M | KEXTRA_TAILS_N)) { sprintf(tmp, "if ((coordA.%c < M) && (coordB.%c < N))", vect[tra], vect[trb]); kgenBeginBranch(ctx, tmp); } generateResultUpdateOld(ctx, CLBLAS_TRMM, &gset, updateResFn, updateResGenericFn); if (kflags & (KEXTRA_TAILS_M | KEXTRA_TAILS_N)) { kgenEndBranch(ctx, tmp); } } if (isMatrixUpper(kflags)) { sprintf(tmp, "currM += %lu;\n", subdims[0].y); } else { sprintf(tmp, "currM -= %lu;\n", subdims[0].y); } kgenAddStmt(ctx, tmp); kgenEndBranch(ctx, NULL); // loop over M kgenEndFuncBody(ctx); ret = kgenAddBlankLine(ctx); if (!ret) { ret = (ssize_t)kgenSourceSize(ctx) + 1; } destroyKgenContext(ctx); return (ret < 0) ? -EOVERFLOW : ret; } static void assignKargs(KernelArg *args, const void *params, const void *extra) { const CLBlasKargs *blasArgs = (const CLBlasKargs*)params; KernelExtraFlags kflags = ((const CLBLASKernExtra*)extra)->flags; int idx = 7; initSizeKarg(&args[0], blasArgs->M); initSizeKarg(&args[1], blasArgs->N); assignScalarKarg(&args[2], &(blasArgs->alpha), blasArgs->dtype); initMemobjKarg(&args[3], blasArgs->A, NULL, 0, 0); initSizeKarg(&args[4], blasArgs->lda.matrix); initMemobjKarg(&args[5], blasArgs->B, NULL, 0, 0); initSizeKarg(&args[6], blasArgs->ldb.matrix); if (kflags & KEXTRA_STARTM_NOT_ZERO) { initSizeKarg(&args[idx++], blasArgs->offsetM); } if (kflags & KEXTRA_STARTN_NOT_ZERO) { initSizeKarg(&args[idx++], blasArgs->offsetN); } if (kflags & KEXTRA_A_OFF_NOT_ZERO) { initSizeKarg(&args[idx++], blasArgs->offA); } if (kflags & KEXTRA_BX_OFF_NOT_ZERO) { initSizeKarg(&args[idx++], blasArgs->offBX); } } static bool isFitToLDS( SubproblemDim *dim, DataType dtype, cl_ulong ldsSize, const void *kernelArgs) { const CLBlasKargs *kargs = (const CLBlasKargs*)kernelArgs; cl_ulong size; size = matrBlockSize(dim, MATRIX_A, dtype, kargs->side); size += matrBlockSize(dim, MATRIX_B, dtype, kargs->side); size += matrBlockSize(dim, MATRIX_C, dtype, kargs->side); return (size * dtypeSize(dtype) <= ldsSize); } static SolverFlags solverFlags(void) { return ((unsigned int)SF_WSPACE_1D); } void initTrmmLdsPattern(MemoryPattern *mempat) { mempat->name = "LDS based block trmm"; mempat->nrLevels = 2; mempat->cuLevel = 0; mempat->thLevel = 1; mempat->sops = &solverOps; mpatExtra.aMset = CLMEM_LEVEL_LDS; mpatExtra.bMset = CLMEM_LEVEL_LDS; mpatExtra.mobjA = CLMEM_BUFFER; mpatExtra.mobjB = CLMEM_BUFFER; mempat->extra = &mpatExtra; } static int getPerf( unsigned int kflags, const void *args) { DUMMY_ARG_USAGE(kflags); DUMMY_ARG_USAGE(args); return PPERF_POOR; } clblas-2.10/src/library/blas/gens/legacy/trsm_cached_lds.c000066400000000000000000000717111264277366700235650ustar00rootroot00000000000000/* ************************************************************************ * Copyright 2013 Advanced Micro Devices, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * ************************************************************************/ /* * TRSM generator with support of cached reads from the global memory */ #include #include #include #include #include #include #include #include #include #include #include "../blas_kgen.h" #include "../trxm_common.h" #include "trsm_kgen_legacy.h" #include "gen_helper_legacy.h" #include "../trsm_kgen.h" static const char *readSquareBlock = "y = (currM + %lu <= M) ? %lu : M - currM;\n" "x = (k0 + %lu <= M) ? %lu : M - k0;\n" "if ((y == %lu) && (x == %lu)) {\n" // just read with an optimized function " %s((LPtr)temp%c, (GPtr)A, currM, k0, lda);\n" "}\n" "else {\n" " %s((__local float4*)temp%c);\n" // zeroing " barrier(CLK_LOCAL_MEM_FENCE);\n" " %s((LPtr)temp%c, (GPtr)A, currM, k0, y, x, %lu, lda);\n" "}\n\n"; static const char *readSquareBlockOpt = // just read with an optimized function "%s((LPtr)temp%c, (GPtr)A, currM, k0, lda);\n"; static const char *readSquareBlockTrans = "y = (currM + %lu <= M) ? %lu : M - currM;\n" "x = (k0 + %lu <= M) ? %lu : M - k0;\n" "if ((y == %lu) && (x == %lu)) {\n" // read and transpose with an optimized function " %s((LPtr)temp%c, (GPtr)A, k0, currM, lda);\n" "}\n" "else {\n" " %s((__local float4*)temp%c);\n" // zeroing " barrier(CLK_LOCAL_MEM_FENCE);\n" // read and transpose with slow function " %s((LPtr)temp%c, (GPtr)A, k0, currM, x, y, %lu, lda);\n" "}\n\n"; static const char *readSquareBlockTransOpt = // read and transpose with an optimized function "%s((LPtr)temp%c, (GPtr)A, k0, currM, lda);\n"; static CLBLASMpatExtra mpatExtra; static ssize_t generator( char *buf, size_t buflen, const struct SubproblemDim *subdims, const struct PGranularity *pgran, void *extra); static bool isFitToLDS( SubproblemDim *dim, DataType dtype, cl_ulong ldsSize, const void *kernelArgs); static SolverFlags solverFlags(void); static void assignKargs(KernelArg *args, const void *params, const void *extra); static void fixupArgs(void *args, SubproblemDim *subdims, void *extra); static SolverOps trsmSops = { generator, assignKargs, isFitToLDS, NULL, NULL, NULL, NULL, solverFlags, fixupArgs, NULL, //getDefaultDecomp NULL, // getDecompList NULL, NULL }; static TileMulFlags getCyclicFlags( const SubproblemDim *dim, KernelExtraFlags kflags, bool tailPass, unsigned int vecLen) { TileMulFlags mflags = TILEMUL_NO_FLAGS; if (tailPass && !isMatrixUpper(kflags)) { mflags |= TILEMUL_GLOBAL_CYCLIC_A; } if (isMatrixAccessColMaj(CLBLAS_TRSM, kflags, MATRIX_B) && (kflags & KEXTRA_TAILS_N) && (dim->x > vecLen)) { mflags |= TILEMUL_GLOBAL_CYCLIC_B; } return mflags; } static void initTiles(BlasGenSettings *gset) { unsigned int nrRows, nrCols; unsigned int vecLen; const SubproblemDim *dim = &gset->subdims[1]; const CLBLASKernExtra *kextra = gset->kextra; DataType dtype = kextra->dtype; bool tra; // the tile A should be able to fit rectangular and square tiles nrCols = (unsigned int)szmax(dim->y, dim->bwidth); tra = isMatrixAccessColMaj(CLBLAS_TRSM, kextra->flags, MATRIX_A); vecLen = getVecLen(gset, CLBLAS_TRSM, MATRIX_A); initTile(&gset->tileA, "a", (unsigned int)dim->y, nrCols, vecLen, dtype, PRIV_STORAGE_ARRAY, tra, false); /* * tile B should be able to fit tiles of the matrix B and of the * intermediate result. That result will be always transposed * from the point of view of tile multiplication */ tra = !isMatrixAccessColMaj(CLBLAS_TRSM, kextra->flags, MATRIX_B); if (tra) { nrRows = (unsigned int)szmax(dim->bwidth, dim->y); nrCols = (unsigned int)dim->x; } else { nrRows = (unsigned int)szmax(dim->bwidth, dim->x); nrCols = (unsigned int)szmax(dim->x, dim->y); } vecLen = getVecLen(gset, CLBLAS_TRSM, MATRIX_B); initTile(&gset->tileBX, "b", nrRows, nrCols, vecLen, dtype, PRIV_STORAGE_ARRAY, tra, false); initTile(&gset->tileCY, "c", (unsigned int)dim->y, (unsigned int)dim->x, vecLen, dtype, PRIV_STORAGE_ARRAY, false, false); } static void prepareTilesForMainLoop(BlasGenSettings *gset) { const SubproblemDim *dim = &gset->subdims[1]; gset->tileA.nrCols = (unsigned int)dim->bwidth; gset->tileBX.nrRows = (unsigned int)dim->bwidth; gset->tileBX.nrCols = (unsigned int)dim->x; } static void declareLocalVariables( struct KgenContext *ctx, const BlasGenSettings *gset) { char tmp[1024]; const char *elemType; const SubproblemDim *dims = gset->subdims; DataType dtype = gset->kextra->dtype; size_t pitchAC, heightC; elemType = dtypeBuiltinType(dtype); pitchAC = matrBlockPitch(dims, MATRIX_C, dtype, clblasRight); heightC = szmax(dims[0].y, dims[0].x); declareTileStorages(ctx, gset); sprintf(tmp, "const int lid = get_local_id(0);\n" "const int gid = get_group_id(0);\n" "const uint2 skewRow = 0, skewCol = 0;\n\n" "GPtr uA, uB;\n" "uint coordA, coordB, k;\n" "uint x, y;\n" "__local %s tempA[%lu], tempC[%lu];\n" "LPtr utmpA, utmpC;\n" "uint m0 = 0, k0, currM, currN;\n", elemType, pitchAC * dims[0].y, pitchAC * heightC); kgenAddStmt(ctx, tmp); } static void genReadDiagBlock( struct KgenContext *ctx, const SubproblemDim *dim, DataType dtype, const CopyBufFuncs *copyFuncs, const ZeroFuncs *zeroFuncs, KernelExtraFlags kflags, char c) { char tmp[1024]; size_t pitch; const char *readBlock; bool tra; tra = isMatrixAccessColMaj(CLBLAS_TRSM, kflags, MATRIX_A); pitch = matrBlockPitch(dim, MATRIX_A, dtype, clblasLeft); if (!(kflags & KEXTRA_TAILS_M)) { readBlock = (tra) ? readSquareBlockTransOpt : readSquareBlockOpt; sprintf(tmp, readBlock, copyFuncs->read[MATRIX_A], c); } else { readBlock = (tra) ? readSquareBlockTrans : readSquareBlock; sprintf(tmp, readBlock, dim->y, dim->y, dim->bwidth, dim->bwidth, dim->y, dim->bwidth, copyFuncs->read[MATRIX_A], c, zeroFuncs->names[MATRIX_A], c, copyFuncs->readGeneric[MATRIX_A], c, pitch); } kgenAddStmt(ctx, tmp); } static void genZeroResult( struct KgenContext *ctx, DataType dtype, const SubproblemDim *dims, unsigned int vecLen) { unsigned int n; char tmp[1024]; getResultGPRsInfo(dtype, &dims[1], vecLen, &n, NULL); sprintf(tmp, "for (x = 0; x < %u; x++) {\n" " c[x] = 0;\n" "}\n\n", n); kgenAddStmt(ctx, tmp); } static void genInternalLoopCtl( struct KgenContext *ctx, const SubproblemDim *dim, KernelExtraFlags kflags) { char tmp[1024]; if (isMatrixUpper(kflags)) { if (kflags & KEXTRA_TAILS_M) { sprintf(tmp, "for (k0 = currM + %lu; k0 < M / %lu * %lu; " "k0 += %lu)", dim[0].bwidth, dim[1].bwidth, dim[1].bwidth, dim[1].bwidth); } else { sprintf(tmp, "for (k0 = currM + %lu; k0 < M; k0 += %lu)", dim[0].bwidth, dim[1].bwidth); } } else { sprintf(tmp, "for (k0 = 0; k0 < currM; k0 += %lu)", dim[1].bwidth); } kgenBeginBranch(ctx, tmp); } static void genInitCurrM( struct KgenContext *ctx, const SubproblemDim *dim, KernelExtraFlags kflags) { char tmp[1024]; if (isMatrixUpper(kflags)) { /* start from the last block */ sprintf(tmp, "currM = ((M - 1) / %lu) * %lu;\n", dim->y, dim->y); kgenAddStmt(ctx, tmp); } else { kgenAddStmt(ctx, "currM = 0;\n"); } } static void initKernelVarNames(KernelVarNames *kvars) { kvars->A = "uA"; kvars->B = "uB"; kvars->coordA = "coordA"; kvars->coordB = "coordB"; kvars->k = "k"; kvars->sizeM = "M"; kvars->sizeN = "N"; kvars->sizeK = "M"; kvars->lda = "lda"; kvars->ldb = "ldb"; } /* * Generate a code copying tile between LDS and private location. */ static void genLdsCopy( struct KgenContext *ctx, const BlasGenSettings *gset) { char pitchStr[16]; char coordY[128], coordX[128]; size_t pitch; UpresVarNames uvars; UpdateResultFlags upFlags = UPRES_INLINE | UPRES_USE_LDS | UPRES_WITHOUT_ALPHA | UPRES_COLUMN_MAJOR; const SubproblemDim *dims = gset->subdims; unsigned int l1Pans = (unsigned int)(dims[0].x / dims[1].x); memset(&uvars, 0, sizeof(uvars)); pitch = matrBlockPitch(dims, MATRIX_C, gset->kextra->dtype, clblasRight); sprintf(pitchStr, "%lu", pitch); sprintf(coordY, "lid / %u * %lu", l1Pans, dims[1].y); sprintf(coordX, "lid %% %u * %lu", l1Pans, dims[1].x); uvars.result = "tempC"; uvars.ld = pitchStr; uvars.startRow = coordY; uvars.startCol = coordX; uvars.nrRows = NULL; uvars.nrCols = NULL; kgenBeginBranch(ctx, NULL); updateResultGen(ctx, gset, CLBLAS_TRSM, UPRES_SET, upFlags, &uvars); kgenEndBranch(ctx, NULL); kgenAddBlankLine(ctx); } static void genZeroResultTrash( struct KgenContext *ctx, const SubproblemDim *dim, const CLBLASKernExtra *kextra) { char tmp[1024]; unsigned int vecLen, pitch; unsigned int i; vecLen = (isComplexType(kextra->dtype)) ? 1 : kextra->vecLen; pitch = (unsigned int)roundUp(dim->x, vecLen); sprintf(tmp, "if (coordA + %lu > M)", dim->y); kgenBeginBranch(ctx, tmp); sprintf(tmp, "int i = (coordA >= M) ? %lu : (%lu - M %% %lu);\n\n", dim->y, dim->y, dim->y); kgenAddStmt(ctx, tmp); sprintf(tmp, "for (; i > 0; i--)"); kgenBeginBranch(ctx, tmp); for (i = 0; i < pitch / vecLen; i++) { sprintf(tmp, "c[(%lu - i) * %u + %u] = 0;\n", dim->y, pitch / vecLen, i); kgenAddStmt(ctx, tmp); } kgenEndBranch(ctx, NULL); kgenEndBranch(ctx, NULL); } static void setupVdepUpresFlags(KernelExtraFlags kflags, UpdateResultFlags* upFlags) { bool forceBug = false; unsigned int bugFlag1 = KEXTRA_NO_COPY_VEC_A | KEXTRA_TAILS_K | KEXTRA_TAILS_M; unsigned int bugFlag2 = bugFlag1 | KEXTRA_UPPER_TRIANG | KEXTRA_TRANS_A; unsigned int bugFlag3 = bugFlag1 | KEXTRA_SIDE_RIGHT | KEXTRA_COLUMN_MAJOR; unsigned int bugFlag4 = bugFlag3 | KEXTRA_TRANS_A; unsigned int bugFlag5 = bugFlag3 | KEXTRA_UPPER_TRIANG; unsigned int bugFlag6 = KEXTRA_NO_COPY_VEC_A | KEXTRA_NO_COPY_VEC_B | KEXTRA_NO_COPY_VEC_C | KEXTRA_TAILS_K | KEXTRA_TAILS_M; unsigned int bugFlag7 = bugFlag6 | KEXTRA_COLUMN_MAJOR; unsigned int bugFlag8 = bugFlag6 | KEXTRA_SIDE_RIGHT | KEXTRA_UPPER_TRIANG; unsigned int bugFlag9 = bugFlag6 | KEXTRA_UPPER_TRIANG | KEXTRA_TRANS_A | KEXTRA_TAILS_N; unsigned int bugFlag10 = bugFlag7 | KEXTRA_SIDE_RIGHT | KEXTRA_TRANS_A | KEXTRA_TAILS_N; unsigned int bugFlag11 = bugFlag9 | KEXTRA_UNIT_DIAGONAL; unsigned int bugFlag12 = bugFlag6 | KEXTRA_TAILS_N | KEXTRA_SIDE_RIGHT | KEXTRA_UNIT_DIAGONAL | KEXTRA_COLUMN_MAJOR | KEXTRA_TRANS_A; /* * WORKAROUND for AMD GPU: Now, we avoid optimizing the case when * matrix B is not divided on block size and * since it leads to a hang up at code seeming * correct. */ if (kflags & KEXTRA_VENDOR_AMD) { forceBug = (kflags & KEXTRA_TAILS_N) != 0; } else { forceBug = (kflags != bugFlag1 && kflags != bugFlag2 && kflags != bugFlag4 && kflags != bugFlag5 && kflags != bugFlag7 && kflags != bugFlag8 && kflags != bugFlag9 && kflags != bugFlag10 && kflags != bugFlag11 && kflags != bugFlag12); } if (!forceBug) { *upFlags |= UPRES_INDEXING_WITH_CONSTANTS; } } static void genSetupCoordinates( struct KgenContext *ctx, const SubproblemDim *dims, KernelExtraFlags kflags) { char tmp[1024]; unsigned int l1Pans = (unsigned int)(dims[0].x / dims[1].x); sprintf(tmp, "coordA = currM + lid / %u * %lu;\n", l1Pans, dims[1].y); kgenAddStmt(ctx, tmp); if (isMatrixUpper(kflags)) { sprintf(tmp, "k = currM + %lu;\n", dims[0].y); } else { strcpy(tmp, "k = 0;\n"); } kgenAddStmt(ctx, tmp); } static void genInvertDiagBlock( struct KgenContext *ctx, const BlasGenSettings *gset, const ZeroFuncs *zeroFuncs) { char tmp[1024]; const CLBLASKernExtra *kextra = gset->kextra; const SubproblemDim *subdims = gset->subdims; size_t pitchA; pitchA = matrBlockPitch(subdims, MATRIX_A, kextra->dtype, clblasLeft); sprintf(tmp, "%s((__local float4*)tempA);\n", zeroFuncs->names[MATRIX_A]); kgenAddStmt(ctx, tmp); kgenAddBarrier(ctx, CLK_LOCAL_MEM_FENCE); if (kextra->flags & KEXTRA_UNIT_DIAGONAL) { sprintf(tmp, "if (lid < %lu) {\n" " tempC[lid * %lu + lid] = %s;\n" "}\n", subdims[0].bwidth, pitchA, strOne(kextra->dtype)); kgenAddStmt(ctx, tmp); kgenAddBarrier(ctx, CLK_LOCAL_MEM_FENCE); kgenAddBlankLine(ctx); } sprintf(tmp, "if (lid < %lu)", subdims[0].y); kgenBeginBranch(ctx, tmp); sprintf(tmp, "invert(tempC, tempA, lid, (currM + %lu > M) ? " "M - currM : %lu);\n", subdims[0].y, subdims[0].y); kgenAddStmt(ctx, tmp); kgenEndBranch(ctx, NULL); kgenAddBarrier(ctx, CLK_LOCAL_MEM_FENCE); kgenAddBlankLine(ctx); } static void genMulOnDiagBlock( struct KgenContext *ctx, BlasGenSettings *gset, const TileMulOpts *mulOpts) { char tmp[1024]; const SubproblemDim *dims = gset->subdims; const CLBLASKernExtra *kextra = gset->kextra; unsigned int l1Pans = (unsigned int)(dims[0].x / dims[1].x); TileMulOpts optsNew; size_t pitchAC; const char *ptrName; Tile *tile; BlasGenSettings gsetNew; pitchAC = matrBlockPitch(dims, MATRIX_C, kextra->dtype, clblasRight); ptrName = dtypeUPtrField(kextra->dtype); memcpy(&optsNew, mulOpts, sizeof(optsNew)); optsNew.memA = CLMEM_LOCAL_MEMORY; optsNew.memB = CLMEM_LOCAL_MEMORY; optsNew.flags &= ~(TILEMUL_TRA | TILEMUL_GLOBAL_CYCLIC | TILEMUL_CONJA); optsNew.flags |= TILEMUL_TRB; optsNew.memA = CLMEM_LOCAL_MEMORY; optsNew.memB = CLMEM_LOCAL_MEMORY; gset->varNames.A = "utmpA"; gset->varNames.B = "utmpC"; sprintf(tmp, "utmpA.%s = tempA + lid / %u * %lu;\n" "utmpC.%s = tempC + lid %% %u * %lu;\n\n", ptrName, l1Pans, pitchAC * dims[1].y, ptrName, l1Pans, pitchAC * dims[1].x); kgenAddStmt(ctx, tmp); memcpy(&gsetNew, gset, sizeof(gsetNew)); gsetNew.subdims[1].bwidth = dims[1].y; // Configure the tile descriptors to deal with tile of needed sizes. tile = &gsetNew.tileA; tile->nrRows = (unsigned int)dims[1].y; tile->nrCols = (unsigned int)dims[1].y; tile->trans = false; tile = &gsetNew.tileBX; tile->nrRows = (unsigned int)dims[1].y; tile->nrCols = (unsigned int)dims[1].x; tile->trans = true; tileMulGen(ctx, &gsetNew, &optsNew); gset->varNames.A = "uA"; gset->varNames.B = "uB"; } static void genOneTrsmPass( struct KgenContext *ctx, BlasGenSettings *gset, const char *updateResFnRev, const char *updateResGenericFnRev, CopyBufFuncs *copyFuncs, ZeroFuncs *zeroFuncs, bool isTailPass) { const CLBLASKernExtra *kextra = gset->kextra; CLBLASKernExtra kextraTmp; KernelExtraFlags kflags = kextra->flags; char tmp[1024]; DataType dtype = kextra->dtype; unsigned int vecLen = gset->kextra->vecLen; SubproblemDim *subdims = gset->subdims; int tra, trb; UpdateResultFlags upFlags; TilePostFetchPrivate pfpriv; TileMulOpts mulOpts; TailFetch tf; TailStatus tailStatus = 0; memset(&pfpriv, 0, sizeof(pfpriv)); // multiply options mulOpts.memA = CLMEM_GLOBAL_MEMORY; mulOpts.memB = CLMEM_GLOBAL_MEMORY; mulOpts.core = TILEMUL_MAD;//TILEMUL_MULADD; mulOpts.postFetch = NULL; mulOpts.flags = kextraToTilemulFlags(CLBLAS_TRSM, kflags); mulOpts.flags |= TILEMUL_EXTERN_RDECL; mulOpts.flags |= getCyclicFlags(subdims, kflags, isTailPass, vecLen); tra = isMatrixAccessColMaj(CLBLAS_TRSM, kflags, MATRIX_A); trb = isMatrixAccessColMaj(CLBLAS_TRSM, kflags, MATRIX_B); tf = checkForTailFetches(CLBLAS_TRSM, &subdims[1], kextra, MATRIX_B, false, false); if (trb) { tf &= ~FETCH_TAIL_COL; } /* * For lower triangular matrix we proceed upto the diagonal, so we * can't exceed matrix bound and zeroing is not needed */ if (isMatrixUpper(kflags)) { tf |= checkForTailFetches(CLBLAS_TRSM, &subdims[1], kextra, MATRIX_A, false, false); if (tra && trb) { tf &= ~FETCH_TAIL_COL; } } if (tf != FETCH_NO_TAILS) { memset(&pfpriv, 0, sizeof(pfpriv)); pfpriv.funcID = CLBLAS_TRSM; pfpriv.gset = gset; } // loop over M if (!isTailPass) { sprintf(tmp, "for (m0 = 0; m0 < M / %lu * %lu; m0 += %lu)", subdims->y, subdims->y, subdims->y); kgenBeginBranch(ctx, tmp); } genSetupCoordinates(ctx, subdims, kflags); genZeroResult(ctx, dtype, subdims, vecLen); if (!isMatrixUpper(kflags) && isTailPass) { // skip update loop is the matrix consist of the single block sprintf(tmp, "if (M > %lu)", subdims->y); kgenBeginBranch(ctx, tmp); } // Avoid tail adjusting along M. memcpy(&kextraTmp, kextra, sizeof(kextraTmp)); kextraTmp.flags &= ~(KEXTRA_TAILS_M | KEXTRA_TAILS_M_LOWER); // update loop is not needed for tail of an upper triangular matrix if (!(isTailPass && isMatrixUpper(kflags))) { if (isTailPass || (kflags & KEXTRA_TAILS_N)) { kgenBeginBranch(ctx, "if (coordB < N)"); } gset->kextra = &kextraTmp; tailStatus = checkGenAdjustTailCoords(ctx, CLBLAS_TRSM, gset, NULL); gset->kextra = kextra; genInternalLoopCtl(ctx, subdims, kflags); // loop over K // multiplication for the step-by-step block updating subdims[0].bwidth = subdims[1].bwidth; tileMulGen(ctx, gset, &mulOpts); subdims[0].bwidth = subdims[0].y; genInternalLoopEnd(ctx); // loop over K kgenAddBlankLine(ctx); // invoke once again, in order to process tails along K if (isMatrixUpper(kflags) && (tf != FETCH_NO_TAILS)) { subdims[0].bwidth = subdims[1].bwidth; if (!(tra && trb)) { mulOpts.flags |= TILEMUL_WRAP_AROUND_TAIL; } mulOpts.flags |= TILEMUL_GLOBAL_CYCLIC_K; mulOpts.postFetchPriv = &pfpriv; mulOpts.postFetch = defaultTilePostFetch; subdims[0].bwidth = subdims[1].bwidth; tileMulGen(ctx, gset, &mulOpts); subdims[0].bwidth = subdims[0].y; mulOpts.postFetch = NULL; mulOpts.postFetchPriv = NULL; } gset->kextra = &kextraTmp; checkGenRestoreTailCoords(ctx, gset, tailStatus); gset->kextra = kextra; if (isTailPass || (kflags & KEXTRA_TAILS_N)) { kgenEndBranch(ctx, NULL); } } else if (!trb && (kflags & KEXTRA_TAILS_N)) { tailStatus |= TAIL_B_RAISED; } mulOpts.flags &= ~(TILEMUL_WRAP_AROUND_TAIL | TILEMUL_GLOBAL_CYCLIC_A | TILEMUL_GLOBAL_CYCLIC_K); if (!isMatrixUpper(kflags) && isTailPass) { /* * end of branch for non single block tail processing of * the lower triangular matrix */ kgenEndBranch(ctx, NULL); } /* * Final phase: update the accumulated result, multiply on an inverted * block and write back the result */ if (isMatrixUpper(kflags) || ((kflags & KEXTRA_VENDOR_AMD) != 0)) { kgenAddStmt(ctx, "k0 = currM;\n"); } else { kgenAddStmt(ctx, "k0 = m0;\n"); } genReadDiagBlock(ctx, subdims, dtype, copyFuncs, zeroFuncs, kflags, 'C'); genInvertDiagBlock(ctx, gset, zeroFuncs); // Avoid generating not executed non optimal path gset->kextra = &kextraTmp; if (isTailPass) { kextraTmp.flags |= (KEXTRA_TAILS_M | KEXTRA_TAILS_M_LOWER); } genUpdateIntermTrsmResult(ctx, gset, updateResFnRev, updateResGenericFnRev, true); gset->kextra = kextra; /* * Heap to LDS. * Zero unuseful part along columns since it will have an influence * on the result at multiplication on an inverted block */ if (isTailPass) { genZeroResultTrash(ctx, &subdims[1], kextra); } genLdsCopy(ctx, gset); kgenAddBarrier(ctx, CLK_LOCAL_MEM_FENCE); genZeroResult(ctx, dtype, subdims, vecLen); genMulOnDiagBlock(ctx, gset, &mulOpts); // write back the tile evaluated upFlags = kextraToUpresFlags(CLBLAS_TRSM, kflags); upFlags |= tailStatusToUpresFlags(tailStatus); upFlags |= UPRES_EXCEED_PROBLEM_CONDITION; setupVdepUpresFlags(kflags, &upFlags); gset->kextra = &kextraTmp; genResultUpdateWithFlags(ctx, CLBLAS_TRSM, gset, upFlags, NULL, NULL, NULL); gset->kextra = kextra; kgenAddBarrier(ctx, CLK_GLOBAL_MEM_FENCE); if (isMatrixUpper(kflags)) { sprintf(tmp, "currM -= %lu;\n", subdims[0].y); } else { sprintf(tmp, "currM += %lu;\n", subdims[0].y); } kgenAddStmt(ctx, tmp); if (!isTailPass) { kgenEndBranch(ctx, NULL); // loop over M } } static ssize_t generator( char *buf, size_t buflen, const struct SubproblemDim *subdims, const struct PGranularity *pgran, void *extra) { char tmp[1024]; struct KgenContext *ctx; CLBLASKernExtra *kextra = (CLBLASKernExtra*)extra; KernelExtraFlags kflags = kextra->flags; DataType dtype = kextra->dtype; BlasGenSettings gset; char updateResFnRev[FUNC_NAME_MAXLEN]; char updateResGenericFnRev[FUNC_NAME_MAXLEN]; CopyBufFuncs copyFuncs; ZeroFuncs zeroFuncs; UpdateResultFlags upFlags; const char *ptrName; bool b; ssize_t ret; unsigned int l1Pans = (unsigned int)(subdims[0].x / subdims[1].x); bool tailMarker[2] = {false, true}; int triang; int i; if (pgran->wgDim != 1) { return -EINVAL; } if (kflags & KEXTRA_TAILS_M) { kflags |= KEXTRA_TAILS_M_LOWER; } if (kflags & KEXTRA_TAILS_N) { kflags |= KEXTRA_TAILS_N_LOWER; } if (kflags & KEXTRA_TAILS_K) { kflags |= KEXTRA_TAILS_K_LOWER; } kextra->flags = kflags; ctx = createKgenContext(buf, buflen, true); if (ctx == NULL) { return -ENOMEM; } triang = isMatrixUpper(kflags); memset(&gset, 0, sizeof(gset)); memcpy(gset.subdims, subdims, sizeof(gset.subdims)); gset.kextra = kextra; gset.pgran = pgran; initKernelVarNames(&gset.varNames); b = isDoubleBasedType(dtype); kgenDeclareUptrs(ctx, b); if (isComplexType(dtype)) { genComplexMathOperators(ctx, dtype); } /* * For intermediate result after blocks modification. * Take into account tails adjusting */ upFlags = kextraToUpresFlags(CLBLAS_TRSM, kflags); upFlags |= UPRES_WITH_BETA | UPRES_PRIV_DEST; if (!isMatrixAccessColMaj(CLBLAS_TRSM, kflags, MATRIX_B) && (kflags & KEXTRA_TAILS_N)) { upFlags |= UPRES_TAIL_COL; } setupVdepUpresFlags(kflags, &upFlags); initTiles(&gset); genUpresFuncsWithFlags(ctx, &gset, upFlags, updateResFnRev, updateResGenericFnRev); generateBufCopyFuncs(©Funcs, ctx, CLBLAS_TRSM, &gset, BCHF_MATRIX_A); generateZeroingFuncs(&zeroFuncs, ctx, &subdims[0], pgran, dtype, ZF_MATRIX_A); //matrix inversion function genInvertingBlockFunc(ctx, subdims[0].bwidth, dtype, kflags); kgenAddBlankLine(ctx); // now, generate the kernel declareTrxmKernel(ctx, dtype, pgran, kflags, CLBLAS_TRSM, "Cached", false, true); ret = kgenBeginFuncBody(ctx); declareLocalVariables(ctx, &gset); prepareTilesForMainLoop(&gset); sprintf(tmp, "currN = gid * %lu;\n", subdims[0].x); kgenAddStmt(ctx, tmp); genInitCurrM(ctx, subdims, kflags); if (kflags & KEXTRA_A_OFF_NOT_ZERO) { kgenAddStmt(ctx, "A += offA;\n"); } genTrxmBMatrShift(ctx, kflags, false); ptrName = dtypeUPtrField(dtype); sprintf(tmp, "uA.%s = A;\n" "uB.%s = B;\n\n", ptrName, ptrName); kgenAddStmt(ctx, tmp); /* * B matrix is divided on panels, each work group * multiply such a panel on the whole matrix A. */ sprintf(tmp, "coordB = gid * %lu + lid %% %u * %lu;\n", subdims[0].x, l1Pans, subdims[1].x); kgenAddStmt(ctx, tmp); for (i = 0; i < 2; i++) { b = (i) ? tailMarker[1 - triang] : tailMarker[triang]; if (!b || (kflags & KEXTRA_TAILS_M)) { genOneTrsmPass(ctx, &gset, updateResFnRev, updateResGenericFnRev, ©Funcs, &zeroFuncs, b); } } kgenEndFuncBody(ctx); ret = kgenAddBlankLine(ctx); if (!ret) { ret = (ssize_t)kgenSourceSize(ctx) + 1; } destroyKgenContext(ctx); return (ret < 0) ? -EOVERFLOW : ret; } static bool isFitToLDS( SubproblemDim *dim, DataType dtype, cl_ulong ldsSize, const void *kernelArgs) { cl_ulong sizeA, sizeC; const CLBlasKargs *kargs = (const CLBlasKargs*)kernelArgs; /* * It's needed one block for matrix A, * and one block of size maximal of this one for * matrix A and matrix C */ sizeA = matrBlockSize(dim, MATRIX_A, dtype, kargs->side); sizeC = matrBlockSize(dim, MATRIX_B, dtype, kargs->side); if (sizeA > sizeC) { sizeC = sizeA; } return ((sizeA + sizeC) * dtypeSize(dtype) <= ldsSize); } static SolverFlags solverFlags(void) { return (SF_WSPACE_1D | SF_TOP_INPUT_SQUARE_BLOCKS); } static void assignKargs(KernelArg *args, const void *params, const void *extra) { const CLBlasKargs *blasArgs = (CLBlasKargs*)params; KernelExtraFlags kflags = ((const CLBLASKernExtra*)extra)->flags; int idx = 7; initSizeKarg(&args[0], blasArgs->M); initSizeKarg(&args[1], blasArgs->N); assignScalarKarg(&args[2], &(blasArgs->alpha), blasArgs->dtype); initMemobjKarg(&args[3], blasArgs->A, NULL, 0, 0); initSizeKarg(&args[4], blasArgs->lda.matrix); initMemobjKarg(&args[5], blasArgs->B, NULL, 0, 0); initSizeKarg(&args[6], blasArgs->ldb.matrix); if (kflags & KEXTRA_A_OFF_NOT_ZERO) { initSizeKarg(&args[idx++], blasArgs->offA); } if (kflags & KEXTRA_BX_OFF_NOT_ZERO) { initSizeKarg(&args[idx++], blasArgs->offBX); } } static void fixupArgs(void *args, SubproblemDim *subdims, void *extra) { (void)extra; (void)subdims; fixupTrxmKargs((CLBlasKargs*)args); } void initTrsmCachedPattern(MemoryPattern *mempat) { mempat->name = "Cached global memory based block trsm"; mempat->nrLevels = 2; mempat->cuLevel = 0; mempat->thLevel = 0; mempat->sops = &trsmSops; mpatExtra.aMset = CLMEM_LEVEL_L1; mpatExtra.mobjA = CLMEM_BUFFER; mpatExtra.mobjB = CLMEM_BUFFER; mempat->extra = &mpatExtra; } clblas-2.10/src/library/blas/gens/legacy/trsm_img.c000066400000000000000000001021211264277366700222560ustar00rootroot00000000000000/* ************************************************************************ * Copyright 2013 Advanced Micro Devices, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * ************************************************************************/ /* * Image based trsm generator */ #include #include #include #include #include #include #include #include #include "blas_kgen_legacy.h" #include "gen_helper_legacy.h" #include "trsm_kgen_legacy.h" #include "../gen_helper.h" #include "../trsm_kgen.h" #include static const char *trsmImDecl = "__attribute__((reqd_work_group_size(%lu, %lu, 1)))\n" "void __kernel\n" "%ctrsmIm(\n" " uint %c,\n" " uint %c,\n" " %s alpha,\n" " __read_only image2d_t A,\n" " __global %s *B,\n" " uint ldb,\n" " uint startRow,\n" " uint finishRow,\n" " uint offB)\n"; /* * template for memory object based trsm preparation part * for one dimensional work space */ static const char *trsmImPrep1D = "uint m0, k0;\n" "__local %s tempC[%lu];\n" "%s c[%u];\n" "const int lid = get_local_id(0);\n" "const int skew = lid %% %lu;\n" "%s" // groups per Panel variable "uint blockN;\n" "uint x, y, imx, imy;\n" "uint2 coordA, coordB;\n" "\n" "const uint currN = get_global_id(0) / %u * %lu;\n" // group ID "\n"; static const char *readRectBlock = "y = (currN + %lu <= N) ? %lu : N - currN;\n" "x = (k0 + %lu <= finishRow) ? %lu : finishRow - k0;\n" "if ((y == %lu) && (x == %lu)) {\n" // just read with an optimized function " %s((LPtr)temp%c, (GPtr)B, currN, k0, ldb);\n" "}\n" "else {\n" " %s((__local float4*)temp%c);\n" // zeroing " barrier(CLK_LOCAL_MEM_FENCE);\n" " %s((LPtr)temp%c, (GPtr)B, currN, k0, y, x, %lu, ldb);\n" "}\n\n"; static const char *readRectBlockOpt = // just read with an optimized function "%s((LPtr)temp%c, (GPtr)B, currN, k0, ldb);\n"; static const char *readRectBlockTrans = "y = (currN + %lu <= N) ? %lu : N - currN;\n" "x = (k0 + %lu <= finishRow) ? %lu : finishRow - k0;\n" "if ((y == %lu) && (x == %lu)) {\n" // read and transpose with an optimized function " %s((LPtr)temp%c, (GPtr)B, k0, currN, ldb);\n" "}\n" "else {\n" " %s((__local float4*)temp%c);\n" // zeroing " barrier(CLK_LOCAL_MEM_FENCE);\n" // read and transpose with slow function " %s((LPtr)temp%c, (GPtr)B, k0, currN, x, y, %lu, ldb);\n" "}\n\n"; static const char *readRectBlockTransOpt = // read and transpose with an optimized function "%s((LPtr)temp%c, (GPtr)B, k0, currN, ldb);\n"; static ssize_t wrapper( char *buf, size_t buflen, const struct SubproblemDim *subdims, const struct PGranularity *pgran, void *extra); static ssize_t generator( char *buf, size_t buflen, const struct SubproblemDim *subdims, const struct PGranularity *pgran, void *extra); static ssize_t prepGenerator( char *buf, size_t buflen, const struct SubproblemDim *subdims, const struct PGranularity *pgran, void *extra); static void assignKargs(KernelArg *args, const void *params, const void *extra); static bool isFitToLDS( SubproblemDim *dim, DataType dtype, cl_ulong ldsSize, const void *kernelArgs); static void calcNrThreads( size_t threads[2], const SubproblemDim *dims, const PGranularity *pgran, const void *args, const void *extra); static void imgPackMode( const void *extra, const SubproblemDim *dims, int dataID, unsigned int *packRate, clblasOrder *packOrder); static SolverFlags solverFlags(void); static SolverOps solverOps = { wrapper, assignKargs, isFitToLDS, NULL, NULL, calcNrThreads, imgPackMode, solverFlags, NULL, //fixupArgs NULL, //getDefaultDecomp NULL, //getDecompList NULL, NULL }; static CLBLASMpatExtra mpatExtra; /* Prepare A kernel begin */ static const char *trsmPrepDecl = "void __kernel\n" "%ctrsmPrepare(\n" " uint %c,\n" " __global %s *A,\n" " uint lda,\n" " __write_only image2d_t imA,\n" " uint startRow,\n" " uint offA)\n"; /* * template for memory object based trsm preparation part * for one dimensional work space */ static const char *trsmPrep1D = "__local %s tempA[%lu];\n" "__local %s tempC[%lu];\n" "int lid, gid;\n" "uint currM, k0;\n" "uint x, y, imx, imy;\n" "\n" "lid = get_local_id(0);\n" "gid = get_global_id(0) / %u;\n" // group ID "A += offA;\n" "\n"; static const char *readSquareBlock = "y = (currM + %lu <= M) ? %lu : M - currM;\n" "x = (k0 + %lu <= M) ? %lu : M - k0;\n" "if ((y == %lu) && (x == %lu)) {\n" // just read with an optimized function " %s((LPtr)temp%c, (GPtr)A, currM, k0, lda);\n" "}\n" "else {\n" " %s((__local float4*)temp%c);\n" // zeroing " barrier(CLK_LOCAL_MEM_FENCE);\n" " %s((LPtr)temp%c, (GPtr)A, currM, k0, y, x, %lu, lda);\n" "}\n\n"; static const char *readSquareBlockOpt = // just read with an optimized function "%s((LPtr)temp%c, (GPtr)A, currM, k0, lda);\n"; static const char *readSquareBlockTrans = "y = (currM + %lu <= M) ? %lu : M - currM;\n" "x = (k0 + %lu <= M) ? %lu : M - k0;\n" "if ((y == %lu) && (x == %lu)) {\n" // read and transpose with an optimized function " %s((LPtr)temp%c, (GPtr)A, k0, currM, lda);\n" "}\n" "else {\n" " %s((__local float4*)temp%c);\n" // zeroing " barrier(CLK_LOCAL_MEM_FENCE);\n" // read and transpose with slow function " %s((LPtr)temp%c, (GPtr)A, k0, currM, x, y, %lu, lda);\n" "}\n\n"; static const char *readSquareBlockTransOpt = // read and transpose with an optimized function "%s((LPtr)temp%c, (GPtr)A, k0, currM, lda);\n"; static bool useTransposedMul(const SubproblemDim *dims, DataType dtype, bool trb) { unsigned int vecLen; vecLen = sizeof(cl_float4) / dtypeSize(dtype); return (!(trb || isComplexType(dtype) || (dims[1].x % vecLen))); } static size_t calcPitchB(const SubproblemDim *dim, DataType dtype, bool transpMul) { size_t ret; size_t tsize; tsize = dtypeSize(dtype); ret = (transpMul) ? dim->x : dim->bwidth; ret = fl4RowWidth(ret, tsize) * sizeof(cl_float4) / tsize; return ret; } static void genPrepareSquareBlock( struct KgenContext *ctx, const SubproblemDim *dim, DataType dtype, const CopyBufFuncs *copyFuncs, const ZeroFuncs *zeroFuncs, bool tra, char c, bool opt) { char tmp[1024]; size_t pitch; const char *readBlock; pitch = matrBlockPitch(dim, MATRIX_A, dtype, clblasLeft); if (opt) { readBlock = (tra) ? readSquareBlockTransOpt : readSquareBlockOpt; sprintf(tmp, readBlock, copyFuncs->read[MATRIX_A], c); } else { readBlock = (tra) ? readSquareBlockTrans : readSquareBlock; sprintf(tmp, readBlock, dim->y, dim->y, dim->bwidth, dim->bwidth, dim->y, dim->bwidth, copyFuncs->read[MATRIX_A], c, zeroFuncs->names[MATRIX_A], c, copyFuncs->readGeneric[MATRIX_A], c, pitch); } kgenAddStmt(ctx, tmp); } static void genPrepZeroBlockC( struct KgenContext *ctx, const ZeroFuncs *zeroFuncs) { char tmp[1024]; sprintf(tmp, "%s((__local float4*)tempC);\n", zeroFuncs->names[MATRIX_A]); kgenAddStmt(ctx, tmp); } static void genWriteBlock( struct KgenContext *ctx, const SubproblemDim *dim, const CopyBufFuncs *copyFuncs) { char tmp[1024]; sprintf(tmp, "%s(imA, imx, imy, (LPtr)tempC, %lu, %lu, %lu);\n", copyFuncs->write, dim[0].y, dim[0].y, dim[0].y); kgenAddStmt(ctx, tmp); } static void getBufferPos(struct KgenContext *ctx, bool isU) //n -> x,y buffer { kgenDeclareFunction(ctx, "void\ngetBufferPos(uint n, uint startRow, " "uint width, uint *y, " "uint *x)\n"); kgenBeginFuncBody(ctx); if (isU) { //n from beginning kgenAddStmt(ctx, "n += (2 * width - startRow + 1) * (startRow) / 2;\n"); kgenAddStmt(ctx, "*y = trunc((2 * width + 1) - " "sqrt((2 * width + 1) *" "(2 * width + 1) - 8 * n)) / 2;\n"); kgenAddStmt(ctx, "*x = *y + n - (2 * width - *y + 1) * (*y) / 2;\n"); } else { //n from beginning kgenAddStmt(ctx, "n += startRow * (startRow + 1) / 2;\n"); kgenAddStmt(ctx, "*y = trunc((-0.5 + sqrt(2.0 * n + 0.25)));\n"); kgenAddStmt(ctx, "*x = n - (*y) * (*y + 1) / 2;\n"); } kgenEndFuncBody(ctx); kgenAddBlankLine(ctx); } static void genGetImagePos( struct KgenContext *ctx, const SubproblemDim *subdims, DataType dtype, const char *blockName, bool tra) //n -> x,y image { char tmp[1024]; const char *parName; const char *op[2] = {"/", "%"}; parName = (tra) ? "bpc" : "bpr"; sprintf(tmp, "imy = %s %s %s * %lu;\n" "imx = (%s %s %s) * %lu;\n", blockName, op[tra], parName, subdims[0].y, blockName, op[1 - tra], parName, subdims[0].y * dtypeSize(dtype) / sizeof(cl_float4)); kgenAddStmt(ctx, tmp); } // global memory to image converter static ssize_t prepGenerator( char *buf, size_t buflen, const struct SubproblemDim *subdims, const struct PGranularity *pgran, void *extra) { struct KgenContext *ctx; CLBLASKernExtra *kextra = (CLBLASKernExtra*)extra; char tmp[1024]; const char *typeName; CopyBufFuncs copyFuncs; ZeroFuncs zeroFuncs; char fpref; DataType dtype = kextra->dtype; KernelExtraFlags kflags = kextra->flags; ssize_t ret; size_t pitchAB; bool b; bool tra, trb, isU, transpMul; BlasGenSettings gset; if (pgran->wgDim != 1) { return -EINVAL; } ctx = createKgenContext(buf, buflen, true); if (ctx == NULL) { return -ENOMEM; } tra = isMatrixAccessColMaj(CLBLAS_TRSM, kflags, MATRIX_A); trb = isMatrixAccessColMaj(CLBLAS_TRSM, kflags, MATRIX_B); isU = isMatrixUpper(kflags); // at first, generate needed declarations and auxiliary functions b = isDoubleBasedType(dtype); kgenDeclareUptrs(ctx, b); if (isComplexType(dtype)) { genComplexMathOperators(ctx, dtype); } memset(&gset, 0, sizeof(gset)); memcpy(gset.subdims, subdims, sizeof(gset.subdims)); gset.kextra = kextra; gset.pgran = pgran; generateBufCopyFuncs(©Funcs, ctx, CLBLAS_TRSM, &gset, BCHF_MATRIX_A | BCHF_WRITE_OUTPUT | BCHF_IMAGE_WRITE); generateZeroingFuncs(&zeroFuncs, ctx, &subdims[0], pgran, dtype, ZF_MATRIX_A); //matrix inversion function genInvertingBlockFunc(ctx, (unsigned int)subdims[0].bwidth, dtype, isU); //coordinates calculation getBufferPos(ctx, isU); typeName = dtypeBuiltinType(dtype); fpref = dtypeToBlasPrefix(dtype); // now, generate the kernel sprintf(tmp, trsmPrepDecl, fpref, 'M', typeName, typeName, typeName, typeName); kgenDeclareFunction(ctx, tmp); ret = kgenBeginFuncBody(ctx); transpMul = useTransposedMul(subdims, dtype, trb); if (!transpMul) { sprintf(tmp, "const int bpr = get_image_width(imA) / %lu;\n", subdims[0].y / (sizeof(cl_float4) / dtypeSize(dtype))); } else { sprintf(tmp, "const int bpc = get_image_height(imA) / %lu;\n", subdims[0].y); } kgenAddStmt(ctx, tmp); /* * Calculate local buffer pitches, and then insert the * preparative code */ pitchAB = matrBlockPitch(subdims, MATRIX_A, dtype, clblasLeft); sprintf(tmp, trsmPrep1D, typeName, pitchAB * subdims[0].y, typeName, pitchAB * subdims[0].y, pgran->wgSize[0]); ret = kgenAddStmt(ctx, tmp); sprintf(tmp, "getBufferPos(gid, startRow / %lu, (M + %lu) / %lu, &currM, &k0);\n", subdims[0].y, subdims[0].y - 1, subdims[0].y); kgenAddStmt(ctx, tmp); sprintf(tmp, "currM *= %lu;\n" "k0 *= %lu;\n", subdims[0].y, subdims[0].y); kgenAddStmt(ctx, tmp); genGetImagePos(ctx, subdims, dtype, "gid", transpMul); kgenBeginBranch(ctx, "if (currM == k0)"); genPrepareSquareBlock(ctx, subdims, dtype, ©Funcs, &zeroFuncs, tra, 'A', !(kextra->flags & KEXTRA_TAILS_M)); genPrepZeroBlockC(ctx, &zeroFuncs); kgenAddBarrier(ctx, CLK_LOCAL_MEM_FENCE); if (kextra->flags & KEXTRA_UNIT_DIAGONAL) { sprintf(tmp, "if (lid < %lu) {\n" " tempA[lid * %lu + lid] = %s;\n" "}\n", subdims[0].bwidth, pitchAB, strOne(dtype)); kgenAddStmt(ctx, tmp); kgenAddBarrier(ctx, CLK_LOCAL_MEM_FENCE); kgenAddBlankLine(ctx); } sprintf(tmp, "if (lid < %lu)", subdims[0].bwidth); kgenBeginBranch(ctx, tmp); sprintf(tmp, "invert(tempA, tempC, lid, (currM + %lu > M) ? " "M - currM : %lu);\n", subdims[0].y, subdims[0].y); kgenAddStmt(ctx, tmp); kgenEndBranch(ctx, NULL); kgenAddBarrier(ctx, CLK_LOCAL_MEM_FENCE); kgenEndBranch(ctx, NULL); kgenBeginBranch(ctx, "else"); genPrepareSquareBlock(ctx, subdims, dtype, ©Funcs, &zeroFuncs, tra, 'C', !(kextra->flags & KEXTRA_TAILS_M)); kgenAddBarrier(ctx, CLK_LOCAL_MEM_FENCE); kgenEndBranch(ctx, NULL); genWriteBlock(ctx, subdims, ©Funcs); kgenEndFuncBody(ctx); ret = kgenAddBlankLine(ctx); if (!ret) { ret = (ssize_t)kgenSourceSize(ctx) + 1; } destroyKgenContext(ctx); return (ret < 0) ? -EOVERFLOW : ret; } static void genZeroResult( struct KgenContext *ctx, DataType dtype, const SubproblemDim *dims) { unsigned int n; char tmp[1024]; unsigned int vecLen = sizeof(cl_float4) / dtypeSize(dtype); getResultGPRsInfo(dtype, &dims[1], vecLen, &n, NULL); sprintf(tmp, "for (x = 0; x < %u; x++) {\n" " c[x] = 0;\n" "}\n\n", n); kgenAddStmt(ctx, tmp); } static void genPrepareRectBlock( struct KgenContext *ctx, const SubproblemDim *dim, DataType dtype, const CopyBufFuncs *copyFuncs, const ZeroFuncs *zeroFuncs, bool trb, char c, bool opt) { char tmp[1024]; size_t pitch; const char *readBlock; size_t bsizes[2] = {dim->bwidth, dim->x}; /* * NOTE: in case of accessing to B in the non transposed way * block multiplication is done with transposed block B */ pitch = calcPitchB(dim, dtype, !trb); if (opt) { readBlock = (trb) ? readRectBlockTransOpt : readRectBlockOpt; sprintf(tmp, readBlock, copyFuncs->read[MATRIX_B], c); } else { readBlock = (trb) ? readRectBlockTrans : readRectBlock; sprintf(tmp, readBlock, bsizes[trb], bsizes[trb], bsizes[1 - trb], bsizes[1 - trb], bsizes[trb], bsizes[1 - trb], copyFuncs->read[MATRIX_B], c, zeroFuncs->names[MATRIX_B], c, copyFuncs->readGeneric[MATRIX_B], c, pitch); } kgenAddStmt(ctx, tmp); } static void getNblock(struct KgenContext *ctx, bool isU) //x, y -> n { kgenDeclareFunction(ctx, "void\ngetNBlock(uint y, uint x, uint startRow, " "uint width, uint *n)\n"); kgenBeginFuncBody(ctx); if (isU) { kgenAddStmt(ctx, "*n = ((2 * width - y + 1) * y - " "(2 * width - startRow + 1) * startRow) / 2 + x - y;\n"); } else { kgenAddStmt(ctx, "*n = (y * (y + 1) - startRow * (startRow + 1)) / 2 + x;\n"); } kgenEndFuncBody(ctx); kgenAddBlankLine(ctx); } static void genMultiplication( struct KgenContext *ctx, const SubproblemDim *dims, DataType dtype, const char *blkmulName, BlkMulFlags mulFlags) { char tmp[1024]; size_t u; unsigned int l1Pans; l1Pans = (unsigned int)(dims[0].x / dims[1].x); if (mulFlags & BLKMUL_TRANSPOSED_B) { u = 1; } else { u = matrBlockPitch(dims, MATRIX_B, dtype, clblasLeft); } // find image position and invoke the multiplier sprintf(tmp, "getNBlock(m0 / %lu, k0 / %lu, startRow / %lu, " "(M + %lu) / %lu, &blockN);\n", dims[0].y, dims[0].y, dims[0].y, dims[0].y - 1, dims[0].y); kgenAddStmt(ctx, tmp); genGetImagePos(ctx, dims, dtype, "blockN", (mulFlags & BLKMUL_TRANSPOSED_B) != 0); sprintf(tmp, "%s(A, (int2)(imx, imy + lid / %u * %lu), \n" " (LPtr)(tempC + (lid %% %u * %lu) * %lu),\n" " c, skew);\n", blkmulName, l1Pans, dims[1].y, l1Pans, dims[1].x, u); kgenAddStmt(ctx, tmp); } static void genReorderSolution( struct KgenContext *ctx, const SubproblemDim *subdims, const char *outTypeName, unsigned int colRegs) { char tmp[1024], tmp1[1024]; char *p; unsigned i; sprintf(tmp, "void\n" "reorderResult(%s *c, int skew)", outTypeName); kgenDeclareFunction(ctx, tmp); kgenBeginFuncBody(ctx); sprintf(tmp, "%s tmp;\n" "int i, j;\n", outTypeName); kgenAddStmt(ctx, tmp); p = tmp1; for (i = 0; i < colRegs; i++) { unsigned int k = (unsigned int)(subdims[1].y - 1) * colRegs + i; sprintf(p, "\n" " tmp = c[%u];\n" " for (j = %lu; j >= 0; j--) {\n" " c[(j+1) * %u + %u] = c[j * %u + %u];\n" " }\n" " c[%u] = tmp;\n", k, subdims[1].y - 2, colRegs, i, colRegs, i, i); p += strlen(p); } sprintf(tmp, "\n" "for (i = 0; i < skew; i++) {\n" "%s" "}\n" "\n", tmp1); kgenAddStmt(ctx, tmp); kgenEndFuncBody(ctx); kgenAddBlankLine(ctx); } static void initKernelVarNames(KernelVarNames *kvars, KernelExtraFlags kflags) { kvars->A = "imgA"; kvars->B = "B"; if (isMatrixAccessColMaj(CLBLAS_TRSM, kflags, MATRIX_A)) { kvars->coordA = "coordA.x"; } else { kvars->coordA = "coordA.y"; } if (isMatrixAccessColMaj(CLBLAS_TRSM, kflags, MATRIX_B)) { kvars->coordB = "coordB.x"; } else { kvars->coordB = "coordB.y"; } kvars->sizeM = "M"; kvars->sizeN = "N"; kvars->sizeK = "origM"; } // image based kernel generator static ssize_t generator( char *buf, size_t buflen, const struct SubproblemDim *subdims, const struct PGranularity *pgran, void *extra) { struct KgenContext *ctx; CLBLASKernExtra *kextra = (CLBLASKernExtra*)extra; CLBLASKernExtra kextraTmp = *kextra; char tmp[1024], tmp1[1024]; char blkmul[FUNC_NAME_MAXLEN]; char updateResFn[FUNC_NAME_MAXLEN]; char updateResGenericFn[FUNC_NAME_MAXLEN]; char updateResFnRev[FUNC_NAME_MAXLEN]; char updateResGenericFnRev[FUNC_NAME_MAXLEN]; char copyPLFn[FUNC_NAME_MAXLEN]; char *s1 = ""; const char *typeName; CopyBufFuncs copyFuncs; ZeroFuncs zeroFuncs; char fpref; DataType dtype = kextra->dtype; ssize_t ret; BlasGenSettings gset; BlkMulOpts mulOpts; BlkMulFlags mulFlags; size_t pitchAB; size_t u; bool b; bool isU; bool areTails; const char *outTypeName; unsigned int nrRegs, colRegs; KernelExtraFlags kflags = kextra->flags; size_t tsize; unsigned int vecLen = sizeof(cl_float4) / dtypeSize(dtype); UpdateResultFlags upFlags; int tra, trb; unsigned int l1Pans; char vect[2] = {'y', 'x'}; if (pgran->wgDim != 1) { return -EINVAL; } ctx = createKgenContext(buf, buflen, true); if (ctx == NULL) { return -ENOMEM; } tsize = dtypeSize(dtype); areTails = (kflags & (KEXTRA_TAILS_M | KEXTRA_TAILS_N)); isU = isMatrixUpper(kflags); tra = isMatrixAccessColMaj(CLBLAS_TRSM, kflags, MATRIX_A); trb = isMatrixAccessColMaj(CLBLAS_TRSM, kflags, MATRIX_B); l1Pans = (unsigned int)subdims[0].x / (unsigned int)subdims[1].x; /* * Force generation of the transposed version of the block * reading function with following multiplication with transposed * block B to decrease LDS bank conflicts without column skew using. * Reverse temporarily the flag of the column-major order for that */ if (useTransposedMul(subdims, dtype, trb)) { if (kflags & KEXTRA_COLUMN_MAJOR) { kflags &= ~KEXTRA_COLUMN_MAJOR; } else { kflags |= KEXTRA_COLUMN_MAJOR; } mulFlags = BLKMUL_SKEW_ROW | BLKMUL_TRANSPOSED_B; u = subdims[1].y; } else { mulFlags = BLKMUL_SKEW_COLUMN; u = subdims[0].y / (sizeof(cl_float4) / dtypeSize(dtype)); } ctx = createKgenContext(buf, buflen, true); if (ctx == NULL) { return -ENOMEM; } // at first, generate needed declarations and auxiliary functions b = isDoubleBasedType(dtype); kgenDeclareUptrs(ctx, b); kextraTmp.flags = kflags; memset(&gset, 0, sizeof(gset)); memcpy(gset.subdims, subdims, sizeof(gset.subdims)); gset.kextra = &kextraTmp; gset.pgran = pgran; initKernelVarNames(&gset.varNames, kextra->flags); if (isComplexType(dtype)) { genComplexMathOperators(ctx, dtype); } generateBufCopyFuncs(©Funcs, ctx, CLBLAS_TRSM, &gset, BCHF_MATRIX_B); /* * Temporary kernel extra has been needed to produce inverted block B read. * Restore the original one, and restore kflags as well */ gset.kextra = kextra; kflags = kextra->flags; // functions updating result // for the final result generateUpresFuncs(ctx, CLBLAS_TRSM, &gset, updateResFn, updateResGenericFn); // for intermediate result after blocks modification upFlags = kextraToUpresFlags(CLBLAS_TRSM, kflags); upFlags |= UPRES_WITH_BETA | UPRES_PRIV_DEST; genUpresFuncsWithFlags(ctx, &gset, upFlags, updateResFnRev, updateResGenericFnRev); // for heaping before multiplying on inverted block upFlags = UPRES_USE_LDS; if (!(mulFlags & BLKMUL_TRANSPOSED_B)) { upFlags |= UPRES_COLUMN_MAJOR; } updateResultGenOld(ctx, &gset, UPRES_SET, upFlags, NULL); kgenGetLastFuncName(copyPLFn, FUNC_NAME_MAXLEN, ctx); kgenAddBlankLine(ctx); generateZeroingFuncs(&zeroFuncs, ctx, &subdims[0], pgran, dtype, ZF_MATRIX_B | ZF_MATRIX_C); // block multiplication function mulOpts.aMobj = CLMEM_IMAGE; mulOpts.bMobj = CLMEM_BUFFER; mulOpts.flags = BLKMUL_OUTPUT_PRIVATE | mulFlags; if (isComplexType(dtype)) { mulOpts.core = BLKMUL_SEPARATE_MULADD; } else { mulOpts.core = BLKMUL_MAD; } ret = blkMulGen(ctx, subdims, dtype, &mulOpts); if (ret) { destroyKgenContext(ctx); return -EOVERFLOW; } kgenAddBlankLine(ctx); kgenGetLastFuncName(blkmul, sizeof(blkmul), ctx); typeName = dtypeBuiltinType(dtype); fpref = dtypeToBlasPrefix(dtype); // block number calculation getNblock(ctx, isU); getResultGPRsInfo(dtype, &subdims[1], vecLen, &nrRegs, &outTypeName); if (isComplexType(dtype)) { colRegs = (unsigned int)subdims[1].x; } else { colRegs = (unsigned int)fl4RowWidth(subdims[1].x, tsize); } if (mulFlags & BLKMUL_SKEW_ROW) { genReorderSolution(ctx, subdims, outTypeName, colRegs); } // now, generate the kernel if (kflags & KEXTRA_SIDE_RIGHT) { sprintf(tmp, trsmImDecl, pgran->wgSize[0], pgran->wgSize[1], fpref, 'N', 'M', typeName, typeName, typeName, typeName); } else { sprintf(tmp, trsmImDecl, pgran->wgSize[0], pgran->wgSize[1], fpref, 'M', 'N', typeName, typeName, typeName, typeName); } kgenDeclareFunction(ctx, tmp); ret = kgenBeginFuncBody(ctx); if (!(mulFlags & BLKMUL_TRANSPOSED_B)) { sprintf(tmp, "const int bpr = get_image_width(A) / %lu;\n", subdims[0].y / (sizeof(cl_float4) / tsize)); } else { sprintf(tmp, "const int bpc = get_image_height(A) / %lu;\n", subdims[0].y); } kgenAddStmt(ctx, tmp); /* * Calculate local buffer pitches, and then insert the * preparative code */ pitchAB = matrBlockPitch(subdims, MATRIX_A, dtype, clblasLeft); sprintf(tmp, trsmImPrep1D, typeName, pitchAB * subdims[0].x, outTypeName, nrRegs, u, s1, pgran->wgSize[0], subdims[0].itemX); kgenAddStmt(ctx, tmp); kgenAddBlankLine(ctx); kgenAddStmt(ctx, "B += offB;\n"); sprintf(tmp, "coordB.%c = currN + lid %% %u * %lu;\n" "coordB.%c = 0;\n\n", vect[trb], l1Pans, subdims[1].x, vect[1 - trb]); kgenAddStmt(ctx, tmp); /* * B matrix is divided on panels, each work group * multiply such a panel on the whole matrix A. */ // top level loop over M if (isU) { sprintf(tmp1, "(((finishRow - 1) / %lu) * %lu)", subdims[0].y, subdims[0].y); //last block start sprintf(tmp, "for (m0 = %s; m0 + %lu != startRow; m0 -= %lu)", tmp1, subdims[0].y, subdims[0].y); ret = kgenBeginBranch(ctx, tmp); } else { sprintf(tmp, "for (m0 = startRow; m0 < finishRow; m0 += %lu)", subdims[0].y); ret = kgenBeginBranch(ctx, tmp); } sprintf(tmp, "coordA.%c = m0 + lid / %u * %lu;\n" "coordA.%c = 0;\n\n", vect[tra], l1Pans, subdims[1].y, vect[1 - tra]); kgenAddStmt(ctx, tmp); genZeroResult(ctx, dtype, subdims); // loop over K if (isU) { sprintf(tmp, "for (k0 = m0 + %lu; k0 < M; k0 += %lu)", subdims[0].bwidth, subdims[0].bwidth); } else { sprintf(tmp, "for (k0 = 0; k0 < m0; k0 += %lu)", subdims[0].bwidth); } ret = kgenBeginBranch(ctx, tmp); genPrepareRectBlock(ctx, subdims, dtype, ©Funcs, &zeroFuncs, trb, 'C', !areTails); kgenAddBarrier(ctx, CLK_LOCAL_MEM_FENCE); // multiplication in the adjusting loop genMultiplication(ctx, subdims, dtype, blkmul, mulFlags); kgenAddBarrier(ctx, CLK_LOCAL_MEM_FENCE); kgenEndBranch(ctx, NULL); // loop over K kgenAddBlankLine(ctx); if (mulFlags & BLKMUL_SKEW_ROW) { kgenAddStmt(ctx, "reorderResult(c, skew);\n"); } kgenAddStmt(ctx, "k0 = m0;\n"); genUpdateIntermTrsmResult(ctx, &gset, updateResFnRev, updateResGenericFnRev, true); genHeapTrsmResultToLDS(ctx, &gset, copyPLFn, "tempC"); kgenAddBarrier(ctx, CLK_LOCAL_MEM_FENCE); genZeroResult(ctx, dtype, subdims); // multiplication on the inverted block genMultiplication(ctx, subdims, dtype, blkmul, mulFlags); if (mulFlags & BLKMUL_SKEW_ROW) { kgenAddStmt(ctx, "reorderResult(c, skew);\n"); } // write back the tile evaluated upFlags = UPRES_EXCEED_PROBLEM_CONDITION; if (isMatrixAccessColMaj(CLBLAS_TRSM, kflags, MATRIX_C)) { upFlags |= UPRES_COLUMN_MAJOR; } genResultUpdateWithFlagsOld(ctx, CLBLAS_TRSM, &gset, upFlags, updateResFn, updateResGenericFn, NULL); kgenAddBarrier(ctx, CLK_GLOBAL_MEM_FENCE); // end external loops over panels of matrix A kgenEndBranch(ctx, NULL); kgenEndFuncBody(ctx); ret = kgenAddBlankLine(ctx); if (!ret) { ret = (ssize_t)kgenSourceSize(ctx) + 1; } destroyKgenContext(ctx); return (ret < 0) ? -EOVERFLOW : ret; } static ssize_t wrapper( char *buf, size_t buflen, const struct SubproblemDim *subdims, const struct PGranularity *pgran, void *extra) { CLBLASKernExtra *kextra = (CLBLASKernExtra*)extra; if (kextra->kernType == CLBLAS_COMPUTING_KERNEL) { return generator(buf, buflen, subdims, pgran, extra); } else { return prepGenerator(buf, buflen, subdims, pgran, extra); } } static void assignKargs(KernelArg *args, const void *params, const void *extra) { const CLBlasKargs *blasArgs = (const CLBlasKargs*)params; (void)extra; if (blasArgs->kernType == CLBLAS_COMPUTING_KERNEL) { if (blasArgs->side == clblasLeft) { initSizeKarg(&args[0], blasArgs->K); initSizeKarg(&args[1], blasArgs->N); } else { initSizeKarg(&args[0], blasArgs->M); initSizeKarg(&args[1], blasArgs->K); } assignScalarKarg(&args[2], &(blasArgs->alpha), blasArgs->dtype); initMemobjKarg(&args[3], blasArgs->scimage[0], NULL, 0, 0); initMemobjKarg(&args[4], blasArgs->B, NULL, 0, 0); initSizeKarg(&args[5], blasArgs->ldb.matrix); if (blasArgs->side == clblasLeft) { initSizeKarg(&args[6], blasArgs->offsetM); initSizeKarg(&args[7], blasArgs->M + blasArgs->offsetM); } else { initSizeKarg(&args[6], blasArgs->offsetN); initSizeKarg(&args[7], blasArgs->N + blasArgs->offsetN); } initSizeKarg(&args[8], blasArgs->offBX); } else { if (blasArgs->side == clblasLeft) { initSizeKarg(&args[0], blasArgs->M); } else { initSizeKarg(&args[0], blasArgs->N); } initMemobjKarg(&args[1], blasArgs->A, NULL, 0, 0); initSizeKarg(&args[2], blasArgs->lda.matrix); initMemobjKarg(&args[3], blasArgs->scimage[0], NULL, 0, 0); if (blasArgs->side == clblasLeft) { initSizeKarg(&args[4], blasArgs->offsetM); } else { initSizeKarg(&args[4], blasArgs->offsetN); } initSizeKarg(&args[5], blasArgs->offA); } } static bool isFitToLDS( SubproblemDim *dim, DataType dtype, cl_ulong ldsSize, const void *kernelArgs) { cl_ulong sizeA, sizeB, size; const CLBlasKargs *kargs = (const CLBlasKargs*)kernelArgs; /* * For prepare kernel two square local blocks required. * For main kernel two rectangular blocks required. * Maximum of these two values checked. */ sizeA = matrBlockSize(dim, MATRIX_A, dtype, kargs->side); sizeB = matrBlockSize(dim, MATRIX_B, dtype, kargs->side); size = (sizeA > sizeB) ? sizeA : sizeB; return (2 * size * dtypeSize(dtype) <= ldsSize); } static void calcNrThreads( size_t threads[2], const SubproblemDim *dims, const PGranularity *pgran, const void *args, const void *extra) { SubproblemDim globDim, offDim; const CLBlasKargs *kargs = (const CLBlasKargs*)args; size_t width, startBlock, finishBlock; bool isU = (kargs->uplo == clblasUpper) ^ (kargs->transA != clblasNoTrans) ^ (kargs->side == clblasRight); (void)extra; width = kargs->K; width = (width + dims[0].bwidth - 1) / dims[0].bwidth; kargsToProbDims(&globDim, CLBLAS_TRSM, kargs, false); kargsToProbDims(&offDim, CLBLAS_TRSM, kargs, true); startBlock = offDim.y / dims[0].bwidth; finishBlock = (globDim.y + offDim.y + dims[0].bwidth - 1) / dims[0].bwidth; if (kargs->kernType == CLBLAS_PREP_A_KERNEL) { if (isU) { threads[0] = ((2 * width - startBlock - finishBlock + 1) * (finishBlock - startBlock) / 2) * pgran->wgSize[0]; } else { threads[0] = ((1 + finishBlock + startBlock) * (finishBlock - startBlock) / 2) * pgran->wgSize[0]; } threads[1] = 0; } else { calcGlobalThreads(threads, dims, pgran, globDim.y, globDim.x); } } static void imgPackMode( const void *extra, const SubproblemDim *dims, int dataID, unsigned int *packRate, clblasOrder *packOrder) { bool trb; const CLBLASKernExtra *kextra = (CLBLASKernExtra*)extra; (void)dataID; trb = isMatrixAccessColMaj(CLBLAS_TRSM, kextra->flags, MATRIX_B); if (trb || isComplexType(kextra->dtype)) { *packOrder = clblasRowMajor; *packRate = (unsigned int)dims[0].y; } else { *packOrder = clblasColumnMajor; *packRate = (unsigned int)dims[0].y; } } static SolverFlags solverFlags(void) { return (SF_WSPACE_1D | SF_TOP_INPUT_SQUARE_BLOCKS); } void initTrsmImgPattern(MemoryPattern *mempat) { mempat->name = "Image based block trsm"; mempat->nrLevels = 2; mempat->cuLevel = 0; mempat->thLevel = 1; mempat->sops = &solverOps; mpatExtra.aMset = CLMEM_LEVEL_L1 | CLMEM_LEVEL_LDS; mpatExtra.bMset = CLMEM_LEVEL_LDS; mpatExtra.mobjA = CLMEM_IMAGE; mpatExtra.mobjB = CLMEM_BUFFER; mempat->extra = &mpatExtra; } clblas-2.10/src/library/blas/gens/legacy/trsm_kgen_legacy.c000066400000000000000000000131251264277366700237570ustar00rootroot00000000000000/* ************************************************************************ * Copyright 2013 Advanced Micro Devices, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * ************************************************************************/ #include #include "../blas_kgen.h" #include "trsm_kgen_legacy.h" void genUpdateIntermTrsmResult( struct KgenContext *ctx, const BlasGenSettings *gset, const char *optFuncName, const char *genericFuncName, bool withMhitCond) { char tmp[1024]; const char *coordY, *coordX; char *revAlp, *alp; DataType dtype = gset->kextra->dtype; KernelExtraFlags kflags = gset->kextra->flags; const SubproblemDim *dim = &gset->subdims[1]; const KernelVarNames *kvarNames = &gset->varNames; if (isComplexType(dtype)) { if (dtype == TYPE_COMPLEX_FLOAT) { revAlp = "div((float2)(-1.f, 0), alpha)"; alp = "(float2)(1.f, 0)"; } else { revAlp = "div((double2)(-1., 0), alpha)"; alp = "(double2)(1., 0)"; } } else { revAlp = "-1. / alpha"; alp = "1."; } coordY = kvarNames->coordA; coordX = kvarNames->coordB; if (!(kflags & (KEXTRA_TAILS_M | KEXTRA_TAILS_N))) { sprintf(tmp, "%s(B, c, %s, %s, %s, ldb, %s);\n", optFuncName, alp, coordY, coordX, revAlp); kgenAddStmt(ctx, tmp); } else { if (withMhitCond) { sprintf(tmp, "if ((%s < %s) && (%s < %s))", coordY, kvarNames->sizeM, coordX, kvarNames->sizeN); kgenBeginBranch(ctx, tmp); } else { /* for x, y variables scope */ kgenBeginBranch(ctx, NULL); } sprintf(tmp, "uint y = min(%luu, %s - (uint)%s);\n" "uint x = min(%luu, %s - (uint)%s);\n" "if ((y == %luu) && (x == %luu)) {\n" " %s(B, c, %s, %s, %s, ldb, %s);\n" "}\n" "else {\n" " %s(B, c, %s, %s, %s, ldb, %s, y, x);\n" "}\n", dim->y, kvarNames->sizeM, coordY, dim->x, kvarNames->sizeN, coordX, dim->y, dim->x, optFuncName, alp, coordY, coordX, revAlp, genericFuncName, alp, coordY, coordX, revAlp); kgenAddStmt(ctx, tmp); kgenEndBranch(ctx, NULL); } } void genHeapTrsmResultToLDS( struct KgenContext *ctx, const BlasGenSettings *gset, const char *funcName, const char *dstName) { char tmp[1024]; char *alp; unsigned int l1Pans; DataType dtype = gset->kextra->dtype; const SubproblemDim *dims = gset->subdims; if(isComplexType(dtype)) { if (dtype == TYPE_COMPLEX_FLOAT) { alp = "(float2)(1.f, 0)"; } else { alp = "(double2)(1., 0)"; } } else { alp = "1."; } l1Pans = (unsigned int)dims[0].x / (unsigned int)dims[1].x; sprintf(tmp, "%s(%s, c, %s, (lid / %u * %lu), (lid %% %u * %lu), %lu);\n", funcName, dstName, alp, l1Pans, dims[1].y, l1Pans, dims[1].x, dims[0].bwidth); kgenAddStmt(ctx, tmp); } void genInvertingBlockFunc( struct KgenContext *ctx, size_t pitch, DataType dtype, KernelExtraFlags kflags) { char tmp[1024]; const char *ctype; ctype = dtypeBuiltinType(dtype); sprintf(tmp, "void\ninvert(__local %s *src, __local %s *dst, int lid, " "int lastRow)\n", ctype, ctype); kgenDeclareFunction(ctx, tmp); kgenBeginFuncBody(ctx); kgenAddStmt(ctx, "int i, k;\n"); if (isComplexType(dtype)) { sprintf(tmp, "dst[lid * %lu + lid].x = 1.f;\n", pitch); } else { sprintf(tmp, "dst[lid * %lu + lid] = 1.f;\n", pitch); } kgenAddStmt(ctx, tmp); if (isMatrixUpper(kflags)) { sprintf(tmp, "for (i = lastRow - 1; i >= 0; i--)"); } else { sprintf(tmp, "for (i = 0; i < lastRow; i++)"); } kgenBeginBranch(ctx, tmp); if (isComplexType(dtype)) { sprintf(tmp, "dst[i * %lu + lid] = div(dst[i * %lu + lid], " "src[i * %lu + i]);\n", pitch, pitch, pitch); } else { sprintf(tmp, "dst[i * %lu + lid] = dst[i * %lu + lid] / " "src[i * %lu + i];\n", pitch, pitch, pitch); } kgenAddStmt(ctx, tmp); if (isMatrixUpper(kflags)) { sprintf(tmp, "for (k = 0; k < i; k++)"); } else { sprintf(tmp, "for (k = i + 1; k < %lu; k++)", pitch); } kgenBeginBranch(ctx, tmp); if (isComplexType(dtype)) { sprintf(tmp, "dst[k * %lu + lid] = dst[k * %lu + lid] - " "mul(src[k * %lu + i], dst[i * %lu + lid]);\n", pitch, pitch, pitch, pitch); } else { sprintf(tmp, "dst[k * %lu + lid] = dst[k * %lu + lid] - " "dst[i * %lu + lid] * src[k * %lu + i];\n", pitch, pitch, pitch, pitch); } kgenAddStmt(ctx, tmp); kgenEndBranch(ctx, NULL); kgenEndBranch(ctx, NULL); kgenEndFuncBody(ctx); } clblas-2.10/src/library/blas/gens/legacy/trsm_kgen_legacy.h000066400000000000000000000024211264277366700237610ustar00rootroot00000000000000/* ************************************************************************ * Copyright 2013 Advanced Micro Devices, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * ************************************************************************/ #ifndef TRSM_KGEN_LEGACY_H_ #define TRSM_KGEN_LEGACY_H_ void genUpdateIntermTrsmResult( struct KgenContext *ctx, const BlasGenSettings *gset, const char *optFuncName, const char *genericFuncName, bool withMhitCond); void genHeapTrsmResultToLDS( struct KgenContext *ctx, const BlasGenSettings *gset, const char *funcName, const char *dstName); void genInvertingBlockFunc( struct KgenContext *ctx, size_t pitch, DataType dtype, KernelExtraFlags kflags); #endif /* TRSM_KGEN_LEGACY_H_ */ clblas-2.10/src/library/blas/gens/legacy/trsm_lds.c000066400000000000000000000457101264277366700222760ustar00rootroot00000000000000/* ************************************************************************ * Copyright 2013 Advanced Micro Devices, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * ************************************************************************/ /* * LDS based trsm generator */ #include #include #include #include #include #include #include #include #include "../init.h" #include "blas_kgen_legacy.h" #include "gen_helper_legacy.h" #include "trsm_kgen_legacy.h" #include "../trxm_common.h" #include "../trsm_kgen.h" static CLBLASMpatExtra mpatExtra; /* * template for memory object based trsm preparation part * for one dimensional work space */ static const char *trsmPrep1D = "uint m0, k0;\n" "__local %s tempA[%lu];\n" "__local %s tempC[%lu];\n" "%s c[%u];\n" "int lid, gid;\n" "%s" // groups per Panel variable "uint currM, currN;\n" "uint x, y;\n" "uint2 coordA, coordB;\n" "\n" "lid = get_local_id(0);\n" "gid = get_global_id(0) / %u;\n" // group ID "\n"; static const char *readSquareBlock = "y = (currM + %lu <= M) ? %lu : M - currM;\n" "x = (k0 + %lu <= M) ? %lu : M - k0;\n" "if ((y == %lu) && (x == %lu)) {\n" // just read with an optimized function " %s((LPtr)temp%c, (GPtr)A, currM, k0, lda);\n" "}\n" "else {\n" " %s((__local float4*)temp%c);\n" // zeroing " barrier(CLK_LOCAL_MEM_FENCE);\n" " %s((LPtr)temp%c, (GPtr)A, currM, k0, y, x, %lu, lda);\n" "}\n\n"; static const char *readSquareBlockOpt = // just read with an optimized function "%s((LPtr)temp%c, (GPtr)A, currM, k0, lda);\n"; static const char *readSquareBlockTrans = "y = (currM + %lu <= M) ? %lu : M - currM;\n" "x = (k0 + %lu <= M) ? %lu : M - k0;\n" "if ((y == %lu) && (x == %lu)) {\n" // read and transpose with an optimized function " %s((LPtr)temp%c, (GPtr)A, k0, currM, lda);\n" "}\n" "else {\n" " %s((__local float4*)temp%c);\n" // zeroing " barrier(CLK_LOCAL_MEM_FENCE);\n" // read and transpose with slow function " %s((LPtr)temp%c, (GPtr)A, k0, currM, x, y, %lu, lda);\n" "}\n\n"; static const char *readSquareBlockTransOpt = // read and transpose with an optimized function "%s((LPtr)temp%c, (GPtr)A, k0, currM, lda);\n"; static const char *readRectBlock = "y = (currN + %lu <= N) ? %lu : N - currN;\n" "x = (k0 + %lu <= M) ? %lu : M - k0;\n" "if ((y == %lu) && (x == %lu)) {\n" // just read with an optimized function " %s((LPtr)temp%c, (GPtr)B, currN, k0, ldb);\n" "}\n" "else {\n" " %s((__local float4*)temp%c);\n" // zeroing " barrier(CLK_LOCAL_MEM_FENCE);\n" " %s((LPtr)temp%c, (GPtr)B, currN, k0, y, x, %lu, ldb);\n" "}\n\n"; static const char *readRectBlockOpt = // just read with an optimized function "%s((LPtr)temp%c, (GPtr)B, currN, k0, ldb);\n"; static const char *readRectBlockTrans = "y = (currN + %lu <= N) ? %lu : N - currN;\n" "x = (k0 + %lu <= M) ? %lu : M - k0;\n" "if ((y == %lu) && (x == %lu)) {\n" // read and transpose with an optimized function " %s((LPtr)temp%c, (GPtr)B, k0, currN, ldb);\n" "}\n" "else {\n" " %s((__local float4*)temp%c);\n" // zeroing " barrier(CLK_LOCAL_MEM_FENCE);\n" // read and transpose with slow function " %s((LPtr)temp%c, (GPtr)B, k0, currN, x, y, %lu, ldb);\n" "}\n\n"; static const char *readRectBlockTransOpt = // read and transpose with an optimized function "%s((LPtr)temp%c, (GPtr)B, k0, currN, ldb);\n"; static ssize_t generator( char *buf, size_t buflen, const struct SubproblemDim *subdims, const struct PGranularity *pgran, void *extra); static void assignKargs(KernelArg *args, const void *params, const void *extra); static bool isFitToLDS( SubproblemDim *dim, DataType dtype, cl_ulong ldsSize, const void *kernelArgs); static SolverFlags solverFlags(void); static SolverOps solverOps = { generator, assignKargs, isFitToLDS, NULL, NULL, NULL, NULL, solverFlags, NULL, //fixupArgs NULL, //getDefaultDecomp NULL, //getDecompList NULL, NULL }; static void genZeroResult( struct KgenContext *ctx, DataType dtype, const SubproblemDim *dims) { unsigned int n; char tmp[1024]; unsigned int vecLen = sizeof(cl_float4) / dtypeSize(dtype); getResultGPRsInfo(dtype, &dims[1], vecLen, &n, NULL); sprintf(tmp, "for (x = 0; x < %u; x++) {\n" " c[x] = 0;\n" "}\n\n", n); kgenAddStmt(ctx, tmp); } static void genPrepareSquareBlock( struct KgenContext *ctx, const SubproblemDim *dim, DataType dtype, const CopyBufFuncs *copyFuncs, const ZeroFuncs *zeroFuncs, KernelExtraFlags kflags, char c) { char tmp[1024]; size_t pitch; const char *readBlock; bool tra; tra = isMatrixAccessColMaj(CLBLAS_TRSM, kflags, MATRIX_A); pitch = matrBlockPitch(dim, MATRIX_A, dtype, clblasLeft); if (!(kflags & KEXTRA_TAILS_M)) { readBlock = (tra) ? readSquareBlockTransOpt : readSquareBlockOpt; sprintf(tmp, readBlock, copyFuncs->read[MATRIX_A], c); } else { readBlock = (tra) ? readSquareBlockTrans : readSquareBlock; sprintf(tmp, readBlock, dim->y, dim->y, dim->bwidth, dim->bwidth, dim->y, dim->bwidth, copyFuncs->read[MATRIX_A], c, zeroFuncs->names[MATRIX_A], c, copyFuncs->readGeneric[MATRIX_A], c, pitch); } kgenAddStmt(ctx, tmp); } static void genPrepareRectBlock( struct KgenContext *ctx, const SubproblemDim *dim, DataType dtype, const CopyBufFuncs *copyFuncs, const ZeroFuncs *zeroFuncs, KernelExtraFlags kflags, char c) { char tmp[1024]; size_t pitch; const char *readBlock; bool trb; trb = isMatrixAccessColMaj(CLBLAS_TRSM, kflags, MATRIX_B); pitch = matrBlockPitch(dim, MATRIX_B, dtype, clblasLeft); if (!(kflags & (KEXTRA_TAILS_N | KEXTRA_TAILS_M))) { readBlock = (trb) ? readRectBlockTransOpt : readRectBlockOpt; sprintf(tmp, readBlock, copyFuncs->read[MATRIX_B], c); } else { readBlock = (trb) ? readRectBlockTrans : readRectBlock; sprintf(tmp, readBlock, dim->x, dim->x, dim->bwidth, dim->bwidth, dim->x, dim->bwidth, copyFuncs->read[MATRIX_B], c, zeroFuncs->names[MATRIX_B], c, copyFuncs->readGeneric[MATRIX_B], c, pitch); } kgenAddStmt(ctx, tmp); } static void genZeroBlockA( struct KgenContext *ctx, const ZeroFuncs *zeroFuncs) { char tmp[1024]; sprintf(tmp, "%s((__local float4*)tempA);\n", zeroFuncs->names[MATRIX_A]); kgenAddStmt(ctx, tmp); } /* * Generate control block of the loop over K * Two kind of loops: without triangle block and only triangle block */ static void genInternalLoopCtl( struct KgenContext *ctx, const SubproblemDim *dim, KernelExtraFlags kflags, bool triangPart) { char tmp[1024]; (void)triangPart; if (isMatrixUpper(kflags)) { sprintf(tmp, "for (k0 = currM + %lu; k0 < M; k0 += %lu)", dim->bwidth, dim->bwidth); } else { sprintf(tmp, "for (k0 = 0; k0 < currM; k0 += %lu)", dim->bwidth); } kgenBeginBranch(ctx, tmp); } static void genInitCurrM( struct KgenContext *ctx, const SubproblemDim *dim, KernelExtraFlags kflags) { char tmp[1024]; if (isMatrixUpper(kflags)) { /* start from the last block */ sprintf(tmp, "currM = ((M - 1) / %lu) * %lu;\n", dim->y, dim->y); kgenAddStmt(ctx, tmp); } else { kgenAddStmt(ctx, "currM = 0;\n"); } } static void initKernelVarNames(KernelVarNames *kvars, KernelExtraFlags kflags) { kvars->A = "A"; kvars->B = "B"; if (isMatrixAccessColMaj(CLBLAS_TRSM, kflags, MATRIX_A)) { kvars->coordA = "coordA.x"; } else { kvars->coordA = "coordA.y"; } if (isMatrixAccessColMaj(CLBLAS_TRSM, kflags, MATRIX_B)) { kvars->coordB = "coordB.x"; } else { kvars->coordB = "coordB.y"; } kvars->sizeM = "M"; kvars->sizeN = "N"; kvars->sizeK = "origM"; } static ssize_t generator( char *buf, size_t buflen, const struct SubproblemDim *subdims, const struct PGranularity *pgran, void *extra) { struct KgenContext *ctx; CLBLASKernExtra *kextra = (CLBLASKernExtra*)extra; KernelExtraFlags kflags = kextra->flags; char tmp[1024]; char blkmul[FUNC_NAME_MAXLEN]; char updateResFn[FUNC_NAME_MAXLEN]; char updateResGenericFn[FUNC_NAME_MAXLEN]; char updateResFnRev[FUNC_NAME_MAXLEN]; char updateResGenericFnRev[FUNC_NAME_MAXLEN]; char copyPLFn[FUNC_NAME_MAXLEN]; char *s1 = ""; const char *typeName; CopyBufFuncs copyFuncs; ZeroFuncs zeroFuncs; DataType dtype = kextra->dtype; ssize_t ret; BlasGenSettings gset; BlkMulOpts mulOpts; size_t pitchAB, pitchC; bool b; const char *outTypeName; unsigned int nrRegs; unsigned int vecLen = sizeof(cl_float4) / dtypeSize(dtype); int tra, trb; unsigned int l1Pans; char vect[2] = {'y', 'x'}; UpdateResultFlags upFlags; if (pgran->wgDim != 1) { return -EINVAL; } ctx = createKgenContext(buf, buflen, true); if (ctx == NULL) { return -ENOMEM; } // at first, generate needed declarations and auxiliary functions b = isDoubleBasedType(dtype); kgenDeclareUptrs(ctx, b); memset(&gset, 0, sizeof(gset)); memcpy(gset.subdims, subdims, sizeof(gset.subdims)); gset.kextra = kextra; gset.pgran = pgran; initKernelVarNames(&gset.varNames, kflags); if (isComplexType(dtype)) { genComplexMathOperators(ctx, dtype); } generateBufCopyFuncs(©Funcs, ctx, CLBLAS_TRSM, &gset, BCHF_MATRIX_A | BCHF_MATRIX_B | BCHF_WRITE_OUTPUT); generateZeroingFuncs(&zeroFuncs, ctx, &subdims[0], pgran, dtype, ZF_MATRIX_A | ZF_MATRIX_B); getResultGPRsInfo(dtype, &subdims[1], vecLen, &nrRegs, &outTypeName); // functions updating result // for the final result generateUpresFuncs(ctx, CLBLAS_TRSM, &gset, updateResFn, updateResGenericFn); // for intermediate result after blocks modification upFlags = kextraToUpresFlags(CLBLAS_TRSM, kflags); upFlags |= UPRES_WITH_BETA | UPRES_PRIV_DEST; genUpresFuncsWithFlags(ctx, &gset, upFlags, updateResFnRev, updateResGenericFnRev); // for heaping before multiplying on inverted block updateResultGenOld(ctx, &gset, UPRES_SET, UPRES_COLUMN_MAJOR | UPRES_USE_LDS, NULL); kgenGetLastFuncName(copyPLFn, FUNC_NAME_MAXLEN, ctx); kgenAddBlankLine(ctx); // block multiplication function mulOpts.aMobj = CLMEM_BUFFER; mulOpts.bMobj = CLMEM_BUFFER; mulOpts.flags = BLKMUL_SKEW_COLUMN | BLKMUL_OUTPUT_PRIVATE; mulOpts.core = BLKMUL_SEPARATE_MULADD; ret = blkMulGen(ctx, subdims, dtype, &mulOpts); if (ret) { destroyKgenContext(ctx); return -EOVERFLOW; } kgenAddBlankLine(ctx); kgenGetLastFuncName(blkmul, sizeof(blkmul), ctx); //matrix inversion function genInvertingBlockFunc(ctx, subdims[0].bwidth, dtype, kflags); typeName = dtypeBuiltinType(dtype); // now, generate the kernel declareTrxmKernel(ctx, dtype, pgran, kflags, CLBLAS_TRSM, NULL, false, false); ret = kgenBeginFuncBody(ctx); /* * Calculate local buffer pitches, and then insert the * preparative code */ pitchAB = matrBlockPitch(subdims, MATRIX_A, dtype, clblasLeft); pitchC = matrBlockPitch(subdims, MATRIX_C, dtype, clblasLeft); sprintf(tmp, trsmPrep1D, typeName, pitchAB * subdims[0].y, typeName, ((pitchC > pitchAB) ? pitchC : pitchAB) * subdims[0].y, outTypeName, nrRegs, s1, pgran->wgSize[0]); ret = kgenAddStmt(ctx, tmp); /* * B matrix is divided on panels, each work group * multiply such a panel on the whole matrix A. */ sprintf(tmp, "currN = gid * %lu;\n", subdims[0].x); kgenAddStmt(ctx, tmp); genInitCurrM(ctx, subdims, kflags); if (kflags & KEXTRA_A_OFF_NOT_ZERO) { kgenAddStmt(ctx, "A += offA;\n"); } genTrxmBMatrShift(ctx, kflags, false); tra = isMatrixAccessColMaj(CLBLAS_TRSM, kflags, MATRIX_A); trb = isMatrixAccessColMaj(CLBLAS_TRSM, kflags, MATRIX_B); l1Pans = (unsigned int)subdims[0].x / (unsigned int)subdims[1].x; sprintf(tmp, "coordB.%c = currN + lid %% %u * %lu;\n" "coordB.%c = 0;\n\n", vect[trb], l1Pans, subdims[1].x, vect[1 - trb]); kgenAddStmt(ctx, tmp); // loop over M sprintf(tmp, "for (m0 = 0; m0 < M; m0 += %lu)", subdims->y); kgenBeginBranch(ctx, tmp); sprintf(tmp, "coordA.%c = currM + lid / %u * %lu;\n" "coordA.%c = 0;\n\n", vect[tra], l1Pans, subdims[1].y, vect[1 - tra]); kgenAddStmt(ctx, tmp); genZeroResult(ctx, dtype, subdims); genInternalLoopCtl(ctx, subdims, kflags, false); // loop over K genPrepareSquareBlock(ctx, subdims, dtype, ©Funcs, &zeroFuncs, kflags, 'A'); genPrepareRectBlock(ctx, subdims, dtype, ©Funcs, &zeroFuncs, kflags, 'C'); kgenAddBarrier(ctx, CLK_LOCAL_MEM_FENCE); // multiplication for the step-by-step block updating sprintf(tmp, "%s((LPtr)(tempA + (lid / %u * %lu) * %lu), \n" " (LPtr)(tempC + (lid %% %u * %lu) * %lu),\n" " (%s*)c, lid %% %lu);\n", blkmul, l1Pans, subdims[1].y, pitchAB, l1Pans, subdims[1].x, pitchAB, outTypeName, subdims[1].y); ret = kgenAddStmt(ctx, tmp); kgenAddBarrier(ctx, CLK_LOCAL_MEM_FENCE); genInternalLoopEnd(ctx); // loop over K kgenAddBlankLine(ctx); kgenAddStmt(ctx, "k0 = currM;\n"); genPrepareSquareBlock(ctx, subdims, dtype, ©Funcs, &zeroFuncs, kflags, 'C'); genZeroBlockA(ctx, &zeroFuncs); kgenAddBarrier(ctx, CLK_LOCAL_MEM_FENCE); if (kflags & KEXTRA_UNIT_DIAGONAL) { sprintf(tmp, "if (lid < %lu) {\n" " tempC[lid * %lu + lid] = %s;\n" "}\n", subdims[0].bwidth, pitchAB, strOne(dtype)); kgenAddStmt(ctx, tmp); kgenAddBarrier(ctx, CLK_LOCAL_MEM_FENCE); kgenAddBlankLine(ctx); } sprintf(tmp, "if (lid < %lu)", subdims[0].bwidth); kgenBeginBranch(ctx, tmp); sprintf(tmp, "invert(tempC, tempA, lid, (currM + %lu > M) ? " "M - currM : %lu);\n", subdims[0].y, subdims[0].y); kgenAddStmt(ctx, tmp); kgenEndBranch(ctx, NULL); kgenAddBarrier(ctx, CLK_LOCAL_MEM_FENCE); kgenAddBlankLine(ctx); genUpdateIntermTrsmResult(ctx, &gset, updateResFnRev, updateResGenericFnRev, true); genHeapTrsmResultToLDS(ctx, &gset, copyPLFn, "tempC"); kgenAddBarrier(ctx, CLK_LOCAL_MEM_FENCE); genZeroResult(ctx, dtype, subdims); // multypling on an inverted block sprintf(tmp, "%s((LPtr)(tempA + (lid / %u * %lu) * %lu), \n" " (LPtr)(tempC + (lid %% %u * %lu) * %lu),\n" " (%s*)c, lid %% %lu);\n\n", blkmul, l1Pans, subdims[1].y, pitchAB, l1Pans, subdims[1].x, pitchAB, outTypeName, subdims[1].y); ret = kgenAddStmt(ctx, tmp); // write back the tile evaluated upFlags = kextraToUpresFlags(CLBLAS_TRSM, kflags); upFlags |= UPRES_EXCEED_PROBLEM_CONDITION; genResultUpdateWithFlagsOld(ctx, CLBLAS_TRSM, &gset, upFlags, updateResFn, updateResGenericFn, NULL); kgenAddBarrier(ctx, CLK_GLOBAL_MEM_FENCE); if (isMatrixUpper(kflags)) { sprintf(tmp, "currM -= %lu;\n", subdims[0].y); } else { sprintf(tmp, "currM += %lu;\n", subdims[0].y); } kgenAddStmt(ctx, tmp); kgenEndBranch(ctx, NULL); // loop over M kgenEndFuncBody(ctx); ret = kgenAddBlankLine(ctx); if (!ret) { ret = (ssize_t)kgenSourceSize(ctx) + 1; } destroyKgenContext(ctx); return (ret < 0) ? -EOVERFLOW : ret; } static void assignKargs(KernelArg *args, const void *params, const void *extra) { const CLBlasKargs *blasArgs = (const CLBlasKargs*)params; KernelExtraFlags kflags = ((CLBLASKernExtra*)extra)->flags; int idx = 7; initSizeKarg(&args[0], blasArgs->M); initSizeKarg(&args[1], blasArgs->N); assignScalarKarg(&args[2], &(blasArgs->alpha), blasArgs->dtype); initMemobjKarg(&args[3], blasArgs->A, NULL, 0, 0); initSizeKarg(&args[4], blasArgs->lda.matrix); initMemobjKarg(&args[5], blasArgs->B, NULL, 0, 0); initSizeKarg(&args[6], blasArgs->ldb.matrix); if (kflags & KEXTRA_STARTM_NOT_ZERO) { initSizeKarg(&args[idx++], blasArgs->offsetM); } if (kflags & KEXTRA_STARTN_NOT_ZERO) { initSizeKarg(&args[idx++], blasArgs->offsetN); } if (kflags & KEXTRA_A_OFF_NOT_ZERO) { initSizeKarg(&args[idx++], blasArgs->offA); } if (kflags & KEXTRA_BX_OFF_NOT_ZERO) { initSizeKarg(&args[idx++], blasArgs->offBX); } } static bool isFitToLDS( SubproblemDim *dim, DataType dtype, cl_ulong ldsSize, const void *kernelArgs) { cl_ulong sizeA, sizeB, size; const CLBlasKargs *kargs = (const CLBlasKargs*)kernelArgs; /* * It's needed one block for each matrix A and B, * and one block of size maximal of this one for * matrix B and matrix C */ sizeA = matrBlockSize(dim, MATRIX_A, dtype, kargs->side); sizeB = matrBlockSize(dim, MATRIX_B, dtype, kargs->side); size = matrBlockSize(dim, MATRIX_C, dtype, kargs->side); if (sizeB > size) { size = sizeB; } size += sizeA + sizeB; return (size * dtypeSize(dtype) <= ldsSize); } static SolverFlags solverFlags(void) { return (SF_WSPACE_1D | SF_TOP_INPUT_SQUARE_BLOCKS); } void initTrsmLdsPattern(MemoryPattern *mempat) { mempat->name = "LDS based block trsm"; mempat->nrLevels = 2; mempat->cuLevel = 0; mempat->thLevel = 1; mempat->sops = &solverOps; mpatExtra.aMset = CLMEM_LEVEL_LDS; mpatExtra.bMset = CLMEM_LEVEL_LDS; mpatExtra.mobjA = CLMEM_BUFFER; mpatExtra.mobjB = CLMEM_BUFFER; mempat->extra = &mpatExtra; } clblas-2.10/src/library/blas/gens/legacy/trxm_common_legacy.c000066400000000000000000000204231264277366700243270ustar00rootroot00000000000000/* ************************************************************************ * Copyright 2013 Advanced Micro Devices, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * ************************************************************************/ #include #include #include #include "trxm_common_legacy.h" void declareLdsBasedTrxmVariables( struct KgenContext *ctx, DataType dtype, const SubproblemDim *dims, const PGranularity *pgran, bool useLocalC) { char tmp[1024]; size_t pitchAB, pitchC; const char *inTypeName, *outTypeName; unsigned int nrRegs; unsigned int vecLen; inTypeName = dtypeBuiltinType(dtype); pitchAB = matrBlockPitch(dims, MATRIX_A, dtype, clblasLeft); pitchC = matrBlockPitch(dims, MATRIX_C, dtype, clblasLeft); vecLen = sizeof(cl_float4) / dtypeSize(dtype); sprintf(tmp, "__local %s tempA[%lu];\n" "__local %s tempB[%lu];\n" "uint m0, k0;\n" "uint currM, currN;\n" "uint2 coordA, coordB;\n" "uint x, y;\n", inTypeName, pitchAB * dims->y, inTypeName, pitchAB * dims->x); kgenAddStmt(ctx, tmp); getResultGPRsInfo(dtype, &dims[1], vecLen, &nrRegs, &outTypeName); if (useLocalC) { sprintf(tmp, "__local %s tempC[%lu];\n", inTypeName, pitchC * dims->y); } else { sprintf(tmp, "%s c[%u];\n", outTypeName, nrRegs); } kgenAddStmt(ctx, tmp); kgenDeclareLocalID(ctx, "lid", pgran); kgenDeclareGroupID(ctx, "gid", pgran); kgenAddBlankLine(ctx); } void genPrepareTrxmBlockA( struct KgenContext *ctx, const SubproblemDim *dim, DataType dtype, const CopyBufFuncs *copyFuncs, const ZeroFuncs *zeroFuncs, KernelExtraFlags flags, const char *nameM) { char tmp[1024]; size_t pitch; const char *coordName[2] = {"currM", "k0"}; const char *sizeName[2] = {"y", "x"}; int tra; pitch = matrBlockPitch(dim, MATRIX_A, dtype, clblasLeft); tra = isMatrixAccessColMaj(CLBLAS_TRMM, flags, MATRIX_A); /* * If the (sub)problem is integrally divisible, * skip any checks, and just read with optimal blocks, * otherwise check for tails and then read with a * fast function in the case of optimal blocks, and with * the slow one in the case of tails respectively */ if (!(flags & KEXTRA_TAILS_M)) { sprintf(tmp, "%s((LPtr)tempA, (GPtr)A, %s, %s, lda);\n", copyFuncs->read[MATRIX_A], coordName[tra], coordName[1 - tra]); } else { sprintf(tmp, "y = (currM + %lu <= M) ? %lu : M - currM;\n" "x = (k0 + %lu <= %s) ? %lu : %s - k0;\n" "if ((y == %lu) && (x == %lu)) {\n" // fast read " %s((LPtr)tempA, (GPtr)A, %s, %s, lda);\n" "}\n" "else {\n" " %s((__local float4*)tempA);\n" // zeroing " barrier(CLK_LOCAL_MEM_FENCE);\n" // slow read " %s((LPtr)tempA, (GPtr)A, %s, %s, %s, %s, %lu, lda);\n" "}\n\n", dim->y, dim->y, dim->bwidth, nameM, dim->bwidth, nameM, dim->y, dim->bwidth, copyFuncs->read[MATRIX_A], coordName[tra], coordName[1 - tra], zeroFuncs->names[MATRIX_A], copyFuncs->readGeneric[MATRIX_A], coordName[tra], coordName[1 - tra], sizeName[tra], sizeName[1 - tra], pitch); } kgenAddStmt(ctx, tmp); } void genPrepareTrxmBlockB( struct KgenContext *ctx, const SubproblemDim *dim, DataType dtype, const CopyBufFuncs *copyFuncs, const ZeroFuncs *zeroFuncs, KernelExtraFlags flags) { char tmp[1024]; size_t pitch; const char *coordName[2] = {"currN", "k0"}; const char *sizeName[2] = {"y", "x"}; int trb; trb = isMatrixAccessColMaj(CLBLAS_TRMM, flags, MATRIX_B); pitch = matrBlockPitch(dim, MATRIX_B, dtype, clblasLeft); if (!(flags & (KEXTRA_TAILS_N | KEXTRA_TAILS_K))) { sprintf(tmp, "%s((LPtr)tempB, (GPtr)B, %s, %s, ldb);\n", copyFuncs->read[MATRIX_B], coordName[trb], coordName[1 - trb]); } else { sprintf(tmp, "y = (currN + %lu <= N) ? %lu : N - currN;\n" "x = (k0 + %lu <= M) ? %lu : M - k0;\n" "if ((y == %lu) && (x == %lu)) {\n" // fast read " %s((LPtr)tempB, (GPtr)B, %s, %s, ldb);\n" "}\n" "else {\n" " %s((__local float4*)tempB);\n" // zeroing " barrier(CLK_LOCAL_MEM_FENCE);\n" // barrier if it's needed // slow read " %s((LPtr)tempB, (GPtr)B, %s, %s, %s, %s, %lu, ldb);\n" "}\n\n", dim->x, dim->x, dim->bwidth, dim->bwidth, dim->x, dim->bwidth, copyFuncs->read[MATRIX_B], coordName[trb], coordName[1 - trb], zeroFuncs->names[MATRIX_B], copyFuncs->readGeneric[MATRIX_B], coordName[trb], coordName[1 - trb], sizeName[trb], sizeName[1 - trb], pitch); } kgenAddStmt(ctx, tmp); } void genTriangMatrBlock( struct KgenContext *ctx, const SubproblemDim *dim, DataType dtype, KernelExtraFlags kflags) { char tmp[1024], tmp1[512]; const char *one; size_t pitch; pitch = matrBlockPitch(dim, MATRIX_A, dtype, clblasLeft); one = strOne(dtype); strcpy(tmp1, ""); // staring diagonal coordinates kgenAddStmt(ctx, "y = (k0 < currM) ? 0 : (k0 - currM);\n" "x = (k0 < currM) ? (currM - k0) : 0;\n\n"); if (isMatrixUpper(kflags)) { /* * resulting block is upper diagonal, zeroing everything * below the diagonal and set "1" on the diagonal for the * unit diagonal matrix */ if (kflags & KEXTRA_UNIT_DIAGONAL) { sprintf(tmp1, "\n" " if (x < %lu) {\n" " tempA[lid * %lu + x] = %s;\n" " }\n", dim->bwidth, pitch, one); } sprintf(tmp, "if (lid >= y && lid < %lu) {\n" " uint i;\n" "\n" " x = x + lid - y;\n" " x = (x > %lu) ? %lu : x;\n" "\n" " for (i = 0; i < x; i++) {\n" " tempA[lid * %lu + i] = 0;\n" " }\n" "%s" "}\n", dim->y, dim->bwidth, dim->bwidth, pitch, tmp1); } else { /* * resulting block is lower diagonal, zeroing everything * above the diagonal and set "1" on the diagonal for the * unit diagonal matrix */ if (kflags & KEXTRA_UNIT_DIAGONAL) { sprintf(tmp1, "\n" " if (y < %lu) {\n" " tempA[y * %lu + lid] = %s;\n" " }\n", dim->y, pitch, one); } sprintf(tmp, "if (lid >= x && lid < %lu) {\n" " uint i;\n" "\n" " y = y + lid - x;\n" " y = (y > %lu) ? %lu : y;\n" "\n" " for (i = 0; i < y; i++) {\n" " tempA[i * %lu + lid] = 0;\n" " }\n" "%s" "}\n", dim->bwidth, dim->y, dim->y, pitch, tmp1); } kgenAddStmt(ctx, tmp); kgenAddBarrier(ctx, CLK_LOCAL_MEM_FENCE); kgenAddBlankLine(ctx); } clblas-2.10/src/library/blas/gens/legacy/trxm_common_legacy.h000066400000000000000000000051141264277366700243340ustar00rootroot00000000000000/* ************************************************************************ * Copyright 2013 Advanced Micro Devices, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * ************************************************************************/ #ifndef TRXM_COMMON_LEGACY_H_ #define TRXM_COMMON_LEGACY_H_ #include "../gen_helper.h" /* * Declare local variables for LDS based version * of TRXM kernels. * * It provides the names typical for another generators as well: * * lid, gid - local and global ID. * m0, k0 - top level counters over M and N * currM, currN - current block coordinates over M and N at the top level * tempA, tempB - blocks of matrix A and B located in the local memory * tempC - block of matrix C located in the local memory; declared if * the 'useLocalC' argument is set * c - matrix C tile located in registers; declared if the 'useLocalC' * argument is not set * x, y - auxiliary variables to evaluate size of read/write blocks * * TRXM specific variables: * * startM, endM - starting and end coordinate over rows a kernel can access */ void declareLdsBasedTrxmVariables( struct KgenContext *ctx, DataType dtype, const SubproblemDim *dims, const PGranularity *pgran, bool useLocalC); /* * NOTE: the all following functions generate a code * using local variables declared with the * 'declareTrxmLocalVariables' function */ void genPrepareTrxmBlockA( struct KgenContext *ctx, const SubproblemDim *dim, DataType dtype, const CopyBufFuncs *copyFuncs, const ZeroFuncs *zeroFuncs, KernelExtraFlags flags, const char *nameM); void genPrepareTrxmBlockB( struct KgenContext *ctx, const SubproblemDim *dim, DataType dtype, const CopyBufFuncs *copyFuncs, const ZeroFuncs *zeroFuncs, KernelExtraFlags flags); /* * Triangulate matrix block. The decision to triangulate is * made based on the current coordinates. */ void genTriangMatrBlock( struct KgenContext *ctx, const SubproblemDim *dim, DataType dtype, KernelExtraFlags kflags); #endif /* TRXM_COMMON_LEGACY_H_ */ clblas-2.10/src/library/blas/gens/nrm2.cpp000066400000000000000000000170031264277366700204130ustar00rootroot00000000000000/* ************************************************************************ * Copyright 2013 Advanced Micro Devices, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * ************************************************************************/ /* * nrm2 generator */ //#define DEBUG_NRM2 #define WORKGROUPS_PER_CU 32 #include #include #include #include #include #include #include #include #include "blas_kgen.h" #include #include #include #define min(a, b) (((a) < (b)) ? (a) : (b)) extern "C" unsigned int dtypeSize(DataType type); static char Prefix[4]; static SolverFlags solverFlags(void) { #ifdef DEBUG_NRM2 printf("solverFlags called...\n"); #endif return (SF_WSPACE_1D); } static void calcNrThreads( size_t threads[2], const SubproblemDim *subdims, const PGranularity *pgran, const void *args, const void *extra); static ssize_t generator( char *buf, size_t buflen, const struct SubproblemDim *subdims, const struct PGranularity *pgran, void *extra); static void fixupArgs(void *args, SubproblemDim *subdims, void *extra); static void assignKargs(KernelArg *args, const void *params, const void* extra ); extern "C" void initNrm2RegisterPattern(MemoryPattern *mempat); static KernelExtraFlags selectVectorization( void *kargs, unsigned int vlen ); static void setBuildOpts( char * buildOptStr, const void *kArgs); static SolverOps nrm2Ops = { generator, assignKargs, NULL, NULL, NULL, calcNrThreads, NULL, solverFlags, fixupArgs, NULL, NULL, setBuildOpts, selectVectorization }; static KernelExtraFlags selectVectorization( void *args, unsigned int vlen ) { KernelExtraFlags kflags = KEXTRA_NO_FLAGS; CLBlasKargs *kargs = (CLBlasKargs *)args; if(((kargs->offBX) % vlen) != 0) { kflags = KEXTRA_NO_COPY_VEC_A; } return kflags; } static void setBuildOpts( char * buildOptStr, const void *args) { const SolutionStep *step = (const SolutionStep *)args; const CLBlasKargs *kargs = (const CLBlasKargs *)(&step->args); if ( kargs->dtype == TYPE_DOUBLE || kargs->dtype == TYPE_COMPLEX_DOUBLE) { addBuildOpt( buildOptStr, BUILD_OPTS_MAXLEN, "-DDOUBLE_PRECISION"); } if( (kargs->dtype == TYPE_COMPLEX_FLOAT) || (kargs->dtype == TYPE_COMPLEX_DOUBLE) ) { addBuildOpt( buildOptStr, BUILD_OPTS_MAXLEN, "-DCOMPLEX"); } if(kargs->redctnType == REDUCE_BY_HYPOT) { addBuildOpt( buildOptStr, BUILD_OPTS_MAXLEN, "-DUSE_HYPOT"); } else if(kargs->redctnType == REDUCE_BY_SSQ) { addBuildOpt( buildOptStr, BUILD_OPTS_MAXLEN, "-DUSE_SSQ"); } if( (kargs->ldb.vector) != 1) { addBuildOpt( buildOptStr, BUILD_OPTS_MAXLEN, "-DINCX_NONUNITY"); } if( (kargs->ldb.vector) < 1) { addBuildOpt( buildOptStr, BUILD_OPTS_MAXLEN, "-DRETURN_ON_INVALID"); } return; } static CLBLASMpatExtra mpatExtra; extern "C" void initNrm2RegisterPattern(MemoryPattern *mempat) { #ifdef DEBUG_NRM2 printf("initRegPattern called with mempat = 0x%p\n", mempat); #endif fflush(stdout); mempat->name = "Register accumulation based Nrm2"; mempat->nrLevels = 2; mempat->cuLevel = 0; mempat->thLevel = 1; mempat->sops = &nrm2Ops; mpatExtra.aMset = CLMEM_LEVEL_L2; mpatExtra.bMset = CLMEM_LEVEL_L2; mpatExtra.mobjA = CLMEM_GLOBAL_MEMORY; mpatExtra.mobjB = CLMEM_GLOBAL_MEMORY; mempat->extra = &mpatExtra; Prefix[TYPE_FLOAT] = 'S'; Prefix[TYPE_DOUBLE] = 'D'; Prefix[TYPE_COMPLEX_FLOAT] = 'C'; Prefix[TYPE_COMPLEX_DOUBLE] = 'Z'; } static void calcNrThreads( size_t threads[2], const SubproblemDim *subdims, const PGranularity *pgran, const void *args, const void *_extra) { DUMMY_ARG_USAGE(subdims); const CLBLASKernExtra *extra = ( CLBLASKernExtra *)_extra; CLBlasKargs *kargs = (CLBlasKargs *)args; SolutionStep *step = container_of(kargs, args, SolutionStep); TargetDevice *kDevice = &(step->device); cl_int err; unsigned int numComputeUnits = deviceComputeUnits( (kDevice->id), &err ); if(err != CL_SUCCESS) { numComputeUnits = 1; } unsigned int vecLen = extra->vecLenA; unsigned int blockSize = pgran->wgSize[0] * pgran->wgSize[1]; unsigned int wgToSpawn = ((kargs->N + (blockSize*vecLen) - 1)/ (blockSize*vecLen)); wgToSpawn = min( wgToSpawn, (numComputeUnits * WORKGROUPS_PER_CU) ); threads[0] = wgToSpawn * blockSize; threads[1] = 1; } // // FIXME: Report correct return value - Needs change in KPRINTF // static ssize_t generator( char *buf, size_t buflen, const struct SubproblemDim *subdims, const struct PGranularity *pgran, void *extra) { size_t BLOCKSIZE = pgran->wgSize[0]; char tempTemplate[32*1024]; SolutionStep *step = container_of(subdims, subdims, SolutionStep); if ( buf == NULL) // return buffer size { buflen = (32 * 1024 * sizeof(char)); return (ssize_t)buflen; } CLBLASKernExtra *extraFlags = ( CLBLASKernExtra *)extra; unsigned int vecLenA = extraFlags->vecLenA; bool doVLOAD = false; if( extraFlags->flags & KEXTRA_NO_COPY_VEC_A ) { doVLOAD = true; } const char *kernName; if(step->args.redctnType == REDUCE_BY_HYPOT) { kernName = nrm2_hypot_kernel; } else if (step->args.redctnType == REDUCE_BY_SSQ) { kernName = nrm2_ssq_kernel; } else { printf(" Error in selecting kernel!\n"); return 0; } strcpy( tempTemplate, kernName ); kprintf kobj( Prefix[extraFlags->dtype], vecLenA, doVLOAD, doVLOAD, BLOCKSIZE); kobj.spit((char*)buf, tempTemplate); return (32 * 1024 * sizeof(char)); } /* __kernel void %PREFIXnrm2_kernel( __global %TYPE *_X, __global %TYPE *_Y, __global %TYPE *scratchBuff, uint N, uint offx, int incx, uint offy, int incy, int doConj ) */ static void assignKargs(KernelArg *args, const void *params, const void* ) { CLBlasKargs *blasArgs = (CLBlasKargs*)params; cl_int incx; INIT_KARG(&args[0], blasArgs->B); INIT_KARG(&args[1], blasArgs->D); initSizeKarg(&args[2], blasArgs->N); initSizeKarg(&args[3], blasArgs->offBX); incx = blasArgs->ldb.vector; INIT_KARG(&args[4], incx); return; } /** The purpose of this function is to add an work-group size indicator in kernelKey, so that a different kernel is generated when work-group size is changed. Reduction loop is unrolled in kprintf based on work-group size. Member of SubproblemDim- bwidth, will be used to store work-group size of the current kernel this will become a kernelKey, and kernel cache will be accordingly managed. Note -- SubproblemDim is a member of kernelKey **/ static void fixupArgs(void *args, SubproblemDim *subdims, void *extra) { DUMMY_ARG_USAGE(extra); CLBlasKargs *kargs = (CLBlasKargs*)args; SolutionStep *step = container_of(kargs, args, SolutionStep); subdims->bwidth = (step->pgran.wgSize[0]) * (step->pgran.wgSize[1]); } clblas-2.10/src/library/blas/gens/reduction.cpp000066400000000000000000000213261264277366700215340ustar00rootroot00000000000000/* ************************************************************************ * Copyright 2013 Advanced Micro Devices, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * ************************************************************************/ /* * reduction generator */ //#define DEBUG_REDUCTION #include #include #include #include #include #include #include #include #include "blas_kgen.h" #include #include #include extern "C" unsigned int dtypeSize(DataType type); static char Prefix[4]; static SolverFlags solverFlags(void) { #ifdef DEBUG_REDUCTION printf("solverFlags called...\n"); #endif return (SF_WSPACE_1D); } static void calcNrThreads( size_t threads[2], const SubproblemDim *subdims, const PGranularity *pgran, const void *args, const void *extra); static ssize_t generator( char *buf, size_t buflen, const struct SubproblemDim *subdims, const struct PGranularity *pgran, void *extra); static void fixupArgs(void *args, SubproblemDim *subdims, void *extra); static void assignKargs(KernelArg *args, const void *params, const void* extra ); extern "C" void initReductionRegisterPattern(MemoryPattern *mempat); static KernelExtraFlags selectVectorization( void *kargs, unsigned int vlen ); static void setBuildOpts( char * buildOptStr, const void *kArgs); static SolverOps reductionOps = { generator, assignKargs, NULL, NULL, // Prepare Translate Dims NULL, // Inner Decomposition Axis calcNrThreads, NULL, solverFlags, fixupArgs, NULL, NULL, setBuildOpts, selectVectorization }; static KernelExtraFlags selectVectorization( void *args, unsigned int vlen ) { KernelExtraFlags kflags = KEXTRA_NO_FLAGS; CLBlasKargs *kargs = (CLBlasKargs *)args; if( (((kargs->offBX) % vlen) != 0) || (((kargs->offCY) % vlen) != 0) ) { kflags = KEXTRA_NO_COPY_VEC_A; } // Since ssq will be vector-loaded from Nth location of scratch buffer i.e scratchBuff[N] // If N is not a multiple of vlen, then use vload if( (kargs->redctnType == REDUCE_BY_SSQ) && (((kargs->N) % vlen) != 0) ) { kflags = KEXTRA_NO_COPY_VEC_A; } return kflags; } static void setBuildOpts( char * buildOptStr, const void *args) { const SolutionStep *step = (const SolutionStep *)args; const CLBlasKargs *kargs = (const CLBlasKargs *)(&step->args); if ( kargs->dtype == TYPE_DOUBLE || kargs->dtype == TYPE_COMPLEX_DOUBLE) { addBuildOpt( buildOptStr, BUILD_OPTS_MAXLEN, "-DDOUBLE_PRECISION"); } switch(kargs->redctnType) { case REDUCE_BY_SUM: addBuildOpt( buildOptStr, BUILD_OPTS_MAXLEN, "-DREDUCE_BY_SUM"); break; case REDUCE_BY_MAX: addBuildOpt( buildOptStr, BUILD_OPTS_MAXLEN, "-DREDUCE_BY_MAX"); break; case REDUCE_BY_MIN: addBuildOpt( buildOptStr, BUILD_OPTS_MAXLEN, "-DREDUCE_BY_MIN"); break; case REDUCE_MAX_WITH_INDEX: addBuildOpt( buildOptStr, BUILD_OPTS_MAXLEN, "-DREDUCE_MAX_WITH_INDEX"); break; case REDUCE_BY_HYPOT: addBuildOpt( buildOptStr, BUILD_OPTS_MAXLEN, "-DREDUCE_BY_HYPOT"); break; case REDUCE_BY_SSQ: addBuildOpt( buildOptStr, BUILD_OPTS_MAXLEN, "-DREDUCE_BY_SSQ"); break; case REDUCE_MAX_WITH_INDEX_ATOMICS: addBuildOpt( buildOptStr, BUILD_OPTS_MAXLEN, "-DREDUCE_MAX_WITH_INDEX_ATOMICS"); break; default: printf("Invalid reduction type!!\n"); break; } return; } static CLBLASMpatExtra mpatExtra; extern "C" void initReductionRegisterPattern(MemoryPattern *mempat) { #ifdef DEBUG_REDUCTION printf("initRegPattern called with mempat = 0x%p\n", mempat); #endif fflush(stdout); mempat->name = "Register accumulation based swap"; mempat->nrLevels = 2; mempat->cuLevel = 0; mempat->thLevel = 1; mempat->sops = &reductionOps; mpatExtra.aMset = CLMEM_LEVEL_L2; mpatExtra.bMset = CLMEM_LEVEL_L2; mpatExtra.mobjA = CLMEM_GLOBAL_MEMORY; mpatExtra.mobjB = CLMEM_GLOBAL_MEMORY; mempat->extra = &mpatExtra; Prefix[TYPE_FLOAT] = 'S'; Prefix[TYPE_DOUBLE] = 'D'; Prefix[TYPE_COMPLEX_FLOAT] = 'C'; Prefix[TYPE_COMPLEX_DOUBLE] = 'Z'; } static void calcNrThreads( size_t threads[2], const SubproblemDim *subdims, const PGranularity *pgran, const void *args, const void *_extra) { DUMMY_ARGS_USAGE_3(subdims, args, _extra); int BLOCKSIZE = pgran->wgSize[0] * pgran->wgSize[1]; // 1D Block size_t blocks = 1; // Reduction will use only 1 block #ifdef DEBUG_REDUCTION printf("blocks : %d\n", blocks); #endif threads[0] = blocks * BLOCKSIZE; #ifdef DEBUG_REDUCTION printf("pgran-wgSize[0] : %d, globalthreads[0] : %d\n", pgran->wgSize[0], threads[0]); #endif threads[1] = 1; } // // FIXME: Report correct return value - Needs change in KPRINTF // static ssize_t generator( char *buf, size_t buflen, const struct SubproblemDim *subdims, const struct PGranularity *pgran, void *extra) { DUMMY_ARG_USAGE(subdims); size_t BLOCKSIZE = pgran->wgSize[0]; char tempTemplate[32*1024]; if ( buf == NULL) // return buffer size { buflen = (32 * 1024 * sizeof(char)); return (ssize_t)buflen; } CLBLASKernExtra *extraFlags = ( CLBLASKernExtra *)extra; SolutionStep *step = container_of( pgran , pgran, SolutionStep); CLBlasKargs* kargs = (CLBlasKargs*) &(step->args); char const *kernName; if(kargs->redctnType == REDUCE_BY_SUM) { kernName = red_sum_kernel; } else if(kargs->redctnType == REDUCE_BY_MAX) { kernName = red_max_kernel; } else if(kargs->redctnType == REDUCE_BY_MIN) { kernName = red_min_kernel; } else if(kargs->redctnType == REDUCE_MAX_WITH_INDEX) { kernName = red_with_index_kernel; } else if(kargs->redctnType == REDUCE_BY_HYPOT) { kernName = red_hypot_kernel; } else if(kargs->redctnType == REDUCE_BY_SSQ) { kernName = red_ssq_kernel; } #ifdef DEBUG_REDUCTION printf("REDUCTION GENERATOR called....\n"); printf("dataType : %c\n", Prefix[extraFlags->dtype]); printf("Vector length used : %d\n\n", vecLenA); #endif unsigned int vecLenA = extraFlags->vecLenA; bool doVLOAD = false; if( extraFlags->flags & KEXTRA_NO_COPY_VEC_A ) { doVLOAD = true; } strcpy( tempTemplate, kernName ); kprintf kobj( Prefix[extraFlags->dtype], vecLenA, doVLOAD, doVLOAD, BLOCKSIZE); kobj.spit((char*)buf, tempTemplate); return (32 * 1024 * sizeof(char)); } /* __kernel void %PREFIXred_sum_kernel( __global %TYPE *_X, __global %TYPE *_res, uint N, uint offx, uint offRes ) */ static void assignKargs(KernelArg *args, const void *params, const void* _extra) { DUMMY_ARG_USAGE(_extra); CLBlasKargs *blasArgs = (CLBlasKargs*)params; INIT_KARG(&args[0], blasArgs->D); INIT_KARG(&args[1], blasArgs->A); initSizeKarg(&args[2], blasArgs->N); size_t offScratch = 0; initSizeKarg(&args[3], offScratch); initSizeKarg(&args[4], blasArgs->offA); return; } /** The purpose of this function is to add an work-group size indicator in kernelKey, so that a different kernel is generated when work-group size is changed. Reduction loop is unrolled in kprintf based on work-group size. Member of SubproblemDim- bwidth, will be used to store work-group size of the current kernel this will become a kernelKey, and kernel cache will be accordingly managed. Note -- SubproblemDim is a member of kernelKey **/ static void fixupArgs(void *args, SubproblemDim *subdims, void *extra) { DUMMY_ARG_USAGE(extra); CLBlasKargs *kargs = (CLBlasKargs*)args; SolutionStep *step = container_of(kargs, args, SolutionStep); subdims->bwidth = (step->pgran.wgSize[0]) * (step->pgran.wgSize[1]); } clblas-2.10/src/library/blas/gens/rotg_reg.cpp000066400000000000000000000121461264277366700213500ustar00rootroot00000000000000/* ************************************************************************ * Copyright 2013 Advanced Micro Devices, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * ************************************************************************/ /* * rotg generator */ //#define DEBUG_ROTG #include #include #include #include #include #include #include #include #include "blas_kgen.h" #include #include #include #include "blas_subgroup.h" #include "gen_helper.h" extern "C" unsigned int dtypeSize(DataType type); static char Prefix[4]; static SolverFlags solverFlags(void) { return (SF_WSPACE_1D); } static void calcNrThreads( size_t threads[2], const SubproblemDim *subdims, const PGranularity *pgran, const void *args, const void *extra); static ssize_t generator( char *buf, size_t buflen, const struct SubproblemDim *subdims, const struct PGranularity *pgran, void *extra); static void assignKargs(KernelArg *args, const void *params, const void* extra ); extern "C" void initRotgRegisterPattern(MemoryPattern *mempat); static void setBuildOpts( char * buildOptStr, const void *kArgs); static SolverOps rotgOps = { generator, assignKargs, NULL, NULL, // Prepare Translate Dims NULL, // Inner Decomposition Axis calcNrThreads, NULL, solverFlags, NULL, NULL, NULL, setBuildOpts, NULL }; static void setBuildOpts( char * buildOptStr, const void *args) { const SolutionStep *step = (const SolutionStep *)args; const CLBlasKargs *kargs = (const CLBlasKargs *)(&step->args); if ( (kargs->dtype == TYPE_DOUBLE) || (kargs->dtype == TYPE_COMPLEX_DOUBLE) ) { addBuildOpt( buildOptStr, BUILD_OPTS_MAXLEN, "-DDOUBLE_PRECISION"); } if( (kargs->dtype == TYPE_COMPLEX_FLOAT) || (kargs->dtype == TYPE_COMPLEX_DOUBLE) ) { addBuildOpt( buildOptStr, BUILD_OPTS_MAXLEN, "-DCOMPLEX"); } return; } static CLBLASMpatExtra mpatExtra; extern "C" void initRotgRegisterPattern(MemoryPattern *mempat) { #ifdef DEBUG_ROTG printf("initRegPattern called with mempat = 0x%p\n", mempat); #endif fflush(stdout); mempat->name = "Register accumulation based swap"; mempat->nrLevels = 2; mempat->cuLevel = 0; mempat->thLevel = 1; mempat->sops = &rotgOps; mpatExtra.aMset = CLMEM_LEVEL_L2; mpatExtra.bMset = CLMEM_LEVEL_L2; mpatExtra.mobjA = CLMEM_GLOBAL_MEMORY; mpatExtra.mobjB = CLMEM_GLOBAL_MEMORY; mempat->extra = &mpatExtra; Prefix[TYPE_FLOAT] = 'S'; Prefix[TYPE_DOUBLE] = 'D'; Prefix[TYPE_COMPLEX_FLOAT] = 'C'; Prefix[TYPE_COMPLEX_DOUBLE] = 'Z'; } static void calcNrThreads( size_t threads[2], const SubproblemDim *subdims, const PGranularity *pgran, const void *args, const void *_extra) { int BLOCKSIZE = pgran->wgSize[0] * pgran->wgSize[1]; // 1D Block DUMMY_ARGS_USAGE_3(subdims, _extra, args); size_t blocks = 1; // Only 1 work-group is enough #ifdef DEBUG_ROTG printf("blocks : %d\n", blocks); #endif threads[0] = blocks * BLOCKSIZE; #ifdef DEBUG_ROTG printf("pgran-wgSize[0] : %d, globalthreads[0] : %d\n", pgran->wgSize[0], threads[0]); #endif threads[1] = 1; } // // FIXME: Report correct return value - Needs change in KPRINTF // static ssize_t generator( char *buf, size_t buflen, const struct SubproblemDim *subdims, const struct PGranularity *pgran, void *extra) { CLBLASKernExtra *extraFlags = ( CLBLASKernExtra *)extra; DUMMY_ARGS_USAGE_2(subdims, pgran); char tempTemplate[32*1024]; if ( buf == NULL) // return buffer size { buflen = (32 * 1024 * sizeof(char)); return (ssize_t)buflen; } #ifdef DEBUG_ROTG printf("dataType : %c\n", Prefix[extraFlags->dtype]); #endif strcpy( tempTemplate, (char*)rotg_kernel ); kprintf kobj( Prefix[extraFlags->dtype], 1, false, false); kobj.spit((char*)buf, tempTemplate); return (32 * 1024 * sizeof(char)); } /* __kernel void %PREFIXrotg_kernel( __global %TYPE *_A, __global %TYPE *_B, __global %PTYPE *_C, __global %TYPE *_S, uint offa, uint offb, uint offc, uint offs ) */ static void assignKargs(KernelArg *args, const void *params, const void* ) { CLBlasKargs *blasArgs = (CLBlasKargs*)params; INIT_KARG(&args[0], blasArgs->A); INIT_KARG(&args[1], blasArgs->B); INIT_KARG(&args[2], blasArgs->C); INIT_KARG(&args[3], blasArgs->D); initSizeKarg(&args[4], blasArgs->offa); initSizeKarg(&args[5], blasArgs->offb); initSizeKarg(&args[6], blasArgs->offc); initSizeKarg(&args[7], blasArgs->offd); return; } clblas-2.10/src/library/blas/gens/rotm_reg.cpp000066400000000000000000000162361264277366700213620ustar00rootroot00000000000000/* ************************************************************************ * Copyright 2013 Advanced Micro Devices, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * ************************************************************************/ /* * rotm generator */ //#define DEBUG_ROTM #define WORKGROUPS_PER_CU 32 #include #include #include #include #include #include #include #include #include "blas_kgen.h" #include #include #include #define min(a, b) (((a) < (b)) ? (a) : (b)) extern "C" unsigned int dtypeSize(DataType type); static char Prefix[4]; static SolverFlags solverFlags(void) { return (SF_WSPACE_1D); } static void calcNrThreads( size_t threads[2], const SubproblemDim *subdims, const PGranularity *pgran, const void *args, const void *extra); static ssize_t generator( char *buf, size_t buflen, const struct SubproblemDim *subdims, const struct PGranularity *pgran, void *extra); static void assignKargs(KernelArg *args, const void *params, const void* extra ); extern "C" void initRotmRegisterPattern(MemoryPattern *mempat); static KernelExtraFlags selectVectorization( void *kargs, unsigned int vlen ); static void setBuildOpts( char * buildOptStr, const void *kArgs); static SolverOps rotmOps = { generator, assignKargs, NULL, NULL, // Prepare Translate Dims NULL, // Inner Decomposition Axis calcNrThreads, NULL, solverFlags, NULL, NULL, NULL, setBuildOpts, selectVectorization }; static KernelExtraFlags selectVectorization( void *args, unsigned int vlen ) { KernelExtraFlags kflags = KEXTRA_NO_FLAGS; CLBlasKargs *kargs = (CLBlasKargs *)args; if( (((kargs->offBX) % vlen) != 0) || (((kargs->offCY) % vlen) != 0) ) { kflags = KEXTRA_NO_COPY_VEC_A; } return kflags; } static void setBuildOpts( char * buildOptStr, const void *args) { const SolutionStep *step = (const SolutionStep *)args; const CLBlasKargs *kargs = (const CLBlasKargs *)(&step->args); if ( kargs->dtype == TYPE_DOUBLE || kargs->dtype == TYPE_COMPLEX_DOUBLE) { addBuildOpt( buildOptStr, BUILD_OPTS_MAXLEN, "-DDOUBLE_PRECISION"); } if(kargs->pigFuncID == CLBLAS_ROT) { addBuildOpt( buildOptStr, BUILD_OPTS_MAXLEN, "-DDO_ROT"); } if( (kargs->ldb.vector) != 1) { addBuildOpt( buildOptStr, BUILD_OPTS_MAXLEN, "-DINCX_NONUNITY"); } if( (kargs->ldc.vector) != 1) { addBuildOpt( buildOptStr, BUILD_OPTS_MAXLEN, "-DINCY_NONUNITY"); } return; } static CLBLASMpatExtra mpatExtra; extern "C" void initRotmRegisterPattern(MemoryPattern *mempat) { #ifdef DEBUG_ROTM printf("initRegPattern called with mempat = 0x%p\n", mempat); #endif fflush(stdout); mempat->name = "Register accumulation based swap"; mempat->nrLevels = 2; mempat->cuLevel = 0; mempat->thLevel = 1; mempat->sops = &rotmOps; mpatExtra.aMset = CLMEM_LEVEL_L2; mpatExtra.bMset = CLMEM_LEVEL_L2; mpatExtra.mobjA = CLMEM_GLOBAL_MEMORY; mpatExtra.mobjB = CLMEM_GLOBAL_MEMORY; mempat->extra = &mpatExtra; Prefix[TYPE_FLOAT] = 'S'; Prefix[TYPE_DOUBLE] = 'D'; Prefix[TYPE_COMPLEX_FLOAT] = 'C'; Prefix[TYPE_COMPLEX_DOUBLE] = 'Z'; } static void calcNrThreads( size_t threads[2], const SubproblemDim *subdims, const PGranularity *pgran, const void *args, const void *_extra) { DUMMY_ARG_USAGE(subdims); const CLBLASKernExtra *extra = ( CLBLASKernExtra *)_extra; CLBlasKargs *kargs = (CLBlasKargs *)args; SolutionStep *step = container_of(kargs, args, SolutionStep); TargetDevice *kDevice = &(step->device); cl_int err; unsigned int numComputeUnits = deviceComputeUnits( (kDevice->id), &err ); if(err != CL_SUCCESS) { numComputeUnits = 1; } unsigned int vecLen = extra->vecLenA; unsigned int blockSize = pgran->wgSize[0] * pgran->wgSize[1]; unsigned int wgToSpawn = ((kargs->N - 1)/ (blockSize*vecLen)) + 1; wgToSpawn = min( wgToSpawn, (numComputeUnits * WORKGROUPS_PER_CU) ); threads[0] = wgToSpawn * blockSize; threads[1] = 1; } // // FIXME: Report correct return value - Needs change in KPRINTF // static ssize_t generator( char *buf, size_t buflen, const struct SubproblemDim *subdims, const struct PGranularity *pgran, void *extra) { DUMMY_ARGS_USAGE_2(pgran, subdims); CLBLASKernExtra *extraFlags = ( CLBLASKernExtra *)extra; char tempTemplate[32*1024]; if ( buf == NULL) // return buffer size { buflen = (32 * 1024 * sizeof(char)); return (ssize_t)buflen; } #ifdef DEBUG_ROTM printf("dataType : %c\n", Prefix[extraFlags->dtype]); #endif unsigned int vecLenA = extraFlags->vecLenA; #ifdef DEBUG_ROTM printf("Vector length used : %d\n\n", vecLenA); #endif bool doVLOAD = false; if( extraFlags->flags & KEXTRA_NO_COPY_VEC_A ) { doVLOAD = true; #ifdef DEBUG_ROTM printf("DOing VLOAD as Aligned Data Pointer not Availabe\n"); #endif } else { #ifdef DEBUG_ROTM printf("Using Aligned Data Pointer .......\n"); #endif } strcpy( tempTemplate, (char*)rotm_kernel ); kprintf kobj( Prefix[extraFlags->dtype], vecLenA, doVLOAD, doVLOAD); kobj.spit((char*)buf, tempTemplate); return (32 * 1024 * sizeof(char)); } /* __kernel void %PREFIXrotm_kernel( __global %TYPE *_X, __global %TYPE *_Y, uint N, uint offx, int incx, uint offy, int incy #ifndef DO_ROT , __global %TYPE *_param, uint offParam // Rotm parameters #else , %PTYPE C, %PTYPE S // Rot parameters #endif */ static void assignKargs(KernelArg *args, const void *params, const void* ) { CLBlasKargs *blasArgs = (CLBlasKargs*)params; cl_int incx, incy; INIT_KARG(&args[0], blasArgs->A); INIT_KARG(&args[1], blasArgs->B); initSizeKarg(&args[2], blasArgs->N); initSizeKarg(&args[3], blasArgs->offBX); incx = blasArgs->ldb.vector; INIT_KARG(&args[4], incx); initSizeKarg(&args[5], blasArgs->offCY); incy = blasArgs->ldc.vector; INIT_KARG(&args[6], incy); if(blasArgs->pigFuncID == CLBLAS_ROT) { DataType alphaBetaType = (blasArgs->dtype == TYPE_COMPLEX_FLOAT)? TYPE_FLOAT: ((blasArgs->dtype == TYPE_COMPLEX_DOUBLE)? TYPE_DOUBLE: blasArgs->dtype); assignScalarKarg(&args[7], &(blasArgs->alpha), alphaBetaType); assignScalarKarg(&args[8], &(blasArgs->beta), alphaBetaType); } else if(blasArgs->pigFuncID == CLBLAS_ROTM) { INIT_KARG(&args[7], blasArgs->D); initSizeKarg(&args[8], blasArgs->offd); } return; } clblas-2.10/src/library/blas/gens/rotmg_reg.cpp000066400000000000000000000121021264277366700215150ustar00rootroot00000000000000/* ************************************************************************ * Copyright 2013 Advanced Micro Devices, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * ************************************************************************/ /* * rotmg generator */ //#define DEBUG_ROTMG #include #include #include #include #include #include #include #include #include "blas_kgen.h" #include #include #include extern "C" unsigned int dtypeSize(DataType type); static char Prefix[4]; static SolverFlags solverFlags(void) { return (SF_WSPACE_1D); } static void calcNrThreads( size_t threads[2], const SubproblemDim *subdims, const PGranularity *pgran, const void *args, const void *extra); static ssize_t generator( char *buf, size_t buflen, const struct SubproblemDim *subdims, const struct PGranularity *pgran, void *extra); static void assignKargs(KernelArg *args, const void *params, const void* extra ); extern "C" void initRotmgRegisterPattern(MemoryPattern *mempat); static void setBuildOpts( char * buildOptStr, const void *kArgs); static SolverOps rotmgOps = { generator, assignKargs, NULL, NULL, // Prepare Translate Dims NULL, // Inner Decomposition Axis calcNrThreads, NULL, solverFlags, NULL, NULL, NULL, setBuildOpts, NULL }; static void setBuildOpts( char * buildOptStr, const void *args) { const SolutionStep *step = (const SolutionStep *)args; const CLBlasKargs *kargs = (const CLBlasKargs *)(&step->args); if ( kargs->dtype == TYPE_DOUBLE || kargs->dtype == TYPE_COMPLEX_DOUBLE) { addBuildOpt( buildOptStr, BUILD_OPTS_MAXLEN, "-DDOUBLE_PRECISION"); } return; } static CLBLASMpatExtra mpatExtra; extern "C" void initRotmgRegisterPattern(MemoryPattern *mempat) { #ifdef DEBUG_ROTMG printf("initRegPattern called with mempat = 0x%p\n", mempat); #endif fflush(stdout); mempat->name = "Register accumulation based swap"; mempat->nrLevels = 2; mempat->cuLevel = 0; mempat->thLevel = 1; mempat->sops = &rotmgOps; mpatExtra.aMset = CLMEM_LEVEL_L2; mpatExtra.bMset = CLMEM_LEVEL_L2; mpatExtra.mobjA = CLMEM_GLOBAL_MEMORY; mpatExtra.mobjB = CLMEM_GLOBAL_MEMORY; mempat->extra = &mpatExtra; Prefix[TYPE_FLOAT] = 'S'; Prefix[TYPE_DOUBLE] = 'D'; Prefix[TYPE_COMPLEX_FLOAT] = 'C'; Prefix[TYPE_COMPLEX_DOUBLE] = 'Z'; } static void calcNrThreads( size_t threads[2], const SubproblemDim *subdims, const PGranularity *pgran, const void *args, const void *_extra) { DUMMY_ARGS_USAGE_3(subdims, _extra, args); int BLOCKSIZE = pgran->wgSize[0] * pgran->wgSize[1]; // 1D Block size_t blocks = 1; // Only 1 work-group is enough #ifdef DEBUG_ROTMG printf("blocks : %d\n", blocks); #endif threads[0] = blocks * BLOCKSIZE; #ifdef DEBUG_ROTMG printf("pgran-wgSize[0] : %d, globalthreads[0] : %d\n", pgran->wgSize[0], threads[0]); #endif threads[1] = 1; } // // FIXME: Report correct return value - Needs change in KPRINTF // static ssize_t generator( char *buf, size_t buflen, const struct SubproblemDim *subdims, const struct PGranularity *pgran, void *extra) { DUMMY_ARGS_USAGE_2(subdims, pgran); CLBLASKernExtra *extraFlags = ( CLBLASKernExtra *)extra; char tempTemplate[32*1024]; if ( buf == NULL) // return buffer size { buflen = (32 * 1024 * sizeof(char)); return (ssize_t)buflen; } #ifdef DEBUG_ROTMG printf("dataType : %c\n", Prefix[extraFlags->dtype]); #endif strcpy( tempTemplate, (char*)rotmg_kernel ); kprintf kobj( Prefix[extraFlags->dtype], 1, false, false); kobj.spit((char*)buf, tempTemplate); return (32 * 1024 * sizeof(char)); } /* __kernel void %PREFIXrotmg_kernel( __global %TYPE *_D1, __global %TYPE *_D2, __global %TYPE *_X1, __global %TYPE *_Y1, __global %TYPE *_param, uint offD1, uint offD2, uint offX1, uint offY1, uint offParam ) */ static void assignKargs(KernelArg *args, const void *params, const void* ) { CLBlasKargs *blasArgs = (CLBlasKargs*)params; INIT_KARG(&args[0], blasArgs->A); INIT_KARG(&args[1], blasArgs->B); INIT_KARG(&args[2], blasArgs->C); INIT_KARG(&args[3], blasArgs->D); INIT_KARG(&args[4], blasArgs->E); initSizeKarg(&args[5], blasArgs->offa); initSizeKarg(&args[6], blasArgs->offb); initSizeKarg(&args[7], blasArgs->offc); initSizeKarg(&args[8], blasArgs->offd); initSizeKarg(&args[9], blasArgs->offe); return; } clblas-2.10/src/library/blas/gens/scal_reg.cpp000066400000000000000000000143131264277366700213150ustar00rootroot00000000000000/* ************************************************************************ * Copyright 2013 Advanced Micro Devices, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * ************************************************************************/ /* * SCAL generator */ //#define DEBUG_SCAL #define WORKGROUPS_PER_CU 32 #include #include #include #include #include #include #include #include #include "blas_kgen.h" #include #include #include #define min(a, b) (((a) < (b)) ? (a) : (b)) extern "C" unsigned int dtypeSize(DataType type); static char Prefix[4]; static SolverFlags solverFlags(void) { #ifdef DEBUG_SCAL printf("solverFlags called......\n"); #endif return (SF_WSPACE_1D); } static void calcNrThreads( size_t threads[2], const SubproblemDim *subdims, const PGranularity *pgran, const void *args, const void *extra); static ssize_t generator( char *buf, size_t buflen, const struct SubproblemDim *subdims, const struct PGranularity *pgran, void *extra); static void assignKargs(KernelArg *args, const void *params, const void* extra ); extern "C" void initScalRegisterPattern(MemoryPattern *mempat); static KernelExtraFlags selectVectorization( void *kargs, unsigned int vlen ); static void setBuildOpts( char * buildOptStr, const void *kArgs); static SolverOps SCALOps = { generator, assignKargs, NULL, NULL, // Prepare Translate Dims NULL, // Inner Decomposition Axis calcNrThreads, NULL, solverFlags, NULL, NULL, NULL, setBuildOpts, selectVectorization }; static KernelExtraFlags selectVectorization( void *args, unsigned int vlen ) { KernelExtraFlags kflags = KEXTRA_NO_FLAGS; CLBlasKargs *kargs = (CLBlasKargs *)args; if( (((kargs->offBX) % vlen) != 0)) { kflags = KEXTRA_NO_COPY_VEC_A; } return kflags; } static void setBuildOpts( char * buildOptStr, const void *args) { const SolutionStep *step = (const SolutionStep *)args; const CLBlasKargs *kargs = (const CLBlasKargs *)(&step->args); if ( kargs->dtype == TYPE_DOUBLE || kargs->dtype == TYPE_COMPLEX_DOUBLE) { addBuildOpt( buildOptStr, BUILD_OPTS_MAXLEN, "-DDOUBLE_PRECISION"); #ifdef DEBUG_SCAL printf("Setting build options ... Double... for DOUBLE PRECISION support\n"); #endif } if( (kargs->ldb.vector) != 1) { addBuildOpt( buildOptStr, BUILD_OPTS_MAXLEN, "-DINCX_NONUNITY"); } return; } static CLBLASMpatExtra mpatExtra; extern "C" void initScalRegisterPattern(MemoryPattern *mempat) { #ifdef DEBUG_SCAL printf("initRegPattern called with mempat = 0x%p\n", mempat); #endif fflush(stdout); mempat->name = "Register accumulation based SCAL"; mempat->nrLevels = 2; mempat->cuLevel = 0; mempat->thLevel = 1; mempat->sops = &SCALOps; mpatExtra.aMset = CLMEM_LEVEL_L2; mpatExtra.bMset = CLMEM_LEVEL_L2; mpatExtra.mobjA = CLMEM_GLOBAL_MEMORY; mempat->extra = &mpatExtra; Prefix[TYPE_FLOAT] = 'S'; Prefix[TYPE_DOUBLE] = 'D'; Prefix[TYPE_COMPLEX_FLOAT] = 'C'; Prefix[TYPE_COMPLEX_DOUBLE] = 'Z'; } static void calcNrThreads( size_t threads[2], const SubproblemDim *subdims, const PGranularity *pgran, const void *args, const void *_extra) { DUMMY_ARG_USAGE(subdims); const CLBLASKernExtra *extra = ( CLBLASKernExtra *)_extra; CLBlasKargs *kargs = (CLBlasKargs *)args; SolutionStep *step = container_of(kargs, args, SolutionStep); TargetDevice *kDevice = &(step->device); cl_int err; unsigned int numComputeUnits = deviceComputeUnits( (kDevice->id), &err ); if(err != CL_SUCCESS) { numComputeUnits = 1; } unsigned int vecLen = extra->vecLenA; unsigned int blockSize = pgran->wgSize[0] * pgran->wgSize[1]; unsigned int wgToSpawn = ((kargs->N - 1)/ (blockSize*vecLen)) + 1; wgToSpawn = min( wgToSpawn, (numComputeUnits * WORKGROUPS_PER_CU) ); threads[0] = wgToSpawn * blockSize; threads[1] = 1; } static ssize_t generator( char *buf, size_t buflen, const struct SubproblemDim *subdims, const struct PGranularity *pgran, void *extra) { DUMMY_ARGS_USAGE_2(pgran, subdims); char tempTemplate[32*1024]; if ( buf == NULL) // return buffer size { buflen = (32 * 1024 * sizeof(char)); return (ssize_t)buflen; } CLBLASKernExtra *extraFlags = ( CLBLASKernExtra *)extra; #ifdef DEBUG_SCAL printf("SCAL GENERATOR called....\n"); printf("dataType : %c\n", Prefix[extraFlags->dtype]); #endif unsigned int vecLenA = extraFlags->vecLenA; #ifdef DEBUG_SCAL printf("Vector length used : %d\n\n", vecLenA); #endif bool doVLOAD = false; if( extraFlags->flags & KEXTRA_NO_COPY_VEC_A ) { doVLOAD = true; #ifdef DEBUG_SCAL printf("DOing VLOAD as Aligned Data Pointer not Availabe\n"); #endif } else { #ifdef DEBUG_SCAL printf("Using Aligned Data Pointer .........................\n"); #endif } strcpy( tempTemplate, (char*)scal_kernel ); kprintf kobj( Prefix[extraFlags->dtype], vecLenA, doVLOAD, doVLOAD); kobj.spit((char*)buf, tempTemplate); return (32 * 1024 * sizeof(char)); } /* __kernel void %PREFIXSCAL_kernel( __global %TYPE *_alpha, __global %TYPE *_X, uint N, uint offx, int incx ) */ static void assignKargs(KernelArg *args, const void *params, const void* ) { CLBlasKargs *blasArgs = (CLBlasKargs*)params; cl_int incx; assignScalarKarg(&args[0], &(blasArgs->alpha), blasArgs->dtype); INIT_KARG(&args[1], blasArgs->A); initSizeKarg(&args[2], blasArgs->N); initSizeKarg(&args[3], blasArgs->offBX); incx = blasArgs->ldb.vector; INIT_KARG(&args[4], incx); return; } clblas-2.10/src/library/blas/gens/swap_reg.cpp000066400000000000000000000147631264277366700213560ustar00rootroot00000000000000/* ************************************************************************ * Copyright 2013 Advanced Micro Devices, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * ************************************************************************/ /* * swap generator */ //#define DEBUG_SWAP #define WORKGROUPS_PER_CU 32 #include #include #include #include #include #include #include #include #include "blas_kgen.h" #include #include #include #define min(a, b) (((a) < (b)) ? (a) : (b)) extern "C" unsigned int dtypeSize(DataType type); static char Prefix[4]; static SolverFlags solverFlags(void) { #ifdef DEBUG_SWAP printf("solverFlags called......\n"); #endif return (SF_WSPACE_1D); } static void calcNrThreads( size_t threads[2], const SubproblemDim *subdims, const PGranularity *pgran, const void *args, const void *extra); static ssize_t generator( char *buf, size_t buflen, const struct SubproblemDim *subdims, const struct PGranularity *pgran, void *extra); static void assignKargs(KernelArg *args, const void *params, const void* extra ); extern "C" void initSwapRegisterPattern(MemoryPattern *mempat); static KernelExtraFlags selectVectorization( void *kargs, unsigned int vlen ); static void setBuildOpts( char * buildOptStr, const void *kArgs); static SolverOps swapOps = { generator, assignKargs, NULL, NULL, // Prepare Translate Dims NULL, // Inner Decomposition Axis calcNrThreads, NULL, solverFlags, NULL, NULL, NULL, setBuildOpts, selectVectorization }; static KernelExtraFlags selectVectorization( void *args, unsigned int vlen ) { KernelExtraFlags kflags = KEXTRA_NO_FLAGS; CLBlasKargs *kargs = (CLBlasKargs *)args; if( (((kargs->offBX) % vlen) != 0) || (((kargs->offCY) % vlen) != 0) ) { kflags = KEXTRA_NO_COPY_VEC_A; } return kflags; } static void setBuildOpts( char * buildOptStr, const void *args) { const SolutionStep *step = (const SolutionStep *)args; const CLBlasKargs *kargs = (const CLBlasKargs *)(&step->args); if ( kargs->dtype == TYPE_DOUBLE || kargs->dtype == TYPE_COMPLEX_DOUBLE) { addBuildOpt( buildOptStr, BUILD_OPTS_MAXLEN, "-DDOUBLE_PRECISION"); #ifdef DEBUG_SWAP printf("Setting build options ... Double... for DOUBLE PRECISION support\n"); #endif } if( (kargs->ldb.vector) != 1) { addBuildOpt( buildOptStr, BUILD_OPTS_MAXLEN, "-DINCX_NONUNITY"); } if( (kargs->ldc.vector) != 1) { addBuildOpt( buildOptStr, BUILD_OPTS_MAXLEN, "-DINCY_NONUNITY"); } return; } static CLBLASMpatExtra mpatExtra; extern "C" void initSwapRegisterPattern(MemoryPattern *mempat) { #ifdef DEBUG_SWAP printf("initREgPattern called with mempat = 0x%p\n", mempat); #endif fflush(stdout); mempat->name = "Register accumulation based swap"; mempat->nrLevels = 2; mempat->cuLevel = 0; mempat->thLevel = 1; mempat->sops = &swapOps; mpatExtra.aMset = CLMEM_LEVEL_L2; mpatExtra.bMset = CLMEM_LEVEL_L2; mpatExtra.mobjA = CLMEM_GLOBAL_MEMORY; mpatExtra.mobjB = CLMEM_GLOBAL_MEMORY; mempat->extra = &mpatExtra; Prefix[TYPE_FLOAT] = 'S'; Prefix[TYPE_DOUBLE] = 'D'; Prefix[TYPE_COMPLEX_FLOAT] = 'C'; Prefix[TYPE_COMPLEX_DOUBLE] = 'Z'; } static void calcNrThreads( size_t threads[2], const SubproblemDim *subdims, const PGranularity *pgran, const void *args, const void *_extra) { DUMMY_ARG_USAGE(subdims); const CLBLASKernExtra *extra = ( CLBLASKernExtra *)_extra; CLBlasKargs *kargs = (CLBlasKargs *)args; SolutionStep *step = container_of(kargs, args, SolutionStep); TargetDevice *kDevice = &(step->device); cl_int err; unsigned int numComputeUnits = deviceComputeUnits( (kDevice->id), &err ); if(err != CL_SUCCESS) { numComputeUnits = 1; } unsigned int vecLen = extra->vecLenA; unsigned int blockSize = pgran->wgSize[0] * pgran->wgSize[1]; unsigned int wgToSpawn = ((kargs->N - 1)/ (blockSize*vecLen)) + 1; wgToSpawn = min( wgToSpawn, (numComputeUnits * WORKGROUPS_PER_CU) ); threads[0] = wgToSpawn * blockSize; threads[1] = 1; } static ssize_t generator( char *buf, size_t buflen, const struct SubproblemDim *subdims, const struct PGranularity *pgran, void *extra) { DUMMY_ARGS_USAGE_2(pgran, subdims); char tempTemplate[32*1024]; if ( buf == NULL) // return buffer size { buflen = (32 * 1024 * sizeof(char)); return (ssize_t)buflen; } CLBLASKernExtra *extraFlags = ( CLBLASKernExtra *)extra; #ifdef DEBUG_SWAP printf("SWAP GENERATOR called....\n"); printf("dataType : %c\n", Prefix[extraFlags->dtype]); #endif unsigned int vecLenA = extraFlags->vecLenA; #ifdef DEBUG_SWAP printf("Vector length used : %d\n\n", vecLenA); #endif bool doVLOAD = false; if( extraFlags->flags & KEXTRA_NO_COPY_VEC_A ) { doVLOAD = true; #ifdef DEBUG_SWAP printf("DOing VLOAD as Aligned Data Pointer not Availabe\n"); #endif } else { #ifdef DEBUG_SWAP printf("Using Aligned Data Pointer .........................\n"); #endif } strcpy( tempTemplate, (char*)swap_kernel ); kprintf kobj( Prefix[extraFlags->dtype], vecLenA, doVLOAD, doVLOAD); kobj.spit((char*)buf, tempTemplate); return (32 * 1024 * sizeof(char)); } /* __kernel void %PREFIXswap_kernel( __global %TYPE *_X, __global %TYPE *_Y, uint N, uint offx, int incx, uint offy, int incy ) */ static void assignKargs(KernelArg *args, const void *params, const void* ) { CLBlasKargs *blasArgs = (CLBlasKargs*)params; cl_int incx, incy; INIT_KARG(&args[0], blasArgs->A); INIT_KARG(&args[1], blasArgs->B); initSizeKarg(&args[2], blasArgs->N); initSizeKarg(&args[3], blasArgs->offBX); incx = blasArgs->ldb.vector; INIT_KARG(&args[4], incx); initSizeKarg(&args[5], blasArgs->offCY); incy = blasArgs->ldc.vector; INIT_KARG(&args[6], incy); return; } clblas-2.10/src/library/blas/gens/symm_cached.cpp000066400000000000000000000164351264277366700220210ustar00rootroot00000000000000/* ************************************************************************ * Copyright 2013 Advanced Micro Devices, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * ************************************************************************/ /* * Cached global buffers based symm generator */ #include #include #include #include #include #include #include #include #include #include #include //#define DEBUG_SYMM extern "C" unsigned int dtypeSize(DataType type); static char Prefix[4]; static CLBLASMpatExtra mpatExtra; extern "C" unsigned int dtypeSize(DataType type); static ssize_t generator( char *buf, size_t buflen, const struct SubproblemDim *subdims, const struct PGranularity *pgran, void *extra); static void assignKargs(KernelArg *args, const void *params, const void*); /* static void calcNrThreads( size_t threads[2], const SubproblemDim *subdims, const PGranularity *pgran, const void *args, const void *extra); */ static SolverFlags solverFlags(void); static void setBuildOpts( char * buildOptStr, const void *kArgs); static SolverOps symmSops = { generator, assignKargs, NULL, //isFitLDS? NULL, //prepareTranslateDims? NULL, //DecomAxis NULL, // calcNrThreads, NULL, //ImagePackMode solverFlags, //SolverFlags NULL, NULL, NULL, setBuildOpts, //Set Build Options NULL }; static void setBuildOpts( char * buildOptStr, const void *args) { const SolutionStep *step = (const SolutionStep *)args; const CLBlasKargs *kargs = (const CLBlasKargs *)(&step->args); if ( kargs->dtype == TYPE_DOUBLE || kargs->dtype == TYPE_COMPLEX_DOUBLE) { addBuildOpt( buildOptStr, BUILD_OPTS_MAXLEN, "-DDOUBLE_PRECISION"); #ifdef DEBUG_TRMV printf("Setting build options ... Double... for DOUBLE PRECISION support\n"); #endif } if (kargs->side == clblasLeft) { addBuildOpt( buildOptStr, BUILD_OPTS_MAXLEN, "-D__SYMM_LEFT__ "); } else { addBuildOpt( buildOptStr, BUILD_OPTS_MAXLEN, "-D__SYMM_RIGHT__"); } if (kargs->uplo == clblasUpper) { addBuildOpt( buildOptStr, BUILD_OPTS_MAXLEN, "-D__SYMM_UPPER__"); } else { addBuildOpt( buildOptStr, BUILD_OPTS_MAXLEN, "-D__SYMM_LOWER__"); } if (kargs->order == clblasColumnMajor) { addBuildOpt( buildOptStr, BUILD_OPTS_MAXLEN, "-D__SYMM_COLMAJOR__"); } else { addBuildOpt( buildOptStr, BUILD_OPTS_MAXLEN, "-D__SYMM_ROWMAJOR__"); } strcat(buildOptStr, " -cl-mad-enable "); #ifdef DEBUG_SYMM printf("setBuildOptions: Setting to %s\n", buildOptStr); #endif return; } static SolverFlags solverFlags(void) { return (SF_WSPACE_1D); } static ssize_t generator( char *buf, size_t buflen, const struct SubproblemDim *subdims, const struct PGranularity *pgran, void *extra) { CLBLASKernExtra *kextra = (CLBLASKernExtra*)extra; KernelExtraFlags kflags = kextra->flags; DataType dtype = kextra->dtype; char tempTemplate[32*1024]; char itemx[10], itemy[10], width[10], itemy_by_width[10]; size_t Y, X, BLOCKSIZE, ITEMX, ITEMY; if (buf == NULL) { buflen = 32*1024*sizeof(char); return (ssize_t)buflen; } // // Row-major is implemented in terms of column major routines // if ((kflags & KEXTRA_COLUMN_MAJOR) == 0) { return 0; } kprintf kobj(Prefix[dtype], kextra->vecLenA, true, true); BLOCKSIZE = pgran->wgSize[0]; #ifdef DEBUG_SYMM printf("SYMM- generator(): Blocksize passed = %lu, subdimy = %lu, subdimx = %lu, veclen = %lu \n", BLOCKSIZE, subdims->y, subdims->x, kextra->vecLenA); #endif Y = 16; while (Y*(kextra->vecLenA) > subdims->y) { Y /= 2; } X = BLOCKSIZE/Y; ITEMY = (subdims->y) / Y; ITEMX = (subdims->x) / X; if (ITEMX == 0) { ITEMX = 1; } if ((BLOCKSIZE % Y) || ((subdims->y) % Y) || ((subdims->x)%X) || (ITEMY % kextra->vecLenA)) { printf("WARNING: SYMM- generator: subdim and blocksize in-compatible.\n"); } sprintf(width, "%" SPREFIX "u", Y); sprintf(itemy, "%" SPREFIX "u", ITEMY); sprintf(itemx, "%" SPREFIX "u", ITEMX); sprintf(itemy_by_width, "%" SPREFIX "u", (size_t) ITEMY/kextra->vecLenA); kobj.put("%WIDTH", width); kobj.put("%ITEMX", itemx); kobj.put("%ITEMY", itemy); kobj.put("%ITEMY_BY_V", itemy_by_width); #ifdef DEBUG_SYMM printf("ColMajor SYMM - WIDTH = %s, ITEMX = %s, ITEMY = %s\n", width, itemx, itemy); #endif strcpy(tempTemplate, SYMM_C_KERNEL); kobj.spit(buf, tempTemplate); #ifdef DEBUG_SYMM printf("Kernel = \n%s\n", buf); #endif size_t tail = strlen(buf) + 1; while(tail < 32*1024) { buf[tail++] = 0; } return 32*1024*sizeof(char); } /* __kernel void symm_C_kernel( __global %TYPE const * restrict _A, __global %TYPE const * restrict _B, __global %TYPE *C,\n\ uint M, uint N, uint _lda, uint _ldb, int ldc, %TYPE alpha, %TYPE beta) */ static void assignKargs(KernelArg *args, const void *params, const void*) { CLBlasKargs *blasArgs = (CLBlasKargs*)params; #ifdef DEBUG_SYMM printf("SAlpha=%f, DAlpha=%f, CAlpha =<%f, %f>, DAlpha=<%f, %f>\n", blasArgs->alpha.argFloat, blasArgs->alpha.argDouble, CREAL(blasArgs->alpha.argFloatComplex), CIMAG(blasArgs->alpha.argFloatComplex), CREAL(blasArgs->alpha.argDoubleComplex) , CIMAG(blasArgs->alpha.argDoubleComplex)); printf("SBeta=%f, DBeta=%f, CBeta=<%f, %f>, DBeta=<%f, %f>\n", blasArgs->beta.argFloat, blasArgs->beta.argDouble, CREAL(blasArgs->beta.argFloatComplex), CIMAG(blasArgs->beta.argFloatComplex), CREAL(blasArgs->beta.argDoubleComplex) , CIMAG(blasArgs->beta.argDoubleComplex)); #endif INIT_KARG(&args[0], blasArgs->A); //A - input matrix - argument INIT_KARG(&args[1], blasArgs->B); INIT_KARG(&args[2], blasArgs->C); initSizeKarg(&args[3], blasArgs->M); initSizeKarg(&args[4], blasArgs->N); initSizeKarg(&args[5], blasArgs->lda.matrix); initSizeKarg(&args[6], blasArgs->ldb.matrix); initSizeKarg(&args[7], blasArgs->ldc.matrix); initSizeKarg(&args[8], blasArgs->offa); //PENDING: offA or offa ?? initSizeKarg(&args[9], blasArgs->offBX); initSizeKarg(&args[10], blasArgs->offCY); assignScalarKarg(&args[11], &(blasArgs->alpha), blasArgs->dtype); assignScalarKarg(&args[12], &(blasArgs->beta), blasArgs->dtype); return; } extern "C" void initSymmDefaultPattern(MemoryPattern *mempat) { mempat->name = "Cached global memory based block Symm"; mempat->nrLevels = 2; mempat->cuLevel = 0; mempat->thLevel = 1; mempat->sops = &symmSops; mpatExtra.aMset = CLMEM_LEVEL_L1; mpatExtra.bMset = CLMEM_LEVEL_L1; mpatExtra.mobjA = CLMEM_BUFFER; mpatExtra.mobjB = CLMEM_BUFFER; mempat->extra = &mpatExtra; Prefix[TYPE_FLOAT] = 'S'; Prefix[TYPE_DOUBLE] = 'D'; Prefix[TYPE_COMPLEX_FLOAT] = 'C'; Prefix[TYPE_COMPLEX_DOUBLE] = 'Z'; return; } clblas-2.10/src/library/blas/gens/symv.c000066400000000000000000000764021264277366700202030ustar00rootroot00000000000000/* ************************************************************************ * Copyright 2013 Advanced Micro Devices, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * ************************************************************************/ /* * symv generator */ #include #include #include #include #include #include #include #include #include #include "blas_kgen.h" #include "xxmv_common.h" static const char *symvDecl = "__attribute__((reqd_work_group_size(%lu, %lu, 1)))\n" "void __kernel\n" "%csymv(\n" " uint N,\n" " const %s alpha,\n" " const __global %s *restrict A,\n" " const __global %s *restrict X,\n" "%s" " __global %s *Y,\n" " uint lda,\n" "%s" // offset A, X and Y "%s" "%s" " const uint startN,\n" " uint actualN)\n"; static CLBLASMpatExtra mpatExtra; struct symvPrivate { TilePostFetchPrivate *pfPriv; TileMulOpts *mulOpts; Tile tilea; bool diag; bool coord; }; static ssize_t generator( char *buf, size_t buflen, const struct SubproblemDim *subdims, const struct PGranularity *pgran, void *extra); static void assignKargs(KernelArg *args, const void *params, const void *extra); static SolverFlags solverFlags(void); static void fixupArgs(void *args, SubproblemDim *subdims, void *extra); static bool isFitToLDS( SubproblemDim *dim, DataType dtype, cl_ulong ldsSize, const void *kernelArgs); static void calcNrThreads( size_t threads[2], const SubproblemDim *subdims, const PGranularity *pgran, const void *args, const void *extra); static int symvSubgGetDefaultDecomp( PGranularity *pgran, SubproblemDim *subdims, unsigned int subdimsNum, void * pArgs); static bool subgCheckCalcDecomp( PGranularity *pgran, SubproblemDim *subdims, unsigned int subdimsNum, DataType dtype, int check); static SolverOps symvSops = { generator, assignKargs, isFitToLDS, NULL, NULL, calcNrThreads, NULL, solverFlags, fixupArgs, symvSubgGetDefaultDecomp, //getDefaultDecomposition subgCheckCalcDecomp, // get Decomp. List NULL, NULL }; static void declareSymvKernel( struct KgenContext *ctx, DataType dtype, const PGranularity *pgran, KernelExtraFlags kflags) { bool incxOne = ((kflags & KEXTRA_INCX_ONE) != 0); bool incyOne = ((kflags & KEXTRA_INCY_ONE) != 0); bool beta0 = ((kflags & KEXTRA_BETA_ZERO) != 0); const char *incxDecl = incxOne ? "" : " const int incx,\n"; const char *incyDecl = incyOne ? "" : " const int incy,\n"; char betaDecl[128]; char offDecl[128]; char tmp[512]; char fpref; const char *typeName; typeName = dtypeBuiltinType(dtype); fpref = dtypeToBlasPrefix(dtype); if (beta0) { betaDecl[0] = '\0'; } else { sprintf(betaDecl, " const %s beta,\n", typeName); } offDecl[0] = '\0'; if (kflags & KEXTRA_A_OFF_NOT_ZERO) { strcpy(offDecl, " const uint offA,\n"); } if (kflags & KEXTRA_BX_OFF_NOT_ZERO) { strcat(offDecl, " const uint offX,\n"); } if (kflags & KEXTRA_CY_OFF_NOT_ZERO) { strcat(offDecl, " const uint offY,\n"); } sprintf(tmp, symvDecl, pgran->wgSize[0], pgran->wgSize[1], fpref, typeName, typeName, typeName, betaDecl, typeName, offDecl, incxDecl, incyDecl); kgenDeclareFunction(ctx, tmp); } /* avoid " + 0" statements */ static void genAdd(char *buf, size_t val) { if (val == 0) { buf[0] = 0; //zero length string } else { sprintf(buf, " + %lu", val); } } static int genPostFetchMirror( struct KgenContext *ctx, MatrixRole mrole, void *priv) { TilePostFetchPrivate *pfPriv = ((struct symvPrivate *)priv)->pfPriv; TileMulOpts *mulOpts = ((struct symvPrivate *)priv)->mulOpts; Tile *tileb = (Tile *)&pfPriv->gset->tileA; Tile *tilea = &((struct symvPrivate *)priv)->tilea; bool tra = ((mulOpts->flags & TILEMUL_TRA) != 0); char tmp[1024]; char stmtStr[2][128]; size_t blockx, blocky; unsigned int x, y; const struct SubproblemDim *dims = &pfPriv->gset->subdims[1]; (void)mrole; blockx = blocky = 0; // zero triangular part of tile a // either single row of tile a either the whole tile have been fetched if (tra) { blocky = dims->bwidth; blockx = dims->y; } else { blocky = dims->y; blockx = dims->bwidth; } // loop through block rows for(y = 0; y < blocky; y++) { // loop through all elements of block row for(x = 0; x < blockx; x++) { Kstring kstr[3]; const char *cmp = ">"; sprintfTileElement(&kstr[0], tileb, x, y, 1); sprintfTileElement(&kstr[1], tileb, y, x, 1); sprintfTileElement(&kstr[2], tilea, y, x, 1); genAdd(stmtStr[0], x); genAdd(stmtStr[1], y); sprintf(tmp, "%s = k%s %s n%s ? %s : %s;\n", kstr[2].buf, stmtStr[0], cmp, stmtStr[1], kstr[0].buf, kstr[1].buf); kgenAddStmt(ctx, tmp); } pfPriv->fetchNumA++; } *tileb = *tilea; return 0; } static int genPostFetchDiag( struct KgenContext *ctx, MatrixRole mrole, void *priv) { TilePostFetchPrivate *pfPriv = ((struct symvPrivate *)priv)->pfPriv; Tile *tile = (Tile *)&pfPriv->gset->tileA; bool diag = ((struct symvPrivate *)priv)->diag; bool tra = ((struct symvPrivate *)priv)->coord; char tmp[1024]; char stmtStr[2][128]; const KernelVarNames *vnames = &pfPriv->gset->varNames; const char *coord = tra ? vnames->coordA : vnames->k; size_t blockx, blocky; unsigned int x, y; const struct SubproblemDim *dims = &pfPriv->gset->subdims[1]; (void)mrole; blockx = blocky = 0; // zero triangular part of tile a // either single row of tile a either the whole tile have been fetched if (tra) { blocky = dims->bwidth; blockx = dims->y; } else { blocky = dims->y; blockx = dims->bwidth; } // loop through block rows for(y = 0; y < blocky; y++) { // loop through all elements of block row for(x = 0; x < blockx; x++) { Kstring kstr[3]; const char *cmp = diag ? ">=" : ">"; if (diag) { sprintfTileElement(&kstr[0], tile, x, y, 1); } else { sprintfTileElement(&kstr[0], tile, y, x, 1); } genAdd(stmtStr[0], x); genAdd(stmtStr[1], y); sprintf(tmp, "%s = Ktail <= %i || %s%s %s n%s ? 0 : %s;\n", kstr[0].buf, y, coord, stmtStr[0], cmp, stmtStr[1], kstr[0].buf); kgenAddStmt(ctx, tmp); } pfPriv->fetchNumA++; } return 0; } static int genPostFetchVertDiag( struct KgenContext *ctx, MatrixRole mrole, void *priv) { TilePostFetchPrivate *pfPriv = ((struct symvPrivate *)priv)->pfPriv; TileMulOpts *mulOpts = ((struct symvPrivate *)priv)->mulOpts; Tile *tile = (Tile *)&pfPriv->gset->tileA; bool diag = ((struct symvPrivate *)priv)->diag; char tmp[1024], tmp1[128] = ""; char stmtStr[2][128]; size_t blockx, blocky; unsigned int x, y; const struct SubproblemDim *dims = &pfPriv->gset->subdims[1]; (void)mrole; blockx = blocky = 0; // zero triangular part of tile a // either single row of tile a either the whole tile have been fetched if (!diag) { blocky = dims->bwidth; blockx = dims->y; } else { blocky = dims->y; blockx = dims->bwidth; } // loop through block rows for(y = 0; y < blocky; y++) { // loop through all elements of block row for(x = 0; x < blockx; x++) { Kstring kstr[3]; const char *cmp = diag ? ">=" : ">"; const char *name = diag ? "k" : "coordA"; if (diag) { sprintfTileElement(&kstr[0], tile, y, x, 1); } else { sprintfTileElement(&kstr[0], tile, x, y, 1); } genAdd(stmtStr[0], x); genAdd(stmtStr[1], y); if (mulOpts->flags & TILEMUL_SKEW_B) { sprintf(tmp1, "Ktail <= %i || ", y); } sprintf(tmp, "%s = %s%s%s %s n%s ? 0 : %s;\n", kstr[0].buf, tmp1, name, stmtStr[0], cmp, stmtStr[1], kstr[0].buf); kgenAddStmt(ctx, tmp); } pfPriv->fetchNumA++; } return 0; } // global memory based kernel generator static ssize_t generator( char *buf, size_t buflen, const struct SubproblemDim *subdims, const struct PGranularity *pgran, void *extra) { struct KgenContext *ctx; CLBLASKernExtra *kextra = (CLBLASKernExtra*)extra; KernelExtraFlags kflags = kextra->flags; bool upper = ((kflags & KEXTRA_UPPER_TRIANG) != 0) ^ ((kflags & KEXTRA_COLUMN_MAJOR) != 0); char tmp[2048]; const char *typeName; DataType dtype = kextra->dtype; BlasGenSettings gset, tgset, lset, gset1; CLBLASKernExtra kextraTmp; TileMulOpts mulOpts, tmulOpts; KernelVarNames *vnames = &gset.varNames; ssize_t ret; size_t vecLen = kextra->vecLen; const char *outTypeName; bool b; TilePostFetchPrivate pfPriv; struct symvPrivate priv; size_t wgSize; bool tailM = (kflags & KEXTRA_TAILS_M) != 0; bool tailK = (kflags & KEXTRA_TAILS_K) != 0; bool tra = (kflags & KEXTRA_COLUMN_MAJOR) != 0; bool rowMaj = !isMatrixAccessColMaj(CLBLAS_SYMV, kflags, MATRIX_A); bool isComplex = isComplexType(dtype); Tile tileb; const char *gid = "get_group_id(0)"; const char *lid = "get_local_id(0)"; bool isHoriz = subdims[1].bwidth >= subdims[1].y; unsigned int bStep = subdims[0].bwidth / subdims[1].bwidth; unsigned int cLocal; unsigned int nPlans; wgSize = (subdims[0].y / subdims[1].y) * (subdims[0].bwidth / subdims[1].bwidth); assert(pgran->wgSize[0] == wgSize); assert(subdims[0].x == 1); assert(subdims[1].x == 1); memset(&gset, 0, sizeof(gset)); memset(&mulOpts, 0, sizeof(mulOpts)); memset(&pfPriv, 0, sizeof(pfPriv)); memset(&priv, 0, sizeof(priv)); ctx = createKgenContext(buf, buflen, true); if (ctx == NULL) { return -ENOMEM; } // at first, generate needed declarations b = isDoubleBasedType(dtype); kgenDeclareUptrs(ctx, b); typeName = dtypeBuiltinType(dtype); declareSymvKernel(ctx, dtype, pgran, kflags); ret = kgenBeginFuncBody(ctx); /* 1D work space. Matrix is divided among wi, each calculates it's own * part of vector y */ kgenAddStmt(ctx, "#define M actualN\n"); memcpy(gset.subdims, subdims, sizeof(gset.subdims)); gset.subdims[0].itemX = gset.subdims[0].x = 1; gset.subdims[1].itemX = gset.subdims[1].x = 1; gset.subdims[0].bwidth = gset.subdims[1].bwidth; gset.flags |= BGF_WHOLE_A | BGF_UPTRS; gset.kextra = kextra; gset.pgran = pgran; initDefaultTiles(&gset, CLBLAS_SYMV, 0, PRIV_STORAGE_VARIABLE_SET); gset.tileA.vecLen = umin(8u, tra ? gset.tileA.nrCols : gset.tileA.nrRows); if (isComplex) { gset.tileCY.vecLen = 1; } declareTileStorages(ctx, &gset); genZeroTile(ctx, &gset.tileCY); getVectorTypeName(dtype, gset.tileCY.vecLen, &outTypeName, NULL); cLocal = wgSize / bStep; nPlans = gset.tileCY.nrRows / gset.tileCY.vecLen; sprintf(tmp, "__local %s localRes[%u][%u];\n", outTypeName, pgran->wgSize[0], nPlans); kgenAddStmt(ctx, tmp); sprintf(tmp, "uint coordA = (%s * %u + %s / %u) * %lu + startN;\n", gid, cLocal, lid, bStep, subdims[1].y); kgenAddStmt(ctx, tmp); sprintf(tmp, "uint n = coordA;\n"); kgenAddStmt(ctx, tmp); sprintf(tmp, "uint k0 = (%s %% %u) * %lu;\n", lid, bStep, subdims[1].bwidth); kgenAddStmt(ctx, tmp); kgenAddStmt(ctx, "actualN += startN;\n"); kgenAddBlankLine(ctx); kgenBeginBranch(ctx,"if (coordA < actualN && k0 < N)"); genIncPointers(ctx, kflags); sprintf(tmp, "const GPtr Ag = {(__global %s*)A};\n" "const GPtr Xg = {(__global %s*)X};\n", typeName, typeName); kgenAddStmt(ctx, tmp); kgenAddBlankLine(ctx); kgenAddStmt(ctx, "uint k = k0;\n"); if (tailK) { sprintf(tmp, "uint Ntail = N %% %lu;\n", subdims[1].bwidth); kgenAddStmt(ctx, tmp); sprintf(tmp, "uint Ktail = N %% %lu;\n\n", subdims[1].y); kgenAddStmt(ctx, tmp); kgenBeginBranch(ctx, "if (n + Ktail < N)"); kgenAddStmt(ctx, "N -= Ntail;\n"); kgenAddBlankLine(ctx); } mulOpts.flags |= TILEMUL_OPTIMIZE_COORD_CALC; if (tailM) { vnames->sizeM = "N"; } vnames->A = "Ag"; vnames->B = "Xg"; vnames->coordA = "coordA"; vnames->coordB = ""; //should not be used for vector vnames->k = "k"; vnames->lda = "lda"; vnames->sizeK = "N"; vnames->sizeM = "N"; mulOpts.flags |= TILEMUL_NOT_FETCH_B | TILEMUL_TRB | TILEMUL_NOT_INC_K; if ((kflags & KEXTRA_CONJUGATE_A) != 0) { mulOpts.flags |= TILEMUL_CONJA; } if ((kflags & KEXTRA_ENABLE_MAD) != 0) { mulOpts.core = TILEMUL_MAD; } else { mulOpts.core = TILEMUL_MULADD; } mulOpts.memA = CLMEM_GLOBAL_MEMORY; mulOpts.memB = CLMEM_GLOBAL_MEMORY; if (rowMaj) { mulOpts.flags |= TILEMUL_BW_STRIDE; } if (upper) { kgenAddStmt(ctx, "// k loop over column from the beginning of the column till the diagonal\n"); } else { kgenAddStmt(ctx, "// k loop over row from the beginning of the row till the diagonal\n"); } sprintf(tmp, "for (; k < n/%lu*%lu; k += %lu)", subdims[1].bwidth, subdims[1].bwidth, bStep*subdims[1].bwidth); kgenBeginBranch(ctx, tmp); genFetchX(ctx, &gset.tileBX, gset.kextra->vecLen, dtype, vnames, mulOpts.flags, kflags); upper ^= rowMaj; tra ^= rowMaj; if (upper ^ rowMaj && tra) { mulOpts.flags |= TILEMUL_TRA; } gset.tileA.trans ^= !upper; tgset = gset; tmulOpts = mulOpts; ret = tileMulGen(ctx, &gset, &mulOpts); if (ret != 0) { return ret; } kgenEndBranch(ctx, NULL); /* k loop */ if (tailK) { kextraTmp = *kextra; gset1 = gset; kextraTmp.vecLen = 1; gset1.kextra = &kextraTmp; gset1.subdims[0].bwidth = gset1.subdims[1].bwidth = 1; gset1.tileBX.nrRows = 1; gset1.tileA.nrCols = 1; kextraTmp.vecLenA = 1; } if (isHoriz) { lset = gset; lset.subdims[0].bwidth = lset.subdims[1].bwidth = lset.subdims[1].y = umin(subdims[1].bwidth, subdims[1].y); lset.tileA.nrCols = lset.tileA.nrRows = lset.tileBX.nrRows = lset.subdims[1].y; kgenAddStmt(ctx, "// the diagonal\n"); kgenBeginBranch(ctx, "if (k <= n)"); kgenAddStmt(ctx, "uint k1 = k;\n"); if (subdims[1].bwidth != subdims[1].y) { kgenAddStmt(ctx, "// the pred diagonal\n"); sprintf(tmp, "for (; k < n; k += %lu)", lset.subdims[1].bwidth); kgenBeginBranch(ctx, tmp); genFetchX(ctx, &lset.tileBX, lset.subdims[1].bwidth, dtype, vnames, mulOpts.flags, kflags); ret = tileMulGen(ctx, &lset, &mulOpts); if (ret != 0) { return ret; } kgenEndBranch(ctx, NULL); /* k loop */ } initTile(&tileb, "b", lset.subdims[1].bwidth, lset.subdims[1].bwidth, lset.subdims[1].bwidth, lset.tileA.dtype, PRIV_STORAGE_VARIABLE_SET, lset.tileA.trans, lset.tileA.packed); declareOneTileStorage(ctx, &tileb); genFetchX(ctx, &lset.tileBX, lset.subdims[1].bwidth, dtype, vnames, mulOpts.flags, kflags); priv.mulOpts = &mulOpts; priv.pfPriv = &pfPriv; priv.tilea = lset.tileA; priv.diag = false; pfPriv.funcID = CLBLAS_SYMV; pfPriv.gset = &lset; lset.tileA = tileb; mulOpts.postFetch = genPostFetchMirror; mulOpts.postFetchPriv = &priv; ret = tileMulGen(ctx, &lset, &mulOpts); if (ret != 0) { return ret; } if (upper ^ rowMaj && tra) { mulOpts.flags &= ~TILEMUL_TRA; } else { mulOpts.flags |= TILEMUL_TRA; } gset.tileA.trans = lset.tileA.trans ^= true; mulOpts.postFetch = NULL; mulOpts.postFetchPriv = NULL; if (subdims[1].bwidth != subdims[1].y) { size_t width = umax(subdims[1].bwidth, subdims[1].y); kgenAddStmt(ctx, "// the post diagonal\n"); if (tailK) { kgenBeginBranch(ctx, "if(k < N)"); } sprintf(tmp, "for (k += %lu; k < n/%lu*%lu+%lu; k += %lu)", lset.subdims[1].bwidth, width, width, width, lset.subdims[1].bwidth); kgenBeginBranch(ctx, tmp); genFetchX(ctx, &lset.tileBX, lset.subdims[1].bwidth, dtype, vnames, mulOpts.flags, kflags); ret = tileMulGen(ctx, &lset, &mulOpts); if (ret != 0) { return ret; } kgenEndBranch(ctx, NULL); /* k loop */ if (tailK) { kgenEndBranch(ctx, NULL); kgenBeginBranch(ctx, "else"); /* Handle tail along vector X */ kgenAddStmt(ctx, "N += Ntail;\n"); mulOpts.flags |= TILEMUL_GLOBAL_CYCLIC_A; #if 1 sprintf(tmp, "for (k += %lu; k < actualN; k++)", lset.subdims[1].bwidth); kgenBeginBranch(ctx, tmp); gset1.tileA.trans = gset.tileA.trans; genFetchX(ctx, &gset1.tileBX, gset1.kextra->vecLen, dtype, vnames, mulOpts.flags, kflags); ret = tileMulGen(ctx, &gset1, &mulOpts); if (ret != 0) { return ret; } kgenEndBranch(ctx, NULL); /* k loop for tails along vector X */ #else mulOpts.flags |= TILEMUL_SKEW_B | TILEMUL_NOT_INC_K; genFetchX(ctx, &gset.tileBX, gset.kextra->vecLen, dtype, vnames, mulOpts.flags, kflags); ret = tileMulGen(ctx, &gset, &mulOpts); if (ret != 0) { return ret; } #endif mulOpts.flags &= ~TILEMUL_GLOBAL_CYCLIC_A; kgenEndBranch(ctx, NULL); } } sprintf(tmp, "k = k1 + %lu;\n", bStep*subdims[1].bwidth); kgenAddStmt(ctx, tmp); kgenEndBranch(ctx, NULL); } else { kgenAddStmt(ctx, "// the diagonal\n"); sprintf(tmp, "if (k <= (n + (get_local_id(0)%%%lu)*%lu))", subdims[1].y/subdims[1].bwidth, subdims[1].bwidth); kgenBeginBranch(ctx, tmp); genFetchX(ctx, &gset.tileBX, gset.subdims[1].bwidth, dtype, vnames, mulOpts.flags, kflags); kgenBeginBranch(ctx, NULL); priv.mulOpts = &mulOpts; priv.pfPriv = &pfPriv; priv.diag = true; pfPriv.funcID = CLBLAS_SYMV; pfPriv.gset = &gset; mulOpts.postFetch = genPostFetchVertDiag; mulOpts.postFetchPriv = &priv; ret = tileMulGen(ctx, &gset, &mulOpts); if (ret != 0) { return ret; } kgenEndBranch(ctx, NULL); if (upper ^ rowMaj && tra) { mulOpts.flags &= ~TILEMUL_TRA; } else { mulOpts.flags |= TILEMUL_TRA; } gset.tileA.trans ^= true; lset = gset; sprintf(tmp, "n += (get_local_id(0)%%%lu)*%lu;\n", subdims[1].y/subdims[1].bwidth, subdims[1].bwidth); kgenAddStmt(ctx, tmp); kgenBeginBranch(ctx, NULL); priv.diag = false; ret = tileMulGen(ctx, &gset, &mulOpts); if (ret != 0) { return ret; } kgenEndBranch(ctx, NULL); mulOpts.postFetch = NULL; mulOpts.postFetchPriv = NULL; sprintf(tmp, "k += %lu;\n", bStep*subdims[1].bwidth); kgenAddStmt(ctx, tmp); kgenEndBranch(ctx, NULL); /* if */ } if (upper) { kgenAddStmt(ctx, "// k loop over row from the diagonal till the right\n"); } else { kgenAddStmt(ctx, "// k loop over column from the diagonal till the bottom\n"); } sprintf(tmp, "for (; k < N; k += %lu)", bStep*subdims[1].bwidth); kgenBeginBranch(ctx, tmp); genFetchX(ctx, &gset.tileBX, gset.kextra->vecLen, dtype, vnames, mulOpts.flags, kflags); ret = tileMulGen(ctx, &gset, &mulOpts); if (ret != 0) { return ret; } kgenEndBranch(ctx, NULL); /* k loop */ if (tailK) { /* Handle tail along vector X */ kgenAddStmt(ctx, "N += Ntail;\n"); mulOpts.flags |= TILEMUL_GLOBAL_CYCLIC_A; #if 1 sprintf(tmp, "for (; k < N; k++)"); kgenBeginBranch(ctx, tmp); gset1.tileA.trans = gset.tileA.trans; genFetchX(ctx, &gset1.tileBX, gset1.kextra->vecLen, dtype, vnames, mulOpts.flags, kflags); ret = tileMulGen(ctx, &gset1, &mulOpts); if (ret != 0) { return ret; } kgenEndBranch(ctx, NULL); /* k loop for tails along vector X */ #else mulOpts.flags |= TILEMUL_SKEW_B | TILEMUL_NOT_INC_K; genFetchX(ctx, &gset.tileBX, gset.kextra->vecLen, dtype, vnames, mulOpts.flags, kflags); ret = tileMulGen(ctx, &gset, &mulOpts); if (ret != 0) { return ret; } #endif kgenEndBranch(ctx, NULL); kgenBeginBranch(ctx, "else"); sprintf(tmp, "for (; k < N; k += %lu)", bStep*subdims[1].bwidth); kgenBeginBranch(ctx, tmp); tmulOpts.flags |= TILEMUL_SKEW_B | TILEMUL_GLOBAL_CYCLIC_A; genFetchX(ctx, &tgset.tileBX, tgset.kextra->vecLen, dtype, vnames, tmulOpts.flags, kflags); priv.mulOpts = &tmulOpts; priv.pfPriv = &pfPriv; pfPriv.gset = &tgset; priv.diag = false; pfPriv.funcID = CLBLAS_SYMV; tmulOpts.postFetch = genPostFetchDiag; tmulOpts.postFetchPriv = &priv; ret = tileMulGen(ctx, &tgset, &tmulOpts); if (ret != 0) { return ret; } if (isHoriz) { sprintf(tmp, "if (k + %lu > N) break;\n", subdims[1].bwidth); } else { sprintf(tmp, "if (k + %lu > N + (get_local_id(0)%%%lu)*%lu) break;\n", subdims[1].y, subdims[1].y/subdims[1].bwidth, subdims[1].bwidth); } kgenAddStmt(ctx, tmp); kgenEndBranch(ctx, NULL); /* k loop */ kgenBeginBranch(ctx, "if (k < N)"); if (isHoriz) { kgenAddStmt(ctx, "k = n;\n"); } else { sprintf(tmp, "n += (get_local_id(0)%%%lu)*%lu;\n", subdims[1].y/subdims[1].bwidth, subdims[1].bwidth); kgenAddStmt(ctx, tmp); } genFetchX(ctx, &lset.tileBX, lset.kextra->vecLen, dtype, vnames, tmulOpts.flags, kflags); priv.mulOpts = &tmulOpts; priv.pfPriv = &pfPriv; priv.diag = true; pfPriv.funcID = CLBLAS_SYMV; pfPriv.gset = &lset; tmulOpts.postFetch = genPostFetchDiag; tmulOpts.postFetchPriv = &priv; if (!isHoriz) { if (upper ^ rowMaj && tra) { tmulOpts.flags &= ~TILEMUL_TRA; } else { tmulOpts.flags |= TILEMUL_TRA; } kgenAddStmt(ctx, "Ktail = N - n;\n"); priv.coord = true; } else { priv.coord = false; } tmulOpts.flags |= TILEMUL_SKEW_B | TILEMUL_GLOBAL_CYCLIC_A | TILEMUL_GLOBAL_CYCLIC_K; ret = tileMulGen(ctx, &lset, &tmulOpts); if (ret != 0) { return ret; } kgenEndBranch(ctx, NULL); kgenEndBranch(ctx, NULL); } if (!isMatrixAccessColMaj(CLBLAS_GEMV, kflags, MATRIX_A)) { mulOpts.flags &= ~TILEMUL_BW_STRIDE; } kgenEndBranch(ctx,NULL); genStoreLocalResult(ctx, &gset.tileCY, lid); kgenAddBarrier(ctx, CLK_LOCAL_MEM_FENCE); kgenAddBlankLine(ctx); sprintf(tmp, "if ((%s %% %u) == 0 && coordA < actualN && k0 < N)", lid, bStep); kgenBeginBranch(ctx, tmp); genAddLocalResult(ctx, &gset.tileCY, lid, bStep, 1); /* write back the results */ /* y := alpha*A*x + beta*y */ sprintf(tmp,"(%s - startN)", vnames->coordA); setResultPos(ctx, kflags, tmp); updateResultVectorTiled(ctx, kflags, vecLen, &gset.tileCY); kgenEndBranch(ctx, NULL); kgenEndFuncBody(ctx); ret = kgenAddBlankLine(ctx); if (!ret) { ret = (ssize_t)kgenSourceSize(ctx) + 1; } destroyKgenContext(ctx); return (ret < 0) ? -EOVERFLOW : ret; } static void assignKargs(KernelArg *args, const void *params, const void *extra) { const CLBlasKargs *blasArgs = (const CLBlasKargs*)params; KernelExtraFlags kflags = ((const CLBLASKernExtra*)extra)->flags; cl_int inc; int i; initSizeKarg(&args[0], blasArgs->K); assignScalarKarg(&args[1], &(blasArgs->alpha), blasArgs->dtype); INIT_KARG(&args[2], blasArgs->A); INIT_KARG(&args[3], blasArgs->B); i = 4; if (!(kflags & KEXTRA_BETA_ZERO)) { assignScalarKarg(&args[i++], &(blasArgs->beta), blasArgs->dtype); } initMemobjKarg(&args[i++], blasArgs->C, NULL, 0, 0); initSizeKarg(&args[i++], blasArgs->lda.matrix); if (kflags & KEXTRA_A_OFF_NOT_ZERO) { initSizeKarg(&args[i++], blasArgs->offA); } if (kflags & KEXTRA_BX_OFF_NOT_ZERO) { initSizeKarg(&args[i++], blasArgs->offBX); } if (kflags & KEXTRA_CY_OFF_NOT_ZERO) { initSizeKarg(&args[i++], blasArgs->offCY); } if (!(kflags & KEXTRA_INCX_ONE)) { inc = blasArgs->ldb.vector; INIT_KARG(&args[i], inc); i++; } if (!(kflags & KEXTRA_INCY_ONE)) { inc = blasArgs->ldc.vector; INIT_KARG(&args[i], inc); i++; } initSizeKarg(&args[i++], blasArgs->offsetN); initSizeKarg(&args[i++], blasArgs->N); //Actual N } static void fixupArgs(void *args, SubproblemDim *subdims, void *extra) { CLBlasKargs *kargs = (CLBlasKargs*)args; (void)extra; (void)subdims; if (kargs->offsetN) { if (kargs->ldc.vector < 0) { // K store the original height of the matrix A kargs->offCY += (kargs->K - kargs->offsetN) * abs(kargs->ldc.vector); } else { kargs->offCY += kargs->offsetN * kargs->ldc.vector; } } } static bool isFitToLDS( SubproblemDim *dim, DataType dtype, cl_ulong ldsSize, const void *kernelArgs) { cl_ulong size; (void)kernelArgs; /* * One needs y1 * wgSize size of local memory in elements, * but y1 is not calculated yet. The expression below produces * reliable a larger value. It is larger in dims[1].bwidth times. */ size = dim[0].y * dim[0].bwidth * dtypeSize(dtype); return (size <= ldsSize); } static void calcNrThreads( size_t threads[2], const SubproblemDim *subdims, const PGranularity *pgran, const void *args, const void *extra) { const CLBlasKargs *kargs = args; unsigned int subgr = subdims[0].bwidth / subdims[1].bwidth; (void)extra; //each work item handles y1 lines threads[0] = divRoundUp(kargs->N, subdims[1].y) * subgr; threads[0] = roundUp(threads[0], pgran->wgSize[0]); threads[1] = 0; } static SolverFlags solverFlags(void) { return (SF_WSPACE_1D); } static int symvSubgGetDefaultDecomp( PGranularity *pgran, SubproblemDim *subdims, unsigned int subdimsNum, void * pArgs) { (void)subdimsNum; DUMMY_ARG_USAGE(pArgs); pgran->wgDim = 1; pgran->wgSize[0] = 64; pgran->wgSize[1] = 1; subdims[1].bwidth = 4; subdims[1].itemX = subdims[1].x = 1; subdims[1].itemY = subdims[1].y = 4; subdims[0].bwidth = 8 * subdims[1].bwidth; subdims[0].itemX = subdims[0].x = 1; subdims[0].itemY = subdims[0].y = 8 * subdims[1].y; return 0; } static bool subgCheckCalcDecomp( PGranularity *pgran, SubproblemDim *subdims, unsigned int subdimsNum, DataType dtype, int check) { unsigned int divider1 = dtypeSize(dtype)/sizeof(cl_float); //EINVAL if( (subdimsNum<2)|| (NULL==pgran)|| (NULL==subdims) ){ return false; } if( 0 == subdims[0].x || 0 == subdims[0].y || 0 == subdims[0].bwidth || 0 == subdims[1].x || 0 == subdims[1].y || 0 == subdims[1].bwidth ){ return false; } if( subdims[1].x != subdims[1].itemX || subdims[1].y != subdims[1].itemY ){ return false; } // the group block must consist of integer number of subgroup blocks if( subdims[0].x % subdims[1].x || subdims[0].y % subdims[1].y || subdims[0].bwidth % subdims[1].bwidth ){ return false; } //check fitting of bw to common vector sizes if( isComplexType(dtype) ){ if( 2*subdims[1].bwidth > 32 ){ return false; } } // check dimensions if( subdims[1].bwidth > 16 / divider1 || subdims[1].x > 1 || subdims[1].y > 16 / divider1 ){ return false; } if( subdims[0].bwidth > 128 || subdims[0].x > 1 || subdims[0].y > 128 ){ return false; } if (64 != (subdims[0].y / subdims[1].y) * (subdims[0].bwidth / subdims[1].bwidth)) { return false; } if (subdims[0].y > subdims[0].bwidth && subdims[0].y / subdims[0].bwidth < (subdims[0].bwidth / subdims[1].bwidth)) { return false; } // passed PGranularity should be checked if( PGRAN_CHECK == check ){ if( pgran->wgSize[0] * pgran->wgSize[1] != 64 ){ return false; } } // PGranularity should be calculated else{ pgran->wgDim = 1; pgran->wgSize[1] = 1; pgran->wgSize[0] = 64; //subdims[0].bwidth = (pgran->wgSize[0] * subdims[1].bwidth) / // (subdims[0].y / subdims[1].y); } /*Debug out for Tune*/ return true; } void initSymvPattern(MemoryPattern *mempat) { mempat->name = "Cached global memory based block symv"; mempat->nrLevels = 2; mempat->cuLevel = 0; mempat->thLevel = 1; mempat->sops = &symvSops; mpatExtra.aMset = CLMEM_LEVEL_L1; mpatExtra.bMset = CLMEM_LEVEL_L1; mpatExtra.mobjA = CLMEM_BUFFER; mpatExtra.mobjB = CLMEM_BUFFER; mempat->extra = &mpatExtra; } clblas-2.10/src/library/blas/gens/syr2_lds.cpp000066400000000000000000000215741264277366700213060ustar00rootroot00000000000000/* ************************************************************************ * Copyright 2013 Advanced Micro Devices, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * ************************************************************************/ /* * SYR2 Generator */ #include #include #include #include #include #include #include #include #include "blas_kgen.h" #include #include #include //#define DEBUG_SYR2 extern "C" unsigned int dtypeSize(DataType type); static char Prefix[4]; static SolverFlags solverFlags(void) { #ifdef DEBUG_SYR2 printf("solverFlags called......\n"); #endif return (SolverFlags)(SF_WSPACE_1D | SF_TOP_INPUT_SQUARE_BLOCKS); } static void calcNrThreads( size_t threads[2], const SubproblemDim *subdims, const PGranularity *pgran, const void *args, const void *extra); static ssize_t generator( char *buf, size_t buflen, const struct SubproblemDim *subdims, const struct PGranularity *pgran, void *extra); static void assignKargs(KernelArg *args, const void *params, const void*); extern "C" void initSyr2DefaultPattern(MemoryPattern *mempat); static KernelExtraFlags selectVectorization( void *kargs, unsigned int vlen ); static void setBuildOpts( char * buildOptStr, const void *kArgs); static bool isFitToLDS( SubproblemDim *dim, DataType dtype, cl_ulong ldsSize, const void *kernelArgs); static SolverOps syr2Ops = { generator, assignKargs, isFitToLDS, NULL, // Prepare Translate Dims NULL, // Inner Decomposition Axis calcNrThreads, NULL, solverFlags, NULL, NULL, NULL, setBuildOpts, selectVectorization }; static KernelExtraFlags selectVectorization( void *args, unsigned int vlen ) { KernelExtraFlags kflags = KEXTRA_NO_FLAGS; CLBlasKargs *kargs = (CLBlasKargs *)args; if(kargs->uplo == clblasUpper) { if( (kargs->N) % vlen) { kflags = KEXTRA_NO_COPY_VEC_A; } } if( kargs->pigFuncID == CLBLAS_SPR2 ) { kflags = KEXTRA_NO_COPY_VEC_A; // Packed-case never do aligned access } return kflags; } static void setBuildOpts( char * buildOptStr, const void *args) { const SolutionStep *step = (const SolutionStep *)args; const CLBlasKargs *kargs = (const CLBlasKargs *)(&step->args); if ( kargs->dtype == TYPE_DOUBLE ) { addBuildOpt( buildOptStr, BUILD_OPTS_MAXLEN, "-DDOUBLE_PRECISION"); #ifdef DEBUG_SYR2 printf("Setting build options ... Double... for DOUBLE PRECISION support\n"); #endif } if( kargs->pigFuncID == CLBLAS_SPR2 ) { addBuildOpt( buildOptStr, BUILD_OPTS_MAXLEN, "-DPACKED"); } return; } static CLBLASMpatExtra mpatExtra; extern "C" void initSyr2DefaultPattern(MemoryPattern *mempat) { #ifdef DEBUG_SYR2 printf("initSyrDefaultPattern called with mempat = 0x%p\n", (void *)mempat); fflush(stdout); #endif mempat->name = "LDS based syr"; mempat->nrLevels = 2; mempat->cuLevel = 0; mempat->thLevel = 1; mempat->sops = &syr2Ops; mpatExtra.aMset = 0; mpatExtra.bMset = CLMEM_LEVEL_LDS; // For "x" vector //mpatExtra.cMset = CLMEM_LEVEL_LDS; // For "y" vector mpatExtra.mobjA = CLMEM_GLOBAL_MEMORY; mpatExtra.mobjB = CLMEM_GLOBAL_MEMORY; //mpatExtra.mobjC = CLMEM_GLOBAL_MEMORY; mempat->extra = &mpatExtra; Prefix[TYPE_FLOAT] = 'S'; Prefix[TYPE_DOUBLE] = 'D'; } static void calcNrThreads( size_t threads[2], const SubproblemDim *subdims, const PGranularity *pgran, const void *args, const void *_extra) { int BLOCKSIZE = pgran->wgSize[0]; // 1D Block #ifdef DEBUG_SYR2 printf("calcNrThreads called from syr2_lds.cpp\n"); #endif const CLBlasKargs *kargs = (const CLBlasKargs *)args; const CLBLASKernExtra *extra = ( CLBLASKernExtra *)_extra; clblasOrder order = ( extra->flags & KEXTRA_COLUMN_MAJOR) ? clblasColumnMajor: clblasRowMajor; if ( order == clblasRowMajor ) { order = clblasColumnMajor; } #ifdef DEBUG_SYR2 printf("subdims->y : %d, subdims->x : %d\n", (int)subdims->y, (int)subdims->x); #endif size_t TARGETROWS = subdims->y ; #ifdef DEBUG_SYR2 printf("kargs-> N : %d, TARGETROWS: %d\n", (int)kargs->N, TARGETROWS); #endif size_t blocks = ((kargs->N - 1)/ TARGETROWS) + 1; #ifdef DEBUG_SYR2 printf("blocks : %d\n", blocks); #endif threads[0] = ((blocks * (blocks + 1)) / 2) * BLOCKSIZE; #ifdef DEBUG_SYR2 printf("pgran-wgSize[0] : %d, globalthreads[0] : %d\n", (int)pgran->wgSize[0], (int)threads[0]); #endif threads[1] = 1; } // // FIXME: Report correct return value - Needs change in KPRINTF // static ssize_t generator( char *buf, size_t buflen, const struct SubproblemDim *subdims, const struct PGranularity *pgran, void *extra) { int BLOCKSIZE = pgran->wgSize[0]; char tempTemplate[32*1024]; char targetRows[10], blockSize[10]; if ( buf == NULL) // return buffer size { buflen = (64 * 1024 * sizeof(char)); return (ssize_t)buflen; } CLBLASKernExtra *extraFlags = ( CLBLASKernExtra *)extra; #ifdef DEBUG_SYR2 printf("SYR2 GENERATOR called....\n"); #endif clblasUplo uplo = ( extraFlags->flags & KEXTRA_UPPER_TRIANG) ? clblasUpper : clblasLower; clblasOrder order = ( extraFlags->flags & KEXTRA_COLUMN_MAJOR) ? clblasColumnMajor: clblasRowMajor; if ((subdims->y % extraFlags->vecLenA) != 0) { printf("WARNING: SYR2: generator: TARGETROWS must be divisible by Vector Length\n"); return 0; } size_t TARGETROWS = 0; if(order == clblasColumnMajor) { ( uplo == clblasLower )? (strcpy(tempTemplate, (char*)syr2_her2_CL_kernel)) : (strcpy(tempTemplate, (char*)syr2_her2_CU_kernel)); } else { printf("WARNING: SYR2: Rowmajor order is implemented in columnMajor. This part should never get executed.\n"); return 0; } TARGETROWS = subdims->y; if ((BLOCKSIZE % TARGETROWS) != 0) { printf("WARNING: SYR2: generator: Invalid Block Size\n"); return 0; } #ifdef DEBUG_SYR2 printf("dataType : %c\n", Prefix[extraFlags->dtype]); #endif // FIXME: VECTORSIZE HARD CODED // FIXME : SetKernelArgs.. sends offa, offx, and lda should be received as uint unsigned int vecLenA = extraFlags->vecLenA; #ifdef DEBUG_SYR2 printf("Vector length used : %d\n\n", vecLenA); #endif bool doVLOAD = false; if( extraFlags->flags & KEXTRA_NO_COPY_VEC_A ) { doVLOAD = true; #ifdef DEBUG_SYR2 printf("DOing VLOAD as Aligned Data Pointer not Availabe\n"); #endif } else { #ifdef DEBUG_SYR2 printf("Using Aligned Data Pointer .........................\n"); #endif } kprintf kobj( Prefix[extraFlags->dtype], vecLenA, doVLOAD, doVLOAD); sprintf( targetRows, "%" SPREFIX "u", TARGETROWS ); sprintf( blockSize, "%d", BLOCKSIZE ); #ifdef DEBUG_SYR2 printf("TARGET ROWS = %s\n", targetRows); printf("BLOCK SIZE = %s\n", blockSize); #endif kobj.put("%TARGET_ROWS", (const char *)targetRows); kobj.put("%BLOCKSIZE", (const char *) blockSize); kobj.spit((char*)buf, tempTemplate); return (64 * 1024 * sizeof(char)); // return 0;//(ret < 0) ? -EOVERFLOW : ret; } /* ( __global %TYPE* _A, __global const %TYPE* _X, __global const %TYPE* _Y, int N, int offx, int incx, int offy, int incy, int offa, int lda, %TYPE alpha) */ static void assignKargs(KernelArg *args, const void *params, const void*) { CLBlasKargs *blasArgs = (CLBlasKargs*)params; cl_int inc; INIT_KARG(&args[0], blasArgs->A); //A - input/output matrix - argument INIT_KARG(&args[1], blasArgs->B); //X - x vector INIT_KARG(&args[2], blasArgs->C); //Y - y vector initSizeKarg(&args[3], blasArgs->N); initSizeKarg(&args[4], blasArgs->offBX); inc = blasArgs->ldb.vector; INIT_KARG(&args[5], inc); initSizeKarg(&args[6], blasArgs->offCY); inc = blasArgs->ldc.vector; INIT_KARG(&args[7], inc); initSizeKarg(&args[8], blasArgs->offa); initSizeKarg(&args[9], blasArgs->lda.matrix); assignScalarKarg(&args[10], &(blasArgs->alpha), blasArgs->dtype); return; } static bool isFitToLDS( SubproblemDim *dim, DataType dtype, cl_ulong ldsSize, const void *kernelArgs) { cl_ulong maxSize; CLBlasKargs *blasArgs; blasArgs = (CLBlasKargs *)kernelArgs; // 4 buffers for xShared, yShared, xSharedTrans and ySharedTrans and 2 integers for the values of iShared and jShared. maxSize = (dim->y * 4 * sizeof(dtype)) + (2 * sizeof(int)); return ((maxSize) <= ldsSize); } //#undef DEBUG_SYR2 clblas-2.10/src/library/blas/gens/syr_lds.cpp000066400000000000000000000211441264277366700212150ustar00rootroot00000000000000/* ************************************************************************ * Copyright 2013 Advanced Micro Devices, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * ************************************************************************/ /* * SYR Generator */ #include #include #include #include #include #include #include #include #include "blas_kgen.h" #include #include #include //#define DEBUG_SYR extern "C" unsigned int dtypeSize(DataType type); static char Prefix[4]; static SolverFlags solverFlags(void) { #ifdef DEBUG_SYR printf("solverFlags called......\n"); #endif return (SolverFlags)(SF_WSPACE_1D | SF_TOP_INPUT_SQUARE_BLOCKS); } static void calcNrThreads( size_t threads[2], const SubproblemDim *subdims, const PGranularity *pgran, const void *args, const void *extra); static ssize_t generator( char *buf, size_t buflen, const struct SubproblemDim *subdims, const struct PGranularity *pgran, void *extra); static void assignKargs(KernelArg *args, const void *params, const void*); extern "C" void initSyrDefaultPattern(MemoryPattern *mempat); static KernelExtraFlags selectVectorization( void *kargs, unsigned int vlen ); static void setBuildOpts( char * buildOptStr, const void *kArgs); static bool isFitToLDS( SubproblemDim *dim, DataType dtype, cl_ulong ldsSize, const void *kernelArgs); static SolverOps syrOps = { generator, assignKargs, isFitToLDS, NULL, // Prepare Translate Dims NULL, // Inner Decomposition Axis calcNrThreads, NULL, solverFlags, NULL, NULL, NULL, setBuildOpts, selectVectorization }; static KernelExtraFlags selectVectorization( void *args, unsigned int vlen ) { KernelExtraFlags kflags = KEXTRA_NO_FLAGS; CLBlasKargs *kargs = (CLBlasKargs *)args; if(kargs->uplo == clblasUpper) { if( (kargs->N) % vlen) { kflags = KEXTRA_NO_COPY_VEC_A; } } if( kargs->pigFuncID == CLBLAS_SPR ) { kflags = KEXTRA_NO_COPY_VEC_A; // Packed-case never do aligned access } #ifdef DEBUG_SYR printf("SYR: selectVectorization being called\n"); #endif return kflags; } static void setBuildOpts( char * buildOptStr, const void *args) { const SolutionStep *step = (const SolutionStep *)args; const CLBlasKargs *kargs = (const CLBlasKargs *)(&step->args); if ( kargs->dtype == TYPE_DOUBLE ) { addBuildOpt( buildOptStr, BUILD_OPTS_MAXLEN, "-DDOUBLE_PRECISION"); #ifdef DEBUG_SYR printf("Setting build options ... Double... for DOUBLE PRECISION support\n"); #endif } if( kargs->pigFuncID == CLBLAS_SPR ) { addBuildOpt( buildOptStr, BUILD_OPTS_MAXLEN, "-DPACKED"); } return; } static CLBLASMpatExtra mpatExtra; extern "C" void initSyrDefaultPattern(MemoryPattern *mempat) { #ifdef DEBUG_SYR printf("initSyrDefaultPattern called with mempat = 0x%p\n", (void *)mempat); fflush(stdout); #endif mempat->name = "LDS based syr"; mempat->nrLevels = 2; mempat->cuLevel = 0; mempat->thLevel = 1; mempat->sops = &syrOps; mpatExtra.aMset = 0; mpatExtra.bMset = CLMEM_LEVEL_LDS; // For "x" vector mpatExtra.mobjA = CLMEM_GLOBAL_MEMORY; mpatExtra.mobjB = CLMEM_GLOBAL_MEMORY; mempat->extra = &mpatExtra; Prefix[TYPE_FLOAT] = 'S'; Prefix[TYPE_DOUBLE] = 'D'; } static void calcNrThreads( size_t threads[2], const SubproblemDim *subdims, const PGranularity *pgran, const void *args, const void *_extra) { int BLOCKSIZE = pgran->wgSize[0]; // 1D Block #ifdef DEBUG_SYR printf("calcNrThreads called from syr_reg.cpp\n"); #endif const CLBlasKargs *kargs = (const CLBlasKargs *)args; const CLBLASKernExtra *extra = ( CLBLASKernExtra *)_extra; clblasOrder order = ( extra->flags & KEXTRA_COLUMN_MAJOR) ? clblasColumnMajor: clblasRowMajor; if ( order == clblasRowMajor ) { order = clblasColumnMajor; } #ifdef DEBUG_SYR printf("subdims->y : %d, subdims->x : %d\n", (int)subdims->y, (int)subdims->x); #endif size_t TARGETROWS = subdims->y ; #ifdef DEBUG_SYR printf("kargs-> N : %d, TARGETROWS: %d\n", (int)kargs->N, TARGETROWS); #endif size_t blocks = ((kargs->N - 1)/ TARGETROWS) + 1; #ifdef DEBUG_SYR printf("blocks : %d\n", blocks); #endif threads[0] = ((blocks * (blocks + 1)) / 2) * BLOCKSIZE; #ifdef DEBUG_SYR printf("pgran-wgSize[0] : %d, globalthreads[0] : %d\n", (int)pgran->wgSize[0], (int)threads[0]); #endif threads[1] = 1; } // // FIXME: Report correct return value - Needs change in KPRINTF // static ssize_t generator( char *buf, size_t buflen, const struct SubproblemDim *subdims, const struct PGranularity *pgran, void *extra) { int BLOCKSIZE = pgran->wgSize[0]; char tempTemplate[32*1024]; char targetRows[10], blockSize[10]; if ( buf == NULL) // return buffer size { buflen = (64 * 1024 * sizeof(char)); return (ssize_t)buflen; } CLBLASKernExtra *extraFlags = ( CLBLASKernExtra *)extra; #ifdef DEBUG_SYR printf("SYR GENERATOR called....\n"); #endif clblasUplo uplo = ( extraFlags->flags & KEXTRA_UPPER_TRIANG) ? clblasUpper : clblasLower; clblasOrder order = ( extraFlags->flags & KEXTRA_COLUMN_MAJOR) ? clblasColumnMajor: clblasRowMajor; if ((subdims->y % extraFlags->vecLenA) != 0) { printf("WARNING: SYR: generator: TARGETROWS must be divisible by Vector Length\n"); return 0; } size_t TARGETROWS = 0; if(order == clblasColumnMajor) { ( uplo == clblasLower )? (strcpy(tempTemplate, (char*)syr_her_CL_kernel)) : (strcpy(tempTemplate, (char*)syr_her_CU_kernel)); } else { printf("WARNING: SYR: Rowmajor order is implemented in columnMajor. This part should never get executed.\n"); return 0; } TARGETROWS = subdims->y; if ((BLOCKSIZE % TARGETROWS) != 0) { printf("WARNING: SYR: generator: Invalid Block Size\n"); return 0; } #ifdef DEBUG_SYR printf("dataType : %c\n", Prefix[extraFlags->dtype]); #endif // FIXME: VECTORSIZE HARD CODED // FIXME : SetKernelArgs.. sends offa, offx, and lda should be received as uint unsigned int vecLenA = extraFlags->vecLenA; #ifdef DEBUG_SYR printf("Vector length used : %d\n\n", vecLenA); #endif bool doVLOAD = false; if( extraFlags->flags & KEXTRA_NO_COPY_VEC_A ) { doVLOAD = true; #ifdef DEBUG_SYR printf("DOing VLOAD as Aligned Data Pointer not Availabe\n"); #endif } else { #ifdef DEBUG_SYR printf("Using Aligned Data Pointer .........................\n"); #endif } kprintf kobj( Prefix[extraFlags->dtype], vecLenA, doVLOAD, doVLOAD); sprintf( targetRows, "%" SPREFIX "u", TARGETROWS ); sprintf( blockSize, "%d", BLOCKSIZE ); #ifdef DEBUG_SYR printf("TARGET ROWS = %s\n", targetRows); printf("BLOCK SIZE = %s\n", blockSize); #endif kobj.put("%TARGET_ROWS", (const char *)targetRows); kobj.put("%BLOCKSIZE", (const char *) blockSize); kobj.spit((char*)buf, tempTemplate); return (64 * 1024 * sizeof(char)); // return 0;//(ret < 0) ? -EOVERFLOW : ret; } /* __global %TYPE* _A, __global const %TYPE* _X, int N, int offx, int incx, int offa, int lda, %PTYPE alpha */ static void assignKargs(KernelArg *args, const void *params, const void*) { CLBlasKargs *blasArgs = (CLBlasKargs*)params; cl_int inc; INIT_KARG(&args[0], blasArgs->A); //A - input/output matrix - argument INIT_KARG(&args[1], blasArgs->B); //x - x vector initSizeKarg(&args[2], blasArgs->N); initSizeKarg(&args[3], blasArgs->offBX); inc = blasArgs->ldb.vector; INIT_KARG(&args[4], inc); initSizeKarg(&args[5], blasArgs->offA); initSizeKarg(&args[6], blasArgs->lda.matrix); assignScalarKarg(&args[7], &(blasArgs->alpha), blasArgs->dtype); return; } static bool isFitToLDS( SubproblemDim *dim, DataType dtype, cl_ulong ldsSize, const void *kernelArgs) { cl_ulong maxSize; CLBlasKargs *blasArgs; blasArgs = (CLBlasKargs *)kernelArgs; // 2 buffers for xShared and yShared and 2 integers for the values of iShared and jShared. maxSize = (dim->y * 2 * sizeof(dtype)) + (2 * sizeof(int)); return ((maxSize) <= ldsSize); } //#undef DEBUG_SYR clblas-2.10/src/library/blas/gens/syrxk.c000066400000000000000000002262041264277366700203620ustar00rootroot00000000000000/* ************************************************************************ * Copyright 2013 Advanced Micro Devices, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * ************************************************************************/ /** * SYRk and SYR2K kernel generator */ #include #include #include #include #include #include #include #include #include #include #include #include #include "init.h" #include "blas_kgen.h" #include "gen_helper.h" #include "blas_subgroup.h" #include "tile_iter.h" /* * Priority within a statement batch of different kind * of statements consisting update around the diagonal. */ enum { CALC_COORDS_STMT_PRIO, FETCH_STMT_PRIO, MAD_STMT_PRIO, STORE_STMT_PRIO }; enum { MAX_DIAG_UPRES_STORAGE_SIZE = 95, MAX_FETCH_CLAUSE_SIZE = 8 }; typedef struct { size_t staggered; } extraData_t; struct SetupPtrAttrs { MatrixRole mrole; const char *basePtr; const char *ldName; const char *offName; KernelExtraFlags offMask; }; typedef struct SyrxkExtraPriv { unsigned int maxVlenC; } MAY_ALIAS SyrxkExtraPriv; static CLBLASMpatExtra mpatExtra; static ssize_t generator( char *buf, size_t buflen, const struct SubproblemDim *subdims, const struct PGranularity *pgran, void *extra, BlasFunctionID funcID); static void assignKargs( KernelArg *args, const CLBlasKargs *blasArgs, KernelExtraFlags kflags, BlasFunctionID funcID); static void syrkAssignKargs(KernelArg *args, const void *params, const void *extra); static void syr2kAssignKargs(KernelArg *args, const void *params, const void *extra); static SolverFlags solverFlags(void); static void fixupArgs(void *args, SubproblemDim *subdims, void *extra); static bool checkCalcDecomp( PGranularity *pgran, SubproblemDim *subdims, unsigned int subdimsNum, DataType dtype, int check); static void syrkCalcThreads( size_t threads[2], const SubproblemDim *subdims, const PGranularity *pgran, const void *args, const void *extra); static ssize_t syrkGenerator( char *buf, size_t buflen, const struct SubproblemDim *subdims, const struct PGranularity *pgran, void *extra) { return generator(buf, buflen, subdims, pgran, extra, CLBLAS_SYRK); } static ssize_t syr2kGenerator( char *buf, size_t buflen, const struct SubproblemDim *subdims, const struct PGranularity *pgran, void *extra) { return generator(buf, buflen, subdims, pgran, extra, CLBLAS_SYR2K); } static bool subgCheckCalcDecomp( PGranularity *pgran, SubproblemDim *subdims, unsigned int subdimsNum, DataType dtype, int check); static int syrkSubgGetPerf( unsigned int kflags, const void *args); static int syrkSubgGetDefaultDecomp( PGranularity *pgran, SubproblemDim *subdims, unsigned int subdimsNum, void *pArgs ); static int syrkBlockGetPerf( unsigned int kflags, const void *args); #if 0 static int syrkBlockGetDefaultDecomp( PGranularity *pgran, SubproblemDim *subdims, unsigned int subdimsNum); #endif // ---------------------------------------------------------------------------- static SolverOps syrkSolverOps = { syrkGenerator, syrkAssignKargs, NULL, syrkBlockGetPerf, NULL, syrkCalcThreads, NULL, solverFlags, fixupArgs, NULL,//getDefaultDecomp checkCalcDecomp, NULL, NULL }; static SolverOps syr2kSolverOps = { syr2kGenerator, syr2kAssignKargs, NULL, syrkBlockGetPerf, NULL, syrkCalcThreads, NULL, solverFlags, fixupArgs, NULL,//getDefaultDecomp checkCalcDecomp, NULL, NULL }; static SolverOps syrkSubgSops = { syrkGenerator, syrkAssignKargs, NULL, syrkSubgGetPerf, NULL, syrkCalcThreads, NULL, solverFlags, fixupArgs, syrkSubgGetDefaultDecomp, subgCheckCalcDecomp, NULL, NULL }; static SolverOps syr2kSubgSops = { syr2kGenerator, syr2kAssignKargs, NULL, syrkSubgGetPerf, NULL, syrkCalcThreads, NULL, solverFlags, fixupArgs, syrkSubgGetDefaultDecomp, subgCheckCalcDecomp, NULL, NULL }; //----------------------------------------------------------------------------- static void genPanelBlocksStmt( struct KgenContext *ctx, const char *varName, int roundDir, const SubproblemDim *dim, const char *start, const char *end) { char tmp[1024]; char *p; p = tmp + sprintf(tmp, "%s = (%s", varName, end); if (start[0] != '\0') { p += sprintf(p, " - %s", start); } if (roundDir) { p += sprintf(p, " + %lu", dim->y - 1); } sprintf(p, ") / %lu;\n", dim->y); kgenAddStmt(ctx, tmp); } //----------------------------------------------------------------------------- static void genSetupPointers( struct KgenContext *ctx, const BlasGenSettings *gset, BlasFunctionID funcID, FetchAddrMode addrMode, int rank) { const CLBLASKernExtra *kextra = gset->kextra; char dstPtr[64]; const char *coordName; struct SetupPtrAttrs attrs[3] = { {MATRIX_A, "A", "lda", "offA", KEXTRA_A_OFF_NOT_ZERO}, {MATRIX_B, "B", "ldb", "offB", KEXTRA_BX_OFF_NOT_ZERO}, {MATRIX_C, "C", "ldc", "offC", KEXTRA_CY_OFF_NOT_ZERO} }; int idx = 0; int i; Kstring k1, k2, k3; Kstring madExpr; unsigned int scale; unsigned int vecLen; FetchAddrMode relFlag; /* * Pointers are serviced in the following order: * B for tilemul, A for tilemul, C */ for (i = 0; i < 3; i++) { // The output pointer should be shifted once in case of 2-rank update if ((i == 2) && rank) { break; } emptyKstring(&k1); emptyKstring(&k2); emptyKstring(&k3); scale = 0; // select start coordinate relFlag = (i) ? FETCH_ADDR_A_RELATIVE : FETCH_ADDR_B_RELATIVE; if (addrMode & relFlag) { coordName = (i) ? "coord.y" : "coord.x"; kstrcpy(&k2, coordName); } // fill destination pointer to assign if (i == 2) { strcpy(dstPtr, "C"); } else { const char *p; p = (i) ? gset->varNames.A : gset->varNames.B; strcpy(dstPtr, p); } // select index in the attribute array switch (i) { case 0: idx = (funcID == CLBLAS_SYRK) ? 0 : (1 - rank); break; case 1: idx = (funcID == CLBLAS_SYRK) ? 0 : rank; break; case 2: idx = 2; break; } vecLen = getVecLen(gset, funcID, attrs[idx].mrole); // construct expression if (attrs[idx].mrole != MATRIX_C) { if (isMatrixAccessColMaj(funcID, gset->kextra->flags, attrs[idx].mrole)) { kstrcpy(&k1, "1"); scale = vecLen; } else { kstrcpy(&k1, attrs[idx].ldName); } } if (kextra->flags & attrs[idx].offMask) { if ((attrs[idx].mrole == MATRIX_C) || (vecLen == 1)) { kstrcpy(&k3, attrs[idx].offName); } else { int shift = findHighestSetBit(vecLen); ksprintf(&k3, "(%s >> %d)", attrs[idx].offName, shift); } } sprintfFastScalarMad(&madExpr, &k1, &k2, scale, &k3); // check if it is not "0" or empty string if (strlen(madExpr.buf) <= 1) { if (attrs[idx].mrole != MATRIX_C) { kgenPrintf(ctx, "%s = %s;\n", dstPtr, attrs[idx].basePtr); } } else { kgenPrintf(ctx, "%s = %s + %s;\n", dstPtr, attrs[idx].basePtr, madExpr.buf); } } } //----------------------------------------------------------------------------- static void declareKernel( struct KgenContext *ctx, const BlasGenSettings *gset, BlasFunctionID funcID, const char* nameSuffix ) { char tmp[1024], betaStr[64], bstr[64], strOffABC[256]; DataType dtype = gset->kextra->dtype; KernelExtraFlags kflags = gset->kextra->flags; const PGranularity *pgran = gset->pgran; const char *tnameOrig, *tnameA; unsigned int vecLen; char fpref; const char *rank; tnameOrig = dtypeBuiltinType(dtype); vecLen = getVecLen(gset, funcID, MATRIX_A); getVectorTypeName(dtype, vecLen, &tnameA, NULL); fpref = dtypeToBlasPrefix(dtype); if (kflags & KEXTRA_BETA_ZERO) { betaStr[0] = '\0'; } else { sprintf(betaStr, " const %s beta,\n", tnameOrig); } if (funcID == CLBLAS_SYR2K) { const char *tnameB; rank = "2"; vecLen = getVecLen(gset, funcID, MATRIX_B); getVectorTypeName(dtype, vecLen, &tnameB, NULL); sprintf(bstr, " const __global %s *restrict B,\n" " uint ldb,\n", tnameB); } else { rank = ""; bstr[0] = '\0'; } strOffABC[0] = '\0'; if (kflags & KEXTRA_A_OFF_NOT_ZERO) { strcpy(strOffABC, ",\n uint offA"); } if (kflags & KEXTRA_BX_OFF_NOT_ZERO) { strcat(strOffABC, ",\n uint offB"); } if (kflags & KEXTRA_CY_OFF_NOT_ZERO) { strcat(strOffABC, ",\n uint offC"); } sprintf(tmp, "__attribute__((reqd_work_group_size(%u, 1, 1)))\n" "void __kernel\n" "%csyr%sk%s(\n" " uint N,\n" " const uint K,\n" " const %s alpha,\n" " const __global %s *restrict A,\n" " uint lda,\n" "%s" // B and ldb "%s" // beta " __global %s *C,\n" " uint ldc,\n" " const uint startN,\n" " const uint origN%s)\n", pgran->wgSize[0], fpref, rank, nameSuffix, tnameOrig, tnameA, bstr, betaStr, tnameOrig, strOffABC); kgenDeclareFunction(ctx, tmp); } //----------------------------------------------------------------------------- static void genHead( struct KgenContext *ctx, BlasGenSettings *gset, BlasFunctionID funcID, SubgVarNames *pSubgVNames, bool subgMode) { char tmp[1024], tmp1[128]; char start[128], end[128]; char *p; const char *vecTypeA; unsigned int vlenA, vlenB; unsigned int l1Pans; const SubproblemDim *dim = gset->subdims; const CLBLASKernExtra *kextra = gset->kextra; KernelExtraFlags kflags = kextra->flags; KernelExtraFlags diagFlags = KEXTRA_SYRK_SEPARATE_DIAGONAL | KEXTRA_SYRK_EVALUATE_DIAGONAL; bool isDiagSep= ((kflags & KEXTRA_SYRK_SEPARATE_DIAGONAL) != 0); bool isEvalOnlyDiag = ((kflags & diagFlags) == diagFlags); l1Pans = (unsigned int)(dim[0].y / dim[1].y); vlenA = getVecLen(gset, funcID, MATRIX_A); vlenB = getVecLen(gset, funcID, MATRIX_B); getVectorTypeName(kextra->dtype, vlenA, &vecTypeA, NULL); // the variable stores N, passed as argument. // this variable is used for C matrix hit check kgenPrintf( ctx, "uint argN = N;\n" ); if ( subgMode ) { gset->varNames.LDS = "scratch"; // declaring variables used by subgroup mode pSubgVNames->itemId = "itemId"; pSubgVNames->subgCoord = "subgCoord"; kgenAddBlankLine( ctx ); kgenAddBlankLine(ctx); kgenPrintf(ctx, "int skipTilemul = 0;\n" ); kgenPrintf(ctx, "int2 %s;\n", pSubgVNames->itemId ); kgenPrintf(ctx, "int2 %s;\n", pSubgVNames->subgCoord); // item ID kgenPrintf( ctx, "%s.x = get_local_id(0)%%%d;\n", pSubgVNames->itemId, dim[0].bwidth/dim[1].bwidth); // subgroup ID kgenPrintf( ctx, "%s.y = get_local_id(0)/%d;\n", pSubgVNames->itemId, dim[0].bwidth/dim[1].bwidth); // subgroup coordX kgenPrintf( ctx, "%s.x = %s.y/%d;\n", pSubgVNames->subgCoord, pSubgVNames->itemId, dim[0].y/dim[1].y ); // subgroup coordY kgenPrintf( ctx, "%s.y = %s.y%%%d;\n", pSubgVNames->subgCoord, pSubgVNames->itemId, dim[0].y/dim[1].y ); } if (funcID == CLBLAS_SYRK) { sprintf(tmp, "__global %s *B;\n", vecTypeA); kgenAddStmt(ctx, tmp); } if (kflags & KEXTRA_SYRK_2K_RANK) { const char *vecTypeB; getVectorTypeName(kextra->dtype, vlenB, &vecTypeB, NULL); sprintf(tmp, "__global %s *wiA;\n" "__global %s *wiB;\n", vecTypeA, vecTypeB); kgenAddStmt(ctx, tmp); } kgenAddStmt(ctx, "uint4 coord = 0;\n" /* contains coordB, coordA, k */ "uint k0 = 0;\n\n"); // extra variables needed for the upper triangular case if ( kflags & KEXTRA_UPPER_TRIANG ) { if (kflags & KEXTRA_TAILS_N) { kgenAddStmt(ctx, "uint step;\n"); } kgenAddStmt(ctx, "uint w;\n"); } kgenAddStmt(ctx, "const int lid = get_local_id(0);\n" "uint block = get_group_id(0);\n\n"); /* * Increase/decrease the outer block coordinate while the inner block number * exceeds the number of blocks. Inner block number is counted from the * diagonal up to the matrix edge. A is always the inner matrix. It is from * the largest panel. The resulting block number determines starting * coordinates. * * In the case of separate evaluating of the area around the diagonal it's * critically important that at least on step would be aligned. * Otherwise, solution areas will overlap that will lead to a wrong result. */ if ( kflags & KEXTRA_UPPER_TRIANG ) { char step[128], tmp2[128], *stepCalc = NULL; int roundDir; if ((kflags & KEXTRA_TAILS_N)) { sprintf(tmp2, "step = (coord.x %% %lu) ? (coord.x %% %lu) : %lu;\n", dim[0].x, dim[0].x, dim[0].x); stepCalc = tmp2; sprintf(step, "step"); } else { tmp2[0] = '\0'; sprintf(step, "%lu", dim[0].x); } if (!isEvalOnlyDiag) { start[0] = '\0'; } else { sprintf(start, "(coord.x - %s) / %lu * %lu", step, dim[0].y, dim[0].y); } if (!isDiagSep || isEvalOnlyDiag) { strcpy(end, "coord.x"); roundDir = 1; // round up } else { sprintf(end, "(coord.x - %s) / %lu * %lu", step, dim[0].y, dim[0].y); roundDir = 0; // round down } if (!isEvalOnlyDiag) { kgenAddStmt(ctx, "coord.x = origN;\n"); kgenAddStmt(ctx, stepCalc); sprintf(tmp, "w = (origN - startN - N + %lu) / %lu * %lu;\n" "k0 = (N + %lu) / %lu;\n" "if (block <= k0 * (w / %lu)) {\n" " coord.x -= (block / k0) * %lu;\n" " block %%= k0;\n" "}\n", dim[0].x - 1, dim[0].x, dim[0].x, dim[0].y - 1, dim[0].y, dim[0].x, dim[0].x); kgenAddStmt(ctx, tmp); kgenBeginBranch(ctx, "else"); sprintf(tmp, "coord.x = N;\n" "block -= k0 * (w / %lu);\n", dim[0].x); kgenAddStmt(ctx, tmp); kgenAddStmt(ctx, stepCalc); } else { kgenAddStmt(ctx, "coord.x = N;\n"); kgenAddStmt(ctx, stepCalc); } if (isDiagSep) { genPanelBlocksStmt(ctx, "k0", roundDir, dim, start, end); } kgenBeginBranch(ctx, "while (block >= k0)"); kgenAddStmt(ctx, "block -= k0;\n"); sprintf(tmp, "coord.x -= %s;\n", step); kgenAddStmt(ctx, tmp); kgenAddStmt(ctx, stepCalc); genPanelBlocksStmt(ctx, "k0", roundDir, dim, start, end); kgenEndBranch(ctx, NULL); kgenAddStmt(ctx, "coord.x += startN;\n"); if (!isEvalOnlyDiag) { kgenEndBranch(ctx, NULL); } if (isEvalOnlyDiag) { sprintf(tmp1, "%s", start); p = tmp1; } else { p = (char*)"startN"; } if ( subgMode ) { kgenPrintf( ctx, "coord.y = %s + block * %lu + %s.y * %lu;\n", p, dim[0].y, pSubgVNames->subgCoord, dim[1].y ); kgenPrintf( ctx, "coord.x = coord.x - %s + %s.x * %lu;\n", step, pSubgVNames->subgCoord, dim[1].x); kgenBeginBranch( ctx, "if (coord.y >= startN + argN || coord.x >= origN)"); kgenPrintf( ctx, "skipTilemul = 1;\n" ); kgenEndBranch( ctx, NULL ); sprintf( tmp, "if (coord.y >= coord.x + %lu)", dim[1].x ); kgenBeginBranch( ctx, tmp ); kgenPrintf( ctx, "skipTilemul = 1;\n" ); kgenEndBranch( ctx, NULL ); } else { sprintf(tmp, "coord.y = %s + block * %lu + lid %% %u * %lu;\n" "coord.x = coord.x - %s + lid / %u * %lu;\n" "\n" "if (coord.y >= startN + N || coord.x >= origN) {\n" " return;\n" "}\n\n" // Check if the tile is fully out of diagonal "if (coord.y >= coord.x + %lu) {\n" " return;\n" "}\n\n", p, dim[0].y, l1Pans, dim[1].y, step, l1Pans, dim[1].x, dim[1].x); kgenAddStmt(ctx, tmp); } } else { int vecAlign = 1; if (!isDiagSep || isEvalOnlyDiag) { strcpy(start, "coord.x"); } else { sprintf(start, "(coord.x + %lu) / %lu * %lu", dim[0].x + dim[0].y - 1, dim[0].y, dim[0].y); } if (isEvalOnlyDiag) { sprintf(end, "(coord.x + %lu) / %lu * %lu", dim[0].x + dim[0].y - 1, dim[0].y, dim[0].y); } else { vecAlign = umax(vlenA, vlenB); if (isMatrixAccessColMaj(funcID, kflags, MATRIX_A) && (vecAlign > 1)) { sprintf(end, "(N + %u) / %u * %u", vecAlign - 1, vecAlign, vecAlign); } else { strcpy(end, "N"); } } if (!isEvalOnlyDiag) { sprintf(tmp, "k0 = (N + %lu) / %lu;\n" "if (block < k0 * (startN / %lu)) {\n" " coord.x = (block / k0) * %lu;\n" " block %%= k0;\n" "}\n", dim[0].y - 1, dim[0].y, dim[0].x, dim[0].x); kgenAddStmt(ctx, tmp); kgenBeginBranch(ctx, "else"); sprintf(tmp, "block -= k0 * (startN / %lu);\n", dim[0].x); kgenAddStmt(ctx, tmp); } if (isDiagSep) { genPanelBlocksStmt(ctx, "k0", 1, dim, start, end); } kgenBeginBranch(ctx, "while (block >= k0)"); sprintf(tmp, "block -= k0;\n" "coord.x += %lu;\n", dim[0].x); kgenAddStmt(ctx, tmp); genPanelBlocksStmt(ctx, "k0", 1, dim, start, end); kgenEndBranch(ctx, NULL); kgenAddStmt(ctx, "coord.x += startN;\n"); if (!isEvalOnlyDiag) { kgenEndBranch(ctx, NULL); } if (!isDiagSep && (kflags & KEXTRA_TAILS_M)) { sprintf(tmp, "coord.y = (%s >= startN + N %% %lu) ? " "(N - (block + 1) * %lu) : " "(N - N %% %lu - block * %lu);\n", start, dim[0].y, dim[0].y, dim[0].y, dim[0].y); } else if ((isDiagSep && !isEvalOnlyDiag) && (kflags & KEXTRA_TAILS_M)) { sprintf(tmp, "coord.y = (N - N %% %lu - block * %lu);\n", dim[0].y, dim[0].y); } else { sprintf(tmp, "coord.y = %s - (block + 1) * %lu;\n", end, dim[0].y); } kgenAddStmt(ctx, tmp); if (isMatrixAccessColMaj(funcID, kflags, MATRIX_A) && (vecAlign > 1)) { sprintf(tmp, "coord.y = (coord.y + %u) / %u * %u;\n", vecAlign - 1, vecAlign, vecAlign); kgenAddStmt(ctx, tmp); } if ( subgMode ) { kgenPrintf( ctx, "coord.y += startN + %s.y * %lu;\n", pSubgVNames->subgCoord, dim[1].y ); kgenPrintf( ctx, "coord.x += %s.x * %lu;\n", pSubgVNames->subgCoord, dim[1].x ); kgenBeginBranch( ctx, "if (coord.y >= startN + argN || coord.x >= startN + argN)" ); kgenPrintf( ctx, "skipTilemul = 1;\n" ); kgenEndBranch( ctx, NULL ); sprintf( tmp, "if (coord.x >= coord.y + %lu)", dim[1].y ); kgenBeginBranch( ctx, tmp ); kgenPrintf( ctx, "skipTilemul = 1;\n" ); kgenEndBranch( ctx, NULL ); } else { sprintf(tmp, "coord.y += startN + lid %% %u * %lu;\n", l1Pans, dim[1].y); kgenAddStmt(ctx, tmp); sprintf(tmp, "coord.x += lid / %u * %lu;\n" "if (coord.y >= startN + N || coord.x >= startN + N) {\n" " return;\n" "}\n" // check if the tile is fully out of the diagonal "if (coord.x >= coord.y + %lu) {\n" " return;\n" "}\n\n", l1Pans, dim[1].x, dim[1].y); kgenAddStmt(ctx, tmp); } } kgenAddBlankLine(ctx); } //----------------------------------------------------------------------------- static void declareComplexMults( struct KgenContext *ctx, DataType dtype, UpdateResultFlags uflags) { const char *tname; if (isComplexType(dtype)) { tname = dtypeBuiltinType(dtype); declareComplexMultParts(ctx, "alpha", tname); if (uflags & UPRES_WITH_BETA) { declareComplexMultParts(ctx, "beta", tname); } } } //----------------------------------------------------------------------------- static void genUpdateSingleOptimized( struct StatementBatch *batch, const BlasGenSettings *gset, const Kstring *tempC, const Kstring *result, const Kstring *complexOpTmp) { const char *alphaName; const char *betaName; bool useMad; const CLBLASKernExtra *kextra = gset->kextra; alphaName = gset->varNames.alpha; betaName = (kextra->flags & KEXTRA_BETA_ZERO) ? NULL : gset->varNames.beta; useMad = (kextra->flags & KEXTRA_ENABLE_MAD) != 0; if (isComplexType(kextra->dtype)) { TileMulCore core; Kstring expr; Kstring alphaStr; const Kstring *k3; bool isDouble; isDouble = isDoubleBasedType(kextra->dtype); core = (useMad) ? TILEMUL_MAD : TILEMUL_MULADD; kstrcpy(&alphaStr, alphaName); k3 = ((betaName != NULL) && (core == TILEMUL_MAD) && complexOpTmp) ? complexOpTmp : tempC; if (betaName != NULL) { Kstring betaStr; kstrcpy(&betaStr, betaName); sprintfComplexMulUpdate(&expr, k3, tempC, &betaStr, NULL, isDouble, false, false, core); kgenAddStmtToBatch(batch, MAD_STMT_PRIO, expr.buf); sprintfComplexMulUpdate(&expr, tempC, result, &alphaStr, k3, isDouble, false, false, core); kgenAddStmtToBatch(batch, MAD_STMT_PRIO, expr.buf); } else { //fix correctness bug for c/z syr2k when beta = (0,0) sprintfComplexMulUpdate_syr2k_beta0(&expr, tempC, result, &alphaStr, NULL, isDouble, false, false, core); kgenAddStmtToBatch(batch, MAD_STMT_PRIO, expr.buf); } } else { if (betaName != NULL) { if (useMad) { kgenBatchPrintf(batch, MAD_STMT_PRIO, "%s = mad(%s, %s, 0);\n" "%s = mad(%s, %s, %s);\n", tempC->buf, tempC->buf, betaName, tempC->buf, result->buf, alphaName, tempC->buf); } else { kgenBatchPrintf(batch, MAD_STMT_PRIO, "%s = %s * %s + %s * %s;\n", tempC->buf, result->buf, alphaName, tempC->buf, betaName); } } else { if (useMad) { kgenBatchPrintf(batch, MAD_STMT_PRIO, "%s = mad(%s, %s, 0);\n", tempC->buf, result->buf, alphaName); } else { kgenBatchPrintf(batch, MAD_STMT_PRIO, "%s = %s * %s;\n", tempC->buf, result->buf, alphaName); } } } } //----------------------------------------------------------------------------- // Init temporary file for diagonal result update static void initTmpResTile(Tile *tile, const BlasGenSettings *gset, bool forceNoTrans) { KernelExtraFlags kflags = gset->kextra->flags; bool cmaj = ((kflags & KEXTRA_COLUMN_MAJOR) != 0) && !forceNoTrans; const Tile *tc = &gset->tileCY; memcpy(tile, tc, sizeof(Tile)); if (!(kflags & KEXTRA_BETA_ZERO)) { unsigned int maxTmpSize; unsigned int pitch; maxTmpSize = tileStorageSize(&gset->tileA) + tileStorageSize(&gset->tileBX); tile->baseName = "tempC"; tile->vecLen = getVecLen(gset, CLBLAS_SYRK, MATRIX_C); tile->trans = cmaj; pitch = (cmaj) ? tile->nrRows : tile->nrCols; tile->vecLen = (unsigned int)roundDownPow2(pitch); tile->vecLen = umin(tile->vecLen, MAX_TILE_VECLEN); /* * restrict number of rows or columns of the new tile according * to the maximum tile size evaluated above */ if (cmaj) { pitch = (unsigned int)roundUp(tile->nrRows, tile->vecLen); tile->nrCols = umin(maxTmpSize / pitch, tile->nrCols); tile->nrCols = (unsigned int)roundDownPow2(tile->nrCols); } else { pitch = (unsigned int)roundUp(tile->nrCols, tile->vecLen); tile->nrRows = umin(maxTmpSize / pitch, tile->nrRows); tile->nrRows = (unsigned int)roundDownPow2(tile->nrRows); } } } //----------------------------------------------------------------------------- // Declare and setup pointer to the start of updated outpu tile const char *declareSetupOutputPtr(struct KgenContext *ctx, const BlasGenSettings *gset) { const KernelVarNames *kvars = &gset->varNames; const char *coords[2] = {kvars->coordA, kvars->coordB}; const char *tname; int cmaj; tname = dtypeBuiltinType(gset->kextra->dtype); cmaj = ((gset->kextra->flags & KEXTRA_COLUMN_MAJOR) != 0); kgenPrintf(ctx, "__global %s *dst = %s + %s * %s + %s;\n\n", tname, kvars->C, coords[cmaj], kvars->ldc, coords[1 - cmaj]); return "dst"; } //----------------------------------------------------------------------------- /* * Check if an additional temporary variable is need for updating complex * result. It is needed if using "mad" buit-in OpenCL functions because * a single operation is evaluated with 2 statements. Without that the result * part evaluated with the first statement is used as an input argument * in the second one that leads to wrong evaluation. Declare and put its * name to the passed string if it's really needed or just empty the * string otherwise */ static void checkDeclareUpcomTmp( struct KgenContext *ctx, const BlasGenSettings *gset, Kstring *kstr) { DataType dtype = gset->kextra->dtype; const char *tname; if (isComplexType(dtype) && (gset->kextra->flags & KEXTRA_ENABLE_MAD)) { tname = dtypeBuiltinType(dtype); kgenPrintf(ctx, "%s sctmp;\n", tname); kstrcpy(kstr, "sctmp"); } else { emptyKstring(kstr); } } //----------------------------------------------------------------------------- // Declare set of variables differing with trailing index static void declareDiagUpresIndexedVars( struct KgenContext *ctx, const char *type, const char *baseName, unsigned int nrVars) { Kstring kstr; unsigned int i; ksprintf(&kstr, "%s %s0", type, baseName); for (i = 1; i < nrVars; i++) { kstrcatf(&kstr, ", %s%u", baseName, i); } kstrcatf(&kstr, ";\n"); kgenAddStmt(ctx, kstr.buf); } //----------------------------------------------------------------------------- /* * Add blank line for each diagonal update statement priority * to make the code more readable */ static void addDiagUpdateBlanks(struct StatementBatch *batch) { kgenAddStmtToBatch(batch, FETCH_STMT_PRIO, "\n"); kgenAddStmtToBatch(batch, MAD_STMT_PRIO, "\n"); kgenAddStmtToBatch(batch, STORE_STMT_PRIO, "\n"); } //----------------------------------------------------------------------------- /* * The function update result around the diagonal in case of * 'y' and 'x' subdimensions equal at the tile level, and not * having tails along those subdimensions. */ static int genUpdateIsoscelesDiagTile( struct KgenContext *ctx, const BlasGenSettings *gset) { KernelExtraFlags kflags = gset->kextra->flags; DataType dtype = gset->kextra->dtype; struct StatementBatch *batch; PhysTileIterator iter; unsigned int vlen; const Tile *tileC = &gset->tileCY; Tile tileTempC; bool isPhysUpper; bool isHit; bool withBeta; bool cmaj; unsigned int nrStored; unsigned int skipCnt = 0; const char *glbType; const char *dstPtr; Kstring tempElem, resElem; Kstring k1, k2, ldcName; Kstring comTmp; const Kstring *ptmp; Kstring offExpr; unsigned int tempRows, tempCols; unsigned int madLen; batch = createStmtBatch(); if (batch == NULL) { return -ENOMEM; } cmaj = (kflags & KEXTRA_COLUMN_MAJOR) != 0; isPhysUpper = ((kflags & KEXTRA_UPPER_TRIANG) != 0) ^ cmaj; withBeta = !(kflags & KEXTRA_BETA_ZERO); iterInit(&iter, tileC, 1, 0); vlen = getVecLen(gset, CLBLAS_SYRK, MATRIX_C); kstrcpy(&ldcName, gset->varNames.ldc); initTmpResTile(&tileTempC, gset, false); tempRows = tileTempC.nrRows; tempCols = tileTempC.nrCols; // declare and initialize needed variables dstPtr = declareSetupOutputPtr(ctx, gset); checkDeclareUpcomTmp(ctx, gset, &comTmp); ptmp = (isKstringEmpty(&comTmp)) ? NULL : &comTmp; if (tileTempC.baseName != tileC->baseName) { declareOneTileStorage(ctx, &tileTempC); kgenAddBlankLine(ctx); } while (!iterIsEnd(&iter)) { if (!(iter.row % tempRows || iter.col % tempCols)) { addDiagUpdateBlanks(batch); flushStmtBatch(ctx, batch); } isHit = (isPhysUpper) ? (iter.vec >= iter.line) : (iter.line >= iter.vec); skipCnt = (skipCnt) ? (skipCnt - 1) : 0; if (!isHit) { iterIterate(&iter); continue; } if (skipCnt) { nrStored = 0; } else if (isPhysUpper) { if (iter.vec && !isRoundedPow2(iter.vec)) { size_t s = iter.vec; s = szmin(roundUpPow2(s) - s, s - roundDownPow2(s)); nrStored = (unsigned int)s; } else { nrStored = (iter.vec) ? umin(iter.vec, iter.nrVecs - iter.vec) : (unsigned int)iter.nrVecs; } } else { nrStored = (unsigned int)roundDownPow2(iter.line - iter.vec + 1); } nrStored = umin(nrStored, vlen); skipCnt = umax(skipCnt, nrStored); if (nrStored) { getVectorTypeName(dtype, nrStored, &glbType, NULL); ksprintf(&k1, "%u", iter.line); ksprintf(&k2, "%u", iter.vec); sprintfFastScalarMad(&offExpr, &k1, &ldcName, 0, &k2); if (withBeta) { sprintfTileElement(&tempElem, &tileTempC, iter.row % tempRows, iter.col % tempCols, nrStored); kgenBatchPrintf(batch, FETCH_STMT_PRIO, "%s = *(__global %s*)(&%s[%s]);\n", tempElem.buf, glbType, dstPtr, offExpr.buf); } } madLen = (isComplexType(dtype) || (tileC->trans != cmaj)) ? 1 : nrStored; if (madLen) { sprintfTileElement(&tempElem, &tileTempC, iter.row % tempRows, iter.col % tempCols, madLen); sprintfTileElement(&resElem, tileC, iter.row, iter.col, madLen); genUpdateSingleOptimized(batch, gset, &tempElem, &resElem, ptmp); } if (nrStored) { sprintfTileElement(&tempElem, &tileTempC, iter.row % tempRows, iter.col % tempCols, nrStored); kgenBatchPrintf(batch, STORE_STMT_PRIO, "*(__global %s*)(&%s[%s]) = %s;\n", glbType, dstPtr, offExpr.buf, tempElem.buf); } iterIterate(&iter); } addDiagUpdateBlanks(batch); flushStmtBatch(ctx, batch); destroyStmtBatch(batch); return 0; } //----------------------------------------------------------------------------- /* * Update diagonal tile of arbitrary shape in case of not having tails * along 'x' and 'y' subdimensions at the tile level. */ static int genUpdateGenericDiagTile( struct KgenContext *ctx, const BlasGenSettings *gset) { KernelExtraFlags kflags = gset->kextra->flags; DataType dtype = gset->kextra->dtype; const char *typeName; struct StatementBatch *batch; PhysTileIterator iter; TileIterFlags tifl; BlasGenSettings gsetNew; const Tile *tileC = &gset->tileCY; Tile tileTempC; bool withBeta; bool isUpper; const char *dstPtr; const char *s; Kstring tempElem, resElem; Kstring comTmp; const Kstring *ptmp; Kstring kstr, alphaStr, betaStr; unsigned int nrRows, nrCols; unsigned int tempRows; // type of the vectorized coordinates Kstring vctype; Kstring constOffs, constShifts, constMasks; unsigned int i, j, nops,size; unsigned int maxFetches = 0; const char *yname, *xname; const char *ldcName; char hexadec[2]; batch = createStmtBatch(); if (batch == NULL) { return -ENOMEM; } typeName = dtypeBuiltinType(dtype); nrRows = tileC->nrRows; nrCols = tileC->nrCols; withBeta = !(kflags & KEXTRA_BETA_ZERO); isUpper = ((kflags & KEXTRA_UPPER_TRIANG) != 0); yname = gset->varNames.coordA; xname = gset->varNames.coordB; ldcName = gset->varNames.ldc; memcpy(&gsetNew, gset, sizeof(BlasGenSettings)); /* * Fetches are done by single element. Non transposed shape * is forced to facilitate further size restriction and tile * manipulation */ memcpy(&tileTempC, tileC, sizeof(Tile)); tileTempC.trans = false; tifl = (isUpper) ? TILE_ITER_BACKWARD_ROWS : TILE_ITER_BACKWARD_COLS; iterInit(&iter, &tileTempC, 1, tifl); nops = 0; while (!iterIsEnd(&iter)) { nops++; size = nops / nrCols; iterIterate(&iter); } iterInit(&iter, &tileTempC, 1, tifl); initTmpResTile(&tileTempC, gset, true); if (nrCols == 1) { kstrcpy(&vctype, "uint"); } else { ksprintf(&vctype, "uint%u", nrCols); } /* * fill constant offsets, shifts and masks within each line * for vectorized coorinates */ ksprintf(&constOffs, "(%s)(", vctype.buf); ksprintf(&constShifts, "(%s)(", vctype.buf); ksprintf(&constMasks, "(%s)(", vctype.buf); for (i = 0; i < nrCols; i++) { s = (i == nrCols - 1) ? "" : ", "; j = (isUpper) ? (nrCols - i - 1) : i; kstrcatf(&constOffs, "%uu%s", j, s); kstrcatf(&constShifts, "%uu%s", i, s); kstrcatf(&constMasks, "%#x%s", 1 << i, s); } kstrcatf(&constOffs, ")"); kstrcatf(&constShifts, ")"); kstrcatf(&constMasks, ")"); // declare and initialize needed variables dstPtr = declareSetupOutputPtr(ctx, gset); checkDeclareUpcomTmp(ctx, gset, &comTmp); ptmp = (isKstringEmpty(&comTmp)) ? NULL : &comTmp; if (tileTempC.baseName != tileC->baseName) { /* * Make additional temporary tile size restrition because of the * following factors: * * No more than 16 fetches can be combined into single clause. * So, there is no need to maintain larger temporary tile as well * as more vector coordinates to reduce number of consumed registers. * However, actually, the compiler pains even 16 fetches merged into * single clause and allocate huge number of registers. */ if (tileStorageSize(&tileTempC) > MAX_FETCH_CLAUSE_SIZE) { tileTempC.nrRows = (unsigned int)roundDownPow2( MAX_FETCH_CLAUSE_SIZE / nrCols); if (!tileTempC.nrRows) { tileTempC.nrRows = 1; } } } tempRows = tileTempC.nrRows; maxFetches = MAX_FETCH_CLAUSE_SIZE / nrCols * nrCols; maxFetches = umin(maxFetches, tempRows * nrCols); i = tileStorageSize(&tileTempC); maxFetches = umin(maxFetches, i); // declare vectorized coordinates declareDiagUpresIndexedVars(ctx, vctype.buf, "cc", size); /* * real y coordinate, offset mask and * substituted beta and alpha (one value per temporary line) */ kgenAddStmt(ctx, "unsigned int ry;\n" "unsigned int mask;\n" "int hit;\n"); if (withBeta) { declareDiagUpresIndexedVars(ctx, typeName, "alphaNew", size); declareDiagUpresIndexedVars(ctx, typeName, "betaNew", size); } // declare tile if (tileTempC.baseName != gset->tileCY.baseName) { declareOneTileStorage(ctx, &tileTempC); kgenAddBlankLine(ctx); } // set start mask value if (isUpper) { kgenPrintf(ctx, "if (%s + %u <= %s) {\n" " mask = ~0;\n" "}\n" "else {\n" " mask = (%s + %u < %s + %u) " " ? ~((1 << (%s + %u - %s)) - 1) : 0;\n" "}\n\n", yname, nrRows - 1, xname, yname, nrRows - 1, xname, nrCols - 1, yname, nrRows, xname); } else { kgenPrintf(ctx, "if (%s + %u <= %s) {\n" " mask = ~0;\n" "}\n" "else {\n" " mask = (%s > %s) ? ((1 << (%s - %s)) - 1) : 0;\n" "}\n\n", xname, nrCols - 1, yname, yname, xname, yname, xname); } // let's go nops = 0; while (!iterIsEnd(&iter)) { if (nops == maxFetches) { addDiagUpdateBlanks(batch); flushStmtBatch(ctx, batch); nops = 0; } // index for all temporary coordinates i = nops / nrCols; // prepare vectorized coordinates for the next line if (nops % tileTempC.nrCols == 0) { if (isUpper) { kgenBatchPrintf(batch, CALC_COORDS_STMT_PRIO, "hit = (%s + %u <= %s + %u);\n", yname, iter.row, xname, nrCols - 1); } else { kgenBatchPrintf(batch, CALC_COORDS_STMT_PRIO, "hit = (%s + %u >= %s);\n", yname, iter.row, xname); } if (withBeta) { kgenBatchPrintf(batch, CALC_COORDS_STMT_PRIO, "betaNew%u = (hit) ? %s : %s;\n" "alphaNew%u = (hit) ? %s : (%s)0;\n", i, gset->varNames.beta, strOne(dtype), i, gset->varNames.alpha, typeName); } if (isUpper) { kgenBatchPrintf(batch, CALC_COORDS_STMT_PRIO, "ry = select(0, %u, hit);\n" "mask = select(mask, mask >> 1 | %#x, hit);\n" "cc%u = ((%s)mask &\n" " %s) >>\n" " %s;\n" "cc%u = %u - mad24(cc%u, %s, 0u);\n", iter.row, (1 << (nrCols - 1)), i, vctype.buf, constMasks.buf, constShifts.buf, i, nrCols - 1, i, constOffs.buf); } else { kgenBatchPrintf(batch, CALC_COORDS_STMT_PRIO, "ry = select(%u, %u, hit);\n" "mask = select(mask, mask << 1 | 1, hit);\n" "cc%u = ((%s)mask &\n" " %s) >>\n" " %s;\n" "cc%u = mad24(cc%u, %s, 0u);\n", nrRows - 1, iter.row, i, vctype.buf, constMasks.buf, constShifts.buf, i, i, constOffs.buf); } if (kflags & KEXTRA_COLUMN_MAJOR) { kgenBatchPrintf(batch, CALC_COORDS_STMT_PRIO, "cc%u = mad24(cc%u, (%s)%s, (%s)ry);\n\n", i, i, vctype.buf, ldcName, vctype.buf); } else { kgenBatchPrintf(batch, CALC_COORDS_STMT_PRIO, "cc%u = mad24((%s)ry, (%s)%s, cc%u);\n\n", i, vctype.buf, vctype.buf, ldcName, i); } } // prepare for the immediate update sprintfTileElement(&tempElem, &tileTempC, iter.row % tempRows, iter.col, 1); sprintfTileElement(&resElem, tileC, iter.row, iter.col, 1); if (nrCols == 1) { ksprintf(&kstr, "cc%u", i); } else { snprintf(hexadec, sizeof(char)*2, "%x", iter.col); //itoa(iter.col, hexadec, 16); ksprintf(&kstr, "cc%u.s%s", i, hexadec); } // prepare multipliers and fetch if (withBeta) { ksprintf(&alphaStr, "alphaNew%u", i); ksprintf(&betaStr, "betaNew%u", i); gsetNew.varNames.alpha = alphaStr.buf; gsetNew.varNames.beta = betaStr.buf; kgenBatchPrintf(batch, FETCH_STMT_PRIO, "%s = %s[%s];\n", tempElem.buf, dstPtr, kstr.buf); } genUpdateSingleOptimized(batch, &gsetNew, &tempElem, &resElem, ptmp); // store kgenBatchPrintf(batch, STORE_STMT_PRIO, "%s[%s] = %s;\n", dstPtr, kstr.buf, tempElem.buf); nops++; iterIterate(&iter); } addDiagUpdateBlanks(batch); flushStmtBatch(ctx, batch); destroyStmtBatch(batch); return 0; } //----------------------------------------------------------------------------- static int genUpdateTailedDiagTile( struct KgenContext *ctx, const BlasGenSettings *gset, UpdateResultFlags uflags) { char tmp[1024]; char s1[1024], s2[256]; char src[32], dst[32]; char *p; const char *vfield; size_t pitch; struct KgenContext *ctx1; const CLBLASKernExtra *kextra = gset->kextra; DataType dtype = kextra->dtype; KernelExtraFlags kflags = kextra->flags; const SubproblemDim *dims = gset->subdims; UpdateResultOp op; /* * solution tile coordinate without consideration of * row/column order */ const char *trow, *tcol, *s3, *s4; vfield = dtypeUPtrField(dtype); pitch = roundUp(gset->tileCY.nrCols, gset->tileCY.vecLen); tcol = gset->varNames.coordB; trow = gset->varNames.coordA; s3 = (kflags & KEXTRA_COLUMN_MAJOR) ? tcol : trow; s4 = (kflags & KEXTRA_COLUMN_MAJOR) ? trow : tcol; // declare and initialize variables sprintf(s1, "uint m = min(%luu, N - %s);\n" "uint n = min(%luu, N - %s);\n", dims[1].y, trow, dims[1].x, tcol); p = s1 + strlen(s1); sprintf(p, "uint i, j, j0;\n" "PPtr res;\n" "GPtr uC;\n" "\n" "res.%s = c;\n" "uC.%s = C + %s * ldc + %s;\n", vfield, vfield, s3, s4); if (uflags & (UPRES_TAIL_ROW | UPRES_TAIL_COL)) { char offStr[64]; char *p = offStr; offStr[0] = '\0'; if (uflags & UPRES_TAIL_ROW) { sprintf(offStr, " + (%lu - m) * %lu", dims[1].y, pitch); p += strlen(offStr); } if (uflags & UPRES_TAIL_COL) { sprintf(p, " + (%lu - n)", dims[1].x); } p = s1 + strlen(s1); sprintf(p, "res.%s = res.%s%s;\n", vfield, vfield, offStr); } kgenAddBlankLine(ctx); ctx1 = createKgenContext(s2, sizeof(s2), true); if (ctx1 == NULL) { return -ENOMEM; } kgenSyncFormatting(ctx1, ctx, 1); // update logic sprintf(src, "res.%s[i * %lu + j]", vfield, pitch); if (uflags & UPRES_COLUMN_MAJOR) { sprintf(dst, "uC.%s[j * ldc + i]", vfield); } else { sprintf(dst, "uC.%s[i * ldc + j]", vfield); } op = (kflags & KEXTRA_BETA_ZERO) ? UPRES_SET : UPRES_SUM; genUpdateResultSingle(ctx1, dst, src, gset, op, uflags); if ( kflags & KEXTRA_UPPER_TRIANG ) { declareComplexMults(ctx, dtype, uflags); sprintf(tmp, "%s" // variables /* * setup number of rows to update * and start column to update from */ "j = min(%s + %lu, %s + %lu) - %s;\n" "m = min(m, j);\n" "j0 = (%s < %s) ? (%s - %s) : 0;\n" "\n" "for (i = 0; i < m; i++) {\n" " for (j = j0; j < n; j++) {\n" "%s" // update logic " }\n" /* * increment row, increment start column * if the diagonal is reached */ " %s++;\n" " j0 = (%s >= %s) ? j0 : (j0 + 1);\n" "}\n", s1, trow, dims[1].y, tcol, dims[1].x, trow, tcol, trow, trow, tcol, s2, trow, tcol, trow); } else { declareComplexMults(ctx, dtype, uflags); sprintf(tmp, "uint i0;\n" "%s" // variables "i0 = (%s < %s) ? (%s - %s) : 0;\n" "j = min(%s + %lu, %s + %lu) - %s;\n" "n = min(j, n);\n" "j0 = (%s < %s) ? (%s - %s + 1) : 1;\n" "\n" "for (i = i0; i < m; i++) {\n" " for (j = 0; j < j0; j++) {\n" "%s" // update logic " }\n" " j0 = min(j0 + 1, n);\n" "}\n", s1, trow, tcol, tcol, trow, trow, dims[1].y, tcol, dims[1].x, tcol, tcol, trow, trow, tcol, s2); } destroyKgenContext(ctx1); return kgenAddStmt(ctx, tmp); } //----------------------------------------------------------------------------- static int genUpdateResult( struct KgenContext *ctx, BlasFunctionID funcID, BlasGenSettings *gset, UpdateResultFlags upresFlags, const char * d1, // dummy parameters for compatibility with callback ptr const char * d2, const char * d3) { KernelExtraFlags kflags = gset->kextra->flags; KernelExtraFlags diagFlags = KEXTRA_SYRK_SEPARATE_DIAGONAL | KEXTRA_SYRK_EVALUATE_DIAGONAL; int ret; char tmp[1024]; DUMMY_ARGS_USAGE_3(d1, d2, d3); if ( gset->kextra->flags & KEXTRA_UPPER_TRIANG ) { sprintf( tmp, "if ( !( (coord.y >= startN + argN) || " "(coord.x >= origN) || " "(coord.y >= coord.x + %lu) ) )", gset->subdims[1].x ); kgenBeginBranch( ctx, tmp ); } else { sprintf( tmp, "if ( !( (coord.y >= startN + argN) || " "(coord.x >= startN + argN) || " "(coord.x >= coord.y + %lu) ) )", gset->subdims[1].y ); kgenBeginBranch( ctx, tmp ); } // update diagonal if the chosen mode implies its processing if ((kflags & diagFlags) != KEXTRA_SYRK_SEPARATE_DIAGONAL) { const char *tcol = gset->varNames.coordB; const char *trow = gset->varNames.coordA; bool areTails; areTails = ((kflags & (KEXTRA_TAILS_M_LOWER | KEXTRA_TAILS_N_LOWER)) != 0); if (areTails || (gset->subdims[1].y == gset->subdims[1].x)) { if ( kflags & KEXTRA_UPPER_TRIANG ) { sprintf(tmp, "if (%s + %lu > %s)", trow, gset->subdims[1].y, tcol); } else { sprintf(tmp, "if (%s + %lu > %s)", tcol, gset->subdims[1].x, trow); } kgenBeginBranch(ctx, tmp); if (!areTails) { ret = genUpdateIsoscelesDiagTile(ctx, gset); } else { ret = genUpdateTailedDiagTile(ctx, gset, upresFlags); } } else { unsigned int xb, yb; xb = (unsigned int)gset->subdims[0].x; yb = (unsigned int)gset->subdims[0].y; if ( kflags & KEXTRA_UPPER_TRIANG ) { sprintf(tmp, "if (%s / %u * %u + %u > %s / %u * %u)", trow, yb, yb, yb - 1, tcol, xb, xb); } else { sprintf(tmp, "if (%s / %u * %u + %u > %s / %u * %u)", tcol, xb, xb, xb - 1, trow, yb, yb); } kgenBeginBranch(ctx, tmp); ret = genUpdateGenericDiagTile(ctx, gset); } if (ret) { return ret; } kgenEndBranch(ctx, NULL); // the function above put a respective code into a conditional path kgenBeginBranch(ctx, "else"); } ret = genResultUpdateWithFlags( ctx, funcID, gset, upresFlags, NULL, NULL, NULL ); if ((kflags & diagFlags) != KEXTRA_SYRK_SEPARATE_DIAGONAL) { ret = kgenEndBranch(ctx, NULL); } kgenEndBranch( ctx, NULL ); return ret; } //----------------------------------------------------------------------------- static void initGenSettings( BlasGenSettings *gset, const SubproblemDim *subdims, const PGranularity *pgran, const CLBLASKernExtra *kextra, BlasFunctionID funcID) { KernelVarNames *vnames = &gset->varNames; unsigned int vecLen; memset(gset, 0, sizeof(BlasGenSettings)); memcpy(gset->subdims, subdims, sizeof(gset->subdims)); gset->flags = BGF_LD_IN_VECTORS; if ((funcID == CLBLAS_SYR2K) && !(kextra->flags & KEXTRA_SYRK_2K_RANK)) { gset->flags |= BGF_DISTINCT_VECLEN; } gset->pgran = pgran; gset->kextra = kextra; // !!! WORKAROUND; some cases fails with fetched fully tile of A vecLen = getVecLen(gset, funcID, MATRIX_A); if (vecLen != 1) { gset->flags |= BGF_WHOLE_A; } /////////////////////////////////////////////////////////////////////// if ((funcID == CLBLAS_SYR2K) && kextra->flags & KEXTRA_SYRK_2K_RANK) { vnames->A = "wiA"; vnames->B = "wiB"; } else { vnames->A = "A"; vnames->B = "B"; } vnames->C = "C"; vnames->lda = "lda"; vnames->ldb = (funcID == CLBLAS_SYR2K) ? "ldb" : vnames->lda; vnames->alpha = "alpha"; if (!(kextra->flags & KEXTRA_BETA_ZERO)) { vnames->beta = "beta"; } vnames->coordA = "coord.y"; vnames->coordB = "coord.x"; vnames->k = "coord.z"; vnames->sizeM = "N"; vnames->sizeN = "N"; vnames->sizeK = "K"; vnames->skewA = NULL; vnames->skewB = NULL; vnames->skewK = NULL; } //----------------------------------------------------------------------------- static ssize_t generator( char *buf, size_t buflen, const struct SubproblemDim *subdims, const struct PGranularity *pgran, void *extra, BlasFunctionID funcID) { ssize_t ret; struct KgenContext *ctx; char tmp[1024]; CLBLASKernExtra kextraNew; TileCreationFlags tcflags; DataType dtype; KernelExtraFlags kflags; UpdateResultFlags uflags; BlasGenSettings gset; TileMulOpts mulOpts; KernelVarNames *vnames = &gset.varNames; int i, numRanks; TilePostFetchPrivate pfPriv; TailStatus tailStatus = 0; FetchAddrMode addrMode; SyrxkExtraPriv *priv; bool subgMode = 0; SubgVarNames subgVNames; bool areTailsMN; memcpy(&kextraNew, extra, sizeof(kextraNew)); subgMode = ( subdims[0].bwidth != subdims[1].bwidth ); // fixup tail flags in respect with the selected separate diagonal mode kflags = kextraNew.flags; if (kflags & KEXTRA_SYRK_SEPARATE_DIAGONAL) { bool isUpper = ((kflags & KEXTRA_UPPER_TRIANG) != 0); if ((kflags & (KEXTRA_SYRK_SEPARATE_DIAGONAL | KEXTRA_SYRK_EVALUATE_DIAGONAL)) == KEXTRA_SYRK_SEPARATE_DIAGONAL) { if (isUpper) { kflags &= ~(KEXTRA_TAILS_M | KEXTRA_TAILS_M_LOWER); } else { kflags &= ~(KEXTRA_TAILS_N | KEXTRA_TAILS_N_LOWER); } } kextraNew.flags = kflags; } dtype = kextraNew.dtype; ctx = createKgenContext(buf, buflen, true); if (ctx == NULL) { return -ENOMEM; } kgenDeclareUptrs(ctx, isDoubleBasedType(dtype)); initGenSettings(&gset, subdims, pgran, &kextraNew, funcID); /* * fixup vectorization for C if some restrictions for it has been set * during the generic solve stage */ priv = (SyrxkExtraPriv*)&kextraNew.solverPriv; if (priv->maxVlenC) { kextraNew.vecLenC = umin(kextraNew.vecLenC, priv->maxVlenC); if (!(gset.flags & BGF_DISTINCT_VECLEN)) { kextraNew.vecLen = umin(kextraNew.vecLenC, kextraNew.vecLen); } } mulOpts.memA = mulOpts.memB = CLMEM_GLOBAL_MEMORY; mulOpts.core = (kflags & KEXTRA_ENABLE_MAD) ? TILEMUL_MAD : TILEMUL_MULADD; mulOpts.postFetch = NULL; mulOpts.flags = TILEMUL_NO_FLAGS; if (isMatrixAccessColMaj(funcID, kflags, MATRIX_A)) { mulOpts.flags |= TILEMUL_TRA; } else { mulOpts.flags |= TILEMUL_TRB; } mulOpts.fctx = createFetchContext(); if (mulOpts.fctx == NULL) { destroyKgenContext(ctx); return -ENOMEM; } if (kflags & KEXTRA_TAILS_K_LOWER) { // setup post fetch callback memset(&pfPriv, 0, sizeof(pfPriv)); pfPriv.wholeA = 1; pfPriv.funcID = funcID; pfPriv.gset = &gset; mulOpts.postFetch = defaultTilePostFetch; mulOpts.postFetchPriv = &pfPriv; } if( subgMode ) { declareKernel( ctx, &gset, funcID, "Subg" ); } else { declareKernel( ctx, &gset, funcID, "Block" ); } kgenBeginFuncBody(ctx); areTailsMN = (kflags & (KEXTRA_TAILS_M_LOWER | KEXTRA_TAILS_N_LOWER)) != 0; tcflags = areTailsMN ? TILE_C_FORCE_NOTRANS : 0; initDefaultTiles(&gset, funcID, tcflags, PRIV_STORAGE_VARIABLE_SET); /* * FIXME: since now it is used PPtr for updating diagonal * in case of tails variables cannot be used */ if (areTailsMN) { gset.tileCY.storType = PRIV_STORAGE_ARRAY; } declareTileStorages(ctx, &gset); genHead( ctx, &gset, funcID, &subgVNames, subgMode ); genZeroTile(ctx, &gset.tileCY); /* For adjusting coordinates, skews and updating result */ kgenAddStmt(ctx, "// Set N to initial argument of blas function, not divided one\n" "N = origN;\n"); if ( kflags & KEXTRA_UPPER_TRIANG ) { tailStatus = checkGenAdjustTailCoords(ctx, funcID, &gset, NULL); kgenAddBlankLine(ctx); } // generate multiplication logic numRanks = (kflags & KEXTRA_SYRK_2K_RANK) ? 2 : 1; addrMode = setDefaultFetchAddrMode(mulOpts.fctx, &gset, 0, tailStatus, (kflags & KEXTRA_TAILS_K_LOWER) != 0); genScaleLeadingDimensions(ctx, &gset); // ldc should not be scaled, so it is initialized after that gset.varNames.ldc = "ldc"; // Begin loop over the small panel for (i = 0; i < numRanks; i++) { if (i) { kgenAddStmt(ctx, "// begin the second rank update\n"); /* * For the second rank, reset coordinates and swap leading * dimensions */ if (!(addrMode & FETCH_ADDR_K_RELATIVE)) { kgenAddStmt(ctx, "coord.z = 0;\n"); } vnames->lda = "ldb"; vnames->ldb = "lda"; } genSetupPointers(ctx, &gset, funcID, addrMode, i); if (i) { kgenBeginBranch(ctx, NULL); } prepareFetchLoop(ctx, mulOpts.fctx, &gset, CLMEM_GLOBAL_MEMORY, CLMEM_GLOBAL_MEMORY); if ( subgMode ) { mulOpts.flags |= TILEMUL_BW_STRIDE; mulOpts.flags |= TILEMUL_NOT_INC_K; mulOpts.postFetch = NULL; setFetchAddrMode(mulOpts.fctx, (addrMode&~FETCH_ADDR_K_RELATIVE)); sprintf( tmp, "if( skipTilemul == 0 )"); kgenBeginBranch( ctx, tmp ); if ( kflags & KEXTRA_TAILS_K_LOWER ) { kgenPrintf( ctx, "uint kBase = K - (K%%%lu);\n", subdims[0].bwidth ); sprintf( tmp, "for ( k0 = %s.x * %lu; k0 < kBase; k0 += %lu )", subgVNames.itemId, subdims[1].bwidth, subdims[0].bwidth ); } else { sprintf( tmp, "for ( k0 = %s.x * %lu; k0 < K; k0 += %lu )", subgVNames.itemId, subdims[1].bwidth, subdims[0].bwidth ); } // main loop branch kgenBeginBranch( ctx, tmp ); gset.varNames.k = "k0"; } else { sprintf(tmp, "for (k0 = 0; k0 < K; k0 += %lu)", subdims[1].bwidth); kgenBeginBranch(ctx, tmp); } pfPriv.fetchNumA = 0; tileMulGen(ctx, &gset, &mulOpts); // main loop branch kgenEndBranch(ctx, NULL); if ( subgMode ) { // lowerK tails for subgroup mode if( kflags & KEXTRA_TAILS_K_LOWER ) { setFetchAddrMode(mulOpts.fctx, addrMode | FETCH_ADDR_TAILK_PADD); mulOpts.postFetch = defaultTilePostFetch; mulOpts.flags |= TILEMUL_EXTERN_RDECL; kgenPrintf( ctx, "%s = kBase + %s.x*%lu;\n", vnames->k, subgVNames.itemId, subdims[1].bwidth ); tileMulGen( ctx, &gset, &mulOpts ); } // skipTilemul branch kgenEndBranch( ctx, NULL ); } if (i) { kgenEndBranch(ctx, NULL); } kgenAddBlankLine(ctx); } if ( kflags & KEXTRA_UPPER_TRIANG ) { checkGenRestoreTailCoords(ctx, &gset, tailStatus); } kgenAddBlankLine(ctx); gset.flags &= ~BGF_LD_IN_VECTORS; uflags = kextraToUpresFlags(funcID, kflags); uflags |= tailStatusToUpresFlags(tailStatus); if ( subgMode ) { mergeUpdateResult( ctx, funcID, &gset, &subgVNames, //uflags | UPRES_EXCEED_PROBLEM_CONDITION, uflags, (UpresProcPtr)genUpdateResult ); } else { genUpdateResult( ctx, funcID, &gset, uflags, NULL, NULL, NULL ); } ret = kgenEndFuncBody(ctx); if (!ret) { ret = (ssize_t)kgenSourceSize(ctx) + 1; } destroyFetchContext(mulOpts.fctx); destroyKgenContext(ctx); return (ret < 0) ? -EOVERFLOW : ret; } //----------------------------------------------------------------------------- static void assignKargs( KernelArg *args, const CLBlasKargs *blasArgs, KernelExtraFlags kflags, BlasFunctionID funcID) { int i = 5; // height of the diagonal part initSizeKarg(&args[0], blasArgs->M); initSizeKarg(&args[1], blasArgs->K); assignScalarKarg(&args[2], &(blasArgs->alpha), blasArgs->dtype); initMemobjKarg(&args[3], blasArgs->A, NULL, 0, 0); initSizeKarg(&args[4], blasArgs->lda.matrix); if (funcID == CLBLAS_SYR2K) { initMemobjKarg(&args[i++], blasArgs->B, NULL, 0, 0); initSizeKarg(&args[i++], blasArgs->ldb.matrix); } if (!(kflags & KEXTRA_BETA_ZERO)) { assignScalarKarg(&args[i++], &(blasArgs->beta), blasArgs->dtype); } initMemobjKarg(&args[i++], blasArgs->C, NULL, 0, 0); initSizeKarg(&args[i++], blasArgs->ldc.matrix); initSizeKarg(&args[i++], blasArgs->offsetM); /* Original N */ initSizeKarg(&args[i++], blasArgs->N); if (kflags & KEXTRA_A_OFF_NOT_ZERO) { initSizeKarg(&args[i++], blasArgs->offA); } if (kflags & KEXTRA_BX_OFF_NOT_ZERO) { initSizeKarg(&args[i++], blasArgs->offBX); } if (kflags & KEXTRA_CY_OFF_NOT_ZERO) { initSizeKarg(&args[i++], blasArgs->offCY); } } //----------------------------------------------------------------------------- static void syrkAssignKargs(KernelArg *args, const void *params, const void *extra) { (void)extra; assignKargs(args, (const CLBlasKargs*)params, ((const CLBLASKernExtra*)extra)->flags, CLBLAS_SYRK); } //----------------------------------------------------------------------------- static void syr2kAssignKargs(KernelArg *args, const void *params, const void *extra) { (void)extra; assignKargs(args, (const CLBlasKargs*)params, ((const CLBLASKernExtra*)extra)->flags, CLBLAS_SYR2K); } //----------------------------------------------------------------------------- static void syrkCalcThreads( size_t threads[2], const SubproblemDim *subdims, const PGranularity *pgran, const void *args, const void *extra) { const CLBLASKernExtra *kextra = (CLBLASKernExtra*)extra; CLBlasKargs *blasArgs = (CLBlasKargs*)args; size_t nrGroups = 0; size_t x, procX, startN, N, origN, step; bool isU = (blasArgs->uplo == clblasUpper); KernelExtraFlags kflags = ((CLBLASKernExtra*)extra)->flags; KernelExtraFlags diagFlags = KEXTRA_SYRK_SEPARATE_DIAGONAL | KEXTRA_SYRK_EVALUATE_DIAGONAL; bool isDiagSep = ((kflags & KEXTRA_SYRK_SEPARATE_DIAGONAL) != 0); bool isEvalOnlyDiag = ((kflags & diagFlags) == diagFlags); size_t start, end; int roundDir = 1; size_t vecAlign = 1; /* * Traverse the output matrix with panels from * the largest one */ N = blasArgs->M; // width of the diagonal part startN = blasArgs->offsetM; // vertical offset of the diagonal part origN = blasArgs->N; x = (isU) ? N : 0; step = subdims[0].x; /* * NOTE: * * In the case of separate evaluating of the area around the diagonal it's * critically important that at least on step would be aligned. * Otherwise, solution areas will overlap that will lead to a wrong result. */ if (isU && (isDiagSep && !isEvalOnlyDiag)) { roundDir = 0; } else { roundDir = 1; } if (!isU && (!isDiagSep || isEvalOnlyDiag)) { vecAlign = isMatrixAccessColMaj(CLBLAS_SYRK, kflags, MATRIX_A) ? (size_t)umax(kextra->vecLenA, kextra->vecLenB) : 1; } for (procX = 0; procX < N; procX += step) { if (isU) { step = (isU && (x % subdims[0].x)) ? (x % subdims[0].x) : subdims[0].x; start = (!isEvalOnlyDiag) ? 0 : roundDown(x - step, subdims[0].y); end = (!isDiagSep || isEvalOnlyDiag) ? x : roundDown(x - step, subdims[0].y); x -= step; } else { start = (!isDiagSep || isEvalOnlyDiag) ? x : roundUp(x + step, subdims[0].y); end = (isEvalOnlyDiag) ? roundUp(x + step, subdims[0].y) : N; end = roundUp(end, vecAlign); x += step; if (start >= end) { continue; } } if (roundDir) { nrGroups += divRoundUp(end - start, subdims[0].y); } else { nrGroups += (end - start) / subdims[0].y; } } /* rectangular part of trapezium */ if (!isEvalOnlyDiag) { if (isU) { nrGroups += divRoundUp(N, subdims[0].y) * divRoundUp(origN - N - startN, subdims[0].x); } else { nrGroups += (startN / subdims[0].x) * divRoundUp(N, subdims[0].y); } } if (nrGroups == 0) { // in case we got N==0 nrGroups = 1; } threads[0] = nrGroups * pgran->wgSize[0]; threads[1] = 1; } //----------------------------------------------------------------------------- static SolverFlags solverFlags(void) { return SF_WSPACE_1D; } //----------------------------------------------------------------------------- static void fixupArgs(void *args, SubproblemDim *subdims, void *extra) { CLBLASKernExtra *kextra = (CLBLASKernExtra*)extra; const CLBlasKargs *blasArgs = (const CLBlasKargs*)args; size_t moddim; extraData_t *extraData = (extraData_t*)&((CLBLASKernExtra*)extra)->solverPriv; const size_t nChans = 8; // !!!DEVICE DEPENDED!!! const size_t wideChans = 64; // !!!DEVICE DEPENDED!!! const size_t sizeType[] = {1,2,2,4}; size_t sizeBlock = wideChans * nChans / sizeType[blasArgs->dtype]; size_t off = blasArgs->K % sizeBlock; if (off == 0) { extraData->staggered = roundUp(subdims[1].bwidth * sizeType[blasArgs->dtype] , wideChans / sizeType[blasArgs->dtype]); } else { extraData->staggered = 0; } extraData->staggered = 64 / sizeType[blasArgs->dtype]; //fixed, not calculated /* * Save maxium possible vectorization for C in case of column-major order * and lower triangular matrix C. It is needed because the 'y' problem * dimensions expands in backward direction and aligned access to memory * can occur. */ moddim = (unsigned int)(blasArgs->N % subdims[1].y); if (isMatrixAccessColMaj(CLBLAS_SYRK, kextra->flags, MATRIX_C) && (blasArgs->uplo == clblasLower) && moddim) { SyrxkExtraPriv *priv = (SyrxkExtraPriv*)kextra->solverPriv; size_t tsize; tsize = dtypeSize(kextra->dtype); priv->maxVlenC = appropriateVecLen(blasArgs->N, tsize, subdims[1].y, 3); } } //----------------------------------------------------------------------------- static bool checkCalcDecomp( PGranularity *pgran, SubproblemDim *subdims, unsigned int subdimsNum, DataType dtype, int check) { bool ret = true; DUMMY_ARG_USAGE(subdimsNum); if (check == PGRAN_CHECK) { unsigned int minSize, maxSize; maxSize = (dtype == TYPE_COMPLEX_DOUBLE) ? 4 : 8; minSize = (dtype == TYPE_COMPLEX_DOUBLE) ? 1 : 2; ret = decompSanityCheck(subdims, minSize, maxSize, 24, dtype, true); ret = ret && (subdims[0].bwidth == subdims[1].bwidth); ret = ret && (pgran->wgSize[0] == 64); } else { calcPgranDedicated(pgran, subdims, -1, 3); } return ret; } //----------------------------------------------------------------------------- void initSyr2kBlockPattern(MemoryPattern *mempat) { mempat->name = "Cached global memory based block syr2k"; mempat->nrLevels = 2; mempat->sops = &syr2kSolverOps; mempat->cuLevel = 0; mempat->thLevel = 1; mpatExtra.aMset = CLMEM_LEVEL_L1; mpatExtra.bMset = CLMEM_LEVEL_L1; mempat->extra = &mpatExtra; } //----------------------------------------------------------------------------- void initSyrkBlockPattern(MemoryPattern *mempat) { mempat->name = "Cached global memory based block syrk"; mempat->nrLevels = 2; mempat->sops = &syrkSolverOps; mempat->cuLevel = 0; mempat->thLevel = 1; mpatExtra.aMset = CLMEM_LEVEL_L1; mpatExtra.bMset = CLMEM_LEVEL_L1; mempat->extra = &mpatExtra; } //----------------------------------------------------------------------------- void initSyrkSubgPattern(MemoryPattern *mempat) { mempat->name = "Cached global memory based subgroup syrk"; mempat->nrLevels = 2; mempat->sops = &syrkSubgSops; mempat->cuLevel = 0; mempat->thLevel = 1; mpatExtra.aMset = CLMEM_LEVEL_L1; mpatExtra.bMset = CLMEM_LEVEL_L1; mempat->extra = &mpatExtra; } //----------------------------------------------------------------------------- void initSyr2kSubgPattern(MemoryPattern *mempat) { mempat->name = "Cached global memory based subgroup syr2k"; mempat->nrLevels = 2; mempat->sops = &syr2kSubgSops; mempat->cuLevel = 0; mempat->thLevel = 1; mpatExtra.aMset = CLMEM_LEVEL_L1; mpatExtra.bMset = CLMEM_LEVEL_L1; mempat->extra = &mpatExtra; } // ---------------------------------------------------------------------------- static int syrkSubgGetPerf( unsigned int kflags, const void *args) { DUMMY_ARG_USAGE(args); if ( !isMatrixAccessColMaj( CLBLAS_SYRK, kflags, MATRIX_A ) && !isMatrixAccessColMaj( CLBLAS_SYRK, kflags, MATRIX_B ) ) { return PPERF_GOOD; } return PPERF_NOT_SUPPORTED; } //----------------------------------------------------------------------------- static int syrkBlockGetPerf( unsigned int kflags, const void *args) { DUMMY_ARG_USAGE(args); if ( !isMatrixAccessColMaj( CLBLAS_SYRK, kflags, MATRIX_A ) && !isMatrixAccessColMaj( CLBLAS_SYRK, kflags, MATRIX_B ) ) { return PPERF_AVERAGE; } return PPERF_GOOD; } //----------------------------------------------------------------------------- static int syrkSubgGetDefaultDecomp( PGranularity *pgran, SubproblemDim *subdims, unsigned int subdimsNum, void *pArgs ) { DUMMY_ARG_USAGE(subdimsNum); pgran->wgDim = 1; return subgGetDefaultDecomp( pgran, subdims, pArgs ); } //----------------------------------------------------------------------------- #if 0 // for debug static int syrkBlockGetDefaultDecomp( PGranularity *pgran, SubproblemDim *subdims, unsigned int subdimsNum) { // !!! DEBUG #if 1 subdims[0].itemX = subdims[0].x = 64; subdims[0].itemY = subdims[0].y = 32; subdims[0].bwidth = subdims[1].bwidth = 2; subdims[1].itemX = subdims[1].x = 8; subdims[1].itemY = subdims[1].y = 4; #else subdims[0].itemX = subdims[0].x = 32; subdims[0].itemY = subdims[0].y = 32; subdims[0].bwidth = subdims[1].bwidth = 4; subdims[1].itemX = subdims[1].x = 4; subdims[1].itemY = subdims[1].y = 4; #endif pgran->wgDim = 1; pgran->wgSize[0] = 64; return 0; ////////////////////////////////////////////////// if( (subdimsNum<2)|| (NULL==pgran)|| (NULL==subdims) ){ return EINVAL; } pgran->wgDim = 1; pgran->wgSize[0] = 64; subdims[1].bwidth = 4; subdims[1].itemX = subdims[1].x = 4; subdims[1].itemY = subdims[1].y = 4; //subdims[0].bwidth = subdims[1].bwidth * itemsPerSubg; subdims[0].bwidth = subdims[1].bwidth; subdims[0].itemX = subdims[0].x = subdims[1].x * 8; subdims[0].itemY = subdims[0].y = subdims[1].y * 8; return 0; } #endif //----------------------------------------------------------------------------- static bool subgCheckCalcDecomp( PGranularity *pgran, SubproblemDim *subdims, unsigned int subdimsNum, DataType dtype, int check) { size_t subgA = 0; size_t subgB = 0; size_t regUse = 0; unsigned int itemsPerSubg = 0; DUMMY_ARG_USAGE(subdimsNum); if( 0 == subdims[0].x || 0 == subdims[0].y || 0 == subdims[0].bwidth || 0 == subdims[1].x || 0 == subdims[1].y || 0 == subdims[1].bwidth ){ return false; } subgA = subdims[0].y/subdims[1].y; subgB = subdims[0].x/subdims[1].x; itemsPerSubg = subdims[0].bwidth/subdims[1].bwidth; if( itemsPerSubg < 4 ){ return false; } if( subdims[1].y < 4 || subdims[1].x < 4 || subdims[1].bwidth < 4 ){ return false; } if( subdims[1].x != subdims[1].itemX || subdims[1].y != subdims[1].itemY ){ return false; } // the group block must consist of integer number of subgroup blocks if( subdims[0].x % subdims[1].x || subdims[0].y % subdims[1].y || subdims[0].bwidth % subdims[1].bwidth ){ return false; } //check fitting of bw to common vector sizes if( isComplexType(dtype) ){ if( 2*subdims[1].bwidth > 16 ){ return false; } } // check dimensions if( subdims[1].bwidth > 16 || subdims[1].x > 16 || subdims[1].y > 16 ){ return false; } // estimate register usage, drop // inevitably slowed decompositions regUse = ( subdims[1].bwidth * subdims[1].x + subdims[1].bwidth * subdims[1].y + subdims[1].x * subdims[1].y ) * dtypeSize(dtype); regUse /= 16; // 16 bytes per register if( regUse >= 64 ){ return false; } // passed PGranularity should be checked if( PGRAN_CHECK == check ){ if( pgran->wgDim != 1 ){ return false; } if( pgran->wgSize[0] != 64 ){ return false; } if( pgran->wgSize[0] != subgA*subgB*itemsPerSubg ){ return false; } } // PGranularity should be calculated else{ pgran->wgDim = 1; pgran->wgSize[0] = subgA * subgB * itemsPerSubg; } return true; } clblas-2.10/src/library/blas/gens/tests/000077500000000000000000000000001264277366700201725ustar00rootroot00000000000000clblas-2.10/src/library/blas/gens/tests/CMakeLists.txt000066400000000000000000000043671264277366700227440ustar00rootroot00000000000000# ######################################################################## # Copyright 2013 Advanced Micro Devices, Inc. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # ######################################################################## set(SRC_TILEMUL ../tilemul.c ../fetch.c ${clBLAS_SOURCE_DIR}/library/common/kerngen_core.c ${clBLAS_SOURCE_DIR}/library/common/kgen_basic.c ${clBLAS_SOURCE_DIR}/library/common/kgen_loop_helper.c ${clBLAS_SOURCE_DIR}/library/common/misc.c ${clBLAS_SOURCE_DIR}/library/blas/gens/blas_kgen.c ${clBLAS_SOURCE_DIR}/library/blas/gens/tile.c ${clBLAS_SOURCE_DIR}/library/blas/gens/tile_iter.c ${clBLAS_SOURCE_DIR}/library/blas/gens/gen_helper.c ${clBLAS_SOURCE_DIR}/library/blas/generic/blas_funcs.c ${clBLAS_SOURCE_DIR}/library/blas/generic/matrix_dims.c ${clBLAS_SOURCE_DIR}/library/blas/generic/matrix_props.c ${clBLAS_SOURCE_DIR}/library/common/gens/dblock_kgen.c ${clBLAS_SOURCE_DIR}/library/common/kgen_guard.c ${clBLAS_SOURCE_DIR}/library/common/list.c ${clBLAS_SOURCE_DIR}/library/common/mutex.c ${clBLAS_SOURCE_DIR}/library/common/trace_malloc.c t_tilemul.c ) include_directories(${OPENCL_INCLUDE_DIRS} ${clBLAS_SOURCE_DIR} ${clBLAS_SOURCE_DIR}/include ${clBLAS_SOURCE_DIR}/library/blas/include ${clBLAS_SOURCE_DIR}/library/blas/gens) add_executable(t_tilemul ${SRC_TILEMUL}) target_link_libraries(t_tilemul ${OPENCL_LIBRARIES}) set_target_properties( t_tilemul PROPERTIES RUNTIME_OUTPUT_DIRECTORY "${PROJECT_BINARY_DIR}/staging" ) # CPack configuration; include the executable into the package install( TARGETS t_tilemul RUNTIME DESTINATION bin${SUFFIX_BIN} LIBRARY DESTINATION lib${SUFFIX_LIB} ARCHIVE DESTINATION lib${SUFFIX_LIB}/import ) clblas-2.10/src/library/blas/gens/tests/t_tilemul.c000066400000000000000000001041251264277366700223370ustar00rootroot00000000000000/* ************************************************************************ * Copyright 2013 Advanced Micro Devices, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * ************************************************************************/ #ifdef __APPLE__ #include #else #include #endif #include #include #include #include #include #include #include #include #include #define JUST_MULTIPLICATION 0 #if JUST_MULTIPLICATION enum { ITEM_WORK_M = 1, ITEM_WORK_N = 1, ITEM_BLOCKS_K = 1, }; #else enum { ITEM_WORK_M = 4, ITEM_WORK_N = 4, ITEM_BLOCKS_K = 3, RAND_BOUND = 10 }; #endif const char *kernelName = "tilemul_test"; // float types based unified pointer typedef union FPtr { void *v; cl_float *f; cl_double *d; cl_float2 *f2; cl_double2 *d2; } FPtr; // float type based unified data type typedef union FType { unsigned char u[sizeof(cl_double)]; cl_float f; cl_float2 f2; cl_double d; cl_double2 d2; } FType; static void printUsage(const char *programName, int exitCode) { printf( "USAGE: %s [options] \n" " --help, -h Print this help message.\n" " --device, -d OpenCL device used. can " "be \"gpu\" or \"cpu\". Default is \"gpu\".\n" " --type, -t Type can be s, d, c or z. Default " "is s.\n" " --fetch, -f Size of used fetch vectors, in used " "types. Default is 1.\n" " --local, -l If matrix is local or global. Matrix " "can be A or B. By default, both are global.\n" " --verbose, -v Turn on verbose mode.\n" " --a, -a \n" " --b, -b \n Set order for tiles a and b fetching. " "Order can be are \"r\" for row major and \"c\" for " "column major. Default values are \"r\" for A and \"c\" for B.\n" " --skew, -s Set skews for tiles along M, N, and K " "directions. skew_value can be \"a\" for tile A skew along M, \"b\"" " for tile B skew along N and \"k\" for both tiles skew along K. " "There is no skews by default.\n" " -g, --globalcycling \n" " Set global cycling for tiles along M, " "N and K directions. global_cycling_value can be \"a\" for tile A " "global cycling along M, \"b\" for tile B global cycling along N " "and \"k\" for both tiles global cycling along K. There is no " "global cycling enabled by default.\n" " --iter, -i Number of iterations.\n" " --core, -c Multiplier core. can " "be \"muladd\", \"mad\" or \"dot\". Default is \"mad\".\n" " --old, -o Use old tilemul generator interface " "with one generator function call for both fetching and " "multiplication. Separate generators functions are used by " "default.\n" " M N K Size of block.\n", programName); exit(exitCode); } void genFillTileWithNAN(struct KgenContext *ctx, const Tile *tile) { char tmp[1024]; Kstring elem; unsigned int incRows, incCols; unsigned int i, j, v; if (!tile->trans) { incRows = 1; v = incCols = umin(tile->vecLen, tile->nrCols); } else { v = incRows = umin(tile->vecLen, tile->nrRows); incCols = 1; } for (i = 0; i < tile->nrRows; i += incRows) { for (j = 0; j < tile->nrCols; j += incCols) { sprintfTileElement(&elem, tile, i, j, v); sprintf(tmp, "%s = NAN;\n", elem.buf); kgenAddStmt(ctx, tmp); } } kgenAddBlankLine(ctx); } void addTestPrefix(struct KgenContext *ctx, bool isDouble) { kgenDeclareUptrs(ctx, isDouble); } static void checkRet(int ret, const char *genName) { if (ret != 0) { printf("%s generator failed: %s\n", genName, strerror(-ret)); exit(EXIT_FAILURE); } } void genTest( struct KgenContext *ctx, BlasGenSettings *gset, TileMulOpts *mulOpts, bool separateFetch) { char s[1024]; Kstring kstr; char *tName, tVect[64], *ptrName; KernelVarNames *vnames = &gset->varNames; DataType dtype = gset->kextra->dtype; const SubproblemDim *subdims = gset->subdims; unsigned int vecLen = gset->kextra->vecLen; size_t m, n, k; unsigned int i, j; bool tra, trb, localA, localB, vecCoords; int ret; TileMulFlags flags = mulOpts->flags; FetchOpts fetchOpts; m = gset->subdims[1].y; n = gset->subdims[1].x; k = gset->subdims[1].bwidth; tra = ((flags & TILEMUL_TRA) != 0); trb = ((flags & TILEMUL_TRB) != 0); localA = (mulOpts->memA == CLMEM_LOCAL_MEMORY); localB = (mulOpts->memB == CLMEM_LOCAL_MEMORY); vecCoords = ((flags & TILEMUL_OPTIMIZE_VEC_COORDS) != 0); tVect[0] = '\0'; if (vecCoords && vecLen != 1) { sprintf(tVect, "%u", vecLen); } switch (dtype) { case TYPE_FLOAT: tName = "float"; ptrName = "f"; break; case TYPE_DOUBLE: tName = "double"; ptrName = "d"; break; case TYPE_COMPLEX_FLOAT: tName = "float2"; ptrName = "f2v"; break; case TYPE_COMPLEX_DOUBLE: tName = "double2"; ptrName = "d2v"; break; default: return; } if (vecCoords) { //Do not use GPtrs in fetching vnames->A = "A"; vnames->B = "B"; } else { vnames->A = localA ? "LAptr" : "((GPtr)A)"; vnames->B = localB ? "LBptr" : "((GPtr)B)"; } if (!localA) { vnames->lda = "lda"; } if (!localB) { vnames->ldb = "ldb"; } vnames->sizeM = "M"; vnames->sizeN = "N"; vnames->sizeK = "K"; vnames->skewA = "skewA"; vnames->skewB = "skewB"; vnames->skewK = "skewK"; vnames->coordA = "workItemM"; vnames->coordB = "workItemN"; vnames->k = "k"; kgenAddBlankLine(ctx); sprintf(s, "__attribute__((reqd_work_group_size(%i, %i, 1)))\n", ITEM_WORK_M, ITEM_WORK_N); kgenAddStmt(ctx, s); kgenAddStmt(ctx, "__kernel void\n"); sprintf(s, "%s(\n", kernelName); kgenAddStmt(ctx, s); sprintf(s," %s alpha,\n", tName); kgenAddStmt(ctx, s); sprintf(s," __global %s%s *A,\n", tName, tVect); kgenAddStmt(ctx, s); sprintf(s," __global %s%s *B,\n", tName, tVect); kgenAddStmt(ctx, s); kgenAddStmt(ctx, " uint M,\n" " uint N,\n" " uint K,\n"); sprintf(s, " __global %s *C,\n" " const uint iter)\n", tName); kgenAddStmt(ctx, s); kgenBeginFuncBody(ctx); sprintf(s, "uint workItemM = %lu * get_global_id(0);\n" "uint workItemN = %lu * get_global_id(1);\n", m, n); kgenAddStmt(ctx, s); if ((flags & TILEMUL_SKEW_A) != 0) { kgenAddStmt(ctx, "uint skewA = 0u;\n"); } if ((flags & TILEMUL_SKEW_B) != 0) { kgenAddStmt(ctx, "uint skewB = 0u;\n"); } if ((flags & TILEMUL_SKEW_K) != 0) { kgenAddStmt(ctx, "uint skewK = 0u;\n"); } if (localA) { sprintf(s, "__local %s LA[%lu];\n", tName, subdims[0].bwidth * subdims[0].y); kgenAddStmt(ctx, s); } else { //global A sprintf(s, "uint lda = %s;\n", tra ? "M" : "K"); kgenAddStmt(ctx, s); } if (localB) { sprintf(s, "__local %s LB[%lu];\n", tName, subdims[0].bwidth * subdims[0].x); kgenAddStmt(ctx, s); } else { //global B sprintf(s, "uint ldb = %s;\n", trb ? "K" : "N"); kgenAddStmt(ctx, s); } initDefaultTiles(gset, CLBLAS_GEMM, TILE_PACKED, PRIV_STORAGE_ARRAY); declareTileStorages(ctx, gset); if (vecCoords) { size_t ha, hb; char *str; ha = tra ? k : m; hb = trb ? n : k; if (ha > 1) { str = s; str += sprintf(str, "uint%lu ca = {0", ha); for (i = 1; i < ha; i++) { str += sprintf(str, ", %s * %u / %u", vnames->lda, i, vecLen); } str += sprintf(str, "};\n"); kgenAddStmt(ctx, s); } else { kgenAddStmt(ctx, "uint ca = 0;\n"); } vnames->vectCoordA = "ca"; if (hb > 1) { str = s; str += sprintf(str, "uint%lu cb = {0", hb); for (i = 1; i < hb; i++) { str += sprintf(str, ", %s * %u / %u", vnames->ldb, i, vecLen); } str += sprintf(str, "};\n"); kgenAddStmt(ctx, s); } else { kgenAddStmt(ctx, "uint cb = 0;\n"); } vnames->vectCoordB = "cb"; // uint4 ca = {0, vecLDA, vecLDA * 2, vecLDA * 3}; // uint4 cb = {0, vecLDB, vecLDB * 2, vecLDB * 3}; } kgenAddBlankLine(ctx); sprintf(s, "for (int it = 0; it < iter; it++)"); kgenBeginBranch(ctx, s); if (!(localA && localB)) { kgenAddStmt(ctx, "uint k = 0;\n"); } genZeroTile(ctx, &gset->tileCY); if (vecCoords) { char *coordsA[2] = {"workItemM", "k"}; char *coordsB[2] = {"k", "workItemN"}; sprintf(s, "A += %s * (lda / %u) + %s / %u;\n", coordsA[tra], vecLen, coordsA[1 - tra], vecLen); kgenAddStmt(ctx, s); sprintf(s, "B += %s * (ldb / %u) + %s / %u;\n", coordsB[trb], vecLen, coordsB[1 - trb], vecLen); kgenAddStmt(ctx, s); } sprintf(s, "for (int k0 = 0; k0 < K; k0 += %lu)", subdims[0].bwidth); kgenBeginBranch(ctx, s); /* Copy data to local memory. We know that the size of matrix is the same * that the size of one block and use that. */ if (localA) { sprintf(s, "event_t evA = async_work_group_copy(LA, A, %lu, 0);\n" "wait_group_events(1, &evA);\n" "barrier(CLK_LOCAL_MEM_FENCE);\n", subdims[0].y * subdims[0].bwidth); kgenAddStmt(ctx, s); kgenAddStmt(ctx, "LPtr LAptr;\n"); if (tra) { sprintf(s, "LAptr.%s = LA + workItemM;\n", ptrName); } else { sprintf(s, "LAptr.%s = LA + workItemM * %lu;\n", ptrName, subdims[0].bwidth); } kgenAddStmt(ctx, s); } if (localB) { sprintf(s, "event_t evB = async_work_group_copy(LB, B, %lu, 0);\n" "wait_group_events(1, &evB);\n" "barrier(CLK_LOCAL_MEM_FENCE);\n", subdims[0].x * subdims[0].bwidth); kgenAddStmt(ctx, s); kgenAddStmt(ctx, "LPtr LBptr;\n"); if (trb) { sprintf(s, "LBptr.%s = LB + workItemN * %lu;\n", ptrName, subdims[0].bwidth); } else { sprintf(s, "LBptr.%s = LB + workItemN;\n", ptrName); } kgenAddStmt(ctx, s); } if (!separateFetch) { ret = tileMulGen(ctx, gset, mulOpts); checkRet(ret, "Multiplier"); } else { Tile *tileA = &gset->tileA; Tile *tileB = &gset->tileBX; memset(&fetchOpts, 0, sizeof(fetchOpts)); if (localA) { fetchOpts.memA = CLMEM_LOCAL_MEMORY; } if (localB) { fetchOpts.memB = CLMEM_LOCAL_MEMORY; } genFillTileWithNAN(ctx, tileA); genFillTileWithNAN(ctx, tileB); if (subdims[0].bwidth != subdims[1].bwidth) { sprintf(s, "for (int k1 = 0; k1 < %lu; k1 += %lu)", subdims[0].bwidth, k); kgenBeginBranch(ctx, s); } #if JUST_MULTIPLICATION for (i = 0; i < tileA->nrRows; i++) { for(j = 0; j < tileA->nrCols; j++) { sprintfTileElement(&kstr, tileA, i, j, 1); sprintf(s, "%s = %u;\n", kstr.buf, i * tileA->nrCols + j); kgenAddStmt(ctx, s); } } for (i = 0; i < tileB->nrRows; i++) { for(j = 0; j < tileB->nrCols; j++) { sprintfTileElement(&kstr, tileB, i, j, 1); sprintf(s, "%s = %u;\n", kstr.buf, i * tileB->nrCols + j); kgenAddStmt(ctx, s); } } #else fetchOpts.mrole = MATRIX_B; fetchOpts.lineOffset = 0; fetchOpts.linesNum = (tileB->trans) ? tileB->nrCols : tileB->nrRows; ret = genFetchInputTile(ctx, NULL, gset, &fetchOpts); checkRet(ret, "Fetching tile b"); fetchOpts.mrole = MATRIX_A; fetchOpts.linesNum = (tileA->trans) ? tileA->nrCols : tileA->nrRows; kgenAddBlankLine(ctx); fetchOpts.lineOffset = 0; ret = genFetchInputTile(ctx, NULL, gset, &fetchOpts); checkRet(ret, "Fetching tile a"); #endif ret = genMulTiles(ctx, gset, mulOpts); checkRet(ret, "Multiplier"); #if ! JUST_MULTIPLICATION sprintf(s, "k += %lu;\n", k); kgenAddStmt(ctx, s); #endif if (subdims[0].bwidth != subdims[1].bwidth) { kgenEndBranch(ctx, NULL); } } kgenEndBranch(ctx, NULL); // K loop kgenEndBranch(ctx, NULL); // iterations loop kgenAddBlankLine(ctx); for (i = 0; i < m; i++) { for (j = 0; j < n; j++) { sprintfTileElement(&kstr, &gset->tileCY, i, j, 1); sprintf(s, "((GPtr)C).%s" "[(%d + workItemM) * N + %d + workItemN] = %s;\n", ptrName, i, j, kstr.buf); kgenAddStmt(ctx, s); } } kgenEndFuncBody(ctx); } cl_int run ( const char *ker, cl_uint M, cl_uint N, cl_uint K, FType alpha, BlasGenSettings *gset, TileMulFlags flags, cl_device_type deviceType, bool verbose, unsigned int iterNum) { cl_int err; cl_platform_id platform; cl_context ctx; cl_device_id device; cl_command_queue queue; cl_event evt; DataType dtype = gset->kextra->dtype; cl_mem bufA, bufB, bufC; FPtr A, B, C, C_naive; bool isComplex = isComplexType(dtype); bool isDouble = isDoubleBasedType(dtype); cl_uint nwords = (isComplex) ? 2 : 1; unsigned int tsize = dtypeSize(dtype); cl_kernel kernel; size_t i, j, k; size_t globalWorkSize[2] = {ITEM_WORK_M, ITEM_WORK_N}; size_t localWorkSize[2] = {ITEM_WORK_M, ITEM_WORK_N}; char log[100000]; size_t logSize; cl_long sTime, fTime; cl_program program = NULL; clGetPlatformIDs(1, &platform, NULL); clGetDeviceIDs(platform, deviceType, 1, &device, NULL); ctx = clCreateContext(NULL, 1, &device, NULL, NULL, &err); if (err != CL_SUCCESS) { return err; } queue = clCreateCommandQueue(ctx, device, CL_QUEUE_PROFILING_ENABLE, &err); if (err != CL_SUCCESS) { return err; } /* Prepare OpenCL kernel and its arguments */ program = clCreateProgramWithSource(ctx, 1, &ker, NULL, NULL); err = clBuildProgram(program, 1, &device, NULL, NULL, NULL); clGetProgramBuildInfo (program, device, CL_PROGRAM_BUILD_LOG, sizeof(log), log, &logSize); printf("%s", log); if (err != CL_SUCCESS){ clReleaseProgram(program); return err; } kernel = clCreateKernel(program, kernelName, &err); if (err != CL_SUCCESS){ clReleaseProgram(program); return err; } /* Memory allocation */ A.v = malloc(M * K * tsize); B.v = malloc(K * N * tsize); C.v = malloc(M * N * tsize); C_naive.v = malloc(M * N * tsize); #if JUST_MULTIPLICATION srand(0); if (isDouble) { for(i = 0; i < M * K * nwords; i++){ A.d[i] = i; } for(i = 0; i < N * K * nwords; i++){ B.d[i] = i + 7; } for(i = 0; i < M * N * nwords; i++){ C.d[i] = 0.0; C_naive.d[i] = 0.0; } } else { for(i = 0; i < M * K * nwords; i++){ A.f[i] = i; } for(i = 0; i < N * K * nwords; i++){ B.f[i] = i + 7; } for(i = 0; i < M * N * nwords; i++){ C.f[i] = 0.0; C_naive.f[i] = 0.0; } } #else srand(0); if (isDouble) { for(i = 0; i < M * K * nwords; i++){ A.d[i] = (double)(rand() % RAND_BOUND); } for(i = 0; i < N * K * nwords; i++){ B.d[i] = (double)(rand() % RAND_BOUND); } for(i = 0; i < M * N * nwords; i++){ C.d[i] = 0.0; C_naive.d[i] = 0.0; } } else { for(i = 0; i < M * K * nwords; i++){ A.f[i] = (float)(rand() % RAND_BOUND); } for(i = 0; i < N * K * nwords; i++){ B.f[i] = (float)(rand() % RAND_BOUND); } for(i = 0; i < M * N * nwords; i++){ C.f[i] = 0.0; C_naive.f[i] = 0.0; } } #endif bufA = clCreateBuffer(ctx, CL_MEM_READ_ONLY | CL_MEM_USE_HOST_PTR, K * M * tsize, A.v, &err); if (err != CL_SUCCESS) { clReleaseKernel(kernel); return err; } bufB = clCreateBuffer(ctx, CL_MEM_READ_ONLY | CL_MEM_USE_HOST_PTR, K * N * tsize, B.v, &err); if (err != CL_SUCCESS) { clReleaseMemObject(bufA); clReleaseKernel(kernel); return err; } bufC = clCreateBuffer(ctx, CL_MEM_WRITE_ONLY | CL_MEM_USE_HOST_PTR, M * N * tsize, C.v, &err); if (err != CL_SUCCESS) { clReleaseMemObject(bufB); clReleaseMemObject(bufA); clReleaseKernel(kernel); return err; } /* Argument setting and kernel execution */ err = clSetKernelArg(kernel, 0, tsize, alpha.u); err |= clSetKernelArg(kernel, 1, sizeof(bufA), &bufA); err |= clSetKernelArg(kernel, 2, sizeof(bufB), &bufB); err |= clSetKernelArg(kernel, 3, sizeof(M), &M); err |= clSetKernelArg(kernel, 4, sizeof(N), &N); err |= clSetKernelArg(kernel, 5, sizeof(K), &K); err |= clSetKernelArg(kernel, 6, sizeof(bufC), &bufC); err |= clSetKernelArg(kernel, 7, sizeof(iterNum), &iterNum); if (err != CL_SUCCESS) { clReleaseMemObject(bufC); clReleaseMemObject(bufB); clReleaseMemObject(bufA); clReleaseKernel(kernel); return err; } err = clEnqueueNDRangeKernel(queue, kernel, 2, NULL, globalWorkSize, localWorkSize, 0, NULL, &evt); if (err != CL_SUCCESS) { clReleaseMemObject(bufC); clReleaseMemObject(bufB); clReleaseMemObject(bufA); clReleaseKernel(kernel); return err; } err = clFinish(queue); err = clEnqueueReadBuffer (queue, bufC, CL_TRUE, 0, M * N * tsize, C.v, 0, NULL, NULL); /* Naive CPU multiplication */ if (isDouble) { for (i = 0; i < M; i++) { for (j = 0; j < N; j++) { if (isComplex) { cl_double2 val; for (k = 0; k < K; k++) { cl_double2 bkj = flags & TILEMUL_TRB ? B.d2[j * K + k] : B.d2[k * N + j]; cl_double2 aik = flags & TILEMUL_TRA ? A.d2[k * M + i] : A.d2[i * K + k]; val.s[0] = aik.s[0] * bkj.s[0] - aik.s[1] * bkj.s[1]; val.s[1] = aik.s[0] * bkj.s[1] + aik.s[1] * bkj.s[0]; C_naive.d2[i * N + j].s[0] += val.s[0]; C_naive.d2[i * N + j].s[1] += val.s[1]; } val.s[0] = C_naive.d2[i * N + j].s[0] * alpha.d2.s[0] - C_naive.d2[i * N + j].s[1] * alpha.d2.s[1]; val.s[1] = C_naive.d2[i * N + j].s[0] * alpha.d2.s[1] + C_naive.d2[i * N + j].s[1] * alpha.d2.s[0]; C_naive.d2[i * N + j] = val; } else { for (k = 0; k < K; k++) { double bkj = flags & TILEMUL_TRB ? B.d[j * K + k] : B.d[k * N + j]; double aik = flags & TILEMUL_TRA ? A.d[k * M + i] : A.d[i * K + k]; C_naive.d[i * N + j] += aik * bkj; } C_naive.d[i * N + j] *= alpha.d; } } } for (i = 0; i < M * N; i++) { if (C.d[i] != C_naive.d[i]) { printf("Differ at (%lu, %lu): %lf != %lf\n", i / N, i % N, C.d[i], C_naive.d[i]); break; } } if (i == M * N) { printf("Match\n"); } } else { for (i = 0; i < M; i++) { for (j = 0; j < N; j++) { if (isComplex) { cl_float2 val; for (k = 0; k < K; k++) { cl_float2 bkj = flags & TILEMUL_TRB ? B.f2[j * K + k] : B.f2[k * N + j]; cl_float2 aik = flags & TILEMUL_TRA ? A.f2[k * M + i] : A.f2[i * K + k]; val.s[0] = aik.s[0] * bkj.s[0] - aik.s[1] * bkj.s[1]; val.s[1] = aik.s[0] * bkj.s[1] + aik.s[1] * bkj.s[0]; C_naive.f2[i * N + j].s[0] += val.s[0]; C_naive.f2[i * N + j].s[1] += val.s[1]; } val.s[0] = C_naive.f2[i * N + j].s[0] * alpha.f2.s[0] - C_naive.f2[i * N + j].s[1] * alpha.f2.s[1]; val.s[1] = C_naive.f2[i * N + j].s[0] * alpha.f2.s[1] + C_naive.f2[i * N + j].s[1] * alpha.f2.s[0]; C_naive.f2[i * N + j] = val; } else { for (k = 0; k < K; k++) { float bkj = flags & TILEMUL_TRB ? B.f[j * K + k] : B.f[k * N + j]; float aik = flags & TILEMUL_TRA ? A.f[k * M + i] : A.f[i * K + k]; C_naive.f[i * N + j] += aik * bkj; } C_naive.f[i * N + j] *= alpha.f; } } } for (i = 0; i < M * N; i++) { if (C.f[i] != C_naive.f[i]) { printf("Differ at (%lu, %lu): %lf != %lf\n", i / N, i % N, C.f[i], C_naive.f[i]); break; } } if (i == M * N) { printf("Match\n"); } } /* End of naive CPU multiplication */ if (verbose) { if (!isDouble) { printf("Matrix A:\n"); for (i = 0; i < M; i++) { for (k = 0; k < K; k++) { if (isComplex) { cl_float2 aik = flags & TILEMUL_TRA ? A.f2[k * M + i] : A.f2[i * K + k]; printf("(%4.1f, %4.1f) ", aik.s[0], aik.s[1]); } else { float aik = flags & TILEMUL_TRA ? A.f[k * M + i] : A.f[i * K + k]; printf("%4.1f ", aik); } } printf("\n"); } printf("Matrix B:\n"); for (k = 0; k < K; k++) { for (j = 0; j < N; j++) { if (isComplex) { cl_float2 bkj = flags & TILEMUL_TRB ? B.f2[j * K + k] : B.f2[k * N + j]; printf("(%4.1f, %4.1f) ", bkj.s[0], bkj.s[1]); } else { float bkj = flags & TILEMUL_TRB ? B.f[j * K + k] : B.f[k * N + j]; printf("%4.1f ", bkj); } } printf("\n"); } printf("CPU calculated matrix:\n"); for (i = 0; i < M; i++) { for (j = 0; j < N; j++) { if (isComplex) { printf("(%4.1f, %4.1f) ", C_naive.f2[i * N + j].s[0], C_naive.f2[i * N + j].s[1]); } else { printf("%4.1f ", C_naive.f[i * N + j]); } } printf("\n"); } printf("GPU calculated matrix:\n"); for (i = 0; i < M; i++) { for (j = 0; j < N; j++) { if (isComplex) { printf("(%4.1f, %4.1f) ", C.f2[i * N + j].s[0], C.f2[i * N + j].s[1]); } else { printf("%4.1f ", C.f[i * N + j]); } } printf("\n"); } } } clGetEventProfilingInfo(evt, CL_PROFILING_COMMAND_START, sizeof(cl_ulong), &sTime, NULL); clGetEventProfilingInfo(evt, CL_PROFILING_COMMAND_END, sizeof(cl_ulong), &fTime, NULL); printf("Total multiplication time: %d ms\nTime per iteration: %d ns\n", (int)((fTime-sTime)/1000000), (int)((fTime-sTime)/iterNum)); clReleaseMemObject(bufC); clReleaseMemObject(bufB); clReleaseMemObject(bufA); clReleaseKernel(kernel); return CL_SUCCESS; } int main(int argc, char *argv[]) { char out[1024*1024]; CLBLASKernExtra kextra; BlasGenSettings gset; TileMulOpts mulOpts; int i; cl_uint blockM = 4, blockN = 4, blockK = 8; struct KgenContext *ctx = createKgenContext(out, sizeof(out), 1); FType alpha; cl_int err; unsigned int iterNum = 1; const char* const shortOptions = "hd:f:l:t:a:b:s:g:i:c:ov"; const struct option longOptions[] = { {"help", no_argument, NULL, 'h'}, {"device", required_argument, NULL, 'd'}, {"fetch", required_argument, NULL, 'f'}, {"local", required_argument, NULL, 'l'}, {"type", required_argument, NULL, 't'}, {"a", required_argument, NULL, 'a'}, {"b", required_argument, NULL, 'b'}, {"skew", required_argument, NULL, 's'}, {"globalcycling", required_argument, NULL, 'g'}, {"iter", required_argument, NULL, 'i'}, {"core", required_argument, NULL, 'c'}, {"old", no_argument, NULL, 'o'}, {"verbose", no_argument, NULL, 'v'}, {NULL, 0, NULL, 0} }; int nextOption; cl_device_type deviceType = CL_DEVICE_TYPE_GPU; bool verbose = false; SubproblemDim *subdims = gset.subdims; bool separateFetch = false; memset(&gset, 0, sizeof(gset)); memset(&mulOpts, 0, sizeof(mulOpts)); memset(&kextra, 0, sizeof(kextra)); gset.kextra = &kextra; gset.flags |= BGF_WHOLE_A; mulOpts.core = TILEMUL_MAD; mulOpts.flags = TILEMUL_FORCE_VECTORIZATION; kextra.vecLen = 1; kextra.dtype = TYPE_FLOAT; alpha.f = 1; // parse command line do { nextOption = getopt_long(argc, argv, shortOptions, longOptions, NULL); switch (nextOption) { case 'h': printUsage(argv[0], EXIT_SUCCESS); break; case 'd': if (!strcmp("cpu", optarg)) { deviceType = CL_DEVICE_TYPE_CPU; } else if (!strcmp("gpu", optarg)) { deviceType = CL_DEVICE_TYPE_GPU; } else { printf("Unknown device type %s. Supported values are \"cpu\" " "and \"gpu\".\n", optarg); exit(EXIT_FAILURE); } break; case 'f': kextra.vecLen = atoi(optarg); break; case 'l': if (!strcmp(optarg, "A")) { mulOpts.memA = CLMEM_LOCAL_MEMORY; } else if (!strcmp(optarg, "B")) { mulOpts.memB = CLMEM_LOCAL_MEMORY; } else { printf("Wrong matrix specified: %s. Supported values are " "A, B.\n", optarg); exit(EXIT_FAILURE); } break; case 't': if (!strcmp(optarg, "s")) { kextra.dtype = TYPE_FLOAT; alpha.f = 1; } else if (!strcmp(optarg, "d")) { kextra.dtype = TYPE_DOUBLE; alpha.d = 1; } else if (!strcmp(optarg, "c")) { kextra.dtype = TYPE_COMPLEX_FLOAT; alpha.f2.s[0] = 1; alpha.f2.s[1] = 0; } else if (!strcmp(optarg, "z")) { kextra.dtype = TYPE_COMPLEX_DOUBLE; alpha.d2.s[0] = 1; alpha.d2.s[1] = 0; } else { printf("Wrong type specified: %s. Supported values are " "s, d, c, z.\n", optarg); exit(EXIT_FAILURE); } break; case 'a': if (!strcmp(optarg, "r")) { mulOpts.flags &= ~TILEMUL_TRA; } else if (!strcmp(optarg, "c")) { mulOpts.flags |= TILEMUL_TRA; } else { printf("Wrong tile a parameter specified: %s. Supported values " "are \"r\", \"c\".\n", optarg); exit(EXIT_FAILURE); } break; case 'b': if (!strcmp(optarg, "r")) { mulOpts.flags &= ~TILEMUL_TRB; } else if (!strcmp(optarg, "c")) { mulOpts.flags |= TILEMUL_TRB; } else { printf("Wrong tile b order specified: %s. Supported values " "are \"r\", \"c\".\n", optarg); exit(EXIT_FAILURE); } break; case 's': if (!strcmp(optarg, "a")) { mulOpts.flags |= TILEMUL_SKEW_A; } else if (!strcmp(optarg, "b")) { mulOpts.flags |= TILEMUL_SKEW_B; } else if (!strcmp(optarg, "k")) { mulOpts.flags |= TILEMUL_SKEW_K; } else { printf("Wrong skew parameter specified: %s. Supported values " "are \"a\", \"b\", \"k\"\n", optarg); exit(EXIT_FAILURE); } break; case 'g': if (!strcmp(optarg, "a")) { mulOpts.flags |= TILEMUL_GLOBAL_CYCLIC_A; } else if (!strcmp(optarg, "b")) { mulOpts.flags |= TILEMUL_GLOBAL_CYCLIC_B; } else if (!strcmp(optarg, "k")) { mulOpts.flags |= TILEMUL_GLOBAL_CYCLIC_K; } else { printf("Wrong global cycling parameter specified: %s. " "Supported values are \"a\", \"b\", \"k\"\n", optarg); exit(EXIT_FAILURE); } break; case 'i': iterNum = atoi(optarg); break; case 'c': if (!strcmp("muladd", optarg)) { mulOpts.core = TILEMUL_MULADD; } else if (!strcmp("mad", optarg)) { mulOpts.core = TILEMUL_MAD; } else if (!strcmp("dot", optarg)) { mulOpts.core = TILEMUL_DOT; } else { printf("Unknown multiplier core %s. Supported values" " are \"muladd\", \"mad\" and \"dot\".\n", optarg); exit(EXIT_FAILURE); } break; case 'o': separateFetch = false; break; case 'v': verbose = true; break; case -1: break; default: printUsage(argv[0], EXIT_FAILURE); break; } } while (nextOption != -1); if (optind + 2 >= argc) { printf("Error: Not all sizes are specified\n"); printUsage(argv[0], EXIT_FAILURE); } blockM = atoi(argv[optind]); blockN = atoi(argv[optind + 1]); blockK = atoi(argv[optind + 2]); if ((mulOpts.memA == CLMEM_LOCAL_MEMORY || mulOpts.memB == CLMEM_LOCAL_MEMORY) && ((mulOpts.flags & TILEMUL_GLOBAL_CYCLIC) != 0)) { printf("One of matrixes is in local memory, " "disabling global cycling\n"); mulOpts.flags &= ~TILEMUL_GLOBAL_CYCLIC; } if (mulOpts.flags & TILEMUL_TRA) { kextra.flags |= KEXTRA_TRANS_A; } if (mulOpts.flags & TILEMUL_TRB) { kextra.flags |= KEXTRA_TRANS_B; } subdims[0].y = blockM * ITEM_WORK_M; subdims[0].x = blockN * ITEM_WORK_N; subdims[0].bwidth = blockK * ITEM_BLOCKS_K; subdims[1].y = blockM; subdims[1].x = blockN; subdims[1].bwidth = blockK; memset(out, 0, sizeof(out)); i = isDoubleBasedType(kextra.dtype); kgenDeclareUptrs(ctx, i); genTest(ctx, &gset, &mulOpts, separateFetch); destroyKgenContext(ctx); printf("Kernel code: \n\"%s\"\n", out); err = run(out, subdims[0].y, subdims[0].x, subdims[0].bwidth, alpha, &gset, mulOpts.flags, deviceType, verbose, iterNum); if (err != CL_SUCCESS) { printf("Test run failed, error %d\n", err); return EXIT_FAILURE; } return EXIT_SUCCESS; } clblas-2.10/src/library/blas/gens/tile.c000066400000000000000000000312701264277366700201340ustar00rootroot00000000000000/* ************************************************************************ * Copyright 2013 Advanced Micro Devices, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * ************************************************************************/ #include #include #include #include #include #include #include #include #include "blas_kgen.h" // assign tile's base name to 'name' if it is assigned to zero pointer static __inline void selectTileBaseName(Tile *tile, const char *name) { if (tile->baseName == NULL) { tile->baseName = name; } } static void selectDefaultTileVecLen( Tile *tile, TileCreationFlags tflags, const BlasGenSettings *gset, BlasFunctionID funcID, MatrixRole mrole) { if (tflags & TILE_WITH_FETCH_VECLEN) { tile->vecLen = getVecLen(gset, funcID, mrole); } else { size_t w; w = (tile->trans) ? tile->nrRows : tile->nrCols; if (tile->packed) { size_t wpad, height; wpad = roundUpPow2(w); height = (tile->trans) ? tile->nrCols : tile->nrRows; tile->vecLen = (unsigned int)szmin(height * wpad, MAX_TILE_VECLEN); } else { tile->vecLen = (unsigned int)roundUpPow2(w); tile->vecLen = (unsigned int)szmin(tile->vecLen, MAX_TILE_VECLEN); } } } // physical tile pitch, can be less than one vector in case of packed mode static unsigned int tilePitch(const Tile *tile) { unsigned int pitch; if (!tile->trans) { if (tile->packed) { pitch = (unsigned int)roundUpPow2(tile->nrCols); } else { pitch = (unsigned int)roundUp(tile->nrCols, tile->vecLen); } } else { if (tile->packed) { pitch = (unsigned int)roundUpPow2(tile->nrRows); } else { pitch = (unsigned int)roundUp(tile->nrRows, tile->vecLen); } } return pitch; } void initTile( Tile *tile, const char *baseName, unsigned int nrRows, unsigned int nrCols, unsigned int vecLen, DataType dtype, PrivateStorageType storType, bool trans, bool packed) { assert(baseName == NULL || strlen(baseName) <= MAX_TILE_BASE_NAMELEN); tile->baseName = baseName; tile->nrRows = nrRows; tile->nrCols = nrCols; tile->vecLen = umin(MAX_TILE_VECLEN, vecLen); tile->dtype = dtype; tile->storType = storType; tile->trans = trans; tile->packed = packed; } void initDefaultTiles( BlasGenSettings *gset, BlasFunctionID funcID, TileCreationFlags flags, PrivateStorageType storType) { const SubproblemDim *dim = &gset->subdims[1]; KernelExtraFlags kflags = gset->kextra->flags; DataType dtype = gset->kextra->dtype; Tile *tile; const char *name; int level; bool packed; level = funcBlasLevel(funcID); packed = ((flags & TILE_PACKED) != 0); tile = &gset->tileA; selectTileBaseName(tile, "a"); initTile(tile, tile->baseName, (unsigned int)dim->y, (unsigned int)dim->bwidth, 1, dtype, storType, false, packed); tile->trans = isMatrixAccessColMaj(funcID, kflags, MATRIX_A); if (!(gset->flags & BGF_WHOLE_A)) { if (tile->trans) { tile->nrCols = 1; } else { tile->nrRows = 1; } } selectDefaultTileVecLen(tile, flags, gset, funcID, MATRIX_A); tile = &gset->tileBX; name = (level == 2) ? "x" : "b"; selectTileBaseName(tile, name); initTile(tile, tile->baseName, (unsigned int)dim->bwidth, (unsigned int)dim->x, 1, dtype, storType, false, packed); /* * NOTE: Tiles for the level 2 functions are forced to be transposed * in order to allow user to fetch elements belonging to different * rows which is very useful in case of unit increment between * elements because provides faster access to the global memory. */ if (level == 2) { tile->trans = true; } else { tile->trans = !isMatrixAccessColMaj(funcID, kflags, MATRIX_B); } selectDefaultTileVecLen(tile, flags, gset, funcID, MATRIX_B); tile = &gset->tileCY; name = (level == 2) ? "y" : "c"; selectTileBaseName(tile, name); initTile(tile, tile->baseName, (unsigned int)dim->y, (unsigned int)dim->x, 1, dtype, storType, false, packed); if (level == 2) { tile->trans = true; } else if (!(flags & TILE_C_FORCE_NOTRANS)) { tile->trans = isMatrixAccessColMaj(funcID, kflags, MATRIX_C); } selectDefaultTileVecLen(tile, flags, gset, funcID, MATRIX_C); // FIXME: remove the restriction /*if (isComplexType(tile->dtype)) { tile->vecLen = 1; }*/ } unsigned int tileVectorsNum(const Tile *tile) { size_t pitch, height; pitch = tilePitch(tile); height = (tile->trans) ? tile->nrCols : tile->nrRows; return (unsigned int)divRoundUp(height * pitch, tile->vecLen); } unsigned int tileStorageSize(const Tile *tile) { unsigned int u; u = tileVectorsNum(tile) * tile->vecLen; return u; } unsigned int tileLineSegmentLen(const Tile *tile) { unsigned int pitch; unsigned int len; pitch = tilePitch(tile); len = umin(pitch, tile->vecLen); if (tile->trans) { len = umin(len, tile->nrRows); } else { len = umin(len, tile->nrCols); } return len; } int declareOneTileStorage(struct KgenContext *ctx, const Tile *tile) { char tmp[1024]; const char *tname; int r; size_t size; getVectorTypeName(tile->dtype, tile->vecLen, &tname, NULL); size = tileVectorsNum(tile); if (tile->storType == PRIV_STORAGE_ARRAY) { sprintf(tmp, "%s %s[%lu];\n", tname, tile->baseName, size); } else { size_t i; char *p; sprintf(tmp, "%s %s0", tname, tile->baseName); p = tmp + strlen(tmp); for (i = 1; i < size; i++) { sprintf(p, ", %s%lu", tile->baseName, i); p += strlen(p); } strcpy(p, ";\n"); } r = kgenAddStmt(ctx, tmp); return (r) ? -EOVERFLOW : 0; } int declareTileStorages(struct KgenContext *ctx, const BlasGenSettings *gset) { int ret; ret = declareOneTileStorage(ctx, &gset->tileA); if (!ret) { ret = declareOneTileStorage(ctx, &gset->tileBX); } if (!ret) { declareOneTileStorage(ctx, &gset->tileCY); } return ret; } void sprintfTileElement( Kstring *str, const Tile *tile, unsigned int row, unsigned int col, unsigned int len) { unsigned int pitch; unsigned int elemLen; unsigned int off; unsigned int vecLen = tile->vecLen; char vchunk[24]; if (len == 0) { len = vecLen; } pitch = tilePitch(tile); elemLen = isComplexType(tile->dtype) ? 2 : 1; if (!tile->trans) { assert((row < tile->nrRows) && (col + len <= tile->nrCols)); off = (row * pitch + col) * elemLen; } else { assert((row + len <= tile->nrRows) && (col < tile->nrCols)); off = (col * pitch + row) * elemLen; } vecLen *= elemLen; sprintfVecChunk(vchunk, vecLen, len * elemLen, off % vecLen); if (tile->storType == PRIV_STORAGE_ARRAY) { sprintf(str->buf, "%s[%u]%s", tile->baseName, off / vecLen, vchunk); } else { sprintf(str->buf, "%s%u%s", tile->baseName, off / vecLen, vchunk); } } void sprintfTileElementHalf( Kstring *str, const Tile *tile, unsigned int row, unsigned int col, TileElementHalf half) { int len; assert(isComplexType(tile->dtype)); // sprintf the full element and the drop an unneded half sprintfTileElement(str, tile, row, col, 1); len = (int)strlen(str->buf); if (half == TE_HALF_HIGH) { str->buf[len - 2] = str->buf[len - 1]; } str->buf[len - 1] = '\0'; } int forEachTile(Kstring *kstr, unsigned int row, unsigned int col, unsigned int num, Tile *first, ...) { unsigned int minVecLen = first->vecLen; unsigned int valRow = first->nrRows; unsigned int valCol = first->nrCols; va_list argptr; unsigned int i; va_start(argptr, first); for (i = 1; i < num; i++) { Tile * cur = va_arg( argptr, Tile * ); minVecLen = umin(minVecLen, cur->vecLen); } va_end(argptr); if (first->trans) { valRow /= minVecLen; } else { valCol /= minVecLen; } if (row >= valRow || col >= valCol /*|| row < 0 || col < 0*/) { //would be signed return 0; } if (kstr) { va_start(argptr, first); for (i = 0; i < num; i++) { Tile * cur = i ? va_arg( argptr, Tile * ) : first; if (cur->baseName) { unsigned int vRow = (cur->trans ? row * minVecLen : row); unsigned int vCol = (cur->trans ? col : col * minVecLen); sprintfTileElement(&kstr[i], cur, vRow, vCol, minVecLen); } } va_end(argptr); } return first->trans ? valRow : valCol; } void genSetZeroInTile( struct KgenContext *ctx, const Tile *tile, unsigned int row, unsigned int col, unsigned int len) { char tmp[1024]; Kstring elem; sprintfTileElement(&elem, tile, row, col, len); sprintf(tmp, "%s = 0;\n", elem.buf); kgenAddStmt(ctx, tmp); } void genSetUnitInTile( struct KgenContext *ctx, const Tile *tile, unsigned int row, unsigned int col) { char tmp[1024]; Kstring elem; const char *s; sprintfTileElement(&elem, tile, row, col, 1); s = strOne(tile->dtype); sprintf(tmp, "%s = %s;\n", elem.buf, s); kgenAddStmt(ctx, tmp); } void genZeroTile(struct KgenContext *ctx, const Tile *tile) { char tmp[1024]; Kstring elem; unsigned int incRows, incCols; unsigned int i, j, v; v = tileLineSegmentLen(tile); if (!tile->trans) { incRows = 1; incCols = v; } else { incRows = v; incCols = 1; } for (i = 0; i < tile->nrRows; i += incRows) { for (j = 0; j < tile->nrCols; j += incCols) { sprintfTileElement(&elem, tile, i, j, v); sprintf(tmp, "%s = 0;\n", elem.buf); kgenAddStmt(ctx, tmp); } } kgenAddBlankLine(ctx); } void genTileCopy( struct KgenContext *ctx, const Tile *dst, const Tile *src, TileCopyOps op) { char tmp[1024]; Kstring el1, el2; unsigned int nrRows, nrCols; unsigned int incRows, incCols; unsigned int vlen; unsigned int i, j; nrRows = umin(dst->nrRows, src->nrRows); nrCols = umin(dst->nrCols, src->nrCols); if (dst->trans != src->trans) { vlen = 1; incRows = incCols = 1; } else { vlen = umin(dst->vecLen, src->vecLen); if (!dst->trans) { incRows = 1; incCols = umin(dst->nrCols, src->nrCols); incCols = umin(incCols, vlen); } else { incRows = umin(dst->nrRows, src->nrRows); incRows = umin(incRows, vlen); incCols = 1; } } for (i = 0; i < nrRows; i += incRows) { for (j = 0; j < nrCols; j += incCols) { sprintfTileElement(&el1, dst, i, j, vlen); sprintfTileElement(&el2, src, i, j, vlen); switch( op ) { case TILECOPY_ASSIGN: sprintf(tmp, "%s = %s;\n", el1.buf, el2.buf); break; case TILECOPY_ADD_ASSIGN: sprintf(tmp, "%s += %s;\n", el1.buf, el2.buf); break; case TILECOPY_SUB_ASSIGN: sprintf(tmp, "%s -= %s;\n", el1.buf, el2.buf); break; case TILECOPY_MUL_ASSIGN: sprintf(tmp, "%s *= %s;\n", el1.buf, el2.buf); break; case TILECOPY_DIV_ASSIGN: sprintf(tmp, "%s /= %s;\n", el1.buf, el2.buf); break; case TILECOPY_MOD_ASSIGN: sprintf(tmp, "%s %%= %s;\n", el1.buf, el2.buf); break; default: break; } kgenAddStmt(ctx, tmp); } } kgenAddBlankLine(ctx); } clblas-2.10/src/library/blas/gens/tile.h000066400000000000000000000313461264277366700201450ustar00rootroot00000000000000/* ************************************************************************ * Copyright 2013 Advanced Micro Devices, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * ************************************************************************/ /* * API to manupulate with matrix tiles */ #ifndef TILE_H_ #define TILE_H_ #include #include #define tileLineElemNum forEachTile struct BlasGenSettings; enum { MAX_TILE_BASE_NAMELEN = sizeof(Kstring) - 25, /* * It may be 16 vector components at maximum. Adding the length of the * subscript and selector operator, 2 digit index, and the end-line symbol, * to the maximum base name length we get the maximum tile element string * length */ MAX_TILE_ELEMENT_STRLEN = sizeof(Kstring) - 1, MAX_TILE_VECLEN = 8 }; /** * @internal * @brief Flags showing tile storing specifics * @ignroup TILES */ typedef enum TileCreationFlags { /** Tile C should be forced to non-transposed form */ TILE_C_FORCE_NOTRANS = 0x01, /** tile vector length is equal to the length of fetched vectors */ TILE_WITH_FETCH_VECLEN = 0x02, /** * If depending of transposing vector length is greater than * number of rows or columns, store several rows or columns respectively * in each vector */ TILE_PACKED = 0x04 } TileCreationFlags; /** * @internal * @brief Type of storage in the private memory * @ingroup TILES */ typedef enum PrivateStorageType { /** Tile is stored in array */ PRIV_STORAGE_ARRAY, /** Tile is stored in a set of variables */ PRIV_STORAGE_VARIABLE_SET } PrivateStorageType; typedef enum TileCopyOps { TILECOPY_ASSIGN, TILECOPY_ADD_ASSIGN, TILECOPY_SUB_ASSIGN, TILECOPY_MUL_ASSIGN, TILECOPY_DIV_ASSIGN, TILECOPY_MOD_ASSIGN } TileCopyOps; /** * @internal * @brief Tile element half types * @ingroup TILES */ typedef enum TileElementHalf { TE_HALF_LOW, TE_HALF_HIGH } TileElementHalf; /** * @internal * @brief Matrix tile stored in a private area * @ingroup TILES */ typedef struct Tile { const char *baseName; unsigned int nrRows; unsigned int nrCols; unsigned int vecLen; DataType dtype; PrivateStorageType storType; /** Flag of storing tile in the transposed form */ bool trans; /* * Depending on the transposing several rows or columns can be fit * into single vector. It makes sense only when number of rows or column * respectively is less than vector length */ bool packed; } Tile; /** * @internal * @brief Initialize tile * * @param[out] tile Tile description structure to fill * @param[in] baseName Tile base name * @param[in] nrRows Number of rows in the tile * @param[in] nrCols Number of columns in the tile * @param[in] vecLen Length of one native OpenCL element being a part of * the tile * @param[in] dtype Data type * @param[in] storType Tile storate type * @param[in] trans Shows if tile is stored in the transposed form * or direct * @param[in] packed Tile is stored in packed form. Has not effect if * a single line can be fit into the single vector. * * If \b vecLen param is above MAX_TILE_VECLEN then will be truncated into * MAX_TILE_VECLEN. * * @ingroup TILES */ void initTile( Tile *tile, const char *baseName, unsigned int nrRows, unsigned int nrCols, unsigned int vecLen, DataType dtype, PrivateStorageType storType, bool trans, bool packed); /** * @internal * @brief Initialize matrix tile from generator settings * * @param[out] gset Generator settings which tile should be initialized in * @param[in] funcID BLAS function ID * @param[in] flags Tile creation flags * @param[in] storType Storage type * * If \b baseName field of a tile structure in the generator settings is zero, * it is initialized with the default value: "a" for the matrix A, "b" for * the matrix B, "x" for the vector X, "c" for the matrix C, and "y" for the * vector Y. * * As X and Y are column-vectors from the math point of view, tiles for them * are always packed irrespectively the TileCreationFlags::TILE_PACKED flag * is specified or not. * * * Transposition of C tile matches transposition of C matrix by default, until * the TILE_C_FORCE_NOTRANS flag is not set. If the flag is set, tile is * forced to be initialized as non-transposed and veclen must be verified. * */ void initDefaultTiles( struct BlasGenSettings *gset, BlasFunctionID funcID, TileCreationFlags flags, PrivateStorageType storType); /** * @internal * @brief Get entire number of vectors in the tile * * @param[in] tile Tile to get number of vectors of */ unsigned int tileVectorsNum(const Tile *tile); /** * @internal * @brief Size of entire tile storage in elements * * @param[in] tile Tile to get size of */ unsigned int tileStorageSize(const Tile *tile); /** * @brief Get length of tile line segment * * @param[in] Tile Source tile * * Under that segment it is assumed such a part of line which doesn't cross over * vector bound and row/column bound depending on the tile is transposed or not. * In the other words, this is a piece of data which provides maximum possible * vectorization don't breaking correctness. */ unsigned int tileLineSegmentLen(const Tile *tile); /** * @internal * @brief Declare variables needed to store a tile * * @param[out] ctx Generator context * @param[in] gset Generator settings containing desctiptors of * tiles to declare storages for * * If a tile is fit into a single variable of the native type matching * to the tile's vector length, it is declared a single variable with the name * matching the \b baseName field being a part of the @ref Tile structure. * If not, the following rules are applied. If the tile is needed to be stored * in a private array, variable name matches the base name and array size * is sufficient to fit such a tile. If the tile is needed to be stored * in a set of variables which names are arranged as the base name followed * with an integer index starting from zero and incremented by one for each * subsequent variable. * * @return 0 on success, and -EOVERFLOW if the source buffer is overflowed * * @ingroup TILES */ int declareTileStorages(struct KgenContext *ctx, const struct BlasGenSettings *gset); /** * @internal * @brief Declare variable needed to store one tile * * @param[out] ctx Generator context * @param[in] tile Tile settings containing desctiptors of * a tile to declare storages for * * If a tile is fit into a single variable of the native type matching * to the tile's vector length, it is declared a single variable with the name * matching the \b baseName field being a part of the @ref Tile structure. * If not, the following rules are applied. If the tile is needed to be stored * in a private array, variable name matches the base name and array size * is sufficient to fit such a tile. If the tile is needed to be stored * in a set of variables which names are arranged as the base name followed * with an integer index starting from zero and incremented by one for each * subsequent variable. * * @return 0 on success, and -EOVERFLOW if the source buffer is overflowed * * @ingroup TILES */ int declareOneTileStorage(struct KgenContext *ctx, const Tile *tile); /** * @internal * @brief Sprintf element composed of one or several data elements * stored in the tile * * @param[out] str Kernel string object to store tile element * expression * @param[in] tile Tile description structure * @param[in] row Row of the starting element * @param[in] col Element column * @param[in] len Number of tile elements needed to be captured by * the expression * * \b row should be less than number of rows and \b col should be less than * number of columns in the tile. Traversal of a tile line is not allowed. * That means \b col plus \b len should be not greater than number of columns * if the tile is stored in direct form, and \b row plus \b len should be not * greater than number of rows if the tile is stored in transposed form. * If it is not hold true in debug mode, an assertion is triggered. * In the release may produce a wrong code which can be even not compilable. * * @ingroup TILES */ void sprintfTileElement( Kstring *str, const Tile *tile, unsigned int row, unsigned int col, unsigned int len); /** * @internal * @brief Sprintf half of a single complex data element stored in the tile * * @param[out] str Kernel string object to store tile element * expression * @param[in] tile Tile description structure * @param[in] row Row of the starting element * @param[in] col Element column * @param[in] half Half type * * The restrictions for \b row and \b col are the same as for * sprintfTileElement(). This function is applicable only for tiles containing * complex data and must not be used in case of real data. * * @ingroup TILES */ void sprintfTileElementHalf( Kstring *str, const Tile *tile, unsigned int row, unsigned int col, TileElementHalf half); /** * @internal * @brief Sprintf element composed of one or several data elements * stored in each of the tiles * * @param[out] kstrs Kernel string objects array to store element * expression for each tile * @param[in] row Vectorizable element row * @param[in] col Vectorizable element column * @param[in] num Number of tile description structure * @param[in] first First tile description structure * * Decides how many vectored access in for each line of each tile will be and * does sprintfTileElement() for each of tiles. This function can have got any * value of \b row \b and \b col \b. \b kstrs \b and \b tile->baseName \b can * have NULL, then no sprintfTileElement() will be executed. * * @return 0 if no sprintf tiles, or number of vectors in one line * * @ingroup TILES */ int forEachTile(Kstring *kstrs, unsigned int row, unsigned int col, unsigned int num, Tile *first, ...); /** * @internal * @brief Generate assigning a tile element with zero * * @param[out] ctx Generator context * @param[in] tile Tile description structure * @param[in] row Row of the starting element * @param[in] col Element column * @param[in] len Number of elements needed to be assigned with zero * * See decription of sprintfTileElement() for more details about restrictions * on \b row, \b col and \b len. * * @ingroup TILES */ void genSetZeroInTile( struct KgenContext *ctx, const Tile *tile, unsigned int row, unsigned int col, unsigned int len); /** * @internal * @brief Generate assigning a tile element with unit * * @internal * @brief Generate assigning a tile element with zero * * @param[out] ctx Generator context * @param[in] tile Tile description structure * @param[in] row Row of the starting element * @param[in] col Element column * * \b row should be less than number of rows and \b col should be less than * number of columns in the tile. If it is not hold true in debug mode, * an assertion is triggered. In the release may produce a wrong code which * can be even not compilable. * * @ingroup TILES */ void genSetUnitInTile( struct KgenContext *ctx, const Tile *tile, unsigned int row, unsigned int col); /** * @internal * @brief Generate zeroing an entire tile * * @param[out] ctx Generator context * @param[in] tile Tile description structure * * @ingroup TILES */ void genZeroTile(struct KgenContext *ctx, const Tile *tile); /** * @internal * @brief Generate copying between 2 tiles * * @param[out] ctx Generator context * @param[in] dst Destination tile * @param[in] src Source tile * * @ingroup TILES */ void genTileCopy( struct KgenContext *ctx, const Tile *dst, const Tile *src, TileCopyOps op); #endif /* TILE_H_ */ clblas-2.10/src/library/blas/gens/tile_iter.c000066400000000000000000000151211264277366700211540ustar00rootroot00000000000000/* ************************************************************************ * Copyright 2013 Advanced Micro Devices, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * ************************************************************************/ #include #include #include "tile_iter.h" // Translate coordiates in physical memory block // into logical tile coordinates static int iterCalcLogCoords( PhysTileIterator* iter){ if( NULL == iter ){ return -EINVAL; } if ( iter->isLogRowMaj ) { iter->row = iter->line; iter->col = iter->vec*iter->vecLen; } else { iter->col = iter->line; iter->row = iter->vec*iter->vecLen; } return 0; } //----------------------------------------------------------------------------- int iterInit(PhysTileIterator *iter, const Tile *tile, int vecLen, unsigned int tileIterFlags) { if( NULL == iter || NULL == tile ){ return -EINVAL; } memset(iter, 0, sizeof(PhysTileIterator)); iter->isLogRowMaj = tile->trans ? 0 : 1; iter->vecLen = vecLen; if ( iter->isLogRowMaj ) { if ( tile->nrCols % vecLen ) { return -EINVAL; } if ( tileIterFlags & TILE_ITER_BACKWARD_ROWS ) { iter->phyIterFlags |= PHY_ITER_BACKWARD_LINES; } if ( tileIterFlags & TILE_ITER_BACKWARD_COLS ) { iter->phyIterFlags |= PHY_ITER_BACKWARD_VECS; } iter->nrLines = tile->nrRows; iter->nrVecs = tile->nrCols/vecLen; } else { if ( tile->nrRows % vecLen ) { return -EINVAL; } if ( tileIterFlags & TILE_ITER_BACKWARD_ROWS ) { iter->phyIterFlags |= PHY_ITER_BACKWARD_VECS; } if ( tileIterFlags & TILE_ITER_BACKWARD_COLS ) { iter->phyIterFlags |= PHY_ITER_BACKWARD_LINES; } iter->nrLines = tile->nrCols; iter->nrVecs = tile->nrRows/vecLen; } switch( iter->phyIterFlags & ( PHY_ITER_BACKWARD_VECS | PHY_ITER_BACKWARD_LINES ) ){ // lines - forward, vectors - forward case !( PHY_ITER_BACKWARD_LINES | PHY_ITER_BACKWARD_VECS ): iter->vec = 0; iter->line = 0; break; // lines - forward, vectors - backward case PHY_ITER_BACKWARD_VECS: iter->vec = iter->nrVecs-1; iter->line = 0; break; // lines - backward, vectors - forward case PHY_ITER_BACKWARD_LINES: iter->vec = 0; iter->line = iter->nrLines-1; break; // lines - backward, vectors - backward case PHY_ITER_BACKWARD_LINES | PHY_ITER_BACKWARD_VECS: iter->vec = iter->nrVecs-1; iter->line = iter->nrLines-1; break; } iterCalcLogCoords(iter); return 0; } //----------------------------------------------------------------------------- int iterIterate(PhysTileIterator *iter) { if( NULL == iter ){ return -EINVAL; } //tile end if( iterIsEnd(iter) ){ return 1; } switch( iter->phyIterFlags & ( PHY_ITER_BACKWARD_LINES | PHY_ITER_BACKWARD_VECS) ){ // lines - forward, vectors - forward case !( PHY_ITER_BACKWARD_LINES | PHY_ITER_BACKWARD_VECS ): if( iter->nrVecs-1 == iter->vec ){ iter->vec = 0; iter->line++; } else{ iter->vec++; } break; // lines - forward, vectors - backward case PHY_ITER_BACKWARD_VECS: if( 0 == iter->vec ){ iter->vec = iter->nrVecs-1; iter->line++; } else{ iter->vec--; } break; // lines - backward, vectors - forward case PHY_ITER_BACKWARD_LINES: if( iter->nrVecs-1 == iter->vec ){ iter->vec = 0; iter->line--; } else{ iter->vec++; } break; // lines - backward, vectors - backward case ( PHY_ITER_BACKWARD_LINES | PHY_ITER_BACKWARD_VECS ): if( 0 == iter->vec ){ iter->vec = iter->nrVecs-1; iter->line--; } else{ iter->vec--; } break; } iterCalcLogCoords(iter); return 0; } //----------------------------------------------------------------------------- int iterSeek( PhysTileIterator *iter, int row, int col ) { if ( NULL == iter ) { return -EINVAL; } iter->row = row; iter->col = col; if ( iter->isLogRowMaj ) { iter->line = row; iter->vec = col/iter->vecLen; } else { iter->line = col; iter->vec = row/iter->vecLen; } assert( iter->line < iter->nrLines ); assert( iter->vec < iter->nrVecs ); return 0; } //----------------------------------------------------------------------------- int iterSeekPhys( PhysTileIterator *iter, int line, int vec ) { if ( NULL == iter ) { return -EINVAL; } iter->line = line; iter->vec = vec; if ( iter->isLogRowMaj ) { iter->row = line; iter->col = vec * iter->vecLen; } else { iter->row = vec * iter->vecLen; iter->col = line; } assert( iter->line < iter->nrLines ); assert( iter->vec < iter->nrVecs ); return 0; } //----------------------------------------------------------------------------- /* * Check if the entire tile has been iterated. Return true if the iterator is * at the next element beyond the last. */ int iterIsEnd(const PhysTileIterator *iter) { int isEnd = false; if( NULL == iter ){ return -EINVAL; } if( iter->phyIterFlags & PHY_ITER_BACKWARD_LINES ){ if( iter->line < 0 ){ isEnd = true; } } else{ if( iter->line >= iter->nrLines ){ isEnd = true; } } return isEnd; } clblas-2.10/src/library/blas/gens/tile_iter.h000066400000000000000000000040431264277366700211620ustar00rootroot00000000000000/* ************************************************************************ * Copyright 2013 Advanced Micro Devices, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * ************************************************************************/ #ifndef TILE_ITER_H #define TILE_ITER_H #include "blas_kgen.h" typedef enum TileIterFlags { // iterate in the backward direction along logical rows TILE_ITER_BACKWARD_ROWS = 0x01, // iterate in the backward direction along logical columns TILE_ITER_BACKWARD_COLS = 0x02 } TileIterFlags; typedef enum PhyIterFlags { PHY_ITER_BACKWARD_LINES = 0x01, PHY_ITER_BACKWARD_VECS = 0x02, } PhyIterFlags; typedef struct PhysTileIterator { int row; // logical tile row int col; // logical tile column int phyIterFlags; int isLogRowMaj; int vecLen; int line; // physical line int vec; // vector in physical line int nrLines; // physical line number int nrVecs; // physical vec number } PhysTileIterator; //----------------------------------------------------------------------------- int iterInit(PhysTileIterator *iter, const Tile *tile, int vecLen, unsigned int tileIterFlags); int iterIterate(PhysTileIterator *iter); /* * Check if the entire tile has been iterated. Return true if the iterator is * at the next element beyond the last. */ int iterIsEnd(const PhysTileIterator *iter); int iterSeek( PhysTileIterator *iter, int row, int col ); int iterSeekPhys( PhysTileIterator *iter, int line, int vec ); #endif clblas-2.10/src/library/blas/gens/tilemul.c000066400000000000000000000677521264277366700206700ustar00rootroot00000000000000/* ************************************************************************ * Copyright 2013 Advanced Micro Devices, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * ************************************************************************/ #include #include #include #include #include #include #include #include #include #include "blas_kgen.h" #define MAX_LENGTH 4096 #define BITS_INT (sizeof(int) * 8) typedef enum VectMulType { VECT_MULT_REAL, VECT_MULT_COMPLEX_REAL, VECT_MULT_COMPLEX_IMAG } VectMulType; static const char *vectComponents = "0123456789abcdef"; static void getVecLens( const BlasGenSettings *gset, unsigned int *vlenA, unsigned int *vlenB, unsigned int *vlenC) { const CLBLASKernExtra *kextra = gset->kextra; bool distVect = ((gset->flags & BGF_DISTINCT_VECLEN) != 0); if (vlenA != NULL) { *vlenA = (distVect) ? kextra->vecLenA : kextra->vecLen; } if (vlenB != NULL) { *vlenB = (distVect) ? kextra->vecLenB : kextra->vecLen; } if (vlenC != NULL) { *vlenC = (distVect) ? kextra->vecLenC : kextra->vecLen; } } static TileMulCore checkReplaceCore( const BlasGenSettings *gset, TileMulCore core, bool tra, bool trb) { const SubproblemDim *subdims = gset->subdims; DataType dtype = gset->kextra->dtype; unsigned int vlenC; // 'dot' function can't be used for complex types if (isComplexType(dtype) && (core == TILEMUL_DOT)) { core = TILEMUL_MULADD; } // 'dot' is supported only for one case of vectors fetch // where A is fetched by rows and B - by columns if (core == TILEMUL_DOT && !(!tra && trb)) { core = TILEMUL_MULADD; } // dot is not supported for vector unaligned bwidth getVecLens(gset, NULL, NULL, &vlenC); if (core == TILEMUL_DOT && (subdims[1].bwidth % vlenC != 0)) { core = TILEMUL_MULADD; } return core; } static int checkTriggerPostFetch( struct KgenContext *ctx, const TileMulOpts *mulOpts, MatrixRole mrole) { int ret = 0; if (mulOpts->postFetch) { ret = mulOpts->postFetch(ctx, mrole, mulOpts->postFetchPriv); kgenAddBlankLine(ctx); } return ret; } /* * In an expression of a complex elements swap real and imaginary parts */ static void swapComplexComponents(Kstring *expr, unsigned int vecLen) { char *p; unsigned int i; char tmp; /* * If the string doesn't contain a suffix of vector components, then * construct it from scratch in the swapped form right away, otherwise * swap all even and odd components */ p = strchr(expr->buf, '.'); if (p == NULL) { strcat(expr->buf, ".s"); p = expr->buf + strlen(expr->buf); for (i = 0; i < vecLen; i++) { *p++ = vectComponents[2 * i + 1]; *p++ = vectComponents[2 * i]; } *p = '\0'; } else { p = expr->buf; i = (unsigned int)strlen(p) - 1; for (; vecLen != 0; i -= 2, vecLen--) { tmp = p[i]; p[i] = p[i - 1]; p[i - 1] = tmp; } } } static void takeComplexApart(Kstring *re, Kstring *im, const Kstring *src) { char *p; int i; p = strchr(src->buf, '.'); if (p == NULL) { strcpy(re->buf, src->buf); strcat(re->buf, ".s0"); strcpy(im->buf, src->buf); strcat(im->buf, ".s1"); } else { i = (int)strlen(src->buf) - 1; strcpy(re->buf, src->buf); strcpy(im->buf, src->buf); re->buf[i] = '\0'; im->buf[i - 1] = im->buf[i]; im->buf[i] = '\0'; } } /* * Select physical row in tile A depending on current row in tile C * and storing mode of A: whole or not, transposed or not */ static __inline unsigned int selectRowA(const Tile *a, unsigned int m, bool wholeA) { return (a->trans || wholeA) ? m : 0; } /* * Select physical column in tile A depending on current column in tile C * and storing mode of A: whole or not, transposed or not */ static __inline unsigned int selectColA(const Tile *a, unsigned int k, bool wholeA) { return (!a->trans || wholeA) ? k : 0; } /* * Common line segment length of 2 tiles being arguments in tile multiplication */ static unsigned int commonTileSegmentLen(const Tile *tile1, const Tile *tile2) { unsigned int u1, u2; u1 = tileLineSegmentLen(tile1); u2 = tileLineSegmentLen(tile2); return umin(u1, u2); } static void genPointerUpdate( struct KgenContext *ctx, const char *ptrName, const char *ldName, size_t bwidth, size_t bheight, unsigned int vecLen, DataType dtype, BlasGenFlags gflags, bool rowMaj, bool isLocal) { const char *uptr; Kstring tmp; const char *p; if (gflags & BGF_UPTRS) { getVectorTypeName(dtype, vecLen, NULL, &uptr); ksprintf(&tmp, "%s.%s", ptrName, uptr); p = tmp.buf; } else { p = ptrName; } if (rowMaj) { kgenPrintf(ctx, "%s += %lu;\n", p, bwidth / vecLen); } else if (isLocal) { kgenPrintf(ctx, "%s += %lu;\n", p, bwidth * (bheight / vecLen)); } else { Kstring ld; Kstring bwStr, madExpr; unsigned int scale; kstrcpy(&ld, ldName); ksprintf(&bwStr, "%lu", bwidth); scale = (gflags & BGF_LD_IN_VECTORS) ? 0 : vecLen; sprintfFastScalarMad(&madExpr, &bwStr, &ld, scale, NULL); kgenPrintf(ctx, "%s += %s;\n", p, madExpr.buf); } } static void genRealMulUpdate( struct KgenContext *ctx, const Kstring *elA, const Kstring *elB, const Kstring *elC, bool transC, TileMulCore core) { char tmp[MAX_LENGTH]; const char *src1, *src2; /* * Select order of source operands because type of 'mad' result is * determined by the first operand */ src1 = (transC) ? elA->buf : elB->buf; src2 = (transC) ? elB->buf : elA->buf; if (core == TILEMUL_MAD) { sprintf(tmp, "%s = mad(%s, %s, %s);\n", elC->buf, src1, src2, elC->buf); } else { sprintf(tmp, "%s += %s * %s;\n", elC->buf, src1, src2); } kgenAddStmt(ctx, tmp); } // Generate complete vector-vector product static void genVecMul( struct KgenContext *ctx, unsigned int m, unsigned int n, const Tile *a, const Tile *b, const Tile *c, bool conjA, bool conjB, TileMulCore core, bool wholeA) { unsigned int k; char tmp[MAX_LENGTH]; Kstring elA, elB, elC; unsigned int vlen = 0; bool isComplex; bool isDouble; isDouble = isDoubleBasedType(c->dtype); isComplex = isComplexType(c->dtype); if ((core == TILEMUL_DOT) && !isComplex) { vlen = commonTileSegmentLen(a, b); } else { vlen = 1; } sprintfTileElement(&elC, c, m, n, 1); if (!wholeA) { m = 0; } for (k = 0; k < a->nrCols; k += vlen) { sprintfTileElement(&elA, a, m, k, vlen); sprintfTileElement(&elB, b, k, n, vlen); /* * Using 'dot' is not valid for complex, and replaced with '*' operator * for unvectorized real data */ if ((core == TILEMUL_DOT) && (vlen > 1)) { sprintf(tmp, "%s += dot(%s, %s);\n", elC.buf, elA.buf, elB.buf); } else if (isComplex) { Kstring expr; sprintfComplexMulUpdate(&expr, &elC, &elA, &elB, &elC, isDouble, conjA, conjB, core); kgenAddStmt(ctx, expr.buf); } else { genRealMulUpdate(ctx, &elA, &elB, &elC, c->trans, core); } } } /* * Generate complete vector-vector product using separate multiple-add * operations and explicit vectorization */ static void genVectorizedVecMulAdd( struct KgenContext *ctx, unsigned int m, unsigned int n, const Tile *a, const Tile *b, const Tile *c, bool conjA, bool conjB, VectMulType type, bool wholeA) { unsigned int k; unsigned int sumLen; char tmp[MAX_LENGTH], tmp2[MAX_LENGTH]; char *str = tmp; const char *s; char op; Kstring elA, elB, elC; unsigned int vlen; // signs for even and odd components int signs[2] = {0, 0}; vlen = commonTileSegmentLen(a, b); if (!wholeA) { m = 0; } if (type == VECT_MULT_REAL) { sprintfTileElement(&elC, c, m, n, 1); sumLen = vlen; } else { TileElementHalf half = (type == VECT_MULT_COMPLEX_REAL) ? TE_HALF_LOW : TE_HALF_HIGH; sprintfTileElementHalf(&elC, c, m, n, half); sumLen = vlen * 2; if (type == VECT_MULT_COMPLEX_REAL) { if ((conjA && conjB) || (!conjA && !conjB)) { signs[1] = 1; } } else if (!(conjA && conjB)) { /* * When both the matrix are conjugated, the sum is substracted * from the temporary result */ signs[0] = (int)conjB; signs[1] = (int)conjA; } } // initial expression sprintfTileElement(&elA, a, m, 0, vlen); sprintfTileElement(&elB, b, 0, n, vlen); if (type == VECT_MULT_COMPLEX_IMAG) { swapComplexComponents(&elB, vlen); } str += sprintf(str, "sum = %s * %s", elA.buf, elB.buf); // add expressions for remaining elements for (k = vlen; k < a->nrCols; k += vlen) { sprintfTileElement(&elA, a, m, k, vlen); sprintfTileElement(&elB, b, k, n, vlen); if (type == VECT_MULT_COMPLEX_IMAG) { swapComplexComponents(&elB, vlen); } str += sprintf(str, " + %s * %s", elA.buf, elB.buf); } strcat(tmp, ";\n"); kgenAddStmt(ctx, tmp); // sum components of the temporary results str = tmp2; s = (signs[0]) ? "-" : ""; str += sprintf(tmp2, "%ssum.s0", s); for (k = 1; k < sumLen; k++) { op = signs[k & 1] ? '-' : '+'; str += sprintf(str, " %c sum.s%c", op, vectComponents[k]); } if ((type == VECT_MULT_COMPLEX_IMAG) && conjA & conjB) { op = '-'; } else { op = '+'; } sprintf(tmp, "%s %c= %s;\n", elC.buf, op, tmp2); kgenAddStmt(ctx, tmp); } /* * Generate one stage of vector-vector product. Iterating over M and N having * fixed coordinate over K. */ static void genStagedVecMul( struct KgenContext *ctx, unsigned int lineA, unsigned int k, const Tile *a, const Tile *b, const Tile *c, bool conjA, bool conjB, TileMulCore core, bool wholeA) { Kstring elA, elB, elC; unsigned int stepM, endM, stepN, vlenC; unsigned int i, j; unsigned int m, ma, ka; bool isDouble; bool isComplex; if (a->trans) { m = 0; endM = a->nrRows; } else { m = lineA; endM = m + 1; } isDouble = isDoubleBasedType(c->dtype); isComplex = isComplexType(c->dtype); if (( (c->trans == a->trans) || (c->trans == b->trans) ) && !isComplex) { if (c->trans) { stepM = vlenC = commonTileSegmentLen(a, c); stepN = 1; } else { stepM = 1; stepN = vlenC = commonTileSegmentLen(b, c); } } else { stepM = stepN = 1; vlenC = 1; } ka = selectColA(a, k, wholeA); for (i = m; i < endM; i += stepM) { ma = selectRowA(a, i, wholeA); sprintfTileElement(&elA, a, ma, ka, stepM); for (j = 0; j < b->nrCols; j += stepN) { sprintfTileElement(&elB, b, k, j, stepN); sprintfTileElement(&elC, c, i, j, vlenC); if (isComplex) { Kstring expr; sprintfComplexMulUpdate(&expr, &elC, &elA, &elB, &elC, isDouble, conjA, conjB, core); kgenAddStmt(ctx, expr.buf); } else { genRealMulUpdate(ctx, &elA, &elB, &elC, c->trans, core); } } } } /* check input values like x, y, bw to be fetch vector aligned and so on */ static int checkInput(const BlasGenSettings *gset, const TileMulOpts *mulOpts) { //bool localA = (mulOpts->memA == CLMEM_LOCAL_MEMORY); //bool localB = (mulOpts->memB == CLMEM_LOCAL_MEMORY); TileMulFlags mflags = mulOpts->flags; //bool cyclicGlobal = ((mflags & TILEMUL_GLOBAL_CYCLIC) != 0); bool isReal = ! isComplexType(gset->kextra->dtype); bool conjA = ((mflags & TILEMUL_CONJA) != 0); bool conjB = ((mflags & TILEMUL_CONJB) != 0); // This condition is not validate the case // when the matrix B is in the local memory // and the matrix A in the global memory. // //if ((localA ||localB) && cyclicGlobal) { // return -EINVAL; //} if (isReal && (conjA || conjB)) { /* 'Conjugated' flag can be used for complex types only */ return -EINVAL; } return 0; } static void genMulLineOnTile( struct KgenContext *ctx, const BlasGenSettings *gset, const TileMulOpts *mulOpts, unsigned int lineOffset, bool wholeA) { TileMulFlags mflags = mulOpts->flags; const Tile *a = &gset->tileA; const Tile *b = &gset->tileBX; const Tile *c = &gset->tileCY; bool isReal; bool conjA, conjB; const SubproblemDim *subdims = gset->subdims; TileMulCore core; DataType dtype = gset->kextra->dtype; unsigned int j, n; n = (unsigned int)subdims[1].x; core = checkReplaceCore(gset, mulOpts->core, a->trans, b->trans); isReal = !isComplexType(dtype); conjA = ((mflags & TILEMUL_CONJA) != 0); conjB = ((mflags & TILEMUL_CONJB) != 0); if (a->trans || !b->trans) { unsigned int startK, endK; startK = (a->trans)? lineOffset : 0; endK = (a->trans)? lineOffset + 1 : (unsigned int)subdims[1].bwidth; for (j = startK; j < endK; j++) { genStagedVecMul(ctx, lineOffset, j, a, b, c, conjA, conjB, core, wholeA); } } else { bool vectorize = false; if (commonTileSegmentLen(a, b) > 1) { vectorize = ((mflags & TILEMUL_FORCE_VECTORIZATION) != 0); } for (j = 0; j < n; j++) { /* full dot product of row of A by column of B */ if ((core == TILEMUL_MULADD) && vectorize) { if (isReal) { genVectorizedVecMulAdd(ctx, lineOffset, j, a, b, c, false, false, VECT_MULT_REAL, wholeA); } else { genVectorizedVecMulAdd(ctx, lineOffset, j, a, b, c, conjA, conjB, VECT_MULT_COMPLEX_REAL, wholeA); genVectorizedVecMulAdd(ctx, lineOffset, j, a, b, c, conjA, conjB, VECT_MULT_COMPLEX_IMAG, wholeA); } } else { genVecMul(ctx, lineOffset, j, a, b, c, conjA, conjB, core, wholeA); } } } } void sprintfComplexMulUpdate( Kstring *expr, const Kstring *dst, const Kstring *a, const Kstring *b, const Kstring *c, bool isDouble, bool conjA, bool conjB, TileMulCore core) { Kstring swSrc1; // swapped element of the first source // real and imaginary part of the second source Kstring reSrc2, imSrc2; const Kstring *src11, *src12, *src21, *src22; const char *sign1 = "", *sign2 = "", *sign3 = ""; const char *baseType; baseType = (isDouble) ? "double2" : "float2"; /* * Prepare components for multiplying. We should get the following * vectorized operations: * * c = b * a1 + bsw * (-a2, a2) if both 'a' and 'b' are not conjugated * c = b * a1 + bsw * (a2, -a2) if 'b' is conjugated and 'a' is not * c = a * b1 + asw * (-b2, b2) if 'a' is conjugated and 'b' is not * c = asw * (-b2) + a * (b1, -b1) if both 'a' and 'b' are conjugated * * Where (a1, a2) and (b1, b2) are complex components of 'a' and 'b', * and asw and bsw - swapped elements of 'a' and 'b' respectively. */ src11 = (conjB) ? a : b; src21 = (conjB) ? b : a; kstrcpy(&swSrc1, src11->buf); swapComplexComponents(&swSrc1, 1); takeComplexApart(&reSrc2, &imSrc2, src21); if (conjA && conjB) { src12 = src11; src11 = &swSrc1; src21 = &imSrc2; src22 = &reSrc2; sign1 = sign3 = "-"; } else { src12 = &swSrc1; src21 = &reSrc2; src22 = &imSrc2; if (conjA || conjB) { sign3 = "-"; } else { sign2 = "-"; } } if (core == TILEMUL_MAD) { const char *strC = (c == NULL) ? "0" : c->buf; ksprintf(expr, "%s = mad(%s, %s%s, %s);\n" "%s = mad(%s, (%s)(%s%s, %s%s), %s);\n", dst->buf, src11->buf, sign1, src21->buf, strC, dst->buf, src12->buf, baseType, sign2, src22->buf, sign3, src22->buf, dst->buf); } else { const char *op = (dst == c) ? "+=" : "="; ksprintf(expr, "%s %s %s * %s%s + %s * (%s)(%s%s, %s%s)", dst->buf, op, src11->buf, sign1, src21->buf, src12->buf, baseType, sign2, src22->buf, sign3, src22->buf); if (!((c == NULL) || (c == dst))) { kstrcatf(expr, " + %s", c->buf); } kstrcatf(expr, "%s", ";\n"); } } void sprintfComplexMulUpdate_syr2k_beta0( Kstring *expr, const Kstring *dst, const Kstring *a, const Kstring *b, const Kstring *c, bool isDouble, bool conjA, bool conjB, TileMulCore core) { Kstring swSrc1; // swapped element of the first source // real and imaginary part of the second source Kstring reSrc2, imSrc2; const Kstring *src11, *src12, *src21, *src22; const char *sign1 = "", *sign2 = "", *sign3 = ""; const char *baseType; baseType = (isDouble) ? "double2" : "float2"; /* * Prepare components for multiplying. We should get the following * vectorized operations: * * c = b * a1 + bsw * (-a2, a2) if both 'a' and 'b' are not conjugated * c = b * a1 + bsw * (a2, -a2) if 'b' is conjugated and 'a' is not * c = a * b1 + asw * (-b2, b2) if 'a' is conjugated and 'b' is not * c = asw * (-b2) + a * (b1, -b1) if both 'a' and 'b' are conjugated * * Where (a1, a2) and (b1, b2) are complex components of 'a' and 'b', * and asw and bsw - swapped elements of 'a' and 'b' respectively. */ src11 = (conjB) ? a : b; src21 = (conjB) ? b : a; kstrcpy(&swSrc1, src11->buf); swapComplexComponents(&swSrc1, 1); takeComplexApart(&reSrc2, &imSrc2, src21); if (conjA && conjB) { src12 = src11; src11 = &swSrc1; src21 = &imSrc2; src22 = &reSrc2; sign1 = sign3 = "-"; } else { src12 = &swSrc1; src21 = &reSrc2; src22 = &imSrc2; if (conjA || conjB) { sign3 = "-"; } else { sign2 = "-"; } } if (core == TILEMUL_MAD) { const char *strC = (c == NULL) ? "0" : c->buf; ksprintf(expr, "%s = mad(%s, %s%s, %s);\n" "%s = mad(%s, (%s)(%s%s, %s%s), %s);\n", "sctmp", src11->buf, sign1, src21->buf, strC, dst->buf, src12->buf, baseType, sign2, src22->buf, sign3, src22->buf, "sctmp"); } else { const char *op = (dst == c) ? "+=" : "="; ksprintf(expr, "%s %s %s * %s%s + %s * (%s)(%s%s, %s%s)", dst->buf, op, src11->buf, sign1, src21->buf, src12->buf, baseType, sign2, src22->buf, sign3, src22->buf); if (!((c == NULL) || (c == dst))) { kstrcatf(expr, " + %s", c->buf); } kstrcatf(expr, "%s", ";\n"); } } int genMulTiles( struct KgenContext *ctx, const BlasGenSettings *gset, const TileMulOpts *mulOpts) { char s[32]; const CLBLASKernExtra *kextra = gset->kextra; const char *tNameIn; unsigned int i; unsigned int iend; bool tra = ((mulOpts->flags & TILEMUL_TRA) != 0); bool trb = ((mulOpts->flags & TILEMUL_TRB) != 0); TileMulCore core; int ret; ret = checkInput(gset, mulOpts); if (ret) { return ret; } getVectorTypeName(kextra->dtype, kextra->vecLen, &tNameIn, NULL); core = checkReplaceCore(gset, mulOpts->core, tra, trb); if (((core == TILEMUL_MULADD || isComplexType(kextra->dtype)) && !tra && trb)) { sprintf(s,"%s sum;\n", tNameIn); kgenAddStmt(ctx, s); } iend = (unsigned int)((mulOpts->flags & TILEMUL_TRA) ? gset->subdims[1].bwidth : gset->subdims[1].y); for (i = 0; i < iend; i++) { genMulLineOnTile(ctx, gset, mulOpts, i, true); } // just to get state ret = kgenAddStmt(ctx, NULL); return (ret) ? -EOVERFLOW : 0; } int tileMulGen( struct KgenContext *ctx, const BlasGenSettings *gset, const TileMulOpts *mulOpts) { char s[MAX_LENGTH]; unsigned int vlenA, vlenB; unsigned int i, iend; //counters // size_t m, n, subK; int ret = 0; TileMulFlags mflags = mulOpts->flags; bool tra = ((mflags & TILEMUL_TRA) != 0); bool trb = ((mflags & TILEMUL_TRB) != 0); bool localA = (mulOpts->memA == CLMEM_LOCAL_MEMORY); bool localB = (mulOpts->memB == CLMEM_LOCAL_MEMORY); bool internalFetchB = ((mflags & TILEMUL_NOT_FETCH_B) == 0); bool bwStride = ((mflags & TILEMUL_BW_STRIDE) != 0); bool incK = ((mflags & TILEMUL_NOT_INC_K) == 0); const SubproblemDim *subdims = gset->subdims; size_t bwidth = bwStride ? subdims[0].bwidth : subdims[1].bwidth; TileMulCore core = mulOpts->core; DataType dtype = gset->kextra->dtype; const KernelVarNames *varNames = &gset->varNames; FetchOpts fetchOpts; struct FetchContext *fctx = mulOpts->fctx; FetchAddrMode addrMode; FetchOptLevel foptlev; struct StatementBatch *batch = NULL; const Tile *tile; memset(&fetchOpts, 0, sizeof(fetchOpts)); fetchOpts.memA = mulOpts->memA; fetchOpts.memB = mulOpts->memB; kgenAddStmt(ctx, "/* -- Tiles multiplier -- */\n"); getVecLens(gset, &vlenA, &vlenB, NULL); /* check generator input values */ ret = checkInput(gset, mulOpts); if (ret) { return ret; } if (!bwStride && (subdims[0].bwidth != subdims[1].bwidth)) { sprintf(s, "for (int k1 = 0; k1 < %lu; k1 += %lu)", subdims[0].bwidth, subdims[1].bwidth); kgenBeginBranch(ctx, s); } core = checkReplaceCore(gset, core, tra, trb); if (((core == TILEMUL_MULADD || isComplexType(dtype)) && !tra && trb)) { unsigned int n; const char *tname; n = commonTileSegmentLen(&gset->tileA, &gset->tileBX); getVectorTypeName(gset->tileA.dtype, n, &tname, NULL); sprintf(s,"%s sum;\n", tname); kgenAddStmt(ctx, s); } // FIXME: remove this kludge for backward compatibility if (fctx == NULL) { fctx = createFetchContext(); if (fctx == NULL) { return -ENOMEM; } fetchOpts.mulOpts = mulOpts; } ////////////////////////////////////////////////////// foptlev = getFetchOptLevels(fctx); if ((gset->flags & BGF_WHOLE_A) && internalFetchB && (foptlev & FOPTLEV_MERGE_FETCHES)) { batch = createStmtBatch(); if (batch == NULL) { ret = -ENOMEM; goto out; } } /* * First, disable sharing internal variables of the fetch code for * the first call so as the fetch generator could declares it for the * first matrix. And then re-enable it when invoking the fetch for * the other matrix if it has been actually enabled. */ disableFetchOptLevels(fctx, FOPTLEV_CAN_SHARE_TMP_AB); /* * fetch elements of the matrix B, by rows or by columns depending on * the transposing flag */ if (internalFetchB) { tile = &gset->tileBX; fetchOpts.mrole = MATRIX_B; fetchOpts.linesNum = trb ? tile->nrCols : tile->nrRows; if (batch == NULL) { ret = genFetchInputTile(ctx, fctx, gset, &fetchOpts); if (!ret) { ret = checkTriggerPostFetch(ctx, mulOpts, MATRIX_B); } } else { genFetchInputTileBatch(batch, fctx, gset, &fetchOpts); } } fetchOpts.mrole = MATRIX_A; if (foptlev & FOPTLEV_CAN_SHARE_TMP_AB) { enableFetchOptLevels(fctx, FOPTLEV_CAN_SHARE_TMP_AB); } if (ret) { goto out; } if (gset->flags & BGF_WHOLE_A) { tile = &gset->tileA; iend = (tra) ? tile->nrCols : tile->nrRows; fetchOpts.linesNum = iend; if (batch == NULL) { ret = genFetchInputTile(ctx, fctx, gset, &fetchOpts); } else { genFetchInputTileBatch(batch, fctx, gset, &fetchOpts); ret = flushStmtBatch(ctx, batch); if (!ret) { ret = checkTriggerPostFetch(ctx, mulOpts, MATRIX_B); } } if (!ret) { ret = checkTriggerPostFetch(ctx, mulOpts, MATRIX_A); } if (ret) { goto out; } // main multiplying loop for (i = 0; i < iend; i++) { if (i) { kgenAddBlankLine(ctx); } genMulLineOnTile(ctx, gset, mulOpts, i, true); } } else { iend = (unsigned int)((tra) ? subdims[1].bwidth : subdims[1].y); fetchOpts.linesNum = 1; // main multiplying loop for (i = 0; i < iend; i++) { if (i) { kgenAddBlankLine(ctx); revalidateFetchContext(fctx, MATRIX_A); } // fetch elements of matrix A from single row fetchOpts.lineOffset = i; genFetchInputTile(ctx, fctx, gset, &fetchOpts); ret = checkTriggerPostFetch(ctx, mulOpts, MATRIX_A); if (ret) { goto out; } genMulLineOnTile(ctx, gset, mulOpts, i, false); } } /* * increment K-related coordinates or pointers depending on addressing * mode */ addrMode = getFetchAddrMode(fctx); if (addrMode & FETCH_ADDR_K_RELATIVE) { kgenAddBlankLine(ctx); genPointerUpdate(ctx, varNames->A, varNames->lda, bwidth, subdims[0].y, vlenA, dtype, gset->flags, !tra, localA); genPointerUpdate(ctx, varNames->B, varNames->ldb, bwidth, subdims[0].x, vlenB, dtype, gset->flags, trb, localB); } else { if (incK && (varNames->k != NULL) && !(localA && localB)) { sprintf(s, "\n%s += %lu;\n", varNames->k, bwidth); kgenAddStmt(ctx, s); } } if (!bwStride && (subdims[0].bwidth != subdims[1].bwidth)) { kgenEndBranch(ctx, NULL); // k1 loop } ret = kgenAddStmt(ctx, "/* ---------------------- */\n"); ret = (ret) ? -EOVERFLOW : 0; out: if (batch != NULL) { destroyStmtBatch(batch); } if (fctx != mulOpts->fctx) { destroyFetchContext(fctx); } return ret; } clblas-2.10/src/library/blas/gens/trmm.c000066400000000000000000001142701264277366700201600ustar00rootroot00000000000000/* ************************************************************************ * Copyright 2013 Advanced Micro Devices, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * ************************************************************************/ /* * Cached global buffers based trmm generator */ #include #include #include #include #include #include #include #include #include "init.h" #include "blas_kgen.h" #include "blas_subgroup.h" #include "trxm_common.h" typedef struct { size_t staggered; } MAY_ALIAS extraData_t; static CLBLASMpatExtra mpatExtra; static ssize_t generator( char *buf, size_t buflen, const struct SubproblemDim *subdims, const struct PGranularity *pgran, void *extra); static void assignKargs(KernelArg *args, const void *params, const void *extra); static SolverFlags solverFlags(void); static void fixupArgs( void *args, SubproblemDim *subdims, void *extra ); static int blockGetPerf( unsigned int kflags, const void *args ); static int subgGetPerf( unsigned int kflags, const void *args ); static void subgCalcThreads( size_t threads[2], const SubproblemDim *subdims, const PGranularity *pgran, const void *args, const void *extra ); static int trmmGetDefaultDecomp( PGranularity *pgran, SubproblemDim *subdims, unsigned int subdimsNum, void *pArgs); static int trmmSubgGetDefaultDecomp( PGranularity *pgran, SubproblemDim *subdims, unsigned int subdimsNum, void *pArgs ); static bool subgCheckCalcDecomp( PGranularity *pgran, SubproblemDim *subdims, unsigned int subdimsNum, DataType dtype, int check ); static bool isFitToLDS( SubproblemDim *dim, DataType dtype, cl_ulong ldsSize, const void *kernelArgs); static bool blockCheckCalcDecomp( PGranularity *pgran, SubproblemDim *subdims, unsigned int subdimsNum, DataType dtype, int check); static SolverOps blockSops = { generator, assignKargs, isFitToLDS, blockGetPerf, NULL, NULL, NULL, solverFlags, fixupArgs, trmmGetDefaultDecomp, // getDefaultDecomp blockCheckCalcDecomp, NULL, NULL}; // Solver options for subgroup pattern static SolverOps subgSops = { generator, assignKargs, NULL, subgGetPerf, NULL, subgCalcThreads, NULL, solverFlags, fixupArgs, trmmSubgGetDefaultDecomp, subgCheckCalcDecomp, NULL, NULL}; //----------------------------------------------------------------------------- static void initKernelVarNames(KernelVarNames *kvars) { kvars->A = "(Ag)"; kvars->B = "(Bg)"; kvars->C = "C"; kvars->coordA = "coord.y"; kvars->coordB = "coord.x"; kvars->k = "coord.z"; kvars->sizeK = "M"; kvars->sizeM = "M"; kvars->sizeN = "N"; kvars->lda = "lda"; kvars->ldb = "ldb"; kvars->ldc = "ldb"; kvars->alpha = "alpha"; } //----------------------------------------------------------------------------- static void genInitCurrM( struct KgenContext *ctx, const SubproblemDim *dim, KernelExtraFlags kflags) { char tmp[1024]; if (isMatrixUpper(kflags)) { strcpy(tmp, "currM = 0;\n"); } else { sprintf(tmp, "currM = (M - 1) / %lu * %lu;\n", dim->y, dim->y); } kgenAddStmt(ctx, tmp); kgenAddBlankLine(ctx); } //----------------------------------------------------------------------------- static void genStartPosK( struct KgenContext *ctx, const SubproblemDim *dim, KernelExtraFlags kflags, bool subgMode) { char tmp[1024]; if (isMatrixUpper(kflags)) { // K loop - from diagonal till M if (subgMode) { sprintf(tmp, "uint kBegin = currM;\n"); } else { if (!(kflags & KEXTRA_TAILS_M)) { sprintf(tmp, "uint kBegin = currM;\n"); } else { sprintf(tmp, "uint kBegin = currM / %lu * %lu;\n", dim->bwidth, dim->bwidth); } } } else { // K loop - from 0 till diagonal sprintf(tmp, "uint kBegin = 0;\n"); } kgenAddStmt(ctx, tmp); } //----------------------------------------------------------------------------- static void resetFetchNumA(TileMulOpts *mulOpts) { TilePostFetchPrivate *pfPriv; pfPriv = (TilePostFetchPrivate *) mulOpts->postFetchPriv; pfPriv[0].fetchNumA = 0; pfPriv[1].fetchNumA = 0; } //----------------------------------------------------------------------------- static int genSubgLoopsK( struct KgenContext *ctx, BlasGenSettings *gset, TileMulOpts *mulOpts, SubgVarNames* pSubgVNames, size_t staggered) { char tmp[1024]; KernelExtraFlags kflags = gset->kextra->flags; const size_t y0 = gset->subdims[0].y; const size_t bw1 = gset->subdims[1].bwidth; const size_t bw0 = gset->subdims[0].bwidth; // bw, that will be used for diagonal block evaluation size_t diagBw1 = getVecLen( gset, CLBLAS_TRMM, MATRIX_A ); // saving dimensions of tile A, that will be changed for // diagonal block size_t sDimA = gset->tileA.trans ? gset->tileA.nrRows: gset->tileA.nrCols; size_t sDimB = gset->tileBX.trans ? gset->tileBX.nrRows: gset->tileBX.nrCols; const CLBLASKernExtra* psKExtra = gset->kextra; CLBLASKernExtra diagKExtra; TilePostFetchPrivate postFPriv; int ret = 0; kgenPrintf( ctx, "uint k0;\n" ); kgenPrintf( ctx, "uint kMax;\n" ); // upper triangle case if (isMatrixUpper(kflags)) { // diagonal part ------------------------------------------------------ // adjust tile and kextra settings for // processing diagonal block gset->subdims[1].bwidth = diagBw1; if ( gset->tileA.trans ) { gset->tileA.nrRows = diagBw1; } else { gset->tileA.nrCols = diagBw1; } if ( gset->tileBX.trans ) { gset->tileBX.nrRows = diagBw1; } else { gset->tileBX.nrCols = diagBw1; } memcpy( &diagKExtra,gset->kextra,sizeof(CLBLASKernExtra) ); diagKExtra.vecLenA = diagBw1 < psKExtra->vecLenA? diagBw1: psKExtra->vecLenA; diagKExtra.vecLenB = diagBw1 < psKExtra->vecLenB? diagBw1: psKExtra->vecLenB; gset->kextra = (const CLBLASKernExtra*)&diagKExtra; // Process the triangle block by the 0 item // of each subgroup kgenPrintf( ctx, "// k-coordinate of the end of diagonal block\n" ); kgenPrintf( ctx, "// calculated to be aligned to bw1\n"); kgenPrintf( ctx, "kMax = kBegin + %lu + (%lu - %lu%%(kBegin+%lu));\n", y0, bw1, bw1, y0); sprintf( tmp, "if( %s.x == 0 )", pSubgVNames->itemId ); kgenBeginBranch( ctx, tmp ); sprintf( tmp, "for( k0=kBegin; (k0varNames.k ); mulOpts->postFetch = genTrxmPostFetchZero; ret = tileMulGen( ctx, gset, mulOpts ); if( 0 != ret ){ return ret; } kgenEndBranch(ctx, NULL);// for() kgenEndBranch(ctx, NULL);// if( itemId.x == 0 ) // Restore tile and kextra settings to the // original parameters gset->subdims[1].bwidth = bw1; if ( gset->tileA.trans ) { gset->tileA.nrRows = sDimA; } else { gset->tileA.nrCols = sDimA; } if ( gset->tileBX.trans ) { gset->tileBX.nrRows = sDimB; } else { gset->tileBX.nrCols = sDimB; } gset->kextra = psKExtra; // rectangle part ----------------------------------------------------- kgenAddBlankLine( ctx ); kgenPrintf( ctx, "k0 = kMax;\n" ); if ( kflags & KEXTRA_TAILS_K_LOWER ) { kgenPrintf( ctx, "uint alignedK = M-(M%%%lu);\n", bw1 ); } // strided access sprintf(tmp, "for ( k0 = k0+%s.x*%lu; k0 < %s; k0 += %lu )", pSubgVNames->itemId, bw1, ( kflags & KEXTRA_TAILS_K_LOWER )? "alignedK" : "M", bw0); kgenBeginBranch(ctx, tmp); // TODO: make staggered access operational with lower-K tails /*kgenPrintf( ctx, "%s = (kBegin+%d) + ( m0*64*(gid%%2) + k0 )%%(M-(kBegin+%d));\n", gset->varNames.k, diagW, diagW); */ kgenPrintf( ctx, "%s = k0;\n", gset->varNames.k ); mulOpts->postFetch = NULL; ret = tileMulGen(ctx, gset, mulOpts); if (ret != 0) { return ret; } kgenEndBranch(ctx, NULL); // rectangle tail part ------------------------------------------------ if ( kflags & KEXTRA_TAILS_K_LOWER ) { kgenAddBlankLine( ctx ); kgenPrintf( ctx, "// lower K tail is handled by item 0 of each subgroup\n"); sprintf(tmp, "if( (%s.x == 0)&&(kMax < M) )", pSubgVNames->itemId); kgenBeginBranch( ctx, tmp ); kgenPrintf( ctx, "%s = alignedK;\n", gset->varNames.k ); postFPriv.fetchNumA = 0; postFPriv.gset = gset; mulOpts->postFetch = defaultTilePostFetch; mulOpts->postFetchPriv = &postFPriv; ret = tileMulGen( ctx, gset, mulOpts ); if ( ret != 0 ) { return ret; } kgenEndBranch( ctx, NULL ); } } // lower triangle case else { // rectangle part ----------------------------------------------------- kgenPrintf( ctx, "kMax = currM - currM%%%lu;\n", bw1 ); // strided access, staggered access sprintf( tmp, "for( k0 = 0; k0 < kMax; k0 += %lu )", bw0 ); kgenBeginBranch( ctx, tmp ); kgenPrintf( ctx, "%s=(k0+%s.x*%d+%d*gid)%%kMax;\n", gset->varNames.k, pSubgVNames->itemId, bw1, staggered/bw1*bw1 ); mulOpts->postFetch = NULL; // part without diagonal elements post fetch zeroing ret = tileMulGen(ctx, gset, mulOpts); if (ret != 0) { return ret; } kgenEndBranch( ctx, NULL ); // diagonal part ------------------------------------------------------ // adjust tile and kextra settings for // processing diagonal block gset->subdims[1].bwidth = diagBw1; if ( gset->tileA.trans ) { gset->tileA.nrRows = diagBw1; } else { gset->tileA.nrCols = diagBw1; } if ( gset->tileBX.trans ) { gset->tileBX.nrRows = diagBw1; } else { gset->tileBX.nrCols = diagBw1; } psKExtra = gset->kextra; memcpy( &diagKExtra,gset->kextra,sizeof(CLBLASKernExtra) ); diagKExtra.vecLenA = diagBw1 < psKExtra->vecLenA? diagBw1: psKExtra->vecLenA; diagKExtra.vecLenB = diagBw1 < psKExtra->vecLenB? diagBw1: psKExtra->vecLenB; gset->kextra = (const CLBLASKernExtra*)&diagKExtra; // process the triangle block by the 0 item // of each subgroup sprintf( tmp, "if( %s.x == 0 )", pSubgVNames->itemId ); kgenBeginBranch( ctx, tmp ); sprintf( tmp, "for( k0 = kMax; (k0 < currM+%lu)&&(k0 < M); k0 += %lu )", y0, diagBw1 ); kgenBeginBranch( ctx, tmp ); kgenPrintf( ctx, "%s=k0;\n", gset->varNames.k ); mulOpts->postFetch = genTrxmPostFetchZero; resetFetchNumA(mulOpts); ret = tileMulGen(ctx, gset, mulOpts); if (ret != 0) { return ret; } kgenEndBranch( ctx, NULL );// for() kgenEndBranch( ctx, NULL );// if( itemId.x == 0 ) // Restore tile and kextra settings to the // original parameters gset->subdims[1].bwidth = bw1; if ( gset->tileA.trans ) { gset->tileA.nrRows = sDimA; } else { gset->tileA.nrCols = sDimA; } if ( gset->tileBX.trans ) { gset->tileBX.nrRows = sDimB; } else { gset->tileBX.nrCols = sDimB; } gset->kextra = psKExtra; } return 0; } //----------------------------------------------------------------------------- static int genLoopsK( struct KgenContext *ctx, BlasGenSettings *gset, TileMulOpts *mulOpts, char *tmp) { KernelExtraFlags kflags = gset->kextra->flags; const size_t y0 = gset->subdims[0].y; const size_t bwidth = gset->subdims[1].bwidth; int ret; bool isRel = false; const char *inTypeNameA, *inPtrNameA, *inTypeNameB, *inPtrNameB; getVectorTypeName(gset->kextra->dtype, gset->kextra->vecLenA, &inTypeNameA, &inPtrNameA); getVectorTypeName(gset->kextra->dtype, gset->kextra->vecLenB, &inTypeNameB, &inPtrNameB); sprintf(tmp, "uint k0;\n"); kgenAddStmt(ctx, tmp); if (!(kflags & (KEXTRA_TAILS_M_LOWER | KEXTRA_TAILS_N_LOWER | KEXTRA_TAILS_K_LOWER))) { FetchAddrMode addrMode = FETCH_ADDR_A_RELATIVE | FETCH_ADDR_B_RELATIVE | FETCH_ADDR_K_RELATIVE; isRel = true; mulOpts->fctx = createFetchContext(); if (mulOpts->fctx == NULL) { return -ENOMEM; } setFetchAddrMode(mulOpts->fctx, addrMode); gset->varNames.A = "pA"; gset->varNames.B = "pB"; } else { gset->flags |= BGF_UPTRS; kgenPrintf(ctx, "GPtr Ag, Bg;\n" "\n" "Ag.%s = A;\n" "Bg.%s = B;\n\n", inPtrNameA, inPtrNameB); } if (isMatrixUpper(kflags)) { if (isRel) { switch ((((gset->kextra->flags & KEXTRA_TRANS_A) != 0)<<1) | (((gset->kextra->flags & KEXTRA_UPPER_TRIANG) != 0) ^ ((gset->kextra->flags & KEXTRA_COLUMN_MAJOR) != 0)) ) { case 0: kgenPrintf(ctx, "__global %s *pA = (__global %s *)&A[mad24(coord.z, lda, coord.y)];\n" "__global %s *pB = (__global %s *)&B[mad24(coord.x, ldb, coord.z)];\n", inTypeNameA, inTypeNameA,inTypeNameB, inTypeNameB); break; case 1: kgenPrintf(ctx, "__global %s *pA = (__global %s *)&A[mad24(coord.y, lda, coord.z)];\n" "__global %s *pB = (__global %s *)&B[mad24(coord.z, ldb, coord.x)];\n", inTypeNameA, inTypeNameA,inTypeNameB, inTypeNameB); break; case 2: kgenPrintf(ctx, "__global %s *pA = (__global %s *)&A[mad24(coord.z, lda, coord.y)];\n" "__global %s *pB = (__global %s *)&B[mad24(coord.z, ldb, coord.x)];\n", inTypeNameA, inTypeNameA,inTypeNameB, inTypeNameB); break; case 3: kgenPrintf(ctx, "__global %s *pA = (__global %s *)&A[mad24(coord.y, lda, coord.z)];\n" "__global %s *pB = (__global %s *)&B[mad24(coord.x, ldb, coord.z)];\n", inTypeNameA, inTypeNameA,inTypeNameB, inTypeNameB); break; } } sprintf(tmp, "for (k0 = kBegin; " "(k0 <= (kBegin + %luu))&&(k0 < M); " "k0 += %lu)", y0, bwidth); kgenBeginBranch(ctx, tmp); kgenPrintf( ctx, "coord.z = k0;\n"); mulOpts->postFetch = genTrxmPostFetchZero; ret = tileMulGen(ctx, gset, mulOpts); if (ret != 0) { return ret; } kgenEndBranch(ctx, NULL); //main triangle part sprintf(tmp, "for (; k0 <= max(0, (int)M - %lu); k0 += %lu)", y0, gset->subdims[1].bwidth); kgenBeginBranch(ctx, tmp); mulOpts->postFetch = NULL; ret = tileMulGen(ctx, gset, mulOpts); if (ret != 0) { return ret; } kgenEndBranch(ctx, NULL); // matrix side part // should be calculated by item0 of each subgroup sprintf(tmp, "for (; k0 < M; k0 += %lu)", bwidth); kgenBeginBranch(ctx, tmp); kgenPrintf( ctx, "coord.z = k0;\n"); resetFetchNumA(mulOpts); mulOpts->postFetch = genTrxmPostFetchZero; ret = tileMulGen(ctx, gset, mulOpts); if (ret != 0) { return ret; } kgenEndBranch(ctx, NULL); } else { // lower size_t diagBlocks; //Number of bw *y blocks that fit in y*y square if (isRel) { switch ((((gset->kextra->flags & KEXTRA_TRANS_A) != 0)<<1) | (((gset->kextra->flags & KEXTRA_UPPER_TRIANG) != 0) ^ ((gset->kextra->flags & KEXTRA_COLUMN_MAJOR) != 0)) ) { case 0: kgenPrintf(ctx, "__global %s *pA = (__global %s *)&A[mad24(coord.y, lda, coord.z)];\n" "__global %s *pB = (__global %s *)&B[mad24(coord.z, ldb, coord.x)];\n", inTypeNameA, inTypeNameA,inTypeNameB, inTypeNameB); break; case 1: kgenPrintf(ctx, "__global %s *pA = (__global %s *)&A[mad24(coord.z, lda, coord.y)];\n" "__global %s *pB = (__global %s *)&B[mad24(coord.x, ldb, coord.z)];\n", inTypeNameA, inTypeNameA,inTypeNameB, inTypeNameB); break; case 2: kgenPrintf(ctx, "__global %s *pA = (__global %s *)&A[mad24(coord.y, lda, coord.z)];\n" "__global %s *pB = (__global %s *)&B[mad24(coord.x, ldb, coord.z)];\n", inTypeNameA, inTypeNameA,inTypeNameB, inTypeNameB); break; case 3: kgenPrintf(ctx, "__global %s *pA = (__global %s *)&A[mad24(coord.z, lda, coord.y)];\n" "__global %s *pB = (__global %s *)&B[mad24(coord.z, ldb, coord.x)];\n", inTypeNameA, inTypeNameA,inTypeNameB, inTypeNameB); break; } } diagBlocks = divRoundUp(y0, bwidth); sprintf(tmp, "uint iterK = min(currM + %luu, M);\n", y0); kgenAddStmt(ctx, tmp); sprintf(tmp, "iterK = (iterK + %lu) / %lu;\n", bwidth - 1, bwidth); kgenAddStmt(ctx, tmp); // main triangle part sprintf(tmp, "for (k0 = 0; k0 < max(0, (int)iterK - %lu); k0++)", diagBlocks); kgenBeginBranch(ctx, tmp); mulOpts->postFetch = NULL; // part without diagonal elements post fetch zeroing ret = tileMulGen(ctx, gset, mulOpts); if (ret != 0) { return ret; } kgenEndBranch(ctx, NULL); // diagonal part sprintf(tmp, "for (; k0 < iterK; k0++)"); kgenBeginBranch(ctx, tmp); kgenPrintf( ctx, "coord.z = k0 * %lu;\n", bwidth); // diagonal blocks part mulOpts->postFetch = genTrxmPostFetchZero; resetFetchNumA(mulOpts); ret = tileMulGen(ctx, gset, mulOpts); if (ret != 0) { return ret; } kgenEndBranch(ctx, NULL); } if (isRel) { destroyFetchContext(mulOpts->fctx); mulOpts->fctx = NULL; } return 0; } //----------------------------------------------------------------------------- static ssize_t generator( char *buf, size_t buflen, const struct SubproblemDim *subdims, const struct PGranularity *pgran, void *extra) { char tmp[4096]; struct KgenContext *ctx; CLBLASKernExtra *kextra = (CLBLASKernExtra*)extra; KernelExtraFlags kflags = kextra->flags; DataType dtype = kextra->dtype; bool doubleBased = isDoubleBasedType(dtype); size_t staggered = ((extraData_t*)&kextra->solverPriv)->staggered; int ret; BlasGenSettings gset; TileMulOpts mulOpts; int tra = isMatrixAccessColMaj(CLBLAS_TRMM, kflags, MATRIX_A); int trb = isMatrixAccessColMaj(CLBLAS_TRMM, kflags, MATRIX_B); unsigned int l1Pans; TilePostFetchPrivate pfPriv[2]; UpdateResultFlags upResFlags; TailStatus tailStatus; bool subgMode = false; SubgVarNames subgVNames; ctx = createKgenContext(buf, buflen, true); if (ctx == NULL) { return -ENOMEM; } // mismatching subdims define case with subgroup decomposition subgMode = ( subdims[0].bwidth != subdims[1].bwidth ); memset(&gset, 0, sizeof(gset)); memcpy(gset.subdims, subdims, sizeof(gset.subdims)); gset.flags = BGF_DISTINCT_VECLEN; gset.flags |= BGF_WHOLE_A; /*FIXME: This used to be a workaround for compilation issues with dtrmm on * cpu. Normally BGF_WHOLE_A should be enabled always. But for now, * there are wrong results for non-aligned cases on CPU and there is * no workaround yet. if (kflags & (KEXTRA_TAILS_M | KEXTRA_TAILS_N | KEXTRA_TAILS_K)) { gset.flags &= ~BGF_WHOLE_A; }*/ gset.kextra = kextra; gset.pgran = pgran; //avoid [0].bw loop //gset.subdims[0].bwidth = gset.subdims[1].bwidth; memset(pfPriv, 0, sizeof(pfPriv)); pfPriv[0].funcID = CLBLAS_TRMM; pfPriv[0].gset = &gset; if ((gset.flags & BGF_WHOLE_A) != 0) { pfPriv[0].wholeA = 1; } // at first, generate needed declarations kgenDeclareUptrs(ctx, doubleBased); // For inner callback, because both callbacks use own fetchNumA memcpy(&pfPriv[1], &pfPriv[0], sizeof(pfPriv[0])); // if both matrices are accessed row-major - using subgroup pattern if ( subgMode ) { declareTrxmKernel(ctx, dtype, pgran, kflags, CLBLAS_TRMM, "Subgroup", true, true); gset.flags |= BGF_UPTRS; } else { declareTrxmKernel(ctx, dtype, pgran, kflags, CLBLAS_TRMM, "Block", true, true); } kgenBeginFuncBody(ctx); initDefaultTiles(&gset, CLBLAS_TRMM, 0, PRIV_STORAGE_VARIABLE_SET); declareTileStorages(ctx, &gset); kgenAddStmt(ctx, "uint currM, currN;\n" "uint4 coord = 0; /* contains coordB, coordA, k */\n"); kgenDeclareLocalID(ctx, "lid", pgran); kgenDeclareGroupID(ctx, "gid", pgran); if ( subgMode ) { gset.varNames.LDS = "scratch"; // declaring variables used by subgroup mode subgVNames.itemId = "itemId"; subgVNames.subgCoord = "subgCoord"; kgenAddBlankLine( ctx ); kgenAddBlankLine(ctx); kgenPrintf(ctx, "int2 %s;\n", subgVNames.itemId ); kgenPrintf(ctx, "int2 %s;\n", subgVNames.subgCoord); // item ID kgenPrintf( ctx, "%s.x = get_local_id(0)%%%d;\n", subgVNames.itemId, subdims[0].bwidth/subdims[1].bwidth); // subgroup ID kgenPrintf( ctx, "%s.y = get_local_id(0)/%d;\n", subgVNames.itemId, subdims[0].bwidth/subdims[1].bwidth); // subgroup coordX kgenPrintf( ctx, "%s.x = %s.y/%d;\n", subgVNames.subgCoord, subgVNames.itemId, subdims[0].y/subdims[1].y ); // subgroup coordY kgenPrintf( ctx, "%s.y = %s.y%%%d;\n", subgVNames.subgCoord, subgVNames.itemId, subdims[0].y/subdims[1].y ); } kgenAddBlankLine(ctx); sprintf(tmp, "currN = gid * %lu;\n", subdims->x); kgenAddStmt(ctx, tmp); genInitCurrM(ctx, subdims, kflags); if (kflags & KEXTRA_A_OFF_NOT_ZERO) { kgenAddStmt(ctx, "A += offA;\n"); } genTrxmBMatrShift(ctx, kflags, true); if ( subgMode ) { kgenAddStmt(ctx, "GPtr Ag = {A};\n" "GPtr Bg = {B};\n"); } l1Pans = (unsigned int)subdims[0].x / (unsigned int)subdims[1].x; memset(&mulOpts, 0, sizeof(mulOpts)); mulOpts.core = ((kflags & KEXTRA_ENABLE_MAD) != 0) ? TILEMUL_MAD : TILEMUL_MULADD; mulOpts.memA = CLMEM_GLOBAL_MEMORY; mulOpts.memB = CLMEM_GLOBAL_MEMORY; mulOpts.postFetch = NULL; mulOpts.postFetchPriv = &pfPriv; mulOpts.flags = TILEMUL_NO_FLAGS; mulOpts.flags |= TILEMUL_EXTERN_RDECL; if ( subgMode ) { mulOpts.flags |= TILEMUL_NOT_INC_K; mulOpts.flags |= TILEMUL_BW_STRIDE; } if (kflags & KEXTRA_TAILS_M_LOWER) { mulOpts.flags |= TILEMUL_GLOBAL_CYCLIC_A; } if (kflags & KEXTRA_TAILS_N_LOWER) { mulOpts.flags |= TILEMUL_GLOBAL_CYCLIC_B; } if (kflags & KEXTRA_TAILS_K_LOWER) { mulOpts.flags |= TILEMUL_GLOBAL_CYCLIC_K; mulOpts.flags |= TILEMUL_WRAP_AROUND_TAIL; } if (tra) { mulOpts.flags |= TILEMUL_TRA; } if (!trb) { mulOpts.flags |= TILEMUL_TRB; } if (isMatrixConj(kflags, MATRIX_A)) { mulOpts.flags |= TILEMUL_CONJA; } if (isMatrixConj(kflags, MATRIX_B)) { mulOpts.flags |= TILEMUL_CONJB; } initKernelVarNames(&gset.varNames); if ( subgMode ) { kgenPrintf( ctx, "coord.x = currN + %s.x*%d;\n", subgVNames.subgCoord, subdims[1].x ); } else { sprintf(tmp, "coord.x = currN + lid %% %u * %lu;\n", l1Pans, subdims[1].x); kgenAddStmt(ctx, tmp); } // loop over M sprintf(tmp, "for (uint m0 = 0; m0 < M; m0 += %lu)", subdims[0].y); kgenBeginBranch(ctx, tmp); genStartPosK( ctx, subdims, kflags, subgMode ); sprintf(tmp, "coord.z = kBegin;\n"); kgenAddStmt(ctx, tmp); if ( subgMode ) { kgenPrintf(ctx, "coord.y = currM + %s.y*%d;\n", subgVNames.subgCoord, subdims[1].y); } else { sprintf( tmp, "coord.y = currM + lid / %u * %lu;\n", l1Pans, subdims[1].y ); kgenAddStmt(ctx, tmp); } genZeroTile(ctx, &gset.tileCY); checkGenBeginHitMatrixBlock(ctx, kflags); tailStatus = checkGenAdjustTailCoords(ctx, CLBLAS_TRMM, &gset, NULL); // loops along 'K' if ( subgMode ) { ret = genSubgLoopsK( ctx, &gset, &mulOpts, &subgVNames, staggered); } else { ret = genLoopsK( ctx, &gset, &mulOpts, tmp ); } if (ret != 0) { printf("%s", buf); return ret; } checkGenEndHitMatrixBlock(ctx, kflags); kgenAddBarrier(ctx, CLK_GLOBAL_MEM_FENCE); // store results // for result update - x coordinate is in elements, not in vectors checkGenRestoreTailCoords(ctx, &gset, tailStatus); upResFlags = kextraToUpresFlags(CLBLAS_TRMM, kflags); upResFlags |= tailStatusToUpresFlags(tailStatus); upResFlags |= UPRES_INDEXING_WITH_CONSTANTS; upResFlags |= UPRES_TRIANG_WRITE_C; upResFlags |= UPRES_EXCEED_PROBLEM_CONDITION; if ( subgMode ) { mergeUpdateResult( ctx, CLBLAS_TRMM, &gset, &subgVNames, upResFlags, genResultUpdateWithFlags ); } else { //checkGenBeginHitMatrixBlock(ctx, kflags); genResultUpdateWithFlags( ctx, CLBLAS_TRMM, &gset, upResFlags, NULL, NULL, NULL ); //checkGenEndHitMatrixBlock(ctx, kflags); } if (isMatrixUpper(kflags)) { sprintf(tmp, "currM += %lu;\n", subdims[0].y); } else { sprintf(tmp, "currM -= %lu;\n", subdims[0].y); } kgenAddStmt(ctx, tmp); kgenEndBranch(ctx, NULL); kgenEndFuncBody(ctx); ret = kgenAddBlankLine(ctx); if (!ret) { ret = (ssize_t)kgenSourceSize(ctx) + 1; } destroyKgenContext(ctx); return (ret < 0) ? -EOVERFLOW : ret; } //----------------------------------------------------------------------------- static void assignKargs(KernelArg *args, const void *params, const void *extra) { const CLBlasKargs *blasArgs = (const CLBlasKargs*)params; KernelExtraFlags kflags = ((const CLBLASKernExtra*)extra)->flags; int idx; (void)extra; initSizeKarg(&args[0], blasArgs->M); initSizeKarg(&args[1], blasArgs->N); assignScalarKarg(&args[2], &(blasArgs->alpha), blasArgs->dtype); initMemobjKarg(&args[3], blasArgs->A, NULL, 0, 0); initSizeKarg(&args[4], blasArgs->lda.matrix); initMemobjKarg(&args[5], blasArgs->B, NULL, 0, 0); initMemobjKarg(&args[6], blasArgs->B, NULL, 0, 0); //C in kernel initSizeKarg(&args[7], blasArgs->ldb.matrix); idx = 8; if (kflags & KEXTRA_A_OFF_NOT_ZERO) { initSizeKarg(&args[idx++], blasArgs->offA); } if (kflags & KEXTRA_BX_OFF_NOT_ZERO) { initSizeKarg(&args[idx], blasArgs->offBX); } } //----------------------------------------------------------------------------- static bool isFitToLDS( SubproblemDim *dim, DataType dtype, cl_ulong ldsSize, const void *kernelArgs) { (void)dim; (void)dtype; (void)ldsSize; (void)kernelArgs; /* LDS is not used here so we surely fit to LDS */ return true; } //----------------------------------------------------------------------------- static SolverFlags solverFlags(void) { return (SF_WSPACE_1D); } //----------------------------------------------------------------------------- static void fixupArgs(void *args, SubproblemDim *subdims, void *extra) { CLBlasKargs *kargs = (CLBlasKargs*)args; extraData_t *extraData = (extraData_t*)&((CLBLASKernExtra*)extra)->solverPriv; const size_t nChans = 8; // !!!DEVICE DEPENDED!!! const size_t wideChans = 64; // !!!DEVICE DEPENDED!!! const size_t sizeType[] = {1,2,2,4}; size_t sizeBlock = wideChans * nChans / sizeType[kargs->dtype]; size_t off = kargs->K % sizeBlock; if (off == 0) { ///!= or == ??? extraData->staggered = roundUp(subdims[1].bwidth * sizeType[kargs->dtype] , wideChans / sizeType[kargs->dtype]); } else { extraData->staggered = 0; } extraData->staggered = 64 / sizeType[kargs->dtype]; //fixed, not calculated fixupTrxmKargs((CLBlasKargs*)args); } //----------------------------------------------------------------------------- static bool blockCheckCalcDecomp( PGranularity *pgran, SubproblemDim *subdims, unsigned int subdimsNum, DataType dtype, int check) { bool ret = true; DUMMY_ARG_USAGE(subdimsNum); if (check == PGRAN_CHECK) { unsigned int minSize, maxSize; maxSize = (dtype == TYPE_COMPLEX_DOUBLE) ? 4 : 8; minSize = (dtype == TYPE_COMPLEX_DOUBLE) ? 1 : 2; ret = decompSanityCheck(subdims, minSize, maxSize, 24, dtype, true); ret = ret && (subdims[0].bwidth == subdims[1].bwidth); ret = ret && (pgran->wgSize[0] == 64); } else { calcPgranDedicated(pgran, subdims, -1, 3); } return ret; } //----------------------------------------------------------------------------- void initTrmmCachedBlockPattern(MemoryPattern *mempat) { mempat->name = "Cached global memory based trmm"; mempat->nrLevels = 2; mempat->cuLevel = 0; mempat->thLevel = 1; mempat->sops = &blockSops; mpatExtra.aMset = CLMEM_LEVEL_L1 | CLMEM_LEVEL_L2; mpatExtra.bMset = CLMEM_LEVEL_L1 | CLMEM_LEVEL_L2; mpatExtra.mobjA = CLMEM_BUFFER; mpatExtra.mobjB = CLMEM_BUFFER; mempat->extra = &mpatExtra; } //----------------------------------------------------------------------------- void initTrmmCachedSubgroupPattern(MemoryPattern *mempat) { mempat->name = "Cached global memory based subgroup trmm"; mempat->nrLevels = 2; mempat->cuLevel = 0; mempat->thLevel = 1; mempat->sops = &subgSops; mpatExtra.aMset = CLMEM_LEVEL_L1 | CLMEM_LEVEL_L2; mpatExtra.bMset = CLMEM_LEVEL_L1 | CLMEM_LEVEL_L2; mpatExtra.mobjA = CLMEM_BUFFER; mpatExtra.mobjB = CLMEM_BUFFER; mempat->extra = &mpatExtra; } //----------------------------------------------------------------------------- static int blockGetPerf( unsigned int kflags, const void *args ) { DUMMY_ARG_USAGE(args); if( !isMatrixAccessColMaj( CLBLAS_TRMM, kflags, MATRIX_A ) && !isMatrixAccessColMaj( CLBLAS_TRMM, kflags, MATRIX_B ) ){ return PPERF_AVERAGE; } return PPERF_GOOD; } //----------------------------------------------------------------------------- static int subgGetPerf( unsigned int kflags, const void *args ) { DUMMY_ARG_USAGE(args); if( !isMatrixAccessColMaj( CLBLAS_TRMM, kflags, MATRIX_A ) && !isMatrixAccessColMaj( CLBLAS_TRMM, kflags, MATRIX_B ) ){ return PPERF_GOOD; } return PPERF_NOT_SUPPORTED; } //----------------------------------------------------------------------------- static void subgCalcThreads( size_t threads[2], const SubproblemDim *subdims, const PGranularity *pgran, const void *args, const void *extra ) { CLBLASKernExtra* pKExtra; CLBlasKargs *pArgs; //EINVAL if ( NULL == subdims || NULL == pgran || NULL == args || NULL == extra ) { return; } pKExtra = (CLBLASKernExtra*)extra; pArgs = (CLBlasKargs*)args; // if side is right the dimensions outside kernel are swapped // A is NxN and B is MxN // inside kernel A is still MxM if ( pKExtra->flags & KEXTRA_SIDE_RIGHT ) { threads[0] = ( (pArgs->M/subdims[0].x) * 64 ); // B tail group if ( pArgs->M%subdims[0].x ) { threads[0] += 64;//pgran->wgSize[0]; } } else { threads[0] = ( (pArgs->N/subdims[0].x) * 64 ); // B tail group if ( pArgs->N%subdims[0].x ) { threads[0] += 64;//pgran->wgSize[0]; } } threads[1] = 0; } //----------------------------------------------------------------------------- static int trmmGetDefaultDecomp( PGranularity *pgran, SubproblemDim *subdims, unsigned int subdimsNum, void *pArgs) { (void*)subdimsNum; if ( NULL == pArgs ) { return -EINVAL; } subdims[1].bwidth = 2; subdims[1].x = subdims[1].itemX = 8; subdims[1].y = subdims[1].itemY = 8; subdims[0].bwidth = 2; subdims[0].x = subdims[0].itemX = 32; subdims[0].y = 128; subdims[0].itemY = -1; pgran->wgDim = 1; pgran->wgSize[0] = 64; pgran->wgSize[1] = 1; return 0; } static int trmmSubgGetDefaultDecomp( PGranularity *pgran, SubproblemDim *subdims, unsigned int subdimsNum, void *pArgs) { int itemsPerSubg = 4; int subgA = 8; int subgB = 2; int bw1 = 8; int x1 = 4; int y1 = 4; CLBlasKargs *kargs; DUMMY_ARG_USAGE(subdimsNum); if ( NULL == pArgs ) { return -EINVAL; } kargs = (CLBlasKargs *)pArgs; if( isComplexType(kargs->dtype) ){ bw1 /= 2; } if( isDoubleBasedType(kargs->dtype) ){ bw1 /= 2; } subdims[1].bwidth = bw1; subdims[1].x = subdims[1].itemX = x1; subdims[1].y = subdims[1].itemY = y1; subdims[0].bwidth = bw1 * itemsPerSubg; subdims[0].itemX = x1 * subgB; subdims[0].x = x1*subgB; subdims[0].itemY = y1*subgA; subdims[0].y = y1*subgA; pgran->wgDim = 1; pgran->wgSize[0] = 64; pgran->wgSize[1] = 1; return 0; } //----------------------------------------------------------------------------- // TODO: reimplement via new validation API static bool subgCheckCalcDecomp( PGranularity *pgran, SubproblemDim *subdims, unsigned int subdimsNum, DataType dtype, int check ) { unsigned int subgA = 0; unsigned int subgB = 0; unsigned int regUse = 0; unsigned int itemsPerSubg = 0; DUMMY_ARG_USAGE(subdimsNum); if( 0 == subdims[0].x || 0 == subdims[0].y || 0 == subdims[0].bwidth || 0 == subdims[1].x || 0 == subdims[1].y || 0 == subdims[1].bwidth ){ return false; } subgA = subdims[0].y/subdims[1].y; subgB = subdims[0].x/subdims[1].x; itemsPerSubg = subdims[0].bwidth/subdims[1].bwidth; if( itemsPerSubg < 4 ){ return false; } if( subdims[1].y < 4 || subdims[1].x < 4 || subdims[1].bwidth < 4 ){ return false; } if( subdims[1].x != subdims[1].itemX || subdims[1].y != subdims[1].itemY ){ return false; } // the group block must consist of integer number of subgroup blocks if( subdims[0].x % subdims[1].x || subdims[0].y % subdims[1].y || subdims[0].bwidth % subdims[1].bwidth ){ return false; } //check fitting of bw to common vector sizes if( isComplexType(dtype) ){ if( 2*subdims[1].bwidth > 16 ){ return false; } } // check dimensions if( subdims[1].bwidth > 16 || subdims[1].x > 16 || subdims[1].y > 16 ){ return false; } // estimate register usage, drop // inevitably slowed decompositions regUse = ( subdims[1].bwidth * subdims[1].x + subdims[1].bwidth * subdims[1].y + subdims[1].x * subdims[1].y ) * dtypeSize(dtype); regUse /= 16; // 16 bytes per register if( regUse >= 64 ){ return false; } // passed PGranularity should be checked if( PGRAN_CHECK == check ){ if( pgran->wgDim != 1 ){ return false; } if( pgran->wgSize[0] != 64 ){ return false; } if( pgran->wgSize[0] != subgA*subgB*itemsPerSubg ){ return false; } } // PGranularity should be calculated else{ pgran->wgDim = 1; pgran->wgSize[0] = subgA * subgB * itemsPerSubg; } return true; } clblas-2.10/src/library/blas/gens/trmv_reg.cpp000066400000000000000000000320421264277366700213620ustar00rootroot00000000000000/* ************************************************************************ * Copyright 2013 Advanced Micro Devices, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * ************************************************************************/ /* * trmv generator */ //#define DEBUG_TRMV #include #include #include #include #include #include #include #include #include "blas_kgen.h" #include #include #include extern "C" unsigned int dtypeSize(DataType type); static char Prefix[4]; static SolverFlags solverFlags(void) { #ifdef DEBUG_TRMV printf("solverFlags callen......\n"); #endif return (SF_WSPACE_1D); } static void calcNrThreads( size_t threads[2], const SubproblemDim *subdims, const PGranularity *pgran, const void *args, const void *extra); static ssize_t generator( char *buf, size_t buflen, const struct SubproblemDim *subdims, const struct PGranularity *pgran, void *extra); static void assignKargs(KernelArg *args, const void *params, const void* extra ); extern "C" void initTrmvRegisterPattern(MemoryPattern *mempat); static KernelExtraFlags selectVectorization( void *kargs, unsigned int vlen ); static void setBuildOpts( char * buildOptStr, const void *kArgs); static bool isFitToLDS( SubproblemDim *dim, DataType dtype, cl_ulong ldsSize, const void *kernelArgs); static SolverOps trmvOps = { generator, assignKargs, isFitToLDS, NULL, // Prepare Translate Dims NULL, // Inner Decomposition Axis calcNrThreads, NULL, solverFlags, NULL, NULL, NULL, setBuildOpts, selectVectorization }; static KernelExtraFlags selectVectorization( void *args, unsigned int vlen ) { KernelExtraFlags kflags = KEXTRA_NO_FLAGS; CLBlasKargs *kargs = (CLBlasKargs *)args; if( ( (kargs->uplo == clblasLower) && (kargs->order == clblasColumnMajor) ) || ( (kargs->uplo == clblasUpper) && (kargs->order == clblasRowMajor) ) ) { if( (kargs->N) % vlen) { kflags = KEXTRA_NO_COPY_VEC_A; } } if( kargs->pigFuncID == CLBLAS_TPMV || kargs->pigFuncID == CLBLAS_HPMV || kargs->pigFuncID == CLBLAS_SPMV ) { kflags = KEXTRA_NO_COPY_VEC_A; // Packed-case never do aligned access } return kflags; } static void setBuildOpts( char * buildOptStr, const void *args) { const SolutionStep *step = (const SolutionStep *)args; const CLBlasKargs *kargs = (const CLBlasKargs *)(&step->args); if ( kargs->dtype == TYPE_DOUBLE || kargs->dtype == TYPE_COMPLEX_DOUBLE) { addBuildOpt( buildOptStr, BUILD_OPTS_MAXLEN, "-DDOUBLE_PRECISION"); #ifdef DEBUG_TRMV printf("Setting build options ... Double... for DOUBLE PRECISION support\n"); #endif } if( (step->funcID == CLBLAS_HEMV) || (kargs->pigFuncID == CLBLAS_HPMV) || (kargs->pigFuncID == CLBLAS_SPMV) ) { addBuildOpt( buildOptStr, BUILD_OPTS_MAXLEN, "-DHEMV_ONLY"); /* if(kargs->diag == clblasUnit) { addBuildOpt( buildOptStr, BUILD_OPTS_MAXLEN, "-DHEMV_ZERO_DIAG"); } */ } if ( kargs->pigFuncID == CLBLAS_SPMV ) { addBuildOpt( buildOptStr, BUILD_OPTS_MAXLEN, "-DSPMV_ONLY"); } if( (kargs->pigFuncID == CLBLAS_TPMV) || (kargs->pigFuncID == CLBLAS_HPMV) || (kargs->pigFuncID == CLBLAS_SPMV) ) { addBuildOpt( buildOptStr, BUILD_OPTS_MAXLEN, "-DPACKED"); } return; } static CLBLASMpatExtra mpatExtra; extern "C" void initTrmvRegisterPattern(MemoryPattern *mempat) { #ifdef DEBUG_TRMV printf("initTRMVREgPattern called with mempat = 0x%p\n", mempat); #endif fflush(stdout); mempat->name = "Register accumulation based trmv"; mempat->nrLevels = 2; mempat->cuLevel = 0; mempat->thLevel = 1; mempat->sops = &trmvOps; mpatExtra.aMset = CLMEM_LEVEL_L2; mpatExtra.bMset = CLMEM_LEVEL_L1 | CLMEM_LEVEL_LDS; // For "x" vector mpatExtra.mobjA = CLMEM_BUFFER; mpatExtra.mobjB = CLMEM_BUFFER; mempat->extra = &mpatExtra; Prefix[TYPE_FLOAT] = 'S'; Prefix[TYPE_DOUBLE] = 'D'; Prefix[TYPE_COMPLEX_FLOAT] = 'C'; Prefix[TYPE_COMPLEX_DOUBLE] = 'Z'; } static void calcNrThreads( size_t threads[2], const SubproblemDim *subdims, const PGranularity *pgran, const void *args, const void *_extra) { int BLOCKSIZE = pgran->wgSize[0] * pgran->wgSize[1]; // 1D Block #ifdef DEBUG_TRMV printf("calcNrThreads called from TRMV_Reg.c\n"); #endif const CLBlasKargs *kargs = (const CLBlasKargs *)args; const CLBLASKernExtra *extra = ( CLBLASKernExtra *)_extra; clblasOrder order = ( extra->flags & KEXTRA_COLUMN_MAJOR) ? clblasColumnMajor: clblasRowMajor; clblasTranspose trans = ( extra->flags & KEXTRA_TRANS_A) ? clblasTrans : (( extra->flags & KEXTRA_CONJUGATE_A) ? clblasConjTrans: clblasNoTrans); // unity and doConj handled in setKernelArgs if ( order == clblasRowMajor ) { order = clblasColumnMajor; if ( trans == clblasNoTrans) { trans = clblasTrans; } else if ( trans == clblasTrans ) { trans = clblasNoTrans; } else // clblasConjTrans { trans = clblasNoTrans; } } size_t TARGETROWS = (trans == clblasNoTrans) ? subdims->y : BLOCKSIZE/(subdims->y/extra->vecLenA); #ifdef DEBUG_TRMV printf("kargs-> N : %d, TARGETROWS: %d\n", kargs->N, TARGETROWS); #endif size_t blocks = ((kargs->N - 1)/ TARGETROWS) + 1; #ifdef DEBUG_TRMV printf("blocks : %d\n", blocks); #endif threads[0] = blocks * BLOCKSIZE; #ifdef DEBUG_TRMV printf("pgran-wgSize[0] : %d, globalthreads[0] : %d\n", pgran->wgSize[0], threads[0]); #endif threads[1] = 1; } // // FIXME: Report correct return value - Needs change in KPRINTF // static ssize_t generator( char *buf, size_t buflen, const struct SubproblemDim *subdims, const struct PGranularity *pgran, void *extra) { size_t BLOCKSIZE = pgran->wgSize[0]; char tempTemplate[32*1024]; char targetRows[10], blockSize[10]; if ( buf == NULL) // return buffer size { buflen = (64 * 1024 * sizeof(char)); return (ssize_t)buflen; } CLBLASKernExtra *extraFlags = ( CLBLASKernExtra *)extra; #ifdef DEBUG_TRMV printf("TRMV GENERATOR called....\n"); #endif if((( extraFlags->flags & KEXTRA_TRANS_A) || ( extraFlags ->flags & KEXTRA_CONJUGATE_A ))) { #ifdef DEBUG_TRMV printf("A is trans or CONJ-TRANS\n"); #endif } else { #ifdef DEBUG_TRMV printf("A is noTrans...\n"); #endif } clblasUplo uplo = ( extraFlags->flags & KEXTRA_UPPER_TRIANG) ? clblasUpper : clblasLower; clblasOrder order = ( extraFlags->flags & KEXTRA_COLUMN_MAJOR) ? clblasColumnMajor: clblasRowMajor; clblasTranspose trans = ( extraFlags->flags & KEXTRA_TRANS_A) ? clblasTrans : (( extraFlags->flags & KEXTRA_CONJUGATE_A) ? clblasConjTrans: clblasNoTrans); // unity and doConj handled in setKernelArgs if ( order == clblasRowMajor ) { order = clblasColumnMajor; if ( trans == clblasNoTrans) { trans = clblasTrans; } else if ( trans == clblasTrans ) { trans = clblasNoTrans; } else // clblasConjTrans { trans = clblasNoTrans; } uplo = ( uplo == clblasUpper)? clblasLower : clblasUpper; } if ((subdims->y % extraFlags->vecLenA) != 0) { printf("WARNING: TRMV: generator: TARGETROWS must be divisible by Vector Length\n"); return 0; } size_t TARGETROWS = 0; if ( trans == clblasNoTrans) { #ifdef DEBUG_TRMV printf("clblasNoTrans....%s\n", ( uplo == clblasLower )?"LOWER":"UPPER"); #endif ( uplo == clblasLower )? (strcpy(tempTemplate, (char*)trmv_CL_kernel)) : (strcpy(tempTemplate, (char*)trmv_CU_kernel)); TARGETROWS = subdims->y; if ((BLOCKSIZE % TARGETROWS) != 0) { printf("WARNING: TRMV: generator: Invalid Block Size\n"); return 0; } } else // Transpose cases... { #ifdef DEBUG_TRMV printf("clblasTrans....%s\n", ( uplo == clblasLower )?"LOWER":"UPPER"); #endif ( uplo == clblasLower )? (strcpy(tempTemplate, (char*)trmv_CLT_kernel)) : (strcpy(tempTemplate, (char*)trmv_CUT_kernel)); if ((BLOCKSIZE % (subdims->y / extraFlags->vecLenA)) != 0) { printf("WARNING: TRMV: generator: Invalid Block Size\n"); return 0; } TARGETROWS = BLOCKSIZE/(subdims->y / extraFlags->vecLenA); } #ifdef DEBUG_TRMV printf("dataType : %c\n", Prefix[extraFlags->dtype]); #endif // FIXME: VECTORSIZE HARD CODED // FIXME : SetKernelArgs.. sends offa, offx, and lda should be received as uint unsigned int vecLenA = extraFlags->vecLenA; #ifdef DEBUG_TRMV printf("Vector length used : %d\n\n", vecLenA); #endif bool doVLOAD = false; if( extraFlags->flags & KEXTRA_NO_COPY_VEC_A ) { doVLOAD = true; #ifdef DEBUG_TRMV printf("DOing VLOAD as Aligned Data Pointer not Availabe\n"); #endif } else { #ifdef DEBUG_TRMV printf("Using Aligned Data Pointer .........................\n"); #endif } kprintf kobj( Prefix[extraFlags->dtype], vecLenA, doVLOAD); sprintf( targetRows, "%" SPREFIX "u", TARGETROWS ); sprintf( blockSize, "%" SPREFIX "u", BLOCKSIZE ); #ifdef DEBUG_TRMV printf("TARGET ROWS = %s\n", targetRows); printf("BLOCK SIZE = %s\n", blockSize); #endif kobj.put("%TARGET_ROWS", (const char *)targetRows); kobj.put("%BLOCKSIZE", (const char *) blockSize); kobj.spit((char*)buf, tempTemplate); return (64 * 1024 * sizeof(char)); // return 0;//(ret < 0) ? -EOVERFLOW : ret; } /* (__global %TYPE const* restrict A, __global %TYPE * _xnew, __global %TYPE const* restrict _x_vector, uint N, int incx, int isUnity, uint lda, int doConj, uint offa, uint offx) */ static void assignKargs(KernelArg *args, const void *params, const void* ) { CLBlasKargs *blasArgs = (CLBlasKargs*)params; //NOTE: This will not work if SolutionStep->args is not passed in const void *params. SolutionStep *step = container_of(blasArgs, args, SolutionStep); cl_int inc; cl_int unity, doConj; //bool incxOne = (blasArgs->ldb.vector == 1); //bool incyOne = (blasArgs->ldc.vector == 1); INIT_KARG(&args[0], blasArgs->A); //A - input matrix - argument if( (step->funcID == CLBLAS_HEMV) || (blasArgs->pigFuncID == CLBLAS_HPMV) || (blasArgs->pigFuncID == CLBLAS_SPMV) ) { INIT_KARG(&args[1], blasArgs->C); //y - since the 2nd argument is the result buffer, we should send y for HEMV INIT_KARG(&args[2], blasArgs->B); //x - actual x vector argument } else { INIT_KARG(&args[1], blasArgs->B); //x - result buffer = _xnew argument INIT_KARG(&args[2], blasArgs->C); //y - scratch == _x_vector argument } initSizeKarg(&args[3], blasArgs->N); inc = blasArgs->ldb.vector; INIT_KARG(&args[4], inc); unity = (blasArgs->diag == clblasUnit); INIT_KARG(&args[5], unity); initSizeKarg(&args[6], blasArgs->lda.matrix); doConj = (blasArgs->transA == clblasConjTrans); #ifdef DEBUG_TRMV printf("doConj is : %d, unity is : %d, incx is : %d\n", doConj, unity, inc); #endif INIT_KARG(&args[7], doConj); initSizeKarg(&args[8], blasArgs->offa); initSizeKarg(&args[9], blasArgs->offBX); // For HEMV both alpha and beta has to be passed. if( (step->funcID == CLBLAS_HEMV) || (blasArgs->pigFuncID == CLBLAS_HPMV) || (blasArgs->pigFuncID == CLBLAS_SPMV) ) { inc = blasArgs->ldc.vector; INIT_KARG(&args[10], inc); initSizeKarg(&args[11], blasArgs->offCY); assignScalarKarg(&args[12], &(blasArgs->alpha), blasArgs->dtype); assignScalarKarg(&args[13], &(blasArgs->beta), blasArgs->dtype); } return; } static bool isFitToLDS( SubproblemDim *dim, DataType dtype, cl_ulong ldsSize, const void *kernelArgs) { size_t x, y; cl_ulong maxSize; CLBlasKargs *blasArgs = (CLBlasKargs *)kernelArgs; //size_t tile; size_t maxBlockSize = 256; // PENDING: Query MAX_WORKGROUP_SIZE from OpenCL size_t extra; int naturalVecLength = sizeof(cl_float4)/sizeof(dtype); dim = dim; // Dummy- to remove warnings //extra = (blasArgs->transA == clblasNoTrans) ? dim[0].bwidth : dim[0].y; //extra = (extra > maxBlockSize) ? maxBlockSize : extra; // // TRMV is colMajor always... // y = 16; // Optimized for 16 float4 type reads by a quarter wavefront x = maxBlockSize / y; maxSize = x*y*sizeof(cl_float4); // PENDING: Implementing %REDUCE_SUM can bring this down to sizeof(cl_float) for non-transpose cases extra = ((blasArgs->transA == clblasNoTrans) ? x : (y*naturalVecLength)) * sizeof(dtype); return ((maxSize + extra) <= ldsSize); /* tile = dim[0].y * dim[0].bwidth; tile = (tile > maxBlockSize) ? (maxBlockSize) : tile; tile += extra; maxSize = tile * dtypeSize(dtype); */ } clblas-2.10/src/library/blas/gens/trsm.c000066400000000000000000001352101264277366700201630ustar00rootroot00000000000000/* ************************************************************************ * Copyright 2013 Advanced Micro Devices, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * ************************************************************************/ /* * TRSM generator with support of cached reads from the global memory */ #include #include #include #include #include #include #include #include #include #include #include #include "dblock_kgen.h" #include "kerngen.h" #include "blas_kgen.h" #include "gen_helper.h" #include "trxm_common.h" #include "trsm_kgen.h" #include "legacy/blas_kgen_legacy.h" typedef enum LdsUseFlags { LDS_NO_USE = 0, LDS_USE_LARGE = 0x1, LDS_USE_DIAGONAL = 0x2 } LdsUseFlags; typedef struct TrsmExtraParams { int unrollingFactor; unsigned int unrolledTail; LdsUseFlags ldsUse; } TrsmExtraParams; enum TrsmStage { BLOCK_UPDATE, TILE_UPDATE }; static CLBLASMpatExtra mpatExtra; static ssize_t generator( char *buf, size_t buflen, const struct SubproblemDim *subdims, const struct PGranularity *pgran, void *extra); static bool isFitToLDS( SubproblemDim *dim, DataType dtype, cl_ulong ldsSize, const void *kernelArgs); static SolverFlags solverFlags(void); static void assignKargs(KernelArg *args, const void *params, const void *extra); static void fixupArgs(void *args, SubproblemDim *subdims, void *extra); static bool checkCalcDecompDedicated( PGranularity *pgran, SubproblemDim *subdims, unsigned int subdimsNum, DataType dtype, int check); #if 0 static int getDefaultDecomp( PGranularity *pgran, SubproblemDim *subdims, unsigned int subdimsNum, void * pArgs); #endif static SolverOps trsmSops = { generator, assignKargs, isFitToLDS, NULL, NULL, NULL, NULL, solverFlags, fixupArgs, NULL,//getDefaultDecomp checkCalcDecompDedicated, NULL, NULL }; // The struct for storage tails typedef struct TileSet { Tile rectA; // The rectangular tile A for the update loop at stage 1 Tile squareA; // The square tile for the stage 2 Tile origB; // The rectangular tile B for the update loop at the stage 1 Tile bStage2; // The rectangular tile B for the update loop at thestage 2 Tile bAsSqA; // Descriptor for holding square tile A in the storage of B Tile bAsC; // Descriptor for holding tile C in the storage of B // the entire tile A matching the storage declared in the kernel Tile A; // the entire tile B matching the storage declared in the kernel Tile B; } TileSet; static bool useSkewedFetchB(const BlasGenSettings *gset) { KernelExtraFlags kflags = gset->kextra->flags; TrsmExtraParams *extraParams = (TrsmExtraParams*)gset->kextra->solverPriv; bool ret = false; if (extraParams->ldsUse & LDS_USE_LARGE) { ret = !isMatrixAccessColMaj(CLBLAS_TRSM, kflags, MATRIX_B); } return ret; } static void restoreTile(Tile* dst, const Tile* src) { dst->baseName = src->baseName; dst->vecLen = src->vecLen; dst->storType = src->storType; } static Tile substituteTile(Tile* dst, const Tile* src) { Tile tmp; restoreTile(&tmp, dst); restoreTile(dst, src); return tmp; } static void sprintfInvertedElement( Kstring *elem, const Tile *tile, unsigned int row, unsigned int col, unsigned int len, bool isU) { if (isU) { row = tile->nrRows - row - 1; col = tile->nrCols - col - len; } sprintfTileElement(elem, tile, row, col, len); } static void genTileInverting( struct KgenContext *ctx, const BlasGenSettings *gset, const TileSet *tileSet) { char tmp[1024]; const CLBLASKernExtra *kextra = gset->kextra; KernelExtraFlags kflags = kextra->flags; DataType dtype = kextra->dtype; const SubproblemDim *dim = &gset->subdims[1]; unsigned int accLen; unsigned int i, j, k; Tile srcTile; Tile dstTile; bool isU, isComplex; bool isInlined = gset->flags & BGF_EXPLICIT_INLINE; const char* typeNameA; const char* typeNameB; memcpy(&srcTile, &tileSet->bAsSqA, sizeof(srcTile)); memcpy(&dstTile, &tileSet->squareA, sizeof(dstTile)); getVectorTypeName(kextra->dtype, dstTile.vecLen, &typeNameA, NULL); getVectorTypeName(kextra->dtype, srcTile.vecLen, &typeNameB, NULL); isU = isMatrixUpper(kflags); isComplex = isComplexType(dtype); if (isComplex || dstTile.trans) { accLen = 1; } else { accLen = umin(srcTile.vecLen, dstTile.vecLen); accLen = umin(accLen, srcTile.nrCols); } if (!isInlined) { dstTile.baseName = "a"; srcTile.baseName = "b"; sprintf(tmp, "void\n" "invertTile(%s *a, %s *b)\n", typeNameA, typeNameB); kgenDeclareFunction(ctx, tmp); kgenBeginFuncBody(ctx); } else { kgenAddStmt(ctx, "// Invert tile\n"); } // made destination block unit genZeroTile(ctx, &dstTile); for (i = 0; i < dim->y; i++) { genSetUnitInTile(ctx, &dstTile, i, i); } kgenAddBlankLine(ctx); for (i = 0; i < dim->y; i++) { Kstring src, srcDiag, dst, dstLast; // current source diagonal element sprintfInvertedElement(&srcDiag, &srcTile, i, i, 1, isU); for (j = i; j < dim->y; j++) { // current source non diagonal element if (i) { sprintfInvertedElement(&src, &srcTile, j, i - 1, 1, isU); } for (k = 0; k < dim->y; k += accLen) { // current updated vectorized element sprintfInvertedElement(&dst, &dstTile, j, k, accLen, isU); // update if (i) { // last updated vectorized element sprintfInvertedElement(&dstLast, &dstTile, i - 1, k, accLen, isU); if (isComplex) { sprintf(tmp, "%s -= mul(%s, %s);\n", dst.buf, dstLast.buf, src.buf); } else { sprintf(tmp, "%s -= %s * %s;\n", dst.buf, dstLast.buf, src.buf); } kgenAddStmt(ctx, tmp); } // divide on the diagonal element if (j == i) { if (isComplex) { sprintf(tmp, "%s = div(%s, %s);\n", dst.buf, dst.buf, srcDiag.buf); } else { sprintf(tmp, "%s /= %s;\n", dst.buf, srcDiag.buf); } kgenAddStmt(ctx, tmp); } } } if (i != dim->y - 1) { kgenAddBlankLine(ctx); } } if (!isInlined) { kgenEndFuncBody(ctx); } kgenAddBlankLine(ctx); } static void declareLocalVariables( struct KgenContext *ctx, const BlasGenSettings *gset, Tile* parTile, TrsmExtraParams * extraParams) { char tmp[1024]; const SubproblemDim *dims = gset->subdims; const char* parTileTypeName = NULL; bool trb = isMatrixAccessColMaj(CLBLAS_TRSM, gset->kextra->flags, MATRIX_B); unsigned int locWidth; unsigned int tsize; unsigned int parTileSize; unsigned int l1Pans; unsigned int step; kgenAddStmt(ctx, "const int lid = get_local_id(0);\n" "const int gid = get_group_id(0);\n" "GPtr uA, uB;\n" "uint coordA, coordB;\n" "uint m0 = 0, k0, m1;\n"); if (isMatrixUpper(gset->kextra->flags)) { sprintf(tmp, "uint currM = (M - 1) / %lu * %lu;\n", dims[0].y, dims[0].y); kgenAddStmt(ctx, tmp); } /* * Declare private blocks. * The region 'b' stores in different time tiles of both * the input matrices and the result */ declareTileStorages(ctx, gset); *parTile = gset->tileBX; if (extraParams->ldsUse) { tsize = dtypeSize(gset->kextra->dtype); l1Pans = (unsigned int)(dims[0].x / dims[1].x); parTile->vecLen = (trb) ? (unsigned int)dims[1].x : (unsigned int)dims[1].bwidth; parTile->vecLen = umin(parTile->vecLen, sizeof(cl_float4) / tsize); parTile->trans = trb; /* * Allocate enough space in the local area to fit several tiles * at the stage1 (according to the unrolled factor) and one tile * at the stage2 */ locWidth = (unsigned int)dims[1].bwidth * extraParams->unrollingFactor; if (extraParams->ldsUse & LDS_USE_DIAGONAL) { locWidth = umax(locWidth, (unsigned int)dims[1].y); } if (trb) { parTile->nrRows = locWidth; parTile->nrCols = (unsigned int)dims[0].x; step = (unsigned int)dims[1].x / parTile->vecLen; } else { parTile->nrRows = (unsigned int)dims[0].x; parTile->nrCols = locWidth; step = (unsigned int)dims[1].x * locWidth / parTile->vecLen; } parTileSize = tileVectorsNum(parTile); getVectorTypeName(gset->kextra->dtype, parTile->vecLen, &parTileTypeName, NULL); sprintf(tmp, "__local %s tmpB[%i];\n" "LPtr lB;\n" "LPtr lBMain = {(__local float*)(tmpB + lid %% %u * %u)};\n", parTileTypeName, parTileSize, l1Pans, step); kgenAddStmt(ctx, tmp); if (useSkewedFetchB(gset)) { kgenPrintf(ctx, "const uint skewX = lid %% %u %% %lu;\n", l1Pans, gset->subdims[1].x); } } kgenAddBlankLine(ctx); } /* * Generate cyclical tile shifting so as to convert the skewed * storing to "one-to-one", i. e. the first element in the tile * matches to the first element of the respective tile in the * output matrix. */ static void genTileCyclicalShift(struct KgenContext *ctx, BlasGenSettings *gset) { const char *tname; Kstring k1, k2, *src, *dst, *ktmp; unsigned int row, col; unsigned int seglen; Tile *tileC = &gset->tileCY; seglen = tileLineSegmentLen(tileC); getVectorTypeName(gset->kextra->dtype, seglen, &tname, NULL); kgenAddStmt(ctx, "\n// deliver from skewing in the result\n"); kgenBeginBranch(ctx, "for (uint i = 0; i < skewX; i++)"); kgenPrintf(ctx, "%s tmp;\n\n", tname); src = &k1; dst = &k2; // Skewing may be used only in case of transposed C for (row = 0; row < tileC->nrRows; row += seglen) { sprintfTileElement(dst, tileC, row, tileC->nrCols - 1, seglen); kgenPrintf(ctx, "tmp = %s;\n", dst->buf); for (col = tileC->nrCols - 1; col > 0; col--) { sprintfTileElement(src, tileC, row, col - 1, seglen); kgenPrintf(ctx, "%s = %s;\n", dst->buf, src->buf); // swap pointer ktmp = src; src = dst; dst = ktmp; } kgenPrintf(ctx, "%s = tmp;\n", dst->buf); } kgenEndBranch(ctx, NULL); kgenAddBlankLine(ctx); } /* * Setup coordinates before beginning a trsm stage * A caller must ensure the strict stage sequence: * BLOCK_UPDATE -> TILE_UPDATE */ static void genSetupCoords( struct KgenContext *ctx, const BlasGenSettings *gset, enum TrsmStage stage) { char tmp[1024]; KernelExtraFlags kflags = gset->kextra->flags; const SubproblemDim *dims = gset->subdims; unsigned int l1Pans = (unsigned int)(dims[0].x / dims[1].x); const char *s; s = isMatrixUpper(kflags) ? "currM" : "m0"; sprintf(tmp, "coordA = %s + (lid / %u * %lu);\n", s, l1Pans, dims[1].y); kgenAddStmt(ctx, tmp); switch (stage) { case BLOCK_UPDATE: if (isMatrixUpper(kflags)) { sprintf(tmp, "k0 = currM + %lu;\n", dims[0].y); } else { sprintf(tmp, "k0 = 0;\n"); } break; case TILE_UPDATE: if (isMatrixUpper(kflags)) { sprintf(tmp, "k0 = currM + %lu - m1 * %lu;\n", dims[0].y - dims[1].y, dims[1].y); } else { sprintf(tmp, "k0 = m0 + m1 * %lu;\n", dims[1].y); } break; } kgenAddStmt(ctx, tmp); sprintf(tmp, "coordB = gid * %lu + (lid %% %u * %lu);\n", dims[0].x, l1Pans, dims[1].x); kgenAddStmt(ctx, tmp); kgenAddBlankLine(ctx); } // Generate control block of the loop over K static void genInternalLoopCtl( struct KgenContext *ctx, const SubproblemDim *dim, KernelExtraFlags kflags, size_t stepK, size_t boundAlign) { char tmp[1024]; if (isMatrixUpper(kflags)) { if (kflags & KEXTRA_TAILS_M) { sprintf(tmp, "for (k0 = currM + %lu; k0 < M / %lu * %lu; " "k0 += %lu)", dim[0].y, boundAlign, boundAlign, stepK); } else { sprintf(tmp, "for (k0 = currM + %lu; k0 < M; k0 += %lu)", dim[0].y, stepK); } } else { sprintf(tmp, "for (k0 = 0; k0 < m0; k0 += %lu)", stepK); } kgenBeginBranch(ctx, tmp); } static void initKernelVarNames(KernelVarNames *kvars) { kvars->A = "uA"; kvars->B = "uB"; kvars->C = "B"; kvars->coordA = "coordA"; kvars->coordB = "coordB"; kvars->k = "k0"; kvars->sizeM = "M"; kvars->sizeN = "N"; kvars->sizeK = "M"; kvars->lda = "lda"; kvars->ldb = "ldb"; kvars->ldc = "ldb"; kvars->alpha = "alpha"; kvars->beta = "beta"; } static void setFetchHandler( TileMulOpts *mulOpts, const BlasGenSettings *gset, int handler(struct KgenContext *ctx, MatrixRole mrole, void *priv), TilePostFetchPrivate *priv) { int i, nrPrivs; const char *regName = NULL; if (handler == defaultTilePostFetch) { nrPrivs = 1; } else { nrPrivs = 2; regName = "b"; } for (i = 0; i < nrPrivs; i++) { priv[i].fetchNumA = 0; priv[i].wholeA = 1; priv[i].funcID = CLBLAS_TRSM; priv[i].gset = gset; priv[i].regName = regName; mulOpts->postFetch = handler; mulOpts->postFetchPriv = priv; } } static void genCheckShiftTailB( struct KgenContext *ctx, const BlasGenSettings *gset, int adjustRestore, TailStatus *tailStatus) { BlasGenSettings gsetNew; CLBLASKernExtra kextraNew; memcpy(&gsetNew, gset, sizeof(gsetNew)); memcpy(&kextraNew, gset->kextra, sizeof(kextraNew)); // avoid tail shift for the matrix A kextraNew.flags &= ~(KEXTRA_TAILS_M | KEXTRA_TAILS_M_LOWER); gsetNew.kextra = &kextraNew; if (adjustRestore) { checkGenRestoreTailCoords(ctx, &gsetNew, *tailStatus); } else { *tailStatus = checkGenAdjustTailCoords(ctx, CLBLAS_TRSM, &gsetNew, NULL); } } static void sprintfHitMatrixCond( char *buf, MatrixRole mrole, const char *prefix, const char *suffix) { const char *coordName; char bound; coordName = (mrole == MATRIX_A) ? "coordA" : "coordB"; bound = (mrole == MATRIX_A) ? 'M' : 'N'; if (suffix == NULL) { suffix = ""; } sprintf(buf, "%s%s < %c%s", prefix, coordName, bound, suffix); } /* * 'mulUpd' arguments mean what action is being done: multiplication on * an inverted tile or subsequent update */ static void sprintfStage2Condition( char *buf, const BlasGenSettings *gset, int mulUpd) { KernelExtraFlags kflags = gset->kextra->flags; char hitCond[1024]; char *p; unsigned int xPans, yPans; hitCond[0] = '\0'; xPans = (unsigned int)(gset->subdims[0].x / gset->subdims[1].x); yPans = (unsigned int)(gset->subdims[0].y / gset->subdims[1].y); if (kflags & KEXTRA_TAILS_M) { sprintfHitMatrixCond(hitCond, MATRIX_A, " && ", NULL); } p = hitCond + strlen(hitCond); if (kflags & KEXTRA_TAILS_N) { sprintfHitMatrixCond(p, MATRIX_B, " && ", NULL); } if (!mulUpd) { if (isMatrixUpper(kflags)) { sprintf(buf, "if (lid / %u + m1 == %u%s)", xPans, yPans - 1, hitCond); } else { sprintf(buf, "if (lid / %u == m1%s)", xPans, hitCond); } } else { if (isMatrixUpper(kflags)) { sprintf(buf, "if (lid / %u + m1 < %u%s)", xPans, yPans - 1, hitCond); } else { sprintf(buf, "if (lid / %u > m1%s)", xPans, hitCond); } } } static void genZeroTileTrash( struct KgenContext *ctx, const BlasGenSettings *gset, MatrixRole mrole, Tile* tile) { char tmp[1024]; const SubproblemDim *dim = &gset->subdims[1]; const CLBLASKernExtra *kextra = gset->kextra; unsigned int i, j; unsigned int step; Kstring elem; if (mrole == MATRIX_A) { kgenAddBlankLine(ctx); } else { kgenBeginBranch(ctx, NULL); } sprintf(tmp, "const int bound = (coordA + %lu > M) ? (M - coordA) : %lu;\n", dim->y, dim->y); kgenAddStmt(ctx, tmp); step = tileLineSegmentLen(tile); step = (tile->trans) ? 1 : step; for (j = 0; j < tile->nrRows; ++j) { for (i = 0; i < tile->nrCols; i+=step) { sprintfTileElement(&elem, tile, j, i, step); sprintf(tmp, "%s = (bound <= %u) ? 0 : %s;\n", elem.buf, j, elem.buf); kgenAddStmt(ctx, tmp); } } // Set units in the trash diagonal elements for a tile of A if (mrole == MATRIX_A) { for (i = 0; i < (unsigned int)dim->y; i++) { sprintfTileElement(&elem, tile, i, i, 1); sprintf(tmp, "%s = (bound <= %d) ? %s : %s;\n", elem.buf, (int)i, strOne(kextra->dtype), elem.buf); kgenAddStmt(ctx, tmp); } } if (mrole == MATRIX_A) { kgenAddBlankLine(ctx); } else { kgenEndBranch(ctx, NULL); } } /* * NOTE: Before invoking this function 'tileA' must be initialized accordingly * so as it stores a square tile of the matrix A. */ static void genMulOnDiagonalTile( struct KgenContext *ctx, BlasGenSettings *gset, TileSet *tileSet, const TileMulOpts *mulOpts) { char tmp[1024]; FetchOpts fetchOpts; const SubproblemDim *dim = &gset->subdims[1]; TilePostFetchPrivate pfPriv[2]; TileMulOpts optsNew; const CLBLASKernExtra *extra = gset->kextra; CLBLASKernExtra extraNew; KernelExtraFlags kflags = extra->flags; Tile t; bool isTail; memset(&fetchOpts, 0, sizeof(fetchOpts)); fetchOpts.regName = "b"; fetchOpts.mrole = MATRIX_A; fetchOpts.lineOffset = 0; fetchOpts.linesNum = (unsigned int)dim->y; // setup options to multiply on the inverted tile memcpy(&optsNew, mulOpts, sizeof(TileMulOpts)); optsNew.flags &= ~TILEMUL_TRB; kgenAddStmt(ctx, "// Fetch and invert the square tile located on the " "diagonal\n"); // The matrix B play the role of A t = substituteTile(&gset->tileA, &tileSet->bAsSqA); isTail = ((kflags & KEXTRA_TAILS_M) != 0); genFetchInputTile(ctx, mulOpts->fctx, gset, &fetchOpts); setFetchHandler(&optsNew, gset, genTrxmPostFetchZero, pfPriv); /* * There is no needs in zeroing tail along K in case of the lower * triangular matrix because it is in the "other" triangle which is * never accessed */ if (isTail && !isMatrixUpper(kflags)) { memcpy(&extraNew, extra, sizeof(extraNew)); extraNew.flags &= ~KEXTRA_TAILS_K_LOWER; gset->kextra = &extraNew; } genTrxmPostFetchZero(ctx, MATRIX_A, pfPriv); /* * One must zero the tail part of a fetched square tile * in order to avoid influence of the trailing trash on the resulting * inverted tile (evaluating proceeds from the bottom towards the top * of the tile) */ if (isTail) { genZeroTileTrash(ctx, gset, MATRIX_A, &gset->tileA); } restoreTile(&gset->tileA, &t); if(gset->flags & BGF_EXPLICIT_INLINE) { genTileInverting(ctx, gset, tileSet); } else { sprintf(tmp, "invertTile(%s, %s);\n\n", tileSet->squareA.baseName, tileSet->bAsSqA.baseName); kgenAddStmt(ctx, tmp); } gset->tileBX = tileSet->bAsC; genTileCopy(ctx, &gset->tileBX, &gset->tileCY, TILECOPY_ASSIGN); /* * For the lower diagonal not integrally decomposed matrix A * it's enough to zero the tail part of the result in order to * clear trash accumulated over the update loop */ if (isTail && !isMatrixUpper(kflags)) { genZeroTileTrash(ctx, gset, MATRIX_B, &gset->tileBX); } genZeroTile(ctx, &gset->tileCY); genMulTiles(ctx, gset, &optsNew); kgenAddBlankLine(ctx); // restore original extra gset->kextra = extra; } static void genUpdateIntermResult( struct KgenContext *ctx, const BlasGenSettings *gset, bool withMhitCond, UpdateResultFlags flags) { char tmp[1024]; const char *coordY, *coordX; char *revAlp, *alp; DataType dtype = gset->kextra->dtype; KernelExtraFlags kflags = gset->kextra->flags; const SubproblemDim *dim = &gset->subdims[1]; const KernelVarNames *kvarNames = &gset->varNames; UpdateResultOp op; UpresVarNames uvars; const char* ctype; memset(&uvars, 0, sizeof(uvars)); op = (flags & UPRES_WITH_BETA) ? UPRES_SUM : UPRES_SET; uvars.startRow = kvarNames->coordA; uvars.startCol = kvarNames->coordB; uvars.nrRows = "y"; uvars.nrCols = "x"; uvars.result = "B"; uvars.ld = "ldb"; ctype = dtypeBuiltinType(dtype); if (isComplexType(dtype)) { if (dtype == TYPE_COMPLEX_FLOAT) { revAlp = "div((float2)(-1.f, 0), alpha)"; alp = "(float2)(1.f, 0)"; } else { revAlp = "div((double2)(-1., 0), alpha)"; alp = "(double2)(1., 0)"; } } else { revAlp = "-1. / alpha"; alp = "1."; } // inline result update flags |= UPRES_INLINE; coordY = kvarNames->coordA; coordX = kvarNames->coordB; /* * We should be careful here. * * The non tailed case of updateResult() is rewritted. * Now update result for tailed and non tailed cases have a bit * different semantics. * * The first one produces expressions like * 'dst = dst * beta + src * alpha'. * * Here 'dst' and 'src' may be private result stored in registers or * result to be updated in the global memory. Let the first one to be * designated as tileC and the second one as matC. * * The non tailed case produces expressions like * 'dst = matC * beta + tileC * alpha'. * * The second variant is more clear and native for the new implementation. * But as the difference is not eliminated, both the variants are * maintained here. */ if (!(kflags & (KEXTRA_TAILS_M | KEXTRA_TAILS_N))) { kgenBeginBranch(ctx, ""); sprintf(tmp, "%s %s = %s;\n" "%s alpha = beta;\n", ctype, "beta", revAlp, ctype); kgenAddStmt(ctx, tmp); updateResultGen(ctx, gset, CLBLAS_TRSM, op, flags & ~UPRES_WITH_BETA, &uvars); kgenEndBranch(ctx, NULL); } else { if (withMhitCond) { sprintf(tmp, "if ((%s < %s) && (%s < %s))", coordY, kvarNames->sizeM, coordX, kvarNames->sizeN); kgenBeginBranch(ctx, tmp); } else { /* for x, y variables scope */ kgenBeginBranch(ctx, NULL); } sprintf(tmp, "uint y = min(%luu, %s - (uint)%s);\n" "uint x = min(%luu, %s - (uint)%s);\n", dim->y, kvarNames->sizeM, coordY, dim->x, kvarNames->sizeN, coordX); kgenAddStmt(ctx, tmp); sprintf(tmp, "if ((y == %lu) && (x == %lu))", dim->y, dim->x); kgenBeginBranch(ctx, tmp); sprintf(tmp, "%s %s = %s;\n" "%s alpha = beta;\n", ctype, "beta", revAlp, ctype); kgenAddStmt(ctx, tmp); // optimized update updateResultGen(ctx, gset, CLBLAS_TRSM, op, flags & ~UPRES_WITH_BETA, &uvars); kgenEndBranch(ctx, NULL); flags |= UPRES_GENERIC; kgenBeginBranch(ctx, "else "); sprintf(tmp, "%s %s = %s;\n" "%s %s = %s;\n", ctype, "beta", revAlp, ctype, "alpha", alp); kgenAddStmt(ctx, tmp); // not optimized update updateResultGen(ctx, gset, CLBLAS_TRSM, op, flags, &uvars); kgenEndBranch(ctx, NULL); kgenEndBranch(ctx, NULL); } } static void genPreloadedTileMul( struct KgenContext *ctx, BlasGenSettings *gset, TileMulOpts *mulOpts, const Tile *parTile, const char* copy2LDSFuncName) { char tmp[1024]; KernelExtraFlags kflags = gset->kextra->flags; unsigned int bwidthOld; const char *oldNameB; const char *ptrName; getVectorTypeName(gset->kextra->dtype, parTile->vecLen, NULL, &ptrName); kgenPrintf(ctx, "lB.%s = tmpB;\n", ptrName); kgenAddBarrier(ctx, CLK_LOCAL_MEM_FENCE); if (!isMatrixAccessColMaj(CLBLAS_TRSM, kflags, MATRIX_B)) { sprintf(tmp, "%s(lB, uB, gid * %lu, k0, ldb);\n", copy2LDSFuncName, gset->subdims[0].x); } else { sprintf(tmp, "%s(lB, uB, k0, gid * %lu, ldb);\n", copy2LDSFuncName, gset->subdims[0].x); } kgenAddStmt(ctx, tmp); kgenAddBarrier(ctx, CLK_LOCAL_MEM_FENCE); kgenAddBlankLine(ctx); kgenAddStmt(ctx, "lB = lBMain;\n\n"); mulOpts->memB = CLMEM_LOCAL_MEMORY; oldNameB = gset->varNames.B; bwidthOld = (unsigned int)gset->subdims[0].bwidth; gset->varNames.B = "lB"; gset->subdims[0].bwidth = (parTile->trans) ? parTile->nrRows : parTile->nrCols; tileMulGen(ctx, gset, mulOpts); gset->varNames.B = oldNameB; gset->subdims[0].bwidth = bwidthOld; mulOpts->memB = CLMEM_GLOBAL_MEMORY; } static void initTiles( BlasGenSettings* gset, TileSet* tileSet, const struct SubproblemDim *subdims, KernelExtraFlags kflags, DataType dtype, PrivateStorageType storType) { unsigned int rowsA; unsigned int rowsB; unsigned int rowsC; unsigned int colsA; unsigned int colsB; unsigned int colsC; bool transA; bool transB; unsigned int vecLenA; unsigned int vecLenB; unsigned int vecLenC; rowsA = (unsigned int)subdims[1].y; colsA = (unsigned int)szmax(subdims[1].y, subdims[1].bwidth); rowsB = (unsigned int)szmax(subdims[1].y, subdims[1].bwidth); colsB = (unsigned int)szmax(subdims[1].x, subdims[1].y); rowsC = (unsigned int)subdims[1].y; colsC = (unsigned int)subdims[1].x; transA = isMatrixAccessColMaj(CLBLAS_TRSM, kflags, MATRIX_A); transB = isMatrixAccessColMaj(CLBLAS_TRSM, kflags, MATRIX_B); vecLenA = (unsigned int)((transA) ? subdims[1].y : subdims[1].bwidth); vecLenA = umin(vecLenA, MAX_TILE_VECLEN); vecLenB = (unsigned int)((transB) ? subdims[1].x : subdims[1].bwidth); vecLenB = umin(vecLenB, MAX_TILE_VECLEN); vecLenC = (transB) ? vecLenB : vecLenA; initTile(&tileSet->rectA, "a", (unsigned int)subdims[1].y, (unsigned int)subdims[1].bwidth, vecLenA, dtype, storType, transA, false); initTile(&tileSet->squareA, "a", (unsigned int)subdims[1].y, (unsigned int)subdims[1].y, vecLenA, dtype, storType, transA, false); initTile(&tileSet->origB, "b", (unsigned int)subdims[1].bwidth, (unsigned int)subdims[1].x, vecLenB, dtype, storType, !transB, false); initTile(&tileSet->bStage2, "b", (unsigned int)subdims[1].y, (unsigned int)subdims[1].x, vecLenB, dtype, storType, !transB, false); initTile(&tileSet->bAsSqA, "b", (unsigned int)subdims[1].y, (unsigned int)subdims[1].y, vecLenB, dtype, storType, transA, false); initTile(&tileSet->bAsC, "b", (unsigned int)subdims[1].y, (unsigned int)subdims[1].x, vecLenB, dtype, storType, gset->tileCY.trans, false); initTile(&gset->tileA, "a", rowsA, colsA, vecLenA, dtype, storType, transA, false); initTile(&gset->tileBX, "b", rowsB, colsB, vecLenB, dtype, storType, !transB, false); initTile(&gset->tileCY, "c", rowsC, colsC, vecLenC, dtype, storType, !transB, false); tileSet->A = gset->tileA; tileSet->B = gset->tileBX; } static ssize_t generator( char *buf, size_t buflen, const struct SubproblemDim *subdims, const struct PGranularity *pgran, void *extra) { char tmp[1024]; struct KgenContext *ctx; ssize_t ret; CLBLASKernExtra *kextra = (CLBLASKernExtra*)extra; DataType dtype = kextra->dtype; KernelExtraFlags kflags = kextra->flags; CLBLASKernExtra extraNew; BlasGenSettings gset; TileMulOpts mulOpts; const char *ptrName; UpdateResultFlags upFlags = 0; TilePostFetchPrivate pfPriv; unsigned int l1Pans; bool b; Tile parTile; TrsmExtraParams *extraParams = (TrsmExtraParams *)kextra->solverPriv; int ldsLarge, lds_diagonal; bool isInline; TileSet tileSet; char copy2LDSFuncName[FUNC_NAME_MAXLEN]; TailStatus tailStatus = 0; FetchAddrMode addrMode = 0; bool tailM = ((kflags & KEXTRA_TAILS_M) != 0); bool tailN = ((kflags & KEXTRA_TAILS_N) != 0); size_t alignK; if (pgran->wgDim != 1) { return -EINVAL; } l1Pans = (unsigned int)(subdims[0].x / subdims[1].x); memset(&gset, 0, sizeof(gset)); gset.flags = BGF_WHOLE_A | BGF_EXPLICIT_INLINE | BGF_UPTRS; memcpy(gset.subdims, subdims, sizeof(SubproblemDim) * 2); // there is not need in block structure along K gset.subdims[0].bwidth = gset.subdims[1].bwidth; subdims = gset.subdims; /* * Since tiles are changed dynamically, e. g. in the main tilemul * loop they are rectangular, but at the second stage both A and B * tile storages are used for square tiles. One must adjust physical * vectorization accordindly, so as vector length might not be * greater than linear size of any tile */ memcpy(&extraNew, kextra, sizeof(extraNew)); extraNew.vecLenA = umin(kextra->vecLenA, (unsigned int)subdims[1].y); extraNew.vecLenB = umin(kextra->vecLenB, (unsigned int)subdims[1].y); gset.pgran = pgran; gset.kextra = &extraNew; initKernelVarNames(&gset.varNames); // multiplication options mulOpts.memA = CLMEM_GLOBAL_MEMORY; mulOpts.memB = CLMEM_GLOBAL_MEMORY; mulOpts.core = (kextra->flags & KEXTRA_ENABLE_MAD) ? TILEMUL_MAD : TILEMUL_MULADD; mulOpts.postFetch = NULL; mulOpts.flags = kextraToTilemulFlags(CLBLAS_TRSM, kflags); mulOpts.flags |= TILEMUL_EXTERN_RDECL | TILEMUL_NOT_INC_K; mulOpts.fctx = createFetchContext(); if (mulOpts.fctx == NULL) { return -ENOMEM; } disableFetchOptLevels(mulOpts.fctx, FOPTLEV_TMP_COORD_PRECOMPUTING); isInline = (gset.flags & BGF_EXPLICIT_INLINE); initTiles(&gset, &tileSet, subdims, kflags, dtype, PRIV_STORAGE_VARIABLE_SET); ctx = createKgenContext(buf, buflen, true); if (ctx == NULL) { destroyFetchContext(mulOpts.fctx); return -ENOMEM; } kgenAddStmt(ctx, "#pragma OPENCL EXTENSION cl_amd_printf : enable\n\n"); b = isDoubleBasedType(dtype); kgenDeclareUptrs(ctx, b); if (isComplexType(dtype)) { genComplexMathOperators(ctx, dtype); } if(!isInline) { genTileInverting(ctx, &gset, &tileSet); } if ( extraParams->ldsUse != LDS_NO_USE ) { SubproblemDim sdims; DBlockCopyFlags flags; unsigned int vecLen; if (!isMatrixAccessColMaj(CLBLAS_TRSM, kflags, MATRIX_B)) { sdims.x = gset.subdims[1].bwidth * extraParams->unrollingFactor; sdims.y = gset.subdims[0].x; } else { sdims.x = gset.subdims[0].x; sdims.y = gset.subdims[1].bwidth * extraParams->unrollingFactor; } vecLen = getVecLen(&gset, CLBLAS_TRSM, MATRIX_B); flags = (vecLen < 4) ? DBLOCK_COPY_NOT_VECTORIZE : 0; copyDataBlockGen(ctx, &sdims, gset.pgran, dtype, DBLOCK_GLOBAL_TO_LOCAL, flags); kgenAddBlankLine(ctx); kgenGetLastFuncName(copy2LDSFuncName, FUNC_NAME_MAXLEN, ctx); } declareTrxmKernel(ctx, dtype, pgran, kflags, CLBLAS_TRSM, "Cached", false, true); kgenBeginFuncBody(ctx); declareLocalVariables(ctx, &gset, &parTile, extraParams); if (kflags & KEXTRA_A_OFF_NOT_ZERO) { kgenAddStmt(ctx, "A += offA;\n"); } genTrxmBMatrShift(ctx, kflags, false); ptrName = dtypeUPtrField(dtype); sprintf(tmp, "uB.%s = B;\n\n", ptrName); kgenAddStmt(ctx, tmp); // external loop sprintf(tmp, "for (m0 = 0; m0 < M; m0 += %lu)", subdims[0].y); kgenBeginBranch(ctx, tmp); genZeroTile(ctx, &gset.tileCY); genSetupCoords(ctx, &gset, BLOCK_UPDATE); kgenAddStmt(ctx, "// Stage 1. Multiply and update with large blocks\n"); gset.tileA = tileSet.rectA; gset.tileBX = tileSet.origB; if (!isMatrixUpper(kflags) && tailM) { addrMode |= FETCH_ADDR_A_CYCLICAL; setFetchAddrMode(mulOpts.fctx, addrMode); } ldsLarge = ((extraParams->ldsUse & LDS_USE_LARGE) != 0); alignK = subdims[1].bwidth; if (ldsLarge) { alignK *= extraParams->unrollingFactor; } if (ldsLarge) { const char *oldCoordB; FetchAddrMode bamode = addrMode | FETCH_ADDR_K_RELATIVE; bool withSkew; withSkew = useSkewedFetchB(&gset); if (!withSkew) { bamode |= FETCH_ADDR_B_RELATIVE; } else { bamode |= FETCH_ADDR_B_CYCLICAL; } setFetchAddrMode(mulOpts.fctx, bamode); if (tailN) { /* * Conditional branch for those items which hit into * matrix B with their matrix coordinates */ sprintf(tmp, "if ((gid + 1) * %lu < N)", subdims[0].x); kgenBeginBranch(ctx, tmp); } if (isMatrixAccessColMaj(CLBLAS_TRSM, kflags, MATRIX_A)) { kgenPrintf(ctx, "uA.%s = A + k0 * lda;\n", ptrName); } else { kgenPrintf(ctx, "uA.%s = A + k0;\n", ptrName); } if (withSkew) { unsigned int bwidthOld; oldCoordB = gset.varNames.coordB; gset.varNames.coordB = "skewX"; bwidthOld = gset.subdims[0].bwidth; gset.subdims[0].bwidth = (parTile.trans) ? parTile.nrRows : parTile.nrCols; gset.subdims[0].bwidth = bwidthOld; } genInternalLoopCtl(ctx, subdims, kflags, alignK, alignK); genPreloadedTileMul(ctx, &gset, &mulOpts, &parTile, copy2LDSFuncName); genInternalLoopEnd(ctx); // loop over K if (withSkew) { gset.varNames.coordB = oldCoordB; setFetchAddrMode(mulOpts.fctx, bamode & ~FETCH_ADDR_B_CYCLICAL); // deliver from skew in the result before proceed to the next stage genTileCyclicalShift(ctx, &gset); } if (tailN) { kgenEndBranch(ctx, NULL); kgenBeginBranch(ctx, "else"); } setFetchAddrMode(mulOpts.fctx, addrMode); } if (!ldsLarge || tailN) { genCheckShiftTailB(ctx, &gset, 0, &tailStatus); if ((kflags & KEXTRA_TAILS_N_LOWER) && !tailStatus) { addrMode |= FETCH_ADDR_B_CYCLICAL; setFetchAddrMode(mulOpts.fctx, addrMode); } if (tailN) { sprintfHitMatrixCond(tmp, MATRIX_B, "if (", ")"); kgenBeginBranch(ctx, tmp); } genInternalLoopCtl(ctx, subdims, kflags, subdims[1].bwidth, alignK); tileMulGen(ctx, &gset, &mulOpts); genInternalLoopEnd(ctx); // loop over K if (tailN) { kgenEndBranch(ctx, NULL); } if (extraParams->ldsUse & LDS_USE_LARGE) { kgenEndBranch(ctx, NULL); } } sprintf(tmp, "uA.%s = A;\n\n", ptrName); kgenAddStmt(ctx, tmp); // processing tails along update dimension if (isMatrixUpper(kflags) && ((kflags & KEXTRA_TAILS_K_LOWER) || (ldsLarge && extraParams->unrolledTail))) { unsigned int tailChunks; tailChunks = (extraParams->ldsUse & LDS_USE_LARGE) ? extraParams->unrolledTail : 1; if (tailN) { char hitCond[1024]; sprintfHitMatrixCond(hitCond, MATRIX_B, "(", ")"); sprintf(tmp, "if ((currM + %lu < M) && %s)", subdims[0].y, hitCond); } else { sprintf(tmp, "if (currM + %lu < M)", subdims[0].y); } kgenBeginBranch(ctx, tmp); if (kflags & KEXTRA_TAILS_K_LOWER) { setFetchAddrMode(mulOpts.fctx, addrMode | FETCH_ADDR_K_CYCLICAL); setFetchHandler(&mulOpts, &gset, defaultTilePostFetch, &pfPriv); } if (tailChunks > 1) { mulOpts.flags &= ~TILEMUL_NOT_INC_K; sprintf(tmp, "for (uint k1 = 0; k1 < %u; k1++)", tailChunks); kgenBeginBranch(ctx, tmp); } addrMode |= FETCH_ADDR_B_CYCLICAL; setFetchAddrMode(mulOpts.fctx, addrMode); tileMulGen(ctx, &gset, &mulOpts); if (tailChunks > 1) { kgenEndBranch(ctx, NULL); mulOpts.flags |= TILEMUL_NOT_INC_K; } kgenEndBranch(ctx, NULL); } gset.tileA = tileSet.squareA; kgenAddStmt(ctx, "\n/*\n" " * Stage 2. A part of work items multiply got result on " "a respective\n" " * inverted diagonal block, and the remaining ones wait. " "Then they perform\n" " * one step of further intermediate result evaluation as " "multiplying tile by tile.\n" " * It continues until the whole panel of the " "matrix A is processed\n" " */\n"); // one must deal further with square blocks strictly gset.subdims[0].bwidth = gset.subdims[1].bwidth = gset.subdims[1].y; sprintf(tmp, "for (m1 = 0; m1 < %lu; m1++)", subdims[0].y / subdims[1].y); kgenBeginBranch(ctx, tmp); if (extraParams->ldsUse & LDS_USE_DIAGONAL) { sprintf(tmp, "const int bid = lid %% %u;\n\n", l1Pans); kgenAddStmt(ctx, tmp); } /* * Update the intermediate result multiply on the inverted diagonal tile, * and write back */ genSetupCoords(ctx, &gset, TILE_UPDATE); sprintfStage2Condition(tmp, &gset, 0); ret = kgenBeginBranch(ctx, tmp); upFlags = kextraToUpresFlags(CLBLAS_TRSM, kflags); upFlags |= tailStatusToUpresFlags(tailStatus); upFlags |= UPRES_PRIV_DEST | UPRES_WITH_BETA; genUpdateIntermResult(ctx, &gset, false, upFlags); kgenAddBlankLine(ctx); lds_diagonal = ((extraParams->ldsUse & LDS_USE_DIAGONAL) && (kflags & (KEXTRA_COLUMN_MAJOR)) == 0 && !(tailM || tailN) && !(upFlags & UPRES_NO_VECTORIZATION) && !isComplexType(kextra->dtype)); /* * it's needed now to adjust addressing mode of A so as to don't * exceed the bound of A */ if (tailM) { setFetchAddrMode(mulOpts.fctx, addrMode | FETCH_ADDR_A_CYCLICAL | FETCH_ADDR_K_CYCLICAL); extraNew.flags |= KEXTRA_TAILS_K_LOWER; } genMulOnDiagonalTile(ctx, &gset, &tileSet, &mulOpts); gset.tileBX = tileSet.bStage2; if (tailM) { setFetchHandler(&mulOpts, &gset, defaultTilePostFetch, &pfPriv); } kgenAddStmt(ctx, "// Write back the given result\n"); upFlags = kextraToUpresFlags(CLBLAS_TRSM, kflags); upFlags |= tailStatusToUpresFlags(tailStatus); if (lds_diagonal) { sprintf(tmp, "tmpB[%%u * %u + bid]", l1Pans); } genResultUpdateWithFlags(ctx, CLBLAS_TRSM, &gset, upFlags, NULL, NULL, lds_diagonal ? tmp : NULL); kgenEndBranch(ctx, NULL); // multiply on the inverted tile path kgenAddBarrier(ctx, CLK_GLOBAL_MEM_FENCE); // continue the tile update kgenAddBlankLine(ctx); sprintfStage2Condition(tmp, &gset, 1); kgenBeginBranch(ctx, tmp); genCheckShiftTailB(ctx, &gset, 0, &tailStatus); if (lds_diagonal) { // TODO: add here storing to LDS as well } else { addrMode |= FETCH_ADDR_B_CYCLICAL; setFetchAddrMode(mulOpts.fctx, addrMode); tileMulGen(ctx, &gset, &mulOpts); } kgenEndBranch(ctx, NULL); // tile update path kgenAddBarrier(ctx, CLK_GLOBAL_MEM_FENCE); kgenEndBranch(ctx, NULL); // second stage loop if (isMatrixUpper(kflags)) { sprintf(tmp, "currM -= %lu;\n", subdims[0].y); kgenAddStmt(ctx, tmp); } kgenEndBranch(ctx, NULL); // loop over M ret = kgenEndFuncBody(ctx); if (!ret) { ret = (ssize_t)kgenSourceSize(ctx) + 1; } destroyFetchContext(mulOpts.fctx); destroyKgenContext(ctx); return (ret < 0) ? -EOVERFLOW : ret; } static bool isFitToLDS( SubproblemDim *dim, DataType dtype, cl_ulong ldsSize, const void *kernelArgs) { (void)dim; (void)dtype; (void)ldsSize; (void)kernelArgs; return true; } static SolverFlags solverFlags(void) { return (SF_WSPACE_1D | SF_TOP_INPUT_SQUARE_BLOCKS); } static void assignKargs(KernelArg *args, const void *params, const void *extra) { const CLBlasKargs *blasArgs = (const CLBlasKargs*)params; KernelExtraFlags kflags = ((const CLBLASKernExtra*)extra)->flags; int idx = 7; initSizeKarg(&args[0], blasArgs->M); initSizeKarg(&args[1], blasArgs->N); assignScalarKarg(&args[2], &(blasArgs->alpha), blasArgs->dtype); initMemobjKarg(&args[3], blasArgs->A, NULL, 0, 0); initSizeKarg(&args[4], blasArgs->lda.matrix); initMemobjKarg(&args[5], blasArgs->B, NULL, 0, 0); initSizeKarg(&args[6], blasArgs->ldb.matrix); if (kflags & KEXTRA_A_OFF_NOT_ZERO) { initSizeKarg(&args[idx++], blasArgs->offA); } if (kflags & KEXTRA_BX_OFF_NOT_ZERO) { initSizeKarg(&args[idx], blasArgs->offBX); } } static void fixupArgs(void *args, SubproblemDim *subdims, void *extra) { CLBLASKernExtra *kextra = (CLBLASKernExtra*)extra; CLBlasKargs *kargs = (CLBlasKargs*)args; TrsmExtraParams *extraParams = (TrsmExtraParams *)kextra->solverPriv; size_t loadBatch; unsigned int wgSize; unsigned int workRatio; unsigned int ldsUse = LDS_NO_USE; KernelExtraFlags kflags = kextra->flags; SubproblemDim globDim; bool isAmdGPU; /* * Calculate size of the batch loaded from global to local memory * at each iteration of the stage 1. Choose such unrolling factor * that allow each work item to load at least 16 bytes that provides * efficient global memory access */ loadBatch = subdims[0].x * subdims[1].bwidth * dtypeSize(kargs->dtype); wgSize = (unsigned int)((subdims[0].x / subdims[1].itemX) * (subdims[0].y / subdims[1].itemY)); if (loadBatch < wgSize) { workRatio = 1; } else { workRatio = 16 / ((unsigned int)loadBatch / wgSize); if (!workRatio) { workRatio = 1; } } #ifndef NDEBUG { const char *envImpl = getenv("AMD_CLBLAS_TRSM_LDSUSE"); if (envImpl != NULL) { unsigned int w = atoi(envImpl); ldsUse = w % 10; w = w / 10; workRatio = w > 0 ? w : workRatio; } } #endif ldsUse = LDS_NO_USE; isAmdGPU = ((kflags & KEXTRA_VENDOR_AMD) != 0); if ((isAmdGPU && !(kflags & (KEXTRA_TAILS_K_LOWER | KEXTRA_TAILS_M_LOWER))) || (!isAmdGPU && !(kflags & KEXTRA_TAILS_M))) { ldsUse = LDS_USE_LARGE; } kargsToProbDims(&globDim, CLBLAS_TRSM, args, false); extraParams->ldsUse = ldsUse; extraParams->unrollingFactor = workRatio; extraParams->unrolledTail = (unsigned int)(((globDim.bwidth % (subdims[1].bwidth * workRatio)) + subdims[1].bwidth - 1) / subdims[1].bwidth); fixupTrxmKargs(kargs); } static bool checkCalcDecompDedicated( PGranularity *pgran, SubproblemDim *subdims, unsigned int subdimsNum, DataType dtype, int check) { bool ret = true; DUMMY_ARG_USAGE(subdimsNum); if (check == PGRAN_CHECK) { unsigned int minSize, maxSize; maxSize = (dtype == TYPE_COMPLEX_DOUBLE) ? 4 : 8; minSize = (dtype == TYPE_COMPLEX_DOUBLE) ? 1 : 2; ret = decompSanityCheck(subdims, minSize, maxSize, 24, dtype, true); ret = ret && (subdims[0].bwidth == subdims[1].bwidth); ret = ret && (pgran->wgSize[0] == 64); } else { calcPgranDedicated(pgran, subdims, -1, 3); } return ret; } void initTrsmLdsLessCachedPattern(MemoryPattern *mempat) { mempat->name = "2-staged cached global memory based block trsm"; mempat->nrLevels = 2; mempat->cuLevel = 0; mempat->thLevel = 0; mempat->sops = &trsmSops; mpatExtra.aMset = CLMEM_LEVEL_L1; mpatExtra.bMset = CLMEM_LEVEL_L1; mpatExtra.mobjA = CLMEM_BUFFER; mpatExtra.mobjB = CLMEM_BUFFER; mempat->extra = &mpatExtra; } #if 0 static int getDefaultDecomp( PGranularity *pgran, SubproblemDim *subdims, unsigned int subdimsNum, void * pArgs) { pgran->wgDim = 1; pgran->wgSize[0] = 64; pgran->wgSize[1] = 1; subdims[0].x = subdims[0].itemX = 32; subdims[0].y = 64; subdims[0].itemY = SUBDIM_UNUSED; subdims[0].bwidth = subdims[1].bwidth = 4; subdims[1].x = subdims[1].itemX = 8; subdims[1].y = subdims[1].itemY = 4; } #endif clblas-2.10/src/library/blas/gens/trsm_kgen.c000066400000000000000000000032701264277366700211670ustar00rootroot00000000000000/* ************************************************************************ * Copyright 2013 Advanced Micro Devices, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * ************************************************************************/ #include #include "trsm_kgen.h" void genComplexMathOperators( struct KgenContext *ctx, DataType dtype) { const char *ctype; char tmp[1024]; ctype = dtypeBuiltinType(dtype); sprintf(tmp, "%s\ndiv(%s u, %s v)\n", ctype, ctype, ctype); kgenDeclareFunction(ctx, tmp); kgenBeginFuncBody(ctx); sprintf(tmp, "return (%s)((u.x * v.x + u.y * v.y) / " "(v.x * v.x + v.y * v.y)," "(u.y * v.x - u.x * v.y) / " "(v.x * v.x + v.y * v.y));\n", ctype); kgenAddStmt(ctx, tmp); kgenEndFuncBody(ctx); kgenAddBlankLine(ctx); sprintf(tmp, "%s\nmul(%s u, %s v)\n", ctype, ctype, ctype); kgenDeclareFunction(ctx, tmp); kgenBeginFuncBody(ctx); sprintf(tmp, "return (%s)(u.x * v.x - u.y * v.y, u.x * v.y + u.y * v.x);\n", ctype); kgenAddStmt(ctx, tmp); kgenEndFuncBody(ctx); kgenAddBlankLine(ctx); } clblas-2.10/src/library/blas/gens/trsm_kgen.h000066400000000000000000000016421264277366700211750ustar00rootroot00000000000000/* ************************************************************************ * Copyright 2013 Advanced Micro Devices, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * ************************************************************************/ #ifndef TRSM_KGEN_H_ #define TRSM_KGEN_H_ #include "blas_kgen.h" void genComplexMathOperators( struct KgenContext *ctx, DataType dtype); #endif /* TRSM_KGEN_H_ */ clblas-2.10/src/library/blas/gens/trsv_gemv.cpp000066400000000000000000000401321264277366700215500ustar00rootroot00000000000000/* ************************************************************************ * Copyright 2013 Advanced Micro Devices, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * ************************************************************************/ /* * trsv gemv generator - * * This generator generates code for the GEMV portion of TRSV. * The idea is to call this routine after solving a subset of coefficients. * This generator will help to update the RHS of remaining equations using the * currently solved variables. * The current clBLAS implementation of GEMV does not have support complex types. * Hence, Need to write this kludge. * One day, this should go away and be completely replaced by existing GEMV * * NOTE: * This generator is highly tied to TRSV and is not a replacement for GEMV. * In some cases, this generator generates code not only for updating the RHS * but also for solving the next triangle (trtri based solve) as well. * We have seen marginal performance increases (1GB/s) by doing so. * If this is not important, one can replace this with GEMV when GEMV becomes * feature-complete. */ #include #include #include #include #include #include #include #include #include #include #include //#define DEBUG_TRSV_GEMV extern "C" unsigned int dtypeSize(DataType type); static char Prefix[4]; // PENDING: Magic "4" == Number of data types supported (float, double, cl_float2, cl_double2) static SolverFlags solverFlags(void) { #ifdef DEBUG_TRSV_GEMV printf("TRSV GEMV solverFlags(): solverFlags called......\n"); #endif return (SF_WSPACE_1D); } static bool isTransposeFeasible(size_t triangle, size_t blockSize, size_t vecLen, size_t &TARGETHEIGHT); static bool isNoTransposeFeasible(size_t triangle, size_t blockSize, size_t vecLen, size_t & TARGETROWS, size_t & TARGETWIDTH, size_t &NLOOPS); static void calcNrThreads( size_t threads[2], const SubproblemDim *subdims, const PGranularity *pgran, const void *args, const void *extra); static ssize_t generator( char *buf, size_t buflen, const struct SubproblemDim *subdims, const struct PGranularity *pgran, void *extra); static void assignKargs(KernelArg *args, const void *params, const void*); extern "C" void initTrsvGemvDefaultPattern(MemoryPattern *mempat); static void setBuildOpts( char * buildOptStr, const void *kArgs); static bool isFitToLDS( SubproblemDim *dim, DataType dtype, cl_ulong ldsSize, const void *kernelArgs); static SolverOps trsvGemvOps = { generator, assignKargs, isFitToLDS, NULL, // Prepare Translate Dims NULL, // Inner Decomposition Axis calcNrThreads, NULL, solverFlags, NULL, NULL, NULL, setBuildOpts, NULL }; static void setBuildOpts( char * buildOptStr, const void *args) { const SolutionStep *step = (const SolutionStep *)args; const CLBlasKargs *kargs = (const CLBlasKargs *)(&step->args); if ( kargs->dtype == TYPE_DOUBLE || kargs->dtype == TYPE_COMPLEX_DOUBLE) { addBuildOpt( buildOptStr, BUILD_OPTS_MAXLEN, "-DDOUBLE_PRECISION"); #ifdef DEBUG_TRSV_GEMV printf("TRSV GEMV: Setting build options ... Double... for DOUBLE PRECISION support\n"); #endif } if( kargs->pigFuncID == CLBLAS_TPSV) { addBuildOpt( buildOptStr, BUILD_OPTS_MAXLEN, "-DPACKED"); #ifdef DEBUG_TRSV_GEMV printf("TPSV GEMV: Setting build options ... PACKED\n"); #endif } return; } static CLBLASMpatExtra mpatExtra; extern "C" void initTrsvGemvDefaultPattern(MemoryPattern *mempat) { #ifdef DEBUG_TRSV_GEMV printf("TRSV GEMV: initTrsvGemvDefaultPattern called with mempat = 0x%p\n", (void*)mempat); #endif mempat->name = "TRSV - GEMV Update Kernel"; mempat->nrLevels = 2; mempat->cuLevel = 0; mempat->thLevel = 1; mempat->sops = &trsvGemvOps; mpatExtra.aMset = CLMEM_LEVEL_L2; mpatExtra.bMset = CLMEM_LEVEL_L1 | CLMEM_LEVEL_LDS; mpatExtra.mobjA = CLMEM_BUFFER; // == No images mpatExtra.mobjB = CLMEM_BUFFER; // == No images mempat->extra = &mpatExtra; Prefix[TYPE_FLOAT] = 'S'; Prefix[TYPE_DOUBLE] = 'D'; Prefix[TYPE_COMPLEX_FLOAT] = 'C'; Prefix[TYPE_COMPLEX_DOUBLE] = 'Z'; } /* * Helper function that helps in calculating the "TARGET WIDTH" of * a block with Block Size needed for the case where * "theight" number of variables have been solved. * This is applicable only to NON-TRANSPOSE cases. */ static cl_ulong getTargetWidth(size_t theight, size_t blk_size, size_t vwidth) { cl_ulong nLoops_v, nLoops; // // NOTE: This function should be called only for Non-Transpose cases // NOTE: Does not check if the block size is suitable for our purposes // NOTE: nLoops_v = (theight * theight) / blk_size; nLoops = nLoops_v / vwidth; if (nLoops == 0) { return 0; } return theight/nLoops; } static void calcNrThreads( size_t threads[2], const SubproblemDim *subdims, const PGranularity *pgran, const void *args, const void *_extra) { size_t BLOCKSIZE = pgran->wgSize[0] * pgran->wgSize[1]; // 1D Block CLBlasKargs *kargs = (CLBlasKargs *)args; CLBLASKernExtra *extra = (CLBLASKernExtra*) _extra; size_t blocks; size_t vecLenA = extra->vecLenA; #ifdef DEBUG_TRSV_GEMV printf("TRSV GEMV: calcNrThreads() called \n"); #endif if (((kargs->order == clblasColumnMajor) && (kargs->transA == clblasNoTrans)) || ((kargs->order == clblasRowMajor) && (kargs->transA != clblasNoTrans))) { size_t rowsLeft, TARGETROWS; //CL, CU TARGETROWS = subdims->y; rowsLeft = kargs->endRow; blocks = ((rowsLeft-1)/TARGETROWS) + 1; } else { size_t TARGETHEIGHT; if (isTransposeFeasible(subdims->y, BLOCKSIZE, vecLenA, TARGETHEIGHT) == false) { threads[0] =0; threads[1] = 0; #ifdef DEBUG_TRSV_GEMV printf("TRSV GEMV: calcNrThreads() WARNING: Returning 0\n"); #endif return; } if ( ((kargs->uplo == clblasUpper) && (kargs->order == clblasColumnMajor)) || ((kargs->uplo == clblasLower) && (kargs->order == clblasRowMajor)) ) { blocks = ((kargs->N - kargs->endRow -1) / (BLOCKSIZE / TARGETHEIGHT)) + 1; } else { blocks = (kargs->startRow)/(BLOCKSIZE/TARGETHEIGHT) + 1; } } #ifdef DEBUG_TRSV_GEMV printf("blocks : %lu\n", blocks); #endif threads[0] = blocks * BLOCKSIZE; threads[1] = 1; #ifdef DEBUG_TRSV_GEMV printf("pgran-wgSize[0] : %d, globalthreads[0] : %lu\n", pgran->wgSize[0], threads[0]); #endif return; } static bool isTransposeFeasible(size_t triangle, size_t blockSize, size_t vecLen, size_t &TARGETHEIGHT) { size_t maxHeight; if (triangle % vecLen) { #ifdef DEBUG_TRSV_GEMV printf("TRSV GEMV: isTransposeFeasible(): triangle not multiple of vectorLength\n"); #endif return false; } maxHeight = triangle/vecLen; while (blockSize % maxHeight) { maxHeight--; } // maxHeight at minimum will be 1 #ifdef DEBUG_TRSV_GEMV printf("TRSV GEMV: isTransposeFeasible(): Target Height chosen = %lu\n", maxHeight); #endif TARGETHEIGHT = maxHeight; return true; } /* * NOTE: * No-Transpose case - The code iterates along the X direction. Vectoring is along Y Direction. * Since we dont iterate on Y direction (triangle height), this fixes the "blocky" component of the blocksize. * The blockSize then determines how much width the block has on X direction and thus the number of loops * can be calculated from that information. */ static bool isNoTransposeFeasible(size_t triangle, size_t blockSize, size_t vecLen, size_t & TARGETROWS, size_t & TARGETWIDTH, size_t &NLOOPS) { size_t blockx, blocky, nLoops; if ( ((triangle*triangle) % blockSize) != 0) { #ifdef DEBUG_TRSV_GEMV printf("TRSV GEMV: isNoTransposeFeasible(): triangle*triangle not multiple of blockSize\n"); #endif return false; } if (triangle % vecLen) { #ifdef DEBUG_TRSV_GEMV printf("TRSV GEMV: isNoTransposeFeasible(): triangle not multiple of vectorLength\n"); #endif return false; } blocky = triangle/vecLen; if (blockSize % blocky) { #ifdef DEBUG_TRSV_GEMV printf("TRSV GEMV: isNoTransposeFeasible(): blockSize not multiple of blocky\n"); #endif return false; } blockx = blockSize / blocky; if (triangle % blockx) { #ifdef DEBUG_TRSV_GEMV printf("TRSV GEMV: isNoTransposeFeasible(): blockSize not multiple of blocky\n"); #endif return false; } nLoops = triangle/blockx; TARGETROWS = triangle; TARGETWIDTH = blockx; NLOOPS = nLoops; return true; } // // FIXME: Report correct return value when "buf" is NULL - Needs change in KPRINTF // FIXME: Return correct return value when "buf" is NON NULL - Needs change in KPRINTF // FIXME: "buflen" check needs to be more accurate. Relies on above changes to KPRINTF // static ssize_t generator( char *buf, size_t buflen, const struct SubproblemDim *subdims, const struct PGranularity *pgran, void *extra) { CLBLASKernExtra *extraFlags = ( CLBLASKernExtra *)extra; unsigned int vecLenA = extraFlags->vecLenA; char tempTemplate[32*1024]; char TARGETROWS_S[10], NLOOPS_S[10], TARGETWIDTH_S[10]; size_t TARGETROWS, NLOOPS, TARGETWIDTH; char TARGETHEIGHT_S[10], BLOCKSIZE_S[10], TRIANGLE_HEIGHT_S[10]; size_t TARGETHEIGHT; bool doVLOAD = false; int BLOCKSIZE = pgran->wgSize[0] * pgran->wgSize[1]; // [1] will always be 1 since we are a 1D implementation if (buf == NULL) // PENDING: Return correct buffer size { return (32 * 1024 * sizeof(char)); } if (buflen > 32*1024) { #ifdef DEBUG_TRSV_GEMV printf("TRSV GEMV: generator(): WARNING: Returning 0 as buflen is > 32K\n"); #endif return 0; } if( extraFlags->flags & KEXTRA_NO_COPY_VEC_A ) { doVLOAD = true; #ifdef DEBUG_TRSV_GEMV printf("DOing VLOAD as Aligned Data Pointer not Availabe\n"); #endif } else { #ifdef DEBUG_TRSV_GEMV printf("Using Aligned Data Pointer .........................\n"); #endif } kprintf kobj( Prefix[extraFlags->dtype], vecLenA, doVLOAD); #ifdef DEBUG_TRSV_GEMV printf("TRSV GEMV GENERATOR called....\n"); #endif clblasUplo uplo = ( extraFlags->flags & KEXTRA_UPPER_TRIANG) ? clblasUpper : clblasLower; clblasOrder order = ( extraFlags->flags & KEXTRA_COLUMN_MAJOR) ? clblasColumnMajor: clblasRowMajor; clblasTranspose trans = (extraFlags->flags & KEXTRA_TRANS_A) ? clblasTrans : (( extraFlags->flags & KEXTRA_CONJUGATE_A) ? clblasConjTrans: clblasNoTrans); bool unit = (((extraFlags->flags) & KEXTRA_UNIT_DIAGONAL) != 0); // unity and doConj handled in setKernelArgs if ( order == clblasRowMajor ) { order = clblasColumnMajor; if ( trans == clblasNoTrans) { trans = clblasTrans; } else if ( trans == clblasTrans ) { trans = clblasNoTrans; } else // clblasConjTrans { trans = clblasNoTrans; } uplo = ( uplo == clblasUpper)? clblasLower : clblasUpper; } // // Check Feasibility and then generate the code. // if ( trans != clblasNoTrans) { if (isTransposeFeasible(subdims->y, BLOCKSIZE, vecLenA, TARGETHEIGHT) == false) { return 0; } sprintf( TARGETHEIGHT_S, "%" SPREFIX "u", TARGETHEIGHT ); sprintf( BLOCKSIZE_S, "%d", BLOCKSIZE ); sprintf( TRIANGLE_HEIGHT_S, "%" SPREFIX "u", subdims->y ); kobj.put("%TARGET_HEIGHT", TARGETHEIGHT_S); kobj.put("%BLOCKSIZE", BLOCKSIZE_S); kobj.put("%TRIANGLE_HEIGHT", TRIANGLE_HEIGHT_S); ( uplo == clblasLower )? (strcpy(tempTemplate, (char*)trsv_CLT_ComputeRectangle_kernel)) : (strcpy(tempTemplate, (char*)trsv_CUT_ComputeRectangle_kernel)); } else // No-Transpose cases... { if (isNoTransposeFeasible(subdims->y, BLOCKSIZE, vecLenA, TARGETROWS, TARGETWIDTH, NLOOPS) == false) { return 0; } sprintf( TARGETROWS_S, "%" SPREFIX "u", TARGETROWS ); sprintf( TARGETWIDTH_S, "%" SPREFIX "u", TARGETWIDTH ); sprintf( NLOOPS_S, "%" SPREFIX "u", NLOOPS ); kobj.put("%TARGET_ROWS", TARGETROWS_S); kobj.put("%TARGET_WIDTH", TARGETWIDTH_S); kobj.put("%NLOOPS", NLOOPS_S); if (unit) { ( uplo == clblasLower )? (strcpy(tempTemplate, (char*)trsv_CL_ComputeRectangle_kernel)) : (strcpy(tempTemplate, (char*)trsv_CU_ComputeRectangle_kernel)); } else { ( uplo == clblasLower )? (strcpy(tempTemplate, (char*)trsv_CL_ComputeRectangle_NonUnity_kernel)) : (strcpy(tempTemplate, (char*)trsv_CU_ComputeRectangle_NonUnity_kernel)); } } #ifdef DEBUG_TRSV_GEMV printf("dataType : %c\n", Prefix[extraFlags->dtype]); #endif // FIXME: VECTORSIZE HARD CODED // FIXME : SetKernelArgs.. sends offa, offx, and lda should be received as uint #ifdef DEBUG_TRSV_GEMV printf("Vector length used : %d\n\n", vecLenA); #endif kobj.spit((char*)buf, tempTemplate); return (32 * 1024 * sizeof(char)); } static void assignKargs(KernelArg *args, const void *params, const void*) { CLBlasKargs *blasArgs = (CLBlasKargs*)params; cl_int inc; cl_int unity, doConj; INIT_KARG(&args[0], blasArgs->A); //A - input matrix - argument INIT_KARG(&args[1], blasArgs->B); //x - result buffer = _xnew argument initSizeKarg(&args[2], blasArgs->N); inc = blasArgs->ldb.vector; INIT_KARG(&args[3], inc); unity = (blasArgs->diag == clblasUnit); INIT_KARG(&args[4], unity); initSizeKarg(&args[5], blasArgs->lda.matrix); doConj = (blasArgs->transA == clblasConjTrans); #ifdef DEBUG_TRSV_GEMV printf("TRMV GEMV: assignKargs: doConj is : %d, unity is : %d, incx is : %d\n", doConj, unity, inc); printf("TRMV GEMV: startRow, startCol set to %d, %d\n", blasArgs->startRow, blasArgs->endRow); #endif INIT_KARG(&args[6], doConj); INIT_KARG(&args[7], blasArgs->startRow); INIT_KARG(&args[8], blasArgs->endRow); initSizeKarg(&args[9], blasArgs->offa); initSizeKarg(&args[10], blasArgs->offBX); return; } /* * isFitToLDS() * * 1. We will assume "dim[0].y" as the TRIANGLE_HEIGHT oiow - The number of variables solved * by the corresponding TRTRI kernel * * NOTE: * 1. It is Possible that this function can cause "dim[0].y" to change from what was used in * the "trtri" counterpart. * In such a case, we will detect this in "xtrsv.c" and abort the TRSV call. * 2. We may need to mellow down the bloated numbers we are returning down here. */ static bool isFitToLDS( SubproblemDim *dim, DataType dtype, cl_ulong ldsSize, const void *kernelArgs) { CLBlasKargs *blasArgs = (CLBlasKargs *)kernelArgs; size_t MAXBLOCKSIZE = 256; cl_ulong maxSize; if ( ((blasArgs->transA == clblasNoTrans) && (blasArgs->order == clblasColumnMajor)) || ((blasArgs->transA != clblasNoTrans) && (blasArgs->order == clblasRowMajor)) ) { // // Estimate worst case Local Memory needed - Vector Width of 4 irrespective of data-type? // cl_ulong tw; tw = getTargetWidth(dim[0].y, MAXBLOCKSIZE, 4); if (tw == 0) { do { MAXBLOCKSIZE /= 2; tw = getTargetWidth(dim[0].y, MAXBLOCKSIZE, 4); } while((MAXBLOCKSIZE > 1) && (tw == 0)); } #ifdef DEBUG_TRSV_GEMV printf("TRSV GEMV: isFitLDS() tw = %lu\n", tw); #endif maxSize = (1+4+tw)*dtypeSize(dtype) + MAXBLOCKSIZE*dtypeSize(dtype)*4; #ifdef DEBUG_TRSV_GEMV printf("TRSV GEMV: isFitLDS() maxSize = %lu, ldsSize = %lu, Y = %lu\n", maxSize, ldsSize, dim[0].y); #endif return (maxSize < ldsSize); } // // The remaining kernels use "TriangleWidth" amount of local memory for storing the RHS. // We will assume "dim[0].y" to be the "TriangleWidth" // MAXBLOCKSIZE = (dim[0].y)*(dim[0].y) > 256 ? 256 : dim[0].y*dim[0].y; maxSize = (dim[0].y + MAXBLOCKSIZE)*dtypeSize(dtype); return (maxSize < ldsSize); } clblas-2.10/src/library/blas/gens/trsv_trtri.cpp000066400000000000000000000405221264277366700217610ustar00rootroot00000000000000/* ************************************************************************ * Copyright 2013 Advanced Micro Devices, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * ************************************************************************/ /* * trsv trtri generator - * * This kernel solves the triangular system of equations with only 1 work-group. * This is terribly slow and forms the weakest link in the chain. * It solves 1 variable per work-item. So, the size of the triangle that can be solved * is limited by the hardware's MAX_WORKGROUP_SIZE. * The "chain" for solving larger systems of equations involve a "gemv" operation * which can be exploited by "xtrsv.c". However, the current "gemv" implementation * does NOT support "single complex" and "double complex" data types. * So, to give complete support, another "trsv_gemv" generator will be used. */ #include #include #include #include #include #include #include #include #include #include //#include "blas_kgen.h" #include //#define DEBUG_TRSV_TRTRI extern "C" unsigned int dtypeSize(DataType type); static char Prefix[4]; // PENDING: Magic "4" == Number of data types supported (float, double, cl_float2, cl_double2) static SolverFlags solverFlags(void) { #ifdef DEBUG_TRSV_TRTRI printf("TRSV TRTRI solverFlags(): solverFlags callen......\n"); #endif return (SF_WSPACE_1D); } static void calcNrThreads( size_t threads[2], const SubproblemDim *subdims, const PGranularity *pgran, const void *args, const void *extra); static ssize_t generator( char *buf, size_t buflen, const struct SubproblemDim *subdims, const struct PGranularity *pgran, void *extra); static void assignKargs(KernelArg *args, const void *params, const void*); extern "C" void initTrsvDefaultPattern(MemoryPattern *mempat); static void setBuildOpts( char * buildOptStr, const void *kArgs); static bool isFitToLDS( SubproblemDim *dim, DataType dtype, cl_ulong ldsSize, const void *kernelArgs); static ssize_t generator_tbsv( char *buf, size_t buflen, const struct SubproblemDim *subdims, const struct PGranularity *pgran, void *extra); static SolverOps trsvOps = { generator, assignKargs, isFitToLDS, NULL, // Prepare Translate Dims NULL, // Inner Decomposition Axis calcNrThreads, NULL, // Image related solverFlags, NULL, NULL, NULL, setBuildOpts, NULL }; static void setBuildOpts( char * buildOptStr, const void *args) { const SolutionStep *step = (const SolutionStep *)args; const CLBlasKargs *kargs = (const CLBlasKargs *)(&step->args); if ( kargs->dtype == TYPE_DOUBLE || kargs->dtype == TYPE_COMPLEX_DOUBLE) { addBuildOpt( buildOptStr, BUILD_OPTS_MAXLEN, "-DDOUBLE_PRECISION"); #ifdef DEBUG_TRSV_TRTRI printf("TRSV TRTRI: Setting build options ... Double... for DOUBLE PRECISION support\n"); #endif } if( kargs->pigFuncID == CLBLAS_TPSV) { addBuildOpt( buildOptStr, BUILD_OPTS_MAXLEN, "-DPACKED"); #ifdef DEBUG_TRSV_TRTRI printf("TPSV TRTRI: Setting build options ... PACKED\n"); #endif } if( kargs->pigFuncID == CLBLAS_TBSV) { addBuildOpt( buildOptStr, BUILD_OPTS_MAXLEN, "-DBANDED"); #ifdef DEBUG_TRSV_TRTRI printf("TBSV TRTRI: Setting build options .. BANDED\n"); #endif } return; } static CLBLASMpatExtra mpatExtra; extern "C" void initTrsvDefaultPattern(MemoryPattern *mempat) { #ifdef DEBUG_TRSV_TRTRI printf("TRSV TRTRI: initTRSVDefaultPattern called with mempat = 0x%p\n", (void*)mempat); #endif mempat->name = "Triangular matrix solver - Only 1 workgroup"; mempat->nrLevels = 2; mempat->cuLevel = 0; mempat->thLevel = 1; mempat->sops = &trsvOps; mpatExtra.aMset = CLMEM_LEVEL_L2; mpatExtra.bMset = CLMEM_LEVEL_L1 | CLMEM_LEVEL_LDS; mpatExtra.mobjA = CLMEM_BUFFER; // == No images mpatExtra.mobjB = CLMEM_BUFFER; // == No images mempat->extra = &mpatExtra; Prefix[TYPE_FLOAT] = 'S'; Prefix[TYPE_DOUBLE] = 'D'; Prefix[TYPE_COMPLEX_FLOAT] = 'C'; Prefix[TYPE_COMPLEX_DOUBLE] = 'Z'; } // // Read comments atop "isFitToLDS()" // This function is required by "isFitLDS()" // static cl_ulong getTargetWidth(size_t theight, size_t blk_size, size_t vwidth) { cl_ulong nLoops_v, nLoops; // // NOTE: This function should be called only for Non-Transpose cases // NOTE: Does not check if the block size is suitable for our purposes // NOTE: nLoops_v = (theight * theight) / blk_size; nLoops = nLoops_v / vwidth; if (nLoops == 0) { return 0; } return theight/nLoops; } static void calcNrThreads( size_t threads[2], const SubproblemDim *subdims, const PGranularity *pgran, const void *args, const void *_extra) { size_t BLOCKSIZE = pgran->wgSize[0] * pgran->wgSize[1]; // 1D Block CLBlasKargs *kargs = (CLBlasKargs *)args; #ifdef DEBUG_TRSV_TRTRI printf("TRSV TRTRI: calcNrThreads() called \n"); #endif int blocks = 1; _extra = _extra; // Dummy- to avoid warnings #ifdef DEBUG_TRSV_TRTRI printf("blocks : %d\n", blocks); #endif if (((kargs->order == clblasColumnMajor) && (kargs->transA == clblasNoTrans)) || ((kargs->order == clblasRowMajor) && (kargs->transA != clblasNoTrans))) { if (subdims->y > BLOCKSIZE) { // These little kernels cannot handle arbitrary numbers printf("TRSV calcNrThreads(): Warning. TRTRI Cannot handle subproblemdim of size %lu\n", subdims->y); threads[0] = 0; threads[1] = 0; return; } } else { if (subdims->y > 1024) { // These little kernels cannot handle arbitrary numbers printf("TRSV calcNrThreads(): Warning. TRTRI Cannot handle subproblemdim of size %lu\n", subdims->y); threads[0] = 0; threads[1] = 0; return; } } threads[0] = blocks * BLOCKSIZE; threads[1] = 1; #ifdef DEBUG_TRSV_TRTRI printf("pgran-wgSize[0] : %d, globalthreads[0] : %lu\n", pgran->wgSize[0], threads[0]); #endif return; } // // FIXME: Report correct return value when "buf" is NULL - Needs change in KPRINTF // FIXME: Return correct return value - Needs change in KPRINTF // static ssize_t generator( char *buf, size_t buflen, const struct SubproblemDim *subdims, const struct PGranularity *pgran, void *extra) { char tempTemplate[32*1024]; char vector_size_trans[10], triangle_height[10]; pgran = pgran; // Dummy- to avoid warnings if (buf == NULL) // PENDING: Return correct buffer size { buflen = (32 * 1024 * sizeof(char)); return (ssize_t)buflen; } CLBLASKernExtra *extraFlags = ( CLBLASKernExtra *)extra; SolutionStep *step = container_of( pgran , pgran, SolutionStep); // NOTE: using container_of() to get pigFuncID CLBlasKargs* kargs = (CLBlasKargs*) &(step->args); if(kargs->pigFuncID == CLBLAS_TBSV) { return generator_tbsv(buf, buflen, subdims, pgran, extra); } #ifdef DEBUG_TRSV_TRTRI printf("TRSV GENERATOR called....\n"); if((( extraFlags->flags & KEXTRA_TRANS_A) || ( extraFlags ->flags & KEXTRA_CONJUGATE_A ))) { printf("A is trans or CONJ-TRANS\n"); } else { printf("A is noTrans...\n"); } #endif clblasUplo uplo = ( extraFlags->flags & KEXTRA_UPPER_TRIANG) ? clblasUpper : clblasLower; clblasOrder order = ( extraFlags->flags & KEXTRA_COLUMN_MAJOR) ? clblasColumnMajor: clblasRowMajor; clblasTranspose trans = ( extraFlags->flags & KEXTRA_TRANS_A) ? clblasTrans : (( extraFlags->flags & KEXTRA_CONJUGATE_A) ? clblasConjTrans: clblasNoTrans); //bool unit = (((extraFlags->flags) & KEXTRA_UNIT_DIAGONAL) != 0); // unity and doConj handled in setKernelArgs if ( order == clblasRowMajor ) { order = clblasColumnMajor; if ( trans == clblasNoTrans) { trans = clblasTrans; } else if ( trans == clblasTrans ) { trans = clblasNoTrans; } else // clblasConjTrans { trans = clblasNoTrans; } uplo = ( uplo == clblasUpper)? clblasLower : clblasUpper; } if ( trans == clblasNoTrans) { ( uplo == clblasLower )? (strcpy(tempTemplate, (char*)trsv_CL_SolveTriangle_kernel)) : (strcpy(tempTemplate, (char*)trsv_CU_SolveTriangle_kernel)); } else // Transpose cases... { ( uplo == clblasLower )? (strcpy(tempTemplate, (char*)trsv_CLT_SolveTriangle_kernel)) : (strcpy(tempTemplate, (char*)trsv_CUT_SolveTriangle_kernel)); } #ifdef DEBUG_TRSV_TRTRI printf("dataType : %c\n", Prefix[extraFlags->dtype]); #endif // FIXME: VECTORSIZE HARD CODED // FIXME : SetKernelArgs.. sends offa, offx, and lda should be received as uint unsigned int vecLenA = extraFlags->vecLenA; #ifdef DEBUG_TRSV_TRTRI printf("Vector length used : %d\n\n", vecLenA); #endif bool doVLOAD = false; if( extraFlags->flags & KEXTRA_NO_COPY_VEC_A ) { doVLOAD = true; #ifdef DEBUG_TRSV_TRTRI printf("DOing VLOAD as Aligned Data Pointer not Availabe\n"); #endif } else { #ifdef DEBUG_TRSV_TRTRI printf("Using Aligned Data Pointer .........................\n"); #endif } kprintf kobj( Prefix[extraFlags->dtype], vecLenA, doVLOAD); if (trans != clblasNoTrans) { sprintf( vector_size_trans, "%u", vecLenA ); sprintf( triangle_height, "%ld", subdims[0].y ); #ifdef DEBUG_TRSV_TRTRI printf("vector size trans = %s\n", vector_size_trans); #endif kobj.put("%PREFIXVECTOR_SIZE_TRANS", (const char *)vector_size_trans); kobj.put("%TRIANGLE_HEIGHT", triangle_height); } kobj.spit((char*)buf, tempTemplate); return (32 * 1024 * sizeof(char)); } static void assignKargs(KernelArg *args, const void *params, const void*) { CLBlasKargs *blasArgs = (CLBlasKargs*)params; cl_int inc; cl_int unity, doConj; INIT_KARG(&args[0], blasArgs->A); //A - input matrix - argument INIT_KARG(&args[1], blasArgs->B); //x - result buffer = _xnew argument initSizeKarg(&args[2], blasArgs->N); inc = blasArgs->ldb.vector; INIT_KARG(&args[3], inc); unity = (blasArgs->diag == clblasUnit); INIT_KARG(&args[4], unity); initSizeKarg(&args[5], blasArgs->lda.matrix); doConj = (blasArgs->transA == clblasConjTrans); #ifdef DEBUG_TRSV_TRTRI printf("TRMV TRTRI: assignKargs: doConj is : %d, unity is : %d, incx is : %d\n", doConj, unity, inc); printf("TRMV TRTRI: startRow, startCol set to %d, %d\n", blasArgs->startRow, blasArgs->endRow); #endif INIT_KARG(&args[6], doConj); INIT_KARG(&args[7], blasArgs->startRow); INIT_KARG(&args[8], blasArgs->endRow); initSizeKarg(&args[9], blasArgs->offa); initSizeKarg(&args[10], blasArgs->offBX); if( blasArgs->pigFuncID == CLBLAS_TBSV) { initSizeKarg(&args[11], blasArgs->K); } return; } /* * isFitToLDS() is based on the "trsv_gemv" counterpart than the kernel corresponding to TRTRI * The Kernels corersponding to TRTRI are run with only 1 Workgroup. * So, it really does not matter at all. * But, if dim[0].y selected by the library changes between TRTRI and TRSV_GEMV, results will go * wrong. So, by using the same "isFitToLDS" function, we will indirectly force the library to * choose the same "SubproblemDim" for both cases. */ static bool isFitToLDS( SubproblemDim *dim, DataType dtype, cl_ulong ldsSize, const void *kernelArgs) { CLBlasKargs *blasArgs = (CLBlasKargs *)kernelArgs; size_t MAXBLOCKSIZE = 256; cl_ulong maxSize; if ( ((blasArgs->transA == clblasNoTrans) && (blasArgs->order == clblasColumnMajor)) || ((blasArgs->transA != clblasNoTrans) && (blasArgs->order == clblasRowMajor)) ) { // // Estimate worst case Local Memory needed - Vector Width of 4 irrespective of data-type? // cl_ulong tw; tw = getTargetWidth(dim[0].y, MAXBLOCKSIZE, 4); if (tw == 0) { do { MAXBLOCKSIZE /= 2; tw = getTargetWidth(dim[0].y, MAXBLOCKSIZE, 4); } while((MAXBLOCKSIZE > 1) && (tw == 0)); } #ifdef DEBUG_TRSV_TRTRI printf("TRSV TRTRI: isFitLDS() tw = %lu\n", tw); #endif maxSize = (1+4+tw)*dtypeSize(dtype) + MAXBLOCKSIZE*dtypeSize(dtype)*4; #ifdef DEBUG_TRSV_TRTRI printf("TRSV TRTRI: isFitLDS() maxSize = %lu, ldsSize = %lu, Y=%lu\n", maxSize, ldsSize, dim[0].y); #endif return (maxSize < ldsSize); } // // The remaining kernels use "TriangleWidth" amount of local memory for storing the RHS. // We will assume "dim[0].y" to be the "TriangleWidth" // MAXBLOCKSIZE = (dim[0].y)*(dim[0].y) > 256 ? 256 : dim[0].y*dim[0].y; maxSize = (dim[0].y + MAXBLOCKSIZE)*dtypeSize(dtype); return (maxSize < ldsSize); } static ssize_t generator_tbsv( char *buf, size_t buflen, const struct SubproblemDim *subdims, const struct PGranularity *pgran, void *extra) { char tempTemplate[32*1024]; char vector_size_trans[10], triangle_height[10]; pgran = pgran; // Dummy- to avoid warnings if (buf == NULL) // PENDING: Return correct buffer size { buflen = (32 * 1024 * sizeof(char)); return (ssize_t)buflen; } CLBLASKernExtra *extraFlags = ( CLBLASKernExtra *)extra; clblasUplo uplo = ( extraFlags->flags & KEXTRA_UPPER_TRIANG) ? clblasUpper : clblasLower; clblasOrder order = ( extraFlags->flags & KEXTRA_COLUMN_MAJOR) ? clblasColumnMajor: clblasRowMajor; clblasTranspose trans = ( extraFlags->flags & KEXTRA_TRANS_A) ? clblasTrans : (( extraFlags->flags & KEXTRA_CONJUGATE_A) ? clblasConjTrans: clblasNoTrans); // unity and doConj handled in setKernelArgs if ( order == clblasColumnMajor ) { if ( trans == clblasNoTrans) { trans = clblasTrans; } else if ( trans == clblasTrans ) { trans = clblasNoTrans; } else // clblasConjTrans { trans = clblasNoTrans; } uplo = ( uplo == clblasUpper)? clblasLower : clblasUpper; } if ( trans == clblasNoTrans) { ( uplo == clblasLower )? (strcpy(tempTemplate, (char*)trsv_CL_SolveTriangle_kernel)) : (strcpy(tempTemplate, (char*)trsv_CU_SolveTriangle_kernel)); } else // Transpose cases... { ( uplo == clblasLower )? (strcpy(tempTemplate, (char*)trsv_CLT_SolveTriangle_kernel)) : (strcpy(tempTemplate, (char*)trsv_CUT_SolveTriangle_kernel)); } unsigned int vecLenA = extraFlags->vecLenA; bool doVLOAD = false; if( extraFlags->flags & KEXTRA_NO_COPY_VEC_A ) { doVLOAD = true; #ifdef DEBUG_TRSV_TRTRI printf("DOing VLOAD as Aligned Data Pointer not Availabe\n"); #endif } else { #ifdef DEBUG_TRSV_TRTRI printf("Using Aligned Data Pointer .........................\n"); #endif } kprintf kobj( Prefix[extraFlags->dtype], vecLenA, doVLOAD); if (trans != clblasNoTrans) { sprintf( vector_size_trans, "%u", vecLenA ); sprintf( triangle_height, "%ld", subdims[0].y ); kobj.put("%PREFIXVECTOR_SIZE_TRANS", (const char *)vector_size_trans); kobj.put("%TRIANGLE_HEIGHT", triangle_height); } kobj.spit((char*)buf, tempTemplate); return (32 * 1024 * sizeof(char)); } clblas-2.10/src/library/blas/gens/trxm_common.c000066400000000000000000000203461264277366700215430ustar00rootroot00000000000000/* ************************************************************************ * Copyright 2013 Advanced Micro Devices, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * ************************************************************************/ #include #include #include #include "trxm_common.h" void declareTrxmKernel( struct KgenContext *ctx, DataType dtype, const PGranularity *pgran, KernelExtraFlags kflags, BlasFunctionID funcID, const char *nameSuffix, bool declareC, bool restrictPointers) { char tmp[1024]; char strC[1024]; char fpref, fsuff; const char *typeName; // swap coordinate names for the right side char coordNames[2] = {'M', 'N'}; int side = ((kflags & KEXTRA_SIDE_RIGHT) != 0); char offStr[1024]; int len = 0; const char *qualA[2], *qualB[2]; // type qualifiers typeName = dtypeBuiltinType(dtype); fpref = dtypeToBlasPrefix(dtype); fsuff = (funcID == CLBLAS_TRMM) ? 'm' : 's'; if (nameSuffix == NULL) { nameSuffix = ""; } strC[0] = '\0'; if (declareC) { sprintf(strC, " __global %s *C,\n", typeName); } offStr[0] = '\0'; if (kflags & KEXTRA_STARTM_NOT_ZERO) { len = sprintf(offStr, ",\n uint offset%c", coordNames[side]); } if (kflags & KEXTRA_STARTN_NOT_ZERO) { len += sprintf(offStr + len, ",\n uint offset%c", coordNames[1 - side]); } if (kflags & KEXTRA_A_OFF_NOT_ZERO) { strcat(offStr, ",\n uint offA"); } if (kflags & KEXTRA_BX_OFF_NOT_ZERO) { strcat(offStr, ",\n uint offB"); } if (restrictPointers) { qualA[0] = "const "; qualA[1] = "restrict "; } else { qualA[0] = qualA[1] = ""; } if (restrictPointers && declareC) { qualB[0] = "const "; qualB[1] = "restrict "; } else { qualB[0] = qualB[1] = ""; } sprintf(tmp, "__attribute__((reqd_work_group_size(%u, 1, 1)))\n" "void __kernel\n" "%ctr%cm%s(\n" " uint %c,\n" " uint %c,\n" " %s alpha,\n" " %s__global %s *%sA,\n" " uint lda,\n" " %s__global %s *%sB,\n" "%s" " uint ldb%s)\n", pgran->wgSize[0], fpref, fsuff, nameSuffix, coordNames[side], coordNames[1 - side], typeName, qualA[0], typeName, qualA[1], qualB[0], typeName, qualB[1], strC, offStr); kgenDeclareFunction(ctx, tmp); } void genTrxmBMatrShift( struct KgenContext *ctx, KernelExtraFlags kflags, bool useC) { char tmp[1024], addstr[1024]; int len = 0; const char *opstr; char coordNames[2] = {'M', 'N'}; int side = (int)((kflags & KEXTRA_SIDE_RIGHT) != 0); bool cmaj = ((kflags & KEXTRA_COLUMN_MAJOR) != 0); if (kflags & KEXTRA_BX_OFF_NOT_ZERO) { len = sprintf(addstr, "offB"); } if (kflags & KEXTRA_STARTM_NOT_ZERO) { opstr = (len) ? " + " : ""; if (cmaj) { len += sprintf(addstr + len, "%soffset%c", opstr, coordNames[side]); } else { len += sprintf(addstr + len, "%soffset%c * ldb", opstr, coordNames[side]); } } if (kflags & KEXTRA_STARTN_NOT_ZERO) { opstr = (len) ? " + " : ""; if (cmaj) { len += sprintf(addstr + len, "%soffset%c * ldb", opstr, coordNames[1 - side]); } else { len += sprintf(addstr + len, "%soffset%c", opstr, coordNames[1 - side]); } } if (len) { sprintf(tmp, "B += %s;\n", addstr); kgenAddStmt(ctx, tmp); if (useC) { sprintf(tmp, "C += %s;\n", addstr); kgenAddStmt(ctx, tmp); } kgenAddBlankLine(ctx); } } void fixupTrxmKargs(CLBlasKargs *kargs) { size_t offA = (kargs->side == clblasRight) ? kargs->offsetN : kargs->offsetM; kargs->offA += offA * kargs->lda.matrix + offA; if (kargs->order == clblasColumnMajor) { kargs->offBX += kargs->offsetN * kargs->ldb.matrix + kargs->offsetM; } else { kargs->offBX += kargs->offsetM * kargs->ldb.matrix + kargs->offsetN; } kargs->offsetM = kargs->offsetN = 0; } /* avoid " + 0" statements */ static void genAdd(char *buf, size_t val) { if (val == 0) { buf[0] = 0; //zero length string } else { sprintf(buf, " + %lu", val); } } int genTrxmPostFetchZero( struct KgenContext *ctx, MatrixRole mrole, void *priv) { TilePostFetchPrivate *pfPriv = (TilePostFetchPrivate*)priv; char tmp[1024]; char stmtStr[512]; const CLBLASKernExtra *kextra = pfPriv->gset->kextra; KernelExtraFlags kflags = kextra->flags; const KernelVarNames *vnames = &pfPriv->gset->varNames; char yCoordVar[64], xCoordVar[64]; size_t blockx, blocky; unsigned int x, y; const struct SubproblemDim *dims = &pfPriv->gset->subdims[1]; DataType dtype = pfPriv->gset->kextra->dtype; bool b; bool tra; Kstring kstr; const Tile* pTile = &pfPriv->gset->tileA; // For both A and B tiles, zero tail along K b = ((pfPriv->gset->flags & BGF_DISTINCT_VECLEN)); if (checkForTailFetches(pfPriv->funcID, dims, kextra, mrole, b, true) != FETCH_NO_TAILS) { defaultTilePostFetch(ctx, mrole, &pfPriv[1]); } if (mrole == MATRIX_B) { /* This is not triangular matrix, just go away from here */ return 0; } blockx = blocky = 0; // zero triangular part of tile a // either single row of tile a either the whole tile have been fetched tra = isMatrixAccessColMaj(pfPriv->funcID, kflags, mrole); if (tra) { blocky = pfPriv->wholeA ? dims->bwidth : 1; blockx = dims->y; sprintf(xCoordVar, "%s", vnames->coordA); sprintf(yCoordVar, "%s", vnames->k); } else { blocky = pfPriv->wholeA ? dims->y : 1; blockx = dims->bwidth; sprintf(xCoordVar, "%s", vnames->k); sprintf(yCoordVar, "%s", vnames->coordA); } kgenAddStmt(ctx, "// post fetch A\n"); kgenBeginBranch(ctx, NULL); genAdd(stmtStr, (size_t)pfPriv->fetchNumA); sprintf(tmp, "uint zy = %s%s;\n", yCoordVar, stmtStr); kgenAddStmt(ctx, tmp); // loop through block rows (there is only one row in A block) for(y = 0; y < blocky; y++) { // loop through all elements of block row for(x = 0; x < blockx; x++) { unsigned int row, col; char cmp = '<'; row = (unsigned int)(tra ? x : y); col = (unsigned int)(tra ? y : x); if (((kflags & KEXTRA_UPPER_TRIANG) != 0) ^ ((kflags & KEXTRA_COLUMN_MAJOR) != 0)) { cmp = '>'; } genAdd(stmtStr, x); sprintfTileElement(&kstr, pTile, row, col, 1); sprintf(tmp, "%s = zy %c %s%s ? 0 : %s;\n", kstr.buf, cmp, xCoordVar, stmtStr, kstr.buf); kgenAddStmt(ctx, tmp); if (kflags & KEXTRA_UNIT_DIAGONAL) { const char *one = strOne(dtype); sprintf(tmp, "%s = zy == %s%s ? " "%s : %s;\n", kstr.buf, xCoordVar, stmtStr, one, kstr.buf); kgenAddStmt(ctx, tmp); } } if (y != blocky - 1) { kgenAddStmt(ctx, "zy++;\n"); } pfPriv->fetchNumA++; } return kgenEndBranch(ctx, NULL); } clblas-2.10/src/library/blas/gens/trxm_common.h000066400000000000000000000076571264277366700215620ustar00rootroot00000000000000/* ************************************************************************ * Copyright 2013 Advanced Micro Devices, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * ************************************************************************/ #ifndef TRXM_BUFS_COMMON_H_ #define TRXM_BUFS_COMMON_H_ #include "blas_kgen.h" #include "gen_helper.h" #include "blas_funcs.h" /* * COMMON NOTES: * To use the functions the caller must guarantee kernel argument * naming and subproblem dimensions independent on the side. * That means size of A must be named as 'M'. The 'y' field of dimensions * must be a step over rows of the matrix A in case of the left side, and over * columns of the matrix otherwise. Similarly the 'x' field must be a step * over columns of the matrix B in case of the left side, and over rows of * the matrix otherwise. Both 'A' and 'B' are passed in global buffers. */ void declareTrxmKernel( struct KgenContext *ctx, DataType dtype, const PGranularity *pgran, KernelExtraFlags kflags, BlasFunctionID funcID, const char *nameSuffix, bool declareC, bool restrictPointers); /* * Declare local variables for LDS based version * of TRXM kernels. * * It provides the names typical for another generators as well: * * lid, gid - local and global ID. * m0, k0 - top level counters over M and N * currM, currN - current block coordinates over M and N at the top level * tempA, tempB - blocks of matrix A and B located in the local memory * tempC - block of matrix C located in the local memory; declared if * the 'useLocalC' argument is set * c - matrix C tile located in registers; declared if the 'useLocalC' * argument is not set * x, y - auxiliary variables to evaluate size of read/write blocks * * TRXM specific variables: * * startM, endM - starting and end coordinate over rows a kernel can access */ void declareLdsBasedTrxmVariables( struct KgenContext *ctx, DataType dtype, const SubproblemDim *dims, const PGranularity *pgran, bool useLocalC); /* * NOTE: the all following functions generate a code * using local variables declared with the * 'declareTrxmLocalVariables' function */ void genPrepareTrxmBlockA( struct KgenContext *ctx, const SubproblemDim *dim, DataType dtype, const CopyBufFuncs *copyFuncs, const ZeroFuncs *zeroFuncs, KernelExtraFlags flags, const char *nameM); void genPrepareTrxmBlockB( struct KgenContext *ctx, const SubproblemDim *dim, DataType dtype, const CopyBufFuncs *copyFuncs, const ZeroFuncs *zeroFuncs, KernelExtraFlags flags); void genUpdateTrxmResult( struct KgenContext *ctx, const SubproblemDim *dims, char *fnName, char *genericFnName, KernelExtraFlags kflags); /* * Triangulate matrix block. The decision to triangulate is * made based on the current coordinates. */ void genTriangMatrBlock( struct KgenContext *ctx, const SubproblemDim *dim, DataType dtype, KernelExtraFlags kflags); /* * Move matrix B start pointer according to offsetM, offsetN. */ void genTrxmBMatrShift( struct KgenContext *ctx, KernelExtraFlags kflags, bool useC); void fixupTrxmKargs(CLBlasKargs *kargs); /* Setting to zero upper/lower triangle elements and optionally set diagonal * elements to one after fetching */ int genTrxmPostFetchZero( struct KgenContext *ctx, MatrixRole mrole, void *priv); #endif /* TRXM_BUFS_COMMON_H_ */ clblas-2.10/src/library/blas/gens/tuned_numbers.c000066400000000000000000000621611264277366700220540ustar00rootroot00000000000000/* ************************************************************************ * Copyright 2013 Advanced Micro Devices, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * ************************************************************************/ #include "tuned_numbers.h" #define USE_TUNED_NUMBERS typedef enum callType { GEMM_NN_CALL, // A-Non trans, B-Non trans GEMM_NT_CALL, // A-Non trans, B-Trans GEMM_TN_CALL, // A-Trans, B-Non trans GEMM_TT_CALL, // A-Trans, B-Trans HERK_UN_CALL, // Upper, Non-trans HERK_UC_CALL, // Upper, Conj-trans HERK_LN_CALL, // Lower, Non-trans HERK_LC_CALL, // Lower, Conj-trans SYMM_LU_CALL, // Left, Upper SYMM_RU_CALL, // Right, Upper SYMM_LL_CALL, // Left, Lower SYMM_RL_CALL, // Right, Lower HEMM_LU_CALL, // Left, Upper HEMM_RU_CALL, // Right, Upper HEMM_LL_CALL, // Left, Lower HEMM_RL_CALL, // Right, Lower NUM_CALL_TYPES } callType; blockSizes bestBlockSizeForDevice( SolutionStep *step ) { blockSizes temp; callType currCall; CLBlasKargs *kargs = &(step->args); TargetDevice *kDevice = &(step->device); size_t maxWGSize; /////////////////////////////////////////////////////////////////////////////////////////////////////////// // QUICK FIX: changing code using fast regex search-replace: // Removing the tagged array-of-structs initialization - which works only with gcc // moving the global static variable locally and assiging the values as individual statements // this is not thread-safe; fix-this if thread safety is needed static blockSizes bestBlockSizes [NUM_DEVICE_CHIPS][4][NUM_CALL_TYPES]; // [NUM_DEVICE_CHIPS][NUM_DATATYPES][NUM_CALL_TYPES] // Block sizes for unknows devices -- using default numbers { blockSizes t = { 16, 8, 8, 4, 1 }; bestBlockSizes[CHIP_UNKNOWN][TYPE_FLOAT][GEMM_NN_CALL] = t; } { blockSizes t = { 16, 8, 8, 4, 1 }; bestBlockSizes[CHIP_UNKNOWN][TYPE_FLOAT][GEMM_NT_CALL] = t; } { blockSizes t = { 16, 16, 8, 4, 1 }; bestBlockSizes[CHIP_UNKNOWN][TYPE_FLOAT][GEMM_TN_CALL] = t; } { blockSizes t = { 8, 16, 4, 4, 1 }; bestBlockSizes[CHIP_UNKNOWN][TYPE_FLOAT][GEMM_TT_CALL] = t; } { blockSizes t = { 16, 4, 4, 4, 0 }; bestBlockSizes[CHIP_UNKNOWN][TYPE_DOUBLE][GEMM_NN_CALL] = t; } { blockSizes t = { 16, 16, 8, 4, 1 }; bestBlockSizes[CHIP_UNKNOWN][TYPE_DOUBLE][GEMM_NT_CALL] = t; } { blockSizes t = { 16, 4, 4, 4, 0 }; bestBlockSizes[CHIP_UNKNOWN][TYPE_DOUBLE][GEMM_TN_CALL] = t; } { blockSizes t = { 8, 16, 2, 4, 1 }; bestBlockSizes[CHIP_UNKNOWN][TYPE_DOUBLE][GEMM_TT_CALL] = t; } { blockSizes t = { 16, 4, 4, 4, 0 }; bestBlockSizes[CHIP_UNKNOWN][TYPE_COMPLEX_FLOAT][GEMM_NN_CALL] = t; } { blockSizes t = { 8, 16, 4, 4, 1 }; bestBlockSizes[CHIP_UNKNOWN][TYPE_COMPLEX_FLOAT][GEMM_NT_CALL] = t; } { blockSizes t = { 16, 16, 4, 4, 1 }; bestBlockSizes[CHIP_UNKNOWN][TYPE_COMPLEX_FLOAT][GEMM_TN_CALL] = t; } { blockSizes t = { 8, 16, 2, 4, 1 }; bestBlockSizes[CHIP_UNKNOWN][TYPE_COMPLEX_FLOAT][GEMM_TT_CALL] = t; } { blockSizes t = { 16, 4, 4, 4, 0 }; bestBlockSizes[CHIP_UNKNOWN][TYPE_COMPLEX_DOUBLE][GEMM_NN_CALL] = t; } { blockSizes t = { 16, 4, 4, 4, 0 }; bestBlockSizes[CHIP_UNKNOWN][TYPE_COMPLEX_DOUBLE][GEMM_NT_CALL] = t; } { blockSizes t = { 16, 16, 4, 4, 0 }; bestBlockSizes[CHIP_UNKNOWN][TYPE_COMPLEX_DOUBLE][GEMM_TN_CALL] = t; } { blockSizes t = { 8, 16, 1, 4, 0 }; bestBlockSizes[CHIP_UNKNOWN][TYPE_COMPLEX_DOUBLE][GEMM_TT_CALL] = t; } { blockSizes t = { 16, 16, 4, 4, 1 }; bestBlockSizes[CHIP_UNKNOWN][TYPE_COMPLEX_FLOAT][HERK_UN_CALL] = t; } { blockSizes t = { 16, 16, 4, 4, 1 }; bestBlockSizes[CHIP_UNKNOWN][TYPE_COMPLEX_FLOAT][HERK_UC_CALL] = t; } { blockSizes t = { 16, 16, 4, 4, 1 }; bestBlockSizes[CHIP_UNKNOWN][TYPE_COMPLEX_FLOAT][HERK_LN_CALL] = t; } { blockSizes t = { 16, 16, 4, 4, 1 }; bestBlockSizes[CHIP_UNKNOWN][TYPE_COMPLEX_FLOAT][HERK_LC_CALL] = t; } { blockSizes t = { 16, 16, 4, 4, 0 }; bestBlockSizes[CHIP_UNKNOWN][TYPE_COMPLEX_DOUBLE][HERK_UN_CALL] = t; } { blockSizes t = { 16, 16, 4, 4, 0 }; bestBlockSizes[CHIP_UNKNOWN][TYPE_COMPLEX_DOUBLE][HERK_UC_CALL] = t; } { blockSizes t = { 16, 16, 4, 4, 0 }; bestBlockSizes[CHIP_UNKNOWN][TYPE_COMPLEX_DOUBLE][HERK_LN_CALL] = t; } { blockSizes t = { 16, 16, 4, 4, 0 }; bestBlockSizes[CHIP_UNKNOWN][TYPE_COMPLEX_DOUBLE][HERK_LC_CALL] = t; } { blockSizes t = { 16, 8, 8, 4, 1 }; bestBlockSizes[CHIP_UNKNOWN][TYPE_FLOAT][SYMM_LU_CALL] = t; } { blockSizes t = { 16, 8, 8, 4, 1 }; bestBlockSizes[CHIP_UNKNOWN][TYPE_FLOAT][SYMM_RU_CALL] = t; } { blockSizes t = { 16, 8, 8, 4, 1 }; bestBlockSizes[CHIP_UNKNOWN][TYPE_FLOAT][SYMM_LL_CALL] = t; } { blockSizes t = { 16, 8, 8, 4, 1 }; bestBlockSizes[CHIP_UNKNOWN][TYPE_FLOAT][SYMM_RL_CALL] = t; } { blockSizes t = { 16, 4, 4, 4, 0 }; bestBlockSizes[CHIP_UNKNOWN][TYPE_DOUBLE][SYMM_LU_CALL] = t; } { blockSizes t = { 16, 4, 4, 4, 0 }; bestBlockSizes[CHIP_UNKNOWN][TYPE_DOUBLE][SYMM_RU_CALL] = t; } { blockSizes t = { 16, 4, 4, 4, 0 }; bestBlockSizes[CHIP_UNKNOWN][TYPE_DOUBLE][SYMM_LL_CALL] = t; } { blockSizes t = { 16, 4, 4, 4, 0 }; bestBlockSizes[CHIP_UNKNOWN][TYPE_DOUBLE][SYMM_RL_CALL] = t; } { blockSizes t = { 16, 4, 4, 4, 0 }; bestBlockSizes[CHIP_UNKNOWN][TYPE_COMPLEX_FLOAT][SYMM_LU_CALL] = t; } { blockSizes t = { 16, 4, 4, 4, 0 }; bestBlockSizes[CHIP_UNKNOWN][TYPE_COMPLEX_FLOAT][SYMM_RU_CALL] = t; } { blockSizes t = { 16, 4, 4, 4, 0 }; bestBlockSizes[CHIP_UNKNOWN][TYPE_COMPLEX_FLOAT][SYMM_LL_CALL] = t; } { blockSizes t = { 16, 4, 4, 4, 0 }; bestBlockSizes[CHIP_UNKNOWN][TYPE_COMPLEX_FLOAT][SYMM_RL_CALL] = t; } { blockSizes t = { 16, 4, 4, 4, 0 }; bestBlockSizes[CHIP_UNKNOWN][TYPE_COMPLEX_DOUBLE][SYMM_LU_CALL] = t; } { blockSizes t = { 16, 4, 4, 4, 0 }; bestBlockSizes[CHIP_UNKNOWN][TYPE_COMPLEX_DOUBLE][SYMM_RU_CALL] = t; } { blockSizes t = { 16, 4, 4, 4, 0 }; bestBlockSizes[CHIP_UNKNOWN][TYPE_COMPLEX_DOUBLE][SYMM_LL_CALL] = t; } { blockSizes t = { 16, 4, 4, 4, 0 }; bestBlockSizes[CHIP_UNKNOWN][TYPE_COMPLEX_DOUBLE][SYMM_RL_CALL] = t; } { blockSizes t = { 16, 4, 4, 4, 0 }; bestBlockSizes[CHIP_UNKNOWN][TYPE_COMPLEX_FLOAT][HEMM_LU_CALL] = t; } { blockSizes t = { 16, 4, 4, 4, 0 }; bestBlockSizes[CHIP_UNKNOWN][TYPE_COMPLEX_FLOAT][HEMM_RU_CALL] = t; } { blockSizes t = { 16, 4, 4, 4, 0 }; bestBlockSizes[CHIP_UNKNOWN][TYPE_COMPLEX_FLOAT][HEMM_LL_CALL] = t; } { blockSizes t = { 16, 4, 4, 4, 0 }; bestBlockSizes[CHIP_UNKNOWN][TYPE_COMPLEX_FLOAT][HEMM_RL_CALL] = t; } { blockSizes t = { 16, 4, 4, 4, 0 }; bestBlockSizes[CHIP_UNKNOWN][TYPE_COMPLEX_DOUBLE][HEMM_LU_CALL] = t; } { blockSizes t = { 16, 4, 4, 4, 0 }; bestBlockSizes[CHIP_UNKNOWN][TYPE_COMPLEX_DOUBLE][HEMM_RU_CALL] = t; } { blockSizes t = { 16, 4, 4, 4, 0 }; bestBlockSizes[CHIP_UNKNOWN][TYPE_COMPLEX_DOUBLE][HEMM_LL_CALL] = t; } { blockSizes t = { 16, 4, 4, 4, 0 }; bestBlockSizes[CHIP_UNKNOWN][TYPE_COMPLEX_DOUBLE][HEMM_RL_CALL] = t; } #ifdef USE_TUNED_NUMBERS // Block sizes for Cayman { blockSizes t = { 32, 4, 4, 8, 0 }; bestBlockSizes[CAYMAN][TYPE_FLOAT][GEMM_NN_CALL] = t; } { blockSizes t = { 8, 32, 4, 8, 1 }; bestBlockSizes[CAYMAN][TYPE_FLOAT][GEMM_NT_CALL] = t; } { blockSizes t = { 16, 16, 8, 4, 0 }; bestBlockSizes[CAYMAN][TYPE_FLOAT][GEMM_TN_CALL] = t; } { blockSizes t = { 8, 8, 8, 2, 0 }; bestBlockSizes[CAYMAN][TYPE_DOUBLE][GEMM_NN_CALL] = t; } { blockSizes t = { 16, 4, 4, 8, 0 }; bestBlockSizes[CAYMAN][TYPE_DOUBLE][GEMM_NT_CALL] = t; } { blockSizes t = { 16, 16, 8, 4, 0 }; bestBlockSizes[CAYMAN][TYPE_DOUBLE][GEMM_TN_CALL] = t; } { blockSizes t = { 8, 8, 8, 2, 0 }; bestBlockSizes[CAYMAN][TYPE_COMPLEX_FLOAT][GEMM_NN_CALL] = t; } { blockSizes t = { 8, 32, 4, 4, 0 }; bestBlockSizes[CAYMAN][TYPE_COMPLEX_FLOAT][GEMM_NT_CALL] = t; } { blockSizes t = { 16, 16, 8, 4, 0 }; bestBlockSizes[CAYMAN][TYPE_COMPLEX_FLOAT][GEMM_TN_CALL] = t; } { blockSizes t = { 8, 8, 8, 2, 0 }; bestBlockSizes[CAYMAN][TYPE_COMPLEX_DOUBLE][GEMM_NN_CALL] = t; } { blockSizes t = { 16, 4, 4, 4, 0 }; bestBlockSizes[CAYMAN][TYPE_COMPLEX_DOUBLE][GEMM_NT_CALL] = t; } { blockSizes t = { 8, 16, 4, 2, 1 }; bestBlockSizes[CAYMAN][TYPE_COMPLEX_DOUBLE][GEMM_TN_CALL] = t; } { blockSizes t = { 4, 16, 8, 2, 0 }; bestBlockSizes[CAYMAN][TYPE_COMPLEX_FLOAT][HERK_UN_CALL] = t; } { blockSizes t = { 8, 8, 8, 8, 0 }; bestBlockSizes[CAYMAN][TYPE_FLOAT][SYMM_LU_CALL] = t; } { blockSizes t = { 32, 4, 4, 8, 0 }; bestBlockSizes[CAYMAN][TYPE_FLOAT][SYMM_RU_CALL] = t; } { blockSizes t = { 8, 8, 8, 8, 0 }; bestBlockSizes[CAYMAN][TYPE_FLOAT][SYMM_LL_CALL] = t; } { blockSizes t = { 16, 4, 8, 8, 0 }; bestBlockSizes[CAYMAN][TYPE_FLOAT][SYMM_RL_CALL] = t; } { blockSizes t = { 8, 8, 8, 4, 0 }; bestBlockSizes[CAYMAN][TYPE_DOUBLE][SYMM_LU_CALL] = t; } { blockSizes t = { 8, 8, 8, 4, 0 }; bestBlockSizes[CAYMAN][TYPE_DOUBLE][SYMM_RU_CALL] = t; } { blockSizes t = { 8, 8, 4, 4, 0 }; bestBlockSizes[CAYMAN][TYPE_DOUBLE][SYMM_LL_CALL] = t; } { blockSizes t = { 16, 4, 8, 4, 0 }; bestBlockSizes[CAYMAN][TYPE_DOUBLE][SYMM_RL_CALL] = t; } { blockSizes t = { 16, 16, 4, 4, 0 }; bestBlockSizes[CAYMAN][TYPE_COMPLEX_FLOAT][SYMM_LU_CALL] = t; } { blockSizes t = { 16, 4, 8, 4, 0 }; bestBlockSizes[CAYMAN][TYPE_COMPLEX_FLOAT][SYMM_RU_CALL] = t; } { blockSizes t = { 16, 8, 4, 4, 0 }; bestBlockSizes[CAYMAN][TYPE_COMPLEX_FLOAT][SYMM_LL_CALL] = t; } { blockSizes t = { 16, 4, 4, 4, 0 }; bestBlockSizes[CAYMAN][TYPE_COMPLEX_FLOAT][SYMM_RL_CALL] = t; } { blockSizes t = { 16, 16, 4, 4, 0 }; bestBlockSizes[CAYMAN][TYPE_COMPLEX_DOUBLE][SYMM_LU_CALL] = t; } { blockSizes t = { 16, 4, 4, 4, 0 }; bestBlockSizes[CAYMAN][TYPE_COMPLEX_DOUBLE][SYMM_RU_CALL] = t; } { blockSizes t = { 8, 16, 4, 2, 0 }; bestBlockSizes[CAYMAN][TYPE_COMPLEX_DOUBLE][SYMM_LL_CALL] = t; } { blockSizes t = { 16, 4, 4, 4, 0 }; bestBlockSizes[CAYMAN][TYPE_COMPLEX_DOUBLE][SYMM_RL_CALL] = t; } { blockSizes t = { 16, 16, 4, 4, 0 }; bestBlockSizes[CAYMAN][TYPE_COMPLEX_FLOAT][HEMM_LU_CALL] = t; } { blockSizes t = { 16, 4, 8, 4, 0 }; bestBlockSizes[CAYMAN][TYPE_COMPLEX_FLOAT][HEMM_RU_CALL] = t; } // Block sizes for Tahiti { blockSizes t = { 32, 8, 4, 8, 0 }; bestBlockSizes[TAHITI][TYPE_FLOAT][GEMM_NN_CALL] = t; } { blockSizes t = { 8, 32, 4, 8, 0 }; bestBlockSizes[TAHITI][TYPE_FLOAT][GEMM_NT_CALL] = t; } { blockSizes t = { 32, 8, 8, 8, 0 }; bestBlockSizes[TAHITI][TYPE_FLOAT][GEMM_TN_CALL] = t; } { blockSizes t = { 32, 8, 4, 4, 1 }; bestBlockSizes[TAHITI][TYPE_DOUBLE][GEMM_NN_CALL] = t; } { blockSizes t = { 8, 32, 4, 4, 1 }; bestBlockSizes[TAHITI][TYPE_DOUBLE][GEMM_NT_CALL] = t; } { blockSizes t = { 16, 16, 8, 4, 1 }; bestBlockSizes[TAHITI][TYPE_DOUBLE][GEMM_TN_CALL] = t; } { blockSizes t = { 32, 8, 4, 4, 1 }; bestBlockSizes[TAHITI][TYPE_COMPLEX_FLOAT][GEMM_NN_CALL] = t; } { blockSizes t = { 8, 32, 4, 4, 0 }; bestBlockSizes[TAHITI][TYPE_COMPLEX_FLOAT][GEMM_NT_CALL] = t; } { blockSizes t = { 16, 16, 8, 2, 0 }; bestBlockSizes[TAHITI][TYPE_COMPLEX_FLOAT][GEMM_TN_CALL] = t; } { blockSizes t = { 16, 16, 4, 2, 1 }; bestBlockSizes[TAHITI][TYPE_COMPLEX_DOUBLE][GEMM_NN_CALL] = t; } { blockSizes t = { 8, 32, 4, 2, 1 }; bestBlockSizes[TAHITI][TYPE_COMPLEX_DOUBLE][GEMM_NT_CALL] = t; } { blockSizes t = { 16, 16, 4, 2, 0 }; bestBlockSizes[TAHITI][TYPE_COMPLEX_DOUBLE][GEMM_TN_CALL] = t; } { blockSizes t = { 8, 8, 4, 4, 0 }; bestBlockSizes[TAHITI][TYPE_COMPLEX_FLOAT][HERK_UN_CALL] = t; } { blockSizes t = { 4, 16, 8, 2, 0 }; bestBlockSizes[TAHITI][TYPE_COMPLEX_FLOAT][HERK_UC_CALL] = t; } { blockSizes t = { 8, 8, 4, 4, 0 }; bestBlockSizes[TAHITI][TYPE_COMPLEX_FLOAT][HERK_LN_CALL] = t; } { blockSizes t = { 8, 32, 8, 2, 0 }; bestBlockSizes[TAHITI][TYPE_COMPLEX_FLOAT][HERK_LC_CALL] = t; } { blockSizes t = { 8, 16, 4, 2, 0 }; bestBlockSizes[TAHITI][TYPE_COMPLEX_DOUBLE][HERK_UN_CALL] = t; } { blockSizes t = { 16, 16, 4, 4, 0 }; bestBlockSizes[TAHITI][TYPE_COMPLEX_DOUBLE][HERK_UC_CALL] = t; } { blockSizes t = { 8, 16, 4, 2, 0 }; bestBlockSizes[TAHITI][TYPE_COMPLEX_DOUBLE][HERK_LN_CALL] = t; } { blockSizes t = { 16, 16, 4, 4, 0 }; bestBlockSizes[TAHITI][TYPE_COMPLEX_DOUBLE][HERK_LC_CALL] = t; } { blockSizes t = { 32, 8, 4, 8, 0 }; bestBlockSizes[TAHITI][TYPE_FLOAT][SYMM_LU_CALL] = t; } { blockSizes t = { 16, 4, 4, 8, 0 }; bestBlockSizes[TAHITI][TYPE_FLOAT][SYMM_RU_CALL] = t; } { blockSizes t = { 32, 8, 4, 8, 0 }; bestBlockSizes[TAHITI][TYPE_FLOAT][SYMM_LL_CALL] = t; } { blockSizes t = { 32, 8, 8, 4, 0 }; bestBlockSizes[TAHITI][TYPE_FLOAT][SYMM_RL_CALL] = t; } { blockSizes t = { 32, 8, 4, 4, 0 }; bestBlockSizes[TAHITI][TYPE_DOUBLE][SYMM_LU_CALL] = t; } { blockSizes t = { 16, 16, 4, 4, 0 }; bestBlockSizes[TAHITI][TYPE_DOUBLE][SYMM_RU_CALL] = t; } { blockSizes t = { 32, 8, 4, 4, 0 }; bestBlockSizes[TAHITI][TYPE_DOUBLE][SYMM_LL_CALL] = t; } { blockSizes t = { 16, 16, 4, 4, 0 }; bestBlockSizes[TAHITI][TYPE_DOUBLE][SYMM_RL_CALL] = t; } { blockSizes t = { 32, 8, 4, 4, 0 }; bestBlockSizes[TAHITI][TYPE_COMPLEX_FLOAT][SYMM_LU_CALL] = t; } { blockSizes t = { 8, 8, 4, 4, 0 }; bestBlockSizes[TAHITI][TYPE_COMPLEX_FLOAT][SYMM_RU_CALL] = t; } { blockSizes t = { 32, 8, 4, 4, 0 }; bestBlockSizes[TAHITI][TYPE_COMPLEX_FLOAT][SYMM_LL_CALL] = t; } { blockSizes t = { 32, 8, 4, 4, 0 }; bestBlockSizes[TAHITI][TYPE_COMPLEX_FLOAT][SYMM_RL_CALL] = t; } { blockSizes t = { 16, 16, 4, 2, 0 }; bestBlockSizes[TAHITI][TYPE_COMPLEX_DOUBLE][SYMM_LU_CALL] = t; } { blockSizes t = { 8, 32, 4, 2, 0 }; bestBlockSizes[TAHITI][TYPE_COMPLEX_DOUBLE][SYMM_RU_CALL] = t; } { blockSizes t = { 16, 16, 4, 2, 0 }; bestBlockSizes[TAHITI][TYPE_COMPLEX_DOUBLE][SYMM_LL_CALL] = t; } { blockSizes t = { 8, 32, 4, 2, 0 }; bestBlockSizes[TAHITI][TYPE_COMPLEX_DOUBLE][SYMM_RL_CALL] = t; } { blockSizes t = { 32, 8, 4, 4, 0 }; bestBlockSizes[TAHITI][TYPE_COMPLEX_FLOAT][HEMM_LU_CALL] = t; } { blockSizes t = { 8, 8, 4, 4, 0 }; bestBlockSizes[TAHITI][TYPE_COMPLEX_FLOAT][HEMM_RU_CALL] = t; } { blockSizes t = { 32, 8, 4, 4, 0 }; bestBlockSizes[TAHITI][TYPE_COMPLEX_FLOAT][HEMM_LL_CALL] = t; } { blockSizes t = { 32, 8, 4, 4, 0 }; bestBlockSizes[TAHITI][TYPE_COMPLEX_FLOAT][HEMM_RL_CALL] = t; } { blockSizes t = { 16, 16, 4, 2, 0 }; bestBlockSizes[TAHITI][TYPE_COMPLEX_DOUBLE][HEMM_LU_CALL] = t; } { blockSizes t = { 8, 32, 4, 2, 0 }; bestBlockSizes[TAHITI][TYPE_COMPLEX_DOUBLE][HEMM_RU_CALL] = t; } { blockSizes t = { 16, 16, 4, 2, 0 }; bestBlockSizes[TAHITI][TYPE_COMPLEX_DOUBLE][HEMM_LL_CALL] = t; } { blockSizes t = { 8, 32, 4, 2, 0 }; bestBlockSizes[TAHITI][TYPE_COMPLEX_DOUBLE][HEMM_RL_CALL] = t; } // Block-sizes for Cypress { blockSizes t = { 32, 8, 4, 8, 1 }; bestBlockSizes[CYPRESS][TYPE_FLOAT][GEMM_NN_CALL] = t; } { blockSizes t = { 8, 8, 8, 8, 1 }; bestBlockSizes[CYPRESS][TYPE_FLOAT][GEMM_NT_CALL] = t; } { blockSizes t = { 16, 16, 8, 8, 0 }; bestBlockSizes[CYPRESS][TYPE_FLOAT][GEMM_TN_CALL] = t; } { blockSizes t = { 16, 4, 4, 8, 1 }; bestBlockSizes[CYPRESS][TYPE_DOUBLE][GEMM_NN_CALL] = t; } { blockSizes t = { 8, 16, 8, 4, 1 }; bestBlockSizes[CYPRESS][TYPE_DOUBLE][GEMM_NT_CALL] = t; } { blockSizes t = { 4, 16, 8, 4, 0 }; bestBlockSizes[CYPRESS][TYPE_DOUBLE][GEMM_TN_CALL] = t; } { blockSizes t = { 16, 4, 4, 8, 0 }; bestBlockSizes[CYPRESS][TYPE_COMPLEX_FLOAT][GEMM_NN_CALL] = t; } { blockSizes t = { 8, 32, 4, 4, 1 }; bestBlockSizes[CYPRESS][TYPE_COMPLEX_FLOAT][GEMM_NT_CALL] = t; } { blockSizes t = { 16, 16, 8, 4, 0 }; bestBlockSizes[CYPRESS][TYPE_COMPLEX_FLOAT][GEMM_TN_CALL] = t; } { blockSizes t = { 8, 32, 8, 2, 0 }; bestBlockSizes[CYPRESS][TYPE_COMPLEX_FLOAT][HERK_UN_CALL] = t; } { blockSizes t = { 8, 16, 4, 2, 0 }; bestBlockSizes[CYPRESS][TYPE_COMPLEX_FLOAT][HERK_UC_CALL] = t; } { blockSizes t = { 8, 16, 8, 4, 0 }; bestBlockSizes[CYPRESS][TYPE_COMPLEX_FLOAT][HERK_LN_CALL] = t; } { blockSizes t = { 8, 16, 4, 2, 0 }; bestBlockSizes[CYPRESS][TYPE_COMPLEX_FLOAT][HERK_LC_CALL] = t; } { blockSizes t = { 8, 8, 8, 8, 0 }; bestBlockSizes[CYPRESS][TYPE_FLOAT][SYMM_LU_CALL] = t; } { blockSizes t = { 8, 8, 8, 8, 0 }; bestBlockSizes[CYPRESS][TYPE_FLOAT][SYMM_RU_CALL] = t; } { blockSizes t = { 8, 8, 4, 4, 0 }; bestBlockSizes[CYPRESS][TYPE_FLOAT][SYMM_LL_CALL] = t; } { blockSizes t = { 8, 8, 8, 8, 0 }; bestBlockSizes[CYPRESS][TYPE_FLOAT][SYMM_RL_CALL] = t; } { blockSizes t = { 8, 16, 8, 4, 0 }; bestBlockSizes[CYPRESS][TYPE_DOUBLE][SYMM_LU_CALL] = t; } { blockSizes t = { 16, 4, 4, 8, 0 }; bestBlockSizes[CYPRESS][TYPE_DOUBLE][SYMM_RU_CALL] = t; } { blockSizes t = { 8, 8, 4, 4, 0 }; bestBlockSizes[CYPRESS][TYPE_DOUBLE][SYMM_LL_CALL] = t; } { blockSizes t = { 16, 4, 8, 4, 0 }; bestBlockSizes[CYPRESS][TYPE_DOUBLE][SYMM_RL_CALL] = t; } { blockSizes t = { 8, 16, 8, 4, 0 }; bestBlockSizes[CYPRESS][TYPE_COMPLEX_FLOAT][SYMM_LU_CALL] = t; } { blockSizes t = { 8, 8, 8, 4, 0 }; bestBlockSizes[CYPRESS][TYPE_COMPLEX_FLOAT][SYMM_RU_CALL] = t; } { blockSizes t = { 8, 8, 4, 4, 0 }; bestBlockSizes[CYPRESS][TYPE_COMPLEX_FLOAT][SYMM_LL_CALL] = t; } { blockSizes t = { 8, 8, 8, 4, 0 }; bestBlockSizes[CYPRESS][TYPE_COMPLEX_FLOAT][SYMM_RL_CALL] = t; } { blockSizes t = { 16, 8, 4, 4, 0 }; bestBlockSizes[CYPRESS][TYPE_COMPLEX_DOUBLE][SYMM_LU_CALL] = t; } { blockSizes t = { 16, 4, 4, 4, 0 }; bestBlockSizes[CYPRESS][TYPE_COMPLEX_DOUBLE][SYMM_RU_CALL] = t; } { blockSizes t = { 8, 8, 4, 4, 0 }; bestBlockSizes[CYPRESS][TYPE_COMPLEX_DOUBLE][SYMM_LL_CALL] = t; } { blockSizes t = { 16, 4, 4, 4, 0 }; bestBlockSizes[CYPRESS][TYPE_COMPLEX_DOUBLE][SYMM_RL_CALL] = t; } { blockSizes t = { 8, 8, 4, 8, 0 }; bestBlockSizes[CYPRESS][TYPE_COMPLEX_FLOAT][HEMM_LU_CALL] = t; } { blockSizes t = { 32, 4, 8, 2, 0 }; bestBlockSizes[CYPRESS][TYPE_COMPLEX_FLOAT][HEMM_RU_CALL] = t; } { blockSizes t = { 4, 32, 4, 2, 0 }; bestBlockSizes[CYPRESS][TYPE_COMPLEX_FLOAT][HEMM_LL_CALL] = t; } { blockSizes t = { 32, 4, 8, 2, 0 }; bestBlockSizes[CYPRESS][TYPE_COMPLEX_FLOAT][HEMM_RL_CALL] = t; } { blockSizes t = { 16, 4, 4, 2, 0 }; bestBlockSizes[CYPRESS][TYPE_COMPLEX_DOUBLE][HEMM_LU_CALL] = t; } { blockSizes t = { 32, 4, 8, 2, 0 }; bestBlockSizes[CYPRESS][TYPE_COMPLEX_DOUBLE][HEMM_RU_CALL] = t; } { blockSizes t = { 4, 16, 4, 4, 0 }; bestBlockSizes[CYPRESS][TYPE_COMPLEX_DOUBLE][HEMM_LL_CALL] = t; } { blockSizes t = { 32, 4, 8, 2, 0 }; bestBlockSizes[CYPRESS][TYPE_COMPLEX_DOUBLE][HEMM_RL_CALL] = t; } // Block-sizes for GeForce GTX 580 { blockSizes t = { 16, 32, 4, 4, 0 }; bestBlockSizes[GEFORCE_GTX_580][TYPE_FLOAT][GEMM_NN_CALL] = t; } { blockSizes t = { 16, 32, 4, 8, 0 }; bestBlockSizes[GEFORCE_GTX_580][TYPE_FLOAT][GEMM_NT_CALL] = t; } { blockSizes t = { 32, 16, 8, 2, 0 }; bestBlockSizes[GEFORCE_GTX_580][TYPE_FLOAT][GEMM_TN_CALL] = t; } { blockSizes t = { 16, 16, 8, 2, 0 }; bestBlockSizes[GEFORCE_GTX_580][TYPE_DOUBLE][GEMM_NN_CALL] = t; } { blockSizes t = { 16, 4, 4, 4, 0 }; bestBlockSizes[GEFORCE_GTX_580][TYPE_DOUBLE][GEMM_NT_CALL] = t; } { blockSizes t = { 16, 32, 8, 2, 0 }; bestBlockSizes[GEFORCE_GTX_580][TYPE_DOUBLE][GEMM_TN_CALL] = t; } { blockSizes t = { 16, 16, 4, 4, 0 }; bestBlockSizes[GEFORCE_GTX_580][TYPE_COMPLEX_FLOAT][GEMM_NN_CALL] = t; } { blockSizes t = { 32, 16, 4, 4, 0 }; bestBlockSizes[GEFORCE_GTX_580][TYPE_COMPLEX_FLOAT][GEMM_NT_CALL] = t; } { blockSizes t = { 32, 8, 4, 2, 0 }; bestBlockSizes[GEFORCE_GTX_580][TYPE_COMPLEX_FLOAT][GEMM_TN_CALL] = t; } { blockSizes t = { 16, 32, 4, 2, 0 }; bestBlockSizes[GEFORCE_GTX_580][TYPE_COMPLEX_DOUBLE][GEMM_NN_CALL] = t; } { blockSizes t = { 32, 8, 4, 2, 0 }; bestBlockSizes[GEFORCE_GTX_580][TYPE_COMPLEX_DOUBLE][GEMM_NT_CALL] = t; } { blockSizes t = { 16, 16, 4, 4, 0 }; bestBlockSizes[GEFORCE_GTX_580][TYPE_COMPLEX_FLOAT][HERK_UN_CALL] = t; } { blockSizes t = { 16, 32, 4, 2, 0 }; bestBlockSizes[GEFORCE_GTX_580][TYPE_COMPLEX_FLOAT][HERK_UC_CALL] = t; } { blockSizes t = { 16, 16, 4, 4, 0 }; bestBlockSizes[GEFORCE_GTX_580][TYPE_COMPLEX_FLOAT][HERK_LN_CALL] = t; } { blockSizes t = { 16, 32, 4, 2, 0 }; bestBlockSizes[GEFORCE_GTX_580][TYPE_COMPLEX_FLOAT][HERK_LC_CALL] = t; } { blockSizes t = { 16, 32, 4, 2, 0 }; bestBlockSizes[GEFORCE_GTX_580][TYPE_COMPLEX_DOUBLE][HERK_UN_CALL] = t; } { blockSizes t = { 8, 16, 4, 2, 0 }; bestBlockSizes[GEFORCE_GTX_580][TYPE_COMPLEX_DOUBLE][HERK_UC_CALL] = t; } { blockSizes t = { 16, 32, 4, 2, 0 }; bestBlockSizes[GEFORCE_GTX_580][TYPE_COMPLEX_DOUBLE][HERK_LN_CALL] = t; } { blockSizes t = { 8, 16, 4, 2, 0 }; bestBlockSizes[GEFORCE_GTX_580][TYPE_COMPLEX_DOUBLE][HERK_LC_CALL] = t; } { blockSizes t = { 32, 16, 4, 4, 0 }; bestBlockSizes[GEFORCE_GTX_580][TYPE_FLOAT][SYMM_LU_CALL] = t; } { blockSizes t = { 16, 4, 4, 8, 0 }; bestBlockSizes[GEFORCE_GTX_580][TYPE_FLOAT][SYMM_RU_CALL] = t; } { blockSizes t = { 16, 4, 4, 4, 0 }; bestBlockSizes[GEFORCE_GTX_580][TYPE_FLOAT][SYMM_LL_CALL] = t; } { blockSizes t = { 16, 4, 4, 8, 0 }; bestBlockSizes[GEFORCE_GTX_580][TYPE_FLOAT][SYMM_RL_CALL] = t; } { blockSizes t = { 16, 8, 8, 2, 0 }; bestBlockSizes[GEFORCE_GTX_580][TYPE_DOUBLE][SYMM_LU_CALL] = t; } { blockSizes t = { 16, 4, 8, 2, 0 }; bestBlockSizes[GEFORCE_GTX_580][TYPE_DOUBLE][SYMM_RU_CALL] = t; } { blockSizes t = { 16, 8, 4, 2, 0 }; bestBlockSizes[GEFORCE_GTX_580][TYPE_DOUBLE][SYMM_LL_CALL] = t; } { blockSizes t = { 16, 4, 4, 4, 0 }; bestBlockSizes[GEFORCE_GTX_580][TYPE_DOUBLE][SYMM_RL_CALL] = t; } { blockSizes t = { 16, 8, 4, 2, 0 }; bestBlockSizes[GEFORCE_GTX_580][TYPE_COMPLEX_FLOAT][SYMM_LU_CALL] = t; } { blockSizes t = { 16, 4, 4, 4, 0 }; bestBlockSizes[GEFORCE_GTX_580][TYPE_COMPLEX_FLOAT][SYMM_RU_CALL] = t; } { blockSizes t = { 16, 8, 4, 2, 0 }; bestBlockSizes[GEFORCE_GTX_580][TYPE_COMPLEX_FLOAT][SYMM_LL_CALL] = t; } { blockSizes t = { 16, 4, 4, 4, 0 }; bestBlockSizes[GEFORCE_GTX_580][TYPE_COMPLEX_FLOAT][SYMM_RL_CALL] = t; } { blockSizes t = { 16, 4, 4, 2, 0 }; bestBlockSizes[GEFORCE_GTX_580][TYPE_COMPLEX_DOUBLE][SYMM_LU_CALL] = t; } { blockSizes t = { 16, 4, 4, 2, 0 }; bestBlockSizes[GEFORCE_GTX_580][TYPE_COMPLEX_DOUBLE][SYMM_RU_CALL] = t; } { blockSizes t = { 16, 8, 4, 2, 0 }; bestBlockSizes[GEFORCE_GTX_580][TYPE_COMPLEX_DOUBLE][SYMM_LL_CALL] = t; } { blockSizes t = { 16, 4, 4, 2, 0 }; bestBlockSizes[GEFORCE_GTX_580][TYPE_COMPLEX_DOUBLE][SYMM_RL_CALL] = t; } { blockSizes t = { 16, 8, 4, 2, 0 }; bestBlockSizes[GEFORCE_GTX_580][TYPE_COMPLEX_FLOAT][HEMM_LU_CALL] = t; } { blockSizes t = { 16, 4, 4, 4, 0 }; bestBlockSizes[GEFORCE_GTX_580][TYPE_COMPLEX_FLOAT][HEMM_RU_CALL] = t; } { blockSizes t = { 16, 8, 4, 2, 0 }; bestBlockSizes[GEFORCE_GTX_580][TYPE_COMPLEX_FLOAT][HEMM_LL_CALL] = t; } { blockSizes t = { 16, 4, 4, 4, 0 }; bestBlockSizes[GEFORCE_GTX_580][TYPE_COMPLEX_FLOAT][HEMM_RL_CALL] = t; } #endif // USE_TUNED_NUMBERS /////////////////////////////////////////////////////////////////////////////////////////////////////////// identifyDevice( kDevice ); // Query device name and stores it in the structure if( kargs->pigFuncID == CLBLAS_GEMM2 ) { if( kargs->transA == clblasNoTrans ) { if( kargs->transB == clblasNoTrans ) currCall = GEMM_NN_CALL; else currCall = GEMM_NT_CALL; } else { if( kargs->transB == clblasNoTrans ) currCall = GEMM_TN_CALL; else currCall = GEMM_TT_CALL; } } else if( kargs->pigFuncID == CLBLAS_HERK ) { if( kargs->uplo == clblasUpper ) { if( kargs->transA == clblasNoTrans ) currCall = HERK_UN_CALL; else currCall = HERK_UC_CALL; } else { if( kargs->transA == clblasNoTrans ) currCall = HERK_LN_CALL; else currCall = HERK_LC_CALL; } } else if( (kargs->pigFuncID == CLBLAS_SYMM) || (kargs->pigFuncID == CLBLAS_SYMM_DIAGONAL) ) { if( kargs->side == clblasLeft ) { if( kargs->uplo == clblasUpper ) currCall = SYMM_LU_CALL; else currCall = SYMM_LL_CALL; } else { if( kargs->uplo == clblasUpper ) currCall = SYMM_RU_CALL; else currCall = SYMM_RL_CALL; } } else if( (kargs->pigFuncID == CLBLAS_HEMM) || (kargs->pigFuncID == CLBLAS_HEMM_DIAGONAL) ) { if( kargs->side == clblasLeft ) { if( kargs->uplo == clblasUpper ) currCall = HEMM_LU_CALL; else currCall = HEMM_LL_CALL; } else { if( kargs->uplo == clblasUpper ) currCall = HEMM_RU_CALL; else currCall = HEMM_RL_CALL; } } temp = bestBlockSizes [ (kDevice->ident).chip ] [kargs->dtype] [currCall]; if( (temp.TY == 0) || (temp.TX == 0) || (temp.ITEMY == 0) || (temp.ITEMX == 0) ) { // If optimal block-sizes for the device is not available, // we take default block-sizes temp = bestBlockSizes [CHIP_UNKNOWN] [kargs->dtype] [currCall]; } maxWGSize = deviceMaxWorkgroupSize( (kDevice->id), NULL ); while( ( ((size_t)temp.TY)*((size_t)temp.TX) ) > maxWGSize ) // FIXME check this { if( temp.TX < temp.TY ) temp.TX /= 2; else temp.TY /= 2; } return temp; } clblas-2.10/src/library/blas/gens/tuned_numbers.h000066400000000000000000000024331264277366700220550ustar00rootroot00000000000000/* ************************************************************************ * Copyright 2013 Advanced Micro Devices, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * ************************************************************************/ #ifndef _TUNED_NUMBERS_ #define _TUNED_NUMBERS_ #include #include #include #include #ifdef __cplusplus extern "C" { #endif typedef struct blockSizes { unsigned char TY; // Not more than 32 unsigned char TX; unsigned char ITEMY:7; // Not more than 8 unsigned char ITEMX:7; unsigned char useBarrier:1; } blockSizes; blockSizes bestBlockSizeForDevice( SolutionStep *step ); #ifdef __cplusplus } /* extern "C" { */ #endif #endif // _TUNED_NUMBERS_ clblas-2.10/src/library/blas/gens/xxmv_common.c000066400000000000000000000242311264277366700215500ustar00rootroot00000000000000/* ************************************************************************ * Copyright 2013 Advanced Micro Devices, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * ************************************************************************/ #include #include #include "xxmv_common.h" static void genMul(char *buf, size_t val, const char* type, const char* sum, const char* mul) { if (mul == NULL) { if (sum == NULL) { sprintf(buf, "%lu", val); } else { if (val == 0) { sprintf(buf, "%s", sum); //zero length string } else { sprintf(buf, "%s + %lu", sum, val); } } } else { if (sum == NULL) { if (val == 0) { sprintf(buf, "0"); //zero length string } else if (val == 1) { sprintf(buf, "%s", mul); //zero length string } else { sprintf(buf, "mad24((%s)%lu, (%s)%s, (%s)0)", type, val, type, mul, type); //sprintf(buf, "%lu * %s", val, mul); } } else { if (val == 0) { sprintf(buf, "mad24((%s)%s, (%s)%s, (%s)0)", type, sum, type, mul, type); //zero length string //sprintf(buf, "%s * %s", sum, mul); } else { sprintf(buf, "mad24((%s)%s + %lu, (%s)%s, (%s)0)", type, sum, val, type, mul, type); //sprintf(buf, "(%s + %lu) * %s", sum, val, mul); } } } } void genFetchX( struct KgenContext *ctx, Tile *tile, unsigned int vecLen, DataType dtype, const KernelVarNames *varNames, TileMulFlags tflags, KernelExtraFlags kflags) { Kstring kstr[1]; Tile memtile; char tmp[1024], strMul[128]; unsigned int n; const char *ptrName; bool tailN = (tflags & TILEMUL_SKEW_B) != 0; bool incxOne = ((kflags & KEXTRA_INCX_ONE) != 0); bool elemFetch = ((kflags & KEXTRA_NO_COPY_VEC_B) != 0); unsigned int nfetch = !tailN && incxOne && !elemFetch ? vecLen : 1; (void)dtype; initTile(&memtile, NULL, tile->nrRows, tile->nrCols, nfetch, tile->dtype, tile->storType, tile->trans, tile->packed); getVectorTypeName(tile->dtype, vecLen, NULL, &ptrName); if (!tailN && incxOne && !elemFetch) { sprintf(tmp, "const uint xk = %s / %u;\n", varNames->k, vecLen); kgenAddStmt(ctx, tmp); for (n = 0; forEachTile(kstr, n, 0, 2, tile, &memtile); n++) { sprintf(tmp,"%s = %s.%s[xk + %u];\n", kstr[0].buf, varNames->B, ptrName, n); kgenAddStmt(ctx, tmp); } } else { for (n = 0; forEachTile(kstr, n, 0, 2, tile, &memtile); n++) { genMul(strMul, n, "int", "k", incxOne ? NULL : "incx"); if (tailN) { sprintf(tmp,"%s = X[k + %u < %s ? %s : 0];\n", kstr[0].buf, n, varNames->sizeK, strMul); } else { sprintf(tmp,"%s = X[%s];\n",kstr[0].buf, strMul); } kgenAddStmt(ctx, tmp); } } if (tailN) { for (n = 0; forEachTile(kstr, n, 0, 2, tile, &memtile); n++) { sprintf(tmp,"%s = k + %u < %s ? %s : 0;\n", kstr[0].buf, n, varNames->sizeK, kstr[0].buf); kgenAddStmt(ctx, tmp); } } } void setResultPos( struct KgenContext *ctx, KernelExtraFlags kflags, const char *axVar) { bool incyOne = ((kflags & KEXTRA_INCY_ONE) != 0); char tmp[2048]; if (incyOne) { sprintf(tmp, "Y += %s;\n", axVar); } else { sprintf(tmp, "Y += incy * (int)%s;\n", axVar); } kgenAddStmt(ctx, tmp); } void updateResultVectorTiled( struct KgenContext *ctx, KernelExtraFlags kflags, unsigned int vecLen, Tile *tile) { bool beta0 = ((kflags & KEXTRA_BETA_ZERO) != 0); bool incyOne = ((kflags & KEXTRA_INCY_ONE) != 0); bool tailM = ((kflags & KEXTRA_TAILS_M) != 0); bool isComplex = isComplexType(tile->dtype); unsigned int n, i; const char *outTypeName, *outPtrName; Tile result, memtile; char tmp[2048],strMul[256]; Kstring kstr[2]; if (isComplex) { vecLen = 1; } initTile(&result, "r", tile->nrRows, tile->nrCols, tile->nrRows, tile->dtype, tile->storType, true, tile->packed); declareOneTileStorage(ctx, &result); memtile = result; memtile.baseName = NULL; memtile.vecLen = !tailM && incyOne ? vecLen : 1; getVectorTypeName(memtile.dtype, memtile.vecLen, &outTypeName, &outPtrName); sprintf(tmp,"GPtr uC;\n" "uC.f = Y;\n"); kgenAddStmt(ctx, tmp); if (!tailM && incyOne) { for (n = 0; forEachTile(kstr, n, 0, 2, &result, &memtile); n++) { sprintf(tmp,"%s = uC.%s[%u];\n", kstr[0].buf, outPtrName, n); kgenAddStmt(ctx, tmp); } } else { for (n = 0; forEachTile(kstr, n, 0, 2, &result, &memtile); n++) { genMul(strMul, n, "int", NULL, incyOne ? NULL : "incy"); if (tailM) { sprintf(tmp,"%s = Y[coordA + %u >= M ? 0 : %s];\n", kstr[0].buf, n, strMul); } else { sprintf(tmp,"%s = Y[%s];\n", kstr[0].buf, strMul); } kgenAddStmt(ctx, tmp); } } if (isComplex) { const char *complVec = isDoubleBasedType(tile->dtype) ? "double2" : "float2"; Tile onetile = result; onetile.baseName = NULL; onetile.vecLen = 1; for (n = 0; forEachTile(kstr, n, 0, 3, &result, tile, &onetile); n++) { if (beta0) { sprintf(tmp, "%s = %s * alpha.x + %s.yx * (%s)(-alpha.y, alpha.y);\n", kstr[0].buf, kstr[1].buf, kstr[1].buf, complVec); } else { sprintf(tmp, "%s = %s * beta.x + %s.yx * (%s)(-beta.y, beta.y) + " "%s * alpha.x + %s.yx * (%s)(-alpha.y, alpha.y);\n", kstr[0].buf, kstr[0].buf, kstr[0].buf, complVec, kstr[1].buf, kstr[1].buf, complVec); } kgenAddStmt(ctx, tmp); } } else { for (n = 0; forEachTile(kstr, n, 0, 2, &result, tile); n++) { if (beta0) { sprintf(tmp, "%s = alpha * %s;\n", kstr[0].buf, kstr[1].buf); } else { sprintf(tmp, "%s = beta * %s + alpha * %s;\n", kstr[0].buf, kstr[0].buf, kstr[1].buf); } kgenAddStmt(ctx, tmp); } } if (!tailM && incyOne) { for (i = 0; forEachTile(kstr, i, 0, 2, &result, &memtile); i++) { sprintf(tmp,"uC.%s[%u] = %s;\n", outPtrName, i, kstr[0].buf); kgenAddStmt(ctx, tmp); } } else { if (!tailM) { for (i = 0; forEachTile(kstr, i, 0, 2, &result, &memtile); i++) { sprintf(tmp,"*Y = %s;\n", kstr[0].buf); //sprintf(tmp,"Y[%u * incy] = %s;\n", i, kstr.buf); kgenAddStmt(ctx, tmp); kgenAddStmt(ctx, "Y += incy;\n"); } } else { for (n = forEachTile(NULL, 0, 0, 2, &result, &memtile); n != 0; n--) { i = n - 1; forEachTile(kstr, i, 0, 2, &result, &memtile); genMul(strMul, i, "int", NULL, incyOne ? NULL : "incy"); sprintf(tmp,"Y[coordA + %u >= M ? 0 : %s] = %s;\n", i, strMul, kstr[0].buf); kgenAddStmt(ctx, tmp); } } } } void genIncPointers( struct KgenContext *ctx, KernelExtraFlags kflags) { bool incxOne = ((kflags & KEXTRA_INCX_ONE) != 0); bool incyOne = ((kflags & KEXTRA_INCY_ONE) != 0); if (kflags & KEXTRA_A_OFF_NOT_ZERO) { kgenAddStmt(ctx, "A += offA;\n"); } if (kflags & KEXTRA_BX_OFF_NOT_ZERO) { kgenAddStmt(ctx, "X += offX;\n"); } if (kflags & KEXTRA_CY_OFF_NOT_ZERO) { kgenAddStmt(ctx, "Y += offY;\n"); } if (!incxOne) { kgenAddStmt(ctx, "X += incx > 0 ? 0 : (N - 1) * abs(incx);\n"); } if (!incyOne) { kgenAddStmt(ctx, "Y += incy > 0 ? 0 : (M - 1) * abs(incy);\n"); } } void genStoreLocalResult( struct KgenContext *ctx, Tile *tile, const char *lid) { Kstring kstr; char tmp[1024]; unsigned int i; for (i = 0; forEachTile(&kstr, i, 0, 1, tile); i++) { sprintf(tmp, "localRes[%s][%u] = %s;\n", lid, i, kstr.buf); kgenAddStmt(ctx, tmp); } } void genAddLocalResult( struct KgenContext *ctx, Tile *tile, const char *lid, unsigned int cLocal, unsigned int bStep) { Kstring kstr; char tmp[1024]; unsigned int i; sprintf(tmp, "for (uint i = 1; i < %u; i++)", cLocal); kgenBeginBranch(ctx, tmp); for (i = 0; forEachTile(&kstr, i, 0, 1, tile); i++) { sprintf(tmp, "%s += localRes[%s + i*%u][%u];\n", kstr.buf, lid, bStep, i); kgenAddStmt(ctx, tmp); } kgenEndBranch(ctx, NULL); } void genMergeResults( struct KgenContext *ctx, Tile *result, Tile *source) { unsigned int i; Kstring kstr[2]; char tmp[2048]; for (i = 0; forEachTile(kstr, i, 0, 2, result, source); i++) { sprintf(tmp, "%s += %s;\n", kstr[0].buf, kstr[1].buf); kgenAddStmt(ctx, tmp); } } clblas-2.10/src/library/blas/gens/xxmv_common.h000066400000000000000000000034201264277366700215520ustar00rootroot00000000000000/* ************************************************************************ * Copyright 2013 Advanced Micro Devices, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * ************************************************************************/ #ifndef XXMV_COMMON_H_ #define XXMV_COMMON_H_ #include "blas_kgen.h" #include "gen_helper.h" /* Fetch part of vector x into tile b */ void genFetchX( struct KgenContext *ctx, Tile *tile, unsigned int vecLen, DataType dtype, const KernelVarNames *varNames, TileMulFlags tflags, KernelExtraFlags kflags); void setResultPos( struct KgenContext *ctx, KernelExtraFlags kflags, const char *axVar); void updateResultVectorTiled( struct KgenContext *ctx, KernelExtraFlags kflags, unsigned int vecLen, Tile *tile); void genIncPointers( struct KgenContext *ctx, KernelExtraFlags kflags); void genStoreLocalResult( struct KgenContext *ctx, Tile *tile, const char *lid); void genAddLocalResult( struct KgenContext *ctx, Tile *tile, const char *lid, unsigned int cLocal, unsigned int bStep); /* Store partial result to private result buffer */ void genMergeResults( struct KgenContext *ctx, Tile *result, Tile *source); #endif /* XXMV_COMMON_H_ */ clblas-2.10/src/library/blas/impl.c000066400000000000000000000076151264277366700172120ustar00rootroot00000000000000/* ************************************************************************ * Copyright 2013 Advanced Micro Devices, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * ************************************************************************/ #include #include #include #include #include clblasStatus clblasSelectImplementation( clblasImplementation impl) { switch (impl) { case clblasDefaultGemm: case clblasLdsBlockGemm: case clblasImageBlockGemm: case clblasBlockGemmWithCaching: clblasSolvers[CLBLAS_GEMM].defaultPattern = getGemmMemPatternIndex(impl); break; case clblasDefaultTrmm: case clblasLdsBlockTrmm: case clblasImageBlockTrmm: case clblasBlockTrmmWithCaching: clblasSolvers[CLBLAS_TRMM].defaultPattern = getTrmmMemPatternIndex(impl); break; case clblasDefaultTrsm: case clblasLdsBlockTrsm: case clblasImageBlockTrsm: case clblasBlockTrsmWithCaching: case clblasBlockTrsmWithoutLds: clblasSolvers[CLBLAS_TRSM].defaultPattern = getTrsmMemPatternIndex(impl); break; default: return clblasInvalidValue; } return clblasSuccess; } int scratchImagesEnabled(void) { int enable = 0; const char *envImpl; envImpl = getenv("AMD_CLBLAS_GEMM_IMPLEMENTATION"); if ((envImpl != NULL) && (strcmp(envImpl, "1") == 0)) { enable = 1; }; envImpl = getenv("AMD_CLBLAS_TRMM_IMPLEMENTATION"); if ((envImpl != NULL) && (strcmp(envImpl, "1") == 0)) { enable = 1; }; envImpl = getenv("AMD_CLBLAS_TRSM_IMPLEMENTATION"); if ((envImpl != NULL) && (strcmp(envImpl, "1") == 0)) { enable = 1; }; return enable; } void parseEnvImplementation(void) { const char *envImpl; envImpl = getenv("AMD_CLBLAS_GEMM_IMPLEMENTATION"); clblasSelectImplementation(clblasDefaultGemm); if (envImpl != NULL) { if (strcmp(envImpl, "0") == 0) { clblasSelectImplementation(clblasLdsBlockGemm); } else if (strcmp(envImpl, "1") == 0) { clblasSelectImplementation(clblasImageBlockGemm); } else if (strcmp(envImpl, "2") == 0) { clblasSelectImplementation(clblasBlockGemmWithCaching); } } envImpl = getenv("AMD_CLBLAS_TRMM_IMPLEMENTATION"); clblasSelectImplementation(clblasDefaultTrmm); if (envImpl != NULL) { if (strcmp(envImpl, "0") == 0) { clblasSelectImplementation(clblasLdsBlockTrmm); } else if (strcmp(envImpl, "1") == 0) { clblasSelectImplementation(clblasImageBlockTrmm); } else if (strcmp(envImpl, "2") == 0) { clblasSelectImplementation(clblasBlockTrmmWithCaching); } } envImpl = getenv("AMD_CLBLAS_TRSM_IMPLEMENTATION"); clblasSelectImplementation(clblasDefaultTrsm); if (envImpl != NULL) { if (strcmp(envImpl, "0") == 0) { clblasSelectImplementation(clblasLdsBlockTrsm); } else if (strcmp(envImpl, "1") == 0) { clblasSelectImplementation(clblasImageBlockTrsm); } else if (strcmp(envImpl, "2") == 0) { clblasSelectImplementation(clblasBlockTrsmWithoutLds); } else if (strcmp(envImpl, "3") == 0) { clblasSelectImplementation(clblasBlockTrsmWithCaching); } } } clblas-2.10/src/library/blas/include/000077500000000000000000000000001264277366700175175ustar00rootroot00000000000000clblas-2.10/src/library/blas/include/blas_funcs.h000066400000000000000000000041171264277366700220120ustar00rootroot00000000000000/* ************************************************************************ * Copyright 2013 Advanced Micro Devices, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * ************************************************************************/ /* * Blas function identifiers and properties */ #ifndef BLASFUNCS_H_ #define BLASFUNCS_H_ #include #ifdef __cplusplus extern "C" { #endif typedef enum BlasFunctionID { CLBLAS_GEMV, CLBLAS_SYMV, CLBLAS_GEMM, CLBLAS_TRMM, CLBLAS_TRSM, CLBLAS_SYRK, CLBLAS_SYR2K, CLBLAS_TRMV, CLBLAS_HEMV, CLBLAS_TRSV, CLBLAS_TRSV_GEMV, // Need a Kludge as current "gemv" don't support complex types CLBLAS_SYMM, CLBLAS_SYMM_DIAGONAL, CLBLAS_HEMM_DIAGONAL, CLBLAS_GEMM2, CLBLAS_GEMM_TAIL, CLBLAS_SYR, CLBLAS_SYR2, CLBLAS_GER, CLBLAS_HER, CLBLAS_HER2, CLBLAS_HEMM, CLBLAS_HERK, CLBLAS_TPMV, CLBLAS_SPMV, CLBLAS_HPMV, CLBLAS_TPSV, CLBLAS_SPR, CLBLAS_SPR2, CLBLAS_HPR, CLBLAS_HPR2, CLBLAS_GBMV, CLBLAS_TBMV, CLBLAS_SBMV, CLBLAS_HBMV, CLBLAS_TBSV, CLBLAS_SWAP, CLBLAS_SCAL, CLBLAS_COPY, CLBLAS_AXPY, CLBLAS_DOT, CLBLAS_REDUCTION_EPILOGUE, CLBLAS_ROTG, CLBLAS_ROTMG, CLBLAS_ROT, CLBLAS_ROTM, CLBLAS_iAMAX, CLBLAS_NRM2, CLBLAS_ASUM, CLBLAS_TRANSPOSE, /* ! Must be the last */ BLAS_FUNCTIONS_NUMBER } BlasFunctionID; int funcBlasLevel(BlasFunctionID funcID); bool funcHasBeta(BlasFunctionID funcID); bool funcHasTriangMatrix(BlasFunctionID funcID); #ifdef __cplusplus } #endif #endif /* BLASFUNCS_H_ */ clblas-2.10/src/library/blas/include/blas_mempat.h000066400000000000000000000205541264277366700221620ustar00rootroot00000000000000/* ************************************************************************ * Copyright 2013 Advanced Micro Devices, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * ************************************************************************/ /* * Related to BLAS memory patterns */ #ifndef BLAS_MEMPAT_H_ #define BLAS_MEMPAT_H_ #include #include #include #include /** * @brief Type of internal function implementation */ typedef enum clblasImplementation { clblasDefaultGemm, /**< Default: let the library decide what to use. */ clblasLdsBlockGemm, /**< Use blocked GEMM with LDS optimization. */ clblasImageBlockGemm, /**< Use blocked GEMM with image-based... */ clblasBlockGemmWithCaching, /**< Use blocked GEMM with cache-usage optimization. */ clblasSubgroupGemmWithCaching,/**< Use subgroup GEMM with cache-usage optimization. */ clblasDefaultTrmm, /**< Default: let the library decide what to use. */ clblasLdsBlockTrmm, /**< Use blocked TRMM with LDS optimization. */ clblasImageBlockTrmm, /**< Use blocked TRMM with image-based... */ clblasBlockTrmmWithCaching, /**< Use blocked TRMM with cache-usage optimization. */ clblasSubgroupTrmmWithCaching,/**< Use subgroup TRMM with cache-usage optimization. */ clblasDefaultTrsm, /**< Default: let the library decide what to use. */ clblasLdsBlockTrsm, /**< Use blocked TRSM with LDS optimization. */ clblasImageBlockTrsm, /**< Use blocked TRSM with image-based... */ clblasBlockTrsmWithCaching, /**< Use blocked TRSM with cache-usage optimization. */ clblasBlockTrsmWithoutLds, clblasDefaultSyrk, clblasBlockSyrk, clblasSubgSyrk, clblasDefaultSyr2k, clblasBlockSyr2k, clblasSubgSyr2k } clblasImplementation; /** * @internal * @brief extra information for a memory pattern * used for BLAS problem solving * @ingroup BLAS_SOLVERIF_SPEC */ typedef struct CLBLASMpatExtra { /** memory levels used to store blocks of matrix A */ meml_set_t aMset; /** memory levels used to store blocks of matrix B */ meml_set_t bMset; CLMemType mobjA; CLMemType mobjB; } CLBLASMpatExtra; /* * init memory patterns for the xGEMM functions * * Returns number of the initialized patterns */ unsigned int initGemmMemPatterns(MemoryPattern *mempats); /* * Get index of the specific xGEMM pattern */ int getGemmMemPatternIndex(clblasImplementation impl); /* * Get preferred xGEMM pattern */ clblasImplementation getGemmPreferredPattern(void); /* * init memory patterns for the xGEMV functions * * Returns number of the initialized patterns */ unsigned int initGemvMemPatterns(MemoryPattern *mempats); /* * Get index of the specific xGEMV pattern */ int getGemvMemPatternIndex(clblasImplementation impl); /* * init memory patterns for the xSYMV functions * * Returns number of the initialized patterns */ unsigned int initSymvMemPatterns(MemoryPattern *mempats); /* * Get index of the specific xSYMV pattern */ int getSymvMemPatternIndex(clblasImplementation impl); /* * init memory patterns for the xTRMM functions * * Returns number of the initialized patterns */ unsigned int initTrmmMemPatterns(MemoryPattern *mempats); /* * Get index of the specific xTRMM pattern */ int getTrmmMemPatternIndex(clblasImplementation impl); /* * Get preferred xTRMM pattern */ clblasImplementation getTrmmPreferredPattern(void); /* * init memory patterns for the xTRSM functions * * Returns number of the initialized patterns */ unsigned int initTrsmMemPatterns(MemoryPattern *mempats); /* * Get index of the specific xTRSM pattern */ int getTrsmMemPatternIndex(clblasImplementation impl); /* * Get preferred xTRSM pattern */ clblasImplementation getTrsmPreferredPattern(void); /* * init memory patterns for the xSYR2K functions * * Returns number of the initialized patterns */ unsigned int initSyr2kMemPatterns(MemoryPattern *mempats); /* * Get index of the specific xSYR2K pattern */ int getSyr2kMemPatternIndex(clblasImplementation impl); /* * init memory patterns for the xSYRK functions * * Returns number of the initialized patterns */ unsigned int initSyrkMemPatterns(MemoryPattern *mempats); /* * Get index of the specific xSYRK pattern */ int getSyrkMemPatternIndex(clblasImplementation impl); /* * init memory patters for TRMV routine * Returns the number of inited patterns */ unsigned int initTrmvMemPatterns(MemoryPattern *mempats); int getTrmvMemPatternIndex(clblasImplementation impl); /* * init memory patterns for TRSV TRTRI routine * Returns the number of inited patterns */ unsigned int initTrsvMemPatterns(MemoryPattern *mempats); int getTrsvMemPatternIndex(clblasImplementation impl); unsigned int initTrsvGemvMemPatterns(MemoryPattern *mempats); int getTrsvGemvMemPatternIndex(clblasImplementation impl); unsigned int initSymmMemPatterns(MemoryPattern *mempats); int getSymmMemPatternIndex(clblasImplementation impl); unsigned int initGemmV2MemPatterns(MemoryPattern *mempats); int getGemmV2MemPatternIndex(clblasImplementation impl); unsigned int initGemmV2TailMemPatterns(MemoryPattern *mempats); int getGemmV2TailMemPatternIndex(clblasImplementation impl); /* * init memory patterns for the xSYR functions * * Returns number of the initialized patterns */ unsigned int initSyrMemPatterns(MemoryPattern *mempats); /* * Get index of the specific xSYR pattern */ int getSyrMemPatternIndex(clblasImplementation impl); /* * init memory patterns for the xSYR2 functions * * Returns number of the initialized patterns */ unsigned int initSyr2MemPatterns(MemoryPattern *mempats); /* * Get index of the specific xSYR2 pattern */ int getSyr2MemPatternIndex(clblasImplementation impl); /* * init memory patters for GER routine * Returns the number of inited patterns */ unsigned int initGerMemPatterns(MemoryPattern *mempats); int getGerMemPatternIndex(clblasImplementation impl); unsigned int initHerMemPatterns(MemoryPattern *mempats); /* * Get index of the specific xSYR pattern */ int getHerMemPatternIndex(clblasImplementation impl); /* * init memory patterns for the xHER2 functions * * Returns number of the initialized patterns */ unsigned int initHer2MemPatterns(MemoryPattern *mempats); /* * Get index of the specific xHER2 pattern */ int getHer2MemPatternIndex(clblasImplementation impl); unsigned int initGbmvMemPatterns(MemoryPattern *mempats); int getGbmvMemPatternIndex(clblasImplementation impl); unsigned int initSwapMemPatterns(MemoryPattern *mempats); int getSwapMemPatternIndex(clblasImplementation impl); unsigned int initScalMemPatterns(MemoryPattern *mempats); int getScalMemPatternIndex(clblasImplementation impl); unsigned int initCopyMemPatterns(MemoryPattern *mempats); int getCopyMemPatternIndex(clblasImplementation impl); unsigned int initDotMemPatterns(MemoryPattern *mempats); int getDotMemPatternIndex(clblasImplementation impl); unsigned int initAxpyMemPatterns(MemoryPattern *mempats); int getAxpyMemPatternIndex(clblasImplementation impl); unsigned int initReductionMemPatterns(MemoryPattern *mempats); int getReductionMemPatternIndex(clblasImplementation impl); unsigned int initRotgMemPatterns(MemoryPattern *mempats); int getRotgMemPatternIndex(clblasImplementation impl); unsigned int initRotmgMemPatterns(MemoryPattern *mempats); int getRotmgMemPatternIndex(clblasImplementation impl); unsigned int initRotmMemPatterns(MemoryPattern *mempats); int getRotmMemPatternIndex(clblasImplementation impl); unsigned int initiAmaxMemPatterns(MemoryPattern *mempats); int getiAmaxMemPatternIndex(clblasImplementation impl); unsigned int initNrm2MemPatterns(MemoryPattern *mempats); int getNrm2MemPatternIndex(clblasImplementation impl); unsigned int initAsumMemPatterns(MemoryPattern *mempats); int getAsumMemPatternIndex(clblasImplementation impl); #endif /* BLAS_MEMPAT_H_ */ clblas-2.10/src/library/blas/include/clblas-internal.h000066400000000000000000000265351264277366700227550ustar00rootroot00000000000000/* ************************************************************************ * Copyright 2013 Advanced Micro Devices, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * ************************************************************************/ #ifndef CLBLAS_INTERNAL_H_ #define CLBLAS_INTERNAL_H_ #include #include #include #include #include "blas_funcs.h" #include "kernel_extra.h" #if defined(_MSC_VER) #define VISIBILITY_HIDDEN #else #define VISIBILITY_HIDDEN __attribute__((visibility("hidden"))) #endif #ifdef __cplusplus extern "C" { #endif struct SolutionStep; typedef struct CLBlasSolvers { MemoryPattern memPatterns[MEMPAT_PER_BLASFN]; unsigned int nrPatterns; int defaultPattern; /* -1 -- select among all available patterns * >= 0 -- index for memPatterns[] */ } CLBlasSolvers; extern int clblasInitialized; extern CLBlasSolvers clblasSolvers[BLAS_FUNCTIONS_NUMBER]; extern struct KernelCache *clblasKernelCache; typedef union ArgMultiplier { cl_float argFloat; cl_double argDouble; FloatComplex argFloatComplex; DoubleComplex argDoubleComplex; } ArgMultiplier; typedef union LeadingDimention { size_t matrix; /**< Positive ld value for matrixes */ int vector; /**< Integer offset value for vectors */ } LeadingDimention; typedef enum reductionType { REDUCE_BY_SUM, REDUCE_BY_MAX, REDUCE_BY_MIN, REDUCE_MAX_WITH_INDEX, REDUCE_BY_HYPOT, REDUCE_BY_SSQ, REDUCE_MAX_WITH_INDEX_ATOMICS } reductionType; /** * @internal * @brief Kernel arguments for solver methods * @ingroup SUBMIT_PROBLEM */ typedef struct CLBlasKargs { BlasFunctionID pigFuncID; // FuncID piggy backing on this call. Used by Blas-3 routines to take advantage of GEMM code /** Kernel type to pass the arguments for */ CLBlasKernelType kernType; DataType dtype; /**< Data type */ clblasOrder order; /**< Row/column order */ clblasSide side; /**< Matrix A side */ clblasUplo uplo; /**< Matrix A is upper/lower */ clblasTranspose transA; /**< Operation to be applied to matrix A */ clblasTranspose transB; /**< Operation to be applied to matrix B */ clblasDiag diag; /**< Matrix A diagonality */ size_t M; /**< Problem size in M dimension */ size_t N; /**< Problem size in N dimension */ size_t K; /**< Problem size in K dimension, or number of diagonals in a banded-matrix */ ArgMultiplier alpha; /**< Alpha multiplier */ cl_mem A; /**< Matrix A data */ LeadingDimention lda; /**< Matrix A leading dimension */ cl_mem B; /**< Matrix B data */ LeadingDimention ldb; /**< Matrix B or vector X leading dimension */ ArgMultiplier beta; /**< Beta multiplier */ cl_mem C; /**< Matrix C data */ LeadingDimention ldc; /**< Matrix C or vector Y leading dimension */ cl_mem D; /**< Extra cl_mem buffer. For scratch usage or other purpose */ cl_mem E; /**< Extra buffer.. Needed for blas 1 functions */ int addrBits; /**< Number of device address bits */ /** Problem start offset in M dimension to process from */ size_t offsetM; /** Problem start offset in N dimension to process from */ size_t offsetN; /** Problem start offset in K dimension to process from */ size_t offsetK; cl_mem scimage[2]; /**< Scratch images */ size_t offA; /**< Offset of first element of matrix A */ /** Offset of first element of matrix B or vector X */ size_t offBX; /**< Offset of first element of matrix C or vector Y */ size_t offCY; size_t offa; /**< Offset of first element of Matrix A */ size_t offb; /**< Offset of first element of Matrix B */ size_t offc; /**< Offset of first element of Matrix C */ size_t offd; /**< Offset of first element of buffer D */ size_t offe; /**< Offset of first element of buffer E */ cl_int startRow; /**< Triangular Solver - Identify where the triangle starts */ cl_int endRow; /**< Triangular Solver - Identify where the triangle ends */ size_t tailStartM; // Tail Kernel for GEMM2 size_t tailStartN; // Tail Kernel for GEMM2 size_t KL; // Number of sub-diagonals in a banded-matrix size_t KU; // Number of super-diagonals in a banded-matrix reductionType redctnType; // To store kind of reduction for reduction-framewrok to handle -- enum } CLBlasKargs; /** * @internal * @brief Initialize the binary cache (on disk) for OpenCL programs */ void clblasInitBinaryCache(void); /* * Clear all registered functor caches */ void cleanFunctorCaches(void); static __inline bool areKernelsCacheable(void) { return (clblasKernelCache != NULL); } /* * Assign a scalar multiplied on a matrix as a kernel argument */ void assignScalarKarg(KernelArg *arg, const void *value, DataType dtype); /** * calculate amount of global threads needed to compute all the problem * * @wgDim: Subproblem dimension at the level where the previous level subproblem * is distributed among different work groups * @M: problem size in dimension M before the distributing * @N: problem size in dimension N before the distributing */ void calcGlobalThreads( size_t globalThreads[2], const SubproblemDim *wgDim, const PGranularity *pgran, size_t M, size_t N); /** * @internal * @brief Get the context associated with kernel. * * @param[in] kernel Kernel object being queried. * @param[out] context The context. * * @return clGetKernelInfo() return code. */ cl_int getKernelContext( cl_kernel kernel, cl_context *context); /** * @brief Get the context associated with queue. * * @param[in] queue Queue being queried. * @param[out] context The context. * * @return clGetCommandQueueInfo() return code. */ cl_int getQueueContext( cl_command_queue queue, cl_context *context); /** * @internal * @brief Get the device specified when the command-queue is created. * * @param[in] queue Queue being queried. * @param[out] device The device. * * @return clGetCommandQueueInfo() return code. */ cl_int getQueueDevice( cl_command_queue queue, cl_device_id *device); /** * @internal * @brief Get the currently specified properties for the command-queue. * * @param[in] queue Queue being queried. * @param[out] props Properties. * * @return clGetCommandQueueInfo() return code. */ cl_int getQueueProperties( cl_command_queue queue, cl_command_queue_properties *props); Kernel *makeKernelCached( cl_device_id device, cl_context context, solver_id_t sid, KernelKey * key, SolverKgen kernelGenerator, const SubproblemDim *dims, const PGranularity *pgran, const CLBLASKernExtra *extra, const char *buildOpts, cl_int *error); Kernel *makeKernel( cl_device_id device, cl_context context, SolverKgen kernelGenerator, cl_program program, const SubproblemDim *dims, const PGranularity *pgran, const CLBLASKernExtra *extra, const char *buildOpts, cl_int *error); Kernel *loadKernel( const unsigned char** buffer, size_t sizeBuffer, KernelKey *key, const CLBLASKernExtra *extra, cl_int *error); /* * TODO: doxygen style comments */ void setupBuildOpts( char opts[BUILD_OPTS_MAXLEN], cl_device_id devID, MemoryPattern *mempat); void addBuildOpt( char * opts, size_t len, const char * option); // Internal scatter image API int initSCImages(void); void releaseSCImages(void); /** * Request an image appropriating the most to perform a user API request * * @ctx: context containing images * @devID: id of device the image will used for * @bestSize: size of image, i. e. minWidth*bestHeight of the image that should * be enough to solve a problem in single step * @minSize: minimal size of image image, i. e. minWidth*minHeight * @minWidth: minimal image width * * Returns memory object of the most appropriate image. If there are * not images available for the device or not enough memory, to allocate * some internal structures to save a usage info the function returns NULL. */ cl_mem getSCImage( cl_context ctx, cl_device_id devID, cl_ulong bestSize, cl_ulong minSize, size_t minWidth); void putSCImage(cl_device_id devID, cl_mem image); char *sprintfGranulation(char *buf, const SubproblemDim *dim, int level); const char *kernelTypeString(CLBlasKernelType ktype); #ifdef DUMP_CLBLAS_KERNELS void dumpKernel( const struct SolutionStep *step, CLBlasKernelType ktype); #else /* DUMP_CLBLAS_KERNEL */ // stub, does nothing #define dumpKernel(step, ktype) #endif /* !DUMP_CLBLAS_KERNEL */ static __inline solver_id_t makeSolverID(int fid, int mpat) { return (solver_id_t)(fid * MEMPAT_PER_BLASFN + mpat); } static __inline int solverFunctionID(solver_id_t sid) { return (sid / MEMPAT_PER_BLASFN); } static __inline int solverPattern(solver_id_t sid) { return (sid % MEMPAT_PER_BLASFN); } typedef enum ErrorCodeSet { A_MAT_ERRSET, B_MAT_ERRSET, C_MAT_ERRSET, X_VEC_ERRSET, Y_VEC_ERRSET, END_ERRSET } ErrorCodeSet; clblasStatus checkMatrixSizes( DataType dtype, clblasOrder order, clblasTranspose transA, size_t M, size_t N, cl_mem A, size_t offA, size_t lda, ErrorCodeSet err ); clblasStatus checkBandedMatrixSizes( DataType dtype, clblasOrder order, clblasTranspose transA, size_t M, size_t N, size_t KL, size_t KU, cl_mem A, size_t offA, size_t lda, ErrorCodeSet err ); clblasStatus checkVectorSizes( DataType dtype, size_t N, cl_mem x, size_t offx, int incx, ErrorCodeSet err ); clblasStatus checkMemObjects( cl_mem A, cl_mem B, cl_mem C, bool checkC, ErrorCodeSet errA, ErrorCodeSet errB, ErrorCodeSet errC ); /** * @brief Set preferred function internal implementation. * * Some BLAS functions are implemented in several different ways internally. * By default the library tries to select the most suitable implementation for * given problem. Using this function user can force library to use specific one. * * @return \b clblasSuccess on success, \b clblasInvalidValue if an * unknown implementation id was passed. */ clblasStatus clblasSelectImplementation( clblasImplementation impl); /** * @brief Set preferred implementation according to environment variable. */ void parseEnvImplementation(void); /** * @brief Check whether it is allowed to use scratch images */ int scratchImagesEnabled(void); #ifdef __cplusplus } #endif #endif /* CLBLAS_INTERNAL_H_ */ clblas-2.10/src/library/blas/include/events.h000066400000000000000000000017421264277366700212000ustar00rootroot00000000000000/* ************************************************************************ * Copyright 2013 Advanced Micro Devices, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * ************************************************************************/ /* * Events used during SolutionStep decomposition internally. */ #ifndef EVENTS_H_ #define EVENTS_H_ void decomposeEventsSetup(void); void decomposeEventsTeardown(void); cl_event* decomposeEventsAlloc(void); #endif /* EVENTS_H_ */ clblas-2.10/src/library/blas/include/kprintf.hpp000066400000000000000000000115121264277366700217050ustar00rootroot00000000000000/* ************************************************************************ * Copyright 2013 Advanced Micro Devices, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * ************************************************************************/ #ifndef __KPRINTF_HPP__ #define __KPRINTF_HPP__ #include #include #include #include #include #include typedef enum REDUCTION_TYPE { REDUCTION_BY_SUM, REDUCTION_BY_MAX, REDUCTION_BY_MIN, REDUCTION_BY_HYPOT, REDUCTION_BY_SSQ } REDUCTION_TYPE; typedef enum RedWithIndexImpl { ATOMIC_FLI, REG_FLI, ATOMIC_FHI, REG_FHI } RedWithIndexImpl; class kprintf { public: typedef struct fmt { const char *key; const char *value; }fmt_t; private: enum SRV { SCALAR, VECTOR }; const char *HALFWORD; // 1/2 of DERIVED const char *QUARTERWORD; // 1/4 of DERIVED const char *HALFQUARTERWORD; // 1/8 of DERIVED const char *VLOADWORD; const char *DERIVED; const char *BASE; bool doVLOAD; bool doVSTORE; char dataType; // For mystrtok() char* strtokPtr; int strtokCount; enum SRV s_or_v; int vectorWidth, effectiveVectorWidthOnBaseType; size_t maxKeySize; int wgSize; std::vector v; struct fmt get(const char *key); const char *findType(char *type); const char *findVectorWidthType(char *type); const char *findTypeVLOAD(char *type); const char *findTypeVSTORE(char *type); void generateVecSuffix(char *p, int n); void registerType(const char *baseType, int vecWidth, int internalVecWidth=1); void registerReducedTypes( const char* in, int div); void registerSuperTypes( const char* in, int mul); char* mystrtok( char* in, const char* tok); //NOTE: strtok overwrites the string. we dont like that... // // VLOAD %TYPE%V from (%PTYPE*) kind of memory locations // The Kernel writers should use "%TYPE" and "%TYPE%V" for kernel aguments, local variables etc.. // However, while loading using %VLOAD, they should cast the pointers as "%PTYPE *" because // VLOADn imposes certain restrictions. // Having the pointers as %TYPE and %TYPE%V relieves us from address calculations for primitives // which are vectors (like float2, double2 etc..) // void registerVLOAD(); void registerVSTORE(); void registerVectorWidth(); void handleMakeVector(char **_src, char **_dst, int div = 1); void handleMUL(char **_src, char **_dst, bool vmul=false); void handleMAD(char **_src, char **_dst, bool vmul=false); void handleDIV(char **_src, char **_dst, bool vdiv=false); void handleADD_SUB(char **_src, char **_dst, const char op); void handleVLoadWithIncx(char **_src, char **_dst, bool ignoreFirst = false); void handleVStoreWithIncx(char **_src, char **_dst); void handleReduceSum(char **_src, char **_dst); void handleReduceSumReal(char **_src, char **_dst, int vlength); void handleReduceMax(char **_src, char **_dst); void handleReduceMin(char **_src, char **_dst); void handleReduceHypot(char **_src, char **_dst); void handleCONJUGATE(char **_src, char **_dst); void handleClearImaginary(char **_src, char **_dst); void handleAlignedDataAccess(char **_src, char **_dst); void handleAlignedVSTORE(char **_src, char **_dst); void handlePredicate(char **_src, char **_dst); void handleComplexJoin(char **_src, char **_dst); void doConstruct(const char *type, int vecWidth, bool doVLOAD, bool doVSTORE, int wgSize); void handleVMAD_AND_REDUCE(char **_src, char **_dst); void handleMAD_AND_REDUCE(char **_src, char **_dst); void handleVFOR(char **_src, char **_dst, bool isReal); void handleReductionFramework(char **_src, char **_dst, REDUCTION_TYPE reductionType= REDUCTION_BY_SUM); void handleVABS(char **_src, char **_dst); void getRandomString(char *str, int length); public: kprintf(char _type, int vecWidth=1, bool doVLOAD=false, bool doVSTORE = false, int wgSize=64); kprintf(const char *type, int vecWidth=1, bool doVLOAD=false, bool doVSTORE=false, int wgSize=64); void put(const char *key, const char *value); // // PENDING: // Needs ammendment at a later point of time when we support MACROS // int real_strlen(const char *src); void spit(char *dst, char *src); }; #endif clblas-2.10/src/library/blas/include/matrix_dims.h000066400000000000000000000037061264277366700222160ustar00rootroot00000000000000/* ************************************************************************ * Copyright 2013 Advanced Micro Devices, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * ************************************************************************/ #ifndef MATRIX_DIMS_H_ #define MATRIX_DIMS_H_ #include #include #include #include #ifdef __cplusplus extern "C" { #endif void swapDimXY(SubproblemDim *dim); size_t matrBlockPitch( const SubproblemDim *dim, MatrixRole mrole, DataType dtype, clblasSide side); cl_ulong matrBlockSize( SubproblemDim *dim, MatrixRole mrole, DataType dtype, clblasSide side); size_t matrBlockHeight( SubproblemDim *dim, MatrixRole mrole, clblasSide side); /* * Transform respective kernel arguments to problem dimension. * if 'offset' is set to true, then it transform starting offsets * to process matrices from, otherwise it transforms matrix sizes. * It ignores 'bwidth' field in offset mode. */ void kargsToProbDims( SubproblemDim *probDim, BlasFunctionID funcID, const CLBlasKargs *kargs, bool offset); /* * Transform problem dimensions to respective kernel arguments. * In the offset mode it ignore 'offsetK' and always sets it to 0 */ void probDimsToKargs( CLBlasKargs *kargs, BlasFunctionID funcID, SubproblemDim *blasDim, bool offset); #ifdef __cplusplus } #endif #endif /* MATRIX_DIMS_H_ */ clblas-2.10/src/library/blas/include/matrix_props.h000066400000000000000000000036621264277366700224260ustar00rootroot00000000000000/* ************************************************************************ * Copyright 2013 Advanced Micro Devices, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * ************************************************************************/ #ifndef MATRIX_PROPS_H_ #define MATRIX_PROPS_H_ #include #include "clblas-internal.h" #include "blas_funcs.h" #include "matrix_props.h" typedef enum MatrixRole { MATRIX_A, MATRIX_B, MATRIX_C, MATRIX_ROLES_NUMBER } MatrixRole; /* * Functions to deal with kernel extra flags */ // Is a matrix should be conjugated bool isMatrixConj(KernelExtraFlags flags, MatrixRole mrole); /* * Is a matrix accessed in the column-major order */ bool isMatrixAccessColMaj( BlasFunctionID funcID, KernelExtraFlags flags, MatrixRole mrole); /* * Triangularity type at the physical layout with account * of solution element indices the largest part makes * a contribution to. That means a right-side, non transposed, * upper diagonal matrix is considered as the lower triangular * since the largest part make a contribution to solution elements * with a highest index. */ static __inline bool isMatrixUpper(KernelExtraFlags kflags); static __inline bool isMatrixUpper(KernelExtraFlags kflags) { return (((kflags & KEXTRA_UPPER_TRIANG) != 0) ^ ((kflags & KEXTRA_TRANS_A) != 0) ^ ((kflags & KEXTRA_SIDE_RIGHT) != 0)); } #endif /* MATRIX_PROPS_H_ */ clblas-2.10/src/library/blas/include/solution_seq.h000066400000000000000000000122051264277366700224140ustar00rootroot00000000000000/* ************************************************************************ * Copyright 2013 Advanced Micro Devices, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * ************************************************************************/ #ifndef SOLUTION_SEQ_H_ #define SOLUTION_SEQ_H_ #include #include #include #include #include "blas_funcs.h" #include "clblas-internal.h" #ifdef __cplusplus extern "C" { #endif // subproblem dimension components typedef enum SDimConponent { SDIM_X, SDIM_Y, SDIM_BWIDTH } SDimComponent; typedef struct SolutionStep { BlasFunctionID funcID; Kernel *kernels[MAX_CLBLAS_KERNELS_PER_STEP]; CLBlasKargs args; cl_command_queue cmdQueue; TargetDevice device; cl_uint numEventsInWaitList; const cl_event *eventWaitList; cl_event *event; unsigned int patternID; SubproblemDim subdims[MAX_SUBDIMS]; PGranularity pgran; KernelExtraFlags extraFlags; ListNode node; } SolutionStep; /** * @internal * @brief Make solution sequence * * @param[in] funcID BLAS function ID * @param[in] args BLAS parameters * @param[in] numCommandQueues Number of the command queues * @param[in] commandQueues Command queues to distribute the problem * among * @param[in] numEventsInWaitList Number of events in the wait list * @param[in] eventWaitList List of events which must fire before any * of the problem's kernels can be executed * @param[out] events List of output events signaling on * completion of evaluating the problem for * the command queues. * @param[out] seq Solution sequence head which will be * followed by all needed solution steps * after the function returns * * @returns * - \b CL_SUCCESS on success; * - \b CL_INVALID_VALUE if \b numCommandQueues is zero, or * \b commandQueues is NULL; * - \b CL_INVALID_DEVICE if the function ID indicates that this is * a double precision function, but any of the command queue's devices * does not support double precision; * - \b CL_INVALID_COMMAND_QUEUE if any of the passed command queues is * invalid; * - \b CL_OUT_OF_HOST_MEMORY if there is not enough memory to allocate * internal structures; * - \b CL_OUT_OF_HOST_RESOURCES if required scratch resources are * unavailable. * * @ingroup SUBMIT_PROBLEM */ cl_int makeSolutionSeq( BlasFunctionID funcID, const CLBlasKargs *args, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events, ListHead *seq); /** * @internal * @brief Free solution sequence * * @param[out] seq Solution sequence to free * * It initializes the list after freeing. * * @ingroup SUBMIT_PROBLEM */ void freeSolutionSeq(ListHead *seq); void freeSolutionStep(ListNode *node); /** * @internal * @brief Execute solution sequence * * @param[in] seq Sequence to execute * * @returns CL_SUCCESS on success, errors from a clEnqueueNDRangeKernel() call * otherwise. * * @ingroup SUBMIT_PROBLEM */ cl_int executeSolutionSeq(const ListHead *seq); /* * Get math decomposition of a solution step in order * to accelerate its evaluation of faster kernels for * other functions. The step must inserted into a * solution sequence. */ ListNode *decomposeProblemStep(SolutionStep *step); cl_int selectVectorization(const SolutionStep *step, CLBLASKernExtra *kextra); // Find vector length which lda and tile width is divisible on unsigned int appropriateVecLen(size_t ld, unsigned int typeSize, size_t tileWidth, int funcLevel); KernelExtraFlags VISIBILITY_HIDDEN clblasArgsToKextraFlags( const CLBlasKargs *args, BlasFunctionID funcID); void VISIBILITY_HIDDEN getStepGranulation(SolutionStep *step); bool VISIBILITY_HIDDEN dimensionsExceedProblemSize(SolutionStep *step); void VISIBILITY_HIDDEN getMinimalStepGranulation(SolutionStep *step); void VISIBILITY_HIDDEN detectProblemTails(SolutionStep *step); void VISIBILITY_HIDDEN detectOffsets(SolutionStep *step); unsigned int VISIBILITY_HIDDEN selectPattern( SolutionStep* pStep, unsigned int maxImages); void VISIBILITY_HIDDEN fixupGemmOffsets(CLBlasKargs *kargs, KernelExtraFlags kflags, size_t offsetK); #ifdef __cplusplus } #endif #endif /* SOLUTION_SEQ_H_ */ clblas-2.10/src/library/blas/include/xgemm.h000066400000000000000000000021331264277366700210040ustar00rootroot00000000000000/* ************************************************************************ * Copyright 2015 Advanced Micro Devices, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * ************************************************************************/ //some help functions #ifndef CLBLAS_XGEMM_H #define CLBLAS_XGEMM_H #ifdef __cplusplus extern "C" { #endif void makeGemmKernel( cl_kernel *clKernel, cl_command_queue clQueue, const char *kernelSource, const char *sourceBuildOptions, const unsigned char **kernelBinary, size_t *kernelBinarySize, const char *binaryBuildOptions); #ifdef __cplusplus } #endif #endifclblas-2.10/src/library/blas/init.c000066400000000000000000000206011264277366700172020ustar00rootroot00000000000000/* ************************************************************************ * Copyright 2013 Advanced Micro Devices, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * ************************************************************************/ #include #include #include #include #include #include "clblas-internal.h" #include #include #include #ifdef BUILDING_CLBLAS #include "AutoGemmTeardown.h" #include "UserGemmClKernels.h" #endif clblasStatus clblasGetVersion(cl_uint* major, cl_uint* minor, cl_uint* patch) { *major = clblasVersionMajor; *minor = clblasVersionMinor; *patch = clblasVersionPatch; return clblasSuccess; } clblasStatus clblasSetup(void) { solver_id_t sidsNum; char* tmp = NULL; // Made the cache unlimited by default size_t kCacheLimit = 0; if (clblasInitialized) { return clblasSuccess; } // printf("\n%s, line %d\n", __func__, __LINE__); initMallocTrace(); clblasInitBinaryCache(); clblasSolvers[CLBLAS_GEMM].nrPatterns = initGemmMemPatterns(clblasSolvers[CLBLAS_GEMM].memPatterns); clblasSolvers[CLBLAS_GEMM].defaultPattern = -1; clblasSolvers[CLBLAS_TRMM].nrPatterns = initTrmmMemPatterns(clblasSolvers[CLBLAS_TRMM].memPatterns); clblasSolvers[CLBLAS_TRMM].defaultPattern = -1; clblasSolvers[CLBLAS_TRSM].nrPatterns = initTrsmMemPatterns(clblasSolvers[CLBLAS_TRSM].memPatterns); clblasSolvers[CLBLAS_TRSM].defaultPattern = -1; clblasSolvers[CLBLAS_GEMV].nrPatterns = initGemvMemPatterns(clblasSolvers[CLBLAS_GEMV].memPatterns); clblasSolvers[CLBLAS_GEMV].defaultPattern = -1; clblasSolvers[CLBLAS_SYMV].nrPatterns = initSymvMemPatterns(clblasSolvers[CLBLAS_SYMV].memPatterns); clblasSolvers[CLBLAS_SYMV].defaultPattern = -1; clblasSolvers[CLBLAS_SYR2K].nrPatterns = initSyr2kMemPatterns(clblasSolvers[CLBLAS_SYR2K].memPatterns); clblasSolvers[CLBLAS_SYR2K].defaultPattern = -1; clblasSolvers[CLBLAS_SYRK].nrPatterns = initSyrkMemPatterns(clblasSolvers[CLBLAS_SYRK].memPatterns); clblasSolvers[CLBLAS_SYRK].defaultPattern = -1; clblasSolvers[CLBLAS_TRMV].nrPatterns = initTrmvMemPatterns(clblasSolvers[CLBLAS_TRMV].memPatterns); clblasSolvers[CLBLAS_TRMV].defaultPattern = -1; // HEMV uses the same memory pattern as TRMV. clblasSolvers[CLBLAS_HEMV].nrPatterns = initTrmvMemPatterns(clblasSolvers[CLBLAS_HEMV].memPatterns); clblasSolvers[CLBLAS_HEMV].defaultPattern = -1; clblasSolvers[CLBLAS_TRSV].nrPatterns = initTrsvMemPatterns(clblasSolvers[CLBLAS_TRSV].memPatterns); clblasSolvers[CLBLAS_TRSV].defaultPattern = -1; clblasSolvers[CLBLAS_TRSV_GEMV].nrPatterns = initTrsvGemvMemPatterns(clblasSolvers[CLBLAS_TRSV_GEMV].memPatterns); clblasSolvers[CLBLAS_TRSV_GEMV].defaultPattern = -1; clblasSolvers[CLBLAS_SYMM].nrPatterns = initSymmMemPatterns(clblasSolvers[CLBLAS_SYMM].memPatterns); clblasSolvers[CLBLAS_SYMM].defaultPattern = -1; clblasSolvers[CLBLAS_GEMM2].nrPatterns = initGemmV2MemPatterns(clblasSolvers[CLBLAS_GEMM2].memPatterns); clblasSolvers[CLBLAS_GEMM2].defaultPattern = -1; clblasSolvers[CLBLAS_GEMM_TAIL].nrPatterns = initGemmV2TailMemPatterns(clblasSolvers[CLBLAS_GEMM_TAIL].memPatterns); clblasSolvers[CLBLAS_GEMM_TAIL].defaultPattern = -1; clblasSolvers[CLBLAS_SYR].nrPatterns = initSyrMemPatterns(clblasSolvers[CLBLAS_SYR].memPatterns); clblasSolvers[CLBLAS_SYR].defaultPattern = -1; clblasSolvers[CLBLAS_SYR2].nrPatterns = initSyr2MemPatterns(clblasSolvers[CLBLAS_SYR2].memPatterns); clblasSolvers[CLBLAS_SYR2].defaultPattern = -1; clblasSolvers[CLBLAS_GER].nrPatterns = initGerMemPatterns(clblasSolvers[CLBLAS_GER].memPatterns); clblasSolvers[CLBLAS_GER].defaultPattern = -1; clblasSolvers[CLBLAS_HER].nrPatterns = initHerMemPatterns(clblasSolvers[CLBLAS_HER].memPatterns); clblasSolvers[CLBLAS_HER].defaultPattern = -1; clblasSolvers[CLBLAS_HER2].nrPatterns = initHer2MemPatterns(clblasSolvers[CLBLAS_HER2].memPatterns); clblasSolvers[CLBLAS_HER2].defaultPattern = -1; clblasSolvers[CLBLAS_GBMV].nrPatterns = initGbmvMemPatterns(clblasSolvers[CLBLAS_GBMV].memPatterns); clblasSolvers[CLBLAS_GBMV].defaultPattern = -1; clblasSolvers[CLBLAS_SWAP].nrPatterns = initSwapMemPatterns(clblasSolvers[CLBLAS_SWAP].memPatterns); clblasSolvers[CLBLAS_SWAP].defaultPattern = -1; clblasSolvers[CLBLAS_SCAL].nrPatterns = initScalMemPatterns(clblasSolvers[CLBLAS_SCAL].memPatterns); clblasSolvers[CLBLAS_SCAL].defaultPattern = -1; clblasSolvers[CLBLAS_COPY].nrPatterns = initCopyMemPatterns(clblasSolvers[CLBLAS_COPY].memPatterns); clblasSolvers[CLBLAS_COPY].defaultPattern = -1; clblasSolvers[CLBLAS_AXPY].nrPatterns = initAxpyMemPatterns(clblasSolvers[CLBLAS_AXPY].memPatterns); clblasSolvers[CLBLAS_AXPY].defaultPattern = -1; clblasSolvers[CLBLAS_DOT].nrPatterns = initDotMemPatterns(clblasSolvers[CLBLAS_DOT].memPatterns); clblasSolvers[CLBLAS_DOT].defaultPattern = -1; clblasSolvers[CLBLAS_REDUCTION_EPILOGUE].nrPatterns = initReductionMemPatterns(clblasSolvers[CLBLAS_REDUCTION_EPILOGUE].memPatterns); clblasSolvers[CLBLAS_REDUCTION_EPILOGUE].defaultPattern = -1; clblasSolvers[CLBLAS_ROTG].nrPatterns = initRotgMemPatterns(clblasSolvers[CLBLAS_ROTG].memPatterns); clblasSolvers[CLBLAS_ROTG].defaultPattern = -1; clblasSolvers[CLBLAS_ROTMG].nrPatterns = initRotmgMemPatterns(clblasSolvers[CLBLAS_ROTMG].memPatterns); clblasSolvers[CLBLAS_ROTMG].defaultPattern = -1; clblasSolvers[CLBLAS_ROTM].nrPatterns = initRotmMemPatterns(clblasSolvers[CLBLAS_ROTM].memPatterns); clblasSolvers[CLBLAS_ROTM].defaultPattern = -1; clblasSolvers[CLBLAS_iAMAX].nrPatterns = initiAmaxMemPatterns(clblasSolvers[CLBLAS_iAMAX].memPatterns); clblasSolvers[CLBLAS_iAMAX].defaultPattern = -1; clblasSolvers[CLBLAS_NRM2].nrPatterns = initNrm2MemPatterns(clblasSolvers[CLBLAS_NRM2].memPatterns); clblasSolvers[CLBLAS_NRM2].defaultPattern = -1; clblasSolvers[CLBLAS_ASUM].nrPatterns = initAsumMemPatterns(clblasSolvers[CLBLAS_ASUM].memPatterns); clblasSolvers[CLBLAS_ASUM].defaultPattern = -1; sidsNum = makeSolverID(BLAS_FUNCTIONS_NUMBER, 0); // Read environmental variable to limit or disable ( 0 ) the size of the kernel cache in memory tmp = getenv( "AMD_CLBLAS_KCACHE_LIMIT_MB" ); if( tmp != NULL ) { kCacheLimit = atol( tmp ); #if defined( _WIN32 ) printf( "Kernel Cache limit: %Iu MB\n", kCacheLimit ); #else printf( "Kernel Cache limit: %zu MB\n", kCacheLimit ); #endif kCacheLimit *= (1024 * 1024); } if (kCacheLimit || (tmp == NULL)) { clblasKernelCache = createKernelCache(sidsNum, kCacheLimit); if (clblasKernelCache == NULL) { return clblasOutOfHostMemory; } } if (initSCImages()) { destroyKernelCache(clblasKernelCache); return clblasOutOfHostMemory; } decomposeEventsSetup(); initStorageCache(); clblasInitialized = 1; return clblasSuccess; } // TO BE FIXED: is really a uggly hack. // The tune tool and some tests are linked with // only a subset of clBLAS that does not contain // the functor related codes. // //void (* _cleanFunctorCachesHook)(void) = 0 ; void clblasTeardown(void) { if (!clblasInitialized) { return; } printMallocStatistics(); if (clblasKernelCache != NULL) { printKernelCacheSize(clblasKernelCache); destroyKernelCache(clblasKernelCache); clblasKernelCache = NULL; } releaseSCImages(); decomposeEventsTeardown(); // win32 - crashes destroyStorageCache(); cleanFunctorCaches() ; printMemLeaksInfo(); releaseMallocTrace(); #ifdef BUILDING_CLBLAS initUserGemmClKernels(); initAutoGemmClKernels(); #endif clblasInitialized = 0; } clblas-2.10/src/library/blas/ixamax.c000066400000000000000000000172751264277366700175430ustar00rootroot00000000000000/* ************************************************************************ * Copyright 2013 Advanced Micro Devices, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * ************************************************************************/ //#define IAMAX_USE_ATOMIC_MIN #include #include #include #include #include #include "clblas-internal.h" #include "solution_seq.h" clblasStatus doiAmax( CLBlasKargs *kargs, size_t N, cl_mem iMax, size_t offiMax, const cl_mem X, size_t offx, int incx, cl_mem scratchBuf, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events) { cl_int err; ListHead seq, seq2; clblasStatus retCode = clblasSuccess; cl_event firstiAmaxCall; CLBlasKargs redctnArgs; ListNode *listNodePtr; SolutionStep *step; if (!clblasInitialized) { return clblasNotInitialized; } /* Validate arguments */ retCode = checkMemObjects(X, scratchBuf, iMax, true, X_VEC_ERRSET, A_MAT_ERRSET, X_VEC_ERRSET ); if (retCode) { #ifdef DEBUG_iAMAX printf("Invalid mem object..\n"); #endif return retCode; } // Check wheather enough memory was allocated if ((retCode = checkVectorSizes(kargs->dtype, N, X, offx, incx, X_VEC_ERRSET ))) { #ifdef DEBUG_iAMAX printf("Invalid Size for X\n"); #endif return retCode; } // Minimum size of scratchBuff is 2 * N if ((retCode = checkVectorSizes(kargs->dtype, (2 * N), scratchBuf, 0, 1, A_MAT_ERRSET ))) { #ifdef DEBUG_iAMAX printf("Insufficient ScratchBuff A\n"); #endif return retCode; } if ((retCode = checkVectorSizes(TYPE_UNSIGNED_INT, 1, iMax, offiMax, 1, X_VEC_ERRSET ))) { #ifdef DEBUG_iAMAX printf("Invalid Size for iX\n"); #endif return retCode; } /////////////////////////////////////////////////////////////// if ((commandQueues == NULL) || (numCommandQueues == 0)) { return clblasInvalidValue; } /* numCommandQueues will be hardcoded to 1 as of now. No multi-gpu support */ numCommandQueues = 1; if (commandQueues[0] == NULL) { return clblasInvalidCommandQueue; } if ((numEventsInWaitList !=0) && (eventWaitList == NULL)) { return clblasInvalidEventWaitList; } // cl_mem D is scratch buffer // cl_mem A is the output Buffer i.e. iMAX, offA for offiMax // cl_mem B is the input Buffer containing N Values kargs->N = N; kargs->B = X; kargs->offb = offx; kargs->ldb.vector = incx; // Will be using this as incx if(incx < 1) { // According to netlib, if incx<1, NRM2 will be zero kargs->N = 1; // Makeing it launch only 1 work-group } kargs->D = scratchBuf; kargs->A = iMax; kargs->offA = offiMax; #ifdef IAMAX_USE_ATOMIC_MIN kargs->redctnType = REDUCE_MAX_WITH_INDEX_ATOMICS; #else kargs->redctnType = REDUCE_MAX_WITH_INDEX; #endif memcpy(&redctnArgs, kargs, sizeof(CLBlasKargs)); listInitHead(&seq); err = makeSolutionSeq(CLBLAS_iAMAX, kargs, numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, &firstiAmaxCall, &seq); if (err == CL_SUCCESS) { // The second kernel call needs to know the number of work-groups used // in the first kernel call. This number of work-groups is calculated here // and passed as N to second reduction kernel err = executeSolutionSeq(&seq); if (err == CL_SUCCESS) { listNodePtr = listNodeFirst(&seq); // Get the node step = container_of(listNodePtr, node, SolutionStep); redctnArgs.N = step->pgran.numWGSpawned[0]; // 1D block was used redctnArgs.dtype = (redctnArgs.dtype == TYPE_COMPLEX_FLOAT) ? TYPE_FLOAT : ((redctnArgs.dtype == TYPE_COMPLEX_DOUBLE) ? TYPE_DOUBLE : redctnArgs.dtype); listInitHead(&seq2); err = makeSolutionSeq(CLBLAS_REDUCTION_EPILOGUE, &redctnArgs, numCommandQueues, commandQueues, 1, &firstiAmaxCall, events, &seq2); if (err == CL_SUCCESS) { err = executeSolutionSeq(&seq2); } freeSolutionSeq(&seq2); } } freeSolutionSeq(&seq); return (clblasStatus)err; } clblasStatus clblasiSamax( size_t N, cl_mem iMax, size_t offiMax, const cl_mem X, size_t offx, int incx, cl_mem scratchBuf, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events) { CLBlasKargs kargs; #ifdef DEBUG_iAMAX printf("iSAMAX Called\n"); #endif memset(&kargs, 0, sizeof(kargs)); kargs.dtype = TYPE_FLOAT; kargs.pigFuncID = CLBLAS_iAMAX; return doiAmax(&kargs, N, iMax, offiMax, X, offx, incx, scratchBuf, numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events); } clblasStatus clblasiDamax( size_t N, cl_mem iMax, size_t offiMax, const cl_mem X, size_t offx, int incx, cl_mem scratchBuf, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events) { CLBlasKargs kargs; #ifdef DEBUG_iAMAX printf("iDAMAX called\n"); #endif memset(&kargs, 0, sizeof(kargs)); kargs.dtype = TYPE_DOUBLE; kargs.pigFuncID = CLBLAS_iAMAX; return doiAmax(&kargs, N, iMax, offiMax, X, offx, incx, scratchBuf, numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events); } clblasStatus clblasiCamax( size_t N, cl_mem iMax, size_t offiMax, const cl_mem X, size_t offx, int incx, cl_mem scratchBuf, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events) { CLBlasKargs kargs; #ifdef DEBUG_iAMAX printf("iCAMAX Called\n"); #endif memset(&kargs, 0, sizeof(kargs)); kargs.pigFuncID = CLBLAS_iAMAX; kargs.dtype = TYPE_COMPLEX_FLOAT; return doiAmax(&kargs, N, iMax, offiMax, X, offx, incx, scratchBuf, numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events); } clblasStatus clblasiZamax( size_t N, cl_mem iMax, size_t offiMax, const cl_mem X, size_t offx, int incx, cl_mem scratchBuf, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events) { CLBlasKargs kargs; #ifdef DEBUG_iAMAX printf("iZAMAX Called\n"); #endif memset(&kargs, 0, sizeof(kargs)); kargs.pigFuncID = CLBLAS_iAMAX; kargs.dtype = TYPE_COMPLEX_DOUBLE; return doiAmax(&kargs, N, iMax, offiMax, X, offx, incx, scratchBuf, numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events); } clblas-2.10/src/library/blas/matrix.c000066400000000000000000000523571264277366700175600ustar00rootroot00000000000000/* ************************************************************************ * Copyright 2014 Advanced Micro Devices, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * ************************************************************************/ #include #include #include #include #define SWAP(TYPE,a,b) do { TYPE swap_tmp_ = a ; a = b ; b = swap_tmp_ ; } while(0) // Return true if the area starting from pint (x,y) and of size (w,h) is // within the array of size d1 x d2 static int inside2d( size_t d1, size_t d2, int x, int y, size_t w, size_t h ) { // Very very large dimensions are likely a bug size_t MAXDIM = ((size_t)INT_MAX) ; size_t max_w = (size_t)(d1-x) ; size_t max_h = (size_t)(d2-y) ; if ( d1 >= MAXDIM ) return 0 ; if ( d2 >= MAXDIM ) return 0 ; if ( w >= MAXDIM ) return 0 ; if ( h >= MAXDIM ) return 0 ; if ( x < 0 || x >= (int)d1 ) return 0 ; if ( w > max_w ) return 0 ; if ( y < 0 || y >= (int)d2 ) return 0 ; if ( h > max_h ) return 0 ; return 1 ; } clblasStatus clblasMatrixSizeInfo(clblasOrder order, size_t rows, size_t columns, size_t elemsize, size_t padding, size_t * ld, size_t * fullsize) { size_t x; size_t y; if( order == clblasRowMajor ) { x = columns; y = rows; } else { x = rows; y = columns; } // set if not NULL if( ld ) *ld = x + padding; if( fullsize ) *fullsize = (size_t) ( (x + padding) * y * elemsize ); return clblasSuccess; } cl_mem clblasCreateMatrix( cl_context context, clblasOrder order, size_t rows, size_t columns, size_t elemsize, size_t padding, size_t * ld, size_t * fullsize, cl_int * err) { size_t tmp_fullsize; cl_mem_flags flags = CL_MEM_READ_WRITE; clblasMatrixSizeInfo( order, rows, columns, elemsize, padding, ld, &tmp_fullsize); // set if not NULL if(fullsize != NULL) *fullsize = tmp_fullsize; return clCreateBuffer( context, flags, tmp_fullsize, NULL, err); } /* * Internal function: * see clblasCreateMatrix() */ cl_mem clblasCreateMatrixWithLd( cl_context context, clblasOrder order, size_t rows, size_t columns, size_t elemsize, size_t ld, size_t * fullsize, cl_int * err) { int nbelem; cl_mem_flags flags = CL_MEM_READ_WRITE; // compute number of elements if( order == clblasRowMajor ) { // check ld if( ld < columns ) { *err = clblasInvalidValue; return 0; } nbelem = rows * ld; } else if( order == clblasColumnMajor ) { // check ld if( ld < rows ) { *err = clblasInvalidValue; return 0; } nbelem = ld * columns; } // set if not NULL if( fullsize ) *fullsize = (size_t) (nbelem * elemsize ); // allocate return clCreateBuffer( context, flags, *fullsize, NULL, err); } cl_mem clblasCreateMatrixFromHost( cl_context context, clblasOrder order, size_t rows, size_t columns, size_t elemsize, size_t ld, void * host, size_t off_host, size_t ld_host, cl_command_queue command_queue, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_int * err) { size_t fullsize; cl_mem out; size_t i; out = clblasCreateMatrixWithLd( context, order, rows, columns, elemsize, ld, &fullsize, err); if( ! *err ) { printf("ok\n"); // TODO use ReadMatrix instead ? if( order == clblasRowMajor ) { for( i = 0; i < rows; i++ ) { const size_t host_orig[3] = {off_host, off_host, 0}; const size_t buff_orig[3] = {0, 0, 0}; const size_t region[3] = {columns*elemsize, rows, 1}; *err = clEnqueueWriteBufferRect( command_queue, out, CL_TRUE, buff_orig, host_orig, region, columns * elemsize, 0, ld_host * elemsize, 0, host, numEventsInWaitList, eventWaitList, NULL); } } } return out; } /* * Internal function: * enqueue event in list and wait for it if blocking */ static clblasStatus emptyAction( cl_command_queue command_queue, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *event, cl_bool blocking) { cl_int err ; err = clEnqueueBarrierWithWaitList( command_queue, numEventsInWaitList, eventWaitList, event); if (err != clblasSuccess) return (clblasStatus)err; if(blocking) return (clblasStatus)clWaitForEvents(1, event); else return (clblasStatus)err; } /* * Internal function: * Generic version of clblasWriteSubMatrix with blocking arg * event must be non-NULL if blocking is set to CL_TRUE */ static clblasStatus _clblasWriteSubMatrix( clblasOrder order, size_t element_size, const void *A, size_t offA, size_t ldA, size_t nrA, size_t ncA, size_t xA, size_t yA, cl_mem B, size_t offB, size_t ldB, size_t nrB, size_t ncB, size_t xB, size_t yB, size_t nx, size_t ny, cl_command_queue command_queue, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *event, cl_bool blocking) { if( order == clblasRowMajor ) { SWAP(size_t, xA, yA); SWAP(size_t, nrA, ncA); SWAP(size_t, xB, yB); SWAP(size_t, nrB, ncB); SWAP(size_t, nx, ny); } // Check that the specified area is within the array A if ( !inside2d( nrA,ncA, xA,yA , nx,ny ) ) { return clblasInvalidValue ; } // Check that the specified area is within the array B if ( !inside2d( nrB,ncB, xB,yB , nx,ny ) ) { return clblasInvalidValue ; } if( nx == 0 || ny == 0 ) { return emptyAction( command_queue, numEventsInWaitList, eventWaitList, event, blocking); } { const size_t origA[3] = { (xA+offA)*element_size, yA, 0 }; const size_t origB[3] = { (xB+offB)*element_size, yB, 0 }; const size_t region[3] = { nx * element_size, ny, 1 }; return (clblasStatus) clEnqueueWriteBufferRect( command_queue, B, blocking, origB, origA, region, ldB * element_size, 0, ldA * element_size, 0, A, numEventsInWaitList, eventWaitList, event); } } clblasStatus clblasWriteSubMatrix( clblasOrder order, size_t element_size, const void *A, size_t offA, size_t ldA, size_t nrA, size_t ncA, size_t xA, size_t yA, cl_mem B, size_t offB, size_t ldB, size_t nrB, size_t ncB, size_t xB, size_t yB, size_t nx, size_t ny, cl_command_queue command_queue, cl_uint numEventsInWaitList, const cl_event *eventWaitList) { cl_event evt; return _clblasWriteSubMatrix( order, element_size, A, offA, ldA, nrA, ncA, xA, yA, B, offB, ldB, nrB, ncB, xB, yB, nx, ny, command_queue, numEventsInWaitList, eventWaitList, &evt, CL_TRUE); } clblasStatus clblasWriteSubMatrixAsync( clblasOrder order, size_t element_size, const void *A, size_t offA, size_t ldA, size_t nrA, size_t ncA, size_t xA, size_t yA, cl_mem B, size_t offB, size_t ldB, size_t nrB, size_t ncB, size_t xB, size_t yB, size_t nx, size_t ny, cl_command_queue command_queue, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *event) { return _clblasWriteSubMatrix( order, element_size, A, offA, ldA, nrA, ncA, xA, yA, B, offB, ldB, nrB, ncB, xB, yB, nx, ny, command_queue, numEventsInWaitList, eventWaitList, event, CL_FALSE); } /* * Internal function: * Generic version of clblasReadSubMatrix with blocking arg * event must be non-NULL if blocking is set to CL_TRUE */ static clblasStatus _clblasReadSubMatrix( clblasOrder order, size_t element_size, const cl_mem A, size_t offA, size_t ldA, size_t nrA, size_t ncA, size_t xA, size_t yA, void *B, size_t offB, size_t ldB, size_t nrB, size_t ncB, size_t xB, size_t yB, size_t nx, size_t ny, cl_command_queue command_queue, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *event, cl_bool blocking) { if( order == clblasRowMajor ) { SWAP(size_t, xA, yA); SWAP(size_t, nrA, ncA); SWAP(size_t, xB, yB); SWAP(size_t, nrB, ncB); SWAP(size_t, nx, ny); } if( nx == 0 || ny == 0 ) { return emptyAction( command_queue, numEventsInWaitList, eventWaitList, event, blocking); } // Check that the specified area is within the array A if ( !inside2d( nrA,ncA, xA,yA , nx,ny ) ) { return clblasInvalidValue ; } // Check that the specified area is within the array B if ( !inside2d( nrB,ncB, xB,yB , nx,ny ) ) { return clblasInvalidValue ; } { const size_t origA[3] = { (xA+offA)*element_size, yA, 0 }; const size_t origB[3] = { (xB+offB)*element_size, yB, 0 }; const size_t region[3] = { nx * element_size, ny, 1 }; return (clblasStatus) clEnqueueReadBufferRect( command_queue, A, blocking, origA, origB, region, ldA * element_size, 0, ldB * element_size, 0, B, numEventsInWaitList, eventWaitList, event); } } clblasStatus clblasReadSubMatrix( clblasOrder order, size_t element_size, const cl_mem A, size_t offA, size_t ldA, size_t nrA, size_t ncA, size_t xA, size_t yA, void *B, size_t offB, size_t ldB, size_t nrB, size_t ncB, size_t xB, size_t yB, size_t nx, size_t ny, cl_command_queue command_queue, cl_uint numEventsInWaitList, const cl_event *eventWaitList) { cl_event evt; return _clblasReadSubMatrix( order, element_size, A, offA, ldA, nrA, ncA, xA, yA, B, offB, ldB, nrB, ncB, xB, yB, nx, ny, command_queue, numEventsInWaitList, eventWaitList, &evt, CL_TRUE); } clblasStatus clblasReadSubMatrixAsync( clblasOrder order, size_t element_size, const cl_mem A, size_t offA, size_t ldA, size_t nrA, size_t ncA, size_t xA, size_t yA, void *B, size_t offB, size_t ldB, size_t nrB, size_t ncB, size_t xB, size_t yB, size_t nx, size_t ny, cl_command_queue command_queue, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *event) { return _clblasReadSubMatrix( order, element_size, A, offA, ldA, nrA, ncA, xA, yA, B, offB, ldB, nrB, ncB, xB, yB, nx, ny, command_queue, numEventsInWaitList, eventWaitList, event, CL_TRUE); } /* * Internal function: * Generic version of clblasCopySubMatrix with blocking arg * event must be non-NULL if blocking is set to CL_TRUE */ static clblasStatus _clblasCopySubMatrix( clblasOrder order, size_t element_size, const cl_mem A, size_t offA, size_t ldA, size_t nrA, size_t ncA, size_t xA, size_t yA, cl_mem B, size_t offB, size_t ldB, size_t nrB, size_t ncB, size_t xB, size_t yB, size_t nx, size_t ny, cl_command_queue command_queue, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *event, cl_bool blocking) { cl_int err; if( order == clblasRowMajor ) { SWAP(size_t, xA, yA); SWAP(size_t, nrA, ncA); SWAP(size_t, xB, yB); SWAP(size_t, nrB, ncB); SWAP(size_t, nx, ny); } if( nx == 0 || ny == 0 ) { return emptyAction( command_queue, numEventsInWaitList, eventWaitList, event, CL_FALSE); } // Check that the specified area is within the array A if ( !inside2d( nrA,ncA, xA,yA , nx,ny ) ) { return clblasInvalidValue ; } // Check that the specified area is within the array B if ( !inside2d( nrB,ncB, xB,yB , nx,ny ) ) { return clblasInvalidValue ; } { const size_t origA[3] = { (xA+offA)*element_size, yA, 0 }; const size_t origB[3] = { (xB+offB)*element_size, yB, 0 }; const size_t region[3] = { nx * element_size, ny, 1 }; err = clEnqueueCopyBufferRect( command_queue, A, B, origA, origB, region, ldA * element_size, 0, ldB * element_size, 0, numEventsInWaitList, eventWaitList, event); } if (err != clblasSuccess) return (clblasStatus)err; if(blocking) return (clblasStatus)clWaitForEvents(1, event); else return (clblasStatus)err; } clblasStatus clblasCopySubMatrix( clblasOrder order, size_t element_size, const cl_mem A, size_t offA, size_t ldA, size_t nrA, size_t ncA, size_t xA, size_t yA, cl_mem B, size_t offB, size_t ldB, size_t nrB, size_t ncB, size_t xB, size_t yB, size_t nx, size_t ny, cl_command_queue command_queue, cl_uint numEventsInWaitList, const cl_event *eventWaitList) { cl_event evt; return (clblasStatus) _clblasCopySubMatrix( order, element_size, A, offA, ldA, nrA, ncA, xA, yA, B, offB, ldB, nrB, ncB, xB, yB, nx, ny, command_queue, numEventsInWaitList, eventWaitList, &evt, CL_TRUE); } clblasStatus clblasCopySubMatrixAsync( clblasOrder order, size_t element_size, const cl_mem A, size_t offA, size_t ldA, size_t nrA, size_t ncA, size_t xA, size_t yA, cl_mem B, size_t offB, size_t ldB, size_t nrB, size_t ncB, size_t xB, size_t yB, size_t nx, size_t ny, cl_command_queue command_queue, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *event) { return (clblasStatus) _clblasCopySubMatrix( order, element_size, A, offA, ldA, nrA, ncA, xA, yA, B, offB, ldB, nrB, ncB, xB, yB, nx, ny, command_queue, numEventsInWaitList, eventWaitList, event, CL_FALSE); } clblasStatus clblasWriteVector( size_t nb_elem, size_t element_size, const void *A, size_t offA, cl_mem B, size_t offB, cl_command_queue command_queue, cl_uint numEventsInWaitList, const cl_event *eventWaitList) { return clblasWriteMatrix( clblasColumnMajor, nb_elem, 1, element_size, A, offA, nb_elem, B, offB, nb_elem, command_queue, numEventsInWaitList, eventWaitList); } clblasStatus clblasWriteVectorAsync( size_t nb_elem, size_t element_size, const void *A, size_t offA, cl_mem B, size_t offB, cl_command_queue command_queue, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events) { return clblasWriteMatrixAsync( clblasColumnMajor, nb_elem, 1, element_size, A, offA, nb_elem, B, offB, nb_elem, command_queue, numEventsInWaitList, eventWaitList, events); } clblasStatus clblasReadVector( size_t nb_elem, size_t element_size, const cl_mem A, size_t offA, void * B, size_t offB, cl_command_queue command_queue, cl_uint numEventsInWaitList, const cl_event *eventWaitList) { return clblasReadMatrix( clblasColumnMajor, nb_elem, 1, element_size, A, offA, nb_elem, B, offB, nb_elem, command_queue, numEventsInWaitList, eventWaitList); } clblasStatus clblasReadVectorAsync( size_t nb_elem, size_t element_size, const cl_mem A, size_t offA, void * B, size_t offB, cl_command_queue command_queue, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events) { return clblasReadMatrixAsync( clblasColumnMajor, nb_elem, 1, element_size, A, offA, nb_elem, B, offB, nb_elem, command_queue, numEventsInWaitList, eventWaitList, events); } clblasStatus clblasCopyVector( size_t nb_elem, size_t element_size, const cl_mem A, size_t offA, cl_mem B, size_t offB, cl_command_queue command_queue, cl_uint numEventsInWaitList, const cl_event *eventWaitList) { return clblasCopyMatrix( clblasColumnMajor, nb_elem, 1, element_size, A, offA, nb_elem, B, offB, nb_elem, command_queue, numEventsInWaitList, eventWaitList); } clblasStatus clblasCopyVectorAsync( size_t nb_elem, size_t element_size, const cl_mem A, size_t offA, cl_mem B, size_t offB, cl_command_queue command_queue, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events) { return clblasCopyMatrixAsync( clblasColumnMajor, nb_elem, 1, element_size, A, offA, nb_elem, B, offB, nb_elem, command_queue, numEventsInWaitList, eventWaitList, events); } clblasStatus clblasWriteMatrix( clblasOrder order, size_t sx, size_t sy, size_t element_size, const void *A, size_t offA, size_t ldA, cl_mem B, size_t offB, size_t ldB, cl_command_queue command_queue, cl_uint numEventsInWaitList, const cl_event *eventWaitList) { return clblasWriteSubMatrix( order, element_size, A, offA, ldA, sx, sy, 0, 0, B, offB, ldB, sx, sy, 0, 0, sx, sy, command_queue, numEventsInWaitList, eventWaitList); } clblasStatus clblasWriteMatrixAsync( clblasOrder order, size_t sx, size_t sy, size_t element_size, const void *A, size_t offA, size_t ldA, cl_mem B, size_t offB, size_t ldB, cl_command_queue command_queue, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events) { return clblasWriteSubMatrixAsync( order, element_size, A, offA, ldA, sx, sy, 0, 0, B, offB, ldB, sx, sy, 0, 0, sx, sy, command_queue, numEventsInWaitList, eventWaitList, events); } clblasStatus clblasReadMatrix( clblasOrder order, size_t sx, size_t sy, size_t element_size, const cl_mem A, size_t offA, size_t ldA, void * B, size_t offB, size_t ldB, cl_command_queue command_queue, cl_uint numEventsInWaitList, const cl_event *eventWaitList) { return clblasReadSubMatrix( order, element_size, A, offA, ldA, sx, sy, 0, 0, B, offB, ldB, sx, sy, 0, 0, sx, sy, command_queue, numEventsInWaitList, eventWaitList); } clblasStatus clblasReadMatrixAsync( clblasOrder order, size_t sx, size_t sy, size_t element_size, const cl_mem A, size_t offA, size_t ldA, void * B, size_t offB, size_t ldB, cl_command_queue command_queue, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events) { return clblasReadSubMatrixAsync( order, element_size, A, offA, ldA, sx, sy, 0, 0, B, offB, ldB, sx, sy, 0, 0, sx, sy, command_queue, numEventsInWaitList, eventWaitList, events); } clblasStatus clblasCopyMatrix( clblasOrder order, size_t sx, size_t sy, size_t element_size, const cl_mem A, size_t offA, size_t ldA, cl_mem B, size_t offB, size_t ldB, cl_command_queue command_queue, cl_uint numEventsInWaitList, const cl_event *eventWaitList) { return clblasCopySubMatrix( order, element_size, A, offA, ldA, sx, sy, 0, 0, B, offB, ldB, sx, sy, 0, 0, sx, sy, command_queue, numEventsInWaitList, eventWaitList); } clblasStatus clblasCopyMatrixAsync( clblasOrder order, size_t sx, size_t sy, size_t element_size, const cl_mem A, size_t offA, size_t ldA, cl_mem B, size_t offB, size_t ldB, cl_command_queue command_queue, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events) { return clblasCopySubMatrixAsync( order, element_size, A, offA, ldA, sx, sy, 0, 0, B, offB, ldB, sx, sy, 0, 0, sx, sy, command_queue, numEventsInWaitList, eventWaitList, events); } clblas-2.10/src/library/blas/scimage.c000066400000000000000000000163721264277366700176610ustar00rootroot00000000000000/* ************************************************************************ * Copyright 2013 Advanced Micro Devices, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * ************************************************************************/ #include #include #include #include #include #include #define IMAGES_LOCK() mutexLock(imagesLock) #define IMAGES_UNLOCK() mutexUnlock(imagesLock) typedef struct DeviceNode { cl_device_id devID; ListNode node; } DeviceNode; typedef struct SCImageNode { cl_mem image; size_t width; size_t height; // devices using this image for computing ListHead usingDevices; ListNode node; } SCImageNode; typedef struct SearchContext { cl_context ctx; cl_device_id devID; cl_ulong bestSize; cl_ulong minSize; size_t minWidth; cl_ulong minExtraSize; SCImageNode *bestImgNode; } SearchContext; static const cl_image_format IMAGE_FORMAT = { CL_RGBA, CL_UNSIGNED_INT32 }; static ListHead images; static mutex_t *imagesLock = NULL; static void freeDeviceNode(ListNode *node) { DeviceNode *devNode; devNode = container_of(node, node, DeviceNode); listDel(node); free(devNode); } static void freeImageNode(ListNode *node) { SCImageNode *imgNode; imgNode = container_of(node, node, SCImageNode); clReleaseMemObject(imgNode->image); listDoForEachSafe(&imgNode->usingDevices, freeDeviceNode); free(imgNode); } static int imageNodeCmp(const ListNode *node, const void *key) { SCImageNode *imgNode; const cl_mem *image; imgNode = container_of(node, node, SCImageNode); image = (const cl_mem *)key; return (imgNode->image == *image) ? 0 : 1; } static int deviceNodeCmp(const ListNode *node, const void *key) { cl_device_id *devID = (cl_device_id*)key; DeviceNode *devNode = container_of(node, node, DeviceNode); return !(devNode->devID == *devID); } static void checkBestImage(ListNode *node, void *priv) { SCImageNode *imgNode; ListNode *dnode; SearchContext *sctx = (SearchContext*)priv; cl_ulong es, is; // extra and image size imgNode = container_of(node, node, SCImageNode); is = imgNode->height * imgNode->width; // check if the image is not yet in use and meet the size requirements dnode = listNodeSearch(&imgNode->usingDevices, (const void*)&sctx->devID, deviceNodeCmp); if ((dnode == NULL) && (imgNode->width >= sctx->minWidth) && (is >= sctx->minSize)) { es = (is >= sctx->bestSize) ? (is - sctx->bestSize) : (sctx->bestSize - is); if (es < sctx->minExtraSize) { sctx->minExtraSize = es; sctx->bestImgNode = imgNode; } } } int VISIBILITY_HIDDEN initSCImages(void) { int ret = 0; listInitHead(&images); imagesLock = mutexInit(); if (imagesLock == NULL) { ret = -1; } return ret; } void VISIBILITY_HIDDEN releaseSCImages(void) { IMAGES_LOCK(); listDoForEachSafe(&images, freeImageNode); listInitHead(&images); IMAGES_UNLOCK(); mutexDestroy(imagesLock); } cl_mem VISIBILITY_HIDDEN getSCImage( cl_context ctx, cl_device_id devID, cl_ulong bestSize, cl_ulong minSize, size_t minWidth) { cl_mem img = NULL; DeviceNode *devNode; SearchContext sctx; sctx.ctx = ctx; sctx.devID = devID; sctx.bestSize = bestSize; sctx.minSize = minSize; sctx.minWidth = minWidth; sctx.minExtraSize = (cl_ulong)1 << 63; sctx.bestImgNode = NULL; devNode = malloc(sizeof(DeviceNode)); if (devNode == NULL) { return NULL; } /* * find an image serving turn to minimum of either * unused image space or unfitted data size */ IMAGES_LOCK(); listDoForEachPriv(&images, checkBestImage, &sctx); if (sctx.bestImgNode != NULL) { img = sctx.bestImgNode->image; devNode->devID = devID; listAddToTail(&sctx.bestImgNode->usingDevices, &devNode->node); clRetainMemObject(img); } IMAGES_UNLOCK(); if (img == NULL) { free(devNode); } return img; } void VISIBILITY_HIDDEN putSCImage(cl_device_id devID, cl_mem image) { ListNode *node; SCImageNode *imgNode; DeviceNode *devNode = NULL; IMAGES_LOCK(); node = listNodeSearch(&images, (const void*)&image, imageNodeCmp); if (node != NULL) { imgNode = container_of(node, node, SCImageNode); node = listNodeSearch(&imgNode->usingDevices, (const void*)&devID, deviceNodeCmp); if (node != NULL) { devNode = container_of(node, node, DeviceNode); listDel(node); } } IMAGES_UNLOCK(); if (devNode != NULL) { free(devNode); } clReleaseMemObject(image); } cl_ulong clblasAddScratchImage( cl_context context, size_t width, size_t height, clblasStatus *status) { cl_int err; cl_mem image; SCImageNode *imgNode; intptr_t tmp; if (!clblasInitialized) { if (status != NULL) { *status = clblasNotInitialized; } return 0; } if (!scratchImagesEnabled()) { if (status != NULL) { *status = clblasSuccess; } return 0; } image = clCreateImage2D(context, CL_MEM_READ_WRITE, &IMAGE_FORMAT, width, height, 0, NULL, &err); if (err != CL_SUCCESS) { if (status != NULL) { *status = (clblasStatus)err; } return 0; } imgNode = calloc(1, sizeof(SCImageNode)); if (imgNode == NULL) { clReleaseMemObject(image); if (status != NULL) { *status = clblasOutOfHostMemory; } return 0; } imgNode->image = image; imgNode->width = width; imgNode->height = height; listInitHead(&imgNode->usingDevices); mutexLock(imagesLock); if ((images.prev == NULL) && (images.next == NULL)) { listInitHead(&images); } listAddToHead(&images, &(imgNode->node)); mutexUnlock(imagesLock); if (status != NULL) { *status = clblasSuccess; } tmp = (intptr_t)image; return (cl_ulong)tmp; } clblasStatus clblasRemoveScratchImage( cl_ulong imageID) { intptr_t tmp = (intptr_t)imageID; cl_mem image = (cl_mem)tmp; ListNode *node; if (!clblasInitialized) { return clblasNotInitialized; } if (!scratchImagesEnabled()) { return clblasSuccess; } IMAGES_LOCK(); node = listNodeSearch(&images, &image, imageNodeCmp); if (node == NULL) { IMAGES_UNLOCK(); return clblasInvalidValue; } listDel(node); IMAGES_UNLOCK(); freeImageNode(node); return clblasSuccess; } clblas-2.10/src/library/blas/specialCases/000077500000000000000000000000001264277366700204735ustar00rootroot00000000000000clblas-2.10/src/library/blas/specialCases/GemmSpecialCases.cpp000066400000000000000000000761641264277366700243620ustar00rootroot00000000000000/* ************************************************************************ * Copyright 2015 Advanced Micro Devices, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * ************************************************************************/ #include "GemmSpecialCases.h" #include "UserGemmKernelSources/UserGemmKernelSourceIncludes.h" #include "UserGemmKernelSources/UserGemmClKernels.h" #include "xgemm.h" //helper functions defined in xgemm.cpp #include "AutoGemmIncludes/AutoGemmClKernels.h" #include "AutoGemmIncludes/AutoGemmKernelSources.h" #include "AutoGemmIncludes/AutoGemmKernelBinaries.h" /****************************************************************************** * Check OpenCL Errors *****************************************************************************/ #define CL_CHECK(RET) \ if(RET != CL_SUCCESS) { \ printf("OpenCL error %i on line %u\n", RET, __LINE__); \ assert(false); \ } /* template clblasStatus SGEMM_SPLIT_CALLS( cl_kernel *ClKernel, clblasOrder order, unsigned int tile_size, unsigned int WG_size, unsigned int M_split_factor, unsigned int N_split_factor, unsigned int K_split_factor, clblasTranspose transA, clblasTranspose transB, cl_uint M, cl_uint N, cl_uint K, precision alpha, cl_mem A, cl_uint offA, cl_uint lda, cl_mem B, cl_uint offB, cl_uint ldb, precision beta, cl_mem C, cl_uint offC, cl_uint ldc, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events); */ template clblasStatus GEMM_SPLIT_CALLS( cl_kernel *ClKernel, clblasOrder order, unsigned int tile_size, unsigned int WG_size, unsigned int M_split_factor, unsigned int N_split_factor, unsigned int K_split_factor, clblasTranspose transA, clblasTranspose transB, cl_uint M, cl_uint N, cl_uint K, precision alpha, cl_mem A, cl_uint offA, cl_uint lda, cl_mem B, cl_uint offB, cl_uint ldb, precision beta, cl_mem C, cl_uint offC, cl_uint ldc, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events) { //for example, when M=N=K=8192 in GEMM col NT //we are gonna call 16 GEMMs //each GEMM has M=N=K=4096 //note are direct GEMM call has a 0.7 TFLOPS performance // [ A11 | A12 | A13 | A14 ] [ B11 | B12 | B13 | B14 ] [ C11 | C12 ] // A = [ A21 | A22 | A23 | A24 ] B = [ B21 | B22 | B23 | B24 ] C = [ C21 | C22 ] // 16 GEMMs are // #01: C11 = a*A11*B11 + b*C11 // #02: C11 = a*A12*B12 + 1*C11 // #03: C11 = a*A13*B13 + 1*C11 // #04: C11 = a*A14*B14 + 1*C11 now we are done with C11 // #05: C12 = a*A11*B21 + b*C12 // #06: C12 = a*A12*B22 + 1*C12 // #07: C12 = a*A12*B22 + 1*C12 // #08: C12 = a*A12*B22 + 1*C12 now we are done with C12 // #09: C21 = a*A21*B11 + b*C21 // #10: C21 = a*A22*B12 + 1*C21 // #11: C21 = a*A23*B13 + 1*C21 // #12: C21 = a*A24*B14 + 1*C21 now we are done with C21 // #13: C22 = a*A21*B21 + b*C22 // #14: C22 = a*A22*B22 + 1*C22 // #15: C22 = a*A23*B23 + 1*C22 // #16: C22 = a*A24*B24 + 1*C22 now we are done with C22 if (transA == clblasNoTrans && transB == clblasTrans) { unsigned int small_M = M / M_split_factor; unsigned int small_N = N / N_split_factor; unsigned int small_K = K / K_split_factor; size_t GlobalX = ((small_M - 1) / tile_size + 1) * WG_size; size_t GlobalY = ((small_N - 1) / tile_size + 1) * WG_size; size_t gs[2] = { GlobalX, GlobalY }; size_t wgsize[2] = { WG_size, WG_size }; cl_int error = 0; precision betaone = 1; error = clSetKernelArg(*ClKernel, 5, sizeof(cl_uint), &small_M); assert(error == CL_SUCCESS); error = clSetKernelArg(*ClKernel, 6, sizeof(cl_uint), &small_N); assert(error == CL_SUCCESS); error = clSetKernelArg(*ClKernel, 7, sizeof(cl_uint), &small_K); assert(error == CL_SUCCESS); for (int M_split_index = 0; M_split_index < M_split_factor; M_split_index++) { for (int N_split_index = 0; N_split_index < N_split_factor; N_split_index++) { unsigned int offc_C = ldc*N / N_split_factor * N_split_index + M / M_split_factor * M_split_index + offC; error = clSetKernelArg(*ClKernel, 13, sizeof(cl_uint), &offc_C); assert(error == CL_SUCCESS); for (int K_split_index = 0; K_split_index < K_split_factor; K_split_index++) { unsigned int offa_A = (M / M_split_factor * M_split_index) + (lda * K / K_split_factor * K_split_index) + offA; unsigned int offb_B = (N / N_split_factor * N_split_index) + (ldb * K / K_split_factor * K_split_index) + offB; error = clSetKernelArg(*ClKernel, 11, sizeof(cl_uint), &offa_A); assert(error == CL_SUCCESS); error = clSetKernelArg(*ClKernel, 12, sizeof(cl_uint), &offb_B); assert(error == CL_SUCCESS); if (K_split_index == 0) { error = clSetKernelArg(*ClKernel, 4, sizeof(precision), &(beta)); assert(error == CL_SUCCESS); if (M_split_index == 0 && N_split_index == 0) { //very first GEMM call if ((M_split_factor == 1) && (N_split_factor == 1) && (K_split_factor == 1)) { //also very last GEMM call error = clEnqueueNDRangeKernel(commandQueues[0], *ClKernel, 2, NULL, gs, wgsize, numEventsInWaitList, eventWaitList, &events[0]); assert(error == CL_SUCCESS); } else { error = clEnqueueNDRangeKernel(commandQueues[0], *ClKernel, 2, NULL, gs, wgsize, numEventsInWaitList, eventWaitList, NULL); assert(error == CL_SUCCESS); } } else { error = clEnqueueNDRangeKernel(commandQueues[0], *ClKernel, 2, NULL, gs, wgsize, 0, NULL, NULL); assert(error == CL_SUCCESS); } } else { error = clSetKernelArg(*ClKernel, 4, sizeof(precision), &betaone); assert(error == CL_SUCCESS); if ((M_split_index == (M_split_factor - 1)) && (N_split_index == (N_split_factor - 1)) && (K_split_index == (K_split_factor - 1))) { //very last GEMM call error = clEnqueueNDRangeKernel(commandQueues[0], *ClKernel, 2, NULL, gs, wgsize, 0, NULL, events); assert(error == CL_SUCCESS); } else { error = clEnqueueNDRangeKernel(commandQueues[0], *ClKernel, 2, NULL, gs, wgsize, 0, NULL, NULL); assert(error == CL_SUCCESS); } } } } } return clblasSuccess; } return clblasNotImplemented; } clblasStatus SGEMM_mod1024( clblasTranspose transA, clblasTranspose transB, cl_uint M, cl_uint N, cl_uint K, float alpha, cl_mem A, cl_uint offA, cl_uint lda, cl_mem B, cl_uint offB, cl_uint ldb, float beta, cl_mem C, cl_uint offC, cl_uint ldc, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events, bool &specialCaseHandled) { const char *tileKernelSource = NULL; cl_kernel *tileClKernel = NULL; size_t tileKernelBinarySize = 0; cl_int err; const unsigned char *tileKernelBinary = NULL; clblasStatus status; //split the kernel calls to handle sgemm NT perf drop at big multiples of 1024 if ((lda % 1024 == 0) && (ldb % 1024 == 0) && (K > lda / 4)) { if ((lda == ldb) && (lda >= 4096) && (lda <= 8192)) // between 4096 and 8192 for now { if (lda != 6144)// 6144 is handled by 96 x 96 kernel { // we are going to call 16 GEMMs with M=M/2, N=N/2, K=K/4 // each GEMM requires M%128 == 0, N%128 == 0, K%16 == 0 if (M % 256 == 0 && N % 256 == 0 && K % 64 == 0) { if (!((transA == clblasNoTrans) && (transB == clblasTrans))) return clblasNotImplemented; specialCaseHandled = true; unsigned int M_split_factor; unsigned int N_split_factor; unsigned int K_split_factor; if (lda < 7168) { M_split_factor = 1; N_split_factor = 1; K_split_factor = 1; } else { //7168, 8192 M_split_factor = 2; N_split_factor = 2; K_split_factor = 4; } tileKernelSource = sgemm_Col_NT_B1_MX128_NX128_KX16_src; tileClKernel = &sgemm_Col_NT_B1_MX128_NX128_KX16_clKernel; tileKernelBinary = sgemm_Col_NT_B1_MX128_NX128_KX16_bin; tileKernelBinarySize = sgemm_Col_NT_B1_MX128_NX128_KX16_binSize; makeGemmKernel(tileClKernel, commandQueues[0], tileKernelSource, User_srcBuildOptions, &tileKernelBinary, &tileKernelBinarySize, User_binBuildOptions); err = clSetKernelArg(*tileClKernel, 0, sizeof(cl_mem), &A); CL_CHECK(err); err = clSetKernelArg(*tileClKernel, 1, sizeof(cl_mem), &B); CL_CHECK(err); err = clSetKernelArg(*tileClKernel, 2, sizeof(cl_mem), &C); CL_CHECK(err); err = clSetKernelArg(*tileClKernel, 3, sizeof(cl_float), &alpha); CL_CHECK(err); err = clSetKernelArg(*tileClKernel, 4, sizeof(cl_float), &beta); CL_CHECK(err); err = clSetKernelArg(*tileClKernel, 5, sizeof(cl_uint), &M); CL_CHECK(err); err = clSetKernelArg(*tileClKernel, 6, sizeof(cl_uint), &N); CL_CHECK(err); err = clSetKernelArg(*tileClKernel, 7, sizeof(cl_uint), &K); CL_CHECK(err); err = clSetKernelArg(*tileClKernel, 8, sizeof(cl_uint), &lda); CL_CHECK(err); err = clSetKernelArg(*tileClKernel, 9, sizeof(cl_uint), &ldb); CL_CHECK(err); err = clSetKernelArg(*tileClKernel, 10, sizeof(cl_uint), &ldc); CL_CHECK(err); err = clSetKernelArg(*tileClKernel, 11, sizeof(cl_uint), &offA); CL_CHECK(err); err = clSetKernelArg(*tileClKernel, 12, sizeof(cl_uint), &offB); CL_CHECK(err); err = clSetKernelArg(*tileClKernel, 13, sizeof(cl_uint), &offC); CL_CHECK(err); status = GEMM_SPLIT_CALLS( tileClKernel, clblasColumnMajor, 128, 16, M_split_factor, N_split_factor, K_split_factor, transA, transB, M, N, K, alpha, A, offA, lda, B, offB, ldb, beta, C, offC, ldc, numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events); return status; } } else { // lda == ldb == 6144 // we are going to call 4 GEMMs each with K = K/4 if (M % 96 == 0 && N % 96 == 0 && K % 64 == 0) { if (!((transA == clblasNoTrans) && (transB == clblasTrans))) return clblasNotImplemented; specialCaseHandled = true; unsigned int M_split_factor = 1; unsigned int N_split_factor = 1; unsigned int K_split_factor = 4; tileKernelSource = sgemm_Col_NT_B1_MX096_NX096_KX16_src; tileClKernel = &sgemm_Col_NT_B1_MX096_NX096_KX16_clKernel; tileKernelBinary = sgemm_Col_NT_B1_MX096_NX096_KX16_bin; tileKernelBinarySize = sgemm_Col_NT_B1_MX096_NX096_KX16_binSize; makeGemmKernel(tileClKernel, commandQueues[0], tileKernelSource, User_srcBuildOptions, &tileKernelBinary, &tileKernelBinarySize, User_binBuildOptions); err = clSetKernelArg(*tileClKernel, 0, sizeof(cl_mem), &A); CL_CHECK(err); err = clSetKernelArg(*tileClKernel, 1, sizeof(cl_mem), &B); CL_CHECK(err); err = clSetKernelArg(*tileClKernel, 2, sizeof(cl_mem), &C); CL_CHECK(err); err = clSetKernelArg(*tileClKernel, 3, sizeof(cl_float), &alpha); CL_CHECK(err); err = clSetKernelArg(*tileClKernel, 4, sizeof(cl_float), &beta); CL_CHECK(err); err = clSetKernelArg(*tileClKernel, 5, sizeof(cl_uint), &M); CL_CHECK(err); err = clSetKernelArg(*tileClKernel, 6, sizeof(cl_uint), &N); CL_CHECK(err); err = clSetKernelArg(*tileClKernel, 7, sizeof(cl_uint), &K); CL_CHECK(err); err = clSetKernelArg(*tileClKernel, 8, sizeof(cl_uint), &lda); CL_CHECK(err); err = clSetKernelArg(*tileClKernel, 9, sizeof(cl_uint), &ldb); CL_CHECK(err); err = clSetKernelArg(*tileClKernel, 10, sizeof(cl_uint), &ldc); CL_CHECK(err); err = clSetKernelArg(*tileClKernel, 11, sizeof(cl_uint), &offA); CL_CHECK(err); err = clSetKernelArg(*tileClKernel, 12, sizeof(cl_uint), &offB); CL_CHECK(err); err = clSetKernelArg(*tileClKernel, 13, sizeof(cl_uint), &offC); CL_CHECK(err); status = GEMM_SPLIT_CALLS( tileClKernel, clblasColumnMajor, 96, 16, M_split_factor, N_split_factor, K_split_factor, transA, transB, M, N, K, alpha, A, offA, lda, B, offB, ldb, beta, C, offC, ldc, numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events); return status; } } } } return clblasNotImplemented; } clblasStatus SGEMM_SPLIT64_32( clblasTranspose transA, clblasTranspose transB, cl_uint M, cl_uint N, cl_uint K, float alpha, cl_mem A, cl_uint offA, cl_uint lda, cl_mem B, cl_uint offB, cl_uint ldb, float beta, cl_mem C, cl_uint offC, cl_uint ldc, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events, bool &specialCaseHandled) { //all the mod32 sizes that is not mod64 or mod96 ranging from 1184 to 3872 //non mod32 cases are not implemented in this approach and are of less interest const char *tileKernelSource = NULL; const char *rowKernelSource = NULL; const char *columnKernelSource = NULL; const char *singleKernelSource = NULL; cl_kernel *tileClKernel = NULL; cl_kernel *rowClKernel = NULL; cl_kernel *columnClKernel = NULL; cl_kernel *singleClKernel = NULL; const unsigned char *tileKernelBinary = NULL; const unsigned char *rowKernelBinary = NULL; const unsigned char *columnKernelBinary = NULL; const unsigned char *singleKernelBinary = NULL; size_t tileKernelBinarySize = 0; size_t rowKernelBinarySize = 0; size_t columnKernelBinarySize = 0; size_t singleKernelBinarySize = 0; cl_int err; if ((M >= 1184 && N >= 1184) && (M <= 3872 && N <= 3872) && (M % 64 != 0 && N % 64 != 0) && (M % 96 != 0 && N % 96 != 0) && (K % 16 == 0)) { if ((M % 32 == 0 && N % 32 == 0) && (transA == clblasNoTrans && transB == clblasTrans)) { specialCaseHandled = true; //execute the kernels //GlobalX = ((Mvalue - 1) / 64) * 16 //GlobalY = ((Nvalue - 1) / 64) * 16 size_t GlobalX = ((M - 1) / 64) * 16; size_t GlobalY = ((N - 1) / 64) * 16; size_t gs[2] = { GlobalX, GlobalY }; size_t wgsize[2] = { 16, 16 }; tileKernelSource = sgemm_Col_NT_B1_MX064_NX064_KX16_src; tileClKernel = &sgemm_Col_NT_B1_MX064_NX064_KX16_clKernel; tileKernelBinary = sgemm_Col_NT_B1_MX064_NX064_KX16_bin; tileKernelBinarySize = sgemm_Col_NT_B1_MX064_NX064_KX16_binSize; rowKernelSource = sgemm_Col_NT_B1_MX032_NX064_KX16_ROW_src; rowClKernel = &sgemm_Col_NT_B1_MX032_NX064_KX16_ROW_clKernel; rowKernelBinary = sgemm_Col_NT_B1_MX032_NX064_KX16_ROW_bin; rowKernelBinarySize = sgemm_Col_NT_B1_MX032_NX064_KX16_ROW_binSize; columnKernelSource = sgemm_Col_NT_B1_MX064_NX032_KX16_COLUMN_src; columnClKernel = &sgemm_Col_NT_B1_MX064_NX032_KX16_COLUMN_clKernel; columnKernelBinary = sgemm_Col_NT_B1_MX064_NX032_KX16_COLUMN_bin; columnKernelBinarySize = sgemm_Col_NT_B1_MX064_NX032_KX16_COLUMN_binSize; singleKernelSource = sgemm_Col_NT_B1_MX032_NX032_KX16_SINGLE_src; singleClKernel = &sgemm_Col_NT_B1_MX032_NX032_KX16_SINGLE_clKernel; singleKernelBinary = sgemm_Col_NT_B1_MX032_NX032_KX16_SINGLE_bin; singleKernelBinarySize = sgemm_Col_NT_B1_MX032_NX032_KX16_SINGLE_binSize; cl_kernel * Kernels[4] = { tileClKernel, rowClKernel, columnClKernel, singleClKernel }; makeGemmKernel(tileClKernel, commandQueues[0], tileKernelSource, User_srcBuildOptions, &tileKernelBinary, &tileKernelBinarySize, User_binBuildOptions); makeGemmKernel(rowClKernel, commandQueues[0], rowKernelSource, User_srcBuildOptions, &rowKernelBinary, &rowKernelBinarySize, User_binBuildOptions); makeGemmKernel(columnClKernel, commandQueues[0], columnKernelSource, User_srcBuildOptions, &columnKernelBinary, &columnKernelBinarySize, User_binBuildOptions); makeGemmKernel(singleClKernel, commandQueues[0], singleKernelSource, User_srcBuildOptions, &singleKernelBinary, &singleKernelBinarySize, User_binBuildOptions); for (int i = 0; i < 4; i++) { err = clSetKernelArg(*Kernels[i], 0, sizeof(cl_mem), &A); CL_CHECK(err); err = clSetKernelArg(*Kernels[i], 1, sizeof(cl_mem), &B); CL_CHECK(err); err = clSetKernelArg(*Kernels[i], 2, sizeof(cl_mem), &C); CL_CHECK(err); err = clSetKernelArg(*Kernels[i], 3, sizeof(cl_float), &alpha); CL_CHECK(err); err = clSetKernelArg(*Kernels[i], 4, sizeof(cl_float), &beta); CL_CHECK(err); err = clSetKernelArg(*Kernels[i], 5, sizeof(cl_uint), &M); CL_CHECK(err); err = clSetKernelArg(*Kernels[i], 6, sizeof(cl_uint), &N); CL_CHECK(err); err = clSetKernelArg(*Kernels[i], 7, sizeof(cl_uint), &K); CL_CHECK(err); err = clSetKernelArg(*Kernels[i], 8, sizeof(cl_uint), &lda); CL_CHECK(err); err = clSetKernelArg(*Kernels[i], 9, sizeof(cl_uint), &ldb); CL_CHECK(err); err = clSetKernelArg(*Kernels[i], 10, sizeof(cl_uint), &ldc); CL_CHECK(err); err = clSetKernelArg(*Kernels[i], 11, sizeof(cl_uint), &offA); CL_CHECK(err); err = clSetKernelArg(*Kernels[i], 12, sizeof(cl_uint), &offB); CL_CHECK(err); err = clSetKernelArg(*Kernels[i], 13, sizeof(cl_uint), &offC); CL_CHECK(err); } err = clEnqueueNDRangeKernel(commandQueues[0], *Kernels[0], 2, NULL, gs, wgsize, numEventsInWaitList, eventWaitList, NULL); gs[0] = 16; err |= clEnqueueNDRangeKernel(commandQueues[0], *Kernels[1], 2, NULL, gs, wgsize, 0, NULL, NULL); gs[1] = 16; gs[0] = GlobalX; err |= clEnqueueNDRangeKernel(commandQueues[0], *Kernels[2], 2, NULL, gs, wgsize, 0, NULL, NULL); gs[0] = 16; gs[1] = 16; err |= clEnqueueNDRangeKernel(commandQueues[0], *Kernels[3], 2, NULL, gs, wgsize, 0, NULL, events); if (err == 0) return clblasSuccess; } } return clblasNotImplemented; } clblasStatus SGEMM_BRANCH_32( clblasTranspose transA, clblasTranspose transB, cl_uint M, cl_uint N, cl_uint K, float alpha, cl_mem A, cl_uint offA, cl_uint lda, cl_mem B, cl_uint offB, cl_uint ldb, float beta, cl_mem C, cl_uint offC, cl_uint ldc, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events, bool &specialCaseHandled) { const char *tileKernelSource = NULL; cl_kernel *tileClKernel = NULL; size_t tileKernelBinarySize = 0; cl_int err; const unsigned char *tileKernelBinary = NULL; clblasStatus status; if ((M * N < 1080 * 1080) && (M % 32 != 0 || N % 32 != 0) && (K%16==0)) { // ((Mvalue - 1) / 32 + 1) * 16 size_t GlobalX = ((M - 1) / 32 + 1) * 16; size_t GlobalY = ((N - 1) / 32 + 1) * 16; size_t gs[2] = { GlobalX, GlobalY }; size_t wgsize[2] = { 16, 16 }; if (transA == clblasNoTrans && transB == clblasNoTrans) { specialCaseHandled = true; tileKernelSource = sgemm_Col_NN_B1_MX032_NX032_KX16_BRANCH_src; tileClKernel = &sgemm_Col_NN_B1_MX032_NX032_KX16_BRANCH_clKernel; tileKernelBinary = sgemm_Col_NN_B1_MX032_NX032_KX16_BRANCH_bin; tileKernelBinarySize = sgemm_Col_NN_B1_MX032_NX032_KX16_BRANCH_binSize; makeGemmKernel(tileClKernel, commandQueues[0], tileKernelSource, User_srcBuildOptions, &tileKernelBinary, &tileKernelBinarySize, User_binBuildOptions); err = clSetKernelArg(*tileClKernel, 0, sizeof(cl_mem), &A); CL_CHECK(err); err = clSetKernelArg(*tileClKernel, 1, sizeof(cl_mem), &B); CL_CHECK(err); err = clSetKernelArg(*tileClKernel, 2, sizeof(cl_mem), &C); CL_CHECK(err); err = clSetKernelArg(*tileClKernel, 3, sizeof(cl_float), &alpha); CL_CHECK(err); err = clSetKernelArg(*tileClKernel, 4, sizeof(cl_float), &beta); CL_CHECK(err); err = clSetKernelArg(*tileClKernel, 5, sizeof(cl_uint), &M); CL_CHECK(err); err = clSetKernelArg(*tileClKernel, 6, sizeof(cl_uint), &N); CL_CHECK(err); err = clSetKernelArg(*tileClKernel, 7, sizeof(cl_uint), &K); CL_CHECK(err); err = clSetKernelArg(*tileClKernel, 8, sizeof(cl_uint), &lda); CL_CHECK(err); err = clSetKernelArg(*tileClKernel, 9, sizeof(cl_uint), &ldb); CL_CHECK(err); err = clSetKernelArg(*tileClKernel, 10, sizeof(cl_uint), &ldc); CL_CHECK(err); err = clSetKernelArg(*tileClKernel, 11, sizeof(cl_uint), &offA); CL_CHECK(err); err = clSetKernelArg(*tileClKernel, 12, sizeof(cl_uint), &offB); CL_CHECK(err); err = clSetKernelArg(*tileClKernel, 13, sizeof(cl_uint), &offC); CL_CHECK(err); err = clEnqueueNDRangeKernel(commandQueues[0], *tileClKernel, 2, NULL, gs, wgsize, numEventsInWaitList, eventWaitList, &events[0]); if (err == 0) return clblasSuccess; } if (transA == clblasNoTrans && transB == clblasTrans) { specialCaseHandled = true; tileKernelSource = sgemm_Col_NT_B1_MX032_NX032_KX16_BRANCH_src; tileClKernel = &sgemm_Col_NT_B1_MX032_NX032_KX16_BRANCH_clKernel; tileKernelBinary = sgemm_Col_NT_B1_MX032_NX032_KX16_BRANCH_bin; tileKernelBinarySize = sgemm_Col_NT_B1_MX032_NX032_KX16_BRANCH_binSize; makeGemmKernel(tileClKernel, commandQueues[0], tileKernelSource, User_srcBuildOptions, &tileKernelBinary, &tileKernelBinarySize, User_binBuildOptions); err = clSetKernelArg(*tileClKernel, 0, sizeof(cl_mem), &A); CL_CHECK(err); err = clSetKernelArg(*tileClKernel, 1, sizeof(cl_mem), &B); CL_CHECK(err); err = clSetKernelArg(*tileClKernel, 2, sizeof(cl_mem), &C); CL_CHECK(err); err = clSetKernelArg(*tileClKernel, 3, sizeof(cl_float), &alpha); CL_CHECK(err); err = clSetKernelArg(*tileClKernel, 4, sizeof(cl_float), &beta); CL_CHECK(err); err = clSetKernelArg(*tileClKernel, 5, sizeof(cl_uint), &M); CL_CHECK(err); err = clSetKernelArg(*tileClKernel, 6, sizeof(cl_uint), &N); CL_CHECK(err); err = clSetKernelArg(*tileClKernel, 7, sizeof(cl_uint), &K); CL_CHECK(err); err = clSetKernelArg(*tileClKernel, 8, sizeof(cl_uint), &lda); CL_CHECK(err); err = clSetKernelArg(*tileClKernel, 9, sizeof(cl_uint), &ldb); CL_CHECK(err); err = clSetKernelArg(*tileClKernel, 10, sizeof(cl_uint), &ldc); CL_CHECK(err); err = clSetKernelArg(*tileClKernel, 11, sizeof(cl_uint), &offA); CL_CHECK(err); err = clSetKernelArg(*tileClKernel, 12, sizeof(cl_uint), &offB); CL_CHECK(err); err = clSetKernelArg(*tileClKernel, 13, sizeof(cl_uint), &offC); CL_CHECK(err); err = clEnqueueNDRangeKernel(commandQueues[0], *tileClKernel, 2, NULL, gs, wgsize, numEventsInWaitList, eventWaitList, &events[0]); if (err == 0) return clblasSuccess; } if (transA == clblasTrans && transB == clblasNoTrans) { specialCaseHandled = true; tileKernelSource = sgemm_Col_TN_B1_MX032_NX032_KX16_BRANCH_src; tileClKernel = &sgemm_Col_TN_B1_MX032_NX032_KX16_BRANCH_clKernel; tileKernelBinary = sgemm_Col_TN_B1_MX032_NX032_KX16_BRANCH_bin; tileKernelBinarySize = sgemm_Col_TN_B1_MX032_NX032_KX16_BRANCH_binSize; makeGemmKernel(tileClKernel, commandQueues[0], tileKernelSource, User_srcBuildOptions, &tileKernelBinary, &tileKernelBinarySize, User_binBuildOptions); err = clSetKernelArg(*tileClKernel, 0, sizeof(cl_mem), &A); CL_CHECK(err); err = clSetKernelArg(*tileClKernel, 1, sizeof(cl_mem), &B); CL_CHECK(err); err = clSetKernelArg(*tileClKernel, 2, sizeof(cl_mem), &C); CL_CHECK(err); err = clSetKernelArg(*tileClKernel, 3, sizeof(cl_float), &alpha); CL_CHECK(err); err = clSetKernelArg(*tileClKernel, 4, sizeof(cl_float), &beta); CL_CHECK(err); err = clSetKernelArg(*tileClKernel, 5, sizeof(cl_uint), &M); CL_CHECK(err); err = clSetKernelArg(*tileClKernel, 6, sizeof(cl_uint), &N); CL_CHECK(err); err = clSetKernelArg(*tileClKernel, 7, sizeof(cl_uint), &K); CL_CHECK(err); err = clSetKernelArg(*tileClKernel, 8, sizeof(cl_uint), &lda); CL_CHECK(err); err = clSetKernelArg(*tileClKernel, 9, sizeof(cl_uint), &ldb); CL_CHECK(err); err = clSetKernelArg(*tileClKernel, 10, sizeof(cl_uint), &ldc); CL_CHECK(err); err = clSetKernelArg(*tileClKernel, 11, sizeof(cl_uint), &offA); CL_CHECK(err); err = clSetKernelArg(*tileClKernel, 12, sizeof(cl_uint), &offB); CL_CHECK(err); err = clSetKernelArg(*tileClKernel, 13, sizeof(cl_uint), &offC); CL_CHECK(err); err = clEnqueueNDRangeKernel(commandQueues[0], *tileClKernel, 2, NULL, gs, wgsize, numEventsInWaitList, eventWaitList, &events[0]); if (err == 0) return clblasSuccess; } } return clblasNotImplemented; } clblasStatus DGEMM_BIG_MOD48( clblasTranspose transA, clblasTranspose transB, cl_uint M, cl_uint N, cl_uint K, double alpha, cl_mem A, cl_uint offA, cl_uint lda, cl_mem B, cl_uint offB, cl_uint ldb, double beta, cl_mem C, cl_uint offC, cl_uint ldc, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events, bool &specialCaseHandled) { const char *tileKernelSource = NULL; cl_kernel *tileClKernel = NULL; size_t tileKernelBinarySize = 0; cl_int err; const unsigned char *tileKernelBinary = NULL; clblasStatus status; //split the kernel calls to handle dgemm NT perf drop when matrix sizes are big if ((lda == ldb) && (lda >= 18000) && (lda <= 36000)) // between 18000 and 36000 for now { if (!((transA == clblasNoTrans) && (transB == clblasTrans))) return clblasNotImplemented; unsigned int M_split_factor; unsigned int N_split_factor; unsigned int K_split_factor; if ((M % 192 == 0) && (N % 192 == 0) && (K % 192 == 0) && (K > lda / 4)) { M_split_factor = 4; N_split_factor = 4; K_split_factor = 4; } else if ((M % 96 == 0) && (N % 96 == 0) && (K % 96 == 0) && (K > lda / 4)) { M_split_factor = 2; N_split_factor = 2; K_split_factor = 2; } else { return clblasNotImplemented; } tileKernelSource = dgemm_Col_NT_B1_MX048_NX048_KX08_src; tileClKernel = &dgemm_Col_NT_B1_MX048_NX048_KX08_clKernel; tileKernelBinary = dgemm_Col_NT_B1_MX048_NX048_KX08_bin; tileKernelBinarySize = dgemm_Col_NT_B1_MX048_NX048_KX08_binSize; makeGemmKernel(tileClKernel, commandQueues[0], tileKernelSource, User_srcBuildOptions, &tileKernelBinary, &tileKernelBinarySize, User_binBuildOptions); err = clSetKernelArg(*tileClKernel, 0, sizeof(cl_mem), &A); CL_CHECK(err); err = clSetKernelArg(*tileClKernel, 1, sizeof(cl_mem), &B); CL_CHECK(err); err = clSetKernelArg(*tileClKernel, 2, sizeof(cl_mem), &C); CL_CHECK(err); err = clSetKernelArg(*tileClKernel, 3, sizeof(cl_double), &alpha); CL_CHECK(err); err = clSetKernelArg(*tileClKernel, 4, sizeof(cl_double), &beta); CL_CHECK(err); err = clSetKernelArg(*tileClKernel, 5, sizeof(cl_uint), &M); CL_CHECK(err); err = clSetKernelArg(*tileClKernel, 6, sizeof(cl_uint), &N); CL_CHECK(err); err = clSetKernelArg(*tileClKernel, 7, sizeof(cl_uint), &K); CL_CHECK(err); err = clSetKernelArg(*tileClKernel, 8, sizeof(cl_uint), &lda); CL_CHECK(err); err = clSetKernelArg(*tileClKernel, 9, sizeof(cl_uint), &ldb); CL_CHECK(err); err = clSetKernelArg(*tileClKernel, 10, sizeof(cl_uint), &ldc); CL_CHECK(err); err = clSetKernelArg(*tileClKernel, 11, sizeof(cl_uint), &offA); CL_CHECK(err); err = clSetKernelArg(*tileClKernel, 12, sizeof(cl_uint), &offB); CL_CHECK(err); err = clSetKernelArg(*tileClKernel, 13, sizeof(cl_uint), &offC); CL_CHECK(err); status = GEMM_SPLIT_CALLS( tileClKernel, clblasColumnMajor, 48, 8, M_split_factor, N_split_factor, K_split_factor, transA, transB, M, N, K, alpha, A, offA, lda, B, offB, ldb, beta, C, offC, ldc, numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events); if (status == clblasSuccess) specialCaseHandled = true; return status; } return clblasNotImplemented; } template<> clblasStatus GemmSpecialCases(clblasOrder order, clblasTranspose transA, clblasTranspose transB, cl_uint M, cl_uint N, cl_uint K, float alpha, cl_mem A, cl_uint offA, cl_uint lda, cl_mem B, cl_uint offB, cl_uint ldb, float beta, cl_mem C, cl_uint offC, cl_uint ldc, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events, bool &specialCaseHandled) { if (order == clblasRowMajor) return clblasNotImplemented; clblasStatus status; //handles big multiples of 1024 status = SGEMM_mod1024(transA, transB, M, N, K, alpha, A, offA, lda, B, offB, ldb, beta, C, offC, ldc, numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events, specialCaseHandled); if (specialCaseHandled) return status; //handles mod32 but not mod64 status = SGEMM_SPLIT64_32(transA, transB, M, N, K, alpha, A, offA, lda, B, offB, ldb, beta, C, offC, ldc, numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events, specialCaseHandled); if (specialCaseHandled) return status; //handles middle range sgemm (M*N<1080*1080) that are not mod32 (M%32!=0 || N%32!=0) //use 32x32 micro tile kernels with branch statement within kernels status = SGEMM_BRANCH_32(transA, transB, M, N, K, alpha, A, offA, lda, B, offB, ldb, beta, C, offC, ldc, numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events, specialCaseHandled); if (specialCaseHandled) return status; return clblasNotImplemented; } template<> clblasStatus GemmSpecialCases(clblasOrder order, clblasTranspose transA, clblasTranspose transB, cl_uint M, cl_uint N, cl_uint K, double alpha, cl_mem A, cl_uint offA, cl_uint lda, cl_mem B, cl_uint offB, cl_uint ldb, double beta, cl_mem C, cl_uint offC, cl_uint ldc, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events, bool &specialCaseHandled) { if (order == clblasRowMajor) return clblasNotImplemented; clblasStatus status; status = DGEMM_BIG_MOD48(transA, transB, M, N, K, alpha, A, offA, lda, B, offB, ldb, beta, C, offC, ldc, numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events, specialCaseHandled); if (specialCaseHandled) return status; return clblasNotImplemented; } template<> clblasStatus GemmSpecialCases(clblasOrder order, clblasTranspose transA, clblasTranspose transB, cl_uint M, cl_uint N, cl_uint K, FloatComplex alpha, cl_mem A, cl_uint offA, cl_uint lda, cl_mem B, cl_uint offB, cl_uint ldb, FloatComplex beta, cl_mem C, cl_uint offC, cl_uint ldc, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events, bool &specialCaseHandled) { return clblasNotImplemented; } template<> clblasStatus GemmSpecialCases(clblasOrder order, clblasTranspose transA, clblasTranspose transB, cl_uint M, cl_uint N, cl_uint K, DoubleComplex alpha, cl_mem A, cl_uint offA, cl_uint lda, cl_mem B, cl_uint offB, cl_uint ldb, DoubleComplex beta, cl_mem C, cl_uint offC, cl_uint ldc, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events, bool &specialCaseHandled) { return clblasNotImplemented; }clblas-2.10/src/library/blas/specialCases/include/000077500000000000000000000000001264277366700221165ustar00rootroot00000000000000clblas-2.10/src/library/blas/specialCases/include/GemmSpecialCases.h000066400000000000000000000026561264277366700254450ustar00rootroot00000000000000/* ************************************************************************ * Copyright 2015 Advanced Micro Devices, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * ************************************************************************/ #ifndef CLBLAS_GEMM_SPECIAL_CASES_H #define CLBLAS_GEMM_SPECIAL_CASES_H #include #include #include template clblasStatus GemmSpecialCases(clblasOrder order, clblasTranspose transA, clblasTranspose transB, cl_uint M, cl_uint N, cl_uint K, Precision alpha, cl_mem A, cl_uint offA, cl_uint lda, cl_mem B, cl_uint offB, cl_uint ldb, Precision beta, cl_mem C, cl_uint offC, cl_uint ldc, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events, bool &specialCaseHandled); #endifclblas-2.10/src/library/blas/trtri/000077500000000000000000000000001264277366700172405ustar00rootroot00000000000000clblas-2.10/src/library/blas/trtri/TrtriClKernels.h000066400000000000000000000040331264277366700223200ustar00rootroot00000000000000 #ifndef TRTRI_CL_KERNELS_H #define TRTRI_CL_KERNELS_H #if defined( __APPLE__ ) || defined ( __MACOS ) #include #else #include "CL/cl.h" #endif /*mod 192 dtrsm*/ static cl_kernel diag_dtrtri_upper_192_12_clKernel = NULL; static cl_kernel triple_dgemm_update_192_12_R_clKernel = NULL; static cl_kernel triple_dgemm_update_192_24_PART1_R_clKernel = NULL; static cl_kernel triple_dgemm_update_192_24_PART2_R_clKernel = NULL; static cl_kernel triple_dgemm_update_192_48_PART1_R_clKernel = NULL; static cl_kernel triple_dgemm_update_192_48_PART2_R_clKernel = NULL; static cl_kernel triple_dgemm_update_192_96_PART1_R_clKernel = NULL; static cl_kernel triple_dgemm_update_192_96_PART2_R_clKernel = NULL; /*mod 128 dtrsm*/ /*upper*/ static cl_kernel diag_dtrtri_upper_128_16_clKernel = NULL; static cl_kernel triple_dgemm_update_128_16_R_clKernel = NULL; static cl_kernel triple_dgemm_update_128_32_PART1_R_clKernel = NULL; static cl_kernel triple_dgemm_update_128_32_PART2_R_clKernel = NULL; static cl_kernel triple_dgemm_update_128_64_PART1_R_clKernel = NULL; static cl_kernel triple_dgemm_update_128_64_PART2_R_clKernel = NULL; static cl_kernel triple_dgemm_update_128_ABOVE64_PART1_R_clKernel = NULL; static cl_kernel triple_dgemm_update_128_ABOVE64_PART2_R_clKernel = NULL; static cl_kernel triple_dgemm_update_128_ABOVE64_PART3_R_clKernel = NULL; /*lower*/ static cl_kernel diag_dtrtri_lower_128_16_clKernel = NULL; static cl_kernel triple_dgemm_update_128_16_PART1_L_clKernel = NULL; static cl_kernel triple_dgemm_update_128_16_PART2_L_clKernel = NULL; static cl_kernel triple_dgemm_update_128_32_PART1_L_clKernel = NULL; static cl_kernel triple_dgemm_update_128_32_PART2_L_clKernel = NULL; static cl_kernel triple_dgemm_update_128_64_PART1_L_clKernel = NULL; static cl_kernel triple_dgemm_update_128_64_PART2_L_clKernel = NULL; static cl_kernel triple_dgemm_update_128_ABOVE64_PART1_L_clKernel = NULL; static cl_kernel triple_dgemm_update_128_ABOVE64_PART2_L_clKernel = NULL; static cl_kernel triple_dgemm_update_128_ABOVE64_PART3_L_clKernel = NULL; #endifclblas-2.10/src/library/blas/trtri/TrtriKernelSourceIncludes.cpp000066400000000000000000000063421264277366700250660ustar00rootroot00000000000000/******************************************************************************* * This file is NOT auto-generated; populate it with hand-written kernels ******************************************************************************/ #ifndef TRTRI_SOURCE_INCLUDES_CPP #define TRTRI_SOURCE_INCLUDES_CPP #ifndef CLBLAS_OFFLINE_COMPILE_DTRSM /*mod 192 dtrsm*/ #include "diag_dtrtri_upper_192_12.cpp" #include "triple_dgemm_update_192_12_R.cpp" #include "triple_dgemm_update_192_24_PART1_R.cpp" #include "triple_dgemm_update_192_24_PART2_R.cpp" #include "triple_dgemm_update_192_48_PART1_R.cpp" #include "triple_dgemm_update_192_48_PART2_R.cpp" #include "triple_dgemm_update_192_96_PART1_R.cpp" #include "triple_dgemm_update_192_96_PART2_R.cpp" /*mod 128 dtrsm*/ /*upper*/ #include "diag_dtrtri_upper_128_16.cpp" #include "triple_dgemm_update_128_16_R.cpp" #include "triple_dgemm_update_128_32_PART1_R.cpp" #include "triple_dgemm_update_128_32_PART2_R.cpp" #include "triple_dgemm_update_128_64_PART1_R.cpp" #include "triple_dgemm_update_128_64_PART2_R.cpp" #include "triple_dgemm_update_128_ABOVE64_PART1_R.cpp" #include "triple_dgemm_update_128_ABOVE64_PART2_R.cpp" #include "triple_dgemm_update_128_ABOVE64_PART3_R.cpp" /*lower*/ #include "diag_dtrtri_lower_128_16.cpp" #include "triple_dgemm_update_128_16_PART1_L.cpp" #include "triple_dgemm_update_128_16_PART2_L.cpp" #include "triple_dgemm_update_128_32_PART1_L.cpp" #include "triple_dgemm_update_128_32_PART2_L.cpp" #include "triple_dgemm_update_128_64_PART1_L.cpp" #include "triple_dgemm_update_128_64_PART2_L.cpp" #include "triple_dgemm_update_128_ABOVE64_PART1_L.cpp" #include "triple_dgemm_update_128_ABOVE64_PART2_L.cpp" #include "triple_dgemm_update_128_ABOVE64_PART3_L.cpp" #else /*mod 192 dtrsm*/ #include "diag_dtrtri_upper_192_12_bin.cpp" #include "triple_dgemm_update_192_12_R_bin.cpp" #include "triple_dgemm_update_192_24_PART1_R_bin.cpp" #include "triple_dgemm_update_192_24_PART2_R_bin.cpp" #include "triple_dgemm_update_192_48_PART1_R_bin.cpp" #include "triple_dgemm_update_192_48_PART2_R_bin.cpp" #include "triple_dgemm_update_192_96_PART1_R_bin.cpp" #include "triple_dgemm_update_192_96_PART2_R_bin.cpp" /*mod 128 dtrsm*/ /*upper*/ #include "diag_dtrtri_upper_128_16_bin.cpp" #include "triple_dgemm_update_128_16_R_bin.cpp" #include "triple_dgemm_update_128_32_PART1_R_bin.cpp" #include "triple_dgemm_update_128_32_PART2_R_bin.cpp" #include "triple_dgemm_update_128_64_PART1_R_bin.cpp" #include "triple_dgemm_update_128_64_PART2_R_bin.cpp" #include "triple_dgemm_update_128_ABOVE64_PART1_R_bin.cpp" #include "triple_dgemm_update_128_ABOVE64_PART2_R_bin.cpp" #include "triple_dgemm_update_128_ABOVE64_PART3_R_bin.cpp" /*lower*/ #include "diag_dtrtri_lower_128_16_bin.cpp" #include "triple_dgemm_update_128_16_PART1_L_bin.cpp" #include "triple_dgemm_update_128_16_PART2_L_bin.cpp" #include "triple_dgemm_update_128_32_PART1_L_bin.cpp" #include "triple_dgemm_update_128_32_PART2_L_bin.cpp" #include "triple_dgemm_update_128_64_PART1_L_bin.cpp" #include "triple_dgemm_update_128_64_PART2_L_bin.cpp" #include "triple_dgemm_update_128_ABOVE64_PART1_L_bin.cpp" #include "triple_dgemm_update_128_ABOVE64_PART2_L_bin.cpp" #include "triple_dgemm_update_128_ABOVE64_PART3_L_bin.cpp" #endif //CLBLAS_OFFLINE_COMPILE_DTRSM #endif clblas-2.10/src/library/blas/trtri/TrtriKernelSourceIncludes.h000066400000000000000000000124201264277366700245250ustar00rootroot00000000000000 #ifndef TRTRI_SOURCE_INCLUDES_H #define TRTRI_SOURCE_INCLUDES_H #include #include "TrtriKernelSourceIncludes.cpp" //**** compiler flags //**** online compilation flags const char * const TrtriBuildOptions = "-cl-std=CL2.0"; const char * const TrtribinBuildOptions = "-cl-std=CL2.0"; /*mod 192 dtrsm*/ extern const char * const diag_dtrtri_upper_192_12_src; extern unsigned char *diag_dtrtri_upper_192_12_bin; extern size_t diag_dtrtri_upper_192_12_binSize; extern const char * const triple_dgemm_update_192_12_R_src; extern unsigned char *triple_dgemm_update_192_12_R_bin; extern size_t triple_dgemm_update_192_12_R_binSize; extern const char * const triple_dgemm_update_192_24_PART1_R_src; extern unsigned char *triple_dgemm_update_192_24_PART1_R_bin; extern size_t triple_dgemm_update_192_24_PART1_R_binSize; extern const char * const triple_dgemm_update_192_24_PART2_R_src; extern unsigned char *triple_dgemm_update_192_24_PART2_R_bin; extern size_t triple_dgemm_update_192_24_PART2_R_binSize; extern const char * const triple_dgemm_update_192_48_PART1_R_src; extern unsigned char *triple_dgemm_update_192_48_PART1_R_bin; extern size_t triple_dgemm_update_192_48_PART1_R_binSize; extern const char * const triple_dgemm_update_192_48_PART2_R_src; extern unsigned char *triple_dgemm_update_192_48_PART2_R_bin; extern size_t triple_dgemm_update_192_48_PART2_R_binSize; extern const char * const triple_dgemm_update_192_96_PART1_R_src; extern unsigned char *triple_dgemm_update_192_96_PART1_R_bin; extern size_t triple_dgemm_update_192_96_PART1_R_binSize; extern const char * const triple_dgemm_update_192_96_PART2_R_src; extern unsigned char *triple_dgemm_update_192_96_PART2_R_bin; extern size_t triple_dgemm_update_192_96_PART2_R_binSize; /*mod 128 dtrsm*/ /*upper*/ extern const char * const diag_dtrtri_upper_128_16_src; extern unsigned char *diag_dtrtri_upper_128_16_bin; extern size_t diag_dtrtri_upper_128_16_binSize; extern const char * const triple_dgemm_update_128_16_R_src; extern unsigned char *triple_dgemm_update_128_16_R_bin; extern size_t triple_dgemm_update_128_16_R_binSize; extern const char * const triple_dgemm_update_128_32_PART1_R_src; extern unsigned char *triple_dgemm_update_128_32_PART1_R_bin; extern size_t triple_dgemm_update_128_32_PART1_R_binSize; extern const char * const triple_dgemm_update_128_32_PART2_R_src; extern unsigned char *triple_dgemm_update_128_32_PART2_R_bin; extern size_t triple_dgemm_update_128_32_PART2_R_binSize; extern const char * const triple_dgemm_update_128_64_PART1_R_src; extern unsigned char *triple_dgemm_update_128_64_PART1_R_bin; extern size_t triple_dgemm_update_128_64_PART1_R_binSize; extern const char * const triple_dgemm_update_128_64_PART2_R_src; extern unsigned char *triple_dgemm_update_128_64_PART2_R_bin; extern size_t triple_dgemm_update_128_64_PART2_R_binSize; extern const char * const triple_dgemm_update_128_ABOVE64_PART1_R_src; extern unsigned char *triple_dgemm_update_128_ABOVE64_PART1_R_bin; extern size_t triple_dgemm_update_128_ABOVE64_PART1_R_binSize; extern const char * const triple_dgemm_update_128_ABOVE64_PART2_R_src; extern unsigned char *triple_dgemm_update_128_ABOVE64_PART2_R_bin; extern size_t triple_dgemm_update_128_ABOVE64_PART2_R_binSize; extern const char * const triple_dgemm_update_128_ABOVE64_PART3_R_src; extern unsigned char *triple_dgemm_update_128_ABOVE64_PART3_R_bin; extern size_t triple_dgemm_update_128_ABOVE64_PART3_R_binSize; /*lower*/ extern const char * const diag_dtrtri_lower_128_16_src; extern unsigned char *diag_dtrtri_lower_128_16_bin; extern size_t diag_dtrtri_lower_128_16_binSize; extern const char * const triple_dgemm_update_128_16_PART1_L_src; extern unsigned char *triple_dgemm_update_128_16_PART1_L_bin; extern size_t triple_dgemm_update_128_16_PART1_L_binSize; extern const char * const triple_dgemm_update_128_16_PART2_L_src; extern unsigned char *triple_dgemm_update_128_16_PART2_L_bin; extern size_t triple_dgemm_update_128_16_PART2_L_binSize; extern const char * const triple_dgemm_update_128_32_PART1_L_src; extern unsigned char *triple_dgemm_update_128_32_PART1_L_bin; extern size_t triple_dgemm_update_128_32_PART1_L_binSize; extern const char * const triple_dgemm_update_128_32_PART2_L_src; extern unsigned char *triple_dgemm_update_128_32_PART2_L_bin; extern size_t triple_dgemm_update_128_32_PART2_L_binSize; extern const char * const triple_dgemm_update_128_64_PART1_L_src; extern unsigned char *triple_dgemm_update_128_64_PART1_L_bin; extern size_t triple_dgemm_update_128_64_PART1_L_binSize; extern const char * const triple_dgemm_update_128_64_PART2_L_src; extern unsigned char *triple_dgemm_update_128_64_PART2_L_bin; extern size_t triple_dgemm_update_128_64_PART2_L_binSize; extern const char * const triple_dgemm_update_128_ABOVE64_PART1_L_src; extern unsigned char *triple_dgemm_update_128_ABOVE64_PART1_L_bin; extern size_t triple_dgemm_update_128_ABOVE64_PART1_L_binSize; extern const char * const triple_dgemm_update_128_ABOVE64_PART2_L_src; extern unsigned char *triple_dgemm_update_128_ABOVE64_PART2_L_bin; extern size_t triple_dgemm_update_128_ABOVE64_PART2_L_binSize; extern const char * const triple_dgemm_update_128_ABOVE64_PART3_L_src; extern unsigned char *triple_dgemm_update_128_ABOVE64_PART3_L_bin; extern size_t triple_dgemm_update_128_ABOVE64_PART3_L_binSize; #endif clblas-2.10/src/library/blas/trtri/diag_dtrtri_lower_128_16.cpp000066400000000000000000000070421264277366700243530ustar00rootroot00000000000000/******************************************************************************* * Hand-tuned kernel ******************************************************************************/ #ifndef KERNEL_DIAG_DTRTRI_LOWER_128_16_SRC_CPP #define KERNEL_DIAG_DTRTRI_LOWER_128_16_SRC_CPP #pragma message("#define KERNEL_DIAG_DTRTRI_UPPER_128_16_SRC_CPP.") #ifndef STRINGIFY #define STRINGIFY2(...) #__VA_ARGS__ #define STRINGIFY(...) STRINGIFY2(__VA_ARGS__) #endif unsigned char *diag_dtrtri_lower_128_16_bin = 0; size_t diag_dtrtri_lower_128_16_binSize = 0; const char * const diag_dtrtri_lower_128_16_src = STRINGIFY( #define BLOCK_SIZE 16 \n #define NB 128 \n #define ZERO ( 0.0) \n #define ONE ( 1.0) \n #ifdef DOUBLE_PRECISION \n #ifdef cl_khr_fp64 \n #pragma OPENCL EXTENSION cl_khr_fp64 : enable \n #else \n #pragma OPENCL EXTENSION cl_amd_fp64 : enable \n #endif \n #endif \n __kernel void diag_dtrtri_lower_128_16_src(\n int isDiagUnit,\n __global double const * restrict A, \n uint offA, \n __global double *d_dinvA, \n uint lda, \n uint na)\n { \n int i, j;\n double Ystx = 0; \n __local double *Bw = 0, *x = 0, *y = 0; \n double switcher; \n double neg_switcher; \n // Thread index int tx = get_local_id(0); \n int txw; \n int gx = get_global_id(0); \n // Block index int bx = get_group_id(0); \n A = A + offA; \n __global const double *Aoff = A + bx*lda*BLOCK_SIZE + bx*BLOCK_SIZE; \n int NumBLperNB = NB / BLOCK_SIZE; \n d_dinvA += bx / NumBLperNB*NB*NB + (bx % NumBLperNB)*(NB*BLOCK_SIZE + BLOCK_SIZE); \n __local double Bs[BLOCK_SIZE*BLOCK_SIZE]; \n __local double workspace[BLOCK_SIZE]; \n // workspace used to store the current working column // load A _Pragma("unroll")\n for (i = 0; i < BLOCK_SIZE; i++)\n { \n if (tx >= i && gx < na)\n { \n Bs[i*BLOCK_SIZE + tx] = *(Aoff + i*lda + tx); \n }\n else\n { \n Bs[i*BLOCK_SIZE + tx] = ZERO; \n }\n }\n // read in the whole square block of my A and zero out the non data triangular // not the upper or lower diagonal // Synchronize to make sure the matrices are loaded //__syncthreads(); barrier(CLK_LOCAL_MEM_FENCE); \n // solve the diagonals if (isDiagUnit == 1)\n { \n Bs[tx*BLOCK_SIZE + tx] = ONE; \n }\n else\n { \n if (Bs[tx*BLOCK_SIZE + tx] == ZERO)\n { \n Bs[tx*BLOCK_SIZE + tx] = ONE; \n }\n else\n { \n Bs[tx*BLOCK_SIZE + tx] = ONE / (Bs[tx*BLOCK_SIZE + tx]); \n }\n }\n /* * the lower case */ if (!(tx < BLOCK_SIZE - 1))\n { \n switcher = ONE; \n }\n else\n { \n switcher = ZERO; \n }\n Bs[(BLOCK_SIZE - 1)*BLOCK_SIZE + tx] = switcher * Bs[(BLOCK_SIZE - 1)*BLOCK_SIZE + tx]; \n // zero out the last column, except the diagonal element for (i = BLOCK_SIZE - 2; i >= 0; i--) {\n Ystx = ZERO; \n if (tx > i)\n { \n switcher = ONE; \n }\n else\n { \n switcher = ZERO; \n }\n //dtrmv Bw = Bs + (i + 1)*BLOCK_SIZE + i + 1; \n workspace[tx] = *(Bs + i*BLOCK_SIZE + tx); \n x = workspace + i + 1; \n y = Bs + i*BLOCK_SIZE; \n txw = (tx - i - 1); \n _Pragma("unroll")\n for (j = 0; j < BLOCK_SIZE - i - 1; j++)\n Ystx += switcher*(*(Bw + j*BLOCK_SIZE + txw)*x[j]); \n //sscal if (tx != i)\n { \n switcher = ONE; \n neg_switcher = ZERO; \n }\n else\n { \n switcher = ZERO; \n neg_switcher = ONE; \n }\n y[tx] = switcher * Ystx*(-Bs[i*BLOCK_SIZE + i]) + neg_switcher *y[tx]; \n //__syncthreads(); barrier(CLK_LOCAL_MEM_FENCE); \n }\n // write back A _Pragma("unroll")\n for (i = 0; i < BLOCK_SIZE; i++)\n *(d_dinvA + i*NB + tx) = Bs[i*BLOCK_SIZE + tx]; \n }\n // end of kernel ); #endif clblas-2.10/src/library/blas/trtri/diag_dtrtri_upper_128_16.cpp000066400000000000000000000066001264277366700243550ustar00rootroot00000000000000/******************************************************************************* * Hand-tuned kernel ******************************************************************************/ #ifndef KERNEL_DIAG_DTRTRI_UPPER_128_16_SRC_CPP #define KERNEL_DIAG_DTRTRI_UPPER_128_16_SRC_CPP #pragma message("#define KERNEL_DIAG_DTRTRI_UPPER_128_16_SRC_CPP.") #ifndef STRINGIFY #define STRINGIFY2(...) #__VA_ARGS__ #define STRINGIFY(...) STRINGIFY2(__VA_ARGS__) #endif unsigned char *diag_dtrtri_upper_128_16_bin = 0; size_t diag_dtrtri_upper_128_16_binSize = 0; const char * const diag_dtrtri_upper_128_16_src = STRINGIFY( #define BLOCK_SIZE 16 \n #define NB 128 \n #define ZERO ( 0.0) \n #define ONE ( 1.0) \n #ifdef DOUBLE_PRECISION \n #ifdef cl_khr_fp64 \n #pragma OPENCL EXTENSION cl_khr_fp64 : enable \n #else \n #pragma OPENCL EXTENSION cl_amd_fp64 : enable \n #endif \n #endif \n __kernel void diag_dtrtri_upper_128_16_src(\n int isDiagUnit,\n __global double const * restrict A, \n uint offA, \n __global double *d_dinvA, \n uint lda, \n uint na)\n { \n int i, j;\n double Ystx = 0;\n __local double *y = 0;\n double switcher;\n double neg_switcher;\n // Thread index int tx = get_local_id(0);\n // Thread index int gx = get_global_id(0);\n // Block index int bx = get_group_id(0);\n A = A + offA;\n __global const double *Aoff = A + bx*lda*BLOCK_SIZE + bx*BLOCK_SIZE;\n int NumBLperNB = NB/BLOCK_SIZE;\n d_dinvA += bx/NumBLperNB*NB*NB + (bx % NumBLperNB)*(NB*BLOCK_SIZE + BLOCK_SIZE);\n __local double Bs[BLOCK_SIZE*BLOCK_SIZE];\n __local double workspace[BLOCK_SIZE]; \n // workspace used to store the current working column // load A _Pragma("unroll")\n for( i=0; i < BLOCK_SIZE; i++ )\n {\n if(tx <= i && i+bx*BLOCK_SIZE < na )\n {\n Bs[i*BLOCK_SIZE+tx] = *(Aoff+i*lda+tx);\n }\n else\n {\n Bs[i*BLOCK_SIZE+tx] = ZERO;\n }\n }\n // read in the whole square block of my A and zero out the non data triangular // Synchronize to make sure the matrices are loaded //__syncthreads(); barrier(CLK_LOCAL_MEM_FENCE);\n // solve the diagonals if(isDiagUnit == 1)\n {\n Bs[tx*BLOCK_SIZE+tx] = ONE;\n }\n else\n {\n if( Bs[tx*BLOCK_SIZE+tx] == ZERO )\n {\n Bs[tx*BLOCK_SIZE+tx] = ONE; \n }\n else \n {\n Bs[tx*BLOCK_SIZE+tx] = ONE / ( Bs[tx*BLOCK_SIZE+tx]) ;\n }\n }\n /* the upper case */ for( i=0; i < BLOCK_SIZE; i++ ) {\n Ystx = ZERO;\n if( tx < i)\n {\n switcher = ONE;\n }\n else\n {\n switcher = ZERO;\n }\n //dtrmv workspace[tx] = *(Bs+i*BLOCK_SIZE+tx);\n y = Bs+i*BLOCK_SIZE;\n _Pragma("unroll")\n //for( j=tx; j < i; j++ ) for( j=0; j < i; j++ )\n {\n Ystx += switcher * (*(Bs+j*BLOCK_SIZE+tx)*workspace[j]);\n }\n //sscal // if (tx != i) y[tx]=switcher*Ystx*(-Bs[i*BLOCK_SIZE+i]); if( tx != i)\n {\n switcher = ONE;\n neg_switcher = ZERO;\n }\n else\n {\n switcher = ZERO;\n neg_switcher = ONE;\n }\n y[tx] = switcher *Ystx*(-Bs[i*BLOCK_SIZE+i])+neg_switcher*y[tx];\n // __syncthreads(); barrier(CLK_LOCAL_MEM_FENCE);\n }\n // write back A _Pragma("unroll")\n for( i=0; i < BLOCK_SIZE; i++ )\n {\n *(d_dinvA+i*NB+tx) = Bs[i*BLOCK_SIZE+tx];\n }\n }\n // end of kernel ); #endif clblas-2.10/src/library/blas/trtri/diag_dtrtri_upper_192_12.cpp000066400000000000000000000064171264277366700243600ustar00rootroot00000000000000/******************************************************************************* * Hand-tuned kernel ******************************************************************************/ #ifndef KERNEL_DIAG_DTRTRI_UPPER_192_12_SRC_CPP #define KERNEL_DIAG_DTRTRI_UPPER_192_12_SRC_CPP #pragma message("#define KERNEL_DIAG_DTRTRI_UPPER_192_12_SRC_CPP.") #ifndef STRINGIFY #define STRINGIFY2(...) #__VA_ARGS__ #define STRINGIFY(...) STRINGIFY2(__VA_ARGS__) #endif unsigned char *diag_dtrtri_upper_192_12_bin = 0; size_t diag_dtrtri_upper_192_12_binSize = 0; const char * const diag_dtrtri_upper_192_12_src = STRINGIFY( #define BLOCK_SIZE 12 \n #define NB 192 \n #define ZERO ( 0.0) \n #define ONE ( 1.0) \n #ifdef DOUBLE_PRECISION \n #ifdef cl_khr_fp64 \n #pragma OPENCL EXTENSION cl_khr_fp64 : enable \n #else \n #pragma OPENCL EXTENSION cl_amd_fp64 : enable \n #endif \n #endif \n __kernel void diag_dtrtri_upper_192_12_src(\n int isDiagUnit,\n __global double const * restrict A, \n uint offA, \n __global double *d_dinvA, \n uint lda, \n uint na)\n { \n int i, j;\n double Ystx = 0; \n __local double *y = 0; \n double switcher; \n double neg_switcher; \n // Thread index int tx = get_local_id(0); \n // Thread index int gx = get_global_id(0); \n // Block index int bx = get_group_id(0); \n A = A + offA; \n __global const double *Aoff = A + bx*lda*BLOCK_SIZE + bx*BLOCK_SIZE; \n int NumBLperNB = NB / BLOCK_SIZE; \n d_dinvA += bx / NumBLperNB*NB*NB + (bx % NumBLperNB)*(NB*BLOCK_SIZE + BLOCK_SIZE); \n __local double Bs[BLOCK_SIZE*BLOCK_SIZE]; \n __local double workspace[BLOCK_SIZE];\n // workspace used to store the current working column // load A \n _Pragma("unroll")\n for (i = 0; i < BLOCK_SIZE; i++)\n { \n if (tx <= i && i + bx*BLOCK_SIZE < na)\n { \n Bs[i*BLOCK_SIZE + tx] = *(Aoff + i*lda + tx); \n }\n else\n { \n Bs[i*BLOCK_SIZE + tx] = ZERO; \n }\n }\n // read in the whole square block of my A and zero out the non data triangular // Synchronize to make sure the matrices are loaded //__syncthreads(); barrier(CLK_LOCAL_MEM_FENCE); \n // solve the diagonals if (isDiagUnit == 1)\n { \n Bs[tx*BLOCK_SIZE + tx] = ONE; \n }\n else\n { \n if (Bs[tx*BLOCK_SIZE + tx] == ZERO)\n { \n Bs[tx*BLOCK_SIZE + tx] = ONE; \n }\n else\n { \n Bs[tx*BLOCK_SIZE + tx] = ONE / (Bs[tx*BLOCK_SIZE + tx]); \n }\n }\n /* the upper case */ for (i = 0; i < BLOCK_SIZE; i++) {\n Ystx = ZERO; \n if (tx < i)\n { \n switcher = ONE; \n }\n else\n { \n switcher = ZERO; \n }\n //dtrmv workspace[tx] = *(Bs + i*BLOCK_SIZE + tx); \n y = Bs + i*BLOCK_SIZE; \n _Pragma("unroll")\n //for( j=tx; j < i; j++ ) for (j = 0; j < i; j++)\n Ystx += switcher * (*(Bs + j*BLOCK_SIZE + tx)*workspace[j]); \n //sscal // if (tx != i) y[tx]=switcher*Ystx*(-Bs[i*BLOCK_SIZE+i]); if (tx != i)\n { \n switcher = ONE; \n neg_switcher = ZERO; \n }\n else\n { \n switcher = ZERO; \n neg_switcher = ONE; \n }\n y[tx] = switcher *Ystx*(-Bs[i*BLOCK_SIZE + i]) + neg_switcher*y[tx]; \n // __syncthreads(); barrier(CLK_LOCAL_MEM_FENCE); \n }\n // write back A _Pragma("unroll")\n for (i = 0; i < BLOCK_SIZE; i++)\n *(d_dinvA + i*NB + tx) = Bs[i*BLOCK_SIZE + tx]; \n }\n // end of kernel ); #endif clblas-2.10/src/library/blas/trtri/triple_dgemm_update_128_16_PART1_L.cpp000066400000000000000000000123131264277366700260400ustar00rootroot00000000000000/******************************************************************************* * Hand-tuned kernel * ******************************************************************************/ #ifndef KERNEL_TRIPLE_DGEMM_UPDATE_128_16_PART1_L_SRC_CPP #define KERNEL_TRIPLE_DGEMM_UPDATE_128_16_PART1_L_SRC_CPP #pragma message("#define KERNEL_TRIPLE_DGEMM_UPDATE_128_16_PART1_L_SRC_CPP.") #ifndef STRINGIFY #define STRINGIFY2(...) #__VA_ARGS__ #define STRINGIFY(...) STRINGIFY2(__VA_ARGS__) #endif unsigned char *triple_dgemm_update_128_16_PART1_L_bin = 0; size_t triple_dgemm_update_128_16_PART1_L_binSize = 0; const char * const triple_dgemm_update_128_16_PART1_L_src = STRINGIFY( static void daxpy(\n double alpha, \n __local const double * __restrict__ b, \n double * __restrict__ c)\n { \n c[0] += alpha * b[0]; \n c[1] += alpha * b[1]; \n c[2] += alpha * b[2]; \n c[3] += alpha * b[3]; \n c[4] += alpha * b[4]; \n c[5] += alpha * b[5]; \n c[6] += alpha * b[6]; \n c[7] += alpha * b[7]; \n c[8] += alpha * b[8]; \n c[9] += alpha * b[9]; \n c[10] += alpha * b[10]; \n c[11] += alpha * b[11]; \n c[12] += alpha * b[12]; \n c[13] += alpha * b[13]; \n c[14] += alpha * b[14]; \n c[15] += alpha * b[15]; \n }\n #define NB 128\n #define __mul(i,j) ((i)*(j))\n #define qmod(a, b) ((a)%(b))\n __kernel void TRIPLE_DGEMM_UPDATE_128_16_PART1_L(__global const double *Ain, uint offAin, __global double *d_dinvA, int blk, uint lda, int npages, int na)\n { \n const int bIdy = get_group_id(1) / npages;\n //const int page = (get_group_id(1))%(npages); const int page = qmod(get_group_id(1), npages); \n const int inx = get_local_id(0); \n const int iny = get_local_id(1); \n const int ibx = get_group_id(0) * (get_local_size(0)*get_local_size(1)); \n const int iby = bIdy * 16; \n const int id = inx + iny*get_local_size(0); \n __local double bs[16][17]; \n Ain = Ain + offAin; \n //--------------------------part one---------------------------// { // A21*inv(A11) -> A21 // A=A21, B=inv(A11), C=A21 __global const double *A; \n __global double *B, *C; \n int ldb = NB; \n int ldc = NB; \n int PagesPerNB = NB / (blk * 2); \n d_dinvA += NB*NB*(page / PagesPerNB) + (qmod(page, PagesPerNB))*(blk * 2)*NB + (qmod(page, PagesPerNB))*(blk * 2); \n int xa = page*blk * 2 + blk + ibx + id; \n int ya = page*blk * 2; \n int incA = ya * lda + xa; \n // maxA will be used to detect overflow on all subsequent accesses on A(xa, ya:ya+???) int maxA; \n if (xa < na)\n maxA = lda*na; \n // macro READA will detect overflow on y dimension else\n maxA = 0; \n // there is already an overflow on xa #define READA ( (incA < maxA ) ? Ain[incA] : 0 ) \n B = d_dinvA; \n C = d_dinvA + blk; \n B += inx + __mul(iby + iny, ldb); \n C += ibx + id + __mul(iby, ldc); \n __global double *Blast = B + blk; \n double c[16] = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }; \n do {\n double a[4]; \n a[0] = ((incA < maxA) ? Ain[incA] : 0); incA += lda; \n a[1] = ((incA < maxA) ? Ain[incA] : 0); incA += lda; \n a[2] = ((incA < maxA) ? Ain[incA] : 0); incA += lda; \n a[3] = ((incA < maxA) ? Ain[incA] : 0); incA += lda; \n bs[inx][iny] = B[0 * ldb]; \n bs[inx][iny + 4] = B[4 * ldb]; \n bs[inx][iny + 8] = B[8 * ldb]; \n bs[inx][iny + 12] = B[12 * ldb]; \n bs[inx + 4][iny] = B[4 + 0 * ldb]; \n bs[inx + 4][iny + 4] = B[4 + 4 * ldb]; \n bs[inx + 4][iny + 8] = B[4 + 8 * ldb]; \n bs[inx + 4][iny + 12] = B[4 + 12 * ldb]; \n bs[inx + 8][iny] = B[8 + 0 * ldb]; \n bs[inx + 8][iny + 4] = B[8 + 4 * ldb]; \n bs[inx + 8][iny + 8] = B[8 + 8 * ldb]; \n bs[inx + 8][iny + 12] = B[8 + 12 * ldb]; \n bs[inx + 12][iny] = B[12 + 0 * ldb]; \n bs[inx + 12][iny + 4] = B[12 + 4 * ldb]; \n bs[inx + 12][iny + 8] = B[12 + 8 * ldb]; \n bs[inx + 12][iny + 12] = B[12 + 12 * ldb]; \n //__syncthreads(); barrier(CLK_LOCAL_MEM_FENCE); \n daxpy(a[0], &bs[0][0], c); a[0] = ((incA < maxA) ? Ain[incA] : 0); incA += lda; \n daxpy(a[1], &bs[1][0], c); a[1] = ((incA < maxA) ? Ain[incA] : 0); incA += lda; \n daxpy(a[2], &bs[2][0], c); a[2] = ((incA < maxA) ? Ain[incA] : 0); incA += lda; \n daxpy(a[3], &bs[3][0], c); a[3] = ((incA < maxA) ? Ain[incA] : 0); incA += lda; \n daxpy(a[0], &bs[4][0], c); a[0] = ((incA < maxA) ? Ain[incA] : 0); incA += lda; \n daxpy(a[1], &bs[5][0], c); a[1] = ((incA < maxA) ? Ain[incA] : 0); incA += lda; \n daxpy(a[2], &bs[6][0], c); a[2] = ((incA < maxA) ? Ain[incA] : 0); incA += lda; \n daxpy(a[3], &bs[7][0], c); a[3] = ((incA < maxA) ? Ain[incA] : 0); incA += lda; \n daxpy(a[0], &bs[8][0], c); a[0] = ((incA < maxA) ? Ain[incA] : 0); incA += lda; \n daxpy(a[1], &bs[9][0], c); a[1] = ((incA < maxA) ? Ain[incA] : 0); incA += lda; \n daxpy(a[2], &bs[10][0], c); a[2] = ((incA < maxA) ? Ain[incA] : 0); incA += lda; \n daxpy(a[3], &bs[11][0], c); a[3] = ((incA < maxA) ? Ain[incA] : 0); incA += lda; \n daxpy(a[0], &bs[12][0], c); \n daxpy(a[1], &bs[13][0], c); \n daxpy(a[2], &bs[14][0], c); \n daxpy(a[3], &bs[15][0], c); \n B += 16; \n //__syncthreads(); barrier(CLK_LOCAL_MEM_FENCE); \n } while (B < Blast); \n for (int i = 0; i < 16; i++) {\n C[0] = c[i]; \n C += ldc; \n }\n }\n #undef READA\n //__syncthreads(); barrier(CLK_LOCAL_MEM_FENCE); \n }\n // end of kernel ); #endif clblas-2.10/src/library/blas/trtri/triple_dgemm_update_128_16_PART2_L.cpp000066400000000000000000000104151264277366700260420ustar00rootroot00000000000000/******************************************************************************* * Hand-tuned kernel * ******************************************************************************/ #ifndef KERNEL_TRIPLE_DGEMM_UPDATE_128_16_PART2_L_SRC_CPP #define KERNEL_TRIPLE_DGEMM_UPDATE_128_16_PART2_L_SRC_CPP #pragma message("#define KERNEL_TRIPLE_DGEMM_UPDATE_128_16_PART2_L_SRC_CPP.") #ifndef STRINGIFY #define STRINGIFY2(...) #__VA_ARGS__ #define STRINGIFY(...) STRINGIFY2(__VA_ARGS__) #endif unsigned char *triple_dgemm_update_128_16_PART2_L_bin = 0; size_t triple_dgemm_update_128_16_PART2_L_binSize = 0; const char * const triple_dgemm_update_128_16_PART2_L_src = STRINGIFY( static void daxpy(\n double alpha, \n __local const double * __restrict__ b, \n double * __restrict__ c)\n { \n c[0] += alpha * b[0]; \n c[1] += alpha * b[1]; \n c[2] += alpha * b[2]; \n c[3] += alpha * b[3]; \n c[4] += alpha * b[4]; \n c[5] += alpha * b[5]; \n c[6] += alpha * b[6]; \n c[7] += alpha * b[7]; \n c[8] += alpha * b[8]; \n c[9] += alpha * b[9]; \n c[10] += alpha * b[10]; \n c[11] += alpha * b[11]; \n c[12] += alpha * b[12]; \n c[13] += alpha * b[13]; \n c[14] += alpha * b[14]; \n c[15] += alpha * b[15]; \n }\n #define NB 128\n #define __mul(i,j) ((i)*(j))\n #define qmod(a, b) ((a)%(b))\n __kernel void TRIPLE_DGEMM_UPDATE_128_16_PART2_L(__global const double *Ain, uint offAin, __global double *d_dinvA, int blk, int lda, int npages, int na)\n {\n const int bIdy = get_group_id(1) / npages; \n const int page = qmod(get_group_id(1), npages); \n const int inx = get_local_id(0); \n const int iny = get_local_id(1); \n const int ibx = get_group_id(0) * (get_local_size(0)*get_local_size(1)); \n const int iby = bIdy * 16; \n const int id = inx + iny*get_local_size(0); \n __local double bs[16][17]; \n Ain = Ain + offAin; \n //--------------------------part two---------------------------// { // -inv(A22)*A21 -> A21 // A=inv(A22), B=A21, C=A21 __global double *A, *B, *C; \n int lda = NB; \n int ldb = NB; \n int ldc = NB; \n int PagesPerNB = NB / (blk * 2); \n d_dinvA += NB*NB*(page / PagesPerNB) + (qmod(page, PagesPerNB))*(blk * 2)*NB + (qmod(page, PagesPerNB))*(blk * 2); \n A = d_dinvA + blk*NB + blk; \n B = C = d_dinvA + blk; \n A += ibx + id; \n B += inx + __mul(iby + iny, ldb); \n C += ibx + id + __mul(iby, ldc); \n __global double *Blast = B + blk; \n double c[16] = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }; \n do {\n double a[4] = { A[0 * lda], A[1 * lda], A[2 * lda], A[3 * lda] }; \n bs[inx][iny] = B[0 * ldb]; \n bs[inx][iny + 4] = B[4 * ldb]; \n bs[inx][iny + 8] = B[8 * ldb]; \n bs[inx][iny + 12] = B[12 * ldb]; \n bs[inx + 4][iny] = B[4 + 0 * ldb]; \n bs[inx + 4][iny + 4] = B[4 + 4 * ldb]; \n bs[inx + 4][iny + 8] = B[4 + 8 * ldb]; \n bs[inx + 4][iny + 12] = B[4 + 12 * ldb]; \n bs[inx + 8][iny] = B[8 + 0 * ldb]; \n bs[inx + 8][iny + 4] = B[8 + 4 * ldb]; \n bs[inx + 8][iny + 8] = B[8 + 8 * ldb]; \n bs[inx + 8][iny + 12] = B[8 + 12 * ldb]; \n bs[inx + 12][iny] = B[12 + 0 * ldb]; \n bs[inx + 12][iny + 4] = B[12 + 4 * ldb]; \n bs[inx + 12][iny + 8] = B[12 + 8 * ldb]; \n bs[inx + 12][iny + 12] = B[12 + 12 * ldb]; \n //__syncthreads(); barrier(CLK_LOCAL_MEM_FENCE); \n A += 4 * lda; \n daxpy(a[0], &bs[0][0], c); a[0] = A[0 * lda]; \n daxpy(a[1], &bs[1][0], c); a[1] = A[1 * lda]; \n daxpy(a[2], &bs[2][0], c); a[2] = A[2 * lda]; \n daxpy(a[3], &bs[3][0], c); a[3] = A[3 * lda]; \n A += 4 * lda; \n daxpy(a[0], &bs[4][0], c); a[0] = A[0 * lda]; \n daxpy(a[1], &bs[5][0], c); a[1] = A[1 * lda]; \n daxpy(a[2], &bs[6][0], c); a[2] = A[2 * lda]; \n daxpy(a[3], &bs[7][0], c); a[3] = A[3 * lda]; \n A += 4 * lda; \n daxpy(a[0], &bs[8][0], c); a[0] = A[0 * lda]; \n daxpy(a[1], &bs[9][0], c); a[1] = A[1 * lda]; \n daxpy(a[2], &bs[10][0], c); a[2] = A[2 * lda]; \n daxpy(a[3], &bs[11][0], c); a[3] = A[3 * lda]; \n A += 4 * lda; \n daxpy(a[0], &bs[12][0], c); \n daxpy(a[1], &bs[13][0], c); \n daxpy(a[2], &bs[14][0], c); \n daxpy(a[3], &bs[15][0], c); \n B += 16; \n //__syncthreads(); barrier(CLK_LOCAL_MEM_FENCE); \n } while (B < Blast); \n for (int i = 0; i < 16; i++) {\n C[0] = (-1)*c[i]; \n C += ldc; \n }\n }\n //__syncthreads(); barrier(CLK_LOCAL_MEM_FENCE); \n }\n // end of kernel ); #endif clblas-2.10/src/library/blas/trtri/triple_dgemm_update_128_16_R.cpp000066400000000000000000000167501264277366700251500ustar00rootroot00000000000000/******************************************************************************* * Hand-tuned kernel * B21 = -inv(A11)*A12*inv(A22) * 16 to 32 ******************************************************************************/ #ifndef KERNEL_TRIPLE_DGEMM_UPDATE_128_16_R_SRC_CPP #define KERNEL_TRIPLE_DGEMM_UPDATE_128_16_R_SRC_CPP #pragma message("#define KERNEL_TRIPLE_DGEMM_UPDATE_128_16_R_SRC_CPP.") #ifndef STRINGIFY #define STRINGIFY2(...) #__VA_ARGS__ #define STRINGIFY(...) STRINGIFY2(__VA_ARGS__) #endif unsigned char *triple_dgemm_update_128_16_R_bin = 0; size_t triple_dgemm_update_128_16_R_binSize = 0; const char * const triple_dgemm_update_128_16_R_src = STRINGIFY( static void daxpy(\n double alpha, \n __local const double * __restrict__ b, \n double * __restrict__ c)\n { \n c[0] += alpha * b[0]; \n c[1] += alpha * b[1]; \n c[2] += alpha * b[2]; \n c[3] += alpha * b[3]; \n c[4] += alpha * b[4]; \n c[5] += alpha * b[5]; \n c[6] += alpha * b[6]; \n c[7] += alpha * b[7]; \n c[8] += alpha * b[8]; \n c[9] += alpha * b[9]; \n c[10] += alpha * b[10]; \n c[11] += alpha * b[11]; \n c[12] += alpha * b[12]; \n c[13] += alpha * b[13]; \n c[14] += alpha * b[14]; \n c[15] += alpha * b[15]; \n }\n #define NB 128\n #define __mul(i,j) ((i)*(j))\n #define qmod(a, b) ((a)%(b))\n __kernel void TRIPLE_DGEMM_UPDATE_128_16_R(__global const double *Ain, uint offAin, __global double *d_dinvA, int blk, uint lda, int npages, int na)\n {\n const int bIdy = get_group_id(1) / npages;\n //const int page = (blockIdx.y)%(npages); const int page = qmod(get_group_id(1), npages);\n const int inx = get_local_id(0); \n const int iny = get_local_id(1); \n const int ibx = get_group_id(0) * (get_local_size(0)*get_local_size(1)); \n const int iby = bIdy * 16; \n const int id = inx + iny*get_local_size(0); \n __local double bs[16][17]; \n Ain = Ain + offAin; \n int PagesPerNB = NB / (blk * 2); \n //--------------------------part one---------------------------// { \n // A12*inv(A22) -> A12 // A=A12, B=inv(A22), C=A12(d_dinvA) __global const double *A; \n __global double *B, *C; \n int ldb = NB; \n int ldc = NB; \n d_dinvA += NB*NB*(page / PagesPerNB) + (qmod(page, PagesPerNB))*(blk * 2)*NB + (qmod(page, PagesPerNB))*(blk * 2); \n int xa = page*blk * 2 + ibx + id; \n int ya = page*blk * 2 + blk; \n int incA = ya * lda + xa; \n // maxA will be used to detect overflow on all subsequent accesses on A(xa, ya:ya+???) int maxA; \n if (xa < na)\n maxA = lda*na; \n // macro READA will detect overflow on y dimension else maxA = 0; \n // there is already an overflow on xa #define READA ( (incA < maxA ) ? Ain[incA] : 0 ) \n B = d_dinvA + blk*NB + blk; \n C = d_dinvA + blk*NB; \n B += inx + __mul(iby + iny, ldb); \n C += ibx + id + __mul(iby, ldc); \n __global double *Blast = B + blk; \n double c[16] = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }; \n do {\n double a[4]; \n a[0] = ((incA < maxA) ? Ain[incA] : 0); incA += lda; \n a[1] = ((incA < maxA) ? Ain[incA] : 0); incA += lda; \n a[2] = ((incA < maxA) ? Ain[incA] : 0); incA += lda; \n a[3] = ((incA < maxA) ? Ain[incA] : 0); incA += lda; \n bs[inx][iny] = B[0 * ldb]; \n bs[inx][iny + 4] = B[4 * ldb]; \n bs[inx][iny + 8] = B[8 * ldb]; \n bs[inx][iny + 12] = B[12 * ldb]; \n bs[inx + 4][iny] = B[4 + 0 * ldb]; \n bs[inx + 4][iny + 4] = B[4 + 4 * ldb]; \n bs[inx + 4][iny + 8] = B[4 + 8 * ldb]; \n bs[inx + 4][iny + 12] = B[4 + 12 * ldb]; \n bs[inx + 8][iny] = B[8 + 0 * ldb]; \n bs[inx + 8][iny + 4] = B[8 + 4 * ldb]; \n bs[inx + 8][iny + 8] = B[8 + 8 * ldb]; \n bs[inx + 8][iny + 12] = B[8 + 12 * ldb]; \n bs[inx + 12][iny] = B[12 + 0 * ldb]; \n bs[inx + 12][iny + 4] = B[12 + 4 * ldb]; \n bs[inx + 12][iny + 8] = B[12 + 8 * ldb]; \n bs[inx + 12][iny + 12] = B[12 + 12 * ldb]; \n //__syncthreads(); barrier(CLK_LOCAL_MEM_FENCE); \n daxpy(a[0], &bs[0][0], c); a[0] = ((incA < maxA) ? Ain[incA] : 0); incA += lda; \n daxpy(a[1], &bs[1][0], c); a[1] = ((incA < maxA) ? Ain[incA] : 0); incA += lda; \n daxpy(a[2], &bs[2][0], c); a[2] = ((incA < maxA) ? Ain[incA] : 0); incA += lda; \n daxpy(a[3], &bs[3][0], c); a[3] = ((incA < maxA) ? Ain[incA] : 0); incA += lda; \n daxpy(a[0], &bs[4][0], c); a[0] = ((incA < maxA) ? Ain[incA] : 0); incA += lda; \n daxpy(a[1], &bs[5][0], c); a[1] = ((incA < maxA) ? Ain[incA] : 0); incA += lda; \n daxpy(a[2], &bs[6][0], c); a[2] = ((incA < maxA) ? Ain[incA] : 0); incA += lda; \n daxpy(a[3], &bs[7][0], c); a[3] = ((incA < maxA) ? Ain[incA] : 0); incA += lda; \n daxpy(a[0], &bs[8][0], c); a[0] = ((incA < maxA) ? Ain[incA] : 0); incA += lda; \n daxpy(a[1], &bs[9][0], c); a[1] = ((incA < maxA) ? Ain[incA] : 0); incA += lda; \n daxpy(a[2], &bs[10][0], c); a[2] = ((incA < maxA) ? Ain[incA] : 0); incA += lda; \n daxpy(a[3], &bs[11][0], c); a[3] = ((incA < maxA) ? Ain[incA] : 0); incA += lda; \n daxpy(a[0], &bs[12][0], c); \n daxpy(a[1], &bs[13][0], c); \n daxpy(a[2], &bs[14][0], c); \n daxpy(a[3], &bs[15][0], c); \n B += 16; \n //__syncthreads(); barrier(CLK_LOCAL_MEM_FENCE); \n } while (B < Blast); \n for (int i = 0; i < 16; i++) {\n C[0] = c[i]; \n C += ldc; \n }\n }\n //__syncthreads(); barrier(CLK_LOCAL_MEM_FENCE); \n #undef READA\n //--------------------------part two---------------------------// { // -inv(A11)*A12 -> A12 // A=inv(A11), B=A12, C=A12 __global double *A, *B, *C; \n int lda = NB; \n int ldb = NB; \n int ldc = NB; \n A = d_dinvA; \n B = C = d_dinvA + blk*NB; \n A += ibx + id; \n B += inx + __mul(iby + iny, ldb); \n C += ibx + id + __mul(iby, ldc); \n __global double *Blast = B + blk; \n double c[16] = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }; \n do { double a[4] = { A[0 * lda], A[1 * lda], A[2 * lda], A[3 * lda] }; \n bs[inx][iny] = B[0 * ldb]; \n bs[inx][iny + 4] = B[4 * ldb]; \n bs[inx][iny + 8] = B[8 * ldb]; \n bs[inx][iny + 12] = B[12 * ldb]; \n bs[inx + 4][iny] = B[4 + 0 * ldb]; \n bs[inx + 4][iny + 4] = B[4 + 4 * ldb]; \n bs[inx + 4][iny + 8] = B[4 + 8 * ldb]; \n bs[inx + 4][iny + 12] = B[4 + 12 * ldb]; \n bs[inx + 8][iny] = B[8 + 0 * ldb]; \n bs[inx + 8][iny + 4] = B[8 + 4 * ldb]; \n bs[inx + 8][iny + 8] = B[8 + 8 * ldb]; \n bs[inx + 8][iny + 12] = B[8 + 12 * ldb]; \n bs[inx + 12][iny] = B[12 + 0 * ldb]; \n bs[inx + 12][iny + 4] = B[12 + 4 * ldb]; \n bs[inx + 12][iny + 8] = B[12 + 8 * ldb]; \n bs[inx + 12][iny + 12] = B[12 + 12 * ldb]; \n //__syncthreads(); barrier(CLK_LOCAL_MEM_FENCE); \n A += 4 * lda; \n daxpy(a[0], &bs[0][0], c); a[0] = A[0 * lda]; \n daxpy(a[1], &bs[1][0], c); a[1] = A[1 * lda]; \n daxpy(a[2], &bs[2][0], c); a[2] = A[2 * lda]; \n daxpy(a[3], &bs[3][0], c); a[3] = A[3 * lda]; \n A += 4 * lda; \n daxpy(a[0], &bs[4][0], c); a[0] = A[0 * lda]; \n daxpy(a[1], &bs[5][0], c); a[1] = A[1 * lda]; \n daxpy(a[2], &bs[6][0], c); a[2] = A[2 * lda]; \n daxpy(a[3], &bs[7][0], c); a[3] = A[3 * lda]; \n A += 4 * lda; \n daxpy(a[0], &bs[8][0], c); a[0] = A[0 * lda]; \n daxpy(a[1], &bs[9][0], c); a[1] = A[1 * lda]; \n daxpy(a[2], &bs[10][0], c); a[2] = A[2 * lda]; \n daxpy(a[3], &bs[11][0], c); a[3] = A[3 * lda]; \n A += 4 * lda; \n daxpy(a[0], &bs[12][0], c); \n daxpy(a[1], &bs[13][0], c); \n daxpy(a[2], &bs[14][0], c); \n daxpy(a[3], &bs[15][0], c); \n B += 16; \n //__syncthreads(); barrier(CLK_LOCAL_MEM_FENCE); \n } while (B < Blast); \n for (int i = 0; i < 16; i++) {\n C[0] = (-1)*c[i]; \n C += ldc; \n }\n }\n }\n // end of kernel ); #endif clblas-2.10/src/library/blas/trtri/triple_dgemm_update_128_32_PART1_L.cpp000066400000000000000000000115001264277366700260330ustar00rootroot00000000000000/******************************************************************************* * Hand-tuned kernel * * B21 = -inv(A11)*A12*inv(A22) ******************************************************************************/ #ifndef KERNEL_TRIPLE_DGEMM_UPDATE_128_32_PART1_L_SRC_CPP #define KERNEL_TRIPLE_DGEMM_UPDATE_128_32_PART1_L_SRC_CPP #pragma message("#define KERNEL_TRIPLE_DGEMM_UPDATE_128_32_PART1_L_SRC_CPP.") #ifndef STRINGIFY #define STRINGIFY2(...) #__VA_ARGS__ #define STRINGIFY(...) STRINGIFY2(__VA_ARGS__) #endif unsigned char *triple_dgemm_update_128_32_PART1_L_bin = 0; size_t triple_dgemm_update_128_32_PART1_L_binSize = 0; const char * const triple_dgemm_update_128_32_PART1_L_src = STRINGIFY( static void daxpy(\n double alpha, \n __local const double * __restrict__ b, \n double * __restrict__ c)\n { \n c[0] += alpha * b[0]; \n c[1] += alpha * b[1]; \n c[2] += alpha * b[2]; \n c[3] += alpha * b[3]; \n c[4] += alpha * b[4]; \n c[5] += alpha * b[5]; \n c[6] += alpha * b[6]; \n c[7] += alpha * b[7]; \n c[8] += alpha * b[8]; \n c[9] += alpha * b[9]; \n c[10] += alpha * b[10]; \n c[11] += alpha * b[11]; \n c[12] += alpha * b[12]; \n c[13] += alpha * b[13]; \n c[14] += alpha * b[14]; \n c[15] += alpha * b[15]; \n }\n #define NB 128\n #define __mul(i,j) ((i)*(j))\n #define qmod(a, b) ((a)%(b))\n __kernel void TRIPLE_DGEMM_UPDATE_128_32_PART1_L(__global const double *Ain, uint offAin, __global double *d_dinvA, int blk, uint lda, int npages, int na)\n { \n const int bIdy = get_group_id(1) / npages;\n const int page = qmod(get_group_id(1), npages); \n const int inx = get_local_id(0); \n const int iny = get_local_id(1); \n const int ibx = get_group_id(0) * (get_local_size(0)*get_local_size(1)); \n const int iby = bIdy * 16; \n const int id = inx + iny*get_local_size(0); \n __local double bs[16][17]; \n Ain = Ain + offAin; \n int PagesPerNB = NB / (blk * 2); \n //--------------------------part one---------------------------// { // A21*inv(A11) -> A21 // A=A21, B=inv(A11), C=A21 __global const double *A; \n __global double *B, *C; \n int ldb = NB; \n int ldc = NB; \n d_dinvA += NB*NB*(page / PagesPerNB) + (qmod(page, PagesPerNB))*(blk * 2)*NB + (qmod(page, PagesPerNB))*(blk * 2); \n int xa = page*blk * 2 + blk + ibx + id; \n int ya = page*blk * 2;\n int incA = ya * lda + xa; \n // maxA will be used to detect overflow on all subsequent accesses on A(xa, ya:ya+???) int maxA; \n if (xa < na)\n maxA = lda*na; \n // macro READA will detect overflow on y dimension else\n maxA = 0; \n // there is already an overflow on xa #define READA ( (incA < maxA ) ? Ain[incA] : 0 ) \n B = d_dinvA;\n C = d_dinvA + blk; \n B += inx + __mul(iby + iny, ldb); \n C += ibx + id + __mul(iby, ldc); \n __global double *Blast = B + blk; \n double c[16] = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }; \n do {\n double a[4]; \n a[0] = ((incA < maxA) ? Ain[incA] : 0); incA += lda; \n a[1] = ((incA < maxA) ? Ain[incA] : 0); incA += lda; \n a[2] = ((incA < maxA) ? Ain[incA] : 0); incA += lda; \n a[3] = ((incA < maxA) ? Ain[incA] : 0); incA += lda; \n bs[inx][iny] = B[0 * ldb]; \n bs[inx][iny + 4] = B[4 * ldb]; \n bs[inx][iny + 8] = B[8 * ldb]; \n bs[inx][iny + 12] = B[12 * ldb]; \n bs[inx + 8][iny] = B[8 + 0 * ldb]; \n bs[inx + 8][iny + 4] = B[8 + 4 * ldb]; \n bs[inx + 8][iny + 8] = B[8 + 8 * ldb]; \n bs[inx + 8][iny + 12] = B[8 + 12 * ldb]; \n //__syncthreads(); barrier(CLK_LOCAL_MEM_FENCE); \n daxpy(a[0], &bs[0][0], c); a[0] = ((incA < maxA) ? Ain[incA] : 0); incA += lda; \n daxpy(a[1], &bs[1][0], c); a[1] = ((incA < maxA) ? Ain[incA] : 0); incA += lda; \n daxpy(a[2], &bs[2][0], c); a[2] = ((incA < maxA) ? Ain[incA] : 0); incA += lda; \n daxpy(a[3], &bs[3][0], c); a[3] = ((incA < maxA) ? Ain[incA] : 0); incA += lda; \n daxpy(a[0], &bs[4][0], c); a[0] = ((incA < maxA) ? Ain[incA] : 0); incA += lda; \n daxpy(a[1], &bs[5][0], c); a[1] = ((incA < maxA) ? Ain[incA] : 0); incA += lda; \n daxpy(a[2], &bs[6][0], c); a[2] = ((incA < maxA) ? Ain[incA] : 0); incA += lda; \n daxpy(a[3], &bs[7][0], c); a[3] = ((incA < maxA) ? Ain[incA] : 0); incA += lda; \n daxpy(a[0], &bs[8][0], c); a[0] = ((incA < maxA) ? Ain[incA] : 0); incA += lda; \n daxpy(a[1], &bs[9][0], c); a[1] = ((incA < maxA) ? Ain[incA] : 0); incA += lda; \n daxpy(a[2], &bs[10][0], c); a[2] = ((incA < maxA) ? Ain[incA] : 0); incA += lda; \n daxpy(a[3], &bs[11][0], c); a[3] = ((incA < maxA) ? Ain[incA] : 0); incA += lda; \n daxpy(a[0], &bs[12][0], c); \n daxpy(a[1], &bs[13][0], c); \n daxpy(a[2], &bs[14][0], c); \n daxpy(a[3], &bs[15][0], c); \n B += 16; \n //__syncthreads(); barrier(CLK_LOCAL_MEM_FENCE); \n } while (B < Blast); \n for (int i = 0; i < 16; i++) {\n C[0] = c[i]; \n C += ldc; \n }\n }\n //__syncthreads(); barrier(CLK_LOCAL_MEM_FENCE); \n }\n // end of kernel ); #endif clblas-2.10/src/library/blas/trtri/triple_dgemm_update_128_32_PART1_R.cpp000066400000000000000000000116211264277366700260450ustar00rootroot00000000000000/******************************************************************************* * Hand-tuned kernel * * B21 = -inv(A11)*A12*inv(A22) * ******************************************************************************/ #ifndef KERNEL_TRIPLE_DGEMM_UPDATE_128_32_PART1_R_SRC_CPP #define KERNEL_TRIPLE_DGEMM_UPDATE_128_32_PART1_R_SRC_CPP #pragma message("#define KERNEL_TRIPLE_DGEMM_UPDATE_128_32_PART1_R_SRC_CPP.") #ifndef STRINGIFY #define STRINGIFY2(...) #__VA_ARGS__ #define STRINGIFY(...) STRINGIFY2(__VA_ARGS__) #endif unsigned char *triple_dgemm_update_128_32_PART1_R_bin = 0; size_t triple_dgemm_update_128_32_PART1_R_binSize = 0; const char * const triple_dgemm_update_128_32_PART1_R_src = STRINGIFY( static void daxpy(\n double alpha, \n __local const double * __restrict__ b, \n double * __restrict__ c)\n { \n c[0] += alpha * b[0]; \n c[1] += alpha * b[1]; \n c[2] += alpha * b[2]; \n c[3] += alpha * b[3]; \n c[4] += alpha * b[4]; \n c[5] += alpha * b[5]; \n c[6] += alpha * b[6]; \n c[7] += alpha * b[7]; \n c[8] += alpha * b[8]; \n c[9] += alpha * b[9]; \n c[10] += alpha * b[10]; \n c[11] += alpha * b[11]; \n c[12] += alpha * b[12]; \n c[13] += alpha * b[13]; \n c[14] += alpha * b[14]; \n c[15] += alpha * b[15]; \n }\n #define NB 128\n #define __mul(i,j) ((i)*(j))\n #define qmod(a, b) ((a)%(b))\n __kernel void TRIPLE_DGEMM_UPDATE_128_32_PART1_R(__global const double *Ain, uint offAin, __global double *d_dinvA, int blk, uint lda, int npages, int na)\n { \n const int bIdy = get_group_id(1) / npages; \n const int page = qmod(get_group_id(1), npages); \n const int inx = get_local_id(0); \n const int iny = get_local_id(1); \n const int ibx = get_group_id(0) * (get_local_size(0)*get_local_size(1)); \n const int iby = bIdy * 16; \n const int id = inx + iny*get_local_size(0); \n __local double bs[16][17]; \n Ain = Ain + offAin; \n int PagesPerNB = NB / (blk * 2); \n //--------------------------part one---------------------------// { // A12*inv(A22) -> A21 // A=A12, B=inv(A22), C=A12(d_dinvA) __global const double *A; \n __global double *B, *C; \n int ldb = NB; \n int ldc = NB; \n d_dinvA += NB*NB*(page / PagesPerNB) + (qmod(page, PagesPerNB))*(blk * 2)*NB + (qmod(page, PagesPerNB))*(blk * 2); \n int xa = page*blk * 2 + ibx + id; \n int ya = page*blk * 2 + blk; \n int incA = ya * lda + xa; \n // maxA will be used to detect overflow on all subsequent accesses on A(xa, ya:ya+???) int maxA; \n if (xa < na)\n maxA = lda*na; \n // macro READA will detect overflow on y dimension else\n maxA = 0; \n // there is already an overflow on xa #define READA ( (incA < maxA ) ? Ain[incA] : 0 ) \n B = d_dinvA + blk*NB + blk; \n C = d_dinvA + blk*NB; \n B += inx + __mul(iby + iny, ldb); \n C += ibx + id + __mul(iby, ldc); \n __global double *Blast = B + blk; \n double c[16] = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }; \n do {\n double a[4]; \n a[0] = ((incA < maxA) ? Ain[incA] : 0); incA += lda; \n a[1] = ((incA < maxA) ? Ain[incA] : 0); incA += lda; \n a[2] = ((incA < maxA) ? Ain[incA] : 0); incA += lda; \n a[3] = ((incA < maxA) ? Ain[incA] : 0); incA += lda; \n bs[inx][iny] = B[0 * ldb]; \n bs[inx][iny + 4] = B[4 * ldb]; \n bs[inx][iny + 8] = B[8 * ldb]; \n bs[inx][iny + 12] = B[12 * ldb]; \n bs[inx + 8][iny] = B[8 + 0 * ldb]; \n bs[inx + 8][iny + 4] = B[8 + 4 * ldb]; \n bs[inx + 8][iny + 8] = B[8 + 8 * ldb]; \n bs[inx + 8][iny + 12] = B[8 + 12 * ldb]; \n //__syncthreads(); barrier(CLK_LOCAL_MEM_FENCE); \n daxpy(a[0], &bs[0][0], c); a[0] = ((incA < maxA) ? Ain[incA] : 0); incA += lda; \n daxpy(a[1], &bs[1][0], c); a[1] = ((incA < maxA) ? Ain[incA] : 0); incA += lda; \n daxpy(a[2], &bs[2][0], c); a[2] = ((incA < maxA) ? Ain[incA] : 0); incA += lda; \n daxpy(a[3], &bs[3][0], c); a[3] = ((incA < maxA) ? Ain[incA] : 0); incA += lda; \n \n daxpy(a[0], &bs[4][0], c); a[0] = ( (incA < maxA ) ? Ain[incA] : 0 ) ; incA += lda; \n daxpy(a[1], &bs[5][0], c); a[1] = ((incA < maxA) ? Ain[incA] : 0); incA += lda; \n daxpy(a[2], &bs[6][0], c); a[2] = ((incA < maxA) ? Ain[incA] : 0); incA += lda; \n daxpy(a[3], &bs[7][0], c); a[3] = ((incA < maxA) ? Ain[incA] : 0); incA += lda; \n \n daxpy(a[0], &bs[8][0], c); a[0] = ( (incA < maxA ) ? Ain[incA] : 0 ) ; incA += lda; \n daxpy(a[1], &bs[9][0], c); a[1] = ((incA < maxA) ? Ain[incA] : 0); incA += lda; \n daxpy(a[2], &bs[10][0], c); a[2] = ((incA < maxA) ? Ain[incA] : 0); incA += lda; \n daxpy(a[3], &bs[11][0], c); a[3] = ((incA < maxA) ? Ain[incA] : 0); incA += lda; \n daxpy(a[0], &bs[12][0], c);\n daxpy(a[1], &bs[13][0], c);\n daxpy(a[2], &bs[14][0], c);\n daxpy(a[3], &bs[15][0], c);\n B += 16; \n //__syncthreads(); barrier(CLK_LOCAL_MEM_FENCE); \n } while (B < Blast); \n for (int i = 0; i < 16; i++) {\n C[0] = c[i]; \n C += ldc; \n }\n }\n //__syncthreads(); barrier(CLK_LOCAL_MEM_FENCE); \n }\n // end of kernel ); #endif clblas-2.10/src/library/blas/trtri/triple_dgemm_update_128_32_PART2_L.cpp000066400000000000000000000076541264277366700260530ustar00rootroot00000000000000/******************************************************************************* * Hand-tuned kernel * * B21 = -inv(A11)*A12*inv(A22) ******************************************************************************/ #ifndef KERNEL_TRIPLE_DGEMM_UPDATE_128_32_PART2_L_SRC_CPP #define KERNEL_TRIPLE_DGEMM_UPDATE_128_32_PART2_L_SRC_CPP #pragma message("#define KERNEL_TRIPLE_DGEMM_UPDATE_128_32_PART2_L_SRC_CPP.") #ifndef STRINGIFY #define STRINGIFY2(...) #__VA_ARGS__ #define STRINGIFY(...) STRINGIFY2(__VA_ARGS__) #endif unsigned char *triple_dgemm_update_128_32_PART2_L_bin = 0; size_t triple_dgemm_update_128_32_PART2_L_binSize = 0; const char * const triple_dgemm_update_128_32_PART2_L_src = STRINGIFY( static void daxpy(\n double alpha, \n __local const double * __restrict__ b, \n double * __restrict__ c)\n { \n c[0] += alpha * b[0]; \n c[1] += alpha * b[1]; \n c[2] += alpha * b[2]; \n c[3] += alpha * b[3]; \n c[4] += alpha * b[4]; \n c[5] += alpha * b[5]; \n c[6] += alpha * b[6]; \n c[7] += alpha * b[7]; \n c[8] += alpha * b[8]; \n c[9] += alpha * b[9]; \n c[10] += alpha * b[10]; \n c[11] += alpha * b[11]; \n c[12] += alpha * b[12]; \n c[13] += alpha * b[13]; \n c[14] += alpha * b[14]; \n c[15] += alpha * b[15]; \n }\n #define NB 128\n #define __mul(i,j) ((i)*(j))\n #define qmod(a, b) ((a)%(b))\n __kernel void TRIPLE_DGEMM_UPDATE_128_32_PART2_L(__global const double *Ain, uint offAin, __global double *d_dinvA, int blk, int lda, int npages, int na)\n {\n const int bIdy = get_group_id(1) / npages;\n const int page = qmod(get_group_id(1), npages); \n const int inx = get_local_id(0); \n const int iny = get_local_id(1); \n const int ibx = get_group_id(0) * (get_local_size(0)*get_local_size(1)); \n const int iby = bIdy * 16; \n const int id = inx + iny*get_local_size(0); \n __local double bs[16][17]; \n Ain = Ain + offAin; \n int PagesPerNB = NB / (blk * 2); \n //--------------------------part two---------------------------// { // -inv(A22)*A21 -> A21 // A=inv(A22), B=A21, C=A21 __global const double *A; \n __global double *B, *C; \n int lda = NB; \n int ldb = NB; \n int ldc = NB; \n d_dinvA += NB*NB*(page / PagesPerNB) + (qmod(page, PagesPerNB))*(blk * 2)*NB + (qmod(page, PagesPerNB))*(blk * 2); \n A = d_dinvA + blk*NB + blk;\n B = C = d_dinvA + blk; \n A += ibx + id; \n B += inx + __mul(iby + iny, ldb); \n C += ibx + id + __mul(iby, ldc); \n __global double *Blast = B + blk; \n double c[16] = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }; \n do {\n double a[4] = { A[0 * lda], A[1 * lda], A[2 * lda], A[3 * lda] }; \n bs[inx][iny] = B[0 * ldb]; \n bs[inx][iny + 4] = B[4 * ldb]; \n bs[inx][iny + 8] = B[8 * ldb]; \n bs[inx][iny + 12] = B[12 * ldb]; \n bs[inx + 8][iny] = B[8 + 0 * ldb]; \n bs[inx + 8][iny + 4] = B[8 + 4 * ldb]; \n bs[inx + 8][iny + 8] = B[8 + 8 * ldb]; \n bs[inx + 8][iny + 12] = B[8 + 12 * ldb]; \n //__syncthreads(); barrier(CLK_LOCAL_MEM_FENCE); \n A += 4 * lda; \n daxpy(a[0], &bs[0][0], c); a[0] = A[0 * lda]; \n daxpy(a[1], &bs[1][0], c); a[1] = A[1 * lda]; \n daxpy(a[2], &bs[2][0], c); a[2] = A[2 * lda]; \n daxpy(a[3], &bs[3][0], c); a[3] = A[3 * lda]; \n A += 4 * lda; \n daxpy(a[0], &bs[4][0], c); a[0] = A[0 * lda]; \n daxpy(a[1], &bs[5][0], c); a[1] = A[1 * lda]; \n daxpy(a[2], &bs[6][0], c); a[2] = A[2 * lda]; \n daxpy(a[3], &bs[7][0], c); a[3] = A[3 * lda]; \n A += 4 * lda; \n daxpy(a[0], &bs[8][0], c); a[0] = A[0 * lda]; \n daxpy(a[1], &bs[9][0], c); a[1] = A[1 * lda]; \n daxpy(a[2], &bs[10][0], c); a[2] = A[2 * lda]; \n daxpy(a[3], &bs[11][0], c); a[3] = A[3 * lda]; \n A += 4 * lda; \n daxpy(a[0], &bs[12][0], c); \n daxpy(a[1], &bs[13][0], c); \n daxpy(a[2], &bs[14][0], c); \n daxpy(a[3], &bs[15][0], c); \n B += 16; \n //__syncthreads(); barrier(CLK_LOCAL_MEM_FENCE); \n } while (B < Blast); \n for (int i = 0; i < 16; i++) {\n C[0] = (-1)*c[i]; \n C += ldc; \n }\n }\n }\n // end of kernel ); #endif clblas-2.10/src/library/blas/trtri/triple_dgemm_update_128_32_PART2_R.cpp000066400000000000000000000076141264277366700260550ustar00rootroot00000000000000/******************************************************************************* * Hand-tuned kernel * * B21 = -inv(A11)*A12*inv(A22) * ******************************************************************************/ #ifndef KERNEL_TRIPLE_DGEMM_UPDATE_128_32_PART2_R_SRC_CPP #define KERNEL_TRIPLE_DGEMM_UPDATE_128_32_PART2_R_SRC_CPP #pragma message("#define KERNEL_TRIPLE_DGEMM_UPDATE_128_32_PART2_R_SRC_CPP.") #ifndef STRINGIFY #define STRINGIFY2(...) #__VA_ARGS__ #define STRINGIFY(...) STRINGIFY2(__VA_ARGS__) #endif unsigned char *triple_dgemm_update_128_32_PART2_R_bin = 0; size_t triple_dgemm_update_128_32_PART2_R_binSize = 0; const char * const triple_dgemm_update_128_32_PART2_R_src = STRINGIFY( static void daxpy(\n double alpha, \n __local const double * __restrict__ b, \n double * __restrict__ c)\n { \n c[0] += alpha * b[0]; \n c[1] += alpha * b[1]; \n c[2] += alpha * b[2]; \n c[3] += alpha * b[3]; \n c[4] += alpha * b[4]; \n c[5] += alpha * b[5]; \n c[6] += alpha * b[6]; \n c[7] += alpha * b[7]; \n c[8] += alpha * b[8]; \n c[9] += alpha * b[9]; \n c[10] += alpha * b[10]; \n c[11] += alpha * b[11]; \n c[12] += alpha * b[12]; \n c[13] += alpha * b[13]; \n c[14] += alpha * b[14]; \n c[15] += alpha * b[15]; \n }\n #define NB 128\n #define __mul(i,j) ((i)*(j))\n #define qmod(a, b) ((a)%(b))\n __kernel void TRIPLE_DGEMM_UPDATE_128_32_PART2_R(__global const double *Ain, uint offAin, __global double *d_dinvA, int blk, int lda, int npages, int na)\n {\n const int bIdy = get_group_id(1) / npages;\n const int page = qmod(get_group_id(1), npages); \n const int inx = get_local_id(0); \n const int iny = get_local_id(1); \n const int ibx = get_group_id(0) * (get_local_size(0)*get_local_size(1)); \n const int iby = bIdy * 16; \n const int id = inx + iny*get_local_size(0); \n __local double bs[16][17]; \n Ain = Ain + offAin; \n int PagesPerNB = NB / (blk * 2); \n //--------------------------part two---------------------------// { // -inv(A11)*A12 -> A12 // A=inv(A11), B=A12, C=A12 __global double *A, *B, *C; \n int lda = NB; \n int ldb = NB; \n int ldc = NB; \n d_dinvA += NB*NB*(page / PagesPerNB) + (qmod(page, PagesPerNB))*(blk * 2)*NB + (qmod(page, PagesPerNB))*(blk * 2); \n A = d_dinvA; \n B = C = d_dinvA + blk*NB; \n A += ibx + id; \n B += inx + __mul(iby + iny, ldb); \n C += ibx + id + __mul(iby, ldc); \n __global double *Blast = B + blk; \n double c[16] = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }; \n do {\n double a[4] = { A[0 * lda], A[1 * lda], A[2 * lda], A[3 * lda] }; \n bs[inx][iny] = B[0 * ldb]; \n bs[inx][iny + 4] = B[4 * ldb]; \n bs[inx][iny + 8] = B[8 * ldb]; \n bs[inx][iny + 12] = B[12 * ldb]; \n bs[inx + 8][iny] = B[8 + 0 * ldb]; \n bs[inx + 8][iny + 4] = B[8 + 4 * ldb]; \n bs[inx + 8][iny + 8] = B[8 + 8 * ldb]; \n bs[inx + 8][iny + 12] = B[8 + 12 * ldb]; \n //__syncthreads(); barrier(CLK_LOCAL_MEM_FENCE); \n A += 4 * lda; \n daxpy(a[0], &bs[0][0], c); a[0] = A[0 * lda]; \n daxpy(a[1], &bs[1][0], c); a[1] = A[1 * lda]; \n daxpy(a[2], &bs[2][0], c); a[2] = A[2 * lda]; \n daxpy(a[3], &bs[3][0], c); a[3] = A[3 * lda]; \n A += 4 * lda; \n daxpy(a[0], &bs[4][0], c); a[0] = A[0 * lda]; \n daxpy(a[1], &bs[5][0], c); a[1] = A[1 * lda]; \n daxpy(a[2], &bs[6][0], c); a[2] = A[2 * lda]; \n daxpy(a[3], &bs[7][0], c); a[3] = A[3 * lda]; \n A += 4 * lda; \n daxpy(a[0], &bs[8][0], c); a[0] = A[0 * lda]; \n daxpy(a[1], &bs[9][0], c); a[1] = A[1 * lda]; \n daxpy(a[2], &bs[10][0], c); a[2] = A[2 * lda]; \n daxpy(a[3], &bs[11][0], c); a[3] = A[3 * lda]; \n A += 4 * lda; \n daxpy(a[0], &bs[12][0], c); \n daxpy(a[1], &bs[13][0], c); \n daxpy(a[2], &bs[14][0], c); \n daxpy(a[3], &bs[15][0], c); \n B += 16; \n //__syncthreads(); barrier(CLK_LOCAL_MEM_FENCE); \n } while (B < Blast); \n for (int i = 0; i < 16; i++) {\n C[0] = (-1)*c[i]; \n C += ldc; \n }\n }\n }\n // end of kernel ); #endif clblas-2.10/src/library/blas/trtri/triple_dgemm_update_128_64_PART1_L.cpp000066400000000000000000000111331264277366700260420ustar00rootroot00000000000000/******************************************************************************* * Hand-tuned kernel * * B21 = -inv(A22)*A21*inv(A11) * ******************************************************************************/ #ifndef KERNEL_TRIPLE_DGEMM_UPDATE_128_64_PART1_L_SRC_CPP #define KERNEL_TRIPLE_DGEMM_UPDATE_128_64_PART1_L_SRC_CPP #pragma message("#define KERNEL_TRIPLE_DGEMM_UPDATE_128_64_PART1_L_SRC_CPP.") #ifndef STRINGIFY #define STRINGIFY2(...) #__VA_ARGS__ #define STRINGIFY(...) STRINGIFY2(__VA_ARGS__) #endif unsigned char *triple_dgemm_update_128_64_PART1_L_bin = 0; size_t triple_dgemm_update_128_64_PART1_L_binSize = 0; const char * const triple_dgemm_update_128_64_PART1_L_src = STRINGIFY( static void daxpy(\n double alpha, \n __local const double * __restrict__ b, \n double * __restrict__ c)\n { \n c[0] += alpha * b[0]; \n c[1] += alpha * b[1]; \n c[2] += alpha * b[2]; \n c[3] += alpha * b[3]; \n c[4] += alpha * b[4]; \n c[5] += alpha * b[5]; \n c[6] += alpha * b[6]; \n c[7] += alpha * b[7]; \n c[8] += alpha * b[8]; \n c[9] += alpha * b[9]; \n c[10] += alpha * b[10]; \n c[11] += alpha * b[11]; \n c[12] += alpha * b[12]; \n c[13] += alpha * b[13]; \n c[14] += alpha * b[14]; \n c[15] += alpha * b[15]; \n }\n #define NB 128\n #define __mul(i,j) ((i)*(j))\n #define qmod(a, b) ((a)%(b))\n __kernel void TRIPLE_DGEMM_UPDATE_128_64_PART1_L(__global const double *Ain, uint offAin, __global double *d_dinvA, int blk, int lda, int npages, int na)\n {\n const int bIdy = get_group_id(1) / npages; \n const int page = qmod(get_group_id(1), npages); \n const int inx = get_local_id(0); \n const int iny = get_local_id(1); \n const int ibx = get_group_id(0) * 64; \n const int iby = bIdy * 16; \n const int id = inx + iny * 16; \n __local double bs[16][17]; \n Ain = Ain + offAin; \n int PagesPerNB = NB / (blk * 2); \n //--------------------------part one---------------------------// { // A21*inv(A11) -> A21 // A=A21, B=inv(A11), C=A21 __global const double *A; \n __global double *B, *C; \n int ldb = NB; \n int ldc = NB; \n d_dinvA += NB*NB*(page / PagesPerNB) + (qmod(page, PagesPerNB))*(blk * 2)*NB + (qmod(page, PagesPerNB))*(blk * 2); \n int xa = page*blk * 2 + blk + ibx + id; \n int ya = page*blk * 2; \n int incA = ya * lda + xa; \n // maxA will be used to detect overflow on all subsequent accesses on A(xa, ya:ya+???) int maxA; \n if (xa < na)\n maxA = lda*na; \n // macro READA will detect overflow on y dimension else\n maxA = 0;\n // there is already an overflow on xa #define READA ( (incA < maxA ) ? Ain[incA] : 0 ) \n B = d_dinvA;\n C = d_dinvA + blk; \n B += inx + __mul(iby + iny, ldb); \n C += ibx + id + __mul(iby, ldc); \n __global double *Blast = B + blk; \n double c[16] = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }; \n do {\n double a[4]; \n a[0] = ((incA < maxA) ? Ain[incA] : 0); incA += lda; \n a[1] = ((incA < maxA) ? Ain[incA] : 0); incA += lda; \n a[2] = ((incA < maxA) ? Ain[incA] : 0); incA += lda; \n a[3] = ((incA < maxA) ? Ain[incA] : 0); incA += lda; \n bs[inx][iny] = B[0 * ldb]; \n bs[inx][iny + 4] = B[4 * ldb]; \n bs[inx][iny + 8] = B[8 * ldb]; \n bs[inx][iny + 12] = B[12 * ldb]; \n //__syncthreads(); barrier(CLK_LOCAL_MEM_FENCE); \n daxpy(a[0], &bs[0][0], c); a[0] = ((incA < maxA) ? Ain[incA] : 0); incA += lda; \n daxpy(a[1], &bs[1][0], c); a[1] = ((incA < maxA) ? Ain[incA] : 0); incA += lda; \n daxpy(a[2], &bs[2][0], c); a[2] = ((incA < maxA) ? Ain[incA] : 0); incA += lda; \n daxpy(a[3], &bs[3][0], c); a[3] = ((incA < maxA) ? Ain[incA] : 0); incA += lda; \n daxpy(a[0], &bs[4][0], c); a[0] = ((incA < maxA) ? Ain[incA] : 0); incA += lda; \n daxpy(a[1], &bs[5][0], c); a[1] = ((incA < maxA) ? Ain[incA] : 0); incA += lda; \n daxpy(a[2], &bs[6][0], c); a[2] = ((incA < maxA) ? Ain[incA] : 0); incA += lda; \n daxpy(a[3], &bs[7][0], c); a[3] = ((incA < maxA) ? Ain[incA] : 0); incA += lda; \n daxpy(a[0], &bs[8][0], c); a[0] = ((incA < maxA) ? Ain[incA] : 0); incA += lda; \n daxpy(a[1], &bs[9][0], c); a[1] = ((incA < maxA) ? Ain[incA] : 0); incA += lda; \n daxpy(a[2], &bs[10][0], c); a[2] = ((incA < maxA) ? Ain[incA] : 0); incA += lda; \n daxpy(a[3], &bs[11][0], c); a[3] = ((incA < maxA) ? Ain[incA] : 0); incA += lda; \n daxpy(a[0], &bs[12][0], c); \n daxpy(a[1], &bs[13][0], c); \n daxpy(a[2], &bs[14][0], c); \n daxpy(a[3], &bs[15][0], c); \n #undef READA\n B += 16; \n //__syncthreads(); barrier(CLK_LOCAL_MEM_FENCE); \n } while (B < Blast); \n for (int i = 0; i < 16; i++) {\n C[0] = c[i]; \n C += ldc; \n }\n }\n }\n // end of kernel ); #endif clblas-2.10/src/library/blas/trtri/triple_dgemm_update_128_64_PART1_R.cpp000066400000000000000000000112401264277366700260470ustar00rootroot00000000000000/******************************************************************************* * Hand-tuned kernel * * B21 = -inv(A11)*A12*inv(A22) * ******************************************************************************/ #ifndef KERNEL_TRIPLE_DGEMM_UPDATE_128_64_PART1_R_SRC_CPP #define KERNEL_TRIPLE_DGEMM_UPDATE_128_64_PART1_R_SRC_CPP #pragma message("#define KERNEL_TRIPLE_DGEMM_UPDATE_128_64_PART1_R_SRC_CPP.") #ifndef STRINGIFY #define STRINGIFY2(...) #__VA_ARGS__ #define STRINGIFY(...) STRINGIFY2(__VA_ARGS__) #endif unsigned char *triple_dgemm_update_128_64_PART1_R_bin = 0; size_t triple_dgemm_update_128_64_PART1_R_binSize = 0; const char * const triple_dgemm_update_128_64_PART1_R_src = STRINGIFY( static void daxpy(\n double alpha, \n __local const double * __restrict__ b, \n double * __restrict__ c)\n { \n c[0] += alpha * b[0]; \n c[1] += alpha * b[1]; \n c[2] += alpha * b[2]; \n c[3] += alpha * b[3]; \n c[4] += alpha * b[4]; \n c[5] += alpha * b[5]; \n c[6] += alpha * b[6]; \n c[7] += alpha * b[7]; \n c[8] += alpha * b[8]; \n c[9] += alpha * b[9]; \n c[10] += alpha * b[10]; \n c[11] += alpha * b[11]; \n c[12] += alpha * b[12]; \n c[13] += alpha * b[13]; \n c[14] += alpha * b[14]; \n c[15] += alpha * b[15]; \n }\n #define NB 128\n #define __mul(i,j) ((i)*(j))\n #define qmod(a, b) ((a)%(b))\n __kernel void TRIPLE_DGEMM_UPDATE_128_64_PART1_R(__global const double *Ain, uint offAin, __global double *d_dinvA, int blk, int lda, int npages, int na)\n {\n const int bIdy = get_group_id(1) / npages; \n const int page = qmod(get_group_id(1), npages); \n const int inx = get_local_id(0); \n const int iny = get_local_id(1); \n const int ibx = get_group_id(0) * 64; \n const int iby = bIdy * 16; \n const int id = inx + iny * 16; \n __local double bs[16][17]; \n Ain = Ain + offAin; \n int PagesPerNB = NB / (blk * 2); \n //--------------------------part one---------------------------// { // A12*inv(A22) -> A12(d_dinvA) // A=A12, B=inv(A22), C=A12 __global const double *A; \n __global double *B, *C; \n int ldb = NB; \n int ldc = NB; \n d_dinvA += NB*NB*(page / PagesPerNB) + (qmod(page, PagesPerNB))*(blk * 2)*NB + (qmod(page, PagesPerNB))*(blk * 2); \n int xa = page*blk * 2 + ibx + id; \n int ya = page*blk * 2 + blk; \n int incA = ya * lda + xa; \n // maxA will be used to detect overflow on all subsequent accesses on A(xa, ya:ya+???) int maxA; \n if (xa < na)\n maxA = lda*na; \n // macro READA will detect overflow on y dimension else\n maxA = 0; \n // there is already an overflow on xa #define READA ( (incA < maxA ) ? Ain[incA] : 0 ) \n B = d_dinvA + blk*NB + blk; \n C = d_dinvA + blk*NB; \n B += inx + __mul(iby + iny, ldb); \n C += ibx + id + __mul(iby, ldc); \n __global double *Blast = B + blk; \n double c[16] = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }; \n do {\n double a[4]; \n a[0] = ((incA < maxA) ? Ain[incA] : 0); incA += lda; \n a[1] = ((incA < maxA) ? Ain[incA] : 0); incA += lda; \n a[2] = ((incA < maxA) ? Ain[incA] : 0); incA += lda; \n a[3] = ((incA < maxA) ? Ain[incA] : 0); incA += lda; \n bs[inx][iny] = B[0 * ldb]; \n bs[inx][iny + 4] = B[4 * ldb]; \n bs[inx][iny + 8] = B[8 * ldb]; \n bs[inx][iny + 12] = B[12 * ldb]; \n //__syncthreads(); barrier(CLK_LOCAL_MEM_FENCE); \n daxpy(a[0], &bs[0][0], c); a[0] = ((incA < maxA) ? Ain[incA] : 0); incA += lda; \n daxpy(a[1], &bs[1][0], c); a[1] = ((incA < maxA) ? Ain[incA] : 0); incA += lda; \n daxpy(a[2], &bs[2][0], c); a[2] = ((incA < maxA) ? Ain[incA] : 0); incA += lda; \n daxpy(a[3], &bs[3][0], c); a[3] = ((incA < maxA) ? Ain[incA] : 0); incA += lda; \n \n daxpy(a[0], &bs[4][0], c); a[0] = ( (incA < maxA ) ? Ain[incA] : 0 ); incA += lda;\n daxpy(a[1], &bs[5][0], c); a[1] = ((incA < maxA) ? Ain[incA] : 0); incA += lda; \n daxpy(a[2], &bs[6][0], c); a[2] = ((incA < maxA) ? Ain[incA] : 0); incA += lda; \n daxpy(a[3], &bs[7][0], c); a[3] = ((incA < maxA) ? Ain[incA] : 0); incA += lda; \n \n daxpy(a[0], &bs[8][0], c); a[0] = ( (incA < maxA ) ? Ain[incA] : 0 ); incA += lda;\n daxpy(a[1], &bs[9][0], c); a[1] = ((incA < maxA) ? Ain[incA] : 0); incA += lda; \n daxpy(a[2], &bs[10][0], c); a[2] = ((incA < maxA) ? Ain[incA] : 0); incA += lda; \n daxpy(a[3], &bs[11][0], c); a[3] = ((incA < maxA) ? Ain[incA] : 0); incA += lda; \n daxpy(a[0], &bs[12][0], c);\n daxpy(a[1], &bs[13][0], c);\n daxpy(a[2], &bs[14][0], c);\n daxpy(a[3], &bs[15][0], c);\n B += 16; \n //__syncthreads(); barrier(CLK_LOCAL_MEM_FENCE); \n } while (B < Blast); \n #undef READA\n for (int i = 0; i < 16; i++) {\n C[0] = c[i]; \n C += ldc; \n }\n }\n }\n // end of kernel ); #endif clblas-2.10/src/library/blas/trtri/triple_dgemm_update_128_64_PART2_L.cpp000066400000000000000000000073621264277366700260540ustar00rootroot00000000000000/******************************************************************************* * Hand-tuned kernel * * B21 = -inv(A22)*A21*inv(A11) * ******************************************************************************/ #ifndef KERNEL_TRIPLE_DGEMM_UPDATE_128_64_PART2_L_SRC_CPP #define KERNEL_TRIPLE_DGEMM_UPDATE_128_64_PART2_L_SRC_CPP #pragma message("#define KERNEL_TRIPLE_DGEMM_UPDATE_128_64_PART2_L_SRC_CPP.") #ifndef STRINGIFY #define STRINGIFY2(...) #__VA_ARGS__ #define STRINGIFY(...) STRINGIFY2(__VA_ARGS__) #endif unsigned char *triple_dgemm_update_128_64_PART2_L_bin = 0; size_t triple_dgemm_update_128_64_PART2_L_binSize = 0; const char * const triple_dgemm_update_128_64_PART2_L_src = STRINGIFY( static void daxpy(\n double alpha, \n __local const double * __restrict__ b, \n double * __restrict__ c)\n { \n c[0] += alpha * b[0]; \n c[1] += alpha * b[1]; \n c[2] += alpha * b[2]; \n c[3] += alpha * b[3]; \n c[4] += alpha * b[4]; \n c[5] += alpha * b[5]; \n c[6] += alpha * b[6]; \n c[7] += alpha * b[7]; \n c[8] += alpha * b[8]; \n c[9] += alpha * b[9]; \n c[10] += alpha * b[10]; \n c[11] += alpha * b[11]; \n c[12] += alpha * b[12]; \n c[13] += alpha * b[13]; \n c[14] += alpha * b[14]; \n c[15] += alpha * b[15]; \n }\n #define NB 128\n #define __mul(i,j) ((i)*(j))\n #define qmod(a, b) ((a)%(b))\n __kernel void TRIPLE_DGEMM_UPDATE_128_64_PART2_L(__global const double *Ain, uint offAin, __global double *d_dinvA, int blk, int lda, int npages, int na)\n {\n const int bIdy = get_group_id(1) / npages; \n const int page = qmod(get_group_id(1), npages); \n const int inx = get_local_id(0); \n const int iny = get_local_id(1); \n const int ibx = get_group_id(0) * 64; \n const int iby = bIdy * 16; \n const int id = inx + iny * 16; \n __local double bs[16][17]; \n Ain = Ain + offAin; \n int PagesPerNB = NB / (blk * 2); \n //--------------------------part two---------------------------// { // -inv(A22)*A21 -> A21 // A=inv(A22), B=A21, C=A21 __global const double *A; \n __global double *B, *C; \n int lda = NB; \n int ldb = NB; \n int ldc = NB; \n d_dinvA += NB*NB*(page / PagesPerNB) + (qmod(page, PagesPerNB))*(blk * 2)*NB + (qmod(page, PagesPerNB))*(blk * 2); \n A = d_dinvA + blk*NB + blk; \n B = C = d_dinvA + blk; \n A += ibx + id; \n B += inx + __mul(iby + iny, ldb); \n C += ibx + id + __mul(iby, ldc); \n __global double *Blast = B + blk; \n double c[16] = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }; \n do {\n double a[4] = { A[0 * lda], A[1 * lda], A[2 * lda], A[3 * lda] }; \n bs[inx][iny] = B[0 * ldb]; \n bs[inx][iny + 4] = B[4 * ldb]; \n bs[inx][iny + 8] = B[8 * ldb]; \n bs[inx][iny + 12] = B[12 * ldb]; \n //__syncthreads(); barrier(CLK_LOCAL_MEM_FENCE); \n A += 4 * lda; \n daxpy(a[0], &bs[0][0], c); a[0] = A[0 * lda]; \n daxpy(a[1], &bs[1][0], c); a[1] = A[1 * lda]; \n daxpy(a[2], &bs[2][0], c); a[2] = A[2 * lda]; \n daxpy(a[3], &bs[3][0], c); a[3] = A[3 * lda]; \n A += 4 * lda; \n daxpy(a[0], &bs[4][0], c); a[0] = A[0 * lda]; \n daxpy(a[1], &bs[5][0], c); a[1] = A[1 * lda]; \n daxpy(a[2], &bs[6][0], c); a[2] = A[2 * lda]; \n daxpy(a[3], &bs[7][0], c); a[3] = A[3 * lda]; \n A += 4 * lda; \n daxpy(a[0], &bs[8][0], c); a[0] = A[0 * lda]; \n daxpy(a[1], &bs[9][0], c); a[1] = A[1 * lda]; \n daxpy(a[2], &bs[10][0], c); a[2] = A[2 * lda]; \n daxpy(a[3], &bs[11][0], c); a[3] = A[3 * lda]; \n A += 4 * lda; \n daxpy(a[0], &bs[12][0], c); \n daxpy(a[1], &bs[13][0], c); \n daxpy(a[2], &bs[14][0], c); \n daxpy(a[3], &bs[15][0], c); \n B += 16; \n //__syncthreads(); barrier(CLK_LOCAL_MEM_FENCE); \n } while (B < Blast); \n for (int i = 0; i < 16; i++) {\n C[0] = (-1)*c[i]; \n C += ldc; \n }\n }\n }\n // end of kernel ); #endif clblas-2.10/src/library/blas/trtri/triple_dgemm_update_128_64_PART2_R.cpp000066400000000000000000000073451264277366700260630ustar00rootroot00000000000000/******************************************************************************* * Hand-tuned kernel * * B21 = -inv(A11)*A12*inv(A22) * ******************************************************************************/ #ifndef KERNEL_TRIPLE_DGEMM_UPDATE_128_64_PART2_R_SRC_CPP #define KERNEL_TRIPLE_DGEMM_UPDATE_128_64_PART2_R_SRC_CPP #pragma message("#define KERNEL_TRIPLE_DGEMM_UPDATE_128_64_PART2_R_SRC_CPP.") #ifndef STRINGIFY #define STRINGIFY2(...) #__VA_ARGS__ #define STRINGIFY(...) STRINGIFY2(__VA_ARGS__) #endif unsigned char *triple_dgemm_update_128_64_PART2_R_bin = 0; size_t triple_dgemm_update_128_64_PART2_R_binSize = 0; const char * const triple_dgemm_update_128_64_PART2_R_src = STRINGIFY( static void daxpy(\n double alpha, \n __local const double * __restrict__ b, \n double * __restrict__ c)\n { \n c[0] += alpha * b[0]; \n c[1] += alpha * b[1]; \n c[2] += alpha * b[2]; \n c[3] += alpha * b[3]; \n c[4] += alpha * b[4]; \n c[5] += alpha * b[5]; \n c[6] += alpha * b[6]; \n c[7] += alpha * b[7]; \n c[8] += alpha * b[8]; \n c[9] += alpha * b[9]; \n c[10] += alpha * b[10]; \n c[11] += alpha * b[11]; \n c[12] += alpha * b[12]; \n c[13] += alpha * b[13]; \n c[14] += alpha * b[14]; \n c[15] += alpha * b[15]; \n }\n #define NB 128\n #define __mul(i,j) ((i)*(j))\n #define qmod(a, b) ((a)%(b))\n __kernel void TRIPLE_DGEMM_UPDATE_128_64_PART2_R(__global const double *Ain, uint offAin, __global double *d_dinvA, int blk, int lda, int npages, int na)\n {\n const int bIdy = get_group_id(1) / npages; \n const int page = qmod(get_group_id(1), npages); \n const int inx = get_local_id(0); \n const int iny = get_local_id(1); \n const int ibx = get_group_id(0) * 64; \n const int iby = bIdy * 16; \n const int id = inx + iny * 16; \n __local double bs[16][17]; \n Ain = Ain + offAin; \n int PagesPerNB = NB / (blk * 2); \n //--------------------------part two---------------------------// { // -inv(A11)*A12 -> A12 // A=inv(A11), B=A12, C=A12 __global const double *A; \n __global double *B, *C; \n int lda = NB; \n int ldb = NB; \n int ldc = NB; \n d_dinvA += NB*NB*(page / PagesPerNB) + (qmod(page, PagesPerNB))*(blk * 2)*NB + (qmod(page, PagesPerNB))*(blk * 2); \n A = d_dinvA; \n B = C = d_dinvA + blk*NB; \n A += ibx + id; \n B += inx + __mul(iby + iny, ldb); \n C += ibx + id + __mul(iby, ldc); \n __global double *Blast = B + blk; \n double c[16] = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }; \n do { double a[4] = { A[0 * lda], A[1 * lda], A[2 * lda], A[3 * lda] }; \n bs[inx][iny] = B[0 * ldb]; \n bs[inx][iny + 4] = B[4 * ldb]; \n bs[inx][iny + 8] = B[8 * ldb]; \n bs[inx][iny + 12] = B[12 * ldb]; \n //__syncthreads(); barrier(CLK_LOCAL_MEM_FENCE); \n A += 4 * lda; \n daxpy(a[0], &bs[0][0], c); a[0] = A[0 * lda]; \n daxpy(a[1], &bs[1][0], c); a[1] = A[1 * lda]; \n daxpy(a[2], &bs[2][0], c); a[2] = A[2 * lda]; \n daxpy(a[3], &bs[3][0], c); a[3] = A[3 * lda]; \n A += 4 * lda; \n daxpy(a[0], &bs[4][0], c); a[0] = A[0 * lda]; \n daxpy(a[1], &bs[5][0], c); a[1] = A[1 * lda]; \n daxpy(a[2], &bs[6][0], c); a[2] = A[2 * lda]; \n daxpy(a[3], &bs[7][0], c); a[3] = A[3 * lda]; \n A += 4 * lda; \n daxpy(a[0], &bs[8][0], c); a[0] = A[0 * lda]; \n daxpy(a[1], &bs[9][0], c); a[1] = A[1 * lda]; \n daxpy(a[2], &bs[10][0], c); a[2] = A[2 * lda]; \n daxpy(a[3], &bs[11][0], c); a[3] = A[3 * lda]; \n A += 4 * lda; \n daxpy(a[0], &bs[12][0], c); \n daxpy(a[1], &bs[13][0], c); \n daxpy(a[2], &bs[14][0], c); \n daxpy(a[3], &bs[15][0], c); \n B += 16; \n //__syncthreads(); barrier(CLK_LOCAL_MEM_FENCE); \n } while (B < Blast); \n for (int i = 0; i < 16; i++) {\n C[0] = (-1)*c[i]; \n C += ldc; \n }\n }\n }\n // end of kernel ); #endif clblas-2.10/src/library/blas/trtri/triple_dgemm_update_128_ABOVE64_PART1_L.cpp000066400000000000000000000111741264277366700266240ustar00rootroot00000000000000/******************************************************************************* * Hand-tuned kernel * * B21 = -inv(A22)*A21*inv(A11) * ******************************************************************************/ #ifndef KERNEL_TRIPLE_DGEMM_UPDATE_128_ABOVE64_PART1_L_SRC_CPP #define KERNEL_TRIPLE_DGEMM_UPDATE_128_ABOVE64_PART1_L_SRC_CPP #pragma message("#define KERNEL_TRIPLE_DGEMM_UPDATE_128_ABOVE64_PART1_L_SRC_CPP.") #ifndef STRINGIFY #define STRINGIFY2(...) #__VA_ARGS__ #define STRINGIFY(...) STRINGIFY2(__VA_ARGS__) #endif unsigned char *triple_dgemm_update_128_ABOVE64_PART1_L_bin = 0; size_t triple_dgemm_update_128_ABOVE64_PART1_L_binSize = 0; const char * const triple_dgemm_update_128_ABOVE64_PART1_L_src = STRINGIFY( static void daxpy(\n double alpha, \n __local const double * __restrict__ b, \n double * __restrict__ c)\n { \n c[0] += alpha * b[0]; \n c[1] += alpha * b[1]; \n c[2] += alpha * b[2]; \n c[3] += alpha * b[3]; \n c[4] += alpha * b[4]; \n c[5] += alpha * b[5]; \n c[6] += alpha * b[6]; \n c[7] += alpha * b[7]; \n c[8] += alpha * b[8]; \n c[9] += alpha * b[9]; \n c[10] += alpha * b[10]; \n c[11] += alpha * b[11]; \n c[12] += alpha * b[12]; \n c[13] += alpha * b[13]; \n c[14] += alpha * b[14]; \n c[15] += alpha * b[15]; \n }\n #define NB 128\n #define __mul(i,j) ((i)*(j))\n #define qmod(a, b) ((a)%(b))\n __kernel void TRIPLE_DGEMM_UPDATE_128_ABOVE64_PART1_R(__global const double *Ain, uint offAin, __global double *d_dinvA, int blk, int lda, int npages, int na)\n {\n const int bIdy = get_group_id(1) / npages; \n const int page = qmod(get_group_id(1), npages); \n const int inx = get_local_id(0); \n const int iny = get_local_id(1); \n const int ibx = get_group_id(0) * 64; \n const int iby = bIdy * 16; \n const int id = inx + iny * 16; \n __local double bs[16][17]; \n Ain = Ain + offAin; \n int PagesPerNB = NB / (blk * 2); \n //--------------------------part one---------------------------// { // A21*inv(A11) -> A21 // A=A21, B=inv(A11), C=A21 __global const double *A; \n __global double *B, *C; \n int ldb = NB; \n int ldc = NB; \n d_dinvA += NB*NB*(page / PagesPerNB) + (qmod(page, PagesPerNB))*(blk * 2)*NB + (qmod(page, PagesPerNB))*(blk * 2); \n int xa = page*blk * 2 + blk + ibx + id;\n int ya = page*blk * 2; \n int incA = ya * lda + xa; \n // maxA will be used to detect overflow on all subsequent accesses on A(xa, ya:ya+???) int maxA; \n if (xa < na)\n maxA = lda*na; \n // macro READA will detect overflow on y dimension else\n maxA = 0; \n // there is already an overflow on xa #define READA ( (incA < maxA ) ? Ain[incA] : 0 ) \n B = d_dinvA;\n C = d_dinvA + blk; \n B += inx + __mul(iby + iny, ldb); \n C += ibx + id + __mul(iby, ldc); \n __global double *Blast = B + blk; \n double c[16] = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }; \n do {\n double a[4]; \n a[0] = ((incA < maxA) ? Ain[incA] : 0); incA += lda; \n a[1] = ((incA < maxA) ? Ain[incA] : 0); incA += lda; \n a[2] = ((incA < maxA) ? Ain[incA] : 0); incA += lda; \n a[3] = ((incA < maxA) ? Ain[incA] : 0); incA += lda; \n bs[inx][iny] = B[0 * ldb]; \n bs[inx][iny + 4] = B[4 * ldb]; \n bs[inx][iny + 8] = B[8 * ldb]; \n bs[inx][iny + 12] = B[12 * ldb]; \n //__syncthreads(); barrier(CLK_LOCAL_MEM_FENCE); \n daxpy(a[0], &bs[0][0], c); a[0] = ((incA < maxA) ? Ain[incA] : 0); incA += lda; \n daxpy(a[1], &bs[1][0], c); a[1] = ((incA < maxA) ? Ain[incA] : 0); incA += lda; \n daxpy(a[2], &bs[2][0], c); a[2] = ((incA < maxA) ? Ain[incA] : 0); incA += lda; \n daxpy(a[3], &bs[3][0], c); a[3] = ((incA < maxA) ? Ain[incA] : 0); incA += lda; \n daxpy(a[0], &bs[4][0], c); a[0] = ((incA < maxA) ? Ain[incA] : 0); incA += lda; \n daxpy(a[1], &bs[5][0], c); a[1] = ((incA < maxA) ? Ain[incA] : 0); incA += lda; \n daxpy(a[2], &bs[6][0], c); a[2] = ((incA < maxA) ? Ain[incA] : 0); incA += lda; \n daxpy(a[3], &bs[7][0], c); a[3] = ((incA < maxA) ? Ain[incA] : 0); incA += lda; \n daxpy(a[0], &bs[8][0], c); a[0] = ((incA < maxA) ? Ain[incA] : 0); incA += lda; \n daxpy(a[1], &bs[9][0], c); a[1] = ((incA < maxA) ? Ain[incA] : 0); incA += lda; \n daxpy(a[2], &bs[10][0], c); a[2] = ((incA < maxA) ? Ain[incA] : 0); incA += lda; \n daxpy(a[3], &bs[11][0], c); a[3] = ((incA < maxA) ? Ain[incA] : 0); incA += lda; \n daxpy(a[0], &bs[12][0], c); \n daxpy(a[1], &bs[13][0], c); \n daxpy(a[2], &bs[14][0], c); \n daxpy(a[3], &bs[15][0], c); \n B += 16; \n //__syncthreads(); barrier(CLK_LOCAL_MEM_FENCE); \n } while (B < Blast); \n #undef READA\n for (int i = 0; i < 16; i++) {\n C[0] = c[i]; \n C += ldc; \n }\n }\n }\n // end of kernel ); #endif clblas-2.10/src/library/blas/trtri/triple_dgemm_update_128_ABOVE64_PART1_R.cpp000066400000000000000000000112161264277366700266270ustar00rootroot00000000000000/******************************************************************************* * Hand-tuned kernel * * B21 = -inv(A11)*A12*inv(A22) * ******************************************************************************/ #ifndef KERNEL_TRIPLE_DGEMM_UPDATE_128_ABOVE64_PART1_R_SRC_CPP #define KERNEL_TRIPLE_DGEMM_UPDATE_128_ABOVE64_PART1_R_SRC_CPP #pragma message("#define KERNEL_TRIPLE_DGEMM_UPDATE_128_ABOVE64_PART1_R_SRC_CPP.") #ifndef STRINGIFY #define STRINGIFY2(...) #__VA_ARGS__ #define STRINGIFY(...) STRINGIFY2(__VA_ARGS__) #endif unsigned char *triple_dgemm_update_128_ABOVE64_PART1_R_bin = 0; size_t triple_dgemm_update_128_ABOVE64_PART1_R_binSize = 0; const char * const triple_dgemm_update_128_ABOVE64_PART1_R_src = STRINGIFY( static void daxpy(\n double alpha, \n __local const double * __restrict__ b, \n double * __restrict__ c)\n { \n c[0] += alpha * b[0]; \n c[1] += alpha * b[1]; \n c[2] += alpha * b[2]; \n c[3] += alpha * b[3]; \n c[4] += alpha * b[4]; \n c[5] += alpha * b[5]; \n c[6] += alpha * b[6]; \n c[7] += alpha * b[7]; \n c[8] += alpha * b[8]; \n c[9] += alpha * b[9]; \n c[10] += alpha * b[10]; \n c[11] += alpha * b[11]; \n c[12] += alpha * b[12]; \n c[13] += alpha * b[13]; \n c[14] += alpha * b[14]; \n c[15] += alpha * b[15]; \n }\n #define NB 128\n #define __mul(i,j) ((i)*(j))\n #define qmod(a, b) ((a)%(b))\n __kernel void TRIPLE_DGEMM_UPDATE_128_ABOVE64_PART1_R(__global const double *Ain, uint offAin, __global double *d_dinvA, int blk, int lda, int npages, int na)\n {\n const int bIdy = get_group_id(1) / npages; \n const int page = qmod(get_group_id(1), npages); \n const int inx = get_local_id(0); \n const int iny = get_local_id(1); \n const int ibx = get_group_id(0) * 64; \n const int iby = bIdy * 16; \n const int id = inx + iny * 16; \n __local double bs[16][17]; \n Ain = Ain + offAin; \n int PagesPerNB = NB / (blk * 2); \n //--------------------------part one---------------------------// { // A12*inv(A22) -> A12(d_dinvA) // A=A12, B=inv(A22), C=A12 __global const double *A; \n __global double *B, *C; \n int ldb = NB; \n int ldc = NB; \n d_dinvA += NB*NB*(page / PagesPerNB) + (qmod(page, PagesPerNB))*(blk * 2)*NB + (qmod(page, PagesPerNB))*(blk * 2); \n int xa = page*blk * 2 + ibx + id; \n int ya = page*blk * 2 + blk; \n int incA = ya * lda + xa; \n // maxA will be used to detect overflow on all subsequent accesses on A(xa, ya:ya+???) int maxA; \n if (xa < na)\n maxA = lda*na; \n // macro READA will detect overflow on y dimension else\n maxA = 0; \n // there is already an overflow on xa #define READA ( (incA < maxA ) ? Ain[incA] : 0 ) \n B = d_dinvA + blk*NB + blk; \n C = d_dinvA + blk*NB; \n B += inx + __mul(iby + iny, ldb); \n C += ibx + id + __mul(iby, ldc); \n __global double *Blast = B + blk; \n double c[16] = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }; \n do {\n double a[4]; \n a[0] = ((incA < maxA) ? Ain[incA] : 0); incA += lda; \n a[1] = ((incA < maxA) ? Ain[incA] : 0); incA += lda; \n a[2] = ((incA < maxA) ? Ain[incA] : 0); incA += lda; \n a[3] = ((incA < maxA) ? Ain[incA] : 0); incA += lda; \n bs[inx][iny] = B[0 * ldb]; \n bs[inx][iny + 4] = B[4 * ldb]; \n bs[inx][iny + 8] = B[8 * ldb]; \n bs[inx][iny + 12] = B[12 * ldb]; \n //__syncthreads(); barrier(CLK_LOCAL_MEM_FENCE); \n daxpy(a[0], &bs[0][0], c); a[0] = ((incA < maxA) ? Ain[incA] : 0); incA += lda; \n daxpy(a[1], &bs[1][0], c); a[1] = ((incA < maxA) ? Ain[incA] : 0); incA += lda; \n daxpy(a[2], &bs[2][0], c); a[2] = ((incA < maxA) ? Ain[incA] : 0); incA += lda; \n daxpy(a[3], &bs[3][0], c); a[3] = ((incA < maxA) ? Ain[incA] : 0); incA += lda; \n daxpy(a[0], &bs[4][0], c); a[0] = ((incA < maxA) ? Ain[incA] : 0); incA += lda; \n daxpy(a[1], &bs[5][0], c); a[1] = ((incA < maxA) ? Ain[incA] : 0); incA += lda; \n daxpy(a[2], &bs[6][0], c); a[2] = ((incA < maxA) ? Ain[incA] : 0); incA += lda; \n daxpy(a[3], &bs[7][0], c); a[3] = ((incA < maxA) ? Ain[incA] : 0); incA += lda; \n daxpy(a[0], &bs[8][0], c); a[0] = ((incA < maxA) ? Ain[incA] : 0); incA += lda; \n daxpy(a[1], &bs[9][0], c); a[1] = ((incA < maxA) ? Ain[incA] : 0); incA += lda; \n daxpy(a[2], &bs[10][0], c); a[2] = ((incA < maxA) ? Ain[incA] : 0); incA += lda; \n daxpy(a[3], &bs[11][0], c); a[3] = ((incA < maxA) ? Ain[incA] : 0); incA += lda; \n daxpy(a[0], &bs[12][0], c); \n daxpy(a[1], &bs[13][0], c); \n daxpy(a[2], &bs[14][0], c); \n daxpy(a[3], &bs[15][0], c); \n B += 16; \n //__syncthreads(); barrier(CLK_LOCAL_MEM_FENCE); \n } while (B < Blast); \n for (int i = 0; i < 16; i++) {\n C[0] = c[i]; \n C += ldc; \n }\n }\n }\n // end of kernel ); #endif clblas-2.10/src/library/blas/trtri/triple_dgemm_update_128_ABOVE64_PART2_L.cpp000066400000000000000000000074151264277366700266300ustar00rootroot00000000000000/******************************************************************************* * Hand-tuned kernel * * B21 = -inv(A22)*A21*inv(A11) * ******************************************************************************/ #ifndef KERNEL_TRIPLE_DGEMM_UPDATE_128_ABOVE64_PART2_L_SRC_CPP #define KERNEL_TRIPLE_DGEMM_UPDATE_128_ABOVE64_PART2_L_SRC_CPP #pragma message("#define KERNEL_TRIPLE_DGEMM_UPDATE_128_ABOVE64_PART2_L_SRC_CPP.") #ifndef STRINGIFY #define STRINGIFY2(...) #__VA_ARGS__ #define STRINGIFY(...) STRINGIFY2(__VA_ARGS__) #endif unsigned char *triple_dgemm_update_128_ABOVE64_PART2_L_bin = 0; size_t triple_dgemm_update_128_ABOVE64_PART2_L_binSize = 0; const char * const triple_dgemm_update_128_ABOVE64_PART2_L_src = STRINGIFY( static void daxpy(\n double alpha, \n __local const double * __restrict__ b, \n double * __restrict__ c)\n { \n c[0] += alpha * b[0]; \n c[1] += alpha * b[1]; \n c[2] += alpha * b[2]; \n c[3] += alpha * b[3]; \n c[4] += alpha * b[4]; \n c[5] += alpha * b[5]; \n c[6] += alpha * b[6]; \n c[7] += alpha * b[7]; \n c[8] += alpha * b[8]; \n c[9] += alpha * b[9]; \n c[10] += alpha * b[10]; \n c[11] += alpha * b[11]; \n c[12] += alpha * b[12]; \n c[13] += alpha * b[13]; \n c[14] += alpha * b[14]; \n c[15] += alpha * b[15]; \n }\n #define NB 128\n #define __mul(i,j) ((i)*(j))\n #define qmod(a, b) ((a)%(b))\n __kernel void TRIPLE_DGEMM_UPDATE_128_ABOVE64_PART2_L(__global const double *Ain, uint offAin, __global double *d_dinvA, int blk, int lda, int npages, int na)\n {\n const int bIdy = get_group_id(1) / npages;\n const int page = qmod(get_group_id(1), npages);\n const int inx = get_local_id(0);\n const int iny = get_local_id(1);\n const int ibx = get_group_id(0) * 64; \n const int iby = bIdy * 16; \n const int id = inx + iny * 16; \n __local double bs[16][17]; \n Ain = Ain + offAin; \n int PagesPerNB = NB / (blk * 2); \n //--------------------------part two---------------------------// { // -inv(A22)*A21 -> A21 // A=inv(A22), B=A21, C=A21 __global double *A, *B, *C; \n int lda = NB; \n int ldb = NB; \n int ldc = NB; \n d_dinvA += NB*NB*(page / PagesPerNB) + (qmod(page, PagesPerNB))*(blk * 2)*NB + (qmod(page, PagesPerNB))*(blk * 2); \n A = d_dinvA + blk*NB + blk;\n B = d_dinvA + blk; \n C = d_dinvA + blk*NB; \n A += ibx + id; \n B += inx + __mul(iby + iny, ldb); \n C += ibx + id + __mul(iby, ldc); \n __global double *Blast = B + blk; \n double c[16] = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }; \n do {\n double a[4] = { A[0 * lda], A[1 * lda], A[2 * lda], A[3 * lda] }; \n bs[inx][iny] = B[0 * ldb]; \n bs[inx][iny + 4] = B[4 * ldb]; \n bs[inx][iny + 8] = B[8 * ldb]; \n bs[inx][iny + 12] = B[12 * ldb]; \n //__syncthreads(); barrier(CLK_LOCAL_MEM_FENCE); \n A += 4 * lda; \n daxpy(a[0], &bs[0][0], c); a[0] = A[0 * lda]; \n daxpy(a[1], &bs[1][0], c); a[1] = A[1 * lda]; \n daxpy(a[2], &bs[2][0], c); a[2] = A[2 * lda]; \n daxpy(a[3], &bs[3][0], c); a[3] = A[3 * lda]; \n A += 4 * lda; \n daxpy(a[0], &bs[4][0], c); a[0] = A[0 * lda]; \n daxpy(a[1], &bs[5][0], c); a[1] = A[1 * lda]; \n daxpy(a[2], &bs[6][0], c); a[2] = A[2 * lda]; \n daxpy(a[3], &bs[7][0], c); a[3] = A[3 * lda]; \n A += 4 * lda; \n daxpy(a[0], &bs[8][0], c); a[0] = A[0 * lda]; \n daxpy(a[1], &bs[9][0], c); a[1] = A[1 * lda]; \n daxpy(a[2], &bs[10][0], c); a[2] = A[2 * lda]; \n daxpy(a[3], &bs[11][0], c); a[3] = A[3 * lda]; \n A += 4 * lda; \n daxpy(a[0], &bs[12][0], c); \n daxpy(a[1], &bs[13][0], c); \n daxpy(a[2], &bs[14][0], c); \n daxpy(a[3], &bs[15][0], c); \n B += 16; \n //__syncthreads(); barrier(CLK_LOCAL_MEM_FENCE); \n } while (B < Blast); \n for (int i = 0; i < 16; i++) {\n C[0] = (-1)*c[i]; \n C += ldc; \n }\n }\n }\n // end of kernel ); #endif clblas-2.10/src/library/blas/trtri/triple_dgemm_update_128_ABOVE64_PART2_R.cpp000066400000000000000000000074371264277366700266420ustar00rootroot00000000000000/******************************************************************************* * Hand-tuned kernel * * B21 = -inv(A11)*A12*inv(A22) * ******************************************************************************/ #ifndef KERNEL_TRIPLE_DGEMM_UPDATE_128_ABOVE64_PART2_R_SRC_CPP #define KERNEL_TRIPLE_DGEMM_UPDATE_128_ABOVE64_PART2_R_SRC_CPP #pragma message("#define KERNEL_TRIPLE_DGEMM_UPDATE_128_ABOVE64_PART2_R_SRC_CPP.") #ifndef STRINGIFY #define STRINGIFY2(...) #__VA_ARGS__ #define STRINGIFY(...) STRINGIFY2(__VA_ARGS__) #endif unsigned char *triple_dgemm_update_128_ABOVE64_PART2_R_bin = 0; size_t triple_dgemm_update_128_ABOVE64_PART2_R_binSize = 0; const char * const triple_dgemm_update_128_ABOVE64_PART2_R_src = STRINGIFY( static void daxpy(\n double alpha, \n __local const double * __restrict__ b, \n double * __restrict__ c)\n { \n c[0] += alpha * b[0]; \n c[1] += alpha * b[1]; \n c[2] += alpha * b[2]; \n c[3] += alpha * b[3]; \n c[4] += alpha * b[4]; \n c[5] += alpha * b[5]; \n c[6] += alpha * b[6]; \n c[7] += alpha * b[7]; \n c[8] += alpha * b[8]; \n c[9] += alpha * b[9]; \n c[10] += alpha * b[10]; \n c[11] += alpha * b[11]; \n c[12] += alpha * b[12]; \n c[13] += alpha * b[13]; \n c[14] += alpha * b[14]; \n c[15] += alpha * b[15]; \n }\n #define NB 128\n #define __mul(i,j) ((i)*(j))\n #define qmod(a, b) ((a)%(b))\n __kernel void TRIPLE_DGEMM_UPDATE_128_ABOVE64_PART2_R(__global const double *Ain, uint offAin, __global double *d_dinvA, int blk, int lda, int npages, int na)\n {\n const int bIdy = get_group_id(1) / npages; \n const int page = qmod(get_group_id(1), npages); \n const int inx = get_local_id(0); \n const int iny = get_local_id(1); \n const int ibx = get_group_id(0) * 64; \n const int iby = bIdy * 16; \n const int id = inx + iny * 16; \n __local double bs[16][17]; \n Ain = Ain + offAin; \n int PagesPerNB = NB / (blk * 2); \n //--------------------------part two---------------------------// { \n // -inv(A11)*A12 -> A12 // A=inv(A11), B=A12, C=A12 __global const double *A; \n __global double *B, *C; \n int lda = NB; \n int ldb = NB; \n int ldc = NB; \n d_dinvA += NB*NB*(page / PagesPerNB) + (qmod(page, PagesPerNB))*(blk * 2)*NB + (qmod(page, PagesPerNB))*(blk * 2); \n A = d_dinvA; \n B = d_dinvA + blk*NB; \n C = d_dinvA + blk; \n A += ibx + id; \n B += inx + __mul(iby + iny, ldb); \n C += ibx + id + __mul(iby, ldc); \n __global double *Blast = B + blk; \n double c[16] = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }; \n do {\n double a[4] = { A[0 * lda], A[1 * lda], A[2 * lda], A[3 * lda] }; \n bs[inx][iny] = B[0 * ldb]; \n bs[inx][iny + 4] = B[4 * ldb]; \n bs[inx][iny + 8] = B[8 * ldb]; \n bs[inx][iny + 12] = B[12 * ldb]; \n //__syncthreads(); barrier(CLK_LOCAL_MEM_FENCE); \n A += 4 * lda; \n daxpy(a[0], &bs[0][0], c); a[0] = A[0 * lda]; \n daxpy(a[1], &bs[1][0], c); a[1] = A[1 * lda]; \n daxpy(a[2], &bs[2][0], c); a[2] = A[2 * lda]; \n daxpy(a[3], &bs[3][0], c); a[3] = A[3 * lda]; \n A += 4 * lda; \n daxpy(a[0], &bs[4][0], c); a[0] = A[0 * lda]; \n daxpy(a[1], &bs[5][0], c); a[1] = A[1 * lda]; \n daxpy(a[2], &bs[6][0], c); a[2] = A[2 * lda]; \n daxpy(a[3], &bs[7][0], c); a[3] = A[3 * lda]; \n A += 4 * lda; \n daxpy(a[0], &bs[8][0], c); a[0] = A[0 * lda]; \n daxpy(a[1], &bs[9][0], c); a[1] = A[1 * lda]; \n daxpy(a[2], &bs[10][0], c); a[2] = A[2 * lda]; \n daxpy(a[3], &bs[11][0], c); a[3] = A[3 * lda]; \n A += 4 * lda; \n daxpy(a[0], &bs[12][0], c); \n daxpy(a[1], &bs[13][0], c); \n daxpy(a[2], &bs[14][0], c); \n daxpy(a[3], &bs[15][0], c); \n B += 16; \n //__syncthreads(); barrier(CLK_LOCAL_MEM_FENCE); \n } while (B < Blast); \n for (int i = 0; i < 16; i++) {\n C[0] = (-1)*c[i]; \n C += ldc; \n }\n }\n }\n // end of kernel ); #endif clblas-2.10/src/library/blas/trtri/triple_dgemm_update_128_ABOVE64_PART3_L.cpp000066400000000000000000000050761264277366700266320ustar00rootroot00000000000000/******************************************************************************* * Hand-tuned kernel * part 3: copy data back to position * ******************************************************************************/ #ifndef KERNEL_TRIPLE_DGEMM_UPDATE_128_ABOVE64_PART3_L_SRC_CPP #define KERNEL_TRIPLE_DGEMM_UPDATE_128_ABOVE64_PART3_L_SRC_CPP #pragma message("#define KERNEL_TRIPLE_DGEMM_UPDATE_128_ABOVE64_PART3_L_SRC_CPP.") #ifndef STRINGIFY #define STRINGIFY2(...) #__VA_ARGS__ #define STRINGIFY(...) STRINGIFY2(__VA_ARGS__) #endif unsigned char *triple_dgemm_update_128_ABOVE64_PART3_L_bin = 0; size_t triple_dgemm_update_128_ABOVE64_PART3_L_binSize = 0; const char * const triple_dgemm_update_128_ABOVE64_PART3_L_src = STRINGIFY( static void daxpy(\n double alpha, \n __local const double * __restrict__ b, \n double * __restrict__ c)\n { \n c[0] += alpha * b[0]; \n c[1] += alpha * b[1]; \n c[2] += alpha * b[2]; \n c[3] += alpha * b[3]; \n c[4] += alpha * b[4]; \n c[5] += alpha * b[5]; \n c[6] += alpha * b[6]; \n c[7] += alpha * b[7]; \n c[8] += alpha * b[8]; \n c[9] += alpha * b[9]; \n c[10] += alpha * b[10]; \n c[11] += alpha * b[11]; \n c[12] += alpha * b[12]; \n c[13] += alpha * b[13]; \n c[14] += alpha * b[14]; \n c[15] += alpha * b[15]; \n }\n #define NB 128\n #define ZERO ( 0.0) \n #define ONE ( 1.0) \n #define __mul(i,j) ((i)*(j))\n #define qmod(a, b) ((a)%(b))\n __kernel void TRIPLE_DGEMM_UPDATE_128_ABOVE64_PART3_L(__global const double *Ain, uint offAin, __global double *d_dinvA, int blk, int lda, int npages, int na)\n {\n const int bIdy = get_group_id(1) / npages; \n const int page = qmod(get_group_id(1), npages); \n const int inx = get_local_id(0); \n const int iny = get_local_id(1); \n const int ibx = get_group_id(0) * 64; \n const int iby = bIdy * 16; \n const int id = inx + iny * 16; \n Ain = Ain + offAin; \n int PagesPerNB = NB / (blk * 2); \n //--------------------------part three---------------------------// { // -inv(A22)*A21 -> A21 // A=inv(A22), B=A21, C=A21 __global double *C_temp, *C_real; \n int ldc = NB; \n d_dinvA += NB*NB*(page / PagesPerNB) + (qmod(page, PagesPerNB))*(blk * 2)*NB + (qmod(page, PagesPerNB))*(blk * 2); \n C_real = d_dinvA + blk;\n C_temp = d_dinvA + blk*NB; \n C_temp += ibx + id + __mul(iby, ldc); \n C_real += ibx + id + __mul(iby, ldc); \n for (int i = 0; i < 16; i++) {\n C_real[0] = C_temp[0]; \n C_temp[0] = ZERO; \n C_real += ldc; \n C_temp += ldc; \n }\n }\n //__syncthreads(); barrier(CLK_LOCAL_MEM_FENCE); \n }\n // end of kernel ); #endif clblas-2.10/src/library/blas/trtri/triple_dgemm_update_128_ABOVE64_PART3_R.cpp000066400000000000000000000051561264277366700266370ustar00rootroot00000000000000/******************************************************************************* * Hand-tuned kernel * * * part 3, copy data into position * ******************************************************************************/ #ifndef KERNEL_TRIPLE_DGEMM_UPDATE_128_ABOVE64_PART3_R_SRC_CPP #define KERNEL_TRIPLE_DGEMM_UPDATE_128_ABOVE64_PART3_R_SRC_CPP #pragma message("#define KERNEL_TRIPLE_DGEMM_UPDATE_128_ABOVE64_PART3_R_SRC_CPP.") #ifndef STRINGIFY #define STRINGIFY2(...) #__VA_ARGS__ #define STRINGIFY(...) STRINGIFY2(__VA_ARGS__) #endif unsigned char *triple_dgemm_update_128_ABOVE64_PART3_R_bin = 0; size_t triple_dgemm_update_128_ABOVE64_PART3_R_binSize = 0; const char * const triple_dgemm_update_128_ABOVE64_PART3_R_src = STRINGIFY( static void daxpy(\n double alpha, \n __local const double * __restrict__ b, \n double * __restrict__ c)\n { \n c[0] += alpha * b[0]; \n c[1] += alpha * b[1]; \n c[2] += alpha * b[2]; \n c[3] += alpha * b[3]; \n c[4] += alpha * b[4]; \n c[5] += alpha * b[5]; \n c[6] += alpha * b[6]; \n c[7] += alpha * b[7]; \n c[8] += alpha * b[8]; \n c[9] += alpha * b[9]; \n c[10] += alpha * b[10]; \n c[11] += alpha * b[11]; \n c[12] += alpha * b[12]; \n c[13] += alpha * b[13]; \n c[14] += alpha * b[14]; \n c[15] += alpha * b[15]; \n }\n #define NB 128\n #define ZERO ( 0.0) \n #define ONE ( 1.0) \n #define __mul(i,j) ((i)*(j))\n #define qmod(a, b) ((a)%(b))\n __kernel void TRIPLE_DGEMM_UPDATE_128_ABOVE64_PART3_R(__global const double *Ain, uint offAin, __global double *d_dinvA, int blk, int lda, int npages, int na)\n {\n const int bIdy = get_group_id(1) / npages;\n const int page = qmod(get_group_id(1), npages); \n const int inx = get_local_id(0); \n const int iny = get_local_id(1); \n const int ibx = get_group_id(0) * 64; \n const int iby = bIdy * 16; \n const int id = inx + iny * 16; \n Ain = Ain + offAin; \n int PagesPerNB = NB / (blk * 2); \n //--------------------------part two---------------------------// { // -inv(A11)*A12 -> A12 // A=inv(A11), B=A12, C=A12 __global double *C_temp, *C_real; \n int ldc = NB; \n C_temp = d_dinvA + NB*NB*(page / PagesPerNB) + (qmod(page, PagesPerNB))*(blk * 2)*NB + (qmod(page, PagesPerNB))*(blk * 2) + blk; \n C_real = d_dinvA + NB*NB*(page / PagesPerNB) + (qmod(page, PagesPerNB))*(blk * 2)*NB + blk*NB + (qmod(page, PagesPerNB))*(blk * 2); \n C_temp += ibx + id + __mul(iby, ldc); \n C_real += ibx + id + __mul(iby, ldc); \n for (int i = 0; i < 16; i++) {\n C_real[0] = C_temp[0]; \n C_temp[0] = ZERO; \n C_temp += ldc; \n C_real += ldc; \n }\n }\n }\n // end of kernel ); #endif clblas-2.10/src/library/blas/trtri/triple_dgemm_update_192_12_R.cpp000066400000000000000000000141771264277366700251460ustar00rootroot00000000000000/******************************************************************************* * Hand-tuned kernel * B21 = -inv(A11)*A12*inv(A22) * 12 to 24 ******************************************************************************/ #ifndef KERNEL_TRIPLE_DGEMM_UPDATE_192_12_R_SRC_CPP #define KERNEL_TRIPLE_DGEMM_UPDATE_192_12_R_SRC_CPP #pragma message("#define KERNEL_TRIPLE_DGEMM_UPDATE_192_12_R_SRC_CPP.") #ifndef STRINGIFY #define STRINGIFY2(...) #__VA_ARGS__ #define STRINGIFY(...) STRINGIFY2(__VA_ARGS__) #endif unsigned char *triple_dgemm_update_192_12_R_bin = 0; size_t triple_dgemm_update_192_12_R_binSize = 0; const char * const triple_dgemm_update_192_12_R_src = STRINGIFY( #define NB 192\n __kernel void TRIPLE_DGEMM_UPDATE_192_12_R(__global const double *Ain, uint offAin, __global double *d_dinvA, int blk, uint lda, int npages, int na)\n {\n // Ain is the non inverse matrix; the size of Ain is lda * na // offAin is the offset of Ain // d_dinvA is the inversed matrix. the size of d_invA is NB * (na-1)/NB + 1 // blk is subblock size, which is 12 here. // lda in leading dimension. Column major here // npages = (na-1)/12*2 + 1; for 96 this is 4 for 192 this is 8 //Work group size is [12] //global work size is [96*number of blocks] //each work item in each work group is responsible for every element in that row //each work group is responsible for one gemm;\ ////////////// A12*invA22 const uint gidx = get_group_id(0);\n const uint idx = get_local_id(0);\n const uint page = gidx % npages;\n const uint page_block = page / 8;\n//8 pages per page block const uint page_index_in_block = page % 8;\n __global double *B, *C;\n __local double lA[12][12];\n __local double lB[12][12];\n double privateC[12] = { (double)0 };\n //decide A12 location for each page Ain = Ain + offAin;\n Ain += (page*blk * 2 + blk) * lda + page * 2 * blk;\n //decide invA22 (B) location for each page B = d_dinvA + page_block*NB*NB + (page_index_in_block*blk * 2 + blk) * NB + page_index_in_block * 2 * blk + blk;\n //decide invA12 location for each page C = d_dinvA + page_block*NB*NB + (page_index_in_block*blk * 2 + blk) * NB + page_index_in_block * 2 * blk;\n //read A and B into LDS no transpose operated here lA[idx][0] = Ain[idx];\n lA[idx][1] = Ain[idx + lda];\n lA[idx][2] = Ain[idx + lda * 2];\n lA[idx][3] = Ain[idx + lda * 3];\n lA[idx][4] = Ain[idx + lda * 4];\n lA[idx][5] = Ain[idx + lda * 5];\n lA[idx][6] = Ain[idx + lda * 6];\n lA[idx][7] = Ain[idx + lda * 7];\n lA[idx][8] = Ain[idx + lda * 8];\n lA[idx][9] = Ain[idx + lda * 9];\n lA[idx][10] = Ain[idx + lda * 10];\n lA[idx][11] = Ain[idx + lda * 11];\n lB[idx][0] = B[idx];\n lB[idx][1] = B[idx + NB];\n lB[idx][2] = B[idx + NB * 2];\n lB[idx][3] = B[idx + NB * 3];\n lB[idx][4] = B[idx + NB * 4];\n lB[idx][5] = B[idx + NB * 5];\n lB[idx][6] = B[idx + NB * 6];\n lB[idx][7] = B[idx + NB * 7];\n lB[idx][8] = B[idx + NB * 8];\n lB[idx][9] = B[idx + NB * 9];\n lB[idx][10] = B[idx + NB * 10];\n lB[idx][11] = B[idx + NB * 11];\n barrier(CLK_LOCAL_MEM_FENCE);\n //do math uint i = 0;\n do{\n privateC[0] = mad(lA[idx][i], lB[i][0], privateC[0]);\n privateC[1] = mad(lA[idx][i], lB[i][1], privateC[1]);\n privateC[2] = mad(lA[idx][i], lB[i][2], privateC[2]);\n privateC[3] = mad(lA[idx][i], lB[i][3], privateC[3]);\n privateC[4] = mad(lA[idx][i], lB[i][4], privateC[4]);\n privateC[5] = mad(lA[idx][i], lB[i][5], privateC[5]);\n privateC[6] = mad(lA[idx][i], lB[i][6], privateC[6]);\n privateC[7] = mad(lA[idx][i], lB[i][7], privateC[7]);\n privateC[8] = mad(lA[idx][i], lB[i][8], privateC[8]);\n privateC[9] = mad(lA[idx][i], lB[i][9], privateC[9]);\n privateC[10] = mad(lA[idx][i], lB[i][10], privateC[10]);\n privateC[11] = mad(lA[idx][i], lB[i][11], privateC[11]);\n //mem_fence(CLK_LOCAL_MEM_FENCE); i = i + 1;\n } while (i < 12);\n i = 0;\n do{\n C[NB*i + idx] = privateC[i];\n i = i + 1;\n } while (i < 12);\n ////////////// -invA11*invA12 barrier(CLK_GLOBAL_MEM_FENCE);\n //A is moving to invA11 __global double *A;\n A = d_dinvA + page_block*NB*NB + ((page % 4)*blk * 2) * NB + (page % 4) * 2 * blk;\n //both B and C are pointing at invA12 B = C;\n //read A and B into LDS no transpose operated here lA[idx][0] = A[idx];\n lA[idx][1] = A[idx + NB];\n lA[idx][2] = A[idx + NB * 2];\n lA[idx][3] = A[idx + NB * 3];\n lA[idx][4] = A[idx + NB * 4];\n lA[idx][5] = A[idx + NB * 5];\n lA[idx][6] = A[idx + NB * 6];\n lA[idx][7] = A[idx + NB * 7];\n lA[idx][8] = A[idx + NB * 8];\n lA[idx][9] = A[idx + NB * 9];\n lA[idx][10] = A[idx + NB * 10];\n lA[idx][11] = A[idx + NB * 11];\n lB[idx][0] = B[idx];\n lB[idx][1] = B[idx + NB];\n lB[idx][2] = B[idx + NB * 2];\n lB[idx][3] = B[idx + NB * 3];\n lB[idx][4] = B[idx + NB * 4];\n lB[idx][5] = B[idx + NB * 5];\n lB[idx][6] = B[idx + NB * 6];\n lB[idx][7] = B[idx + NB * 7];\n lB[idx][8] = B[idx + NB * 8];\n lB[idx][9] = B[idx + NB * 9];\n lB[idx][10] = B[idx + NB * 10];\n lB[idx][11] = B[idx + NB * 11];\n barrier(CLK_LOCAL_MEM_FENCE);\n //do math i = 0;\n privateC[0] = 0;\n privateC[1] = 0;\n privateC[2] = 0;\n privateC[3] = 0;\n privateC[4] = 0;\n privateC[5] = 0;\n privateC[6] = 0;\n privateC[7] = 0;\n privateC[8] = 0;\n privateC[9] = 0;\n privateC[10] = 0;\n privateC[11] = 0;\n do{\n privateC[0] = mad(lA[idx][i], lB[i][0], privateC[0]);\n privateC[1] = mad(lA[idx][i], lB[i][1], privateC[1]);\n privateC[2] = mad(lA[idx][i], lB[i][2], privateC[2]);\n privateC[3] = mad(lA[idx][i], lB[i][3], privateC[3]);\n privateC[4] = mad(lA[idx][i], lB[i][4], privateC[4]);\n privateC[5] = mad(lA[idx][i], lB[i][5], privateC[5]);\n privateC[6] = mad(lA[idx][i], lB[i][6], privateC[6]);\n privateC[7] = mad(lA[idx][i], lB[i][7], privateC[7]);\n privateC[8] = mad(lA[idx][i], lB[i][8], privateC[8]);\n privateC[9] = mad(lA[idx][i], lB[i][9], privateC[9]);\n privateC[10] = mad(lA[idx][i], lB[i][10], privateC[10]);\n privateC[11] = mad(lA[idx][i], lB[i][11], privateC[11]);\n //mem_fence(CLK_LOCAL_MEM_FENCE); i = i + 1;\n } while (i < 12);\n i = 0;\n do{\n C[NB*i + idx] = -1 * privateC[i];\n i = i + 1;\n } while (i < 12);\n }\n // end of kernel ); #endif clblas-2.10/src/library/blas/trtri/triple_dgemm_update_192_24_PART1_R.cpp000066400000000000000000000114601264277366700260500ustar00rootroot00000000000000/******************************************************************************* * Hand-tuned kernel ******************************************************************************/ #ifndef KERNEL_TRIPLE_DGEMM_UPDATE_192_24_PART1_R_SRC_CPP #define KERNEL_TRIPLE_DGEMM_UPDATE_192_24_PART1_R_SRC_CPP #pragma message("#define KERNEL_TRIPLE_DGEMM_UPDATE_192_24_PART1_R_SRC_CPP.") #ifndef STRINGIFY #define STRINGIFY2(...) #__VA_ARGS__ #define STRINGIFY(...) STRINGIFY2(__VA_ARGS__) #endif unsigned char *triple_dgemm_update_192_24_PART1_R_bin = 0; size_t triple_dgemm_update_192_24_PART1_R_binSize = 0; const char * const triple_dgemm_update_192_24_PART1_R_src = STRINGIFY( #define NB 192\n __kernel void TRIPLE_DGEMM_UPDATE_192_24_PART1_R(__global const double *Ain, uint offAin, __global double *d_dinvA, int blk, uint lda, int npages, int na)\n { \n // Ain is the non inverse matrix; the size of Ain is lda * na // offAin is the offset of Ain // d_dinvA is the inversed matrix. the size of d_invA is NB * (na-1)/NB + 1 // blk is subblock size, which is 24 here. // lda in leading dimension. Column major here // npages = (na-1)/12*2 + 1; for 96 this is 2 for 192 this is 4 //Work group size is [24, 2] //global work size is [96*number of blocks, 2] //each work item in each work group is responsible for 12 elements (half) in that row //each work group is responsible for one gemm; ////////////// A12*invA22 const uint gidx = get_group_id(0); \n const uint gidy = get_group_id(1); \n const uint idx = get_local_id(0); \n const uint idy = get_local_id(1); \n const uint page = gidx % npages; \n//0-3 for 192; 0-1 for 96 const uint page_block = page / 4; \n//4 pages per page block __global double *B, *C; \n __local double lA[24][24]; \n __local double lB[24][24]; \n double privateC[12] = { (double)0 }; \n //decide A12 location for each page Ain = Ain + offAin; \n Ain += (page*blk * 2 + blk) * lda + page * 2 * blk; \n //decide invA22 (B) location for each page B = d_dinvA + page_block*NB*NB + ((page % 4)*blk * 2 + blk) * NB + (page % 4) * 2 * blk + blk; \n //decide invA12 location for each page C = d_dinvA + page_block*NB*NB + ((page % 4)*blk * 2 + blk) * NB + (page % 4) * 2 * blk; \n //read A and B into LDS no transpose operated here //each work iteam loads half a row lA[idx][0 + idy * 12] = Ain[idx + idy * 12 * lda]; \n lA[idx][1 + idy * 12] = Ain[idx + lda + idy * 12 * lda]; \n lA[idx][2 + idy * 12] = Ain[idx + lda * 2 + idy * 12 * lda]; \n lA[idx][3 + idy * 12] = Ain[idx + lda * 3 + idy * 12 * lda]; \n lA[idx][4 + idy * 12] = Ain[idx + lda * 4 + idy * 12 * lda]; \n lA[idx][5 + idy * 12] = Ain[idx + lda * 5 + idy * 12 * lda]; \n lA[idx][6 + idy * 12] = Ain[idx + lda * 6 + idy * 12 * lda]; \n lA[idx][7 + idy * 12] = Ain[idx + lda * 7 + idy * 12 * lda]; \n lA[idx][8 + idy * 12] = Ain[idx + lda * 8 + idy * 12 * lda]; \n lA[idx][9 + idy * 12] = Ain[idx + lda * 9 + idy * 12 * lda]; \n lA[idx][10 + idy * 12] = Ain[idx + lda * 10 + idy * 12 * lda]; \n lA[idx][11 + idy * 12] = Ain[idx + lda * 11 + idy * 12 * lda]; \n lB[idx][0 + idy * 12] = B[idx + idy * 12 * NB]; \n lB[idx][1 + idy * 12] = B[idx + NB + idy * 12 * NB]; \n lB[idx][2 + idy * 12] = B[idx + NB * 2 + idy * 12 * NB]; \n lB[idx][3 + idy * 12] = B[idx + NB * 3 + idy * 12 * NB]; \n lB[idx][4 + idy * 12] = B[idx + NB * 4 + idy * 12 * NB]; \n lB[idx][5 + idy * 12] = B[idx + NB * 5 + idy * 12 * NB]; \n lB[idx][6 + idy * 12] = B[idx + NB * 6 + idy * 12 * NB]; \n lB[idx][7 + idy * 12] = B[idx + NB * 7 + idy * 12 * NB]; \n lB[idx][8 + idy * 12] = B[idx + NB * 8 + idy * 12 * NB]; \n lB[idx][9 + idy * 12] = B[idx + NB * 9 + idy * 12 * NB]; \n lB[idx][10 + idy * 12] = B[idx + NB * 10 + idy * 12 * NB]; \n lB[idx][11 + idy * 12] = B[idx + NB * 11 + idy * 12 * NB]; \n barrier(CLK_LOCAL_MEM_FENCE); \n //do math uint i = 0; \n do{ \n privateC[0] = mad(lA[idx][i], lB[i][0 + idy * 12], privateC[0]); \n privateC[1] = mad(lA[idx][i], lB[i][1 + idy * 12], privateC[1]); \n privateC[2] = mad(lA[idx][i], lB[i][2 + idy * 12], privateC[2]); \n privateC[3] = mad(lA[idx][i], lB[i][3 + idy * 12], privateC[3]); \n privateC[4] = mad(lA[idx][i], lB[i][4 + idy * 12], privateC[4]); \n privateC[5] = mad(lA[idx][i], lB[i][5 + idy * 12], privateC[5]); \n privateC[6] = mad(lA[idx][i], lB[i][6 + idy * 12], privateC[6]); \n privateC[7] = mad(lA[idx][i], lB[i][7 + idy * 12], privateC[7]); \n privateC[8] = mad(lA[idx][i], lB[i][8 + idy * 12], privateC[8]); \n privateC[9] = mad(lA[idx][i], lB[i][9 + idy * 12], privateC[9]); \n privateC[10] = mad(lA[idx][i], lB[i][10 + idy * 12], privateC[10]); \n privateC[11] = mad(lA[idx][i], lB[i][11 + idy * 12], privateC[11]); \n i = i + 1; \n } while (i < 24); \n i = 0; \n do{ \n C[NB*idy * 12 + NB*i + idx] = privateC[i]; \n i = i + 1; \n } while (i < 12); \n }\n // end of kernel ); #endif clblas-2.10/src/library/blas/trtri/triple_dgemm_update_192_24_PART2_R.cpp000066400000000000000000000112651264277366700260540ustar00rootroot00000000000000/******************************************************************************* * Hand-tuned kernel ******************************************************************************/ #ifndef KERNEL_TRIPLE_DGEMM_UPDATE_192_24_PART2_R_SRC_CPP #define KERNEL_TRIPLE_DGEMM_UPDATE_192_24_PART2_R_SRC_CPP #pragma message("#define KERNEL_TRIPLE_DGEMM_UPDATE_192_24_PART2_R_SRC_CPP.") #ifndef STRINGIFY #define STRINGIFY2(...) #__VA_ARGS__ #define STRINGIFY(...) STRINGIFY2(__VA_ARGS__) #endif unsigned char *triple_dgemm_update_192_24_PART2_R_bin = 0; size_t triple_dgemm_update_192_24_PART2_R_binSize = 0; const char * const triple_dgemm_update_192_24_PART2_R_src = STRINGIFY( #define NB 192\n __kernel void TRIPLE_DGEMM_UPDATE_192_24_PART2_R(__global const double *Ain, uint offAin, __global double *d_dinvA, int blk, int lda, int npages, int na)\n {\n // Ain is the non inverse matrix; the size of Ain is lda * na // offAin is the offset of Ain // d_dinvA is the inversed matrix. the size of d_invA is NB * (na-1)/NB + 1 // blk is subblock size, which is 24 here. // lda in leading dimension. Column major here // npages = (na-1)/12*2 + 1; for 96 this is 2 for 192 this is 4 //Work group size is [24, 2] //global work size is [96*number of blocks, 2] //each work item in each work group is responsible for 12 elements (half) in that row //each work group is responsible for one gemm; ////////////// -invA11*invA12 const uint gidx = get_group_id(0);\n const uint gidy = get_group_id(1);\n const uint idx = get_local_id(0);\n const uint idy = get_local_id(1);\n const uint page = gidx % npages; \n//0-3 for 192; 0-1 for 96 const uint page_block = page / 4; \n//4 pages per page block __global double *A, *B, *C; \n __local double lA[24][24]; \n __local double lB[24][24]; \n double privateC[12] = { (double)0 }; \n //decide invA11 location for each page A = d_dinvA + page_block*NB*NB + (page % 4)*blk * 2 * NB + (page % 4) * 2 * blk; \n //decide invA12 location for each page B = d_dinvA + page_block*NB*NB + ((page % 4)*blk * 2 + blk) * NB + (page % 4) * 2 * blk; \n C = B; //C = d_dinvA + page_block*NB*NB + ((page%4)*blk*2) * NB + (page%4) * 2 * blk + blk; //read A and B into LDS no transpose operated here //each work iteam loads half a row lA[idx][0 + idy * 12] = A[idx + idy * 12 * NB]; \n lA[idx][1 + idy * 12] = A[idx + NB + idy * 12 * NB]; \n lA[idx][2 + idy * 12] = A[idx + NB * 2 + idy * 12 * NB];\n lA[idx][3 + idy * 12] = A[idx + NB * 3 + idy * 12 * NB];\n lA[idx][4 + idy * 12] = A[idx + NB * 4 + idy * 12 * NB];\n lA[idx][5 + idy * 12] = A[idx + NB * 5 + idy * 12 * NB];\n lA[idx][6 + idy * 12] = A[idx + NB * 6 + idy * 12 * NB];\n lA[idx][7 + idy * 12] = A[idx + NB * 7 + idy * 12 * NB];\n lA[idx][8 + idy * 12] = A[idx + NB * 8 + idy * 12 * NB];\n lA[idx][9 + idy * 12] = A[idx + NB * 9 + idy * 12 * NB];\n lA[idx][10 + idy * 12] = A[idx + NB * 10 + idy * 12 * NB];\n lA[idx][11 + idy * 12] = A[idx + NB * 11 + idy * 12 * NB];\n lB[idx][0 + idy * 12] = B[idx + idy * 12 * NB];\n lB[idx][1 + idy * 12] = B[idx + NB + idy * 12 * NB];\n lB[idx][2 + idy * 12] = B[idx + NB * 2 + idy * 12 * NB];\n lB[idx][3 + idy * 12] = B[idx + NB * 3 + idy * 12 * NB];\n lB[idx][4 + idy * 12] = B[idx + NB * 4 + idy * 12 * NB];\n lB[idx][5 + idy * 12] = B[idx + NB * 5 + idy * 12 * NB];\n lB[idx][6 + idy * 12] = B[idx + NB * 6 + idy * 12 * NB];\n lB[idx][7 + idy * 12] = B[idx + NB * 7 + idy * 12 * NB];\n lB[idx][8 + idy * 12] = B[idx + NB * 8 + idy * 12 * NB];\n lB[idx][9 + idy * 12] = B[idx + NB * 9 + idy * 12 * NB];\n lB[idx][10 + idy * 12] = B[idx + NB * 10 + idy * 12 * NB];\n lB[idx][11 + idy * 12] = B[idx + NB * 11 + idy * 12 * NB];\n barrier(CLK_LOCAL_MEM_FENCE);\n //do math uint i = 0;\n do{\n privateC[0] = mad(lA[idx][i], lB[i][0 + idy * 12], privateC[0]);\n privateC[1] = mad(lA[idx][i], lB[i][1 + idy * 12], privateC[1]);\n privateC[2] = mad(lA[idx][i], lB[i][2 + idy * 12], privateC[2]);\n privateC[3] = mad(lA[idx][i], lB[i][3 + idy * 12], privateC[3]);\n privateC[4] = mad(lA[idx][i], lB[i][4 + idy * 12], privateC[4]);\n privateC[5] = mad(lA[idx][i], lB[i][5 + idy * 12], privateC[5]);\n privateC[6] = mad(lA[idx][i], lB[i][6 + idy * 12], privateC[6]);\n privateC[7] = mad(lA[idx][i], lB[i][7 + idy * 12], privateC[7]);\n privateC[8] = mad(lA[idx][i], lB[i][8 + idy * 12], privateC[8]);\n privateC[9] = mad(lA[idx][i], lB[i][9 + idy * 12], privateC[9]);\n privateC[10] = mad(lA[idx][i], lB[i][10 + idy * 12], privateC[10]);\n privateC[11] = mad(lA[idx][i], lB[i][11 + idy * 12], privateC[11]);\n i = i + 1;\n } while (i < 24);\n i = 0;\n do{\n C[NB*idy * 12 + NB*i + idx] = -1 * privateC[i];\n i = i + 1;\n } while (i < 12);\n }\n // end of kernel ); #endif clblas-2.10/src/library/blas/trtri/triple_dgemm_update_192_48_PART1_R.cpp000066400000000000000000000150661264277366700260640ustar00rootroot00000000000000/******************************************************************************* * Hand-tuned kernel ******************************************************************************/ #ifndef KERNEL_TRIPLE_DGEMM_UPDATE_192_48_PART1_R_SRC_CPP #define KERNEL_TRIPLE_DGEMM_UPDATE_192_48_PART1_R_SRC_CPP #pragma message("#define KERNEL_TRIPLE_DGEMM_UPDATE_192_48_PART1_R_SRC_CPP.") #ifndef STRINGIFY #define STRINGIFY2(...) #__VA_ARGS__ #define STRINGIFY(...) STRINGIFY2(__VA_ARGS__) #endif unsigned char *triple_dgemm_update_192_48_PART1_R_bin = 0; size_t triple_dgemm_update_192_48_PART1_R_binSize = 0; const char * const triple_dgemm_update_192_48_PART1_R_src = STRINGIFY( #define NB 192\n __kernel void TRIPLE_DGEMM_UPDATE_192_48_PART1_R(__global const double *Ain, uint offAin, __global double *d_dinvA, int blk, int lda, int npages, int na)\n {\n // Ain is the non inverse matrix; the size of Ain is lda * na // offAin is the offset of Ain // d_dinvA is the inversed matrix. the size of d_invA is NB * (na-1)/NB + 1 // blk is subblock size, which is 48 here. // lda in leading dimension. Column major here // npages = (na-1)/12*2 + 1; for 96 this is 1 for 192 this is 2 //Work group size is [24, 2] //global work size is [96*number of blocks, 4] //each work item in each work group is responsible for 12 elements (1/4) in that row //each work group is responsible for 24 by 24 macro tile; ////////////// A12*invA22 const uint gidx = get_group_id(0);\n const uint gidy = get_group_id(1);\n const uint idx = get_local_id(0);\n const uint idy = get_local_id(1);\n //uint page = gidx / 2;//0-1 for 192; 0 for 96 const uint page = (gidx / 2) % 2; \n//index of page within a page_block; 2 pages per page_block const uint page_block = gidx / 4; \n//#index of page_block; 2 WG per page; 4 WG per page_block __global double *B, *C; \n __local double lA[24][48]; \n __local double lB[48][24]; \n double privateC[12] = { (double)0 }; \n //decide A12 location for each page //each workgroup loads half of A (left or right) Ain = Ain + offAin; \n Ain += page_block*NB*lda + page_block*NB + page*blk * 2 * lda + page*blk * 2 + blk*lda + gidx % 2 * (blk / 2); \n //decide invA22 (B) location for each page //each workgroup loads half of B (up or down) B = d_dinvA + page_block*NB*NB + page*blk * 2 * NB + page*blk * 2 + blk*NB + blk + gidy*(blk / 2)*NB; \n //decide invA12 location for each page; //Actually this will be stored in invA21 temporarily //each workgroup writes 1/4 of C C = d_dinvA + page_block*NB*NB + page*blk * 2 * NB + page*blk * 2 + blk*NB + gidx % 2 * (blk / 2) + gidy*(blk / 2)*NB; \n //read A and B into LDS no transpose operated here //each work item loads a half row of A and half column of B //idx 0-23 idy 0-1 lA[idx][0 + idy * 24] = Ain[idx + idy * 24 * lda]; \n lA[idx][1 + idy * 24] = Ain[idx + lda + idy * 24 * lda]; \n lA[idx][2 + idy * 24] = Ain[idx + lda * 2 + idy * 24 * lda]; \n lA[idx][3 + idy * 24] = Ain[idx + lda * 3 + idy * 24 * lda]; \n lA[idx][4 + idy * 24] = Ain[idx + lda * 4 + idy * 24 * lda]; \n lA[idx][5 + idy * 24] = Ain[idx + lda * 5 + idy * 24 * lda]; \n lA[idx][6 + idy * 24] = Ain[idx + lda * 6 + idy * 24 * lda]; \n lA[idx][7 + idy * 24] = Ain[idx + lda * 7 + idy * 24 * lda]; \n lA[idx][8 + idy * 24] = Ain[idx + lda * 8 + idy * 24 * lda]; \n lA[idx][9 + idy * 24] = Ain[idx + lda * 9 + idy * 24 * lda]; \n lA[idx][10 + idy * 24] = Ain[idx + lda * 10 + idy * 24 * lda];\n lA[idx][11 + idy * 24] = Ain[idx + lda * 11 + idy * 24 * lda];\n lA[idx][12 + idy * 24] = Ain[idx + lda * 12 + idy * 24 * lda];\n lA[idx][13 + idy * 24] = Ain[idx + lda * 13 + idy * 24 * lda];\n lA[idx][14 + idy * 24] = Ain[idx + lda * 14 + idy * 24 * lda];\n lA[idx][15 + idy * 24] = Ain[idx + lda * 15 + idy * 24 * lda];\n lA[idx][16 + idy * 24] = Ain[idx + lda * 16 + idy * 24 * lda];\n lA[idx][17 + idy * 24] = Ain[idx + lda * 17 + idy * 24 * lda];\n lA[idx][18 + idy * 24] = Ain[idx + lda * 18 + idy * 24 * lda];\n lA[idx][19 + idy * 24] = Ain[idx + lda * 19 + idy * 24 * lda];\n lA[idx][20 + idy * 24] = Ain[idx + lda * 20 + idy * 24 * lda];\n lA[idx][21 + idy * 24] = Ain[idx + lda * 21 + idy * 24 * lda];\n lA[idx][22 + idy * 24] = Ain[idx + lda * 22 + idy * 24 * lda];\n lA[idx][23 + idy * 24] = Ain[idx + lda * 23 + idy * 24 * lda];\n lB[0 + idy * 24][idx] = B[idx*NB + idy * 24]; \n lB[1 + idy * 24][idx] = B[idx*NB + idy * 24 + 1];\n lB[2 + idy * 24][idx] = B[idx*NB + idy * 24 + 2];\n lB[3 + idy * 24][idx] = B[idx*NB + idy * 24 + 3];\n lB[4 + idy * 24][idx] = B[idx*NB + idy * 24 + 4];\n lB[5 + idy * 24][idx] = B[idx*NB + idy * 24 + 5];\n lB[6 + idy * 24][idx] = B[idx*NB + idy * 24 + 6];\n lB[7 + idy * 24][idx] = B[idx*NB + idy * 24 + 7];\n lB[8 + idy * 24][idx] = B[idx*NB + idy * 24 + 8];\n lB[9 + idy * 24][idx] = B[idx*NB + idy * 24 + 9];\n lB[10 + idy * 24][idx] = B[idx*NB + idy * 24 + 10];\n lB[11 + idy * 24][idx] = B[idx*NB + idy * 24 + 11];\n lB[12 + idy * 24][idx] = B[idx*NB + idy * 24 + 12];\n lB[13 + idy * 24][idx] = B[idx*NB + idy * 24 + 13];\n lB[14 + idy * 24][idx] = B[idx*NB + idy * 24 + 14];\n lB[15 + idy * 24][idx] = B[idx*NB + idy * 24 + 15];\n lB[16 + idy * 24][idx] = B[idx*NB + idy * 24 + 16];\n lB[17 + idy * 24][idx] = B[idx*NB + idy * 24 + 17];\n lB[18 + idy * 24][idx] = B[idx*NB + idy * 24 + 18];\n lB[19 + idy * 24][idx] = B[idx*NB + idy * 24 + 19];\n lB[20 + idy * 24][idx] = B[idx*NB + idy * 24 + 20];\n lB[21 + idy * 24][idx] = B[idx*NB + idy * 24 + 21];\n lB[22 + idy * 24][idx] = B[idx*NB + idy * 24 + 22];\n lB[23 + idy * 24][idx] = B[idx*NB + idy * 24 + 23];\n barrier(CLK_LOCAL_MEM_FENCE); \n //do math uint i = 0; \n do{\n privateC[0] = mad(lA[idx][i], lB[i][0 + idy * 12], privateC[0]);\n privateC[1] = mad(lA[idx][i], lB[i][1 + idy * 12], privateC[1]);\n privateC[2] = mad(lA[idx][i], lB[i][2 + idy * 12], privateC[2]);\n privateC[3] = mad(lA[idx][i], lB[i][3 + idy * 12], privateC[3]);\n privateC[4] = mad(lA[idx][i], lB[i][4 + idy * 12], privateC[4]);\n privateC[5] = mad(lA[idx][i], lB[i][5 + idy * 12], privateC[5]);\n privateC[6] = mad(lA[idx][i], lB[i][6 + idy * 12], privateC[6]);\n privateC[7] = mad(lA[idx][i], lB[i][7 + idy * 12], privateC[7]);\n privateC[8] = mad(lA[idx][i], lB[i][8 + idy * 12], privateC[8]);\n privateC[9] = mad(lA[idx][i], lB[i][9 + idy * 12], privateC[9]);\n privateC[10] = mad(lA[idx][i], lB[i][10 + idy * 12], privateC[10]); \n privateC[11] = mad(lA[idx][i], lB[i][11 + idy * 12], privateC[11]); \n i = i + 1; \n } while (i < 48); \n i = 0; \n do{\n C[NB*idy * 12 + NB*i + idx] = privateC[i]; \n i = i + 1; \n } while (i < 12); \n }\n // end of kernel ); #endif clblas-2.10/src/library/blas/trtri/triple_dgemm_update_192_48_PART2_R.cpp000066400000000000000000000146551264277366700260700ustar00rootroot00000000000000/******************************************************************************* * Hand-tuned kernel ******************************************************************************/ #ifndef KERNEL_TRIPLE_DGEMM_UPDATE_192_48_PART2_R_SRC_CPP #define KERNEL_TRIPLE_DGEMM_UPDATE_192_48_PART2_R_SRC_CPP #pragma message("#define KERNEL_TRIPLE_DGEMM_UPDATE_192_48_PART2_R_SRC_CPP.") #ifndef STRINGIFY #define STRINGIFY2(...) #__VA_ARGS__ #define STRINGIFY(...) STRINGIFY2(__VA_ARGS__) #endif unsigned char *triple_dgemm_update_192_48_PART2_R_bin = 0; size_t triple_dgemm_update_192_48_PART2_R_binSize = 0; const char * const triple_dgemm_update_192_48_PART2_R_src = STRINGIFY( #define NB 192\n __kernel void TRIPLE_DGEMM_UPDATE_192_48_PART2_R(__global const double *Ain, uint offAin, __global double *d_dinvA, int blk, int lda, int npages, int na)\n {\n // Ain is the non inverse matrix; the size of Ain is lda * na // offAin is the offset of Ain // d_dinvA is the inversed matrix. the size of d_invA is NB * (na-1)/NB + 1 // blk is subblock size, which is 48 here. // lda in leading dimension. Column major here // npages = (na-1)/12*2 + 1; for 96 this is 1 for 192 this is 2 //Work group size is [24, 2] //global work size is [96*number of blocks, 4] //each work item in each work group is responsible for 12 elements (1/4) in that row //each work group is responsible for 24 by 24 macro tile; ////////////// -invA11*invA12 const uint gidx = get_group_id(0);\n const uint gidy = get_group_id(1); \n const uint idx = get_local_id(0); \n const uint idy = get_local_id(1); \n //uint page = gidx / 2;//0-1 for 192; 0 for 96 const uint page = (gidx / 2) % 2; \n//index of page within a page_block; 2 pages per page_block const uint page_block = gidx / 4; \n//#index of page_block; 2 WG per page; 4 WG per page_block __global double *A, *B, *C; \n __local double lA[24][48]; \n __local double lB[48][24]; \n double privateC[12] = { (double)0 }; \n //decide invA11 location for each page //each workgroup loads half of A (left or right) A = d_dinvA + page_block*NB*NB + page*blk * 2 * NB + page*blk * 2 + gidx % 2 * (blk / 2); \n //decide invA12 (B) location for each page //actually it was saved in invA21 from last kernel //each workgroup loads half of B (up or down) B = d_dinvA + page_block*NB*NB + page*blk * 2 * NB + page*blk * 2 + blk*NB + gidy*(blk / 2)*NB; \n //decide invA12 location for each page //each workgroup writes 1/4 of C C = d_dinvA + page_block*NB*NB + page*blk * 2 * NB + page*blk * 2 + blk*NB + gidx % 2 * (blk / 2) + gidy*(blk / 2)*NB; \n //read A and B into LDS no transpose operated here //each work item loads a half row of A and half column of B //idx 0-23 idy 0-1 lA[idx][0 + idy * 24] = A[idx + idy * 24 * NB]; \n lA[idx][1 + idy * 24] = A[idx + NB + idy * 24 * NB]; \n lA[idx][2 + idy * 24] = A[idx + NB * 2 + idy * 24 * NB];\n lA[idx][3 + idy * 24] = A[idx + NB * 3 + idy * 24 * NB];\n lA[idx][4 + idy * 24] = A[idx + NB * 4 + idy * 24 * NB];\n lA[idx][5 + idy * 24] = A[idx + NB * 5 + idy * 24 * NB];\n lA[idx][6 + idy * 24] = A[idx + NB * 6 + idy * 24 * NB];\n lA[idx][7 + idy * 24] = A[idx + NB * 7 + idy * 24 * NB];\n lA[idx][8 + idy * 24] = A[idx + NB * 8 + idy * 24 * NB];\n lA[idx][9 + idy * 24] = A[idx + NB * 9 + idy * 24 * NB];\n lA[idx][10 + idy * 24] = A[idx + NB * 10 + idy * 24 * NB];\n lA[idx][11 + idy * 24] = A[idx + NB * 11 + idy * 24 * NB];\n lA[idx][12 + idy * 24] = A[idx + NB * 12 + idy * 24 * NB];\n lA[idx][13 + idy * 24] = A[idx + NB * 13 + idy * 24 * NB];\n lA[idx][14 + idy * 24] = A[idx + NB * 14 + idy * 24 * NB];\n lA[idx][15 + idy * 24] = A[idx + NB * 15 + idy * 24 * NB];\n lA[idx][16 + idy * 24] = A[idx + NB * 16 + idy * 24 * NB];\n lA[idx][17 + idy * 24] = A[idx + NB * 17 + idy * 24 * NB];\n lA[idx][18 + idy * 24] = A[idx + NB * 18 + idy * 24 * NB];\n lA[idx][19 + idy * 24] = A[idx + NB * 19 + idy * 24 * NB];\n lA[idx][20 + idy * 24] = A[idx + NB * 20 + idy * 24 * NB];\n lA[idx][21 + idy * 24] = A[idx + NB * 21 + idy * 24 * NB];\n lA[idx][22 + idy * 24] = A[idx + NB * 22 + idy * 24 * NB];\n lA[idx][23 + idy * 24] = A[idx + NB * 23 + idy * 24 * NB];\n lB[0 + idy * 24][idx] = B[idx*NB + idy * 24]; \n lB[1 + idy * 24][idx] = B[idx*NB + idy * 24 + 1];\n lB[2 + idy * 24][idx] = B[idx*NB + idy * 24 + 2];\n lB[3 + idy * 24][idx] = B[idx*NB + idy * 24 + 3];\n lB[4 + idy * 24][idx] = B[idx*NB + idy * 24 + 4];\n lB[5 + idy * 24][idx] = B[idx*NB + idy * 24 + 5];\n lB[6 + idy * 24][idx] = B[idx*NB + idy * 24 + 6];\n lB[7 + idy * 24][idx] = B[idx*NB + idy * 24 + 7];\n lB[8 + idy * 24][idx] = B[idx*NB + idy * 24 + 8];\n lB[9 + idy * 24][idx] = B[idx*NB + idy * 24 + 9];\n lB[10 + idy * 24][idx] = B[idx*NB + idy * 24 + 10];\n lB[11 + idy * 24][idx] = B[idx*NB + idy * 24 + 11];\n lB[12 + idy * 24][idx] = B[idx*NB + idy * 24 + 12];\n lB[13 + idy * 24][idx] = B[idx*NB + idy * 24 + 13];\n lB[14 + idy * 24][idx] = B[idx*NB + idy * 24 + 14];\n lB[15 + idy * 24][idx] = B[idx*NB + idy * 24 + 15];\n lB[16 + idy * 24][idx] = B[idx*NB + idy * 24 + 16];\n lB[17 + idy * 24][idx] = B[idx*NB + idy * 24 + 17];\n lB[18 + idy * 24][idx] = B[idx*NB + idy * 24 + 18];\n lB[19 + idy * 24][idx] = B[idx*NB + idy * 24 + 19];\n lB[20 + idy * 24][idx] = B[idx*NB + idy * 24 + 20];\n lB[21 + idy * 24][idx] = B[idx*NB + idy * 24 + 21];\n lB[22 + idy * 24][idx] = B[idx*NB + idy * 24 + 22];\n lB[23 + idy * 24][idx] = B[idx*NB + idy * 24 + 23];\n barrier(CLK_LOCAL_MEM_FENCE); \n //do math uint i = 0; \n do{\n privateC[0] = mad(lA[idx][i], lB[i][0 + idy * 12], privateC[0]);\n privateC[1] = mad(lA[idx][i], lB[i][1 + idy * 12], privateC[1]);\n privateC[2] = mad(lA[idx][i], lB[i][2 + idy * 12], privateC[2]);\n privateC[3] = mad(lA[idx][i], lB[i][3 + idy * 12], privateC[3]);\n privateC[4] = mad(lA[idx][i], lB[i][4 + idy * 12], privateC[4]);\n privateC[5] = mad(lA[idx][i], lB[i][5 + idy * 12], privateC[5]);\n privateC[6] = mad(lA[idx][i], lB[i][6 + idy * 12], privateC[6]);\n privateC[7] = mad(lA[idx][i], lB[i][7 + idy * 12], privateC[7]);\n privateC[8] = mad(lA[idx][i], lB[i][8 + idy * 12], privateC[8]);\n privateC[9] = mad(lA[idx][i], lB[i][9 + idy * 12], privateC[9]);\n privateC[10] = mad(lA[idx][i], lB[i][10 + idy * 12], privateC[10]); \n privateC[11] = mad(lA[idx][i], lB[i][11 + idy * 12], privateC[11]); \n i = i + 1; \n } while (i < 48); \n i = 0; \n do{\n C[NB*idy * 12 + NB*i + idx] = -1 * privateC[i]; \n i = i + 1; \n } while (i < 12); \n }\n // end of kernel ); #endif clblas-2.10/src/library/blas/trtri/triple_dgemm_update_192_96_PART1_R.cpp000066400000000000000000000153741264277366700260710ustar00rootroot00000000000000/******************************************************************************* * Hand-tuned kernel ******************************************************************************/ #ifndef KERNEL_TRIPLE_DGEMM_UPDATE_192_96_PART1_R_SRC_CPP #define KERNEL_TRIPLE_DGEMM_UPDATE_192_96_PART1_R_SRC_CPP #pragma message("#define KERNEL_TRIPLE_DGEMM_UPDATE_192_96_PART1_R_SRC_CPP.") #ifndef STRINGIFY #define STRINGIFY2(...) #__VA_ARGS__ #define STRINGIFY(...) STRINGIFY2(__VA_ARGS__) #endif unsigned char *triple_dgemm_update_192_96_PART1_R_bin = 0; size_t triple_dgemm_update_192_96_PART1_R_binSize = 0; const char * const triple_dgemm_update_192_96_PART1_R_src = STRINGIFY( #define NB 192\n __kernel void TRIPLE_DGEMM_UPDATE_192_96_PART1_R(__global const double *Ain, uint offAin, __global double *d_dinvA, int blk, int lda, int npages, int na)\n { // Ain is the non inverse matrix; the size of Ain is lda * na // offAin is the offset of Ain // d_dinvA is the inversed matrix. the size of d_invA is NB * (na-1)/NB + 1 // blk is subblock size, which is 96 here. // lda in leading dimension. Column major here // npages = (na-1)/12*2 + 1; for 192 this is 1 for 384 this is 2 //Work group size is [24, 2] //global work size is [96*number of blocks, 8] //each work item in each work group is responsible for 12 elements (1/8) in that row //each work group is responsible for 24 by 24 macro tile; ////////////// A12*invA22 const uint gidx = get_group_id(0);\n const uint gidy = get_group_id(1); \n const uint idx = get_local_id(0); \n const uint idy = get_local_id(1); \n //uint page = gidx / 2;//0-1 for 192; 0 for 96 //const uint page = (gidx/4)%1;//index of page within a page_block; 1 pages per page_block const uint page_block = gidx / 4; \n//#index of page_block; 4 WG per page; 4 WG per page_block __global double *B, *C; \n __local double lA[24][48]; \n __local double lB[48][24]; \n double privateC[12] = { (double)0 }; \n //decide A12 location for each page //each workgroup loads 1/4 of A (left or right) Ain = Ain + offAin; \n Ain += page_block*NB*lda + page_block*NB + blk*lda + gidx % 4 * (blk / 4); \n //decide invA22 (B) location for each page //each workgroup loads 1/4 of B (up or down) B = d_dinvA + page_block*NB*NB + blk*NB + blk + gidy*(blk / 4)*NB; \n //decide invA12 location for each page; //Actually this will be stored in invA21 temporarily //each workgroup writes 1/4*1/4 of C C = d_dinvA + page_block*NB*NB + blk*NB + gidx % 4 * (blk / 4) + gidy*(blk / 4)*NB; \n //read A and B into LDS no transpose operated here //each work item loads a half row of A and half column of B //each loop loads 1/4 row of A and 1/4 column of B //idx 0-23 idy 0-1 uint block_k = blk / 48; \n //thus we need 2 iterations here do{\n barrier(CLK_LOCAL_MEM_FENCE); \n lA[idx][0 + idy * 24] = Ain[idx + idy * 24 * lda]; \n lA[idx][1 + idy * 24] = Ain[idx + lda + idy * 24 * lda]; \n lA[idx][2 + idy * 24] = Ain[idx + lda * 2 + idy * 24 * lda]; \n lA[idx][3 + idy * 24] = Ain[idx + lda * 3 + idy * 24 * lda]; \n lA[idx][4 + idy * 24] = Ain[idx + lda * 4 + idy * 24 * lda]; \n lA[idx][5 + idy * 24] = Ain[idx + lda * 5 + idy * 24 * lda]; \n lA[idx][6 + idy * 24] = Ain[idx + lda * 6 + idy * 24 * lda]; \n lA[idx][7 + idy * 24] = Ain[idx + lda * 7 + idy * 24 * lda]; \n lA[idx][8 + idy * 24] = Ain[idx + lda * 8 + idy * 24 * lda]; \n lA[idx][9 + idy * 24] = Ain[idx + lda * 9 + idy * 24 * lda]; \n lA[idx][10 + idy * 24] = Ain[idx + lda * 10 + idy * 24 * lda];\n lA[idx][11 + idy * 24] = Ain[idx + lda * 11 + idy * 24 * lda];\n lA[idx][12 + idy * 24] = Ain[idx + lda * 12 + idy * 24 * lda];\n lA[idx][13 + idy * 24] = Ain[idx + lda * 13 + idy * 24 * lda];\n lA[idx][14 + idy * 24] = Ain[idx + lda * 14 + idy * 24 * lda];\n lA[idx][15 + idy * 24] = Ain[idx + lda * 15 + idy * 24 * lda];\n lA[idx][16 + idy * 24] = Ain[idx + lda * 16 + idy * 24 * lda];\n lA[idx][17 + idy * 24] = Ain[idx + lda * 17 + idy * 24 * lda];\n lA[idx][18 + idy * 24] = Ain[idx + lda * 18 + idy * 24 * lda];\n lA[idx][19 + idy * 24] = Ain[idx + lda * 19 + idy * 24 * lda];\n lA[idx][20 + idy * 24] = Ain[idx + lda * 20 + idy * 24 * lda];\n lA[idx][21 + idy * 24] = Ain[idx + lda * 21 + idy * 24 * lda];\n lA[idx][22 + idy * 24] = Ain[idx + lda * 22 + idy * 24 * lda];\n lA[idx][23 + idy * 24] = Ain[idx + lda * 23 + idy * 24 * lda];\n lB[0 + idy * 24][idx] = B[idx*NB + idy * 24]; \n lB[1 + idy * 24][idx] = B[idx*NB + idy * 24 + 1];\n lB[2 + idy * 24][idx] = B[idx*NB + idy * 24 + 2];\n lB[3 + idy * 24][idx] = B[idx*NB + idy * 24 + 3];\n lB[4 + idy * 24][idx] = B[idx*NB + idy * 24 + 4];\n lB[5 + idy * 24][idx] = B[idx*NB + idy * 24 + 5];\n lB[6 + idy * 24][idx] = B[idx*NB + idy * 24 + 6];\n lB[7 + idy * 24][idx] = B[idx*NB + idy * 24 + 7];\n lB[8 + idy * 24][idx] = B[idx*NB + idy * 24 + 8];\n lB[9 + idy * 24][idx] = B[idx*NB + idy * 24 + 9];\n lB[10 + idy * 24][idx] = B[idx*NB + idy * 24 + 10];\n lB[11 + idy * 24][idx] = B[idx*NB + idy * 24 + 11];\n lB[12 + idy * 24][idx] = B[idx*NB + idy * 24 + 12];\n lB[13 + idy * 24][idx] = B[idx*NB + idy * 24 + 13];\n lB[14 + idy * 24][idx] = B[idx*NB + idy * 24 + 14];\n lB[15 + idy * 24][idx] = B[idx*NB + idy * 24 + 15];\n lB[16 + idy * 24][idx] = B[idx*NB + idy * 24 + 16];\n lB[17 + idy * 24][idx] = B[idx*NB + idy * 24 + 17];\n lB[18 + idy * 24][idx] = B[idx*NB + idy * 24 + 18];\n lB[19 + idy * 24][idx] = B[idx*NB + idy * 24 + 19];\n lB[20 + idy * 24][idx] = B[idx*NB + idy * 24 + 20];\n lB[21 + idy * 24][idx] = B[idx*NB + idy * 24 + 21];\n lB[22 + idy * 24][idx] = B[idx*NB + idy * 24 + 22];\n lB[23 + idy * 24][idx] = B[idx*NB + idy * 24 + 23];\n barrier(CLK_LOCAL_MEM_FENCE); \n //do math uint i = 0; \n do{\n privateC[0] = mad(lA[idx][i], lB[i][0 + idy * 12], privateC[0]); \n privateC[1] = mad(lA[idx][i], lB[i][1 + idy * 12], privateC[1]); \n privateC[2] = mad(lA[idx][i], lB[i][2 + idy * 12], privateC[2]); \n privateC[3] = mad(lA[idx][i], lB[i][3 + idy * 12], privateC[3]); \n privateC[4] = mad(lA[idx][i], lB[i][4 + idy * 12], privateC[4]); \n privateC[5] = mad(lA[idx][i], lB[i][5 + idy * 12], privateC[5]); \n privateC[6] = mad(lA[idx][i], lB[i][6 + idy * 12], privateC[6]); \n privateC[7] = mad(lA[idx][i], lB[i][7 + idy * 12], privateC[7]); \n privateC[8] = mad(lA[idx][i], lB[i][8 + idy * 12], privateC[8]); \n privateC[9] = mad(lA[idx][i], lB[i][9 + idy * 12], privateC[9]); \n privateC[10] = mad(lA[idx][i], lB[i][10 + idy * 12], privateC[10]); \n privateC[11] = mad(lA[idx][i], lB[i][11 + idy * 12], privateC[11]); \n i = i + 1; \n } while (i < 48); \n Ain += 48 * lda; \n B += 48; \n } while (--block_k>0); \n uint i = 0; \n do{\n C[NB*idy * 12 + NB*i + idx] = privateC[i]; \n i = i + 1; \n } while (i < 12); \n }\n // end of kernel ); #endif clblas-2.10/src/library/blas/trtri/triple_dgemm_update_192_96_PART2_R.cpp000066400000000000000000000153411264277366700260640ustar00rootroot00000000000000/******************************************************************************* * Hand-tuned kernel ******************************************************************************/ #ifndef KERNEL_TRIPLE_DGEMM_UPDATE_192_96_PART2_R_SRC_CPP #define KERNEL_TRIPLE_DGEMM_UPDATE_192_96_PART2_R_SRC_CPP #pragma message("#define KERNEL_TRIPLE_DGEMM_UPDATE_192_96_PART2_R_SRC_CPP.") #ifndef STRINGIFY #define STRINGIFY2(...) #__VA_ARGS__ #define STRINGIFY(...) STRINGIFY2(__VA_ARGS__) #endif unsigned char *triple_dgemm_update_192_96_PART2_R_bin = 0; size_t triple_dgemm_update_192_96_PART2_R_binSize = 0; const char * const triple_dgemm_update_192_96_PART2_R_src = STRINGIFY( #define NB 192\n __kernel void TRIPLE_DGEMM_UPDATE_192_96_PART2_R(__global const double *Ain, uint offAin, __global double *d_dinvA, int blk, int lda, int npages, int na)\n { \n // Ain is the non inverse matrix; the size of Ain is lda * na // offAin is the offset of Ain // d_dinvA is the inversed matrix. the size of d_invA is NB * (na-1)/NB + 1 // blk is subblock size, which is 48 here. // lda in leading dimension. Column major here // npages = (na-1)/12*2 + 1; for 96 this is 1 for 192 this is 2 //Work group size is [24, 2] //global work size is [48*number of blocks, 4] //each work item in each work group is responsible for 12 elements (1/4) in that row //each work group is responsible for 24 by 24 macro tile; ////////////// -invA11*invA12 const uint gidx = get_group_id(0); \n const uint gidy = get_group_id(1); \n const uint idx = get_local_id(0); \n const uint idy = get_local_id(1); \n //uint page = gidx / 2;//0-1 for 192; 0 for 96 //const uint page = (gidx/2)%2;//index of page within a page_block; 1 pages per page_block const uint page_block = gidx / 4; \n//#index of page_block; 4 WG per page; 4 WG per page_block __global double *A, *B, *C; \n __local double lA[24][48]; \n __local double lB[48][24]; \n double privateC[12] = { (double)0 }; \n //decide invA11 location for each page //each workgroup loads half of A (left or right) //A = d_dinvA + page*NB*NB + gidx%2*(blk/2); A = d_dinvA + page_block*NB*NB + gidx % 4 * (blk / 4); \n //decide invA12 (B) location for each page //actually it was saved in invA21 from last kernel //each workgroup loads half of B (up or down) //B = d_dinvA + page*NB*NB + blk*NB + gidy*(blk/2)*NB; B = d_dinvA + page_block*NB*NB + blk*NB + gidy*(blk / 4)*NB; \n //decide invA12 location for each page //each workgroup writes 1/4 of C //C = d_dinvA + page*NB*NB + blk * NB + gidx%2*(blk/2) + gidy*(blk/2)*NB; C = d_dinvA + page_block*NB*NB + blk*NB + gidx % 4 * (blk / 4) + gidy*(blk / 4)*NB; \n //read A and B into LDS no transpose operated here //each work item loads a half row of A and half column of B //idx 0-23 idy 0-1 uint block_k = blk / 48; \n //thus we need 2 iterations here do{\n barrier(CLK_LOCAL_MEM_FENCE); \n lA[idx][0 + idy * 24] = A[idx + idy * 24 * NB]; \n lA[idx][1 + idy * 24] = A[idx + NB + idy * 24 * NB]; \n lA[idx][2 + idy * 24] = A[idx + NB * 2 + idy * 24 * NB];\n lA[idx][3 + idy * 24] = A[idx + NB * 3 + idy * 24 * NB];\n lA[idx][4 + idy * 24] = A[idx + NB * 4 + idy * 24 * NB];\n lA[idx][5 + idy * 24] = A[idx + NB * 5 + idy * 24 * NB];\n lA[idx][6 + idy * 24] = A[idx + NB * 6 + idy * 24 * NB];\n lA[idx][7 + idy * 24] = A[idx + NB * 7 + idy * 24 * NB];\n lA[idx][8 + idy * 24] = A[idx + NB * 8 + idy * 24 * NB];\n lA[idx][9 + idy * 24] = A[idx + NB * 9 + idy * 24 * NB];\n lA[idx][10 + idy * 24] = A[idx + NB * 10 + idy * 24 * NB];\n lA[idx][11 + idy * 24] = A[idx + NB * 11 + idy * 24 * NB];\n lA[idx][12 + idy * 24] = A[idx + NB * 12 + idy * 24 * NB];\n lA[idx][13 + idy * 24] = A[idx + NB * 13 + idy * 24 * NB];\n lA[idx][14 + idy * 24] = A[idx + NB * 14 + idy * 24 * NB];\n lA[idx][15 + idy * 24] = A[idx + NB * 15 + idy * 24 * NB];\n lA[idx][16 + idy * 24] = A[idx + NB * 16 + idy * 24 * NB];\n lA[idx][17 + idy * 24] = A[idx + NB * 17 + idy * 24 * NB];\n lA[idx][18 + idy * 24] = A[idx + NB * 18 + idy * 24 * NB];\n lA[idx][19 + idy * 24] = A[idx + NB * 19 + idy * 24 * NB];\n lA[idx][20 + idy * 24] = A[idx + NB * 20 + idy * 24 * NB];\n lA[idx][21 + idy * 24] = A[idx + NB * 21 + idy * 24 * NB];\n lA[idx][22 + idy * 24] = A[idx + NB * 22 + idy * 24 * NB];\n lA[idx][23 + idy * 24] = A[idx + NB * 23 + idy * 24 * NB];\n lB[0 + idy * 24][idx] = B[idx*NB + idy * 24]; \n lB[1 + idy * 24][idx] = B[idx*NB + idy * 24 + 1]; \n lB[2 + idy * 24][idx] = B[idx*NB + idy * 24 + 2];\n lB[3 + idy * 24][idx] = B[idx*NB + idy * 24 + 3];\n lB[4 + idy * 24][idx] = B[idx*NB + idy * 24 + 4];\n lB[5 + idy * 24][idx] = B[idx*NB + idy * 24 + 5];\n lB[6 + idy * 24][idx] = B[idx*NB + idy * 24 + 6];\n lB[7 + idy * 24][idx] = B[idx*NB + idy * 24 + 7];\n lB[8 + idy * 24][idx] = B[idx*NB + idy * 24 + 8];\n lB[9 + idy * 24][idx] = B[idx*NB + idy * 24 + 9];\n lB[10 + idy * 24][idx] = B[idx*NB + idy * 24 + 10];\n lB[11 + idy * 24][idx] = B[idx*NB + idy * 24 + 11];\n lB[12 + idy * 24][idx] = B[idx*NB + idy * 24 + 12];\n lB[13 + idy * 24][idx] = B[idx*NB + idy * 24 + 13];\n lB[14 + idy * 24][idx] = B[idx*NB + idy * 24 + 14];\n lB[15 + idy * 24][idx] = B[idx*NB + idy * 24 + 15];\n lB[16 + idy * 24][idx] = B[idx*NB + idy * 24 + 16];\n lB[17 + idy * 24][idx] = B[idx*NB + idy * 24 + 17];\n lB[18 + idy * 24][idx] = B[idx*NB + idy * 24 + 18];\n lB[19 + idy * 24][idx] = B[idx*NB + idy * 24 + 19];\n lB[20 + idy * 24][idx] = B[idx*NB + idy * 24 + 20];\n lB[21 + idy * 24][idx] = B[idx*NB + idy * 24 + 21];\n lB[22 + idy * 24][idx] = B[idx*NB + idy * 24 + 22];\n lB[23 + idy * 24][idx] = B[idx*NB + idy * 24 + 23];\n barrier(CLK_LOCAL_MEM_FENCE); \n //do math uint i = 0; \n do{\n privateC[0] = mad(lA[idx][i], lB[i][0 + idy * 12], privateC[0]);\n privateC[1] = mad(lA[idx][i], lB[i][1 + idy * 12], privateC[1]);\n privateC[2] = mad(lA[idx][i], lB[i][2 + idy * 12], privateC[2]);\n privateC[3] = mad(lA[idx][i], lB[i][3 + idy * 12], privateC[3]);\n privateC[4] = mad(lA[idx][i], lB[i][4 + idy * 12], privateC[4]);\n privateC[5] = mad(lA[idx][i], lB[i][5 + idy * 12], privateC[5]);\n privateC[6] = mad(lA[idx][i], lB[i][6 + idy * 12], privateC[6]);\n privateC[7] = mad(lA[idx][i], lB[i][7 + idy * 12], privateC[7]);\n privateC[8] = mad(lA[idx][i], lB[i][8 + idy * 12], privateC[8]);\n privateC[9] = mad(lA[idx][i], lB[i][9 + idy * 12], privateC[9]);\n privateC[10] = mad(lA[idx][i], lB[i][10 + idy * 12], privateC[10]); \n privateC[11] = mad(lA[idx][i], lB[i][11 + idy * 12], privateC[11]); \n i = i + 1; \n } while (i < 48); \n A += 48 * NB; \n B += 48; \n } while (--block_k>0); \n uint i = 0; \n do{\n C[NB*idy * 12 + NB*i + idx] = -1 * privateC[i]; \n i = i + 1; \n } while (i < 12); \n }\n // end of kernel ); #endif clblas-2.10/src/library/blas/xasum.c000066400000000000000000000164361264277366700174070ustar00rootroot00000000000000/* ************************************************************************ * Copyright 2013 Advanced Micro Devices, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * ************************************************************************/ #include #include #include #include #include #include "clblas-internal.h" #include "solution_seq.h" clblasStatus doAsum( CLBlasKargs *kargs, size_t N, cl_mem asum, size_t offAsum, const cl_mem X, size_t offx, int incx, cl_mem scratchBuff, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events) { cl_int err; ListHead seq, seq2; clblasStatus retCode = clblasSuccess; cl_event firstAsumCall; CLBlasKargs redctnArgs; ListNode *listNodePtr; SolutionStep *step; DataType asumType = (kargs->dtype == TYPE_COMPLEX_FLOAT) ? TYPE_FLOAT: ((kargs->dtype == TYPE_COMPLEX_DOUBLE) ? TYPE_DOUBLE: kargs->dtype); if (!clblasInitialized) { return clblasNotInitialized; } /* Validate arguments */ retCode = checkMemObjects(scratchBuff, asum, X, true, X_VEC_ERRSET, X_VEC_ERRSET, X_VEC_ERRSET ); if (retCode) { #ifdef DEBUG_ASUM printf("Invalid mem object..\n"); #endif return retCode; } // Check wheather enough memory was allocated if ((retCode = checkVectorSizes(kargs->dtype, N, X, offx, incx, X_VEC_ERRSET ))) { #ifdef DEBUG_ASUM printf("Invalid Size for X\n"); #endif return retCode; } // Minimum size of scratchBuff is N if ((retCode = checkVectorSizes(kargs->dtype, N, scratchBuff, 0, 1, X_VEC_ERRSET ))) { #ifdef DEBUG_ASUM printf("Insufficient ScratchBuff\n"); #endif return retCode; } if ((retCode = checkVectorSizes(asumType, 1, asum, offAsum, 1, X_VEC_ERRSET ))) { #ifdef DEBUG_ASUM printf("Invalid Size for asum\n"); #endif return retCode; } /////////////////////////////////////////////////////////////// if ((commandQueues == NULL) || (numCommandQueues == 0)) { return clblasInvalidValue; } /* numCommandQueues will be hardcoded to 1 as of now. No multi-gpu support */ numCommandQueues = 1; if (commandQueues[0] == NULL) { return clblasInvalidCommandQueue; } if ((numEventsInWaitList !=0) && (eventWaitList == NULL)) { return clblasInvalidEventWaitList; } kargs->N = N; kargs->A = asum; kargs->offA = offAsum; kargs->B = X; kargs->offBX = offx; kargs->ldb.vector = incx; // Will be using this as incx if(incx <1){ kargs->N = 1; } kargs->D = scratchBuff; kargs->redctnType = REDUCE_BY_SUM; memcpy(&redctnArgs, kargs, sizeof(CLBlasKargs)); redctnArgs.dtype = asumType; listInitHead(&seq); err = makeSolutionSeq(CLBLAS_ASUM, kargs, numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, &firstAsumCall, &seq); if (err == CL_SUCCESS) { /** The second kernel call needs to know the number of work-groups used in the first kernel call. This number of work-groups is calculated here and passed as N to second reduction kernel **/ err = executeSolutionSeq(&seq); if (err == CL_SUCCESS) { listNodePtr = listNodeFirst(&seq); // Get the node step = container_of(listNodePtr, node, SolutionStep); redctnArgs.N = step->pgran.numWGSpawned[0]; // 1D block was used listInitHead(&seq2); err = makeSolutionSeq(CLBLAS_REDUCTION_EPILOGUE, &redctnArgs, numCommandQueues, commandQueues, 1, &firstAsumCall, events, &seq2); if (err == CL_SUCCESS) { err = executeSolutionSeq(&seq2); } freeSolutionSeq(&seq2); } } freeSolutionSeq(&seq); return (clblasStatus)err; } clblasStatus clblasSasum( size_t N, cl_mem asum, size_t offAsum, const cl_mem X, size_t offx, int incx, cl_mem scratchBuff, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events) { CLBlasKargs kargs; #ifdef DEBUG_ASUM printf("SASUM Called\n"); #endif memset(&kargs, 0, sizeof(kargs)); kargs.dtype = TYPE_FLOAT; kargs.pigFuncID = CLBLAS_ASUM; return doAsum(&kargs, N, asum, offAsum, X, offx, incx, scratchBuff, numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events); } clblasStatus clblasDasum( size_t N, cl_mem asum, size_t offAsum, const cl_mem X, size_t offx, int incx, cl_mem scratchBuff, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events) { CLBlasKargs kargs; #ifdef DEBUG_ASUM printf("DASUM called\n"); #endif memset(&kargs, 0, sizeof(kargs)); kargs.dtype = TYPE_DOUBLE; kargs.pigFuncID = CLBLAS_ASUM; return doAsum(&kargs, N, asum, offAsum, X, offx, incx, scratchBuff, numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events); } clblasStatus clblasScasum( size_t N, cl_mem asum, size_t offAsum, const cl_mem X, size_t offx, int incx, cl_mem scratchBuff, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events) { CLBlasKargs kargs; #ifdef DEBUG_ASUM printf("SCASUM Called\n"); #endif memset(&kargs, 0, sizeof(kargs)); kargs.pigFuncID = CLBLAS_ASUM; kargs.dtype = TYPE_COMPLEX_FLOAT; return doAsum(&kargs, N, asum, offAsum, X, offx, incx, scratchBuff, numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events); } clblasStatus clblasDzasum( size_t N, cl_mem asum, size_t offAsum, const cl_mem X, size_t offx, int incx, cl_mem scratchBuff, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events) { CLBlasKargs kargs; #ifdef DEBUG_DZASUM printf("DZASUM Called\n"); #endif memset(&kargs, 0, sizeof(kargs)); kargs.pigFuncID = CLBLAS_ASUM; kargs.dtype = TYPE_COMPLEX_DOUBLE; return doAsum(&kargs, N, asum, offAsum, X, offx, incx, scratchBuff, numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events); } clblas-2.10/src/library/blas/xaxpy.c000066400000000000000000000134231264277366700174140ustar00rootroot00000000000000/* ************************************************************************ * Copyright 2013 Advanced Micro Devices, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * ************************************************************************/ //#define DEBUG_AXPY #include #include #include #include #include "clblas-internal.h" #include "solution_seq.h" clblasStatus doAxpy( CLBlasKargs *kargs, size_t N, const cl_mem X, size_t offx, int incx, cl_mem Y, size_t offy, int incy, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events) { cl_int err; ListHead seq; clblasStatus retCode = clblasSuccess; if (!clblasInitialized) { return clblasNotInitialized; } /* Validate arguments */ retCode = checkMemObjects(X, Y, X, false, X_VEC_ERRSET, Y_VEC_ERRSET, X_VEC_ERRSET ); if (retCode) { #ifdef DEBUG_AXPY printf("Invalid mem object..\n"); #endif return retCode; } // Check wheather enough memory was allocated if ((retCode = checkVectorSizes(kargs->dtype, N, X, offx, incx, X_VEC_ERRSET))) { #ifdef DEBUG_AXPY printf("Invalid Size for X\n"); #endif return retCode; } if ((retCode = checkVectorSizes(kargs->dtype, N, Y, offy, incy, Y_VEC_ERRSET))) { #ifdef DEBUG_AXPY printf("Invalid Size for Y\n"); #endif return retCode; } /////////////////////////////////////////////////////////////// if ((commandQueues == NULL) || (numCommandQueues == 0)) { return clblasInvalidValue; } /* numCommandQueues will be hardcoded to 1 as of now. No multi-gpu support */ numCommandQueues = 1; if (commandQueues[0] == NULL) { return clblasInvalidCommandQueue; } if ((numEventsInWaitList !=0) && (eventWaitList == NULL)) { return clblasInvalidEventWaitList; } /* * ASSUMPTION: * doTRMV assumes "commandQueue" of 0. The same is reflected in * "makeSolutionSeq" as well. If either of them changes in future, * this code needs to be revisited. */ kargs->N = N; kargs->A = X; kargs->offBX = offx; kargs->ldb.vector = incx; // Will be using this as incx kargs->B = Y; kargs->offCY = offy; kargs->ldc.vector = incy; // Will be using this as incy #ifdef DEBUG_AXPY printf("Calling makeSolutionSeq from DoAxpy: AXPY\n"); #endif listInitHead(&seq); err = makeSolutionSeq(CLBLAS_AXPY, kargs, numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events, &seq); if (err == CL_SUCCESS) { err = executeSolutionSeq(&seq); } freeSolutionSeq(&seq); return (clblasStatus)err; } clblasStatus clblasSaxpy( size_t N, float alpha, const cl_mem X, size_t offx, int incx, cl_mem Y, size_t offy, int incy, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events) { CLBlasKargs kargs; #ifdef DEBUG_AXPY printf("\nSAXPY Called\n"); #endif memset(&kargs, 0, sizeof(kargs)); kargs.dtype = TYPE_FLOAT; kargs.alpha.argFloat = alpha; return doAxpy(&kargs, N, X, offx, incx, Y, offy, incy, numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events); } clblasStatus clblasDaxpy( size_t N, double alpha, const cl_mem X, size_t offx, int incx, cl_mem Y, size_t offy, int incy, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events) { CLBlasKargs kargs; #ifdef DEBUG_AXPY printf("\nDAXPY Called\n"); #endif memset(&kargs, 0, sizeof(kargs)); kargs.dtype = TYPE_DOUBLE; kargs.alpha.argDouble = alpha; return doAxpy(&kargs, N, X, offx, incx, Y, offy, incy, numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events); } clblasStatus clblasCaxpy( size_t N, cl_float2 alpha, const cl_mem X, size_t offx, int incx, cl_mem Y, size_t offy, int incy, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events) { CLBlasKargs kargs; #ifdef DEBUG_AXPY printf("\nCAXPY Called\n"); #endif memset(&kargs, 0, sizeof(kargs)); kargs.dtype = TYPE_COMPLEX_FLOAT; kargs.alpha.argFloatComplex = alpha; return doAxpy(&kargs, N, X, offx, incx, Y, offy, incy, numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events); } clblasStatus clblasZaxpy( size_t N, cl_double2 alpha, const cl_mem X, size_t offx, int incx, cl_mem Y, size_t offy, int incy, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events) { CLBlasKargs kargs; #ifdef DEBUG_AXPY printf("\nZAXPY Called\n"); #endif memset(&kargs, 0, sizeof(kargs)); kargs.dtype = TYPE_COMPLEX_DOUBLE; kargs.alpha.argDoubleComplex = alpha; return doAxpy(&kargs, N, X, offx, incx, Y, offy, incy, numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events); } clblas-2.10/src/library/blas/xcopy.c000066400000000000000000000125171264277366700174100ustar00rootroot00000000000000/* ************************************************************************ * Copyright 2013 Advanced Micro Devices, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * ************************************************************************/ //#define DEBUG_COPY #include #include #include #include #include "clblas-internal.h" #include "solution_seq.h" clblasStatus doCopy( CLBlasKargs *kargs, size_t N, const cl_mem X, size_t offx, int incx, cl_mem Y, size_t offy, int incy, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events) { cl_int err; ListHead seq; clblasStatus retCode = clblasSuccess; if (!clblasInitialized) { return clblasNotInitialized; } /* Validate arguments */ retCode = checkMemObjects(X, Y, X, false, X_VEC_ERRSET, Y_VEC_ERRSET, X_VEC_ERRSET ); if (retCode) { #ifdef DEBUG_COPY printf("Invalid mem object..\n"); #endif return retCode; } // Check wheather enough memory was allocated if ((retCode = checkVectorSizes(kargs->dtype, N, X, offx, incx, X_VEC_ERRSET))) { #ifdef DEBUG_COPY printf("Invalid Size for X\n"); #endif return retCode; } if ((retCode = checkVectorSizes(kargs->dtype, N, Y, offy, incy, Y_VEC_ERRSET))) { #ifdef DEBUG_COPY printf("Invalid Size for Y\n"); #endif return retCode; } /////////////////////////////////////////////////////////////// if ((commandQueues == NULL) || (numCommandQueues == 0)) { return clblasInvalidValue; } /* numCommandQueues will be hardcoded to 1 as of now. No multi-gpu support */ numCommandQueues = 1; if (commandQueues[0] == NULL) { return clblasInvalidCommandQueue; } if ((numEventsInWaitList !=0) && (eventWaitList == NULL)) { return clblasInvalidEventWaitList; } kargs->N = N; kargs->A = X; kargs->offBX = offx; kargs->ldb.vector = incx; // Will be using this as incx kargs->B = Y; kargs->offCY = offy; kargs->ldc.vector = incy; // Will be using this as incy #ifdef DEBUG_COPY printf("Calling makeSolutionSeq from DoCopy: COPY\n"); #endif listInitHead(&seq); err = makeSolutionSeq(CLBLAS_COPY, kargs, numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events, &seq); if (err == CL_SUCCESS) { err = executeSolutionSeq(&seq); } freeSolutionSeq(&seq); return (clblasStatus)err; } clblasStatus clblasScopy( size_t N, const cl_mem X, size_t offx, int incx, cl_mem Y, size_t offy, int incy, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events) { CLBlasKargs kargs; #ifdef DEBUG_COPY printf("\nSCOPY Called\n"); #endif memset(&kargs, 0, sizeof(kargs)); kargs.dtype = TYPE_FLOAT; return doCopy(&kargs, N, X, offx, incx, Y, offy, incy, numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events); } clblasStatus clblasDcopy( size_t N, const cl_mem X, size_t offx, int incx, cl_mem Y, size_t offy, int incy, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events) { CLBlasKargs kargs; #ifdef DEBUG_COPY printf("\nDCOPY Called\n"); #endif memset(&kargs, 0, sizeof(kargs)); kargs.dtype = TYPE_DOUBLE; return doCopy(&kargs, N, X, offx, incx, Y, offy, incy, numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events); } clblasStatus clblasCcopy( size_t N, const cl_mem X, size_t offx, int incx, cl_mem Y, size_t offy, int incy, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events) { CLBlasKargs kargs; #ifdef DEBUG_COPY printf("\nCCOPY Called\n"); #endif memset(&kargs, 0, sizeof(kargs)); kargs.dtype = TYPE_COMPLEX_FLOAT; return doCopy(&kargs, N, X, offx, incx, Y, offy, incy, numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events); } clblasStatus clblasZcopy( size_t N, const cl_mem X, size_t offx, int incx, cl_mem Y, size_t offy, int incy, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events) { CLBlasKargs kargs; #ifdef DEBUG_COPY printf("\nZCOPY Called\n"); #endif memset(&kargs, 0, sizeof(kargs)); kargs.dtype = TYPE_COMPLEX_DOUBLE; return doCopy(&kargs, N, X, offx, incx, Y, offy, incy, numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events); } clblas-2.10/src/library/blas/xdot.c000066400000000000000000000227431264277366700172260ustar00rootroot00000000000000/* ************************************************************************ * Copyright 2013 Advanced Micro Devices, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * ************************************************************************/ #include #include #include #include #include #include "clblas-internal.h" #include "solution_seq.h" clblasStatus doDot( CLBlasKargs *kargs, size_t N, cl_mem dotProduct, size_t offDP, const cl_mem X, size_t offx, int incx, const cl_mem Y, size_t offy, int incy, cl_mem scratchBuff, int doConj, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events) { cl_int err; ListHead seq, seq2; clblasStatus retCode = clblasSuccess; cl_event firstDotCall; CLBlasKargs redctnArgs; ListNode *listNodePtr; SolutionStep *step; if (!clblasInitialized) { return clblasNotInitialized; } /* Validate arguments */ retCode = checkMemObjects(X, Y, X, false, X_VEC_ERRSET, Y_VEC_ERRSET, X_VEC_ERRSET ); retCode |= checkMemObjects(scratchBuff, dotProduct, X, false, X_VEC_ERRSET, X_VEC_ERRSET, Y_VEC_ERRSET ); if (retCode) { #ifdef DEBUG_DOT printf("Invalid mem object..\n"); #endif return retCode; } // Check wheather enough memory was allocated if ((retCode = checkVectorSizes(kargs->dtype, N, X, offx, incx, X_VEC_ERRSET))) { #ifdef DEBUG_DOT printf("Invalid Size for X\n"); #endif return retCode; } if ((retCode = checkVectorSizes(kargs->dtype, N, Y, offy, incy, Y_VEC_ERRSET))) { #ifdef DEBUG_DOT printf("Invalid Size for Y\n"); #endif return retCode; } // Minimum size of scratchBuff is N if ((retCode = checkVectorSizes(kargs->dtype, N, scratchBuff, 0, 1, X_VEC_ERRSET))) { #ifdef DEBUG_DOT printf("Insufficient ScratchBuff\n"); #endif return retCode; } if ((retCode = checkVectorSizes(kargs->dtype, 1, dotProduct, offDP, 1, Y_VEC_ERRSET))) { #ifdef DEBUG_DOT printf("Invalid Size for dotProduct\n"); #endif return retCode; } /////////////////////////////////////////////////////////////// if ((commandQueues == NULL) || (numCommandQueues == 0)) { return clblasInvalidValue; } /* numCommandQueues will be hardcoded to 1 as of now. No multi-gpu support */ numCommandQueues = 1; if (commandQueues[0] == NULL) { return clblasInvalidCommandQueue; } if ((numEventsInWaitList !=0) && (eventWaitList == NULL)) { return clblasInvalidEventWaitList; } kargs->N = N; kargs->A = dotProduct; kargs->offA = offDP; kargs->offa = offDP; kargs->B = X; kargs->offBX = offx; kargs->ldb.vector = incx; // Will be using this as incx kargs->C = Y; kargs->offCY = offy; kargs->ldc.vector = incy; // Will be using this as incy kargs->D = scratchBuff; kargs->redctnType = REDUCE_BY_SUM; kargs->K = (size_t)doConj; memcpy(&redctnArgs, kargs, sizeof(CLBlasKargs)); listInitHead(&seq); err = makeSolutionSeq(CLBLAS_DOT, kargs, numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, &firstDotCall, &seq); if (err == CL_SUCCESS) { /** The second kernel call needs to know the number of work-groups used in the first kernel call. This number of work-groups is calculated here and passed as N to second reduction kernel **/ err = executeSolutionSeq(&seq); if (err == CL_SUCCESS) { listNodePtr = listNodeFirst(&seq); // Get the node step = container_of(listNodePtr, node, SolutionStep); redctnArgs.N = step->pgran.numWGSpawned[0]; // 1D block was used listInitHead(&seq2); err = makeSolutionSeq(CLBLAS_REDUCTION_EPILOGUE, &redctnArgs, numCommandQueues, commandQueues, 1, &firstDotCall, events, &seq2); if (err == CL_SUCCESS) { err = executeSolutionSeq(&seq2); } freeSolutionSeq(&seq2); } } freeSolutionSeq(&seq); return (clblasStatus)err; } clblasStatus clblasSdot( size_t N, cl_mem dotProduct, size_t offDP, const cl_mem X, size_t offx, int incx, const cl_mem Y, size_t offy, int incy, cl_mem scratchBuff, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events) { CLBlasKargs kargs; int doConj; #ifdef DEBUG_DOT printf("SDOT Called\n"); #endif memset(&kargs, 0, sizeof(kargs)); kargs.dtype = TYPE_FLOAT; kargs.pigFuncID = CLBLAS_DOT; doConj = 0; return doDot(&kargs, N, dotProduct, offDP, X, offx, incx, Y, offy, incy, scratchBuff, doConj, numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events); } clblasStatus clblasDdot( size_t N, cl_mem dotProduct, size_t offDP, const cl_mem X, size_t offx, int incx, const cl_mem Y, size_t offy, int incy, cl_mem scratchBuff, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events) { CLBlasKargs kargs; int doConj; #ifdef DEBUG_DOT printf("DDOT called\n"); #endif memset(&kargs, 0, sizeof(kargs)); kargs.dtype = TYPE_DOUBLE; kargs.pigFuncID = CLBLAS_DOT; doConj = 0; return doDot(&kargs, N, dotProduct, offDP, X, offx, incx, Y, offy, incy, scratchBuff, doConj, numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events); } clblasStatus clblasCdotu( size_t N, cl_mem dotProduct, size_t offDP, const cl_mem X, size_t offx, int incx, const cl_mem Y, size_t offy, int incy, cl_mem scratchBuff, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events) { CLBlasKargs kargs; int doConj; #ifdef DEBUG_DOT printf("CDOTU Called\n"); #endif memset(&kargs, 0, sizeof(kargs)); kargs.pigFuncID = CLBLAS_DOT; kargs.dtype = TYPE_COMPLEX_FLOAT; doConj = 0; return doDot(&kargs, N, dotProduct, offDP, X, offx, incx, Y, offy, incy, scratchBuff, doConj, numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events); } clblasStatus clblasZdotu( size_t N, cl_mem dotProduct, size_t offDP, const cl_mem X, size_t offx, int incx, const cl_mem Y, size_t offy, int incy, cl_mem scratchBuff, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events) { CLBlasKargs kargs; int doConj; #ifdef DEBUG_DOT printf("ZDOTU Called\n"); #endif memset(&kargs, 0, sizeof(kargs)); kargs.pigFuncID = CLBLAS_DOT; kargs.dtype = TYPE_COMPLEX_DOUBLE; doConj = 0; return doDot(&kargs, N, dotProduct, offDP, X, offx, incx, Y, offy, incy, scratchBuff, doConj, numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events); } clblasStatus clblasCdotc( size_t N, cl_mem dotProduct, size_t offDP, const cl_mem X, size_t offx, int incx, const cl_mem Y, size_t offy, int incy, cl_mem scratchBuff, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events) { CLBlasKargs kargs; int doConj; #ifdef DEBUG_DOT printf("CDOTU Called\n"); #endif memset(&kargs, 0, sizeof(kargs)); kargs.pigFuncID = CLBLAS_DOT; kargs.dtype = TYPE_COMPLEX_FLOAT; doConj = 1; return doDot(&kargs, N, dotProduct, offDP, X, offx, incx, Y, offy, incy, scratchBuff, doConj, numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events); } clblasStatus clblasZdotc( size_t N, cl_mem dotProduct, size_t offDP, const cl_mem X, size_t offx, int incx, const cl_mem Y, size_t offy, int incy, cl_mem scratchBuff, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events) { CLBlasKargs kargs; int doConj; #ifdef DEBUG_DOT printf("ZDOTU Called\n"); #endif memset(&kargs, 0, sizeof(kargs)); kargs.pigFuncID = CLBLAS_DOT; kargs.dtype = TYPE_COMPLEX_DOUBLE; doConj = 1; return doDot(&kargs, N, dotProduct, offDP, X, offx, incx, Y, offy, incy, scratchBuff, doConj, numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events); } clblas-2.10/src/library/blas/xgbmv.c000066400000000000000000000154511264277366700173710ustar00rootroot00000000000000/* ************************************************************************ * Copyright 2013 Advanced Micro Devices, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * ************************************************************************/ #include #include #include #include "clblas-internal.h" #include "solution_seq.h" static clblasStatus doGbmv( CLBlasKargs *kargs, clblasOrder order, clblasTranspose transA, size_t M, size_t N, size_t KL, size_t KU, const cl_mem A, size_t offa, size_t lda, const cl_mem x, size_t offx, int incx, cl_mem y, size_t offy, int incy, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events) { cl_int err; ListHead seq; size_t sizev; clblasStatus retCode = clblasSuccess; if (!clblasInitialized) { return clblasNotInitialized; } if ((commandQueues == NULL) || (numCommandQueues == 0)) { return clblasInvalidValue; } if (commandQueues[0] == NULL) { return clblasInvalidCommandQueue; } if ((numEventsInWaitList !=0) && (eventWaitList == NULL)) { return clblasInvalidEventWaitList; } /* Validate arguments */ if ((retCode = checkMemObjects(A, x, y, true, A_MAT_ERRSET, X_VEC_ERRSET, Y_VEC_ERRSET ))) { return retCode; } if ((retCode = checkBandedMatrixSizes(kargs->dtype, order, clblasNoTrans, M, N, KL, KU, A, offa, lda, A_MAT_ERRSET ))) { return retCode; } sizev = (transA == clblasNoTrans) ? N : M; if ((retCode = checkVectorSizes(kargs->dtype, sizev, x, offx, incx, X_VEC_ERRSET ))) { return retCode; } sizev = (transA == clblasNoTrans) ? M : N; if ((retCode = checkVectorSizes(kargs->dtype, sizev, y, offy, incy, Y_VEC_ERRSET ))) { return retCode; } /* numCommandQueues will be hardcoded to 1 as of now. No multi-gpu support */ numCommandQueues = 1; kargs->order = order; kargs->transA = transA; kargs->M = M; kargs->N = N; kargs->KL = KL; kargs->KU = KU; kargs->A = A; kargs->offA = offa; kargs->offa = offa; kargs->lda.matrix = lda; kargs->B = x; kargs->offBX = offx; kargs->ldb.vector = incx; kargs->C = y; kargs->offCY = offy; kargs->ldc.vector = incy; listInitHead(&seq); err = makeSolutionSeq(CLBLAS_GBMV, kargs, numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events, &seq); if (err == CL_SUCCESS) { err = executeSolutionSeq(&seq); } freeSolutionSeq(&seq); return (clblasStatus)err; } clblasStatus clblasSgbmv( clblasOrder order, clblasTranspose transA, size_t M, size_t N, size_t KL, size_t KU, cl_float alpha, const cl_mem A, size_t offa, size_t lda, const cl_mem x, size_t offx, int incx, cl_float beta, cl_mem y, size_t offy, int incy, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events) { CLBlasKargs kargs; memset(&kargs, 0, sizeof(kargs)); kargs.dtype = TYPE_FLOAT; kargs.pigFuncID = CLBLAS_GBMV; kargs.alpha.argFloat = alpha; kargs.beta.argFloat = beta; return doGbmv(&kargs, order, transA, M, N, KL, KU, A, offa, lda, x, offx, incx, y, offy, incy, numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events); } clblasStatus clblasDgbmv( clblasOrder order, clblasTranspose transA, size_t M, size_t N, size_t KL, size_t KU, cl_double alpha, const cl_mem A, size_t offa, size_t lda, const cl_mem x, size_t offx, int incx, cl_double beta, cl_mem y, size_t offy, int incy, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events) { CLBlasKargs kargs; memset(&kargs, 0, sizeof(kargs)); kargs.dtype = TYPE_DOUBLE; kargs.pigFuncID = CLBLAS_GBMV; kargs.alpha.argDouble = alpha; kargs.beta.argDouble = beta; return doGbmv(&kargs, order, transA, M, N, KL, KU, A, offa, lda, x, offx, incx, y, offy, incy, numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events); } clblasStatus clblasCgbmv( clblasOrder order, clblasTranspose transA, size_t M, size_t N, size_t KL, size_t KU, cl_float2 alpha, const cl_mem A, size_t offa, size_t lda, const cl_mem x, size_t offx, int incx, cl_float2 beta, cl_mem y, size_t offy, int incy, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events) { CLBlasKargs kargs; memset(&kargs, 0, sizeof(kargs)); kargs.dtype = TYPE_COMPLEX_FLOAT; kargs.pigFuncID = CLBLAS_GBMV; kargs.alpha.argFloatComplex = alpha; kargs.beta.argFloatComplex = beta; return doGbmv(&kargs, order, transA, M, N, KL, KU, A, offa, lda, x, offx, incx, y, offy, incy, numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events); } clblasStatus clblasZgbmv( clblasOrder order, clblasTranspose transA, size_t M, size_t N, size_t KL, size_t KU, cl_double2 alpha, const cl_mem A, size_t offa, size_t lda, const cl_mem x, size_t offx, int incx, cl_double2 beta, cl_mem y, size_t offy, int incy, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events) { CLBlasKargs kargs; memset(&kargs, 0, sizeof(kargs)); kargs.dtype = TYPE_COMPLEX_DOUBLE; kargs.pigFuncID = CLBLAS_GBMV; kargs.alpha.argDoubleComplex = alpha; kargs.beta.argDoubleComplex = beta; return doGbmv(&kargs, order, transA, M, N, KL, KU, A, offa, lda, x, offx, incx, y, offy, incy, numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events); } clblas-2.10/src/library/blas/xgemm.cc000066400000000000000000000625111264277366700175250ustar00rootroot00000000000000/* ************************************************************************ * Copyright 2013 Advanced Micro Devices, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * ************************************************************************/ #include #include #include #include #include #include #include "AutoGemmIncludes/AutoGemmKernelSelection.h" #include "GemmSpecialCases.h" #include // #include #include "xgemm.h" /****************************************************************************** * Row major -> column major *****************************************************************************/ static void force_gemm_column_major( clblasOrder &order, clblasTranspose &transA, clblasTranspose &transB, cl_uint &M, cl_uint &N, cl_uint &offA, cl_uint &offB, cl_uint &lda, cl_uint &ldb, cl_mem &A, cl_mem &B ) { if (order == clblasRowMajor) { std::swap(transA , transB); std::swap(M , N); std::swap(offA , offB); std::swap(lda , ldb); std::swap(A , B); order = clblasColumnMajor; } } /****************************************************************************** * Check OpenCL Errors *****************************************************************************/ #define CL_CHECK(RET) \ if(RET != CL_SUCCESS) { \ printf("OpenCL error %i on line %u of %s\n", RET, __LINE__, __FILE__); \ assert(false); \ } const static unsigned int numGemmKernelArgs = 14; void *gemmKernelArgs[numGemmKernelArgs]; size_t gemmKernelArgSizes[numGemmKernelArgs]; /****************************************************************************** * Is beta zero for optimization *****************************************************************************/ template bool isZero(Precision value); template<> bool isZero( float value ) { return value == 0; }; template<> bool isZero( double value ) { return value == 0; }; template<> bool isZero( FloatComplex value ) { return CREAL(value) == 0 && CIMAG(value) == 0; }; template<> bool isZero( DoubleComplex value ) { return CREAL(value) == 0 && CIMAG(value) == 0; }; static char *getKernelName(cl_kernel clKernel) { cl_int err; // get kernel name size_t kernelNameLength; err = clGetKernelInfo( clKernel, CL_KERNEL_FUNCTION_NAME, sizeof(kernelNameLength), NULL, &kernelNameLength); // Do not check this error because of an nvidia bug. // The kernelNameLength turns out to be of proper length. // CL_CHECK(err) char *kernelName = new char[kernelNameLength]; err = clGetKernelInfo( clKernel, CL_KERNEL_FUNCTION_NAME, kernelNameLength*sizeof(char), kernelName, NULL ); CL_CHECK(err) return kernelName; } /****************************************************************************** * Make Gemm Kernel *****************************************************************************/ //FIXME: This function should be returning an error. void makeGemmKernel( cl_kernel *clKernel, cl_command_queue clQueue, const char *kernelSource, const char *sourceBuildOptions, const unsigned char **kernelBinary, size_t *kernelBinarySize, const char *binaryBuildOptions) { //TODO: This will need to be converted to thread local when making clBLAS thread safe typedef std::map kernel_map_t; static kernel_map_t kernel_map; cl_context clContext; cl_device_id clDevice; cl_int err; err = clGetCommandQueueInfo( clQueue, CL_QUEUE_CONTEXT, sizeof(clContext), &clContext, NULL); CL_CHECK(err) err = clGetCommandQueueInfo( clQueue, CL_QUEUE_DEVICE, sizeof(clDevice), &clDevice, NULL); CL_CHECK(err) std::stringstream ss; ss << clDevice << "_" << clContext; std::string prefix = ss.str(); if (*clKernel) { char *kernelName = getKernelName(*clKernel); // kernel has already been built, return #ifdef AUTOGEMM_PRINT_DEBUG printf("makeGemmKernel: \"%s\" already built; returning.\n", kernelName); #endif // Check if kernel exists for this device std::string key = prefix + "_" + kernelName; kernel_map_t::iterator idx = kernel_map.find(key); // If kernel not found for this device, set to NULL if (idx == kernel_map.end()) { *clKernel = NULL; } else { *clKernel = idx->second; } delete[] kernelName; } if (!*clKernel) { // kernel has not been built, so build it (from binary, preferably) cl_program clProgram; cl_int clBinaryStatus; if (*kernelBinary) { #ifdef AUTOGEMM_PRINT_DEBUG printf("makeGemmKernel: pre-compiled binary found: %llu bytes\n", *kernelBinarySize); printf("makeGemmKernel: Creating program from binary\n"); #endif clProgram = clCreateProgramWithBinary( clContext, 1, &clDevice, kernelBinarySize, kernelBinary, &clBinaryStatus, &err ); #ifdef AUTOGEMM_PRINT_DEBUG if (err != CL_SUCCESS) { printf("makeGemmKernel: Failed to create program with binary\n"); } #endif err = clBuildProgram( clProgram, 1, &clDevice, binaryBuildOptions, NULL, NULL ); #ifdef AUTOGEMM_PRINT_DEBUG if (err != CL_SUCCESS) { printf("makeGemmKernel: Failed to build program from binary\n"); } #endif } if (!*kernelBinary || err != CL_SUCCESS) { #ifdef AUTOGEMM_PRINT_DEBUG printf("makeGemmKernel: Creating program from source\n"); #endif clProgram = clCreateProgramWithSource( clContext, 1, &kernelSource, NULL, &err ); CL_CHECK(err) err = clBuildProgram( clProgram, 1, &clDevice, sourceBuildOptions, NULL, NULL ); CL_CHECK(err) } // print build failure if (err != CL_SUCCESS) { printf("clBuildProgram Failed\n"); printf("err = %d\n", err); size_t len = 0; clGetProgramBuildInfo(clProgram, clDevice, CL_PROGRAM_BUILD_LOG, 0, NULL, &len); char* buildLog = new char[len]; printf("Error: Failed to build program executable!\n"); clGetProgramBuildInfo(clProgram, clDevice, CL_PROGRAM_BUILD_LOG, len*sizeof(char), buildLog, 0); printf("\nBuild Log:\n\n"); printf("%s\n", buildLog); //printf("\n\nKernel String:\n\n"); //printf("%s\n", kernelSource); //FIXME: The function should be exiting at this point } err = clCreateKernelsInProgram( clProgram, 1, clKernel, NULL ); CL_CHECK(err) err = clReleaseProgram(clProgram); CL_CHECK(err) char *kernelName = getKernelName(*clKernel); #ifdef AUTOGEMM_PRINT_DEBUG printf("makeGemmKernel: \"%s\" now built; returning.\n", kernelName); #endif std::string key = prefix + "_" + kernelName; kernel_map[key] = *clKernel; delete[] kernelName; } return; } /****************************************************************************** * Enqueue Gemm Kernel *****************************************************************************/ void enqueueGemmKernel( cl_command_queue clQueue, cl_kernel clKernel, void **kernelArgs, size_t *kernelArgSizes, unsigned int numKernelArgs, const size_t *globalWorkSize, const size_t *localWorkSize, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *clEvent) { for (unsigned int i = 0; i < numKernelArgs; i++) { CL_CHECK( clSetKernelArg( clKernel, i, kernelArgSizes[i], kernelArgs[i]) ) } /*printf("global={%llu, %llu} local={%llu, %llu}\n", globalWorkSize[0], globalWorkSize[1], localWorkSize[0], localWorkSize[1] );*/ CL_CHECK( clEnqueueNDRangeKernel( clQueue, clKernel, 2, NULL, globalWorkSize, localWorkSize, numEventsInWaitList, eventWaitList, clEvent ) ) } /****************************************************************************** * get precision string *****************************************************************************/ template char * getPrecision(); template<> char * getPrecision() { return "s"; } template<> char * getPrecision() { return "d"; } template<> char * getPrecision() { return "c"; } template<> char * getPrecision() { return "z"; } /****************************************************************************** * convert ConjTrans -> Trans for real *****************************************************************************/ template clblasTranspose correctTranspose(clblasTranspose trans); template<> clblasTranspose correctTranspose( clblasTranspose trans) { return (trans==clblasConjTrans) ? clblasTrans : trans; } template<> clblasTranspose correctTranspose( clblasTranspose trans) { return (trans==clblasConjTrans) ? clblasTrans : trans; } template<> clblasTranspose correctTranspose( clblasTranspose trans) { return trans; } template<> clblasTranspose correctTranspose( clblasTranspose trans) { return trans; } /****************************************************************************** * templated Gemm *****************************************************************************/ template clblasStatus clblasGemm( clblasOrder order, clblasTranspose transA, clblasTranspose transB, size_t iM, size_t iN, size_t iK, Precision alpha, const cl_mem iA, size_t iOffA, size_t iLda, const cl_mem iB, size_t iOffB, size_t iLdb, Precision beta, cl_mem C, size_t iOffC, size_t iLdc, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events) { // cast types to opencl types cl_mem A = iA; cl_mem B = iB; cl_uint M = static_cast( iM ); cl_uint N = static_cast( iN ); cl_uint K = static_cast( iK ); cl_uint offA = static_cast( iOffA ); cl_uint offB = static_cast( iOffB ); cl_uint offC = static_cast( iOffC ); cl_uint lda = static_cast( iLda ); cl_uint ldb = static_cast( iLdb ); cl_uint ldc = static_cast( iLdc ); transA = correctTranspose(transA); transB = correctTranspose(transB); // if debug build, validate input // CHECK_QUEUES(numCommandQueues, commandQueues); // CHECK_EVENTS(numEventsInWaitList, eventWaitList); // CHECK_MATRIX_A(Precision, order, transA, A, M, K, offA, lda); // CHECK_MATRIX_B(Precision, order, transB, B, K, N, offB, ldb); // CHECK_MATRIX_C(Precision, order, clblasNoTrans, C, M, N, offC, ldc); force_gemm_column_major( order, transA, transB, M, N, offA, offB, lda, ldb, A, B ); /****************************************************************************** * Handle Special Cases * * 1) sgemm NT where lda, ldb are big multiples of 1024 starting from 4096 * * 2) sgemm NT where M and N are within middle range * and are mod32 but not mod96 or mod64 * *****************************************************************************/ bool specialCaseHandled = false; clblasStatus SpecialCaseStatus = GemmSpecialCases(order, transA, transB, M, N, K, alpha, A, offA, lda, B, offB, ldb, beta, C, offC, ldc, numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events, specialCaseHandled); if (specialCaseHandled) return SpecialCaseStatus; /****************************************************************************** * Optimal num elements per thread *****************************************************************************/ cl_int err; cl_device_id clDevice; err = clGetCommandQueueInfo( commandQueues[0], CL_QUEUE_DEVICE, sizeof(clDevice), &clDevice, NULL); CL_CHECK(err) cl_uint clDeviceNumCUs; err = clGetDeviceInfo( clDevice, CL_DEVICE_MAX_COMPUTE_UNITS, sizeof(clDeviceNumCUs), &clDeviceNumCUs, NULL); CL_CHECK(err) unsigned int deviceIdealNumThreads = (8 /*waves per CU*/)*(64 /*threads per wave*/)*clDeviceNumCUs; float optimalNumElementsPerThread = ((float)M*N) / deviceIdealNumThreads; //optimalNumElementsPerThread = 32; bool betaNonZero = !isZero(beta); #ifdef AUTOGEMM_PRINT_DEBUG printf("%sgemm_%3s_%s%s_B%u_%llux%llux%llu\n", getPrecision(), order==clblasColumnMajor ? "Col" : "Row", transA==clblasNoTrans ? "N" : transA==clblasTrans ? "T" : "C", transB==clblasNoTrans ? "N" : transB==clblasTrans ? "T" : "C", betaNonZero ? 1 : 0, iM, iN, iK ); #endif /****************************************************************************** * Select kernel *****************************************************************************/ const char *tileKernelSource = NULL; const char *rowKernelSource = NULL; const char *colKernelSource = NULL; const char *cornerKernelSource = NULL; const char *sourceBuildOptions = NULL; const unsigned char *tileKernelBinary = NULL; const unsigned char *rowKernelBinary = NULL; const unsigned char *colKernelBinary = NULL; const unsigned char *cornerKernelBinary = NULL; size_t *tileKernelBinarySize = 0; size_t *rowKernelBinarySize = 0; size_t *colKernelBinarySize = 0; size_t *cornerKernelBinarySize = 0; const char *binaryBuildOptions = NULL; cl_kernel *tileClKernel = NULL; cl_kernel *rowClKernel = NULL; cl_kernel *colClKernel = NULL; cl_kernel *cornerClKernel = NULL; unsigned int workGroupNumRows; unsigned int workGroupNumCols; unsigned int microTileNumRows; unsigned int microTileNumCols; unsigned int unroll; gemmSelectKernel( order, transA, transB, iM, iN, iK, betaNonZero, optimalNumElementsPerThread, &tileKernelSource, &rowKernelSource, &colKernelSource, &cornerKernelSource, &sourceBuildOptions, &tileKernelBinary, &rowKernelBinary, &colKernelBinary, &cornerKernelBinary, &tileKernelBinarySize, &rowKernelBinarySize, &colKernelBinarySize, &cornerKernelBinarySize, &binaryBuildOptions, &tileClKernel, &rowClKernel, &colClKernel, &cornerClKernel, &workGroupNumRows, &workGroupNumCols, µTileNumRows, µTileNumCols, &unroll); // make sure gemmSelectKernel found a valid kernel if (!tileKernelSource) { printf("ERROR: gemmSelectKernel() couldn't find kernel(s) for { order=%s, transA=%s, transB=%s, M=%llu, N=%llu, K=%llu, beta=%u, onept=%f }\n", order==clblasColumnMajor ? "ColMajor" : "RowMajor", transA==clblasNoTrans ? "N" : transA==clblasTrans ? "T" : "C", transB==clblasNoTrans ? "N" : transB==clblasTrans ? "T" : "C", M, N, K, betaNonZero ? 1 : 0, optimalNumElementsPerThread ); gemmSelectKernel( order, transA, transB, M, N, K, betaNonZero, optimalNumElementsPerThread, &tileKernelSource, &rowKernelSource, &colKernelSource, &cornerKernelSource, &sourceBuildOptions, &tileKernelBinary, &rowKernelBinary, &colKernelBinary, &cornerKernelBinary, &tileKernelBinarySize, &rowKernelBinarySize, &colKernelBinarySize, &cornerKernelBinarySize, &binaryBuildOptions, &tileClKernel, &rowClKernel, &colClKernel, &cornerClKernel, &workGroupNumRows, &workGroupNumCols, µTileNumRows, µTileNumCols, &unroll); return clblasNotImplemented; } unsigned int macroTileNumRows = workGroupNumRows*microTileNumRows; unsigned int macroTileNumCols = workGroupNumCols*microTileNumCols; bool needTileKernel = M/macroTileNumRows > 0 && N/macroTileNumCols > 0; bool needRowKernel = M%macroTileNumRows > 0 && N/macroTileNumCols > 0; bool needColKernel = N%macroTileNumCols > 0 && M/macroTileNumRows > 0; bool needCornerKernel = M%macroTileNumRows > 0 && N%macroTileNumCols > 0; #if 0 printf("For M,N,K = %u,%u,%u and %u CUs selected tile is wg=%ux%u, microTile=%ux%u, macroTile=%ux%u kernelsNeeded=%u,%u,%u,%u\n", M, N, K, clDeviceNumCUs, workGroupNumRows, workGroupNumCols, microTileNumRows, microTileNumCols, macroTileNumRows, macroTileNumCols, needTileKernel ? 1 : 0, needRowKernel ? 1 : 0, needColKernel ? 1 : 0, needCornerKernel ? 1 : 0 ); #endif /****************************************************************************** * Build kernels *****************************************************************************/ if (needTileKernel) makeGemmKernel( tileClKernel, commandQueues[0], tileKernelSource, sourceBuildOptions, &tileKernelBinary, tileKernelBinarySize, binaryBuildOptions); if (needRowKernel) makeGemmKernel( rowClKernel, commandQueues[0], rowKernelSource, sourceBuildOptions, &rowKernelBinary, rowKernelBinarySize, binaryBuildOptions); if (needColKernel) makeGemmKernel( colClKernel, commandQueues[0], colKernelSource, sourceBuildOptions, &colKernelBinary, colKernelBinarySize, binaryBuildOptions); if (needCornerKernel) makeGemmKernel(cornerClKernel, commandQueues[0], cornerKernelSource, sourceBuildOptions, &cornerKernelBinary, cornerKernelBinarySize, binaryBuildOptions); const size_t localWorkSize[2] = { workGroupNumRows, workGroupNumCols }; unsigned int numKernelsEnqueued = 0; /****************************************************************************** * Gather kernel arguments *****************************************************************************/ gemmKernelArgs[ 0] = &A; gemmKernelArgSizes[ 0] = sizeof(cl_mem); gemmKernelArgs[ 1] = &B; gemmKernelArgSizes[ 1] = sizeof(cl_mem); gemmKernelArgs[ 2] = &C; gemmKernelArgSizes[ 2] = sizeof(cl_mem); gemmKernelArgs[ 3] = α gemmKernelArgSizes[ 3] = sizeof(Precision); gemmKernelArgs[ 4] = β gemmKernelArgSizes[ 4] = sizeof(Precision); gemmKernelArgs[ 5] = &M; gemmKernelArgSizes[ 5] = sizeof(cl_uint); gemmKernelArgs[ 6] = &N; gemmKernelArgSizes[ 6] = sizeof(cl_uint); gemmKernelArgs[ 7] = &K; gemmKernelArgSizes[ 7] = sizeof(cl_uint); gemmKernelArgs[ 8] = &lda; gemmKernelArgSizes[ 8] = sizeof(cl_uint); gemmKernelArgs[ 9] = &ldb; gemmKernelArgSizes[ 9] = sizeof(cl_uint); gemmKernelArgs[10] = &ldc; gemmKernelArgSizes[10] = sizeof(cl_uint); gemmKernelArgs[11] = &offA; gemmKernelArgSizes[11] = sizeof(cl_uint); gemmKernelArgs[12] = &offB; gemmKernelArgSizes[12] = sizeof(cl_uint); gemmKernelArgs[13] = &offC; gemmKernelArgSizes[13] = sizeof(cl_uint); /****************************************************************************** * Enqueue Tile kernel *****************************************************************************/ if (needTileKernel) { //printf("enqueueing tile kernel\n"); size_t globalWorkSize[2] = {(M/macroTileNumRows)*workGroupNumRows, (N/macroTileNumCols)*workGroupNumCols }; enqueueGemmKernel( commandQueues[numKernelsEnqueued%numCommandQueues], *tileClKernel, gemmKernelArgs, gemmKernelArgSizes, numGemmKernelArgs, globalWorkSize, localWorkSize, numEventsInWaitList, eventWaitList, &events[numKernelsEnqueued%numCommandQueues] ); numKernelsEnqueued++; } /****************************************************************************** * Enqueue Row kernel *****************************************************************************/ if (needRowKernel) { //printf("enqueueing row kernel\n"); size_t globalWorkSize[2] = {1*workGroupNumRows, (N/macroTileNumCols)*workGroupNumCols }; enqueueGemmKernel( commandQueues[numKernelsEnqueued%numCommandQueues], *rowClKernel, gemmKernelArgs, gemmKernelArgSizes, numGemmKernelArgs, globalWorkSize, localWorkSize, numEventsInWaitList, eventWaitList, &events[numKernelsEnqueued%numCommandQueues] ); numKernelsEnqueued++; } /****************************************************************************** * Enqueue Col kernel *****************************************************************************/ if (needColKernel) { //printf("enqueueing col kernel\n"); size_t globalWorkSize[2] = { (M/macroTileNumRows)*workGroupNumRows, 1*workGroupNumCols }; enqueueGemmKernel( commandQueues[numKernelsEnqueued%numCommandQueues], *colClKernel, gemmKernelArgs, gemmKernelArgSizes, numGemmKernelArgs, globalWorkSize, localWorkSize, numEventsInWaitList, eventWaitList, &events[numKernelsEnqueued%numCommandQueues] ); numKernelsEnqueued++; } /****************************************************************************** * Enqueue Corner kernel *****************************************************************************/ if (needCornerKernel) { //printf("enqueueing corner kernel\n"); size_t globalWorkSize[2] = { 1*workGroupNumRows, 1*workGroupNumCols }; enqueueGemmKernel( commandQueues[numKernelsEnqueued%numCommandQueues], *cornerClKernel, gemmKernelArgs, gemmKernelArgSizes, numGemmKernelArgs, globalWorkSize, localWorkSize, numEventsInWaitList, eventWaitList, &events[numKernelsEnqueued%numCommandQueues] ); numKernelsEnqueued++; } return clblasSuccess; } /****************************************************************************** * SGEMM API call *****************************************************************************/ extern "C" clblasStatus clblasSgemm( clblasOrder order, clblasTranspose transA, clblasTranspose transB, size_t M, size_t N, size_t K, cl_float alpha, const cl_mem A, size_t offA, size_t lda, const cl_mem B, size_t offB, size_t ldb, cl_float beta, cl_mem C, size_t offC, size_t ldc, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events) { return clblasGemm( order, transA, transB, M, N, K, alpha, A, offA, lda, B, offB, ldb, beta, C, offC, ldc, numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events); } /****************************************************************************** * DGEMM API call *****************************************************************************/ extern "C" clblasStatus clblasDgemm( clblasOrder order, clblasTranspose transA, clblasTranspose transB, size_t M, size_t N, size_t K, cl_double alpha, const cl_mem A, size_t offA, size_t lda, const cl_mem B, size_t offB, size_t ldb, cl_double beta, cl_mem C, size_t offC, size_t ldc, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events) { return clblasGemm( order, transA, transB, M, N, K, alpha, A, offA, lda, B, offB, ldb, beta, C, offC, ldc, numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events); } /****************************************************************************** * CGEMM API call *****************************************************************************/ extern "C" clblasStatus clblasCgemm( clblasOrder order, clblasTranspose transA, clblasTranspose transB, size_t M, size_t N, size_t K, FloatComplex alpha, const cl_mem A, size_t offA, size_t lda, const cl_mem B, size_t offB, size_t ldb, FloatComplex beta, cl_mem C, size_t offC, size_t ldc, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events) { return clblasGemm( order, transA, transB, M, N, K, alpha, A, offA, lda, B, offB, ldb, beta, C, offC, ldc, numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events); } /****************************************************************************** * ZGEMM API *****************************************************************************/ extern "C" clblasStatus clblasZgemm( clblasOrder order, clblasTranspose transA, clblasTranspose transB, size_t M, size_t N, size_t K, DoubleComplex alpha, const cl_mem A, size_t offA, size_t lda, const cl_mem B, size_t offB, size_t ldb, DoubleComplex beta, cl_mem C, size_t offC, size_t ldc, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events) { return clblasGemm( order, transA, transB, M, N, K, alpha, A, offA, lda, B, offB, ldb, beta, C, offC, ldc, numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events); } clblas-2.10/src/library/blas/xgemm2.c000066400000000000000000000321641264277366700174450ustar00rootroot00000000000000/* ************************************************************************ * Copyright 2013 Advanced Micro Devices, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * ************************************************************************/ #include #include #include #include #include "clblas-internal.h" #include "solution_seq.h" //#define DEBUG_GEMM_2 int gemmHasMTail(size_t M, int vecLen, clblasOrder order, clblasTranspose transA, clblasTranspose transB) { transB = transB; // Dummy- to remove warning if (order == clblasColumnMajor) { if (transA == clblasNoTrans) { return (M % vecLen); } else { return 0; } } else { printf("gemmHasMTail: Not handling Row Major - FIXME\n"); return 0; } } int gemmHasNTail(size_t N, int vecLen, clblasOrder order, clblasTranspose transA, clblasTranspose transB) { if (order == clblasColumnMajor) { if (transA == clblasNoTrans) { if (transB == clblasNoTrans) { return 0; } else { return (N % vecLen); } } else { if (transB == clblasNoTrans) { return 0; } else { return (N % vecLen); } } } else { printf("gemmHasNTail: Not handling Row Major - FIXME\n"); return 0; } } int gemmHasTails(size_t M, size_t N, size_t K, int vecLen, clblasOrder order, clblasTranspose transA, clblasTranspose transB) { K = K; // Dummy- to remove warning if (order == clblasColumnMajor) { if (transA == clblasNoTrans) { if (transB == clblasNoTrans) { return (M % vecLen); } else { return ((M % vecLen) || (N % vecLen)); } } else { if (transB == clblasNoTrans) { // // Vectoring on A is on K dimension and we handle tail directly in the kernel // return 0; } else { return (N % vecLen); } } } else { printf("gemmHasTails: Not handling Row Major - FIXME\n"); return 0; } } clblasStatus executeGEMM( CLBlasKargs *kargs, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events) { cl_int err = CL_SUCCESS; ListHead seq, tailSeq; cl_event nontail; cl_uint gemmVeclen; CLBLASKernExtra *kextra; size_t M, N, K; M = kargs->M; N = kargs->N; K = kargs->K; #ifdef DEBUG_GEMM_2 printf("executeGEMM Called\n"); #endif listInitHead(&seq); err = makeSolutionSeq(CLBLAS_GEMM2, kargs, numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, &nontail, &seq); if (err == CL_SUCCESS) { ListNode *f = listNodeFirst(&seq); SolutionStep *gemm2; size_t tailStartM, tailStartN; bool processTails; gemm2 = container_of(f, node, SolutionStep); kextra = gemm2->kernels[CLBLAS_COMPUTING_KERNEL]->extra; gemmVeclen = kextra->vecLen; if (gemmHasTails(M, N, K, gemmVeclen, kargs->order, kargs->transA, kargs->transB) == 0) { #ifdef DEBUG_GEMM_2 printf("No M or N Tails to process..\n"); #endif processTails = false; gemm2->event = events; } else { processTails = true; if (gemmHasMTail(M, gemmVeclen, kargs->order, kargs->transA, kargs->transB)) { tailStartM = M - (M%gemmVeclen); } else { tailStartM = M; } if (gemmHasNTail(N, gemmVeclen, kargs->order, kargs->transA, kargs->transB)) { tailStartN = N - (N%gemmVeclen); } else { tailStartN = N; } } err = executeSolutionSeq(&seq); if ((err == CL_SUCCESS) && (processTails == true)) { CLBlasKargs targs; memcpy(&targs, &gemm2->args, sizeof(CLBlasKargs)); targs.tailStartM = tailStartM; targs.tailStartN = tailStartN; #ifdef DEBUG_GEMM_2 printf("Processing Tails\n"); #endif listInitHead(&tailSeq); err = makeSolutionSeq(CLBLAS_GEMM_TAIL, &targs, numCommandQueues, commandQueues, 1, &nontail, events, &tailSeq); if (err == CL_SUCCESS) { err = executeSolutionSeq(&tailSeq); } freeSolutionSeq(&tailSeq); } } freeSolutionSeq(&seq); return (clblasStatus) err; } static clblasStatus doGemm( CLBlasKargs *kargs, clblasOrder order, clblasTranspose transA, clblasTranspose transB, size_t M, size_t N, size_t K, const cl_mem A, size_t offA, size_t lda, const cl_mem B, size_t offB, size_t ldb, cl_mem C, size_t offC, size_t ldc, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events) { clblasStatus err; clblasStatus retCode = clblasSuccess; if (!clblasInitialized) { return clblasNotInitialized; } /* Validate arguments */ if ((retCode = checkMemObjects(A, B, C, true, A_MAT_ERRSET, B_MAT_ERRSET, C_MAT_ERRSET))) { return retCode; } if (K != 0) { if ((retCode = checkMatrixSizes(kargs->dtype, order, transA, M, K, A, offA, lda, A_MAT_ERRSET))) { return retCode; } if ((retCode = checkMatrixSizes(kargs->dtype, order, transB, K, N, B, offB, ldb, B_MAT_ERRSET))) { return retCode; } } if ((retCode = checkMatrixSizes(kargs->dtype, order, clblasNoTrans, M, N, C, offC, ldc, C_MAT_ERRSET))) { return retCode; } numCommandQueues = 1; #ifdef DEBUG_2 printf("DoGemm being called...\n"); #endif kargs->pigFuncID = CLBLAS_GEMM2; kargs->order = order; kargs->transA = transA; kargs->transB = transB; kargs->M = M; kargs->N = N; kargs->K = K; kargs->A = A; kargs->offA = offA; kargs->offa = offA; kargs->lda.matrix = lda; kargs->B = B; kargs->offBX = offB; kargs->ldb.matrix = ldb; kargs->C = C; kargs->offCY = offC; kargs->ldc.matrix = ldc; kargs->offsetM = 0; kargs->offsetN = 0; kargs->scimage[0] = 0; kargs->scimage[1] = 0; err = executeGEMM(kargs, numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events); return err; } /* clblasStatus clblasSgemmV2( clblasOrder order, clblasTranspose transA, clblasTranspose transB, size_t M, size_t N, size_t K, cl_float alpha, const cl_mem A, size_t lda, const cl_mem B, size_t ldb, cl_float beta, cl_mem C, size_t ldc, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events) { CLBlasKargs kargs; memset(&kargs, 0, sizeof(kargs)); kargs.dtype = TYPE_FLOAT; kargs.alpha.argFloat = alpha; kargs.beta.argFloat = beta; return doGemm(&kargs, order, transA, transB, M, N, K, A, 0, lda, B, 0, ldb, C, 0, ldc, numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events); } clblasStatus clblasDgemmV2( clblasOrder order, clblasTranspose transA, clblasTranspose transB, size_t M, size_t N, size_t K, cl_double alpha, const cl_mem A, size_t lda, const cl_mem B, size_t ldb, cl_double beta, cl_mem C, size_t ldc, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events) { CLBlasKargs kargs; memset(&kargs, 0, sizeof(kargs)); kargs.dtype = TYPE_DOUBLE; kargs.alpha.argDouble = alpha; kargs.beta.argDouble = beta; return doGemm(&kargs, order, transA, transB, M, N, K, A, 0, lda, B, 0, ldb, C, 0, ldc, numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events); } clblasStatus clblasCgemmV2( clblasOrder order, clblasTranspose transA, clblasTranspose transB, size_t M, size_t N, size_t K, FloatComplex alpha, const cl_mem A, size_t lda, const cl_mem B, size_t ldb, FloatComplex beta, cl_mem C, size_t ldc, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events) { CLBlasKargs kargs; memset(&kargs, 0, sizeof(kargs)); kargs.dtype = TYPE_COMPLEX_FLOAT; kargs.alpha.argFloatComplex = alpha; kargs.beta.argFloatComplex = beta; return doGemm(&kargs, order, transA, transB, M, N, K, A, 0, lda, B, 0, ldb, C, 0, ldc, numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events); } clblasStatus clblasZgemmV2( clblasOrder order, clblasTranspose transA, clblasTranspose transB, size_t M, size_t N, size_t K, DoubleComplex alpha, const cl_mem A, size_t lda, const cl_mem B, size_t ldb, DoubleComplex beta, cl_mem C, size_t ldc, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events) { CLBlasKargs kargs; memset(&kargs, 0, sizeof(kargs)); kargs.dtype = TYPE_COMPLEX_DOUBLE; kargs.alpha.argDoubleComplex = alpha; kargs.beta.argDoubleComplex = beta; return doGemm(&kargs, order, transA, transB, M, N, K, A, 0, lda, B, 0, ldb, C, 0, ldc, numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events); } clblasStatus clblasSgemmExV2( clblasOrder order, clblasTranspose transA, clblasTranspose transB, size_t M, size_t N, size_t K, cl_float alpha, const cl_mem A, size_t offA, size_t lda, const cl_mem B, size_t offB, size_t ldb, cl_float beta, cl_mem C, size_t offC, size_t ldc, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events) { CLBlasKargs kargs; memset(&kargs, 0, sizeof(kargs)); kargs.dtype = TYPE_FLOAT; kargs.alpha.argFloat = alpha; kargs.beta.argFloat = beta; return doGemm(&kargs, order, transA, transB, M, N, K, A, offA, lda, B, offB, ldb, C, offC, ldc, numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events); } clblasStatus clblasDgemmExV2( clblasOrder order, clblasTranspose transA, clblasTranspose transB, size_t M, size_t N, size_t K, cl_double alpha, const cl_mem A, size_t offA, size_t lda, const cl_mem B, size_t offB, size_t ldb, cl_double beta, cl_mem C, size_t offC, size_t ldc, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events) { CLBlasKargs kargs; memset(&kargs, 0, sizeof(kargs)); kargs.dtype = TYPE_DOUBLE; kargs.alpha.argDouble = alpha; kargs.beta.argDouble = beta; return doGemm(&kargs, order, transA, transB, M, N, K, A, offA, lda, B, offB, ldb, C, offC, ldc, numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events); } clblasStatus clblasCgemmExV2( clblasOrder order, clblasTranspose transA, clblasTranspose transB, size_t M, size_t N, size_t K, FloatComplex alpha, const cl_mem A, size_t offA, size_t lda, const cl_mem B, size_t offB, size_t ldb, FloatComplex beta, cl_mem C, size_t offC, size_t ldc, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events) { CLBlasKargs kargs; memset(&kargs, 0, sizeof(kargs)); kargs.dtype = TYPE_COMPLEX_FLOAT; kargs.alpha.argFloatComplex = alpha; kargs.beta.argFloatComplex = beta; return doGemm(&kargs, order, transA, transB, M, N, K, A, offA, lda, B, offB, ldb, C, offC, ldc, numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events); } clblasStatus clblasZgemmExV2( clblasOrder order, clblasTranspose transA, clblasTranspose transB, size_t M, size_t N, size_t K, DoubleComplex alpha, const cl_mem A, size_t offA, size_t lda, const cl_mem B, size_t offB, size_t ldb, DoubleComplex beta, cl_mem C, size_t offC, size_t ldc, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events) { CLBlasKargs kargs; memset(&kargs, 0, sizeof(kargs)); kargs.dtype = TYPE_COMPLEX_DOUBLE; kargs.alpha.argDoubleComplex = alpha; kargs.beta.argDoubleComplex = beta; return doGemm(&kargs, order, transA, transB, M, N, K, A, offA, lda, B, offB, ldb, C, offC, ldc, numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events); } */ clblas-2.10/src/library/blas/xgemv.c000066400000000000000000000141631264277366700173730ustar00rootroot00000000000000/* ************************************************************************ * Copyright 2013 Advanced Micro Devices, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * ************************************************************************/ #include #include #include #include "clblas-internal.h" #include "solution_seq.h" static clblasStatus doGemv( CLBlasKargs *kargs, clblasOrder order, clblasTranspose transA, size_t M, size_t N, const cl_mem A, size_t offA, size_t lda, const cl_mem x, size_t offx, int incx, cl_mem y, size_t offy, int incy, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events) { cl_int err; ListHead seq; size_t sizev; clblasStatus retCode = clblasSuccess; if (!clblasInitialized) { return clblasNotInitialized; } /* Validate arguments */ if ((retCode = checkMemObjects( A, x, y, true, A_MAT_ERRSET, X_VEC_ERRSET, Y_VEC_ERRSET ))) { return retCode; } if ((retCode = checkMatrixSizes(kargs->dtype, order, clblasNoTrans, M, N, A, offA, lda, A_MAT_ERRSET ))) { return retCode; } sizev = (transA == clblasNoTrans) ? N : M; if ((retCode = checkVectorSizes(kargs->dtype, sizev, x, offx, incx, X_VEC_ERRSET ))) { return retCode; } sizev = (transA == clblasNoTrans) ? M : N; if ((retCode = checkVectorSizes(kargs->dtype, sizev, y, offy, incy, Y_VEC_ERRSET))) { return retCode; } kargs->order = order; kargs->transA = transA; kargs->M = M; kargs->N = N; /* * store original height of the matrix A * FIXME: store it to a dedicated field */ kargs->K = (transA == clblasNoTrans) ? M : N; kargs->A = A; kargs->offA = offA; kargs->lda.matrix = lda; kargs->B = x; kargs->offBX = offx; kargs->ldb.vector = incx; kargs->C = y; kargs->offCY = offy; kargs->ldc.vector = incy; listInitHead(&seq); err = makeSolutionSeq(CLBLAS_GEMV, kargs, numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events, &seq); if (err == CL_SUCCESS) { err = executeSolutionSeq(&seq); } freeSolutionSeq(&seq); return (clblasStatus)err; } clblasStatus clblasSgemv( clblasOrder order, clblasTranspose transA, size_t M, size_t N, cl_float alpha, const cl_mem A, size_t offA, size_t lda, const cl_mem x, size_t offx, int incx, cl_float beta, cl_mem y, size_t offy, int incy, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events) { CLBlasKargs kargs; memset(&kargs, 0, sizeof(kargs)); kargs.dtype = TYPE_FLOAT; kargs.alpha.argFloat = alpha; kargs.beta.argFloat = beta; return doGemv(&kargs, order, transA, M, N, A, offA, lda, x, offx, incx, y, offy, incy, numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events); } clblasStatus clblasDgemv( clblasOrder order, clblasTranspose transA, size_t M, size_t N, cl_double alpha, const cl_mem A, size_t offA, size_t lda, const cl_mem x, size_t offx, int incx, cl_double beta, cl_mem y, size_t offy, int incy, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events) { CLBlasKargs kargs; memset(&kargs, 0, sizeof(kargs)); kargs.dtype = TYPE_DOUBLE; kargs.alpha.argDouble = alpha; kargs.beta.argDouble = beta; return doGemv(&kargs, order, transA, M, N, A, offA, lda, x, offx, incx, y, offy, incy, numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events); } clblasStatus clblasCgemv( clblasOrder order, clblasTranspose transA, size_t M, size_t N, FloatComplex alpha, const cl_mem A, size_t offA, size_t lda, const cl_mem x, size_t offx, int incx, FloatComplex beta, cl_mem y, size_t offy, int incy, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events) { CLBlasKargs kargs; memset(&kargs, 0, sizeof(kargs)); kargs.dtype = TYPE_COMPLEX_FLOAT; kargs.alpha.argFloatComplex = alpha; kargs.beta.argFloatComplex = beta; return doGemv(&kargs, order, transA, M, N, A, offA, lda, x, offx, incx, y, offy, incy, numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events); } clblasStatus clblasZgemv( clblasOrder order, clblasTranspose transA, size_t M, size_t N, DoubleComplex alpha, const cl_mem A, size_t offA, size_t lda, const cl_mem x, size_t offx, int incx, DoubleComplex beta, cl_mem y, size_t offy, int incy, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events) { CLBlasKargs kargs; memset(&kargs, 0, sizeof(kargs)); kargs.dtype = TYPE_COMPLEX_DOUBLE; kargs.alpha.argDoubleComplex = alpha; kargs.beta.argDoubleComplex = beta; return doGemv(&kargs, order, transA, M, N, A, offA, lda, x, offx, incx, y, offy, incy, numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events); } clblas-2.10/src/library/blas/xger.c000066400000000000000000000207261264277366700172140ustar00rootroot00000000000000/* ************************************************************************ * Copyright 2013 Advanced Micro Devices, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * ************************************************************************/ //#define DEBUG_GER #include #include #include #include #include "clblas-internal.h" #include "solution_seq.h" clblasStatus doGer( CLBlasKargs *kargs, clblasOrder order, size_t M, size_t N, const cl_mem X, size_t offx, int incx, const cl_mem Y, size_t offy, int incy, cl_mem A, size_t offa, size_t lda, int doConj, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events) { cl_int err; ListHead seq; clblasStatus retCode = clblasSuccess; if (!clblasInitialized) { return clblasNotInitialized; } /* Validate arguments */ if ((retCode = checkMemObjects(A, X, Y, true, A_MAT_ERRSET, X_VEC_ERRSET, Y_VEC_ERRSET))) { #ifdef DEBUG_GER printf("Invalid mem object..\n"); #endif return retCode; } // Check wheather enough memory was allocated if ((retCode = checkMatrixSizes(kargs->dtype, order, clblasNoTrans, M, N, A, offa, lda, A_MAT_ERRSET))) { #ifdef DEBUG_GER printf("Invalid Size for A %d\n",retCode ); #endif return retCode; } if ((retCode = checkVectorSizes(kargs->dtype, M, X, offx, incx, X_VEC_ERRSET))) { #ifdef DEBUG_GER printf("Invalid Size for X\n"); #endif return retCode; } if ((retCode = checkVectorSizes(kargs->dtype, N, Y, offy, incy, Y_VEC_ERRSET))) { #ifdef DEBUG_GER printf("Invalid Size for Y\n"); #endif return retCode; } /////////////////////////////////////////////////////////////// if ((commandQueues == NULL) || (numCommandQueues == 0)) { return clblasInvalidValue; } /* numCommandQueues will be hardcoded to 1 as of now. No multi-gpu support */ numCommandQueues = 1; if (commandQueues[0] == NULL) { return clblasInvalidCommandQueue; } if ((numEventsInWaitList !=0) && (eventWaitList == NULL)) { return clblasInvalidEventWaitList; } /* * ASSUMPTION: * doTRMV assumes "commandQueue" of 0. The same is reflected in * "makeSolutionSeq" as well. If either of them changes in future, * this code needs to be revisited. */ kargs->order = order; kargs->M = M; kargs->N = N; kargs->A = A; kargs->offa = offa; kargs->offA = offa; kargs->lda.matrix = lda; kargs->B = X; kargs->offBX = offx; kargs->ldb.vector = incx; // Will be using this as incx kargs->C = Y; kargs->offCY = offy; kargs->ldc.vector = incy; // Will be using this as incy kargs->offsetM = 0; kargs->offsetN = 0; kargs->scimage[0] = 0; kargs->scimage[1] = 0; kargs->K = (size_t)doConj; // Will be using K as doConj parameter #ifdef DEBUG_GER printf("Calling makeSolutionSeq from DoGer: GER\n"); #endif listInitHead(&seq); err = makeSolutionSeq(CLBLAS_GER, kargs, numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events, &seq); if (err == CL_SUCCESS) { err = executeSolutionSeq(&seq); } freeSolutionSeq(&seq); return (clblasStatus)err; } clblasStatus clblasSger( clblasOrder order, size_t M, size_t N, float alpha, const cl_mem X, size_t offx, int incx, const cl_mem Y, size_t offy, int incy, cl_mem A, size_t offa, size_t lda, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events) { CLBlasKargs kargs; int doConj; #ifdef DEBUG_GER printf("\nSGER Called\n"); #endif memset(&kargs, 0, sizeof(kargs)); kargs.dtype = TYPE_FLOAT; kargs.alpha.argFloat = alpha; doConj = 0; return doGer(&kargs, order, M, N, X, offx, incx, Y, offy, incy, A, offa, lda, doConj, numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events); } clblasStatus clblasDger( clblasOrder order, size_t M, size_t N, double alpha, const cl_mem X, size_t offx, int incx, const cl_mem Y, size_t offy, int incy, cl_mem A, size_t offa, size_t lda, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events) { CLBlasKargs kargs; int doConj; #ifdef DEBUG_GER printf("\nDGER Called\n"); #endif memset(&kargs, 0, sizeof(kargs)); kargs.dtype = TYPE_DOUBLE; kargs.alpha.argDouble = alpha; doConj = 0; return doGer(&kargs, order, M, N, X, offx, incx, Y, offy, incy, A, offa, lda, doConj, numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events); } clblasStatus clblasCgeru( clblasOrder order, size_t M, size_t N, cl_float2 alpha, const cl_mem X, size_t offx, int incx, const cl_mem Y, size_t offy, int incy, cl_mem A, size_t offa, size_t lda, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events) { CLBlasKargs kargs; int doConj; #ifdef DEBUG_GER printf("\nCGERU Called\n"); #endif memset(&kargs, 0, sizeof(kargs)); kargs.dtype = TYPE_COMPLEX_FLOAT; kargs.alpha.argFloatComplex = alpha; doConj = 0; return doGer(&kargs, order, M, N, X, offx, incx, Y, offy, incy, A, offa, lda, doConj, numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events); } clblasStatus clblasZgeru( clblasOrder order, size_t M, size_t N, cl_double2 alpha, const cl_mem X, size_t offx, int incx, const cl_mem Y, size_t offy, int incy, cl_mem A, size_t offa, size_t lda, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events) { CLBlasKargs kargs; int doConj; #ifdef DEBUG_GER printf("\nZGERU Called\n"); #endif memset(&kargs, 0, sizeof(kargs)); kargs.dtype = TYPE_COMPLEX_DOUBLE; kargs.alpha.argDoubleComplex = alpha; doConj = 0; return doGer(&kargs, order, M, N, X, offx, incx, Y, offy, incy, A, offa, lda, doConj, numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events); } clblasStatus clblasCgerc( clblasOrder order, size_t M, size_t N, cl_float2 alpha, const cl_mem X, size_t offx, int incx, const cl_mem Y, size_t offy, int incy, cl_mem A, size_t offa, size_t lda, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events) { CLBlasKargs kargs; int doConj; #ifdef DEBUG_GER printf("\nCGERC Called\n"); #endif memset(&kargs, 0, sizeof(kargs)); kargs.dtype = TYPE_COMPLEX_FLOAT; kargs.alpha.argFloatComplex = alpha; doConj = 1; return doGer(&kargs, order, M, N, X, offx, incx, Y, offy, incy, A, offa, lda, doConj, numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events); } clblasStatus clblasZgerc( clblasOrder order, size_t M, size_t N, cl_double2 alpha, const cl_mem X, size_t offx, int incx, const cl_mem Y, size_t offy, int incy, cl_mem A, size_t offa, size_t lda, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events) { CLBlasKargs kargs; int doConj; #ifdef DEBUG_GER printf("\nZGERC Called\n"); #endif memset(&kargs, 0, sizeof(kargs)); kargs.dtype = TYPE_COMPLEX_DOUBLE; kargs.alpha.argDoubleComplex = alpha; doConj = 1; return doGer(&kargs, order, M, N, X, offx, incx, Y, offy, incy, A, offa, lda, doConj, numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events); } clblas-2.10/src/library/blas/xhemm.c000066400000000000000000000062171264277366700173640ustar00rootroot00000000000000/* ************************************************************************ * Copyright 2013 Advanced Micro Devices, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * ************************************************************************/ #include #include #include #include #include "clblas-internal.h" #include "solution_seq.h" extern clblasStatus doSymm( CLBlasKargs *kargs, clblasOrder order, clblasUplo uplo, clblasSide side, size_t M, size_t N, const cl_mem A, size_t offa, size_t lda, const cl_mem B, size_t offb, size_t ldb, cl_mem C, size_t offc, size_t ldc, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events, BlasFunctionID symm_or_hemm); clblasStatus clblasChemm( clblasOrder order, clblasSide side, clblasUplo uplo, size_t M, size_t N, cl_float2 alpha, const cl_mem A, size_t offa, size_t lda, const cl_mem B, size_t offb, size_t ldb, cl_float2 beta, cl_mem C, size_t offc, size_t ldc, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events) { CLBlasKargs kargs; #ifdef DEBUG_HEMM printf("Chemm called\n"); #endif memset(&kargs, 0, sizeof(kargs)); kargs.dtype = TYPE_COMPLEX_FLOAT; kargs.alpha.argFloatComplex = alpha; kargs.beta.argFloatComplex = beta; return doSymm( &kargs, order, uplo, side, M, N, A, offa, lda, B, offb, ldb, C, offc, ldc, numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events, CLBLAS_HEMM); } clblasStatus clblasZhemm( clblasOrder order, clblasSide side, clblasUplo uplo, size_t M, size_t N, cl_double2 alpha, const cl_mem A, size_t offa, size_t lda, const cl_mem B, size_t offb, size_t ldb, cl_double2 beta, cl_mem C, size_t offc, size_t ldc, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events) { CLBlasKargs kargs; #ifdef DEBUG_HEMM printf("Zhemm called\n"); #endif memset(&kargs, 0, sizeof(kargs)); kargs.dtype = TYPE_COMPLEX_DOUBLE; kargs.alpha.argDoubleComplex = alpha; kargs.beta.argDoubleComplex = beta; return doSymm( &kargs, order, uplo, side, M, N, A, offa, lda, B, offb, ldb, C, offc, ldc, numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events, CLBLAS_HEMM); } clblas-2.10/src/library/blas/xhemv.c000066400000000000000000000115761264277366700174010ustar00rootroot00000000000000/* ************************************************************************ * Copyright 2013 Advanced Micro Devices, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * ************************************************************************/ #include #include #include #include "clblas-internal.h" #include "solution_seq.h" static clblasStatus doHemv( CLBlasKargs *kargs, clblasOrder order, clblasUplo uplo, size_t N, const cl_mem A, size_t offA, size_t lda, const cl_mem x, size_t offx, int incx, cl_mem y, size_t offy, int incy, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events) { cl_int err; ListHead seq1, seq2; cl_event first_event; clblasStatus retCode = clblasSuccess; if (!clblasInitialized) { return clblasNotInitialized; } /* Validate arguments */ if ((retCode = checkMemObjects(A, x, y, true, A_MAT_ERRSET, X_VEC_ERRSET, Y_VEC_ERRSET))) { return retCode; } if ((retCode = checkMatrixSizes(kargs->dtype, order, clblasNoTrans, N, N, A, offA, lda, A_MAT_ERRSET))) { return retCode; } if ((retCode = checkVectorSizes(kargs->dtype, N, x, offx, incx, X_VEC_ERRSET))) { return retCode; } if ((retCode = checkVectorSizes(kargs->dtype, N, y, offy, incy, Y_VEC_ERRSET))) { return retCode; } if ((commandQueues == NULL) || (numCommandQueues == 0)) { return clblasInvalidValue; } if ((numEventsInWaitList !=0) && (eventWaitList == NULL)) { return clblasInvalidEventWaitList; } numCommandQueues = 1; kargs->order = order; kargs->uplo = uplo; kargs->N = N; kargs->A = A; kargs->offA = offA; kargs->offa = offA; kargs->lda.matrix = lda; kargs->B = x; kargs->offBX = offx; kargs->ldb.vector = incx; kargs->C = y; kargs->offCY = offy; kargs->ldc.vector = incy; kargs->transA = clblasNoTrans; kargs->diag = clblasNonUnit; listInitHead(&seq1); err = makeSolutionSeq(CLBLAS_HEMV, kargs, numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, &first_event, &seq1); if (err == CL_SUCCESS) { err = executeSolutionSeq(&seq1); if (err == CL_SUCCESS) { listInitHead(&seq2); kargs->transA = clblasConjTrans; kargs->diag = clblasUnit; err = makeSolutionSeq(CLBLAS_HEMV, kargs, numCommandQueues, commandQueues, 1, &first_event, events, &seq2); if (err == CL_SUCCESS) { err = executeSolutionSeq(&seq2); } freeSolutionSeq(&seq2); } } freeSolutionSeq(&seq1); return (clblasStatus)err; //printf("doHemv called\n"); //return 0; } clblasStatus clblasChemv( clblasOrder order, clblasUplo uplo, size_t N, FloatComplex alpha, const cl_mem A, size_t offa, size_t lda, const cl_mem x, size_t offx, int incx, FloatComplex beta, cl_mem y, size_t offy, int incy, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events) { CLBlasKargs kargs; memset(&kargs, 0, sizeof(kargs)); kargs.dtype = TYPE_COMPLEX_FLOAT; kargs.alpha.argFloatComplex = alpha; kargs.beta.argFloatComplex = beta; return doHemv(&kargs, order, uplo, N, A, offa, lda, x, offx, incx, y, offy, incy, numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events); } clblasStatus clblasZhemv( clblasOrder order, clblasUplo uplo, size_t N, DoubleComplex alpha, const cl_mem A, size_t offa, size_t lda, const cl_mem x, size_t offx, int incx, DoubleComplex beta, cl_mem y, size_t offy, int incy, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events) { CLBlasKargs kargs; memset(&kargs, 0, sizeof(kargs)); kargs.dtype = TYPE_COMPLEX_DOUBLE; kargs.alpha.argDoubleComplex = alpha; kargs.beta.argDoubleComplex = beta; return doHemv(&kargs, order, uplo, N, A, offa, lda, x, offx, incx, y, offy, incy, numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events); } clblas-2.10/src/library/blas/xher.c000066400000000000000000000141021264277366700172040ustar00rootroot00000000000000/* ************************************************************************ * Copyright 2013 Advanced Micro Devices, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * ************************************************************************/ //#define DO_HER #include #include #include #include #include "clblas-internal.h" #include "solution_seq.h" clblasStatus doher( CLBlasKargs *kargs, clblasOrder order, clblasUplo uplo, size_t N, const cl_mem X, size_t offx, int incx, cl_mem A, size_t offa, size_t lda, cl_uint numCommandQueues, cl_command_queue* commandQueues, cl_uint numEventsInWaitList, const cl_event* eventWaitList, cl_event* events) { cl_int err; ListHead seq; clblasStatus retCode = clblasSuccess; if (!clblasInitialized) { return clblasNotInitialized; } #ifdef DEBUG_HER printf("doher called\n"); #endif /* Validate arguments */ if ((retCode = checkMemObjects(A, X, 0, false, A_MAT_ERRSET, X_VEC_ERRSET, END_ERRSET))) { #ifdef DEBUG_HER printf("Invalid mem object..\n"); #endif return retCode; } if ((retCode = checkMatrixSizes(kargs->dtype, order, clblasNoTrans, N, N, A, offa, lda, A_MAT_ERRSET))) { #ifdef DEBUG_HER printf("Invalid Size for A\n"); #endif return retCode; } if ((retCode = checkVectorSizes(kargs->dtype, N, X, offx, incx, X_VEC_ERRSET))) { #ifdef DEBUG_HER printf("Invalid Size for X\n"); #endif return retCode; } if ((commandQueues == NULL) || (numCommandQueues == 0)) { return clblasInvalidValue; } if ((numEventsInWaitList !=0) && (eventWaitList == NULL)) { return clblasInvalidEventWaitList; } kargs->order = order; if(order == clblasRowMajor) { kargs->uplo = (uplo == clblasUpper) ? clblasLower : clblasUpper; } else { kargs->uplo = uplo; } kargs->N = N; kargs->A = A; kargs->lda.matrix = lda; kargs->B = X; kargs->ldb.vector = incx; kargs->offBX = offx; kargs->offa = offa; kargs->offA = offa; #ifdef DEBUG_HER printf("Calling makeSolutionSeq : HER\n"); #endif /* * Always use commandQueues (0) * PENDING: * 1. No Multi-GPU / Multi-command queue support * 2. This can be optimized to use the commandQ with the higher * memmory bandwidth that supports the data-type and the LDA */ numCommandQueues = 1; listInitHead(&seq); err = makeSolutionSeq(CLBLAS_HER, kargs, numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events, &seq); if (err == CL_SUCCESS) { err = executeSolutionSeq(&seq); } freeSolutionSeq(&seq); return (clblasStatus)err; } clblasStatus clblasCher( clblasOrder order, clblasUplo uplo, size_t N, cl_float alpha, const cl_mem X, size_t offx, int incx, cl_mem A, size_t offa, size_t lda, cl_uint numCommandQueues, cl_command_queue* commandQueues, cl_uint numEventsInWaitList, const cl_event* eventWaitList, cl_event* events) { CLBlasKargs kargs; memset(&kargs, 0, sizeof(kargs)); kargs.dtype = TYPE_COMPLEX_FLOAT; kargs.alpha.argFloat = alpha; kargs.pigFuncID = CLBLAS_HER; #ifdef DEBUG_HER printf("CHER called\n"); #endif return doher(&kargs, order, uplo, N, X, offx, incx, A, offa, lda, numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events); } clblasStatus clblasZher( clblasOrder order, clblasUplo uplo, size_t N, cl_double alpha, const cl_mem X, size_t offx, int incx, cl_mem A, size_t offa, size_t lda, cl_uint numCommandQueues, cl_command_queue* commandQueues, cl_uint numEventsInWaitList, const cl_event* eventWaitList, cl_event* events) { CLBlasKargs kargs; memset(&kargs, 0, sizeof(kargs)); kargs.dtype = TYPE_COMPLEX_DOUBLE; kargs.alpha.argDouble = alpha; kargs.pigFuncID = CLBLAS_HER; #ifdef DEBUG_HER printf("ZHER called\n"); #endif return doher(&kargs, order, uplo, N, X, offx, incx, A, offa, lda, numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events); } clblasStatus clblasChpr( clblasOrder order, clblasUplo uplo, size_t N, cl_float alpha, const cl_mem X, size_t offx, int incx, cl_mem AP, size_t offa, cl_uint numCommandQueues, cl_command_queue* commandQueues, cl_uint numEventsInWaitList, const cl_event* eventWaitList, cl_event* events) { CLBlasKargs kargs; memset(&kargs, 0, sizeof(kargs)); kargs.dtype = TYPE_COMPLEX_FLOAT; kargs.alpha.argFloat = alpha; kargs.pigFuncID = CLBLAS_HPR; return doher(&kargs, order, uplo, N, X, offx, incx, AP, offa, 0, numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events); } clblasStatus clblasZhpr( clblasOrder order, clblasUplo uplo, size_t N, cl_double alpha, const cl_mem X, size_t offx, int incx, cl_mem AP, size_t offa, cl_uint numCommandQueues, cl_command_queue* commandQueues, cl_uint numEventsInWaitList, const cl_event* eventWaitList, cl_event* events) { CLBlasKargs kargs; memset(&kargs, 0, sizeof(kargs)); kargs.dtype = TYPE_COMPLEX_DOUBLE; kargs.alpha.argDouble = alpha; kargs.pigFuncID = CLBLAS_HPR; return doher(&kargs, order, uplo, N, X, offx, incx, AP, offa, 0, numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events); } clblas-2.10/src/library/blas/xher2.c000066400000000000000000000157251264277366700173020ustar00rootroot00000000000000/* ************************************************************************ * Copyright 2013 Advanced Micro Devices, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * ************************************************************************/ //#define DEBUG_HER2 #include #include #include #include #include "clblas-internal.h" #include "solution_seq.h" clblasStatus doHer2( CLBlasKargs *kargs, clblasOrder order, clblasUplo uplo, size_t N, const cl_mem X, size_t offx, int incx, const cl_mem Y, size_t offy, int incy, cl_mem A, size_t offa, size_t lda, cl_uint numCommandQueues, cl_command_queue* commandQueue, cl_uint numEventsInWaitList, const cl_event* eventWaitList, cl_event* events) { cl_int err; ListHead seq; clblasStatus retCode = clblasSuccess; if (!clblasInitialized) { return clblasNotInitialized; } #ifdef DEBUG_HER2 printf("doHer2 called\n"); #endif /* Validate arguments */ if ((retCode = checkMemObjects(A, X, Y, true, A_MAT_ERRSET, X_VEC_ERRSET, Y_VEC_ERRSET))) { #ifdef DEBUG_HER2 printf("Invalid mem object..\n"); #endif return retCode; } if ((retCode = checkMatrixSizes(kargs->dtype, order, clblasNoTrans, N, N, A, offa, lda, A_MAT_ERRSET))) { #ifdef DEBUG_HER2 printf("Invalid Size for A\n"); #endif return retCode; } if ((retCode = checkVectorSizes(kargs->dtype, N, X, offx, incx, X_VEC_ERRSET))) { #ifdef DEBUG_HER2 printf("Invalid Size for X\n"); #endif return retCode; } if ((retCode = checkVectorSizes(kargs->dtype, N, Y, offy, incy, Y_VEC_ERRSET))) { #ifdef DEBUG_HER2 printf("Invalid Size for Y\n"); #endif return retCode; } if ((commandQueue == NULL) || (numCommandQueues == 0)) { return clblasInvalidValue; } if ((numEventsInWaitList !=0) && (eventWaitList == NULL)) { return clblasInvalidEventWaitList; } kargs->order = order; if(order == clblasRowMajor) // Handling row-major. Invert X, Y and uplo { kargs->uplo = (uplo == clblasUpper) ? clblasLower : clblasUpper; kargs->B = Y; kargs->ldb.vector = incy; kargs->offBX = offy; kargs->C = X; kargs->ldc.vector = incx; kargs->offCY = offx; } else { kargs->uplo = uplo; kargs->B = X; kargs->ldb.vector = incx; kargs->offBX = offx; kargs->C = Y; kargs->ldc.vector = incy; kargs->offCY = offy; } kargs->N = N; kargs->A = A; kargs->lda.matrix = lda; kargs->offa = offa; kargs->offA = offa; #ifdef DEBUG_HER2 printf("Calling makeSolutionSeq : HER2\n"); #endif /* * Always use CommandQueue (0) * PENDING: * 1. No Multi-GPU / Multi-command queue support * 2. This can be optimized to use the commandQ with the higher * memmory bandwidth that supports the data-type and the LDA */ numCommandQueues = 1; listInitHead(&seq); err = makeSolutionSeq(CLBLAS_HER2, kargs, numCommandQueues, commandQueue, numEventsInWaitList, eventWaitList, events, &seq); if (err == CL_SUCCESS) { err = executeSolutionSeq(&seq); } freeSolutionSeq(&seq); return (clblasStatus)err; } clblasStatus clblasCher2( clblasOrder order, clblasUplo uplo, size_t N, cl_float2 alpha, const cl_mem X, size_t offx, int incx, const cl_mem Y, size_t offy, int incy, cl_mem A, size_t offa, size_t lda, cl_uint numCommandQueues, cl_command_queue* commandQueue, cl_uint numEventsInWaitList, const cl_event* eventWaitList, cl_event* events) { CLBlasKargs kargs; memset(&kargs, 0, sizeof(kargs)); kargs.dtype = TYPE_COMPLEX_FLOAT; kargs.alpha.argFloatComplex = alpha; kargs.pigFuncID = CLBLAS_HER2; #ifdef DEBUG_HER2 printf("Cher2 called\n"); #endif return doHer2(&kargs, order, uplo, N, X, offx, incx, Y, offy, incy, A, offa, lda, numCommandQueues, commandQueue, numEventsInWaitList, eventWaitList, events); } clblasStatus clblasZher2( clblasOrder order, clblasUplo uplo, size_t N, cl_double2 alpha, const cl_mem X, size_t offx, int incx, const cl_mem Y, size_t offy, int incy, cl_mem A, size_t offa, size_t lda, cl_uint numCommandQueues, cl_command_queue* commandQueue, cl_uint numEventsInWaitList, const cl_event* eventWaitList, cl_event* events) { CLBlasKargs kargs; memset(&kargs, 0, sizeof(kargs)); kargs.dtype = TYPE_COMPLEX_DOUBLE; kargs.alpha.argDoubleComplex = alpha; kargs.pigFuncID = CLBLAS_HER2; #ifdef DEBUG_HER2 printf("Zher2 called\n"); #endif return doHer2(&kargs, order, uplo, N, X, offx, incx, Y, offy, incy, A, offa, lda, numCommandQueues, commandQueue, numEventsInWaitList, eventWaitList, events); } clblasStatus clblasChpr2( clblasOrder order, clblasUplo uplo, size_t N, cl_float2 alpha, const cl_mem X, size_t offx, int incx, const cl_mem Y, size_t offy, int incy, cl_mem AP, size_t offa, cl_uint numCommandQueues, cl_command_queue* commandQueue, cl_uint numEventsInWaitList, const cl_event* eventWaitList, cl_event* events) { CLBlasKargs kargs; memset(&kargs, 0, sizeof(kargs)); kargs.dtype = TYPE_COMPLEX_FLOAT; kargs.alpha.argFloatComplex = alpha; kargs.pigFuncID = CLBLAS_HPR2; return doHer2(&kargs, order, uplo, N, X, offx, incx, Y, offy, incy, AP, offa, 0, numCommandQueues, commandQueue, numEventsInWaitList, eventWaitList, events); } clblasStatus clblasZhpr2( clblasOrder order, clblasUplo uplo, size_t N, cl_double2 alpha, const cl_mem X, size_t offx, int incx, const cl_mem Y, size_t offy, int incy, cl_mem AP, size_t offa, cl_uint numCommandQueues, cl_command_queue* commandQueue, cl_uint numEventsInWaitList, const cl_event* eventWaitList, cl_event* events) { CLBlasKargs kargs; memset(&kargs, 0, sizeof(kargs)); kargs.dtype = TYPE_COMPLEX_DOUBLE; kargs.alpha.argDoubleComplex = alpha; kargs.pigFuncID = CLBLAS_HPR2; return doHer2(&kargs, order, uplo, N, X, offx, incx, Y, offy, incy, AP, offa, 0, numCommandQueues, commandQueue, numEventsInWaitList, eventWaitList, events); } clblas-2.10/src/library/blas/xher2k.c000066400000000000000000000147561264277366700174600ustar00rootroot00000000000000/* ************************************************************************ * Copyright 2013 Advanced Micro Devices, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * ************************************************************************/ #include #include #include #include "clblas-internal.h" #include "solution_seq.h" //#define DEBUG_HER2K extern clblasStatus executeGEMM( CLBlasKargs *kargs, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events); clblasStatus doHer2k( CLBlasKargs *kargs, clblasOrder order, clblasUplo uplo, clblasTranspose transA, size_t N, size_t K, const cl_mem A, size_t offa, size_t lda, const cl_mem B, size_t offb, size_t ldb, cl_mem C, size_t offc, size_t ldc, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events) { clblasStatus err; clblasUplo fUplo; clblasTranspose fTransA; cl_event firstHerkCall; clblasStatus retCode = clblasSuccess; if (!clblasInitialized) { return clblasNotInitialized; } if (numCommandQueues == 0 || commandQueues == NULL) { return clblasInvalidValue; } numCommandQueues = 1; if ((numEventsInWaitList !=0) && (eventWaitList == NULL)) { return clblasInvalidEventWaitList; } // Validate arguments if ((retCode = checkMemObjects(A, B, C, true, A_MAT_ERRSET, B_MAT_ERRSET, C_MAT_ERRSET))) { return retCode; } if (transA == clblasTrans) { return clblasInvalidValue; } if ((retCode = checkMatrixSizes(kargs->dtype, order, transA, N, K, A, offa, lda, A_MAT_ERRSET))) { return retCode; } if ((retCode = checkMatrixSizes(kargs->dtype, order, transA, N, K, B, offb, ldb, B_MAT_ERRSET))) { return retCode; } if ((retCode = checkMatrixSizes(kargs->dtype, order, clblasNoTrans, N, N, C, offc, ldc, C_MAT_ERRSET))) { return retCode; } if ((numEventsInWaitList !=0) && (eventWaitList == NULL)) { return clblasInvalidEventWaitList; } fUplo = (order == clblasRowMajor) ? ((uplo == clblasLower) ? clblasUpper : clblasLower) : uplo; fTransA = (order == clblasRowMajor) ? ((transA == clblasNoTrans) ? clblasConjTrans : clblasNoTrans) : transA; kargs->order = (order == clblasRowMajor) ? clblasColumnMajor : order; kargs->transA = fTransA; kargs->transB = (fTransA == clblasNoTrans) ? clblasConjTrans : clblasNoTrans; kargs->uplo = fUplo; kargs->M = N; kargs->N = N; kargs->K = K; kargs->A = A; kargs->offA = offa; kargs->offa = offa; kargs->lda.matrix = lda; kargs->B = B; kargs->offBX = offb; kargs->ldb.matrix = ldb; kargs->C = C; kargs->offCY = offc; kargs->ldc.matrix = ldc; kargs->pigFuncID = CLBLAS_HERK; err = executeGEMM(kargs, numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, &firstHerkCall); if( err == CL_SUCCESS ) { kargs->A = B; kargs->offA = offb; kargs->offa = offb; kargs->lda.matrix = ldb; kargs->B = A; kargs->offBX = offa; kargs->ldb.matrix = lda; if( kargs->dtype == TYPE_COMPLEX_FLOAT ) { CIMAG( kargs->alpha.argFloatComplex ) *= -1.0; CREAL( kargs->beta.argFloatComplex ) = 1.0; CIMAG( kargs->beta.argFloatComplex ) = 0.0; } else { CIMAG( kargs->alpha.argDoubleComplex ) *= -1.0; CREAL( kargs->beta.argDoubleComplex ) = 1.0; CIMAG( kargs->beta.argDoubleComplex ) = 0.0; } err = executeGEMM(kargs, numCommandQueues, commandQueues, 1, &firstHerkCall, events); } return (clblasStatus)err; } clblasStatus clblasCher2k( clblasOrder order, clblasUplo uplo, clblasTranspose trans, size_t N, size_t K, FloatComplex alpha, const cl_mem A, size_t offa, size_t lda, const cl_mem B, size_t offb, size_t ldb, cl_float beta, cl_mem C, size_t offc, size_t ldc, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events) { CLBlasKargs kargs; FloatComplex fBeta; memset(&kargs, 0, sizeof(kargs)); CREAL(fBeta) = beta; CIMAG(fBeta) = 0.0f; kargs.alpha.argFloatComplex = alpha; kargs.beta.argFloatComplex = fBeta; kargs.dtype = TYPE_COMPLEX_FLOAT; if( order == clblasRowMajor ) { CIMAG( kargs.alpha.argFloatComplex ) *= -1.0; } return doHer2k(&kargs, order, uplo, trans, N, K, A, offa, lda, B, offb, ldb, C, offc, ldc, numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events); } clblasStatus clblasZher2k( clblasOrder order, clblasUplo uplo, clblasTranspose trans, size_t N, size_t K, DoubleComplex alpha, const cl_mem A, size_t offa, size_t lda, const cl_mem B, size_t offb, size_t ldb, cl_double beta, cl_mem C, size_t offc, size_t ldc, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events) { CLBlasKargs kargs; DoubleComplex fBeta; memset(&kargs, 0, sizeof(kargs)); CREAL(fBeta) = beta; CIMAG(fBeta) = 0.0f; kargs.alpha.argDoubleComplex = alpha; kargs.beta.argDoubleComplex = fBeta; kargs.dtype = TYPE_COMPLEX_DOUBLE; if( order == clblasRowMajor ) { CIMAG( kargs.alpha.argDoubleComplex ) *= -1.0; } return doHer2k(&kargs, order, uplo, trans, N, K, A, offa, lda, B, offb, ldb, C, offc, ldc, numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events); } clblas-2.10/src/library/blas/xherk.c000066400000000000000000000131521264277366700173630ustar00rootroot00000000000000/* ************************************************************************ * Copyright 2013 Advanced Micro Devices, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * ************************************************************************/ #include #include #include #include "clblas-internal.h" #include "solution_seq.h" extern clblasStatus executeGEMM( CLBlasKargs *kargs, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events); clblasStatus doHerk( CLBlasKargs *kargs, clblasOrder order, clblasUplo uplo, clblasTranspose transA, size_t N, size_t K, const cl_mem A, size_t offA, size_t lda, cl_mem C, size_t offC, size_t ldc, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events) { clblasStatus err; clblasUplo fUplo; clblasTranspose fTransA; clblasStatus retCode = clblasSuccess; if (!clblasInitialized) { return clblasNotInitialized; } if (numCommandQueues == 0 || commandQueues == NULL) { return clblasInvalidValue; } if ((numEventsInWaitList !=0) && (eventWaitList == NULL)) { return clblasInvalidEventWaitList; } // Validate arguments if ((retCode = checkMemObjects(A, C, NULL, false, A_MAT_ERRSET, C_MAT_ERRSET, END_ERRSET))) { return retCode; } if (transA == clblasTrans) { return clblasInvalidValue; } if ((retCode = checkMatrixSizes(kargs->dtype, order, transA, N, K, A, offA, lda, A_MAT_ERRSET))) { return retCode; } if ((retCode = checkMatrixSizes(kargs->dtype, order, false, N, N, C, offC, ldc, C_MAT_ERRSET))) { return retCode; } if ((numEventsInWaitList !=0) && (eventWaitList == NULL)) { return clblasInvalidEventWaitList; } fUplo = (order == clblasRowMajor) ? ((uplo == clblasLower) ? clblasUpper : clblasLower) : uplo; fTransA = (order == clblasRowMajor) ? ((transA == clblasNoTrans) ? clblasConjTrans : clblasNoTrans) : transA; kargs->order = (order == clblasRowMajor) ? clblasColumnMajor : order; kargs->transA = fTransA; kargs->transB = (fTransA == clblasNoTrans) ? clblasConjTrans : clblasNoTrans; kargs->uplo = fUplo; kargs->M = N; kargs->N = N; kargs->K = K; kargs->A = A; kargs->offA = offA; kargs->offa = offA; kargs->lda.matrix = lda; kargs->B = A; kargs->offBX = offA; kargs->ldb.matrix = lda; kargs->C = C; kargs->offCY = offC; kargs->ldc.matrix = ldc; kargs->pigFuncID = CLBLAS_HERK; err = CL_SUCCESS; #ifdef DEBUG_HERK printf("doHerk called\n"); #endif numCommandQueues = 1; // Call GEMM to handle HERK. err = executeGEMM(kargs, numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events); /* listInitHead(&seq); err = makeSolutionSeq(CLBLAS_GEMM, kargs, numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events, &seq); if (err == CL_SUCCESS) { err = executeSolutionSeq(&seq); } freeSolutionSeq(&seq); */ return (clblasStatus)err; } clblasStatus clblasCherk( clblasOrder order, clblasUplo uplo, clblasTranspose transA, size_t N, size_t K, float alpha, const cl_mem A, size_t offA, size_t lda, float beta, cl_mem C, size_t offC, size_t ldc, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events) { CLBlasKargs kargs; FloatComplex fAlpha, fBeta; memset(&kargs, 0, sizeof(kargs)); CREAL(fAlpha) = alpha; CIMAG(fAlpha) = 0.0f; CREAL(fBeta) = beta; CIMAG(fBeta) = 0.0f; kargs.alpha.argFloatComplex = fAlpha; kargs.beta.argFloatComplex = fBeta; kargs.dtype = TYPE_COMPLEX_FLOAT; return doHerk(&kargs, order, uplo, transA, N, K, A, offA, lda, C, offC, ldc, numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events); } clblasStatus clblasZherk( clblasOrder order, clblasUplo uplo, clblasTranspose transA, size_t N, size_t K, double alpha, const cl_mem A, size_t offA, size_t lda, double beta, cl_mem C, size_t offC, size_t ldc, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events) { CLBlasKargs kargs; DoubleComplex fAlpha, fBeta; memset(&kargs, 0, sizeof(kargs)); CREAL(fAlpha) = alpha; CIMAG(fAlpha) = 0.0f; CREAL(fBeta) = beta; CIMAG(fBeta) = 0.0f; kargs.alpha.argDoubleComplex = fAlpha; kargs.beta.argDoubleComplex = fBeta; kargs.dtype = TYPE_COMPLEX_DOUBLE; return doHerk(&kargs, order, uplo, transA, N, K, A, offA, lda, C, offC, ldc, numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events); } clblas-2.10/src/library/blas/xhpmv.c000066400000000000000000000116141264277366700174050ustar00rootroot00000000000000/* ************************************************************************ * Copyright 2013 Advanced Micro Devices, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * ************************************************************************/ #include #include #include #include "clblas-internal.h" #include "solution_seq.h" static clblasStatus doHpmv( CLBlasKargs *kargs, clblasOrder order, clblasUplo uplo, size_t N, const cl_mem AP, size_t offa, const cl_mem X, size_t offx, int incx, cl_mem Y, size_t offy, int incy, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events) { cl_int err; ListHead seq1, seq2; cl_event first_event; clblasStatus retCode = clblasSuccess; if (!clblasInitialized) { return clblasNotInitialized; } /* Validate arguments */ if ((retCode = checkMemObjects(AP, X, Y, true, A_MAT_ERRSET, X_VEC_ERRSET, Y_VEC_ERRSET))) { return retCode; } if ((retCode = checkMatrixSizes(kargs->dtype, order, clblasNoTrans, N, N, AP, offa, 0, A_MAT_ERRSET))) { return retCode; } if ((retCode = checkVectorSizes(kargs->dtype, N, X, offx, incx, X_VEC_ERRSET))) { return retCode; } if ((retCode = checkVectorSizes(kargs->dtype, N, Y, offy, incy, Y_VEC_ERRSET))) { return retCode; } if ((commandQueues == NULL) || (numCommandQueues == 0)) { return clblasInvalidValue; } if ((numEventsInWaitList !=0) && (eventWaitList == NULL)) { return clblasInvalidEventWaitList; } numCommandQueues = 1; kargs->order = order; kargs->uplo = uplo; kargs->N = N; kargs->A = AP; kargs->offA = offa; kargs->offa = offa; kargs->lda.matrix = 0; // Set lda as zero for packed matrices kargs->B = X; kargs->offBX = offx; kargs->ldb.vector = incx; kargs->C = Y; kargs->offCY = offy; kargs->ldc.vector = incy; kargs->transA = clblasNoTrans; kargs->diag = clblasNonUnit; kargs->pigFuncID = CLBLAS_HPMV; listInitHead(&seq1); err = makeSolutionSeq(CLBLAS_TRMV, kargs, numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, &first_event, &seq1); if (err == CL_SUCCESS) { err = executeSolutionSeq(&seq1); if (err == CL_SUCCESS) { listInitHead(&seq2); kargs->transA = clblasConjTrans; kargs->diag = clblasUnit; err = makeSolutionSeq(CLBLAS_TRMV, kargs, numCommandQueues, commandQueues, 1, &first_event, events, &seq2); if (err == CL_SUCCESS) { err = executeSolutionSeq(&seq2); } freeSolutionSeq(&seq2); } } freeSolutionSeq(&seq1); return (clblasStatus)err; } clblasStatus clblasChpmv( clblasOrder order, clblasUplo uplo, size_t N, cl_float2 alpha, const cl_mem AP, size_t offa, const cl_mem X, size_t offx, int incx, cl_float2 beta, cl_mem Y, size_t offy, int incy, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events) { CLBlasKargs kargs; memset(&kargs, 0, sizeof(kargs)); kargs.dtype = TYPE_COMPLEX_FLOAT; kargs.alpha.argFloatComplex = alpha; kargs.beta.argFloatComplex = beta; return doHpmv(&kargs, order, uplo, N, AP, offa, X, offx, incx, Y, offy, incy, numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events); } clblasStatus clblasZhpmv( clblasOrder order, clblasUplo uplo, size_t N, cl_double2 alpha, const cl_mem AP, size_t offa, const cl_mem X, size_t offx, int incx, cl_double2 beta, cl_mem Y, size_t offy, int incy, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events) { CLBlasKargs kargs; memset(&kargs, 0, sizeof(kargs)); kargs.dtype = TYPE_COMPLEX_DOUBLE; kargs.alpha.argDoubleComplex = alpha; kargs.beta.argDoubleComplex = beta; return doHpmv(&kargs, order, uplo, N, AP, offa, X, offx, incx, Y, offy, incy, numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events); } clblas-2.10/src/library/blas/xnrm2.c000066400000000000000000000233701264277366700173130ustar00rootroot00000000000000/* ************************************************************************ * Copyright 2013 Advanced Micro Devices, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * ************************************************************************/ //#define USE_HYPOT #include #include #include #include #include #include "clblas-internal.h" #include "solution_seq.h" clblasStatus doNrm2_hypot(CLBlasKargs *kargs, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events) { cl_int err; ListHead seq, seq2; cl_event firstNrmCall; CLBlasKargs redctnArgs; ListNode *listNodePtr; SolutionStep *step; // // Scratch buffer will be of %PTYPE // Result of compelx nrm2 is scalar // DataType nrmType = (kargs->dtype == TYPE_COMPLEX_FLOAT)? TYPE_FLOAT : ((kargs->dtype == TYPE_COMPLEX_DOUBLE)? TYPE_DOUBLE : (kargs->dtype)); kargs->redctnType = REDUCE_BY_HYPOT; memcpy(&redctnArgs, kargs, sizeof(CLBlasKargs)); redctnArgs.dtype = nrmType; listInitHead(&seq); err = makeSolutionSeq(CLBLAS_NRM2, kargs, numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, &firstNrmCall, &seq); if (err == CL_SUCCESS) { /** The second kernel call needs to know the number of work-groups used in the first kernel call. This number of work-groups is calculated here and passed as N to second reduction kernel **/ err = executeSolutionSeq(&seq); if (err == CL_SUCCESS) { listNodePtr = listNodeFirst(&seq); // Get the node step = container_of(listNodePtr, node, SolutionStep); redctnArgs.N = step->pgran.numWGSpawned[0]; // 1D block was used listInitHead(&seq2); err = makeSolutionSeq(CLBLAS_REDUCTION_EPILOGUE, &redctnArgs, numCommandQueues, commandQueues, 1, &firstNrmCall, events, &seq2); if (err == CL_SUCCESS) { err = executeSolutionSeq(&seq2); } freeSolutionSeq(&seq2); } } freeSolutionSeq(&seq); return (clblasStatus)err; } clblasStatus doNrm2_ssq(CLBlasKargs *kargs, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events) { cl_int err; ListHead seq, seq2; cl_event firstNrmCall; CLBlasKargs redctnArgs; ListNode *listNodePtr; SolutionStep *step; // // Scratch buffer will be of %PTYPE // Result of compelx nrm2 is scalar // DataType nrmType = (kargs->dtype == TYPE_COMPLEX_FLOAT)? TYPE_FLOAT : ((kargs->dtype == TYPE_COMPLEX_DOUBLE)? TYPE_DOUBLE : (kargs->dtype)); kargs->redctnType = REDUCE_BY_SSQ; memcpy(&redctnArgs, kargs, sizeof(CLBlasKargs)); redctnArgs.dtype = nrmType; listInitHead(&seq); err = makeSolutionSeq(CLBLAS_NRM2, kargs, numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, &firstNrmCall, &seq); if (err == CL_SUCCESS) { /** The second kernel call needs to know the number of work-groups used in the first kernel call. This number of work-groups is calculated here and passed as N to second reduction kernel **/ err = executeSolutionSeq(&seq); if (err == CL_SUCCESS) { listNodePtr = listNodeFirst(&seq); // Get the node step = container_of(listNodePtr, node, SolutionStep); redctnArgs.N = step->pgran.numWGSpawned[0]; // 1D block was used listInitHead(&seq2); err = makeSolutionSeq(CLBLAS_REDUCTION_EPILOGUE, &redctnArgs, numCommandQueues, commandQueues, 1, &firstNrmCall, events, &seq2); if (err == CL_SUCCESS) { err = executeSolutionSeq(&seq2); } freeSolutionSeq(&seq2); } } freeSolutionSeq(&seq); return (clblasStatus)err; } clblasStatus doNrm2( bool useHypot, CLBlasKargs *kargs, size_t N, cl_mem NRM2, size_t offNRM2, const cl_mem X, size_t offx, int incx, cl_mem scratchBuff, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events) { clblasStatus retCode = clblasSuccess; DataType nrmType = (kargs->dtype == TYPE_COMPLEX_FLOAT)? TYPE_FLOAT : ((kargs->dtype == TYPE_COMPLEX_DOUBLE)? TYPE_DOUBLE : (kargs->dtype)); if (!clblasInitialized) { return clblasNotInitialized; } /* Validate arguments */ retCode = checkMemObjects(X, NRM2, scratchBuff, true, X_VEC_ERRSET, Y_VEC_ERRSET, X_VEC_ERRSET ); if (retCode) { printf("Invalid mem object..\n"); return retCode; } // Check wheather enough memory was allocated retCode = checkVectorSizes(kargs->dtype, N, X, offx, incx, X_VEC_ERRSET ); if (retCode) { printf("Invalid Size for X\n"); return retCode; } // Minimum size of scratchBuff is 2*N retCode = checkVectorSizes(kargs->dtype, (2*N), scratchBuff, 0, 1, X_VEC_ERRSET ); if (retCode) { printf("Insufficient ScratchBuff\n"); return retCode; } retCode = checkVectorSizes(nrmType, 1, NRM2, offNRM2, 1, Y_VEC_ERRSET ); if (retCode) { printf("Invalid Size for NRM2\n"); return retCode; } /////////////////////////////////////////////////////////////// if ((commandQueues == NULL) || (numCommandQueues == 0)) { return clblasInvalidValue; } /* numCommandQueues will be hardcoded to 1 as of now. No multi-gpu support */ numCommandQueues = 1; if (commandQueues[0] == NULL) { return clblasInvalidCommandQueue; } if ((numEventsInWaitList !=0) && (eventWaitList == NULL)) { return clblasInvalidEventWaitList; } kargs->N = N; kargs->A = NRM2; kargs->offA = offNRM2; kargs->offa = offNRM2; kargs->B = X; kargs->offBX = offx; kargs->ldb.vector = incx; if(incx < 1) { // According to netlib, if incx<1, NRM2 will be zero kargs->N = 1; // Makeing it launch only 1 work-group } kargs->D = scratchBuff; if(useHypot) { return doNrm2_hypot(kargs, numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events); } else { return doNrm2_ssq(kargs, numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events); } } clblasStatus clblasSnrm2( size_t N, cl_mem NRM2, size_t offNRM2, const cl_mem X, size_t offx, int incx, cl_mem scratchBuff, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events) { bool useHypot; CLBlasKargs kargs; #ifdef USE_HYPOT useHypot = true; #else useHypot = false; #endif memset(&kargs, 0, sizeof(kargs)); kargs.dtype = TYPE_FLOAT; return doNrm2(useHypot, &kargs, N, NRM2, offNRM2, X, offx, incx, scratchBuff, numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events); } clblasStatus clblasDnrm2( size_t N, cl_mem NRM2, size_t offNRM2, const cl_mem X, size_t offx, int incx, cl_mem scratchBuff, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events) { bool useHypot; CLBlasKargs kargs; #ifdef USE_HYPOT useHypot = true; #else useHypot = false; #endif memset(&kargs, 0, sizeof(kargs)); kargs.dtype = TYPE_DOUBLE; return doNrm2(useHypot, &kargs, N, NRM2, offNRM2, X, offx, incx, scratchBuff, numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events); } clblasStatus clblasScnrm2( size_t N, cl_mem NRM2, size_t offNRM2, const cl_mem X, size_t offx, int incx, cl_mem scratchBuff, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events) { bool useHypot; CLBlasKargs kargs; #ifdef USE_HYPOT useHypot = true; #else useHypot = false; #endif memset(&kargs, 0, sizeof(kargs)); kargs.dtype = TYPE_COMPLEX_FLOAT; return doNrm2(useHypot, &kargs, N, NRM2, offNRM2, X, offx, incx, scratchBuff, numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events); } clblasStatus clblasDznrm2( size_t N, cl_mem NRM2, size_t offNRM2, const cl_mem X, size_t offx, int incx, cl_mem scratchBuff, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events) { bool useHypot; CLBlasKargs kargs; #ifdef USE_HYPOT useHypot = true; #else useHypot = false; #endif memset(&kargs, 0, sizeof(kargs)); kargs.dtype = TYPE_COMPLEX_DOUBLE; return doNrm2(useHypot, &kargs, N, NRM2, offNRM2, X, offx, incx, scratchBuff, numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events); } clblas-2.10/src/library/blas/xrot.c000066400000000000000000000127371264277366700172460ustar00rootroot00000000000000/* ************************************************************************ * Copyright 2013 Advanced Micro Devices, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * ************************************************************************/ #include #include #include #include #include "clblas-internal.h" #include "solution_seq.h" clblasStatus doRot( CLBlasKargs *kargs, size_t N, const cl_mem X, size_t offx, int incx, cl_mem Y, size_t offy, int incy, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events) { cl_int err; ListHead seq; clblasStatus retCode = clblasSuccess; if (!clblasInitialized) { return clblasNotInitialized; } /* Validate arguments */ retCode = checkMemObjects(X, Y, X, false, X_VEC_ERRSET, Y_VEC_ERRSET, X_VEC_ERRSET ); if (retCode) { #ifdef DEBUG_ROT printf("Invalid mem object..\n"); #endif return retCode; } // Check wheather enough memory was allocated if ((retCode = checkVectorSizes(kargs->dtype, N, X, offx, incx, X_VEC_ERRSET))) { #ifdef DEBUG_ROT printf("Invalid Size for X\n"); #endif return retCode; } if ((retCode = checkVectorSizes(kargs->dtype, N, Y, offy, incy, Y_VEC_ERRSET))) { #ifdef DEBUG_ROT printf("Invalid Size for Y\n"); #endif return retCode; } /////////////////////////////////////////////////////////////// if ((commandQueues == NULL) || (numCommandQueues == 0)) { return clblasInvalidValue; } /* numCommandQueues will be hardcoded to 1 as of now. No multi-gpu support */ numCommandQueues = 1; if (commandQueues[0] == NULL) { return clblasInvalidCommandQueue; } if ((numEventsInWaitList !=0) && (eventWaitList == NULL)) { return clblasInvalidEventWaitList; } kargs->N = N; kargs->A = X; kargs->offBX = offx; kargs->ldb.vector = incx; // Will be using this as incx kargs->B = Y; kargs->offCY = offy; kargs->ldc.vector = incy; // Will be using this as incy kargs->pigFuncID = CLBLAS_ROT; // Using ROTM kernel for ROT. Both are similar listInitHead(&seq); err = makeSolutionSeq(CLBLAS_ROTM, kargs, numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events, &seq); if (err == CL_SUCCESS) { err = executeSolutionSeq(&seq); } freeSolutionSeq(&seq); return (clblasStatus)err; } clblasStatus clblasSrot( size_t N, cl_mem X, size_t offx, int incx, cl_mem Y, size_t offy, int incy, cl_float C, cl_float S, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events) { CLBlasKargs kargs; memset(&kargs, 0, sizeof(kargs)); kargs.dtype = TYPE_FLOAT; kargs.alpha.argFloat = C; kargs.beta.argFloat = S; return doRot(&kargs, N, X, offx, incx, Y, offy, incy, numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events); } clblasStatus clblasDrot( size_t N, cl_mem X, size_t offx, int incx, cl_mem Y, size_t offy, int incy, cl_double C, cl_double S, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events) { CLBlasKargs kargs; memset(&kargs, 0, sizeof(kargs)); kargs.dtype = TYPE_DOUBLE; kargs.alpha.argDouble = C; kargs.beta.argDouble = S; return doRot(&kargs, N, X, offx, incx, Y, offy, incy, numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events); } clblasStatus clblasCsrot( size_t N, cl_mem X, size_t offx, int incx, cl_mem Y, size_t offy, int incy, float C, float S, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events) { CLBlasKargs kargs; memset(&kargs, 0, sizeof(kargs)); kargs.dtype = TYPE_COMPLEX_FLOAT; kargs.alpha.argFloat = C; kargs.beta.argFloat = S; return doRot(&kargs, N, X, offx, incx, Y, offy, incy, numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events); } clblasStatus clblasZdrot( size_t N, cl_mem X, size_t offx, int incx, cl_mem Y, size_t offy, int incy, double C, double S, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events) { CLBlasKargs kargs; memset(&kargs, 0, sizeof(kargs)); kargs.dtype = TYPE_COMPLEX_DOUBLE; kargs.alpha.argDouble = C; kargs.beta.argDouble = S; return doRot(&kargs, N, X, offx, incx, Y, offy, incy, numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events); } clblas-2.10/src/library/blas/xrotg.c000066400000000000000000000140021264277366700174000ustar00rootroot00000000000000/* ************************************************************************ * Copyright 2013 Advanced Micro Devices, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * ************************************************************************/ //#define DEBUG_ROTG #include #include #include #include #include "clblas-internal.h" #include "solution_seq.h" clblasStatus doRotg( CLBlasKargs *kargs, cl_mem A, size_t offA, cl_mem B, size_t offB, cl_mem C, size_t offC, cl_mem S, size_t offS, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events) { cl_int err; ListHead seq; clblasStatus retCode = clblasSuccess; // C is of real type even for complex numbers DataType cType = (kargs->dtype == TYPE_COMPLEX_FLOAT)? TYPE_FLOAT : ((kargs->dtype == TYPE_COMPLEX_DOUBLE)? TYPE_DOUBLE : (kargs->dtype)); if (!clblasInitialized) { return clblasNotInitialized; } /* Validate arguments */ retCode = checkMemObjects(A, B, A, false, X_VEC_ERRSET, Y_VEC_ERRSET, X_VEC_ERRSET ); if (retCode) { // for mem objects A, B #ifdef DEBUG_ROTG printf("Invalid mem object..\n"); #endif return retCode; } retCode = checkMemObjects(C, S, C, false, X_VEC_ERRSET, Y_VEC_ERRSET, X_VEC_ERRSET ); if (retCode) { // for mem objects C, S #ifdef DEBUG_ROTG printf("Invalid mem object..\n"); #endif return retCode; } // Check wheather enough memory was allocated if ((retCode = checkVectorSizes(kargs->dtype, 1, A, offA, 1, X_VEC_ERRSET))) { #ifdef DEBUG_ROTG printf("Invalid Size for A\n"); #endif return retCode; } if ((retCode = checkVectorSizes(kargs->dtype, 1, B, offB, 1, Y_VEC_ERRSET))) { #ifdef DEBUG_ROTG printf("Invalid Size for B\n"); #endif return retCode; } if ((retCode = checkVectorSizes(cType, 1, C, offC, 1, X_VEC_ERRSET))) { #ifdef DEBUG_ROTG printf("Invalid Size for C\n"); #endif return retCode; } if ((retCode = checkVectorSizes(kargs->dtype, 1, S, offS, 1, Y_VEC_ERRSET))) { #ifdef DEBUG_ROTG printf("Invalid Size for S\n"); #endif return retCode; } /////////////////////////////////////////////////////////////// if ((commandQueues == NULL) || (numCommandQueues == 0)) { return clblasInvalidValue; } /* numCommandQueues will be hardcoded to 1 as of now. No multi-gpu support */ numCommandQueues = 1; if (commandQueues[0] == NULL) { return clblasInvalidCommandQueue; } if ((numEventsInWaitList !=0) && (eventWaitList == NULL)) { return clblasInvalidEventWaitList; } kargs->A = A; kargs->B = B; kargs->C = C; kargs->D = S; kargs->offa = offA; kargs->offb = offB; kargs->offc = offC; kargs->offd = offS; listInitHead(&seq); err = makeSolutionSeq(CLBLAS_ROTG, kargs, numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events, &seq); if (err == CL_SUCCESS) { err = executeSolutionSeq(&seq); } freeSolutionSeq(&seq); return (clblasStatus)err; } clblasStatus clblasSrotg( cl_mem SA, size_t offSA, cl_mem SB, size_t offSB, cl_mem C, size_t offC, cl_mem S, size_t offS, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events) { CLBlasKargs kargs; memset(&kargs, 0, sizeof(kargs)); kargs.dtype = TYPE_FLOAT; return doRotg(&kargs, SA, offSA, SB, offSB, C, offC, S, offS, numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events); } clblasStatus clblasDrotg( cl_mem DA, size_t offDA, cl_mem DB, size_t offDB, cl_mem C, size_t offC, cl_mem S, size_t offS, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events) { CLBlasKargs kargs; memset(&kargs, 0, sizeof(kargs)); kargs.dtype = TYPE_DOUBLE; return doRotg(&kargs, DA, offDA, DB, offDB, C, offC, S, offS, numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events); } clblasStatus clblasCrotg( cl_mem CA, size_t offCA, cl_mem CB, size_t offCB, cl_mem C, size_t offC, cl_mem S, size_t offS, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events) { CLBlasKargs kargs; memset(&kargs, 0, sizeof(kargs)); kargs.dtype = TYPE_COMPLEX_FLOAT; return doRotg(&kargs, CA, offCA, CB, offCB, C, offC, S, offS, numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events); } clblasStatus clblasZrotg( cl_mem CA, size_t offCA, cl_mem CB, size_t offCB, cl_mem C, size_t offC, cl_mem S, size_t offS, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events) { CLBlasKargs kargs; memset(&kargs, 0, sizeof(kargs)); kargs.dtype = TYPE_COMPLEX_DOUBLE; return doRotg(&kargs, CA, offCA, CB, offCB, C, offC, S, offS, numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events); } clblas-2.10/src/library/blas/xrotm.c000066400000000000000000000106231264277366700174130ustar00rootroot00000000000000/* ************************************************************************ * Copyright 2013 Advanced Micro Devices, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * ************************************************************************/ #include #include #include #include #include "clblas-internal.h" #include "solution_seq.h" clblasStatus doRotm( CLBlasKargs *kargs, size_t N, const cl_mem X, size_t offx, int incx, cl_mem Y, size_t offy, int incy, cl_mem param, size_t offParam, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events) { cl_int err; ListHead seq; clblasStatus retCode = clblasSuccess; if (!clblasInitialized) { return clblasNotInitialized; } /* Validate arguments */ retCode = checkMemObjects(X, Y, param, true, X_VEC_ERRSET, Y_VEC_ERRSET, X_VEC_ERRSET ); if (retCode) { #ifdef DEBUG_ROTM printf("Invalid mem object..\n"); #endif return retCode; } // Check wheather enough memory was allocated if ((retCode = checkVectorSizes(kargs->dtype, N, X, offx, incx, X_VEC_ERRSET))) { #ifdef DEBUG_ROTM printf("Invalid Size for X\n"); #endif return retCode; } if ((retCode = checkVectorSizes(kargs->dtype, N, Y, offy, incy, Y_VEC_ERRSET))) { #ifdef DEBUG_ROTM printf("Invalid Size for Y\n"); #endif return retCode; } if ((retCode = checkVectorSizes(kargs->dtype, 5, param, offParam, 1, Y_VEC_ERRSET))) { #ifdef DEBUG_ROTM printf("Invalid Size for PARAM\n"); // PARAM is of minimum length 5 #endif return retCode; } /////////////////////////////////////////////////////////////// if ((commandQueues == NULL) || (numCommandQueues == 0)) { return clblasInvalidValue; } /* numCommandQueues will be hardcoded to 1 as of now. No multi-gpu support */ numCommandQueues = 1; if (commandQueues[0] == NULL) { return clblasInvalidCommandQueue; } if ((numEventsInWaitList !=0) && (eventWaitList == NULL)) { return clblasInvalidEventWaitList; } kargs->N = N; kargs->A = X; kargs->offBX = offx; kargs->ldb.vector = incx; // Will be using this as incx kargs->B = Y; kargs->offCY = offy; kargs->ldc.vector = incy; // Will be using this as incy kargs->D = param; kargs->offd = offParam; kargs->pigFuncID = CLBLAS_ROTM; listInitHead(&seq); err = makeSolutionSeq(CLBLAS_ROTM, kargs, numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events, &seq); if (err == CL_SUCCESS) { err = executeSolutionSeq(&seq); } freeSolutionSeq(&seq); return (clblasStatus)err; } clblasStatus clblasSrotm( size_t N, cl_mem X, size_t offx, int incx, cl_mem Y, size_t offy, int incy, const cl_mem SPARAM, size_t offSparam, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events) { CLBlasKargs kargs; memset(&kargs, 0, sizeof(kargs)); kargs.dtype = TYPE_FLOAT; return doRotm(&kargs, N, X, offx, incx, Y, offy, incy, SPARAM, offSparam, numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events); } clblasStatus clblasDrotm( size_t N, cl_mem X, size_t offx, int incx, cl_mem Y, size_t offy, int incy, const cl_mem DPARAM, size_t offDparam, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events) { CLBlasKargs kargs; memset(&kargs, 0, sizeof(kargs)); kargs.dtype = TYPE_DOUBLE; return doRotm(&kargs, N, X, offx, incx, Y, offy, incy, DPARAM, offDparam, numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events); } clblas-2.10/src/library/blas/xrotmg.c000066400000000000000000000120771264277366700175670ustar00rootroot00000000000000/* ************************************************************************ * Copyright 2013 Advanced Micro Devices, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * ************************************************************************/ #include #include #include #include #include "clblas-internal.h" #include "solution_seq.h" clblasStatus doRotmg( CLBlasKargs *kargs, cl_mem D1, size_t offD1, cl_mem D2, size_t offD2, cl_mem X1, size_t offX1, cl_mem Y1, size_t offY1, cl_mem param, size_t offParam, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events) { cl_int err; ListHead seq; clblasStatus retCode = clblasSuccess; if (!clblasInitialized) { return clblasNotInitialized; } /* Validate arguments */ retCode = checkMemObjects(D1, D2, X1, true, X_VEC_ERRSET, Y_VEC_ERRSET, X_VEC_ERRSET ); if (retCode) { // for mem objects A, B #ifdef DEBUG_ROTMG printf("Invalid mem object..\n"); #endif return retCode; } retCode = checkMemObjects(Y1, param, Y1, false, X_VEC_ERRSET, Y_VEC_ERRSET, X_VEC_ERRSET ); if (retCode) { // for mem objects C, S #ifdef DEBUG_ROTMG printf("Invalid mem object..\n"); #endif return retCode; } // Check wheather enough memory was allocated if ((retCode = checkVectorSizes(kargs->dtype, 1, D1, offD1, 1, X_VEC_ERRSET))) { #ifdef DEBUG_ROTMG printf("Invalid Size for D1\n"); #endif return retCode; } if ((retCode = checkVectorSizes(kargs->dtype, 1, D2, offD2, 1, Y_VEC_ERRSET))) { #ifdef DEBUG_ROTMG printf("Invalid Size for D2\n"); #endif return retCode; } if ((retCode = checkVectorSizes(kargs->dtype, 1, X1, offX1, 1, X_VEC_ERRSET))) { #ifdef DEBUG_ROTMG printf("Invalid Size for X1\n"); #endif return retCode; } if ((retCode = checkVectorSizes(kargs->dtype, 1, Y1, offY1, 1, Y_VEC_ERRSET))) { #ifdef DEBUG_ROTMG printf("Invalid Size for Y1\n"); #endif return retCode; } if ((retCode = checkVectorSizes(kargs->dtype, 1, param, offParam, 1, Y_VEC_ERRSET))) { #ifdef DEBUG_ROTMG printf("Invalid Size for PARAM\n"); #endif return retCode; } /////////////////////////////////////////////////////////////// if ((commandQueues == NULL) || (numCommandQueues == 0)) { return clblasInvalidValue; } /* numCommandQueues will be hardcoded to 1 as of now. No multi-gpu support */ numCommandQueues = 1; if (commandQueues[0] == NULL) { return clblasInvalidCommandQueue; } if ((numEventsInWaitList !=0) && (eventWaitList == NULL)) { return clblasInvalidEventWaitList; } kargs->A = D1; kargs->B = D2; kargs->C = X1; kargs->D = Y1; kargs->E = param; kargs->offa = offD1; kargs->offb = offD2; kargs->offc = offX1; kargs->offd = offY1; kargs->offe = offParam; listInitHead(&seq); err = makeSolutionSeq(CLBLAS_ROTMG, kargs, numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events, &seq); if (err == CL_SUCCESS) { err = executeSolutionSeq(&seq); } freeSolutionSeq(&seq); return (clblasStatus)err; } clblasStatus clblasSrotmg( cl_mem SD1, size_t offSD1, cl_mem SD2, size_t offSD2, cl_mem SX1, size_t offSX1, const cl_mem SY1, size_t offSY1, cl_mem SPARAM, size_t offSparam, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events) { CLBlasKargs kargs; memset(&kargs, 0, sizeof(kargs)); kargs.dtype = TYPE_FLOAT; return doRotmg(&kargs, SD1, offSD1, SD2, offSD2, SX1, offSX1, SY1, offSY1, SPARAM, offSparam, numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events); } clblasStatus clblasDrotmg( cl_mem DD1, size_t offDD1, cl_mem DD2, size_t offDD2, cl_mem DX1, size_t offDX1, const cl_mem DY1, size_t offDY1, cl_mem DPARAM, size_t offDparam, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events) { CLBlasKargs kargs; memset(&kargs, 0, sizeof(kargs)); kargs.dtype = TYPE_DOUBLE; return doRotmg(&kargs, DD1, offDD1, DD2, offDD2, DX1, offDX1, DY1, offDY1, DPARAM, offDparam, numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events); } clblas-2.10/src/library/blas/xscal.c000066400000000000000000000150511264277366700173540ustar00rootroot00000000000000/* ************************************************************************ * Copyright 2013 Advanced Micro Devices, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * ************************************************************************/ //#define DEBUG_SCAL #include #include #include #include #include "clblas-internal.h" #include "solution_seq.h" clblasStatus doScal( CLBlasKargs *kargs, size_t N, cl_mem X, size_t offx, int incx, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events) { cl_int err; ListHead seq; clblasStatus retCode = clblasSuccess; if (!clblasInitialized) { return clblasNotInitialized; } /* Validate arguments */ retCode = checkMemObjects(X, X, X, false, X_VEC_ERRSET, X_VEC_ERRSET, X_VEC_ERRSET ); if (retCode) { #ifdef DEBUG_SCAL printf("Invalid mem object..\n"); #endif return retCode; } // Check wheather enough memory was allocated if ((retCode = checkVectorSizes(kargs->dtype, N, X, offx, incx, X_VEC_ERRSET))) { #ifdef DEBUG_SCAL printf("Invalid Size for X\n"); #endif return retCode; } /////////////////////////////////////////////////////////////// if ((commandQueues == NULL) || (numCommandQueues == 0)) { return clblasInvalidValue; } /* numCommandQueues will be hardcoded to 1 as of now. No multi-gpu support */ numCommandQueues = 1; if (commandQueues[0] == NULL) { return clblasInvalidCommandQueue; } if ((numEventsInWaitList !=0) && (eventWaitList == NULL)) { return clblasInvalidEventWaitList; } kargs->N = N; kargs->A = X; kargs->offBX = offx; kargs->ldb.vector = incx; // Will be using this as incx if(incx < 0) { // According to Netlib - return for negative incx return clblasSuccess; } #ifdef DEBUG_SCAL printf("Calling makeSolutionSeq from DoScal: SCAL\n"); #endif listInitHead(&seq); err = makeSolutionSeq(CLBLAS_SCAL, kargs, numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events, &seq); if (err == CL_SUCCESS) { err = executeSolutionSeq(&seq); } freeSolutionSeq(&seq); return (clblasStatus)err; } clblasStatus clblasSscal( size_t N, float alpha, cl_mem X, size_t offx, int incx, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events) { CLBlasKargs kargs; #ifdef DEBUG_SCAL printf("\nSSCAL Called\n"); #endif memset(&kargs, 0, sizeof(kargs)); kargs.dtype = TYPE_FLOAT; kargs.alpha.argFloat = alpha; return doScal(&kargs, N, X, offx, incx, numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events); } clblasStatus clblasDscal( size_t N, double alpha, cl_mem X, size_t offx, int incx, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events) { CLBlasKargs kargs; #ifdef DEBUG_SCAL printf("\nDSCAL Called\n"); #endif memset(&kargs, 0, sizeof(kargs)); kargs.dtype = TYPE_DOUBLE; kargs.alpha.argDouble = alpha; return doScal(&kargs, N, X, offx, incx, numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events); } clblasStatus clblasCscal( size_t N, cl_float2 alpha, cl_mem X, size_t offx, int incx, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events) { CLBlasKargs kargs; #ifdef DEBUG_SCAL printf("\nCSCAL Called\n"); #endif memset(&kargs, 0, sizeof(kargs)); kargs.dtype = TYPE_COMPLEX_FLOAT; kargs.alpha.argFloatComplex = alpha; return doScal(&kargs, N, X, offx, incx, numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events); } clblasStatus clblasZscal( size_t N, cl_double2 alpha, cl_mem X, size_t offx, int incx, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events) { CLBlasKargs kargs; #ifdef DEBUG_SCAL printf("\nZSCAL Called\n"); #endif memset(&kargs, 0, sizeof(kargs)); kargs.dtype = TYPE_COMPLEX_DOUBLE; kargs.alpha.argDoubleComplex = alpha; return doScal(&kargs, N, X, offx, incx, numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events); } clblasStatus clblasCsscal( size_t N, float alpha, cl_mem X, size_t offx, int incx, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events) { CLBlasKargs kargs; FloatComplex fAlpha; #ifdef DEBUG_SSCAL printf("\nCSSCAL Called\n"); #endif CREAL(fAlpha) = alpha; CIMAG(fAlpha) = 0.0f; memset(&kargs, 0, sizeof(kargs)); kargs.alpha.argFloatComplex = fAlpha; kargs.dtype = TYPE_COMPLEX_FLOAT; return doScal(&kargs, N, X, offx, incx, numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events); } clblasStatus clblasZdscal( size_t N, double alpha, cl_mem X, size_t offx, int incx, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events) { CLBlasKargs kargs; DoubleComplex fAlpha; #ifdef DEBUG_SSCAL printf("\nZDSCAL Called\n"); #endif CREAL(fAlpha) = alpha; CIMAG(fAlpha) = 0.0f; memset(&kargs, 0, sizeof(kargs)); kargs.alpha.argDoubleComplex = fAlpha; kargs.dtype = TYPE_COMPLEX_DOUBLE; return doScal(&kargs, N, X, offx, incx, numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events); } clblas-2.10/src/library/blas/xscal.cc000066400000000000000000000202051264277366700175140ustar00rootroot00000000000000/* ************************************************************************ * Copyright 2013 Advanced Micro Devices, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * ************************************************************************/ #include #include #include #include #include // // This file provide the functor based public clBLAS API for // // clblasSscal() // clblasDscal() // clblasCscal() // clblasZscal() // clblasCsscal() // clblasZdscal() // extern "C" clblasStatus clblasSscal( size_t N, float alpha, cl_mem X, size_t offx, int incx, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events) { CHECK_QUEUES(numCommandQueues, commandQueues); CHECK_EVENTS(numEventsInWaitList, eventWaitList); CHECK_VECTOR_X(TYPE_FLOAT, N, X, offx, incx); clblasSscalFunctor * functor ; if ( numCommandQueues>1 ) { numCommandQueues = 1 ; // No support for multi-device (yet) } cl_command_queue queue = commandQueues[0]; clblasSscalFunctor::Args args(N, alpha, X, offx, incx, queue, numEventsInWaitList, eventWaitList, events); clblasFunctorSelector * fselector = clblasFunctorSelector::find(queue); functor = fselector->select_sscal_specific(args); clblasStatus res = functor->execute(args); functor->release(); return res; } extern "C" clblasStatus clblasDscal( size_t N, double alpha, cl_mem X, size_t offx, int incx, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events) { CHECK_QUEUES(numCommandQueues, commandQueues); CHECK_EVENTS(numEventsInWaitList, eventWaitList); CHECK_VECTOR_X(TYPE_DOUBLE, N, X, offx, incx); clblasDscalFunctor * functor ; if ( numCommandQueues>1 ) { numCommandQueues = 1 ; // No support for multi-device (yet) } cl_command_queue queue = commandQueues[0]; clblasDscalFunctor::Args args(N, alpha, X, offx, incx, queue, numEventsInWaitList, eventWaitList, events); clblasFunctorSelector * fselector = clblasFunctorSelector::find(queue); functor = fselector->select_dscal_specific(args); clblasStatus res = functor->execute(args); functor->release(); return res; } extern "C" clblasStatus clblasCscal( size_t N, cl_float2 alpha, cl_mem X, size_t offx, int incx, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events) { CHECK_QUEUES(numCommandQueues, commandQueues); CHECK_EVENTS(numEventsInWaitList, eventWaitList); CHECK_VECTOR_X(TYPE_COMPLEX_FLOAT, N, X, offx, incx); clblasCscalFunctor * functor ; if ( numCommandQueues>1 ) { numCommandQueues = 1 ; // No support for multi-device (yet) } cl_command_queue queue = commandQueues[0]; clblasCscalFunctor::Args args(N, alpha, X, offx, incx, queue, numEventsInWaitList, eventWaitList, events); clblasFunctorSelector * fselector = clblasFunctorSelector::find(queue); functor = fselector->select_cscal_specific(args); clblasStatus res = functor->execute(args); functor->release(); return res; } extern "C" clblasStatus clblasZscal( size_t N, cl_double2 alpha, cl_mem X, size_t offx, int incx, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events) { CHECK_QUEUES(numCommandQueues, commandQueues); CHECK_EVENTS(numEventsInWaitList, eventWaitList); CHECK_VECTOR_X(TYPE_COMPLEX_DOUBLE, N, X, offx, incx); clblasZscalFunctor * functor ; if ( numCommandQueues>1 ) { numCommandQueues = 1 ; // No support for multi-device (yet) } cl_command_queue queue = commandQueues[0]; clblasZscalFunctor::Args args(N, alpha, X, offx, incx, queue, numEventsInWaitList, eventWaitList, events); clblasFunctorSelector * fselector = clblasFunctorSelector::find(queue); functor = fselector->select_zscal_specific(args); clblasStatus res = functor->execute(args); functor->release(); return res; } extern "C" clblasStatus clblasCsscal( size_t N, float alpha, cl_mem X, size_t offx, int incx, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events) { CHECK_QUEUES(numCommandQueues, commandQueues); CHECK_EVENTS(numEventsInWaitList, eventWaitList); CHECK_VECTOR_X(TYPE_COMPLEX_FLOAT, N, X, offx, incx); clblasCsscalFunctor * functor ; if ( numCommandQueues>1 ) { numCommandQueues = 1 ; // No support for multi-device (yet) } cl_command_queue queue = commandQueues[0]; clblasCsscalFunctor::Args args(N, alpha, X, offx, incx, queue, numEventsInWaitList, eventWaitList, events); clblasFunctorSelector * fselector = clblasFunctorSelector::find(queue); functor = fselector->select_csscal_specific(args); clblasStatus res = functor->execute(args); functor->release(); return res; } extern "C" clblasStatus clblasZdscal( size_t N, double alpha, cl_mem X, size_t offx, int incx, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events) { CHECK_QUEUES(numCommandQueues, commandQueues); CHECK_EVENTS(numEventsInWaitList, eventWaitList); CHECK_VECTOR_X(TYPE_COMPLEX_DOUBLE, N, X, offx, incx); clblasZdscalFunctor * functor ; if ( numCommandQueues>1 ) { numCommandQueues = 1 ; // No support for multi-device (yet) } cl_command_queue queue = commandQueues[0]; clblasZdscalFunctor::Args args(N, alpha, X, offx, incx, queue, numEventsInWaitList, eventWaitList, events); clblasFunctorSelector * fselector = clblasFunctorSelector::find(queue); functor = fselector->select_zdscal_specific(args); clblasStatus res = functor->execute(args); functor->release(); return res; } clblas-2.10/src/library/blas/xshbmv.c000066400000000000000000000147241264277366700175570ustar00rootroot00000000000000/* ************************************************************************ * Copyright 2013 Advanced Micro Devices, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * ************************************************************************/ #include #include #include #include "clblas-internal.h" #include "solution_seq.h" static clblasStatus doSHbmv( CLBlasKargs *kargs, clblasOrder order, clblasUplo uplo, size_t N, size_t K, const cl_mem A, size_t offa, size_t lda, const cl_mem x, size_t offx, int incx, cl_mem y, size_t offy, int incy, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events) { cl_int err; ListHead seq; clblasStatus retCode = clblasSuccess; if (!clblasInitialized) { return clblasNotInitialized; } if ((commandQueues == NULL) || (numCommandQueues == 0)) { return clblasInvalidValue; } if (commandQueues[0] == NULL) { return clblasInvalidCommandQueue; } if ((numEventsInWaitList !=0) && (eventWaitList == NULL)) { return clblasInvalidEventWaitList; } /* Validate arguments */ if ((retCode = checkMemObjects(A, x, y, true, A_MAT_ERRSET, X_VEC_ERRSET, Y_VEC_ERRSET))) { return retCode; } if ((retCode = checkBandedMatrixSizes(kargs->dtype, order, clblasNoTrans, N, N, K, 0, A, offa, lda, A_MAT_ERRSET))) { return retCode; } if ((retCode = checkVectorSizes(kargs->dtype, N, x, offx, incx, X_VEC_ERRSET))) { return retCode; } if ((retCode = checkVectorSizes(kargs->dtype, N, y, offy, incy, Y_VEC_ERRSET))) { return retCode; } /* numCommandQueues will be hardcoded to 1 as of now. No multi-gpu support */ numCommandQueues = 1; kargs->order = order; kargs->uplo = uplo; kargs->transA = clblasNoTrans; kargs->N = N; kargs->M = N; kargs->KL = K; kargs->KU = K; kargs->A = A; kargs->offA = offa; kargs->offa = offa; kargs->lda.matrix = lda; kargs->B = x; kargs->offBX = offx; kargs->ldb.vector = incx; kargs->C = y; kargs->offCY = offy; kargs->ldc.vector = incy; listInitHead(&seq); err = makeSolutionSeq(CLBLAS_GBMV, kargs, numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events, &seq); if (err == CL_SUCCESS) { err = executeSolutionSeq(&seq); } freeSolutionSeq(&seq); return (clblasStatus)err; } clblasStatus clblasSsbmv( clblasOrder order, clblasUplo uplo, size_t N, size_t K, cl_float alpha, const cl_mem A, size_t offa, size_t lda, const cl_mem X, size_t offx, int incx, cl_float beta, cl_mem Y, size_t offy, int incy, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events) { CLBlasKargs kargs; memset(&kargs, 0, sizeof(kargs)); kargs.dtype = TYPE_FLOAT; kargs.pigFuncID = CLBLAS_SBMV; kargs.alpha.argFloat = alpha; kargs.beta.argFloat = beta; return doSHbmv(&kargs, order, uplo, N, K, A, offa, lda, X, offx, incx, Y, offy, incy, numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events); } clblasStatus clblasDsbmv( clblasOrder order, clblasUplo uplo, size_t N, size_t K, cl_double alpha, const cl_mem A, size_t offa, size_t lda, const cl_mem X, size_t offx, int incx, cl_double beta, cl_mem Y, size_t offy, int incy, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events) { CLBlasKargs kargs; memset(&kargs, 0, sizeof(kargs)); kargs.dtype = TYPE_DOUBLE; kargs.pigFuncID = CLBLAS_SBMV; kargs.alpha.argDouble = alpha; kargs.beta.argDouble = beta; return doSHbmv(&kargs, order, uplo, N, K, A, offa, lda, X, offx, incx, Y, offy, incy, numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events); } clblasStatus clblasChbmv( clblasOrder order, clblasUplo uplo, size_t N, size_t K, cl_float2 alpha, const cl_mem A, size_t offa, size_t lda, const cl_mem X, size_t offx, int incx, cl_float2 beta, cl_mem Y, size_t offy, int incy, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events) { CLBlasKargs kargs; memset(&kargs, 0, sizeof(kargs)); kargs.dtype = TYPE_COMPLEX_FLOAT; kargs.pigFuncID = CLBLAS_HBMV; kargs.alpha.argFloatComplex = alpha; kargs.beta.argFloatComplex = beta; return doSHbmv(&kargs, order, uplo, N, K, A, offa, lda, X, offx, incx, Y, offy, incy, numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events); } clblasStatus clblasZhbmv( clblasOrder order, clblasUplo uplo, size_t N, size_t K, cl_double2 alpha, const cl_mem A, size_t offa, size_t lda, const cl_mem X, size_t offx, int incx, cl_double2 beta, cl_mem Y, size_t offy, int incy, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events) { CLBlasKargs kargs; memset(&kargs, 0, sizeof(kargs)); kargs.dtype = TYPE_COMPLEX_DOUBLE; kargs.pigFuncID = CLBLAS_HBMV; kargs.alpha.argDoubleComplex = alpha; kargs.beta.argDoubleComplex = beta; return doSHbmv(&kargs, order, uplo, N, K, A, offa, lda, X, offx, incx, Y, offy, incy, numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events); } clblas-2.10/src/library/blas/xspmv.c000066400000000000000000000115321264277366700174170ustar00rootroot00000000000000/* ************************************************************************ * Copyright 2013 Advanced Micro Devices, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * ************************************************************************/ #include #include #include #include "clblas-internal.h" #include "solution_seq.h" static clblasStatus doSpmv( CLBlasKargs *kargs, clblasOrder order, clblasUplo uplo, size_t N, const cl_mem AP, size_t offa, const cl_mem X, size_t offx, int incx, cl_mem Y, size_t offy, int incy, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events) { cl_int err; ListHead seq1, seq2; cl_event first_event; clblasStatus retCode = clblasSuccess; if (!clblasInitialized) { return clblasNotInitialized; } /* Validate arguments */ if ((retCode = checkMemObjects(AP, X, Y, true, A_MAT_ERRSET, X_VEC_ERRSET, Y_VEC_ERRSET))) { return retCode; } if ((retCode = checkMatrixSizes(kargs->dtype, order, clblasNoTrans, N, N, AP, offa, 0, A_MAT_ERRSET))) { return retCode; } if ((retCode = checkVectorSizes(kargs->dtype, N, X, offx, incx, X_VEC_ERRSET))) { return retCode; } if ((retCode = checkVectorSizes(kargs->dtype, N, Y, offy, incy, Y_VEC_ERRSET))) { return retCode; } if ((commandQueues == NULL) || (numCommandQueues == 0)) { return clblasInvalidValue; } if ((numEventsInWaitList !=0) && (eventWaitList == NULL)) { return clblasInvalidEventWaitList; } numCommandQueues = 1; kargs->order = order; kargs->uplo = uplo; kargs->N = N; kargs->A = AP; kargs->offA = offa; kargs->offa = offa; kargs->lda.matrix = 0; // Set lda as zero for packed matrices kargs->B = X; kargs->offBX = offx; kargs->ldb.vector = incx; kargs->C = Y; kargs->offCY = offy; kargs->ldc.vector = incy; kargs->transA = clblasNoTrans; kargs->diag = clblasNonUnit; kargs->pigFuncID = CLBLAS_SPMV; listInitHead(&seq1); err = makeSolutionSeq(CLBLAS_TRMV, kargs, numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, &first_event, &seq1); if (err == CL_SUCCESS) { err = executeSolutionSeq(&seq1); if (err == CL_SUCCESS) { listInitHead(&seq2); kargs->transA = clblasTrans; kargs->diag = clblasUnit; err = makeSolutionSeq(CLBLAS_TRMV, kargs, numCommandQueues, commandQueues, 1, &first_event, events, &seq2); if (err == CL_SUCCESS) { err = executeSolutionSeq(&seq2); } freeSolutionSeq(&seq2); } } freeSolutionSeq(&seq1); return (clblasStatus)err; } clblasStatus clblasSspmv( clblasOrder order, clblasUplo uplo, size_t N, cl_float alpha, const cl_mem AP, size_t offa, const cl_mem X, size_t offx, int incx, cl_float beta, cl_mem Y, size_t offy, int incy, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events) { CLBlasKargs kargs; memset(&kargs, 0, sizeof(kargs)); kargs.dtype = TYPE_FLOAT; kargs.alpha.argFloat = alpha; kargs.beta.argFloat = beta; return doSpmv(&kargs, order, uplo, N, AP, offa, X, offx, incx, Y, offy, incy, numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events); } clblasStatus clblasDspmv( clblasOrder order, clblasUplo uplo, size_t N, cl_double alpha, const cl_mem AP, size_t offa, const cl_mem X, size_t offx, int incx, cl_double beta, cl_mem Y, size_t offy, int incy, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events) { CLBlasKargs kargs; memset(&kargs, 0, sizeof(kargs)); kargs.dtype = TYPE_DOUBLE; kargs.alpha.argDouble = alpha; kargs.beta.argDouble = beta; return doSpmv(&kargs, order, uplo, N, AP, offa, X, offx, incx, Y, offy, incy, numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events); } clblas-2.10/src/library/blas/xswap.c000066400000000000000000000124611264277366700174060ustar00rootroot00000000000000/* ************************************************************************ * Copyright 2013 Advanced Micro Devices, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * ************************************************************************/ //#define DEBUG_SWAP #include #include #include #include #include "clblas-internal.h" #include "solution_seq.h" clblasStatus doSwap( CLBlasKargs *kargs, size_t N, cl_mem X, size_t offx, int incx, cl_mem Y, size_t offy, int incy, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events) { cl_int err; ListHead seq; clblasStatus retCode = clblasSuccess; if (!clblasInitialized) { return clblasNotInitialized; } /* Validate arguments */ retCode = checkMemObjects(X, Y, X, false, X_VEC_ERRSET, Y_VEC_ERRSET, X_VEC_ERRSET ); if (retCode) { #ifdef DEBUG_SWAP printf("Invalid mem object..\n"); #endif return retCode; } // Check wheather enough memory was allocated if ((retCode = checkVectorSizes(kargs->dtype, N, X, offx, incx, X_VEC_ERRSET))) { #ifdef DEBUG_SWAP printf("Invalid Size for X\n"); #endif return retCode; } if ((retCode = checkVectorSizes(kargs->dtype, N, Y, offy, incy, Y_VEC_ERRSET))) { #ifdef DEBUG_SWAP printf("Invalid Size for Y\n"); #endif return retCode; } /////////////////////////////////////////////////////////////// if ((commandQueues == NULL) || (numCommandQueues == 0)) { return clblasInvalidValue; } /* numCommandQueues will be hardcoded to 1 as of now. No multi-gpu support */ numCommandQueues = 1; if (commandQueues[0] == NULL) { return clblasInvalidCommandQueue; } if ((numEventsInWaitList !=0) && (eventWaitList == NULL)) { return clblasInvalidEventWaitList; } kargs->N = N; kargs->A = X; kargs->offBX = offx; kargs->ldb.vector = incx; // Will be using this as incx kargs->B = Y; kargs->offCY = offy; kargs->ldc.vector = incy; // Will be using this as incy #ifdef DEBUG_SWAP printf("Calling makeSolutionSeq from DoSwap: SWAP\n"); #endif listInitHead(&seq); err = makeSolutionSeq(CLBLAS_SWAP, kargs, numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events, &seq); if (err == CL_SUCCESS) { err = executeSolutionSeq(&seq); } freeSolutionSeq(&seq); return (clblasStatus)err; } clblasStatus clblasSswap( size_t N, cl_mem X, size_t offx, int incx, cl_mem Y, size_t offy, int incy, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events) { CLBlasKargs kargs; #ifdef DEBUG_SWAP printf("\nSSWAP Called\n"); #endif memset(&kargs, 0, sizeof(kargs)); kargs.dtype = TYPE_FLOAT; return doSwap(&kargs, N, X, offx, incx, Y, offy, incy, numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events); } clblasStatus clblasDswap( size_t N, cl_mem X, size_t offx, int incx, cl_mem Y, size_t offy, int incy, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events) { CLBlasKargs kargs; #ifdef DEBUG_SWAP printf("\nDSWAP Called\n"); #endif memset(&kargs, 0, sizeof(kargs)); kargs.dtype = TYPE_DOUBLE; return doSwap(&kargs, N, X, offx, incx, Y, offy, incy, numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events); } clblasStatus clblasCswap( size_t N, cl_mem X, size_t offx, int incx, cl_mem Y, size_t offy, int incy, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events) { CLBlasKargs kargs; #ifdef DEBUG_SWAP printf("\nCSWAP Called\n"); #endif memset(&kargs, 0, sizeof(kargs)); kargs.dtype = TYPE_COMPLEX_FLOAT; return doSwap(&kargs, N, X, offx, incx, Y, offy, incy, numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events); } clblasStatus clblasZswap( size_t N, cl_mem X, size_t offx, int incx, cl_mem Y, size_t offy, int incy, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events) { CLBlasKargs kargs; #ifdef DEBUG_SWAP printf("\nZSWAP Called\n"); #endif memset(&kargs, 0, sizeof(kargs)); kargs.dtype = TYPE_COMPLEX_DOUBLE; return doSwap(&kargs, N, X, offx, incx, Y, offy, incy, numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events); } clblas-2.10/src/library/blas/xsymm.c000066400000000000000000000266631264277366700174320ustar00rootroot00000000000000/* ************************************************************************ * Copyright 2013 Advanced Micro Devices, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * ************************************************************************/ #include #include #include #include #include "clblas-internal.h" #include "solution_seq.h" #define SYMM_USING_GEMM //#define DEBUG_SYMM extern clblasStatus executeGEMM( CLBlasKargs *kargs, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events); clblasStatus doSymm( CLBlasKargs *kargs, clblasOrder order, clblasUplo uplo, clblasSide side, size_t M, size_t N, const cl_mem A, size_t offa, size_t lda, const cl_mem B, size_t offb, size_t ldb, cl_mem C, size_t offc, size_t ldc, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events, BlasFunctionID symm_or_hemm) { cl_int err; clblasStatus retCode = clblasSuccess; if (!clblasInitialized) { return clblasNotInitialized; } /* Validate arguments */ if ((retCode = checkMemObjects(A, B, C, true, A_MAT_ERRSET, B_MAT_ERRSET, C_MAT_ERRSET))) { return retCode; } if (side == clblasLeft) { // MxM x MxN if ((retCode = checkMatrixSizes(kargs->dtype, order, clblasNoTrans, M, M, A, offa, lda, A_MAT_ERRSET))) { return retCode; } } else { // MxN x NxN if ((retCode = checkMatrixSizes(kargs->dtype, order, clblasNoTrans, N, N, A, offa, lda, A_MAT_ERRSET))) { return retCode; } } if ((retCode = checkMatrixSizes(kargs->dtype, order, clblasNoTrans, M, N, B, offb, ldb, B_MAT_ERRSET))) { return retCode; } if ((retCode = checkMatrixSizes(kargs->dtype, order, clblasNoTrans, M, N, C, offc, ldc, C_MAT_ERRSET))) { return retCode; } #ifdef DEBUG_SYMM printf("DoSymm being called...\n"); #endif if ((commandQueues == NULL) || (numCommandQueues == 0)) { return clblasInvalidValue; } if ((numEventsInWaitList !=0) && (eventWaitList == NULL)) { return clblasInvalidEventWaitList; } numCommandQueues = 1; kargs->order = order; kargs->uplo = uplo; kargs->side = side; kargs->pigFuncID = symm_or_hemm; kargs->M = M; if (kargs->side == clblasLeft) { kargs->K = M; } else { kargs->K = N; } kargs->N = N; kargs->A = A; kargs->lda.matrix = lda; kargs->B = B; kargs->ldb.matrix = ldb; kargs->C = C; kargs->ldc.matrix = ldc; kargs->offA = offa; kargs->offa = offa; kargs->offA = offa; kargs->offBX = offb; kargs->offCY = offc; kargs->offsetM = 0; kargs->offsetN = 0; //kargs->offsetK = 0; FIXME: not found offsetK in new AMD structure! kargs->scimage[0] = 0; kargs->scimage[1] = 0; if (kargs->order == clblasRowMajor) { kargs->order = clblasColumnMajor; kargs->M = N; kargs->N = M; if (kargs->side == clblasLeft) { kargs->side = clblasRight; } else { kargs->side = clblasLeft; } if (kargs->uplo == clblasUpper) { kargs->uplo = clblasLower; } else { kargs->uplo = clblasUpper; } } #ifndef SYMM_USING_GEMM #ifdef DEBUG_SYMM printf("Calling makeSolutionSeq : SYMM \n"); #endif { ListHead seq; listInitHead(&seq); err = makeSolutionSeq(CLBLAS_SYMM, kargs, numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events, &seq); if (err == CL_SUCCESS) { err = executeSolutionSeq(&seq); } freeSolutionSeq(&seq); } #else // // SYMM_USING_GEMM // { CLBlasKargs GEMMNArgs, GEMMTArgs, GEMMDArgs; cl_event gemmNEvent, gemmTEvent ; FloatComplex cBeta; DoubleComplex zBeta; clblasTranspose transposeFunction = clblasTrans; memcpy(&GEMMNArgs, kargs, sizeof(CLBlasKargs)); memcpy(&GEMMTArgs, kargs, sizeof(CLBlasKargs)); memcpy(&GEMMDArgs, kargs, sizeof(CLBlasKargs)); switch(symm_or_hemm) { case CLBLAS_SYMM: transposeFunction = clblasTrans; GEMMDArgs.pigFuncID = CLBLAS_SYMM_DIAGONAL; break; case CLBLAS_HEMM: transposeFunction = clblasConjTrans; GEMMDArgs.pigFuncID = CLBLAS_HEMM_DIAGONAL; break; default: printf("WARNING: doSymm(): Neither SYMM nor HEMM is calling this function."); break; } // // It is the diagonal piggy back for GEMMD. For others, it is just CLBLAS_SYMM // // // Set the Transpose for GEMM'T' and GEMM'D' // The other two do not have transpose by default // switch(kargs->side) { case clblasLeft: GEMMTArgs.transA = transposeFunction; if (kargs->uplo == clblasUpper) { // // This is for proper TAIL handling for Right Lower case alone // For all other cases, NN kernel is good enough to handle tails // GEMMDArgs.transA = transposeFunction; } break; case clblasRight: GEMMTArgs.transB = transposeFunction; if (kargs->uplo == clblasLower) { // // This is for proper TAIL handling for Right Lower case alone // For all other cases, NN kernel is good enough to handle tails // GEMMDArgs.transB = transposeFunction; } break; default: break; } // // Set the BETA multiplier to 1 for GEMMT and GEMMD // memset(&GEMMTArgs.beta, 0, sizeof(GEMMTArgs.beta)); memset(&GEMMDArgs.beta, 0, sizeof(GEMMDArgs.beta)); switch(kargs->dtype) { case TYPE_FLOAT: GEMMTArgs.beta.argFloat = 1.0f; GEMMDArgs.beta.argFloat = 1.0f; break; case TYPE_DOUBLE: GEMMTArgs.beta.argDouble = 1.0; GEMMDArgs.beta.argDouble = 1.0; break; case TYPE_COMPLEX_FLOAT: CREAL(cBeta) = 1.0f; CIMAG(cBeta) = 0.0f; GEMMTArgs.beta.argFloatComplex = cBeta; GEMMDArgs.beta.argFloatComplex = cBeta; break; case TYPE_COMPLEX_DOUBLE: CREAL(zBeta) = 1.0; CIMAG(zBeta) = 0.0; GEMMTArgs.beta.argDoubleComplex = zBeta; GEMMDArgs.beta.argDoubleComplex = zBeta; break; } // // GEMM Handler will notice the "pigFuncID" and set appropriate flags // err = executeGEMM(&GEMMNArgs, numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, &gemmNEvent); if (err == CL_SUCCESS) { err = executeGEMM(&GEMMTArgs, numCommandQueues, commandQueues, 1, &gemmNEvent, &gemmTEvent); if (err == CL_SUCCESS) { err = executeGEMM(&GEMMDArgs, numCommandQueues, commandQueues, 1, &gemmTEvent, events); } } } #endif return (clblasStatus)err; } clblasStatus clblasSsymm( clblasOrder order, clblasSide side, clblasUplo uplo, size_t M, size_t N, float alpha, const cl_mem A, size_t offa, size_t lda, const cl_mem B, size_t offb, size_t ldb, float beta, cl_mem C, size_t offc, size_t ldc, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events) { CLBlasKargs kargs; memset(&kargs, 0, sizeof(kargs)); kargs.dtype = TYPE_FLOAT; kargs.alpha.argFloat = alpha; kargs.beta.argFloat = beta; #ifdef DEBUG_SYMM printf("Ssymm called\n"); #endif return doSymm( &kargs, order, uplo, side, M, N, A, offa, lda, B, offb, ldb, C, offc, ldc, numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events, CLBLAS_SYMM); } clblasStatus clblasDsymm( clblasOrder order, clblasSide side, clblasUplo uplo, size_t M, size_t N, double alpha, const cl_mem A, size_t offa, size_t lda, const cl_mem B, size_t offb, size_t ldb, double beta, cl_mem C, size_t offc, size_t ldc, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events) { CLBlasKargs kargs; #ifdef DEBUG_SYMM printf("Dsymm called\n"); #endif memset(&kargs, 0, sizeof(kargs)); kargs.dtype = TYPE_DOUBLE; kargs.alpha.argDouble = alpha; kargs.beta.argDouble = beta; return doSymm( &kargs, order, uplo, side, M, N, A, offa, lda, B, offb, ldb, C, offc, ldc, numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events, CLBLAS_SYMM); } clblasStatus clblasCsymm( clblasOrder order, clblasSide side, clblasUplo uplo, size_t M, size_t N, cl_float2 alpha, const cl_mem A, size_t offa, size_t lda, const cl_mem B, size_t offb, size_t ldb, cl_float2 beta, cl_mem C, size_t offc, size_t ldc, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events) { CLBlasKargs kargs; #ifdef DEBUG_SYMM printf("Csymm called\n"); #endif memset(&kargs, 0, sizeof(kargs)); kargs.dtype = TYPE_COMPLEX_FLOAT; kargs.alpha.argFloatComplex = alpha; kargs.beta.argFloatComplex = beta; return doSymm( &kargs, order, uplo, side, M, N, A, offa, lda, B, offb, ldb, C, offc, ldc, numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events, CLBLAS_SYMM); } clblasStatus clblasZsymm( clblasOrder order, clblasSide side, clblasUplo uplo, size_t M, size_t N, cl_double2 alpha, const cl_mem A, size_t offa, size_t lda, const cl_mem B, size_t offb, size_t ldb, cl_double2 beta, cl_mem C, size_t offc, size_t ldc, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events) { CLBlasKargs kargs; #ifdef DEBUG_SYMM printf("Zsymm called\n"); #endif memset(&kargs, 0, sizeof(kargs)); kargs.dtype = TYPE_COMPLEX_DOUBLE; kargs.alpha.argDoubleComplex = alpha; kargs.beta.argDoubleComplex = beta; return doSymm( &kargs, order, uplo, side, M, N, A, offa, lda, B, offb, ldb, C, offc, ldc, numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events, CLBLAS_SYMM); } clblas-2.10/src/library/blas/xsymv.c000066400000000000000000000123671264277366700174370ustar00rootroot00000000000000/* ************************************************************************ * Copyright 2013 Advanced Micro Devices, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * ************************************************************************/ //#define USE_SYMV #include #include #include #include "clblas-internal.h" #include "solution_seq.h" static clblasStatus doSymv( CLBlasKargs *kargs, clblasOrder order, clblasUplo uplo, size_t N, const cl_mem A, size_t offA, size_t lda, const cl_mem x, size_t offx, int incx, cl_mem y, size_t offy, int incy, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events) { cl_int err; ListHead seq; clblasStatus retCode = clblasSuccess; #ifdef USE_SYMV ListHead seq2; ListNode *listNodePtr; cl_event first_event; #endif if (!clblasInitialized) { return clblasNotInitialized; } /* Validate arguments */ if ((retCode = checkMemObjects(A, x, y, true, A_MAT_ERRSET, X_VEC_ERRSET, Y_VEC_ERRSET))) { return retCode; } if ((retCode = checkMatrixSizes(kargs->dtype, order, clblasNoTrans, N, N, A, offA, lda, A_MAT_ERRSET ))) { return retCode; } if ((retCode = checkVectorSizes(kargs->dtype, N, x, offx, incx, X_VEC_ERRSET ))) { return retCode; } if ((retCode = checkVectorSizes(kargs->dtype, N, y, offy, incy, Y_VEC_ERRSET ))) { return retCode; } kargs->order = order; kargs->uplo = uplo; kargs->N = N; kargs->K = N; //store original N kargs->A = A; kargs->offA = offA; kargs->offa = offA; kargs->lda.matrix = lda; kargs->B = x; kargs->offBX = offx; kargs->ldb.vector = incx; kargs->C = y; kargs->offCY = offy; kargs->ldc.vector = incy; #ifndef USE_SYMV listInitHead(&seq); err = makeSolutionSeq(CLBLAS_SYMV, kargs, numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events, &seq); if (err == CL_SUCCESS) { err = executeSolutionSeq(&seq); } #else // version of SYMV using kprintf numCommandQueues = 1; listInitHead(&seq); kargs->transA = clblasNoTrans; kargs->diag = clblasNonUnit; err = makeSolutionSeq(CLBLAS_HEMV, kargs, numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, &first_event, &seq); if (err == CL_SUCCESS) { listInitHead(&seq2); kargs->transA = clblasTrans; kargs->diag = clblasUnit; err = makeSolutionSeq(CLBLAS_HEMV, kargs, numCommandQueues, commandQueues, 1, &first_event, events, &seq2); if (err == CL_SUCCESS) { // Adding node from seq2 to main seq listNodePtr = listNodeFirst(&seq2); listAddToTail(&seq, listNodePtr); err = executeSolutionSeq(&seq); // Executes both kernels in the seq one after other } } #endif freeSolutionSeq(&seq); return (clblasStatus)err; } clblasStatus clblasSsymv( clblasOrder order, clblasUplo uplo, size_t N, cl_float alpha, const cl_mem A, size_t offA, size_t lda, const cl_mem x, size_t offx, int incx, cl_float beta, cl_mem y, size_t offy, int incy, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events) { CLBlasKargs kargs; memset(&kargs, 0, sizeof(kargs)); kargs.dtype = TYPE_FLOAT; kargs.alpha.argFloat = alpha; kargs.beta.argFloat = beta; return doSymv(&kargs, order, uplo, N, A, offA, lda, x, offx, incx, y, offy, incy, numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events); } clblasStatus clblasDsymv( clblasOrder order, clblasUplo uplo, size_t N, cl_double alpha, const cl_mem A, size_t offA, size_t lda, const cl_mem x, size_t offx, int incx, cl_double beta, cl_mem y, size_t offy, int incy, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events) { CLBlasKargs kargs; memset(&kargs, 0, sizeof(kargs)); kargs.dtype = TYPE_DOUBLE; kargs.alpha.argDouble = alpha; kargs.beta.argDouble = beta; return doSymv(&kargs, order, uplo, N, A, offA, lda, x, offx, incx, y, offy, incy, numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events); } clblas-2.10/src/library/blas/xsyr.c000066400000000000000000000142441264277366700172520ustar00rootroot00000000000000/* ************************************************************************ * Copyright 2013 Advanced Micro Devices, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * ************************************************************************/ #include #include #include #include #include "clblas-internal.h" #include "solution_seq.h" clblasStatus doSyr( CLBlasKargs *kargs, clblasOrder order, clblasUplo uplo, size_t N, const cl_mem X, size_t offx, int incx, cl_mem A, size_t offa, size_t lda, cl_uint numCommandQueues, cl_command_queue* commandQueue, cl_uint numEventsInWaitList, const cl_event* eventWaitList, cl_event* events) { cl_int err; ListHead seq; clblasStatus retCode = clblasSuccess; if (!clblasInitialized) { return clblasNotInitialized; } #ifdef DEBUG_SYR printf("doSyr called\n"); #endif /* Validate arguments */ if ((retCode = checkMemObjects(A, X, 0, false, A_MAT_ERRSET, X_VEC_ERRSET, END_ERRSET))) { #ifdef DEBUG_SYR printf("Invalid mem object..\n"); #endif return retCode; } /* * PENDING: * checkMatrixSizes() does not account of "offa" argument. * Need to be added. */ if ((retCode = checkMatrixSizes(kargs->dtype, order, clblasNoTrans, N, N, A, offa, lda, A_MAT_ERRSET))) { #ifdef DEBUG_SYR printf("Invalid Size for A\n"); #endif return retCode; } if ((retCode = checkVectorSizes(kargs->dtype, N, X, offx, incx, X_VEC_ERRSET))) { #ifdef DEBUG_SYR printf("Invalid Size for X\n"); #endif return retCode; } if ((commandQueue == NULL) || (numCommandQueues == 0)) { return clblasInvalidValue; } if ((numEventsInWaitList !=0) && (eventWaitList == NULL)) { return clblasInvalidEventWaitList; } if(order == clblasRowMajor) { kargs->order = clblasColumnMajor; kargs->uplo = (uplo == clblasUpper) ? clblasLower : clblasUpper; } else { kargs->order = order; kargs->uplo = uplo; } kargs->N = N; kargs->A = A; kargs->lda.matrix = lda; kargs->B = X; kargs->ldb.vector = incx; kargs->offBX = offx; kargs->offa = offa; kargs->offA = offa; #ifdef DEBUG_SYR printf("Calling makeSolutionSeq : SYR\n"); #endif /* * Always use CommandQueue (0) * PENDING: * 1. No Multi-GPU / Multi-command queue support * 2. This can be optimized to use the commandQ with the higher * memmory bandwidth that supports the data-type and the LDA */ numCommandQueues = 1; listInitHead(&seq); err = makeSolutionSeq(CLBLAS_SYR, kargs, numCommandQueues, commandQueue, numEventsInWaitList, eventWaitList, events, &seq); if (err == CL_SUCCESS) { err = executeSolutionSeq(&seq); } freeSolutionSeq(&seq); return (clblasStatus)err; } clblasStatus clblasSsyr( clblasOrder order, clblasUplo uplo, size_t N, cl_float alpha, const cl_mem X, size_t offx, int incx, cl_mem A, size_t offa, size_t lda, cl_uint numCommandQueues, cl_command_queue* commandQueue, cl_uint numEventsInWaitList, const cl_event* eventWaitList, cl_event* events) { CLBlasKargs kargs; memset(&kargs, 0, sizeof(kargs)); kargs.dtype = TYPE_FLOAT; kargs.alpha.argFloat = alpha; kargs.pigFuncID = CLBLAS_SYR; #ifdef DEBUG_SYR printf("Ssyr called\n"); #endif return doSyr(&kargs, order, uplo, N, X, offx, incx, A, offa, lda, numCommandQueues, commandQueue, numEventsInWaitList, eventWaitList, events); } clblasStatus clblasDsyr( clblasOrder order, clblasUplo uplo, size_t N, cl_double alpha, const cl_mem X, size_t offx, int incx, cl_mem A, size_t offa, size_t lda, cl_uint numCommandQueues, cl_command_queue* commandQueue, cl_uint numEventsInWaitList, const cl_event* eventWaitList, cl_event* events) { CLBlasKargs kargs; memset(&kargs, 0, sizeof(kargs)); kargs.dtype = TYPE_DOUBLE; kargs.alpha.argDouble = alpha; kargs.pigFuncID = CLBLAS_SYR; #ifdef DEBUG_SYR printf("Dsyr called\n"); #endif return doSyr(&kargs, order, uplo, N, X, offx, incx, A, offa, lda, numCommandQueues, commandQueue, numEventsInWaitList, eventWaitList, events); } clblasStatus clblasSspr( clblasOrder order, clblasUplo uplo, size_t N, cl_float alpha, const cl_mem X, size_t offx, int incx, cl_mem AP, size_t offa, cl_uint numCommandQueues, cl_command_queue* commandQueue, cl_uint numEventsInWaitList, const cl_event* eventWaitList, cl_event* events) { CLBlasKargs kargs; memset(&kargs, 0, sizeof(kargs)); kargs.dtype = TYPE_FLOAT; kargs.alpha.argFloat = alpha; kargs.pigFuncID = CLBLAS_SPR; return doSyr(&kargs, order, uplo, N, X, offx, incx, AP, offa, 0, numCommandQueues, commandQueue, numEventsInWaitList, eventWaitList, events); } clblasStatus clblasDspr( clblasOrder order, clblasUplo uplo, size_t N, cl_double alpha, const cl_mem X, size_t offx, int incx, cl_mem AP, size_t offa, cl_uint numCommandQueues, cl_command_queue* commandQueue, cl_uint numEventsInWaitList, const cl_event* eventWaitList, cl_event* events) { CLBlasKargs kargs; memset(&kargs, 0, sizeof(kargs)); kargs.dtype = TYPE_DOUBLE; kargs.alpha.argDouble = alpha; kargs.pigFuncID = CLBLAS_SPR; return doSyr(&kargs, order, uplo, N, X, offx, incx, AP, offa, 0, numCommandQueues, commandQueue, numEventsInWaitList, eventWaitList, events); } clblas-2.10/src/library/blas/xsyr2.c000066400000000000000000000154251264277366700173360ustar00rootroot00000000000000/* ************************************************************************ * Copyright 2013 Advanced Micro Devices, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * ************************************************************************/ #include #include #include #include #include "clblas-internal.h" #include "solution_seq.h" clblasStatus doSyr2( CLBlasKargs *kargs, clblasOrder order, clblasUplo uplo, size_t N, const cl_mem X, size_t offx, int incx, const cl_mem Y, size_t offy, int incy, cl_mem A, size_t offa, size_t lda, cl_uint numCommandQueues, cl_command_queue* commandQueue, cl_uint numEventsInWaitList, const cl_event* eventWaitList, cl_event* events) { cl_int err; ListHead seq; clblasStatus retCode = clblasSuccess; if (!clblasInitialized) { return clblasNotInitialized; } #ifdef DEBUG_SYR2 printf("doSyr2 called\n"); #endif /* Validate arguments */ if ((retCode = checkMemObjects(A, X, Y, true, A_MAT_ERRSET, X_VEC_ERRSET, Y_VEC_ERRSET))) { #ifdef DEBUG_SYR2 printf("Invalid mem object..\n"); #endif return retCode; } if ((retCode = checkMatrixSizes(kargs->dtype, order, clblasNoTrans, N, N, A, offa, lda, A_MAT_ERRSET ))) { #ifdef DEBUG_SYR2 printf("Invalid Size for A\n"); #endif return retCode; } if ((retCode = checkVectorSizes(kargs->dtype, N, X, offx, incx, X_VEC_ERRSET))) { #ifdef DEBUG_SYR2 printf("Invalid Size for X\n"); #endif return retCode; } if ((retCode = checkVectorSizes(kargs->dtype, N, Y, offy, incy, Y_VEC_ERRSET))) { #ifdef DEBUG_SYR2 printf("Invalid Size for Y\n"); #endif return retCode; } if ((commandQueue == NULL) || (numCommandQueues == 0)) { return clblasInvalidValue; } if ((numEventsInWaitList !=0) && (eventWaitList == NULL)) { return clblasInvalidEventWaitList; } if(order == clblasRowMajor) { kargs->order = clblasColumnMajor; kargs->uplo = (uplo == clblasUpper) ? clblasLower : clblasUpper; } else { kargs->order = order; kargs->uplo = uplo; } kargs->N = N; kargs->A = A; kargs->lda.matrix = lda; kargs->B = X; kargs->ldb.vector = incx; kargs->offBX = offx; kargs->C = Y; kargs->ldc.vector = incy; kargs->offCY = offy; kargs->offa = offa; kargs->offA = offa; #ifdef DEBUG_SYR2 printf("Calling makeSolutionSeq : SYR2\n"); #endif /* * Always use CommandQueue (0) * PENDING: * 1. No Multi-GPU / Multi-command queue support * 2. This can be optimized to use the commandQ with the higher * memmory bandwidth that supports the data-type and the LDA */ numCommandQueues = 1; listInitHead(&seq); err = makeSolutionSeq(CLBLAS_SYR2, kargs, numCommandQueues, commandQueue, numEventsInWaitList, eventWaitList, events, &seq); if (err == CL_SUCCESS) { err = executeSolutionSeq(&seq); } freeSolutionSeq(&seq); return (clblasStatus)err; } clblasStatus clblasSsyr2( clblasOrder order, clblasUplo uplo, size_t N, float alpha, const cl_mem X, size_t offx, int incx, const cl_mem Y, size_t offy, int incy, cl_mem A, size_t offa, size_t lda, cl_uint numCommandQueues, cl_command_queue* commandQueue, cl_uint numEventsInWaitList, const cl_event* eventWaitList, cl_event* events) { CLBlasKargs kargs; memset(&kargs, 0, sizeof(kargs)); kargs.dtype = TYPE_FLOAT; kargs.alpha.argFloat = alpha; kargs.pigFuncID = CLBLAS_SYR2; #ifdef DEBUG_SYR2 printf("Ssyr2 called\n"); #endif return doSyr2(&kargs, order, uplo, N, X, offx, incx, Y, offy, incy, A, offa, lda, numCommandQueues, commandQueue, numEventsInWaitList, eventWaitList, events); } clblasStatus clblasDsyr2( clblasOrder order, clblasUplo uplo, size_t N, double alpha, const cl_mem X, size_t offx, int incx, const cl_mem Y, size_t offy, int incy, cl_mem A, size_t offa, size_t lda, cl_uint numCommandQueues, cl_command_queue* commandQueue, cl_uint numEventsInWaitList, const cl_event* eventWaitList, cl_event* events) { CLBlasKargs kargs; memset(&kargs, 0, sizeof(kargs)); kargs.dtype = TYPE_DOUBLE; kargs.alpha.argDouble = alpha; kargs.pigFuncID = CLBLAS_SYR2; #ifdef DEBUG_SYR2 printf("Dsyr2 called\n"); #endif return doSyr2(&kargs, order, uplo, N, X, offx, incx, Y, offy, incy, A, offa, lda, numCommandQueues, commandQueue, numEventsInWaitList, eventWaitList, events); } clblasStatus clblasSspr2( clblasOrder order, clblasUplo uplo, size_t N, float alpha, const cl_mem X, size_t offx, int incx, const cl_mem Y, size_t offy, int incy, cl_mem AP, size_t offa, cl_uint numCommandQueues, cl_command_queue* commandQueue, cl_uint numEventsInWaitList, const cl_event* eventWaitList, cl_event* events) { CLBlasKargs kargs; memset(&kargs, 0, sizeof(kargs)); kargs.dtype = TYPE_FLOAT; kargs.alpha.argFloat = alpha; kargs.pigFuncID = CLBLAS_SPR2; return doSyr2(&kargs, order, uplo, N, X, offx, incx, Y, offy, incy, AP, offa, 0, numCommandQueues, commandQueue, numEventsInWaitList, eventWaitList, events); } clblasStatus clblasDspr2( clblasOrder order, clblasUplo uplo, size_t N, double alpha, const cl_mem X, size_t offx, int incx, const cl_mem Y, size_t offy, int incy, cl_mem AP, size_t offa, cl_uint numCommandQueues, cl_command_queue* commandQueue, cl_uint numEventsInWaitList, const cl_event* eventWaitList, cl_event* events) { CLBlasKargs kargs; memset(&kargs, 0, sizeof(kargs)); kargs.dtype = TYPE_DOUBLE; kargs.alpha.argDouble = alpha; kargs.pigFuncID = CLBLAS_SPR2; return doSyr2(&kargs, order, uplo, N, X, offx, incx, Y, offy, incy, AP, offa, 0, numCommandQueues, commandQueue, numEventsInWaitList, eventWaitList, events); } clblas-2.10/src/library/blas/xsyr2k.c000066400000000000000000000144141264277366700175060ustar00rootroot00000000000000/* ************************************************************************ * Copyright 2013 Advanced Micro Devices, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * ************************************************************************/ #include #include #include #include "clblas-internal.h" #include "solution_seq.h" clblasStatus doSyr2k( CLBlasKargs *kargs, clblasOrder order, clblasUplo uplo, clblasTranspose transAB, size_t N, size_t K, const cl_mem A, size_t offA, size_t lda, const cl_mem B, size_t offB, size_t ldb, cl_mem C, size_t offC, size_t ldc, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events) { cl_int err; ListHead seq; clblasStatus retCode = clblasSuccess; if (!clblasInitialized) { return clblasNotInitialized; } if (numCommandQueues == 0 || commandQueues == NULL) { return clblasInvalidValue; } // Validate arguments if ((retCode = checkMemObjects(A, B, C, true, A_MAT_ERRSET, B_MAT_ERRSET, C_MAT_ERRSET))) { return retCode; } if (isComplexType(kargs->dtype) && transAB == clblasConjTrans) { return clblasInvalidValue; } if ((retCode = checkMatrixSizes(kargs->dtype, order, transAB, N, K, A, offA, lda, A_MAT_ERRSET))) { return retCode; } if ((retCode = checkMatrixSizes(kargs->dtype, order, transAB, N, K, B, offB, ldb, B_MAT_ERRSET))) { return retCode; } if ((retCode = checkMatrixSizes(kargs->dtype, order, false, N, N, C, offC, ldc, C_MAT_ERRSET))) { return retCode; } kargs->order = order; kargs->transA = transAB; kargs->transB = transAB; kargs->uplo = uplo; kargs->M = N; kargs->N = N; kargs->K = K; kargs->A = A; kargs->offA = offA; kargs->lda.matrix = lda; kargs->B = B; kargs->offBX = offB; kargs->ldb.matrix = ldb; kargs->C = C; kargs->offCY = offC; kargs->ldc.matrix = ldc; listInitHead(&seq); err = makeSolutionSeq(CLBLAS_SYR2K, kargs, numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events, &seq); if (err == CL_SUCCESS) { err = executeSolutionSeq(&seq); } freeSolutionSeq(&seq); return (clblasStatus)err; } clblasStatus clblasSsyr2k( clblasOrder order, clblasUplo uplo, clblasTranspose transAB, size_t N, size_t K, cl_float alpha, const cl_mem A, size_t offA, size_t lda, const cl_mem B, size_t offB, size_t ldb, cl_float beta, cl_mem C, size_t offC, size_t ldc, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events) { CLBlasKargs kargs; memset(&kargs, 0, sizeof(kargs)); kargs.alpha.argFloat = alpha; kargs.beta.argFloat = beta; kargs.dtype = TYPE_FLOAT; return doSyr2k(&kargs, order, uplo, transAB, N, K, A, offA, lda, B, offB, ldb, C, offC, ldc, numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events); } clblasStatus clblasDsyr2k( clblasOrder order, clblasUplo uplo, clblasTranspose transAB, size_t N, size_t K, cl_double alpha, const cl_mem A, size_t offA, size_t lda, const cl_mem B, size_t offB, size_t ldb, cl_double beta, cl_mem C, size_t offC, size_t ldc, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events) { CLBlasKargs kargs; memset(&kargs, 0, sizeof(kargs)); kargs.alpha.argDouble = alpha; kargs.beta.argDouble = beta; kargs.dtype = TYPE_DOUBLE; return doSyr2k(&kargs, order, uplo, transAB, N, K, A, offA, lda, B, offB, ldb, C, offC, ldc, numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events); } clblasStatus clblasCsyr2k( clblasOrder order, clblasUplo uplo, clblasTranspose transAB, size_t N, size_t K, FloatComplex alpha, const cl_mem A, size_t offA, size_t lda, const cl_mem B, size_t offB, size_t ldb, FloatComplex beta, cl_mem C, size_t offC, size_t ldc, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events) { CLBlasKargs kargs; memset(&kargs, 0, sizeof(kargs)); kargs.alpha.argFloatComplex = alpha; kargs.beta.argFloatComplex = beta; kargs.dtype = TYPE_COMPLEX_FLOAT; return doSyr2k(&kargs, order, uplo, transAB, N, K, A, offA, lda, B, offB, ldb, C, offC, ldc, numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events); } clblasStatus clblasZsyr2k( clblasOrder order, clblasUplo uplo, clblasTranspose transAB, size_t N, size_t K, DoubleComplex alpha, const cl_mem A, size_t offA, size_t lda, const cl_mem B, size_t offB, size_t ldb, DoubleComplex beta, cl_mem C, size_t offC, size_t ldc, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events) { CLBlasKargs kargs; memset(&kargs, 0, sizeof(kargs)); kargs.alpha.argDoubleComplex = alpha; kargs.beta.argDoubleComplex = beta; kargs.dtype = TYPE_COMPLEX_DOUBLE; return doSyr2k(&kargs, order, uplo, transAB, N, K, A, offA, lda, B, offB, ldb, C, offC, ldc, numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events); } clblas-2.10/src/library/blas/xsyrk.c000066400000000000000000000134611264277366700174250ustar00rootroot00000000000000/* ************************************************************************ * Copyright 2013 Advanced Micro Devices, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * ************************************************************************/ #include #include #include #include "clblas-internal.h" #include "solution_seq.h" clblasStatus doSyrk( CLBlasKargs *kargs, clblasOrder order, clblasUplo uplo, clblasTranspose transA, size_t N, size_t K, const cl_mem A, size_t offA, size_t lda, cl_mem C, size_t offC, size_t ldc, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events) { cl_int err; ListHead seq; clblasStatus retCode = clblasSuccess; if (!clblasInitialized) { return clblasNotInitialized; } if (numCommandQueues == 0 || commandQueues == NULL) { return clblasInvalidValue; } // Validate arguments if ((retCode = checkMemObjects(A, C, NULL, false, A_MAT_ERRSET, C_MAT_ERRSET, END_ERRSET))) { return retCode; } if (isComplexType(kargs->dtype) && transA == clblasConjTrans) { return clblasInvalidValue; } if ((retCode = checkMatrixSizes(kargs->dtype, order, transA, N, K, A, offA, lda, A_MAT_ERRSET))) { return retCode; } if ((retCode = checkMatrixSizes(kargs->dtype, order, false, N, N, C, offC, ldc, C_MAT_ERRSET))) { return retCode; } kargs->order = order; kargs->transA = transA; kargs->transB = transA; kargs->uplo = uplo; kargs->M = N; kargs->N = N; kargs->K = K; kargs->A = A; kargs->offA = offA; kargs->lda.matrix = lda; kargs->B = A; kargs->offBX = offA; kargs->ldb.matrix = lda; kargs->C = C; kargs->offCY = offC; kargs->ldc.matrix = ldc; listInitHead(&seq); err = makeSolutionSeq(CLBLAS_SYRK, kargs, numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events, &seq); if (err == CL_SUCCESS) { err = executeSolutionSeq(&seq); } freeSolutionSeq(&seq); return (clblasStatus)err; } clblasStatus clblasSsyrk( clblasOrder order, clblasUplo uplo, clblasTranspose transA, size_t N, size_t K, cl_float alpha, const cl_mem A, size_t offA, size_t lda, cl_float beta, cl_mem C, size_t offC, size_t ldc, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events) { CLBlasKargs kargs; memset(&kargs, 0, sizeof(kargs)); kargs.alpha.argFloat = alpha; kargs.beta.argFloat = beta; kargs.dtype = TYPE_FLOAT; return doSyrk(&kargs, order, uplo, transA, N, K, A, offA, lda, C, offC, ldc, numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events); } clblasStatus clblasDsyrk( clblasOrder order, clblasUplo uplo, clblasTranspose transA, size_t N, size_t K, cl_double alpha, const cl_mem A, size_t offA, size_t lda, cl_double beta, cl_mem C, size_t offC, size_t ldc, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events) { CLBlasKargs kargs; memset(&kargs, 0, sizeof(kargs)); kargs.alpha.argDouble = alpha; kargs.beta.argDouble = beta; kargs.dtype = TYPE_DOUBLE; return doSyrk(&kargs, order, uplo, transA, N, K, A, offA, lda, C, offC, ldc, numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events); } clblasStatus clblasCsyrk( clblasOrder order, clblasUplo uplo, clblasTranspose transA, size_t N, size_t K, FloatComplex alpha, const cl_mem A, size_t offA, size_t lda, FloatComplex beta, cl_mem C, size_t offC, size_t ldc, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events) { CLBlasKargs kargs; memset(&kargs, 0, sizeof(kargs)); kargs.alpha.argFloatComplex = alpha; kargs.beta.argFloatComplex = beta; kargs.dtype = TYPE_COMPLEX_FLOAT; return doSyrk(&kargs, order, uplo, transA, N, K, A, offA, lda, C, offC, ldc, numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events); } clblasStatus clblasZsyrk( clblasOrder order, clblasUplo uplo, clblasTranspose transA, size_t N, size_t K, DoubleComplex alpha, const cl_mem A, size_t offA, size_t lda, DoubleComplex beta, cl_mem C, size_t offC, size_t ldc, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events) { CLBlasKargs kargs; memset(&kargs, 0, sizeof(kargs)); kargs.alpha.argDoubleComplex = alpha; kargs.beta.argDoubleComplex = beta; kargs.dtype = TYPE_COMPLEX_DOUBLE; return doSyrk(&kargs, order, uplo, transA, N, K, A, offA, lda, C, offC, ldc, numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events); } clblas-2.10/src/library/blas/xtbmv.c000066400000000000000000000171161264277366700174060ustar00rootroot00000000000000/* ************************************************************************ * Copyright 2013 Advanced Micro Devices, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * ************************************************************************/ //#define DEBUG_TBMV #include #include #include #include #include #include "clblas-internal.h" #include "solution_seq.h" clblasStatus doTbmv( CLBlasKargs *kargs, clblasOrder order, clblasUplo uplo, clblasTranspose trans, clblasDiag diag, size_t N, size_t K, const cl_mem A, size_t offa, size_t lda, cl_mem x, size_t offx, int incx, cl_mem y, // Scratch Buffer cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events) { cl_int err; ListHead seq; size_t sizeOfVector; cl_event *newEventWaitList; clblasStatus retCode = clblasSuccess; if (!clblasInitialized) { return clblasNotInitialized; } /* Validate arguments */ if ((retCode = checkMemObjects(A, x, y, true, A_MAT_ERRSET, X_VEC_ERRSET, Y_VEC_ERRSET))) { #ifdef DEBUG_TBMV printf("Invalid mem object..\n"); #endif return retCode; } if ((retCode = checkBandedMatrixSizes(kargs->dtype, order, trans, N, N, K, 0, A, offa, lda, A_MAT_ERRSET))) { #ifdef DEBUG_TBMV printf("Invalid Size for A\n"); #endif return retCode; } if ((retCode = checkVectorSizes(kargs->dtype, N, x, offx, incx, X_VEC_ERRSET))) { #ifdef DEBUG_TBMV printf("Invalid Size for X\n"); #endif return retCode; } if ((retCode = checkVectorSizes(kargs->dtype, N, y, 0, incx, Y_VEC_ERRSET))) { #ifdef DEBUG_TBMV printf("Invalid Size for scratch vector\n"); #endif return retCode; } #ifdef DEBUG_TBMV printf("DoTbmv being called...\n"); #endif if ((commandQueues == NULL) || (numCommandQueues == 0)) { return clblasInvalidValue; } numCommandQueues = 1; if ((numEventsInWaitList !=0) && (eventWaitList == NULL)) { return clblasInvalidEventWaitList; } newEventWaitList = malloc((numEventsInWaitList+1) * sizeof(cl_event)); if (newEventWaitList == NULL) { return clblasOutOfHostMemory; } if (numEventsInWaitList != 0 ) { memcpy(newEventWaitList, eventWaitList, numEventsInWaitList*sizeof(cl_event)); } /* * ASSUMPTION: * doTBMV assumes "commandQueue" of 0. The same is reflected in * "makeSolutionSeq" as well. If either of them changes in future, * this code needs to be revisited. */ sizeOfVector = (1 + (N-1)*abs(incx)) * dtypeSize(kargs->dtype); err = clEnqueueCopyBuffer(commandQueues[0], x, y, offx*dtypeSize(kargs->dtype), 0, sizeOfVector, numEventsInWaitList, eventWaitList, &newEventWaitList[numEventsInWaitList]); if (err != CL_SUCCESS) { free(newEventWaitList); return err; } kargs->order = order; kargs->uplo = uplo; kargs->transA = trans; kargs->diag = diag; kargs->M = N; kargs->N = N; if( uplo == clblasUpper ) { kargs->KL = 0; kargs->KU = K; } else { kargs->KL = K; kargs->KU = 0; } kargs->A = A; kargs->lda.matrix = lda; kargs->B = y; // Now it becomes x = A * y kargs->ldb.vector = incx; kargs->C = x; kargs->ldc.vector = incx; kargs->offBX = 0; // Not used by assignKargs(); Just for clarity kargs->offCY = offx; kargs->offa = offa; kargs->offA = offa; #ifdef DEBUG_TBMV printf("Calling makeSolutionSeq : TBMV\n"); #endif listInitHead(&seq); err = makeSolutionSeq(CLBLAS_GBMV, kargs, numCommandQueues, commandQueues, numEventsInWaitList+1, newEventWaitList, events, &seq); if (err == CL_SUCCESS) { err = executeSolutionSeq(&seq); } freeSolutionSeq(&seq); free(newEventWaitList); return (clblasStatus)err; } clblasStatus clblasStbmv( clblasOrder order, clblasUplo uplo, clblasTranspose trans, clblasDiag diag, size_t N, size_t K, const cl_mem A, size_t offa, size_t lda, cl_mem X, size_t offx, int incx, cl_mem scratchBuff, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events) { CLBlasKargs kargs; #ifdef DEBUG_TBMV printf("STBMV Called\n"); #endif memset(&kargs, 0, sizeof(kargs)); kargs.dtype = TYPE_FLOAT; kargs.pigFuncID = CLBLAS_TBMV; return doTbmv(&kargs, order, uplo, trans, diag, N, K, A, offa, lda, X, offx, incx, scratchBuff, numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events); } clblasStatus clblasDtbmv( clblasOrder order, clblasUplo uplo, clblasTranspose trans, clblasDiag diag, size_t N, size_t K, const cl_mem A, size_t offa, size_t lda, cl_mem X, size_t offx, int incx, cl_mem scratchBuff, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events) { CLBlasKargs kargs; #ifdef DEBUG_TBMV printf("DTBMV called\n"); #endif memset(&kargs, 0, sizeof(kargs)); kargs.dtype = TYPE_DOUBLE; kargs.pigFuncID = CLBLAS_TBMV; return doTbmv(&kargs, order, uplo, trans, diag, N, K, A, offa, lda, X, offx, incx, scratchBuff, numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events); } clblasStatus clblasCtbmv( clblasOrder order, clblasUplo uplo, clblasTranspose trans, clblasDiag diag, size_t N, size_t K, const cl_mem A, size_t offa, size_t lda, cl_mem X, size_t offx, int incx, cl_mem scratchBuff, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events) { CLBlasKargs kargs; #ifdef DEBUG_TBMV printf("CTBMV called\n"); #endif memset(&kargs, 0, sizeof(kargs)); kargs.dtype = TYPE_COMPLEX_FLOAT; kargs.pigFuncID = CLBLAS_TBMV; return doTbmv(&kargs, order, uplo, trans, diag, N, K, A, offa, lda, X, offx, incx, scratchBuff, numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events); } clblasStatus clblasZtbmv( clblasOrder order, clblasUplo uplo, clblasTranspose trans, clblasDiag diag, size_t N, size_t K, const cl_mem A, size_t offa, size_t lda, cl_mem X, size_t offx, int incx, cl_mem scratchBuff, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events) { CLBlasKargs kargs; #ifdef DEBUG_TBMV printf("ZTBMV called\n"); #endif memset(&kargs, 0, sizeof(kargs)); kargs.dtype = TYPE_COMPLEX_DOUBLE; kargs.pigFuncID = CLBLAS_TBMV; return doTbmv(&kargs, order, uplo, trans, diag, N, K, A, offa, lda, X, offx, incx, scratchBuff, numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events); } clblas-2.10/src/library/blas/xtbsv.c000066400000000000000000000641461264277366700174210ustar00rootroot00000000000000/* ************************************************************************ * Copyright 2013 Advanced Micro Devices, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * ************************************************************************/ #include #include #include #include #include #include "clblas-internal.h" #include "solution_seq.h" //#define DEBUG_TBSV static clblasUplo getUpLo(CLBlasKargs *kargs) { if (kargs->order == clblasRowMajor) { return kargs->uplo; } if (kargs->uplo == clblasUpper) { return clblasLower; } return clblasUpper; } static clblasStatus orchestrateNonTransposeTBSV(CLBlasKargs *kargs, ListHead *trtriSeq, ListHead *gbmvSeq, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events) { clblasStatus err; SolutionStep *trtri, *gbmv; size_t nLoops, i; cl_event *triangleEventArray, *rectangleEventArray; size_t TARGET_ROWS; bool gbmvExecute; size_t temp; ListNode *f = listNodeFirst(trtriSeq); trtri = container_of(f, node, SolutionStep); f = listNodeFirst(gbmvSeq); gbmv = container_of(f, node, SolutionStep); TARGET_ROWS = trtri->subdims->y; TARGET_ROWS = (TARGET_ROWS > kargs->K) ? kargs->K : TARGET_ROWS; TARGET_ROWS = (TARGET_ROWS == 0) ? 1 : TARGET_ROWS; trtri->numEventsInWaitList = numEventsInWaitList; trtri->eventWaitList = eventWaitList; if (kargs->N <= TARGET_ROWS) { trtri->event = events; trtri->args.startRow = 0; trtri->args.endRow = (cl_int)((kargs->N)-1); err = executeSolutionSeq(trtriSeq); return err; } // // Allocate Event Chain // nLoops = ((kargs->N) / TARGET_ROWS); if ((kargs->N % TARGET_ROWS)) { nLoops++; } // // Allocate Event Arrays to order the orchestration // triangleEventArray = malloc(nLoops*sizeof(cl_event)); rectangleEventArray = malloc(nLoops*sizeof(cl_event)); if ((triangleEventArray == NULL) || (rectangleEventArray == NULL)) { if (triangleEventArray) { free (triangleEventArray); } if (rectangleEventArray) { free (rectangleEventArray); } return clblasOutOfHostMemory; } // // Solve 1 Triangle using Triangle Kernel Followed by Rectangle Kernels // trtri->event = &triangleEventArray[0]; if (getUpLo(kargs) == clblasUpper) { trtri->args.startRow = (cl_int)((kargs->N) - TARGET_ROWS); trtri->args.endRow = (cl_int)((kargs->N)-1); } else { trtri->args.startRow = 0; trtri->args.endRow = (cl_int)(TARGET_ROWS-1); } err = executeSolutionSeq(trtriSeq); /*#define GET_OFFA(offa, lda, r, c, k)\ if(r < k) \ offa = r * lda + col + k - r;\ else if (r == k) \ offa = r * lda + col;\ else\ offa = r * lda + col - (r - k); */ #define GET_OFFA_LOWER(offa, lda, row, col, kl) (offa) = ((row) * (lda)) + (col) + (kl) - (row); #define GET_OFFA_UPPER(offa, lda, row, col) (offa) = ((row) * (lda)) + (col) - (row); if (err == CL_SUCCESS) { // // Solve the Rectangles one by one // //nLoops = 1; for(i=1; inumEventsInWaitList = 1; gbmv->eventWaitList = &triangleEventArray[i-1]; gbmv->event = &rectangleEventArray[i-1]; if (getUpLo(kargs) == clblasUpper) { gbmv->args.N = TARGET_ROWS; gbmv->args.M = ((trtri->args.startRow) >= (int)(kargs->K)) ? kargs->K : (size_t)trtri->args.startRow; gbmv->args.startRow = (trtri->args.startRow - gbmv->args.M); gbmv->args.endRow = (trtri->args.startRow - 1); gbmv->args.KU = (trtri->args.startRow >= (int)(kargs->K)) ? 0 : (kargs->K - trtri->args.startRow); gbmv->args.KL = gbmv->args.M - 1; GET_OFFA_UPPER(gbmv->args.offA, kargs->lda.matrix, gbmv->args.startRow, trtri->args.startRow); gbmv->args.offA -= gbmv->args.KL; gbmv->args.offA += kargs->offA; gbmv->args.offa = gbmv->args.offA; if(kargs->ldb.vector < 0) { gbmv->args.offBX = kargs->offBX + ((i-1) * TARGET_ROWS) * abs(kargs->ldb.vector); gbmv->args.offCY = kargs->offBX + ((i * TARGET_ROWS) ) * abs(kargs->ldb.vector); } else { gbmv->args.offBX = kargs->offBX + (trtri->args.startRow) * kargs->ldb.vector; gbmv->args.offCY = kargs->offBX + (gbmv->args.startRow) * kargs->ldb.vector; } } else { gbmv->args.startRow = (cl_int)((i)*TARGET_ROWS); gbmv->args.endRow = (cl_int)((((TARGET_ROWS*i) + kargs->K) > kargs->N) ? kargs->N : (TARGET_ROWS*i + kargs->K)); gbmv->args.N = TARGET_ROWS; gbmv->args.M = (gbmv->args.endRow - gbmv->args.startRow); gbmv->args.KU = TARGET_ROWS - 1; gbmv->args.KL = ((trtri->args.startRow + kargs->K) < kargs->N) ? (kargs->K - TARGET_ROWS) : (kargs->N - trtri->args.startRow - 1 - TARGET_ROWS); GET_OFFA_LOWER(gbmv->args.offA, kargs->lda.matrix, gbmv->args.startRow, trtri->args.startRow, kargs->K); gbmv->args.offA -= gbmv->args.KL; gbmv->args.offA += kargs->offA; gbmv->args.offa = gbmv->args.offA; if(kargs->ldb.vector < 0) { gbmv->args.offBX = kargs->offBX + (kargs->N - gbmv->args.startRow) * abs(kargs->ldb.vector); gbmv->args.offCY = kargs->offBX + (kargs->N - (gbmv->args.startRow + gbmv->args.M) ) * abs(kargs->ldb.vector); } else { gbmv->args.offBX = kargs->offBX + (trtri->args.startRow) * kargs->ldb.vector; gbmv->args.offCY = kargs->offBX + (gbmv->args.startRow) * kargs->ldb.vector; } } #ifdef DEBUG_TBSV printf("GBMV ITER %d, startRow %d, endRow %d, N %d, M %d , KU %d, KL %d, offBX %d, offA %d, offCY %d\n", i-1, gbmv->args.startRow, gbmv->args.endRow, \ gbmv->args.N, gbmv->args.M, gbmv->args.KU, gbmv->args.KL, gbmv->args.offBX, gbmv->args.offA, gbmv->args.offCY); #endif // This is required when KL or KU is 0 for TBSV. gbmvExecute = (gbmv->args.M != 0); if(gbmvExecute) { if(kargs->order == clblasColumnMajor) //GBMV Swaps it back while assigning { temp = gbmv->args.N; gbmv->args.N = gbmv->args.M; gbmv->args.M = temp; temp = gbmv->args.KU; gbmv->args.KU = gbmv->args.KL; gbmv->args.KL = temp; } err = executeSolutionSeq(gbmvSeq); } if (err != CL_SUCCESS) { printf("TBSV: WARNING: GBMV LOOP: Breaking after %d iterations !!!\n", (int)i); break; } #ifdef DEBUG_TBSV printf("Calling TBSV\n"); #endif if (getUpLo(kargs) == clblasUpper) { trtri->args.startRow = (cl_int)(((int)trtri->args.startRow - (int)TARGET_ROWS) >= 0) ? (trtri->args.startRow - TARGET_ROWS) : 0; trtri->args.endRow = (cl_int)(gbmv->args.endRow); } else { trtri->args.startRow = gbmv->args.startRow; trtri->args.endRow = (cl_int)(((gbmv->args.startRow + TARGET_ROWS-1) < kargs->N) ? (gbmv->args.startRow + TARGET_ROWS-1) : kargs->N-1); } #ifdef DEBUG_TBSV printf("TRSV ITER %d, startRow %d , endRow %d\n", i, trtri->args.startRow, trtri->args.endRow); #endif trtri->event = &triangleEventArray[i]; if (i == (nLoops-1)) { // // TRTRI's last iteration must be tied to the "event" that the API // user will choose to wait on. // trtri->event = events; } // // For first iteration, TRTRI waits on what the API user has specified. // Subsequent iterations will wait on the previous iteration's rectangle // counterpart // trtri->numEventsInWaitList =1; if(gbmvExecute) { trtri->eventWaitList = &rectangleEventArray[i-1]; } else //GBMV is not executed when KL or KU of the band in TBSV is 0. { trtri->eventWaitList = &triangleEventArray[i-1]; } err = executeSolutionSeq(trtriSeq); if (err != CL_SUCCESS) { printf("TBSV: WARNING: TRSV LOOP: Breaking after %d iterations !!!\n", (int)i); break; } } } free(triangleEventArray); free(rectangleEventArray); return err; } static clblasStatus orchestrateTransposeTBSV(CLBlasKargs *kargs, ListHead *trtriSeq, ListHead *gbmvSeq, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events) { clblasStatus err; SolutionStep *trtri, *gbmv; size_t nLoops, i; cl_event *triangleEventArray, *rectangleEventArray; size_t TARGET_ROWS; bool gbmvExecute; size_t temp; int TR_ER, N_SR, SD_ER; ListNode *f = listNodeFirst(trtriSeq); trtri = container_of(f, node, SolutionStep); f = listNodeFirst(gbmvSeq); gbmv = container_of(f, node, SolutionStep); TARGET_ROWS = trtri->subdims->y; TARGET_ROWS = (TARGET_ROWS > kargs->K) ? kargs->K : TARGET_ROWS; TARGET_ROWS = (TARGET_ROWS == 0) ? 1 : TARGET_ROWS; trtri->numEventsInWaitList = numEventsInWaitList; trtri->eventWaitList = eventWaitList; if (kargs->N <= TARGET_ROWS) { trtri->event = events; trtri->args.startRow = 0; trtri->args.endRow = (cl_int)((kargs->N)); err = executeSolutionSeq(trtriSeq); return err; } // // Allocate Event Chain // nLoops = ((kargs->N) / TARGET_ROWS); if ((kargs->N % TARGET_ROWS)) { nLoops++; } // // Allocate Event Arrays to order the orchestration // triangleEventArray = malloc(nLoops*sizeof(cl_event)); rectangleEventArray = malloc(nLoops*sizeof(cl_event)); if ((triangleEventArray == NULL) || (rectangleEventArray == NULL)) { if (triangleEventArray) { free (triangleEventArray); } if (rectangleEventArray) { free (rectangleEventArray); } return clblasOutOfHostMemory; } // // Solve 1 Triangle using Triangle Kernel Followed by Rectangle Kernels // trtri->event = &triangleEventArray[0]; if (getUpLo(kargs) == clblasUpper) { trtri->args.startRow = 0; trtri->args.endRow = (cl_int)(TARGET_ROWS); } else { trtri->args.startRow = (cl_int)((kargs->N) - TARGET_ROWS); trtri->args.endRow = (cl_int)((kargs->N)); } err = executeSolutionSeq(trtriSeq); /*#define GET_OFFA(offa, lda, r, c, k)\ if(r < k) \ offa = r * lda + col + k - r;\ else if (r == k) \ offa = r * lda + col;\ else\ offa = r * lda + col - (r - k); */ #define GET_OFFA_LOWER(offa, lda, row, col, kl) (offa) = ((row) * (lda)) + (col) + (kl) - (row); #define GET_OFFA_UPPER(offa, lda, row, col) (offa) = ((row) * (lda)) + (col) - (row); if (err == CL_SUCCESS) { // // Solve the Rectangles one by one // //nLoops = 1; #define max(a, b) (((a) > (b)) ? (a) : (b)) #define min(a, b) (((a) < (b)) ? (a) : (b)) for(i=1; inumEventsInWaitList = 1; gbmv->eventWaitList = &triangleEventArray[i-1]; gbmv->event = &rectangleEventArray[i-1]; if (getUpLo(kargs) == clblasUpper) { TR_ER = trtri->args.endRow - 1; gbmv->args.N = max(0, min(((int)kargs->K), ((int)kargs->N - 1 - TR_ER))); gbmv->args.M = TARGET_ROWS; gbmv->args.startRow = (trtri->args.startRow); gbmv->args.endRow = trtri->args.endRow; N_SR = max(0, min(((int)kargs->K), ((int)kargs->N - 1 - (int)trtri->args.startRow))); gbmv->args.KU = N_SR - TARGET_ROWS; gbmv->args.KL = gbmv->args.M - 1; GET_OFFA_UPPER(gbmv->args.offA, kargs->lda.matrix, gbmv->args.startRow, (gbmv->args.endRow)); gbmv->args.offA -= gbmv->args.KL; gbmv->args.offA += kargs->offA; gbmv->args.offa = gbmv->args.offA; if(kargs->ldb.vector < 0) { gbmv->args.offBX = kargs->offBX + (kargs->N - (gbmv->args.endRow)) * abs(kargs->ldb.vector); gbmv->args.offCY = kargs->offBX + (kargs->N - (gbmv->args.endRow + gbmv->args.N) ) * abs(kargs->ldb.vector); } else { gbmv->args.offBX = kargs->offBX + (gbmv->args.startRow) * kargs->ldb.vector; gbmv->args.offCY = kargs->offBX + (gbmv->args.endRow) * kargs->ldb.vector; } } else { #define SUBDIAGS(r, k) ((r) <= (k)) ? (r) : (k); gbmv->args.startRow = trtri->args.startRow; gbmv->args.endRow = trtri->args.endRow; gbmv->args.N = SUBDIAGS((int)trtri->args.startRow, (int)kargs->K); gbmv->args.M = TARGET_ROWS; gbmv->args.KU = gbmv->args.N - 1; SD_ER = SUBDIAGS((int)(trtri->args.endRow - 1), (int)kargs->K); gbmv->args.KL = SD_ER - gbmv->args.N; GET_OFFA_LOWER(gbmv->args.offA, kargs->lda.matrix, gbmv->args.startRow, (gbmv->args.startRow - gbmv->args.N), kargs->K); gbmv->args.offA -= gbmv->args.KL; gbmv->args.offA += kargs->offA; gbmv->args.offa = gbmv->args.offA; if(kargs->ldb.vector < 0) { gbmv->args.offBX = kargs->offBX + (kargs->N - gbmv->args.endRow) * abs(kargs->ldb.vector); gbmv->args.offCY = kargs->offBX + (kargs->N - (gbmv->args.startRow) ) * abs(kargs->ldb.vector); } else { gbmv->args.offBX = kargs->offBX + (gbmv->args.startRow) * kargs->ldb.vector; gbmv->args.offCY = kargs->offBX + (gbmv->args.startRow - gbmv->args.N) * kargs->ldb.vector; } } #ifdef DEBUG_TBSV printf("GBMV ITER %d, startRow %d, endRow %d, N %d, M %d , KU %d, KL %d, offBX %d, offA %d, offCY %d\n", i-1, gbmv->args.startRow, gbmv->args.endRow, \ gbmv->args.N, gbmv->args.M, gbmv->args.KU, gbmv->args.KL, gbmv->args.offBX, gbmv->args.offA, gbmv->args.offCY); #endif // This is required when KL or KU is 0 for TBSV. gbmvExecute = (gbmv->args.N != 0); if(gbmvExecute) { if(kargs->order == clblasColumnMajor) //GBMV Swaps it back while assigning { temp = gbmv->args.N; gbmv->args.N = gbmv->args.M; gbmv->args.M = temp; temp = gbmv->args.KU; gbmv->args.KU = gbmv->args.KL; gbmv->args.KL = temp; } err = executeSolutionSeq(gbmvSeq); } if (err != CL_SUCCESS) { printf("TBSV: WARNING: GBMV LOOP: Breaking after %d iterations !!!\n", (int)i); break; } #ifdef DEBUG_TBSV printf("Calling TBSV\n"); #endif if (getUpLo(kargs) == clblasUpper) { trtri->args.startRow = (cl_int)(trtri->args.endRow); trtri->args.endRow = (cl_int)(((int)trtri->args.endRow + (int)TARGET_ROWS) <= (int)kargs->N) ? (trtri->args.endRow + TARGET_ROWS) : kargs->N; } else { trtri->args.endRow = trtri->args.startRow; trtri->args.startRow = (cl_int)((((int)trtri->args.startRow - (int)TARGET_ROWS) > 0) ? (trtri->args.startRow - TARGET_ROWS) : 0); } #ifdef DEBUG_TBSV printf("TRSV ITER %d, startRow %d , endRow %d\n", i, trtri->args.startRow, trtri->args.endRow); #endif trtri->event = &triangleEventArray[i]; if (i == (nLoops-1)) { // // TRTRI's last iteration must be tied to the "event" that the API // user will choose to wait on. // trtri->event = events; } // // For first iteration, TRTRI waits on what the API user has specified. // Subsequent iterations will wait on the previous iteration's rectangle // counterpart // trtri->numEventsInWaitList =1; if(gbmvExecute) { trtri->eventWaitList = &rectangleEventArray[i-1]; } else //GBMV is not executed when KL or KU of the band in TBSV is 0. { trtri->eventWaitList = &triangleEventArray[i-1]; } err = executeSolutionSeq(trtriSeq); if (err != CL_SUCCESS) { printf("TBSV: WARNING: TRSV LOOP: Breaking after %d iterations !!!\n", (int)i); break; } } } free(triangleEventArray); free(rectangleEventArray); return err; } static clblasStatus orchestrateTBSV(CLBlasKargs *kargs, ListHead *trtriSeq, ListHead *gbmvSeq, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events) { clblasStatus err = clblasNotImplemented; if ( ((kargs->order == clblasRowMajor) && (kargs->transA == clblasNoTrans)) || ((kargs->order == clblasColumnMajor) && (kargs->transA != clblasNoTrans)) ) { #ifdef DEBUG_TBSV printf("Orchestrating the NO-Transpose case..\n"); #endif err = orchestrateNonTransposeTBSV(kargs, trtriSeq, gbmvSeq, numEventsInWaitList, eventWaitList, events); } else { #ifdef DEBUG_TRSV printf("Orchestrating the Transpose case..\n"); #endif err = orchestrateTransposeTBSV(kargs, trtriSeq, gbmvSeq, numEventsInWaitList, eventWaitList, events); } return err; } clblasStatus doTbsv( CLBlasKargs *kargs, clblasOrder order, clblasUplo uplo, clblasTranspose trans, clblasDiag diag, size_t N, size_t K, const cl_mem A, size_t offa, size_t lda, cl_mem x, size_t offx, int incx, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events) { cl_int err = clblasNotImplemented; ListHead seq; CLBlasKargs gbmvKargs; ListHead gbmvSeq; //cl_context c; clblasStatus retCode = clblasSuccess; if (!clblasInitialized) { return clblasNotInitialized; } /* Validate arguments */ retCode = checkMemObjects(A, x, (cl_mem) NULL, false, A_MAT_ERRSET, X_VEC_ERRSET, END_ERRSET); if (retCode != clblasSuccess) { #ifdef DEBUG_TBSV printf("Invalid mem object..\n"); #endif return retCode; } /* * PENDING: * checkMatrixSizes() does not account for "offa" argument. * Need to pass "offa" when "checkMatrixSizes()" is changed. */ retCode = checkBandedMatrixSizes(kargs->dtype, order, trans, N, N, K, 0, A, offa, lda, A_MAT_ERRSET ); if (retCode != clblasSuccess) { #ifdef DEBUG_TBSV printf("Invalid Size for A\n"); #endif return retCode; } retCode = checkVectorSizes(kargs->dtype, N, x, offx, incx, X_VEC_ERRSET ); if (retCode != clblasSuccess) { #ifdef DEBUG_TBSV printf("Invalid Size for X\n"); #endif return retCode; } #ifdef DEBUG_TBSV printf("DoTbsv being called...\n"); #endif if ((commandQueues == NULL) || (numCommandQueues == 0)) { return clblasInvalidValue; } if ((numEventsInWaitList !=0) && (eventWaitList == NULL)) { return clblasInvalidEventWaitList; } if (commandQueues[0] == NULL) { return clblasInvalidCommandQueue; } numCommandQueues = 1; // NOTE: Hard-coding the number of command queues to 1i kargs->order = order; kargs->uplo = uplo; kargs->transA = trans; kargs->diag = diag; kargs->M = N; // store Original N kargs->N = N; kargs->K = K; kargs->A = A; kargs->lda.matrix = lda; kargs->B = x; kargs->ldb.vector = incx; kargs->offBX = offx; kargs->offa = offa; kargs->offA = offa; kargs->C = x; kargs->offCY = offx; kargs->ldc.vector = incx; kargs->startRow = 0; if(trans == clblasNoTrans) { kargs->endRow = (order == clblasRowMajor) ? N-1 : N; } else { kargs->endRow = (order == clblasRowMajor) ? N : N-1; } memcpy(&gbmvKargs, kargs, sizeof(CLBlasKargs)); gbmvKargs.pigFuncID = CLBLAS_GBMV; listInitHead(&seq); listInitHead(&gbmvSeq); err = makeSolutionSeq(CLBLAS_TRSV, kargs, numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events, &seq); if (err == CL_SUCCESS) { err = makeSolutionSeq(CLBLAS_GBMV, &gbmvKargs, numCommandQueues, commandQueues, 0, NULL, NULL, &gbmvSeq); if (err == CL_SUCCESS) { err = orchestrateTBSV(kargs, &seq, &gbmvSeq, numEventsInWaitList, eventWaitList, events); } } freeSolutionSeq(&seq); freeSolutionSeq(&gbmvSeq); return (clblasStatus)err; } clblasStatus clblasStbsv( clblasOrder order, clblasUplo uplo, clblasTranspose trans, clblasDiag diag, size_t N, size_t K, const cl_mem A, size_t offa, size_t lda, cl_mem X, size_t offx, int incx, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events) { CLBlasKargs kargs; #ifdef DEBUG_TBSV printf("STBSV Called\n"); #endif memset(&kargs, 0, sizeof(kargs)); kargs.dtype = TYPE_FLOAT; kargs.pigFuncID = CLBLAS_TBSV; kargs.alpha.argFloat = -1.0; kargs.beta.argFloat = 1.0; return doTbsv(&kargs, order, uplo, trans, diag, N, K, A, offa, lda, X, offx, incx, numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events); } clblasStatus clblasDtbsv( clblasOrder order, clblasUplo uplo, clblasTranspose trans, clblasDiag diag, size_t N, size_t K, const cl_mem A, size_t offa, size_t lda, cl_mem X, size_t offx, int incx, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events) { CLBlasKargs kargs; #ifdef DEBUG_TBSV printf("DTBSV Called\n"); #endif memset(&kargs, 0, sizeof(kargs)); kargs.dtype = TYPE_DOUBLE; kargs.pigFuncID = CLBLAS_TBSV; kargs.alpha.argDouble = -1.0; kargs.beta.argDouble = 1.0; return doTbsv(&kargs, order, uplo, trans, diag, N, K, A, offa, lda, X, offx, incx, numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events); } clblasStatus clblasCtbsv( clblasOrder order, clblasUplo uplo, clblasTranspose trans, clblasDiag diag, size_t N, size_t K, const cl_mem A, size_t offa, size_t lda, cl_mem X, size_t offx, int incx, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events) { CLBlasKargs kargs; FloatComplex alpha, beta; #ifdef DEBUG_TBSV printf("CTBSV Called\n"); #endif memset(&kargs, 0, sizeof(kargs)); kargs.dtype = TYPE_COMPLEX_FLOAT; kargs.pigFuncID = CLBLAS_TBSV; CREAL(alpha) = -1.0; CIMAG(alpha) = 0.0; CREAL(beta) = 1.0; CIMAG(beta) = 0.0; kargs.alpha.argFloatComplex = alpha; kargs.beta.argFloatComplex = beta; return doTbsv(&kargs, order, uplo, trans, diag, N, K, A, offa, lda, X, offx, incx, numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events); } clblasStatus clblasZtbsv( clblasOrder order, clblasUplo uplo, clblasTranspose trans, clblasDiag diag, size_t N, size_t K, const cl_mem A, size_t offa, size_t lda, cl_mem X, size_t offx, int incx, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events) { CLBlasKargs kargs; DoubleComplex alpha, beta; #ifdef DEBUG_TBSV printf("ZTBSV Called\n"); #endif memset(&kargs, 0, sizeof(kargs)); kargs.dtype = TYPE_COMPLEX_DOUBLE; kargs.pigFuncID = CLBLAS_TBSV; CREAL(alpha) = -1.0; CIMAG(alpha) = 0.0; CREAL(beta) = 1.0; CIMAG(beta) = 0.0; kargs.alpha.argDoubleComplex = alpha; kargs.beta.argDoubleComplex = beta; return doTbsv(&kargs, order, uplo, trans, diag, N, K, A, offa, lda, X, offx, incx, numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events); } clblas-2.10/src/library/blas/xtrmm.c000066400000000000000000000141471264277366700174160ustar00rootroot00000000000000/* ************************************************************************ * Copyright 2013 Advanced Micro Devices, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * ************************************************************************/ #include #include #include #include "clblas-internal.h" #include "solution_seq.h" static clblasStatus doTrmm( CLBlasKargs *kargs, clblasOrder order, clblasSide side, clblasUplo uplo, clblasTranspose transA, clblasDiag diag, size_t M, size_t N, const cl_mem A, size_t offA, size_t lda, cl_mem B, size_t offB, size_t ldb, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events) { cl_int err; ListHead seq; size_t msize; clblasStatus retCode = clblasSuccess; if (!clblasInitialized) { return clblasNotInitialized; } /* Validate arguments */ if ((retCode = checkMemObjects(A, B, NULL, false, A_MAT_ERRSET, B_MAT_ERRSET, END_ERRSET))) { return retCode; } msize = (side == clblasLeft) ? M : N; if ((retCode = checkMatrixSizes(kargs->dtype, order, transA, msize, msize, A, offA, lda, A_MAT_ERRSET ))) { return retCode; } if ((retCode = checkMatrixSizes(kargs->dtype, order, clblasNoTrans, M, N, B, offB, ldb, B_MAT_ERRSET ))) { return retCode; } kargs->order = order; kargs->side = side; kargs->uplo = uplo; kargs->transA = transA; kargs->diag = diag; kargs->M = M; kargs->N = N; kargs->A = A; kargs->offA = offA; kargs->lda.matrix = lda; kargs->B = B; kargs->offBX = offB; kargs->ldb.matrix = ldb; // Store original problem size in K, this is used to know it while // calculating result by parts using M or N as part size if (side == clblasLeft) { kargs->K = M; } else { kargs->K = N; } kargs->offsetM = 0; kargs->offsetN = 0; kargs->scimage[0] = 0; #ifndef TRXM_MULTIPLE_QUEUES if (numCommandQueues != 0) { numCommandQueues = 1; } #endif listInitHead(&seq); err = makeSolutionSeq(CLBLAS_TRMM, kargs, numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events, &seq); if (err == CL_SUCCESS) { err = executeSolutionSeq(&seq); } freeSolutionSeq(&seq); return (clblasStatus)err; } clblasStatus clblasStrmm( clblasOrder order, clblasSide side, clblasUplo uplo, clblasTranspose transA, clblasDiag diag, size_t M, size_t N, cl_float alpha, const cl_mem A, size_t offA, size_t lda, cl_mem B, size_t offB, size_t ldb, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events) { CLBlasKargs kargs; memset(&kargs, 0, sizeof(kargs)); kargs.dtype = TYPE_FLOAT; kargs.alpha.argFloat = alpha; return doTrmm(&kargs, order, side, uplo, transA, diag, M, N, A, offA, lda, B, offB, ldb, numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events); } clblasStatus clblasDtrmm( clblasOrder order, clblasSide side, clblasUplo uplo, clblasTranspose transA, clblasDiag diag, size_t M, size_t N, cl_double alpha, const cl_mem A, size_t offA, size_t lda, cl_mem B, size_t offB, size_t ldb, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events) { CLBlasKargs kargs; memset(&kargs, 0, sizeof(kargs)); kargs.dtype = TYPE_DOUBLE; kargs.alpha.argDouble = alpha; return doTrmm(&kargs, order, side, uplo, transA, diag, M, N, A, offA, lda, B, offB, ldb, numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events); } clblasStatus clblasCtrmm( clblasOrder order, clblasSide side, clblasUplo uplo, clblasTranspose transA, clblasDiag diag, size_t M, size_t N, FloatComplex alpha, const cl_mem A, size_t offA, size_t lda, cl_mem B, size_t offB, size_t ldb, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events) { CLBlasKargs kargs; memset(&kargs, 0, sizeof(kargs)); kargs.dtype = TYPE_COMPLEX_FLOAT; kargs.alpha.argFloatComplex = alpha; return doTrmm(&kargs, order, side, uplo, transA, diag, M, N, A, offA, lda, B, offB, ldb, numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events); } clblasStatus clblasZtrmm( clblasOrder order, clblasSide side, clblasUplo uplo, clblasTranspose transA, clblasDiag diag, size_t M, size_t N, DoubleComplex alpha, const cl_mem A, size_t offA, size_t lda, cl_mem B, size_t offB, size_t ldb, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events) { CLBlasKargs kargs; memset(&kargs, 0, sizeof(kargs)); kargs.dtype = TYPE_COMPLEX_DOUBLE; kargs.alpha.argDoubleComplex = alpha; return doTrmm(&kargs, order, side, uplo, transA, diag, M, N, A, offA, lda, B, offB, ldb, numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events); } clblas-2.10/src/library/blas/xtrmv.c000066400000000000000000000247601264277366700174310ustar00rootroot00000000000000/* ************************************************************************ * Copyright 2013 Advanced Micro Devices, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * ************************************************************************/ #include #include #include #include #include #include "clblas-internal.h" #include "solution_seq.h" clblasStatus doTrmv( CLBlasKargs *kargs, clblasOrder order, clblasUplo uplo, clblasTranspose trans, clblasDiag diag, size_t N, const cl_mem A, size_t offa, size_t lda, cl_mem x, size_t offx, int incx, cl_mem y, // Scratch Buffer cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events) { cl_int err; ListHead seq; size_t sizeOfVector; cl_event *newEventWaitList; clblasStatus retCode = clblasSuccess; if (!clblasInitialized) { return clblasNotInitialized; } /* Validate arguments */ if ((retCode = checkMemObjects(A, x, y, true, A_MAT_ERRSET, X_VEC_ERRSET, Y_VEC_ERRSET))) { #ifdef DEBUG_TRMV printf("Invalid mem object..\n"); #endif return retCode; } if ((retCode = checkMatrixSizes(kargs->dtype, order, trans, N, N, A, offa, lda, A_MAT_ERRSET))) { #ifdef DEBUG_TRMV printf("Invalid Size for A\n"); #endif return retCode; } if ((retCode = checkVectorSizes(kargs->dtype, N, x, offx, incx, X_VEC_ERRSET))) { #ifdef DEBUG_TRMV printf("Invalid Size for X\n"); #endif return retCode; } if ((retCode = checkVectorSizes(kargs->dtype, N, y, 0, incx, Y_VEC_ERRSET))) { #ifdef DEBUG_TRMV printf("Invalid Size for scratch vector\n"); #endif return retCode; } #ifdef DEBUG_TRMV printf("DoTrmv being called...\n"); #endif if ((commandQueues == NULL) || (numCommandQueues == 0)) { return clblasInvalidValue; } numCommandQueues = 1; if ((numEventsInWaitList !=0) && (eventWaitList == NULL)) { return clblasInvalidEventWaitList; } newEventWaitList = malloc((numEventsInWaitList+1) * sizeof(cl_event)); if (newEventWaitList == NULL) { return clblasOutOfHostMemory; } if (numEventsInWaitList != 0 ) { memcpy(newEventWaitList, eventWaitList, numEventsInWaitList*sizeof(cl_event)); } /* * ASSUMPTION: * doTRMV assumes "commandQueue" of 0. The same is reflected in * "makeSolutionSeq" as well. If either of them changes in future, * this code needs to be revisited. */ sizeOfVector = (1 + (N-1)*abs(incx)) * dtypeSize(kargs->dtype); err = clEnqueueCopyBuffer(commandQueues[0], x, y, offx*dtypeSize(kargs->dtype), 0, sizeOfVector, numEventsInWaitList, eventWaitList, &newEventWaitList[numEventsInWaitList]); if (err != CL_SUCCESS) { free(newEventWaitList); return err; } kargs->order = order; kargs->uplo = uplo; kargs->transA = trans; kargs->diag = diag; kargs->N = N; kargs->K = N; //store original N kargs->A = A; kargs->lda.matrix = lda; kargs->B = x; kargs->ldb.vector = incx; kargs->C = y; kargs->ldc.vector = incx; kargs->offBX = offx; kargs->offCY = 0; // Not used by assignKargs(); Just for clarity kargs->offa = offa; kargs->offA = offa; kargs->offsetM = 0; kargs->offsetN = 0; // kargs->offsetK = 0; kargs->scimage[0] = 0; kargs->scimage[1] = 0; #ifdef DEBUG_TRMV printf("Calling makeSolutionSeq : TRMV\n"); #endif listInitHead(&seq); err = makeSolutionSeq(CLBLAS_TRMV, kargs, numCommandQueues, commandQueues, numEventsInWaitList+1, newEventWaitList, events, &seq); if (err == CL_SUCCESS) { err = executeSolutionSeq(&seq); } freeSolutionSeq(&seq); free(newEventWaitList); return (clblasStatus)err; } clblasStatus clblasStrmv( clblasOrder order, clblasUplo uplo, clblasTranspose trans, clblasDiag diag, size_t N, const cl_mem A, size_t offa, size_t lda, cl_mem X, size_t offx, int incx, cl_mem scratchBuff, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events) { CLBlasKargs kargs; #ifdef DEBUG_TRMV printf("STRMV Called\n"); #endif memset(&kargs, 0, sizeof(kargs)); kargs.dtype = TYPE_FLOAT; kargs.pigFuncID = CLBLAS_TRMV; return doTrmv(&kargs, order, uplo, trans, diag, N, A, offa, lda, X, offx, incx, scratchBuff, numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events); } clblasStatus clblasDtrmv( clblasOrder order, clblasUplo uplo, clblasTranspose trans, clblasDiag diag, size_t N, const cl_mem A, size_t offa, size_t lda, cl_mem X, size_t offx, int incx, cl_mem scratchBuff, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events) { CLBlasKargs kargs; #ifdef DEBUG_TRMV printf("DTRMV called\n"); #endif memset(&kargs, 0, sizeof(kargs)); kargs.dtype = TYPE_DOUBLE; kargs.pigFuncID = CLBLAS_TRMV; return doTrmv(&kargs, order, uplo, trans, diag, N, A, offa, lda, X, offx, incx, scratchBuff, numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events); } clblasStatus clblasCtrmv( clblasOrder order, clblasUplo uplo, clblasTranspose trans, clblasDiag diag, size_t N, const cl_mem A, size_t offa, size_t lda, cl_mem X, size_t offx, int incx, cl_mem scratchBuff, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events) { CLBlasKargs kargs; #ifdef DEBUG_TRMV printf("CTRMV called\n"); #endif memset(&kargs, 0, sizeof(kargs)); kargs.dtype = TYPE_COMPLEX_FLOAT; kargs.pigFuncID = CLBLAS_TRMV; return doTrmv(&kargs, order, uplo, trans, diag, N, A, offa, lda, X, offx, incx, scratchBuff, numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events); } clblasStatus clblasZtrmv( clblasOrder order, clblasUplo uplo, clblasTranspose trans, clblasDiag diag, size_t N, const cl_mem A, size_t offa, size_t lda, cl_mem X, size_t offx, int incx, cl_mem scratchBuff, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events) { CLBlasKargs kargs; #ifdef DEBUG_TRMV printf("ZTRMV called\n"); #endif memset(&kargs, 0, sizeof(kargs)); kargs.dtype = TYPE_COMPLEX_DOUBLE; kargs.pigFuncID = CLBLAS_TRMV; return doTrmv(&kargs, order, uplo, trans, diag, N, A, offa, lda, X, offx, incx, scratchBuff, numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events); } clblasStatus clblasStpmv( clblasOrder order, clblasUplo uplo, clblasTranspose trans, clblasDiag diag, size_t N, const cl_mem AP, size_t offa, cl_mem X, size_t offx, int incx, cl_mem scratchBuff, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events) { CLBlasKargs kargs; #ifdef DEBUG_TPMV printf("STPMV Called\n"); #endif memset(&kargs, 0, sizeof(kargs)); kargs.dtype = TYPE_FLOAT; kargs.pigFuncID = CLBLAS_TPMV; return doTrmv(&kargs, order, uplo, trans, diag, N, AP, offa, 0 /* lda as zero for packed */, X, offx, incx, scratchBuff, numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events); } clblasStatus clblasDtpmv( clblasOrder order, clblasUplo uplo, clblasTranspose trans, clblasDiag diag, size_t N, const cl_mem AP, size_t offa, cl_mem X, size_t offx, int incx, cl_mem scratchBuff, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events) { CLBlasKargs kargs; #ifdef DEBUG_TPMV printf("DTPMV called\n"); #endif memset(&kargs, 0, sizeof(kargs)); kargs.dtype = TYPE_DOUBLE; kargs.pigFuncID = CLBLAS_TPMV; return doTrmv(&kargs, order, uplo, trans, diag, N, AP, offa, 0, X, offx, incx, scratchBuff, numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events); } clblasStatus clblasCtpmv( clblasOrder order, clblasUplo uplo, clblasTranspose trans, clblasDiag diag, size_t N, const cl_mem AP, size_t offa, cl_mem X, size_t offx, int incx, cl_mem scratchBuff, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events) { CLBlasKargs kargs; #ifdef DEBUG_TPMV printf("CTPMV called\n"); #endif memset(&kargs, 0, sizeof(kargs)); kargs.dtype = TYPE_COMPLEX_FLOAT; kargs.pigFuncID = CLBLAS_TPMV; return doTrmv(&kargs, order, uplo, trans, diag, N, AP, offa, 0, X, offx, incx, scratchBuff, numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events); } clblasStatus clblasZtpmv( clblasOrder order, clblasUplo uplo, clblasTranspose trans, clblasDiag diag, size_t N, const cl_mem AP, size_t offa, cl_mem X, size_t offx, int incx, cl_mem scratchBuff, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events) { CLBlasKargs kargs; #ifdef DEBUG_TPMV printf("ZTPMV called\n"); #endif memset(&kargs, 0, sizeof(kargs)); kargs.dtype = TYPE_COMPLEX_DOUBLE; kargs.pigFuncID = CLBLAS_TPMV; return doTrmv(&kargs, order, uplo, trans, diag, N, AP, offa, 0, X, offx, incx, scratchBuff, numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events); } clblas-2.10/src/library/blas/xtrsm.cc000066400000000000000000001547001264277366700175670ustar00rootroot00000000000000 /************************************************************************ * Copyright 2013 Advanced Micro Devices, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * ************************************************************************/ #include #include #include #include #include #include #include #include #include #include "clblas-internal.h" #include "solution_seq.h" #include "TrtriClKernels.h" #include "TrtriKernelSourceIncludes.h" #include // Transform a trsm in clblasRowMajor into a trsm in clblasColumnMajor: // // The idea is basically that // B = A*X // can be computed as // B' = (A*X)' // = X'*A' // And since changing the order is basically a transpose on each matrix, // the formula becomes with the new order // B = X*A (so only the side, the uplo must be changed and the M and N sizes swapped) // // When enabled, only the ColumnMajor kernels need to be implemented // for all TRSM // #define FORCE_COLUMN_MAJOR 1 #if FORCE_COLUMN_MAJOR template static void force_trsm_column_major(Args & args) { if (args.order == clblasRowMajor) { std::swap(args.M, args.N); args.side = (args.side == clblasLeft ) ? clblasRight : clblasLeft ; args.uplo = (args.uplo == clblasUpper) ? clblasLower : clblasUpper ; args.order = clblasColumnMajor; } } #endif // // This file provide the public clBLAS API for // // clblasStrsm() // clblasDtrsm() // clblasCtrsm() // clblasZtrsm() // // using functors // // Potential optimizations: //// // - Get rid of the 'order' argument assuming that // row-major is equivalent to the transpose of column-major. // That is // // B = A*X // // is equivalent to // // B' = X'*A' // // and, when considering the opposite order, is equivalent to // // B = X*A (with A swap between upper and lower) // // By applying that transformation early, the functors implementing // the TRSMs only have to consider one of the two cases. // // // Common part of all XTRSM implementations using the old Solver infrastructure // #define CL_CHECK(RET) \ if(RET != CL_SUCCESS) { \ printf("OpenCL error %i on line %u\n", RET, __LINE__); \ assert(false); \ } #define min(x,y) ((x)<(y)?(x):(y)) static char *getKernelName(cl_kernel clKernel) { cl_int err; // get kernel name size_t kernelNameLength; err = clGetKernelInfo( clKernel, CL_KERNEL_FUNCTION_NAME, sizeof(kernelNameLength), NULL, &kernelNameLength); // Do not check this error because of an nvidia bug. // The kernelNameLength turns out to be of proper length. // CL_CHECK(err) char *kernelName = new char[kernelNameLength]; err = clGetKernelInfo( clKernel, CL_KERNEL_FUNCTION_NAME, kernelNameLength*sizeof(char), kernelName, NULL ); CL_CHECK(err) return kernelName; } //FIXME: This function should be returning an error. void makeKernel( cl_kernel *clKernel, cl_command_queue clQueue, const char *kernelSource, const char *sourceBuildOptions, const unsigned char **kernelBinary, size_t *kernelBinarySize, const char *binaryBuildOptions) { //TODO: This will need to be converted to thread local when making clBLAS thread safe typedef std::map kernel_map_t; static kernel_map_t kernel_map; cl_context clContext; cl_device_id clDevice; cl_int err; err = clGetCommandQueueInfo( clQueue, CL_QUEUE_CONTEXT, sizeof(clContext), &clContext, NULL); CL_CHECK(err) err = clGetCommandQueueInfo( clQueue, CL_QUEUE_DEVICE, sizeof(clDevice), &clDevice, NULL); CL_CHECK(err) std::stringstream ss; ss << clDevice << "_" << clContext; std::string prefix = ss.str(); if (*clKernel) { char *kernelName = getKernelName(*clKernel); // kernel has already been built, return #ifdef AUTOGEMM_PRINT_DEBUG printf("makeKernel: \"%s\" already built; returning.\n", kernelName); #endif // Check if kernel exists for this device std::string key = prefix + "_" + kernelName; kernel_map_t::iterator idx = kernel_map.find(key); // If kernel not found for this device, set to NULL if (idx == kernel_map.end()) { *clKernel = NULL; } else { *clKernel = idx->second; } delete[] kernelName; } if (!*clKernel) { // kernel has not been built, so build it (from binary, preferably) cl_program clProgram; cl_int clBinaryStatus; if (*kernelBinary) { #ifdef AUTOGEMM_PRINT_DEBUG printf("makeKernel: pre-compiled binary found: %llu bytes\n", *kernelBinarySize); printf("makeKernel: Creating program from binary\n"); #endif clProgram = clCreateProgramWithBinary( clContext, 1, &clDevice, kernelBinarySize, kernelBinary, &clBinaryStatus, &err ); #ifdef AUTOGEMM_PRINT_DEBUG if (err != CL_SUCCESS) { printf("makeKernel: Failed to create program with binary\n"); } #endif err = clBuildProgram( clProgram, 1, &clDevice, binaryBuildOptions, NULL, NULL ); #ifdef AUTOGEMM_PRINT_DEBUG if (err != CL_SUCCESS) { printf("makeKernel: Failed to build program from binary\n"); } #endif } if (!*kernelBinary || err != CL_SUCCESS) { #ifdef AUTOGEMM_PRINT_DEBUG printf("makeKernel: Creating program from source\n"); #endif clProgram = clCreateProgramWithSource( clContext, 1, &kernelSource, NULL, &err ); CL_CHECK(err) err = clBuildProgram( clProgram, 1, &clDevice, sourceBuildOptions, NULL, NULL ); CL_CHECK(err) } // print build failure if (err != CL_SUCCESS) { printf("clBuildProgram Failed\n"); printf("err = %d\n", err); size_t len = 0; clGetProgramBuildInfo(clProgram, clDevice, CL_PROGRAM_BUILD_LOG, 0, NULL, &len); char* buildLog = new char[len]; printf("Error: Failed to build program executable!\n"); clGetProgramBuildInfo(clProgram, clDevice, CL_PROGRAM_BUILD_LOG, len*sizeof(char), buildLog, 0); printf("\nBuild Log:\n\n"); printf("%s\n", buildLog); //printf("\n\nKernel String:\n\n"); //printf("%s\n", kernelSource); //FIXME: The function should be exiting at this point } err = clCreateKernelsInProgram( clProgram, 1, clKernel, NULL ); CL_CHECK(err) err = clReleaseProgram(clProgram); CL_CHECK(err) char *kernelName = getKernelName(*clKernel); #ifdef AUTOGEMM_PRINT_DEBUG printf("makeKernel: \"%s\" now built; returning.\n", kernelName); #endif std::string key = prefix + "_" + kernelName; kernel_map[key] = *clKernel; delete[] kernelName; } return; } static cl_int clearBuffer(cl_command_queue queue, cl_mem buffer, size_t buffer_size) { cl_int err = 0; cl_event event; // Hummm clEnqueueFillBuffer is OpenCL 1.2 !!! double zero = 0.0; err = clEnqueueFillBuffer(queue, buffer, &zero, sizeof(double), 0, // offset buffer_size, 0, NULL, &event ); return err; } // Compute the number of blocks of the specified 'size' to fully cover 'n' // Simply speaking, this is n/size rounded up. #define BLOCKS(n,size) ( ( (n) / size ) + ( (n) % (size) != 0 ) ) cl_int call_kernel_triple_update192( cl_kernel *kernel, const char *kernelSource, const char *sourceBuildOptions, const unsigned char **kernelBinary, size_t *kernelBinarySize, const char *binaryBuildOptions, const cl_command_queue queue, cl_mem A, unsigned int offA, cl_mem d_dinvA, int i, unsigned int lda, int M, cl_event *event) { cl_int err = 0; unsigned int m = M; int npages = M / (i * 2) + (M % (i * 2) != 0); size_t globalLocal[2]; size_t globalThreads[2]; switch (i) { case 12: globalLocal[0] = 12; globalLocal[1] = 1; globalThreads[0] = npages * 12; globalThreads[1] = 1; break; case 24: globalLocal[0] = 24; globalLocal[1] = 2; globalThreads[0] = npages * 24; globalThreads[1] = 2; break; case 48: globalLocal[0] = 24; globalLocal[1] = 2; globalThreads[0] = npages * 48; globalThreads[1] = 4; break; case 96: globalLocal[0] = 24; globalLocal[1] = 2; globalThreads[0] = npages * 96; globalThreads[1] = 8; break; default: break; } makeKernel(kernel, queue, kernelSource, sourceBuildOptions, kernelBinary, kernelBinarySize, binaryBuildOptions); /* if (err != CL_SUCCESS) { //printf( "create kernel %s failed with %d\n", kernel_name, err ); return err; } */ clSetKernelArg(*kernel, 0, sizeof(cl_mem), &A); clSetKernelArg(*kernel, 1, sizeof(unsigned int), &offA); clSetKernelArg(*kernel, 2, sizeof(cl_mem), &d_dinvA); clSetKernelArg(*kernel, 3, sizeof(int), &i); clSetKernelArg(*kernel, 4, sizeof(unsigned int), &lda); clSetKernelArg(*kernel, 5, sizeof(int), &npages); clSetKernelArg(*kernel, 6, sizeof(unsigned int), &m); err = clEnqueueNDRangeKernel(queue, *kernel, 2, NULL, globalThreads, globalLocal, 0, NULL, event); return err; } cl_int diag_dtrtri192( cl_command_queue queue, int M, clblasUplo uplo, clblasDiag diag, cl_mem A, size_t offA, cl_mem d_dinvA, size_t lda, int inner_block_size, int outer_block_size, cl_event *event) { const char *diag_dtrtri_kernel_upper_KernelSource = NULL; cl_kernel *diag_dtrtri_kernel_upper_ClKernel = NULL; size_t diag_dtrtri_kernel_upper_KernelBinarySize = 0; const unsigned char *diag_dtrtri_kernel_upper_KernelBinary = NULL; cl_int err; /* This routine is used in dtrsm */ //For side==right, M is actually N here int nthreads = (M / inner_block_size + (M % inner_block_size != 0)) * inner_block_size; unsigned int m = M; if (uplo == clblasLower) { //lower is not supported yet } else { diag_dtrtri_kernel_upper_KernelSource = diag_dtrtri_upper_192_12_src; diag_dtrtri_kernel_upper_ClKernel = &diag_dtrtri_upper_192_12_clKernel; diag_dtrtri_kernel_upper_KernelBinary = diag_dtrtri_upper_192_12_bin; diag_dtrtri_kernel_upper_KernelBinarySize = diag_dtrtri_upper_192_12_binSize; //cl_kernel diag_dtrtri_kernel_upper = clCreateKernel(prg, "DIAG_DTRTRI_KERNEL_UPPER", &err); makeKernel(diag_dtrtri_kernel_upper_ClKernel, queue, diag_dtrtri_kernel_upper_KernelSource, TrtriBuildOptions, &diag_dtrtri_kernel_upper_KernelBinary, &diag_dtrtri_kernel_upper_KernelBinarySize, TrtribinBuildOptions); int isDiagUnit = (diag == clblasUnit); err = clSetKernelArg(*diag_dtrtri_kernel_upper_ClKernel, 0, sizeof(int), &isDiagUnit); CL_CHECK(err); err = clSetKernelArg(*diag_dtrtri_kernel_upper_ClKernel, 1, sizeof(cl_mem), &A); CL_CHECK(err); err = clSetKernelArg(*diag_dtrtri_kernel_upper_ClKernel, 2, sizeof(unsigned int), &offA); CL_CHECK(err); err = clSetKernelArg(*diag_dtrtri_kernel_upper_ClKernel, 3, sizeof(cl_mem), &d_dinvA); CL_CHECK(err); err = clSetKernelArg(*diag_dtrtri_kernel_upper_ClKernel, 4, sizeof(unsigned int), &lda); CL_CHECK(err); err = clSetKernelArg(*diag_dtrtri_kernel_upper_ClKernel, 5, sizeof(unsigned int), &m); CL_CHECK(err); size_t globalThreads[1] = { nthreads }; size_t globalLocal[1] = { inner_block_size }; err = clEnqueueNDRangeKernel(queue, *diag_dtrtri_kernel_upper_ClKernel, 1, NULL, globalThreads, globalLocal, 0, NULL, event); if (err != CL_SUCCESS) { //printf( "kernel -diag_dtrtri_kernel_upper- failed with %d\n", err ); return err; } //clReleaseKernel(diag_dtrtri_kernel_upper); if (err != CL_SUCCESS) { return err; } // update the inverse up to the size of BLOCK_SIZE for (int i = inner_block_size; i < outer_block_size; i *= 2) { switch (i) { case 12: //CALL_KERNEL_TRIPLE_UPDATE("TRIPLE_DGEMM_UPDATE_12_R", prg, queue, A, offA, d_dinvA, i, lda, M, event); err = call_kernel_triple_update192(&triple_dgemm_update_192_12_R_clKernel, triple_dgemm_update_192_12_R_src, TrtriBuildOptions, (const unsigned char **)&triple_dgemm_update_192_12_R_bin, &triple_dgemm_update_192_12_R_binSize, TrtribinBuildOptions, queue, A, offA, d_dinvA, i, lda, M, event); CL_CHECK(err); break; case 24: //CALL_KERNEL_TRIPLE_UPDATE("TRIPLE_DGEMM_UPDATE_24_PART1_R", prg, queue, A, offA, d_dinvA, i, lda, M, event); //CALL_KERNEL_TRIPLE_UPDATE("TRIPLE_DGEMM_UPDATE_24_PART2_R", prg, queue, A, offA, d_dinvA, i, lda, M, event); err = call_kernel_triple_update192(&triple_dgemm_update_192_24_PART1_R_clKernel, triple_dgemm_update_192_24_PART1_R_src, TrtriBuildOptions, (const unsigned char **)&triple_dgemm_update_192_24_PART1_R_bin, &triple_dgemm_update_192_24_PART1_R_binSize, TrtribinBuildOptions, queue, A, offA, d_dinvA, i, lda, M, event); CL_CHECK(err); err = call_kernel_triple_update192(&triple_dgemm_update_192_24_PART2_R_clKernel, triple_dgemm_update_192_24_PART2_R_src, TrtriBuildOptions, (const unsigned char **)&triple_dgemm_update_192_24_PART2_R_bin, &triple_dgemm_update_192_24_PART2_R_binSize, TrtribinBuildOptions, queue, A, offA, d_dinvA, i, lda, M, event); CL_CHECK(err); break; case 48: //CALL_KERNEL_TRIPLE_UPDATE("TRIPLE_DGEMM_UPDATE_48_PART1_R", prg, queue, A, offA, d_dinvA, i, lda, M, event); //CALL_KERNEL_TRIPLE_UPDATE("TRIPLE_DGEMM_UPDATE_48_PART2_R", prg, queue, A, offA, d_dinvA, i, lda, M, event); err = call_kernel_triple_update192(&triple_dgemm_update_192_48_PART1_R_clKernel, triple_dgemm_update_192_48_PART1_R_src, TrtriBuildOptions, (const unsigned char **)&triple_dgemm_update_192_48_PART1_R_bin, &triple_dgemm_update_192_48_PART1_R_binSize, TrtribinBuildOptions, queue, A, offA, d_dinvA, i, lda, M, event); CL_CHECK(err); err = call_kernel_triple_update192(&triple_dgemm_update_192_48_PART2_R_clKernel, triple_dgemm_update_192_48_PART2_R_src, TrtriBuildOptions, (const unsigned char **)&triple_dgemm_update_192_48_PART2_R_bin, &triple_dgemm_update_192_48_PART2_R_binSize, TrtribinBuildOptions, queue, A, offA, d_dinvA, i, lda, M, event); CL_CHECK(err); break; case 96: //CALL_KERNEL_TRIPLE_UPDATE("TRIPLE_DGEMM_UPDATE_96_PART1_R", prg, queue, A, offA, d_dinvA, i, lda, M, event); //CALL_KERNEL_TRIPLE_UPDATE("TRIPLE_DGEMM_UPDATE_96_PART2_R", prg, queue, A, offA, d_dinvA, i, lda, M, event); err = call_kernel_triple_update192(&triple_dgemm_update_192_96_PART1_R_clKernel, triple_dgemm_update_192_96_PART1_R_src, TrtriBuildOptions, (const unsigned char **)&triple_dgemm_update_192_96_PART1_R_bin, &triple_dgemm_update_192_96_PART1_R_binSize, TrtribinBuildOptions, queue, A, offA, d_dinvA, i, lda, M, event); CL_CHECK(err); err = call_kernel_triple_update192(&triple_dgemm_update_192_96_PART2_R_clKernel, triple_dgemm_update_192_96_PART2_R_src, TrtriBuildOptions, (const unsigned char **)&triple_dgemm_update_192_96_PART2_R_bin, &triple_dgemm_update_192_96_PART2_R_binSize, TrtribinBuildOptions, queue, A, offA, d_dinvA, i, lda, M, event); CL_CHECK(err); break; default: break; } if (i * 2 >= M) break; } } return err; } static clblasStatus gpu_dtrsm192( clblasOrder order, clblasSide side, clblasUplo uplo, clblasTranspose transA, clblasDiag diag, size_t M, size_t N, double alpha, const cl_mem A, size_t offA, size_t lda, cl_mem B, size_t offB, size_t ldb, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events, bool &specialCaseHandled) { if (order != clblasColumnMajor) return clblasNotImplemented; if ((M % 192 == 0) && (N % 192 == 0)) { //TODO: the implementation of sub block being 192 only supports //side == right //uplo == upper //trans == notrans //M and N need to be mod192 //subblock being 192 is prefered over 128 on Hawaii device since //it does not create "boundary" in DGEMM calls //Hawaii DGEMM calls have better performance when M N K are mod48 if ((side == clblasRight) && (uplo == clblasUpper) && (transA == clblasNoTrans)) { int inner_block_size = 12; // inner blocking size, <=32 int outer_block_size = 192;// outer blocking size, >BLOCK_SIZE cl_int err = 0; int i; cl_context context; err = getQueueContext(commandQueues[0], &context); CL_CHECK(err); /* quick return on wrong size */ if (M <= 0 || N <= 0) return clblasInvalidDim; double neg_one = -1.0; double one = 1.0; double zero = 0.0; cl_mem InvA = 0; cl_mem X = 0; // X of size mxn will contain the result size_t ldX = M; size_t offX = 0; //must be 0: needed by the _(X,i,j) macro size_t size_X = N*ldX * sizeof(double); X = clCreateBuffer(context, CL_MEM_READ_WRITE, size_X, NULL, &err); CL_CHECK(err); err = clearBuffer(commandQueues[0], X, size_X); CL_CHECK(err); // side=R /* invert the diagonals * Allocate device memory for the inverted diagonal blocks, size=n*BLOCK_SIZE */ /* invert the diagonals * Allocate device memory for the inverted diagonal blocks, size=m*nb */ size_t ldInvA = outer_block_size; size_t offInvA = 0; //must be 0: needed by the _(X,i,j) macro size_t size_InvA = ldInvA * BLOCKS(N, outer_block_size) * outer_block_size *sizeof(double); InvA = clCreateBuffer(context, CL_MEM_READ_WRITE, size_InvA, NULL, &err); CL_CHECK(err); err = clearBuffer(commandQueues[0], InvA, size_InvA); CL_CHECK(err); diag_dtrtri192(commandQueues[0], N, uplo, diag, A, offA, InvA, lda, inner_block_size, outer_block_size, events); if (transA == clblasNoTrans) { /* the non-transpose case */ if (uplo == clblasLower) { /* the lower case */ /* handle the first block seperately with alpha */ // lower is not implemented yet } else { /* the upper case */ /* handle the first block seperately with alpha */ int nn = min(outer_block_size, (int)N); //DGEMM_RIGHT( M, nn, nn, alpha, _(B,0,0), _(InvA,0,0), zero, _(X,0,0) ); err = clblasDgemm(clblasColumnMajor, clblasNoTrans, clblasNoTrans, M, nn, nn, alpha, B, offB, ldb, InvA, offInvA, ldInvA, zero, X, offX, ldX, 1, commandQueues, 0, NULL, events); CL_CHECK(err); if (outer_block_size < N) { //DGEMM_RIGHT( M, N-nb, nb, neg_one, _(X,0,0), _(A,0,nb), alpha, _(B,0,nb) ); err = clblasDgemm(clblasColumnMajor, clblasNoTrans, clblasNoTrans, M, N - outer_block_size, outer_block_size, neg_one, X, offX, ldX, A, offA + lda*outer_block_size, lda, alpha, B, offB + outer_block_size*ldb, ldb, 1, commandQueues, 0, NULL, events); assert(err == CL_SUCCESS); /* the rest blocks */ for (i = outer_block_size; i < N; i += outer_block_size) { nn = min(outer_block_size, (int)N - i); //DGEMM_RIGHT( M, nn, nn, one, _(B,0,i), _(InvA,0,i), zero, _(X,0,i) ); err = clblasDgemm(clblasColumnMajor, clblasNoTrans, clblasNoTrans, M, nn, nn, one, B, offB + i*ldb, ldb, InvA, offInvA + i*outer_block_size, ldInvA, zero, X, offX + i*ldX, ldX, 1, commandQueues, 0, NULL, events); assert(err == CL_SUCCESS); if (i + outer_block_size >= N) break; //DGEMM_RIGHT( M, N-i-nb, nb, neg_one, _(X,0,i), _(A,i,i+nb), one, _(B,0,i+nb) ); err = clblasDgemm(clblasColumnMajor, clblasNoTrans, clblasNoTrans, M, N - i - outer_block_size, outer_block_size, neg_one, X, offX + i*ldX, ldX, A, offA + i + (outer_block_size + i)*lda, lda, one, B, offB + (i + outer_block_size)*ldb, ldb, 1, commandQueues, 0, NULL, events); assert(err == CL_SUCCESS); } } } } else { /* the transpose case */ // trans is not implemented yet } { size_t src_origin[3] = { 0, 0, 0 }; size_t dst_origin[3] = { offB*sizeof(double), 0, 0 }; size_t region[3] = { M*sizeof(double), N, 1 }; err = clEnqueueCopyBufferRect(commandQueues[0], X, B, src_origin, dst_origin, region, ldX*sizeof(double), 0, ldb*sizeof(double), 0, 0, NULL, events); CL_CHECK(err); clReleaseMemObject(InvA); clReleaseMemObject(X); } specialCaseHandled = true; return clblasSuccess; } } return clblasNotImplemented; } cl_int call_kernel_triple_update128( cl_kernel *kernel, const char *kernelSource, const char *sourceBuildOptions, const unsigned char **kernelBinary, size_t *kernelBinarySize, const char *binaryBuildOptions, const cl_command_queue queue, cl_mem A, unsigned int offA, cl_mem d_dinvA, int i, unsigned int lda, int M, cl_event *event) { cl_int err = 0; unsigned int m = M; int npages = M / (i * 2) + (M % (i * 2) != 0); size_t globalLocal[2] = { (i <= 32) ? (i / 4) : 16, 4 }; size_t globalThreads[2] = { (i / (globalLocal[0] * globalLocal[1]))* globalLocal[0], npages*(i / 16) * globalLocal[1] }; makeKernel(kernel, queue, kernelSource, sourceBuildOptions, kernelBinary, kernelBinarySize, binaryBuildOptions); /* if (err != CL_SUCCESS) { //printf( "create kernel %s failed with %d\n", kernel_name, err ); return err; } */ clSetKernelArg(*kernel, 0, sizeof(cl_mem), &A); clSetKernelArg(*kernel, 1, sizeof(unsigned int), &offA); clSetKernelArg(*kernel, 2, sizeof(cl_mem), &d_dinvA); clSetKernelArg(*kernel, 3, sizeof(int), &i); clSetKernelArg(*kernel, 4, sizeof(unsigned int), &lda); clSetKernelArg(*kernel, 5, sizeof(int), &npages); clSetKernelArg(*kernel, 6, sizeof(unsigned int), &m); err = clEnqueueNDRangeKernel(queue, *kernel, 2, NULL, globalThreads, globalLocal, 0, NULL, event); return err; } cl_int diag_dtrtri128( cl_command_queue queue, int M, clblasUplo uplo, clblasDiag diag, cl_mem A, size_t offA, cl_mem d_dinvA, size_t lda, int inner_block_size, int outer_block_size, cl_event *event) { const char *diag_dtrtri_kernel_upper_KernelSource = NULL; cl_kernel *diag_dtrtri_kernel_upper_ClKernel = NULL; size_t diag_dtrtri_kernel_upper_KernelBinarySize = 0; const unsigned char *diag_dtrtri_kernel_upper_KernelBinary = NULL; const char *diag_dtrtri_kernel_lower_KernelSource = NULL; cl_kernel *diag_dtrtri_kernel_lower_ClKernel = NULL; size_t diag_dtrtri_kernel_lower_KernelBinarySize = 0; const unsigned char *diag_dtrtri_kernel_lower_KernelBinary = NULL; cl_int err = 0; /* This routine is used in dtrsm */ int nthreads = (M / inner_block_size + (M % inner_block_size != 0)) * inner_block_size; unsigned int m = M; if (uplo == clblasLower) { diag_dtrtri_kernel_lower_KernelSource = diag_dtrtri_lower_128_16_src; diag_dtrtri_kernel_lower_ClKernel = &diag_dtrtri_lower_128_16_clKernel; diag_dtrtri_kernel_lower_KernelBinary = diag_dtrtri_lower_128_16_bin; diag_dtrtri_kernel_lower_KernelBinarySize = diag_dtrtri_lower_128_16_binSize; makeKernel(diag_dtrtri_kernel_lower_ClKernel, queue, diag_dtrtri_kernel_lower_KernelSource, TrtriBuildOptions, &diag_dtrtri_kernel_lower_KernelBinary, &diag_dtrtri_kernel_lower_KernelBinarySize, TrtribinBuildOptions); int isDiagUnit = (diag == clblasUnit); clSetKernelArg(*diag_dtrtri_kernel_lower_ClKernel, 0, sizeof(int), &isDiagUnit); clSetKernelArg(*diag_dtrtri_kernel_lower_ClKernel, 1, sizeof(cl_mem), &A); clSetKernelArg(*diag_dtrtri_kernel_lower_ClKernel, 2, sizeof(unsigned int), &offA); clSetKernelArg(*diag_dtrtri_kernel_lower_ClKernel, 3, sizeof(cl_mem), &d_dinvA); clSetKernelArg(*diag_dtrtri_kernel_lower_ClKernel, 4, sizeof(unsigned int), &lda); clSetKernelArg(*diag_dtrtri_kernel_lower_ClKernel, 5, sizeof(unsigned int), &m); size_t globalThreads[1] = { nthreads }; size_t globalLocal[1] = { inner_block_size }; err = clEnqueueNDRangeKernel(queue, *diag_dtrtri_kernel_lower_ClKernel, 1, NULL, globalThreads, globalLocal, 0, NULL, event); if (err != CL_SUCCESS) { //printf( "kernel -diag_dtrtri_kernel_lower- failed with %d\n", err ); return err; } // update the inverse up to the size of BLOCK_SIZE for (int i = inner_block_size; i < outer_block_size; i *= 2) { switch (i) { case 16: //CALL_KERNEL_TRIPLE_UPDATE("TRIPLE_DGEMM_UPDATE_16_PART1_L", prg, queue, A, offA, d_dinvA, i, lda, M, event); //CALL_KERNEL_TRIPLE_UPDATE("TRIPLE_DGEMM_UPDATE_16_PART2_L", prg, queue, A, offA, d_dinvA, i, lda, M, event); err = call_kernel_triple_update128(&triple_dgemm_update_128_16_PART1_L_clKernel, triple_dgemm_update_128_16_PART1_L_src, TrtriBuildOptions, (const unsigned char **)&triple_dgemm_update_128_16_PART1_L_bin, &triple_dgemm_update_128_16_PART1_L_binSize, TrtribinBuildOptions, queue, A, offA, d_dinvA, i, lda, M, event); CL_CHECK(err); err = call_kernel_triple_update128(&triple_dgemm_update_128_16_PART2_L_clKernel, triple_dgemm_update_128_16_PART2_L_src, TrtriBuildOptions, (const unsigned char **)&triple_dgemm_update_128_16_PART2_L_bin, &triple_dgemm_update_128_16_PART2_L_binSize, TrtribinBuildOptions, queue, A, offA, d_dinvA, i, lda, M, event); CL_CHECK(err); break; case 32: //CALL_KERNEL_TRIPLE_UPDATE("TRIPLE_DGEMM_UPDATE_32_PART1_L", prg, queue, A, offA, d_dinvA, i, lda, M, event); //CALL_KERNEL_TRIPLE_UPDATE("TRIPLE_DGEMM_UPDATE_32_PART2_L", prg, queue, A, offA, d_dinvA, i, lda, M, event); err = call_kernel_triple_update128(&triple_dgemm_update_128_32_PART1_L_clKernel, triple_dgemm_update_128_32_PART1_L_src, TrtriBuildOptions, (const unsigned char **)&triple_dgemm_update_128_32_PART1_L_bin, &triple_dgemm_update_128_32_PART1_L_binSize, TrtribinBuildOptions, queue, A, offA, d_dinvA, i, lda, M, event); CL_CHECK(err); err = call_kernel_triple_update128(&triple_dgemm_update_128_32_PART2_L_clKernel, triple_dgemm_update_128_32_PART2_L_src, TrtriBuildOptions, (const unsigned char **)&triple_dgemm_update_128_32_PART2_L_bin, &triple_dgemm_update_128_32_PART2_L_binSize, TrtribinBuildOptions, queue, A, offA, d_dinvA, i, lda, M, event); CL_CHECK(err); break; case 64: //CALL_KERNEL_TRIPLE_UPDATE("TRIPLE_DGEMM_UPDATE_64_PART1_L", prg, queue, A, offA, d_dinvA, i, lda, M, event); //CALL_KERNEL_TRIPLE_UPDATE("TRIPLE_DGEMM_UPDATE_64_PART2_L", prg, queue, A, offA, d_dinvA, i, lda, M, event); err = call_kernel_triple_update128(&triple_dgemm_update_128_64_PART1_L_clKernel, triple_dgemm_update_128_64_PART1_L_src, TrtriBuildOptions, (const unsigned char **)&triple_dgemm_update_128_64_PART1_L_bin, &triple_dgemm_update_128_64_PART1_L_binSize, TrtribinBuildOptions, queue, A, offA, d_dinvA, i, lda, M, event); CL_CHECK(err); err = call_kernel_triple_update128(&triple_dgemm_update_128_64_PART2_L_clKernel, triple_dgemm_update_128_64_PART2_L_src, TrtriBuildOptions, (const unsigned char **)&triple_dgemm_update_128_64_PART2_L_bin, &triple_dgemm_update_128_64_PART2_L_binSize, TrtribinBuildOptions, queue, A, offA, d_dinvA, i, lda, M, event); CL_CHECK(err); break; default: //CALL_KERNEL_TRIPLE_UPDATE("TRIPLE_DGEMM_UPDATE_ABOVE64_PART1_L", prg, queue, A, offA, d_dinvA, i, lda, M, event); //CALL_KERNEL_TRIPLE_UPDATE("TRIPLE_DGEMM_UPDATE_ABOVE64_PART2_L", prg, queue, A, offA, d_dinvA, i, lda, M, event); //CALL_KERNEL_TRIPLE_UPDATE("TRIPLE_DGEMM_UPDATE_ABOVE64_PART3_L", prg, queue, A, offA, d_dinvA, i, lda, M, event); err = call_kernel_triple_update128(&triple_dgemm_update_128_ABOVE64_PART1_L_clKernel, triple_dgemm_update_128_ABOVE64_PART1_L_src, TrtriBuildOptions, (const unsigned char **)&triple_dgemm_update_128_ABOVE64_PART1_L_bin, &triple_dgemm_update_128_ABOVE64_PART1_L_binSize, TrtribinBuildOptions, queue, A, offA, d_dinvA, i, lda, M, event); CL_CHECK(err); err = call_kernel_triple_update128(&triple_dgemm_update_128_ABOVE64_PART2_L_clKernel, triple_dgemm_update_128_ABOVE64_PART2_L_src, TrtriBuildOptions, (const unsigned char **)&triple_dgemm_update_128_ABOVE64_PART2_L_bin, &triple_dgemm_update_128_ABOVE64_PART2_L_binSize, TrtribinBuildOptions, queue, A, offA, d_dinvA, i, lda, M, event); CL_CHECK(err); err = call_kernel_triple_update128(&triple_dgemm_update_128_ABOVE64_PART3_L_clKernel, triple_dgemm_update_128_ABOVE64_PART3_L_src, TrtriBuildOptions, (const unsigned char **)&triple_dgemm_update_128_ABOVE64_PART3_L_bin, &triple_dgemm_update_128_ABOVE64_PART3_L_binSize, TrtribinBuildOptions, queue, A, offA, d_dinvA, i, lda, M, event); CL_CHECK(err); break; } if (i * 2 >= M) break; } } else { diag_dtrtri_kernel_upper_KernelSource = diag_dtrtri_upper_128_16_src; diag_dtrtri_kernel_upper_ClKernel = &diag_dtrtri_upper_128_16_clKernel; diag_dtrtri_kernel_upper_KernelBinary = diag_dtrtri_upper_128_16_bin; diag_dtrtri_kernel_upper_KernelBinarySize = diag_dtrtri_upper_128_16_binSize; makeKernel(diag_dtrtri_kernel_upper_ClKernel, queue, diag_dtrtri_kernel_upper_KernelSource, TrtriBuildOptions, &diag_dtrtri_kernel_upper_KernelBinary, &diag_dtrtri_kernel_upper_KernelBinarySize, TrtribinBuildOptions); int isDiagUnit = (diag == clblasUnit); err = clSetKernelArg(*diag_dtrtri_kernel_upper_ClKernel, 0, sizeof(int), &isDiagUnit); CL_CHECK(err); err = clSetKernelArg(*diag_dtrtri_kernel_upper_ClKernel, 1, sizeof(cl_mem), &A); CL_CHECK(err); err = clSetKernelArg(*diag_dtrtri_kernel_upper_ClKernel, 2, sizeof(unsigned int), &offA); CL_CHECK(err); err = clSetKernelArg(*diag_dtrtri_kernel_upper_ClKernel, 3, sizeof(cl_mem), &d_dinvA); CL_CHECK(err); err = clSetKernelArg(*diag_dtrtri_kernel_upper_ClKernel, 4, sizeof(unsigned int), &lda); CL_CHECK(err); err = clSetKernelArg(*diag_dtrtri_kernel_upper_ClKernel, 5, sizeof(unsigned int), &m); CL_CHECK(err); size_t globalThreads[1] = { nthreads }; size_t globalLocal[1] = { inner_block_size }; err = clEnqueueNDRangeKernel(queue, *diag_dtrtri_kernel_upper_ClKernel, 1, NULL, globalThreads, globalLocal, 0, NULL, NULL); CL_CHECK(err); //err = clFinish(queue); //CL_CHECK(err); if (err != CL_SUCCESS) { //printf( "kernel -diag_dtrtri_kernel_upper- failed with %d\n", err ); return err; } //clReleaseKernel(diag_dtrtri_kernel_upper); //if (err != CL_SUCCESS) { // return err; //} // update the inverse up to the size of BLOCK_SIZE for (int i = inner_block_size; i < outer_block_size; i *= 2) { switch (i) { case 16: //CALL_KERNEL_TRIPLE_UPDATE("TRIPLE_DGEMM_UPDATE_16_R", prg, queue, A, offA, d_dinvA, i, lda, M, event); err = call_kernel_triple_update128(&triple_dgemm_update_128_16_R_clKernel, triple_dgemm_update_128_16_R_src, TrtriBuildOptions, (const unsigned char **)&triple_dgemm_update_128_16_R_bin, &triple_dgemm_update_128_16_R_binSize, TrtribinBuildOptions, queue, A, offA, d_dinvA, i, lda, M, event); CL_CHECK(err); //err = clFinish(queue); //CL_CHECK(err); break; case 32: //CALL_KERNEL_TRIPLE_UPDATE("TRIPLE_DGEMM_UPDATE_32_PART1_R", prg, queue, A, offA, d_dinvA, i, lda, M, event); //CALL_KERNEL_TRIPLE_UPDATE("TRIPLE_DGEMM_UPDATE_32_PART2_R", prg, queue, A, offA, d_dinvA, i, lda, M, event); err = call_kernel_triple_update128(&triple_dgemm_update_128_32_PART1_R_clKernel, triple_dgemm_update_128_32_PART1_R_src, TrtriBuildOptions, (const unsigned char **)&triple_dgemm_update_128_32_PART1_R_bin, &triple_dgemm_update_128_32_PART1_R_binSize, TrtribinBuildOptions, queue, A, offA, d_dinvA, i, lda, M, event); CL_CHECK(err); err = call_kernel_triple_update128(&triple_dgemm_update_128_32_PART2_R_clKernel, triple_dgemm_update_128_32_PART2_R_src, TrtriBuildOptions, (const unsigned char **)&triple_dgemm_update_128_32_PART2_R_bin, &triple_dgemm_update_128_32_PART2_R_binSize, TrtribinBuildOptions, queue, A, offA, d_dinvA, i, lda, M, event); CL_CHECK(err); break; case 64: //CALL_KERNEL_TRIPLE_UPDATE("TRIPLE_DGEMM_UPDATE_64_PART1_R", prg, queue, A, offA, d_dinvA, i, lda, M, event); //CALL_KERNEL_TRIPLE_UPDATE("TRIPLE_DGEMM_UPDATE_64_PART2_R", prg, queue, A, offA, d_dinvA, i, lda, M, event); err = call_kernel_triple_update128(&triple_dgemm_update_128_64_PART1_R_clKernel, triple_dgemm_update_128_64_PART1_R_src, TrtriBuildOptions, (const unsigned char **)&triple_dgemm_update_128_64_PART1_R_bin, &triple_dgemm_update_128_64_PART1_R_binSize, TrtribinBuildOptions, queue, A, offA, d_dinvA, i, lda, M, event); CL_CHECK(err); err = call_kernel_triple_update128(&triple_dgemm_update_128_64_PART2_R_clKernel, triple_dgemm_update_128_64_PART2_R_src, TrtriBuildOptions, (const unsigned char **)&triple_dgemm_update_128_64_PART2_R_bin, &triple_dgemm_update_128_64_PART2_R_binSize, TrtribinBuildOptions, queue, A, offA, d_dinvA, i, lda, M, event); CL_CHECK(err); break; default: //CALL_KERNEL_TRIPLE_UPDATE("TRIPLE_DGEMM_UPDATE_ABOVE64_PART1_R", prg, queue, A, offA, d_dinvA, i, lda, M, event); //CALL_KERNEL_TRIPLE_UPDATE("TRIPLE_DGEMM_UPDATE_ABOVE64_PART2_R", prg, queue, A, offA, d_dinvA, i, lda, M, event); //CALL_KERNEL_TRIPLE_UPDATE("TRIPLE_DGEMM_UPDATE_ABOVE64_PART3_R", prg, queue, A, offA, d_dinvA, i, lda, M, event); err = call_kernel_triple_update128(&triple_dgemm_update_128_ABOVE64_PART1_R_clKernel, triple_dgemm_update_128_ABOVE64_PART1_R_src, TrtriBuildOptions, (const unsigned char **)&triple_dgemm_update_128_ABOVE64_PART1_R_bin, &triple_dgemm_update_128_ABOVE64_PART1_R_binSize, TrtribinBuildOptions, queue, A, offA, d_dinvA, i, lda, M, event); CL_CHECK(err); err = call_kernel_triple_update128(&triple_dgemm_update_128_ABOVE64_PART2_R_clKernel, triple_dgemm_update_128_ABOVE64_PART2_R_src, TrtriBuildOptions, (const unsigned char **)&triple_dgemm_update_128_ABOVE64_PART2_R_bin, &triple_dgemm_update_128_ABOVE64_PART2_R_binSize, TrtribinBuildOptions, queue, A, offA, d_dinvA, i, lda, M, event); CL_CHECK(err); err = call_kernel_triple_update128(&triple_dgemm_update_128_ABOVE64_PART3_R_clKernel, triple_dgemm_update_128_ABOVE64_PART3_R_src, TrtriBuildOptions, (const unsigned char **)&triple_dgemm_update_128_ABOVE64_PART3_R_bin, &triple_dgemm_update_128_ABOVE64_PART3_R_binSize, TrtribinBuildOptions, queue, A, offA, d_dinvA, i, lda, M, event); CL_CHECK(err); break; } if (i * 2 >= M) break; } } return err; } static clblasStatus gpu_dtrsm128( clblasOrder order, clblasSide side, clblasUplo uplo, clblasTranspose transA, clblasDiag diag, size_t M, size_t N, double alpha, const cl_mem A, size_t offA, size_t ldA, cl_mem B, size_t offB, size_t ldB, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events, bool &specialCaseHandled) { if (order != clblasColumnMajor) return clblasNotImplemented; int inner_block_size = 16; // inner blocking size, <=32 int outer_block_size = 128;// outer blocking size, >BLOCK_SIZE cl_int err = 0; int i; cl_context context; err = getQueueContext(commandQueues[0], &context); CL_CHECK(err); /* quick return on wrong size */ if (M <= 0 || N <= 0) return clblasInvalidDim; double neg_one = -1.0; double one = 1.0; double zero = 0.0; // Helper to compute pass the 3 arguments describing a (sub)-matrix to clblasDgemm #define _(M,i,j) M , (off##M + ((i)+(j)*ld##M) ) , ld##M cl_mem InvA = 0; cl_mem X = 0; // X of size mxn will contain the result size_t ldX = M; size_t offX = 0; //must be 0: needed by the _(X,i,j) macro size_t size_X = N*ldX * sizeof(double); X = clCreateBuffer(context, CL_MEM_READ_WRITE, size_X, NULL, &err); CL_CHECK(err); err = clearBuffer(commandQueues[0], X, size_X); CL_CHECK(err); if (side == clblasLeft) { // side=L /* invert the diagonals * Allocate device memory for the inverted diagonal blocks, size=m*nb */ size_t ldInvA = outer_block_size; size_t offInvA = 0; //must be 0: needed by the _(X,i,j) macro size_t size_InvA = ldInvA * BLOCKS(M, outer_block_size) * outer_block_size *sizeof(double); InvA = clCreateBuffer(context, CL_MEM_READ_WRITE, size_InvA, NULL, &err); CL_CHECK(err); err = clearBuffer(commandQueues[0], InvA, size_InvA); CL_CHECK(err); err = diag_dtrtri128(commandQueues[0], M, uplo, diag, A, offA, InvA, ldA, inner_block_size, outer_block_size, events); CL_CHECK(err); // // Helper for C = alpha * transp(A) * B + beta * C // // In the calls below: // - the 1st matrix shall be either A or InvA transposed according to transA. // - the 2nd and 3rd matrices are either B and X // #define DGEMM_LEFT(m, n, k, alpha, A, B, beta, C) \ do { \ err = clblasDgemm(clblasColumnMajor, transA, clblasNoTrans , m, n, k, alpha, A, B, beta, C , 1, commandQueues, 0, NULL, events ) ; \ CL_CHECK(err); \ } while(0) if (transA == clblasNoTrans) { /* the non-transpose case */ if (uplo == clblasLower) { /* the lower case */ /* handle the first block seperately with alpha */ int mm = min(outer_block_size, (int)M); DGEMM_LEFT(mm, N, mm, alpha, _(InvA, 0, 0), _(B, 0, 0), zero, _(X, 0, 0)); if (outer_block_size < M) { DGEMM_LEFT(M - outer_block_size, N, outer_block_size, neg_one, _(A, outer_block_size, 0), _(X, 0, 0), alpha, _(B, outer_block_size, 0)); /* the rest blocks */ for (i = outer_block_size; i < M; i += outer_block_size) { mm = min((int)M - i, outer_block_size); DGEMM_LEFT(mm, N, mm, one, _(InvA, 0, i), _(B, i, 0), zero, _(X, i, 0)); if (i + outer_block_size >= M) break; DGEMM_LEFT(M - i - outer_block_size, N, outer_block_size, neg_one, _(A, i + outer_block_size, i), _(X, i, 0), one, _(B, i + outer_block_size, 0)); } //check_last_error() ; } } else // if ( uplo == clblasUpper) { /* the upper case */ /* handle the first block seperately with alpha */ int mm = (M % outer_block_size == 0) ? outer_block_size : (M % outer_block_size); i = M - mm; DGEMM_LEFT(mm, N, mm, alpha, _(InvA, 0, i), _(B, i, 0), zero, _(X, i, 0)); if (i - outer_block_size >= 0) { DGEMM_LEFT(i, N, mm, neg_one, _(A, 0, i), _(X, i, 0), alpha, _(B, 0, 0)); /* the rest blocks */ for (i = M - mm - outer_block_size; i >= 0; i -= outer_block_size) { DGEMM_LEFT(outer_block_size, N, outer_block_size, one, _(InvA, 0, i), _(B, i, 0), zero, _(X, i, 0)); if (i - outer_block_size < 0) break; DGEMM_LEFT(i, N, outer_block_size, neg_one, _(A, 0, i), _(X, i, 0), one, _(B, 0, 0)); } } } } else { /* the transpose case */ if (uplo == clblasLower) { /* the lower case */ /* handle the first block seperately with alpha */ int mm = (M % outer_block_size == 0) ? outer_block_size : (M % outer_block_size); i = M - mm; DGEMM_LEFT(mm, N, mm, alpha, _(InvA, 0, i), _(B, i, 0), zero, _(X, i, 0)); if (i - outer_block_size >= 0) { DGEMM_LEFT(i, N, mm, neg_one, _(A, i, 0), _(X, i, 0), alpha, _(B, 0, 0)); /* the rest blocks */ for (i = M - mm - outer_block_size; i >= 0; i -= outer_block_size) { DGEMM_LEFT(outer_block_size, N, outer_block_size, one, _(InvA, 0, i), _(B, i, 0), zero, _(X, i, 0)); if (i - outer_block_size < 0) break; DGEMM_LEFT(i, N, outer_block_size, neg_one, _(A, i, 0), _(X, i, 0), one, _(B, 0, 0)); } } } else { /* the upper case */ /* handle the first block seperately with alpha */ int mm = min(outer_block_size, (int)M); DGEMM_LEFT(mm, N, mm, alpha, _(InvA, 0, 0), _(B, 0, 0), zero, _(X, 0, 0)); if (outer_block_size < M) { DGEMM_LEFT(M - outer_block_size, N, outer_block_size, neg_one, _(A, 0, outer_block_size), _(X, 0, 0), alpha, _(B, outer_block_size, 0)); /* the rest blocks */ for (i = outer_block_size; i < M; i += outer_block_size) { mm = min((int)M - i, outer_block_size); DGEMM_LEFT(mm, N, mm, one, _(InvA, 0, i), _(B, i, 0), zero, _(X, i, 0)); if (i + outer_block_size >= M) break; DGEMM_LEFT(M - i - outer_block_size, N, outer_block_size, neg_one, _(A, i, i + outer_block_size), _(X, i, 0), one, _(B, i + outer_block_size, 0)); } } } } } else { // // Helper for C = alpha * B * A + beta * C // // In the calls below // - the 2nd matrix shall be either A or InvA transposed according to transA // - the 1st and 3rd matrices are either B and X // #define DGEMM_RIGHT(m,n,k, alpha, B, A, beta, C ) \ do { \ err = clblasDgemm(clblasColumnMajor, clblasNoTrans, transA , m, n, k, alpha, B, A, beta, C , 1, commandQueues, 0, NULL, events ) ; \ CL_CHECK(err); \ } while(0) // side=R /* invert the diagonals * Allocate device memory for the inverted diagonal blocks, size=n*BLOCK_SIZE */ /* invert the diagonals * Allocate device memory for the inverted diagonal blocks, size=m*nb */ size_t ldInvA = outer_block_size; size_t offInvA = 0; //must be 0: needed by the _(X,i,j) macro size_t size_InvA = ldInvA * BLOCKS(N, outer_block_size) * outer_block_size *sizeof(double); InvA = clCreateBuffer(context, CL_MEM_READ_WRITE, size_InvA, NULL, &err); CL_CHECK(err); err = clearBuffer(commandQueues[0], InvA, size_InvA); CL_CHECK(err); err = diag_dtrtri128(commandQueues[0], N, uplo, diag, A, offA, InvA, ldA, inner_block_size, outer_block_size, events); CL_CHECK(err); if (transA == clblasNoTrans) { /* the non-transpose case */ if (uplo == clblasLower) { /* the lower case */ /* handle the first block seperately with alpha */ int nn = (N % outer_block_size == 0) ? outer_block_size : (N % outer_block_size); i = N - nn; DGEMM_RIGHT(M, nn, nn, alpha, _(B, 0, i), _(InvA, 0, i), zero, _(X, 0, i)); if (i - outer_block_size >= 0) { DGEMM_RIGHT(M, i, nn, neg_one, _(X, 0, i), _(A, i, 0), alpha, _(B, 0, 0)); /* the rest blocks */ for (i = N - nn - outer_block_size; i >= 0; i -= outer_block_size) { DGEMM_RIGHT(M, outer_block_size, outer_block_size, one, _(B, 0, i), _(InvA, 0, i), zero, _(X, 0, i)); if (i - outer_block_size < 0) break; DGEMM_RIGHT(M, i, outer_block_size, neg_one, _(X, 0, i), _(A, i, 0), one, _(B, 0, 0)); } } } else { /* the upper case */ /* handle the first block seperately with alpha */ int nn = min(outer_block_size, (int)N); DGEMM_RIGHT(M, nn, nn, alpha, _(B, 0, 0), _(InvA, 0, 0), zero, _(X, 0, 0)); if (outer_block_size < N) { DGEMM_RIGHT(M, N - outer_block_size, outer_block_size, neg_one, _(X, 0, 0), _(A, 0, outer_block_size), alpha, _(B, 0, outer_block_size)); /* the rest blocks */ for (i = outer_block_size; i < N; i += outer_block_size) { nn = min(outer_block_size, (int)N - i); DGEMM_RIGHT(M, nn, nn, one, _(B, 0, i), _(InvA, 0, i), zero, _(X, 0, i)); if (i + outer_block_size >= N) break; DGEMM_RIGHT(M, N - i - outer_block_size, outer_block_size, neg_one, _(X, 0, i), _(A, i, i + outer_block_size), one, _(B, 0, i + outer_block_size)); } } } } else { /* the transpose case */ if (uplo == clblasLower) { /* the lower case */ /* handle the first block seperately with alpha */ int nn = min(outer_block_size, (int)N); DGEMM_RIGHT(M, nn, nn, alpha, _(B, 0, 0), _(InvA, 0, 0), zero, _(X, 0, 0)); if (outer_block_size < N) { DGEMM_RIGHT(M, N - outer_block_size, outer_block_size, neg_one, _(X, 0, 0), _(A, outer_block_size, 0), alpha, _(B, 0, outer_block_size)); /* the rest blocks */ for (i = outer_block_size; i < N; i += outer_block_size) { nn = min(outer_block_size, (int)N - i); DGEMM_RIGHT(M, nn, nn, one, _(B, 0, i), _(InvA, 0, i), zero, _(X, 0, i)); if (i + outer_block_size >= N) break; DGEMM_RIGHT(M, N - i - outer_block_size, outer_block_size, neg_one, _(X, 0, i), _(A, outer_block_size + i, i), one, _(B, 0, i + outer_block_size)); } } } else { /* the upper case */ /* handle the first block seperately with alpha */ int nn = (N % outer_block_size == 0) ? outer_block_size : (N % outer_block_size); i = N - nn; DGEMM_RIGHT(M, nn, nn, alpha, _(B, 0, i), _(InvA, 0, i), zero, _(X, 0, i)); if (i - outer_block_size >= 0) { DGEMM_RIGHT(M, i, nn, neg_one, _(X, 0, i), _(A, 0, i), alpha, _(B, 0, 0)); /* the rest blocks */ for (i = N - nn - outer_block_size; i >= 0; i -= outer_block_size) { DGEMM_RIGHT(M, outer_block_size, outer_block_size, one, _(B, 0, i), _(InvA, 0, i), zero, _(X, 0, i)); if (i - outer_block_size < 0) break; DGEMM_RIGHT(M, i, outer_block_size, neg_one, _(X, 0, i), _(A, 0, i), one, _(B, 0, 0)); } } } } } // Copy X(m,n) to B(m,n) { size_t src_origin[3] = { 0, 0, 0 }; size_t dst_origin[3] = { offB*sizeof(double), 0, 0 }; size_t region[3] = { M*sizeof(double), N, 1 }; err = clEnqueueCopyBufferRect(commandQueues[0], X, B, src_origin, dst_origin, region, ldX*sizeof(double), 0, ldB*sizeof(double), 0, 0, NULL, events); CL_CHECK(err); clReleaseMemObject(InvA); clReleaseMemObject(X); specialCaseHandled = true; return clblasSuccess; } return clblasNotImplemented; } static clblasStatus doTrsm( CLBlasKargs *kargs, clblasOrder order, clblasSide side, clblasUplo uplo, clblasTranspose transA, clblasDiag diag, size_t M, size_t N, const cl_mem A, size_t offA, size_t lda, cl_mem B, size_t offB, size_t ldb, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events) { cl_int err; ListHead seq; size_t msize; clblasStatus retCode = clblasSuccess; if (!clblasInitialized) { return clblasNotInitialized; } /* Validate arguments */ if ((retCode = checkMemObjects(A, B, NULL, false, A_MAT_ERRSET, B_MAT_ERRSET, END_ERRSET))) { return retCode; } msize = (side == clblasLeft) ? M : N; if ((retCode = checkMatrixSizes(kargs->dtype, order, transA, msize, msize, A, offA, lda, A_MAT_ERRSET))) { return retCode; } if ((retCode = checkMatrixSizes(kargs->dtype, order, clblasNoTrans, M, N, B, offB, ldb, B_MAT_ERRSET))) { return retCode; } kargs->order = order; kargs->side = side; kargs->uplo = uplo; kargs->transA = transA; kargs->diag = diag; kargs->M = M; kargs->N = N; kargs->A = A; kargs->offA = offA; kargs->lda.matrix = lda; kargs->B = B; kargs->offBX = offB; kargs->ldb.matrix = ldb; // Store original problem size in K, this is used to know it while // calculating result by parts using M or N as part size if (side == clblasLeft) { kargs->K = M; } else { kargs->K = N; } kargs->offsetM = 0; kargs->offsetN = 0; kargs->scimage[0] = 0; #ifndef TRXM_MULTIPLE_QUEUES if (numCommandQueues != 0) { numCommandQueues = 1; } #endif listInitHead(&seq); err = makeSolutionSeq(CLBLAS_TRSM, kargs, numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events, &seq); if (err == CL_SUCCESS) { err = executeSolutionSeq(&seq); } freeSolutionSeq(&seq); return (clblasStatus)err; } extern "C" clblasStatus clblasStrsm( clblasOrder order, clblasSide side, clblasUplo uplo, clblasTranspose transA, clblasDiag diag, size_t M, size_t N, cl_float alpha, const cl_mem A, size_t offA, size_t lda, cl_mem B, size_t offB, size_t ldb, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events) { CHECK_QUEUES(numCommandQueues, commandQueues); CHECK_EVENTS(numEventsInWaitList, eventWaitList); if ( numCommandQueues>1 ) { numCommandQueues = 1 ; // No support for multi-device (yet) } cl_command_queue queue = commandQueues[0]; clblasStrsmFunctor::Args args(order, side, uplo, transA, diag, M, N, alpha, A, offA, lda, B, offB, ldb, queue, numEventsInWaitList, eventWaitList, events); #if FORCE_COLUMN_MAJOR force_trsm_column_major(args); #endif clblasFunctorSelector * fselector = clblasFunctorSelector::find(queue); clblasStrsmFunctor * functor = fselector->select_strsm_specific(args); clblasStatus res = functor->execute(args); functor->release(); return res; } extern "C" clblasStatus clblasDtrsm( clblasOrder order, clblasSide side, clblasUplo uplo, clblasTranspose transA, clblasDiag diag, size_t M, size_t N, cl_double alpha, const cl_mem A, size_t offA, size_t lda, cl_mem B, size_t offB, size_t ldb, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events) { /* CHECK_QUEUES(numCommandQueues, commandQueues); CHECK_EVENTS(numEventsInWaitList, eventWaitList); if ( numCommandQueues>1 ) { numCommandQueues = 1 ; // No support for multi-device (yet) } cl_command_queue queue = commandQueues[0]; clblasDtrsmFunctor::Args args(order, side, uplo, transA, diag, M, N, alpha, A, offA, lda, B, offB, ldb, queue, numEventsInWaitList, eventWaitList, events); clblasFunctorSelector * fselector = clblasFunctorSelector::find(queue); clblasDtrsmFunctor * functor = fselector->select_dtrsm_specific(args); clblasStatus res = functor->execute(args); functor->release(); return res; */ bool specialCaseHandled = false; //outer block size = 192 //inner block size = 12 clblasStatus SpecialCaseStatus; SpecialCaseStatus = gpu_dtrsm192(order, side, uplo, transA, diag, M, N, alpha, A, offA, lda, B, offB, ldb, numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events, specialCaseHandled); if (specialCaseHandled) return SpecialCaseStatus; SpecialCaseStatus = gpu_dtrsm128(order, side, uplo, transA, diag, M, N, alpha, A, offA, lda, B, offB, ldb, numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events, specialCaseHandled); if (specialCaseHandled) return SpecialCaseStatus; CLBlasKargs kargs; memset(&kargs, 0, sizeof(kargs)); kargs.dtype = TYPE_DOUBLE; kargs.alpha.argDouble = alpha; return doTrsm(&kargs, order, side, uplo, transA, diag, M, N, A, offA, lda, B, offB, ldb, numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events); } extern "C" clblasStatus clblasCtrsm( clblasOrder order, clblasSide side, clblasUplo uplo, clblasTranspose transA, clblasDiag diag, size_t M, size_t N, FloatComplex alpha, const cl_mem A, size_t offA, size_t lda, cl_mem B, size_t offB, size_t ldb, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events) { CHECK_QUEUES(numCommandQueues, commandQueues); CHECK_EVENTS(numEventsInWaitList, eventWaitList); if ( numCommandQueues>1 ) { numCommandQueues = 1 ; // No support for multi-device (yet) } cl_command_queue queue = commandQueues[0]; clblasCtrsmFunctor::Args args(order, side, uplo, transA, diag, M, N, alpha, A, offA, lda, B, offB, ldb, queue, numEventsInWaitList, eventWaitList, events); #if FORCE_COLUMN_MAJOR force_trsm_column_major(args); #endif clblasFunctorSelector * fselector = clblasFunctorSelector::find(queue); clblasCtrsmFunctor * functor = fselector->select_ctrsm_specific(args); clblasStatus res = functor->execute(args); functor->release(); return res; } extern "C" clblasStatus clblasZtrsm( clblasOrder order, clblasSide side, clblasUplo uplo, clblasTranspose transA, clblasDiag diag, size_t M, size_t N, DoubleComplex alpha, const cl_mem A, size_t offA, size_t lda, cl_mem B, size_t offB, size_t ldb, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events) { CHECK_QUEUES(numCommandQueues, commandQueues); CHECK_EVENTS(numEventsInWaitList, eventWaitList); if ( numCommandQueues>1 ) { numCommandQueues = 1 ; // No support for multi-device (yet) } cl_command_queue queue = commandQueues[0]; clblasZtrsmFunctor::Args args(order, side, uplo, transA, diag, M, N, alpha, A, offA, lda, B, offB, ldb, queue, numEventsInWaitList, eventWaitList, events); #if FORCE_COLUMN_MAJOR force_trsm_column_major(args); #endif clblasFunctorSelector * fselector = clblasFunctorSelector::find(queue); clblasZtrsmFunctor * functor = fselector->select_ztrsm_specific(args); clblasStatus res = functor->execute(args); functor->release(); return res; } clblas-2.10/src/library/blas/xtrsv.c000066400000000000000000000434331264277366700174350ustar00rootroot00000000000000/* ************************************************************************ * Copyright 2013 Advanced Micro Devices, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * ************************************************************************/ #include #include #include #include #include #include "clblas-internal.h" #include "solution_seq.h" //#define DEBUG_TRSV static clblasUplo getUpLo(CLBlasKargs *kargs) { if (kargs->order == clblasColumnMajor) { return kargs->uplo; } if (kargs->uplo == clblasUpper) { return clblasLower; } return clblasUpper; } static clblasStatus orchestrateNonTransposeTRSV(CLBlasKargs *kargs, ListHead *trtriSeq, ListHead *gemvSeq, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events) { clblasStatus err; SolutionStep *trtri, *gemv; size_t nLoops, i; cl_event *eventArray; size_t TARGET_ROWS; ListNode *f = listNodeFirst(trtriSeq); trtri = container_of(f, node, SolutionStep); f = listNodeFirst(gemvSeq); gemv = container_of(f, node, SolutionStep); TARGET_ROWS = trtri->subdims->y; if ((trtri->subdims->y) != (gemv->subdims->y)) { printf("TRSV: WARNING: TRTRI and GEMV dont have identical sub-divisions!!! %lu and %lu\n", trtri->subdims->y, gemv->subdims->y); return clblasNotImplemented; } else { #ifdef DEBUG_TRSV printf("TRSV: MESSAGE: TRTRI and GEMV have identical sub-divisions! = %lu\n", TARGET_ROWS); #endif } trtri->numEventsInWaitList = numEventsInWaitList; trtri->eventWaitList = eventWaitList; if (kargs->N <= TARGET_ROWS) { trtri->event = events; trtri->args.startRow = 0; trtri->args.endRow = (cl_int)((kargs->N)-1); err = executeSolutionSeq(trtriSeq); return err; } // // Allocate Event Chain // nLoops = ((kargs->N) / TARGET_ROWS); if ((kargs->N % TARGET_ROWS)) { nLoops++; } #ifdef DEBUG_TRSV printf("TRSV: Orchestrate No Transpose Case: nLoops = %d\n", nLoops); #endif eventArray = malloc(nLoops*sizeof(cl_event)); if (eventArray == NULL) { return clblasOutOfHostMemory; } // // Solve 1 Triangle using Triangle Kernel Followed by Rectangle Kernels // trtri->event = &eventArray[0]; if (getUpLo(kargs) == clblasUpper) { trtri->args.startRow = (cl_int)((kargs->N) - TARGET_ROWS); trtri->args.endRow = (cl_int)((kargs->N)-1); } else { trtri->args.startRow = 0; trtri->args.endRow = (cl_int)(TARGET_ROWS-1); } err = executeSolutionSeq(trtriSeq); if (err == CL_SUCCESS) { // // Solve the Rectangles one by one // for(i=1; inumEventsInWaitList = 1; gemv->eventWaitList = &eventArray[i-1]; if (i < (nLoops-1)) { gemv->event = &eventArray[i]; } else { gemv->event = events; } if (getUpLo(kargs) == clblasUpper) { gemv->args.startRow = (cl_int)((kargs->N-1) - (i-1)*TARGET_ROWS); gemv->args.endRow = (cl_int)((kargs->N) - (i)*TARGET_ROWS); } else { gemv->args.startRow = (cl_int)((i-1)*TARGET_ROWS); gemv->args.endRow = (cl_int)((kargs->N) - (TARGET_ROWS*i)); } err = executeSolutionSeq(gemvSeq); if (err != CL_SUCCESS) { printf("TRSV: WARNING: GEMV LOOP: Breaking after %d iterations !!!\n", (int)i); break; } } } free(eventArray); return err; } static clblasStatus orchestrateTransposeTRSV(CLBlasKargs *kargs, ListHead *trtriSeq, ListHead *gemvSeq, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events) { clblasStatus err; SolutionStep *trtri, *gemv; size_t nLoops, i; cl_event *triangleEventArray; cl_event *rectangleEventArray; size_t TRIANGLE_HEIGHT; ListNode *f = listNodeFirst(trtriSeq); trtri = container_of(f, node, SolutionStep); f = listNodeFirst(gemvSeq); gemv = container_of(f, node, SolutionStep); TRIANGLE_HEIGHT = trtri->subdims->y; if ((trtri->subdims->y) != (gemv->subdims->y)) { printf("TRSV: Transpose: WARNING: TRTRI and GEMV dont have identical sub-divisions!!! %lu and %lu\n", trtri->subdims->y, gemv->subdims->y); return clblasNotImplemented; } else { #ifdef DEBUG_TRSV printf("TRSV: Transpose: MESSAGE: TRTRI and GEMV have identical sub-divisions! = %lu\n", TRIANGLE_HEIGHT); #endif } trtri->numEventsInWaitList = numEventsInWaitList; trtri->eventWaitList = eventWaitList; if (kargs->N <= TRIANGLE_HEIGHT) { trtri->event = events; trtri->args.startRow = 0; trtri->args.endRow = (cl_int)(kargs->N); err = executeSolutionSeq(trtriSeq); return err; } // // Allocate Event Chain // nLoops = ((kargs->N) / TRIANGLE_HEIGHT); if ((kargs->N % TRIANGLE_HEIGHT)) { nLoops++; } #ifdef DEBUG_TRSV printf("nLoops: %d\n", nLoops); #endif // // Allocate Event Arrays to order the orchestration // triangleEventArray = malloc(nLoops*sizeof(cl_event)); rectangleEventArray = malloc(nLoops*sizeof(cl_event)); if ((triangleEventArray == NULL) || (rectangleEventArray == NULL)) { if (triangleEventArray) { free (triangleEventArray); } if (rectangleEventArray) { free (rectangleEventArray); } return clblasOutOfHostMemory; } // // Solve as chain of TRIANGLE, RECTANGLE kernels ending on a pair-less TRIANGLE // for(i=0; ievent = &triangleEventArray[i]; if (i == (nLoops-1)) { // // TRTRI's last iteration must be tied to the "event" that the API // user will choose to wait on. // trtri->event = events; } if (i != 0) { // // For first iteration, TRTRI waits on what the API user has specified. // Subsequent iterations will wait on the previous iteration's rectangle // counterpart // trtri->numEventsInWaitList =1; trtri->eventWaitList = &rectangleEventArray[i-1]; } if (getUpLo(kargs) == clblasUpper) { trtri->args.startRow = (cl_int)(TRIANGLE_HEIGHT*i); trtri->args.endRow = (cl_int)(TRIANGLE_HEIGHT*(i+1)); if (trtri->args.endRow >= (cl_int)kargs->N) { trtri->args.endRow = (cl_int)kargs->N; } } else { if (kargs->N < TRIANGLE_HEIGHT*(i+1)) { trtri->args.startRow = 0; } else { trtri->args.startRow = (cl_int)((kargs->N) - TRIANGLE_HEIGHT*(i+1)); } trtri->args.endRow = (cl_int)((kargs->N) - TRIANGLE_HEIGHT*(i)); } err = executeSolutionSeq(trtriSeq); if (err != CL_SUCCESS) { printf("TRSV: Transpose: Breaking in the middle of loop due to error status, i=%d\n", (int)i); break; } if (i == (nLoops-1)) { break; } #ifdef DEBUG_TRSV printf("Calling gemv-"); #endif gemv->numEventsInWaitList = 1; gemv->eventWaitList = &triangleEventArray[i]; gemv->event = &rectangleEventArray[i]; gemv->args.startRow = trtri->args.startRow; gemv->args.endRow = trtri->args.endRow; err = executeSolutionSeq(gemvSeq); if (err != CL_SUCCESS) { printf("TRSV: Transpose: WARNING: GEMV LOOP: Breaking after %d iterations !!!\n", (int)i); break; } } free(triangleEventArray); free(rectangleEventArray); return err; } static clblasStatus orchestrateTRSV(CLBlasKargs *kargs, ListHead *trtriSeq, ListHead *gemvSeq, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events) { clblasStatus err = clblasNotImplemented; if ( ((kargs->order == clblasColumnMajor) && (kargs->transA == clblasNoTrans)) || ((kargs->order == clblasRowMajor) && (kargs->transA != clblasNoTrans)) ) { #ifdef DEBUG_TRSV printf("Orchestrating the NO-Transpose case..\n"); #endif err = orchestrateNonTransposeTRSV(kargs, trtriSeq, gemvSeq, numEventsInWaitList, eventWaitList, events); } else { #ifdef DEBUG_TRSV printf("Orchestrating the Transpose case..\n"); #endif err = orchestrateTransposeTRSV(kargs, trtriSeq, gemvSeq, numEventsInWaitList, eventWaitList, events); } return err; } clblasStatus doTrsv( CLBlasKargs *kargs, clblasOrder order, clblasUplo uplo, clblasTranspose trans, clblasDiag diag, size_t N, const cl_mem A, size_t offa, size_t lda, cl_mem x, size_t offx, int incx, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events) { cl_int err = clblasNotImplemented; ListHead seq; CLBlasKargs gemvKargs; ListHead gemvSeq; // cl_context c; clblasStatus retCode = clblasSuccess; if (!clblasInitialized) { return clblasNotInitialized; } /* Validate arguments */ if ((retCode = checkMemObjects(A, x, (cl_mem) NULL, false, A_MAT_ERRSET, X_VEC_ERRSET, END_ERRSET))) { #ifdef DEBUG_TRSV printf("Invalid mem object..\n"); #endif return retCode; } /* * PENDING: * checkMatrixSizes() does not account for "offa" argument. * Need to pass "offa" when "checkMatrixSizes()" is changed. */ if ((retCode = checkMatrixSizes(kargs->dtype, order, trans, N, N, A, offa, lda, A_MAT_ERRSET))) { #ifdef DEBUG_TRSV printf("Invalid Size for A\n"); #endif return retCode; } if ((retCode = checkVectorSizes(kargs->dtype, N, x, offx, incx, X_VEC_ERRSET))) { #ifdef DEBUG_TRSV printf("Invalid Size for X\n"); #endif return retCode; } #ifdef DEBUG_TRSV printf("DoTrsv being called...\n"); #endif if ((commandQueues == NULL) || (numCommandQueues == 0)) { return clblasInvalidValue; } if ((numEventsInWaitList !=0) && (eventWaitList == NULL)) { return clblasInvalidEventWaitList; } if (commandQueues[0] == NULL) { return clblasInvalidCommandQueue; } numCommandQueues = 1; // NOTE: Hard-coding the number of command queues to 1 kargs->order = order; kargs->uplo = uplo; kargs->transA = trans; kargs->diag = diag; kargs->M = N; // store Original N kargs->N = N; // The field "kargs->N" is the one used by the generator. kargs->K = N; // store original N kargs->A = A; kargs->lda.matrix = lda; kargs->B = x; kargs->ldb.vector = incx; kargs->offBX = offx; kargs->offa = offa; kargs->offA = offa; kargs->offsetM = 0; kargs->offsetN = 0; kargs->scimage[0] = 0; kargs->scimage[1] = 0; memcpy(&gemvKargs, kargs, sizeof(CLBlasKargs)); #ifdef DEBUG_TRSV printf("Calling makeSolutionSeq : TRSV\n"); #endif listInitHead(&seq); listInitHead(&gemvSeq); //err = makeSolutionSeq(CLBLAS_TRSV, kargs, numCommandQueues, commandQueues, //0, NULL, NULL, &seq); /* Problem of context getting released on entry seems to be gone on the new driver. Uncomment these lines if problem recurs getQueueContext(commandQueues[0], &c); clRetainContext(c); #ifdef DEBUG_TRSV clGetContextInfo(c, CL_CONTEXT_REFERENCE_COUNT, sizeof(cl_uint), &refcnt, NULL); printf("doTrsv(): REFCNT ON ENTRY= %u\n", refcnt); #endif */ err = makeSolutionSeq(CLBLAS_TRSV, kargs, numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events, &seq); if (err == CL_SUCCESS) { err = makeSolutionSeq(CLBLAS_TRSV_GEMV, &gemvKargs, numCommandQueues, commandQueues, 0, NULL, NULL, &gemvSeq); if (err == CL_SUCCESS) { #ifdef DEBUG_TRSV printf("Orchestrating TRSV\n"); #endif err = orchestrateTRSV(kargs, &seq, &gemvSeq, numEventsInWaitList, eventWaitList, events); } } freeSolutionSeq(&seq); freeSolutionSeq(&gemvSeq); #ifdef DEBUG_TRSV if (clGetContextInfo(c, CL_CONTEXT_REFERENCE_COUNT, sizeof(cl_uint), &refcnt, NULL) != CL_SUCCESS) { printf("doTrsv(): clGetContextInfo failed..\n"); } else { printf("doTrsv(): REFCNT EXIT = %u\n", refcnt); } #endif return err; } clblasStatus clblasStrsv( clblasOrder order, clblasUplo uplo, clblasTranspose trans, clblasDiag diag, size_t N, const cl_mem A, size_t offa, size_t lda, cl_mem X, size_t offx, int incx, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events) { CLBlasKargs kargs; #ifdef DEBUG_TRSV printf("STRSV Called\n"); #endif memset(&kargs, 0, sizeof(kargs)); kargs.dtype = TYPE_FLOAT; kargs.pigFuncID = CLBLAS_TRSV; return doTrsv(&kargs, order, uplo, trans, diag, N, A, offa, lda, X, offx, incx, numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events); } clblasStatus clblasDtrsv( clblasOrder order, clblasUplo uplo, clblasTranspose trans, clblasDiag diag, size_t N, const cl_mem A, size_t offa, size_t lda, cl_mem X, size_t offx, int incx, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events) { CLBlasKargs kargs; #ifdef DEBUG_TRSV printf("DTRSV Called\n"); #endif memset(&kargs, 0, sizeof(kargs)); kargs.dtype = TYPE_DOUBLE; kargs.pigFuncID = CLBLAS_TRSV; return doTrsv(&kargs, order, uplo, trans, diag, N, A, offa, lda, X, offx, incx, numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events); } clblasStatus clblasCtrsv( clblasOrder order, clblasUplo uplo, clblasTranspose trans, clblasDiag diag, size_t N, const cl_mem A, size_t offa, size_t lda, cl_mem X, size_t offx, int incx, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events) { CLBlasKargs kargs; #ifdef DEBUG_TRSV printf("CTRSV Called\n"); #endif memset(&kargs, 0, sizeof(kargs)); kargs.dtype = TYPE_COMPLEX_FLOAT; kargs.pigFuncID = CLBLAS_TRSV; return doTrsv(&kargs, order, uplo, trans, diag, N, A, offa, lda, X, offx, incx, numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events); } clblasStatus clblasZtrsv( clblasOrder order, clblasUplo uplo, clblasTranspose trans, clblasDiag diag, size_t N, const cl_mem A, size_t offa, size_t lda, cl_mem X, size_t offx, int incx, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events) { CLBlasKargs kargs; #ifdef DEBUG_TRSV printf("ZTRSV Called\n"); #endif memset(&kargs, 0, sizeof(kargs)); kargs.dtype = TYPE_COMPLEX_DOUBLE; kargs.pigFuncID = CLBLAS_TRSV; return doTrsv(&kargs, order, uplo, trans, diag, N, A, offa, lda, X, offx, incx, numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events); } clblasStatus clblasStpsv( clblasOrder order, clblasUplo uplo, clblasTranspose trans, clblasDiag diag, size_t N, const cl_mem A, size_t offa, cl_mem X, size_t offx, int incx, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events) { CLBlasKargs kargs; #ifdef DEBUG_TRSV printf("STPSV Called\n"); #endif memset(&kargs, 0, sizeof(kargs)); kargs.dtype = TYPE_FLOAT; kargs.pigFuncID = CLBLAS_TPSV; return doTrsv(&kargs, order, uplo, trans, diag, N, A, offa, 0, X, offx, incx, numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events); } clblasStatus clblasDtpsv( clblasOrder order, clblasUplo uplo, clblasTranspose trans, clblasDiag diag, size_t N, const cl_mem A, size_t offa, cl_mem X, size_t offx, int incx, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events) { CLBlasKargs kargs; #ifdef DEBUG_TRSV printf("DTPSV Called\n"); #endif memset(&kargs, 0, sizeof(kargs)); kargs.dtype = TYPE_DOUBLE; kargs.pigFuncID = CLBLAS_TPSV; return doTrsv(&kargs, order, uplo, trans, diag, N, A, offa, 0, X, offx, incx, numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events); } clblasStatus clblasCtpsv( clblasOrder order, clblasUplo uplo, clblasTranspose trans, clblasDiag diag, size_t N, const cl_mem A, size_t offa, cl_mem X, size_t offx, int incx, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events) { CLBlasKargs kargs; #ifdef DEBUG_TRSV printf("CTPSV Called\n"); #endif memset(&kargs, 0, sizeof(kargs)); kargs.dtype = TYPE_COMPLEX_FLOAT; kargs.pigFuncID = CLBLAS_TPSV; return doTrsv(&kargs, order, uplo, trans, diag, N, A, offa, 0, X, offx, incx, numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events); } clblasStatus clblasZtpsv( clblasOrder order, clblasUplo uplo, clblasTranspose trans, clblasDiag diag, size_t N, const cl_mem A, size_t offa, cl_mem X, size_t offx, int incx, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events) { CLBlasKargs kargs; #ifdef DEBUG_TRSV printf("ZTPSV Called\n"); #endif memset(&kargs, 0, sizeof(kargs)); kargs.dtype = TYPE_COMPLEX_DOUBLE; kargs.pigFuncID = CLBLAS_TPSV; return doTrsv(&kargs, order, uplo, trans, diag, N, A, offa, 0, X, offx, incx, numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events); } clblas-2.10/src/library/clBLAS.pc.in000066400000000000000000000004721264277366700171470ustar00rootroot00000000000000prefix=@CMAKE_INSTALL_PREFIX@ exec_prefix=${prefix}/bin@SUFFIX_BIN@ includedir=${prefix}/include libdir=${prefix}/lib@SUFFIX_LIB@ Name: clBLAS Description: Open source OpenCL BLAS library Version: @clBLAS_VERSION@ URL: https://github.com/clMathLibraries/clBLAS Cflags: -I${includedir} Libs: -L${libdir} -lclBLAS clblas-2.10/src/library/common/000077500000000000000000000000001264277366700164435ustar00rootroot00000000000000clblas-2.10/src/library/common/clkern.c000066400000000000000000000170241264277366700200710ustar00rootroot00000000000000/* ************************************************************************ * Copyright 2013 Advanced Micro Devices, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * ************************************************************************/ #include #include #include #include enum { MAX_SOURCE_SIZE = 1048576, MAX_OPENCL_DEVICES = 64 }; static size_t getBinSizeAndIdx(cl_program program, int *idx) { size_t allSizes[MAX_OPENCL_DEVICES], size = 0; size_t i, retSize; clGetProgramInfo(program, CL_PROGRAM_BINARY_SIZES, sizeof(allSizes), &allSizes, &retSize); retSize /= sizeof(size); for (i = 0; i < retSize; i++) { size = allSizes[i]; if (size) { break; } } if (idx && (i < retSize)) { *idx = (int)i; } return size; } cl_int launchClKernel( KernelDesc *kernDesc, cl_command_queue queue, KernelErrorInfo *errInfo) { cl_int status; unsigned int i; KernelArg *karg; KernelErrorInfo ei; unsigned long t; unsigned int nrArgs; errInfo->phase = -1; errInfo->wrongArg = (unsigned int)-1; ei.phase = -1; ei.wrongArg = (unsigned int)-1; status = clGetKernelInfo(kernDesc->kernel, CL_KERNEL_NUM_ARGS, sizeof(nrArgs), &nrArgs, NULL); if (status != CL_SUCCESS) { return status; } karg = kernDesc->args; for (i = 0; (i < nrArgs) && (status == CL_SUCCESS); i++, karg++) { status = clSetKernelArg(kernDesc->kernel, i, karg->typeSize, karg->arg.data); if (status != CL_SUCCESS) { ei.wrongArg = i; ei.phase = PHASE_SET_ARGS; } else if (karg->hostBuf && (karg->dir & MEMOBJ_WRITE)) { status = clEnqueueWriteBuffer(queue, karg->arg.mem, CL_TRUE, 0, karg->hostBufLen, karg->hostBuf, 0, NULL, NULL); if (status != CL_SUCCESS) { ei.wrongArg = i; ei.phase = PHASE_ENQUEUE_WRITE; } } } if (status == CL_SUCCESS) { status = clEnqueueNDRangeKernel(queue, kernDesc->kernel, (cl_uint)kernDesc->workDim, NULL, (const size_t*)kernDesc->globalThreads, (const size_t*)kernDesc->localThreads, (cl_uint)kernDesc->waitListSize, kernDesc->eventWaitList, kernDesc->event); if ((status == CL_SUCCESS) && !kernDesc->nowait) { status = clWaitForEvents(1, kernDesc->event); } if (status != CL_SUCCESS) { ei.phase = PHASE_ENQUEUE_KERNEL; } if ((status == CL_SUCCESS) && kernDesc->needExecTime && kernDesc->event) { if (kernDesc->nowait) { status = clWaitForEvents(1, kernDesc->event); if (status != CL_SUCCESS) { ei.phase = PHASE_PROFILING; } } if (status == CL_SUCCESS) { status = clGetEventProfilingInfo(*kernDesc->event, CL_PROFILING_COMMAND_START, sizeof(t), &t, NULL); if (status == CL_SUCCESS) { status = clGetEventProfilingInfo(*kernDesc->event, CL_PROFILING_COMMAND_END, sizeof(kernDesc->execTime), &kernDesc->execTime, NULL); kernDesc->execTime -= t; } if (status != CL_SUCCESS) { ei.phase = PHASE_PROFILING; } } } } karg = kernDesc->args; for (i = 0; (i < nrArgs) && (status == CL_SUCCESS); i++, karg++) { if (karg->hostBuf && (karg->dir & MEMOBJ_READ)) { status = clEnqueueReadBuffer(queue, karg->arg.mem, CL_TRUE, 0, karg->hostBufLen, karg->hostBuf, 0, NULL, NULL); if (status != CL_SUCCESS) { ei.wrongArg = i; ei.phase = PHASE_ENQUEUE_READ; } } } if ((status != CL_SUCCESS) && errInfo) { errInfo->phase = ei.phase; if (ei.phase != PHASE_ENQUEUE_KERNEL) { errInfo->wrongArg = ei.wrongArg; } } return status; } cl_program buildClProgram( const char *source, const char *buildOpts, cl_context ctx, cl_device_id devID, char *logBuf, size_t logBufSize, cl_int *status) { cl_program program = NULL; cl_int stat = CL_SUCCESS; program = clCreateProgramWithSource(ctx, 1, (const char**)&source, NULL, &stat); if (program != NULL) { stat = clBuildProgram(program, 1, (const cl_device_id*)&devID, buildOpts, NULL, NULL); if (stat != CL_SUCCESS) { if (logBuf) { logBuf[0] = '\0'; clGetProgramBuildInfo(program, devID, CL_PROGRAM_BUILD_LOG, logBufSize, logBuf, NULL); } clReleaseProgram(program); program = NULL; } } if (status) { *status = stat; } return program; } cl_program createClProgramWithBinary( cl_context ctx, cl_device_id devID, unsigned char *binary, size_t binSize, cl_int *status) { cl_program program; cl_int s; program = clCreateProgramWithBinary(ctx, 1, &devID, &binSize, (const unsigned char**)&binary, NULL, &s); if (program != NULL) { s = clBuildProgram(program, 1, &devID, NULL, NULL, NULL); if (s != CL_SUCCESS) { clReleaseProgram(program); program = NULL; } } if (status != NULL) { *status = s; } return program; } size_t getProgramBinarySize(cl_program program) { return getBinSizeAndIdx(program, NULL); } unsigned char *getProgramBinary(cl_program program) { unsigned char *binaries[MAX_OPENCL_DEVICES]; unsigned char *bin = NULL; size_t size; int idx = 0; memset(binaries, 0, sizeof(binaries)); size = getBinSizeAndIdx(program, &idx); bin = binaries[idx] = malloc(size); if (bin != NULL) { cl_int err; err = clGetProgramInfo(program, CL_PROGRAM_BINARIES, sizeof(binaries), binaries, NULL); if (err != CL_SUCCESS) { free(bin); bin = NULL; } } return bin; } clblas-2.10/src/library/common/devinfo-cache.c000066400000000000000000000706031264277366700213100ustar00rootroot00000000000000/* ************************************************************************ * Copyright 2013 Advanced Micro Devices, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * ************************************************************************/ #include #include #if defined(__APPLE__) || defined(__MACOSX) #include #else #include #endif #include static cl_ulong closestPowerOf2(cl_ulong x); static const char L2BENCH_NAME[] = "l2Bench"; static const char *L2BENCH = "__constant sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE | \n" " CLK_ADDRESS_NONE | \n" " CLK_FILTER_NEAREST; \n" "__kernel \n" "void l2Bench( \n" " __read_only image2d_t in, \n" " size_t rounds, \n" " __global float4 *out) \n" "{ \n" " int width, height; \n" " size_t gid, nrWorkItems; \n" " size_t pixelsPerWorkItem; \n" " size_t x, y, k, i; \n" " float4 v, sum; \n" " width = get_image_width(in); \n" " height = get_image_height(in); \n" " gid = get_global_id(0); \n" " nrWorkItems = get_global_size(0); \n" " pixelsPerWorkItem = (width * height) / nrWorkItems; \n" " sum = (float4)(0.0); \n" " for (k = 0; k < rounds; k++) { \n" " x = (gid * pixelsPerWorkItem) % width; \n" " y = (gid * pixelsPerWorkItem) / width; \n" " for (i = 0; i < pixelsPerWorkItem; i++) { \n" " v = read_imagef(in, sampler, (int2)(x, y)); \n" " sum += v; \n" " x++; \n" " y += x / width; \n" " x %= width; \n" " } \n" " } \n" " *out = sum; \n" "} \n"; static const char L1BENCH_NAME[] = "l1Bench"; static const char *L1BENCH = "__constant sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE | \n" " CLK_ADDRESS_NONE | \n" " CLK_FILTER_NEAREST; \n" "__kernel \n" "void l1Bench( \n" " __read_only image2d_t in, \n" " size_t l2Size, \n" " size_t rounds, \n" " __global float4 *out) \n" "{ \n" " int width, height; \n" " size_t gid, nrWorkItems; \n" " size_t pixelsPerWorkItem; \n" " size_t x, y, k, i; \n" " float4 v, sum; \n" " width = get_image_width(in); \n" " height = get_image_height(in); \n" " gid = get_global_id(0); \n" " nrWorkItems = get_global_size(0); \n" " pixelsPerWorkItem = (width * height) / nrWorkItems; \n" " sum = (float4)(0.0); \n" " for (k = 0; k < rounds; k++) { \n" " x = (gid * pixelsPerWorkItem) % width; \n" " y = (gid * pixelsPerWorkItem) / width; \n" " for (i = 0; i < pixelsPerWorkItem - l2Size / sizeof(float4); i++) { \n" " v = read_imagef(in, sampler, (int2)(x, y)); \n" " sum += v; \n" " x++; \n" " y += x / width; \n" " x %= width; \n" " } \n" " } \n" " *out = sum; \n" "} \n"; cl_ulong deviceL2CacheSize( cl_device_id device, cl_int *error) { const size_t MAX_CACHE_SIZE = 1024 * 1024; const size_t MIN_CACHE_SIZE = 1 * 1024; const size_t STEP = 4 * 1024; /* Bigger number of rounds increases time measurement precision, * but slows the test down. */ const unsigned int ROUNDS = 32; /* Repeat each kernel run sereval times for higher reliability. */ const unsigned int RELIABILITY_ROUNDS = 5; cl_int err; cl_uint maxComputeUnits; cl_bool imageSupport; cl_platform_id platform; cl_context_properties props[3] = { CL_CONTEXT_PLATFORM, 0, 0 }; cl_context ctx; cl_command_queue queue; cl_program program; cl_kernel kernel; cl_event event; cl_float *in; size_t width, height; const cl_image_format format = { CL_RGBA, CL_FLOAT }; cl_mem imgIn; size_t origin[3], region[3]; cl_float4 out; cl_mem bufOut; size_t global_work_size, local_work_size; cl_ulong start, end, avg; cl_long *times; cl_double d, max; size_t steps; size_t i, t; /* Collect device properties. */ err = clGetDeviceInfo(device, CL_DEVICE_MAX_COMPUTE_UNITS, sizeof(cl_uint), &maxComputeUnits, NULL); if (err != CL_SUCCESS) { if (error != NULL) { *error = err; } return 0; } err = clGetDeviceInfo(device, CL_DEVICE_IMAGE_SUPPORT, sizeof(cl_bool), &imageSupport, NULL); if (err != CL_SUCCESS) { if (error != NULL) { *error = err; } return 0; } if (imageSupport == CL_FALSE) { if (error != NULL) { *error = CL_INVALID_OPERATION; /* like clCreateImage2D() does */ } return 0; } steps = (MAX_CACHE_SIZE - MIN_CACHE_SIZE) / STEP; times = calloc(steps, sizeof(cl_long)); if (times == NULL) { if (error != NULL) { *error = CL_OUT_OF_HOST_MEMORY; } return 0; } /* Create necessary OpenCL objects */ err = clGetDeviceInfo(device, CL_DEVICE_PLATFORM, sizeof(cl_platform_id), &platform, NULL); if (err != CL_SUCCESS) { free(times); if (error != NULL) { *error = err; } return 0; } props[1] = (cl_context_properties)platform; ctx = clCreateContext(props, 1, &device, NULL, NULL, &err); if (err != CL_SUCCESS) { free(times); if (error != NULL) { *error = err; } return 0; } queue = clCreateCommandQueue(ctx, device, CL_QUEUE_PROFILING_ENABLE, &err); if (err != CL_SUCCESS) { clReleaseContext(ctx); free(times); if (error != NULL) { *error = err; } return 0; } program = clCreateProgramWithSource(ctx, 1, &L2BENCH, NULL, &err); if (err != CL_SUCCESS) { clReleaseCommandQueue(queue); clReleaseContext(ctx); free(times); if (error != NULL) { *error = err; } return 0; } err = clBuildProgram(program, 1, &device, NULL, NULL, NULL); if (err != CL_SUCCESS) { clReleaseProgram(program); clReleaseCommandQueue(queue); clReleaseContext(ctx); free(times); if (error != NULL) { *error = err; } return 0; } kernel = clCreateKernel(program, L2BENCH_NAME, &err); clReleaseProgram(program); if (err != CL_SUCCESS) { clReleaseCommandQueue(queue); clReleaseContext(ctx); free(times); if (error != NULL) { *error = err; } return 0; } /* Main idea of this test is to run one work-item on each compute unit. * This will make clear L2 cache hit/miss picture. */ global_work_size = maxComputeUnits; local_work_size = 1; /* Prepare output buffer */ bufOut = clCreateBuffer(ctx, CL_MEM_WRITE_ONLY | CL_MEM_USE_HOST_PTR, sizeof(cl_float4), &out, &err); if (err != CL_SUCCESS) { clReleaseCommandQueue(queue); clReleaseContext(ctx); free(times); if (error != NULL) { *error = err; } return 0; } for (t = 0; t < steps; t++) { width = (size_t)sqrt((double)(MAX_CACHE_SIZE - t * STEP) / sizeof(cl_float4)); height = width; /* Prepare image buffer */ in = calloc(width * height, sizeof(cl_float4)); if (in == NULL) { clReleaseMemObject(bufOut); clReleaseCommandQueue(queue); clReleaseContext(ctx); free(times); if (error != NULL) { *error = CL_OUT_OF_HOST_MEMORY; } return 0; } imgIn = clCreateImage2D(ctx, CL_MEM_READ_ONLY | CL_MEM_USE_HOST_PTR, &format, width, height, 0, in, &err); if (err != CL_SUCCESS) { free(in); clReleaseMemObject(bufOut); clReleaseCommandQueue(queue); clReleaseContext(ctx); free(times); if (error != NULL) { *error = err; } return 0; } origin[0] = origin[1] = origin[2] = 0; region[0] = width; region[1] = height; region[2] = 1; avg = 0; for (i = 0; i < RELIABILITY_ROUNDS; i++) { err = clEnqueueWriteImage(queue, imgIn, CL_TRUE, origin, region, 0, 0, in, 0, NULL, NULL); if (err != CL_SUCCESS) { clReleaseMemObject(imgIn); free(in); clReleaseMemObject(bufOut); clReleaseCommandQueue(queue); clReleaseContext(ctx); free(times); if (error != NULL) { *error = err; } return 0; } err = clEnqueueWriteBuffer(queue, bufOut, CL_TRUE, 0, sizeof(cl_float4), &out, 0, NULL, NULL); if (err != CL_SUCCESS) { clReleaseMemObject(imgIn); free(in); clReleaseMemObject(bufOut); clReleaseCommandQueue(queue); clReleaseContext(ctx); free(times); if (error != NULL) { *error = err; } return 0; } err = clSetKernelArg(kernel, 0, sizeof(cl_mem), &imgIn); if (err != CL_SUCCESS) { clReleaseMemObject(imgIn); free(in); clReleaseMemObject(bufOut); clReleaseCommandQueue(queue); clReleaseContext(ctx); free(times); if (error != NULL) { *error = err; } return 0; } err = clSetKernelArg(kernel, 1, sizeof(ROUNDS), &ROUNDS); if (err != CL_SUCCESS) { clReleaseMemObject(imgIn); free(in); clReleaseMemObject(bufOut); clReleaseCommandQueue(queue); clReleaseContext(ctx); free(times); if (error != NULL) { *error = err; } return 0; } err = clSetKernelArg(kernel, 2, sizeof(cl_mem), &bufOut); if (err != CL_SUCCESS) { clReleaseMemObject(imgIn); free(in); clReleaseMemObject(bufOut); clReleaseCommandQueue(queue); clReleaseContext(ctx); free(times); if (error != NULL) { *error = err; } return 0; } err = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, &global_work_size, &local_work_size, 0, NULL, &event); if (err != CL_SUCCESS) { clReleaseMemObject(imgIn); free(in); clReleaseMemObject(bufOut); clReleaseCommandQueue(queue); clReleaseContext(ctx); free(times); if (error != NULL) { *error = err; } return 0; } err = clWaitForEvents(1, &event); if (err != CL_SUCCESS) { clReleaseEvent(event); clReleaseMemObject(imgIn); free(in); clReleaseMemObject(bufOut); clReleaseCommandQueue(queue); clReleaseContext(ctx); free(times); if (error != NULL) { *error = err; } return 0; } start = end = 0UL; err = clGetEventProfilingInfo(event, CL_PROFILING_COMMAND_START, sizeof(cl_ulong), &start, NULL); if (err != CL_SUCCESS) { clReleaseEvent(event); clReleaseMemObject(imgIn); free(in); clReleaseMemObject(bufOut); clReleaseCommandQueue(queue); clReleaseContext(ctx); free(times); if (error != NULL) { *error = err; } return 0; } err = clGetEventProfilingInfo(event, CL_PROFILING_COMMAND_END, sizeof(cl_ulong), &end, NULL); if (err != CL_SUCCESS) { clReleaseEvent(event); clReleaseMemObject(imgIn); free(in); clReleaseMemObject(bufOut); clReleaseCommandQueue(queue); clReleaseContext(ctx); free(times); if (error != NULL) { *error = err; } return 0; } clReleaseEvent(event); /* NOTE: Sometimes the difference between start and end times * can be unexpectedly large - a tens of seconds. * This is a wrong behavior. */ //assert(end - start < 10000000000UL); avg += end - start; } times[t] = avg / (width * height); clReleaseMemObject(imgIn); free(in); } clReleaseMemObject(bufOut); clReleaseCommandQueue(queue); clReleaseContext(ctx); max = 0; i = MAX_CACHE_SIZE + 1; for (t = 1; t < steps; t++) { d = (cl_double)times[t - 1]; d /= times[t]; if (d > max) { max = d; i = MAX_CACHE_SIZE - t * STEP; } } free(times); if (i == MAX_CACHE_SIZE + 1) return 0; return closestPowerOf2(i); } cl_ulong deviceL1CacheSize( cl_device_id device, cl_ulong l2CacheSize, cl_int *error) { const size_t MIN_CACHE_SIZE = 1024; const size_t STEP = 1024; size_t L2_SIZE = (size_t)l2CacheSize; /* Bigger number of rounds increases time measurement precision, * but slows the test down. */ const unsigned int ROUNDS = 64; /* Repeat each kernel run sereval times for higher reliability. */ const unsigned int RELIABILITY_ROUNDS = 10; cl_int err; cl_uint maxComputeUnits; cl_bool imageSupport; cl_platform_id platform; cl_context_properties props[3] = { CL_CONTEXT_PLATFORM, 0, 0 }; cl_context ctx; cl_command_queue queue; cl_program program; cl_kernel kernel; cl_event event; cl_float *in; size_t width, height; const cl_image_format format = { CL_RGBA, CL_FLOAT }; cl_mem imgIn; size_t origin[3], region[3]; cl_float4 out; cl_mem bufOut; size_t global_work_size, local_work_size; cl_ulong start, end, avg; cl_long *times; cl_double d, max; size_t steps; size_t i, t; /* Collect device properties. */ err = clGetDeviceInfo(device, CL_DEVICE_MAX_COMPUTE_UNITS, sizeof(cl_uint), &maxComputeUnits, NULL); if (err != CL_SUCCESS) { if (error != NULL) { *error = err; } return 0; } err = clGetDeviceInfo(device, CL_DEVICE_IMAGE_SUPPORT, sizeof(cl_bool), &imageSupport, NULL); if (err != CL_SUCCESS) { if (error != NULL) { *error = err; } return 0; } if (imageSupport == CL_FALSE) { if (error != NULL) { *error = CL_INVALID_OPERATION; /* like clCreateImage2D() does */ } return 0; } steps = 1 + (L2_SIZE - MIN_CACHE_SIZE) / STEP; times = calloc(steps, sizeof(cl_long)); if (times == NULL) { if (error != NULL) { *error = CL_OUT_OF_HOST_MEMORY; } return 0; } /* Create necessary OpenCL objects */ err = clGetDeviceInfo(device, CL_DEVICE_PLATFORM, sizeof(cl_platform_id), &platform, NULL); if (err != CL_SUCCESS) { free(times); if (error != NULL) { *error = err; } return 0; } props[1] = (cl_context_properties)platform; ctx = clCreateContext(props, 1, &device, NULL, NULL, &err); if (err != CL_SUCCESS) { free(times); if (error != NULL) { *error = err; } return 0; } queue = clCreateCommandQueue(ctx, device, CL_QUEUE_PROFILING_ENABLE, &err); if (err != CL_SUCCESS) { clReleaseContext(ctx); free(times); if (error != NULL) { *error = err; } return 0; } program = clCreateProgramWithSource(ctx, 1, &L1BENCH, NULL, &err); if (err != CL_SUCCESS) { clReleaseCommandQueue(queue); clReleaseContext(ctx); free(times); if (error != NULL) { *error = err; } return 0; } err = clBuildProgram(program, 1, &device, NULL, NULL, NULL); if (err != CL_SUCCESS) { clReleaseProgram(program); clReleaseCommandQueue(queue); clReleaseContext(ctx); free(times); if (error != NULL) { *error = err; } return 0; } kernel = clCreateKernel(program, L1BENCH_NAME, &err); clReleaseProgram(program); if (err != CL_SUCCESS) { clReleaseCommandQueue(queue); clReleaseContext(ctx); free(times); if (error != NULL) { *error = err; } return 0; } /* Main idea of this test is to run one work-item on each compute unit. * Image region assigned to one work-item consists of two parts: * - part with size of probable L1 cache * - part with size of L2 cache * This makes cache misses in L1 to be misses in L2 as well. * It is also assumed, that each Compute Unit has its own L1 cache. */ global_work_size = maxComputeUnits; local_work_size = 1; /* Prepare output buffer */ bufOut = clCreateBuffer(ctx, CL_MEM_WRITE_ONLY | CL_MEM_USE_HOST_PTR, sizeof(cl_float4), &out, &err); if (err != CL_SUCCESS) { clReleaseCommandQueue(queue); clReleaseContext(ctx); free(times); if (error != NULL) { *error = err; } return 0; } for (t = 0; t < steps; t++) { width = 64; /* One image line takes 1KB */ height = (L2_SIZE - t * STEP + L2_SIZE) * global_work_size / (sizeof(cl_float4) * width); /* Prepare image buffer */ in = calloc(width * height, sizeof(cl_float4)); if (in == NULL) { clReleaseMemObject(bufOut); clReleaseCommandQueue(queue); clReleaseContext(ctx); free(times); if (error != NULL) { *error = CL_OUT_OF_HOST_MEMORY; } return 0; } imgIn = clCreateImage2D(ctx, CL_MEM_READ_ONLY | CL_MEM_USE_HOST_PTR, &format, width, height, 0, in, &err); if (err != CL_SUCCESS) { free(in); clReleaseMemObject(bufOut); clReleaseCommandQueue(queue); clReleaseContext(ctx); free(times); if (error != NULL) { *error = err; } return 0; } origin[0] = origin[1] = origin[2] = 0; region[0] = width; region[1] = height; region[2] = 1; avg = 0; for (i = 0; i < RELIABILITY_ROUNDS; i++) { err = clEnqueueWriteImage(queue, imgIn, CL_TRUE, origin, region, 0, 0, in, 0, NULL, NULL); if (err != CL_SUCCESS) { clReleaseMemObject(imgIn); free(in); clReleaseMemObject(bufOut); clReleaseCommandQueue(queue); clReleaseContext(ctx); free(times); if (error != NULL) { *error = err; } return 0; } err = clEnqueueWriteBuffer(queue, bufOut, CL_TRUE, 0, sizeof(cl_float4), &out, 0, NULL, NULL); if (err != CL_SUCCESS) { clReleaseMemObject(imgIn); free(in); clReleaseMemObject(bufOut); clReleaseCommandQueue(queue); clReleaseContext(ctx); free(times); if (error != NULL) { *error = err; } return 0; } err = clSetKernelArg(kernel, 0, sizeof(cl_mem), &imgIn); if (err != CL_SUCCESS) { clReleaseMemObject(imgIn); free(in); clReleaseMemObject(bufOut); clReleaseCommandQueue(queue); clReleaseContext(ctx); free(times); if (error != NULL) { *error = err; } return 0; } err = clSetKernelArg(kernel, 1, sizeof(L2_SIZE), &L2_SIZE); if (err != CL_SUCCESS) { clReleaseMemObject(imgIn); free(in); clReleaseMemObject(bufOut); clReleaseCommandQueue(queue); clReleaseContext(ctx); free(times); if (error != NULL) { *error = err; } return 0; } err = clSetKernelArg(kernel, 2, sizeof(ROUNDS), &ROUNDS); if (err != CL_SUCCESS) { clReleaseMemObject(imgIn); free(in); clReleaseMemObject(bufOut); clReleaseCommandQueue(queue); clReleaseContext(ctx); free(times); if (error != NULL) { *error = err; } return 0; } err = clSetKernelArg(kernel, 3, sizeof(cl_mem), &bufOut); if (err != CL_SUCCESS) { clReleaseMemObject(imgIn); free(in); clReleaseMemObject(bufOut); clReleaseCommandQueue(queue); clReleaseContext(ctx); free(times); if (error != NULL) { *error = err; } return 0; } err = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, &global_work_size, &local_work_size, 0, NULL, &event); if (err != CL_SUCCESS) { clReleaseMemObject(imgIn); free(in); clReleaseMemObject(bufOut); clReleaseCommandQueue(queue); clReleaseContext(ctx); free(times); if (error != NULL) { *error = err; } return 0; } err = clWaitForEvents(1, &event); if (err != CL_SUCCESS) { clReleaseEvent(event); clReleaseMemObject(imgIn); free(in); clReleaseMemObject(bufOut); clReleaseCommandQueue(queue); clReleaseContext(ctx); free(times); if (error != NULL) { *error = err; } return 0; } start = end = 0UL; err = clGetEventProfilingInfo(event, CL_PROFILING_COMMAND_START, sizeof(cl_ulong), &start, NULL); if (err != CL_SUCCESS) { clReleaseEvent(event); clReleaseMemObject(imgIn); free(in); clReleaseMemObject(bufOut); clReleaseCommandQueue(queue); clReleaseContext(ctx); free(times); if (error != NULL) { *error = err; } return 0; } err = clGetEventProfilingInfo(event, CL_PROFILING_COMMAND_END, sizeof(cl_ulong), &end, NULL); if (err != CL_SUCCESS) { clReleaseEvent(event); clReleaseMemObject(imgIn); free(in); clReleaseMemObject(bufOut); clReleaseCommandQueue(queue); clReleaseContext(ctx); free(times); if (error != NULL) { *error = err; } return 0; } clReleaseEvent(event); /* NOTE: Sometimes the difference between start and end times * can be unexpectedly large - a tens of seconds. * This is a wrong behavior. */ //assert(end - start < 10000000000UL); avg += end - start; } times[t] = avg / ((L2_SIZE - t * STEP) * global_work_size); clReleaseMemObject(imgIn); free(in); } clReleaseMemObject(bufOut); clReleaseCommandQueue(queue); clReleaseContext(ctx); max = 0; i = L2_SIZE + 1; for (t = 1; t < steps; t++) { d = (cl_double)times[t - 1]; d /= times[t]; if (d > max) { max = d; i = L2_SIZE - t * STEP; } } free(times); if (i == L2_SIZE + 1) return 0; return closestPowerOf2(i); } cl_uint deviceL1CacheAssoc( cl_device_id device, cl_ulong l1CacheSize, cl_int *error) { /* TODO: Implementation needed. */ (void)device; (void)l1CacheSize; if (error != NULL) { *error = CL_SUCCESS; } return 32; } static cl_ulong closestPowerOf2(cl_ulong x) { cl_ulong below, above; if (x == 0) { return 0; } for (above = 1; above < x; above <<= 1) { ; /* just iterate */ } if (above == x) { return x; } below = above >> 1; if ((x - below) < (above - x)) { return below; } return above; } clblas-2.10/src/library/common/devinfo.c000066400000000000000000000144271264277366700202510ustar00rootroot00000000000000/* ************************************************************************ * Copyright 2013 Advanced Micro Devices, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * ************************************************************************/ #if defined(__APPLE__) || defined(__MACOSX) #include #else #include #endif #include #include #include #include static DeviceVendor stringToVendor(const char *str) { DeviceVendor vendor; if (!strcmp(str, "Advanced Micro Devices, Inc.")) { vendor = VENDOR_AMD; } else if (!strcmp(str, "NVIDIA Corporation")) { vendor = VENDOR_NVIDIA; } else { vendor = VENDOR_UNKNOWN; } return vendor; } static DeviceChip stringToChip(const char *str) { DeviceChip chip; if (!strcmp(str, "Redwood")) { chip = REDWOOD; } else if (!strcmp(str, "Juniper")) { chip = JUNIPER; } else if (!strcmp(str, "Cypress")) { chip = CYPRESS; } else if (!strcmp(str, "Hemlock")) { chip = HEMLOCK; } else if (!strcmp(str, "Cayman")) { chip = CAYMAN; } else if (!strcmp(str, "Tahiti")) { chip = TAHITI; } else if (!strcmp(str, "Hawaii")) { chip = HAWAII; } else if (!strcmp(str, "Bonaire")) { chip = BONAIRE; } else if (!strcmp(str, "GeForce GTX 480")) { chip = GEFORCE_GTX_480; } else if (!strcmp(str, "GeForce GTX 580")) { chip = GEFORCE_GTX_580; } else { chip = CHIP_UNKNOWN; } return chip; } static DeviceFamily devFamily(DeviceChip chip) { DeviceFamily fam; switch (chip) { case REDWOOD: case JUNIPER: case CYPRESS: case HEMLOCK: fam = GPU_FAMILY_EVERGREEN; break; case GEFORCE_GTX_480: case GEFORCE_GTX_580: fam = GPU_FAMILY_FERMI; break; default: fam = DEVICE_FAMILY_UNKNOWN; break; } return fam; } cl_int identifyDevice(TargetDevice *target) { cl_int err; char s[4096]; DeviceIdent *ident = &target->ident; err = clGetDeviceInfo(target->id, CL_DEVICE_VENDOR, sizeof(s), s, NULL); if (err != CL_SUCCESS) { return err; } ident->vendor = stringToVendor(s); err = clGetDeviceInfo(target->id, CL_DEVICE_NAME, sizeof(s), s, NULL); if (err != CL_SUCCESS) { return err; } ident->chip = stringToChip(s); ident->family = devFamily(ident->chip); return CL_SUCCESS; } cl_uint deviceWavefront( cl_device_id device, cl_int *error) { (void)device; if (error != NULL) { *error = CL_SUCCESS; } return 64; } bool deviceHasNativeComplex( cl_device_id device, cl_int *error) { (void)device; if (error != NULL) { *error = CL_SUCCESS; } return false; } cl_uint deviceComputeUnits( cl_device_id device, cl_int *error) { cl_int err; cl_uint v; v = 0; err = clGetDeviceInfo(device, CL_DEVICE_MAX_COMPUTE_UNITS, sizeof(v), &v, NULL); if (error != NULL) { *error = err; } return v; } size_t deviceMaxWorkgroupSize( cl_device_id device, cl_int *error) { cl_int err; size_t v; v = 64; err = clGetDeviceInfo(device, CL_DEVICE_MAX_WORK_GROUP_SIZE, sizeof(v), &v, NULL); if (error != NULL) { *error = err; } return v; } cl_ulong deviceLDSSize( cl_device_id device, cl_int *error) { cl_int err; cl_long v; v = 0; err = clGetDeviceInfo(device, CL_DEVICE_LOCAL_MEM_SIZE, sizeof(v), &v, NULL); if (error != NULL) { *error = err; } return v; } cl_uint deviceDataAlignment( cl_device_id device, cl_int *error) { cl_int err; cl_uint v; v = 0; err = clGetDeviceInfo(device, CL_DEVICE_MIN_DATA_TYPE_ALIGN_SIZE, sizeof(v), &v, NULL); if (error != NULL) { *error = err; } return v; } cl_uint deviceAddressBits( cl_device_id device, cl_int *error) { cl_int err; cl_uint v; v = 0; err = clGetDeviceInfo(device, CL_DEVICE_ADDRESS_BITS, sizeof(v), &v, NULL); if (error != NULL) { *error = err; } return v; } bool deviceHasNativeDouble( cl_device_id device, cl_int *error) { cl_int err; cl_uint v; size_t len; char *extensions, *s; /* Check for cl_khr_fp64 extension */ err = clGetDeviceInfo(device, CL_DEVICE_PREFERRED_VECTOR_WIDTH_DOUBLE, sizeof(cl_uint), &v, NULL); if (err != CL_SUCCESS) { if (error != NULL) { *error = err; } return false; } if (v != 0) { if (error != NULL) { *error = CL_SUCCESS; } return true; } /* Check extensions */ err = clGetDeviceInfo(device, CL_DEVICE_EXTENSIONS, 0, NULL, &len); if (err != CL_SUCCESS) { if (error != NULL) { *error = err; } return false; } extensions = calloc(1, len); if (extensions == NULL) { if (error != NULL) { *error = CL_OUT_OF_HOST_MEMORY; } return false; } err = clGetDeviceInfo(device, CL_DEVICE_EXTENSIONS, len, extensions, NULL); if (err != CL_SUCCESS) { free(extensions); if (error != NULL) { *error = err; } return false; } /* Check for cl_amd_fp64 extension */ s = strstr(extensions, "cl_amd_fp64"); /* strlen("cl_amd_fp64") = 11 */ if (s != NULL) { if ((s[11] == ' ') || (s[11] == '\0')) { free(extensions); if (error != NULL) { *error = err; } return true; } } free(extensions); if (error != NULL) { *error = CL_SUCCESS; } return false; } clblas-2.10/src/library/common/gens/000077500000000000000000000000001264277366700173775ustar00rootroot00000000000000clblas-2.10/src/library/common/gens/dblock_kgen.c000066400000000000000000001252501264277366700220120ustar00rootroot00000000000000/* ************************************************************************ * Copyright 2013 Advanced Micro Devices, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * ************************************************************************/ #include #include #include #include /* * TODO: * 1) barriers in the case when work group size is greater than the * wavefront size * 2) 2D dimensional work group size * 3) Try version with array indexing * 4) Option to avoid unaligned access to vector data (?) */ // work performed by work items typedef struct ItemWork { // number of rows to be processed by single work item size_t nrRows; // number of columns to be processed by single work item size_t nrCols; // number of items processing the same row unsigned int itemsPerRow; // total number of items performing the work unsigned int nrItems; // reduced number of rows at the block tail size_t blockTail; // work size to be done with the row tail non packed in float4 size_t tail; } ItemWork; /* * Private data for loop unrolling * * NOTE: lmemLD is not used if both * 'locLDName' is initialized */ typedef struct GenPriv { DBlockCopyDirection dir; bool transp; bool packed; bool conjugate; bool notVectorize; // local memory block leading dimension size_t lmemLD; // local memory leading dimension variable name const char *locLDName; // global memory leading dimension variable name const char *globLDName; DataType dtype; unsigned int nfloats; unsigned int typeSize; const SubproblemDim *dim; const ItemWork *work; const char *srcName; const char *dstName; // variables names used while copying to images const char *imgXName; const char *imgYName; size_t cnt; // The block size used for copying. // The default is 4. unsigned int vecLen; } GenPriv; /* * 'ld' in the list of arguments is matrix leading dimension * * Common name forming rule: * (type prefix)(generic part)['Transp']['Conj']['Nvec'](src mem][dst mem][block height][block width] */ const char *copyMemDBlockDecl = "void\n" "%ccopyDBlock%s%s%s%c%c%lu%lu(\n" " %cPtr dst,\n" " %cPtr src,\n" " uint startRow,\n" " uint startCol,\n" " uint ld)\n"; const char *copyMemGImgDBlockDecl = "void\n" "%ccopyDBlock%sGI%lux%lu(\n" " __write_only image2d_t dst,\n" " int startX,\n" " int startY,\n" " GPtr src,\n" " uint startRow,\n" " uint startCol,\n" " uint ld)\n"; const char *copyMemLImgDBlockDecl = "void\n" "%ccopyDBlock%sLI%lux%lu(\n" " __write_only image2d_t dst,\n" " int startX,\n" " int startY,\n" " LPtr src)\n"; /* * declaration for function performing slow data block copying */ const char *copyMemDBlockSlowDecl = "void\n" "%ccopyDBlock%s%s%s%c%c(\n" " %cPtr dst,\n" " %cPtr src,\n" " uint startRow,\n" " uint startCol,\n" " uint nrRows,\n" " uint nrCols,\n" " uint dstLD,\n" " uint srcLD)\n"; /* * declaration for function performing slow data to image block copying */ const char *copyMemGImgDBlockSlowDecl = "void\n" "%ccopyDBlock%sGI(\n" " __write_only image2d_t dst,\n" " int startX,\n" " int startY,\n" " GPtr src,\n" " uint startRow,\n" " uint startCol,\n" " uint nrRows,\n" " uint nrCols,\n" " uint srcLD)\n"; const char *copyMemLImgDBlockSlowDecl = "void\n" "%ccopyDBlock%sLI(\n" " __write_only image2d_t dst,\n" " int startX,\n" " int startY,\n" " LPtr src,\n" " uint nrRows,\n" " uint nrCols,\n" " uint srcLD)\n"; /* * local variables for slow copying between the global and * the local memory */ const char *copyMemSlowLvars = "uint i, j, n;\n" /* * end counters for copying with vector blocks and just vectors * depending in copying type and direction */ "uint jb, jv;\n" // end counter for coying with single data with size lesser than float4 "%s" // temporaty float4 variable for the transposing version "%s" "%cPtr dst1;\n" "%cPtr src1;\n\n"; /* * One version use passing over the rows, and the second one use * passing over the columns. The Second variant is used for transposed * copying from the local to the global memory. */ const char *copyMemDBlockSlowStart[2] = { "if (nrRows %% lsize) {\n" " n = nrRows / lsize + 1;\n" "}\n" "else {\n" " n = nrRows / lsize;\n" "}\n" "\n" "jb = nrCols / %u;\n" "jv = (nrCols - jb * %u) / %u;\n" // set counter end for copying with data which size is lesser than float4 "%s" // set pointers to initial position "%s" "%s" "n = (n * lid >= nrRows) ? 0 : n;\n" "n = (n * lid + n > nrRows) ? (n - 1) : n;\n" "\n", "if (nrCols %% lsize) {\n" " n = nrCols / lsize + 1;\n" "}\n" "else {\n" " n = nrCols / lsize;\n" "}\n" "\n" // set counters for vector copying "jb = nrRows / %u;\n" "jv = (nrRows - jb * %u) / %u;\n" // set counter end for copying with data which size is lesser than float4 "%s" // set pointers to initial position "%s" "%s" "n = (n * lid >= nrCols) ? 0 : n;\n" "n = (n * lid + n > nrCols) ? (n - 1) : n;\n" "\n" }; /* * declaration for function zeroing float4 aligned * block of data */ const char *f4zeroDecl = "void\n" "%cf4zero%lu(%s float4 *data)\n"; const char *fzeroSlowDecl = "void\n" "%cf4zero(%s float4 *buf, size_t cnt)\n"; const char *copyMemImgDBlockSlow = "for (i = 0; i < n; i++) {\n" " int x1 = x;\n" " int y1 = y;\n" " %cPtr src1 = src;\n" "\n" " for (j = 0; j < jb; j++) {\n" " write_imageui(dst, (int2)(x1++, y1), as_uint4(*src1.f4v++));\n" " write_imageui(dst, (int2)(x1++, y1), as_uint4(*src1.f4v++));\n" " write_imageui(dst, (int2)(x1++, y1), as_uint4(*src1.f4v++));\n" " write_imageui(dst, (int2)(x1++, y1), as_uint4(*src1.f4v++));\n" " }\n" " for (j = 0; j < jv; j++) {\n" " write_imageui(dst, (int2)(x1++, y1), as_uint4(*src1.f4v++));\n" " }\n" "\n" " y++;\n" " src.%s += srcLD;\n" "}\n"; const char *copyMemImgDBlockPackedSlow = "for (i = 0; i < n; i++) {\n" " %cPtr src1 = src;\n" " x = startX + ((index + i) %% nLines) * nrCols / %lu;\n" " y = startY + (index + i) / nLines;\n" "\n" " for (j = 0; j < jb; j++) {\n" " write_imageui(dst, (int2)(x++, y), as_uint4(*src1.f4v++));\n" " write_imageui(dst, (int2)(x++, y), as_uint4(*src1.f4v++));\n" " write_imageui(dst, (int2)(x++, y), as_uint4(*src1.f4v++));\n" " write_imageui(dst, (int2)(x++, y), as_uint4(*src1.f4v++));\n" " }\n" " for (j = 0; j < jv; j++) {\n" " write_imageui(dst, (int2)(x++, y), as_uint4(*src1.f4v++));\n" " }\n" "\n" " src.%s += srcLD;\n" "}\n"; const char *setLoopBoundStmt = "if (lid > %u) {\n" " nrows = 0;\n" "}\n" "else {\n" " nrows = (lid == %u) ? %u : %u;\n" "}\n"; const char *privatePtrs = "%cPtr src1;\n" "%cPtr dst1;\n"; // loop bound variable name const char *lboundVarName = "nrows"; // local id variable const char *lidVarName = "lid"; /* * Partial initialization of the generator private information */ static void initGenPriv( GenPriv *priv, DataType dtype, unsigned int typeSize, const SubproblemDim *dim, DBlockCopyDirection dir, const ItemWork *work, const PGranularity *pgran) { unsigned int gsize; priv->dtype = dtype; priv->typeSize = typeSize; priv->nfloats = typeSize / sizeof(float); priv->dim = dim; priv->dir = dir; priv->work = work; priv->cnt = 0; priv->vecLen = FLOAT4_VECLEN; if (dir == DBLOCK_GLOBAL_TO_LOCAL || dir == DBLOCK_LOCAL_TO_GLOBAL) { gsize = pgran->wgSize[0] * pgran->wgSize[1]; priv->vecLen = (unsigned int)(dim->x * dim->y * priv->nfloats / gsize); if (priv->vecLen < 1) { priv->vecLen = 1; } else if (priv->vecLen > 4) { priv->vecLen = FLOAT4_VECLEN; } } } /* * get info about work to be done by the work group * * Resulting work data chunk for each item is float4 aligned. * Remaining data chunk presented as tail for which code is * generated just after the loop part getting deal with float4 * aligned chunks. */ static void getItemWork(ItemWork *work, const SubproblemDim *dim, const PGranularity *pgran, size_t nfloats, unsigned int vecLen) { size_t n; size_t gsize; memset(work, 0, sizeof(ItemWork)); gsize = pgran->wgSize[0] * pgran->wgSize[1]; if (dim->y < gsize) { // one work item processes a part of a row (or none at all) work->itemsPerRow = (unsigned int)(gsize / dim->y); work->nrCols = dim->x / work->itemsPerRow; work->nrRows = 1; if (work->itemsPerRow * dim->y < gsize) { work->nrItems = (unsigned int)(work->itemsPerRow * dim->y); } } else { // one work item processes typically several rows (or none at all) work->itemsPerRow = 1; work->nrCols = dim->x; work->nrRows = dim->y / gsize; if (dim->y % gsize) { work->nrRows++; work->nrItems = (unsigned int)(dim->y / work->nrRows); // remaining number of rows n = dim->y - work->nrItems * work->nrRows; if (n) { work->blockTail = n; // total number of work items needed for the transfer work->nrItems++; } } } work->nrCols -= (work->nrCols * nfloats % vecLen) / nfloats; work->tail = dim->x - work->nrCols * work->itemsPerRow; } /* * Prepare generator outer loop */ static void prepareLoop(struct KgenContext *ctx, ItemWork *work, LoopCtl *loopCtl) { char tmp[1024]; kgenAddStmt(ctx, "size_t n;\n"); loopCtl->ocName = "n"; if (work->nrItems) { sprintf(tmp, "size_t %s;\n\n", lboundVarName); kgenAddStmt(ctx, tmp); /* * set number of rows to be processed by the work item; * in the case it is not a constant */ if (work->blockTail) { sprintf(tmp, setLoopBoundStmt, work->nrItems - 1, work->nrItems - 1, work->blockTail, work->nrRows); kgenAddStmt(ctx, tmp); } else { sprintf(tmp, "nrows = (%s >= %u) ? 0 : %lu;\n", lidVarName, work->nrItems, work->nrRows); kgenAddStmt(ctx, tmp); } loopCtl->outBound.name = lboundVarName; } else { loopCtl->outBound.val = (unsigned long)work->nrRows; loopCtl->obConst = true; } } static int getVecLen(struct KgenContext *ctx, void *priv) { GenPriv *gpriv = (GenPriv*)priv; (void) ctx; return gpriv->vecLen; } /* * common function for loop tail generating */ static void addTailCode( struct KgenContext *ctx, GenPriv *gpriv, LoopUnrollGen genSingleVec, LoopUnrollGen genSingle) { char tmp[1024]; const ItemWork *work = gpriv->work; LoopCtl loopCtl; LoopUnrollers unrollers; memset(&loopCtl, 0, sizeof(loopCtl)); memset(&unrollers, 0, sizeof(unrollers)); loopCtl.inBound = (unsigned long)work->tail; if (work->itemsPerRow > 1) { if (work->nrItems) { sprintf(tmp, "if ((%s %% %u == %u) && (%s < %u))", lidVarName, work->itemsPerRow, work->itemsPerRow - 1, lidVarName, work->nrItems); } else { sprintf(tmp, "if (%s %% %u == %u)", lidVarName, work->itemsPerRow, work->itemsPerRow - 1); } kgenBeginBranch(ctx, tmp); } unrollers.genSingleVec = genSingleVec; unrollers.genSingle = genSingle; unrollers.getVecLen = getVecLen; kgenLoopUnroll(ctx, &loopCtl, gpriv->dtype, &unrollers, gpriv); if (work->itemsPerRow > 1) { kgenEndBranch(ctx, NULL); } } static int copyMemPreUnroll(struct KgenContext *ctx, void *priv) { DUMMY_ARG_USAGE(priv); kgenAddStmt(ctx, "src1 = src;\n"); return kgenAddStmt(ctx, "dst1 = dst;\n\n"); } static int copyImgPreUnroll(struct KgenContext *ctx, void *priv) { char tmp[1024]; GenPriv *gpriv = (GenPriv*)priv; if (gpriv->packed) { sprintf(tmp, "%s = startX + (index * %lu) %% pLine / %u;\n" "%s = startY + (index * %lu) / pLine;\n" "%s = src;\n\n", gpriv->imgXName, gpriv->dim->x, FLOAT4_VECLEN / gpriv->nfloats, gpriv->imgYName, gpriv->dim->x, gpriv->srcName); } else { sprintf(tmp, "%s = x;\n" "%s = y;\n" "%s = src;\n\n", gpriv->imgXName, gpriv->imgYName, gpriv->srcName); } return kgenAddStmt(ctx, tmp); } static int copyImgVec(struct KgenContext *ctx, void *priv) { char tmp[1024]; GenPriv *gpriv = (GenPriv*)priv; dtypeUPtrField(gpriv->dtype); sprintf(tmp, "write_imageui(%s, (int2)(%s++,%s), as_uint4(*%s.f4v++));\n", gpriv->dstName, gpriv->imgXName, gpriv->imgYName, gpriv->srcName); return kgenAddStmt(ctx, tmp); } static int copyImgSingle(struct KgenContext *ctx, void *priv) { GenPriv *gpriv = (GenPriv*)priv; if (gpriv->dtype == TYPE_COMPLEX_DOUBLE) { return copyImgVec(ctx, priv); } else { return -EINVAL; } } static int copyMemVec(struct KgenContext *ctx, void *priv) { char tmp[1024]; char vec[64]; GenPriv *gpriv = (GenPriv*)priv; if (gpriv->vecLen == 1) sprintf(vec,"f"); else sprintf(vec,"f%dv", gpriv->vecLen); if (gpriv->conjugate) { sprintf(tmp, "tmp = *%s.%s++;\n", gpriv->srcName, vec); kgenAddStmt(ctx, tmp); if (gpriv->dtype == TYPE_COMPLEX_FLOAT) { kgenAddStmt(ctx, "tmp.y = -tmp.y;\n" "tmp.w = -tmp.w;\n"); } else { kgenAddStmt(ctx, "tmp.y = -tmp.y;\n"); } sprintf(tmp, "*%s.%s++ = tmp;\n", gpriv->dstName, vec); } else { sprintf(tmp, "*%s.%s++ = *%s.%s++;\n", gpriv->dstName, vec, gpriv->srcName, vec); } return kgenAddStmt(ctx, tmp); } static int copyMemSingle(struct KgenContext *ctx, void *priv) { char tmp[1024]; GenPriv *gpriv = (GenPriv*)priv; const char *vfield; vfield = dtypeUPtrField(gpriv->dtype); if (gpriv->conjugate) { sprintf(tmp, "*%s.%s = *%s.%s++;\n", gpriv->dstName, vfield, gpriv->srcName, vfield); kgenAddStmt(ctx, tmp); sprintf(tmp, "(*%s.%s).y = -(*%s.%s).y;\n", gpriv->dstName, vfield, gpriv->dstName, vfield); kgenAddStmt(ctx, tmp); sprintf(tmp, "%s.%s++;\n", gpriv->dstName, vfield); } else { sprintf(tmp, "*%s.%s++ = *%s.%s++;\n", gpriv->dstName, vfield, gpriv->srcName, vfield); } return kgenAddStmt(ctx, tmp); } static int copyMemVecTransp(struct KgenContext *ctx, void *priv) { char tmp[1024]; size_t i; GenPriv *gpriv = (GenPriv*)priv; unsigned int n = gpriv->nfloats; const char *tmpSuff[2][4] = { {"x", "y", "z", "w"}, {"xy", "zw", NULL, NULL}}; const char *dstSuff[4] = {"f", "f2v", NULL, "f4v"}; const char *vfield; const char *s; vfield = dtypeUPtrField(gpriv->dtype); kgenAddBlankLine(ctx); if (gpriv->dir == DBLOCK_GLOBAL_TO_LOCAL) { sprintf(tmp, "tmp = *%s.f4v++;\n", gpriv->srcName); kgenAddStmt(ctx, tmp); if (gpriv->conjugate) { /* * Only complex float element can be conjugated here, * those of double complex type are processed with no vectrized * function */ kgenAddStmt(ctx, "tmp.y = -tmp.y;\n" "tmp.w = -tmp.w;\n"); } for (i = 0; i < FLOAT4_VECLEN / n; i++) { if (gpriv->locLDName) { sprintf(tmp, "%s.%s[%s * %lu] = tmp.%s;\n", gpriv->dstName, dstSuff[n - 1], gpriv->locLDName, i, tmpSuff[n - 1][i]); } else { sprintf(tmp, "%s.%s[%lu] = tmp.%s;\n", gpriv->dstName, dstSuff[n - 1], gpriv->lmemLD * i, tmpSuff[n - 1][i]); } kgenAddStmt(ctx, tmp); } s = gpriv->dstName; } else { for (i = 0; i < FLOAT4_VECLEN / n; i++) { if (gpriv->locLDName) { sprintf(tmp, "tmp.%s = %s.%s[%s * %lu];\n", tmpSuff[n - 1][i], gpriv->srcName, dstSuff[n - 1], gpriv->locLDName, i); } else { sprintf(tmp, "tmp.%s = %s.%s[%lu];\n", tmpSuff[n - 1][i], gpriv->srcName, dstSuff[n - 1], gpriv->lmemLD * i); } kgenAddStmt(ctx, tmp); } sprintf(tmp, "*%s.f4v++ = tmp;\n", gpriv->dstName); kgenAddStmt(ctx, tmp); s = gpriv->srcName; } if (gpriv->locLDName) { sprintf(tmp, "%s.%s += %s * %lu;\n", s, vfield, gpriv->locLDName, i); } else { sprintf(tmp, "%s.%s += %lu;\n", s, vfield, gpriv->lmemLD * i); } return kgenAddStmt(ctx, tmp); } static int copyMemSingleTransp(struct KgenContext *ctx, void *priv) { char tmp[1024]; GenPriv *gpriv = (GenPriv*)priv; const char *vfield; vfield = dtypeUPtrField(gpriv->dtype); kgenAddBlankLine(ctx); if (gpriv->dir == DBLOCK_GLOBAL_TO_LOCAL) { if (gpriv->locLDName) { sprintf(tmp, "*%s.%s = *%s.%s++;\n", gpriv->dstName, vfield, gpriv->srcName, vfield); kgenAddStmt(ctx, tmp); if (gpriv->conjugate) { sprintf(tmp, "(*%s.%s).y = -(*%s.%s).y;\n", gpriv->dstName, vfield, gpriv->dstName, vfield); kgenAddStmt(ctx, tmp); } sprintf(tmp, "%s.%s += %s;\n", gpriv->dstName, vfield, gpriv->locLDName); } else { sprintf(tmp, "%s.%s[%lu] = *%s.%s++;\n", gpriv->dstName, vfield, gpriv->lmemLD * gpriv->cnt, gpriv->srcName, vfield); if (gpriv->conjugate) { kgenAddStmt(ctx, tmp); sprintf(tmp, "%s.%s[%lu].y = -%s.%s[%lu].y;\n", gpriv->dstName, vfield, gpriv->lmemLD * gpriv->cnt, gpriv->dstName, vfield, gpriv->lmemLD * gpriv->cnt); } } } else { if (gpriv->locLDName) { sprintf(tmp, "*%s.%s++ = *%s.%s;\n" "%s.%s += %s;\n", gpriv->dstName, vfield, gpriv->srcName, vfield, gpriv->srcName, vfield, gpriv->locLDName); } else { sprintf(tmp, "*%s.%s++ = %s.%s[%lu];\n", gpriv->dstName, vfield, gpriv->srcName, vfield, gpriv->lmemLD * gpriv->cnt); } } gpriv->cnt++; return kgenAddStmt(ctx, tmp); } /* * transfer row tail elements being not packing in float4 vector * and zeroing row tail */ static void addCopyTailCode(struct KgenContext *ctx, GenPriv *gpriv) { LoopUnrollGen singleVec; LoopUnrollGen single; bool image; image = (gpriv->dir == DBLOCK_GLOBAL_TO_IMAGE || gpriv->dir == DBLOCK_LOCAL_TO_IMAGE); if (image) { singleVec = copyImgVec; single = copyImgSingle; } else { if (gpriv->transp) { singleVec = copyMemVecTransp; single = copyMemSingleTransp; } else { singleVec = copyMemVec; single = copyMemSingle; } } if (gpriv->notVectorize) { singleVec = NULL; } addTailCode(ctx, gpriv, singleVec, single); } static int copyMemPostUnroll(struct KgenContext *ctx, void *priv) { char tmp[1024]; const char *s[2] = {"src", "dst"}; GenPriv *gpriv = (GenPriv*)priv; int gdir; const char *vfield; gdir = (gpriv->dir == DBLOCK_GLOBAL_TO_LOCAL) ? 0 : 1; if (gpriv->work && gpriv->work->tail) { addCopyTailCode(ctx, gpriv); } if (!gpriv->transp) { kgenAddBlankLine(ctx); } // modify pointers vfield = dtypeUPtrField(gpriv->dtype); sprintf(tmp, "%s.%s += %s;\n", s[gdir], vfield, gpriv->globLDName); kgenAddStmt(ctx, tmp); if (gpriv->transp) { sprintf(tmp, "%s.%s++;\n", s[1 - gdir], vfield); } else { if (gpriv->locLDName) { sprintf(tmp, "%s.%s += %s;\n", s[1 - gdir], vfield, gpriv->locLDName); } else { sprintf(tmp, "%s.%s += %lu;\n", s[1 - gdir], vfield, gpriv->lmemLD); } } return kgenAddStmt(ctx, tmp); } static int copyImgPostUnroll(struct KgenContext *ctx, void *priv) { char tmp[1024]; GenPriv *gpriv = (GenPriv*)priv; const char *vfield = dtypeUPtrField(gpriv->dtype); if (gpriv->work && gpriv->work->tail) { addCopyTailCode(ctx, gpriv); } kgenAddBlankLine(ctx); if (gpriv->dir == DBLOCK_GLOBAL_TO_IMAGE) { sprintf(tmp, "src.%s += %s;\n", vfield, gpriv->globLDName); } else if (gpriv->dir == DBLOCK_LOCAL_TO_IMAGE) { sprintf(tmp, "src.%s += %lu;\n", vfield, gpriv->lmemLD); } kgenAddStmt(ctx, tmp); if(gpriv->packed) { sprintf(tmp, "index++;\n"); } else { sprintf(tmp, "y++;\n"); } return kgenAddStmt(ctx, tmp); } // unrolling generator for the f4zero function static int f4zeroSingle(struct KgenContext *ctx, void *priv) { DUMMY_ARG_USAGE(priv); return kgenAddStmt(ctx, "*data++ = 0;\n"); } /* * Add statement setting initial local pointer for the work item * * @ld: lead dimension for the local block in float words; * if it's zero, the "ld" argument of a generated function is * used instead */ static void addSettingPtrCode( struct KgenContext *ctx, const char *ptrName, size_t ld, bool transpose, const PGranularity *pgran, GenPriv *gpriv) { char tmp[4096]; const char *vfield; const SubproblemDim *dim = gpriv->dim; const ItemWork *work = gpriv->work; size_t gsize; vfield = dtypeUPtrField(gpriv->dtype); gsize = pgran->wgSize[0] * pgran->wgSize[1]; if (ld) { // offset between two rows and two elements in each row size_t roff, eoff; if (transpose) { roff = 1; eoff = ld; } else { roff = ld; eoff = 1; } if (dim->y < gsize) { sprintf(tmp, "%s.%s += (%s / %u) * %lu + (%s %% %u * %lu) * %lu;\n", ptrName, vfield, lidVarName, work->itemsPerRow, roff, lidVarName, work->itemsPerRow, work->nrCols, eoff); } else { sprintf(tmp, "%s.%s += %s * %lu * %lu;\n", ptrName, vfield, lidVarName, work->nrRows, roff); } } else { if (dim->y < gsize) { sprintf(tmp, "%s.%s += (startRow + %s / %u) * %s + " "startCol + %s %% %u * %lu;\n", ptrName, vfield, lidVarName, work->itemsPerRow, gpriv->globLDName, lidVarName, work->itemsPerRow, work->nrCols); } else { sprintf(tmp, "%s.%s += (startRow + %s * %lu) * %s + startCol;\n", ptrName, vfield, lidVarName, work->nrRows, gpriv->globLDName); } } kgenAddStmt(ctx, tmp); kgenAddBlankLine(ctx); } /* * Add statement setting initial coordinates pointer for image * */ static void addSettingImageXYCode( struct KgenContext *ctx, const char *xName, const char *yName, const PGranularity *pgran, GenPriv *gpriv) { char tmp[4096]; const ItemWork *work = gpriv->work; size_t gsize = pgran->wgSize[0] * pgran->wgSize[1]; if (gpriv->packed) { sprintf(tmp, "pLine = ((get_image_width(dst) - startX) * %d / %lu) * %lu;\n", FLOAT4_VECLEN / gpriv->nfloats, gpriv->dim->x, gpriv->lmemLD); kgenAddStmt(ctx, tmp); if (gpriv->dim->y < gsize) { sprintf(tmp, "index = %s / %u;\n", lidVarName, work->itemsPerRow); } else { sprintf(tmp, "index = %s * %lu;\n", lidVarName, work->nrRows); } kgenAddStmt(ctx, tmp); sprintf(tmp, "x = startX + (index * %lu) %% pLine / %u;\n", gpriv->dim->x, FLOAT4_VECLEN / gpriv->nfloats); kgenAddStmt(ctx, tmp); if (gpriv->dim->y < gsize) { sprintf(tmp, "x += (%s %% %u) * (%lu / %u / %u);\n", lidVarName, work->itemsPerRow, gpriv->dim->x, (FLOAT4_VECLEN / gpriv->nfloats), work->itemsPerRow); kgenAddStmt(ctx, tmp); } sprintf(tmp, "y = startY + (index * %lu) / pLine;\n", gpriv->dim->x); kgenAddStmt(ctx, tmp); } else { if (gpriv->dim->y < gsize) { sprintf(tmp, "%s = startX + %s %% %u * %lu / %d;\n", xName, lidVarName, work->itemsPerRow, work->nrCols, FLOAT4_VECLEN/gpriv->nfloats); kgenAddStmt(ctx, tmp); sprintf(tmp, "%s = startY + %s / %u;\n", yName, lidVarName, work->itemsPerRow); kgenAddStmt(ctx, tmp); } else { sprintf(tmp, "%s = startX;\n", xName); kgenAddStmt(ctx, tmp); sprintf(tmp, "%s = startY + %s * %lu;\n", yName, lidVarName, gpriv->work->nrRows); kgenAddStmt(ctx, tmp); } } kgenAddBlankLine(ctx); } // generator working with subproblems of any dimension static int copyDBlockGenericGen( struct KgenContext *ctx, const PGranularity *pgran, GenPriv *gpriv) { char fpref; const char varPref[2] = {'G', 'L'}; char tmp[1024]; bool image; const char *s[3]; int gdir; unsigned int i, n, gsize; const char *vfield; DataType dtype = gpriv->dtype; fpref = dtypeToPrefix(dtype); if (!fpref || (fpref == 'i')) { return -EINVAL; } image = (gpriv->dir == DBLOCK_GLOBAL_TO_IMAGE || gpriv->dir == DBLOCK_LOCAL_TO_IMAGE); s[0] = (gpriv->transp) ? "Transp" : ""; vfield = dtypeUPtrField(dtype); n = FLOAT4_VECLEN / gpriv->nfloats; gsize = pgran->wgSize[0] * pgran->wgSize[1]; if (image) { char srcStr[1024]; s[1] = (gpriv->packed) ? "Pack" : ""; if (gpriv->dir == DBLOCK_GLOBAL_TO_IMAGE) { sprintf(srcStr, "src.%s += (startRow + lid * n) *" " srcLD + startCol;\n", vfield); sprintf(tmp, copyMemGImgDBlockSlowDecl, fpref, s[1]); } else { sprintf(srcStr, "src.%s += srcLD * lid * n;\n", vfield); sprintf(tmp, copyMemLImgDBlockSlowDecl, fpref, s[1]); } kgenDeclareFunction(ctx, tmp); kgenBeginFuncBody(ctx); sprintf(tmp, "int x, y;\n" "uint i, j, n, jb, jv;\n" "int lsize = %u;\n", gsize); kgenAddStmt(ctx, tmp); kgenDeclareLocalID(ctx, "lid", pgran); if (gpriv->packed) { char nLinesStr[1024]; sprintf(nLinesStr, "nLines = (get_image_width(dst) - startX) * %d / nrCols;\n" "index = lid * n;\n", FLOAT4_VECLEN / gpriv->nfloats); sprintf(tmp, "int nLines, index;\n"); kgenAddStmt(ctx, tmp); sprintf(tmp, copyMemDBlockSlowStart[0], 4 * n, 4 * n, n,"", nLinesStr, srcStr); } else { sprintf(tmp, copyMemDBlockSlowStart[0], 4 * n, 4 * n, n, "", "x = startX;\n" "y = startY + lid * n;\n", srcStr); } kgenAddStmt(ctx, tmp); gdir = (gpriv->dir == DBLOCK_GLOBAL_TO_IMAGE) ? 0 : 1; if (gpriv->packed) { sprintf(tmp, copyMemImgDBlockPackedSlow, varPref[gdir], FLOAT4_VECLEN / gpriv->nfloats, vfield); } else { sprintf(tmp, copyMemImgDBlockSlow, varPref[gdir], vfield); } kgenAddStmt(ctx, tmp); } else { LoopCtl loopCtl; LoopUnrollers unrollers; char buf[3][256]; memset(&loopCtl, 0, sizeof(loopCtl)); memset(&unrollers, 0, sizeof(unrollers)); s[1] = (gpriv->conjugate) ? "Conj" : ""; s[2] = (gpriv->notVectorize) ? "Nvec" : ""; gdir = (gpriv->dir == DBLOCK_GLOBAL_TO_LOCAL) ? 0 : 1; sprintf(tmp, copyMemDBlockSlowDecl, fpref, s[0], s[1], s[2], varPref[gdir], varPref[1 - gdir], varPref[1 - gdir], varPref[gdir]); kgenDeclareFunction(ctx, tmp); kgenBeginFuncBody(ctx); kgenDeclareLocalID(ctx, "lid", pgran); sprintf(tmp, "int lsize = %u;\n", gsize); kgenAddStmt(ctx, tmp); if (dtype == TYPE_COMPLEX_DOUBLE) { s[0] = ""; s[1] = ""; } else { s[0] = "uint js;\n"; s[1] = (gpriv->transp || gpriv->conjugate) ? "float4 tmp;\n" : ""; } // pass over rows or columns? i = (gpriv->transp && gdir) ? 1 : 0; if (dtype == TYPE_COMPLEX_DOUBLE) { buf[0][0] = '\0'; } else { const char *boundName; // set counter bound to copy tail part, each work less than float4 boundName = (i) ? "nrRows" : "nrCols"; /* * FIXME: the kludge is introduced due to strange * runtime segfault at block transferring for another * data types. Verify it later. Now, for non float types * keep only simple loop. */ if (i && (dtype != TYPE_FLOAT)) { gpriv->notVectorize = true; } if (gpriv->notVectorize) { sprintf(buf[0], "jb = 0;\n" "jv = 0;\n" "js = %s;\n", boundName); } else { sprintf(buf[0], "js = %s - jb * %u - jv * %u;\n", boundName, 4 * n, n); } } // set initial pointers if (!gdir) { sprintf(buf[1], "src.%s += (startRow + lid * n) * srcLD + " "startCol;\n", vfield); if (gpriv->transp) { sprintf(buf[2], "dst.%s += lid * n;\n", vfield); } else { sprintf(buf[2], "dst.%s += dstLD * lid * n;\n", vfield); } } else { if (gpriv->transp) { sprintf(buf[1], "src.%s += lid * n;\n", vfield); } else { sprintf(buf[1], "src.%s += srcLD * lid * n;\n", vfield); } sprintf(buf[2], "dst.%s += (startRow + lid * n) * dstLD + " "startCol;\n", vfield); } sprintf(tmp, copyMemSlowLvars, s[0], s[1], varPref[1 - gdir], varPref[gdir]); kgenAddStmt(ctx, tmp); sprintf(tmp, copyMemDBlockSlowStart[i], 4 * n, 4 * n, n, buf[0], buf[1], buf[2]); kgenAddStmt(ctx, tmp); // prepare to loop unrolling gpriv->srcName = "src1"; gpriv->dstName = "dst1"; if (gdir) { gpriv->locLDName = "srcLD"; gpriv->globLDName = "dstLD"; } else { gpriv->locLDName = "dstLD"; gpriv->globLDName = "srcLD"; } loopCtl.ocName = "j"; if (gpriv->transp) { unrollers.genSingle = copyMemSingleTransp; if (dtype != TYPE_COMPLEX_DOUBLE) { unrollers.genSingleVec = copyMemVecTransp; } } else { unrollers.genSingle = copyMemSingle; if (dtype != TYPE_COMPLEX_DOUBLE) { unrollers.genSingleVec = copyMemVec; } } // external loop kgenBeginBranch(ctx, "for (i = 0; i < n; i++)"); copyMemPreUnroll(ctx, gpriv); // finally, unroll all loops unrollers.getVecLen = getVecLen; // copying with 4 float4 words if (!gpriv->notVectorize) { loopCtl.outBound.name = "jb"; loopCtl.inBound = 4 * n; kgenLoopUnroll(ctx, &loopCtl, dtype, &unrollers, gpriv); // copying with float4 words loopCtl.outBound.name = "jv"; loopCtl.inBound = n; kgenLoopUnroll(ctx, &loopCtl, dtype, &unrollers, gpriv); } // copying the remaining tail if (dtype != TYPE_COMPLEX_DOUBLE) { unrollers.genSingleVec = NULL; loopCtl.outBound.name = "js"; loopCtl.inBound = 1; kgenLoopUnroll(ctx, &loopCtl, dtype, &unrollers, gpriv); } copyMemPostUnroll(ctx, gpriv); kgenEndBranch(ctx, NULL); } return kgenEndFuncBody(ctx); } // generator optimizing to a subproblem size static int copyDBlockOptimGen( struct KgenContext *ctx, const SubproblemDim *dim, const PGranularity *pgran, GenPriv *gpriv) { char fpref; const char varPref[2] = {'G', 'L'}; char tmp[1024]; // lead dimension for right and transposed local block in float words ItemWork work; LoopCtl loopCtl; LoopUnrollers unrollers; const char *s, *s1, *s2; bool image; SubproblemDim newDim; // copying direction within the memory or image related function group int gdir = 0; int r; fpref = dtypeToPrefix(gpriv->dtype); if (!fpref || (fpref == 'i')) { return -EINVAL; } image = (gpriv->dir == DBLOCK_GLOBAL_TO_IMAGE || gpriv->dir == DBLOCK_LOCAL_TO_IMAGE); memset(&unrollers, 0, sizeof(unrollers)); memset(&loopCtl, 0, sizeof(loopCtl)); memset(&newDim, 0, sizeof(newDim)); gpriv->dim = &newDim; gpriv->work = (const ItemWork*)&work; gpriv->globLDName = "ld"; s = (gpriv->transp) ? "Transp" : ""; s1 = (gpriv->conjugate) ? "Conj" : ""; s2 = (gpriv->notVectorize) ? "Nvec" : ""; if ((gpriv->dir == DBLOCK_LOCAL_TO_GLOBAL) && gpriv->transp) { // pass over columns of the block stored in the local memory newDim.x = dim->y; newDim.y = dim->x; } else { // pass over rows newDim.x = dim->x; newDim.y = dim->y; } getItemWork(&work, &newDim, pgran, gpriv->nfloats, gpriv->vecLen); if (image) { s = (gpriv->packed) ? "Pack" : ""; if (gpriv->dir == DBLOCK_GLOBAL_TO_IMAGE) { sprintf(tmp, copyMemGImgDBlockDecl, fpref, s, dim->y, dim->x); } else { sprintf(tmp, copyMemLImgDBlockDecl, fpref, s, dim->y, dim->x); } } else { gdir = (gpriv->dir == DBLOCK_GLOBAL_TO_LOCAL) ? 0 : 1; sprintf(tmp, copyMemDBlockDecl, fpref, s, s1, s2, varPref[gdir], varPref[1 - gdir], dim->y, dim->x, varPref[1 - gdir], varPref[gdir]); } kgenDeclareFunction(ctx, tmp); kgenBeginFuncBody(ctx); kgenDeclareLocalID(ctx, lidVarName, pgran); if (image) { // data for loop unrolling if (work.nrRows > 1) { gpriv->srcName = "src1"; gpriv->dstName = "dst"; gpriv->imgXName="x1"; gpriv->imgYName="y1"; if(gpriv->dir == DBLOCK_GLOBAL_TO_IMAGE) { kgenAddStmt(ctx, "GPtr src1;\n"); } else if(gpriv->dir == DBLOCK_LOCAL_TO_IMAGE) { kgenAddStmt(ctx, "LPtr src1;\n"); } kgenAddStmt(ctx, "int x1, y1;\n"); unrollers.preUnroll = copyImgPreUnroll; unrollers.postUnroll = copyImgPostUnroll; } else { gpriv->srcName = "src"; // dst has image2d_t type here gpriv->dstName = "dst"; gpriv->imgXName="x"; gpriv->imgYName="y"; } } else { if ((gpriv->nfloats != FLOAT4_VECLEN) && (gpriv->transp || gpriv->conjugate)) { /* * temporary variable to transpose or conjugate non double * complex elements */ kgenAddStmt(ctx, "float4 tmp;\n"); } if (work.nrRows > 1) { sprintf(tmp, privatePtrs, varPref[gdir], varPref[1 - gdir]); kgenAddStmt(ctx, tmp); // data for loop unrolling unrollers.preUnroll = copyMemPreUnroll; unrollers.postUnroll = copyMemPostUnroll; gpriv->srcName = "src1"; gpriv->dstName = "dst1"; } else { gpriv->srcName = "src"; gpriv->dstName = "dst"; } } if ((work.nrRows > 1) || work.nrItems) { prepareLoop(ctx, &work, &loopCtl); } kgenAddBlankLine(ctx); loopCtl.inBound = (unsigned long)work.nrCols; // now, prepare all needed for loop unrolling if (image) { kgenAddStmt(ctx, "int x, y;\n"); if (gpriv->packed) { kgenAddStmt(ctx, "int pLine, index;\n"); } gpriv->lmemLD = fl4RowWidth(dim->x, gpriv->typeSize) * FLOAT4_VECLEN / gpriv->nfloats; // set up starting x and y in image addSettingImageXYCode(ctx, "x", "y", pgran, gpriv); if (gpriv->dir == DBLOCK_GLOBAL_TO_IMAGE) { // set initial global pointer addSettingPtrCode(ctx, "src", 0, false, pgran, gpriv); } else if (gpriv->dir == DBLOCK_LOCAL_TO_IMAGE) { // set initial local pointer addSettingPtrCode(ctx, "src", gpriv->lmemLD, gpriv->transp, pgran, gpriv); } unrollers.genSingleVec = copyImgVec; unrollers.genSingle = copyImgSingle; } else { // set initial global pointer s = (gdir) ? "dst" : "src"; addSettingPtrCode(ctx, s, 0, false, pgran, gpriv); s = (gdir) ? "src" : "dst"; if (!gdir && gpriv->transp) { gpriv->lmemLD = fl4RowWidth(dim->y, gpriv->typeSize) * FLOAT4_VECLEN / gpriv->nfloats; } else { gpriv->lmemLD = fl4RowWidth(dim->x, gpriv->typeSize) * FLOAT4_VECLEN / gpriv->nfloats; } if (gpriv->transp) { unrollers.genSingleVec = (gpriv->notVectorize) ? NULL : copyMemVecTransp; unrollers.genSingle = copyMemSingleTransp; } else { unrollers.genSingleVec = (gpriv->notVectorize) ? NULL : copyMemVec; unrollers.genSingle = copyMemSingle; } addSettingPtrCode(ctx, s, gpriv->lmemLD, gpriv->transp, pgran, gpriv); } unrollers.getVecLen = getVecLen; // unroll for float4 aligned data chunk kgenLoopUnroll(ctx, &loopCtl, gpriv->dtype, &unrollers, gpriv); /* * Unroll for remaining data tail. * Block tail reading/writing is done separately * when many work items process single row * because the compiler don't like any conditional * branches in loops */ if ((unrollers.postUnroll == NULL) && work.tail) { addCopyTailCode(ctx, gpriv); } r = kgenEndFuncBody(ctx); return r ? -EOVERFLOW : 0; } int copyDataBlockGen( struct KgenContext *ctx, const SubproblemDim *dim, const PGranularity *pgran, DataType dtype, DBlockCopyDirection dir, DBlockCopyFlags flags) { int r; GenPriv gpriv; unsigned int tsize; tsize = dtypeSize(dtype); if (dir == DBLOCK_LOCAL_TO_IMAGE || dir == DBLOCK_GLOBAL_TO_IMAGE) { size_t rowSize; if (dim != NULL) { rowSize = tsize * dim->x; if (rowSize % sizeof(cl_float4) != 0) { // only float4 aligned rows are supported return -EINVAL; } } if (flags & DBLOCK_COPY_TRANSPOSE) { return -EINVAL; } } memset(&gpriv, 0, sizeof(gpriv)); gpriv.transp = (flags & DBLOCK_COPY_TRANSPOSE); gpriv.packed = (flags & DBLOCK_COPY_PACKED_IMAGE); if (dtype != TYPE_COMPLEX_DOUBLE) { gpriv.notVectorize = (flags & DBLOCK_COPY_NOT_VECTORIZE); } if ((flags & DBLOCK_COPY_CONJUGATE) && isComplexType(dtype)) { gpriv.conjugate = true; } initGenPriv(&gpriv, dtype, tsize, dim ,dir, NULL, pgran); if (dim) { r = copyDBlockOptimGen(ctx, dim, pgran, &gpriv); } else { r = copyDBlockGenericGen(ctx, pgran, &gpriv); } return r; } int f4zeroBlockGen( struct KgenContext *ctx, const SubproblemDim *dim, const PGranularity *pgran, const char *memPrefix) { char tmp[1024]; ItemWork work; LoopCtl loopCtl; GenPriv priv; char pref; LoopUnrollers unrollers; if (!strcmp(memPrefix, "__local")) { pref = 'l'; } else if (!strcmp(memPrefix, "__global")) { pref = 'g'; } else { return -EINVAL; } if (dim->y != 1) { return -EINVAL; } memset(&loopCtl, 0, sizeof(loopCtl)); memset(&unrollers, 0, sizeof(unrollers)); memset(&priv, 0, sizeof(GenPriv)); initGenPriv(&priv, TYPE_COMPLEX_DOUBLE, FLOAT4_VECLEN * sizeof(cl_float), dim, 0, (const ItemWork*)&work, pgran); getItemWork(&work, dim, pgran, priv.nfloats, priv.vecLen); sprintf(tmp, f4zeroDecl, pref, dim->x, memPrefix); kgenDeclareFunction(ctx, tmp); kgenBeginFuncBody(ctx); // declare local ID variable and set data offset kgenDeclareLocalID(ctx, lidVarName, pgran); sprintf(tmp, "\ndata += %s * %lu;\n\n", lidVarName, work.nrCols); kgenAddStmt(ctx, tmp); unrollers.genSingle = f4zeroSingle; loopCtl.inBound = (unsigned int)work.nrCols; unrollers.getVecLen = getVecLen; kgenLoopUnroll(ctx, &loopCtl, TYPE_COMPLEX_DOUBLE, &unrollers, &priv); if (work.tail) { addTailCode(ctx, &priv, NULL, f4zeroSingle); } return kgenEndFuncBody(ctx); } clblas-2.10/src/library/common/kern_cache.c000066400000000000000000000244631264277366700207020ustar00rootroot00000000000000/* ************************************************************************ * Copyright 2013 Advanced Micro Devices, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * ************************************************************************/ /* * Kernel cache implementation */ /* * TODO: more efficient data structure to search * by dimensions (red black tree, for example) (?) */ #include #include #include #include #include #include #define KCACHE_LOCK(kache) mutexLock((kcache)->mutex) #define KCACHE_UNLOCK(kcache) mutexUnlock((kcache)->mutex) #define UNLIMITED_CACHE_SIZE (~0UL) enum { KNODE_MAGIC = 0x3CED50C5, TRUNC_AHEAD_FACTOR = 4, MAX_OPENCL_DEVICES = 64 }; // prime is chosen such overflowing on multiply on is very likely const unsigned long long prime = 100000000000000889LL; typedef struct KernelNode { unsigned long magic; unsigned long refcnt; Kernel kern; unsigned long hash; // key data the kernel is based on KernelKey key; // function comparing kernel extra information KernelExtraCmpFn extraCmp; // node to store in a memory pattern related list ListNode dimNode; ListNode lruNode; } KernelNode; typedef struct KcacheKey { unsigned long hash; KernelKey key; const void *extra; } KcacheKey; struct KernelCache { size_t totalSize; size_t sizeLimit; // total amount of solvers unsigned int nrSolvers; // lists to search by subproblem dimensions ListHead *dimKern; // least recently used kernels list ListHead lruKern; mutex_t *mutex; }; // update kernel hash using the dimension size static __inline unsigned long updateHash(unsigned long hash, unsigned long size) { if (size != SUBDIM_UNUSED) { hash = (hash << 5) | size; } return hash; } // hash kernel subproblem dimensions static unsigned long kernHash(const SubproblemDim *subdims, unsigned int nrDims) { unsigned int i; unsigned long hash = 0; for (i = 0; i < nrDims; i++) { hash = updateHash(hash, (unsigned long)subdims[i].x); hash = updateHash(hash, (unsigned long)subdims[i].y); hash = updateHash(hash, (unsigned long)subdims[i].bwidth); hash = updateHash(hash, (unsigned long)subdims[i].itemX); hash = updateHash(hash, (unsigned long)subdims[i].itemY); } return (unsigned long)(hash * prime); } // comparison function to look for a kernel node in the cache static int knodeCmp(const ListNode *node, const void *key) { KcacheKey *kkey = (KcacheKey*)key; KernelNode *knode = container_of(node, dimNode, KernelNode); KernelKey *a = &(kkey->key); KernelKey *b = &(knode->key); if ((a->device != b->device) || (a->context != b->context) || (a->nrDims != b->nrDims)) { return 1; } if (memcmp(a->subdims, b->subdims, a->nrDims * sizeof(SubproblemDim)) != 0) { return 1; } if (knode->extraCmp != NULL) { return knode->extraCmp(knode->kern.extra, kkey->extra); } return 0; } static void removeKernels(ListHead *truncList, struct KernelCache *kcache, size_t truncSize) { size_t remSize = 0; size_t ksize; ListNode *l; KernelNode *knode; listInitHead(truncList); while (remSize < truncSize) { l = listNodeLast(&kcache->lruKern); if (l == &kcache->lruKern) { break; } knode = container_of(l, lruNode, KernelNode); listDel(l); listDel(&knode->dimNode); listAddToTail(truncList, &knode->lruNode); ksize = fullKernelSize(&knode->kern); remSize += ksize; kcache->totalSize -= ksize; } } static void putRemovedKernels(struct KernelCache *kcache, ListHead *truncList) { struct ListNode *l; struct KernelNode *knode; while (1) { l = listNodeFirst(truncList); if (l == truncList) { break; } knode = container_of(l, lruNode, KernelNode); listDel(l); putKernel(kcache, &knode->kern); } } Kernel *allocKernel(void) { KernelNode *knode; knode = malloc(sizeof(KernelNode)); if (knode == NULL) { return NULL; } memset(knode, 0, sizeof(KernelNode)); knode->refcnt = 1; knode->magic = KNODE_MAGIC; return &knode->kern; } void getKernel(Kernel *kern) { KernelNode *knode; knode = container_of(kern, kern, KernelNode); assert(knode->magic == KNODE_MAGIC); knode->refcnt++; } void putKernel(struct KernelCache *kcache, Kernel *kern) { KernelNode *knode; unsigned long refcnt; if (kern == NULL) { return; } knode = container_of(kern, kern, KernelNode); assert(knode->magic == KNODE_MAGIC); if (kcache) { KCACHE_LOCK(kcache); } refcnt = --knode->refcnt; if (kcache) { KCACHE_UNLOCK(kcache); } if (!refcnt) { if (kern->dtor) { kern->dtor(kern); } clReleaseProgram(kern->program); clReleaseContext(knode->key.context); free(knode); } } struct KernelCache *createKernelCache( unsigned int nrSolvers, size_t sizeLimit) { int err = 0; unsigned int i; struct KernelCache *kcache; kcache = malloc(sizeof(struct KernelCache)); if (kcache == NULL) { return NULL; } memset(kcache, 0, sizeof(struct KernelCache)); kcache->nrSolvers = nrSolvers; kcache->dimKern = malloc(kcache->nrSolvers * sizeof(ListHead)); if (kcache->dimKern == NULL) { err = -1; } else { for (i = 0; i < kcache->nrSolvers; i++) { listInitHead(&kcache->dimKern[i]); } listInitHead(&kcache->lruKern); kcache->sizeLimit = sizeLimit; kcache->totalSize = 0; kcache->mutex = mutexInit(); err = (kcache->mutex == NULL); } if (err) { if (kcache->dimKern) { free(kcache->dimKern); } free(kcache); kcache = NULL; } return kcache; } void destroyKernelCache(struct KernelCache *kcache) { cleanKernelCache(kcache); free(kcache->dimKern); mutexDestroy(kcache->mutex); free(kcache); } int addKernelToCache( struct KernelCache *kcache, solver_id_t sid, Kernel *kern, const KernelKey *key, KernelExtraCmpFn extraCmp) { size_t ksize; KernelNode *knode; ListHead truncList; knode = container_of(kern, kern, KernelNode); assert(knode->magic == KNODE_MAGIC); if ((unsigned)sid >= kcache->nrSolvers || key->nrDims > MAX_SUBDIMS) { return -1; } listInitHead(&truncList); ksize = fullKernelSize(kern); KCACHE_LOCK(kcache); if (kcache->sizeLimit) { if (ksize > kcache->sizeLimit) { KCACHE_UNLOCK(kcache); return -1; } else if (ksize > kcache->sizeLimit - kcache->totalSize) { removeKernels(&truncList, kcache, ksize * TRUNC_AHEAD_FACTOR); } } knode->hash = kernHash(key->subdims, key->nrDims); knode->extraCmp = extraCmp; knode->key.device = key->device; knode->key.context = key->context; clRetainContext(knode->key.context); knode->key.nrDims = key->nrDims; memset(knode->key.subdims, 0, sizeof(knode->key.subdims)); memcpy(knode->key.subdims, key->subdims, sizeof(SubproblemDim) * knode->key.nrDims); listAddToTail(&kcache->dimKern[sid], &knode->dimNode); listAddToHead(&kcache->lruKern, &knode->lruNode); kcache->totalSize += ksize; KCACHE_UNLOCK(kcache); if (!isListEmpty(&truncList)) { putRemovedKernels(kcache, &truncList); } return 0; } Kernel *findKernel( struct KernelCache *kcache, solver_id_t sid, const KernelKey *key, const void *extraKey) { Kernel *kern = NULL; KcacheKey kkey; KernelNode *knode; ListNode *lnode; if ((unsigned)sid >= kcache->nrSolvers || key->nrDims > MAX_SUBDIMS) { return NULL; } kkey.hash = kernHash(key->subdims, key->nrDims); kkey.extra = extraKey; kkey.key.device = key->device; kkey.key.context = key->context; kkey.key.nrDims = key->nrDims; memset(kkey.key.subdims, 0, sizeof(kkey.key.subdims)); memcpy(kkey.key.subdims, key->subdims, sizeof(SubproblemDim) * kkey.key.nrDims); KCACHE_LOCK(kcache); lnode = listNodeSearch(&kcache->dimKern[sid], &kkey, knodeCmp); if (lnode) { knode = container_of(lnode, dimNode, KernelNode); knode->refcnt++; kern = &knode->kern; // move the kernel to the top of the LRU list listDel(&knode->lruNode); listAddToHead(&kcache->lruKern, &knode->lruNode); } KCACHE_UNLOCK(kcache); return kern; } size_t availKernelCacheSize(struct KernelCache *kcache) { size_t size; KCACHE_LOCK(kcache); size = (kcache->sizeLimit) ? (kcache->sizeLimit - kcache->totalSize) : ~(size_t)0; KCACHE_UNLOCK(kcache); return size; } void cleanKernelCache(struct KernelCache *kcache) { ListHead truncList; KCACHE_LOCK(kcache); removeKernels(&truncList, kcache, kcache->totalSize); KCACHE_UNLOCK(kcache); putRemovedKernels(kcache, &truncList); } size_t fullKernelSize(Kernel *kern) { size_t allSizes[MAX_OPENCL_DEVICES], size = 0; size_t i, retSize; clGetProgramInfo(kern->program, CL_PROGRAM_BINARY_SIZES, sizeof(allSizes), &allSizes, &retSize); retSize /= sizeof(size); for (i = 0; i < retSize; i++) { size += allSizes[i]; } if (!kern->noSource) { clGetProgramInfo(kern->program, CL_PROGRAM_SOURCE, 0, NULL, &retSize); } return (size + retSize + sizeof(Kernel) + kern->extraSize); } #if defined(TRACE_MALLOC) #include void printKernelCacheSize(struct KernelCache *kcache) { printf("[KERNEL CACHE] My size is %lu MiB\n", kcache->totalSize / 1048576); } #endif clblas-2.10/src/library/common/kerngen_core.c000066400000000000000000000304771264277366700212630ustar00rootroot00000000000000/* ************************************************************************ * Copyright 2013 Advanced Micro Devices, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * ************************************************************************/ /* * Implementation of common logic for kernel * generators */ #include #include #include #include #include #include #include #include /* * TODO: Add checks for corruption for KgenContext and StatementBatch */ enum { TAB_WIDTH = 4, }; struct KgenContext { char *buf; size_t bufLen; // name of the last declared function char *lastFname; size_t fnameLen; // current length without trailing '\0' size_t currLen; bool err; // current execution branch nesting int nesting; // number of tabs on the zero level of nesting int nrTabs; bool fmt; }; struct StmtNode { char *stmt; ListNode node; }; struct StatementBatch { ListHead statements[MAX_STATEMENT_PRIORITY + 1]; }; #ifdef TRACE_MALLOC #define strdup(s) strdupDebug(s) static char *strdupDebug(const char *s) { char *dst; int len; len = strlen(s); dst = malloc(len + 1); if (dst != NULL) { memcpy(dst, s, len); dst[len] = '\0'; } return dst; } #else /* TRACE_MALLOC */ #if defined(_MSC_VER) #define strdup _strdup #endif /* _MSC_VER */ #endif /* !TRACE_MALLOC */ static void resetCtx(struct KgenContext *ctx) { ctx->currLen = 0; ctx->nesting = 0; ctx->err = false; ctx->lastFname = NULL; ctx->fnameLen = 0; if (ctx->buf != NULL) { ctx->buf[0] = '\0'; } } // extrace the first function name from a source buffer static char* searchFuncName(const char *source, size_t *len) { char *sep; char *name = NULL; /* * Search the opening paranthesis. The word before it is * the function name */ sep = strchr(source, '('); if (sep != NULL) { for (name = sep; name >= source; name--) { if ((*name == ' ') || (*name == '\n') || (*name == '*')) { break; } } name++; *len = (size_t)(sep - name); } return name; } /* * Immediately add string to source and does length check. * * The string should terminate with '\0' or pass size to copy */ static int checkAddStr(struct KgenContext *ctx, const char *str, size_t slen) { int ret = 0; size_t n = ctx->bufLen - ctx->currLen; size_t cplen; if (!slen) { slen = strlen(str); cplen = slen + 1; } else { cplen = slen; } if (ctx->buf == NULL) { ctx->currLen += slen; } else { if (cplen > n) { // make further code appendings unallowed ctx->err = true; ret = -1; } else { strncpy(ctx->buf + ctx->currLen, str, cplen); ctx->currLen += slen; } } return ret; } // add string to source, consiting of a prefix, a statement and a suffix static int addStr( struct KgenContext *ctx, const char *pref, const char *stmt, const char *suff) { int ret = 0; char blank[MAX_NESTING * TAB_WIDTH]; int i; char *sep = NULL; size_t len = 0; const int nblanks = (ctx->nesting + ctx->nrTabs) * TAB_WIDTH; if (nblanks && ctx->fmt) { for (i = 0; i < nblanks; i++) { blank[i] = ' '; } /* * add formatting symbols if there is a prefix, * or the statement don't begin with the new line * symbols */ if (pref || (stmt && (stmt[0] != '\n'))) { ret = checkAddStr(ctx, blank, nblanks); } } if (!ret && pref) { ret = checkAddStr(ctx, pref, 0); } /* * add the statement itself, * format the multiline ones if it's needed. */ while (!ret && stmt) { if (ctx->fmt) { /* * do not add tabulation for lines consisting of * the new line symbol only */ if (*stmt != '\n') { if (sep && nblanks) { ret = checkAddStr(ctx, blank, nblanks); if (ret) { break; } } sep = strchr(stmt, '\n'); // skip the new line symbol if it is at the end of the line if (sep && (sep[1] == '\0')) { sep = NULL; } len = (sep) ? (sep - stmt + 1) : 0; } else { /* * The line can start with the new line symbol * and have not any prefix. The assignment * ensures the tabulation for the case. */ sep = (sep) ? sep : ((char*)stmt); len = (stmt[1] == '\0') ? 0 : 1; } } ret = checkAddStr(ctx, stmt, len); if (len) { stmt += len; } else { stmt = NULL; } } if (!ret && suff) { ret = checkAddStr(ctx, suff, 0); } return ret; } struct KgenContext *createKgenContext(char *srcBuf, size_t srcBufLen, bool fmt) { struct KgenContext *ctx; ctx = malloc(sizeof(struct KgenContext)); if (ctx != NULL) { ctx->buf = srcBuf; ctx->bufLen = srcBufLen; ctx->fmt = fmt; ctx->nrTabs = 0; resetCtx(ctx); } return ctx; } static void flushDestroyStmtNode(ListNode *l, void *priv) { struct StmtNode *snode = container_of(l, node, struct StmtNode); if (priv != NULL) { addStr((struct KgenContext*)priv, NULL, snode->stmt, NULL); } free(snode->stmt); free(snode); } void destroyKgenContext(struct KgenContext *ctx) { if (ctx->lastFname) { free(ctx->lastFname); } free(ctx); } void resetKgenContext(struct KgenContext *ctx) { if (ctx->lastFname) { free(ctx->lastFname); } resetCtx(ctx); } int kgenSyncFormatting( struct KgenContext *srcCtx, const struct KgenContext *dstCtx, int nrTabs) { int ret = -EINVAL; if (nrTabs >= 0 && (nrTabs + dstCtx->nesting <= MAX_TABS)) { srcCtx->nesting = nrTabs + dstCtx->nesting; ret = 0; } return ret; } int kgenDeclareFunction(struct KgenContext *ctx, const char *decl) { int ret; size_t len; char *dbuf; const char *fnName; if (ctx->err || ctx->nesting) { ctx->err = true; return -1; } else { fnName = searchFuncName(decl, &len); if (fnName == NULL) { ret = -1; } else { // save the last declaration without dbuf = ctx->lastFname; if (dbuf == NULL) { dbuf = malloc(len + 1); } else if (ctx->fnameLen < len + 1) { dbuf = realloc(ctx->lastFname, len + 1); ctx->fnameLen = len + 1; } if (dbuf == NULL) { ret = -1; } else { strncpy(dbuf, fnName, len); dbuf[len] = '\0'; ctx->lastFname = dbuf; ret = addStr(ctx, NULL, decl, NULL); } } if (ret) { ctx->err = true; } } return ret; } int kgenBeginFuncBody(struct KgenContext *ctx) { int ret; if (ctx->err || ctx->nesting) { ctx->err = true; ret = -1; } else { ret = addStr(ctx, NULL, NULL, "{\n"); if (!ret) { ctx->nesting++; } } return ret; } int kgenEndFuncBody(struct KgenContext *ctx) { int ret; if (ctx->err || (ctx->nesting != 1)) { ctx->err = true; ret = -1; } else { ctx->nesting--; ret = addStr(ctx, NULL, NULL, "}\n"); } return ret; } int kgenGetLastFuncName( char *buf, size_t buflen, const struct KgenContext *ctx) { size_t len; int ret = -1; if (ctx->lastFname) { len = strlen(ctx->lastFname); if (buflen >= len + 1) { strncpy(buf, ctx->lastFname, len); buf[len] = '\0'; ret = 0; } } return ret; } int kgenBeginBranch(struct KgenContext *ctx, const char *stmt) { int ret; if (ctx->err || (ctx->nesting == MAX_NESTING)) { ctx->err = true; ret = -1; } else { const char *suff; if (stmt == NULL) { stmt = ""; suff = "{\n"; } else { suff = " {\n"; } ret = addStr(ctx, NULL, stmt, suff); if (!ret) { ctx->nesting++; } } return ret; } int kgenEndBranch(struct KgenContext *ctx, const char *stmt) { const char *pref; const char *suff; if (ctx->err || !ctx->nesting) { ctx->err = true; return -1; } ctx->nesting--; if (stmt) { pref = "} "; suff = ";\n"; } else { pref = "}\n"; suff = NULL; } return addStr(ctx, pref, stmt, suff); } int kgenAddStmt(struct KgenContext *ctx, const char *stmt) { int ret = 0; if (ctx->err) { ret = -1; } else if (stmt != NULL) { ret = addStr(ctx, NULL, stmt, NULL); } return ret; } int kgenPrintf(struct KgenContext *ctx, const char *fmt,...) { char buf[MAX_STATEMENT_LENGTH]; va_list ap; int len; if (ctx->err) { return -1; } va_start(ap, fmt); len = vsnprintf(buf, MAX_STATEMENT_LENGTH, fmt, ap); va_end(ap); if (len >= MAX_STATEMENT_LENGTH) { /* has the statement been truncated? */ return -1; } return addStr(ctx, NULL, buf, NULL); } struct StatementBatch *createStmtBatch(void) { struct StatementBatch *batch; batch = malloc(sizeof(struct StatementBatch)); if (batch != NULL) { int i; for (i = 0; i <= MAX_STATEMENT_PRIORITY; i++) { listInitHead(&batch->statements[i]); } } return batch; } int kgenAddStmtToBatch( struct StatementBatch *batch, int priority, const char *stmt) { struct StmtNode *snode; int ret = -ENOMEM; if (priority == MAX_STATEMENT_PRIORITY) { return -EINVAL; } snode = malloc(sizeof(struct StmtNode)); if (snode != NULL) { snode->stmt = strdup(stmt); if (snode->stmt != NULL) { listAddToTail(&batch->statements[priority], &snode->node); ret = 0; } else { free(snode); } } return ret; } int kgenBatchPrintf( struct StatementBatch *batch, int priority, const char *fmt,...) { char buf[MAX_STATEMENT_LENGTH]; va_list ap; int len; va_start(ap, fmt); len = vsnprintf(buf, MAX_STATEMENT_LENGTH, fmt, ap); va_end(ap); if (len >= MAX_STATEMENT_LENGTH) { /* has the statement been truncated? */ return -1; } kgenAddStmtToBatch(batch, priority, buf); return 0; } int flushStmtBatch(struct KgenContext *ctx, struct StatementBatch *batch) { int i = 0; for (i = 0; i <= MAX_STATEMENT_PRIORITY; i++) { listDoForEachPrivSafe(&batch->statements[i], flushDestroyStmtNode, ctx); listInitHead(&batch->statements[i]); } return (ctx->err) ? -1 : 0; } void destroyStmtBatch(struct StatementBatch *batch) { int i; for (i = 0; i <= MAX_STATEMENT_PRIORITY; i++) { listDoForEachPrivSafe(&batch->statements[i], flushDestroyStmtNode, NULL); } free(batch); } int kgenAddBlankLine(struct KgenContext *ctx) { int ret; if (ctx->err) { ret = -1; } else { ret = addStr(ctx, NULL, NULL, "\n"); } return ret; } size_t kgenSourceSize(struct KgenContext *ctx) { return ctx->currLen; } clblas-2.10/src/library/common/kgen_basic.c000066400000000000000000000214671264277366700207060ustar00rootroot00000000000000/* ************************************************************************ * Copyright 2013 Advanced Micro Devices, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * ************************************************************************/ #include #include #include #include #include #include #include #include const char *uptrsFullDeclaration = "#ifdef cl_khr_fp64\n" "#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n" "#else\n" "#pragma OPENCL EXTENSION cl_amd_fp64 : enable\n" "#endif\n" "\n" "typedef union GPtr {\n" " __global float *f;\n" " __global double *d;\n" " __global float2 *f2v;\n" " __global double2 *d2v;\n" " __global float4 *f4v;\n" " __global double4 *d4v;\n" " __global float8 *f8v;\n" " __global double8 *d8v;\n" " __global float16 *f16v;\n" " __global double16 *d16v;\n" "} GPtr;\n" "\n" "typedef union LPtr {\n" " __local float *f;\n" " __local double *d;\n" " __local float2 *f2v;\n" " __local double2 *d2v;\n" " __local float4 *f4v;\n" " __local double4 *d4v;\n" " __local float8 *f8v;\n" " __local double8 *d8v;\n" " __local float16 *f16v;\n" " __local double16 *d16v;\n" "} LPtr;\n" "\n" "typedef union PPtr {\n" " float *f;\n" " double *d;\n" " float2 *f2v;\n" " double2 *d2v;\n" " float4 *f4v;\n" " double4 *d4v;\n" " float8 *f8v;\n" " double8 *d8v;\n" " float16 *f16v;\n" " double16 *d16v;\n" "} PPtr;\n\n"; const char *uptrsSingleDeclaration = "typedef union GPtr {\n" " __global float *f;\n" " __global float2 *f2v;\n" " __global float4 *f4v;\n" " __global float8 *f8v;\n" " __global float16 *f16v;\n" "} GPtr;\n" "\n" "typedef union LPtr {\n" " __local float *f;\n" " __local float2 *f2v;\n" " __local float4 *f4v;\n" " __local float8 *f8v;\n" " __local float16 *f16v;\n" "} LPtr;\n" "\n" "typedef union PPtr {\n" " float *f;\n" " float2 *f2v;\n" " float4 *f4v;\n" " float8 *f8v;\n" " float16 *f16v;\n" "} PPtr;\n\n"; const char *uptrTypeName(UptrType type) { const char *s = NULL; switch(type) { case UPTR_GLOBAL: s = "GPtr"; break; case UPTR_LOCAL: s = "LPtr"; break; case UPTR_PRIVATE: s = "PPtr"; break; } return s; } char dtypeToPrefix(DataType type) { char c; switch (type) { case TYPE_FLOAT: c = 'f'; break; case TYPE_DOUBLE: c = 'd'; break; case TYPE_COMPLEX_FLOAT: c = 'c'; break; case TYPE_COMPLEX_DOUBLE: c = 'z'; break; default: c = 0; break; } return c; } const char *dtypeBuiltinType(DataType dtype) { const char *s; switch (dtype) { case TYPE_FLOAT: s = "float"; break; case TYPE_DOUBLE: s = "double"; break; case TYPE_COMPLEX_FLOAT: s = "float2"; break; case TYPE_COMPLEX_DOUBLE: s = "double2"; break; default: s = NULL; break; } return s; } const char *dtypeUPtrField(DataType dtype) { const char *s; switch (dtype) { case TYPE_FLOAT: s = "f"; break; case TYPE_DOUBLE: s = "d"; break; case TYPE_COMPLEX_FLOAT: s = "f2v"; break; case TYPE_COMPLEX_DOUBLE: s = "d2v"; break; default: s = NULL; break; } return s; } const char *strOne(DataType dtype) { const char *s; if (isComplexType(dtype)) { if (isDoubleBasedType(dtype)) { s = "(double2)(1, 0)"; } else { s = "(float2)(1, 0)"; } } else { s = "1"; } return s; } void getVectorTypeName( DataType dtype, unsigned int vecLen, const char **typeName, const char **typePtrName) { char *tn = ""; char *tpn = ""; if (isDoubleBasedType(dtype)) { switch (vecLen * dtypeSize(dtype)) { case sizeof(cl_double): tn = "double"; tpn = "d"; break; case sizeof(cl_double2): tn = "double2"; tpn = "d2v"; break; case sizeof(cl_double4): tn = "double4"; tpn = "d4v"; break; case sizeof(cl_double8): tn = "double8"; tpn = "d8v"; break; case sizeof(cl_double16): tn = "double16"; tpn = "d16v"; break; }; } else { switch (vecLen * dtypeSize(dtype)) { case sizeof(cl_float): tn = "float"; tpn = "f"; break; case sizeof(cl_float2): tn = "float2"; tpn = "f2v"; break; case sizeof(cl_float4): tn = "float4"; tpn = "f4v"; break; case sizeof(cl_float8): tn = "float8"; tpn = "f8v"; break; case sizeof(cl_float16): tn = "float16"; tpn = "f16v"; break; }; } if (typeName != NULL) { *typeName = tn; } if (typePtrName != NULL) { *typePtrName = tpn; } } int kgenAddBarrier( struct KgenContext *ctx, CLMemFence fence) { int ret; if (fence == CLK_LOCAL_MEM_FENCE) { ret = kgenAddStmt(ctx, "barrier(CLK_LOCAL_MEM_FENCE);\n"); } else { ret = kgenAddStmt(ctx, "barrier(CLK_GLOBAL_MEM_FENCE);\n"); } if (ret) { ret = -EOVERFLOW; } return ret; } int kgenAddMemFence( struct KgenContext *ctx, CLMemFence fence) { int ret; if (fence == CLK_LOCAL_MEM_FENCE) { ret = kgenAddStmt(ctx, "mem_fence(CLK_LOCAL_MEM_FENCE);\n"); } else { ret = kgenAddStmt(ctx, "mem_fence(CLK_GLOBAL_MEM_FENCE);\n"); } if (ret) { ret = -EOVERFLOW; } return ret; } int kgenDeclareLocalID( struct KgenContext *ctx, const char *lidName, const PGranularity *pgran) { char tmp[128]; int r; if (pgran->wgDim == 1) { sprintf(tmp, "const int %s = get_local_id(0);\n", lidName); } else { sprintf(tmp, "const int %s = get_local_id(1) * %u + " "get_local_id(0);\n", lidName, pgran->wgSize[0]); } r = kgenAddStmt(ctx, tmp); return (r) ? -EOVERFLOW : 0; } int kgenDeclareGroupID( struct KgenContext *ctx, const char *gidName, const PGranularity *pgran) { char tmp[128]; int r; if (pgran->wgDim == 1) { sprintf(tmp, "const int %s = get_global_id(0) / %u;\n", gidName, pgran->wgSize[0]); } else { sprintf(tmp, "const int %s = (get_global_id(1) / %u) * " "(get_global_size(0) / %u) + " "get_global_id(0) / %u;\n", gidName, pgran->wgSize[1], pgran->wgSize[0], pgran->wgSize[0]); } r = kgenAddStmt(ctx, tmp); return (r) ? -EOVERFLOW : 0; } int kgenDeclareUptrs(struct KgenContext *ctx, bool withDouble) { int ret; const char *s; s = (withDouble) ? uptrsFullDeclaration : uptrsSingleDeclaration; ret = kgenAddStmt(ctx, s); return ret ? -EOVERFLOW: 0; } void kstrcpy(Kstring *kstr, const char *str) { const int lastByte = sizeof(kstr->buf) - 1; kstr->buf[lastByte] = '\0'; strncpy(kstr->buf, str, sizeof(kstr->buf)); assert(kstr->buf[lastByte] == '\0'); } void ksprintf(Kstring *kstr, const char *fmt,...) { va_list ap; int len; va_start(ap, fmt); len = vsnprintf(kstr->buf, sizeof(kstr->buf), fmt, ap); va_end(ap); // to mute GCC with its warning regarding set but unused variables #ifdef NDEBUG (void)len; #endif assert((size_t)len < sizeof(kstr->buf)); } void kstrcatf(Kstring *kstr, const char *fmt,...) { va_list ap; int len, maxlen; va_start(ap, fmt); len = (int)strlen(kstr->buf); maxlen = sizeof(kstr->buf) - len; len = vsnprintf(kstr->buf + len, maxlen, fmt, ap); va_end(ap); assert(len < maxlen); } clblas-2.10/src/library/common/kgen_guard.c000066400000000000000000000077531264277366700207310ustar00rootroot00000000000000/* ************************************************************************ * Copyright 2013 Advanced Micro Devices, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * ************************************************************************/ #include #include #include #include #include typedef struct FuncNode { void *pattern; char funcName[FUNC_NAME_MAXLEN]; ListNode node; } FuncNode; typedef struct FuncNodeKey { const void *pattern; size_t patSize; } FuncNodeKey; struct KgenGuard { struct KgenContext *ctx; int (*genCallback)(struct KgenContext*, const void*); size_t patSize; ListHead funcs; }; static int funcNodeCmp(const ListNode *n, const void *key) { const FuncNode *fnode = container_of(n, node, FuncNode); const FuncNodeKey *fkey = (FuncNodeKey*)key; return memcmp(fnode->pattern, fkey->pattern, fkey->patSize); } static void destroyFuncNode(ListNode *node) { FuncNode *fnode = container_of(node, node, FuncNode); free(fnode->pattern); free(fnode); } struct KgenGuard *createKgenGuard( struct KgenContext *ctx, int (*genCallback)(struct KgenContext *ctx, const void *pattern), size_t patSize) { struct KgenGuard *guard; guard = malloc(sizeof(struct KgenGuard)); if (guard != NULL) { guard->ctx = ctx; guard->genCallback = genCallback; guard->patSize = patSize; listInitHead(&guard->funcs); } return guard; } void reinitKgenGuard( struct KgenGuard *guard, struct KgenContext *ctx, int (*genCallback)(struct KgenContext *ctx, const void *pattern), size_t patSize) { listDoForEachSafe(&guard->funcs, destroyFuncNode); listInitHead(&guard->funcs); guard->ctx = ctx; guard->genCallback = genCallback; guard->patSize = patSize; } /* * Invokes generator to generate a function * matching to the 'pattern' pattern or just * returns its name if the function is already * generated */ int findGenerateFunction( struct KgenGuard *guard, const void *pattern, char *name, size_t nameLen) { ListNode *n; FuncNode *fnode = NULL; FuncNodeKey fkey = {pattern, guard->patSize}; int ret = 0; n = listNodeSearch(&guard->funcs, &fkey, funcNodeCmp); if (n == NULL) { ret = guard->genCallback(guard->ctx, pattern); if (!ret) { fnode = malloc(sizeof(FuncNode)); if (fnode == NULL) { ret = -ENOMEM; } else { fnode->pattern = malloc(guard->patSize); if (fnode->pattern == NULL) { free(fnode); ret = -ENOMEM; } else { memcpy(fnode->pattern, pattern, guard->patSize); kgenGetLastFuncName(fnode->funcName, sizeof(fnode->funcName), guard->ctx); fnode->funcName[FUNC_NAME_MAXLEN - 1] = '\0'; listAddToTail(&guard->funcs, &fnode->node); } } } else { ret = -EOVERFLOW; } } else { fnode = container_of(n, node, FuncNode); } if (!ret) { strncpy(name, fnode->funcName, nameLen); name[nameLen - 1] = '\0'; } return ret; } void destroyKgenGuard(struct KgenGuard *guard) { listDoForEachSafe(&guard->funcs, destroyFuncNode); free(guard); } clblas-2.10/src/library/common/kgen_loop_helper.c000066400000000000000000000053341264277366700221300ustar00rootroot00000000000000/* ************************************************************************ * Copyright 2013 Advanced Micro Devices, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * ************************************************************************/ #include #include #include #include #include #include int kgenLoopUnroll( struct KgenContext *ctx, LoopCtl *loopCtl, DataType dtype, const LoopUnrollers *unrollers, void *priv) { int ret = 0; char tmp[1024]; unsigned long i, n; unsigned int nfloats; int vecLen; if (!(dtype == TYPE_FLOAT || dtype == TYPE_DOUBLE || dtype == TYPE_COMPLEX_FLOAT || dtype == TYPE_COMPLEX_DOUBLE)) { return -EINVAL; } if (unrollers->genSingle == NULL) { return -EINVAL; } nfloats = dtypeSize(dtype) / sizeof(cl_float); vecLen = (unrollers->getVecLen == NULL)? FLOAT4_VECLEN : unrollers->getVecLen(ctx, priv); if (loopCtl->ocName) { if (loopCtl->obConst) { sprintf(tmp, "for (%s = 0; %s < %lu; %s++)", loopCtl->ocName, loopCtl->ocName, loopCtl->outBound.val, loopCtl->ocName); } else { sprintf(tmp, "for (%s = 0; %s < %s; %s++)", loopCtl->ocName, loopCtl->ocName, loopCtl->outBound.name, loopCtl->ocName); } kgenBeginBranch(ctx, tmp); } if (unrollers->preUnroll) { ret = unrollers->preUnroll(ctx, priv); } if ((dtype != TYPE_COMPLEX_DOUBLE) && unrollers->genSingleVec) { n = loopCtl->inBound * nfloats / vecLen; for (i = 0; (i < n) && !ret; i++) { ret = unrollers->genSingleVec(ctx, priv); } n = loopCtl->inBound % (vecLen / nfloats); } else { n = loopCtl->inBound; } for (i = 0; (i < n) && !ret; i++) { ret = unrollers->genSingle(ctx, priv); } if (unrollers->postUnroll && !ret) { ret = unrollers->postUnroll(ctx, priv); } if (loopCtl->ocName && !ret) { ret = kgenEndBranch(ctx, NULL); } return ret ? 0 : -EOVERFLOW; } clblas-2.10/src/library/common/list.c000066400000000000000000000055031264277366700175650ustar00rootroot00000000000000/* ************************************************************************ * Copyright 2013 Advanced Micro Devices, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * ************************************************************************/ #include #include #include static __inline void listAddAfter(ListNode *prev, ListNode *node) { ListNode *next = prev->next; prev->next = node; node->prev = prev; node->next = next; next->prev = node; } void listAddToTail(ListHead *head, ListNode *node) { listAddAfter(head->prev, node); } void listAddToHead(ListHead *head, ListNode *node) { listAddAfter(head, node); } void listDel(ListNode *node) { #ifdef DEBUG // check if it's not really the list head assert(node->next != node->prev); #endif node->prev->next = node->next; node->next->prev = node->prev; } ListNode *listDelFromTail(ListHead *head) { ListNode *node = head->prev; listDel(node); return node; } void listDoForEach(ListHead *head, ListAction act) { ListNode *node; for (node = listNodeFirst(head); node != head; node = node->next) { act(node); } } void listDoForEachSafe(ListHead *head, ListAction act) { ListNode *node, *save; for (node = listNodeFirst(head), save = node->next; node != head; node = save, save = node->next) { act(node); } } void listDoForEachPriv(const ListHead *head, ListPrivAction act, void *actPriv) { ListNode *node; for (node = listNodeFirst(head); node != head; node = node->next) { act(node, actPriv); } } void listDoForEachPrivSafe(const ListHead *head, ListPrivAction act, void *actPriv) { ListNode *node, *save; for (node = listNodeFirst(head), save = node->next; node != head; node = save, save = node->next) { act(node, actPriv); } } ListNode *listNodeSearch(const ListHead *head, const void *key, ListCmpFn cmp) { ListNode *node; for (node = listNodeFirst(head); node != head; node = node->next) { if (!cmp(node, key)) { break; } } return (node == head) ? NULL : node; } size_t listLength(const ListHead *head) { size_t length = 0; ListNode *node; for (node= listNodeFirst(head); node != head; node = node->next) { length++; } return length; } clblas-2.10/src/library/common/md5sum.c000066400000000000000000000252161264277366700200270ustar00rootroot00000000000000/* * This is an OpenSSL-compatible implementation of the RSA Data Security, Inc. * MD5 Message-Digest Algorithm (RFC 1321). * * Homepage: * http://openwall.info/wiki/people/solar/software/public-domain-source-code/md5 * * Author: * Alexander Peslyak, better known as Solar Designer * * This software was written by Alexander Peslyak in 2001. No copyright is * claimed, and the software is hereby placed in the public domain. * In case this attempt to disclaim copyright and place the software in the * public domain is deemed null and void, then the software is * Copyright (c) 2001 Alexander Peslyak and it is hereby released to the * general public under the following terms: * * Redistribution and use in source and binary forms, with or without * modification, are permitted. * * There's ABSOLUTELY NO WARRANTY, express or implied. * * (This is a heavily cut-down "BSD license".) * * This differs from Colin Plumb's older public domain implementation in that * no exactly 32-bit integer data type is required (any 32-bit or wider * unsigned integer data type will do), there's no compile-time endianness * configuration, and the function prototypes match OpenSSL's. No code from * Colin Plumb's implementation has been reused; this comment merely compares * the properties of the two independent implementations. * * The primary goals of this implementation are portability and ease of use. * It is meant to be fast, but not as fast as possible. Some known * optimizations are not included to reduce source code size and avoid * compile-time configuration. */ #ifndef HAVE_OPENSSL #include #include #include #include /* * The basic MD5 functions. * * F and G are optimized compared to their RFC 1321 definitions for * architectures that lack an AND-NOT instruction, just like in Colin Plumb's * implementation. */ #define F(x, y, z) ((z) ^ ((x) & ((y) ^ (z)))) #define G(x, y, z) ((y) ^ ((z) & ((x) ^ (y)))) #define H(x, y, z) (((x) ^ (y)) ^ (z)) #define H2(x, y, z) ((x) ^ ((y) ^ (z))) #define I(x, y, z) ((y) ^ ((x) | ~(z))) /* * The MD5 transformation for all four rounds. */ #define STEP(f, a, b, c, d, x, t, s) \ (a) += f((b), (c), (d)) + (x) + (t); \ (a) = (((a) << (s)) | (((a) & 0xffffffff) >> (32 - (s)))); \ (a) += (b); /* * SET reads 4 input bytes in little-endian byte order and stores them * in a properly aligned word in host byte order. * * The check for little-endian architectures that tolerate unaligned * memory accesses is just an optimization. Nothing will break if it * doesn't work. */ #if defined(__i386__) || defined(__x86_64__) || defined(__vax__) #define SET(n) \ (*(MD5_u32plus *)&ptr[(n) * 4]) #define GET(n) \ SET(n) #else #define SET(n) \ (ctx->block[(n)] = \ (MD5_u32plus)ptr[(n) * 4] | \ ((MD5_u32plus)ptr[(n) * 4 + 1] << 8) | \ ((MD5_u32plus)ptr[(n) * 4 + 2] << 16) | \ ((MD5_u32plus)ptr[(n) * 4 + 3] << 24)) #define GET(n) \ (ctx->block[(n)]) #endif /* * This processes one or more 64-byte data blocks, but does NOT update * the bit counters. There are no alignment requirements. */ static const void *body(MD5_CTX *ctx, const void *data, unsigned long size) { const unsigned char *ptr; MD5_u32plus a, b, c, d; MD5_u32plus saved_a, saved_b, saved_c, saved_d; ptr = (const unsigned char *)data; a = ctx->a; b = ctx->b; c = ctx->c; d = ctx->d; do { saved_a = a; saved_b = b; saved_c = c; saved_d = d; /* Round 1 */ STEP(F, a, b, c, d, SET(0), 0xd76aa478, 7) STEP(F, d, a, b, c, SET(1), 0xe8c7b756, 12) STEP(F, c, d, a, b, SET(2), 0x242070db, 17) STEP(F, b, c, d, a, SET(3), 0xc1bdceee, 22) STEP(F, a, b, c, d, SET(4), 0xf57c0faf, 7) STEP(F, d, a, b, c, SET(5), 0x4787c62a, 12) STEP(F, c, d, a, b, SET(6), 0xa8304613, 17) STEP(F, b, c, d, a, SET(7), 0xfd469501, 22) STEP(F, a, b, c, d, SET(8), 0x698098d8, 7) STEP(F, d, a, b, c, SET(9), 0x8b44f7af, 12) STEP(F, c, d, a, b, SET(10), 0xffff5bb1, 17) STEP(F, b, c, d, a, SET(11), 0x895cd7be, 22) STEP(F, a, b, c, d, SET(12), 0x6b901122, 7) STEP(F, d, a, b, c, SET(13), 0xfd987193, 12) STEP(F, c, d, a, b, SET(14), 0xa679438e, 17) STEP(F, b, c, d, a, SET(15), 0x49b40821, 22) /* Round 2 */ STEP(G, a, b, c, d, GET(1), 0xf61e2562, 5) STEP(G, d, a, b, c, GET(6), 0xc040b340, 9) STEP(G, c, d, a, b, GET(11), 0x265e5a51, 14) STEP(G, b, c, d, a, GET(0), 0xe9b6c7aa, 20) STEP(G, a, b, c, d, GET(5), 0xd62f105d, 5) STEP(G, d, a, b, c, GET(10), 0x02441453, 9) STEP(G, c, d, a, b, GET(15), 0xd8a1e681, 14) STEP(G, b, c, d, a, GET(4), 0xe7d3fbc8, 20) STEP(G, a, b, c, d, GET(9), 0x21e1cde6, 5) STEP(G, d, a, b, c, GET(14), 0xc33707d6, 9) STEP(G, c, d, a, b, GET(3), 0xf4d50d87, 14) STEP(G, b, c, d, a, GET(8), 0x455a14ed, 20) STEP(G, a, b, c, d, GET(13), 0xa9e3e905, 5) STEP(G, d, a, b, c, GET(2), 0xfcefa3f8, 9) STEP(G, c, d, a, b, GET(7), 0x676f02d9, 14) STEP(G, b, c, d, a, GET(12), 0x8d2a4c8a, 20) /* Round 3 */ STEP(H, a, b, c, d, GET(5), 0xfffa3942, 4) STEP(H2, d, a, b, c, GET(8), 0x8771f681, 11) STEP(H, c, d, a, b, GET(11), 0x6d9d6122, 16) STEP(H2, b, c, d, a, GET(14), 0xfde5380c, 23) STEP(H, a, b, c, d, GET(1), 0xa4beea44, 4) STEP(H2, d, a, b, c, GET(4), 0x4bdecfa9, 11) STEP(H, c, d, a, b, GET(7), 0xf6bb4b60, 16) STEP(H2, b, c, d, a, GET(10), 0xbebfbc70, 23) STEP(H, a, b, c, d, GET(13), 0x289b7ec6, 4) STEP(H2, d, a, b, c, GET(0), 0xeaa127fa, 11) STEP(H, c, d, a, b, GET(3), 0xd4ef3085, 16) STEP(H2, b, c, d, a, GET(6), 0x04881d05, 23) STEP(H, a, b, c, d, GET(9), 0xd9d4d039, 4) STEP(H2, d, a, b, c, GET(12), 0xe6db99e5, 11) STEP(H, c, d, a, b, GET(15), 0x1fa27cf8, 16) STEP(H2, b, c, d, a, GET(2), 0xc4ac5665, 23) /* Round 4 */ STEP(I, a, b, c, d, GET(0), 0xf4292244, 6) STEP(I, d, a, b, c, GET(7), 0x432aff97, 10) STEP(I, c, d, a, b, GET(14), 0xab9423a7, 15) STEP(I, b, c, d, a, GET(5), 0xfc93a039, 21) STEP(I, a, b, c, d, GET(12), 0x655b59c3, 6) STEP(I, d, a, b, c, GET(3), 0x8f0ccc92, 10) STEP(I, c, d, a, b, GET(10), 0xffeff47d, 15) STEP(I, b, c, d, a, GET(1), 0x85845dd1, 21) STEP(I, a, b, c, d, GET(8), 0x6fa87e4f, 6) STEP(I, d, a, b, c, GET(15), 0xfe2ce6e0, 10) STEP(I, c, d, a, b, GET(6), 0xa3014314, 15) STEP(I, b, c, d, a, GET(13), 0x4e0811a1, 21) STEP(I, a, b, c, d, GET(4), 0xf7537e82, 6) STEP(I, d, a, b, c, GET(11), 0xbd3af235, 10) STEP(I, c, d, a, b, GET(2), 0x2ad7d2bb, 15) STEP(I, b, c, d, a, GET(9), 0xeb86d391, 21) a += saved_a; b += saved_b; c += saved_c; d += saved_d; ptr += 64; } while (size -= 64); ctx->a = a; ctx->b = b; ctx->c = c; ctx->d = d; return ptr; } void MD5_Init(MD5_CTX *ctx) { ctx->a = 0x67452301; ctx->b = 0xefcdab89; ctx->c = 0x98badcfe; ctx->d = 0x10325476; ctx->lo = 0; ctx->hi = 0; } void MD5_Update(MD5_CTX *ctx, const void *data, unsigned long size) { MD5_u32plus saved_lo; unsigned long used, available; saved_lo = ctx->lo; if ((ctx->lo = (saved_lo + size) & 0x1fffffff) < saved_lo) ctx->hi++; ctx->hi += size >> 29; used = saved_lo & 0x3f; if (used) { available = 64 - used; if (size < available) { memcpy(&ctx->buffer[used], data, size); return; } memcpy(&ctx->buffer[used], data, available); data = (const unsigned char *)data + available; size -= available; body(ctx, ctx->buffer, 64); } if (size >= 64) { data = body(ctx, data, size & ~(unsigned long)0x3f); size &= 0x3f; } memcpy(ctx->buffer, data, size); } void MD5_Final(unsigned char *result, MD5_CTX *ctx) { unsigned long used, available; used = ctx->lo & 0x3f; ctx->buffer[used++] = 0x80; available = 64 - used; if (available < 8) { memset(&ctx->buffer[used], 0, available); body(ctx, ctx->buffer, 64); used = 0; available = 64; } memset(&ctx->buffer[used], 0, available - 8); ctx->lo <<= 3; ctx->buffer[56] = ctx->lo; ctx->buffer[57] = ctx->lo >> 8; ctx->buffer[58] = ctx->lo >> 16; ctx->buffer[59] = ctx->lo >> 24; ctx->buffer[60] = ctx->hi; ctx->buffer[61] = ctx->hi >> 8; ctx->buffer[62] = ctx->hi >> 16; ctx->buffer[63] = ctx->hi >> 24; body(ctx, ctx->buffer, 64); result[0] = ctx->a; result[1] = ctx->a >> 8; result[2] = ctx->a >> 16; result[3] = ctx->a >> 24; result[4] = ctx->b; result[5] = ctx->b >> 8; result[6] = ctx->b >> 16; result[7] = ctx->b >> 24; result[8] = ctx->c; result[9] = ctx->c >> 8; result[10] = ctx->c >> 16; result[11] = ctx->c >> 24; result[12] = ctx->d; result[13] = ctx->d >> 8; result[14] = ctx->d >> 16; result[15] = ctx->d >> 24; memset(ctx, 0, sizeof(*ctx)); } char * md5sum(const void * data, unsigned long size) { unsigned char digest[16]; int i; char * md5string = (char*)malloc(33*sizeof(char)); MD5_CTX context; MD5_Init(&context); MD5_Update(&context, data, size); MD5_Final(digest, &context); for(i = 0; i < 16; ++i) sprintf(&md5string[i*2], "%02x", (unsigned int)digest[i]); return (char*) md5string; } #endif /* * This is an OpenSSL-compatible implementation of the RSA Data Security, Inc. * MD5 Message-Digest Algorithm (RFC 1321). * * Homepage: * http://openwall.info/wiki/people/solar/software/public-domain-source-code/md5 * * Author: * Alexander Peslyak, better known as Solar Designer * * This software was written by Alexander Peslyak in 2001. No copyright is * claimed, and the software is hereby placed in the public domain. * In case this attempt to disclaim copyright and place the software in the * public domain is deemed null and void, then the software is * Copyright (c) 2001 Alexander Peslyak and it is hereby released to the * general public under the following terms: * * Redistribution and use in source and binary forms, with or without * modification, are permitted. * * There's ABSOLUTELY NO WARRANTY, express or implied. * * See md5.c for more information. */ #ifdef HAVE_OPENSSL #include #elif !defined(_MD5_H) #define _MD5_H /* Any 32-bit or wider unsigned integer data type will do */ typedef unsigned int MD5_u32plus; typedef struct { MD5_u32plus lo, hi; MD5_u32plus a, b, c, d; unsigned char buffer[64]; MD5_u32plus block[16]; } MD5_CTX; extern void MD5_Init(MD5_CTX *ctx); extern void MD5_Update(MD5_CTX *ctx, const void *data, unsigned long size); extern void MD5_Final(unsigned char *result, MD5_CTX *ctx); char * md5sum(const void * data, unsigned long size) { unsigned char * digest = malloc(16 * sizeof(unsigned char)); MD5_CTX context; MD5_Init(&context); MD5_Update(&context, data, size); MD5_Final(digest, &context); char * md5string = malloc(33*sizeof(char)); for(int i = 0; i < 16; ++i) sprintf(&md5string[i*2], "%02x", (unsigned int)digest[i]); return md5string; } #endif clblas-2.10/src/library/common/misc.c000066400000000000000000000030001264277366700175330ustar00rootroot00000000000000/* ************************************************************************ * Copyright 2013 Advanced Micro Devices, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * ************************************************************************/ #include unsigned int dtypeSize(DataType type) { size_t ret; switch (type) { case TYPE_FLOAT: ret = sizeof(cl_float); break; case TYPE_DOUBLE: ret = sizeof(cl_double); break; case TYPE_COMPLEX_FLOAT: ret = sizeof(cl_float2); break; case TYPE_COMPLEX_DOUBLE: ret = sizeof(cl_double2); break; case TYPE_UNSIGNED_INT:// For iAMAX ret = sizeof(cl_uint); break; default: ret = (size_t)-1; break; } return (unsigned int)ret; } size_t fl4RowWidth(size_t width, size_t typeSize) { size_t s; s = width / (sizeof(cl_float4) / typeSize); if (s * (sizeof(cl_float4) / typeSize) != width) { s++; } return s; } clblas-2.10/src/library/common/mutex.c000066400000000000000000000050271264277366700177550ustar00rootroot00000000000000/* ************************************************************************ * Copyright 2013 Advanced Micro Devices, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * ************************************************************************/ #include #if defined(_MSC_VER) #pragma warning(push,3) #include #pragma warning(pop) mutex_t* mutexInit(void) { HANDLE mutex; mutex = CreateMutex(NULL, FALSE, NULL); return (mutex_t*)mutex; } int mutexDestroy(mutex_t *_mutex) { HANDLE mutex = (HANDLE)_mutex; if (CloseHandle(mutex) == FALSE) { /* Bad mutex, etc. */ return 1; } return 0; } int mutexLock(mutex_t *_mutex) { HANDLE mutex = (HANDLE)_mutex; DWORD rc; rc = WaitForSingleObjectEx(mutex, INFINITE, FALSE); if (rc != WAIT_OBJECT_0) { /* Bad mutex, etc. */ return 1; } return 0; } int mutexUnlock(mutex_t *_mutex) { HANDLE mutex = (HANDLE)_mutex; if (ReleaseMutex(mutex) == FALSE) { /* Bad mutex, etc. */ return 1; } return 0; } #else /* defined(_MSC_VER) */ #include #include mutex_t* mutexInit(void) { pthread_mutex_t *mutex; mutex = calloc(1, sizeof(pthread_mutex_t)); if (mutex == NULL) return NULL; if (pthread_mutex_init(mutex, NULL) != 0) { free(mutex); return NULL; } return (mutex_t*)mutex; } int mutexDestroy(mutex_t *_mutex) { pthread_mutex_t *mutex = (pthread_mutex_t*)_mutex; if (mutex == NULL) { /* Mutex is invalid */ return 1; } if (pthread_mutex_destroy(mutex) != 0) { /* Mutex is busy or invalid */ return 1; } free(mutex); return 0; } int mutexLock(mutex_t *_mutex) { pthread_mutex_t *mutex = (pthread_mutex_t*)_mutex; return (pthread_mutex_lock(mutex) == 0) ? 0 : 1; } int mutexUnlock(mutex_t *_mutex) { pthread_mutex_t *mutex = (pthread_mutex_t*)_mutex; return (pthread_mutex_unlock(mutex) == 0) ? 0 : 1; } #endif /* defined (_MSC_VER) */ clblas-2.10/src/library/common/rwlock.c000066400000000000000000000062341264277366700201150ustar00rootroot00000000000000/* ************************************************************************ * Copyright 2013 Advanced Micro Devices, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * ************************************************************************/ #include #if defined(_MSC_VER) #pragma warning(push,3) // Need Synchapi.h since Windows 8 or Windows Server 2012? #include #pragma warning(pop) rwlock_t* rwlockInit(void) { PSRWLOCK rwlock; rwlock = (PSRWLOCK)calloc(1, sizeof(SRWLOCK)); if (rwlock == NULL) return NULL; InitializeSRWLock(rwlock); return (rwlock_t*) rwlock; } int rwlockDestroy(rwlock_t *_rwlock) { if (_rwlock == NULL) { /* Mutex is invalid */ return 1; } free(_rwlock); return 0; } int rwlockReadLock(rwlock_t *_rwlock ) { BOOLEAN acquired = 0; while(!acquired) acquired = TryAcquireSRWLockShared((PSRWLOCK) _rwlock); return (acquired != 0); } int rwlockWriteLock(rwlock_t *_rwlock ) { BOOLEAN acquired = 0; while(!acquired) acquired = TryAcquireSRWLockExclusive((PSRWLOCK) _rwlock); return (acquired != 0); } int rwlockReadUnlock(rwlock_t *_rwlock ) { ReleaseSRWLockShared((PSRWLOCK) _rwlock); return 0; } int rwlockWriteUnlock(rwlock_t *_rwlock ) { ReleaseSRWLockExclusive((PSRWLOCK)_rwlock); return 0; } #else /* defined(_MSC_VER) */ #include #include rwlock_t* rwlockInit(void) { pthread_rwlock_t *rwlock; rwlock = calloc(1, sizeof(pthread_rwlock_t)); if (rwlock == NULL) return NULL; if (pthread_rwlock_init(rwlock, NULL) != 0) { free(rwlock); return NULL; } return (rwlock_t*) rwlock; } int rwlockDestroy(rwlock_t *_rwlock) { pthread_rwlock_t *rwlock = (pthread_rwlock_t*)_rwlock; if (rwlock == NULL) { /* Mutex is invalid */ return 1; } if (pthread_rwlock_destroy(rwlock) != 0) { /* Mutex is busy or invalid */ return 1; } free(rwlock); return 0; } int rwlockReadLock(rwlock_t *_rwlock ) { pthread_rwlock_t *rwlock = (pthread_rwlock_t*)_rwlock; return (pthread_rwlock_rdlock(rwlock ) == 0) ? 0 : 1; } int rwlockWriteLock(rwlock_t *_rwlock ){ pthread_rwlock_t *rwlock = (pthread_rwlock_t*)_rwlock; return (pthread_rwlock_wrlock(rwlock ) == 0) ? 0 : 1; } int rwlockReadUnlock(rwlock_t *_rwlock ) { pthread_rwlock_t *rwlock = (pthread_rwlock_t*)_rwlock; return (pthread_rwlock_unlock(rwlock ) == 0) ? 0 : 1; } int rwlockWriteUnlock(rwlock_t *_rwlock ) { pthread_rwlock_t *rwlock = (pthread_rwlock_t*)_rwlock; return (pthread_rwlock_unlock(rwlock ) == 0) ? 0 : 1; } #endif /* defined (_MSC_VER) */ clblas-2.10/src/library/common/tests/000077500000000000000000000000001264277366700176055ustar00rootroot00000000000000clblas-2.10/src/library/common/tests/CMakeLists.txt000066400000000000000000000036471264277366700223570ustar00rootroot00000000000000# ######################################################################## # Copyright 2013 Advanced Micro Devices, Inc. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # ######################################################################## set(SRC_COMMON ../list.c ../clkern.c ../kern_cache.c ../kerngen_core.c ../kgen_basic.c ../kgen_loop_helper.c ../kgen_guard.c ../misc.c ../gens/dblock_kgen.c ../devinfo.c ../devinfo-cache.c ../mutex.c ../trace_malloc.c ) set(SRC_DBLOCK_KGEN ${SRC_COMMON} t_dblock_kgen.c ) set(SRC_GENS_CACHE ${SRC_COMMON} t_gens_cache.c ) include_directories(${OPENCL_INCLUDE_DIRS} ${clBLAS_SOURCE_DIR} ${clBLAS_SOURCE_DIR}/include ${clBLAS_SOURCE_DIR}/src/blas/include) add_executable(t_dblock_kgen ${SRC_DBLOCK_KGEN}) target_link_libraries(t_dblock_kgen ${OPENCL_LIBRARIES} ${MATH_LIBRARY}) set_target_properties( t_dblock_kgen PROPERTIES RUNTIME_OUTPUT_DIRECTORY "${PROJECT_BINARY_DIR}/staging" ) add_executable(t_gens_cache ${SRC_GENS_CACHE}) target_link_libraries(t_gens_cache ${OPENCL_LIBRARIES} ${MATH_LIBRARY}) set_target_properties( t_gens_cache PROPERTIES RUNTIME_OUTPUT_DIRECTORY "${PROJECT_BINARY_DIR}/staging" ) # CPack configuration; include the executable into the package install( TARGETS t_dblock_kgen t_gens_cache RUNTIME DESTINATION bin${SUFFIX_BIN} LIBRARY DESTINATION lib${SUFFIX_LIB} ARCHIVE DESTINATION lib${SUFFIX_LIB}/import ) clblas-2.10/src/library/common/tests/t_dblock_kgen.c000066400000000000000000001157301264277366700225450ustar00rootroot00000000000000/* ************************************************************************ * Copyright 2013 Advanced Micro Devices, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * ************************************************************************/ /* * data block processing function * generators test * * NOTES: * 1) The test can run incorrectly on devices with * wavefront less than 64. * 2) The test with -n or (and) -o option will not work * on CPU since unaligned access to vector data are * not allowed for it. */ #include #include #include #include #include #include #include #include #define MAX(a, b) ((b) > (a)) ? (b) : (a) #define ARRAY_LENGTH(ar) sizeof((ar)) / sizeof((ar)[0]) #define EXTRACT_COMPLEX_DOUBLE(ptr, type, re, img) \ do { \ type *ptr1 = (type*)ptr; \ \ re = ptr1->s[0]; \ img = ptr1->s[1]; \ } while (0) #define MUL_COMPLEX(mul1, mul2, type) \ do { \ type *mul11 = (type*)mul1; \ type *mul21 = (type*)mul2; \ type tmp = *mul11; \ \ mul11->s[0] = tmp.s[0] * mul21->s[0] - tmp.s[1] * mul21->s[1]; \ mul11->s[1] = tmp.s[0] * mul21->s[1] + tmp.s[1] * mul21->s[0]; \ } while (0) \ enum { SOURCE_BUFLEN = 1048576 }; enum { DEBUG_BUFLEN = 1048576 }; typedef enum TransposeType { TRANSPOSE_LOCAL, // transpose at copying to the local memory TRANSPOSE_GLOBAL, // transpose at copying to the global memory TRANSPOSE_BOTH // transpose at both the directions copying } TransposeType; typedef struct TestDesc { cl_uint widthA; cl_uint heightA; cl_uint widthB; cl_uint heightB; cl_uint srowA; // start row in matrix A cl_uint scolA; cl_uint srowB; cl_uint scolB; SubproblemDim dim; PGranularity pgran; bool transpose; bool generic; bool packedImages; TransposeType transpType; // type size DataType type; } TestDesc; typedef struct FuncTable { // fill matrix element with random value void (*fillRandom)(void *a); // fill the matrix element with a special marker void (*fillMarker)(void *a); // function comparing two elements int (*compare)(const void *a, const void *b); // multiply an element 'a' on element 'b' and update the element 'a' void (*mul)(void *a, const void *b); } FuncTable; typedef int (*TestFn)( struct KgenContext *ctx, void *srcBuf, TestDesc *tdesc, cl_device_id devID, cl_context clCtx, cl_command_queue queue); extern char *optarg; const float boundMarker = 5.0; const char *usage = "Usage: t_dblock_kgen -f [-c] [-t type] -d [-n] [-o] [-g];\n" "-c -- launch the CL code on CPU\n" "-t -- transposed version: if option argument is 'local', transpose at copying\n" " to the local memory, if it is 'global', then transpose at copying to the\n" " global memory, if 'both' transpose at both the copying\n" "-d -- data type: float, double, complex_float, complex_double\n" "-n -- matrix width is not float4 aligned\n" "-o -- start offset is not zero\n" "-g -- generic (slow) version\n" "-b -- several rows can be packed to one image row;\n"; const char *rwBlockKernelDecl = "__kernel void\n" "rwMatrBlockTest(\n" " __global %s *matrA,\n" " unsigned int lda,\n" " __global %s *matrB,\n" " unsigned int ldb,\n" " unsigned int srowA,\n" " unsigned int scolA,\n" " unsigned int srowB,\n" " unsigned int scolB)\n"; const char *rwBlockKernelImgDecl = "__kernel void\n" "rwMatrBlockTest(\n" " __global %s *matrA,\n" " unsigned int lda,\n" " __global %s *matrB,\n" " unsigned int ldb,\n" " unsigned int srowA,\n" " unsigned int scolA,\n" " unsigned int srowB,\n" " unsigned int scolB,\n" " __write_only image2d_t image1,\n" " __write_only image2d_t image2)\n"; // type specific functions // for the float type static void fFillRandom(void *a) { *(cl_float*)a = random() % 1000; } static void fFillMarker(void *a) { *(cl_float*)a = boundMarker; } static int fCompare(const void *a, const void *b) { cl_float *a1 = (cl_float*)a; cl_float *b1 = (cl_float*)b; return !(*a1 == *b1); } static void fmul(void *a, const void *b) { cl_float *a1 = (cl_float*)a; cl_float *b1 = (cl_float*)b; *a1 *= *b1; } // for the double type static void dFillRandom(void *a) { *(cl_double*)a = random() % 1000; } static void dFillMarker(void *a) { *(cl_double*)a = boundMarker; } static int dCompare(const void *a, const void *b) { cl_double *a1 = (cl_double*)a; cl_double *b1 = (cl_double*)b; return !(*a1 == *b1); } static void dmul(void *a, const void *b) { cl_double *a1 = (cl_double*)a; cl_double *b1 = (cl_double*)b; *a1 *= *b1; } // for the complex float type static void cFillRandom(void *a) { cl_float2 *a1 = (cl_float2*)a; a1->s[0] = random() % 1000; a1->s[1] = random() % 1000; } static void cFillMarker(void *a) { cl_float2 *a1 = (cl_float2*)a; a1->s[0] = boundMarker; a1->s[1] = boundMarker; } static int cCompare(const void *a, const void *b) { cl_float2 *a1 = (cl_float2*)a; cl_float2 *b1 = (cl_float2*)b; return !((a1->s[0] == b1->s[0]) && (a1->s[1] == b1->s[1])); } static void cmul(void *a, const void *b) { MUL_COMPLEX(a, b, cl_float2); } // for the complex double type void zFillRandom(void *a) { cl_double2 *a1 = (cl_double2*)a; a1->s[0] = random() % 1000; a1->s[1] = random() % 1000; } void zFillMarker(void *a) { cl_double2 *a1 = (cl_double2*)a; a1->s[0] = boundMarker; a1->s[1] = boundMarker; } int zCompare(const void *a, const void *b) { cl_double2 *a1 = (cl_double2*)a; cl_double2 *b1 = (cl_double2*)b; return !((a1->s[0] == b1->s[0]) && (a1->s[1] == b1->s[1])); } static void zmul(void *a, const void *b) { MUL_COMPLEX(a, b, cl_double2); } static FuncTable funcTable[TYPE_COMPLEX_DOUBLE + 1] = { {fFillRandom, fFillMarker, fCompare, fmul}, {dFillRandom, dFillMarker, dCompare, dmul}, {cFillRandom, cFillMarker, cCompare, cmul}, {zFillRandom, zFillMarker, zCompare, zmul} }; /* * fill matrix with random elements or the special random * element if 'random' is set to true */ static void fillMatrix( cl_float *matr, size_t height, size_t width, size_t ld, DataType dtype, bool marker) { unsigned int nfloats; size_t i, j; void *p; void (*fill)(void*); fill = (marker) ? funcTable[dtype].fillMarker : funcTable[dtype].fillRandom; nfloats = dtypeSize(dtype) / sizeof(cl_float); for (i = 0; i < height; i++) { for (j = 0; j < width; j++) { p = (cl_float*)matr + (i * ld + j) * nfloats; fill(p); } } } static int compareMatrices(void *matrA, void *matrB, const TestDesc *tdesc) { size_t i, j; unsigned int nfloats; void *p1, *p2; int ret = 0; double a1, b1, a2, b2; nfloats = dtypeSize(tdesc->type) / sizeof(cl_float); for (i = 0; (i < tdesc->dim.y) && !ret; i++) { for (j = 0; j < tdesc->dim.x; j++) { p1 = (cl_float*)matrA + ((tdesc->srowA + i) * tdesc->widthA + tdesc->scolA + j) * nfloats; if (tdesc->transpose && (tdesc->transpType != TRANSPOSE_BOTH)) { p2 = (cl_float*)matrB + ((tdesc->srowB + j) * tdesc->widthB + tdesc->scolB + i) * nfloats; } else { p2 = (cl_float*)matrB + ((tdesc->srowB + i) * tdesc->widthB + tdesc->scolB + j) * nfloats; } ret = funcTable[tdesc->type].compare(p1, p2); if (ret) { printf("The first error occurred at row %lu, column %lu " "of the block: ", i + tdesc->srowA, j + tdesc->scolA); if ((tdesc->type == TYPE_FLOAT) || (tdesc->type == TYPE_DOUBLE)) { if (tdesc->type == TYPE_FLOAT) { a1 = *(cl_float*)p1; b1 = *(cl_float*)p2; } else { a1 = *(cl_double*)p1; b1 = *(cl_double*)p2; } printf("value is %.5E but must be %.5E\n", b1, a1); } else { if (tdesc->type == TYPE_COMPLEX_FLOAT) { EXTRACT_COMPLEX_DOUBLE(p1, cl_float2, a1, a2); EXTRACT_COMPLEX_DOUBLE(p2, cl_float2, b1, b2); } else { EXTRACT_COMPLEX_DOUBLE(p1, cl_double2, a1, a2); EXTRACT_COMPLEX_DOUBLE(p2, cl_double2, b1, b2); } printf("value is (%.5E, %.5E) but must be (%.5E, %.5E)\n", b1, b2, a1, a2); } break; } } } return ret; } static int checkBound( void *matr, DataType dtype, size_t srow, size_t scol, size_t nrRows, size_t nrCols, size_t rwidth) { size_t i, j; unsigned int nfloats; void *p; int ret = 0; double a1, a2; unsigned char marker[sizeof(cl_double2)]; nfloats = dtypeSize(dtype) / sizeof(cl_float); funcTable[dtype].fillMarker(marker); for (i = 0; (i < nrRows) && !ret; i++) { for (j = 0; j < nrCols; j++) { p = (cl_float*)matr + ((srow + i) * rwidth + scol + j) * nfloats; ret = funcTable[dtype].compare(p, marker); if (ret) { printf("The bound marker first damaged at row %lu, column %lu " "of the block: ", i + srow, j + scol); if ((dtype == TYPE_FLOAT) || (dtype == TYPE_DOUBLE)) { if (dtype == TYPE_FLOAT) { a1 = *(cl_float*)p; } else { a1 = *(cl_double*)p; } printf("actual value is %.5E\n", a1); } else { if (dtype == TYPE_COMPLEX_FLOAT) { EXTRACT_COMPLEX_DOUBLE(p, cl_float2, a1, a2); } else { EXTRACT_COMPLEX_DOUBLE(p, cl_double2, a1, a2); } printf("actual value is (%.5E, %.5E)\n", a1, a2); } break; } } } return ret; } // check the data was not written outside bound static int checkMatrixBound(void *matrB, const TestDesc *tdesc) { int ret = 0; size_t dimr, dimc; if (tdesc->transpose && (tdesc->transpType != TRANSPOSE_BOTH)) { dimr = tdesc->dim.y; dimc = tdesc->dim.x; } else { dimr = tdesc->dim.x; dimc = tdesc->dim.y; } if (tdesc->srowB) { ret = checkBound(matrB, tdesc->type, 0, 0, tdesc->srowB, tdesc->widthB, tdesc->widthB); } if (tdesc->scolB && !ret) { ret = checkBound(matrB, tdesc->type, tdesc->srowB, 0, dimc, tdesc->scolB, tdesc->widthB); } if ((tdesc->scolB + dimr < tdesc->widthB) && !ret) { ret = checkBound(matrB, tdesc->type, tdesc->srowB, tdesc->scolB + dimr, dimc, tdesc->widthB - tdesc->scolB - dimr, tdesc->widthB); } if ((tdesc->srowB + dimc < tdesc->heightB) && !ret) { ret = checkBound(matrB, tdesc->type, tdesc->srowB + dimc, 0, tdesc->heightB - tdesc->srowB - dimc, tdesc->widthB, tdesc->widthB); } return ret; } // Check the data was not written outside bound. Several matrix rows can be // packed into single image line. static int checkImageBound(void *imgB, const TestDesc *tdesc) { int ret = 0; // Size of packed line of rows, in tdesc->type's size_t pLine; size_t rowsInLine; rowsInLine = (tdesc->widthB / tdesc->dim.x); pLine = rowsInLine * tdesc->dim.x; //right ret = checkBound(imgB, tdesc->type, 0, pLine, tdesc->heightB, tdesc->widthB - pLine, tdesc->widthB); //last image line tail if (!ret && ((tdesc->dim.x * tdesc->dim.y) % pLine != 0)) { ret = checkBound(imgB, tdesc->type, (tdesc->dim.x * tdesc->dim.y) / pLine, (tdesc->dim.x * tdesc->dim.y) % pLine, 1, (pLine - (tdesc->dim.x * tdesc->dim.y) % pLine) % pLine, tdesc->widthB); } //bottom if (!ret) { int startRow = tdesc->dim.x * tdesc->dim.y / pLine; if (tdesc->dim.x * tdesc->dim.y % pLine != 0) { startRow ++; } ret = checkBound(imgB, tdesc->type, startRow, 0, tdesc->heightB - startRow, tdesc->widthB, tdesc->widthB); } return ret; } // Compare image with matrix. Several matrix rows can be packed into single // image line. static int compareImage(void *matrA, void *imgB, const TestDesc *tdesc) { size_t i, j; unsigned int nfloats; void *p1, *p2; int ret = 0; double a1, b1, a2, b2; nfloats = dtypeSize(tdesc->type) / sizeof(cl_float); for (i = 0; (i < tdesc->dim.y) && !ret; i++) { for (j = 0; j < tdesc->dim.x; j++) { // Size of packed line of rows, in tdesc->type's int pLine; // absolute index of element in image int index; p1 = (cl_float*)matrA + ((tdesc->srowA + i) * tdesc->widthA + tdesc->scolA + j) * nfloats; pLine = (tdesc->widthB / tdesc->dim.x) * tdesc->dim.x; index = i * tdesc->dim.x + j; p2 = (cl_float*)imgB + ((index / pLine) * tdesc->widthB + index % pLine) * nfloats; ret = funcTable[tdesc->type].compare(p1, p2); if (ret) { printf("The first error occurred at row %lu, column %lu " "of the block: ", i + tdesc->srowA, j + tdesc->scolA); if ((tdesc->type == TYPE_FLOAT) || (tdesc->type == TYPE_DOUBLE)) { if (tdesc->type == TYPE_FLOAT) { a1 = *(cl_float*)p1; b1 = *(cl_float*)p2; } else { a1 = *(cl_double*)p1; b1 = *(cl_double*)p2; } printf("value is %.5E but must be %.5E\n", b1, a1); } else { if (tdesc->type == TYPE_COMPLEX_FLOAT) { EXTRACT_COMPLEX_DOUBLE(p1, cl_float2, a1, a2); EXTRACT_COMPLEX_DOUBLE(p2, cl_float2, b1, b2); } else { EXTRACT_COMPLEX_DOUBLE(p1, cl_double2, a1, a2); EXTRACT_COMPLEX_DOUBLE(p2, cl_double2, b1, b2); } printf("value is (%.5E, %.5E) but must be (%.5E, %.5E)\n", b1, b2, a1, a2); } break; } } } return ret; } static cl_uint get_cl_device(cl_device_id *id, int type) { cl_uint status; cl_uint numEnt; cl_platform_id platform; status = clGetPlatformIDs(0, NULL, &numEnt); status += clGetPlatformIDs(1, &platform, NULL); status += clGetDeviceIDs(platform, type, 1, id, &numEnt); return status; } // create memory buffer objects needed for a test case static cl_int createBufferObjs( void *matrA, void *matrB, cl_mem *aobj, cl_mem *bobj, cl_context ctx, size_t asize, size_t bsize) { cl_int status; if (aobj != NULL) { *aobj = clCreateBuffer(ctx, (CL_MEM_READ_WRITE | CL_MEM_USE_HOST_PTR), asize, matrA, &status); if (*aobj == NULL) { printf("Memory object creation for A matrix failed, status = %d, " "asize = %lu\n", status, asize); return status; } } *bobj = clCreateBuffer(ctx, (CL_MEM_READ_WRITE | CL_MEM_USE_HOST_PTR), bsize, matrB, &status); if (*bobj == NULL) { printf("Memory object creation for B matrix failed, status = %d, " "bsize = %lu\n", status, bsize); if (aobj) { clReleaseMemObject(*aobj); *aobj = NULL; } } return status; } // create image memory objects needed for a test case static cl_int createImageObjs( void *img1, void *img2, cl_mem *img1obj, cl_mem *img2obj, cl_context ctx, size_t pixels_width, size_t pixels_height) { cl_mem *objs[2] = {img1obj, img2obj}; void *bufs[2] = {img1, img2}; const char *names[2]={"first", "second"}; const cl_image_format format = { CL_RGBA, CL_FLOAT }; cl_int status; int i; for (i=0; i<2; i++) { if (objs[i] == NULL) { continue; } *objs[i] = clCreateImage2D(ctx, (CL_MEM_WRITE_ONLY | CL_MEM_USE_HOST_PTR), &format, pixels_width, pixels_height, 0, bufs[i], &status); if (status != CL_SUCCESS) { printf("Memory object creation for %s image failed, status = %d, " "width = %lupx, height = %lupx\n", names[i], status, pixels_width, pixels_height); if (i==1) { //first image was created successfully, release it if(objs[0] != NULL) { clReleaseMemObject(*objs[0]); } } break; } } return status; } // create a kernel needed for a test case static cl_kernel createKernel( const char *kernName, char *srcBuf, cl_context ctx, cl_device_id devID, cl_program *program) { char log[65536]; cl_int status; cl_kernel krn = NULL; *program = buildClProgram(srcBuf, NULL, ctx, devID, log, sizeof(log), &status); if (*program == NULL) { printf("Program building failed, status = %d, log info:\n%s\n", status, log); } else { krn = clCreateKernel(*program, kernName, &status); if (krn == NULL) { printf("Kernel creation failed, status = %d\n", status); clReleaseProgram(*program); *program = NULL; printf("failed program code: \"%s\"\n", srcBuf); fflush(stdout); } } return krn; } static void releaseBufferObjs( cl_mem aobj, cl_mem bobj) { if (aobj != NULL) { clReleaseMemObject(aobj); } clReleaseMemObject(bobj); } static int testMatrBlockRW( struct KgenContext *ctx, void *srcBuf, TestDesc *tdesc, cl_device_id devID, cl_context clCtx, cl_command_queue queue) { cl_float *matrA; cl_float *matrB; cl_float *img1; cl_float *img2; TestDesc tdescImage; cl_mem aobj = NULL, bobj = NULL; cl_mem img1obj = NULL, img2obj = NULL; unsigned int tsize; char tmp[1024]; KernelDesc kdesc; const char *s, *s1; int ret; // read, write, global to image, local to image functions names char rname[128], wname[128], giname[128], liname[128]; size_t size, asize, bsize; // width and height in pixels, size in bytes size_t imageWidth, imageHeight, imgSize; cl_program program = NULL; cl_device_type devType; KernelArg *karg; KernelErrorInfo errInfo; cl_event event; cl_int status; SubproblemDim dim, *pdim; // local memory block leading dimension for generic read and write back size_t ld; bool testImages; DBlockCopyFlags flags = 0; unsigned int nfloats; bool b; memset(&kdesc, 0, sizeof(kdesc)); rname[0] = wname[0] = giname[0] = liname[0] = '\0'; clGetDeviceInfo(devID, CL_DEVICE_TYPE, sizeof(devType), &devType, NULL); tsize = dtypeSize(tdesc->type); nfloats = tsize / sizeof(cl_float); if (((tdesc->dim.x * tsize) % sizeof(cl_float4) == 0) && (devType == CL_DEVICE_TYPE_GPU) && !tdesc->transpose) { testImages = true; } else { printf("Size of row is not float4 aligned, or the target device is CPU," "or copying should be transposed, images are not used.\n"); testImages = false; } resetKgenContext(ctx); asize = tdesc->heightA * tdesc->widthA * tsize; bsize = tdesc->heightB * tdesc->widthB * tsize; // Size of images in pixels. Each pixel is float4. if (tdesc->packedImages) { imageWidth = fl4RowWidth(tdesc->dim.x * 3.5, tsize); imageHeight = tdesc->dim.y; } else { imageWidth = fl4RowWidth(tdesc->dim.x, tsize); imageHeight = tdesc->dim.y; } imgSize = imageHeight * imageWidth * sizeof(cl_float4); matrA = malloc(asize); matrB = malloc(bsize); img1 = malloc(imgSize); img2 = malloc(imgSize); if (!matrA || !matrB || !img1 || !img2) { printf("Memory allocation failed\n"); return -1; } fillMatrix(matrA, tdesc->heightA, tdesc->widthA, tdesc->widthA, tdesc->type, false); fillMatrix(matrB, tdesc->heightB, tdesc->widthB, tdesc->widthB, tdesc->type, true); fillMatrix(img1, imageHeight, imageWidth * FLOAT4_VECLEN / nfloats, imageWidth * FLOAT4_VECLEN / nfloats, tdesc->type, true); fillMatrix(img2, imageHeight, imageWidth * FLOAT4_VECLEN / nfloats, imageWidth * FLOAT4_VECLEN / nfloats, tdesc->type, true); if (createBufferObjs(matrA, matrB, &aobj, &bobj, clCtx, asize, bsize) != CL_SUCCESS) { return -1; } if (testImages) { // function gets width in float4's if (createImageObjs(img1, img2, &img1obj, &img2obj, clCtx, imageWidth, imageHeight) != CL_SUCCESS) { releaseBufferObjs(aobj, bobj); return -1; } } b = isDoubleBasedType(tdesc->type); kgenDeclareUptrs(ctx, b); kgenAddBlankLine(ctx); s = dtypeBuiltinType(tdesc->type); s1 = dtypeUPtrField(tdesc->type); pdim = (tdesc->generic) ? NULL : &dim; // generate the functions dim = tdesc->dim; if (tdesc->transpose && (tdesc->transpType != TRANSPOSE_GLOBAL)) { flags = DBLOCK_COPY_TRANSPOSE; } if ((devType == CL_DEVICE_TYPE_CPU) && (tdesc->widthA % sizeof(cl_float4) || tdesc->srowA)) { flags |= DBLOCK_COPY_NOT_VECTORIZE; } copyDataBlockGen(ctx, pdim, &tdesc->pgran, tdesc->type, DBLOCK_GLOBAL_TO_LOCAL, flags); kgenGetLastFuncName(rname, sizeof(rname), ctx); kgenAddBlankLine(ctx); if (tdesc->transpose && (tdesc->transpType != TRANSPOSE_GLOBAL)) { ld = fl4RowWidth(tdesc->dim.y, tsize) * FLOAT4_VECLEN / nfloats; } else { ld = fl4RowWidth(tdesc->dim.x, tsize) * FLOAT4_VECLEN / nfloats; } if (tdesc->transpose) { flags = (tdesc->transpType == TRANSPOSE_LOCAL) ? 0 : DBLOCK_COPY_TRANSPOSE; if (tdesc->transpType != TRANSPOSE_GLOBAL) { dim.x = tdesc->dim.y; dim.y = tdesc->dim.x; } } else { flags = 0; } if ((devType == CL_DEVICE_TYPE_CPU) && (tdesc->widthA % sizeof(cl_float4) || tdesc->srowA)) { flags |= DBLOCK_COPY_NOT_VECTORIZE; } copyDataBlockGen(ctx, pdim, &tdesc->pgran, tdesc->type, DBLOCK_LOCAL_TO_GLOBAL, flags); kgenGetLastFuncName(wname, sizeof(wname), ctx); kgenAddBlankLine(ctx); if (testImages) { if (tdesc->packedImages) { flags |= DBLOCK_COPY_PACKED_IMAGE; } copyDataBlockGen(ctx, pdim, &tdesc->pgran, tdesc->type, DBLOCK_GLOBAL_TO_IMAGE, flags); kgenGetLastFuncName(giname, sizeof(giname), ctx); kgenAddBlankLine(ctx); copyDataBlockGen(ctx, pdim, &tdesc->pgran, tdesc->type, DBLOCK_LOCAL_TO_IMAGE, flags); kgenGetLastFuncName(liname, sizeof(liname), ctx); kgenAddBlankLine(ctx); } if (testImages) { sprintf(tmp, rwBlockKernelImgDecl, s, s); } else { sprintf(tmp, rwBlockKernelDecl, s, s); } kgenDeclareFunction(ctx, tmp); kgenBeginFuncBody(ctx); size = fl4RowWidth(tdesc->dim.x, tsize) * tdesc->dim.y * FLOAT4_VECLEN; if (size < fl4RowWidth(tdesc->dim.y, tsize) * tdesc->dim.x * FLOAT4_VECLEN) { size = fl4RowWidth(tdesc->dim.y, tsize) * tdesc->dim.x * FLOAT4_VECLEN; } // declare and initialize local variables sprintf(tmp, "__local float tmpBuf[%lu];\n" "LPtr tmp;\n" "GPtr src, dst;\n" "\n" "tmp.f = tmpBuf;\n" "src.%s = matrA;\n" "dst.%s = matrB;\n\n", size, s1, s1); kgenAddStmt(ctx, tmp); // read block call if (tdesc->generic) { sprintf(tmp, "%s(tmp, src, srowA, scolA, %lu, %lu, %lu, lda);\n", rname, tdesc->dim.y, tdesc->dim.x, ld); } else { sprintf(tmp, "%s(tmp, src, srowA, scolA, lda);\n", rname); } kgenAddStmt(ctx, tmp); kgenAddStmt(ctx, "barrier(CLK_LOCAL_MEM_FENCE);\n"); // write block call if (tdesc->generic) { sprintf(tmp, "%s(dst, tmp, srowB, scolB, %lu, %lu, ldb, %lu);\n", wname, dim.y, dim.x, ld); } else { sprintf(tmp, "%s(dst, tmp, srowB, scolB, ldb);\n", wname); } kgenAddStmt(ctx, tmp); if (testImages) { // global memory to image write function call if (tdesc->generic) { sprintf(tmp, "%s(image1, 0, 0, src, srowA, scolA, %lu, %lu, lda);\n", giname, dim.y, dim.x); } else { sprintf(tmp, "%s(image1, 0, 0, src, srowA, scolA, lda);\n", giname); } kgenAddStmt(ctx, tmp); // local memory to image write function call if (tdesc->generic) { sprintf(tmp, "%s(image2, 0, 0, tmp, %lu, %lu, %lu);\n", liname, dim.y, dim.x, ld); } else { sprintf(tmp, "%s(image2, 0, 0, tmp);\n", liname); } kgenAddStmt(ctx, tmp); } ret = kgenEndFuncBody(ctx); // now compile and launch the kernel if (!ret) { kdesc.kernel = createKernel("rwMatrBlockTest", srcBuf, clCtx, devID, &program); if (kdesc.kernel == NULL) { ret = -1; } } karg = kdesc.args; initMemobjKarg(&karg[0], aobj, matrA, asize, MEMOBJ_WRITE); INIT_KARG(&karg[1], tdesc->widthA); initMemobjKarg(&karg[2], bobj, matrB, bsize, MEMOBJ_READ); INIT_KARG(&karg[3], tdesc->widthB); INIT_KARG(&karg[4], tdesc->srowA); INIT_KARG(&karg[5], tdesc->scolA); INIT_KARG(&karg[6], tdesc->srowB); INIT_KARG(&karg[7], tdesc->scolB); if (testImages) { INIT_KARG(&karg[8], img1obj); INIT_KARG(&karg[9], img2obj); } kdesc.globalThreads[0] = tdesc->pgran.wgSize[0]; kdesc.localThreads[0] = tdesc->pgran.wgSize[0]; kdesc.workDim = 1; kdesc.needExecTime = 1; kdesc.event = &event; if (!ret) { status = launchClKernel(&kdesc, queue, &errInfo); if (status != CL_SUCCESS) { printf("Kernel launching failed: status = %d, phase = %d, " "wrong arg = %d\n", status, errInfo.phase, errInfo.wrongArg); ret = -1; } } if (testImages) { if (!ret) { ret = clEnqueueReadImage(queue, img1obj, CL_TRUE, (size_t[3]){0, 0, 0}, (size_t[3]){imageWidth, imageHeight, 1}, 0, 0, img1, 0, NULL, NULL); if (ret) { printf ("image read failed, code %d\n", ret); } } if (!ret) { ret = clEnqueueReadImage(queue, img2obj, CL_TRUE, (size_t[3]){0, 0, 0}, (size_t[3]){imageWidth, imageHeight, 1}, 0, 0, img2, 0, NULL, NULL); if (ret) { printf ("image read failed, code %d\n", ret); } } } memcpy(&tdescImage, tdesc, sizeof(tdescImage)); // width in tdesc->types tdescImage.widthB = (imageWidth * FLOAT4_VECLEN) / nfloats; tdescImage.heightB = imageHeight; tdescImage.scolB = 0; tdescImage.srowB = 0; // check the result if (!ret) { ret = compareMatrices(matrA, matrB, tdesc); // check the data wasn't written outside the square if (!ret) { ret = checkMatrixBound(matrB, tdesc); } } if (testImages) { if (tdesc->packedImages) { // compare matrix with packed image data if (!ret) { ret = compareImage(matrA, img1, &tdescImage); if (!ret) { ret = checkImageBound(img1, &tdescImage); } } if (!ret) { ret = compareImage(matrA, img2, &tdescImage); if (!ret) { ret = checkImageBound(img2, &tdescImage); } } } else { if (!ret) { ret = compareMatrices(matrA, img1, &tdescImage); if (!ret) { ret = checkMatrixBound(img1, &tdescImage); } } if (!ret) { ret = compareMatrices(matrA, img2, &tdescImage); if (!ret) { ret = checkMatrixBound(img2, &tdescImage); } } } } releaseBufferObjs(aobj, bobj); if (testImages) { releaseBufferObjs(img1obj, img2obj); } if (kdesc.kernel) { clReleaseKernel(kdesc.kernel); clReleaseProgram(program); } free(matrA); free(matrB); free(img1); free(img2); return ret; } static int parseDataType(DataType *dtype) { int ret = 0; if (!strcmp(optarg, "float")) { *dtype = TYPE_FLOAT; } else if (!strcmp(optarg, "double")) { *dtype = TYPE_DOUBLE; } else if (!strcmp(optarg, "complex_float")) { *dtype = TYPE_COMPLEX_FLOAT; } else if (!strcmp(optarg, "complex_double")) { *dtype = TYPE_COMPLEX_DOUBLE; } else { printf("An unsupported data typs is specified: %s\n", optarg); ret = -1; } return ret; } static int parseTransposeType(TransposeType *ttype) { int ret = 0; if (!strcmp(optarg, "local")) { *ttype = TRANSPOSE_LOCAL; } else if (!strcmp(optarg, "global")) { *ttype = TRANSPOSE_GLOBAL; } else if (!strcmp(optarg, "both")) { *ttype = TRANSPOSE_BOTH; } else { printf("An unsupported transpose type is specified: %s\n", optarg); ret = -1; } return ret; } static int runTestCases( struct KgenContext *ctx, char *srcBuf, TestDesc *tdesc, cl_device_id devID, cl_context clCtx, cl_command_queue queue, TestFn fn) { int i, i1; int ret = 0; unsigned int nfloats; i1 = (tdesc->type == TYPE_COMPLEX_DOUBLE) ? 1 : 2; nfloats = dtypeSize(tdesc->type) / sizeof(cl_float); tdesc->pgran.wgDim = 1; tdesc->pgran.wgSize[1] = 1; tdesc->pgran.wfSize = 64; for (i = 0; i < i1; i++) { if (!i) { printf("Tests with float4 aligned rows:\n\n"); tdesc->dim.x = 64; } else { printf("Tests with not float4 aligned rows:\n\n"); tdesc->dim.x = 65; } printf("Number of block rows is equal to the work group size\n"); tdesc->dim.y = 64 / nfloats; tdesc->pgran.wgSize[0] = 64 / nfloats; ret = fn(ctx, srcBuf, tdesc, devID, clCtx, queue); if (ret) { printf("FAIL\n\n"); break; } printf("PASS\n\n"); printf("Number of block rows is greater than the work group size, " "the rows number is divided on the work group size\n"); tdesc->pgran.wgSize[0] = 32 / nfloats; ret = fn(ctx, srcBuf, tdesc, devID, clCtx, queue); if (ret) { printf("FAIL\n\n"); break; } tdesc->pgran.wgSize[0] = 64 / nfloats; printf("PASS\n\n"); printf("Number of block rows is greater than the work group size, " "the rows number is not divided on the work group size\n"); tdesc->dim.y = 99 / nfloats; ret = fn(ctx, srcBuf, tdesc, devID, clCtx, queue); if (ret) { printf("FAIL\n\n"); break; } printf("PASS\n\n"); printf("Number of block rows is less than the work group size\n" "The work group size is divided on the number of rows\n"); tdesc->dim.y = 32 / nfloats; ret = fn(ctx, srcBuf, tdesc, devID, clCtx, queue); if (ret) { printf("FAIL\n\n"); break; } printf("PASS\n\n"); printf("Number of block rows is less than the work group size\n" "The work group size is not divided on the number of rows\n"); tdesc->dim.y = (17 + nfloats - 1) / nfloats; ret = fn(ctx, srcBuf, tdesc, devID, clCtx, queue); if (ret) { printf("FAIL\n\n"); break; } printf("PASS\n\n"); printf("Number of block rows is less than the work group size\n" "The work group size is not divided on the number of rows\n" "Each row consists of 1 elements\n"); tdesc->dim.x = 1; ret = fn(ctx, srcBuf, tdesc, devID, clCtx, queue); if (ret) { printf("FAIL\n\n"); break; } printf("PASS\n\n"); } return ret; } int main(int argc, char *argv[]) { struct KgenContext *ctx; char *buf; TestDesc tdesc; cl_context clCtx = NULL; cl_command_queue queue = NULL; cl_device_id devID; int devType = CL_DEVICE_TYPE_GPU; cl_int status; int err = 0; int opt; TestFn func; // test with non zero offset bool off = false; // test with non float4 aligned width bool v4na = false; char dataType[64]; const char *s2 = "", *s3 = "", *s4 = "", *s5 = "", *s7 = ""; const char *s6 = "GPU"; memset(&tdesc, 0, sizeof(tdesc)); tdesc.transpose = false; tdesc.type = -1; // parse command line arguments while (!err) { opt = getopt(argc, argv, "ct:d:nogb"); if (opt == -1) { break; } switch (opt) { case 'c': devType = CL_DEVICE_TYPE_CPU; s5 = "CPU"; break; case 't': tdesc.transpose = true; err = parseTransposeType(&tdesc.transpType); break; case 'd': err = parseDataType(&tdesc.type); if (!err) { sprintf(dataType, "%s", optarg); } break; case 'g': tdesc.generic = true; s5 = ", generic (slow) version"; break; case 'n': v4na = true; break; case 'o': off = true; break; case 'b': tdesc.packedImages = true; s7 = ", several rows can be packed to one image row"; break; default: printf("Wrong option %c\n", opt); err = 1; break; } } if ((signed)tdesc.type == -1) { printf("Data type is not specified\n"); err = -1; } if (err) { printf("%s", usage); return 1; } status = get_cl_device(&devID, devType); if (status) { printf("Device opening failed, status = %d\n", status); return 1; } clCtx = clCreateContext((const cl_context_properties*)NULL, 1, &devID, NULL, NULL, &status); if (clCtx == NULL) { printf("Context creation failed, status = %d\n", status); } if (clCtx != NULL) { queue = clCreateCommandQueue(clCtx, devID, CL_QUEUE_PROFILING_ENABLE, &status); if (queue == NULL) { clReleaseContext(clCtx); printf("Command queue creation failed, status = %d\n", status); } } buf = malloc(SOURCE_BUFLEN); ctx = createKgenContext(buf, SOURCE_BUFLEN, true); func = testMatrBlockRW; if (v4na) { tdesc.widthA = 2055; tdesc.widthB = 2777; s2 = ", matrix rows are not aligned to float4 boundary"; } else { tdesc.widthA = 2048; tdesc.widthB = 2560; s2 = "matrix rows are aligned to float4 boundary"; } tdesc.heightA = 2048; tdesc.heightB = 2048; if (off) { s3 = ", starting offsets are not zero"; tdesc.srowA = 17; tdesc.scolA = 27; tdesc.srowB = 55; tdesc.scolB = 86; } else { s3 = ", starting offsets are zero"; } if (tdesc.transpose) { switch (tdesc.transpType) { case TRANSPOSE_LOCAL: s4 = ", transpose at reading"; break; case TRANSPOSE_GLOBAL: s4 = ", transpose at writing back"; break; case TRANSPOSE_BOTH: s4 = ", transpose at both reading and writing back"; break; } } printf("Test read/write block function with %s data type%s%s%s%s%s.\n" "Run the test on %s...\n\n", dataType, s2, s3, s4, s5, s7, s6); if (runTestCases(ctx, buf, &tdesc, devID, clCtx, queue, func)) { printf("Source: \n%s\n", buf); } // release OpenCL objects clReleaseCommandQueue(queue); clReleaseContext(clCtx); return 0; } clblas-2.10/src/library/common/tests/t_gens_cache.c000066400000000000000000000251641264277366700223630ustar00rootroot00000000000000/* ************************************************************************ * Copyright 2013 Advanced Micro Devices, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * ************************************************************************/ /* * test generator and cache infrastructure */ #include #include #include #ifdef __APPLE__ #include #else #include #endif #include #include enum { NR_TEST_PATTERNS = 5, KERNELS_PER_PATTERN = 10, KCACHE_SIZE_LIMIT = 1048576 }; const char *strcpyImpl = "char\n" "*strcpy(char *dst, char *src)\n" "{\n" " do {\n" " *dst++ = *src++;\n" " } while (*(dst - 1) != 0);\n" "}"; static int testGenFunc(struct KgenContext *ctx) { kgenDeclareFunction(ctx, "char\n" "*strcpy(char *dst, char *src)\n"); kgenBeginFuncBody(ctx); kgenAddStmt(ctx, "char *ret = dst;\n\n"); kgenBeginBranch(ctx, "do"); kgenAddStmt(ctx, "*dst = *src;\n" "src++;\n" "dst++;\n"); kgenEndBranch(ctx, "while (*(dst - 1) != 0)"); kgenAddBlankLine(ctx); kgenAddStmt(ctx, "return ret;\n"); return kgenEndFuncBody(ctx); } static int kernExtraCmp(const void *extra, const void *extraKey) { unsigned long u1 = *(unsigned long*)extra; unsigned long u2 = *(unsigned long*)extraKey; return !(u1 == u2); } static int testGen(void) { char buf[4096]; char name[64]; int r; struct KgenContext *ctx; size_t s; ctx = createKgenContext(buf, sizeof(buf), true); if (ctx == NULL) { printf("Context creation failed\n"); printf("FAIL\n\n"); return -1; } printf("Test normal kernel generation\n"); if (!testGenFunc(ctx)) { printf("Generated code:\n\n"); printf("%s", buf); printf("\n\nPASS\n\n"); } else { printf("FAIL\n\n"); } printf("Test function name extracting from the generated code\n"); r = kgenGetLastFuncName(name, sizeof(name), ctx); if (r) { printf("FAIL\n"); } else { if (strcmp((const char*)name, "strcpy")) { printf("Extracted names is %s must be strcpy\n", name); printf("FAIL\n\n"); r = -1; } else { printf("PASS\n\n"); } } destroyKgenContext(ctx); printf("Test source size calculating without actual source " "adding to any buffer\n"); ctx = createKgenContext(NULL, 0, true); r = kgenAddStmt(ctx, strcpyImpl); if (!r) { s = kgenSourceSize(ctx); if (s != strlen(strcpyImpl)) { r = -1; } } if (r) { printf("FAIL\n\n"); } else { printf("PASS\n\n"); } destroyKgenContext(ctx); ctx = createKgenContext(buf, 5, true); if (!r) { printf("Test generation with insufficient buffer\n"); if (testGenFunc(ctx)) { printf("PASS\n"); } else { printf("FAIL\n"); r = -1; } } return r; } // test case for kache error functionality static int errorCacheTestCase( const char *msg, struct KernelCache *kcache, solver_id_t sid, SubproblemDim *dims, unsigned int nrDims, cl_context context, cl_device_id device, unsigned long extra, Kernel *kern) { KernelKey key; Kernel* krn1; int r; bool fail; key.device = device; key.context = context; key.nrDims = nrDims; memset(key.subdims, 0, sizeof(key.subdims)); r = nrDims; if (nrDims > MAX_SUBDIMS) r = MAX_SUBDIMS; memcpy(key.subdims, dims, sizeof(SubproblemDim) * r); printf("%s", msg); if (kern == NULL) { krn1 = findKernel(kcache, sid, &key, &extra); fail = (krn1 != NULL); } else { r = addKernelToCache(kcache, sid, kern, &key, kernExtraCmp); fail = (r == 0); } if (fail) { printf("FAIL\n"); r = -1; } else { printf("PASS\n"); r = 0; } return r; } static int testCache(cl_context context, cl_device_id device) { int r = 0; int i, j; unsigned int k; const solver_id_t wrongSID = 15; struct KernelCache *kcache; KernelKey key; Kernel *kern[NR_TEST_PATTERNS][KERNELS_PER_PATTERN], *krn1; SubproblemDim dims[NR_TEST_PATTERNS][KERNELS_PER_PATTERN][MAX_SUBDIMS]; unsigned int nrDims[NR_TEST_PATTERNS] = {1, 3, 2, 2, 1}; unsigned long extra = 7, extra1; printf("Testing inserting and normal searching of kernels\n"); kcache = createKernelCache(10, KCACHE_SIZE_LIMIT); key.device = device; key.context = context; for (i = 0; (i < NR_TEST_PATTERNS) && !r; i++) { for (j = 0; (j < KERNELS_PER_PATTERN) && !r; j++) { for (k = 0; k < nrDims[i]; k++) { dims[i][j][k].x = random() % 1000; if (k == 2) { dims[i][j][k].y = SUBDIM_UNUSED; dims[i][j][k].itemX = SUBDIM_UNUSED; } else { dims[i][j][k].y = random() % 1000; dims[i][j][k].itemX = random() % 1000; } dims[i][j][k].bwidth = random() % 1000; dims[i][j][k].itemY = random() % 1000; } kern[i][j] = allocKernel(); kern[i][j]->extra = &extra; kern[i][j]->extraSize = sizeof(extra); key.nrDims = nrDims[i]; memset(key.subdims, 0, sizeof(key.subdims)); memcpy(key.subdims, dims[i][j], sizeof(SubproblemDim) * key.nrDims); r = addKernelToCache(kcache, i, kern[i][j], &key, kernExtraCmp); } } if (r) { printf("Error at addition to the cache, i = %d, j = %d\n", i, j); printf("FAIL\n"); } else { // Now try to find each cached kernel extra1 = extra; for (i = 0; (i < NR_TEST_PATTERNS) && !r; i++) { for (j = 0; j < KERNELS_PER_PATTERN; j++) { key.nrDims = nrDims[i]; memset(key.subdims, 0, sizeof(key.subdims)); memcpy(key.subdims, dims[i][j], sizeof(SubproblemDim) * key.nrDims); krn1 = findKernel(kcache, i, &key, &extra1); if (krn1 != kern[i][j]) { r = -1; break; } } } if (r) { printf("First error occurred at pattern %d, kernel %d: ", i, j); if (krn1 == NULL) { printf("the kernel is not found\n"); } else { printf("the kernel mismatch\n"); } } else { printf("PASS\n"); } } // cases for search error functionality dims[0][0][0].x = 1001; if (!r) { r = errorCacheTestCase("Try to search a kernel not being in " "the cache\n", kcache, 0, dims[0][0], nrDims[0], context, device, extra, NULL); } if (!r) { r = errorCacheTestCase("Try To search a kernel with a wrong extra " "information\n", kcache, 0, dims[0][1], nrDims[0], context, device, extra - 2, NULL); } if (!r) { r = errorCacheTestCase("Try to search a kernel with a solver " "ID\n", kcache, wrongSID, dims[0][1], nrDims[0], context, device, extra, NULL); } if (!r) { r = errorCacheTestCase("Try to search a kernel with a wrong number " "of subproblem dimensions\n", kcache, 0, dims[0][1], 500, context, device, extra, NULL); } if (!r) { r = errorCacheTestCase("Try to search a kernel with bad OpenCL context\n", kcache, 0, dims[0][1], 500, (cl_context)-1, device, extra, NULL); } if (!r) { r = errorCacheTestCase("Try to search a kernel with bad OpenCL device\n", kcache, 0, dims[0][1], 500, context, (cl_device_id)-1, extra, NULL); } // error test cases for inserting to cache krn1 = allocKernel(); krn1->extra = &extra; krn1->extraSize = sizeof(extra); if (!r) { r = errorCacheTestCase("Try to insert a kernel with a wrong solver " "ID\n", kcache, wrongSID, dims[0][0], nrDims[0], context, device, extra, krn1); } if (!r) { r = errorCacheTestCase("Try to insert a kernel with a wrong number " "of subproblem dimensions\n", kcache, 0, dims[0][0], 500, context, device, extra, krn1); } return r; } int main(void) { cl_int err; cl_platform_id platform; cl_device_id device; cl_context_properties props[] = { CL_CONTEXT_PLATFORM, 0, 0 }; cl_context context; err = clGetPlatformIDs(1, &platform, NULL); if (err != CL_SUCCESS) { fprintf(stderr, "clGetPlatformIDs() failed with %d\n", err); return 1; } err = clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, 1, &device, NULL); if (err != CL_SUCCESS) { fprintf(stderr, "clGetDeviceIDs() failed with %d\n", err); return 1; } props[1] = (cl_context_properties)platform; context = clCreateContext(props, 1, &device, NULL, NULL, &err); if (err != CL_SUCCESS) { fprintf(stderr, "clCreateContext() failed with %d\n", err); return 1; } printf("Launch tests for kernel generators\n"); printf("-----------------------------------------\n"); if (!testGen()) { printf("-----------------------------------------\n\n"); printf("Launch tests for kernel cache\n"); printf("-----------------------------------------\n"); testCache(context, device); } clReleaseContext(context); return 0; } clblas-2.10/src/library/common/trace_malloc.c000066400000000000000000000132451264277366700212410ustar00rootroot00000000000000/* ************************************************************************ * Copyright 2013 Advanced Micro Devices, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * ************************************************************************/ #include #include #include #include #include #include #include #if defined(TRACE_MALLOC) #if _MSC_VER #include #endif // use standard malloc/free though #undef malloc #undef calloc #undef realloc #undef free enum { MTRACE_NODE_MAGIC = 0x5A20286D }; #define MTRACE_LOCK() mutexLock(mutex) #define MTRACE_UNLOCK() mutexUnlock(mutex) #define KIB 1024 #define MIB KIB*1024 typedef struct MtraceNode { unsigned long magic; char *file; int line; void *ptr; size_t size; ListNode node; } MtraceNode; static mutex_t *mutex; static size_t tracedSize; static size_t rawSize; ListHead traceList; static int cmpNode(const ListNode *node, const void *key) { const MtraceNode *mtnode = container_of(node, node, MtraceNode); return !(mtnode->ptr == key); } static __inline size_t rawTracedSize(MtraceNode *mtnode) { return mtnode->size + sizeof(MtraceNode) + strlen(mtnode->file) + 1; } static MtraceNode *searchMtraceNode(void *ptr) { ListNode *node; MTRACE_LOCK(); node = listNodeSearch(&traceList, ptr, cmpNode); MTRACE_UNLOCK(); return (node) ? container_of(node, node, MtraceNode) : NULL; } static void freeNode(ListNode *node) { MtraceNode *mtnode = container_of(node, node, MtraceNode); if (mtnode->file != NULL) { free(mtnode->file); } if (mtnode->ptr != NULL) { free(mtnode->ptr); } free(mtnode); } static void sprintfTracedSize(char *str, size_t size) { const char *suffix; if (size < KIB * 10) { suffix = "bytes"; } else if (size < MIB * 10) { suffix = "KiB"; size /= KIB; } else { suffix = "MIB"; size /= MIB; } sprintf(str, "%lu %s", size, suffix); } static void printNodeInfo(ListNode *node) { MtraceNode *mtnode = container_of(node, node, MtraceNode); char s[1024]; sprintfTracedSize(s, mtnode->size); printf("%s at %s line %d\n", s, mtnode->file, mtnode->line); } void initMallocTrace(void) { listInitHead(&traceList); tracedSize = rawSize = 0; mutex = mutexInit(); } void *debugMalloc(size_t size, const char *file, int line) { void *ret = NULL; MtraceNode *mtnode; mtnode = calloc(1, sizeof(MtraceNode)); if (mtnode == NULL) { return NULL; } mtnode->magic = MTRACE_NODE_MAGIC; mtnode->file = strdup(file); if (mtnode->file != NULL) { ret = mtnode->ptr = malloc(size); } if (ret != NULL) { mtnode->line = line; mtnode->size = size; MTRACE_LOCK(); tracedSize += size; rawSize += rawTracedSize(mtnode); listAddToTail(&traceList, &mtnode->node); MTRACE_UNLOCK(); } else { freeNode(&mtnode->node); } return ret; } void *debugCalloc(size_t size, const char *file, int line) { void *ret; ret = debugMalloc(size, file, line); if (ret != NULL) { memset(ret, 0, size); } return ret; } void *debugRealloc(void *ptr, size_t size, const char *file, int line) { void *ret; if (ptr == NULL) { ret = debugMalloc(size, file, line); } else { MtraceNode *mtnode; mtnode = searchMtraceNode(ptr); assert((mtnode != NULL) && (mtnode->magic == MTRACE_NODE_MAGIC)); ret = realloc(ptr, size); if (ret != NULL) { ssize_t delta = (ssize_t)size - (ssize_t)mtnode->size; mtnode->ptr = ret; mtnode->size = size; MTRACE_LOCK(); tracedSize += delta; rawSize += delta; MTRACE_UNLOCK(); } else { debugFree(ptr); } } return ret; } void debugFree(void *ptr) { MtraceNode *mtnode; if (ptr == NULL) { return; } mtnode = searchMtraceNode(ptr); assert((mtnode != NULL) && (mtnode->magic == MTRACE_NODE_MAGIC)); MTRACE_LOCK(); tracedSize -= mtnode->size; rawSize -= rawTracedSize(mtnode); listDel(&mtnode->node); MTRACE_UNLOCK(); freeNode(&mtnode->node); } void printMallocStatistics(void) { char s[1024]; sprintfTracedSize(s, tracedSize); printf("[MALLOC TRACE] Totally %s is allocated\n", s); } void printMemLeaksInfo(void) { puts("\n"); if (!tracedSize) { puts("[MALLOC TRACE] Hurray! There are not memory leaks!"); } else { char s1[1024], s2[1024]; sprintfTracedSize(s1, tracedSize); sprintfTracedSize(s2, rawSize); printf("[MALLOC TRACE] Totally %s is lost, raw traced size is %s\n", s1, s2); puts("Detailed report:\n" "------------------------------------------------------------"); assert(!isListEmpty(&traceList)); listDoForEach(&traceList, printNodeInfo); } } void releaseMallocTrace(void) { listDoForEachSafe(&traceList, freeNode); mutexDestroy(mutex); } #endif /* TRACE_MALLOC */ clblas-2.10/src/library/tools/000077500000000000000000000000001264277366700163135ustar00rootroot00000000000000clblas-2.10/src/library/tools/OCLBinaryGenerator/000077500000000000000000000000001264277366700217445ustar00rootroot00000000000000clblas-2.10/src/library/tools/OCLBinaryGenerator/CMakeLists.txt000066400000000000000000000026141264277366700245070ustar00rootroot00000000000000# ######################################################################## # Copyright 2013 Advanced Micro Devices, Inc. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # ######################################################################## cmake_minimum_required(VERSION 2.6) project(OCLBinaryGenerator C CXX) ADD_DEFINITIONS(/D_CRT_SECURE_NO_WARNINGS) ADD_EXECUTABLE(OCLBinaryGenerator OCLBinaryGenerator.cpp) target_link_libraries(OCLBinaryGenerator ${OPENCL_LIBRARIES}) include_directories(${OPENCL_INCLUDE_DIRS}) set_target_properties( OCLBinaryGenerator PROPERTIES RUNTIME_OUTPUT_DIRECTORY "${CMAKE_CURRENT_BINARY_DIR}/staging" ) if ( MSVC ) set_target_properties( OCLBinaryGenerator PROPERTIES RUNTIME_OUTPUT_DIRECTORY_DEBUG "${CMAKE_CURRENT_BINARY_DIR}/staging" ) set_target_properties( OCLBinaryGenerator PROPERTIES RUNTIME_OUTPUT_DIRECTORY_RELEASE "${CMAKE_CURRENT_BINARY_DIR}/staging" ) endif( ) clblas-2.10/src/library/tools/OCLBinaryGenerator/OCLBinaryGenerator.cpp000066400000000000000000000254631264277366700261130ustar00rootroot00000000000000#include #include #include #include #include #include #include #include #include #include #include #ifdef __GNUC__ // Linux #include #include #include #include #else // Windows #include #include #include #include //#define stat _stat #endif #include "CL/cl.h" void find_and_replace(std::string& str, const std::string& findStr, const std::string& replaceStr){ size_t pos = 0; while ((pos = str.find(findStr, pos)) != std::string::npos){ str.replace(pos, findStr.length(), replaceStr); pos += replaceStr.length(); } } /****************************************************************************** * Check OpenCL Errors *****************************************************************************/ #define CL_CHECK(STATUS) \ if(STATUS != CL_SUCCESS) { \ printf("OpenCL error %i on line %u\n", STATUS, __LINE__); \ assert(false); \ } /****************************************************************************** * write binary to stream *****************************************************************************/ void writeBinaryToStream(std::ostream & out, char *binary, size_t binarySize) { for (int i = 0; i < binarySize; i++) { out << std::setw(4) << (int)binary[i]; if (i < binarySize - 1) { out << ","; } if ((i + 1) % 16 == 0) { out << std::endl; } } out << std::endl; } /****************************************************************************** * Get AMD Platform *****************************************************************************/ cl_int getAMDPlatform(cl_platform_id *platform) { *platform = NULL; cl_int status = CL_SUCCESS; // get num platforms cl_uint numPlatforms; status = clGetPlatformIDs(0, NULL, &numPlatforms); if (status != CL_SUCCESS) { std::cout << "Error: clGetPlatformIDs failed. Error code: " << status << std::endl; return status; } if (numPlatforms > 0) { // Get selected platform cl_platform_id* platforms = new cl_platform_id[numPlatforms]; status = clGetPlatformIDs(numPlatforms, platforms, NULL); if (status != CL_SUCCESS) { std::cout << "Error: clGetPlatformIDs failed. Error code : " << status << std::endl; return status; } // Print all platforms for (unsigned i = 0; i < numPlatforms; ++i) { char pbuf[100]; status = clGetPlatformInfo(platforms[i], CL_PLATFORM_VENDOR, sizeof(pbuf), pbuf, NULL); if (status != CL_SUCCESS) { std::cout << "Error: clGetPlatformInfo failed. Error code : " << status << std::endl; return status; } //std::cout << "Platform " << i << " : " << pbuf << std::endl; } // Get AMD platform for (unsigned i = 0; i < numPlatforms; ++i) { char pbuf[100]; status = clGetPlatformInfo(platforms[i], CL_PLATFORM_VENDOR, sizeof(pbuf), pbuf, NULL); if (status != CL_SUCCESS) { std::cout << "Error: clGetPlatformInfo failed. Error code: " << status << std::endl; return status; } *platform = platforms[i]; if (!strcmp(pbuf, "Advanced Micro Devices, Inc.")) { break; } } // verify AMD platform char pbuf[100]; status = clGetPlatformInfo(*platform, CL_PLATFORM_VENDOR, sizeof(pbuf), pbuf, NULL); if (status != CL_SUCCESS) { std::cout << "Error: clGetPlatformInfo failed. Error code: " << status << std::endl; return status; } if (strcmp(pbuf, "Advanced Micro Devices, Inc.")) { std::cout << "AMD platform not found" << std::endl; return CL_INVALID_PLATFORM; } } else { std::cout << "No OpenCL platforms found." << std::endl; return CL_INVALID_PLATFORM; } return status; } /****************************************************************************** * get kernel binary from source *****************************************************************************/ cl_int getKernelBinaryFromSource( cl_context context, const char *source, const char *buildOptions, char **binary, size_t *binarySize) { cl_int status = CL_SUCCESS; // create program cl_program program = clCreateProgramWithSource(context, 1, &source, NULL, &status); CL_CHECK(status); cl_uint numDevicesInContext; status = clGetContextInfo(context, CL_CONTEXT_NUM_DEVICES, sizeof(cl_uint), &numDevicesInContext, NULL); CL_CHECK(status); // get devices cl_device_id* devices = new cl_device_id[numDevicesInContext]; clGetContextInfo(context, CL_CONTEXT_DEVICES, numDevicesInContext*sizeof(cl_device_id), devices, NULL); CL_CHECK(status); // choose device 0 cl_device_id device = devices[0]; // build program for device status = clBuildProgram(program, 1, &device, buildOptions, NULL, NULL); // print build failure if (status != CL_SUCCESS) { printf("clBuildProgram Failed\n"); printf("status = %d\n", status); size_t len = 0; clGetProgramBuildInfo(program, device, CL_PROGRAM_BUILD_LOG, 0, NULL, &len); char* buildLog = new char[len]; printf("Error: Failed to build program executable!\n"); clGetProgramBuildInfo(program, device, CL_PROGRAM_BUILD_LOG, len*sizeof(char), buildLog, 0); printf("\nBuild Log:\n\n"); printf("%s\n", buildLog); printf("\n\nKernel String:\n\n"); printf("%s\n", source); binary[0] = 0; *binarySize = 0; return status; } // get binary from program status = clGetProgramInfo(program, CL_PROGRAM_BINARY_SIZES, sizeof(size_t), binarySize, NULL); binary[0] = new char[*binarySize]; status = clGetProgramInfo(program, CL_PROGRAM_BINARIES, 8 /*?*/, binary, NULL); CL_CHECK(status); return CL_SUCCESS; } int main(int argc, char *argv[]) { /* OCLBinaryGenerator requires at least two inputs 1, path the kernel file 2, file name 3, output directory 4, optional compiler flags 5, [optional] trageted hardware. If this is not supplied OCLBinaryGenerator will generate binary for the first device on system */ if (argc < 4) { printf("not enough arguments. OCLBinaryGenerator aborted.\n"); exit(-1); } //get the input path std::string inputPath = argv[1]; inputPath += "/"; std::cout << "OCLBinaryGenerator input path is " << inputPath < 4) OCL_flag = argv[4]; std::cout << "OCLBinaryGenerator compiler flag is " << OCL_flag << std::endl; //start writing file std::ofstream outputFile; outputFile.open((outputPath+outputFilename).c_str(), std::ios::out); const char *outputFileHeader = "/*****************************************************************************\n" " * kernel binary file\n" " ****************************************************************************/\n\n"; outputFile << outputFileHeader; // get AMD platform cl_platform_id platform; cl_int status = getAMDPlatform(&platform); CL_CHECK(status); cl_uint numDevices; status = clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, 0, NULL, &numDevices); CL_CHECK(status); // get all gpu devices cl_device_id* devices = new cl_device_id[numDevices]; clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, numDevices, devices, NULL); CL_CHECK(status); // choose device 0 or we can choose a target device in the future cl_device_id device = devices[0]; // create context cl_context_properties cps[3] = { CL_CONTEXT_PLATFORM, (cl_context_properties)platform, 0 }; cl_context context = clCreateContext( cps, 1, // device &device, NULL, NULL, &status); CL_CHECK(status); cl_uint numDevicesInContext; status = clGetContextInfo(context, CL_CONTEXT_NUM_DEVICES, sizeof(cl_uint), &numDevicesInContext, NULL); CL_CHECK(status); char **kernelBinary = new char*[1]; size_t kernelBinarySize; std::ifstream inputfile((inputPath+inputFilename).c_str()); if (!inputfile.is_open()) { printf("Input file does not exist. OCLBinaryGenerator aborted.\n"); exit(-1); } std::string str((std::istreambuf_iterator(inputfile)), std::istreambuf_iterator()); inputfile.close(); //std::cout<(" << outputKernelName << "_binArray);" << std::endl; outputFile << "size_t " << outputKernelName << "_binSize = " << kernelBinarySize << ";" << std::endl; outputFile << "const char * const " << outputKernelName << "_src = NULL;" << std::endl; //end writing file outputFile.close(); printf("OCLBinaryGenerator finished.\n"); } clblas-2.10/src/library/tools/bingen/000077500000000000000000000000001264277366700175555ustar00rootroot00000000000000clblas-2.10/src/library/tools/bingen/CMakeLists.txt000066400000000000000000000024701264277366700223200ustar00rootroot00000000000000# ######################################################################## # Copyright 2013 Advanced Micro Devices, Inc. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # ######################################################################## cmake_minimum_required(VERSION 2.6) project(bingen C CXX) ADD_DEFINITIONS(/D_CRT_SECURE_NO_WARNINGS) ADD_EXECUTABLE(bingen bingen.cpp) target_link_libraries(bingen ${OPENCL_LIBRARIES}) include_directories(${OPENCL_INCLUDE_DIRS}) set_target_properties( bingen PROPERTIES RUNTIME_OUTPUT_DIRECTORY "${CMAKE_CURRENT_BINARY_DIR}/staging" ) if ( MSVC ) set_target_properties( bingen PROPERTIES RUNTIME_OUTPUT_DIRECTORY_DEBUG "${CMAKE_CURRENT_BINARY_DIR}/staging" ) set_target_properties( bingen PROPERTIES RUNTIME_OUTPUT_DIRECTORY_RELEASE "${CMAKE_CURRENT_BINARY_DIR}/staging" ) endif( ) clblas-2.10/src/library/tools/bingen/bingen.cpp000066400000000000000000000312471264277366700215320ustar00rootroot00000000000000/* ************************************************************************ * Copyright 2013 Advanced Micro Devices, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * ************************************************************************/ #include #include #include #include #include #include #include #ifdef __GNUC__ // Linux #include #include #include #else // Windows #include #include #include #define stat _stat #endif #include "CL/opencl.h" using namespace std; //const char* NameDevicesToSupport [] = {"Tahiti", "Hawaii"}; char* NameDevicesToSupport = NULL; int writeBinaryToFile(const char* fileName, const char* binary, size_t numBytes) { ofstream output; output.open(fileName, ios::binary | ios::trunc); if (output.is_open()) { output.write(binary, numBytes); output.close(); return 0; } else { return 1; } } cl_int GenBinary(cl_context context, const char * source, const char * outFile) { cl_int status = CL_SUCCESS; size_t sourceSize[] = {strlen(source)}; cl_program program = clCreateProgramWithSource(context,1, &source,sourceSize,&status); if (status!=CL_SUCCESS) { cout<<" error clCreateProgramWithSource, can't generate binaries"< kernelnames; while( inFile.good() ) { getline( inFile, str ); // Replace all tabs with spaces found = str.find( '\t' ); while (found != string::npos) { str.replace( found, 1, " " ); found = str.find( '\t' ); } // Find for beginning of the kernel, which will give the kernelName if ( !validKernel && (str.find( "char" ) != string::npos) && (str.find( '*' ) != string::npos) && (str.find( '"' ) != string::npos)) // Beginning of the kernel { kernelName = FindKernelNameFromString(str); if(_64BitsUse) kernelName+="_64"; else kernelName+="_32"; kernelName+="_bin"; kernelnames.push_back(kernelName); validKernel = true; // ss << str << "\\\n"; lineCount = 1; } // Deals with the case of a binary // Find for end of kernel else if( (str.find( "\";" ) != string::npos) && validKernel ) { //ss << str << "\n\n\n"; cout<<"string kernel name: "< #include #include #include #include #include "config.h" using namespace clMath; namespace po = boost::program_options; bool Config::isSane() { if (!hasFuncID_) { std::cerr << "Missing required options 'function'" << std::endl; return false; } return true; } void Config::setOptDesc( po::options_description& opts, bool useDefaults) { po::options_description genOpts("Generator Arguments"); genOpts.add_options() ("cpp", (useDefaults ? po::value()->default_value(cpp()) : po::value()), "Output file name for C++ generated source") ("cl", po::value(), "Output file name for OpenCL generated source") ("data", (useDefaults ? po::value()->default_value("random") : po::value()), "Data generation pattern\n" "Format: {random | unit | sawtooth}") ( "skip-accuracy", "Don't generate code for accuracy check. Applicable if the program " "is needed only for performance measurement") ; po::options_description openclOpts("OpenCL Arguments"); openclOpts.add_options() ("platform", (useDefaults ? po::value()->default_value(platform()) : po::value()), "Platform name") ("device", (useDefaults ? po::value()->default_value(device()) : po::value()), "Device name") ("build-options", po::value(), "Build options") ; po::options_description kargsOpts("BLAS Arguments"); kargsOpts.add_options() ("function,f", po::value(), "Function name, mandatory\n" "Format: {s | d | c | z}{BLAS function}") ("order", (useDefaults ? po::value()->default_value(clblasRowMajor) : po::value()), "Data ordering\n" "Format: {column | row}") ("side", (useDefaults ? po::value()->default_value(clblasLeft) : po::value()), "The side matrix A is located relative to matrix B\n" "Format: {left | right}") ("uplo", (useDefaults ? po::value()->default_value(clblasUpper) : po::value()), "Upper or lower triangle of matrix is being referenced\n" "Format: {upper | lower}") ("transA", (useDefaults ? po::value()->default_value(clblasNoTrans) : po::value()), "Matrix A transposition operation\n" "Format: {n | t | c}") ("transB", (useDefaults ? po::value()->default_value(clblasNoTrans) : po::value()), "Matrix B transposition operation\n" "Format: {n | t | c}") ("diag", (useDefaults ? po::value()->default_value(clblasNonUnit) : po::value()), "Whether the matrix is unit triangular\n" "Format: {unit | nonunit}") ("M,M", (useDefaults ? po::value()->default_value(256) : po::value()->default_value(256)) ) ("N,N", (useDefaults ? po::value()->default_value(256) : po::value()) ) ("K,K", (useDefaults ? po::value()->default_value(256) : po::value()) ) ("alpha", (useDefaults ? po::value()->default_value("1") : po::value()), "Alpha multiplier\n" "Format: real[,imag]") ("beta", (useDefaults ? po::value()->default_value("1") : po::value()), "Beta multiplier\n" "Format: real[,imag]") ("lda", po::value(), "Leading dimension of the matrix A") ("ldb", po::value(), "Leading dimension of the matrix B") ("ldc", po::value(), "Leading dimension of the matrix C") ("offA", (useDefaults ? po::value()->default_value(0) : po::value()), "Start offset in buffer of matrix A") ("offBX", (useDefaults ? po::value()->default_value(0) : po::value()), "Start offset in buffer of matrix B or vector X") ("offCY", (useDefaults ? po::value()->default_value(0) : po::value()), "Start offset in buffer of matrix C or vector Y") ("incx", (useDefaults ? po::value()->default_value(1) : po::value()), "Increment in the array X") ("incy", (useDefaults ? po::value()->default_value(1) : po::value()), "Increment in the array Y") ; po::options_description decompositionOpts("Decomposition Options"); decompositionOpts.add_options() ("decomposition,d", po::value(), "SubproblemDim\n" "Format: {subdims[0].x},{subdims[0].y},\n" " {subdims[0].bwidth},\n" " {subdims[1].x},{subdims[1].y},\n" " {subdims[1].bwidth}") ("multikernel", useDefaults ? po::value()->default_value(false) : po::value(), "Allow division of one BLAS function between several kernels") ; opts.add(genOpts).add(openclOpts).add(kargsOpts).add(decompositionOpts); } bool Config::loadConfig(const char* filename) { po::options_description cfgOpts; setOptDesc(cfgOpts, false); if ((filename == NULL) || (*filename == '\0')) { return false; } try { std::ifstream in(filename); po::store(po::parse_config_file(in, cfgOpts), vm); po::notify(vm); } catch (const po::invalid_command_line_syntax &err) { #if BOOST_VERSION >= 104200 switch (err.kind()) { case po::invalid_syntax::missing_parameter: std::cerr << "Missing argument for option `" << err.tokens() << "'" << std::endl; break; default: std::cerr << "Syntax error, kind " << int(err.kind()) << std::endl; break; } #else std::cerr << err.msg; #endif return false; } catch (const po::validation_error &err) { std::cerr << err.what() << std::endl; return false; } #if BOOST_VERSION >= 104200 catch (const po::reading_file &err) { std::cerr << err.what() << std::endl; return false; } #endif catch (const po::unknown_option &err) { std::cerr << err.what() << std::endl; } return applyOptions(vm, false); } bool Config::parseCommandLine(int argc, char *argv[]) { po::options_description helpOpts("Application Arguments"); helpOpts.add_options() ("config", po::value()->default_value(defaultConfig_), "Configuration file") ("help,h", "Show this help message"); po::options_description visibleOpts; visibleOpts.add(helpOpts); setOptDesc(visibleOpts, true); try { po::store(po::parse_command_line(argc, argv, visibleOpts), vm); po::notify(vm); } catch (const po::invalid_command_line_syntax &err) { #if BOOST_VERSION >= 104200 switch (err.kind()) { case po::invalid_syntax::missing_parameter: std::cerr << "Missing argument for option `" << err.tokens() << "'" << std::endl; break; default: std::cerr << "Syntax error, kind " << int(err.kind()) << std::endl; break; }; #else std::cerr << err.msg; #endif return false; } catch (const po::validation_error &err) { std::cerr << err.what() << std::endl; return false; } catch (const po::unknown_option &err) { std::cerr << err.what() << std::endl; } if (vm.count("help")) { std::cout << visibleOpts << std::endl; return false; } if (vm.count("config")) { loadConfig(vm["config"].as().c_str()); } return applyOptions(vm); } bool Config::applyOptions( const po::variables_map& vm, bool stopOnError) { bool rc; ArgMultiplier v; rc = true; if (vm.count("function")) { if (!setFunction(vm["function"].as())) { std::cerr << "Invalid function name: " << vm["function"].as() << std::endl; return false; } } if (vm.count("cpp")) { setCpp(vm["cpp"].as()); } if (vm.count("cl")) { setCl(vm["cl"].as()); } if (vm.count("data")) { if (!setDataPattern(vm["data"].as())) { std::cerr << "Invalid data pattern name" << std::endl; rc = false; if (stopOnError) { return false; } } } if (vm.count("skip-accuracy")) { setSkipAccuracy(); } if (vm.count("platform")) { if (!setPlatform(vm["platform"].as())) { std::cerr << "Invalid platform name" << std::endl; rc = false; if (stopOnError) { return false; } } } if (vm.count("device")) { if (!setDevice(vm["device"].as())) { std::cerr << "Invalid device name" << std::endl; rc = false; if (stopOnError) { return false; } } } if (vm.count("build-options")) { setBuildOptions(vm["build-options"].as()); } if (vm.count("order")) { setOrder(vm["order"].as()); } if (vm.count("side")) { setSide(vm["side"].as()); } if (vm.count("uplo")) { setUplo(vm["uplo"].as()); } if (vm.count("transA")) { setTransA(vm["transA"].as()); } if (vm.count("transB")) { setTransB(vm["transB"].as()); } if (vm.count("diag")) { setDiag(vm["diag"].as()); } if (vm.count("M")) { setM(vm["M"].as()); } if (vm.count("N")) { setN(vm["N"].as()); } if (vm.count("K")) { setK(vm["K"].as()); } if (vm.count("alpha")) { if (!parseArgMultiplier(vm["alpha"].as(), v)) { std::cerr << "in option 'alpha': invalid option value" << std::endl; rc = false; if (stopOnError) { return false; } } setAlpha(v); } if (vm.count("beta")) { if (!parseArgMultiplier(vm["beta"].as(), v)) { std::cerr << "in option 'beta': invalid option value" << std::endl; rc = false; if (stopOnError) { return false; } } setBeta(v); } if (vm.count("lda")) { setLDA(vm["lda"].as()); } if (vm.count("ldb")) { setLDB(vm["ldb"].as()); } if (vm.count("ldc")) { setLDC(vm["ldc"].as()); } if (vm.count("offA")) { setOffA(vm["offA"].as()); } if (vm.count("offBX")) { setOffBX(vm["offBX"].as()); } if (vm.count("offCY")) { setOffCY(vm["offCY"].as()); } if (vm.count("incx")) { setIncX(vm["incx"].as()); } if (vm.count("incy")) { setIncY(vm["incy"].as()); } if (vm.count("decomposition")) { if (!parseDecompositionOpt(vm["decomposition"].as())) { std::cerr << "in option 'decomposition': invalid option value" << std::endl; rc = false; if (stopOnError) { return false; } } } if (vm.count("multikernel")) { setMultiKernel(vm["multikernel"].as()); } return rc; } std::istream& operator>>(std::istream& in, clblasOrder& order) { std::string token; in >> token; if (token == "row") { order = clblasRowMajor; } else if (token == "column") { order = clblasColumnMajor; } else { #if BOOST_VERSION >= 104200 throw po::validation_error(po::validation_error::invalid_option_value); #else throw po::validation_error("invalid option value"); #endif } return in; } std::ostream& operator<<(std::ostream& out, const clblasOrder& order) { switch (order) { case clblasRowMajor: out << "row"; break; case clblasColumnMajor: out << "column"; break; } return out; } std::istream& operator>>(std::istream& in, clblasSide& side) { std::string token; in >> token; if (token == "left") { side = clblasLeft; } else if (token == "right") { side = clblasRight; } else { #if BOOST_VERSION >= 104200 throw po::validation_error(po::validation_error::invalid_option_value); #else throw po::validation_error("invalid option value"); #endif } return in; } std::ostream& operator<<(std::ostream& out, const clblasSide& side) { switch (side) { case clblasLeft: out << "left"; break; case clblasRight: out << "right"; break; } return out; } std::istream& operator>>(std::istream& in, clblasUplo& uplo) { std::string token; in >> token; if (token == "upper") { uplo = clblasUpper; } else if (token == "lower") { uplo = clblasLower; } else { #if BOOST_VERSION >= 104200 throw po::validation_error(po::validation_error::invalid_option_value); #else throw po::validation_error("invalid option value"); #endif } return in; } std::ostream& operator<<(std::ostream& out, const clblasUplo& uplo) { switch (uplo) { case clblasUpper: out << "upper"; break; case clblasLower: out << "lower"; break; } return out; } std::istream& operator>>(std::istream& in, clblasTranspose& trans) { std::string token; in >> token; if (token == "n") { trans = clblasNoTrans; } else if (token == "t") { trans = clblasTrans; } else if (token == "c") { trans = clblasConjTrans; } else { #if BOOST_VERSION >= 104200 throw po::validation_error(po::validation_error::invalid_option_value); #else throw po::validation_error("invalid option value"); #endif } return in; } std::ostream& operator<<(std::ostream& out, const clblasTranspose& trans) { switch (trans) { case clblasNoTrans: out << "n"; break; case clblasTrans: out << "t"; break; case clblasConjTrans: out << "c"; break; } return out; } std::istream& operator>>(std::istream& in, clblasDiag& diag) { std::string token; in >> token; if (token == "unit") { diag = clblasUnit; } else if (token == "nonunit") { diag = clblasNonUnit; } else { #if BOOST_VERSION >= 104200 throw po::validation_error(po::validation_error::invalid_option_value); #else throw po::validation_error("invalid option value"); #endif } return in; } std::ostream& operator<<(std::ostream& out, const clblasDiag& diag) { switch (diag) { case clblasUnit: out << "unit"; break; case clblasNonUnit: out << "nonunit"; break; } return out; } bool Config::parseDecompositionOpt(const std::string& opt) { size_t v[6]; // x0, y0, bwidth0, x1, y1, bwidth1 boost::tokenizer<> tok(opt); boost::tokenizer<>::iterator it = tok.begin(); for (int i = 0; i < 6; i++) { if (it == tok.end()) { return false; } try { v[i] = boost::lexical_cast(*it); } catch (boost::bad_lexical_cast&) { return false; } ++it; } if (it != tok.end()) { return false; } setDecomposition(v[0], v[1], v[2], v[3], v[4], v[5]); return true; } bool Config::parseArgMultiplier( const std::string& opt, ArgMultiplier& v) { boost::char_separator sep(","); boost::tokenizer< boost::char_separator > tok(opt, sep); boost::tokenizer< boost::char_separator >::iterator it = tok.begin(); try { switch (kargs_.dtype) { case TYPE_FLOAT: v.argFloat = boost::lexical_cast(*it); ++it; break; case TYPE_DOUBLE: v.argDouble = boost::lexical_cast(*it); ++it; break; case TYPE_COMPLEX_FLOAT: v.argFloatComplex.s[0] = boost::lexical_cast(*it); ++it; if (it == tok.end()) { v.argFloatComplex.s[1] = 0; } else { v.argFloatComplex.s[1] = boost::lexical_cast(*it); ++it; } break; case TYPE_COMPLEX_DOUBLE: v.argDoubleComplex.s[0] = boost::lexical_cast(*it); ++it; if (it == tok.end()) { v.argDoubleComplex.s[1] = 0; } else { v.argDoubleComplex.s[1] = boost::lexical_cast(*it); ++it; } break; } } catch (boost::bad_lexical_cast&) { return false; } return (it == tok.end()); } clblas-2.10/src/library/tools/ktest/config.cpp000066400000000000000000000253371264277366700214300ustar00rootroot00000000000000/* ************************************************************************ * Copyright 2013 Advanced Micro Devices, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * ************************************************************************/ #include #include #include #include "config.h" using namespace clMath; static const char DEFAULT_PLATFORM_NAME[] = "AMD Accelerated Parallel Processing"; Config::Config() : defaultConfig_(""), cpp_("ktest.cpp"), dataPattern_(RANDOM_MATRIX), buildOptions_(""), funcID_(CLBLAS_GEMM), hasFuncID_(false), hasSubdims_(false), skipAccuracy_(false) { setPlatform(DEFAULT_PLATFORM_NAME); setDevice(""); memset(&kargs_, 0, sizeof(kargs_)); kargs_.kernType = CLBLAS_COMPUTING_KERNEL; kargs_.A = kargs_.B = kargs_.C = NULL; kargs_.offsetM = kargs_.offsetN = 0; kargs_.scimage[0] = kargs_.scimage[1] = NULL; kargs_.addrBits = 0; kargs_.dtype = TYPE_FLOAT; kargs_.order = clblasRowMajor; kargs_.side = clblasLeft; kargs_.uplo = clblasUpper; kargs_.transA = clblasNoTrans; kargs_.transB = clblasNoTrans; kargs_.diag = clblasNonUnit; kargs_.M = kargs_.N = kargs_.K = 0; kargs_.lda.matrix = kargs_.ldb.matrix = kargs_.ldc.matrix = 0; kargs_.offA = kargs_.offBX = kargs_.offCY = 0; memset(&kargs_.alpha, 0, sizeof(kargs_.alpha)); memset(&kargs_.beta, 0, sizeof(kargs_.beta)); memset(subdims_, 0, sizeof(subdims_)); names_[CLBLAS_GEMV] = "gemv"; names_[CLBLAS_SYMV] = "symv"; names_[CLBLAS_GEMM] = "gemm"; names_[CLBLAS_TRMM] = "trmm"; names_[CLBLAS_TRSM] = "trsm"; names_[CLBLAS_SYRK] = "syrk"; names_[CLBLAS_SYR2K] = "syr2k"; cl_ = names_[funcID_] + ".cl"; } Config::~Config() { names_.clear(); } const std::string& Config::cpp() const { return cpp_; } const std::string& Config::cl() const { return cl_; } clMath::KTestMatrixGenerator Config::dataPattern() const { return dataPattern_; } std::string Config::platform() const { std::string name; cl_int err; size_t sz; char *pname; err = clGetPlatformInfo(platform_, CL_PLATFORM_NAME, 0, NULL, &sz); if (err != CL_SUCCESS) { return ""; } pname = new char[sz + 1]; err = clGetPlatformInfo(platform_, CL_PLATFORM_NAME, sz, pname, NULL); if (err != CL_SUCCESS) { delete[] pname; return ""; } name = pname; delete[] pname; return name; } std::string Config::device() const { std::string name; cl_int err; size_t sz; char *dname; err = clGetDeviceInfo(device_, CL_DEVICE_NAME, 0, NULL, &sz); if (err != CL_SUCCESS) { return ""; } dname = new char[sz + 1]; err = clGetDeviceInfo(device_, CL_DEVICE_NAME, sz, dname, NULL); if (err != CL_SUCCESS) { delete[] dname; return ""; } name = dname; delete[] dname; return name; } const std::string& Config::buildOptions() const { return buildOptions_; } void Config::kargs(CLBlasKargs *kargs) const { cl_int err; *kargs = kargs_; kargs->addrBits = deviceAddressBits(device_, &err); } bool Config::permitMultiKernels() const { return multiKernel_; } bool Config::withAccuracy() const { return !skipAccuracy_; } bool Config::decomposition(SubproblemDim subdims[MAX_SUBDIMS]) const { if (!hasSubdims_) { return false; } for (int i = 0; i < MAX_SUBDIMS; i++) { subdims[i] = subdims_[i]; } subdims[0].itemX = subdims[0].x; subdims[0].itemY = subdims[0].y; subdims[1].itemX = subdims[1].x; subdims[1].itemY = subdims[1].y; return true; } BlasFunctionID Config::blasFunctionID() const { return funcID_; } void Config::setDefaultConfig(const std::string& filename) { defaultConfig_ = filename; } void Config::setCpp(const std::string& name) { cpp_ = name; } void Config::setCl(const std::string& name) { cl_ = name; } bool Config::setDataPattern(const std::string& name) { if (strcmp(name.c_str(), "random") == 0) { dataPattern_ = clMath::RANDOM_MATRIX; return true; } if (strcmp(name.c_str(), "unit") == 0) { dataPattern_ = clMath::UNIT_MATRIX; return true; } if (strcmp(name.c_str(), "sawtooth") == 0) { dataPattern_ = clMath::SAWTOOTH_MATRIX; return true; } return false; } bool Config::setPlatform(const std::string& name) { cl_int err; cl_uint nrPlatforms; cl_platform_id *platforms; bool found; size_t sz; char *pname; err = clGetPlatformIDs(0, NULL, &nrPlatforms); if ((err != CL_SUCCESS) || (nrPlatforms == 0)) { return false; } platforms = new cl_platform_id[nrPlatforms]; err = clGetPlatformIDs(nrPlatforms, platforms, NULL); if (err != CL_SUCCESS) { delete[] platforms; return false; } found = false; for (cl_uint i = 0; i < nrPlatforms; i++) { err = clGetPlatformInfo(platforms[i], CL_PLATFORM_NAME, 0, NULL, &sz); if (err != CL_SUCCESS) { continue; } pname = new char[sz + 1]; err = clGetPlatformInfo(platforms[i], CL_PLATFORM_NAME, sz, pname, NULL); if (err != CL_SUCCESS) { delete[] pname; continue; } if (name.empty()) { found = (strcmp(pname, DEFAULT_PLATFORM_NAME) == 0); } else { found = (strcmp(pname, name.c_str()) == 0); } delete[] pname; if (found) { platform_ = platforms[i]; break; } } delete[] platforms; return found; } bool Config::setDevice(const std::string& name) { cl_int err; cl_uint nrDevices; cl_device_id *devices; bool found; size_t sz; char *dname; if ((platform_ == NULL) && !setPlatform("")) { return false; } err = clGetDeviceIDs(platform_, CL_DEVICE_TYPE_ALL, 0, NULL, &nrDevices); if ((err != CL_SUCCESS) || (nrDevices == 0)) { return false; } devices = new cl_device_id[nrDevices]; err = clGetDeviceIDs(platform_, CL_DEVICE_TYPE_ALL, nrDevices, devices, NULL); if (err != CL_SUCCESS) { delete[] devices; return false; } found = false; for (cl_uint i = 0; i < nrDevices; i++) { err = clGetDeviceInfo(devices[i], CL_DEVICE_NAME, 0, NULL, &sz); if (err != CL_SUCCESS) { continue; } dname = new char[sz + 1]; err = clGetDeviceInfo(devices[i], CL_DEVICE_NAME, sz, dname, NULL); if (err != CL_SUCCESS) { delete[] dname; continue; } if (name.empty()) { found = true; } else { found = (strcmp(dname, name.c_str()) == 0); } delete[] dname; if (found) { device_ = devices[i]; break; } } delete[] devices; return found; } void Config::setBuildOptions(const std::string& options) { buildOptions_ = options; } bool Config::setFunction(const std::string& name) { if (name.empty()) { return false; } switch (name.c_str()[0]) { case 's': case 'S': kargs_.dtype = TYPE_FLOAT; break; case 'd': case 'D': kargs_.dtype = TYPE_DOUBLE; break; case 'c': case 'C': kargs_.dtype = TYPE_COMPLEX_FLOAT; break; case 'z': case 'Z': kargs_.dtype = TYPE_COMPLEX_DOUBLE; break; default: return false; } for (NameMap::iterator it = names_.begin(); it != names_.end(); ++it) { if (strcmp(name.substr(1).c_str(), (*it).second.c_str()) == 0) { funcID_ = (*it).first; setCl((*it).second + ".cl"); hasFuncID_ = true; return true; } } return false; } void Config::setOrder(clblasOrder order) { kargs_.order = order; } void Config::setSide(clblasSide side) { kargs_.side = side; } void Config::setUplo(clblasUplo uplo) { kargs_.uplo = uplo; } void Config::setTransA(clblasTranspose transA) { kargs_.transA = transA; } void Config::setTransB(clblasTranspose transB) { kargs_.transB = transB; } void Config::setDiag(clblasDiag diag) { kargs_.diag = diag; } void Config::setM(size_t M) { kargs_.M = M; } void Config::setN(size_t N) { kargs_.N = N; } void Config::setK(size_t K) { kargs_.K = K; } void Config::setAlpha(ArgMultiplier alpha) { switch (kargs_.dtype) { case TYPE_FLOAT: kargs_.alpha.argFloat = alpha.argFloat; break; case TYPE_DOUBLE: kargs_.alpha.argDouble = alpha.argDouble; break; case TYPE_COMPLEX_FLOAT: kargs_.alpha.argFloatComplex = alpha.argFloatComplex; break; case TYPE_COMPLEX_DOUBLE: kargs_.alpha.argDoubleComplex = alpha.argDoubleComplex; break; } } void Config::setBeta(ArgMultiplier beta) { switch (kargs_.dtype) { case TYPE_FLOAT: kargs_.beta.argFloat = beta.argFloat; break; case TYPE_DOUBLE: kargs_.beta.argDouble = beta.argDouble; break; case TYPE_COMPLEX_FLOAT: kargs_.beta.argFloatComplex = beta.argFloatComplex; break; case TYPE_COMPLEX_DOUBLE: kargs_.beta.argDoubleComplex = beta.argDoubleComplex; break; } } void Config::setLDA(size_t lda) { kargs_.lda.matrix = lda; } void Config::setLDB(size_t ldb) { kargs_.ldb.matrix = ldb; } void Config::setLDC(size_t ldc) { kargs_.ldc.matrix = ldc; } void Config::setIncX(int incx) { kargs_.ldb.vector = incx; } void Config::setIncY(int incy) { kargs_.ldc.vector = incy; } void Config::setOffA(size_t offA) { kargs_.offA = offA; } void Config::setOffBX(size_t offBX) { kargs_.offBX = offBX; } void Config::setOffCY(size_t offCY) { kargs_.offCY = offCY; } void Config::setMultiKernel(bool multiKernel) { multiKernel_ = multiKernel; } void Config::setSkipAccuracy(void) { skipAccuracy_ = true; } void Config::setDecomposition( size_t x0, size_t y0, size_t bwidth0, size_t x1, size_t y1, size_t bwidth1) { subdims_[0].x = x0; subdims_[0].y = y0; subdims_[0].bwidth = bwidth0; subdims_[1].x = x1; subdims_[1].y = y1; subdims_[1].bwidth = bwidth1; hasSubdims_ = true; } clblas-2.10/src/library/tools/ktest/config.h000066400000000000000000000072061264277366700210700ustar00rootroot00000000000000/* ************************************************************************ * Copyright 2013 Advanced Micro Devices, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * ************************************************************************/ #ifndef KTEST_CONFIG_H__ #define KTEST_CONFIG_H__ #include #include #include #include #include #include #include "ktest-common.h" namespace po = boost::program_options; namespace clMath { typedef std::map NameMap; class Config { private: std::string defaultConfig_; std::string cpp_; std::string cl_; clMath::KTestMatrixGenerator dataPattern_; cl_platform_id platform_; cl_device_id device_; std::string buildOptions_; BlasFunctionID funcID_; CLBlasKargs kargs_; SubproblemDim subdims_[MAX_SUBDIMS]; bool hasFuncID_; bool hasSubdims_; bool multiKernel_; bool skipAccuracy_; po::variables_map vm; NameMap names_; void setOptDesc(po::options_description& opts, bool useDefaults); bool applyOptions(const po::variables_map& vm, bool stopOnError = true); bool parseGroupSizeOpt(const std::string& opt); bool parseDecompositionOpt(const std::string& opt); bool parseArgMultiplier(const std::string& opt, ArgMultiplier& v); public: Config(); ~Config(); const std::string& cpp() const; const std::string& cl() const; clMath::KTestMatrixGenerator dataPattern() const; std::string platform() const; std::string device() const; const std::string& buildOptions() const; void kargs(CLBlasKargs *kargs) const; bool permitMultiKernels() const; bool withAccuracy() const; bool decomposition(SubproblemDim subdims[MAX_SUBDIMS]) const; BlasFunctionID blasFunctionID() const; void setDefaultConfig(const std::string& filename); void setCpp(const std::string& name); void setCl(const std::string& name); bool setDataPattern(const std::string& name); bool setPlatform(const std::string& name); bool setDevice(const std::string& name); void setBuildOptions(const std::string& options); bool setFunction(const std::string& name); void setOrder(clblasOrder order); void setSide(clblasSide side); void setUplo(clblasUplo uplo); void setTransA(clblasTranspose transA); void setTransB(clblasTranspose transB); void setDiag(clblasDiag diag); void setM(size_t M); void setN(size_t N); void setK(size_t K); void setAlpha(ArgMultiplier alpha); void setBeta(ArgMultiplier beta); void setLDA(size_t lda); void setLDB(size_t ldb); void setLDC(size_t ldc); void setOffA(size_t offA); void setOffBX(size_t offBX); void setOffCY(size_t offCY); void setIncX(int incx); void setIncY(int incy); void setMultiKernel(bool multiKernel); void setSkipAccuracy(); void setDecomposition(size_t x0, size_t y0, size_t bwidth0, size_t x1, size_t y1, size_t bwidth1); bool parseCommandLine(int argc, char *argv[]); bool loadConfig(const char* filename); bool isSane(); }; } // namespace clMath #endif // KTEST_CONFIG_H__ clblas-2.10/src/library/tools/ktest/ktest-common.h000066400000000000000000000017471264277366700222470ustar00rootroot00000000000000/* ************************************************************************ * Copyright 2013 Advanced Micro Devices, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * ************************************************************************/ #ifndef KTEST_COMMON_H_ #define KTEST_COMMON_H_ namespace clMath { typedef enum KTestMatrixGenerator { RANDOM_MATRIX, UNIT_MATRIX, SAWTOOTH_MATRIX, N_MATRIX_GENERATORS } KTestMatrixGenerator; } #endif /* KTEST_COMMON_H_ */ clblas-2.10/src/library/tools/ktest/ktest-patterns.h000066400000000000000000000256711264277366700226210ustar00rootroot00000000000000/* ************************************************************************ * Copyright 2013 Advanced Micro Devices, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * ************************************************************************/ #ifndef KTEST_PATTERNS_H_ #define KTEST_PATTERNS_H_ static std::string loadFileCode = "char*\n" "loadFile(const char* path)\n" "{\n" " FILE *f;\n" " long size;\n" " char *text;\n" "\n" " f = fopen(path, \"r\");\n" " if (f == NULL) {\n" " return NULL;\n" " }\n" "\n" " if (fseek(f, 0, SEEK_END) != 0) {\n" " fclose(f);\n" " return NULL;\n" " }\n" " size = ftell(f);\n" " if (size == -1) {\n" " fclose(f);\n" " return NULL;\n" " }\n" " if (fseek(f, 0, SEEK_SET) != 0) {\n" " fclose(f);\n" " return NULL;\n" " }\n" "\n" " text = (char*)calloc(size + 1, 1);\n" " if (text == NULL) {\n" " fclose(f);\n" " return NULL;\n" " }\n" "\n" " if (fread(text, 1, size, f) == 0) {\n" " free(text);\n" " fclose(f);\n" " return NULL;\n" " }\n" " fclose(f);\n" " return text;\n" "}\n"; static std::string randomVectorCode = "template\n" "void\n" "randomVector(\n" " size_t N,\n" " T *X,\n" " int incx)\n" "{\n" " size_t n;\n" " VectorAccessor x(X, N, incx);\n" "\n" " for (n = 0; n < N; n++) {\n" " x[n] = random();\n" " }\n" "}\n"; static std::string unitVectorCode = "template\n" "void\n" "unitVector(\n" " size_t N,\n" " T *X,\n" " int incx)\n" "{\n" " size_t n;\n" " VectorAccessor x(X, N, incx);\n" "\n" " for (n = 0; n < N; n++) {\n" " x[n] = ONE();\n" " }\n" "}\n"; static std::string sawtoothVectorCode = "template\n" "void\n" "sawtoothVector(\n" " size_t N,\n" " T *X,\n" " int incx)\n" "{\n" " T v;\n" " size_t n;\n" " VectorAccessor x(X, N, incx);\n" "\n" " v = ONE();\n" " for (n = 0; n < N; n++) {\n" " x[n] = v;\n" " v = v + ONE();\n" " }\n" "}\n"; static std::string compareVectorsCode = "template\n" "bool\n" "compareVectors(\n" " size_t N,\n" " T *blasVector,\n" " T *naiveVector,\n" " int incx)\n" "{\n" " size_t n;\n" " VectorAccessor blas(blasVector, N, incx);\n" " VectorAccessor naive(naiveVector, N, incx);\n" " T blasVal, naiveVal;\n" "\n" " for (n = 0; n < N; n++) {\n" " blasVal = blas[n];\n" " naiveVal = naive[n];\n" " if (isNAN(blasVal) && isNAN(naiveVal)) {\n" " continue;\n" " }\n" " if (blasVal != naiveVal) {\n" " return false;\n" " }\n" " }\n" " return true;\n" "}\n"; static std::string compareMatricesCode = "template\n" "bool\n" "compareMatrices(\n" " clblasOrder order,\n" " size_t rows,\n" " size_t columns,\n" " T *blasMatrix,\n" " T *naiveMatrix,\n" " size_t ld)\n" "{\n" " size_t r, c;\n" " MatrixAccessor blas(blasMatrix, order, clblasNoTrans, rows, columns, ld);\n" " MatrixAccessor naive(naiveMatrix, order, clblasNoTrans, rows, columns, ld);\n" " T blasVal, naiveVal;\n" "\n" " for (r = 0; r < rows; r++) {\n" " for (c = 0; c < columns; c++) {\n" " blasVal = blas[r][c];\n" " naiveVal = naive[r][c];\n" " if (isNAN(blasVal) && isNAN(naiveVal)) {\n" " continue;\n" " }\n" " if (blasVal != naiveVal) {\n" " return false;\n" " }\n" " }\n" " }\n" " return true;\n" "}\n"; static std::string randomMatrixCode = "\n" "template\n" "void\n" "randomMatrix(\n" " clblasOrder order,\n" " size_t rows,\n" " size_t columns,\n" " T *A,\n" " size_t lda)\n" "{\n" " size_t r, c;\n" " MatrixAccessor a(A, order, clblasNoTrans, rows, columns, lda);\n" "\n" " for (r = 0; r < rows; r++) {\n" " for (c = 0; c < columns; c++) {\n" " a[r][c] = random();\n" " }\n" " }\n" "}\n"; static std::string unitMatrixCode = "\n" "template\n" "void\n" "unitMatrix(\n" " clblasOrder order,\n" " size_t rows,\n" " size_t columns,\n" " T *A,\n" " size_t lda)\n" "{\n" " size_t r, c;\n" " MatrixAccessor a(A, order, clblasNoTrans, rows, columns, lda);\n" "\n" " for (r = 0; r < rows; r++) {\n" " for (c = 0; c < columns; c++) {\n" " a[r][c] = ONE();\n" " }\n" " }\n" "}\n"; static std::string sawtoothMatrixCode = "\n" "template\n" "void\n" "sawtoothMatrix(\n" " clblasOrder order,\n" " size_t rows,\n" " size_t columns,\n" " T *A,\n" " size_t lda)\n" "{\n" " size_t step;\n" " T v;\n" " size_t r, c;\n" " MatrixAccessor a(A, order, clblasNoTrans, rows, columns, lda);\n" "\n" " step = sqrt(rows);\n" " v = ONE();\n" "\n" " for (r = 0; r < rows; r++) {\n" " if ((r != 0) && (r % step == 0)) {\n" " v = v + ONE();\n" " }\n" " for (c = 0; c < columns; c++) {\n" " a[r][c] = v;\n" " }\n" " }\n" "}\n"; static std::string setUpTRSMDiagonalCode = "template\n" "void\n" "setUpTRSMDiagonal(\n" " clblasOrder order,\n" " clblasSide side,\n" " clblasUplo uplo,\n" " clblasTranspose transA,\n" " clblasDiag diag,\n" " size_t M,\n" " size_t N,\n" " T alpha,\n" " T *A,\n" " size_t lda,\n" " T *B,\n" " size_t ldb)\n" "{\n" " size_t sizeA = (side == clblasRight) ? N : M;\n" "\n" " if (diag == clblasNonUnit) {\n" " size_t k = side == clblasLeft ? M : N;\n" " MatrixAccessor a(A, order, clblasNoTrans, k, k, lda);\n" " for (cl_uint i = 0; i < sizeA; i++) {\n" " a[i][i] = ONE();\n" " }\n" " double ub = UPPER_BOUND();\n" " while (ub >= 1) {\n" " size_t i = rand() % k;\n" " a[i][i] = a[i][i] * TWO();\n" " ub /= 2;\n" " }\n" " \n" " }\n" " NaiveBlas::trmm(order, side, uplo, transA, diag, M, N, alpha, A, lda, B, ldb);\n" "}\n" "\n"; static std::string forwardDeclarationsCode = "cl_platform_id getPlatform(const char *name);\n" "cl_device_id getDevice(cl_platform_id platform, const char *name);\n" "cl_kernel createKernel(const char *source, cl_context context,\n" " const char* options, cl_int *error);\n" "void printExecTime(cl_ulong ns);\n"; static std::string getPlatformCode = "cl_platform_id\n" "getPlatform(const char *name)\n" "{\n" " cl_int err;\n" " cl_uint nrPlatforms, i;\n" " cl_platform_id *list, platform;\n" " char platformName[64];\n" "\n" " err = clGetPlatformIDs(0, NULL, &nrPlatforms);\n" " if (err != CL_SUCCESS) {\n" " return NULL;\n" " }\n" "\n" " list = (cl_platform_id*)calloc(nrPlatforms, sizeof(*list));\n" " if (list == NULL) {\n" " return NULL;\n" " }\n" "\n" " err = clGetPlatformIDs(nrPlatforms, list, NULL);\n" " if (err != CL_SUCCESS) {\n" " free(list);\n" " return NULL;\n" " }\n" "\n" " platform = NULL;\n" " for (i = 0; i < nrPlatforms; i++) {\n" " err = clGetPlatformInfo(list[i], CL_PLATFORM_NAME,\n" " sizeof(platformName), platformName, NULL);\n" " if ((err == CL_SUCCESS) && (strcmp(platformName, name) == 0)) {\n" " platform = list[i];\n" " break;\n" " }\n" " }\n" "\n" " free(list);\n" " return platform;\n" "}\n"; static std::string getDeviceCode = "cl_device_id\n" "getDevice(\n" " cl_platform_id platform,\n" " const char *name)\n" "{\n" "\n" " cl_int err;\n" " cl_uint nrDevices, i;\n" " cl_device_id *list, device;\n" " char deviceName[64];\n" "\n" " err = clGetDeviceIDs(platform, CL_DEVICE_TYPE_ALL, 0, NULL, &nrDevices);\n" " if (err != CL_SUCCESS) {\n" " return NULL;\n" " }\n" " list = (cl_device_id*)calloc(nrDevices, sizeof(*list));\n" " if (list == NULL) {\n" " return NULL;\n" " }\n" "\n" " err = clGetDeviceIDs(platform, CL_DEVICE_TYPE_ALL, nrDevices, list, NULL);\n" " if (err != CL_SUCCESS) {\n" " free(list);\n" " return NULL;\n" " }\n" "\n" " device = NULL;\n" " for (i = 0; i < nrDevices; i++) {\n" " err = clGetDeviceInfo(list[i], CL_DEVICE_NAME,\n" " sizeof(deviceName), deviceName, NULL);\n" " if ((err == CL_SUCCESS) && (strcmp(deviceName, name) == 0)) {\n" " device = list[i];\n" " break;\n" " }\n" " }\n" "\n" " free(list);\n" " return device;\n" "}\n"; static std::string createKernelCode = "cl_kernel\n" "createKernel(\n" " const char* source,\n" " cl_context context,\n" " const char* options,\n" " cl_int* error)\n" "{\n" "\n" " cl_int err;\n" " cl_device_id device;\n" " cl_program program;\n" " cl_kernel kernel;\n" " size_t logSize;\n" " char *log;\n" "\n" " err = clGetContextInfo(context, CL_CONTEXT_DEVICES, sizeof(device), &device, NULL);\n" " if (err != CL_SUCCESS) {\n" " if (error != NULL) {\n" " *error = err;\n" " }\n" " return NULL;\n" " }\n" "\n" " program = clCreateProgramWithSource(context, 1, &source, NULL, error);\n" " if (program == NULL) {\n" " return NULL;\n" " }\n" "\n" " err = clBuildProgram(program, 1, &device, options, NULL, NULL);\n" " if (err != CL_SUCCESS) {\n" " logSize = 0;\n" " clGetProgramBuildInfo(program, device, CL_PROGRAM_BUILD_LOG, 0, NULL, &logSize);\n" " log = (char*)calloc(1, logSize + 1);\n" " clGetProgramBuildInfo(program, device, CL_PROGRAM_BUILD_LOG, logSize, log, NULL);\n" " printf(\"=== Build log ===\\n%s\\n\", log);\n" " free(log);\n" " clReleaseProgram(program);\n" " if (error != NULL) {\n" " *error = err;\n" " }\n" " return NULL;\n" " }\n" "\n" " kernel = NULL;\n" " err = clCreateKernelsInProgram(program, 1, &kernel, NULL);\n" " clReleaseProgram(program);\n" " if (error != NULL) {\n" " *error = err;\n" " }\n" " return kernel;\n" "}\n"; static std::string printTimeCode = "void\n" "printExecTime(cl_ulong ns)\n" "{\n" " if (ns > 10000000) {\n" " printf(\"Kernel execution time: %lu milliseconds\\n\", ns / 1000000);\n" " }\n" " else if (ns > 10000) {\n" " printf(\"Kernel execution time: %lu microseconds\\n\", ns / 1000);\n" " }\n" " else {\n" " printf(\"Kernel execution time: %lu nanoseconds\\n\", ns);\n" " }\n" "}\n"; #endif /* KTEST_PATTERNS_H_ */ clblas-2.10/src/library/tools/ktest/ktest.cpp000066400000000000000000000515031264277366700213070ustar00rootroot00000000000000/* ************************************************************************ * Copyright 2013 Advanced Micro Devices, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * ************************************************************************/ #include #include "var.h" #include "ktest.h" #include "ktest-patterns.h" using namespace clMath; KTest::KTest(Step *step, clMath::Config *cfg) : platform_(cfg->platform()), device_(cfg->device()), kernelSourceFile_(cfg->cl()), buildOptions_(cfg->buildOptions()), matrixGen_(cfg->dataPattern()), masterStep_(step), indent_(0), useSeveralKernels_(false) { } KTest::KTest(Step *masterStep, std::vector *steps, clMath::Config *cfg) : platform_(cfg->platform()), device_(cfg->device()), kernelSourceFile_(cfg->cl()), buildOptions_(cfg->buildOptions()), matrixGen_(cfg->dataPattern()), masterStep_(masterStep), steps_(steps), indent_(0), useSeveralKernels_(true) { } std::string KTest::indent() { std::string str = ""; for (size_t i = 0; i < indent_; i++) { str += " "; } return str; } const char* KTest::matrixGenName(KTestMatrixGenerator gen) { switch (gen) { case RANDOM_MATRIX: return "randomMatrix"; case UNIT_MATRIX: return "unitMatrix"; case SAWTOOTH_MATRIX: return "sawtoothMatrix"; default: return NULL; } } const char* KTest::vectorGenName(KTestMatrixGenerator gen) { switch (gen) { case RANDOM_MATRIX: return "randomVector"; case UNIT_MATRIX: return "unitVector"; case SAWTOOTH_MATRIX: return "sawtoothVector"; default: return NULL; } } std::string KTest::generate(bool withAccuracy) { std::stringstream ss; int level; ss << indent() << "#define _CRT_SECURE_NO_WARNINGS" << std::endl; ss << std::endl << indent() << "#include " << std::endl << indent() << "#include " << std::endl << indent() << "#include " << std::endl << indent() << "#include " << std::endl << indent() << "#include " << std::endl << indent() << "#include " << std::endl << indent() << "#include " << std::endl; if (masterStep_->blasFunctionID() == CLBLAS_TRSM) { ss << indent() << "#include " << std::endl << indent() << "#define NANF NAN" << std::endl; } includes(ss); ss << std::endl << indent() << "#include \"naive_blas.cpp\"" << std::endl << std::endl << indent() << "using namespace NaiveBlas;" << std::endl; if (masterStep_->blasFunctionID() == CLBLAS_TRSM) { setUpTRSMDiagonal(ss); } level = funcBlasLevel(masterStep_->blasFunctionID()); switch (matrixGen_) { case RANDOM_MATRIX: if (level == 2) { randomVector(ss); } randomMatrix(ss); break; case UNIT_MATRIX: if (level == 2) { unitVector(ss); } unitMatrix(ss); break; case SAWTOOTH_MATRIX: if (level == 2) { sawtoothVector(ss); } sawtoothMatrix(ss); break; default: break; } if (withAccuracy) { if (level == 2) { compareVectors(ss); } else { compareMatrices(ss); } } declareKTestOptions(ss); declareBlasOptions(ss, masterStep_); declarePatternVars(ss, masterStep_); ss << std::endl << indent() << "char* loadFile(const char* path);" << std::endl; forwardDeclarations(ss); generateMain(ss, withAccuracy); loadFile(ss); auxFunctions(ss); return ss.str(); } void KTest::declareKTestOptions(std::stringstream& ss) { ss << std::endl; ss << indent() << "const char PLATFORM_NAME[] = \"" << platform_ << "\";" << std::endl; ss << indent() << "const char DEVICE_NAME[] = \"" << device_ << "\";" << std::endl; ss << indent() << "const char BUILD_OPTIONS[] = \"" << buildOptions_ << "\";" << std::endl; ss << indent() << "const char KERNEL_SOURCE[] = \"" << kernelSourceFile_ << "\";" << std::endl; } void KTest::declareBlasOptions(std::stringstream& ss, Step *step) { ss << std::endl; ss << indent() << "const clblasOrder order = " << ((step->kargs().order == clblasColumnMajor) ? "clblasColumnMajor" : "clblasRowMajor") << ";" << std::endl; ss << indent() << "const clblasSide side = " << ((step->kargs().side == clblasRight) ? "clblasRight" : "clblasLeft") << ";" << std::endl; ss << indent() << "const clblasUplo uplo = " << ((step->kargs().uplo == clblasUpper) ? "clblasUpper" : "clblasLower") << ";" << std::endl; ss << indent() << "const clblasTranspose transA = "; switch (step->kargs().transA) { case clblasNoTrans: ss << "clblasNoTrans"; break; case clblasTrans: ss << "clblasTrans"; break; case clblasConjTrans: ss << "clblasConjTrans"; break; } ss << ";" << std::endl; ss << indent() << "const clblasTranspose transB = "; switch (step->kargs().transB) { case clblasNoTrans: ss << "clblasNoTrans"; break; case clblasTrans: ss << "clblasTrans"; break; case clblasConjTrans: ss << "clblasConjTrans"; break; } ss << ";" << std::endl; ss << indent() << "const clblasDiag diag = " << ((step->kargs().diag == clblasUnit) ? "clblasUnit" : "clblasNonUnit") << ";" << std::endl; } void KTest::declarePatternVars(std::stringstream& ss, Step *step) { VarList vars = step->vars(); ArrayVarList var_arays = step->arrays(); vars.insert(vars.end(), var_arays.begin(), var_arays.end()); ss << std::endl; for (VarList::const_iterator it = vars.begin(); it != vars.end(); ++it) { Variable *var = *it; if (step != masterStep_ && var->isBuffer()) { // master step buffers are used continue; } ss << indent(); if (var->constant()) { ss << "const "; } ss << var->type() << " " << var->name(); if (!var->defaultValue().empty()) { ss << " = " << var->defaultValue(); } ss << ";" << std::endl; } } void KTest::generateMain(std::stringstream& ss, bool withAccuracy) { ArrayVarList list; std::map kargMap = masterStep_->kargMap(); std::string size; ss << std::endl; ss << indent() << "int" << std::endl; if (useSeveralKernels_) { ss << indent() << "main(int argc, char *argv[])" << std::endl; } else { ss << indent() << "main(void)" << std::endl; } ss << indent() << "{" << std::endl; indent_ += 4; ss << std::endl << indent() << "char *source;" << std::endl << indent() << "cl_ulong start, end;" << std::endl; ss << std::endl << indent() << "srand((unsigned int)time(NULL));" << std::endl; mainInit(ss); ss << std::endl; list = masterStep_->arrays(); for (ArrayVarList::const_iterator it = list.begin(); it != list.end(); ++it) { ss << indent() << (*it)->name() << " = (" <<(*it)->type() << ")calloc("; if ((*it)->isMatrix()) { ss << masterStep_->matrixSize((MatrixVariable*)(*it)); } else { ss << masterStep_->vectorSize((VectorVariable*)(*it)); } ss << ", " << "sizeof(*" << (*it)->name() << "));" << std::endl; ss << indent() << "assert(" << (*it)->name() << " != NULL);" << std::endl; if ((*it)->copyOf() != NULL) { continue; } if ((*it)->isMatrix()) { MatrixVariable *var = (MatrixVariable*)(*it); ss << indent() << matrixGenName(matrixGen_) << "(order, " << var->rows()->name() << ", " << var->columns()->name() << ", " << var->matrixPointer() << ", " << var->ld()->name() << ");" << std::endl; } else { VectorVariable *var = (VectorVariable*)(*it); ss << indent() << vectorGenName(matrixGen_) << "(" << var->nElems()->name() << ", " << var->vectorPointer() << ", " << var->inc()->name() << ");" << std::endl; } } ss << indent() << masterStep_->postRandomCall() << ";" << std::endl; for (ArrayVarList::const_iterator it = list.begin(); it != list.end(); ++it) { if ((*it)->copyOf() == NULL) { continue; } ss << indent() << "memcpy(" << (*it)->name() << ", " << (*it)->copyOf()->name() << ", ("; if ((*it)->isMatrix()) { ss << masterStep_->matrixSize((MatrixVariable*)(*it)); } else { ss << masterStep_->vectorSize((VectorVariable*)(*it)); } ss << ") * sizeof(*" << (*it)->copyOf()->name() << "));" << std::endl; } if (withAccuracy) { ss << std::endl << indent() << "NaiveBlas::" << masterStep_->naiveCall() << ";" << std::endl; } allocateWriteBuffers(ss); if (useSeveralKernels_) { for (unsigned int i = 0; i < steps_->size(); i++) { Step *step = (*steps_)[i]; ss << indent() << "{" << std::endl; indent_ += 4; declareGranulation(ss, step); ss << indent() << "const char* kernelName = argc > " << i + 1 << " ? argv[" << i + 1 << "] : \"" << step->kernelName() << "\";" << std::endl; ss << std::endl << indent() << "source = loadFile(kernelName);" << std::endl << indent() << "assert(source != NULL);" << std::endl; buildKernel(ss); declareBlasOptions(ss, step); declarePatternVars(ss, step); setKernelArgs(ss, step); ss << std::endl << indent() << "start = 0;" << std::endl << indent() << "end = 0;" << std::endl; execKernel(ss); ss << std::endl << indent() << "printExecTime(end - start);" << std::endl; indent_ -= 4; ss << indent() << "}" << std::endl; } } else { declareGranulation(ss, masterStep_); ss << std::endl << indent() << "source = loadFile(KERNEL_SOURCE);" << std::endl << indent() << "assert(source != NULL);" << std::endl; buildKernel(ss); setKernelArgs(ss, masterStep_); ss << std::endl << indent() << "start = 0;" << std::endl << indent() << "end = 0;" << std::endl; execKernel(ss); ss << std::endl << indent() << "printExecTime(end - start);" << std::endl; } if (withAccuracy) { readBuffers(ss); ss << std::endl << indent() << "if (" << masterStep_->compareCall() << ") {" << std::endl << indent() << " printf(\"Correctness test passed\\n\");" << std::endl << indent() << "}" << std::endl << indent() << "else {" << std::endl << indent() << " printf(\"Correctness test failed\\n\");" << std::endl << indent() << "}" << std::endl << indent() << "fflush(stdout);" << std::endl; } mainFinish(ss); ss << std::endl; list = masterStep_->arrays(); for (ArrayVarList::const_iterator it = list.begin(); it != list.end(); ++it) { ss << indent() << "free(" << (*it)->name() << ");" << std::endl; } ss << indent() << "free(source);" << std::endl << indent() << "exit(EXIT_SUCCESS);" << std::endl; indent_ -= 4; ss << indent() << "}" << std::endl; } void KTest::loadFile(std::stringstream& ss) { ss << loadFileCode << std::endl; } void KTest::randomVector(std::stringstream& ss) { ss << randomVectorCode << std::endl; } void KTest::unitVector(std::stringstream& ss) { ss << unitVectorCode << std::endl; } void KTest::sawtoothVector(std::stringstream& ss) { ss << sawtoothVectorCode << std::endl; } void KTest::compareVectors(std::stringstream& ss) { ss << compareVectorsCode << std::endl; } void KTest::randomMatrix(std::stringstream& ss) { ss << randomMatrixCode << std::endl; } void KTest::unitMatrix(std::stringstream& ss) { ss << unitMatrixCode << std::endl; } void KTest::sawtoothMatrix(std::stringstream& ss) { ss << sawtoothMatrixCode << std::endl; } void KTest::setUpTRSMDiagonal(std::stringstream& ss) { ss << setUpTRSMDiagonalCode << std::endl; } void KTest::compareMatrices(std::stringstream& ss) { ss << compareMatricesCode << std::endl; } void KTest::includes(std::stringstream& ss) { ss << std::endl << indent() << "#include " << std::endl; } void KTest::forwardDeclarations(std::stringstream& ss) { ss << forwardDeclarationsCode << std::endl; } void KTest::auxFunctions(std::stringstream& ss) { getPlatform(ss); getDevice(ss); createKernel(ss); printExecTime(ss); } void KTest::getPlatform(std::stringstream& ss) { ss << getPlatformCode << std::endl; } void KTest::getDevice(std::stringstream& ss) { ss << getDeviceCode << std::endl; } void KTest::createKernel(std::stringstream& ss) { ss << createKernelCode << std::endl; } void KTest::printExecTime(std::stringstream& ss) { ss << printTimeCode; } void KTest::declareGranulation(std::stringstream& ss, Step *step) { ss << std::endl; ss << indent() << "const cl_uint workDim = " << step->pgran().wgDim << ";" << std::endl; ss << indent() << "const size_t localWorkSize[" << step->pgran().wgDim << "] = { "; for (unsigned int i = 0; i < step->pgran().wgDim; i++) { if (i != 0) { ss << ", "; } ss << step->pgran().wgSize[i]; } ss << " };" << std::endl; ss << indent() << "const size_t globalWorkSize[" << step->pgran().wgDim << "] = { " << step->globalWorkSize() << " };" << std::endl; } void KTest::mainInit(std::stringstream& ss) { ss << std::endl << indent() << "cl_int err;" << std::endl << indent() << "cl_platform_id platform;" << std::endl << indent() << "cl_device_id device;" << std::endl << indent() << "cl_context_properties props[3] = { CL_CONTEXT_PLATFORM, 0, 0 };" << std::endl << indent() << "cl_context context;" << std::endl << indent() << "cl_command_queue queue;" << std::endl << indent() << "cl_kernel kernel;" << std::endl << indent() << "cl_event event;" << std::endl; ss << std::endl << indent() << "platform = getPlatform(PLATFORM_NAME);" << std::endl << indent() << "assert(platform != NULL);" << std::endl << indent() << "device = getDevice(platform, DEVICE_NAME);" << std::endl << indent() << "assert(device != NULL);" << std::endl << indent() << "props[1] = (cl_context_properties)platform;" << std::endl << indent() << "context = clCreateContext(props, 1, &device, NULL, NULL, &err);" << std::endl << indent() << "assert(context != NULL);" << std::endl << indent() << "queue = clCreateCommandQueue(context, device, CL_QUEUE_PROFILING_ENABLE, &err);" << std::endl << indent() << "assert(queue != NULL);" << std::endl; } void KTest::buildKernel(std::stringstream& ss) { ss << indent() << "kernel = createKernel(source, context, BUILD_OPTIONS, &err);" << std::endl << indent() << "assert(kernel != NULL);" << std::endl; } void KTest::getBufferSizeExpr(Variable *buffer, std::string& size) { ArrayVariableInterface *hostPtr = (ArrayVariableInterface*)(buffer->hostPtr()); if (hostPtr->isMatrix()) { MatrixVariable *ptrVar = (MatrixVariable*)hostPtr; if (masterStep_->matrixSize(ptrVar).empty()) { size += "sizeof("; size += ptrVar->type(); size += ")"; } else { size = "("; size += masterStep_->matrixSize(ptrVar); size += ") * sizeof(*"; size += ptrVar->name(); size += ")"; } } else { VectorVariable *ptrVar = (VectorVariable*)buffer->hostPtr(); size = "("; size += masterStep_->vectorSize(ptrVar); size += ") * sizeof(*"; size += ptrVar->name(); size += ")"; } } void KTest::allocateWriteBuffers(std::stringstream& ss) { VarList list; std::string size; ss << std::endl; list = masterStep_->buffers(); for (VarList::const_iterator it = list.begin(); it != list.end(); ++it) { getBufferSizeExpr(*it, size); ss << indent() << (*it)->name() << " = clCreateBuffer(context, " << (*it)->flagsStr() << "," << std::endl << indent() << " " << size << ", NULL, &err);" << std::endl; ss << indent() << "assert(" << (*it)->name() << " != NULL);" << std::endl; if (((*it)->flags() & CL_MEM_READ_WRITE) || ((*it)->flags() & CL_MEM_READ_ONLY)) { ss << indent() << "err = clEnqueueWriteBuffer(queue, " << (*it)->name() << ", CL_TRUE, 0," << std::endl << indent() << " " << size << ", " << ((Variable*)(*it)->hostPtr())->name() << "," << std::endl << indent() << " 0, NULL, NULL);" << std::endl; ss << indent() << "assert(err == CL_SUCCESS);" << std::endl; } } } void KTest::setKernelArgs(std::stringstream& ss, Step *step) { std::map kargMap = step->kargMap(); ss << std::endl; for (KArgMap::iterator it = kargMap.begin(); it != kargMap.end(); ++it) { ss << indent() << "err = clSetKernelArg(kernel, " << (*it).first << ", sizeof(" << (*it).second->type() << "), " << "&" << (*it).second->name() << ");" << std::endl; ss << indent() << "assert(err == CL_SUCCESS);" << std::endl; } } void KTest::execKernel(std::stringstream& ss) { ss << std::endl << indent() << "event = NULL;" << std::endl << indent() << "err = clEnqueueNDRangeKernel(queue, kernel, workDim, NULL," << std::endl << indent() << " globalWorkSize, localWorkSize, 0, NULL, &event);" << std::endl << indent() << "assert(err == CL_SUCCESS);" << std::endl << indent() << "err = clFinish(queue);" << std::endl << indent() << "assert(err == CL_SUCCESS);" << std::endl; ss << std::endl << indent() << "err = clGetEventProfilingInfo(event, CL_PROFILING_COMMAND_START," << std::endl << indent() << " sizeof(start), &start, NULL);" << std::endl << indent() << "err = clGetEventProfilingInfo(event, CL_PROFILING_COMMAND_END," << std::endl << indent() << " sizeof(end), &end, NULL);" << std::endl; } void KTest::readBuffers(std::stringstream& ss) { VarList list; std::string size; ss << std::endl; list = masterStep_->buffers(); for (VarList::const_iterator it = list.begin(); it != list.end(); ++it) { if (((*it)->flags() & CL_MEM_READ_WRITE) || ((*it)->flags() & CL_MEM_WRITE_ONLY)) { getBufferSizeExpr(*it, size); ss << indent() << "err = clEnqueueReadBuffer(queue, " << (*it)->name() << ", CL_TRUE, 0," << std::endl << indent() << " " << size << ", " << ((Variable*)(*it)->hostPtr())->name() << "," << std::endl << indent() << " 0, NULL, NULL);" << std::endl; ss << indent() << "assert(err == CL_SUCCESS);" << std::endl; } } } void KTest::mainFinish(std::stringstream& ss) { VarList list; ss << std::endl; list = masterStep_->buffers(); for (VarList::const_iterator it = list.begin(); it != list.end(); ++it) { ss << indent() << "err = clReleaseMemObject(" << (*it)->name() << ");" << std::endl; ss << indent() << "assert(err == CL_SUCCESS);" << std::endl; } ss << indent() << "err = clReleaseKernel(kernel);" << std::endl << indent() << "assert(err == CL_SUCCESS);" << std::endl << indent() << "err = clReleaseCommandQueue(queue);" << std::endl << indent() << "assert(err == CL_SUCCESS);" << std::endl << indent() << "err = clReleaseContext(context);" << std::endl << indent() << "assert(err == CL_SUCCESS);" << std::endl; } clblas-2.10/src/library/tools/ktest/ktest.h000066400000000000000000000063621264277366700207570ustar00rootroot00000000000000/* ************************************************************************ * Copyright 2013 Advanced Micro Devices, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * ************************************************************************/ #ifndef KTEST_KTEST_H__ #define KTEST_KTEST_H__ #include #include #include "ktest-common.h" #include "step.h" #include "config.h" namespace clMath { /** * @internal * @brief Host code generation class * * Object of this class generate host-side source file that can execute kernels * for one or several steps. * */ class KTest { private: std::string platform_; std::string device_; std::string kernelSourceFile_; std::string buildOptions_; KTestMatrixGenerator matrixGen_; Step *masterStep_; std::vector *steps_; size_t indent_; bool useSeveralKernels_; const char* matrixGenName(KTestMatrixGenerator gen); const char* vectorGenName(KTestMatrixGenerator gen); void typedefs(std::stringstream& ss); void declareKTestOptions(std::stringstream& ss); void declareBlasOptions(std::stringstream& ss, Step *step); void declarePatternVars(std::stringstream& ss, Step *step); void generateMain(std::stringstream& ss, bool withAccuracy); void loadFile(std::stringstream& ss); void randomVector(std::stringstream& ss); void unitVector(std::stringstream& ss); void sawtoothVector(std::stringstream& ss); void compareVectors(std::stringstream& ss); void randomMatrix(std::stringstream& ss); void unitMatrix(std::stringstream& ss); void sawtoothMatrix(std::stringstream& ss); void setUpTRSMDiagonal(std::stringstream& ss); void compareMatrices(std::stringstream& ss); std::string indent(); void includes(std::stringstream& ss); void forwardDeclarations(std::stringstream& ss); void declareGranulation(std::stringstream& ss, Step *step); void mainInit(std::stringstream& ss); void buildKernel(std::stringstream& ss); void allocateWriteBuffers(std::stringstream& ss); void setKernelArgs(std::stringstream& ss, Step *step); void execKernel(std::stringstream& ss); void readBuffers(std::stringstream& ss); void mainFinish(std::stringstream& ss); void auxFunctions(std::stringstream& ss); void getPlatform(std::stringstream& ss); void getDevice(std::stringstream& ss); void createKernel(std::stringstream& ss); void printExecTime(std::stringstream& ss); void getBufferSizeExpr(Variable *buffer, std::string& size); public: KTest(Step *masterStep, clMath::Config *cfg); KTest(Step *masterStep, std::vector *steps, clMath::Config *cfg); std::string generate(bool withAccuracy); }; } // namespace clMath #endif // KTEST_KTEST_H__ clblas-2.10/src/library/tools/ktest/main.cpp000066400000000000000000000210101264277366700210670ustar00rootroot00000000000000/* ************************************************************************ * Copyright 2013 Advanced Micro Devices, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * ************************************************************************/ #define __CL_ENABLE_EXCEPTIONS #include #include #include #include #include #include "config.h" #include "step.h" #include "ktest.h" #include "steps/gemv.h" #include "steps/symv.h" #include "steps/gemm.h" #include "steps/trmm.h" #include "steps/trsm.h" #include "steps/syrk.h" #include "steps/syr2k.h" #include #include clMath::Step* getMasterStep( BlasFunctionID funcID, std::string platformName, std::string deviceName); clMath::Step* getStep(ListNode *node); void destroyPatterns(std::vector& patterns); cl_platform_id getPlatform(const char *name); cl_device_id getDevice( cl_platform_id platform, const char *name); int main(int argc, char *argv[]) { clMath::Config cfg; cfg.setDefaultConfig("ktest.cfg"); if (!cfg.parseCommandLine(argc, argv) || !cfg.isSane()) { return 1; } clblasSetup(); parseEnvImplementation(); clMath::Step *masterStep = getMasterStep(cfg.blasFunctionID(), cfg.platform(), cfg.device()); if (masterStep == NULL) { std::cerr << "Function support not implemented yet" << std::endl; return 1; } CLBlasKargs kargs; SubproblemDim subdims[MAX_SUBDIMS]; cfg.kargs(&kargs); masterStep->setKargs(kargs); masterStep->fixLD(); ListHead seq; listInitHead(&seq); bool severalKernels = false; /* Single kernel for this function */ if (cfg.decomposition(subdims)) { masterStep->setDecomposition(subdims); } masterStep->completeDecompositionSingle(); if (cfg.permitMultiKernels()) { masterStep->makeSolutionSequence(&seq, getPlatform(cfg.platform().c_str())); if (listLength(&seq) > 1) { severalKernels = true; } } if (severalKernels) { std::ofstream fs; ListNode *node; std::vector steps; masterStep->declareVars(NULL); for (node = listNodeFirst(&seq); node != &seq; node = node->next) { steps.push_back(getStep(node)); } std::string str; for (unsigned int i = 0; i < steps.size(); i++) { std::stringstream kernelFileName; kernelFileName << i << "_" << steps[i]->getBlasFunctionName() << "_" << cfg.cl(); steps[i]->setKernelName(kernelFileName.str()); if (cfg.decomposition(subdims)) { steps[i]->setDecomposition(subdims); } steps[i]->completeDecompositionSingle(); steps[i]->declareVars(masterStep); std::cout << "Generating '" << steps[i]->kernelName() << "' ..." << std::endl; str = steps[i]->generate(); if (str.empty()) { std::cerr << "failed" << std::endl; abort(); } fs.open(kernelFileName.str().c_str()); fs << str; fs.close(); } clMath::KTest *ktest = new clMath::KTest(masterStep, &steps, &cfg); std::cout << "Generating '" << cfg.cpp() << "' ..." << std::endl; str = ktest->generate(cfg.withAccuracy()); if (str.empty()) { std::cerr << "failed" << std::endl; abort(); } fs.open(cfg.cpp().c_str()); fs << str; fs.close(); delete ktest; for (std::vector::iterator it = steps.begin(); it != steps.end(); ++it) { delete (*it); } steps.clear(); } else { std::ofstream fs; masterStep->setKernelName(cfg.cl()); std::cout << "Generating '" << masterStep->kernelName() << "' ..." << std::endl; masterStep->declareVars(NULL); std::string str; str = masterStep->generate(); if (str.empty()) { std::cerr << "failed" << std::endl; abort(); } fs.open(cfg.cl().c_str()); fs << str; fs.close(); clMath::KTest *ktest = new clMath::KTest(masterStep, &cfg); std::cout << "Generating '" << cfg.cpp() << "' ..." << std::endl; str = ktest->generate(cfg.withAccuracy()); if (str.empty()) { std::cerr << "failed" << std::endl; abort(); } fs.open(cfg.cpp().c_str()); fs << str; fs.close(); delete ktest; } if (cfg.permitMultiKernels()) { masterStep->freeSolutionSequence(&seq); } delete masterStep; return 0; } clMath::Step* getMasterStep( BlasFunctionID funcID, std::string platformName, std::string deviceName) { cl_platform_id platformID; cl_device_id deviceID; platformID = getPlatform(platformName.c_str()); deviceID = getDevice(platformID, deviceName.c_str()); switch (funcID) { case CLBLAS_GEMV: return new clMath::GemvStep(deviceID); case CLBLAS_SYMV: return new clMath::SymvStep(deviceID); case CLBLAS_GEMM: return new clMath::GemmStep(deviceID); case CLBLAS_TRMM: return new clMath::TrmmStep(deviceID); case CLBLAS_TRSM: return new clMath::TrsmStep(deviceID); case CLBLAS_SYRK: return new clMath::SyrkStep(deviceID); case CLBLAS_SYR2K: return new clMath::Syr2kStep(deviceID); default: return NULL; } } clMath::Step* getStep(ListNode *node) { switch (clMath::Step::getStepNodeFuncID(node)) { case CLBLAS_GEMV: return new clMath::GemvStep(node); case CLBLAS_SYMV: return new clMath::SymvStep(node); case CLBLAS_GEMM: return new clMath::GemmStep(node); case CLBLAS_TRMM: return new clMath::TrmmStep(node); case CLBLAS_TRSM: return new clMath::TrsmStep(node); case CLBLAS_SYRK: return new clMath::SyrkStep(node); case CLBLAS_SYR2K: return new clMath::Syr2kStep(node); default: return NULL; } } cl_platform_id getPlatform(const char *name) { cl_int err; cl_uint nrPlatforms, i; cl_platform_id *list, platform; char platformName[64]; err = clGetPlatformIDs(0, NULL, &nrPlatforms); if (err != CL_SUCCESS) { return NULL; } list = (cl_platform_id*)calloc(nrPlatforms, sizeof(*list)); if (list == NULL) { return NULL; } err = clGetPlatformIDs(nrPlatforms, list, NULL); if (err != CL_SUCCESS) { free(list); return NULL; } platform = NULL; for (i = 0; i < nrPlatforms; i++) { err = clGetPlatformInfo(list[i], CL_PLATFORM_NAME, sizeof(platformName), platformName, NULL); if ((err == CL_SUCCESS) && (strcmp(platformName, name) == 0)) { platform = list[i]; break; } } free(list); return platform; } cl_device_id getDevice( cl_platform_id platform, const char *name) { cl_int err; cl_uint nrDevices, i; cl_device_id *list, device; char deviceName[64]; err = clGetDeviceIDs(platform, CL_DEVICE_TYPE_ALL, 0, NULL, &nrDevices); if (err != CL_SUCCESS) { return NULL; } list = (cl_device_id*)calloc(nrDevices, sizeof(*list)); if (list == NULL) { return NULL; } err = clGetDeviceIDs(platform, CL_DEVICE_TYPE_ALL, nrDevices, list, NULL); if (err != CL_SUCCESS) { free(list); return NULL; } device = NULL; for (i = 0; i < nrDevices; i++) { err = clGetDeviceInfo(list[i], CL_DEVICE_NAME, sizeof(deviceName), deviceName, NULL); if ((err == CL_SUCCESS) && (strcmp(deviceName, name) == 0)) { device = list[i]; break; } } free(list); return device; } clblas-2.10/src/library/tools/ktest/naive/000077500000000000000000000000001264277366700205475ustar00rootroot00000000000000clblas-2.10/src/library/tools/ktest/naive/naive_blas.cpp000066400000000000000000000434521264277366700233660ustar00rootroot00000000000000/* ************************************************************************ * Copyright 2013 Advanced Micro Devices, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * ************************************************************************/ #if defined (_MSC_VER) #define __template_static static #define isnan(x) _isnan((x)) #pragma warning( disable : 4290 ) #else /* _MSC_VER */ #define __template_static #endif /* !_MSC_VER */ namespace NaiveBlas { /* Problem flags */ typedef enum clblasOrder { clblasRowMajor, clblasColumnMajor } clblasOrder; typedef enum clblasTranspose { clblasNoTrans, clblasTrans, clblasConjTrans } clblasTranspose; typedef enum clblasUplo { clblasUpper, clblasLower } clblasUplo; typedef enum clblasDiag { clblasUnit, clblasNonUnit } clblasDiag; typedef enum clblasSide { clblasLeft, clblasRight } clblasSide; /* Complex types and related manipulations */ typedef cl_float2 FloatComplex; typedef cl_double2 DoubleComplex; static __inline FloatComplex floatComplex(float real, float imag) { FloatComplex z; z.s[0] = real; z.s[1] = imag; return z; } static __inline DoubleComplex doubleComplex(double real, double imag) { DoubleComplex z; z.s[0] = real; z.s[1] = imag; return z; } #define CREAL(v) ((v).s[0]) #define CIMAG(v) ((v).s[1]) // Type-dependent constants template static T ZERO() { return static_cast(0.0); } template<> __template_static FloatComplex ZERO() { return floatComplex(0.0, 0.0); } template<> __template_static DoubleComplex ZERO() { return doubleComplex(0.0, 0.0); } template static T ONE() { return static_cast(1.0); } template<> __template_static FloatComplex ONE() { return floatComplex(1.0, 0.0); } template<> __template_static DoubleComplex ONE() { return doubleComplex(1.0, 0.0); } template static T TWO() { return static_cast(2.0); } template<> __template_static FloatComplex TWO() { return floatComplex(2.0, 0.0); } template<> __template_static DoubleComplex TWO() { return doubleComplex(2.0, 0.0); } template static bool isNAN(T x) { return (isnan(x) != 0); } template<> __template_static bool isNAN(FloatComplex x) { return (isNAN(CREAL(x)) && isNAN(CIMAG(x))); } template<> __template_static bool isNAN(DoubleComplex x) { return (isNAN(CREAL(x)) && isNAN(CIMAG(x))); } /* Type-dependent random() */ template static T random(cl_double limit) { T v; cl_ulong l = static_cast(limit); if (l == 0) { return 0; } v = static_cast(rand() % l); if ((rand() % 2) == 1) v = -v; return v; } template static T random(cl_double left, cl_double right) { T v; T l = static_cast(left); v = random(right - left); if (v < 0) { v -= l; } else { v += l; } return v; } template static T random() { return random(static_cast(10)); } template<> __template_static FloatComplex random() { return floatComplex(random(), random()); } template<> __template_static FloatComplex random(cl_double limit) { return floatComplex(random(limit), random(limit)); } template<> __template_static FloatComplex random(cl_double left, cl_double right) { return floatComplex(random(left, right), random(left, right)); } template<> __template_static DoubleComplex random() { return doubleComplex(random(), random()); } template<> __template_static DoubleComplex random(cl_double limit) { return doubleComplex(random(limit), random(limit)); } template<> __template_static DoubleComplex random(cl_double left, cl_double right) { return doubleComplex(random(left, right), random(left, right)); } /* Boolean operators */ template static bool operator==(T a, T b) { return (a == b); } template<> __template_static bool operator==(FloatComplex a, FloatComplex b) { return ((CREAL(a) == CREAL(b)) && (CIMAG(a) == CIMAG(b))); } template<> __template_static bool operator==(DoubleComplex a, DoubleComplex b) { return ((CREAL(a) == CREAL(b)) && (CIMAG(a) == CIMAG(b))); } template static bool operator!=(T a, T b) { return !(a == b); } /* math operators */ static __inline float conjugate(float elem) { return elem; } static __inline double conjugate(double elem) { return elem; } static __inline FloatComplex conjugate(FloatComplex elem) { return floatComplex(CREAL(elem), -CIMAG(elem)); } static __inline DoubleComplex conjugate(DoubleComplex elem) { return doubleComplex(CREAL(elem), -CIMAG(elem)); } static __inline FloatComplex operator+(FloatComplex a, FloatComplex b) { return floatComplex(CREAL(a) + CREAL(b), CIMAG(a) + CIMAG(b)); } static __inline FloatComplex operator-(FloatComplex a, FloatComplex b) { return floatComplex(CREAL(a) - CREAL(b), CIMAG(a) - CIMAG(b)); } static __inline FloatComplex operator*(FloatComplex a, FloatComplex b) { return floatComplex( CREAL(a) * CREAL(b) - CIMAG(a) * CIMAG(b), CREAL(a) * CIMAG(b) + CREAL(b) * CIMAG(a)); } static __inline FloatComplex operator*(FloatComplex a, cl_float b) { return floatComplex(CREAL(a) * b, CIMAG(a) * b); } static __inline FloatComplex operator/(FloatComplex a, FloatComplex b) { cl_float div = CREAL(b) * CREAL(b) + CIMAG(b) * CIMAG(b); return floatComplex( (CREAL(a) * CREAL(b) + CIMAG(a) * CIMAG(b)) / div, (CREAL(b) * CIMAG(a) - CREAL(a) * CIMAG(b)) / div); } static __inline FloatComplex operator/(FloatComplex a, cl_float b) { return floatComplex(CREAL(a) / b, CIMAG(a) / b); } static __inline DoubleComplex operator+(DoubleComplex a, DoubleComplex b) { return doubleComplex(CREAL(a) + CREAL(b), CIMAG(a) + CIMAG(b)); } static __inline DoubleComplex operator-(DoubleComplex a, DoubleComplex b) { return doubleComplex(CREAL(a) - CREAL(b), CIMAG(a) - CIMAG(b)); } static __inline DoubleComplex operator*(DoubleComplex a, DoubleComplex b) { return doubleComplex( CREAL(a) * CREAL(b) - CIMAG(a) * CIMAG(b), CREAL(a) * CIMAG(b) + CREAL(b) * CIMAG(a)); } static __inline DoubleComplex operator*(DoubleComplex a, cl_double b) { return doubleComplex(CREAL(a) * b, CIMAG(a) * b); } static __inline DoubleComplex operator/(DoubleComplex a, DoubleComplex b) { cl_double div = CREAL(b) * CREAL(b) + CIMAG(b) * CIMAG(b); return doubleComplex( (CREAL(a) * CREAL(b) + CIMAG(a) * CIMAG(b)) / div, (CREAL(b) * CIMAG(a) - CREAL(a) * CIMAG(b)) / div); } static __inline DoubleComplex operator/(DoubleComplex a, cl_double b) { return doubleComplex(CREAL(a) / b, CIMAG(a) / b); } cl_int module(cl_int a) { return abs(a); } cl_float module(cl_float a) { return fabsf(a); } cl_double module(cl_double a) { return fabs(a); } cl_float module(FloatComplex a) { if ((CREAL(a) == 0.0) && (CIMAG(a) == 0.0)) return 0.0; return sqrtf(CREAL(a) * CREAL(a) + CIMAG(a) * CIMAG(a)); } cl_double module(DoubleComplex a) { if ((CREAL(a) == 0.0) && (CIMAG(a) == 0.0)) return 0.0; return sqrt(CREAL(a) * CREAL(a) + CIMAG(a) * CIMAG(a)); } #define FLOAT_UPPER_BOUND pow(2.0, 23) #define DOUBLE_UPPER_BOUND pow(2.0, 52) // Type-dependant constants template static cl_double UPPER_BOUND(); template<> __template_static cl_double UPPER_BOUND() { return FLOAT_UPPER_BOUND; } template<> __template_static cl_double UPPER_BOUND() { return DOUBLE_UPPER_BOUND;} template<> __template_static cl_double UPPER_BOUND() { return FLOAT_UPPER_BOUND; } template<> __template_static cl_double UPPER_BOUND() { return DOUBLE_UPPER_BOUND; } /* Provide simple access to vector elements */ template class VectorAccessor { public: VectorAccessor( ElemType *vector, size_t len, IncType inc, bool conj=false) : vector_(vector), inc_(inc), len_(len), conj_(conj) { /* do nothing */ } ElemType& operator [] (size_t idx) throw (std::string) { ElemType *el; if (idx >= len_) { throw std::string("Trying to access vector beyond boundary!"); } if (inc_ > 0) { el = vector_ + idx * inc_; } else { el = vector_ + (len_ - idx - 1) * (-inc_); } if (conj_) { tmp_ = conjugate(*el); return tmp_; } else { return *el; } } private: ElemType *vector_; ElemType tmp_; IncType inc_; size_t len_; bool conj_; }; /* Mapping between logical and physical matrix layout */ template class MatrixAccessor { public: MatrixAccessor( T *matrix, clblasOrder order, clblasTranspose trans, size_t nrRows, size_t nrCols, size_t ld) : matrix_(matrix), nrRows_(nrRows), nrCols_(nrCols), ld_(ld) { conj_ = (trans == clblasConjTrans); if ((order == clblasColumnMajor && trans == clblasNoTrans) || (order == clblasRowMajor && trans != clblasNoTrans)) { tra_ = true; } else { tra_ = false; } } void flipTransposing(void) { tra_ = !tra_; } VectorAccessor operator [] (size_t row) const throw (std::string) { T *vector; size_t inc; if (row >= nrRows_) { throw std::string("Trying to access matrix beyond boundary!"); } if (tra_) { vector = matrix_ + row; inc = ld_; } else { vector = matrix_ + row * ld_; inc = 1; } return VectorAccessor(vector, nrCols_, inc, conj_); } private: T *matrix_; bool tra_; bool conj_; size_t nrRows_; size_t nrCols_; size_t ld_; }; template __template_static void gemm( clblasOrder order, clblasTranspose transA, clblasTranspose transB, size_t M, size_t N, size_t K, T alpha, const T *A, size_t lda, const T *B, size_t ldb, T beta, T *C, size_t ldc) { MatrixAccessor ma(const_cast(A), order, transA, M, K, lda); MatrixAccessor mb(const_cast(B), order, transB, K, N, ldb); MatrixAccessor mc(C, order, clblasNoTrans, M, N, ldc); size_t i, j, k; T tmp; for (i = 0; i < M; i++) { for (j = 0; j < N; j++) { tmp = ZERO(); for (k = 0; k < K; k++) { tmp = tmp + ma[i][k] * mb[k][j]; } mc[i][j] = mc[i][j] * beta + tmp * alpha; } } } template __template_static void trmm( clblasOrder order, clblasSide side, clblasUplo uplo, clblasTranspose transA, clblasDiag diag, size_t M, size_t N, T alpha, const T *A, size_t lda, T *B, size_t ldb) { size_t i, j, k; size_t row, col; size_t rowsA = (side == clblasLeft) ? M : N; size_t colsB = (side == clblasLeft) ? N : M; MatrixAccessor ma(const_cast(A), order, transA, rowsA, rowsA, lda); MatrixAccessor mb(B, order, clblasNoTrans, rowsA, colsB, ldb); T tmp, a; bool revPass; revPass = (uplo == clblasLower) ^ (transA != clblasNoTrans); if (side == clblasRight) { ma.flipTransposing(); mb.flipTransposing(); revPass = !revPass; } for (i = 0; i < rowsA; i++) { row = (revPass) ? (rowsA - i - 1) : i; for (j = 0; j < colsB; j++) { size_t boundK = (revPass) ? row : (rowsA - row - 1); tmp = ZERO(); for (k = 0; k <= boundK; k++) { col = (revPass) ? k : (rowsA - k - 1); if ((k == boundK) && (diag == clblasUnit)) { a = ONE(); } else { a = ma[row][col]; } tmp = tmp + a * mb[col][j]; } mb[row][j] = tmp * alpha; } } } template __template_static void trsm( clblasOrder order, clblasSide side, clblasUplo uplo, clblasTranspose transA, clblasDiag diag, size_t M, size_t N, T alpha, const T *A, size_t lda, T *B, size_t ldb) { size_t i, j, k; size_t row, col; size_t rowsA = (side == clblasLeft) ? M : N; size_t colsB = (side == clblasLeft) ? N : M; MatrixAccessor ma(const_cast(A), order, transA, rowsA, rowsA, lda); MatrixAccessor mb(B, order, clblasNoTrans, rowsA, colsB, ldb); T tmp, a; bool revPass; revPass = (uplo == clblasUpper) ^ (transA != clblasNoTrans); if (side == clblasRight) { ma.flipTransposing(); mb.flipTransposing(); revPass = !revPass; } for (i = 0; i < rowsA; i++) { row = (revPass) ? (rowsA - i - 1) : i; for (j = 0; j < colsB; j++) { size_t boundK = (revPass) ? (rowsA - row - 1) : row; tmp = ZERO(); for (k = 0; k <= boundK; k++) { col = (revPass) ? (rowsA - k - 1) : k; if (col == row) { a = (diag == clblasUnit) ? ONE() : ma[row][col]; tmp = (mb[row][j] - tmp) / a; } else { tmp = tmp + ma[row][col] * mb[col][j]; } } mb[row][j] = tmp; } } for (i = 0; i < rowsA; i++) { for (j = 0; j < colsB; j++) { mb[i][j] = mb[i][j] * alpha; } } } template __template_static void syrk( clblasOrder order, clblasUplo uplo, clblasTranspose trans, size_t N, size_t K, T alpha, const T *A, size_t lda, T beta, T *C, size_t ldc) { size_t i, j, k; clblasTranspose tr = trans == clblasNoTrans ? clblasNoTrans : clblasTrans; MatrixAccessor ma(const_cast(A), order, tr, N, K, lda); MatrixAccessor mc(C, order, clblasNoTrans, N, N, ldc); T tmp; for (i = 0; i < N; i++) { for (j = 0; j < N; j++) { if ((uplo == clblasLower && j > i) || (uplo == clblasUpper && i > j)) { continue; } tmp = ZERO(); for (k = 0; k < K; k++) { tmp = tmp + ma[i][k] * ma[j][k]; } mc[i][j] = mc[i][j] * beta + tmp * alpha; } } } template __template_static void syr2k( clblasOrder order, clblasUplo uplo, clblasTranspose trans, size_t N, size_t K, T alpha, const T *A, size_t lda, const T *B, size_t ldb, T beta, T *C, size_t ldc) { size_t i, j, k; clblasTranspose tr = trans == clblasNoTrans ? clblasNoTrans : clblasTrans; MatrixAccessor ma(const_cast(A), order, tr, N, K, lda); MatrixAccessor mb(const_cast(B), order, tr, N, K, ldb); MatrixAccessor mc(C, order, clblasNoTrans, N, N, ldc); T tmp; for (i = 0; i < N; i++) { for (j = 0; j < N; j++) { if ((uplo == clblasLower && j > i) || (uplo == clblasUpper && i > j)) { continue; } tmp = ZERO(); for (k = 0; k < K; k++) { tmp = tmp + ma[i][k] * mb[j][k] + ma[j][k] * mb[i][k]; } mc[i][j] = mc[i][j] * beta + tmp * alpha; } } } template __template_static void gemv( clblasOrder order, clblasTranspose transA, size_t M, size_t N, T alpha, const T *A, size_t lda, const T *X, int incx, T beta, T *Y, int incy) { size_t sizeX, sizeY; size_t m, n; T tmp; if(transA == clblasNoTrans) { sizeX = N; sizeY = M; } else { sizeX = M; sizeY = N; } MatrixAccessor ma(const_cast(A), order, transA, sizeY, sizeX, lda); VectorAccessor vx(const_cast(X), sizeX, incx); VectorAccessor vy(const_cast(Y), sizeY, incy); for (m = 0; m < sizeY; m++) { tmp = ZERO(); for (n = 0; n < sizeX; n++) { tmp = tmp + ma[m][n] * vx[n]; } vy[m] = tmp * alpha + vy[m] * beta; } } template __template_static void symv( clblasOrder order, clblasUplo uplo, size_t N, T alpha, const T *A, size_t lda, const T *X, int incx, T beta, T *Y, int incy) { size_t m, n; T tmp; MatrixAccessor ma(const_cast(A), order, clblasNoTrans, N, N, lda); VectorAccessor vx(const_cast(X), N, incx); VectorAccessor vy(const_cast(Y), N, incy); for (m = 0; m < N; m++) { tmp = ZERO(); for (n = 0; n < N; n++) { if (((uplo == clblasUpper) && (m <= n)) || ((uplo == clblasLower) && (m >= n))) { tmp = tmp + ma[m][n] * vx[n]; } else { tmp = tmp + ma[n][m] * vx[n]; } } vy[m] = tmp * alpha + vy[m] * beta; } } } /* NaiveBlas namespace */ clblas-2.10/src/library/tools/ktest/scripts/000077500000000000000000000000001264277366700211345ustar00rootroot00000000000000clblas-2.10/src/library/tools/ktest/scripts/verify_ktest.bash000066400000000000000000000130751264277366700245170ustar00rootroot00000000000000#!/bin/bash FUNCTIONS=(gemm trmm trsm syrk syr2k gemv symv) ALL_PRECISIONS=(s c) ALL_OPTIONS=(order transA transB side uplo diag M N K incx incy offA offBX offCY) # list of supported options for each function: gemm, trmm, trsm, syrk, ssyr2k, gemv, symv FUNC_OPTIONS=( "order transA transB M N K" "order transA side uplo diag M N" "order transA side uplo diag M N" "order transA uplo N K" "order transA uplo N K" "order transA M N" "order uplo N" ) # all options space: precision, order, transA, transB, side, uplo, unit, M, N, K ALL_OPTION_VALUES=( "row column" "n t c" "n t c" "left right" "upper lower" "unit nonunit" "15 16 64" "15 16 64" "15 16 64" "1" "1" "128" "256" "512" ) REPORT_FILE="ktest_report.dat" PREV_KERNEL="" REMAINING_OPTSTR= CMDLINE= FUNCTION_INDEX= forward_options_and_call_test() { local optidx=$1 local precision=$2 local optstr=${REMAINING_OPTSTR[@]} local ret=0 local stat=0 local cmdline= local msg= local err_msg= for opt in ${optstr[@]} do REMAINING_OPTSTR=${REMAINING_OPTSTR[@]##$opt} echo ${FUNC_OPTIONS[$FUNCTION_INDEX]} | grep $opt > /dev/null if [ $? -eq 0 ] then break fi let "optidx += 1" done # make test and call if no more options to forward, or go further in the option list if [ $optidx == ${#ALL_OPTIONS[@]} ] then cmdline="--function "$PRECISION${FUNCTIONS[$FUNCTION_INDEX]}" ${CMDLINE[@]}" echo ${cmdline[@]} ./make-ktest ${cmdline[@]} stat=$? err_msg="[ERROR]: make-ktest has failed!" if [ $stat -eq 0 ] then # check if the kernel is not the same as the last one kernel=`cat *.cl` if [ "${kernel[*]}" == "${LAST_KERNEL[*]}" ] then echo "Critical error, just the same kernel has been already generated!" return 1 fi fi if [ $stat -eq 0 ] then g++ -o test ktest.cpp -I$AMDAPPSDKROOT/include -lOpenCL stat=$? err_msg="[ERROR]: test compilation has failed!"= fi if [ $stat -eq 0 ] then msg=`./test 2>&1` stat=$? fi if [ $stat -eq 0 ] then time_msg=${msg/Correctness*/""} msg=${msg##$time_msg} echo $time_msg echo $msg echo $msg | grep "passed" > /dev/null stat=$? fi if [ $stat -ne 0 ] then echo $err_msg echo ${cmdline[@]} >> $REPORT_FILE.tmp fi else local OPTION=${ALL_OPTIONS[$optidx]} local OPTION_VALUES=${ALL_OPTION_VALUES[$optidx]} let "optidx += 1" cmdline=${CMDLINE[@]} for val in ${OPTION_VALUES[@]} do CMDLINE=${cmdline[@]}" --$OPTION ""$val" (forward_options_and_call_test $optidx) ret=$? if [ $ret -ne 0 ] then break fi LAST_KERNEL=$kernel rm -f *.cl > /dev/null done fi return $ret } rm -f *.cl > /dev/null > $REPORT_FILE.tmp # test the main funtional for ((i = 0; i < ${#FUNCTIONS[@]}; i++)) do FUNCTION_INDEX=$i for PRECISION in ${ALL_PRECISIONS[@]} do if [[ ${FUNCTIONS[$i]} == symv && $PRECISION == c ]] then continue fi CMDLINE="" REMAINING_OPTSTR=${ALL_OPTIONS[@]} forward_options_and_call_test 0 done done echo ========================================================================================== # test increment and offset arguments FUNC_OPTIONS=( "order transA transB M N K offA offCY" "order transA side uplo diag M N offA offBX" "order transA side uplo diag M N offA" "order transA uplo N K offA offCY" "order transA uplo N K offA offBX offCY" "order transA M N offA incx incy offA" "order uplo N offA incx incy offA" ) ALL_OPTION_VALUES=( "row column" "n" "n" "left" "upper" "nonunit" "64" "64" "64" "1 3 7" "1 5 9" "128" "256" "512" ) for ((i = 0; i < ${#FUNCTIONS[@]}; i++)) do FUNCTION_INDEX=$i for PRECISION in ${ALL_PRECISIONS[@]} do if [[ ${FUNCTIONS[$i]} == symv && $PRECISION == c ]] then continue fi CMDLINE="" REMAINING_OPTSTR=${ALL_OPTIONS[@]} forward_options_and_call_test 0 done done # complete the report report=`cat $REPORT_FILE.tmp` nr_fails=`cat $REPORT_FILE.tmp | wc -l` if [ $nr_fails == 0 ] then echo "All tests passed" > $REPORT_FILE else echo "Failed cases:" > $REPORT_FILE echo "-----------------------------------------------------------------" >> $REPORT_FILE cat $REPORT_FILE.tmp >> $REPORT_FILE echo "-----------------------------------------------------------------" >> $REPORT_FILE echo "Total number of failed cases: $nr_fails" >> $REPORT_FILE fi rm $REPORT_FILE.tmp clblas-2.10/src/library/tools/ktest/step-dump.cpp000066400000000000000000000227231264277366700220750ustar00rootroot00000000000000/* ************************************************************************ * Copyright 2013 Advanced Micro Devices, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * ************************************************************************/ #include #include #include "step.h" using namespace clMath; template struct FlagsDesc { T flag; const char *desc; }; static const struct FlagsDesc kernelExtraFlagsDesc[] = { { KEXTRA_TRANS_A, "KEXTRA_TRANS_A" }, { KEXTRA_CONJUGATE_A, "KEXTRA_CONJUGATE_A" }, { KEXTRA_TRANS_B, "KEXTRA_TRANS_B" }, { KEXTRA_CONJUGATE_B, "KEXTRA_CONJUGATE_B" }, { KEXTRA_COLUMN_MAJOR, "KEXTRA_COLUMN_MAJOR" }, { KEXTRA_UPPER_TRIANG, "KEXTRA_UPPER_TRIANG" }, { KEXTRA_SIDE_RIGHT, "KEXTRA_SIDE_RIGHT" }, { KEXTRA_UNIT_DIAGONAL, "KEXTRA_UNIT_DIAGONAL" }, { KEXTRA_TAILS_M, "KEXTRA_TAILS_M" }, { KEXTRA_TAILS_N, "KEXTRA_TAILS_N" }, { KEXTRA_TAILS_K, "KEXTRA_TAILS_K" }, { KEXTRA_BETA_ZERO, "KEXTRA_BETA_ZERO" }, { KEXTRA_NO_COPY_VEC_A, "KEXTRA_NO_COPY_VEC_A" }, { KEXTRA_NO_COPY_VEC_B, "KEXTRA_NO_COPY_VEC_B" }, { KEXTRA_NO_COPY_VEC_C, "KEXTRA_NO_COPY_VEC_C" }, { KEXTRA_SYRK_SEPARATE_DIAGONAL, "KEXTRA_SYRK_SEPARATE_DIAGONAL" }, { KEXTRA_SYRK_EVALUATE_DIAGONAL, "KEXTRA_SYRK_EVALUATE_DIAGONAL" }, { KEXTRA_SYRK_2K_RANK, "KEXTRA_SYRK_2K_RANK" }, { KEXTRA_INCX_ONE, "KEXTRA_INCX_ONE" }, { KEXTRA_INCY_ONE, "KEXTRA_INCY_ONE" }, { KEXTRA_ENABLE_MAD, "KEXTRA_ENABLE_MAD" }, { KEXTRA_VENDOR_AMD, "KEXTRA_VENDOR_AMD" }, { static_cast(0), NULL } }; static const struct FlagsDesc memLevelFlagsDesc[] = { { CLMEM_LEVEL_LDS, "CLMEM_LEVEL_LDS" }, { CLMEM_LEVEL_L1, "CLMEM_LEVEL_L1" }, { CLMEM_LEVEL_L2, "CLMEM_LEVEL_L2" }, { static_cast(0), NULL } }; template static void dumpFlags(std::stringstream& ss, T flags, const struct FlagsDesc *desc) { bool first = true; if (flags == static_cast(0)) { ss << "-"; return; } for (size_t i = 0; desc[i].desc != NULL; i++) { if (flags & desc[i].flag) { if (!first) { ss << " "; } ss << desc[i].desc; flags = static_cast(flags & ~desc[i].flag); first = false; } } if (flags != static_cast(0)) { if (!first) { ss << " "; } ss << flags; } } std::string Step::dtypeToString(DataType dtype) { switch (dtype) { case TYPE_FLOAT: return "cl_float"; case TYPE_DOUBLE: return "cl_double"; case TYPE_COMPLEX_FLOAT: return "FloatComplex"; case TYPE_COMPLEX_DOUBLE: return "DoubleComplex"; default: return ""; } } std::string Step::multiplierToString( DataType dtype, ArgMultiplier arg) { std::stringstream ss; switch (dtype) { case TYPE_FLOAT: ss << arg.argFloat; break; case TYPE_DOUBLE: ss << arg.argDouble; break; case TYPE_COMPLEX_FLOAT: ss << "floatComplex(" << arg.argFloatComplex.s[0] << ", " << arg.argFloatComplex.s[1] << ")"; break; case TYPE_COMPLEX_DOUBLE: ss << "doubleComplex(" << arg.argDoubleComplex.s[0] << ", " << arg.argDoubleComplex.s[1] << ")"; break; } return ss.str(); } std::string Step::dumpSubdim(const SubproblemDim *subdim) { std::stringstream ss; if (subdim == NULL) { return ss.str(); } ss << " x = "; if (subdim->x == SUBDIM_UNUSED) { ss << "SUBDIM_UNUSED"; } else { ss << subdim->x; } ss << std::endl; ss << " y = "; if (subdim->y == SUBDIM_UNUSED) { ss << "SUBDIM_UNUSED"; } else { ss << subdim->y; } ss << std::endl; ss << " bwidth = " << subdim->bwidth << std::endl; ss << " itemX = "; if (subdim->itemX == SUBDIM_UNUSED) { ss << "SUBDIM_UNUSED"; } else { ss << subdim->itemX; } ss << std::endl; ss << " itemY = "; if (subdim->itemY == SUBDIM_UNUSED) { ss << "SUBDIM_UNUSED"; } else { ss << subdim->itemY; } ss << std::endl; return ss.str(); } std::string Step::dumpPgran() { std::stringstream ss; const PGranularity *pgran = &step_.pgran; if (pgran == NULL) { return ss.str(); } ss << " wgDim = " << pgran->wgDim << std::endl; ss << " wgSize = ("; for (unsigned int i = 0; i < pgran->wgDim; i++) { if (i != 0) { ss << ", "; } ss << pgran->wgSize[i]; } ss << ")" << std::endl; ss << " wfSize = " << pgran->wfSize << std::endl; return ss.str(); } std::string Step::dumpKextra() { std::stringstream ss; const CLBLASKernExtra *kextra = &kextra_; if (kextra == NULL) { return ss.str(); } ss << " dtype = "; switch (kextra->dtype) { case TYPE_FLOAT: ss << "TYPE_FLOAT"; break; case TYPE_DOUBLE: ss << "TYPE_DOUBLE"; break; case TYPE_COMPLEX_FLOAT: ss << "TYPE_COMPLEX_FLOAT"; break; case TYPE_COMPLEX_DOUBLE: ss << "TYPE_COMPLEX_DOUBLE"; break; } ss << std::endl; ss << " flags = "; dumpFlags(ss, kextra->flags, kernelExtraFlagsDesc); ss << std::endl; ss << " kernType = "; switch (kextra->kernType) { case CLBLAS_COMPUTING_KERNEL: ss << "CLBLAS_COMPUTING_KERNEL"; break; case CLBLAS_PREP_A_KERNEL: ss << "CLBLAS_PREP_A_KERNEL"; break; case CLBLAS_PREP_B_KERNEL: ss << "CLBLAS_PREP_B_KERNEL"; break; default: ; // should not be reached } ss << std::endl; // Deprecated data ss << " vecLen = " << kextra->vecLen << std::endl; ss << " vecLenA = " << kextra->vecLenA << std::endl; ss << " vecLenB = " << kextra->vecLenB << std::endl; ss << " vecLenC = " << kextra->vecLenC << std::endl; return ss.str(); } std::string Step::dumpMemoryPattern() { std::stringstream ss; const MemoryPattern *pattern = pattern_; CLBLASMpatExtra *mpatExtra = static_cast(pattern->extra); if (pattern == NULL) { return ss.str(); } ss << " name = " << pattern->name << std::endl; ss << " nrLevels = " << pattern->nrLevels << std::endl; ss << " cuLevel = " << pattern->cuLevel << std::endl; ss << " thLevel = " << pattern->thLevel << std::endl; ss << " sops"; if (pattern->sops == NULL) { ss << " = -" << std::endl; } else { ss << std::endl; ss << " genKernel : " << ((pattern->sops->genKernel != NULL) ? "yes" : "no") << std::endl; ss << " assignKargs : " << ((pattern->sops->assignKargs != NULL) ? "yes" : "no") << std::endl; ss << " isFitToLDS : " << ((pattern->sops->isFitToLDS != NULL) ? "yes" : "no") << std::endl; ss << " innerDecompositionAxis: " << ((pattern->sops->innerDecompositionAxis != NULL) ? "yes" : "no") << std::endl; ss << " calcThreads : " << ((pattern->sops->calcThreads != NULL) ? "yes" : "no") << std::endl; ss << " imgPackMode : " << ((pattern->sops->imgPackMode != NULL) ? "yes" : "no") << std::endl; ss << " getFlags : " << ((pattern->sops->getFlags != NULL) ? "yes" : "no") << std::endl; } ss << " extra" << std::endl; ss << " aMset = "; dumpFlags(ss, static_cast(mpatExtra->aMset), memLevelFlagsDesc); ss << std::endl; ss << " bMset = "; dumpFlags(ss, static_cast(mpatExtra->bMset), memLevelFlagsDesc); ss << std::endl; ss << " mobjA = "; switch (mpatExtra->mobjA) { case CLMEM_GLOBAL_MEMORY: ss << "CLMEM_GLOBAL_MEMORY"; break; case CLMEM_LOCAL_MEMORY: ss << "CLMEM_LOCAL_MEMORY"; break; case CLMEM_IMAGE: ss << "CLMEM_IMAGE"; break; } ss << std::endl; ss << " mobjB = "; switch (mpatExtra->mobjB) { case CLMEM_GLOBAL_MEMORY: ss << "CLMEM_GLOBAL_MEMORY"; break; case CLMEM_LOCAL_MEMORY: ss << "CLMEM_LOCAL_MEMORY"; break; case CLMEM_IMAGE: ss << "CLMEM_IMAGE"; break; } ss << std::endl; return ss.str(); } clblas-2.10/src/library/tools/ktest/step.cpp000066400000000000000000000416561264277366700211400ustar00rootroot00000000000000/* ************************************************************************ * Copyright 2013 Advanced Micro Devices, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * ************************************************************************/ #include #include #include #include #include #include #include "step.h" using namespace clMath; // This enum reflects CLBlasKargs structure, declared in clblas-internal.h typedef enum StepKarg { KARG_NONE = 0, // kernType // dtype KARG_ORDER, KARG_SIDE, KARG_UPLO, KARG_TRANS_A, KARG_TRANS_B, KARG_DIAG, KARG_M, KARG_N, KARG_K, KARG_ALPHA, KARG_A, KARG_LDA, KARG_B, KARG_LDB, KARG_BETA, KARG_C, KARG_LDC, // addrBits KARG_OFFSET_M, KARG_OFFSET_N, KARG_OFFSET_K, KARG_SCIMAGE_0, KARG_SCIMAGE_1, KARG_OFF_A, KARG_OFF_BX, KARG_OFF_CY } StepKarg; Step::Step( BlasFunctionID funcID, cl_device_id device) : naiveCall_(""), compareCall_(""), postRandomCall_(""), kernelName_("") { memset(&step_, 0, sizeof(step_)); memset(&kextra_, 0, sizeof(kextra_)); step_.funcID = funcID; step_.device.id = device; identifyDevice(&step_.device); step_.args.A = (cl_mem)BUFFER_A; step_.args.B = (cl_mem)BUFFER_B; step_.args.C = (cl_mem)BUFFER_C; if (blasFunctionID() == CLBLAS_SYR2K) { kextra_.flags = static_cast (kextra_.flags | KEXTRA_SYRK_2K_RANK); step_.extraFlags = kextra_.flags; } } Step::Step(ListNode *node) : naiveCall_(""), compareCall_(""), postRandomCall_(""), kernelName_("") { SolutionStep *stepNode; memset(&kextra_, 0, sizeof(kextra_)); stepNode = container_of(node, node, SolutionStep); memcpy(&step_, stepNode, sizeof(step_)); kextra_.dtype = step_.args.dtype; kextra_.flags = step_.extraFlags; kextra_.kernType = CLBLAS_COMPUTING_KERNEL; } Step::~Step() { for (ArrayVarList::iterator it = arrays_.begin(); it != arrays_.end(); ++it) { delete (*it); } for (VarList::iterator it = vars_.begin(); it != vars_.end(); ++it) { delete (*it); } vars_.clear(); arrays_.clear(); buffers_.clear(); kargMap_.clear(); } BlasFunctionID Step::getStepNodeFuncID(ListNode *node) { SolutionStep *stepNode; stepNode = container_of(node, node, SolutionStep); return stepNode->funcID; } void Step::completeDecompositionSingle() { cl_int err; kextra_.dtype = kargs().dtype; kextra_.kernType = CLBLAS_COMPUTING_KERNEL; kextra_.flags = (KernelExtraFlags)(kextra_.flags | clblasArgsToKextraFlags(&step_.args, blasFunctionID())); if (deviceVendor(device()) == "Advanced Micro Devices, Inc.") { kextra_.flags = static_cast (kextra_.flags | KEXTRA_VENDOR_AMD | KEXTRA_ENABLE_MAD); } step_.pgran.wfSize = deviceWavefront(device(), &err); step_.extraFlags = kextra_.flags; step_.patternID = selectPattern(&step_, 0); pattern_ = &clblasSolvers[step_.funcID].memPatterns[step_.patternID]; if (0 == step_.subdims[0].bwidth && 0 == step_.subdims[0].bwidth && 0 == step_.subdims[0].bwidth) { getStepGranulation(&step_); } else if (pattern_->sops->checkCalcDecomp) { pattern_->sops->checkCalcDecomp(&step_.pgran, step_.subdims, 2, kextra_.dtype, PGRAN_CALC); } else { size_t wgX, wgY; size_t x0, y0; SolverFlags sflags; // Set up granulation for given dimensions wgY = step_.subdims[0].y/ step_.subdims[1].y; wgX = step_.subdims[0].x/ step_.subdims[1].x; x0 = step_.subdims[0].x; y0 = step_.subdims[0].y; if (funcBlasLevel(blasFunctionID()) == 2) { /* Level 2 decomposition size for vectors (dims[0].x) is 1. * We have to "restore" it to proceed. */ size_t xBlocks; xBlocks = step_.subdims[0].bwidth / step_.subdims[1].bwidth; x0 = step_.subdims[1].x * xBlocks; } /* * adjust local size if a subproblem is not divisible * between all local threads */ for (; (wgY > 1) && (y0 < wgY); wgY /= 2) { } for (; (wgX > 1) && (x0 < wgX); wgX /= 2) { } sflags = pattern_->sops->getFlags(); if (sflags & SF_WSPACE_2D) { step_.pgran.wgDim = 2; step_.pgran.wgSize[0] = (unsigned int)wgY; step_.pgran.wgSize[1] = (unsigned int)wgX; } else { step_.pgran.wgDim = 1; step_.pgran.wgSize[0] = (unsigned int)(wgX * wgY); step_.pgran.wgSize[1] = 1; } // fixup work group size in respect with desired work dispatch order if ((step_.pgran.wgDim == 2) && pattern_->sops->innerDecompositionAxis) { if (pattern_->sops->innerDecompositionAxis(&step_.args) == DECOMP_AXIS_X) { unsigned int u; u = step_.pgran.wgSize[0]; step_.pgran.wgSize[0] = step_.pgran.wgSize[1]; step_.pgran.wgSize[1] = u; } } /* Check that dimensions are bigger than whole problem size */ if (dimensionsExceedProblemSize(&step_)) { getMinimalStepGranulation(&step_); } } detectProblemTails(&step_); kextra_.flags = step_.extraFlags; if (pattern_->sops->fixupArgs) { pattern_->sops->fixupArgs(&step_.args, &step_.subdims[0], &kextra_); } step_.extraFlags = kextra_.flags; detectOffsets(&step_); kextra_.flags = step_.extraFlags; selectVectorization(&step_, &kextra_); } void Step::makeSolutionSequence(ListHead *seq, cl_platform_id platform) { SolutionStep *newStep; (void)platform; step_.args.A = (cl_mem)BUFFER_A; step_.args.B = (cl_mem)BUFFER_B; step_.args.C = (cl_mem)BUFFER_C; newStep = (SolutionStep*)malloc(sizeof(SolutionStep)); memcpy(newStep, &step_, sizeof(SolutionStep)); listAddToTail(seq, &newStep->node); decomposeProblemStep(newStep); } void Step::freeSolutionSequence(ListHead *seq) { freeSolutionSeq(seq); } std::string Step::generate() { ssize_t size; char *buf; std::stringstream ss; if ((pattern_->sops == NULL) || (pattern_->sops->genKernel == NULL)) { return ""; } ss << "/*" << std::endl; for (int i = 0; i < MAX_SUBDIMS; i++) { ss << "SubproblemDim[" << i << "]" << std::endl; ss << dumpSubdim(step_.subdims + i) << std::endl; } ss << "PGranularity" << std::endl; ss << dumpPgran() << std::endl; ss << "CLBLASKernExtra" << std::endl; ss << dumpKextra() << std::endl; ss << "MemoryPattern" << std::endl; ss << dumpMemoryPattern(); ss << "*/" << std::endl << std::endl; size = pattern_->sops->genKernel(NULL, 0, step_.subdims, &step_.pgran, static_cast(&kextra_)); if (size <= 0) { return 0; } buf = new char[size + 1]; if (pattern_->sops->genKernel(buf, size, step_.subdims, &step_.pgran, static_cast(&kextra_)) != size) { delete[] buf; return ""; } ss << buf; delete[] buf; return ss.str(); } void Step::setKargs(const CLBlasKargs& kargs) { step_.args = kargs; } const char* Step::getBlasFunctionName() { switch (blasFunctionID()) { case CLBLAS_GEMV: return "gemv"; case CLBLAS_SYMV: return "symv"; case CLBLAS_GEMM: return "gemm"; case CLBLAS_TRMM: return "trmm"; case CLBLAS_TRSM: return "trsm"; case CLBLAS_SYRK: return "syrk"; case CLBLAS_SYR2K: return "syr2k"; default: return ""; } } void Step::setDecomposition( const SubproblemDim *subdims) { for (size_t i = 0; i < MAX_SUBDIMS; i++) { step_.subdims[i] = subdims[i]; } } Variable* Step::addVar( const std::string& name, const std::string& type, const std::string& defaultValue) { Variable *var = new Variable(name, type, defaultValue); vars_.push_back(var); return var; } Variable* Step::addConst( const std::string& name, const std::string& type, const std::string& defaultValue) { Variable *var = addVar(name, type, defaultValue); var->setConstant(true); return var; } Variable* Step::addVar( const std::string& name, const std::string& type, size_t value) { return addVar(name, type, boost::lexical_cast(value)); } Variable* Step::addConst( const std::string& name, const std::string& type, size_t value) { return addConst(name, type, boost::lexical_cast(value)); } Variable* Step::addVar( const std::string& name, const std::string& type, int value) { return addVar(name, type, boost::lexical_cast(value)); } Variable* Step::addConst( const std::string& name, const std::string& type, int value) { return addConst(name, type, boost::lexical_cast(value)); } MatrixVariable* Step::addMatrix( const std::string& name, const std::string& type, Variable *rows, Variable *columns, Variable *ld, Variable *off) { MatrixVariable *var = new MatrixVariable(name, type, "NULL"); var->setMatrixSize(rows, columns, ld, off); arrays_.push_back(var); return var; } VectorVariable* Step::addVector( const std::string& name, const std::string& type, Variable *N, Variable *inc, Variable *off) { VectorVariable *var = new VectorVariable(name, type, "NULL"); var->setVectorSize(N, inc, off); arrays_.push_back(var); return var; } Variable* Step::addBuffer( BufferID bufID, const std::string& name, const std::string& type, cl_mem_flags flags, ArrayVariableInterface* hostPtr) { Variable *var = addVar(name, type, "NULL"); var->setIsBuffer(true); var->setFlags(flags); var->setHostPtr(hostPtr); var->setBufferID(bufID); buffers_.push_back(var); return var; } Variable* Step::getBuffer(BufferID bufID) { for (VarList::iterator it = buffers_.begin(); it != buffers_.end(); ++it) { if ((*it)->getBufID() == bufID) { return (*it); } } return NULL; } void Step::setKernelArg( unsigned int index, const Variable *var) { kargMap_[index] = var; } std::string Step::matrixSize(MatrixVariable *matrix) { std::stringstream size; if ((matrix->rows() == NULL) || (matrix->columns() == NULL)) { return ""; } if (matrix->off() != NULL) { size << matrix->off()->name() << " + "; } if (matrix->ld() != NULL) { size << matrix->ld()->name() << " * "; } if (step_.args.order == clblasColumnMajor) { size << matrix->columns()->name(); } else { size << matrix->rows()->name(); } return size.str(); } std::string Step::vectorSize(VectorVariable *vector) { std::stringstream size; if (vector->nElems() == NULL) { return ""; } if (vector->off() != NULL) { size << vector->off()->name() << " + "; } if (vector->inc() == NULL) { size << vector->nElems()->name(); } else { size << "1 + (" << vector->nElems()->name() << " - 1) * abs(" << vector->inc()->name() << ")"; } return size.str(); } void Step::assignKargs(const StepKargs& map) { CLBlasKargs args; KernelArg kargsList[MAX_KERNEL_ARGS]; Variable *v; if ((pattern_->sops == NULL) || (pattern_->sops->assignKargs == NULL)) { return; } memset(&kargsList, KARG_NONE, sizeof(kargsList)); args.kernType = CLBLAS_COMPUTING_KERNEL; args.dtype = TYPE_COMPLEX_DOUBLE; args.addrBits = 0; args.order = static_cast(KARG_ORDER); args.side = static_cast(KARG_SIDE); args.uplo = static_cast(KARG_UPLO); args.transA = static_cast(KARG_TRANS_A); args.transB = static_cast(KARG_TRANS_B); args.diag = static_cast(KARG_DIAG); args.M = KARG_M; args.N = KARG_N; args.K = KARG_K; args.lda.matrix = KARG_LDA; args.ldb.matrix = KARG_LDB; args.ldc.matrix = KARG_LDC; args.offsetM = KARG_OFFSET_M; args.offsetN = KARG_OFFSET_N; args.offsetK = KARG_OFFSET_K; args.offA = KARG_OFF_A; args.offBX = KARG_OFF_BX; args.offCY = KARG_OFF_CY; args.A = reinterpret_cast(KARG_A); args.B = reinterpret_cast(KARG_B); args.C = reinterpret_cast(KARG_C); memset(&args.alpha, KARG_ALPHA, sizeof(args.alpha)); memset(&args.beta, KARG_BETA, sizeof(args.beta)); args.scimage[0] = reinterpret_cast(KARG_SCIMAGE_0); args.scimage[1] = reinterpret_cast(KARG_SCIMAGE_1); pattern_->sops->assignKargs(kargsList, static_cast(&args), &kextra_); for (unsigned int i = 0; (i < MAX_KERNEL_ARGS) && (kargsList[i].typeSize != 0); i++) { switch (static_cast(kargsList[i].arg.data[0])) { case KARG_M: v = map.M; break; case KARG_N: v = map.N; break; case KARG_K: v = map.K; break; case KARG_ALPHA: v = map.alpha; break; case KARG_A: v = map.A; break; case KARG_LDA: v = map.lda; break; case KARG_B: v = map.B; break; case KARG_LDB: v = map.ldb; break; case KARG_BETA: v = map.beta; break; case KARG_C: v = map.C; break; case KARG_LDC: v = map.ldc; break; case KARG_OFFSET_M: v = map.offsetM; break; case KARG_OFFSET_N: v = map.offsetN; break; case KARG_OFFSET_K: v = map.offsetK; break; case KARG_SCIMAGE_0: v = map.scimage0; break; case KARG_SCIMAGE_1: v = map.scimage1; break; case KARG_OFF_A: v = map.offA; break; case KARG_OFF_BX: v = map.offBX; break; case KARG_OFF_CY: v = map.offCY; break; default: // KARG_ORDER, KARG_SIDE, KARG_UPLO, KARG_TRANS_A, KARG_TRANS_B, // KARG_DIAG v = NULL; break; } if (v != NULL) { setKernelArg(i, v); } } } std::string Step::globalWorkSize() { size_t globalWorkSize[MAX_WORK_DIM] = { 0, 0, 0 }; std::stringstream ss; SubproblemDim dims[MAX_SUBDIMS]; memcpy(dims, step_.subdims, sizeof(dims)); if (pattern_->sops->calcThreads) { pattern_->sops->calcThreads(globalWorkSize, step_.subdims, &step_.pgran, &step_.args, &kextra_); } else { SubproblemDim globDim; const PGranularity *pg; pg = (pattern_->nrLevels == 1) ? NULL : &step_.pgran; kargsToProbDims(&globDim, blasFunctionID(), &step_.args, false); // fixup dimensions in respect with desired work dispatch order if ((pgran().wgDim == 2) && pattern_->sops->innerDecompositionAxis) { if (pattern_->sops->innerDecompositionAxis(&step_.args) == DECOMP_AXIS_X) { /* * these dimensions will not be used more anywhere, so we can * just swap them */ swapDimXY(&dims[0]); swapDimXY(&dims[1]); swapDimXY(&globDim); } } calcGlobalThreads(globalWorkSize, dims, pg, globDim.y, globDim.x); } for (unsigned int i = 0; i < pgran().wgDim; i++) { if (i != 0) { ss << ", "; } ss << globalWorkSize[i]; } return ss.str(); } void Step::setKernelName(std::string name) { kernelName_ = name; } std::string Step::deviceVendor(cl_device_id device) { cl_int err; size_t len; char *str; std::string vendor = ""; err = clGetDeviceInfo(device, CL_DEVICE_VENDOR, 0, NULL, &len); if (err != CL_SUCCESS) { return ""; } str = new char[len + 1]; err = clGetDeviceInfo(device, CL_DEVICE_VENDOR, len, str, NULL); if (err == CL_SUCCESS) { vendor = str; } delete[] str; return vendor; } clblas-2.10/src/library/tools/ktest/step.h000066400000000000000000000334451264277366700206020ustar00rootroot00000000000000/* ************************************************************************ * Copyright 2013 Advanced Micro Devices, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * ************************************************************************/ #ifndef KTEST_PATTERN_H__ #define KTEST_PATTERN_H__ #ifdef __APPLE__ #include #else #include #endif #include #include #include #include #include #include #include #include #include #include #include "var.h" namespace clMath { // This structure reflects CLBlasKargs structure, declared in clblas-internal.h typedef struct StepKargs { // kernType // dtype // order // side // uplo // transA // transB // diag Variable *M; Variable *N; Variable *K; Variable *alpha; Variable *A; Variable *lda; Variable *B; Variable *ldb; Variable *beta; Variable *C; Variable *ldc; // addrBits Variable *offsetM; Variable *offsetN; Variable *offsetK; Variable *scimage0; Variable *scimage1; Variable *offA; Variable *offBX; Variable *offCY; } Kargs; typedef std::list VarList; typedef std::list ArrayVarList; typedef std::map KArgMap; /** * @internal * @brief SolutionStep wrapper object * @ingroup MAKE_KTEST * * Objects of this class are used for problem decomposition. Each Step object * contains single SolutionStep structure. For disabled multikernel feature * case there is only one solution step always. For multikernel case there is * one master step storing arguments of original problem and inner steps * which are received from solution sequence list generated by clBLAS in * makeSolutionSequence call. * */ class Step { private: CLBLASKernExtra kextra_; cl_platform_id platform_; VarList vars_; ArrayVarList arrays_; VarList buffers_; /** * @internal * @brief Kernel arguments map * * Contains variables objects for arguments of step kernel, * in respective order. */ KArgMap kargMap_; std::string dumpMemoryPattern(); std::string dumpSubdim(const SubproblemDim *subdim); std::string dumpPgran(); std::string dumpKextra(); cl_device_id device() { return step_.device.id; }; void setKernelArg(unsigned int index, const Variable *var); protected: /** * @internal * @brief Associated SolutionStep structure */ SolutionStep step_; /** * @internal * @brief Selected memory pattern pointer */ MemoryPattern* pattern_; /** * @internal * @brief Naive call string * * This string contains naive call for processing step problem. Is used in * master step. */ std::string naiveCall_; /** * @internal * @brief Comparison call string * * This string contains comparison call for processed matrixes. Is used in * master step. */ std::string compareCall_; /** * @internal * @brief Post process matrixes * * This string contains function call for post-processing matrixes after * filling them with random data. Can be used in master step. */ std::string postRandomCall_; /** * @internal * @brief Step kernel name */ std::string kernelName_; /** * @internal * @brief Add variable into step variables list. * * Variable value is given by string. */ Variable* addVar(const std::string& name, const std::string& type, const std::string& defaultValue = ""); /** * @internal * @brief Add variable into step variables list. * * Variable value is given by unsigned value. */ Variable* addVar(const std::string& name, const std::string& type, size_t value); /** * @internal * @brief Add variable into step variables list. * * Variable value is given by signed integer. */ Variable* addVar(const std::string& name, const std::string& type, int value); /** * @internal * @brief Add constant variable into step variables list. * * Constant value is given by string. */ Variable* addConst(const std::string& name, const std::string& type, const std::string& defaultValue); /** * @internal * @brief Add constant variable into step variables list. * * Constant value is given by unsigned value. */ Variable* addConst(const std::string& name, const std::string& type, size_t value); /** * @internal * @brief Add constant variable into step variables list. * * Constant value is given by signed integer. */ Variable* addConst(const std::string& name, const std::string& type, int value); /** * @internal * @brief Add matrix array into step host arrays list. */ MatrixVariable* addMatrix(const std::string& name, const std::string& type, Variable *rows, Variable *columns, Variable *ld, Variable *off = NULL); /** * @internal * @brief Add vector into step host arrays list. */ VectorVariable* addVector(const std::string& name, const std::string& type, Variable *N, Variable *inc, Variable *off = NULL); /** * @internal * @brief Add variable for OpenCL buffer into step buffers list. */ Variable* addBuffer(BufferID bufID, const std::string& name, const std::string& type, cl_mem_flags flags, ArrayVariableInterface* hostPtr); /** * @internal * @brief Assign kernel arguments * * Run pattern assign-kernel-arguments function and get information about * used variables and their order which is used for generating kernel test * code. */ void assignKargs(const Kargs& kargs); /** * @internal * @brief Get device vendor string */ static std::string deviceVendor(cl_device_id device); public: /** * @internal * @brief Constructor for master step * * @param[in] funcID Function identifier * @param[in] device Device identifier * * Uses function id and device to compose step object. It is used for * master step. * */ Step(BlasFunctionID funcID, cl_device_id device); /** * @internal * @brief Constructor for inner step * * @param[in] node Solution sequence list node * * Uses solution sequence node to compose step object. It is used for * making inner steps from solution sequence list received from * clBLAS frontend using makeSolutionSequence. * */ Step(ListNode *node); /** * @internal * @brief Step destructor */ virtual ~Step(); /** * @internal * @brief Get step variables list */ const VarList& vars() const { return vars_; }; /** * @internal * @brief Get step host arrays list */ const ArrayVarList& arrays() const { return arrays_; }; /** * @internal * @brief Get step OpenCL buffers list */ const VarList& buffers() const { return buffers_; }; /** * @internal * @brief Fix leading dimensions to fit matrixes sizes */ virtual void fixLD() = 0; /** * @internal * @brief Declare variables * * @param[in] masterStep Master step object * * Add function-specific variables and fill comparison call and naive * implementation call strings. Master step object is used for handling * buffers A, B, C rearrangement. * */ virtual void declareVars(Step *masterStep) = 0; /** * @internal * @brief Get buffer by id * * @param[in] bufID Buffer identifier * * Return variable of step for buffer A, B or C. Is used for multi-step * configurations for handling buffers rearrangement in inner steps. Inner * steps get buffer variables names from respective master step buffers. */ Variable* getBuffer(BufferID bufID); /** * @internal * @brief Complete problem decomposition of a single step * * Parallelism granularity, tails flags and vectorization values are * guaranteed to be set in appropriate values after this function call. */ void completeDecompositionSingle(); /** * @internal * @brief Wrapper for makeSolutionSeq * * @param[out] seq Solution sequence list head * @param[in] platform Platform identifier * * Call makeSolutionSeq from clBLAS frontend and return solution sequence * list for it. */ void makeSolutionSequence(ListHead *seq, cl_platform_id platform); /** * @internal * @brief Wrapper for freeSolutionSeq * * @param[out] seq Solution sequence list head * * Call freeSolutionSeq from clBLAS frontend. */ void freeSolutionSequence(ListHead *seq); /** * @internal * @brief Generate step kernel code * * @return String containing kernel code for this step */ std::string generate(); /** * @internal * @brief Generate step global work size string * * @return String containing global work size for this step */ std::string globalWorkSize(); /** * @internal * @brief Get step blas function identifier * @return blas function id */ BlasFunctionID blasFunctionID() const { return step_.funcID; }; /** * @internal * @brief Get step kernel arguments * @return step kernel arguments structure */ const CLBlasKargs& kargs() const { return step_.args; }; /** * @internal * @brief Get step parallelism granularity * @return step parallelism granularity structure */ const PGranularity& pgran() const { return step_.pgran; }; /** * @internal * @brief Get naive call string * * Get string containing naive blas function call for step blas function * with respective step flags and arguments. * @return naive blas call string */ const std::string& naiveCall() const { return naiveCall_; }; /** * @internal * @brief Get comparison call string * * Get string containing resulting vectors of matrixes comparison function * call for step blas function. * @return comparison call string */ const std::string& compareCall() const { return compareCall_; }; /** * @internal * @brief Get post-processing call * * Get string containing function call which is called after setting step * matrixes. Is used in TRSM now for making divisible B matrix. * @return step matrixes post-processing call */ const std::string& postRandomCall() const { return postRandomCall_; }; /** * @internal * @brief Get step kernel name * @return step kernel name */ const std::string& kernelName() const { return kernelName_; }; /** * @internal * @brief Get blas function name * Returns blas function name from naive blas for this step. * @return blas function name */ const char* getBlasFunctionName(); /** * @internal * @brief Get kernel arguments variables map * @return step kernel arguments variables map */ const std::map& kargMap() const { return kargMap_; } /** * @internal * @brief Set step blas arguments * @param[in] kargs Step blas arguments structure */ void setKargs(const CLBlasKargs& kargs); /** * @internal * @brief Set step blas subdimensions * @param[in] subdims Step subproblem dimensions */ void setDecomposition(const SubproblemDim *subdims); /** * @internal * @brief Set step kernel name * @param[in] name Step kernel name */ void setKernelName(std::string name); /** * @internal * @brief Get string containing matrix size * @param[in] var Matrix variable * @return matrix variable size string */ std::string matrixSize(MatrixVariable *var); /** * @internal * @brief Get string containing vector size * @param[in] var Vector variable * @return vector variable size string */ std::string vectorSize(VectorVariable *vector); /** * @internal * @brief Get string containing argument value * @param[in] dtype Argument type * @param[in] arg Argument value * Get string containing argument value. Argument can have complex type. * @return string containing argument value */ static std::string multiplierToString(DataType dtype, ArgMultiplier arg); /** * @internal * @brief Get string containing type * @param[in] dtype Data type * @return Data type string */ static std::string dtypeToString(DataType dtype); /** * @internal * @brief Get solution node blas function identifier * @param[in] node Solution sequence node * @return blas function id */ static BlasFunctionID getStepNodeFuncID(ListNode *node); }; } // namespace clMath #endif // KTEST_PATTERN_H__ clblas-2.10/src/library/tools/ktest/steps/000077500000000000000000000000001264277366700206035ustar00rootroot00000000000000clblas-2.10/src/library/tools/ktest/steps/gemm.cpp000066400000000000000000000131531264277366700222370ustar00rootroot00000000000000/* ************************************************************************ * Copyright 2013 Advanced Micro Devices, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * ************************************************************************/ #include #include "gemm.h" using namespace clMath; GemmStep::GemmStep(cl_device_id device) : Step(CLBLAS_GEMM, device) { } GemmStep::GemmStep(ListNode *node) : Step(node) { } void GemmStep::declareVars(Step *masterStep) { StepKargs args; MatrixVariable *A, *B, *C, *naiveC; memset(&args, 0, sizeof(args)); std::string type = dtypeToString(kargs().dtype); args.M = addConst("M", "cl_uint", kargs().M); args.N = addConst("N", "cl_uint", kargs().N); args.K = addConst("K", "cl_uint", kargs().K); args.lda = addConst("lda", "cl_uint", kargs().lda.matrix); args.ldb = addConst("ldb", "cl_uint", kargs().ldb.matrix); args.ldc = addConst("ldc", "cl_uint", kargs().ldc.matrix); //TODO: remove after all gemm generators use offsets A,B,C args.offsetM = addConst("offsetM", "cl_uint", kargs().offsetM); args.offsetN = addConst("offsetN", "cl_uint", kargs().offsetN); args.offsetK = addConst("offsetK", "cl_uint", kargs().offsetK); args.offA = addVar("offA", "cl_uint", kargs().offA); args.offBX = addVar("offB", "cl_uint", kargs().offBX); args.offCY = addVar("offC", "cl_uint", kargs().offCY); args.alpha = addVar("alpha", type, multiplierToString(kargs().dtype, kargs().alpha)); args.beta = addVar("beta", type, multiplierToString(kargs().dtype, kargs().beta)); if (kargs().transA == clblasNoTrans) { A = addMatrix("A", type + "*", args.M, args.K, args.lda, args.offA); } else { A = addMatrix("A", type + "*", args.K, args.M, args.lda, args.offA); } if (kargs().transB == clblasNoTrans) { B = addMatrix("B", type + "*", args.K, args.N, args.ldb, args.offBX); } else { B = addMatrix("B", type + "*", args.N, args.K, args.ldb, args.offBX); } C = addMatrix("C", type + "*", args.M, args.N, args.ldc, args.offCY); naiveC = addMatrix("naiveC", type + "*", args.M, args.N, args.ldc, args.offCY); naiveC->setCopy(C); std::string bufAName, bufBName, bufCName; if (NULL == masterStep) { bufAName = "bufA"; bufBName = "bufB"; bufCName = "bufC"; } else { bufAName = masterStep->getBuffer((BufferID)(long)step_.args.A)->name(); bufBName = masterStep->getBuffer((BufferID)(long)step_.args.B)->name(); bufCName = masterStep->getBuffer((BufferID)(long)step_.args.C)->name(); } args.A = addBuffer(BUFFER_A, bufAName, "cl_mem", CL_MEM_READ_ONLY, A); args.B = addBuffer(BUFFER_B, bufBName, "cl_mem", CL_MEM_READ_ONLY, B); args.C = addBuffer(BUFFER_C, bufCName, "cl_mem", CL_MEM_READ_WRITE, C); assignKargs(args); std::stringstream ss; ss << getBlasFunctionName() << "(order, transA, transB, " << args.M->name() << ", " << args.N->name() << ", " << args.K->name() << ", " << args.alpha->name() << ", " << A->matrixPointer() << ", " << args.lda->name() << ", " << B->matrixPointer() << ", " << args.ldb->name() << ", " << args.beta->name() << ", " << naiveC->matrixPointer() << ", " << args.ldc->name() << ")"; naiveCall_ = ss.str(); ss.str(""); ss << "compareMatrices(order, " << args.M->name() << ", " << args.N->name() << ", " << C->matrixPointer() << ", " << naiveC->matrixPointer() << ", " << args.ldc->name() << ")"; compareCall_ = ss.str(); } void GemmStep::fixLD() { CLBlasKargs args; args = kargs(); switch (args.transA) { case clblasNoTrans: if ((args.order == clblasColumnMajor) && (args.lda.matrix < args.M)) { args.lda.matrix = args.M; } if ((args.order == clblasRowMajor) && (args.lda.matrix < args.K)) { args.lda.matrix = args.K; } break; case clblasTrans: case clblasConjTrans: if ((args.order == clblasColumnMajor) && (args.lda.matrix < args.K)) { args.lda.matrix = args.K; } if ((args.order == clblasRowMajor) && (args.lda.matrix < args.M)) { args.lda.matrix = args.M; } break; } switch (args.transB) { case clblasNoTrans: if ((args.order == clblasColumnMajor) && (args.ldb.matrix < args.K)) { args.ldb.matrix = args.K; } if ((args.order == clblasRowMajor) && (args.ldb.matrix < args.N)) { args.ldb.matrix = args.N; } break; case clblasTrans: case clblasConjTrans: if ((args.order == clblasColumnMajor) && (args.ldb.matrix < args.N)) { args.ldb.matrix = args.N; } if ((args.order == clblasRowMajor) && (args.ldb.matrix < args.K)) { args.ldb.matrix = args.K; } break; } if ((args.order == clblasColumnMajor) && (args.ldc.matrix < args.M)) { args.ldc.matrix = args.M; } if ((args.order == clblasRowMajor) && (args.ldc.matrix < args.N)) { args.ldc.matrix = args.N; } setKargs(args); } clblas-2.10/src/library/tools/ktest/steps/gemm.h000066400000000000000000000020671264277366700217060ustar00rootroot00000000000000/* ************************************************************************ * Copyright 2013 Advanced Micro Devices, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * ************************************************************************/ #ifndef KTEST_GEMM_H__ #define KTEST_GEMM_H__ #include "../step.h" namespace clMath { class GemmStep : public Step { public: GemmStep(cl_device_id device); GemmStep(ListNode *node); virtual void fixLD(); virtual void declareVars(Step *masterStep); }; } // namespace clMath #endif // KTEST_GEMM_H__ clblas-2.10/src/library/tools/ktest/steps/gemv.cpp000066400000000000000000000110701264277366700222440ustar00rootroot00000000000000/* ************************************************************************ * Copyright 2013 Advanced Micro Devices, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * ************************************************************************/ #include #include "gemv.h" using namespace clMath; GemvStep::GemvStep(cl_device_id device) : Step(CLBLAS_GEMV, device) { } GemvStep::GemvStep(ListNode *node) : Step(node) { } void GemvStep::declareVars(Step *masterStep) { StepKargs args; MatrixVariable *A; VectorVariable *X, *Y, *naiveY; memset(&args, 0, sizeof(args)); std::string type = dtypeToString(kargs().dtype); args.M = addConst("M", "cl_uint", kargs().M); args.N = addConst("N", "cl_uint", kargs().N); args.lda = addConst("lda", "cl_uint", kargs().lda.matrix); args.ldb = addConst("incx", "cl_int", kargs().ldb.vector); args.ldc = addConst("incy", "cl_int", kargs().ldc.vector); args.offA = addConst("offA", "cl_uint", kargs().offA); args.offBX = addConst("offX", "cl_uint", kargs().offBX); args.offCY = addConst("offY", "cl_uint", kargs().offCY); args.alpha = addVar("alpha", type, multiplierToString(kargs().dtype, kargs().alpha)); args.beta = addVar("beta", type, multiplierToString(kargs().dtype, kargs().beta)); A = addMatrix("A", type + "*", args.M, args.N, args.lda, args.offA); if (kargs().transA == clblasNoTrans) { X = addVector("X", type + "*", args.N, args.ldb, args.offBX); Y = addVector("Y", type + "*", args.M, args.ldc, args.offCY); naiveY = addVector("naiveY", type + "*", args.M, args.ldc, args.offCY); } else { X = addVector("X", type + "*", args.M, args.ldb, args.offBX); Y = addVector("Y", type + "*", args.N, args.ldc, args.offCY); naiveY = addVector("naiveY", type + "*", args.N, args.ldc, args.offCY); } naiveY->setCopy(Y); std::string bufAName, bufBName, bufCName; if (NULL == masterStep) { bufAName = "bufA"; bufBName = "bufX"; bufCName = "bufY"; } else { bufAName = masterStep->getBuffer((BufferID)(long)step_.args.A)->name(); bufBName = masterStep->getBuffer((BufferID)(long)step_.args.B)->name(); bufCName = masterStep->getBuffer((BufferID)(long)step_.args.C)->name(); } args.A = addBuffer(BUFFER_A, bufAName, "cl_mem", CL_MEM_READ_ONLY, A); args.B = addBuffer(BUFFER_B, bufBName, "cl_mem", CL_MEM_READ_ONLY, X); args.C = addBuffer(BUFFER_C, bufCName, "cl_mem", CL_MEM_READ_WRITE, Y); assignKargs(args); std::stringstream ss; ss << getBlasFunctionName() << "(order, transA, " << args.M->name() << ", " << args.N->name() << ", " << args.alpha->name() << ", " << A->matrixPointer() << ", " << args.lda->name() << ", " << X->vectorPointer() << ", " << args.ldb->name() << ", " << args.beta->name() << ", " << naiveY->vectorPointer() << ", " << args.ldc->name() << ")"; naiveCall_ = ss.str(); ss.str(""); if (kargs().transA == clblasNoTrans) { ss << "compareVectors(" << args.M->name() << ", " << Y->vectorPointer() << ", " << naiveY->vectorPointer() << ", " << args.ldc->name() << ")"; } else { ss << "compareVectors(" << args.N->name() << ", " << Y->vectorPointer() << ", " << naiveY->vectorPointer() << ", " << args.ldc->name() << ")"; } compareCall_ = ss.str(); } void GemvStep::fixLD() { CLBlasKargs args; args = kargs(); /* M is always number of rows and N is number of columns in gemv */ if ((args.order == clblasColumnMajor) && (args.lda.matrix < args.M)) { args.lda.matrix = args.M; } if ((args.order == clblasRowMajor) && (args.lda.matrix < args.N)) { args.lda.matrix = args.N; } if (args.ldb.vector == 0) { args.ldb.vector = 1; } if (args.ldc.vector == 0) { args.ldc.vector = 1; } /* * store original height of the matrix A */ args.K = (args.transA == clblasNoTrans) ? args.M : args.N; setKargs(args); } clblas-2.10/src/library/tools/ktest/steps/gemv.h000066400000000000000000000020671264277366700217170ustar00rootroot00000000000000/* ************************************************************************ * Copyright 2013 Advanced Micro Devices, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * ************************************************************************/ #ifndef KTEST_GEMV_H__ #define KTEST_GEMV_H__ #include "../step.h" namespace clMath { class GemvStep : public Step { public: GemvStep(cl_device_id device); GemvStep(ListNode *node); virtual void fixLD(); virtual void declareVars(Step *masterStep); }; } // namespace clMath #endif // KTEST_GEMV_H__ clblas-2.10/src/library/tools/ktest/steps/symv.cpp000066400000000000000000000073351264277366700223150ustar00rootroot00000000000000/* ************************************************************************ * Copyright 2013 Advanced Micro Devices, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * ************************************************************************/ #include #include "symv.h" using namespace clMath; SymvStep::SymvStep(cl_device_id device) : Step(CLBLAS_SYMV, device) { } SymvStep::SymvStep(ListNode *node) : Step(node) { } void SymvStep::declareVars(Step *masterStep) { StepKargs args; MatrixVariable *A; VectorVariable *X, *Y, *naiveY; memset(&args, 0, sizeof(args)); std::string type = dtypeToString(kargs().dtype); args.N = addConst("N", "cl_uint", kargs().N); args.K = args.N; args.lda = addConst("lda", "cl_uint", kargs().lda.matrix); args.ldb = addConst("incx", "cl_int", kargs().ldb.vector); args.ldc = addConst("incy", "cl_int", kargs().ldc.vector); args.offsetN = addConst("offsetN", "cl_uint", kargs().offsetN); args.offA = addConst("offA", "cl_uint", kargs().offA); args.offBX = addConst("offx", "cl_uint", kargs().offBX); args.offCY = addConst("offy", "cl_uint", kargs().offCY); args.alpha = addVar("alpha", type, multiplierToString(kargs().dtype, kargs().alpha)); args.beta = addVar("beta", type, multiplierToString(kargs().dtype, kargs().beta)); A = addMatrix("A", type + "*", args.N, args.N, args.lda, args.offA); X = addVector("X", type + "*", args.N, args.ldb, args.offBX); Y = addVector("Y", type + "*", args.N, args.ldc, args.offCY); naiveY = addVector("naiveY", type + "*", args.N, args.ldc, args.offCY); naiveY->setCopy(Y); std::string bufAName, bufBName, bufCName; if (NULL == masterStep) { bufAName = "bufA"; bufBName = "bufX"; bufCName = "bufY"; } else { bufAName = masterStep->getBuffer((BufferID)(long)step_.args.A)->name(); bufBName = masterStep->getBuffer((BufferID)(long)step_.args.B)->name(); bufCName = masterStep->getBuffer((BufferID)(long)step_.args.C)->name(); } args.A = addBuffer(BUFFER_A, bufAName, "cl_mem", CL_MEM_READ_ONLY, A); args.B = addBuffer(BUFFER_B, bufBName, "cl_mem", CL_MEM_READ_ONLY, X); args.C = addBuffer(BUFFER_C, bufCName, "cl_mem", CL_MEM_READ_WRITE, Y); assignKargs(args); std::stringstream ss; ss << getBlasFunctionName() << "(order, uplo, " << args.N->name() << ", " << args.alpha->name() << ", " << A->matrixPointer() << ", " << args.lda->name() << ", " << X->vectorPointer() << ", " << args.ldb->name() << ", " << args.beta->name() << ", " << naiveY->vectorPointer() << ", " << args.ldc->name() << ")"; naiveCall_ = ss.str(); ss.str(""); ss << "compareVectors(" << args.N->name() << ", " << Y->vectorPointer() << ", " << naiveY->vectorPointer() << ", " << args.ldc->name() << ")"; compareCall_ = ss.str(); } void SymvStep::fixLD() { CLBlasKargs args; args = kargs(); if (args.lda.matrix < args.N) { args.lda.matrix = args.N; } if (args.ldb.vector == 0) { args.ldb.vector = 1; } if (args.ldc.vector == 0) { args.ldc.vector = 1; } args.K = args.N; //store original N setKargs(args); } clblas-2.10/src/library/tools/ktest/steps/symv.h000066400000000000000000000020671264277366700217570ustar00rootroot00000000000000/* ************************************************************************ * Copyright 2013 Advanced Micro Devices, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * ************************************************************************/ #ifndef KTEST_SYMV_H__ #define KTEST_SYMV_H__ #include "../step.h" namespace clMath { class SymvStep : public Step { public: SymvStep(cl_device_id device); SymvStep(ListNode *node); virtual void fixLD(); virtual void declareVars(Step *masterStep); }; } // namespace clMath #endif // KTEST_SYMV_H__ clblas-2.10/src/library/tools/ktest/steps/syr2k.cpp000066400000000000000000000117721264277366700223710ustar00rootroot00000000000000/* ************************************************************************ * Copyright 2013 Advanced Micro Devices, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * ************************************************************************/ #include #include "syr2k.h" using namespace clMath; Syr2kStep::Syr2kStep(cl_device_id device) : Step(CLBLAS_SYR2K, device) { } Syr2kStep::Syr2kStep(ListNode *node) : Step(node) { } void Syr2kStep::declareVars(Step *masterStep) { StepKargs args; MatrixVariable *A, *B, *C, *naiveC; memset(&args, 0, sizeof(args)); std::string type = dtypeToString(kargs().dtype); args.N = addConst("N", "cl_uint", kargs().N); args.M = args.N; args.K = addConst("K", "cl_uint", kargs().K); args.lda = addConst("lda", "cl_uint", kargs().lda.matrix); args.ldb = addConst("ldb", "cl_uint", kargs().ldb.matrix); args.ldc = addConst("ldc", "cl_uint", kargs().ldc.matrix); args.offsetM = addConst("offsetM", "cl_uint", kargs().offsetM); args.offA = addVar("offA", "cl_uint", kargs().offA); args.offBX = addVar("offB", "cl_uint", kargs().offBX); args.offCY = addVar("offC", "cl_uint", kargs().offCY); args.alpha = addVar("alpha", type, multiplierToString(kargs().dtype, kargs().alpha)); args.beta = addVar("beta", type, multiplierToString(kargs().dtype, kargs().beta)); if (kargs().transA == clblasNoTrans) { A = addMatrix("A", type + "*", args.N, args.K, args.lda, args.offA); B = addMatrix("B", type + "*", args.N, args.K, args.lda, args.offBX); } else { A = addMatrix("A", type + "*", args.K, args.N, args.lda, args.offA); B = addMatrix("B", type + "*", args.K, args.N, args.lda, args.offBX); } C = addMatrix("C", type + "*", args.N, args.N, args.ldc, args.offCY); naiveC = addMatrix("naiveC", type + "*", args.N, args.N, args.ldc, args.offCY); naiveC->setCopy(C); std::string bufAName, bufBName, bufCName; if (NULL == masterStep) { bufAName = "bufA"; bufBName = "bufB"; bufCName = "bufC"; } else { bufAName = masterStep->getBuffer((BufferID)(long)step_.args.A)->name(); bufBName = masterStep->getBuffer((BufferID)(long)step_.args.B)->name(); bufCName = masterStep->getBuffer((BufferID)(long)step_.args.C)->name(); } args.A = addBuffer(BUFFER_A, bufAName, "cl_mem", CL_MEM_READ_ONLY, A); args.B = addBuffer(BUFFER_B, bufBName, "cl_mem", CL_MEM_READ_ONLY, B); args.C = addBuffer(BUFFER_C, bufCName, "cl_mem", CL_MEM_READ_WRITE, C); assignKargs(args); std::stringstream ss; ss << getBlasFunctionName() << "(order, uplo, transA, " << args.N->name() << ", " << args.K->name() << ", " << args.alpha->name() << ", " << A->matrixPointer() << ", " << args.lda->name() << ", " << B->matrixPointer() << ", " << args.ldb->name() << ", " << args.beta->name() << ", " << naiveC->matrixPointer() << ", " << args.ldc->name() << ")"; naiveCall_ = ss.str(); ss.str(""); ss << "compareMatrices(order, " << args.N->name() << ", " << args.N->name() << ", " << C->matrixPointer() << ", " << naiveC->matrixPointer() << ", " << args.ldc->name() << ")"; compareCall_ = ss.str(); } void Syr2kStep::fixLD() { CLBlasKargs args; args = kargs(); if (args.transA == clblasNoTrans) { if ((args.order == clblasColumnMajor) && (args.lda.matrix < args.N)) { args.lda.matrix = args.N; } if ((args.order == clblasRowMajor) && (args.lda.matrix < args.K)) { args.lda.matrix = args.K; } if ((args.order == clblasColumnMajor) && (args.ldb.matrix < args.N)) { args.ldb.matrix = args.N; } if ((args.order == clblasRowMajor) && (args.ldb.matrix < args.K)) { args.ldb.matrix = args.K; } } else { if ((args.order == clblasColumnMajor) && (args.lda.matrix < args.K)) { args.lda.matrix = args.K; } if ((args.order == clblasRowMajor) && (args.lda.matrix < args.N)) { args.lda.matrix = args.N; } if ((args.order == clblasColumnMajor) && (args.ldb.matrix < args.K)) { args.ldb.matrix = args.K; } if ((args.order == clblasRowMajor) && (args.ldb.matrix < args.N)) { args.ldb.matrix = args.N; } } if (args.ldc.matrix < args.N) { args.ldc.matrix = args.N; } args.transB = args.transA; args.M = args.N; setKargs(args); } clblas-2.10/src/library/tools/ktest/steps/syr2k.h000066400000000000000000000020751264277366700220320ustar00rootroot00000000000000/* ************************************************************************ * Copyright 2013 Advanced Micro Devices, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * ************************************************************************/ #ifndef KTEST_SYR2K_H__ #define KTEST_SYR2K_H__ #include "../step.h" namespace clMath { class Syr2kStep : public Step { public: Syr2kStep(cl_device_id device); Syr2kStep(ListNode *node); virtual void fixLD(); virtual void declareVars(Step *masterStep); }; } // namespace clMath #endif // KTEST_SYR2K_H__ clblas-2.10/src/library/tools/ktest/steps/syrk.cpp000066400000000000000000000101211264277366700222720ustar00rootroot00000000000000/* ************************************************************************ * Copyright 2013 Advanced Micro Devices, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * ************************************************************************/ #include #include "syrk.h" using namespace clMath; SyrkStep::SyrkStep(cl_device_id device) : Step(CLBLAS_SYRK, device) { } SyrkStep::SyrkStep(ListNode *node) : Step(node) { } void SyrkStep::declareVars(Step *masterStep) { StepKargs args; MatrixVariable *A, *C, *naiveC; memset(&args, 0, sizeof(args)); std::string type = dtypeToString(kargs().dtype); args.N = addConst("N", "cl_uint", kargs().N); args.M = args.N; args.K = addConst("K", "cl_uint", kargs().K); args.lda = addConst("lda", "cl_uint", kargs().lda.matrix); args.ldb = args.lda; args.ldc = addConst("ldc", "cl_uint", kargs().ldc.matrix); args.offsetM = addConst("offsetM", "cl_uint", kargs().offsetM); args.offA = addVar("offA", "cl_uint", kargs().offA); args.offBX = args.offA; args.offCY = addVar("offC", "cl_uint", kargs().offCY); args.alpha = addVar("alpha", type, multiplierToString(kargs().dtype, kargs().alpha)); args.beta = addVar("beta", type, multiplierToString(kargs().dtype, kargs().beta)); if (kargs().transA == clblasNoTrans) { A = addMatrix("A", type + "*", args.N, args.K, args.lda, args.offA); } else { A = addMatrix("A", type + "*", args.K, args.N, args.lda, args.offA); } C = addMatrix("C", type + "*", args.N, args.N, args.ldc, args.offCY); naiveC = addMatrix("naiveC", type + "*", args.N, args.N, args.ldc, args.offCY); naiveC->setCopy(C); std::string bufAName, bufCName; if (NULL == masterStep) { bufAName = "bufA"; bufCName = "bufC"; } else { bufAName = masterStep->getBuffer((BufferID)(long)step_.args.A)->name(); bufCName = masterStep->getBuffer((BufferID)(long)step_.args.C)->name(); } args.A = addBuffer(BUFFER_A, bufAName, "cl_mem", CL_MEM_READ_ONLY, A); args.C = addBuffer(BUFFER_C, bufCName, "cl_mem", CL_MEM_READ_WRITE, C); args.B = args.A; assignKargs(args); std::stringstream ss; ss << getBlasFunctionName() << "(order, uplo, transA, " << args.N->name() << ", " << args.K->name() << ", " << args.alpha->name() << ", " << A->matrixPointer() << ", " << args.lda->name() << ", " << args.beta->name() << ", " << naiveC->matrixPointer() << ", " << args.ldc->name() << ")"; naiveCall_ = ss.str(); ss.str(""); ss << "compareMatrices(order, " << args.N->name() << ", " << args.N->name() << ", " << C->matrixPointer() << ", " << naiveC->matrixPointer() << ", " << args.ldc->name() << ")"; compareCall_ = ss.str(); } void SyrkStep::fixLD() { CLBlasKargs args; args = kargs(); if (args.transA == clblasNoTrans) { if ((args.order == clblasColumnMajor) && (args.lda.matrix < args.N)) { args.lda.matrix = args.N; } if ((args.order == clblasRowMajor) && (args.lda.matrix < args.K)) { args.lda.matrix = args.K; } } else { if ((args.order == clblasColumnMajor) && (args.lda.matrix < args.K)) { args.lda.matrix = args.K; } if ((args.order == clblasRowMajor) && (args.lda.matrix < args.N)) { args.lda.matrix = args.N; } } if (args.ldc.matrix < args.N) { args.ldc.matrix = args.N; } args.transB = args.transA; args.M = args.N; args.ldb.matrix = args.lda.matrix; setKargs(args); } clblas-2.10/src/library/tools/ktest/steps/syrk.h000066400000000000000000000020671264277366700217510ustar00rootroot00000000000000/* ************************************************************************ * Copyright 2013 Advanced Micro Devices, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * ************************************************************************/ #ifndef KTEST_SYRK_H__ #define KTEST_SYRK_H__ #include "../step.h" namespace clMath { class SyrkStep : public Step { public: SyrkStep(cl_device_id device); SyrkStep(ListNode *node); virtual void fixLD(); virtual void declareVars(Step *masterStep); }; } // namespace clMath #endif // KTEST_SYRK_H__ clblas-2.10/src/library/tools/ktest/steps/trmm.cpp000066400000000000000000000076061264277366700222770ustar00rootroot00000000000000/* ************************************************************************ * Copyright 2013 Advanced Micro Devices, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * ************************************************************************/ #include #include "trmm.h" using namespace clMath; TrmmStep::TrmmStep(cl_device_id device) : Step(CLBLAS_TRMM, device) { } TrmmStep::TrmmStep(ListNode *node) : Step(node) { } void TrmmStep::declareVars(Step *masterStep) { StepKargs args; MatrixVariable *A, *B, *naiveB; memset(&args, 0, sizeof(args)); std::string type = dtypeToString(kargs().dtype); args.M = addConst("M", "cl_uint", kargs().M); args.N = addConst("N", "cl_uint", kargs().N); if (kargs().side == clblasLeft) { args.K = args.M; } else { args.K = args.N; } args.lda = addConst("lda", "cl_uint", kargs().lda.matrix); args.ldb = addConst("ldb", "cl_uint", kargs().ldb.matrix); args.offA = addVar("offA", "cl_uint", kargs().offA); args.offBX = addVar("offB", "cl_uint", kargs().offBX); args.alpha = addVar("alpha", type, multiplierToString(kargs().dtype, kargs().alpha)); if (kargs().side == clblasLeft) { A = addMatrix("A", type + "*", args.M, args.M, args.lda, args.offA); } else { A = addMatrix("A", type + "*", args.N, args.N, args.lda, args.offA); } B = addMatrix("B", type + "*", args.M, args.N, args.ldb, args.offBX); naiveB = addMatrix("naiveB", type + "*", args.M, args.N, args.ldb, args.offBX); naiveB->setCopy(B); std::string bufAName, bufBName; if (NULL == masterStep) { bufAName = "bufA"; bufBName = "bufB"; } else { bufAName = masterStep->getBuffer((BufferID)(long)step_.args.A)->name(); bufBName = masterStep->getBuffer((BufferID)(long)step_.args.B)->name(); } args.A = addBuffer(BUFFER_A, bufAName, "cl_mem", CL_MEM_READ_ONLY, A); args.B = addBuffer(BUFFER_B, bufBName, "cl_mem", CL_MEM_READ_WRITE, B); assignKargs(args); std::stringstream ss; ss << getBlasFunctionName() << "(order, side, uplo, transA, diag, " << args.M->name() << ", " << args.N->name() << ", " << args.alpha->name() << ", " << A->matrixPointer() << ", " << args.lda->name() << ", " << naiveB->matrixPointer() << ", " << args.ldb->name() << ")"; naiveCall_ = ss.str(); ss.str(""); ss << "compareMatrices(order, " << args.M->name() << ", " << args.N->name() << ", " << B->matrixPointer() << ", " << naiveB->matrixPointer() << ", " << args.ldb->name() << ")"; compareCall_ = ss.str(); } void TrmmStep::fixLD() { CLBlasKargs args; args = kargs(); if (args.side == clblasLeft) { if (args.lda.matrix < args.M) { args.lda.matrix = args.M; } } else { if (args.lda.matrix < args.N) { args.lda.matrix = args.N; } } if ((args.order == clblasColumnMajor) && (args.ldb.matrix < args.M)) { args.ldb.matrix = args.M; } if ((args.order == clblasRowMajor) && (args.ldb.matrix < args.N)) { args.ldb.matrix = args.N; } // Store original problem size in K, this is used to know it while // calculating result by parts using M or N as part size if (args.side == clblasLeft) { args.K = args.M; } else { args.K = args.N; } setKargs(args); } clblas-2.10/src/library/tools/ktest/steps/trmm.h000066400000000000000000000020671264277366700217400ustar00rootroot00000000000000/* ************************************************************************ * Copyright 2013 Advanced Micro Devices, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * ************************************************************************/ #ifndef KTEST_TRMM_H__ #define KTEST_TRMM_H__ #include "../step.h" namespace clMath { class TrmmStep : public Step { public: TrmmStep(cl_device_id device); TrmmStep(ListNode *node); virtual void fixLD(); virtual void declareVars(Step *masterStep); }; } // namespace clMath #endif // KTEST_TRMM_H__ clblas-2.10/src/library/tools/ktest/steps/trsm.cpp000066400000000000000000000103321264277366700222730ustar00rootroot00000000000000/* ************************************************************************ * Copyright 2013 Advanced Micro Devices, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * ************************************************************************/ #include #include "trsm.h" using namespace clMath; TrsmStep::TrsmStep(cl_device_id device) : Step(CLBLAS_TRSM, device) { } TrsmStep::TrsmStep(ListNode *node) : Step(node) { } void TrsmStep::declareVars(Step *masterStep) { StepKargs args; MatrixVariable *A, *B, *naiveB; memset(&args, 0, sizeof(args)); std::string type = dtypeToString(kargs().dtype); args.M = addConst("M", "cl_uint", kargs().M); args.N = addConst("N", "cl_uint", kargs().N); if (kargs().side == clblasLeft) { args.K = args.M; } else { args.K = args.N; } args.lda = addConst("lda", "cl_uint", kargs().lda.matrix); args.ldb = addConst("ldb", "cl_uint", kargs().ldb.matrix); args.offA = addVar("offA", "cl_uint", kargs().offA); args.offBX = addVar("offB", "cl_uint", kargs().offBX); args.alpha = addVar("alpha", type, multiplierToString(kargs().dtype, kargs().alpha)); if (kargs().side == clblasLeft) { A = addMatrix("A", type + "*", args.M, args.M, args.lda, args.offA); } else { A = addMatrix("A", type + "*", args.N, args.N, args.lda, args.offA); } B = addMatrix("B", type + "*", args.M, args.N, args.ldb, args.offBX); naiveB = addMatrix("naiveB", type + "*", args.M, args.N, args.ldb, args.offBX); naiveB->setCopy(B); std::string bufAName, bufBName; if (NULL == masterStep) { bufAName = "bufA"; bufBName = "bufB"; } else { bufAName = masterStep->getBuffer((BufferID)(long)step_.args.A)->name(); bufBName = masterStep->getBuffer((BufferID)(long)step_.args.B)->name(); } args.A = addBuffer(BUFFER_A, bufAName, "cl_mem", CL_MEM_READ_ONLY, A); args.B = addBuffer(BUFFER_B, bufBName, "cl_mem", CL_MEM_READ_WRITE, B); assignKargs(args); std::stringstream ss; ss << getBlasFunctionName() << "(order, side, uplo, transA, diag, " << args.M->name() << ", " << args.N->name() << ", " << args.alpha->name() << ", " << A->matrixPointer() << ", " << args.lda->name() << ", " << naiveB->matrixPointer() << ", " << args.ldb->name() << ")"; naiveCall_ = ss.str(); ss.str(""); ss << "compareMatrices(order, " << args.M->name() << ", " << args.N->name() << ", " << B->matrixPointer() << ", " << naiveB->matrixPointer() << ", " << args.ldb->name() << ")"; compareCall_ = ss.str(); ss.str(""); ss << "setUpTRSMDiagonal(order, side, uplo, transA, diag, " << args.M->name() << ", " << args.N->name() << ", " << args.alpha->name() << ", " << A->matrixPointer() << ", " << args.lda->name() << ", " << B->matrixPointer() << ", " << args.ldb->name() << ")"; postRandomCall_ = ss.str(); } void TrsmStep::fixLD() { CLBlasKargs args; args = kargs(); if (args.side == clblasLeft) { if (args.lda.matrix < args.M) { args.lda.matrix = args.M; } } else { if (args.lda.matrix < args.N) { args.lda.matrix = args.N; } } if ((args.order == clblasColumnMajor) && (args.ldb.matrix < args.M)) { args.ldb.matrix = args.M; } if ((args.order == clblasRowMajor) && (args.ldb.matrix < args.N)) { args.ldb.matrix = args.N; } // Store original problem size in K, this is used to know it while // calculating result by parts using M or N as part size if (args.side == clblasLeft) { args.K = args.M; } else { args.K = args.N; } setKargs(args); } clblas-2.10/src/library/tools/ktest/steps/trsm.h000066400000000000000000000020671264277366700217460ustar00rootroot00000000000000/* ************************************************************************ * Copyright 2013 Advanced Micro Devices, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * ************************************************************************/ #ifndef KTEST_TRSM_H__ #define KTEST_TRSM_H__ #include "../step.h" namespace clMath { class TrsmStep : public Step { public: TrsmStep(cl_device_id device); TrsmStep(ListNode *node); virtual void fixLD(); virtual void declareVars(Step *masterStep); }; } // namespace clMath #endif // KTEST_TRSM_H__ clblas-2.10/src/library/tools/ktest/var.cpp000066400000000000000000000074761264277366700207570ustar00rootroot00000000000000/* ************************************************************************ * Copyright 2013 Advanced Micro Devices, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * ************************************************************************/ #include "var.h" using namespace clMath; struct MemFlags { cl_mem_flags flag; const char* name; }; static const struct MemFlags MEM_FLAGS[] = { { CL_MEM_READ_WRITE, "CL_MEM_READ_WRITE" }, { CL_MEM_WRITE_ONLY, "CL_MEM_WRITE_ONLY" }, { CL_MEM_READ_ONLY, "CL_MEM_READ_ONLY" }, { CL_MEM_USE_HOST_PTR, "CL_MEM_USE_HOST_PTR" }, { CL_MEM_ALLOC_HOST_PTR,"CL_MEM_ALLOC_HOST_PTR" }, { CL_MEM_COPY_HOST_PTR, "CL_MEM_COPY_HOST_PTR" }, { 0, NULL } }; Variable::Variable( const std::string& name, const std::string& type, const std::string& defaultValue) { name_ = name; type_ = type; defaultValue_ = defaultValue; isBuffer_ = false; constant_ = false; copyOf_ = NULL; flags_ = 0; hostPtr_ = NULL; } Variable::Variable() { Variable("", ""); } MatrixVariable::MatrixVariable( const std::string& name, const std::string& type, const std::string& defaultValue) { name_ = name; type_ = type; defaultValue_ = defaultValue; isBuffer_ = false; constant_ = false; copyOf_ = NULL; flags_ = 0; hostPtr_ = NULL; rows_ = NULL; columns_ = NULL; ld_ = NULL; off_ = NULL; } VectorVariable::VectorVariable( const std::string& name, const std::string& type, const std::string& defaultValue) { name_ = name; type_ = type; defaultValue_ = defaultValue; isBuffer_ = false; constant_ = false; copyOf_ = NULL; flags_ = 0; hostPtr_ = NULL; nElems_ = NULL; inc_ = NULL; off_ = NULL; } Variable::~Variable() { } void Variable::setDefaultValue(const std::string& defaultValue) { defaultValue_ = defaultValue; } void Variable::setConstant(bool constant) { constant_ = constant; } void Variable::setCopy(Variable *copy) { copyOf_ = copy; } void MatrixVariable::setMatrixSize( Variable *rows, Variable *columns, Variable *ld, Variable *off) { if ((rows == NULL) || (columns == NULL)) { return; } rows_ = rows; columns_ = columns; ld_ = ld; off_ = off; matrixPointer_ = name_; if (off != NULL) { matrixPointer_ += " + " + off_->name(); } } void VectorVariable::setVectorSize( Variable *nElems, Variable *inc, Variable *off) { if (nElems == NULL) { return; } nElems_ = nElems; inc_ = inc; off_ = off; vectorPointer_ = name_; if (off != NULL) { vectorPointer_ += " + " + off_->name(); } } std::string Variable::flagsStr() const { std::string str; size_t i; if (type_ != "cl_mem") { return ""; } if (flags_ == 0) { return "0"; } for (i = 0; MEM_FLAGS[i].flag != 0; i++) { if (flags_ & MEM_FLAGS[i].flag) { if (!str.empty()) { str += " | "; } str += MEM_FLAGS[i].name; } } return str; } void Variable::setFlags(cl_mem_flags flags) { if (type_ == "cl_mem") { flags_ = flags; } } void Variable::setHostPtr(Variable *hostPtr) { if (type_ == "cl_mem") { hostPtr_ = hostPtr; } } clblas-2.10/src/library/tools/ktest/var.h000066400000000000000000000112641264277366700204120ustar00rootroot00000000000000/* ************************************************************************ * Copyright 2013 Advanced Micro Devices, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * ************************************************************************/ #ifndef KTEST_VAR_H__ #define KTEST_VAR_H__ #ifdef __APPLE__ #include #else #include #endif #include namespace clMath { typedef enum BufferID { BUFFER_NONE, BUFFER_A, BUFFER_B, BUFFER_C } BufferID; /** * @internal * @brief Variable class * * Objects of this class store name, type and other attributes of variables * necessary for further code generation. * */ class Variable { protected: std::string name_; std::string type_; std::string defaultValue_; bool constant_; bool isBuffer_; BufferID bufID_; Variable *copyOf_; /* Buffer object info */ cl_mem_flags flags_; Variable *hostPtr_; public: Variable(const std::string& name, const std::string& type, const std::string& defaultValue = ""); Variable(); ~Variable(); const std::string& name() const { return name_; } const std::string& type() const { return type_; } const std::string& defaultValue() const { return defaultValue_; } void setDefaultValue(const std::string& defaultValue); bool constant() const { return constant_; } bool isBuffer() const { return isBuffer_; } BufferID getBufID() const { return bufID_; } void setConstant(bool constant); void setIsBuffer(bool isBuffer) { isBuffer_ = isBuffer; } Variable* copyOf() const { return copyOf_; } void setCopy(Variable *copy); void setBufferID(BufferID bufID) { bufID_ = bufID; } cl_mem_flags flags() const { return flags_; } std::string flagsStr() const; void setFlags(cl_mem_flags flags); Variable* hostPtr() const { return hostPtr_; } void setHostPtr(Variable *var); }; class ArrayVariableInterface : public Variable { public: virtual bool isMatrix() = 0; virtual bool isVector() = 0; virtual ~ArrayVariableInterface() {} }; /** * @internal * @brief Matrix variable class * * Objects of this class store information about matrix array * necessary for further code generation. * */ class MatrixVariable : public ArrayVariableInterface { private: /* Matrix info */ Variable *rows_; Variable *columns_; Variable *ld_; Variable *off_; std::string matrixPointer_; public: Variable* rows() const { return rows_; } Variable* columns() const { return columns_; } Variable* ld() const { return ld_; } Variable* off() const { return off_; } bool isMatrix() { return true; } bool isVector() { return false; } const std::string& matrixPointer() const { return matrixPointer_; } void setMatrixSize(Variable *rows, Variable *columns, Variable *ld = NULL, Variable *off = NULL); MatrixVariable(const std::string& name, const std::string& type, const std::string& defaultValue = ""); ~MatrixVariable() {}; }; /** * @internal * @brief Vector variable class * * Objects of this class store information about vector array * necessary for further code generation. * */ class VectorVariable : public ArrayVariableInterface { private: /* Vector info */ Variable *nElems_; Variable *inc_; Variable *off_; std::string vectorPointer_; public: Variable* nElems() const { return nElems_; } Variable* inc() const { return inc_; } Variable* off() const { return off_; } virtual bool isMatrix() { return false; } virtual bool isVector() { return true; } const std::string& vectorPointer() const { return vectorPointer_; } void setVectorSize(Variable *nElems, Variable *inc, Variable *off = NULL); VectorVariable(const std::string& name, const std::string& type, const std::string& defaultValue = ""); }; } // namespace clMath #endif // KTEST_VAR_H__ clblas-2.10/src/library/tools/tplgen/000077500000000000000000000000001264277366700176045ustar00rootroot00000000000000clblas-2.10/src/library/tools/tplgen/CMakeLists.txt000066400000000000000000000015551264277366700223520ustar00rootroot00000000000000# ######################################################################## # Copyright 2013 Advanced Micro Devices, Inc. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # ######################################################################## cmake_minimum_required(VERSION 2.6) project(tplgen C CXX) ADD_DEFINITIONS(/D_CRT_SECURE_NO_WARNINGS) ADD_EXECUTABLE(tplgen tplgen.cpp) clblas-2.10/src/library/tools/tplgen/tplgen.cpp000066400000000000000000000140351264277366700216040ustar00rootroot00000000000000/* ************************************************************************ * Copyright 2013 Advanced Micro Devices, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * ************************************************************************/ #include #include #include #include #include #ifdef __GNUC__ // Linux #include #include #include #else // Windows #include #include #include #define stat _stat #endif using namespace std; void binaryCaseProcess(const string &inputStr, std::ostream &outFile) { //Get the binary location in fileName size_t found = inputStr.find( '@' ); string fileName = inputStr.substr (found+1); //Open the binary std::ifstream file (fileName.c_str(), std::ios::in | std::ios::binary | std::ios::ate); size_t fileSize; if(!file.is_open()) { std::cerr << "fail to open binary file '" << fileName << "'" << std::endl; exit(1); } //Get contents of the binary char* fileContents; fileSize = file.tellg(); fileContents = new char[fileSize]; file.seekg(0, std::ios::beg); if(!file.read(fileContents, fileSize)) { std::cerr << "fail to read binary file '" << fileName << "'" << std::endl; exit(1); } file.close(); outFile << "//generated from the binary: " << fileName << "\n"; //Copy the chars found before the @ outFile << inputStr.substr (0,found); //Write contents of the binary outFile << "[" << fileSize << "] = {\n"; for(int i=0; i < fileSize; i++) { outFile << (int) fileContents[i]; if(i < fileSize-1) outFile << ","; if((i+1)%50 == 0) outFile << "\n"; } outFile << "\n};\n"; } bool isModified( char *clFile, char *clTFile ) { struct stat queryClFile; struct stat queryClTFile; int retval1, retval2; retval1 = stat( clFile, &queryClFile ); retval2 = stat( clTFile, &queryClTFile ); if (retval1 != 0) { // // No CL file to process // return false; } if (retval2 == 0) { // // Both files are present // return ( (queryClFile.st_mtime) >= (queryClTFile.st_mtime) )? true: false; } // // Force a CLT generation - Only CL is present // return true; } int main( int argc, char *argv[] ) { bool validKernel; int lineCount; size_t found; string str; int startOptions = 1; const char *outputPrefix = ""; const char *inputPrefix = ""; char tempInputPrefix[1024]; const char *inputfile = ""; std::cout << "TPLGEN Running.....\n"; if (argc < 2) { return -1; } if (strcmp(argv[1], "-o") == 0) { if (argc < 3) { return -1; } outputPrefix = argv[2]; startOptions = 3; } if (strcmp(argv[startOptions], "-i") == 0) { inputPrefix = argv[startOptions + 1]; startOptions += 2; } for ( int i=startOptions; i // sqrt() #include "toolslib.h" #include "clblas_stddef.h" #include "storage_data.h" unsigned int DimensionsArrayL3[]= {7, 13, 32, 48, 64, 64}; unsigned int DimensionsArrayL2[]= {768/4, 1792/4, 3328/4, 5248/4, 6784/4, 3*1024/4}; int getDimensionCount(TargetDevice* tdev, int func) { (void)tdev; (void)func; return DIMARRAYCOUNT; } // dimension getDimensionID(TargetDevice* tdev, int func, size_t M, size_t N, size_t K) { (void)tdev; (void)func; (void)M; (void)N; (void)K; return 0; } #include unsigned int getDimension(int idx, DataType dt, DeviceInfo *devInfo, int func) { unsigned int dim; // bas - banks aligned size, in bytes, should be // number of banks * number of channels * bytes per channel // here it is set to 8*256 = 2048 = 512 floats size_t bas = 8*256; unsigned int tsize; // The minimum step for which the tails are not. size_t noTailStep; float step; (void) func; tsize = dtypeSize(dt); noTailStep = 256 * sizeof(cl_float) / tsize; // !!! DEBUG //printf("[%s, line %d]: devInfo->globalSize = %lu\n", // __func__, __LINE__, devInfo->globalSize); /* * Skip the smallest size, it does not provide sufficient * device payload anyway */ //i = (idx == DIMARRAYCOUNT - 1) ? (DIMARRAYCOUNT - 1) : (idx + 1); // dim = DimensionsArray2[i]; // dim *= devInfo->nrComputeUnits; step = (float)umin(devInfo->nrComputeUnits, funcBlasLevel(func) == 2 ? 1 : 24); switch (dt) { case TYPE_FLOAT: step *= 4; break; case TYPE_DOUBLE: case TYPE_COMPLEX_FLOAT: step = 2.8f * step; break; case TYPE_COMPLEX_DOUBLE: #if defined(_WIN32) && defined(FORCE_BSOD) if (func != CLBLAS_SYRK && func != CLBLAS_SYR2K) { step *= 2; } #else step *= 2; #endif break; } if (funcBlasLevel(func) == 2) { dim = (unsigned int)(step * DimensionsArrayL2[idx]); } else { dim = (unsigned int)(step * DimensionsArrayL3[idx]); } if (dim * dim * tsize > devInfo->maxMemAllocSize) { dim = (unsigned int)sqrt((double)(devInfo->maxMemAllocSize / tsize)); } assert(devInfo->globalSize); if (dim * dim * tsize >= devInfo->globalSize / 3) { dim = (unsigned int)sqrt((double)devInfo->globalSize / 3 / tsize); } dim = (unsigned int)roundUp(dim - (noTailStep/2), noTailStep); if (idx == BANK_ALIGNED_CASE_RECORD_IDX) { // force size to be banks aligned if (dim * dtypeSize(dt) % bas != 0) { dim = (unsigned int)roundUp(dim, bas / dtypeSize(dt)); } } else { // avoid banks aligned size adding maximal base dimension if (dim * dtypeSize(dt) % bas == 0) { // dim += DimensionsArray2[DIMARRAYCOUNT - 1] / // (dtypeSize(dt) / sizeof(cl_float)); dim += (unsigned int)noTailStep; } } return dim; } clblas-2.10/src/library/tools/tune/fileio.c000066400000000000000000000242731264277366700207110ustar00rootroot00000000000000/* ************************************************************************ * Copyright 2013 Advanced Micro Devices, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * ************************************************************************/ #include #include #include "fileio.h" #ifdef _WIN32 const char dirDelimiter = '\\'; #else const char dirDelimiter = '/'; #endif //TODO typedef unsigned int uint32_t; typedef uint32_t uint_least32_t; /* Name : CRC-32 Poly : 0x04C11DB7 x^32 + x^26 + x^23 + x^22 + x^16 + x^12 + x^11 + x^10 + x^8 + x^7 + x^5 + x^4 + x^2 + x + 1 Init : 0xFFFFFFFF Revert: true XorOut: 0xFFFFFFFF Check : 0xCBF43926 ("123456789") MaxLen: 268 435 455 ���� (2 147 483 647 ���) */ const uint_least32_t Crc32Table[256] = { 0x00000000, 0x77073096, 0xEE0E612C, 0x990951BA, 0x076DC419, 0x706AF48F, 0xE963A535, 0x9E6495A3, 0x0EDB8832, 0x79DCB8A4, 0xE0D5E91E, 0x97D2D988, 0x09B64C2B, 0x7EB17CBD, 0xE7B82D07, 0x90BF1D91, 0x1DB71064, 0x6AB020F2, 0xF3B97148, 0x84BE41DE, 0x1ADAD47D, 0x6DDDE4EB, 0xF4D4B551, 0x83D385C7, 0x136C9856, 0x646BA8C0, 0xFD62F97A, 0x8A65C9EC, 0x14015C4F, 0x63066CD9, 0xFA0F3D63, 0x8D080DF5, 0x3B6E20C8, 0x4C69105E, 0xD56041E4, 0xA2677172, 0x3C03E4D1, 0x4B04D447, 0xD20D85FD, 0xA50AB56B, 0x35B5A8FA, 0x42B2986C, 0xDBBBC9D6, 0xACBCF940, 0x32D86CE3, 0x45DF5C75, 0xDCD60DCF, 0xABD13D59, 0x26D930AC, 0x51DE003A, 0xC8D75180, 0xBFD06116, 0x21B4F4B5, 0x56B3C423, 0xCFBA9599, 0xB8BDA50F, 0x2802B89E, 0x5F058808, 0xC60CD9B2, 0xB10BE924, 0x2F6F7C87, 0x58684C11, 0xC1611DAB, 0xB6662D3D, 0x76DC4190, 0x01DB7106, 0x98D220BC, 0xEFD5102A, 0x71B18589, 0x06B6B51F, 0x9FBFE4A5, 0xE8B8D433, 0x7807C9A2, 0x0F00F934, 0x9609A88E, 0xE10E9818, 0x7F6A0DBB, 0x086D3D2D, 0x91646C97, 0xE6635C01, 0x6B6B51F4, 0x1C6C6162, 0x856530D8, 0xF262004E, 0x6C0695ED, 0x1B01A57B, 0x8208F4C1, 0xF50FC457, 0x65B0D9C6, 0x12B7E950, 0x8BBEB8EA, 0xFCB9887C, 0x62DD1DDF, 0x15DA2D49, 0x8CD37CF3, 0xFBD44C65, 0x4DB26158, 0x3AB551CE, 0xA3BC0074, 0xD4BB30E2, 0x4ADFA541, 0x3DD895D7, 0xA4D1C46D, 0xD3D6F4FB, 0x4369E96A, 0x346ED9FC, 0xAD678846, 0xDA60B8D0, 0x44042D73, 0x33031DE5, 0xAA0A4C5F, 0xDD0D7CC9, 0x5005713C, 0x270241AA, 0xBE0B1010, 0xC90C2086, 0x5768B525, 0x206F85B3, 0xB966D409, 0xCE61E49F, 0x5EDEF90E, 0x29D9C998, 0xB0D09822, 0xC7D7A8B4, 0x59B33D17, 0x2EB40D81, 0xB7BD5C3B, 0xC0BA6CAD, 0xEDB88320, 0x9ABFB3B6, 0x03B6E20C, 0x74B1D29A, 0xEAD54739, 0x9DD277AF, 0x04DB2615, 0x73DC1683, 0xE3630B12, 0x94643B84, 0x0D6D6A3E, 0x7A6A5AA8, 0xE40ECF0B, 0x9309FF9D, 0x0A00AE27, 0x7D079EB1, 0xF00F9344, 0x8708A3D2, 0x1E01F268, 0x6906C2FE, 0xF762575D, 0x806567CB, 0x196C3671, 0x6E6B06E7, 0xFED41B76, 0x89D32BE0, 0x10DA7A5A, 0x67DD4ACC, 0xF9B9DF6F, 0x8EBEEFF9, 0x17B7BE43, 0x60B08ED5, 0xD6D6A3E8, 0xA1D1937E, 0x38D8C2C4, 0x4FDFF252, 0xD1BB67F1, 0xA6BC5767, 0x3FB506DD, 0x48B2364B, 0xD80D2BDA, 0xAF0A1B4C, 0x36034AF6, 0x41047A60, 0xDF60EFC3, 0xA867DF55, 0x316E8EEF, 0x4669BE79, 0xCB61B38C, 0xBC66831A, 0x256FD2A0, 0x5268E236, 0xCC0C7795, 0xBB0B4703, 0x220216B9, 0x5505262F, 0xC5BA3BBE, 0xB2BD0B28, 0x2BB45A92, 0x5CB36A04, 0xC2D7FFA7, 0xB5D0CF31, 0x2CD99E8B, 0x5BDEAE1D, 0x9B64C2B0, 0xEC63F226, 0x756AA39C, 0x026D930A, 0x9C0906A9, 0xEB0E363F, 0x72076785, 0x05005713, 0x95BF4A82, 0xE2B87A14, 0x7BB12BAE, 0x0CB61B38, 0x92D28E9B, 0xE5D5BE0D, 0x7CDCEFB7, 0x0BDBDF21, 0x86D3D2D4, 0xF1D4E242, 0x68DDB3F8, 0x1FDA836E, 0x81BE16CD, 0xF6B9265B, 0x6FB077E1, 0x18B74777, 0x88085AE6, 0xFF0F6A70, 0x66063BCA, 0x11010B5C, 0x8F659EFF, 0xF862AE69, 0x616BFFD3, 0x166CCF45, 0xA00AE278, 0xD70DD2EE, 0x4E048354, 0x3903B3C2, 0xA7672661, 0xD06016F7, 0x4969474D, 0x3E6E77DB, 0xAED16A4A, 0xD9D65ADC, 0x40DF0B66, 0x37D83BF0, 0xA9BCAE53, 0xDEBB9EC5, 0x47B2CF7F, 0x30B5FFE9, 0xBDBDF21C, 0xCABAC28A, 0x53B39330, 0x24B4A3A6, 0xBAD03605, 0xCDD70693, 0x54DE5729, 0x23D967BF, 0xB3667A2E, 0xC4614AB8, 0x5D681B02, 0x2A6F2B94, 0xB40BBE37, 0xC30C8EA1, 0x5A05DF1B, 0x2D02EF8D }; void hfInit(HfInfo* hf) { hf->file = NULL; hf->hash = 0; } char * hfCreateFullPatch( const char* path, const char * name, const char * ext ) { char * fname; size_t pathSize; size_t nameSize; size_t extSize; pathSize = strlen(path); nameSize = strlen(name); extSize = strlen(ext); // Add three characters, terminating zero, slash and dot fname = (char*) malloc((pathSize + nameSize + extSize + 3) * sizeof(char)); strcpy(fname, path); // Added file neme if (fname[pathSize - 1] != dirDelimiter && nameSize > 0){ fname[pathSize] = dirDelimiter; pathSize += 1; fname[pathSize] = '\0'; } strcat(fname, name); strcat(fname, "."); strcat(fname, ext); return fname; } uint_least32_t Crc32(const unsigned char * buf, size_t len) { uint_least32_t crc = 0xFFFFFFFF; while (len--) crc = (crc >> 8) ^ Crc32Table[(crc ^ *buf++) & 0xFF]; return crc ^ 0xFFFFFFFF; } uint_least32_t Crc32Add(const unsigned char * buf, size_t len, uint_least32_t crc) { //uint_least32_t crc = 0xFFFFFFFF; while (len--) crc = (crc >> 8) ^ Crc32Table[(crc ^ *buf++) & 0xFF]; return crc ^ 0xFFFFFFFF; } int hfOpenRead(HfInfo* hf, const char* filename) { hf->hash = 0; #ifdef _DEBUG_TOOLS hf->fileLog = NULL; #endif // _DEBUG if (filename == NULL) { return FILE_NOT_FOUND; } hf->file = fopen(filename, "rb"); if (hf->file == NULL){ return FILE_NOT_FOUND; } return FILE_OK; } int hfOpenWrite(HfInfo* hf, const char* filename) { hf->hash = 0; #ifdef _DEBUG_TOOLS { char* logName = hfCreateFullPatch(filename, "", "log"); hf->fileLog = fopen(logName, "w"); free(logName); } #endif // _DEBUG hf->file = fopen(filename, "wb"); if (!hf->file){ return FILE_ERROR_OPEN_FOR_WRITING; } return FILE_OK; } int hfOpenReWrite(HfInfo* hf, const char* filename) { hf->hash = 0; if (hf->file != NULL){ hfClose(hf); } #ifdef _DEBUG_TOOLS { char* logName = hfCreateFullPatch(filename, "", "log"); hf->fileLog = fopen(logName, "a"); fprintf(hf->fileLog, " ====================== \n"); free(logName); } #endif // _DEBUG hf->file = fopen(filename, "rb+"); return FILE_OK; } int hfClose( HfInfo* hf ) { int ret = 0; if (hf->file != NULL){ ret = fclose(hf->file); hf->file = NULL; } #ifdef _DEBUG_TOOLS if (hf->fileLog != NULL){ ret = fclose(hf->fileLog); hf->fileLog = NULL; } #endif // _DEBUG return ret; } int hfWrite( HfInfo* hf, const void* buff, size_t size ) { hf->hash = Crc32Add(buff, size, hf->hash); #ifdef _DEBUG_TOOLS hf->start = ftell(hf->file); #endif // _DEBUG fwrite(buff, size, 1, hf->file); #ifdef _DEBUG_TOOLS hf->end = ftell(hf->file); fprintf(hf->fileLog, " %8d - %8d (%8d) \n",(int)hf->start, (int)hf->end, (int)size ); fflush(hf->fileLog); #endif // _DEBUG return 0; } int hfWriteCRC( HfInfo* hf ) { #ifdef _DEBUG_TOOLS hf->start = ftell(hf->file); #endif // _DEBUG fwrite (&hf->hash, sizeof(hf->hash), 1, hf->file); hf->hash = 0; #ifdef _DEBUG_TOOLS hf->end = ftell(hf->file); fprintf(hf->fileLog, "CRC %8d - %8d (%8lu) \n",(int)hf->start, (int)hf->end, sizeof(hf->hash) ); fflush(hf->fileLog); #endif // _DEBUG //} fflush(hf->file); return FILE_OK; } int hfReadWithoutCRC( HfInfo* hf, void* buff, size_t size ) { size_t readSize; readSize = fread(buff, 1, size, hf->file); return (int)readSize; } int hfRead( HfInfo* hf, void* buff, int c, size_t size ) { size_t readSize; int i=0; #ifdef _DEBUG_TOOLS hf->start = ftell(hf->file); #endif // _DEBUG readSize = fread(buff, size, c, hf->file); if (readSize != (size_t)c){ return FILE_ERROR_READ_DATA; } for (; i < c; ++i){ hf->hash = Crc32Add((const unsigned char*)buff + (i*size), size, hf->hash); } #ifdef _DEBUG_TOOLS hf->end = ftell(hf->file); #endif // _DEBUG return FILE_OK; } int hfReadConst( HfInfo* hf, const void* buff, size_t size ) { int ret; void* buff2 = malloc(size); ret = FILE_OK; hfRead(hf, buff2, 1, size); if (memcmp(buff, buff2, size) != 0){ ret = FILE_ERROR_BUFFER_MISMATCH; } free(buff2); //hf->isUseHach = true; return ret; } int hfCheckCRC( HfInfo* hf ) { int ret; TYPECRC crc = 0; size_t readSize; #ifdef _DEBUG_TOOLS hf->start = ftell(hf->file); #endif // _DEBUG readSize = fread(&crc, sizeof(crc), 1, hf->file); if (readSize == 1){ if (crc == hf->hash){ ret = FILE_OK; } else { ret = FILE_ERROR_CRC; } } else { ret = FILE_ERROR_READ_DATA; } #ifdef _DEBUG_TOOLS hf->end = ftell(hf->file); #endif // _DEBUG hf->hash = 0; return ret ; } int hfReadString( HfInfo* hf, char** str ) { int status; unsigned int strLen; //long int pos = ftell(hf->file); status = hfRead(hf, &strLen, 1, sizeof(unsigned int)); *str = malloc(strLen + 1); status += hfRead(hf, *str, 1, (size_t)strLen); (*str)[strLen] = '\0'; return status; } int hfWriteString( HfInfo* hf, const char* buff ) { int status; unsigned int strLen = (unsigned int)strlen(buff); status = hfWrite(hf, &strLen, sizeof(unsigned int)); status = hfWrite(hf, buff, strLen); return status; } int hfJump( HfInfo* hf, POSFILE pos ) { fseek(hf->file, (long)pos, SEEK_SET); hf->hash = 0; return FILE_OK; } int hfGetCurentPosition( HfInfo* hf, POSFILE* pos ) { *pos = ftell(hf->file); return FILE_OK; } clblas-2.10/src/library/tools/tune/fileio.h000066400000000000000000000053631264277366700207150ustar00rootroot00000000000000/* ************************************************************************ * Copyright 2013 Advanced Micro Devices, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * ************************************************************************/ #ifndef FILEIO_H__ #define FILEIO_H__ #include #include #include #include #define FILE_OK 0x0000 #define FILE_NOT_FOUND 0x0100 #define FILE_ERROR_OPEN_FOR_WRITING 0x0101 #define FILE_ERROR_READ_DATA 0x0201 #define FILE_ERROR_RESERVED_OVERFLOW 0x0501 #define FILE_ERROR_RESERVED_NOT_FULL 0x0502 #define FILE_ERROR_BUFFER_MISMATCH 0x0601 #define FILE_ERROR_CRC 0x0701 #define FILE_ERROR_INDALID_KERNAL_SIZE 0x0801 typedef unsigned int TYPECRC; #if defined (_WIN32) typedef unsigned __int64 POSFILE; #else #include typedef u_int64_t POSFILE; #endif typedef struct HfInfo { FILE* file; TYPECRC hash; // CRC32 #ifdef _DEBUG_TOOLS FILE* fileLog; POSFILE start; POSFILE end; #endif // _DEBUG }HfInfo; // Structure initialization void hfInit(HfInfo* hf); // Open file for reading int hfOpenRead (HfInfo* hf, const char* filename); // Open file for writing. // if _DEBUG macro is defined, the log file is created. int hfOpenWrite(HfInfo* hf, const char* filename); int hfOpenReWrite(HfInfo* hf, const char* filename); int hfReadWithoutCRC( HfInfo* hf, void* buff, size_t size ); int hfRead(HfInfo* hf, void* buff, int c, size_t size); // Skip data witch calculate CRC // int hfSkip(HfInfo* hf, size_t c, size_t size); //Jamp to position "pos" without calculation CRC int hfJump(HfInfo* hf, POSFILE pos); // int hfGetCurentPosition(HfInfo* hf, POSFILE* pos); int hfReadString(HfInfo* hf, char** str); //! Read data and compare with buff //! \return HF_FILE_ERROR_BUFFER_MISMATCH int hfReadConst(HfInfo* hf, const void* buff, size_t size); //! int hfCheckCRC(HfInfo* hf); int hfWrite(HfInfo* hf, const void* buff, size_t size); int hfWriteString(HfInfo* hf, const char* buff); int hfWriteCRC(HfInfo* hf); int hfClose(HfInfo* hf); char * hfCreateFullPatch( const char* path, const char * name, const char * ext ); #endif /* FILEIO_H__ */ clblas-2.10/src/library/tools/tune/storage_data.c000066400000000000000000000241771264277366700221020ustar00rootroot00000000000000/* ************************************************************************ * Copyright 2013 Advanced Micro Devices, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * ************************************************************************/ #include "storage_data.h" #include "assert.h" BlasParamInfo* findParam( StorageCacheImpl* cacheImpl, const char* pattName, const DataType dt, const KernelExtraFlags kflag, int dim) { unsigned int func; BlasFunctionInfo *functionInfo = cacheImpl->functionInfo; //unsigned int mask[BLAS_FUNCTIONS_NUMBER]; //initMask(mask); for (func =0; func < BLAS_FUNCTIONS_NUMBER; ++ func) { unsigned int patt; BlasFunctionInfo* bFunc = &functionInfo[func]; for (patt =0; patt < bFunc->numPatterns; ++ patt) { unsigned int extra; BlasPatternInfo* bPatt = &bFunc->pattInfo[patt]; if (strcmp(bPatt->name, pattName) == 0) { KernelExtraFlags flag = kflag & bFunc->maskForTuningsKernel; for (extra =0; extra < bPatt->numExtra; ++ extra) { BlasExtraInfo* bExtra = &bPatt->extra[extra]; if (bExtra->dtype == dt && bExtra->flags == flag) { unsigned int param; BlasParamInfo* bestParam = NULL; unsigned int bestDimDelta = 50000; if (dim == 0) { //leading dimension banks aligned case bestParam = &bExtra->param[BANK_ALIGNED_CASE_RECORD_IDX]; } else { for (param = 0; param < bExtra->numParam; ++param) { BlasParamInfo* bParam = &bExtra->param[param]; unsigned int dimDelta = abs(dim - bParam->dim); if (param == BANK_ALIGNED_CASE_RECORD_IDX) { continue; } if (dimDelta < bestDimDelta){ bestDimDelta = dimDelta; bestParam = bParam; } } } return bestParam; } } } } } return NULL; } BlasPatternInfo * getPatternInfo(StorageCacheImpl* cache, unsigned int func, unsigned int patt) { BlasPatternInfo* bPatt = NULL; if (func != BLAS_FUNCTIONS_NUMBER) { BlasFunctionInfo* bFunc = &cache->functionInfo[func]; bPatt = &bFunc->pattInfo[patt]; } return bPatt; } void nextPattern(StorageCacheImpl* cache, unsigned int* func, unsigned int* patt) { BlasFunctionInfo* bFunc = &cache->functionInfo[*func]; (*patt)++; if (bFunc->numPatterns == *patt) { (*func)++; *patt = 0; } } //////////////////////////////////////////////////////////////////////////////// bool isValidFlagMatrix(DataType curType, unsigned int flags) { bool ret; // todo Make refactoring expressions. ret = !isComplexType(curType) && ( (flags & KEXTRA_CONJUGATE_A) || (flags & KEXTRA_CONJUGATE_B)); // The flag KEXTRA_CONJUGATE_X can be set TRUE only when the flag KEXTRA_TRANS_X is TRUE. ret = ret || (flags & (KEXTRA_TRANS_A | KEXTRA_CONJUGATE_A)) == KEXTRA_CONJUGATE_A; ret = ret || (flags & (KEXTRA_TRANS_B | KEXTRA_CONJUGATE_B)) == KEXTRA_CONJUGATE_B; return ret; } size_t getDTypeArray(DataType * dTypes, size_t dtypeCount, DeviceInfo* defInf ) { if (dtypeCount < 4) { return 0; } if (defInf->nativeDouble) { if (defInf->nativeComplex) { dTypes[0] = TYPE_FLOAT; dTypes[1] = TYPE_COMPLEX_FLOAT; dTypes[2] = TYPE_DOUBLE; dTypes[3] = TYPE_COMPLEX_DOUBLE; dtypeCount = 4; } else { dTypes[0] = TYPE_FLOAT; dTypes[1] = TYPE_DOUBLE; dtypeCount = 2; } } else { if (defInf->nativeComplex) { dTypes[0] = TYPE_FLOAT; dTypes[1] = TYPE_COMPLEX_FLOAT; dtypeCount = 2; } else { dTypes[0] = TYPE_FLOAT; dtypeCount = 1; } } return dtypeCount; } void initParamData (BlasParamInfo* bParam, int dim) { memset(bParam->sDim, 0, sizeof(SubproblemDim) * MAX_SUBDIMS); memset(&bParam->pGran, 0, sizeof(PGranularity) ); memset(bParam->kernel, 0, sizeof(OFFSET) * MAX_CLBLAS_KERNELS_PER_STEP); memset(bParam->kSize, 0, sizeof(size_t)* MAX_CLBLAS_KERNELS_PER_STEP); bParam->time = 1e50; // any large number; bParam->dim = dim; bParam->offset = 0; bParam->size = 0; bParam->sstatus = SS_NOLOAD; } void initExtraData(BlasExtraInfo* bExtra, DataType dTypes, unsigned int flags, DeviceInfo* di) { unsigned int param; int func = bExtra->parent->parent->funcNo; assert(bExtra->param == 0); bExtra->dtype = dTypes; bExtra->flags = flags; if (isComplexType(dTypes)) { bExtra->vecLen = 2; } else { bExtra->vecLen = 4; } bExtra->numParam = getDimensionCount(di->tdev, func); bExtra->offset = 0; bExtra->size = 0; bExtra->sstatus = SS_NOLOAD; bExtra->param = calloc( bExtra->numParam, sizeof(BlasParamInfo)); for (param = 0; param < bExtra->numParam; ++param) { BlasParamInfo* bParam = &bExtra->param[param]; initParamData(bParam, getDimension(param, bExtra->dtype, di, func)); } } int genExtraDatasForPattern( BlasPatternInfo* bPatt, unsigned int tuningsMask, unsigned int uniqueMask, DeviceInfo* defInf) { size_t dtypeCount; size_t ndt; unsigned int flags; unsigned int index; DataType dTypes[4]; BlasExtraInfo* extra; BlasFunctionInfo* bFunc; unsigned int extraCount; bFunc = bPatt->parent; extra = bPatt->extra; extraCount = bPatt->numExtra; bPatt->numTuneExtra = 0; dtypeCount = getDTypeArray(dTypes, 4, defInf); index = 0; for (flags = 0; flags <= uniqueMask; flags++) { unsigned int m = flags & (~uniqueMask); if (!m){ for (ndt = 0; ndt < dtypeCount; ++ndt) { DataType curType = dTypes[ndt]; if ( bFunc->isValidFlag != NULL && bFunc->isValidFlag(curType, flags)) { continue; } if (extra != NULL) { unsigned int tm; if (index == extraCount) { return index; } extra[index].parent = bPatt; initExtraData(&extra[index], dTypes[ndt], flags, defInf); tm = flags & (~tuningsMask); extra[index].isUseForTunning = tm == 0; if (extra[index].isUseForTunning) { bPatt->numTuneExtra++; } } ++index; } } else { m = (m&(m-1))^m; flags = flags + m - 1; } } return index; } void initPatternData (BlasPatternInfo* bPatt, DeviceInfo* defInf) { unsigned int tuningsMask = bPatt->parent->maskForTuningsKernel; unsigned int uniqueMask = bPatt->parent->maskForUniqueKernels; assert(bPatt->numExtra == 0); assert(bPatt->extra == 0); bPatt->numExtra = genExtraDatasForPattern(bPatt, tuningsMask, uniqueMask, defInf); bPatt->offset = 0; bPatt->size = 0; bPatt->sstatus = SS_NOLOAD; bPatt->extra = calloc( bPatt->numExtra, sizeof(BlasExtraInfo)); genExtraDatasForPattern(bPatt, tuningsMask, uniqueMask, defInf); } void initFuncData (BlasFunctionInfo* bFunc, DeviceInfo* defInf) { unsigned int patt; bFunc->isValidFlag = isValidFlagMatrix; if (bFunc->initFunctionInfo != NULL) { bFunc->initFunctionInfo(bFunc); } for (patt = 0 ; patt < bFunc->numPatterns; ++patt) { BlasPatternInfo* bPatt = &bFunc->pattInfo[patt]; bPatt->parent = bFunc; bPatt->name = bFunc->pattern[patt].name; bPatt->pattNo = patt; initPatternData (bPatt, defInf); } } void initCacheData (BlasFunctionInfo* bFuncs, DeviceInfo* defInf) { unsigned int func; for (func=0; func < BLAS_FUNCTIONS_NUMBER; ++func) { BlasFunctionInfo* bFunc = &bFuncs[func]; bFunc->funcNo = func; initFuncData(bFunc, defInf); } } void destroyParamData(BlasParamInfo* bParam) { int k; for (k=0; k < MAX_CLBLAS_KERNELS_PER_STEP; ++k) { bParam->kSize[0] = 0; } } void destroyExtraData(BlasExtraInfo* bExtra) { unsigned int param; if (bExtra == NULL) { return; } for (param = 0; param < bExtra->numParam; ++param) { BlasParamInfo* bParam = &bExtra->param[param]; destroyParamData(bParam); } free(bExtra->param); } void destroyPatternData(BlasPatternInfo* bPatt) { unsigned int extra; for (extra = 0 ; extra < bPatt->numExtra; ++extra){ BlasExtraInfo* bExtra = &bPatt->extra[extra]; destroyExtraData (bExtra); } free (bPatt->extra); } void destroyFuncData(BlasFunctionInfo* bFunc) { unsigned int patt; for (patt = 0 ; patt < bFunc->numPatterns; ++patt) { BlasPatternInfo* bPatt = &bFunc->pattInfo[patt]; destroyPatternData (bPatt); } } void destroyData(BlasFunctionInfo* fInfo) { unsigned int func; for (func =0; func < BLAS_FUNCTIONS_NUMBER; ++ func){ destroyFuncData( &fInfo[func]); } } clblas-2.10/src/library/tools/tune/storage_data.h000066400000000000000000000136271264277366700221050ustar00rootroot00000000000000/* ************************************************************************ * Copyright 2013 Advanced Micro Devices, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * ************************************************************************/ #ifndef STORAGEDATA_H_ #define STORAGEDATA_H_ #include #include #include #include #ifdef __APPLE__ #include #else #include #endif #include #include "toolslib.h" #include "solution_seq.h" #include "matrix_dims.h" // typedef unsigned int OFFSET; /* Device information needed for tuning CLBLAS kernels. */ typedef struct CLDeviceInfoRec { cl_uint nrComputeUnits; /* CL_DEVICE_MAX_COMPUTE_UNITS */ unsigned int nrStreamCores; /* Number of stream cores per Compute Unit */ cl_ulong globalSize; /* CL_DEVICE_GLOBAL_MEM_SIZE */ cl_ulong maxMemAllocSize; /* CL_DEVICE_MAX_MEM_ALLOC_SIZE */ cl_ulong ldsSize; /* CL_DEVICE_LOCAL_MEM_SIZE */ unsigned int wavefront; /* Number of work-items executed in parallel on hardware */ cl_uint alignment; /* CL_DEVICE_MIN_DATA_TYPE_ALIGN_SIZE */ unsigned int addressBits; /* CL_DEVICE_ADDRESS_BITS */ size_t workItemSizes[3]; /* CL_DEVICE_MAX_WORK_ITEM_SIZES */ cl_uint workItemSizesDim; /* CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS */ size_t workGroupSizes; /* CL_DEVICE_MAX_WORK_GROUP_SIZE */ bool nativeDouble; /* Specifies whether device supports double precision float */ bool nativeComplex; /* Specifies whether device supports complex float */ TargetDevice* tdev; } DeviceInfo; typedef enum Dimensions{ DIMARRAY_SMALL, DIMARRAY_SHORT, DIMARRAY_MIDDLE, DIMARRAY_BIG, DIMARRAY_HUGE, DIMARRAY_BANK_CONFLICT, DIMARRAYCOUNT, // }Dimensions; // struct SubDimInfo; struct BlasFunctionInfo; struct BlasPatternInfo; struct BlasExtraInfo; struct MatrixInfo; typedef enum SynchStatus { SS_NOLOAD, SS_CORRECT_DATA, SS_INCORRECT_DATA, }SynchStatus; typedef struct BlasParamInfo { int dim; SubproblemDim sDim[MAX_SUBDIMS]; PGranularity pGran; OFFSET kernel[MAX_CLBLAS_KERNELS_PER_STEP]; unsigned int kSize[MAX_CLBLAS_KERNELS_PER_STEP]; double time; OFFSET offset; size_t size; SynchStatus sstatus; } BlasParamInfo; typedef struct BlasExtraInfo { struct BlasPatternInfo* parent; unsigned int numParam; DataType dtype; KernelExtraFlags flags; unsigned int vecLen; bool isUseForTunning; BlasParamInfo* param; OFFSET offset; size_t size; SynchStatus sstatus; } BlasExtraInfo; typedef struct BlasPatternInfo { struct BlasFunctionInfo* parent; unsigned int numExtra; unsigned int numTuneExtra; BlasExtraInfo* extra; const char * name; OFFSET offset; size_t size; SynchStatus sstatus; unsigned int pattNo; bool (*isPGValid) (struct SubDimInfo* sdi); void (*initSubdim)(struct SubDimInfo* sdi); } BlasPatternInfo; typedef struct BlasFunctionInfo { unsigned int numPatterns; int funcNo; unsigned int maskForTuningsKernel; unsigned int maskForUniqueKernels; const char* envImplementation; int defaultPattern; const char* name; // bool (*isValidFlag) (DataType curType, unsigned int flags); void (*initFunctionInfo) (struct BlasFunctionInfo* bFunc); void (*initKNM) (struct MatrixInfo*, unsigned int baseDim); BlasPatternInfo pattInfo[MEMPAT_PER_BLASFN]; MemoryPattern pattern[MEMPAT_PER_BLASFN]; } BlasFunctionInfo; typedef struct StorageCacheImpl { char* fpath; char* fpath_tmp; bool isInit; // bool isPopulate; // The cache has been initialized, // but does not contain data BlasFunctionInfo functionInfo[BLAS_FUNCTIONS_NUMBER]; DeviceIdent devIdent; OFFSET endFile; } StorageCacheImpl; /* * The 'force' argument set to true means returning a cache object even * if the file on disk doesn't exist */ StorageCacheImpl* getStorageCache(TargetDevice* devID, bool force); BlasParamInfo* findParam(StorageCacheImpl* cache, const char* pattName, const DataType dt, const KernelExtraFlags kflag, int dim); void loadKernelsFromFile(StorageCacheImpl* cache, BlasParamInfo* bParam, unsigned char** buffer, size_t* sizeBuffer); void loadDataFromFile(StorageCacheImpl* cache); char * createFullPatch(const char * name, bool tmp); OFFSET calcOffset(BlasFunctionInfo* functionInfo); BlasPatternInfo * getPatternInfo(StorageCacheImpl* cache, unsigned int func, unsigned int patt); void nextPattern(StorageCacheImpl* cache, unsigned int* func, unsigned int* patt); void saveBestParam(TargetDevice* tdev, BlasParamInfo* bParam); unsigned int getDimension(int idx, DataType dt, DeviceInfo* di, int func); bool initReadingData(StorageCacheImpl* cacheImpl, TargetDevice* devID ); void initBlasFuncionData(BlasFunctionInfo* fInfo); void initCacheData (BlasFunctionInfo* bFunc, DeviceInfo* defInfo); void initCLDeviceInfoRec(TargetDevice* tdev, DeviceInfo *devInfo); void destroyData(BlasFunctionInfo* fInfo); #endif /* STORAGEDATA_H_ */ clblas-2.10/src/library/tools/tune/storage_init.c000066400000000000000000000135171264277366700221300ustar00rootroot00000000000000/* ************************************************************************ * Copyright 2013 Advanced Micro Devices, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * ************************************************************************/ #include "storage_data.h" void initGemm(BlasFunctionInfo* bFunc) { bFunc->name = "GEMM"; bFunc->envImplementation = "AMD_CLBLAS_GEMM_IMPLEMENTATION"; bFunc->numPatterns = initGemmMemPatterns(bFunc->pattern); bFunc->defaultPattern = bFunc->numPatterns - 2; bFunc->maskForTuningsKernel = KEXTRA_TRANS_A | KEXTRA_TRANS_B | KEXTRA_COLUMN_MAJOR ; bFunc->maskForUniqueKernels = KEXTRA_TRANS_A | KEXTRA_CONJUGATE_A | KEXTRA_TRANS_B | KEXTRA_CONJUGATE_B | KEXTRA_COLUMN_MAJOR | KEXTRA_BETA_ZERO ; } void initTrmm(BlasFunctionInfo* bFunc) { bFunc->name = "TRMM"; bFunc->envImplementation = "AMD_CLBLAS_TRMM_IMPLEMENTATION"; bFunc->numPatterns = initTrmmMemPatterns(bFunc->pattern); bFunc->defaultPattern = bFunc->numPatterns - 1; bFunc->maskForTuningsKernel = KEXTRA_TRANS_A | KEXTRA_UPPER_TRIANG | KEXTRA_SIDE_RIGHT | KEXTRA_COLUMN_MAJOR ; bFunc->maskForUniqueKernels = KEXTRA_TRANS_A | KEXTRA_CONJUGATE_A | KEXTRA_UPPER_TRIANG | KEXTRA_SIDE_RIGHT | KEXTRA_UNIT_DIAGONAL | KEXTRA_COLUMN_MAJOR ; } void initTrsm(BlasFunctionInfo* bFunc) { bFunc->name = "TRSM"; bFunc->envImplementation = "AMD_CLBLAS_TRSM_IMPLEMENTATION"; bFunc->numPatterns = initTrsmMemPatterns(bFunc->pattern); // FIXME Correct, when adding a new pattern will not lead to corrupt it. // don't create a partition for new TRSM pattern if (bFunc->numPatterns == 3) { bFunc->numPatterns = 2; } bFunc->defaultPattern = bFunc->numPatterns - 1; bFunc->maskForTuningsKernel = KEXTRA_TRANS_A | KEXTRA_UPPER_TRIANG | KEXTRA_SIDE_RIGHT | KEXTRA_COLUMN_MAJOR ; bFunc->maskForUniqueKernels = KEXTRA_TRANS_A | KEXTRA_CONJUGATE_A | KEXTRA_UPPER_TRIANG | KEXTRA_SIDE_RIGHT | KEXTRA_UNIT_DIAGONAL | KEXTRA_COLUMN_MAJOR ; } void initGemv(BlasFunctionInfo* bFunc) { bFunc->name = "GEMV"; bFunc->envImplementation = NULL; bFunc->numPatterns = initGemvMemPatterns(bFunc->pattern); bFunc->defaultPattern = bFunc->numPatterns - 1; bFunc->maskForTuningsKernel = KEXTRA_TRANS_A | KEXTRA_COLUMN_MAJOR | KEXTRA_UPPER_TRIANG ; bFunc->maskForUniqueKernels = KEXTRA_TRANS_A | KEXTRA_COLUMN_MAJOR | KEXTRA_UPPER_TRIANG | KEXTRA_BETA_ZERO | KEXTRA_INCX_ONE | KEXTRA_INCY_ONE ; } void initSymv(BlasFunctionInfo* bFunc) { bFunc->name = "SYMV"; bFunc->envImplementation = NULL; bFunc->numPatterns = initSymvMemPatterns(bFunc->pattern); bFunc->defaultPattern = bFunc->numPatterns - 1; bFunc->maskForTuningsKernel = KEXTRA_COLUMN_MAJOR | KEXTRA_UPPER_TRIANG ; bFunc->maskForUniqueKernels = KEXTRA_COLUMN_MAJOR | KEXTRA_UPPER_TRIANG | KEXTRA_BETA_ZERO | KEXTRA_INCX_ONE | KEXTRA_INCY_ONE ; } void initSyr2k(BlasFunctionInfo* bFunc) { bFunc->name = "SYR2K"; bFunc->envImplementation = NULL; bFunc->numPatterns = initSyr2kMemPatterns(bFunc->pattern); bFunc->defaultPattern = bFunc->numPatterns - 1; bFunc->maskForTuningsKernel = KEXTRA_TRANS_A //| KEXTRA_CONJUGATE_A //| KEXTRA_TRANS_B //| KEXTRA_CONJUGATE_B | KEXTRA_COLUMN_MAJOR //| KEXTRA_UPPER_TRIANG //|KEXTRA_SIDE_RIGHT //| KEXTRA_TAILS_M //| KEXTRA_TAILS_N //| KEXTRA_TAILS_K //| KEXTRA_BETA_ZERO //| KEXTRA_NO_COPY_VEC_A = 0x1000, //| KEXTRA_NO_COPY_VEC_B = 0x2000, //| KEXTRA_NO_COPY_VEC_C = 0x4000, ; bFunc->maskForUniqueKernels = bFunc->maskForTuningsKernel; } void initSyrk(BlasFunctionInfo* bFunc) { bFunc->name = "SYRK"; bFunc->envImplementation = NULL; bFunc->numPatterns = initSyrkMemPatterns(bFunc->pattern); bFunc->defaultPattern = bFunc->numPatterns - 1; bFunc->maskForTuningsKernel = KEXTRA_TRANS_A //| KEXTRA_CONJUGATE_A //| KEXTRA_TRANS_B //| KEXTRA_CONJUGATE_B | KEXTRA_COLUMN_MAJOR //| KEXTRA_UPPER_TRIANG //|KEXTRA_SIDE_RIGHT //| KEXTRA_TAILS_M //| KEXTRA_TAILS_N //| KEXTRA_TAILS_K //| KEXTRA_BETA_ZERO //| KEXTRA_NO_COPY_VEC_A = 0x1000, //| KEXTRA_NO_COPY_VEC_B = 0x2000, //| KEXTRA_NO_COPY_VEC_C = 0x4000, ; bFunc->maskForUniqueKernels = bFunc->maskForTuningsKernel;} void initBlasFuncionData(BlasFunctionInfo* fInfo) { // unsigned int func; memset(fInfo, 0, BLAS_FUNCTIONS_NUMBER * sizeof(BlasFunctionInfo)); fInfo[CLBLAS_GEMM].initFunctionInfo = initGemm; fInfo[CLBLAS_TRMM].initFunctionInfo = initTrmm; fInfo[CLBLAS_TRSM].initFunctionInfo = initTrsm; fInfo[CLBLAS_GEMV].initFunctionInfo = initGemv; fInfo[CLBLAS_SYMV].initFunctionInfo = initSymv; fInfo[CLBLAS_SYR2K].initFunctionInfo = initSyr2k; fInfo[CLBLAS_SYRK].initFunctionInfo = initSyrk; } clblas-2.10/src/library/tools/tune/storage_io.c000066400000000000000000000455751264277366700216050ustar00rootroot00000000000000/* ************************************************************************ * Copyright 2013 Advanced Micro Devices, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * ************************************************************************/ #include #include #include #include "fileio.h" #include "storage_data.h" #define SUBDIM_UNUSED_FILE_VALUE 10000 const char *ENV_FILE_PATH = "CLBLAS_STORAGE_PATH"; const char *FileID = "CBS"; const char *FileExt = "kdb"; const char *FileExtTmp = "kdb.tmp"; const int fileVersion = 3; POSFILE findPattern(HfInfo* file, const char* name) { const int bufSize = 1024*64; char buffer[1024*64]; POSFILE fpos = 0; int ib; int in; int bufRead; int nameLen = (int)strlen(name); hfJump(file, 1); in = 0; do { hfGetCurentPosition(file, &fpos); bufRead = hfReadWithoutCRC(file, buffer, bufSize); for (ib = 0; ib < bufRead; ++ib) { if (name[in] == buffer[ib]) { in++; if (in >= nameLen) { fpos += + ib - nameLen + 1 - sizeof(unsigned int); hfJump(file, fpos); return true; } }else{ in = 0; } } } while (bufRead == bufSize); return 0; } bool checkFile(HfInfo* file, size_t pos2, int status) { POSFILE pos; hfGetCurentPosition(file, &pos); if ((POSFILE)pos2 == pos && status == FILE_OK) { return true; } return false; } // PATTERN void calcPatternOffset(BlasPatternInfo* bPatt, POSFILE* offset) { unsigned int len = (unsigned int)strlen(bPatt->name) + 1; bPatt->size = sizeof(len); bPatt->size += len; bPatt->size += sizeof(bPatt->numExtra); bPatt->size += sizeof(TYPECRC); bPatt->offset = (OFFSET)*offset; *offset += (POSFILE)bPatt->size; } // PARAM void calcParamOffset(BlasParamInfo* bParam, POSFILE* offset) { bParam->size = sizeof(unsigned int) * 5 * MAX_SUBDIMS; bParam->size += sizeof(PGranularity); bParam->size += sizeof(POSFILE)*MAX_CLBLAS_KERNELS_PER_STEP; bParam->size += sizeof(bParam->kSize); bParam->size += sizeof(double); bParam->size += sizeof(TYPECRC); bParam->offset = (OFFSET)*offset; *offset += (POSFILE)bParam->size; } int loadParamData(HfInfo* file, BlasParamInfo* bParam) { int status = 0; int i = 0; int ret = 0; bool dimExist = true; for (i =0; i < MAX_SUBDIMS; i++){ unsigned int temp; status+= hfRead(file, &temp, 1, sizeof(temp)); bParam->sDim[i].x = (size_t)temp; status+= hfRead(file, &temp, 1, sizeof(temp)); bParam->sDim[i].y = (size_t)temp; status+= hfRead(file, &temp, 1, sizeof(temp)); bParam->sDim[i].itemX = (temp >= SUBDIM_UNUSED_FILE_VALUE) ? SUBDIM_UNUSED : (size_t)temp; status+= hfRead(file, &temp, 1, sizeof(temp)); bParam->sDim[i].itemY = (temp >= SUBDIM_UNUSED_FILE_VALUE) ? SUBDIM_UNUSED : (size_t)temp; status+= hfRead(file, &temp, 1, sizeof(temp)); bParam->sDim[i].bwidth = (size_t)temp; } status += hfRead(file, &bParam->pGran, 1, sizeof(PGranularity)); status += hfRead(file, bParam->kernel, 1, sizeof(POSFILE) * MAX_CLBLAS_KERNELS_PER_STEP); status += hfRead(file, bParam->kSize, 1, sizeof(bParam->kSize)); status += hfRead(file, &bParam->time, 1, sizeof(double) ); if ((status == FILE_OK) && (bParam->sDim[0].y == 0)) { dimExist = false; } status += hfCheckCRC(file); if (!dimExist && (status == FILE_ERROR_CRC)) { ret = 1; // file is valid but doesn't have actual data } else if (!checkFile(file, (size_t)bParam->offset + bParam->size, status)) { ret = -1; // file is corrupted } else if (bParam->time > 10000.0) { ret = 1; } if (ret) { memset(bParam->sDim, 0, sizeof(SubproblemDim) * MAX_SUBDIMS); memset(&bParam->pGran, 0, sizeof(PGranularity) ); memset(bParam->kernel, 0, sizeof(POSFILE) * MAX_CLBLAS_KERNELS_PER_STEP ); memset(bParam->kSize, 0, sizeof(unsigned int) * MAX_CLBLAS_KERNELS_PER_STEP ); bParam->time = 1e50; // any large number; } return ret; } // EXTRA DATA void calcExtraOffset(BlasExtraInfo* bExtra, POSFILE* offset) { bExtra->size = sizeof(unsigned int); bExtra->size += sizeof(unsigned int); bExtra->size += sizeof(unsigned int); bExtra->size += sizeof(TYPECRC); bExtra->offset = (OFFSET)*offset; *offset += (OFFSET)bExtra->size; } bool readExtraData( HfInfo* file, BlasExtraInfo* bExtra, int numParam) { int param; int ret = 0; if (bExtra->param == NULL) return false; for (param = 0; param < numParam; ++ param) { BlasParamInfo* bpi = &bExtra->param[param]; ret += loadParamData(file, bpi); if (ret == 0) { bpi->sstatus = SS_CORRECT_DATA; } } if (ret == 0) { bExtra->sstatus = SS_CORRECT_DATA; } return false; } bool loadPatternDataFromFile( HfInfo * file, char** name, unsigned int* len, unsigned int* numExtra) { int status = 0; status += hfRead(file, len, 1, sizeof(*len)); *name = malloc((*len)* sizeof(char)); status += hfRead(file, *name, 1, *len); status += hfRead(file, numExtra, 1, sizeof(unsigned int)); status += hfCheckCRC (file); return status == FILE_OK; } int readExtaDataHeader ( HfInfo * file, unsigned int* dtype, unsigned int* flags, unsigned int* numParam) { int status = 0; status += hfRead(file, dtype, 1, sizeof(unsigned int)); status += hfRead(file, flags, 1, sizeof(unsigned int)); status += hfRead(file, numParam, 1, sizeof(unsigned int)); status += hfCheckCRC(file); return status; } bool readPatternData( HfInfo* file, BlasPatternInfo* bPatt, int numExtra) { unsigned int dtype; unsigned int flags; unsigned int numParam; int ief = 0; int ied = 0; int ret; POSFILE extraSize = 0; if (numExtra > 2) { extraSize = bPatt->extra[1].offset - bPatt->extra[0].offset; } for (ief = 0; ief < numExtra; ++ief) { BlasExtraInfo* bExtra = &bPatt->extra[ied]; POSFILE curPos; ied++; hfGetCurentPosition(file, &curPos); ret = readExtaDataHeader(file, &dtype, &flags, &numParam); if (ret != FILE_OK) { hfJump(file, curPos + extraSize); continue; } bExtra->sstatus = SS_CORRECT_DATA; if ((bExtra->dtype == dtype) && (bExtra->flags == flags)) { readExtraData(file, bExtra, numParam); } else { } } return true; } int loadHeader(HfInfo* file) { int version; int status = 0; unsigned blasFunctionNumber; POSFILE posFile; status = hfReadConst(file, FileID, strlen(FileID)); status += hfRead(file, &version, 1, sizeof(version)); status += hfRead(file, &blasFunctionNumber, 1, sizeof(blasFunctionNumber)); status += hfRead(file, &posFile, 1, sizeof(posFile)); status += hfCheckCRC(file); return (status == 0)? version:0; } void saveHeader(HfInfo* file, unsigned int blasFunctionNumber, POSFILE binData) { int status = 0; status = hfWrite(file, FileID, strlen(FileID)); status += hfWrite(file, &fileVersion, sizeof(fileVersion)); status += hfWrite(file, &blasFunctionNumber, sizeof(blasFunctionNumber)); status += hfWrite(file, &binData, sizeof(binData)); status += hfWriteCRC(file); } bool checkOffset(BlasFunctionInfo* functionInfo) { unsigned int func; unsigned int patt; unsigned int extra; unsigned int param; bool ret = false; for (func =0; func < BLAS_FUNCTIONS_NUMBER; ++ func) { BlasFunctionInfo* bFunc = &functionInfo[func]; for (patt =0; patt < bFunc->numPatterns; ++ patt) { BlasPatternInfo* bPatt = &bFunc->pattInfo[patt]; ret |= (bPatt->offset == 0); for (extra =0; extra < bPatt->numExtra; ++ extra) { BlasExtraInfo* bExtra = &bPatt->extra[extra]; ret |= (bExtra->offset == 0 ); for (param =0; param < bExtra->numParam; ++ param) { BlasParamInfo* bParam = &bExtra->param[param]; ret |= (bParam->offset == 0 ); } } } } return ret; } void loadDataFromFile(StorageCacheImpl* cache) { bool structIsCorrect = true; char* name = NULL; unsigned int nameLen; unsigned int numExtra; unsigned int curFunc = 0; unsigned int curPatt = 0; unsigned int func; unsigned int patt; HfInfo file; if ( hfOpenRead(&file, cache->fpath) == FILE_NOT_FOUND ) { cache->isPopulate = false; return; } // Read file Header loadHeader(&file); // Read pattern header structIsCorrect &= loadPatternDataFromFile(&file, &name, &nameLen, &numExtra); while (structIsCorrect) { unsigned int func = curFunc; unsigned int patt = curPatt; bool ret; BlasPatternInfo* bPatt = getPatternInfo(cache, func, patt); while (bPatt != NULL && memcmp(name, bPatt->name, nameLen) != 0 ) { nextPattern(cache, &func, &patt); bPatt = getPatternInfo(cache, func, patt); } if (bPatt != NULL) { bPatt->sstatus = SS_CORRECT_DATA; // Read pattern data ret = readPatternData(&file, bPatt, numExtra); // go to next pattern nextPattern(cache, &func, &patt); // if the pattern is read witch error or not completely if (!ret) { bPatt = getPatternInfo(cache, func, patt); hfJump(&file, bPatt->offset); } curFunc = func; curPatt = patt; } free(name); name = NULL; structIsCorrect &= loadPatternDataFromFile(&file, &name, &nameLen, &numExtra ); } for (func =0; func < BLAS_FUNCTIONS_NUMBER; ++ func) { BlasFunctionInfo* bFunc = &cache->functionInfo[func]; for (patt =0; patt < bFunc->numPatterns; ++ patt){ BlasPatternInfo* bPatt = &bFunc->pattInfo[patt]; if (bPatt->sstatus == SS_NOLOAD) { POSFILE ret = findPattern(&file, bPatt->name); if (ret != 0) { loadPatternDataFromFile(&file, &name, &nameLen, &numExtra ); readPatternData(&file, bPatt, numExtra); } } } } free(name); cache->isPopulate = true; hfClose(&file); checkOffset(cache->functionInfo); } char * createFullPatch(const char * name, bool tmp) { char* path = getenv(ENV_FILE_PATH); const char * ext = (tmp)? FileExtTmp: FileExt; if (path == NULL) { return NULL; } return hfCreateFullPatch(path, name, ext); } OFFSET calcOffset(BlasFunctionInfo* functionInfo) { unsigned int func; unsigned int patt; unsigned int extra; unsigned int param; POSFILE pos = 0; pos += (POSFILE)strlen(FileID); pos += sizeof(int); // Version pos += sizeof(unsigned int); // Func Count; pos += sizeof(POSFILE); // Func Count; pos += sizeof(TYPECRC); for (func =0; func < BLAS_FUNCTIONS_NUMBER; ++ func) { BlasFunctionInfo* bFunc = &functionInfo[func]; for (patt =0; patt < bFunc->numPatterns; ++ patt) { BlasPatternInfo* bPatt = &bFunc->pattInfo[patt]; calcPatternOffset(bPatt, &pos); for (extra =0; extra < bPatt->numExtra; ++ extra) { BlasExtraInfo* bExtra = &bPatt->extra[extra]; calcExtraOffset(bExtra, &pos); for (param =0; param < bExtra->numParam; ++ param) { BlasParamInfo* bParam = &bExtra->param[param]; calcParamOffset(bParam, &pos); } } } } return (OFFSET)pos; } void loadKernelData( HfInfo* file, BlasParamInfo* bParam, unsigned char** buffer, size_t* sizeBuffer) { int k; int status = FILE_ERROR_READ_DATA; for (k =0; k < MAX_CLBLAS_KERNELS_PER_STEP; ++k) { sizeBuffer[k] = bParam->kSize[k]; if (sizeBuffer[k] != 0 && bParam->kernel[k] != 0) { buffer[k] = malloc(sizeBuffer[k]); hfJump(file, bParam->kernel[k]); hfRead(file, buffer[k], 1, sizeBuffer[k]); status = hfCheckCRC(file); } if (status != FILE_OK) { sizeBuffer[k] = 0; buffer[k] = NULL; } } } void loadKernelsFromFile( StorageCacheImpl* cache, BlasParamInfo* bParam, unsigned char** buffer, size_t* sizeBuffer) { HfInfo file; hfOpenRead(&file, cache->fpath); loadKernelData(&file, bParam, buffer, sizeBuffer); hfClose(&file); } void saveKernelData ( StorageCacheImpl* cacheImpl, HfInfo* file, unsigned char** buffer, size_t* sizeBuffer) { int status; POSFILE pos; unsigned int k; for (k =0; k < MAX_CLBLAS_KERNELS_PER_STEP; ++k) { pos = cacheImpl->endFile; status = hfJump(file, pos); status += hfWrite(file, &sizeBuffer[k], sizeof(size_t)); status += hfWrite(file, buffer[k], sizeBuffer[k]); status += hfWriteCRC(file); status += hfGetCurentPosition(file, &pos); if (status == FILE_OK) { cacheImpl->endFile = (OFFSET)pos; } } } bool copyKernalData( StorageCacheImpl* cacheImpl, HfInfo* oldfile, HfInfo* newfile, BlasParamInfo* bParam) { int k; unsigned char* buffer[MAX_CLBLAS_KERNELS_PER_STEP]; size_t sizeBuffer[MAX_CLBLAS_KERNELS_PER_STEP]; loadKernelData(oldfile, bParam, buffer, sizeBuffer); saveKernelData(cacheImpl, newfile, buffer, sizeBuffer); for (k =0; k < MAX_CLBLAS_KERNELS_PER_STEP; ++k) { free (buffer[k]); } return false; } bool saveParamData (HfInfo* file, BlasParamInfo* bParam) { int status; int i; status = hfJump(file, bParam->offset); for (i =0; i < MAX_SUBDIMS; i++){ unsigned int temp; temp = (unsigned int)bParam->sDim[i].x; status+= hfWrite(file, &temp, sizeof(temp)); temp = (unsigned int)bParam->sDim[i].y; status+= hfWrite(file, &temp, sizeof(temp)); temp = (bParam->sDim[i].itemX == SUBDIM_UNUSED) ? SUBDIM_UNUSED_FILE_VALUE : (unsigned int)bParam->sDim[i].itemX; status+= hfWrite(file, &temp, sizeof(temp)); temp = (bParam->sDim[i].itemY == SUBDIM_UNUSED) ? SUBDIM_UNUSED_FILE_VALUE : (unsigned int)bParam->sDim[i].itemY; status+= hfWrite(file, &temp, sizeof(temp)); temp = (unsigned int)bParam->sDim[i].bwidth; status+= hfWrite(file, &temp, sizeof(temp)); } status += hfWrite(file, &bParam->pGran, sizeof(PGranularity)); status += hfWrite(file, bParam->kernel, sizeof(POSFILE)*MAX_CLBLAS_KERNELS_PER_STEP); status += hfWrite(file, bParam->kSize, sizeof(bParam->kSize)); status += hfWrite(file, &bParam->time, sizeof(double)); status += hfWriteCRC(file); return checkFile(file, (unsigned int) (bParam->offset + bParam->size), status); } bool saveExtraHeader(HfInfo* file, BlasExtraInfo* bExtra) { unsigned int dtype = (unsigned int)bExtra->dtype; unsigned int flags = (unsigned int)bExtra->flags; int status = hfJump(file, bExtra->offset); status += hfWrite(file, &dtype, sizeof(unsigned int)); status += hfWrite(file, &flags, sizeof(unsigned int)); status += hfWrite(file, &bExtra->numParam, sizeof(unsigned int)); status += hfWriteCRC(file); return checkFile(file, (size_t)bExtra->offset + bExtra->size, status); } bool savePatternHeader(HfInfo* file, BlasPatternInfo* bPatt) { unsigned int len; int status = hfJump(file, bPatt->offset); len = (unsigned int)strlen(bPatt->name) + 1; status += hfWrite(file, &len, sizeof(len)); status += hfWrite(file, bPatt->name, len); status += hfWrite(file, &bPatt->numExtra, sizeof(bPatt->numExtra)); status += hfWriteCRC(file); return checkFile(file, (size_t)bPatt->offset + bPatt->size, status); } static void printErrorMessage (int i, const char* filename) { switch (i) { case FILE_NOT_FOUND: printf("File \'%s\' not found\n", filename); break; case FILE_ERROR_CRC: case FILE_ERROR_INDALID_KERNAL_SIZE: printf("File \'%s\' is corrupted.\n", filename); break; case FILE_ERROR_OPEN_FOR_WRITING: printf("Can't open file \'%s\' for writing.\n", filename); break; case FILE_ERROR_BUFFER_MISMATCH: printf("Out of memory to read the file \'%s\'.\n", filename); break; } fflush(stdout); } /// void writeStorageCache(TargetDevice* tdev) { int func; unsigned int patt; unsigned int extra; unsigned int param; int fret; HfInfo outfile; HfInfo infile; StorageCacheImpl* cache = getStorageCache(tdev, true); // Open file for save fret = hfOpenWrite(&infile, cache->fpath); if (fret) { printErrorMessage(fret, cache->fpath); exit(2); } fret = hfOpenWrite(&outfile, cache->fpath_tmp); if (fret) { printErrorMessage(fret, cache->fpath_tmp); exit(2); } saveHeader(&outfile, BLAS_FUNCTIONS_NUMBER, 0); // For each function for (func =0; func < BLAS_FUNCTIONS_NUMBER; ++ func) { BlasFunctionInfo* bFunc = &cache->functionInfo[func]; // For each pattern for (patt =0; patt < bFunc->numPatterns; ++ patt){ BlasPatternInfo* bPatt = &bFunc->pattInfo[patt]; // Save pattern header savePatternHeader(&outfile, bPatt); for (extra =0; extra < bPatt->numExtra; ++ extra){ BlasExtraInfo* bExtra = &bPatt->extra[extra]; saveExtraHeader(&outfile, bExtra); // for (param =0; param < bExtra->numParam; ++param){ BlasParamInfo* bParam = &bExtra->param[param]; saveParamData(&outfile, bParam); } } } } hfClose(&infile); hfClose(&outfile); // rename file fret = remove(cache->fpath); if (fret == 0) { fret = rename(cache->fpath_tmp, cache->fpath); } // Re-init storage cache destroyStorageCache (); initStorageCache(); } //Saving of the best parameter. It is running at tuning of subproblem dimension. //The parameter saving in in advance selected place. void saveBestParam(TargetDevice* tdev, BlasParamInfo* bParam) { HfInfo file; int status; StorageCacheImpl* cache; cache = getStorageCache(tdev, false); hfInit(&file); status = hfOpenReWrite(&file, cache->fpath); if (status == FILE_OK) { POSFILE pos = bParam->offset; hfJump(&file, pos); saveParamData(&file, bParam); bParam->sstatus = SS_CORRECT_DATA; } hfClose(&file); } clblas-2.10/src/library/tools/tune/subdim.c000066400000000000000000000472541264277366700207310ustar00rootroot00000000000000/* ************************************************************************ * Copyright 2013 Advanced Micro Devices, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * ************************************************************************/ #include #include #include #include #include #include #include #include "fileio.h" #include "toolslib.h" #include "tune.h" #include "subdim.h" #include #if defined(_MSC_VER) #define fmin min #define fmax max #endif #define isLdsUsed(pattern) \ (checkMatrixMemLevelSet(pattern, MATRIX_A, CLMEM_LEVEL_LDS) || \ checkMatrixMemLevelSet(pattern, MATRIX_B, CLMEM_LEVEL_LDS)) int VISIBILITY_HIDDEN getDataTypeSize(DataType dataType) { int dataTypeSize = 0; switch (dataType) { case TYPE_FLOAT: dataTypeSize = 4; break; case TYPE_DOUBLE: case TYPE_COMPLEX_FLOAT: dataTypeSize = 8; break; case TYPE_COMPLEX_DOUBLE: dataTypeSize = 16; break; } return dataTypeSize; } /* * Checks current dimensionality on a validity */ bool VISIBILITY_HIDDEN isSubDimValid(SubDimInfo* sd) { int j; size_t wgX = sd->pgran.wgSize[0]; size_t wgY = sd->pgran.wgSize[1]; SubproblemDim l0 = sd->sdim[0]; SubproblemDim l1 = sd->sdim[1]; size_t dataTypeSize = getDataTypeSize(sd->dtype); size_t dataFloatSize = getDataTypeSize(TYPE_FLOAT); int maxRegistr = 64; bool ret = true; bool inv; IgnoreItem* ii = sd->first; // if pattern-based validation is available if( NULL != sd->pattern->sops->checkCalcDecomp ){ return sd->pattern->sops->checkCalcDecomp( &sd->pgran, sd->sdim, 2, sd->dtype, PGRAN_CHECK ); } ret = ret && (l1.y >= 4*dataFloatSize/dataTypeSize); if (sd->blasLevel == 3) { if (!isMatrixAccessColMaj(sd->func, sd->flag, MATRIX_A) || !isMatrixAccessColMaj(sd->func, sd->flag, MATRIX_B)) { /* Avoid small bwidth and big x0, y0 for cases other than * column major access to both matrixes */ ret = ret && (l1.bwidth >= 4*dataFloatSize/dataTypeSize); ret = ret && (l0.y < 128); ret = ret && (l0.x < 128); } } if ( 0 == l1.bwidth ){ return false; } else{ ret = ret && ((l0.bwidth % l1.bwidth) == 0); ret = ret && (wgX*wgY == 64); } //ret = ret && (wgX*wgY < sd->workGroupSizes); //ret = ret && (wgX*wgY > 16); if (sd->blasLevel == 2) { ret = ret && (l0.y > l1.y); } else { ret = ret && (l0.x > l1.x); ret = ret && (l0.y > l1.y); ret = ret && (l1.x >= 4*dataFloatSize/dataTypeSize); } if (sd->is2D) { bool r = ret; ret = ret && (wgY * l1.itemX == l0.x); ret = ret && (wgX * l1.itemY == l0.y); if (r != ret) { return ret; } } if (ret && sd->isSquareBlock) { ret = ret && (l0.x == l0.y && l0.x == l0.bwidth); } //if (!(isLdsUsed(sd->pattern) || (sd->isSquareBlock && sd->nrLevel == 2))) { // ret = ret && l0.bwidth == l1.bwidth; //} if (ret) { int r ; r = (int)(l1.x*l1.bwidth + l1.y*l1.bwidth + l1.x*l1.y); r = r * (int)dataTypeSize / sizeof(cl_float4); if (r > maxRegistr) { return false; } } if (ret && sd->pattern->sops->isFitToLDS != NULL) { bool isFitToLDS; CLBlasKargs args; convKExtraFlagToArg(sd->flag, &args); isFitToLDS = sd->pattern->sops->isFitToLDS(sd->sdim, sd->dtype, sd->ldsSize, &args); if (!isFitToLDS) return false; } // Skip ignored dimension for (;ii != NULL; ii = ii->next) { inv = true; for(j = 0; j < V_COUNT; ++j) { int v1 = ii->var[j]; int v2 = get(&sd->var[j]); if (v1 == -1) { continue; } if (v1 == v2) { continue; } inv = false; break; } if (inv) { ret = false; } } return ret; } /* * Set invalid SubDimension. * Invalid SubDimensions will be skipped. */ void VISIBILITY_HIDDEN setInvalid(SubDimInfo* sdi, int l0x, int l0y, int l0w, int l1x, int l1y, int l1w) { IgnoreItem* ii = malloc(sizeof(IgnoreItem)); ii->var[V_L0_X] = l0x; ii->var[V_L0_Y] = l0y; ii->var[V_L0_BW] = l0w; ii->var[V_L1_X] = l1x; ii->var[V_L1_Y] = l1y; ii->var[V_L1_BW] = l1w; ii->next = sdi->first; sdi->first = ii; } void VISIBILITY_HIDDEN initVector(SubDimInfo* sd) { //0 1 2 3 4 5 6 7 8 9 10 11 int dim [] = {1,2,4,8,16,32,64,128,256,512,1024,2048, 4096}; if (sd->blasLevel == 2 ) { setVariable(sd, V_L0_X, 1, &dim[0]); setVariable(sd, V_L0_Y, 6, &dim[4]); setVariable(sd, V_L0_BW, 10, &dim[0]); setVariable(sd, V_L1_X, 1, &dim[0]); setVariable(sd, V_L1_Y, 6, &dim[1]); setVariable(sd, V_L1_BW, 6, &dim[0]); } else { setVariable(sd, V_L0_X, 4, &dim[4]); setVariable(sd, V_L0_Y, 4, &dim[4]); setVariable(sd, V_L0_BW, 6, &dim[0]); setVariable(sd, V_L1_X, 6, &dim[0]); setVariable(sd, V_L1_Y, 6, &dim[0]); setVariable(sd, V_L1_BW, 6, &dim[0]); } } void VISIBILITY_HIDDEN initKNMVector( SubDimInfo* sd, unsigned int baseDim, unsigned int* K, unsigned int* N, unsigned int* M ) { if (sd->blasLevel == 2 ) { *K = 1; *N = baseDim * 2; *M = baseDim * 2; } else { *K = baseDim; *N = baseDim; *M = baseDim; } } int VISIBILITY_HIDDEN get(SubDimItem* sd) { return sd->data[sd->curId]; } void VISIBILITY_HIDDEN calcPGranularity (SubDimInfo* sd) { SubproblemDim* dim = sd->sdim; PGranularity* pgran = &sd->pgran; //int level = sd->cuLevel; pgran->wgDim = 2; pgran->wfSize = 64; pgran->maxWorkGroupSize = sd->workGroupSizes; // if pattern provides granularity calculation // call the pattern function if( NULL != sd->pattern->sops->checkCalcDecomp ){ sd->pattern->sops->checkCalcDecomp( pgran, dim, 2, sd->dtype, PGRAN_CALC ); } else{ pgran->wgSize[1] = (unsigned int)(dim[0].x / dim[1].itemX); pgran->wgSize[0] = (unsigned int)(dim[0].y / dim[1].itemY); if (!sd->is2D) { pgran->wgDim = 1; pgran->wgSize[0] *= pgran->wgSize[1]; pgran->wgSize[1] = 1; } } } void VISIBILITY_HIDDEN calcParam(SubDimInfo* sd) { SubproblemDim* dim = sd->sdim; int dataTypeSize = getDataTypeSize(sd->dtype); memset(dim, 0, sizeof(sd->sdim)); dim[0].x = get(&sd->var[V_L0_X]); dim[0].itemX = get(&sd->var[V_L0_X]); dim[0].y = get(&sd->var[V_L0_Y]); dim[0].itemY = get(&sd->var[V_L0_Y]); dim[0].bwidth = get(&sd->var[V_L0_BW]); dim[1].x = get(&sd->var[V_L1_X]); dim[1].itemX = get(&sd->var[V_L1_X]); dim[1].y = get(&sd->var[V_L1_Y]); dim[1].itemY = get(&sd->var[V_L1_Y]); dim[1].bwidth = get(&sd->var[V_L1_BW]) / (dataTypeSize / getDataTypeSize(TYPE_FLOAT)); if (funcHasTriangMatrix((BlasFunctionID)sd->func) && !sd->is2D) { dim[0].itemY = SUBDIM_UNUSED; } if (sd->blasLevel == 2) { size_t xBlocks; xBlocks = dim[0].x / dim[1].x; dim[0].x = 1; dim[1].itemX = 1; dim[1].x = 1; if( NULL == sd->pattern->sops->checkCalcDecomp ){ dim[0].bwidth = dim[1].bwidth * xBlocks; } } calcPGranularity(sd); } bool VISIBILITY_HIDDEN next(SubDimItem var[V_COUNT]) { int i = V_COUNT - 1; bool next; do { next = false; var[i].curId ++; if (var[i].curId >= var[i].maxId) { var[i].curId = 0; next = true; -- i; } } while (next && i >= 0 ); return (next && i < 0); } void VISIBILITY_HIDDEN findValidSubdimInit(SubDimInfo* sd) { bool n = false; do { n = false; calcParam(sd); sd->valid = sd->isValid(sd); if (!sd->valid) { n = !next(sd->var); sd->valid = false; } } while (n); } bool nextSubdimElem(SubDimInfo* sd) { bool n = false; // !!! DEBUG if (sd->count > 500) { abort(); } sd->count ++; if (sd->valid == false) { return false; } if (sd->init != NULL) { sd->valid = false; n = !next(sd->var); if (n) findValidSubdimInit(sd); } return sd->valid; } /* * The variant included of the group. */ bool isMemberOfGroup(GroupStatInfo* gsi, Variant* vi) { bool res = true; res &= gsi->var[V_L0_X] == -1 || vi->var[V_L0_X] == gsi->var[V_L0_X]; res &= gsi->var[V_L0_Y] == -1 || vi->var[V_L0_Y] == gsi->var[V_L0_Y]; res &= gsi->var[V_L0_BW] == -1 || vi->var[V_L0_BW] == gsi->var[V_L0_BW]; res &= gsi->var[V_L1_X] == -1 || vi->var[V_L1_X] == gsi->var[V_L1_X]; res &= gsi->var[V_L1_Y] == -1 || vi->var[V_L1_Y] == gsi->var[V_L1_Y]; res &= gsi->var[V_L1_BW] == -1 || vi->var[V_L1_BW] == gsi->var[V_L1_BW]; return res; } /* * Calculate the minimum expected run time. */ double calcMinExpectedTimeForGroup(GroupStatInfo* gsi) { /* * K_INCREASE - Expected range of time values in the group * K_GLOBAL - */ const double K_INCREASE = 1.5; const double K_GLOBAL = 0.97; /* Number of variants in group */ double m = gsi->allCount; /* Number of variants in group for whom time is measured*/ double i = gsi->count; /* * k - Reflects the expected spread of values in the group, * depending on the number of measurements * decreases with increasing i * if i == 1 then k K_INCREASE * if i == m then k = 1 */ double ki = 1/ ((K_INCREASE + K_INCREASE/(m+i) -1)/(i) + (m-K_INCREASE)/(m+1)); double averageTime = (gsi->allTime / m); /* * kdelta - Reflects the expected spread of values in the group, * depending on the spread of values of the measured variations */ double kdelta = (gsi->minTime*3)/((gsi->minTime*2) + averageTime); double t = K_GLOBAL * kdelta * ki * gsi->minTime; /* * Select the minimum time between the minimum time for the current group * and the minimum time for the previous groups */ return t; } bool nextSubdim(SubDimInfo* sd, int maxParam, double time) { int i; int j; double minW = -5000; int vari = 0; double midTime; int iCount = 0; double maxTime; const int MAX_WEIGHT = 99; Variant* v0 = sd->curVar; // Current variant Variant* varNext = NULL; // Next Variant if (sd->count >= maxParam) { return false; } if (sd->returnAll) { bool ret = nextSubdimElem (sd); calcParam(sd); sd->curVarID = sd->count; return ret; } v0->time = time; sd->sumTime += time; midTime = sd->sumTime/(sd->count + 1); if (time > 0) { sd->minTime = fmin(sd->minTime, (float)time); } maxTime = fmax(2.1*midTime - sd->minTime, sd->minTime*5); /* Initialize all groups */ for (j = 0; j < sd->infoCount; j++ ) { GroupStatInfo* si = &sd->info[j]; si->allTime = 0; si->count = 0; si->minTime = 1e9; } /* Calculate an estimate for the groups */ for (i = 0; i < sd->varCount; ++i) { Variant* vi = &sd->allVariant[i]; /* If time for variant is measured*/ if (vi->time > 0) { for (j = 0; j < sd->infoCount; j++ ) { GroupStatInfo* gsi = &sd->info[j]; // For each group, if variant is member this group if (isMemberOfGroup(gsi, vi)) { gsi->minTime = fmin(gsi->minTime, vi->time); gsi->allTime += fmin(vi->time, maxTime); gsi->count ++; gsi->minTime = calcMinExpectedTimeForGroup(gsi); } } } vi->minTime = 0; vi->maxTime = 5000; vi->weight = MAX_WEIGHT; } /* * Calculate the estimate run-time variant */ for (i = 0; i < sd->varCount; ++i) { Variant* vi = &sd->allVariant[i]; vi->weight = MAX_WEIGHT; if (vi->time == 0) { double kgroup = 1.0; for (j = 0; j < sd->infoCount; j++ ) { GroupStatInfo* gsi = &sd->info[j]; // if the variant included of the group if (isMemberOfGroup(gsi, vi)) { if (gsi->count > 0) { vi->minTime = fmax(vi->minTime, gsi->minTime); vi->weight = sd->minTime/vi->minTime; } else { /* * If variant don't included of the group * then to reduce estimated time */ kgroup *= 1.1; } } } vi->weight *= kgroup; vi->minTime /= kgroup; } } /* Find variant with minimal run time */ for (i = 0; i < sd->varCount; ++i) { Variant* vi = &sd->allVariant[i]; if (vi->time == 0 && vi->weight >= 0.01 ) { iCount ++; if (minW < vi->weight) { minW = vi->weight; varNext = vi; vari = i; } } } // if (varNext == NULL) { return false; } sd->curVar = varNext; sd->curVarID = vari; #ifdef TEST_LOG printf ("%4d %6.2f [%6.2f:%5.2f ]",iCount, sd->minTime, sd->curVar->minTime, sd->curVar->weight); #endif for(j = 0; j < V_COUNT; ++j) { sd->var[j].curId = varNext->var[j]; } calcParam(sd); sd->count++; return true; } void resetSubdim(SubDimInfo* sd) { int i; for (i=0; i< V_COUNT; ++i) { sd->var[i].curId = 0; } sd->count = 0; sd->valid = false; if (sd->init != NULL) { sd->init(sd); findValidSubdimInit(sd); assert(sd->valid); } } /* * Groups variants in nonzero parameters. * * Example: l0x = 1 and remaining parameters = 0; * At different variants the parameter l0x accepts values 16, 32, 64. * At the first stage creates are 3 groups (a set of groups). * At the second stage all variants are arranged on these groups. * * The each variant included one group of the set of group. * The each variant included in each set of group. * In set of group can be only one group */ void setGroup(SubDimInfo* sd, int l0x, int l0y, int l0w, int l1x, int l1y, int l1w, int pg) { int i, j; int start = sd->infoCount; int end = sd->infoCount; (void) pg; //For each variant for (i = 0; i < sd->varCount; ++i) { Variant* vi = &sd->allVariant[i]; int id = -1; // For each group of the set of group for (j = start; j < end; j++ ) { bool bj = true; bj &= l0x == 0 || vi->var[V_L0_X] == sd->info[j].var[V_L0_X]; bj &= l0y == 0 || vi->var[V_L0_Y] == sd->info[j].var[V_L0_Y]; bj &= l0w == 0 || vi->var[V_L0_BW] == sd->info[j].var[V_L0_BW]; bj &= l1x == 0 || vi->var[V_L1_X] == sd->info[j].var[V_L1_X]; bj &= l1y == 0 || vi->var[V_L1_Y] == sd->info[j].var[V_L1_Y]; bj &= l1w == 0 || vi->var[V_L1_BW] == sd->info[j].var[V_L1_BW]; // if the variant belongs to group if (bj) { id = j; break; } } /* * if the variant doesn't belong to any group create new group */ if (id == -1) { sd->info[end].var[V_L0_X] = (l0x == 1)? vi->var[V_L0_X] : -1; sd->info[end].var[V_L0_Y] = (l0y == 1)? vi->var[V_L0_Y] : -1; sd->info[end].var[V_L0_BW] = (l0w == 1)? vi->var[V_L0_BW] : -1; sd->info[end].var[V_L1_X] = (l1x == 1)? vi->var[V_L1_X] : -1; sd->info[end].var[V_L1_Y] = (l1y == 1)? vi->var[V_L1_Y] : -1; sd->info[end].var[V_L1_BW] = (l1w == 1)? vi->var[V_L1_BW] : -1; sd->info[end].pg = 0; sd->info[end].allTime = 0; sd->info[end].allCount = 1; end++; sd->infoCount++; } else { sd->info[id].allCount++; } } } void initSubDimInfo(SubDimInfo* sd, MemoryPattern* mempatt, DeviceInfo* devinfo, unsigned int func, unsigned int patt, DataType dtype, KernelExtraFlags flag) { int i = 0; memset(sd, 0, sizeof(SubDimInfo)); sd->func = func; sd->patt = patt; sd->dtype = dtype; sd->flag = flag; sd->pattern = mempatt; sd->first = NULL; sd->is2D = (sd->pattern->sops->getFlags() & SF_WSPACE_2D)?true:false; sd->isSquareBlock = ((sd->pattern->sops->getFlags() & SF_TOP_INPUT_SQUARE_BLOCKS) != 0); sd->blasLevel = funcBlasLevel(sd->func); sd->nrLevel = sd->pattern->nrLevels; sd->ldsSize = devinfo->ldsSize; sd->workGroupSizes = devinfo->workGroupSizes; // Virtual function sd->isValid = isSubDimValid; sd->init = initVector; resetSubdim(sd); i = 0; do { i++; } while (nextSubdimElem(sd)); sd->allVariant = malloc(i* sizeof(Variant)); resetSubdim(sd); sd->varCount = i; for (i = 0; i < sd->varCount; ++i) { int j; int gpx; int gpy; for(j = 0; j < V_COUNT; ++j) { sd->allVariant[i].var[j] = sd->var[j].curId; } sd->allVariant[i].minTime = 0.0; sd->allVariant[i].probableTime = 0.0; sd->allVariant[i].maxTime = 5000.0; sd->allVariant[i].weight = 10; sd->allVariant[i].time = 0; gpx = get(&sd->var[V_L0_X])/ get(&sd->var[V_L1_X]); gpy = get(&sd->var[V_L0_Y])/ get(&sd->var[V_L1_Y]); sd->allVariant[i].pg = gpx * 1000 + gpy; nextSubdimElem(sd); } resetSubdim(sd); sd->minTime = 9999; sd->curVar = &sd->allVariant[0]; sd->curVarID = 0; // Initializing group sd->infoMaxCount = 5000; sd->infoCount = 0; sd->info = malloc(sd->infoMaxCount * sizeof(GroupStatInfo) ); // L0 L1 PG // x y w x y w setGroup(sd, 1, 1, 0, 0, 0, 0, 0); setGroup(sd, 1, 1, 1, 0, 0, 0, 0); setGroup(sd, 0, 0, 0, 1, 1, 1, 0); setGroup(sd, 1, 1, 0, 1, 1, 0, 0); } void setVariable(struct SubDimInfo* sdi, SubDimVariable var, int dcount, int* dim) { size_t size = dcount*sizeof(int); sdi->var[var].curId = 0; sdi->var[var].maxId = dcount; if (sdi->var[var].data != NULL) { free (sdi->var[var].data); sdi->var[var].data = NULL; } sdi->var[var].data = malloc(size); memcpy(sdi->var[var].data, dim, size); } clblas-2.10/src/library/tools/tune/subdim.h000066400000000000000000000061751264277366700207330ustar00rootroot00000000000000/* ************************************************************************ * Copyright 2013 Advanced Micro Devices, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * ************************************************************************/ #ifndef SUBDIM_H__ #define SUBDIM_H__ //#define TEST_LOG typedef struct SubDimItem { int curId; int maxId; int* data; }SubDimItem; int get(SubDimItem * sdi); /////////////////////////////////////////////////////////////////////////////// enum { V_NONE = -1, }; typedef enum SubDimVariable { V_L0_X, V_L0_Y, V_L0_BW, V_L1_X, V_L1_Y, V_L1_BW, V_COUNT, }SubDimVariable; typedef struct IgnoreItem { int var[V_COUNT]; struct IgnoreItem* next; }IgnoreItem; typedef struct GroupStatInfo { int var[V_COUNT]; int pg; double minTime; double allTime; int count; int allCount; }GroupStatInfo; typedef struct Variant { // int var[V_COUNT]; int pg; // Estimated time performance double minTime; // lower bound double probableTime; // double maxTime; // upper bound double weight; double time; }Variant; /////////////////////////////////////////////////////////////////////////////// typedef struct SubDimInfo { // dynamic array for statistics GroupStatInfo * info; int infoCount; int infoMaxCount; Variant* allVariant; SubDimItem var[V_COUNT]; PGranularity pgran; SubproblemDim sdim[MAX_SUBDIMS]; MemoryPattern * pattern; bool valid; DataType dtype; KernelExtraFlags flag; unsigned int func; unsigned int patt; bool is2D; int blasLevel; int nrLevel; bool isSquareBlock; unsigned long ldsSize; size_t workGroupSizes; // IgnoreItem * first; int count; double sumTime; Variant* curVar; int curVarID; int varCount; float minTime; void (*init)(struct SubDimInfo* sdi); bool (*isValid)(struct SubDimInfo* sdi); //#ifdef TEST_LOG bool returnAll; //#endif }SubDimInfo; void setVariable(struct SubDimInfo* sdi, SubDimVariable var, int dcount, int* dim); void setInvalid (struct SubDimInfo* sdi, int l0x, int l0y, int l0w, int l1x, int l1y, int l1w); bool nextSubdim(SubDimInfo* sd, int maxParam, double time); void resetSubdim(SubDimInfo* sd); void initSubDimInfo(SubDimInfo* sd, MemoryPattern* mempatt, DeviceInfo* devinfo, unsigned int func, unsigned int patt, DataType dtype, KernelExtraFlags flag); void destroySubdim(SubDimInfo* sd); void convKExtraFlagToArg(KernelExtraFlags flags, CLBlasKargs* args); #endif /* SUBDIM_H__ */ clblas-2.10/src/library/tools/tune/toolslib.c000066400000000000000000000360361264277366700212710ustar00rootroot00000000000000/* ************************************************************************ * Copyright 2013 Advanced Micro Devices, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * ************************************************************************/ #include #include #include #include "storage_data.h" #include "toolslib.h" #include "devinfo.h" #include "assert.h" #include "clblas_stddef.h" #include "mutex.h" // The array size is the total number devices on all platforms static StorageCacheImpl* storageCacheArray = NULL; // Number of items in storage cache array // is the number of unique devices. static unsigned int storageCacheArrayCount = 0; static mutex_t *storageCacheLock = NULL; static void clearPatternsNumber(BlasFunctionInfo *funcInfo) { int i; for (i = 0; i < BLAS_FUNCTIONS_NUMBER; i++) { funcInfo[i].numPatterns = 0; } } char* getDevName(TargetDevice* tdev) { size_t size; char* name; clGetDeviceInfo(tdev->id, CL_DEVICE_NAME, 0, NULL, &size); name = malloc(size * sizeof(char)); clGetDeviceInfo(tdev->id, CL_DEVICE_NAME, size, name, NULL); return name; } void initCLDeviceInfoRec(TargetDevice* tdev, DeviceInfo *devInfo) { cl_int status = 0; cl_uint bDouble; cl_device_id devID = tdev->id; devInfo->tdev = tdev; status = clGetDeviceInfo(devID, CL_DEVICE_MAX_COMPUTE_UNITS, sizeof(cl_uint), &(devInfo->nrComputeUnits), NULL); status = clGetDeviceInfo(devID, CL_DEVICE_GLOBAL_MEM_SIZE, sizeof(cl_ulong), &(devInfo->globalSize), NULL); status = clGetDeviceInfo(devID, CL_DEVICE_LOCAL_MEM_SIZE, sizeof(cl_ulong), &(devInfo->ldsSize), NULL); status = clGetDeviceInfo(devID, CL_DEVICE_MAX_MEM_ALLOC_SIZE, sizeof(cl_ulong), &(devInfo->maxMemAllocSize), NULL); status = clGetDeviceInfo(devID, CL_DEVICE_MIN_DATA_TYPE_ALIGN_SIZE, sizeof(cl_uint), &(devInfo->alignment), NULL); status = clGetDeviceInfo(devID, CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS, sizeof(cl_uint), &(devInfo->workItemSizesDim), NULL); status = clGetDeviceInfo(devID, CL_DEVICE_MAX_WORK_ITEM_SIZES, sizeof(size_t) * devInfo->workItemSizesDim, &(devInfo->workItemSizes), NULL); status = clGetDeviceInfo(devID, CL_DEVICE_MAX_WORK_GROUP_SIZE, sizeof(size_t) , &(devInfo->workGroupSizes), NULL); status = clGetDeviceInfo(devID, CL_DEVICE_ADDRESS_BITS, sizeof(cl_uint), &(devInfo->addressBits), NULL); status = clGetDeviceInfo(devID, CL_DEVICE_PREFERRED_VECTOR_WIDTH_DOUBLE, sizeof(cl_uint), &bDouble, NULL); devInfo->nativeDouble = deviceHasNativeDouble(devID, &status); // Values are put randomly. //TODO To use the correct data devInfo->nrStreamCores = 1; /* Number of stream cores per Compute Unit */ devInfo->wavefront = 64; /* Number of work-items executed in parallel on hardware */ devInfo->nativeComplex = true; /* Specifies whether device supports complex float */ } bool initReadingData(StorageCacheImpl* cacheImpl, TargetDevice* tdev ) { char* devName; DeviceInfo defInf; initCLDeviceInfoRec(tdev, &defInf); initBlasFuncionData(cacheImpl->functionInfo); initCacheData(cacheImpl->functionInfo, &defInf); cacheImpl->endFile = calcOffset(cacheImpl->functionInfo); devName = getDevName(tdev); cacheImpl->fpath = createFullPatch(devName, false); cacheImpl->fpath_tmp = createFullPatch(devName, true); free(devName); if (cacheImpl->fpath == NULL) { return false; } return true; } int getGranularityInfo( // In TargetDevice* tdev, const char* pattName, const DataType dt, const KernelExtraFlags kflag, int dim, // // Out SubproblemDim *sdim, PGranularity *pgran, double *time) { BlasParamInfo* bParam; int ret = GF_ERROR; int r; StorageCacheImpl* cache = getStorageCache(tdev, false); if (cache == NULL) { return ret; } bParam = findParam(cache, pattName, dt, kflag, dim); if (bParam != NULL) { r = bParam->sstatus != SS_CORRECT_DATA; if (!r) { memcpy(sdim, bParam->sDim, sizeof(SubproblemDim)* MAX_SUBDIMS); memcpy(pgran,&bParam->pGran, sizeof(PGranularity)); *time = bParam->time; ret = GF_SUCCESS; } else if (r == -1) { ret = GF_CORRUPT_FILE; //printCorruptionError(devID); } } return ret; } int getKernelInfo( TargetDevice* devID, const char* pattName, const DataType dt, const KernelExtraFlags kflag, int dim, unsigned char** buffer, size_t* sizeBuffer) { BlasParamInfo* bParam; int ret = GF_ERROR; StorageCacheImpl* cache = getStorageCache(devID, false); if (cache == NULL) { return ret; } memset(buffer, 0, sizeof(char*) * MAX_CLBLAS_KERNELS_PER_STEP); memset(sizeBuffer, 0, sizeof(size_t) * MAX_CLBLAS_KERNELS_PER_STEP); if (cache->isPopulate) { bParam = findParam(cache, pattName, dt, kflag, dim); if (bParam != NULL) { loadKernelsFromFile(cache, bParam, buffer, sizeBuffer); if (buffer[0] == NULL) { ret = GF_SUCCESS; } } } return ret; } /******************************************************************************/ void destroyStorageCache(void) { unsigned int i; StorageCacheImpl* curCache; if(storageCacheArray != NULL) { for (i = 0; i < storageCacheArrayCount; i++) { curCache = &storageCacheArray[i]; if (curCache != NULL) { destroyData(curCache->functionInfo); if (curCache->fpath != NULL) { free(curCache->fpath); } if (curCache->fpath_tmp != NULL) { free(curCache->fpath_tmp); } curCache->isPopulate = false; } } storageCacheArrayCount = 0; mutexDestroy(storageCacheLock); storageCacheLock = NULL; free(storageCacheArray); storageCacheArray = NULL; } } BlasFunctionInfo* getBlasFunctionInfo(TargetDevice* tdev, int func) { StorageCacheImpl* impl = getStorageCache(tdev, false); BlasFunctionInfo* ret = NULL; if (impl == NULL) { return NULL; } if (func >= 0 && func < BLAS_FUNCTIONS_NUMBER) { ret = &impl->functionInfo[func]; } return ret; } #define CHECK_(X) \ res = X; \ if (!res) { \ printf("ERROR %s\n", #X); \ /*raise(SIGTRAP);*/ \ } void checkFILE(TargetDevice* tdev, BlasFunctionInfo* fiArr) { StorageCacheImpl* impl; bool res; int func; unsigned int patt; unsigned int extra; unsigned int param; impl = getStorageCache(tdev, false); if (impl == NULL) { return; } for (func = 0; func < BLAS_FUNCTIONS_NUMBER; func++) { BlasFunctionInfo* cfi = &impl->functionInfo[func]; BlasFunctionInfo* fi = &fiArr[func]; CHECK_(cfi->funcNo == fi->funcNo); CHECK_(cfi->numPatterns == fi->numPatterns); CHECK_(cfi->maskForTuningsKernel == fi->maskForTuningsKernel); CHECK_(cfi->maskForUniqueKernels == fi->maskForUniqueKernels); CHECK_(cfi->defaultPattern == fi->defaultPattern); CHECK_(cfi->defaultPattern == fi->defaultPattern); CHECK_(strcmp(cfi->name, fi->name) == 0); //CHECK_(cfi-> == fi->) for (patt = 0; patt < fi->numPatterns; ++patt) { BlasPatternInfo* cpi = &cfi->pattInfo[patt]; BlasPatternInfo* pi = &fi->pattInfo[patt]; MemoryPattern* cmp = &cfi->pattern[patt]; MemoryPattern* mp = &fi->pattern[patt]; CHECK_(cpi->numExtra == pi->numExtra ); CHECK_(cpi->numTuneExtra == pi->numTuneExtra); CHECK_(cpi->offset == pi->offset); CHECK_(cpi->size == pi->size); //CHECK_(cpi->sstatus == pi->sstatus); CHECK_(cpi->pattNo == pi->pattNo); CHECK_(strcmp(cpi->name, pi->name) == 0); CHECK_(cmp->nrLevels == mp->nrLevels ); CHECK_(cmp->cuLevel == mp->cuLevel ); CHECK_(cmp->thLevel == mp->thLevel ); CHECK_(cmp->sops == mp->sops ); CHECK_(cmp->extra == mp->extra ); CHECK_(strcmp(cmp->name, mp->name) == 0); for (extra = 0; extra < pi->numExtra; ++extra) { BlasExtraInfo* cei = &cpi->extra[extra]; BlasExtraInfo* ei = &pi->extra[extra]; CHECK_(cei->numParam == ei->numParam); CHECK_(cei->dtype == ei->dtype); CHECK_(cei->flags == ei->flags); CHECK_(cei->vecLen == ei->vecLen); CHECK_(cei->isUseForTunning == ei->isUseForTunning); CHECK_(cei->offset == ei->offset); CHECK_(cei->size == ei->size); CHECK_(cei->sstatus == ei->sstatus); for (param = 0; param < ei->numParam; ++param) { BlasParamInfo* cpri = &cei->param[param]; BlasParamInfo* pri = &ei->param[param]; CHECK_(cpri->dim == pri->dim); CHECK_(cpri->pGran.wfSize == pri->pGran.wfSize); CHECK_(cpri->pGran.wgDim == pri->pGran.wgDim); CHECK_(cpri->pGran.wgSize[0] == pri->pGran.wgSize[0]); CHECK_(cpri->pGran.wgSize[1] == pri->pGran.wgSize[1]); CHECK_(cpri->sDim[0].bwidth == pri->sDim[0].bwidth); CHECK_(cpri->sDim[0].itemX== pri->sDim[0].itemX); CHECK_(cpri->sDim[0].itemY== pri->sDim[0].itemY); CHECK_(cpri->sDim[0].x == pri->sDim[0].x); CHECK_(cpri->sDim[0].y == pri->sDim[0].y); CHECK_(cpri->sDim[1].bwidth == pri->sDim[1].bwidth); CHECK_(cpri->sDim[1].itemX== pri->sDim[1].itemX); CHECK_(cpri->sDim[1].itemY== pri->sDim[1].itemY); CHECK_(cpri->sDim[1].x == pri->sDim[1].x); CHECK_(cpri->sDim[1].y == pri->sDim[1].y); CHECK_(cpri->sDim[2].bwidth == pri->sDim[2].bwidth); CHECK_(cpri->sDim[2].itemX== pri->sDim[2].itemX); CHECK_(cpri->sDim[2].itemY== pri->sDim[2].itemY); CHECK_(cpri->sDim[2].x == pri->sDim[2].x); CHECK_(cpri->sDim[2].y == pri->sDim[2].y); CHECK_(cpri->time == pri->time); CHECK_(cpri->offset == pri->offset); CHECK_(cpri->size == pri->size); CHECK_(cpri->sstatus == pri->sstatus); } } } } } bool isDeviceEQ(DeviceIdent* dev1, DeviceIdent* dev2) { bool ret = true; ret &= dev1->chip == dev2->chip; ret &= dev1->family == dev2->family; ret &= dev1->vendor == dev2->vendor; return ret; } StorageCacheImpl* getStorageCache(TargetDevice* tdev, bool force) { unsigned int k; StorageCacheImpl* curCache = NULL; assert(storageCacheArray != NULL); assert(storageCacheLock != NULL); for (k = 0; k < storageCacheArrayCount; ++k) { if (isDeviceEQ(&tdev->ident, &storageCacheArray[k].devIdent) ) { curCache = &storageCacheArray[k]; } } assert (curCache != NULL); // Read data from file can be only one thread // Work with the cached data can all threads in parallel if (!curCache->isInit) { mutexLock(storageCacheLock); // LOCK if (!curCache->isInit) { curCache->isPopulate = false; if (initReadingData(curCache, tdev)) { loadDataFromFile(curCache); } curCache->isInit = true; } mutexUnlock(storageCacheLock); // UNLOCK } // if storage cashe is empty then return NULL if (!(curCache->isPopulate || force)) { curCache = NULL; } return curCache; } unsigned int getPlatforms(cl_platform_id **platforms) { cl_int ret; cl_uint numberPlatform; ret = clGetPlatformIDs(0, NULL, &numberPlatform); if (ret != CL_SUCCESS || numberPlatform == 0) { return 0; } *platforms = calloc(numberPlatform, sizeof(cl_platform_id)); if (*platforms == NULL) { return 0; } ret = clGetPlatformIDs(numberPlatform, *platforms, NULL); return numberPlatform; } void initStorageCache(void) { cl_uint numberPlatform = 0; cl_platform_id *platforms = NULL; cl_device_id *devices = NULL; StorageCacheImpl* cur = NULL; cl_int ret; unsigned int deviceCount = 0; unsigned int i, j, k; assert (storageCacheLock == NULL); assert (storageCacheArray == NULL); assert (storageCacheArrayCount == 0); storageCacheLock = mutexInit(); numberPlatform = getPlatforms(&platforms); if (numberPlatform ==0) { return; } for (i =0; i < numberPlatform; ++i) { cl_uint dc; ret = clGetDeviceIDs(platforms[i], CL_DEVICE_TYPE_ALL, 0, NULL, &dc); if (ret == CL_SUCCESS) { deviceCount += dc; } } storageCacheArray = calloc(deviceCount, sizeof(*storageCacheArray)); for (i =0; i < numberPlatform; ++i) { cl_uint dc; ret = clGetDeviceIDs(platforms[i], CL_DEVICE_TYPE_ALL, 0, NULL, &dc); if (ret != CL_SUCCESS) { continue; } devices = calloc(dc, sizeof(*devices)); ret = clGetDeviceIDs(platforms[i], CL_DEVICE_TYPE_ALL, dc, devices, NULL); for (j = 0; j < dc; ++ j) { TargetDevice td; bool isUnique = true; td.id = devices[j]; identifyDevice(&td); for (k = 0; k < storageCacheArrayCount; ++k) { if (isDeviceEQ(&td.ident, &storageCacheArray[k].devIdent) ) { isUnique = false; } } if (isUnique) { cur = &storageCacheArray[storageCacheArrayCount]; clearPatternsNumber(cur->functionInfo); cur->isInit = false; cur->devIdent.chip = td.ident.chip; cur->devIdent.family = td.ident.family; cur->devIdent.vendor = td.ident.vendor; storageCacheArrayCount++; } } free(devices); } free (platforms); } clblas-2.10/src/library/tools/tune/toolslib.h000066400000000000000000000040011264277366700212610ustar00rootroot00000000000000/* ************************************************************************ * Copyright 2013 Advanced Micro Devices, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * ************************************************************************/ #ifndef TOOLSLIB_H__ #define TOOLSLIB_H__ #ifdef __APPLE__ #include #else #include #endif #include #include #include #include #include // Interface to access to saved data #define GF_SUCCESS 0 #define GF_ERROR 1 #define GF_INVALID_CACHE 2 #define GF_CORRUPT_FILE 3 #define GF_KERNEL_NOT_FOUND 4 /* * FIXME: It's a kludge to dedicated processing a case when matrix leading * dimension is aligned on the bank size */ #define BANK_ALIGNED_CASE_RECORD_IDX 5 typedef int dimension; void initStorageCache(void); void destroyStorageCache(void); int getGranularityInfo ( TargetDevice* tdev, const char* pattName, const DataType dt, const KernelExtraFlags kflag, dimension dim, SubproblemDim* sdim, PGranularity* pgran, double* time); int getKernelInfo ( TargetDevice* tdev, const char* pattName, const DataType dt, const KernelExtraFlags kflag, dimension dim, unsigned char** bufer, size_t* sizeBufer); int getDimensionCount(TargetDevice* tdev, int func); dimension getDimensionID ( TargetDevice* tdev, int func, size_t M, size_t N, size_t K); #endif /* TOOLSLIB_H__ */ clblas-2.10/src/library/tools/tune/tune.c000066400000000000000000002223741264277366700204170ustar00rootroot00000000000000/* ************************************************************************ * Copyright 2013 Advanced Micro Devices, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * ************************************************************************/ #include #include #include #include #include #ifdef __APPLE__ #include #else #include #endif // #include "fileio.h" #include "toolslib.h" #include "tune.h" #include "devinfo.h" #include "assert.h" #include "solution_seq.h" #include "matrix_dims.h" #include "subdim.h" #if defined(_MSC_VER) #include "Windows.h" #elif defined(__APPLE__) #include #include #include #else #include "time.h" #endif #define EXIT_COD_OK 0x0000 #define EXIT_COD_CL_ERROR 0x0100 #define EXIT_COD_UNKNOWN_DATATYPE 0x0101 #define EXIT_COD_NO_DATA 0x0102 #define EXIT_COD_NO_ENVIRONMENT_VARIABLE 0x0103 #define EXIT_COD_BAD_ENVIRONMENT_VARIABLE 0x0104 #define TYPE_NUMBER 4 #define MAX_RUN_KERNEL 3 typedef int KMASK; ////////////////////////////////////////////////////////////////// #if defined(_MSC_VER) typedef unsigned long long nano_time_t; #define NANOTIME_MAX (~0ULL - 1) #define fmin min #define fmax max nano_time_t conv2nanosec(nano_time_t t) { LARGE_INTEGER count; if (QueryPerformanceFrequency(&count) == FALSE) { return 0; } t = (t * 1000000)/count.QuadPart; return (nano_time_t)(t * 1000); } nano_time_t getCurrentTime(void) { LARGE_INTEGER count; if (QueryPerformanceCounter(&count) == FALSE) { return 0; } return (nano_time_t)count.QuadPart; } #elif defined(__APPLE__) typedef uint64_t nano_time_t; #define NANOTIME_MAX UINT64_MAX nano_time_t conv2nanosec(nano_time_t t) { static mach_timebase_info_data_t timebase_info = {0}; if (timebase_info.denom == 0) { (void)mach_timebase_info(&timebase_info); } /* Let's hope we don't overflow */ return (t * timebase_info.denom) / timebase_info.numer; } nano_time_t getCurrentTime(void) { return mach_absolute_time(); } #else typedef unsigned long nano_time_t; #define NANOTIME_MAX (~0UL - 1) nano_time_t conv2nanosec(nano_time_t t) { /* clock_... functions measure time in nanoseconds */ return t; } nano_time_t getCurrentTime(void) { int err; struct timespec t; err = clock_gettime(CLOCK_REALTIME, &t); if (err == 0) { return (t.tv_sec * 1000000000UL + t.tv_nsec); } return 0; } #endif /* defined(_MCS_VER) */ ////////////////////////////////////////////////////////////////// cl_int waitForSuccessfulFinish( cl_command_queue commandQueues, cl_event *event) { cl_int err, status; err = clFinish(commandQueues); if (err != CL_SUCCESS) { return err; } if (event == NULL || *event == NULL) { return CL_SUCCESS; } status = CL_COMPLETE; err = clGetEventInfo(*event, CL_EVENT_COMMAND_EXECUTION_STATUS, sizeof(status), &status, NULL); if (err != CL_SUCCESS) { return err; } if (status < 0) { return -status; } return CL_SUCCESS; } cl_int flushAll(cl_command_queue commandQueue) { cl_int err; err = clFlush(commandQueue); if (err != CL_SUCCESS) { return err; } return CL_SUCCESS; } enum { MASK_KERNEL_COMP = 0x01, MASK_KERNEL_A = 0x02, MASK_KERNEL_B = 0x04 }; const char *FILE_PATH = NULL; FILE* logStream; int globalDim = 0; enum { DEVNAME_MAXLEN = 64 }; #ifdef TEST_LOG #include typedef unsigned long nano_time_t; nano_time_t getCurrentTime(void) { int err; struct timespec t; err = clock_gettime(CLOCK_REALTIME, &t); if (err == 0) { return (t.tv_sec * 1000000000UL + t.tv_nsec); } return 0; } double globalTime = 0; double globalFastTime = 0; #endif extern int getDataTypeSize(DataType dataType); extern void writeStorageCache(TargetDevice* devID); extern BlasFunctionInfo* getBlasFunctionInfo(TargetDevice* devID, int func); extern void checkFILE(TargetDevice* devID, BlasFunctionInfo* fiArr); extern char* getDevName(TargetDevice* tdev); const unsigned int uiNONE = (unsigned int)-1; // float types based unified pointer typedef union FPtr { void *v; cl_float *f; cl_double *d; cl_float2 *f2; cl_double2 *d2; } FPtr; typedef struct GParam { int count; char name[65]; SubproblemDim dims[MAX_SUBDIMS]; PGranularity pgran; unsigned int vecLen; cl_ulong time; // For each kernel the binaries are created Kernel* kernel; //cl_kernel clkern; size_t binary_sizes; char* binaries; Kernel *kernelPrepA; Kernel *kernelPrepB; size_t binary_sizesA; char* binariesA; size_t binary_sizesB; char* binariesB; // POSFILE fbin[MAX_CLBLAS_KERNELS_PER_STEP]; } GParam; typedef struct MatrixInfo { DataType dtype; unsigned int sizeDType; unsigned int M; unsigned int N; unsigned int K; cl_mem clA; cl_mem clB; cl_mem clC; FPtr A; FPtr B; FPtr C; cl_mem clImgA; cl_mem clImgB; void *imgA; void *imgB; }MatrixInfo; enum Command { C_DEFAULT, C_REBUILD, C_GENKERNEL, C_ADD, }; struct GeneratorInfoRec { cl_platform_id platform; // ID of platform cl_device_type devType; cl_context ctx; cl_command_queue queue; // unsigned int numDevices; // number of Devices TargetDevice targetDevice; // DeviceInfo deviceInfos; // Todo delete this member. Use TargetDevice. char *deviceName; // bool aFunc[BLAS_FUNCTIONS_NUMBER]; // True/false value if the corresponding function should be tuned int aPattern; bool aDType[TYPE_NUMBER]; // True false value if the precision should be tuned; s/d/c/z int aFlag; int aCommand; bool aIsKernel; // True/false value to store binary kernels into the kernel database int aMaxparam; bool aExtendedOutput; bool aAll; double next; double last; const char* patternName; } genInfo; char * genParamStr(char* name, int w, size_t data) { char format[5]; sprintf(format,"%%%uu ", w); if (data != (size_t)-1) { char format[5]; sprintf(format,"%%%uu ", w); sprintf(name, format, (unsigned)data); } else { char format[5]; sprintf(format,"%%%us ", w); sprintf(name, format, "SU"); } return name + w + 1; } char * genParamsStr(SubproblemDim* dim, char* name, int w) { char* n = name; n = genParamStr(n, w, dim->x); n = genParamStr(n, w, dim->y); n = genParamStr(n, w, dim->bwidth); sprintf(n,":"); return n + 1; } void createGParamName(GParam* param) { char* n = param->name; if (param->dims[2].itemX > 0) { n = genParamsStr(¶m->dims[0], n, 3); n = genParamsStr(¶m->dims[1], n, 3); n = genParamsStr(¶m->dims[2], n, 2); } else { n = genParamsStr(¶m->dims[0], n, 3); n = genParamsStr(¶m->dims[1], n, 2); } sprintf(n,"%3dx%-2d", param->pgran.wgSize[0], param->pgran.wgSize[1]); } static int patternUseImages(MemoryPattern *pattern) { const CLBLASMpatExtra *extra = (const CLBLASMpatExtra*)pattern->extra; if (extra == NULL) { return 0; } if ((extra->mobjA == CLMEM_IMAGE) || (extra->mobjB == CLMEM_IMAGE) ) { return 1; } return 0; } void initGeneratorInfoRec(void) { int i; memset(&genInfo, 0, sizeof(struct GeneratorInfoRec)); genInfo.devType = CL_DEVICE_TYPE_GPU; genInfo.aCommand = C_DEFAULT; for (i=0; i < TYPE_NUMBER; ++i) { genInfo.aDType[i] = false; } genInfo.aFlag = -1; for (i=0; i < BLAS_FUNCTIONS_NUMBER; ++i) { genInfo.aFunc[i] = false; } genInfo.aPattern = -1; genInfo.aIsKernel = false; genInfo.aMaxparam = 5000; genInfo.aExtendedOutput = false; } void destroyKernels(GParam *param) { if (param->kernel != NULL) { putKernel(NULL, param->kernel); param->kernel = NULL; } if (param->kernelPrepA != NULL) { putKernel(NULL, param->kernelPrepA); param->kernelPrepA = NULL; } if (param->kernelPrepB != NULL) { putKernel(NULL, param->kernelPrepB); param->kernelPrepB = NULL; } } void destroyGenInfo(void) { free (genInfo.deviceName); genInfo.deviceName = NULL; clReleaseCommandQueue(genInfo.queue); clReleaseContext(genInfo.ctx); // destroyData(genInfo.functionInfo); } void checkErrorFunc(char* funcName, cl_int status) { if (status != CL_SUCCESS) { char * ret = "UNKNOWN"; switch (status) { case CL_OUT_OF_RESOURCES: ret = "CL_OUT_OF_RESOURCES"; // -5 break; case CL_BUILD_PROGRAM_FAILURE: // -11 ret = "CL_BUILD_PROGRAM_FAILURE"; break; case CL_INVALID_VALUE: // - 30 ret = "CL_INVALID_VALUE"; break; case CL_INVALID_KERNEL_ARGS: // - 52 ret = "CL_INVALID_KERNEL_ARGS"; break; case CL_INVALID_WORK_GROUP_SIZE: // - 54 ret = "CL_INVALID_WORK_GROUP_SIZE"; break; case CL_INVALID_WORK_ITEM_SIZE: // - 55 ret = "CL_INVALID_WORK_ITEM_SIZE"; break; case CL_INVALID_BUFFER_SIZE: // - 61 ret = "CL_INVALID_BUFFER_SIZE"; break; } fprintf(logStream, "%s() failed with %d(%s)\n", funcName, status, ret); fflush(logStream); destroyGenInfo(); exit(EXIT_COD_CL_ERROR); } } void initOpenCl(void) { cl_int status = 0; cl_uint numPlatforms; status = clGetPlatformIDs(0, NULL, &numPlatforms); checkErrorFunc("clGetPlatformIDs", status); if (numPlatforms > 0) { unsigned int i; cl_platform_id* platforms = (cl_platform_id *)malloc(numPlatforms*sizeof(cl_platform_id)); status = clGetPlatformIDs(numPlatforms, platforms, NULL); checkErrorFunc("clGetPlatformIDs", status); for(i=0; i < numPlatforms; ++i) { char pbuff[100]; status = clGetPlatformInfo( platforms[i], CL_PLATFORM_VENDOR, sizeof(pbuff), pbuff, NULL); checkErrorFunc("clGetPlatformInfo", status); genInfo.platform = platforms[i]; if(!strcmp(pbuff, "Advanced Micro Devices, Inc.")) { break; } } free(platforms); } // Init Device count status = clGetDeviceIDs(genInfo.platform, genInfo.devType, 0, 0, (cl_uint*)&genInfo.numDevices); checkErrorFunc("clGetDeviceIDs", status); } void initDevice(int dev) { cl_int status = 0; cl_uint num_devices; cl_device_id* deviceIDs = (cl_device_id *)calloc(genInfo.numDevices, sizeof(cl_device_id)); status = clGetDeviceIDs(genInfo.platform, genInfo.devType, genInfo.numDevices, deviceIDs, &num_devices); checkErrorFunc("clGetDeviceIDs", status); genInfo.targetDevice.id = deviceIDs[dev]; identifyDevice(&genInfo.targetDevice); genInfo.deviceName = getDevName(&genInfo.targetDevice); initCLDeviceInfoRec(&genInfo.targetDevice, &genInfo.deviceInfos); } void getContext(void) { cl_int status = 0; cl_context_properties props[3] = { CL_CONTEXT_PLATFORM, 0, 0 }; cl_device_id device = genInfo.targetDevice.id; props[1] = (cl_context_properties)genInfo.platform; genInfo.ctx = clCreateContext(props, 1, &device, NULL, NULL, &status); checkErrorFunc("clCreateContext", status); genInfo.queue = clCreateCommandQueue(genInfo.ctx, device, CL_QUEUE_PROFILING_ENABLE, &status); checkErrorFunc("clCreateCommandQueue",status); } int bitcount (unsigned int n) { int count = 1 ; while (n) { count *= 2; n &= (n - 1) ; } return count ; } bool genKernel(GParam *param, CLBLASKernExtra* extra, MemoryPattern *pattern) { cl_int status; SolverKgen genKernel; bool ret = false; cl_device_id device; char bopts[BUILD_OPTS_MAXLEN]; genKernel = pattern->sops->genKernel; device = genInfo.targetDevice.id; setupBuildOpts(bopts, device, pattern); param->kernel = makeKernel(device, genInfo.ctx, genKernel, NULL /*cl_program*/, param->dims, ¶m->pgran, extra, bopts, NULL); if (param->kernel != NULL) { status = clGetProgramInfo(param->kernel->program, CL_PROGRAM_BINARY_SIZES, sizeof(size_t), ¶m->binary_sizes, NULL); checkErrorFunc("clGetProgramInfo", status); param->binaries = (char *)malloc(sizeof(char)*param->binary_sizes); status = clGetProgramInfo(param->kernel->program, CL_PROGRAM_BINARIES, sizeof(char *), ¶m->binaries, NULL); checkErrorFunc("clGetProgramInfo", status); ret = true; } return ret; } void convKExtraFlagToArg(KernelExtraFlags flags, CLBlasKargs* args) { args->order = (flags & KEXTRA_COLUMN_MAJOR)?clblasColumnMajor: clblasRowMajor; args->side = (flags & KEXTRA_SIDE_RIGHT)? clblasRight: clblasLeft; args->uplo = (flags & KEXTRA_UPPER_TRIANG)?clblasUpper: clblasLower; args->transA = (flags & KEXTRA_TRANS_A)? clblasTrans: clblasNoTrans; args->transB = (flags & KEXTRA_TRANS_B)? clblasTrans: clblasNoTrans; if (isComplexType(args->dtype)) { args->transA = (flags & KEXTRA_CONJUGATE_A)?clblasConjTrans: args->transA; args->transB = (flags & KEXTRA_CONJUGATE_B)?clblasConjTrans: args->transB; } args->diag = (flags & KEXTRA_UNIT_DIAGONAL)? clblasUnit: clblasNonUnit; } void initCLBlasKArgDim(CLBlasKargs *args, MatrixInfo* mi, KernelExtraFlags extra) { cl_int status; float beta = ((extra & KEXTRA_BETA_ZERO) != 0)? 0.0f : 1.0f; memset( args, 0, sizeof(CLBlasKargs) ); convKExtraFlagToArg( extra, args ); args->dtype = mi->dtype; switch (mi->dtype) { case TYPE_FLOAT: args->alpha.argFloat = 1.0; args->beta.argFloat = beta; break; case TYPE_DOUBLE: args->alpha.argDouble = 1.0; args->beta.argFloat = beta; break; case TYPE_COMPLEX_FLOAT: args->alpha.argFloatComplex.s[0] = 1.0; args->alpha.argFloatComplex.s[1] = 0.0; args->beta.argFloatComplex.s[0] = beta; args->beta.argFloatComplex.s[1] = 0.0; break; case TYPE_COMPLEX_DOUBLE: args->alpha.argDoubleComplex.s[0] = 1.0; args->alpha.argDoubleComplex.s[1] = 0.0; args->beta.argDoubleComplex.s[0] = beta; args->beta.argDoubleComplex.s[1] = 0.0; break; } args->M = mi->M; args->N = mi->N; args->K = mi->K; args->A = clCreateBuffer(genInfo.ctx, CL_MEM_READ_ONLY, args->N * args->M * mi->sizeDType, NULL, &status); checkErrorFunc("clCreateBuffer",status); mi->clA = args->A; status = clEnqueueWriteBuffer(genInfo.queue, args->A, CL_TRUE, 0, args->N * args->M * mi->sizeDType, mi->A.v, 0, NULL, NULL); checkErrorFunc("clEnqueueWriteBuffer",status); args->lda.matrix = args->K; args->ldb.matrix = args->K; args->ldc.matrix = args->M; args->B = clCreateBuffer(genInfo.ctx, CL_MEM_READ_ONLY , args->K * args->N * mi->sizeDType, NULL, &status); checkErrorFunc("clCreateBuffer",status); mi->clB = args->B; status = clEnqueueWriteBuffer(genInfo.queue, args->B, CL_TRUE, 0, args->K * args->N * mi->sizeDType, mi->B.v, 0, NULL, NULL); checkErrorFunc("clEnqueueWriteBuffer",status); args->C = clCreateBuffer(genInfo.ctx, CL_MEM_WRITE_ONLY , args->M * args->K * mi->sizeDType, NULL, &status); checkErrorFunc("clCreateBuffer",status); mi->clC = args->C; args->addrBits = genInfo.deviceInfos.addressBits; args->offsetM = 0; args->offsetN = 0; args->offA = 0; args->offBX = 0; args->offCY = 0; args->scimage[0] = mi->clImgA; args->scimage[1] = mi->clImgB; } void initKernelArg( MemoryPattern *pattern, CLBlasKargs args, cl_kernel kernel, CLBlasKernelType kernType, const CLBLASKernExtra *kextra) { unsigned int ind; unsigned int nrArgs; cl_int status; KernelArg karg[MAX_KERNEL_ARGS]; memset(karg, 0, sizeof(KernelArg) * MAX_KERNEL_ARGS); args.kernType = kernType; pattern->sops->assignKargs(karg, &args, kextra); status = clGetKernelInfo(kernel, CL_KERNEL_NUM_ARGS, sizeof(nrArgs), &nrArgs, NULL); for (ind = 0; ((ind < nrArgs) && (status == CL_SUCCESS)); ind++) { status = clSetKernelArg(kernel, ind, karg[ind].typeSize, karg[ind].arg.data); } } double runKernel( cl_kernel kernel, cl_device_id device, MemoryPattern *pattern, const GParam *param, //unsigned int dim, CLBlasKargs *args, const void *extra, unsigned int funcID) { unsigned int nrComputeUnits; size_t globalWorkSize[2]; size_t localWorkSize[3]; cl_event evt = NULL; cl_int status; double ret; status = clGetDeviceInfo(device, CL_DEVICE_MAX_COMPUTE_UNITS, sizeof(cl_uint), (void*)&nrComputeUnits, NULL); checkErrorFunc("clGetDeviceInfo",status); ////////////////////////////////////////////////////////////////////////// //calcWorkGroups(); if (pattern->sops->calcThreads) { pattern->sops->calcThreads(globalWorkSize, param->dims, ¶m->pgran, args, extra); } else { ///// SubproblemDim globDim; SubproblemDim sd[MAX_SUBDIMS]; kargsToProbDims(&globDim, funcID, args, false); sd[0] = param->dims[0]; sd[1] = param->dims[1]; if ((param->pgran.wgDim == 2) && pattern->sops->innerDecompositionAxis) { if (pattern->sops->innerDecompositionAxis(args) == DECOMP_AXIS_X) { /* * these dimensions will not used more anywhere, so we can * just swap them */ swapDimXY(&(sd[0])); swapDimXY(&(sd[1])); swapDimXY(&globDim); } } calcGlobalThreads(globalWorkSize, &(sd[0]), ¶m->pgran, globDim.y, globDim.x); } localWorkSize[0] = param->pgran.wgSize[0]; localWorkSize[1] = param->pgran.wgSize[1]; localWorkSize[2] = 0; fflush(stdout); status = clEnqueueNDRangeKernel(genInfo.queue, kernel, param->pgran.wgDim, NULL, globalWorkSize, localWorkSize, 0, NULL, &evt); clReleaseKernel(kernel); checkErrorFunc("clEnqueueNDRangeKernel",status); #if 0 { cl_ulong start, end; status = clFinish(genInfo.queue); checkErrorFunc("clFinish", status); status = clGetEventProfilingInfo(evt, CL_PROFILING_COMMAND_START, sizeof(cl_ulong), &start, NULL); checkErrorFunc("clGetEventProfilingInfo",status); status = clGetEventProfilingInfo(evt, CL_PROFILING_COMMAND_END, sizeof(cl_ulong), &end, NULL); checkErrorFunc("clGetEventProfilingInfo",status); end -= start; end /= 1000; ret = (double)end/1000; } #else { nano_time_t time; status = flushAll(genInfo.queue); checkErrorFunc("flushAll", status); time = getCurrentTime(); status = waitForSuccessfulFinish(genInfo.queue, &evt); checkErrorFunc("waitForSuccessfulFinish", status); time = getCurrentTime() - time; ret = (double)conv2nanosec(time)/1000; ret /= 1000; } #endif clReleaseEvent(evt); return ret; } void subInitMatrixInfo( MatrixInfo *matrixInfo, DataType dt, unsigned int sizeType) { matrixInfo->dtype = dt; matrixInfo->sizeDType = sizeType; matrixInfo->A.v = malloc(matrixInfo->N * matrixInfo->M * sizeType); matrixInfo->B.v = malloc(matrixInfo->N * matrixInfo->K * sizeType); matrixInfo->C.v = malloc(matrixInfo->M * matrixInfo->K * sizeType); } void initMatrixFloat(FPtr* m, int maxi) { int i; for (i = 0; i < maxi; ++i) { m->f[i] = 1.0; } } void initMatrixInfo( MatrixInfo *mi, DataType dt, DeviceInfo* di, BlasExtraInfo* bExtra ) { unsigned int nDim; BlasFunctionInfo* bFunc = bExtra->parent->parent; for (nDim = 0; nDim < bExtra->numParam; ++nDim, mi++) { unsigned int i; unsigned int dimension = getDimension(nDim, dt, di, bFunc->funcNo); if (bFunc != NULL && bFunc->initKNM != NULL) { bFunc->initKNM(mi, dimension); } else { mi->K = dimension; mi->N = dimension; mi->M = dimension; } switch (dt) { case TYPE_FLOAT: subInitMatrixInfo(mi, dt, sizeof(cl_float)); initMatrixFloat(&mi->A, mi->K * mi->M); initMatrixFloat(&mi->B, mi->N * mi->K); break; case TYPE_DOUBLE: subInitMatrixInfo(mi, dt, sizeof(cl_double)); for (i = 0; i < mi->K * mi->M; ++i) { mi->A.d[i] = 1.0; } for (i = 0; i < mi->N * mi->K; ++i) { mi->B.d[i] = 1.0; } break; case TYPE_COMPLEX_FLOAT: subInitMatrixInfo(mi, dt, sizeof(cl_float2)); for (i = 0; i < mi->K * mi->M; ++i) { mi->A.f2[i].s[0] = 1.0; mi->A.f2[i].s[1] = 0.0; } for (i = 0; i < mi->N * mi->K; ++i) { mi->B.f2[i].s[0] = 1.0; mi->B.f2[i].s[1] = 0.0; } break; case TYPE_COMPLEX_DOUBLE: subInitMatrixInfo(mi, dt, sizeof(cl_double2)); for (i = 0; i < mi->K * mi->M; ++i) { mi->A.d2[i].s[0] = 1.0; mi->A.d2[i].s[1] = 0.0; } for (i = 0; i < mi->N * mi->K; ++i) { mi->B.d2[i].s[0] = 1.0; mi->B.d2[i].s[1] = 0.0; } break; default: exit (EXIT_COD_UNKNOWN_DATATYPE); } mi->clA = NULL; mi->clB = NULL; mi->clC = NULL; mi->clImgA = NULL; mi->clImgB = NULL; mi->imgA = NULL; mi->imgB = NULL; } } void releaseMemObjOne(MatrixInfo * mi) { clReleaseMemObject(mi->clA); clReleaseMemObject(mi->clB); clReleaseMemObject(mi->clC); mi->clA = NULL; mi->clB = NULL; mi->clC = NULL; mi->clImgA = NULL; mi->imgA = NULL; mi->clImgB = NULL; mi->imgB = NULL; } void releaseMemObjAll(MatrixInfo * mi, BlasExtraInfo* bExtra) { unsigned int nDim; for (nDim = 0; nDim < bExtra->numParam; ++nDim, mi++) { releaseMemObjOne(mi); } } void destroyMatrixInfo(MatrixInfo* mi, BlasExtraInfo* bExtra) { unsigned int nDim; for (nDim = 0; nDim < bExtra->numParam; ++nDim, mi++) { free(mi->A.v); free(mi->B.v); free(mi->C.v); } } void logBest( unsigned int * bestParam, unsigned int nDim, GParam * gp, double * bestTime) { fprintf(logStream, " %d %s = %f\n",bestParam[nDim], gp->name, bestTime[nDim]); fflush(logStream); } void logCheckError(int dim) { fprintf(logStream, " [%5d]: NOT FOUND\n", dim); } void logCheck( int dim, SubproblemDim* sdim, PGranularity* pgran, double t, double oldt, bool kern) { GParam gp; gp.dims[0] = sdim[0]; gp.dims[1] = sdim[1]; gp.dims[2] = sdim[2]; gp.pgran = *pgran; createGParamName(&gp); if (genInfo.aExtendedOutput) { if (oldt == 0) { fprintf(logStream, " [%5d]: %s - %7g ",dim, gp.name, t); oldt = t; } if (fabs(t - oldt) < 0.0001) { fprintf(logStream, (kern) ? "* " : "+ "); } else { fprintf(logStream, "- "); } } fflush(logStream); } void logParamName(GParam * params, int cur, int max) { if (genInfo.aExtendedOutput) { fprintf(logStream, "%3i/%-3i, %s :", cur, max, params->name); fflush(logStream); /* For Debug GEMM, Memmory pattern #4 fprintf(logStream, "%3i/%-3i; wg: %dx%d; iB: %lux%lu; gB: %lux%lu; bw: %lu", cur, max, params->pgran.wgSize[1], params->pgran.wgSize[0], params->dims[1].x, params->dims[1].y, params->dims[0].x, params->dims[0].y, params->dims[0].bwidth); */ fflush(logStream); } else { if (cur > 0) { fprintf(logStream, "\b\b\b\b\b\b\b"); } fprintf(logStream, "%5.2f%% ", genInfo.last + (genInfo.next - genInfo.last)*cur/max); fflush(logStream); } } void logTime(double time) { if (genInfo.aExtendedOutput) { fprintf(logStream, " %7.2f", time); fflush(logStream); } } void logKernalGen(void) { if (genInfo.aExtendedOutput) { fprintf(logStream, " *"); fflush(logStream); } } void logPattern(const char * patternName) { if ( genInfo.aExtendedOutput || genInfo.patternName != patternName ) { fprintf(logStream, "%s is being tuned, progress: ", patternName); if (genInfo.aExtendedOutput) { fprintf(logStream, "\n"); }else { fprintf(logStream, " "); } fflush(logStream); genInfo.patternName = patternName; } } void logEndString(void) { if (genInfo.aExtendedOutput) { fprintf(logStream, "\n"); fflush(logStream); } } void logExtraFlag( KernelExtraFlags flags, KernelExtraFlags flag, const char * trueName, const char * falseName ) { if ((flags & flag) > 0) { fprintf(logStream, "%s", trueName); } else { fprintf(logStream, "%s", falseName); } } void logEndPattern(unsigned int func, unsigned int patt) { //bool isFunc = (genInfo.aFunc == -1 || genInfo.aFunc == (int)func); bool isFunc = genInfo.aFunc[func]; bool isPattern = (genInfo.aPattern == -1 || genInfo.aPattern == (int)patt); if (!(isFunc && isPattern)) { return; } if (!genInfo.aExtendedOutput) { fprintf(logStream, "\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b"); fprintf(logStream, "\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b"); fprintf(logStream, " tuning is complete. \n"); } fprintf(logStream, "\n"); fflush(logStream); } void logExtra(BlasExtraInfo* bExtra) { const char* strType = ""; const char* strTrans = ""; KernelExtraFlags flags = bExtra->flags; if (!genInfo.aExtendedOutput) { fprintf(logStream, "\b\b\b\b\b\b\b\b %5.2f%% ", genInfo.last); } else { fprintf(logStream, " Flag (%d):(clblas*)", flags); strTrans = (flags & KEXTRA_TRANS_A)? " Trans": " NoTrans"; logExtraFlag(flags, KEXTRA_CONJUGATE_A, " ConjTrans", strTrans); fprintf(logStream, "(A)"); strTrans = (flags & KEXTRA_TRANS_B)? " Trans": " NoTrans"; logExtraFlag(flags, KEXTRA_CONJUGATE_B, " ConjTrans", strTrans); fprintf(logStream, "(B)"); logExtraFlag(flags, KEXTRA_COLUMN_MAJOR, " ColumnMajor", " RowMajor"); logExtraFlag(flags, KEXTRA_UPPER_TRIANG, " Upper", " Lower"); logExtraFlag(flags, KEXTRA_SIDE_RIGHT, " Right", " Left"); fprintf(logStream, " \n"); switch (bExtra->dtype) { case TYPE_FLOAT: strType = "FLOAT"; break; case TYPE_DOUBLE: strType = "DOUBLE"; break; case TYPE_COMPLEX_FLOAT: strType = "COMPLEX_FLOAT"; break; case TYPE_COMPLEX_DOUBLE: strType = "COMPLEX_DOUBLE"; break; } fprintf(logStream, " TYPE = %s:", strType); } fflush(logStream); logEndString(); } void logError(void) { fprintf(logStream, " An internal kernel build error occurred!\n"); fflush(logStream); } static void releaseSCImage(void** buf, cl_mem* clImg) { if (*clImg != NULL) { clReleaseMemObject(*clImg); *clImg = NULL; free(*buf); *buf = NULL; } } static cl_int createSCImage( void **buf, cl_mem *image) { cl_image_format format = { CL_RGBA, CL_FLOAT }; size_t width, height, maxWidth, maxHeight; cl_int status; cl_ulong memSize; cl_device_id device; cl_int err; err = clGetContextInfo(genInfo.ctx, CL_CONTEXT_DEVICES, sizeof(device), &device, NULL); if (err != CL_SUCCESS) { return err; } err = clGetDeviceInfo(device, CL_DEVICE_MAX_MEM_ALLOC_SIZE, sizeof(memSize), &memSize, NULL); if (err != CL_SUCCESS) { return err; } err = clGetDeviceInfo(device, CL_DEVICE_IMAGE2D_MAX_WIDTH, sizeof(maxWidth), &maxWidth, NULL); if (err != CL_SUCCESS) { return err; } err = clGetDeviceInfo(device, CL_DEVICE_IMAGE2D_MAX_HEIGHT, sizeof(maxHeight), &maxHeight, NULL); if (err != CL_SUCCESS) { return err; } // some functions need 2 scratch images memSize /= 2; height = (size_t)sqrt((double)memSize / sizeof(cl_float)); width = height / 4; if (height > maxHeight) { height = maxHeight; } if (width > maxWidth) { width = maxWidth; } *buf = calloc(width * height, 4 * sizeof(cl_float)); if (buf == NULL) { return CL_OUT_OF_HOST_MEMORY; } *image = clCreateImage2D(genInfo.ctx, CL_MEM_READ_ONLY | CL_MEM_USE_HOST_PTR, &format, width, height, 0, *buf, &status); if (*image == NULL) { free(*buf); *buf = NULL; return status; } return CL_SUCCESS; } static void generatePrepKernel( cl_device_id device, MemoryPattern *pattern, GParam * param, CLBlasKargs *args, CLBLASKernExtra *extra, CLBlasKernelType kernType) { PGranularity pgran; Kernel *k = NULL; size_t bSize; char* bin; cl_int status; cl_ulong ldsSize; CLBlasKernelType kernTypeOld = extra->kernType; DUMMY_ARG_USAGE(args); extra->kernType = kernType; clGetDeviceInfo(device, CL_DEVICE_LOCAL_MEM_SIZE, sizeof(cl_ulong), &ldsSize, NULL); pgran = param->pgran; k = makeKernel( device, genInfo.ctx, pattern->sops->genKernel, 0, param->dims, &pgran, extra, NULL, NULL); status = clGetProgramInfo(k->program, CL_PROGRAM_BINARY_SIZES, sizeof(size_t), &bSize, NULL); checkErrorFunc("clGetProgramInfo", status); bin = (char *)malloc(sizeof(char)*bSize); status = clGetProgramInfo(k->program, CL_PROGRAM_BINARIES, sizeof(char*), &bin, NULL); checkErrorFunc("clGetProgramInfo", status); if (kernType == CLBLAS_PREP_A_KERNEL) { param->binariesA = bin; param->binary_sizesA = bSize; param->kernelPrepA = k; } if (kernType == CLBLAS_PREP_B_KERNEL) { param->binariesB = bin; param->binary_sizesB = bSize; param->kernelPrepB = k; } extra->kernType = kernTypeOld; } void delGParam(GParam * gp) { if (gp != NULL) { gp->count --; if (gp->count ==0){ destroyKernels(gp); free(gp->binaries); free(gp->binariesA); free(gp->binariesB); free(gp); gp = NULL; } } } void setFlagsDependentOnDevice( CLBlasKargs* args, CLBLASKernExtra* extra, GParam* parCur, unsigned int func, unsigned int patt ) { SolutionStep step; cl_int status; step.args = *args; step.cmdQueue = genInfo.queue; step.extraFlags = extra->flags; step.funcID = func; step.kernels[0] = NULL; step.kernels[1] = NULL; step.kernels[2] = NULL; //step.node = NULL; step.numEventsInWaitList = 0; step.patternID = patt; step.pgran = parCur->pgran; step.subdims[0] = parCur->dims[0]; step.subdims[1] = parCur->dims[1]; step.subdims[2] = parCur->dims[2]; step.device.id = genInfo.targetDevice.id; status = identifyDevice(&step.device); checkErrorFunc("identifyDevice", status); if (step.device.ident.vendor == VENDOR_AMD) { extra->flags |= (KEXTRA_VENDOR_AMD | KEXTRA_ENABLE_MAD); } selectVectorization(&step, extra); } bool genAllKernel( CLBlasKargs* args, CLBLASKernExtra extra, GParam* parCur, MemoryPattern * pattern, unsigned int func, unsigned int patt ) { bool ret; cl_device_id device = genInfo.targetDevice.id; if (func == (unsigned int)CLBLAS_SYRK || func == (unsigned int)CLBLAS_SYR2K) { extra.flags |= KEXTRA_SYRK_SEPARATE_DIAGONAL; } setFlagsDependentOnDevice(args, &extra, parCur, func, patt); // fixup work group size in respect with desired work dispatch order if ((parCur->pgran.wgDim == 2) && pattern->sops->innerDecompositionAxis) { if (pattern->sops->innerDecompositionAxis(args) == DECOMP_AXIS_X) { unsigned int u; u = parCur->pgran.wgSize[0]; parCur->pgran.wgSize[0] = parCur->pgran.wgSize[1]; parCur->pgran.wgSize[1] = u; } } if (pattern->sops->fixupArgs) { pattern->sops->fixupArgs(args, parCur->dims, &extra); } ret = genKernel(parCur, &extra, pattern); if (patternUseImages(pattern)) { generatePrepKernel(device, pattern, parCur, args, &extra, CLBLAS_PREP_A_KERNEL); generatePrepKernel(device, pattern, parCur, args, &extra, CLBLAS_PREP_B_KERNEL); } return ret; } double runAllKernel( MemoryPattern * pattern, CLBlasKargs *args, GParam* parCur, unsigned int funcId, double bestTime) { double time; double minTime = 1e30; int i; cl_device_id device = genInfo.targetDevice.id; int max_run_kernel = MAX_RUN_KERNEL + (funcBlasLevel(funcId) == 2 ? 7 : 0); cl_int status; cl_kernel kernel; if (patternUseImages(pattern)) { /////////////// A ////////////// cl_kernel kPrepA; cl_kernel kPrepB; status = clCreateKernelsInProgram( parCur->kernelPrepA->program, 1, &kPrepA, NULL); checkErrorFunc("clGetProgramInfo", status); initKernelArg(pattern, *args, kPrepA, CLBLAS_PREP_A_KERNEL, parCur->kernelPrepA->extra); args->kernType = CLBLAS_PREP_A_KERNEL; time = runKernel(kPrepA, device, pattern, parCur, args, parCur->kernelPrepA->extra, funcId); /////////////// B ////////////// status = clCreateKernelsInProgram( parCur->kernelPrepB->program, 1, &kPrepB, NULL); checkErrorFunc("clGetProgramInfo", status); initKernelArg(pattern, *args, kPrepB, CLBLAS_PREP_B_KERNEL, parCur->kernelPrepB->extra); args->kernType = CLBLAS_PREP_B_KERNEL; time = runKernel(kPrepB, device, pattern, parCur, args, parCur->kernelPrepB->extra, funcId); args->kernType = CLBLAS_COMPUTING_KERNEL; } for (i = 0; i < max_run_kernel; ++i) { status = clCreateKernelsInProgram(parCur->kernel->program, 1, &kernel, NULL); checkErrorFunc("clGetProgramInfo", status); initKernelArg(pattern, *args, kernel, CLBLAS_COMPUTING_KERNEL, parCur->kernel->extra); time = runKernel(kernel, device, pattern, parCur, args, parCur->kernel->extra, funcId); minTime = fmin(time, minTime); if (minTime > bestTime*2 && i >= max_run_kernel/2 && minTime > 2) { break; } } return minTime; } GParam* createParCur(SubDimInfo *sdi) { GParam* parCur = calloc(1, sizeof(GParam)); parCur->count ++; parCur->dims[0] = sdi->sdim[0]; parCur->dims[1] = sdi->sdim[1]; parCur->dims[2] = sdi->sdim[2]; parCur->pgran = sdi->pgran; createGParamName(parCur); return parCur; } GParam* createParCur2(GParam* sdi) { GParam* parCur = calloc(1, sizeof(GParam)); parCur->count = 1; parCur->dims[0] = sdi->dims[0]; parCur->dims[1] = sdi->dims[1]; parCur->dims[2] = sdi->dims[2]; parCur->pgran = sdi->pgran; parCur->vecLen = sdi->vecLen; parCur->binaries = NULL; parCur->binariesA = NULL; parCur->binariesB = NULL; parCur->binary_sizes = 0; parCur->binary_sizesA = 0; parCur->binary_sizesB = 0; parCur->time = sdi->time; createGParamName(parCur); return parCur; } static void setParam(BlasParamInfo* bParam, double time, GParam* parCur) { bParam->time = time; bParam->pGran = parCur->pgran; bParam->sDim[0] = parCur->dims[0]; bParam->sDim[1] = parCur->dims[1]; bParam->sDim[2] = parCur->dims[2]; // if (genInfo.aIsKernel) { bParam->kSize[0] = (unsigned int)parCur->binary_sizes; bParam->kSize[1] = (unsigned int)parCur->binary_sizesA; bParam->kSize[2] = (unsigned int)parCur->binary_sizesB; } else { bParam->kSize[0] = 0; bParam->kSize[1] = 0; bParam->kSize[2] = 0; } } int VISIBILITY_HIDDEN comp(const void *i, const void *j) { return *(double *)i < *(double *)j; } void VISIBILITY_HIDDEN initCLBLASExtra(CLBLASKernExtra* extra, BlasExtraInfo* bExtra) { memset( extra, 0, sizeof(CLBLASKernExtra) ); // if (bExtra) { extra->dtype = bExtra->dtype; extra->flags = bExtra->flags; // extra->vecLen = bExtra->vecLen; // } } #ifdef TEST_LOG typedef struct LOG_FILE { FILE* f; bool readElem; double t1, t2, t3; double tall; }LOG_FILE; typedef struct LOG_STAT { int count; double minTime; double maxTime; double midleTime; }LOG_STAT; void openLogFile(LOG_FILE* lf, char* fileName) { if ((lf->f = fopen(fileName, "a+")) != NULL) { } } void closeLogFile(LOG_FILE* lf) { fclose(lf->f); } bool readElemLogFile(LOG_FILE* lf, SubproblemDim* sd, unsigned int vecLen) { unsigned int l0x, l0y, l0w, l1x, l1y, l1w, vl; double t1, t2, t3, tall; fscanf(lf->f, "%u %u %u %u %u %u %u - %lf %lf %lf %lf\n", &l0x, &l0y, &l0w, &l1x, &l1y, &l1w, &vl, &t1, &t2, &t3, &tall); if ( l0x == sd[0].x && l0y == sd[0].y && l0w == sd[0].bwidth && l1x == sd[1].x && l1y == sd[1].y && l1w == sd[1].bwidth && vl == vecLen) { lf->t1 = t1; lf->t2 = t2; lf->t3 = t3; lf->tall = tall; return true; } return false; } double readLogFile(LOG_FILE* lf, SubproblemDim* sd, unsigned int vecLen) { lf->t3 = 0; lf->readElem = readElemLogFile(lf, sd, vecLen); if (!lf->readElem) { rewind(lf->f); while (!lf->readElem && !feof(lf->f)) { lf->readElem = readElemLogFile(lf, sd, vecLen); } } return lf->t3; } double saveLogFile(LOG_FILE* lf, SubproblemDim* sd, unsigned int vecLen, double* time, double timeAll) { if (!lf->readElem) { fprintf(lf->f, "%u %u %u %u %u %u %u - %lf %lf %lf %lf\n", (unsigned int)sd[0].x, (unsigned int)sd[0].y, (unsigned int)sd[0].bwidth, (unsigned int)sd[1].x, (unsigned int)sd[1].y, (unsigned int)sd[1].bwidth, vecLen, time[0], time[1], time[2], timeAll); } return lf->t3; } void getBestVariant(LOG_FILE* lf) { rewind(lf->f); lf->tall = 0; while (!feof(lf->f)) { unsigned int l0x, l0y, l0w, l1x, l1y, l1w, vl; double t1, t2, t3, tall; fscanf(lf->f, "%u %u %u %u %u %u %u - %lf %lf %lf %lf\n", &l0x, &l0y, &l0w, &l1x, &l1y, &l1w, &vl, &t1, &t2, &t3, &tall); lf->t1 = fmin(t1, lf->t1); lf->t2 = fmin(t2, lf->t2); lf->t3 = fmin(t3, lf->t3); lf->tall += tall; } } #endif static void findBestParams( MemoryPattern *pattern, unsigned int func, unsigned int patt, bool isEnvPattSelected, BlasExtraInfo* bExtra, GParam* bestParam[DIMARRAYCOUNT]) { unsigned int nDim; SolutionStep step; MatrixInfo mi [DIMARRAYCOUNT]; //cl_kernel kernel_old[MAX_CLBLAS_KERNELS_PER_STEP]; double time[DIMARRAYCOUNT]; CLBLASKernExtra extra; SubDimInfo sdi; void* imgA = NULL; cl_mem clImgA = NULL; void* imgB = NULL; cl_mem clImgB = NULL; int curStep; unsigned int dimension; #ifdef TEST_LOG LOG_FILE lf; double all_time = 0; double step_time; char str[1000]; #endif memset(time, 0, sizeof(time)); initCLBLASExtra(&extra, bExtra); #ifdef TEST_LOG sprintf(str, "test_%d_%d_%d_%d.log",func, patt, extra.dtype, extra.flags); openLogFile(&lf,str); #endif // create images if (patternUseImages(pattern)) { cl_int status; // Init Image status = createSCImage(&imgA, &clImgA); checkErrorFunc("createSCImage", status); status = createSCImage(&imgB, &clImgB); checkErrorFunc("createSCImage", status); } initSubDimInfo(&sdi, pattern, &genInfo.deviceInfos, func, patt, extra.dtype, extra.flags); initMatrixInfo(mi, extra.dtype, &genInfo.deviceInfos, bExtra); resetSubdim(&sdi); curStep = 0; while (nextSubdim(&sdi, genInfo.aMaxparam, time[bExtra->numParam - 1])) { GParam* parCur; GParam* lastbest[DIMARRAYCOUNT]; bool isKernelValid; if (bExtra) { parCur = createParCur(&sdi); } globalDim++; curStep++; logParamName(parCur, curStep, sdi.varCount); #ifdef TEST_LOG step_time = getCurrentTime(); time[DIMARRAY_BIG] = readLogFile(&lf, sdi.sdim, sdi.vecLen); if (!lf.readElem) { #endif #ifdef TEST_LOG } else { time[DIMARRAY_SMALL] = lf.t1; time[DIMARRAY_MIDDLE] = lf.t2; time[DIMARRAY_BIG] = lf.t3; step_time = lf.tall; } #endif for (nDim = 0; nDim < bExtra->numParam; nDim++) { if (bExtra){ lastbest[nDim] = NULL; } } isKernelValid = 0; for (nDim = 0; nDim < bExtra->numParam; nDim++) { BlasParamInfo* bParam; // can current combination of flags be handled by selected pattern bool isProbSupported = false; dimension = getDimension(nDim, extra.dtype, &genInfo.deviceInfos, bExtra->parent->parent->funcNo); // setup kernel arguments if (patternUseImages(pattern)) { // Init Image mi[nDim].imgA = imgA; mi[nDim].clImgA =clImgA; mi[nDim].imgB = imgB; mi[nDim].clImgB =clImgB; } // Incorrect subdimension for a given size of the matrix if ( dimension < sdi.sdim[0].x || dimension % sdi.sdim[0].x != 0 || dimension < sdi.sdim[0].y || dimension % sdi.sdim[0].y != 0 || dimension < sdi.sdim[0].bwidth || dimension % sdi.sdim[0].bwidth != 0 ) { releaseMemObjOne(mi + nDim); if (genInfo.aExtendedOutput) { fprintf(logStream, " "); } // write dummy data time[nDim] = -1; continue; } step.extraFlags = extra.flags; step.funcID = func; initCLBlasKArgDim( &step.args, mi + nDim, extra.flags ); // assuming that all // "old-fashioned" patterns, providing no performance estimation // function can handle any set of arguments/flags if ( NULL == pattern->sops->getPatternPerf || pattern->sops->getPatternPerf( step.extraFlags, (void*)&step.args ) >= 0 ) { isProbSupported = true; } else { isProbSupported = false; } // if current flags and dimensions are not optimal for current // pattern - skip building and running kernel. // But if the pattern is selected by environment // and can handle current problem - tune it anyway. if ( (patt != selectPattern( &step, 0 ) && (!isEnvPattSelected || !isProbSupported)) ) { releaseMemObjOne(mi + nDim); // write dummy data time[nDim] = -1; bestParam[nDim] = NULL; continue; } if ( 0 == isKernelValid ) { isKernelValid = genAllKernel( &step.args, extra, parCur, pattern, func, patt); logKernalGen(); } if ( 0 == isKernelValid ) { releaseMemObjOne(mi + nDim); logError(); break; } bParam = &(bExtra->param[nDim]); #ifdef TEST_LOG if (!lf.readElem) { #endif time[nDim] = runAllKernel(pattern, &step.args, parCur, func, bParam->time); releaseMemObjOne(mi + nDim); #ifdef TEST_LOG } #endif logTime(time[nDim]); if (bParam->time > time[nDim]) { if (bExtra) { BlasParamInfo* bParamNT = &(bExtra->param[nDim]); setParam(bParamNT, time[nDim], parCur); lastbest[nDim] = bestParam[nDim]; bestParam[nDim] = parCur; parCur->count++; } } } for (nDim = 0; nDim < bExtra->numParam; nDim++) { if (bExtra) { delGParam(lastbest[nDim]); lastbest[nDim] = NULL; } } #ifdef TEST_LOG step_time = ((double)(getCurrentTime()) - step_time)/1000000; saveLogFile(&lf, sdi.sdim, sdi.vecLen, time, step_time); if (lf.readElem) { step_time = lf.tall; } logTime(step_time); all_time += step_time; #endif logEndString(); releaseMemObjAll(mi, bExtra); if (bExtra) { delGParam(parCur); } } #ifdef TEST_LOG // Show log resetSubdim(&sdi); double t; double all = 0; int count = 0; time[DIMARRAY_SMALL] = 5000.0; time[DIMARRAY_MIDDLE] = 5000.0; time[DIMARRAY_BIG] = 5000.0; sdi.returnAll = true; do { t = readLogFile(&lf, sdi.sdim, sdi.vecLen); if (lf.readElem) { time[DIMARRAY_SMALL] = fmin(lf.t1, time[DIMARRAY_SMALL]); time[DIMARRAY_MIDDLE] = fmin(lf.t2, time[DIMARRAY_MIDDLE]); time[DIMARRAY_BIG] = fmin(lf.t3, time[DIMARRAY_BIG]); all+= lf.tall; count++; } else { printf ("^"); } } while (nextSubdim(&sdi, genInfo.aMaxparam, t)); #ifdef TEST_LOG getBestVariant(&lf); #endif lf.t1 = time[DIMARRAY_SMALL]; lf.t2 = time[DIMARRAY_MIDDLE]; lf.t3 = time[DIMARRAY_BIG]; lf.tall = all; fprintf(logStream, "---------------------------------------------------\n"); fprintf(logStream, " steps time1 time2 time3 AllTime \n"); int tmin = (int)(lf.tall/1000/60); int tsec = (int)(lf.tall/1000) - tmin*60; fprintf(logStream, " --> Best %5d %7.2lf %7.2lf %7.2lf %2d:%2d \n", count, lf.t1, lf.t2, lf.t3, tmin, tsec); tmin = (int)(all_time/1000/60); tsec = (int)(all_time/1000) - tmin*60; fprintf(logStream, " --> Fast %5d %7.2lf %7.2lf %7.2lf %2d:%2d\n", curStep, bExtra->param[DIMARRAY_SMALL].time, bExtra->param[DIMARRAY_MIDDLE].time, bExtra->param[DIMARRAY_BIG].time, tmin,tsec); globalFastTime += all_time; globalTime += lf.tall; closeLogFile(&lf); #endif logEndString(); // Release image releaseSCImage(&imgA, &clImgA); releaseSCImage(&imgB, &clImgB); destroyMatrixInfo(mi, bExtra); } double checkData( TargetDevice* devID, const MemoryPattern * pattern, DataType dtype, KernelExtraFlags flags, int dim, double oldt) { SubproblemDim sdim[MAX_SUBDIMS]; PGranularity pgran; double time; int i; unsigned char* buffer[MAX_CLBLAS_KERNELS_PER_STEP]; size_t sizeBuffer[MAX_CLBLAS_KERNELS_PER_STEP]; int status; for (i = 0; i < MAX_CLBLAS_KERNELS_PER_STEP; ++i) { buffer[i] = NULL; sizeBuffer[i] = 0; } status = getGranularityInfo(devID, pattern->name, dtype, flags, dim, sdim, &pgran, &time); if (status == GF_SUCCESS) { status = getKernelInfo(devID, pattern->name, dtype, flags, dim, buffer, sizeBuffer); logCheck(dim, sdim, &pgran, time, oldt, buffer[0] != NULL); } else { logCheckError(dim); } free(buffer[0]); free(buffer[1]); free(buffer[2]); return time; } void logDimension(BlasFunctionInfo* bFunc) { int func = bFunc->funcNo; int i; if (genInfo.aExtendedOutput) { printf("FLOAT "); for (i = 0; i < DIMARRAYCOUNT; ++i) { printf(" %6u", getDimension(i, TYPE_FLOAT, &genInfo.deviceInfos, func)); } printf("\n"); printf("DOUBLE "); for (i = 0; i < DIMARRAYCOUNT; ++i) { printf(" %6u", getDimension(i, TYPE_DOUBLE, &genInfo.deviceInfos, func)); } printf("\n"); printf("COMPLEX FLOAT "); for (i = 0; i < DIMARRAYCOUNT; ++i) { printf(" %6u", getDimension(i, TYPE_COMPLEX_FLOAT, &genInfo.deviceInfos, func)); } printf("\n"); printf("COMPLEX DOUBLE "); for (i = 0; i < DIMARRAYCOUNT; ++i) { printf(" %6u", getDimension(i, TYPE_COMPLEX_DOUBLE, &genInfo.deviceInfos, func)); } printf("\n"); } } void calcExtraCount(int index, int indexCount) { genInfo.last = (double)index/indexCount*100; genInfo.next = (double)(index + 1)/indexCount*100; } int isFlag(BlasExtraInfo* info, KernelExtraFlags flag) { return (info->flags & flag) == flag; } int isNoFlag(BlasExtraInfo* info, KernelExtraFlags flag) { return (info->flags & flag) == 0; } /* Check if current set of flags and datatype should be evaluated for current function and pattern. It may be skipped due to compiler/runtime bugs and if it is considered slow for any of the checked problem sizes If skipSlowPatt parameter is set to false - pattern considered slow is tuned anyway, otherwise, it is skipped */ bool skipFlags(BlasExtraInfo* info, int patt, int func, DeviceInfo* pDI, bool skipSlowPatt) { bool b = false; int i; bool shouldTunePatt = false; SolutionStep step; memset( &step, 0, sizeof(SolutionStep) ); step.funcID = func; step.patternID = patt; step.extraFlags = info->flags; (void)func; (void)patt; /* evaluate problem sizes */ /* skip pattern, if it is not optimal for any of the dimensions for current flags */ if ( skipSlowPatt ) { for ( i = 0; i < DIMARRAYCOUNT; i++ ) { step.args.M = getDimension( i, info->dtype, pDI, func ); step.args.N = getDimension( i, info->dtype, pDI, func ); step.args.K = getDimension( i, info->dtype, pDI, func ); if ( selectPattern( &step, 0 ) == (unsigned int)patt ){ shouldTunePatt = true; } } if( false == shouldTunePatt ){ return true; } } b |= (func == CLBLAS_SYMV) && (info->dtype == TYPE_COMPLEX_FLOAT); b |= (func == CLBLAS_SYMV) && (info->dtype == TYPE_COMPLEX_DOUBLE); /* * WORKAROUND for WINDOWS: Now, for many subproblem dimensions, * when tuning TRMM, SYRK, SYR2K functions * for complex-double type, gives BSoD. */ #if defined(_WIN32) b |= (func == CLBLAS_TRSM) && (info->dtype == TYPE_COMPLEX_DOUBLE); b |= (func == CLBLAS_SYRK) && (info->dtype == TYPE_COMPLEX_DOUBLE); b |= (func == CLBLAS_SYR2K) && (info->dtype == TYPE_COMPLEX_DOUBLE); #endif b |= !info->isUseForTunning; return b; } bool isFilter(BlasExtraInfo* info, int patt, int func) { int dType = (int)info->dtype; int flag = (int)info->flags; bool isFunc = genInfo.aFunc[func]; bool isPattern = (genInfo.aPattern == -1 || genInfo.aPattern == patt); bool isDataType = genInfo.aDType[dType]; bool isFlag = (genInfo.aFlag == -1 || genInfo.aFlag == flag); return (!(isFunc && isPattern && isDataType && isFlag)); } void initParamsTime(BlasExtraInfo* bExtra) { unsigned int nDim; for (nDim = 0; nDim < bExtra->numParam; nDim++) { if (bExtra){ bExtra->param[nDim].time += 1e50; } } } void saveBestParams( BlasExtraInfo* bExtra, GParam* bestParam[DIMARRAYCOUNT]) { unsigned int nDim; for (nDim = 0; nDim < bExtra->numParam; nDim++) { if (bExtra){ BlasParamInfo* bParam = &bExtra->param[nDim]; if (bestParam[nDim] != NULL){ saveBestParam(&genInfo.targetDevice, bParam); } } } } void deleteGParams (BlasExtraInfo* bExtra, GParam* bestParam[DIMARRAYCOUNT]) { unsigned int nDim; for (nDim = 0; nDim < bExtra->numParam; nDim++) { if (bExtra){ delGParam(bestParam[nDim]); } } } void checkDatas(BlasExtraInfo* bExtra, const MemoryPattern* pattern) { unsigned int nDim; double t; unsigned int dimension; int func = bExtra->parent->parent->funcNo; for (nDim = 0; nDim < bExtra->numParam; nDim++) { t = 0; if (bExtra) { dimension = getDimension(nDim, bExtra->dtype, &genInfo.deviceInfos, func); if(nDim == BANK_ALIGNED_CASE_RECORD_IDX) { dimension = 0; } // TODO add implementation checkData (void) pattern; t = checkData(&genInfo.targetDevice, pattern, bExtra->dtype, bExtra->flags, dimension, t); } logEndString(); } } void generateKernelForOthersFlag( BlasExtraInfo* bExtra, GParam* bestParam[DIMARRAYCOUNT], MemoryPattern* pattern) { unsigned int nExtra; BlasPatternInfo* bPatt = bExtra->parent; BlasFunctionInfo* bFunc = bPatt->parent; BlasExtraInfo* bExtraOther; CLBLASKernExtra extra; GParam* bestParamOther[DIMARRAYCOUNT]; unsigned int nDim; CLBlasKargs args; memset( bestParamOther, 0, sizeof(GParam*)*DIMARRAYCOUNT ); for (nExtra = 0; nExtra < bPatt->numExtra; ++nExtra) { bool isMaskFlag; bool isEqFlag; bool isDataType; unsigned int mask; bExtraOther = &(bPatt->extra[nExtra]); mask = bExtraOther->flags & bFunc->maskForTuningsKernel; isMaskFlag = mask == bExtra->flags; isEqFlag = bExtraOther->flags == bExtra->flags; isDataType = bExtra->dtype == bExtraOther->dtype; if (isDataType && isMaskFlag && !isEqFlag) { for (nDim = 0; nDim < bExtra->numParam; nDim++) { if (bestParam[nDim] == NULL) { continue; } bestParamOther[nDim] = createParCur2(bestParam[nDim]); } for (nDim = 0; nDim < bExtra->numParam; nDim++) { unsigned int nd; if (bestParam[nDim] == NULL) { continue; } for (nd = 0; nd < nDim; ++nd) { if (bestParam[nDim] == bestParam[nd]) { bestParamOther[nDim] = bestParamOther[nd]; bestParamOther[nDim]->count++; } } // If the user selected that they want to store the kernel binaries to disk, // and we do not have those binaries, generate them again if (genInfo.aIsKernel && bestParamOther[nDim]->kernel == NULL) { MatrixInfo mi [DIMARRAYCOUNT]; unsigned int func = bFunc->funcNo; unsigned int patt = bPatt->pattNo; // Initialize resources to generate kernels in genAllKernel initCLBLASExtra(&extra, bExtra); initMatrixInfo( mi, extra.dtype, &genInfo.deviceInfos, bExtra ); initCLBlasKArgDim( &args, mi, extra.flags ); genAllKernel(&args, extra, bestParamOther[nDim], pattern, func, patt); // Free those resources when finished releaseMemObjAll( mi, bExtra ); destroyMatrixInfo( mi, bExtra ); logKernalGen( ); } // This stores the kernel binaries to disk saveBestParams(bExtraOther, bestParamOther); } deleteGParams(bExtraOther, bestParamOther); } } } BlasPatternInfo* getPattern(BlasFunctionID fid, int pid) { BlasFunctionInfo* pFunc = getBlasFunctionInfo(&genInfo.targetDevice, fid); return &pFunc->pattInfo[pid]; } void configurePattern(void) { // Initialization specific to the handler function. //getPattern(CLBLAS_XXXX, 0)->isPGValid = ; //getPattern(CLBLAS_XXXX, 0)->initSubdim = ; } bool isRebuild(BlasExtraInfo* bExtra) { unsigned int nDim; bool ret = genInfo.aCommand != C_DEFAULT; for (nDim = 0; nDim < bExtra->numParam; ++nDim) { BlasParamInfo* bParam = &bExtra->param[nDim]; ret |= bParam->sstatus == SS_NOLOAD; if (bParam->offset == 0 ) { printf("*****\n"); } } return ret; } void createFile(void) { unsigned int funcId; unsigned int pattId = 0; unsigned int envPattId = 0; bool isEnvPattSelected = false; unsigned int dev; // This intializes global genInfo with either the last detected platform, or the // first AMD platform it finds. It records the number of devices in that platform. initOpenCl( ); // For each devices for (dev = 0; dev < genInfo.numDevices; dev++) { initDevice(dev); // The following creates the .kdb file on disk according to the set environment variable writeStorageCache(&genInfo.targetDevice); // The following creates the OpenCL context and commanqueue for the first device in global genInfo struct getContext( ); // Does nothing; nop configurePattern( ); // for each function for (funcId = 0; funcId < BLAS_FUNCTIONS_NUMBER; funcId++) { char *pRest = NULL; BlasFunctionInfo *funcInfo = getBlasFunctionInfo( &genInfo.targetDevice, funcId ); if (funcInfo->envImplementation != NULL) { const char *envImpl; envImpl = getenv(funcInfo->envImplementation); if (envImpl != NULL) { envPattId = strtoul( envImpl, &pRest, 10 ); //wrong value of env. variable AMD_CLBLAS_X_IMPLEMENTATION if( 0 == strlen( envImpl ) || pRest != envImpl + strlen(envImpl) ){ isEnvPattSelected = false; } else{ isEnvPattSelected = true; } } else{ isEnvPattSelected = false; } } // if pattern is selected by environment - tune it // otherwise - start from the pattern number 0 if( true == isEnvPattSelected ){ pattId = envPattId; } else{ pattId = 0; } //logPattern( funcInfo->name ); do { unsigned int nExtra; unsigned int nTuneExtra = 0; BlasPatternInfo * bPatt; MemoryPattern* pattern; bPatt = &(funcInfo->pattInfo[pattId]); pattern = &(funcInfo->pattern[pattId]); //if select a new trsm memory pattern (#3), then skip it if ( funcId == CLBLAS_TRSM && pattId == 3) { pattId++; continue; } for (nExtra = 0; nExtra < bPatt->numExtra; ++nExtra) { bool isRebuildRequired; BlasExtraInfo* bExtra; bExtra = &(bPatt->extra[nExtra]); genInfo.last = 0; // This evaluates whether the current combination of parameters from the given function should be tuned or not // If skipFlags returns 1, then the this combination is skipped // This checks for hardcoded combinations which are skipped because of known runtime bugs. if ( skipFlags(bExtra, pattId, funcId, &genInfo.deviceInfos, !isEnvPattSelected ) ) { continue; } // Similar logic to skipFlags, but this mostly filters out cases that were specified on the command line if (isFilter(bExtra, pattId, funcId)) { continue; } logPattern( funcInfo->name ); calcExtraCount(nTuneExtra, bPatt->numTuneExtra); nTuneExtra++; logDimension(funcInfo); logExtra(bExtra); isRebuildRequired = isRebuild(bExtra); if (isRebuildRequired) { size_t bestPatamSize = sizeof(GParam*)*DIMARRAYCOUNT; GParam* bestParam[DIMARRAYCOUNT]; memset(bestParam, 0, bestPatamSize); initParamsTime(bExtra); findBestParams( pattern, funcId, pattId, isEnvPattSelected, bExtra, bestParam); saveBestParams(bExtra, bestParam); generateKernelForOthersFlag( bExtra, bestParam, pattern); deleteGParams(bExtra, bestParam); } checkDatas(bExtra, pattern); } /* extra */ //logEndPattern(funcId, pattId); pattId++; /* patt */ }while( false == isEnvPattSelected && pattId < clblasSolvers[funcId].nrPatterns ); } /* func */ } /* dev */ destroyGenInfo(); } void parseArg(int argc, char* argv[]) { static char* help= "clblasTune - automatically tune the clblas " "library for specific hardware.\n" "\n" "clblas function related parameters:\n" " --gemm\n" " Tune kernels for the GEMM function family.\n" " --trmm\n" " Tune kernels for the TRMM function family.\n" " --trsm\n" " Tune kernels for the TRSM function family.\n" " --gemv\n" " Tune kernels for the GEMV function family.\n" " --symv\n" " Tune kernels for the SYMV function family.\n" " --syrk\n" " Tune kernels for the SYRK function family.\n" " --syr2k\n" " Tune kernels for the SYR2K function family.\n" "\n" " You can specify the parameters of " "several alternatives simultaneously.\n" "\n" " If any of these parameters is not specified the " "tool tries kernels for all the functions.\n" "\n" " Used data types:\n" " --float\n" " Single precision version of functions.\n" " --double\n" " Double precision version of functions.\n" " --complex\n" " Single complex precision version of functions.\n" " --double-complex\n" " Double complex precision version of functions.\n" "\n" " You can specify the parameters of " "several alternatives simultaneously.\n" "\n" " If any of these parameters is not specified the " "tool tries kernels for all the data types.\n" "\n" "Management:\n" " --fast\n" " Using this option allows you to accelerate " "tuning in up to 2-3 times. Achieving an optimal result " "is not guaranteed.\n" " --rebuild\n" " Re-tuning the fastest OpenCL kernels. Can be " "used after the driver update.\n" " --store-kernels\n" " Store found best kernels into a database file\n" " WARNING! The file can be very large.\n" "\n" ; static char* args[] = { "--gemm", // 0 "--trmm", // 1 "--trsm", // 2 "--buffers", // 3 "--images", // 4 "--float", // 5 "--double", // 6 "--complex", // 7 "--double-complex", // 8 "--store-kernels", // 9 "--rebuild", // 10 #if defined(_EXTENDED_TUNE_ARG) "--e", // 11 "--max", // 12 "--extended-output", // 13 #else "", "", "", #endif "--gemv", // 14 "--symv", // 15 "--syrk", // 20 "--syr2k", // 17 "--fast", // 18 "--caches", // 19 "--help" // 20 }; int i; unsigned int j; bool isSetFunction = false; bool isSetType = false; genInfo.aAll = true; for (i = 1; i < argc; ++i) { char * arg = argv[i]; bool b = true; for (j = 0; j < sizeof(args)/sizeof(char*); ++ j){ if (strcmp(arg, args[j]) == 0){ #if defined(_EXTENDED_TUNE_ARG) int argi = 0; #endif switch (j){ case 0 : genInfo.aFunc[CLBLAS_GEMM] = true; isSetFunction = true; break; case 1 : genInfo.aFunc[CLBLAS_TRMM] = true; isSetFunction = true; break; case 2 : genInfo.aFunc[CLBLAS_TRSM] = true; isSetFunction = true; break; case 3 : genInfo.aPattern = 0; break; case 4 : genInfo.aPattern = 1; break; case 5 : genInfo.aDType[TYPE_FLOAT] = true; isSetType = true; break; case 6 : genInfo.aDType[TYPE_DOUBLE] = true; isSetType = true; break; case 7 : genInfo.aDType[TYPE_COMPLEX_FLOAT] = true; isSetType = true; break; case 8 : genInfo.aDType[TYPE_COMPLEX_DOUBLE] = true; isSetType = true; break; case 9 : genInfo.aIsKernel = true; break; case 10: genInfo.aCommand = C_REBUILD; break; #if defined(_EXTENDED_TUNE_ARG) case 11: i++; argi = atoi(argv[i]); genInfo.aFlag = argi; break; case 12: i++; argi = atoi(argv[i]); genInfo.aMaxparam = argi; break; case 13: genInfo.aExtendedOutput = true; break; #endif case 14: genInfo.aFunc[CLBLAS_GEMV] = true; isSetFunction = true; break; case 15: genInfo.aFunc[CLBLAS_SYMV] = true; isSetFunction = true; break; case 16: genInfo.aFunc[CLBLAS_SYRK] = true; isSetFunction = true; break; case 17: genInfo.aFunc[CLBLAS_SYR2K] = true; isSetFunction = true; break; case 18: genInfo.aAll = false; break; case 19: genInfo.aPattern = 2; break; case 20: printf ("%s", help); exit(0); break; } b = false; } } if (b) { fprintf(stdout, "Unknown argument %s\n", arg); } } if (!isSetFunction) { for (i=0; i < BLAS_FUNCTIONS_NUMBER; ++i) { genInfo.aFunc[i] = 1; } } if (!isSetType) { for (i=0; i < TYPE_NUMBER; ++i) { genInfo.aDType[i] = 1; } } } int main(int argc, char* argv[]) { FILE_PATH = getenv(ENV_FILE_PATH); // This clears and initializes the global GeneratorInfoRec genInfo struct initGeneratorInfoRec( ); parseArg(argc, argv); // This will // Set up the global clblasSolvers for all function families supported within blas, including initializing memory patterns // Identify all recognized devices in the system clblasSetup(); if (!FILE_PATH){ printf("The environment variable 'CLBLAS_STORAGE_PATH' is not defined\n"); exit(EXIT_COD_NO_ENVIRONMENT_VARIABLE); } logStream = stdout; createFile(); #ifdef TEST_LOG int h = (int)(globalTime/1000/60/60); int m = (int)(globalTime/1000/60) - h*60; int c = (int)(globalTime/1000) - m*60 - h*60*60; fprintf(logStream, " --> All time : %2d:%2d:%2d \n",h, m,c); h = (int)(globalFastTime/1000/60/60); m = (int)(globalFastTime/1000/60) - h*60; c = (int)(globalFastTime/1000) - m*60 - h*60*60; fprintf(logStream, " --> Fast time : %2d:%2d:%2d \n",h, m,c); #endif } char* getDeviceName(cl_device_id devID, int * status) { char* devName; size_t size; *status = clGetDeviceInfo(devID, CL_DEVICE_NAME, 0, NULL, &size); checkErrorFunc("clGetDeviceInfo", *status); devName = malloc(size * sizeof(char)); *status = clGetDeviceInfo(devID, CL_DEVICE_NAME, size, devName, NULL); checkErrorFunc("clGetDeviceInfo", *status); return devName; } clblas-2.10/src/library/tools/tune/tune.h000066400000000000000000000023551264277366700204170ustar00rootroot00000000000000/* ************************************************************************ * Copyright 2013 Advanced Micro Devices, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * ************************************************************************/ #ifndef TOOLS_H__ #define TOOLS_H__ #include #include #include #include #include #include #include "storage_data.h" extern const char *FileID; extern const char *FileExt; extern const char *ENV_FILE_PATH; struct SubDimInfo; void initMask(unsigned int* mask); char* getDevName(TargetDevice* devId); void initCLDeviceInfoRec(TargetDevice* devID, DeviceInfo *devInfo); #endif /* TOOLS_H__ */ clblas-2.10/src/samples/000077500000000000000000000000001264277366700151535ustar00rootroot00000000000000clblas-2.10/src/samples/CMakeLists.pack000066400000000000000000000237571264277366700200300ustar00rootroot00000000000000# ######################################################################## # Copyright 2013 Advanced Micro Devices, Inc. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # ######################################################################## cmake_minimum_required(VERSION 2.6) project(clblas.samples) # Configure set(AMDAPPSDKROOT $ENV{AMDAPPSDKROOT} CACHE FILEPATH "ATI Stream SDK root path") if(NOT CMAKE_BUILD_TYPE) set(CMAKE_BUILD_TYPE Debug CACHE STRING "Choose the type of build, options are: None Debug Release RelWithDebInfo MinSizeRel." FORCE) endif() if(TARGET_PLATFORM EQUAL 32 OR TARGET_PLATFORM EQUAL 64) set(TARGET_PLATFORM ${TARGET_PLATFORM} CACHE STRING "Target platform type (32-bit or 64-bit)" FORCE) else() if(CMAKE_SIZEOF_VOID_P MATCHES 8) set(TARGET_PLATFORM "64" CACHE STRING "Target platform type (32-bit or 64-bit)" FORCE) else() set(TARGET_PLATFORM "32" CACHE STRING "Target platform type (32-bit or 64-bit)" FORCE) endif() endif() message(STATUS "Target platform: ${TARGET_PLATFORM}-bit") if(TARGET_PLATFORM EQUAL 32) set(_arch "x86" INTERNAL) set_property(GLOBAL PROPERTY FIND_LIBRARY_USE_LIB64_PATHS FALSE) else() set(_arch "x86_64" INTERNAL) set_property(GLOBAL PROPERTY FIND_LIBRARY_USE_LIB64_PATHS TRUE) endif() # Find OpenCL SDK find_path(OPENCL_INCLUDE_DIRS CL/cl.h OpenCL/cl.h HINTS $ENV{AMDAPPSDKROOT}/include ${AMDAPPSDKROOT}/include ) find_library(OPENCL_LIBRARIES OpenCL HINTS $ENV{AMDAPPSDKROOT}/lib/${_arch} ${AMDAPPSDKROOT}/lib/${_arch} ) if(OPENCL_INCLUDE_DIRS AND OPENCL_LIBRARIES) message(STATUS "Found OpenCL: ${OPENCL_LIBRARIES}") else() message(FATAL_ERROR "Cannot find OpenCL SDK") endif() mark_as_advanced(OPENCL_INCLUDE_DIRS OPENCL_LIBRARIES) # Turn on maximum compiler verbosity if(CMAKE_COMPILER_IS_GNUCXX) add_definitions(-pedantic -Wall -Wextra -D_POSIX_C_SOURCE=199309L -D_XOPEN_SOURCE=500 ) set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -std=c99 -Wstrict-prototypes" CACHE STRING "Default CFLAGS" FORCE) # Don't use -rpath. set(CMAKE_SKIP_RPATH ON CACHE BOOL "Skip RPATH" FORCE) set(CMAKE_C_FLAGS "-m${TARGET_PLATFORM} ${CMAKE_C_FLAGS}") set(CMAKE_CXX_FLAGS "-m${TARGET_PLATFORM} ${CMAKE_CXX_FLAGS}") if(TARGET_PLATFORM EQUAL 32) set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -fno-builtin") endif() endif() if (WIN32) add_definitions(-D_CRT_SECURE_NO_WARNINGS) endif() set(SGEMV_SAMPLE_SRC example_sgemv.c) set(SSYMV_SAMPLE_SRC example_ssymv.c) set(SGEMM_SAMPLE_SRC example_sgemm.c) set(STRMM_SAMPLE_SRC example_strmm.c) set(STRSM_SAMPLE_SRC example_strsm.c) set(SSYRK_SAMPLE_SRC example_ssyrk.c) set(SSYR2K_SAMPLE_SRC example_ssyr2k.c) set(STRMV_SAMPLE_SRC example_strmv.c) set(STRSV_SAMPLE_SRC example_strsv.c) set(SGER_SAMPLE_SRC example_sger.c) set(SSYR_SAMPLE_SRC example_ssyr.c) set(SSYR2_SAMPLE_SRC example_ssyr2.c) set(SSYMM_SAMPLE_SRC example_ssymm.c) set(CHER_SAMPLE_SRC example_cher.c) set(CHEMM_SAMPLE_SRC example_chemm.cpp) set(CHERK_SAMPLE_SRC example_cherk.cpp) set(STPMV_SAMPLE_SRC example_stpmv.c) set(CHPMV_SAMPLE_SRC example_chpmv.c) set(STPSV_SAMPLE_SRC example_stpsv.c) set(SSPMV_SAMPLE_SRC example_sspmv.c) set(SSPR_SAMPLE_SRC example_sspr.c) set(CHPR_SAMPLE_SRC example_chpr.c) set(SSPR2_SAMPLE_SRC example_sspr2.c) set(ZHPR2_SAMPLE_SRC example_zhpr2.c) set(SGBMV_SAMPLE_SRC example_sgbmv.c) set(STBMV_SAMPLE_SRC example_stbmv.c) set(SSBMV_SAMPLE_SRC example_ssbmv.c) set(CHBMV_SAMPLE_SRC example_chbmv.c) set(STBSV_SAMPLE_SRC example_stbsv.c) set(CHER2K_SAMPLE_SRC example_cher2k.c) set(SSWAP_SAMPLE_SRC example_sswap.c) set(SSCAL_SAMPLE_SRC example_sscal.c) set(CSSCAL_SAMPLE_SRC example_csscal.c) set(SCOPY_SAMPLE_SRC example_scopy.c) set(SAXPY_SAMPLE_SRC example_saxpy.c) set(SDOT_SAMPLE_SRC example_sdot.c) set(SROTG_SAMPLE_SRC example_srotg.c) set(SROTMG_SAMPLE_SRC example_srotmg.c) set(SROT_SAMPLE_SRC example_srot.c) set(SROTM_SAMPLE_SRC example_srotm.c) set(iSAMAX_SAMPLE_SRC example_isamax.c) set(SNRM2_SAMPLE_SRC example_snrm2.c) set(SASUM_SAMPLE_SRC example_sasum.c) include_directories( ${OPENCL_INCLUDE_DIRS} ${PROJECT_SOURCE_DIR}/../include ) # Set the OpenCL library include path depending on target platform if( TARGET_PLATFORM EQUAL 64 ) if( WIN32 ) link_directories( ${ATI_STREAM_SDK_ROOT}/lib/x86_64/ ${PROJECT_SOURCE_DIR}/../lib64/import ) elseif( UNIX ) link_directories( ${ATI_STREAM_SDK_ROOT}/lib/x86_64/ ${PROJECT_SOURCE_DIR}/../lib64 ) endif() else() if( WIN32 ) link_directories( ${ATI_STREAM_SDK_ROOT}/lib/x86/ ${PROJECT_SOURCE_DIR}/../lib32/import ) elseif( UNIX ) link_directories( ${ATI_STREAM_SDK_ROOT}/lib/x86/ ${PROJECT_SOURCE_DIR}/../lib32 ) endif() endif() add_executable(example_sgemv ${SGEMV_SAMPLE_SRC}) target_link_libraries(example_sgemv ${OPENCL_LIBRARIES} clblas) add_executable(example_ssymv ${SSYMV_SAMPLE_SRC}) target_link_libraries(example_ssymv ${OPENCL_LIBRARIES} clblas) add_executable(example_sgemm ${SGEMM_SAMPLE_SRC}) target_link_libraries(example_sgemm ${OPENCL_LIBRARIES} clblas) add_executable(example_strmm ${STRMM_SAMPLE_SRC}) target_link_libraries(example_strmm ${OPENCL_LIBRARIES} clblas) add_executable(example_strsm ${STRSM_SAMPLE_SRC}) target_link_libraries(example_strsm ${OPENCL_LIBRARIES} clblas) add_executable(example_ssyrk ${SSYRK_SAMPLE_SRC}) target_link_libraries(example_ssyrk ${OPENCL_LIBRARIES} clblas) add_executable(example_ssyr2k ${SSYR2K_SAMPLE_SRC}) target_link_libraries(example_ssyr2k ${OPENCL_LIBRARIES} clblas) add_executable(example_strmv ${STRMV_SAMPLE_SRC}) target_link_libraries(example_strmv ${OPENCL_LIBRARIES} clblas) add_executable(example_strsv ${STRSV_SAMPLE_SRC}) target_link_libraries(example_strsv ${OPENCL_LIBRARIES} clblas) add_executable(example_sger ${SGER_SAMPLE_SRC}) target_link_libraries(example_sger ${OPENCL_LIBRARIES} clblas) add_executable(example_ssyr ${SSYR_SAMPLE_SRC}) target_link_libraries(example_ssyr ${OPENCL_LIBRARIES} clblas) add_executable(example_ssyr2 ${SSYR2_SAMPLE_SRC}) target_link_libraries(example_ssyr2 ${OPENCL_LIBRARIES} clblas) add_executable(example_ssymm ${SSYMM_SAMPLE_SRC}) target_link_libraries(example_ssymm ${OPENCL_LIBRARIES} clblas) add_executable(example_cher ${CHER_SAMPLE_SRC}) target_link_libraries(example_cher ${OPENCL_LIBRARIES} clblas) add_executable(example_chemm ${CHEMM_SAMPLE_SRC}) target_link_libraries(example_chemm ${OPENCL_LIBRARIES} clblas) add_executable(example_cherk ${CHERK_SAMPLE_SRC}) target_link_libraries(example_cherk ${OPENCL_LIBRARIES} clblas) add_executable(example_stpmv ${STPMV_SAMPLE_SRC}) target_link_libraries(example_stpmv ${OPENCL_LIBRARIES} clblas) add_executable(example_chpmv ${CHPMV_SAMPLE_SRC}) target_link_libraries(example_chpmv ${OPENCL_LIBRARIES} clblas) add_executable(example_stpsv ${STPSV_SAMPLE_SRC}) target_link_libraries(example_stpsv ${OPENCL_LIBRARIES} clblas) add_executable(example_sspmv ${SSPMV_SAMPLE_SRC}) target_link_libraries(example_sspmv ${OPENCL_LIBRARIES} clblas) add_executable(example_sspr ${SSPR_SAMPLE_SRC}) target_link_libraries(example_sspr ${OPENCL_LIBRARIES} clblas) add_executable(example_chpr ${CHPR_SAMPLE_SRC}) target_link_libraries(example_chpr ${OPENCL_LIBRARIES} clblas) add_executable(example_sspr2 ${SSPR2_SAMPLE_SRC}) target_link_libraries(example_sspr2 ${OPENCL_LIBRARIES} clblas) add_executable(example_zhpr2 ${ZHPR2_SAMPLE_SRC}) target_link_libraries(example_zhpr2 ${OPENCL_LIBRARIES} clblas) add_executable(example_sgbmv ${SGBMV_SAMPLE_SRC}) target_link_libraries(example_sgbmv ${OPENCL_LIBRARIES} clblas) add_executable(example_stbmv ${STBMV_SAMPLE_SRC}) target_link_libraries(example_stbmv ${OPENCL_LIBRARIES} clblas) add_executable(example_ssbmv ${SSBMV_SAMPLE_SRC}) target_link_libraries(example_ssbmv ${OPENCL_LIBRARIES} clblas) add_executable(example_chbmv ${CHBMV_SAMPLE_SRC}) target_link_libraries(example_chbmv ${OPENCL_LIBRARIES} clblas) add_executable(example_stbsv ${STBSV_SAMPLE_SRC}) target_link_libraries(example_stbsv ${OPENCL_LIBRARIES} clblas) add_executable(example_cher2k ${CHER2K_SAMPLE_SRC}) target_link_libraries(example_cher2k ${OPENCL_LIBRARIES} clblas) add_executable(example_sswap ${SSWAP_SAMPLE_SRC}) target_link_libraries(example_sswap ${OPENCL_LIBRARIES} clblas) add_executable(example_sscal ${SSCAL_SAMPLE_SRC}) target_link_libraries(example_sscal ${OPENCL_LIBRARIES} clblas) add_executable(example_csscal ${CSSCAL_SAMPLE_SRC}) target_link_libraries(example_csscal ${OPENCL_LIBRARIES} clblas) add_executable(example_scopy ${SCOPY_SAMPLE_SRC}) target_link_libraries(example_scopy ${OPENCL_LIBRARIES} clblas) add_executable(example_saxpy ${SAXPY_SAMPLE_SRC}) target_link_libraries(example_saxpy ${OPENCL_LIBRARIES} clblas) add_executable(example_sdot ${SDOT_SAMPLE_SRC}) target_link_libraries(example_sdot ${OPENCL_LIBRARIES} clblas) add_executable(example_srotg ${SROTG_SAMPLE_SRC}) target_link_libraries(example_srotg ${OPENCL_LIBRARIES} clblas) add_executable(example_srotmg ${SROTMG_SAMPLE_SRC}) target_link_libraries(example_srotmg ${OPENCL_LIBRARIES} clblas) add_executable(example_srot ${SROT_SAMPLE_SRC}) target_link_libraries(example_srot ${OPENCL_LIBRARIES} clblas) add_executable(example_srotm ${SROTM_SAMPLE_SRC}) target_link_libraries(example_srotm ${OPENCL_LIBRARIES} clblas) add_executable(example_isamax ${iSAMAX_SAMPLE_SRC}) target_link_libraries(example_isamax ${OPENCL_LIBRARIES} clblas) add_executable(example_snrm2 ${SNRM2_SAMPLE_SRC}) target_link_libraries(example_snrm2 ${OPENCL_LIBRARIES} clblas) add_executable(example_sasum ${SASUM_SAMPLE_SRC}) target_link_libraries(example_sasum ${OPENCL_LIBRARIES} clblas) clblas-2.10/src/samples/CMakeLists.txt000066400000000000000000000327561264277366700177300ustar00rootroot00000000000000# ######################################################################## # Copyright 2013 Advanced Micro Devices, Inc. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # ######################################################################## set(SGEMV_SAMPLE_SRC example_sgemv.c) set(SSYMV_SAMPLE_SRC example_ssymv.c) set(SGEMM_SAMPLE_SRC example_sgemm.c) set(STRMM_SAMPLE_SRC example_strmm.c) set(STRSM_SAMPLE_SRC example_strsm.c) set(SSYRK_SAMPLE_SRC example_ssyrk.c) set(SSYR2K_SAMPLE_SRC example_ssyr2k.c) set(STRMV_SAMPLE_SRC example_strmv.c) # Addition: for STRMV set(DTRMV_SAMPLE_SRC example_dtrmv.c) # Addition: for STRMV set(STRSV_SAMPLE_SRC example_strsv.c) # Addition: for STRSV set(SGER_SAMPLE_SRC example_sger.c) # Addition: for SGER set(SSYR_SAMPLE_SRC example_ssyr.c) # Addition: for SSYR set(SSYR2_SAMPLE_SRC example_ssyr2.c) # Addition: for SSYR2 set(CHER_SAMPLE_SRC example_cher.c) set(ZHEMV_SAMPLE_SRC example_zhemv.cpp) set(ZHER2_SAMPLE_SRC example_zher2.c) set(CHERK_SAMPLE_SRC example_cherk.cpp) set(SSYMM_SAMPLE_SRC example_ssymm.c) set(CHEMM_SAMPLE_SRC example_chemm.cpp) set(STPMV_SAMPLE_SRC example_stpmv.c) set(CHPMV_SAMPLE_SRC example_chpmv.c) set(STPSV_SAMPLE_SRC example_stpsv.c) set(SSPMV_SAMPLE_SRC example_sspmv.c) set(SSPR_SAMPLE_SRC example_sspr.c) set(CHPR_SAMPLE_SRC example_chpr.c) set(SSPR2_SAMPLE_SRC example_sspr2.c) set(ZHPR2_SAMPLE_SRC example_zhpr2.c) set(SGBMV_SAMPLE_SRC example_sgbmv.c) set(STBMV_SAMPLE_SRC example_stbmv.c) set(SSBMV_SAMPLE_SRC example_ssbmv.c) set(CHBMV_SAMPLE_SRC example_chbmv.c) set(STBSV_SAMPLE_SRC example_stbsv.c) set(CHER2K_SAMPLE_SRC example_cher2k.c) set(SSWAP_SAMPLE_SRC example_sswap.c) set(SSCAL_SAMPLE_SRC example_sscal.c) set(CSSCAL_SAMPLE_SRC example_csscal.c) set(SCOPY_SAMPLE_SRC example_scopy.c) set(SAXPY_SAMPLE_SRC example_saxpy.c) set(SDOT_SAMPLE_SRC example_sdot.c) set(SROTG_SAMPLE_SRC example_srotg.c) set(SROTMG_SAMPLE_SRC example_srotmg.c) set(SROT_SAMPLE_SRC example_srot.c) set(SROTM_SAMPLE_SRC example_srotm.c) set(iSAMAX_SAMPLE_SRC example_isamax.c) set(SNRM2_SAMPLE_SRC example_snrm2.c) set(SASUM_SAMPLE_SRC example_sasum.c) set(VERSION_SAMPLE_SRC clBlasVersion.c) include_directories(${OPENCL_INCLUDE_DIRS} ${clBLAS_SOURCE_DIR}) add_executable(example_sgemv ${SGEMV_SAMPLE_SRC}) target_link_libraries(example_sgemv ${OPENCL_LIBRARIES} clBLAS) set_property( TARGET example_sgemv PROPERTY FOLDER "Samples") add_executable(example_ssymv ${SSYMV_SAMPLE_SRC}) target_link_libraries(example_ssymv ${OPENCL_LIBRARIES} clBLAS) set_property( TARGET example_ssymv PROPERTY FOLDER "Samples") add_executable(example_sgemm ${SGEMM_SAMPLE_SRC}) target_link_libraries(example_sgemm ${OPENCL_LIBRARIES} clBLAS) set_property( TARGET example_sgemm PROPERTY FOLDER "Samples") add_executable(example_strmm ${STRMM_SAMPLE_SRC}) target_link_libraries(example_strmm ${OPENCL_LIBRARIES} clBLAS) set_property( TARGET example_strmm PROPERTY FOLDER "Samples") add_executable(example_strsm ${STRSM_SAMPLE_SRC}) target_link_libraries(example_strsm ${OPENCL_LIBRARIES} clBLAS) set_property( TARGET example_strsm PROPERTY FOLDER "Samples") add_executable(example_ssyrk ${SSYRK_SAMPLE_SRC}) target_link_libraries(example_ssyrk ${OPENCL_LIBRARIES} clBLAS) set_property( TARGET example_ssyrk PROPERTY FOLDER "Samples") add_executable(example_ssyr2k ${SSYR2K_SAMPLE_SRC}) target_link_libraries(example_ssyr2k ${OPENCL_LIBRARIES} clBLAS) set_property( TARGET example_ssyr2k PROPERTY FOLDER "Samples") add_executable(version ${VERSION_SAMPLE_SRC}) target_link_libraries(version ${OPENCL_LIBRARIES} clBLAS) set_property( TARGET version PROPERTY FOLDER "Samples") # Addition - for samples add_executable(example_strmv ${STRMV_SAMPLE_SRC}) target_link_libraries(example_strmv ${OPENCL_LIBRARIES} clBLAS) set_property( TARGET example_strmv PROPERTY FOLDER "Samples") add_executable(example_dtrmv ${DTRMV_SAMPLE_SRC}) target_link_libraries(example_dtrmv ${OPENCL_LIBRARIES} clBLAS) set_property( TARGET example_dtrmv PROPERTY FOLDER "Samples") add_executable(example_strsv ${STRSV_SAMPLE_SRC}) target_link_libraries(example_strsv ${OPENCL_LIBRARIES} clBLAS) set_property( TARGET example_strsv PROPERTY FOLDER "Samples") add_executable(example_sger ${SGER_SAMPLE_SRC}) target_link_libraries(example_sger ${OPENCL_LIBRARIES} clBLAS) set_property( TARGET example_sger PROPERTY FOLDER "Samples") add_executable(example_cher ${CHER_SAMPLE_SRC}) target_link_libraries(example_cher ${OPENCL_LIBRARIES} clBLAS) set_property( TARGET example_cher PROPERTY FOLDER "Samples") add_executable(example_ssyr ${SSYR_SAMPLE_SRC}) target_link_libraries(example_ssyr ${OPENCL_LIBRARIES} clBLAS) set_property( TARGET example_ssyr PROPERTY FOLDER "Samples") add_executable(example_ssyr2 ${SSYR2_SAMPLE_SRC}) target_link_libraries(example_ssyr2 ${OPENCL_LIBRARIES} clBLAS) set_property( TARGET example_ssyr2 PROPERTY FOLDER "Samples") add_executable(example_zhemv ${ZHEMV_SAMPLE_SRC}) target_link_libraries(example_zhemv ${OPENCL_LIBRARIES} clBLAS) set_property( TARGET example_zhemv PROPERTY FOLDER "Samples") add_executable(example_zher2 ${ZHER2_SAMPLE_SRC}) target_link_libraries(example_zher2 ${OPENCL_LIBRARIES} clBLAS) set_property( TARGET example_zher2 PROPERTY FOLDER "Samples") add_executable(example_cherk ${CHERK_SAMPLE_SRC}) target_link_libraries(example_cherk ${OPENCL_LIBRARIES} clBLAS) set_property( TARGET example_cherk PROPERTY FOLDER "Samples") add_executable(example_ssymm ${SSYMM_SAMPLE_SRC}) target_link_libraries(example_ssymm ${OPENCL_LIBRARIES} clBLAS) set_property( TARGET example_ssymm PROPERTY FOLDER "Samples") add_executable(example_chemm ${CHEMM_SAMPLE_SRC}) target_link_libraries(example_chemm ${OPENCL_LIBRARIES} clBLAS) set_property( TARGET example_chemm PROPERTY FOLDER "Samples") add_executable(example_stpmv ${STPMV_SAMPLE_SRC}) target_link_libraries(example_stpmv ${OPENCL_LIBRARIES} clBLAS) set_property( TARGET example_stpmv PROPERTY FOLDER "Samples") add_executable(example_chpmv ${CHPMV_SAMPLE_SRC}) target_link_libraries(example_chpmv ${OPENCL_LIBRARIES} clBLAS) set_property( TARGET example_chpmv PROPERTY FOLDER "Samples") add_executable(example_stpsv ${STPSV_SAMPLE_SRC}) target_link_libraries(example_stpsv ${OPENCL_LIBRARIES} clBLAS) set_property( TARGET example_stpsv PROPERTY FOLDER "Samples") add_executable(example_sspmv ${SSPMV_SAMPLE_SRC}) target_link_libraries(example_sspmv ${OPENCL_LIBRARIES} clBLAS) set_property( TARGET example_sspmv PROPERTY FOLDER "Samples") add_executable(example_sspr ${SSPR_SAMPLE_SRC}) target_link_libraries(example_sspr ${OPENCL_LIBRARIES} clBLAS) set_property( TARGET example_sspr PROPERTY FOLDER "Samples") add_executable(example_chpr ${CHPR_SAMPLE_SRC}) target_link_libraries(example_chpr ${OPENCL_LIBRARIES} clBLAS) set_property( TARGET example_chpr PROPERTY FOLDER "Samples") add_executable(example_sspr2 ${SSPR2_SAMPLE_SRC}) target_link_libraries(example_sspr2 ${OPENCL_LIBRARIES} clBLAS) set_property( TARGET example_sspr2 PROPERTY FOLDER "Samples") add_executable(example_zhpr2 ${ZHPR2_SAMPLE_SRC}) target_link_libraries(example_zhpr2 ${OPENCL_LIBRARIES} clBLAS) set_property( TARGET example_zhpr2 PROPERTY FOLDER "Samples") add_executable(example_sgbmv ${SGBMV_SAMPLE_SRC}) target_link_libraries(example_sgbmv ${OPENCL_LIBRARIES} clBLAS) set_property( TARGET example_sgbmv PROPERTY FOLDER "Samples") add_executable(example_stbmv ${STBMV_SAMPLE_SRC}) target_link_libraries(example_stbmv ${OPENCL_LIBRARIES} clBLAS) set_property( TARGET example_stbmv PROPERTY FOLDER "Samples") add_executable(example_ssbmv ${SSBMV_SAMPLE_SRC}) target_link_libraries(example_ssbmv ${OPENCL_LIBRARIES} clBLAS) set_property( TARGET example_ssbmv PROPERTY FOLDER "Samples") add_executable(example_chbmv ${CHBMV_SAMPLE_SRC}) target_link_libraries(example_chbmv ${OPENCL_LIBRARIES} clBLAS) set_property( TARGET example_chbmv PROPERTY FOLDER "Samples") add_executable(example_stbsv ${STBSV_SAMPLE_SRC}) target_link_libraries(example_stbsv ${OPENCL_LIBRARIES} clBLAS) set_property( TARGET example_stbsv PROPERTY FOLDER "Samples") add_executable(example_cher2k ${CHER2K_SAMPLE_SRC}) target_link_libraries(example_cher2k ${OPENCL_LIBRARIES} clBLAS) set_property( TARGET example_cher2k PROPERTY FOLDER "Samples") add_executable(example_sswap ${SSWAP_SAMPLE_SRC}) target_link_libraries(example_sswap ${OPENCL_LIBRARIES} clBLAS) set_property( TARGET example_sswap PROPERTY FOLDER "Samples") add_executable(example_sscal ${SSCAL_SAMPLE_SRC}) target_link_libraries(example_sscal ${OPENCL_LIBRARIES} clBLAS) set_property( TARGET example_sscal PROPERTY FOLDER "Samples") add_executable(example_csscal ${CSSCAL_SAMPLE_SRC}) target_link_libraries(example_csscal ${OPENCL_LIBRARIES} clBLAS) set_property( TARGET example_csscal PROPERTY FOLDER "Samples") add_executable(example_scopy ${SCOPY_SAMPLE_SRC}) target_link_libraries(example_scopy ${OPENCL_LIBRARIES} clBLAS) set_property( TARGET example_scopy PROPERTY FOLDER "Samples") add_executable(example_saxpy ${SAXPY_SAMPLE_SRC}) target_link_libraries(example_saxpy ${OPENCL_LIBRARIES} clBLAS) set_property( TARGET example_saxpy PROPERTY FOLDER "Samples") add_executable(example_sdot ${SDOT_SAMPLE_SRC}) target_link_libraries(example_sdot ${OPENCL_LIBRARIES} clBLAS) set_property( TARGET example_sdot PROPERTY FOLDER "Samples") add_executable(example_srotg ${SROTG_SAMPLE_SRC}) target_link_libraries(example_srotg ${OPENCL_LIBRARIES} clBLAS) set_property( TARGET example_srotg PROPERTY FOLDER "Samples") add_executable(example_srotmg ${SROTMG_SAMPLE_SRC}) target_link_libraries(example_srotmg ${OPENCL_LIBRARIES} clBLAS) set_property( TARGET example_srotmg PROPERTY FOLDER "Samples") add_executable(example_srot ${SROT_SAMPLE_SRC}) target_link_libraries(example_srot ${OPENCL_LIBRARIES} clBLAS) set_property( TARGET example_srot PROPERTY FOLDER "Samples") add_executable(example_srotm ${SROTM_SAMPLE_SRC}) target_link_libraries(example_srotm ${OPENCL_LIBRARIES} clBLAS) set_property( TARGET example_srotm PROPERTY FOLDER "Samples") add_executable(example_isamax ${iSAMAX_SAMPLE_SRC}) target_link_libraries(example_isamax ${OPENCL_LIBRARIES} clBLAS) set_property( TARGET example_isamax PROPERTY FOLDER "Samples") add_executable(example_snrm2 ${SNRM2_SAMPLE_SRC}) target_link_libraries(example_snrm2 ${OPENCL_LIBRARIES} clBLAS) set_property( TARGET example_snrm2 PROPERTY FOLDER "Samples") add_executable(example_sasum ${SASUM_SAMPLE_SRC}) target_link_libraries(example_sasum ${OPENCL_LIBRARIES} clBLAS) set_property( TARGET example_sasum PROPERTY FOLDER "Samples") # CPack configuration; include the executable into the package if( WIN32 ) set( CLBLAS_EXAMPLE_INSTALL_DESTINATION bin${SUFFIX_BIN}) else( ) set( CLBLAS_EXAMPLE_INSTALL_DESTINATION share/clBLAS/samples) endif() install( TARGETS example_sgemm example_sgemv example_ssymv example_ssyrk example_ssyr2k example_strmm example_strsm example_strmv example_strsv example_sger example_cher example_ssyr example_ssyr2 example_cherk example_ssymm example_chemm example_stpmv example_chpmv example_stpsv example_sspmv example_sspr example_chpr example_sspr2 example_zhpr2 example_sgbmv example_stbmv example_ssbmv example_chbmv example_stbsv example_cher2k example_sswap example_sscal example_csscal example_scopy example_saxpy example_sdot example_srotg example_srotmg example_srot example_srotm example_snrm2 example_sasum example_isamax version RUNTIME DESTINATION ${CLBLAS_EXAMPLE_INSTALL_DESTINATION} LIBRARY DESTINATION lib${SUFFIX_LIB} ARCHIVE DESTINATION lib${SUFFIX_LIB}/import ) configure_file( "${PROJECT_SOURCE_DIR}/samples/CMakeLists.pack" "${PROJECT_BINARY_DIR}/samples/CMakeLists.txt" COPYONLY ) if( WIN32 ) set( CLBLAS_SAMPLE_INSTALL_DESTINATION samples) else( ) set( CLBLAS_SAMPLE_INSTALL_DESTINATION share/clBLAS/samples/src) endif() install(FILES example_sgemv.c example_ssymv.c example_sgemm.c example_strmm.c example_strsm.c example_ssyrk.c example_ssyr2k.c example_strmv.c example_strsv.c example_sger.c example_ssyr.c example_ssyr2.c example_ssymm.c example_cher.c example_chemm.cpp example_cherk.cpp example_ssymm.c example_chemm.cpp example_stpmv.c example_chpmv.c example_stpsv.c example_sspmv.c example_sspr.c example_chpr.c example_sspr2.c example_zhpr2.c example_sgbmv.c example_stbmv.c example_ssbmv.c example_chbmv.c example_stbsv.c example_cher2k.c example_sswap.c example_sscal.c example_scopy.c example_csscal.c example_saxpy.c example_sdot.c example_srotg.c example_srotmg.c example_srot.c example_srotm.c example_isamax.c example_snrm2.c example_sasum.c clBlasVersion.c ${PROJECT_BINARY_DIR}/samples/CMakeLists.txt DESTINATION ${CLBLAS_SAMPLE_INSTALL_DESTINATION} ) clblas-2.10/src/samples/clBlasVersion.c000066400000000000000000000024061264277366700200670ustar00rootroot00000000000000/* ************************************************************************ * Copyright 2013 Advanced Micro Devices, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * ************************************************************************/ #include #include #include /* Include CLBLAS header. It automatically includes needed OpenCL header, * so we can drop out explicit inclusion of cl.h header. */ #include int main(void) { cl_uint major,minor,patch; clblasStatus err; err = clblasGetVersion(&major,&minor,&patch); if (err != CL_SUCCESS) { printf("clblasGetVersion() failed with %d\n", err); return 1; } printf("clblas version %d.%d.%d\n", major,minor,patch); return 0; } clblas-2.10/src/samples/example_chbmv.c000066400000000000000000000123751264277366700201410ustar00rootroot00000000000000/* ************************************************************************ * Copyright 2013 Advanced Micro Devices, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * ************************************************************************/ #include #include #include /* Include CLBLAS header. It automatically includes needed OpenCL header, * so we can drop out explicit inclusion of cl.h header. */ #include /* This example uses predefined matrices and their characteristics for * simplicity purpose. */ static const clblasOrder order = clblasRowMajor; static const clblasUplo uplo = clblasUpper; static const size_t N = 5; static const size_t K = 2; static const cl_float2 alpha = {{10,10}}; static const cl_float2 A[] = { {{ 4.0, 4.0}}, {{ 7.0, 7.0}}, {{11.0, 11.0}}, {{ 5.0, 5.0}}, {{ 8.0, 8.0}}, {{12.0, 12.0}}, {{ 6.0, 6.0}}, {{ 9.0, 9.0}}, {{13.0, 13.0}}, {{10.0, 10.0}}, {{14.0, 14.0}}, {{00.0, 00.0}}, {{15.0, 15.0}}, {{00.0, 00.0}}, {{00.0, 00.0}} }; static const size_t lda = 3; // lda = K + 1 static const cl_float2 X[] = { {{1.0, 0.0}}, {{2.0, 0.0}}, {{3.0, 0.0}}, {{4.0, 0.0}}, {{5.0, 0.0}} }; static const int incx = 1; static const cl_float2 beta = {{20.0, 20.0}}; static cl_float2 Y[] = { {{1.0, 0.0}}, {{2.0, 0.0}}, {{3.0, 0.0}}, {{4.0, 0.0}}, {{5.0, 0.0}} }; static const int incy = 1; static void printResult(void) { size_t i; printf("Result:\n"); for (i = 0; i < N; i++) { printf("(%9.2f, %-9.2f)\n", CREAL(Y[i * incy]), CIMAG(Y[i * incy])); } } int main(void) { cl_int err; cl_platform_id platform = 0; cl_device_id device = 0; cl_context_properties props[3] = { CL_CONTEXT_PLATFORM, 0, 0 }; cl_context ctx = 0; cl_command_queue queue = 0; cl_mem bufA, bufX, bufY; cl_event event = NULL; int ret = 0; /* Setup OpenCL environment. */ err = clGetPlatformIDs(1, &platform, NULL); if (err != CL_SUCCESS) { printf( "clGetPlatformIDs() failed with %d\n", err ); return 1; } err = clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, 1, &device, NULL); if (err != CL_SUCCESS) { printf( "clGetDeviceIDs() failed with %d\n", err ); return 1; } props[1] = (cl_context_properties)platform; ctx = clCreateContext(props, 1, &device, NULL, NULL, &err); if (err != CL_SUCCESS) { printf( "clCreateContext() failed with %d\n", err ); return 1; } queue = clCreateCommandQueue(ctx, device, 0, &err); if (err != CL_SUCCESS) { printf( "clCreateCommandQueue() failed with %d\n", err ); clReleaseContext(ctx); return 1; } /* Setup clblas. */ err = clblasSetup(); if (err != CL_SUCCESS) { printf("clblasSetup() failed with %d\n", err); clReleaseCommandQueue(queue); clReleaseContext(ctx); return 1; } /* Prepare OpenCL memory objects and place matrices inside them. */ bufA = clCreateBuffer(ctx, CL_MEM_READ_ONLY, N * lda * sizeof(cl_float2), NULL, &err); bufX = clCreateBuffer(ctx, CL_MEM_READ_ONLY, N * sizeof(cl_float2), NULL, &err); bufY = clCreateBuffer(ctx, CL_MEM_READ_WRITE, N * sizeof(cl_float2), NULL, &err); err = clEnqueueWriteBuffer(queue, bufA, CL_TRUE, 0, N * lda * sizeof(cl_float2), A, 0, NULL, NULL); err = clEnqueueWriteBuffer(queue, bufX, CL_TRUE, 0, N * sizeof(cl_float2), X, 0, NULL, NULL); err = clEnqueueWriteBuffer(queue, bufY, CL_TRUE, 0, N * sizeof(cl_float2), Y, 0, NULL, NULL); /* Call clblas function. */ err = clblasChbmv(order, uplo, N, K, alpha, bufA, 0 /*offA */, lda, bufX, 0 /*offx*/, incx, beta, bufY, 0 /*offx*/, incy, 1, &queue, 0, NULL, &event); if (err != CL_SUCCESS) { printf("clblasChbmv() failed with %d\n", err); ret = 1; } else { /* Wait for calculations to be finished. */ err = clWaitForEvents(1, &event); printResult(); /* Fetch results of calculations from GPU memory. */ err = clEnqueueReadBuffer(queue, bufY, CL_TRUE, 0, N * sizeof(cl_float2), Y, 0, NULL, NULL); /* At this point you will get the result of CHBMV placed in Y array. */ printResult(); } /* Release OpenCL events. */ clReleaseEvent(event); /* Release OpenCL memory objects. */ clReleaseMemObject(bufY); clReleaseMemObject(bufX); clReleaseMemObject(bufA); /* Finalize work with clblas. */ clblasTeardown(); /* Release OpenCL working objects. */ clReleaseCommandQueue(queue); clReleaseContext(ctx); return ret; } clblas-2.10/src/samples/example_chemm.cpp000066400000000000000000000126631264277366700204730ustar00rootroot00000000000000/* ************************************************************************ * Copyright 2013 Advanced Micro Devices, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * ************************************************************************/ #include #include #include /* Include CLBLAS header. It automatically includes needed OpenCL header, * so we can drop out explicit inclusion of cl.h header. */ #include /* This example uses predefined matrices and their characteristics for * simplicity purpose. */ static const clblasOrder order = clblasRowMajor; #define M 4 #define N 3 static const cl_float2 alpha = {{10, 10}}; static const clblasSide side = clblasLeft; static const clblasUplo uplo = clblasLower; static const cl_float2 A[M*M] = { {{11, 12}}, {{-1, -1}}, {{-1, -1}}, {{-1, -1}}, {{21, 22}}, {{22, 23}}, {{-1, -1}}, {{-1, -1}}, {{31, 32}}, {{32, 33}}, {{33, 34}}, {{-1, -1}}, {{41, 61}}, {{42, 62}}, {{43, 73}}, {{44, 23}} }; static const size_t lda = M; static const cl_float2 B[M*N] = { {{11, -21}}, {{-12, 23}}, {{13, 33}}, {{21, 12}}, {{22, -10}}, {{23, 5}}, {{31, 1}}, {{-32, 65}}, {{33, -1}}, {{1, 41}}, {{-33, 42}}, {{12, 43}}, }; static const size_t ldb = N; static const cl_float2 beta = {{20, 20}}; static cl_float2 C[M*N] = { {{11, 11}}, {{-12, 12}}, {{13, 33}}, {{21, -32}}, {{22, -1}}, {{23, 0}}, {{31, 13}}, {{-32, 78}}, {{33, 45}}, {{41, 14}}, {{0, 42}}, {{43, -1}}, }; static const size_t ldc = N; static void printResult(void) { size_t i, j, nrows; printf("Result:\n"); nrows = (sizeof(C) / sizeof(cl_float2)) / ldc; for (i = 0; i < nrows; i++) { for (j = 0; j < ldc; j++) { printf("<%9.2f, %-9.2f> ", CREAL(C[i * ldc + j]), CIMAG(C[i*ldc + j])); } printf("\n"); } } int main(void) { cl_int err; cl_platform_id platform = 0; cl_device_id device = 0; cl_context_properties props[3] = { CL_CONTEXT_PLATFORM, 0, 0 }; cl_context ctx = 0; cl_command_queue queue = 0; cl_mem bufA, bufB, bufC; cl_event event = NULL; int ret = 0; /* Setup OpenCL environment. */ err = clGetPlatformIDs(1, &platform, NULL); if (err != CL_SUCCESS) { printf( "clGetPlatformIDs() failed with %d\n", err ); return 1; } err = clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, 1, &device, NULL); if (err != CL_SUCCESS) { printf( "clGetDeviceIDs() failed with %d\n", err ); return 1; } props[1] = (cl_context_properties)platform; ctx = clCreateContext(props, 1, &device, NULL, NULL, &err); if (err != CL_SUCCESS) { printf( "clCreateContext() failed with %d\n", err ); return 1; } queue = clCreateCommandQueue(ctx, device, 0, &err); if (err != CL_SUCCESS) { printf( "clCreateCommandQueue() failed with %d\n", err ); clReleaseContext(ctx); return 1; } /* Setup clblas. */ err = clblasSetup(); if (err != CL_SUCCESS) { printf("clblasSetup() failed with %d\n", err); clReleaseCommandQueue(queue); clReleaseContext(ctx); return 1; } /* Prepare OpenCL memory objects and place matrices inside them. */ bufA = clCreateBuffer(ctx, CL_MEM_READ_ONLY, M * M * sizeof(*A), NULL, &err); bufB = clCreateBuffer(ctx, CL_MEM_READ_ONLY, M * N * sizeof(*B), NULL, &err); bufC = clCreateBuffer(ctx, CL_MEM_READ_WRITE, M * N * sizeof(*C), NULL, &err); err = clEnqueueWriteBuffer(queue, bufA, CL_TRUE, 0, M * M * sizeof(*A), A, 0, NULL, NULL); err = clEnqueueWriteBuffer(queue, bufB, CL_TRUE, 0, M * N * sizeof(*B), B, 0, NULL, NULL); err = clEnqueueWriteBuffer(queue, bufC, CL_TRUE, 0, M * N * sizeof(*C), C, 0, NULL, NULL); /* Call clblas function. */ err = clblasChemm(order, side, uplo, M, N, alpha, bufA, 0, lda, bufB, 0, ldb, beta, bufC, 0, ldc, 1, &queue, 0, NULL, &event); if (err != CL_SUCCESS) { printf("clblasSsymm() failed with %d\n", err); ret = 1; } else { /* Wait for calculations to be finished. */ err = clWaitForEvents(1, &event); /* Fetch results of calculations from GPU memory. */ err = clEnqueueReadBuffer(queue, bufC, CL_TRUE, 0, M * N * sizeof(*C), C, 0, NULL, NULL); /* At this point you will get the result of SYMM placed in C array. */ printResult(); } /* Release OpenCL events. */ clReleaseEvent(event); /* Release OpenCL memory objects. */ clReleaseMemObject(bufC); clReleaseMemObject(bufB); clReleaseMemObject(bufA); /* Finalize work with clblas. */ clblasTeardown(); /* Release OpenCL working objects. */ clReleaseCommandQueue(queue); clReleaseContext(ctx); return ret; } clblas-2.10/src/samples/example_cher.c000066400000000000000000000116371264277366700177630ustar00rootroot00000000000000/* ************************************************************************ * Copyright 2013 Advanced Micro Devices, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * ************************************************************************/ #include #include #include /* Include CLBLAS header. It automatically includes needed OpenCL header, * so we can drop out explicit inclusion of cl.h header. */ #include /* This example uses predefined matrices and their characteristics for * simplicity purpose. */ static const clblasOrder order = clblasRowMajor; static const size_t N = 5; static cl_float alpha = 10; static const clblasUplo uplo = clblasUpper; static cl_float2 A[] = { {{11.0f, 00.0f}}, {{12.0f, 02.0f}}, {{13.0f, 05.0f}}, {{14.0f, 12.0f}}, {{15.0f, 06.0f}}, {{00.0f, 00.0f}}, {{22.0f, 00.0f}}, {{23.0f, 25.0f}}, {{24.0f, 23.0f}}, {{25.0f, 61.0f}}, {{00.0f, 00.0f}}, {{00.0f, 00.0f}}, {{33.0f, 00.0f}}, {{34.0f, 23.0f}}, {{35.0f, 21.0f}}, {{00.0f, 00.0f}}, {{00.0f, 00.0f}}, {{00.0f, 00.0f}}, {{44.0f, 00.0f}}, {{45.0f, 23.0f}}, {{00.0f, 00.0f}}, {{00.0f, 00.0f}}, {{00.0f, 00.0f}}, {{00.0f, 00.0f}}, {{55.0f, 00.0f}} }; static const size_t lda = 5; /* i.e. lda = N */ static const cl_float2 X[] = { {{11.0f, 23.0f}}, {{21.0f, 65.0f}}, {{31.0f, 20.0f}}, {{41.0f, 02.0f}}, {{51.0f, 10.0f}} }; static const int incx = 1; static void printResult(void) { size_t i, j; printf("\nResult:\n"); for (i = 0; i < N; i++) { for(j = 0; j < N; j++) printf("(%9.2lf, %-9.2lf)\t", CREAL( A[ i*N + j ] ), CIMAG( A[ i*N + j ] )); printf("\n"); } } int main(void) { cl_int err; cl_platform_id platform = 0; cl_device_id device = 0; cl_context_properties props[3] = { CL_CONTEXT_PLATFORM, 0, 0 }; cl_context ctx = 0; cl_command_queue queue = 0; cl_mem bufA, bufX; cl_event event = NULL; int ret = 0; /* Setup OpenCL environment. */ err = clGetPlatformIDs(1, &platform, NULL); if (err != CL_SUCCESS) { printf( "clGetPlatformIDs() failed with %d\n", err ); return 1; } err = clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, 1, &device, NULL); if (err != CL_SUCCESS) { printf( "clGetDeviceIDs() failed with %d\n", err ); return 1; } props[1] = (cl_context_properties)platform; ctx = clCreateContext(props, 1, &device, NULL, NULL, &err); if (err != CL_SUCCESS) { printf( "clCreateContext() failed with %d\n", err ); return 1; } queue = clCreateCommandQueue(ctx, device, 0, &err); if (err != CL_SUCCESS) { printf( "clCreateCommandQueue() failed with %d\n", err ); clReleaseContext(ctx); return 1; } /* Setup clblas. */ err = clblasSetup(); if (err != CL_SUCCESS) { printf("clblasSetup() failed with %d\n", err); clReleaseCommandQueue(queue); clReleaseContext(ctx); return 1; } /* Prepare OpenCL memory objects and place matrices inside them. */ bufA = clCreateBuffer(ctx, CL_MEM_READ_WRITE, N * lda * sizeof(cl_float2), NULL, &err); bufX = clCreateBuffer(ctx, CL_MEM_READ_ONLY, N * sizeof(cl_float2), NULL, &err); err = clEnqueueWriteBuffer(queue, bufA, CL_TRUE, 0, N * lda * sizeof(cl_float2), A, 0, NULL, NULL); err = clEnqueueWriteBuffer(queue, bufX, CL_TRUE, 0, N * sizeof(cl_float2), X, 0, NULL, NULL); err = clblasCher(order, uplo, N, alpha, bufX, 0 /*offx */, incx, bufA, 0 /*offa */, lda, 1, &queue, 0, NULL, &event); if (err != CL_SUCCESS) { printf("clblasCher() failed with %d\n", err); ret = 1; } else { /* Wait for calculations to be finished. */ err = clWaitForEvents(1, &event); /* Fetch results of calculations from GPU memory. */ err = clEnqueueReadBuffer(queue, bufA, CL_TRUE, 0, (N * lda * sizeof(cl_float2)), A, 0, NULL, NULL); /* At this point you will get the result of CHER placed in A array. */ printResult(); } /* Release OpenCL events. */ clReleaseEvent(event); /* Release OpenCL memory objects. */ clReleaseMemObject(bufX); clReleaseMemObject(bufA); /* Finalize work with clblas. */ clblasTeardown(); /* Release OpenCL working objects. */ clReleaseCommandQueue(queue); clReleaseContext(ctx); return ret; } clblas-2.10/src/samples/example_cher2k.c000066400000000000000000000134151264277366700202140ustar00rootroot00000000000000/* ************************************************************************ * Copyright 2013 Advanced Micro Devices, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * ************************************************************************/ #include #include #include /* Include CLBLAS header. It automatically includes needed OpenCL header, * so we can drop out explicit inclusion of cl.h header. */ #include /* This example uses predefined matrices and their characteristics for * simplicity purpose. */ static const clblasOrder order = clblasColumnMajor; static const clblasUplo uplo = clblasLower; static const clblasTranspose transA = clblasNoTrans; static const size_t N = 5; static const size_t K = 4; static const cl_float2 alpha = {{10, 1}}; static const cl_float2 A[] = { {{11, 0}}, {{12, 0}}, {{13, 0}}, {{14, 0}}, {{21, 0}}, {{22, 0}}, {{23, 0}}, {{24, 0}}, {{31, 0}}, {{32, 0}}, {{33, 0}}, {{34, 0}}, {{41, 0}}, {{42, 0}}, {{43, 0}}, {{44, 0}}, {{51, 0}}, {{52, 0}}, {{53, 0}}, {{54, 0}} }; static const size_t lda = 5; /* i.e. lda = N */ static const cl_float2 B[] = { {{1, 0}}, {{2, 0}}, {{3, 0}}, {{4, 0}}, {{2, 0}}, {{2, 0}}, {{3, 0}}, {{4, 0}}, {{3, 0}}, {{2, 0}}, {{3, 0}}, {{3, 0}}, {{4, 0}}, {{4, 0}}, {{4, 0}}, {{4, 0}}, {{5, 0}}, {{5, 0}}, {{5, 0}}, {{5, 0}} }; static const size_t ldb = 5; /* i.e. lda = N */ static const cl_float beta = 1; static cl_float2 C[] = { {{11, 1}}, {{12, 0}}, {{13, 0}}, {{14, 0}}, {{15, 0}}, {{ 0, 0}}, {{22, 2}}, {{23, 0}}, {{24, 0}}, {{25, 0}}, {{ 0, 0}}, {{ 0, 0}}, {{33, 4}}, {{34, 0}}, {{35, 0}}, {{ 0, 0}}, {{ 0, 0}}, {{ 0, 0}}, {{44, 5}}, {{45, 0}}, {{ 0, 0}}, {{ 0, 0}}, {{ 0, 0}}, {{ 0, 0}}, {{55, 6}} }; static const size_t ldc = 5; /* i.e. ldc = N */ static void printResult(void) { size_t i, j; printf("Result:\n"); for (i = 0; i < N; i++) { for (j = 0; j < N; j++) { printf("(%9.2f, %-9.2f) ", CREAL(C[i + j * ldc]), CIMAG(C[i + j * ldc])); } printf("\n"); } } int main(void) { cl_int err; cl_platform_id platform = 0; cl_device_id device = 0; cl_context_properties props[3] = { CL_CONTEXT_PLATFORM, 0, 0 }; cl_context ctx = 0; cl_command_queue queue = 0; cl_mem bufA, bufC, bufB; cl_event event = NULL; int ret = 0; /* Setup OpenCL environment. */ err = clGetPlatformIDs(1, &platform, NULL); if (err != CL_SUCCESS) { printf( "clGetPlatformIDs() failed with %d\n", err ); return 1; } err = clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, 1, &device, NULL); if (err != CL_SUCCESS) { printf( "clGetDeviceIDs() failed with %d\n", err ); return 1; } props[1] = (cl_context_properties)platform; ctx = clCreateContext(props, 1, &device, NULL, NULL, &err); if (err != CL_SUCCESS) { printf( "clCreateContext() failed with %d\n", err ); return 1; } queue = clCreateCommandQueue(ctx, device, 0, &err); if (err != CL_SUCCESS) { printf( "clCreateCommandQueue() failed with %d\n", err ); clReleaseContext(ctx); return 1; } /* Setup clblas. */ err = clblasSetup(); if (err != CL_SUCCESS) { printf("clblasSetup() failed with %d\n", err); clReleaseCommandQueue(queue); clReleaseContext(ctx); return 1; } /* Prepare OpenCL memory objects and place matrices inside them. */ bufA = clCreateBuffer(ctx, CL_MEM_READ_ONLY, N * K * sizeof(*A), NULL, &err); bufB = clCreateBuffer(ctx, CL_MEM_READ_ONLY, N * K * sizeof(*B), NULL, &err); bufC = clCreateBuffer(ctx, CL_MEM_READ_WRITE, N * N * sizeof(*C), NULL, &err); if ((bufA == NULL) || (bufC == NULL) || (bufB == NULL)) { printf("Failed to create buffern"); return 1; } err = clEnqueueWriteBuffer(queue, bufA, CL_TRUE, 0, N * K * sizeof(*A), A, 0, NULL, NULL); err = clEnqueueWriteBuffer(queue, bufB, CL_TRUE, 0, N * K * sizeof(*B), B, 0, NULL, NULL); err = clEnqueueWriteBuffer(queue, bufC, CL_TRUE, 0, N * N * sizeof(*C), C, 0, NULL, NULL); /* Call clblas function. */ err = clblasCher2k(order, uplo, transA, N, K, alpha, bufA, 0, lda, bufB, 0, ldb, beta, bufC, 0, ldc, 1, &queue, 0, NULL, &event); if (err != CL_SUCCESS) { printf("clblasCher2k() failed with %d\n", err); ret = 1; } else { /* Wait for calculations to be finished. */ err = clWaitForEvents(1, &event); /* Fetch results of calculations from GPU memory. */ err = clEnqueueReadBuffer(queue, bufC, CL_TRUE, 0, N * N * sizeof(*C), C, 0, NULL, NULL); /* At this point you will get the result of SSYRK placed in C array. */ printResult(); } /* Release OpenCL events. */ clReleaseEvent(event); /* Release OpenCL memory objects. */ clReleaseMemObject(bufC); clReleaseMemObject(bufB); clReleaseMemObject(bufA); /* Finalize work with clblas. */ clblasTeardown(); /* Release OpenCL working objects. */ clReleaseCommandQueue(queue); clReleaseContext(ctx); return ret; } clblas-2.10/src/samples/example_cherk.cpp000066400000000000000000000127171264277366700204760ustar00rootroot00000000000000/* ************************************************************************ * Copyright 2013 Advanced Micro Devices, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * ************************************************************************/ #include #include #include /* Include CLBLAS header. It automatically includes needed OpenCL header, * so we can drop out explicit inclusion of cl.h header. */ #include /* This example uses predefined matrices and their characteristics for * simplicity purpose. */ static const clblasOrder order = clblasColumnMajor; static const clblasUplo uplo = clblasLower; static const clblasTranspose transA = clblasNoTrans; static const size_t N = 5; static const size_t K = 4; static const cl_float alpha = 10; static const cl_float2 A[] = { {{1, 0}}, {{1, 0}}, {{1, 0}}, {{1, 0}}, {{1, 0}}, {{1, 0}}, {{1, 0}}, {{1, 0}}, {{1, 0}}, {{1, 0}}, {{1, 0}}, {{1, 0}}, {{1, 0}}, {{1, 0}}, {{1, 0}}, {{1, 0}}, {{1, 0}}, {{1, 0}}, {{1, 0}}, {{1, 0}} /* {{11, 0}}, {{12, 0}}, {{13, 0}}, {{14, 0}}, {{21, 0}}, {{22, 0}}, {{23, 0}}, {{24, 0}}, {{31, 0}}, {{32, 0}}, {{33, 0}}, {{34, 0}}, {{41, 0}}, {{42, 0}}, {{43, 0}}, {{44, 0}}, {{51, 0}}, {{52, 0}}, {{53, 0}}, {{54, 0}} */ }; static const size_t lda = 5; /* i.e. lda = K */ static const cl_float beta = 1; static cl_float2 C[] = { {{11, 1}}, {{12, 0}}, {{13, 0}}, {{14, 0}}, {{15, 0}}, {{ 0, 0}}, {{22, 2}}, {{23, 0}}, {{24, 0}}, {{25, 0}}, {{ 0, 0}}, {{ 0, 0}}, {{33, 4}}, {{34, 0}}, {{35, 0}}, {{ 0, 0}}, {{ 0, 0}}, {{ 0, 0}}, {{44, 5}}, {{45, 0}}, {{ 0, 0}}, {{ 0, 0}}, {{ 0, 0}}, {{ 0, 0}}, {{55, 6}} }; static const size_t ldc = 5; /* i.e. ldc = N */ static void printResult(void) { size_t i, j; printf("Result:\n"); for (i = 0; i < N; i++) { for (j = 0; j < N; j++) { printf("(%9.2f, %-9.2f) ", CREAL(C[i + j * ldc]), CIMAG(C[i + j * ldc])); } printf("\n"); } } int main(void) { cl_int err; cl_platform_id platform = 0; cl_device_id device = 0; cl_context_properties props[3] = { CL_CONTEXT_PLATFORM, 0, 0 }; cl_context ctx = 0; cl_command_queue queue = 0; cl_mem bufA, bufC; cl_event event = NULL; int ret = 0; /* Setup OpenCL environment. */ err = clGetPlatformIDs(1, &platform, NULL); if (err != CL_SUCCESS) { printf( "clGetPlatformIDs() failed with %d\n", err ); return 1; } err = clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, 1, &device, NULL); if (err != CL_SUCCESS) { printf( "clGetDeviceIDs() failed with %d\n", err ); return 1; } props[1] = (cl_context_properties)platform; ctx = clCreateContext(props, 1, &device, NULL, NULL, &err); if (err != CL_SUCCESS) { printf( "clCreateContext() failed with %d\n", err ); return 1; } queue = clCreateCommandQueue(ctx, device, 0, &err); if (err != CL_SUCCESS) { printf( "clCreateCommandQueue() failed with %d\n", err ); clReleaseContext(ctx); return 1; } /* Setup clblas. */ err = clblasSetup(); if (err != CL_SUCCESS) { printf("clblasSetup() failed with %d\n", err); clReleaseCommandQueue(queue); clReleaseContext(ctx); return 1; } /* Prepare OpenCL memory objects and place matrices inside them. */ bufA = clCreateBuffer(ctx, CL_MEM_READ_ONLY, N * K * sizeof(*A), NULL, &err); bufC = clCreateBuffer(ctx, CL_MEM_READ_WRITE, N * N * sizeof(*C), NULL, &err); if ((bufA == NULL) || (bufC == NULL)) { printf("Failed to create buffern"); return 1; } err = clEnqueueWriteBuffer(queue, bufA, CL_TRUE, 0, N * K * sizeof(*A), A, 0, NULL, NULL); err = clEnqueueWriteBuffer(queue, bufC, CL_TRUE, 0, N * N * sizeof(*C), C, 0, NULL, NULL); // printResult(); /* Call clblas function. */ err = clblasCherk(order, uplo, transA, N, K, alpha, bufA, 0, lda, beta, bufC, 0, ldc, 1, &queue, 0, NULL, &event); if (err != CL_SUCCESS) { printf("clblasCherk() failed with %d\n", err); ret = 1; } else { /* Wait for calculations to be finished. */ err = clWaitForEvents(1, &event); /* Fetch results of calculations from GPU memory. */ err = clEnqueueReadBuffer(queue, bufC, CL_TRUE, 0, N * N * sizeof(*C), C, 0, NULL, NULL); /* At this point you will get the result of SSYRK placed in C array. */ printResult(); } /* Release OpenCL events. */ clReleaseEvent(event); /* Release OpenCL memory objects. */ clReleaseMemObject(bufC); clReleaseMemObject(bufA); /* Finalize work with clblas. */ clblasTeardown(); /* Release OpenCL working objects. */ clReleaseCommandQueue(queue); clReleaseContext(ctx); return ret; } clblas-2.10/src/samples/example_chpmv.c000066400000000000000000000127611264277366700201560ustar00rootroot00000000000000/* ************************************************************************ * Copyright 2013 Advanced Micro Devices, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * ************************************************************************/ #include #include #include /* Include CLBLAS header. It automatically includes needed OpenCL header, * so we can drop out explicit inclusion of cl.h header. */ #include /* This example uses predefined matrices and their characteristics for * simplicity purpose. */ static const clblasOrder order = clblasRowMajor; static const clblasUplo uplo = clblasUpper; static const size_t N = 5; static const cl_float2 A[] = { {{ 1.0, 00.0}}, {{ 2.0, 02.0}}, {{ 4.0, 4.0}}, {{ 7.0, 7.0}}, {{11.0, 11.0}}, {{ 3.0, 03.0}}, {{ 5.0, 5.0}}, {{ 8.0, 8.0}}, {{12.0, 12.0}}, {{ 6.0, 6.0}}, {{ 9.0, 9.0}}, {{13.0, 13.0}}, {{10.0, 10.0}}, {{14.0, 14.0}}, {{15.0, 15.0}} }; static const cl_float2 alpha = {{10,10}}; static const cl_float2 X[] = { {{1.0, 0.0}}, {{2.0, 0.0}}, {{3.0, 0.0}}, {{4.0, 0.0}}, {{5.0, 0.0}} }; static const int incx = 1; static const cl_float2 beta = {{2.0, 2.0}}; static cl_float2 Y[] = { {{1.0, 0.0}}, {{2.0, 0.0}}, {{3.0, 0.0}}, {{4.0, 0.0}}, {{5.0, 0.0}} }; static const int incy = 1; static void printResult(void) { size_t i; printf("Result:\n\n"); for (i = 0; i < N; i++) { printf("(%9.2lf, %-9.2lf)\n", CREAL(Y[i * incy]), CIMAG(Y[i * incy])); } } int main(void) { cl_int err; cl_platform_id platform = 0; cl_device_id device = 0; cl_context_properties props[3] = { CL_CONTEXT_PLATFORM, 0, 0 }; cl_context ctx = 0; cl_command_queue queue = 0; cl_mem bufAP, bufX, bufY; cl_event event = NULL; int ret = 0, numElementsAP; /* Setup OpenCL environment. */ err = clGetPlatformIDs(1, &platform, NULL); if (err != CL_SUCCESS) { printf( "clGetPlatformIDs() failed with %d\n", err ); return 1; } err = clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, 1, &device, NULL); if (err != CL_SUCCESS) { printf( "clGetDeviceIDs() failed with %d\n", err ); return 1; } props[1] = (cl_context_properties)platform; ctx = clCreateContext(props, 1, &device, NULL, NULL, &err); if (err != CL_SUCCESS) { printf( "clCreateContext() failed with %d\n", err ); return 1; } queue = clCreateCommandQueue(ctx, device, 0, &err); if (err != CL_SUCCESS) { printf( "clCreateCommandQueue() failed with %d\n", err ); clReleaseContext(ctx); return 1; } /* Setup clblas. */ err = clblasSetup(); if (err != CL_SUCCESS) { printf("clblasSetup() failed with %d\n", err); clReleaseCommandQueue(queue); clReleaseContext(ctx); return 1; } numElementsAP = (N * (N+1)) / 2; // To get number of elements in a packed matrix /* Prepare OpenCL memory objects and place matrices inside them. */ bufAP = clCreateBuffer(ctx, CL_MEM_READ_ONLY, numElementsAP * sizeof(cl_float2), NULL, &err); bufX = clCreateBuffer(ctx, CL_MEM_READ_ONLY, N * sizeof(cl_float2), NULL, &err); bufY = clCreateBuffer(ctx, CL_MEM_READ_WRITE, N * sizeof(cl_float2), NULL, &err); err = clEnqueueWriteBuffer(queue, bufAP, CL_TRUE, 0, numElementsAP * sizeof(cl_float2), A, 0, NULL, NULL); err = clEnqueueWriteBuffer(queue, bufX, CL_TRUE, 0, N * sizeof(cl_float2), X, 0, NULL, NULL); err = clEnqueueWriteBuffer(queue, bufY, CL_TRUE, 0, N * sizeof(cl_float2), Y, 0, NULL, NULL); /* Call clblas function. */ err = clblasChpmv(order, uplo, N, alpha, bufAP, 0 /*offA */, bufX, 0 /*offx*/, incx, beta, bufY, 0 /*offy*/, incy, 1, &queue, 0, NULL, &event); if (err != CL_SUCCESS) { printf("clblasChpmv() failed with %d\n", err); ret = 1; } else { /* Wait for calculations to be finished. */ err = clWaitForEvents(1, &event); /* Fetch results of calculations from GPU memory. */ err = clEnqueueReadBuffer(queue, bufY, CL_TRUE, 0, N * sizeof(cl_float2), Y, 0, NULL, NULL); /* At this point you will get the result of CHPMV placed in Y array. */ printResult(); } /* Release OpenCL events. */ clReleaseEvent(event); /* Release OpenCL memory objects. */ clReleaseMemObject(bufY); clReleaseMemObject(bufX); clReleaseMemObject(bufAP); /* Finalize work with clblas. */ clblasTeardown(); /* Release OpenCL working objects. */ clReleaseCommandQueue(queue); clReleaseContext(ctx); return ret; } clblas-2.10/src/samples/example_chpr.c000066400000000000000000000123641264277366700177740ustar00rootroot00000000000000/* ************************************************************************ * Copyright 2013 Advanced Micro Devices, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * ************************************************************************/ #include #include #include /* Include CLBLAS header. It automatically includes needed OpenCL header, * so we can drop out explicit inclusion of cl.h header. */ #include /* This example uses predefined matrices and their characteristics for * simplicity purpose. */ static const clblasOrder order = clblasRowMajor; static const size_t N = 5; static cl_float alpha = 10; static const clblasUplo uplo = clblasUpper; static cl_float2 AP[] = { {{11.0f, 00.0f}}, {{12.0f, 02.0f}}, {{13.0f, 05.0f}}, {{14.0f, 12.0f}}, {{15.0f, 06.0f}}, {{22.0f, 00.0f}}, {{23.0f, 25.0f}}, {{24.0f, 23.0f}}, {{25.0f, 61.0f}}, {{33.0f, 00.0f}}, {{34.0f, 23.0f}}, {{35.0f, 21.0f}}, {{44.0f, 00.0f}}, {{45.0f, 23.0f}}, {{55.0f, 00.0f}} }; static const cl_float2 X[] = { {{11.0f, 23.0f}}, {{21.0f, 65.0f}}, {{31.0f, 20.0f}}, {{41.0f, 02.0f}}, {{51.0f, 10.0f}} }; static const int incx = 1; static void printResult(void) { size_t i, j, off; printf("\nResult:\n"); off = 0; for (i = 0; i < N; i++) { for(j = 0; j < N; j++) { if( ( (uplo == clblasUpper) && (i > j)) || ((uplo == clblasLower) && (j > i)) ) { printf("\t\t\t"); continue; } printf("(%9.2lf, %-9.2lf)\t", CREAL( AP[ off ] ), CIMAG( AP[ off ] )); off ++ ; } printf("\n"); } } int main(void) { cl_int err; cl_platform_id platform = 0; cl_device_id device = 0; cl_context_properties props[3] = { CL_CONTEXT_PLATFORM, 0, 0 }; cl_context ctx = 0; cl_command_queue queue = 0; cl_mem bufAP, bufX; cl_event event = NULL; int ret = 0, numElementsAP; /* Setup OpenCL environment. */ err = clGetPlatformIDs(1, &platform, NULL); if (err != CL_SUCCESS) { printf( "clGetPlatformIDs() failed with %d\n", err ); return 1; } err = clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, 1, &device, NULL); if (err != CL_SUCCESS) { printf( "clGetDeviceIDs() failed with %d\n", err ); return 1; } props[1] = (cl_context_properties)platform; ctx = clCreateContext(props, 1, &device, NULL, NULL, &err); if (err != CL_SUCCESS) { printf( "clCreateContext() failed with %d\n", err ); return 1; } queue = clCreateCommandQueue(ctx, device, 0, &err); if (err != CL_SUCCESS) { printf( "clCreateCommandQueue() failed with %d\n", err ); clReleaseContext(ctx); return 1; } /* Setup clblas. */ err = clblasSetup(); if (err != CL_SUCCESS) { printf("clblasSetup() failed with %d\n", err); clReleaseCommandQueue(queue); clReleaseContext(ctx); return 1; } numElementsAP = (N * (N+1)) / 2; // To get number of elements in a packed matrix /* Prepare OpenCL memory objects and place matrices inside them. */ bufAP = clCreateBuffer(ctx, CL_MEM_READ_WRITE, numElementsAP * sizeof(cl_float2), NULL, &err); bufX = clCreateBuffer(ctx, CL_MEM_READ_ONLY, N * sizeof(cl_float2), NULL, &err); err = clEnqueueWriteBuffer(queue, bufAP, CL_TRUE, 0, numElementsAP * sizeof(cl_float2), AP, 0, NULL, NULL); err = clEnqueueWriteBuffer(queue, bufX, CL_TRUE, 0, N * sizeof(cl_float2), X, 0, NULL, NULL); err = clblasChpr(order, uplo, N, alpha, bufX, 0 /*offx */, incx, bufAP, 0 /*offa */, 1, &queue, 0, NULL, &event); if (err != CL_SUCCESS) { printf("clblasChpr() failed with %d\n", err); ret = 1; } else { /* Wait for calculations to be finished. */ err = clWaitForEvents(1, &event); /* Fetch results of calculations from GPU memory. */ err = clEnqueueReadBuffer(queue, bufAP, CL_TRUE, 0, (numElementsAP * sizeof(cl_float2)), AP, 0, NULL, NULL); /* At this point you will get the result of CHPR placed in A array. */ printResult(); } /* Release OpenCL events. */ clReleaseEvent(event); /* Release OpenCL memory objects. */ clReleaseMemObject(bufX); clReleaseMemObject(bufAP); /* Finalize work with clblas. */ clblasTeardown(); /* Release OpenCL working objects. */ clReleaseCommandQueue(queue); clReleaseContext(ctx); return ret; } clblas-2.10/src/samples/example_csscal.c000066400000000000000000000100721264277366700203020ustar00rootroot00000000000000/* ************************************************************************ * Copyright 2013 Advanced Micro Devices, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * ************************************************************************/ #include #include #include #include /* Include CLBLAS header. It automatically includes needed OpenCL header, * so we can drop out explicit inclusion of cl.h header. */ #include /* This example uses predefined matrices and their characteristics for * simplicity purpose. */ static const clblasOrder order = clblasRowMajor; static const size_t N = 5; static const cl_float alpha = 10; static cl_float2 X[] = { {{1.0, 0.0}}, {{2.0, 0.0}}, {{3.0, 0.0}}, {{4.0, 0.0}}, {{5.0, 0.0}} }; static const int incx = 1; static void printResult(void) { size_t i; printf("\nResult:\n"); for (i = 0; i < N; i++) { printf("(%f, %-f) \n", CREAL(X[i]), CIMAG(X[i])); } } int main(void) { cl_int err; cl_platform_id platform = 0; cl_device_id device = 0; cl_context_properties props[3] = { CL_CONTEXT_PLATFORM, 0, 0 }; cl_context ctx = 0; cl_command_queue queue = 0; cl_mem bufX; cl_event event = NULL; int ret = 0; int lenX = 1 + (N-1)*abs(incx); /* Setup OpenCL environment. */ err = clGetPlatformIDs(1, &platform, NULL); if (err != CL_SUCCESS) { printf( "clGetPlatformIDs() failed with %d\n", err ); return 1; } err = clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, 1, &device, NULL); if (err != CL_SUCCESS) { printf( "clGetDeviceIDs() failed with %d\n", err ); return 1; } props[1] = (cl_context_properties)platform; ctx = clCreateContext(props, 1, &device, NULL, NULL, &err); if (err != CL_SUCCESS) { printf( "clCreateContext() failed with %d\n", err ); return 1; } queue = clCreateCommandQueue(ctx, device, 0, &err); if (err != CL_SUCCESS) { printf( "clCreateCommandQueue() failed with %d\n", err ); clReleaseContext(ctx); return 1; } /* Setup clblas. */ err = clblasSetup(); if (err != CL_SUCCESS) { printf("clblasSetup() failed with %d\n", err); clReleaseCommandQueue(queue); clReleaseContext(ctx); return 1; } /* Prepare OpenCL memory objects and place vectors inside them. */ bufX = clCreateBuffer(ctx, CL_MEM_READ_WRITE, ( lenX * sizeof(cl_float2)), NULL, &err); err = clEnqueueWriteBuffer(queue, bufX, CL_TRUE, 0, (lenX * sizeof(cl_float2)), X, 0, NULL, NULL); /* Call clblas function. */ err = clblasCsscal( N, alpha, bufX, 0, incx, 1, &queue, 0, NULL, &event); if (err != CL_SUCCESS) { printf("clblasCsscal() failed with %d\n", err); ret = 1; } else { /* Wait for calculations to be finished. */ err = clWaitForEvents(1, &event); /* Fetch results of calculations from GPU memory. */ err = clEnqueueReadBuffer(queue, bufX, CL_TRUE, 0, (lenX * sizeof(cl_float2)), X, 0, NULL, NULL); /* At this point you will get the result of CSSCAL placed in vector X. */ printResult(); } /* Release OpenCL events. */ clReleaseEvent(event); /* Release OpenCL memory objects. */ clReleaseMemObject(bufX); /* Finalize work with clblas. */ clblasTeardown(); /* Release OpenCL working objects. */ clReleaseCommandQueue(queue); clReleaseContext(ctx); return ret; } clblas-2.10/src/samples/example_dtrmv.c000066400000000000000000000116701264277366700201730ustar00rootroot00000000000000/* ************************************************************************ * Copyright 2013 Advanced Micro Devices, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * ************************************************************************/ #include #include #include /* Include CLBLAS header. It automatically includes needed OpenCL header, * so we can drop out explicit inclusion of cl.h header. */ #include /* This example uses predefined matrices and their characteristics for * simplicity purpose. */ static const clblasOrder order = clblasColumnMajor; static const size_t N = 5; static const cl_double alpha = 10; static const clblasUplo uplo = clblasUpper; static const cl_double A[] = { 11, 12, 13, 14, 15, 0, 22, 23, 24, 25, 0, 0, 33, 34, 35, 0, 0, 0, 44, 45, 0, 0, 0, 0, 55 }; static const size_t lda = 5; /* i.e. lda = N */ static const cl_double X[] = { 11, 21, 31, 41, 51 }; static const int incx = 1; static const cl_double beta = 20; static cl_double Y[] = { 11, 21, 31, 41, 51 }; static const int incy = 1; static void printResult(void) { size_t i, nElements; printf("Result:\n"); nElements = (sizeof(Y) / sizeof(cl_double)) / incy; for (i = 0; i < nElements; i++) { printf("%d\n", (int)Y[i * incy]); } } int main(void) { cl_int err; cl_platform_id platform = 0; cl_device_id device = 0; cl_context_properties props[3] = { CL_CONTEXT_PLATFORM, 0, 0 }; cl_context ctx = 0; cl_command_queue queue = 0; cl_mem bufA, bufX, bufY; cl_event event = NULL; int ret = 0; /* Setup OpenCL environment. */ err = clGetPlatformIDs(1, &platform, NULL); if (err != CL_SUCCESS) { printf( "clGetPlatformIDs() failed with %d\n", err ); return 1; } err = clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, 1, &device, NULL); if (err != CL_SUCCESS) { printf( "clGetDeviceIDs() failed with %d\n", err ); return 1; } props[1] = (cl_context_properties)platform; ctx = clCreateContext(props, 1, &device, NULL, NULL, &err); if (err != CL_SUCCESS) { printf( "clCreateContext() failed with %d\n", err ); return 1; } queue = clCreateCommandQueue(ctx, device, 0, &err); if (err != CL_SUCCESS) { printf( "clCreateCommandQueue() failed with %d\n", err ); clReleaseContext(ctx); return 1; } /* Setup clblas. */ err = clblasSetup(); if (err != CL_SUCCESS) { printf("clblasSetup() failed with %d\n", err); clReleaseCommandQueue(queue); clReleaseContext(ctx); return 1; } /* Prepare OpenCL memory objects and place matrices inside them. */ bufA = clCreateBuffer(ctx, CL_MEM_READ_ONLY, N * lda * sizeof(*A), NULL, &err); bufX = clCreateBuffer(ctx, CL_MEM_READ_ONLY, N * sizeof(*X), NULL, &err); bufY = clCreateBuffer(ctx, CL_MEM_READ_WRITE, N * sizeof(*Y), NULL, &err); err = clEnqueueWriteBuffer(queue, bufA, CL_TRUE, 0, N * lda * sizeof(*A), A, 0, NULL, NULL); err = clEnqueueWriteBuffer(queue, bufX, CL_TRUE, 0, N * sizeof(*X), X, 0, NULL, NULL); err = clEnqueueWriteBuffer(queue, bufY, CL_TRUE, 0, N * sizeof(*Y), Y, 0, NULL, NULL); /* Call clblas function. */ err = clblasDtrmv(order, uplo, clblasTrans, clblasUnit, N, bufA, 0 /*offA */, lda, bufX, 0 /*offX */, incx, bufY, 1, &queue, 0, NULL, &event); if (err != CL_SUCCESS) { printf("clblasDtrmv() failed with %d\n", err); ret = 1; } else { /* Wait for calculations to be finished. */ err = clWaitForEvents(1, &event); /* Fetch results of calculations from GPU memory. */ err = clEnqueueReadBuffer(queue, bufX, CL_TRUE, 0, N * sizeof(*Y), Y, 0, NULL, NULL); /* At this point you will get the result of SSYMV placed in Y array. */ printResult(); } /* Release OpenCL events. */ clReleaseEvent(event); /* Release OpenCL memory objects. */ clReleaseMemObject(bufY); clReleaseMemObject(bufX); clReleaseMemObject(bufA); /* Finalize work with clblas. */ clblasTeardown(); /* Release OpenCL working objects. */ clReleaseCommandQueue(queue); clReleaseContext(ctx); return ret; } clblas-2.10/src/samples/example_isamax.c000066400000000000000000000103001264277366700203060ustar00rootroot00000000000000/* ************************************************************************ * Copyright 2013 Advanced Micro Devices, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * ************************************************************************/ #include #include #include #include /* Include CLBLAS header. It automatically includes needed OpenCL header, * so we can drop out explicit inclusion of cl.h header. */ #include /* This example uses predefined matrices and their characteristics for * simplicity purpose. */ static const size_t N = 7; static cl_float X[] = { 1, 2, -11, 17, 5, 6, 800, 10 }; static const int incx = 1; static cl_uint indexMax; int main(void) { cl_int err; cl_platform_id platform = 0; cl_device_id device = 0; cl_context_properties props[3] = { CL_CONTEXT_PLATFORM, 0, 0 }; cl_context ctx = 0; cl_command_queue queue = 0; cl_mem bufX, scratchBuf, iMax; cl_event event = NULL; int ret = 0; int lenX = 1 + (N-1)*abs(incx); int lenScratchBuf = N; /* Setup OpenCL environment. */ err = clGetPlatformIDs(1, &platform, NULL); if (err != CL_SUCCESS) { printf( "clGetPlatformIDs() failed with %d\n", err ); return 1; } err = clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, 1, &device, NULL); if (err != CL_SUCCESS) { printf( "clGetDeviceIDs() failed with %d\n", err ); return 1; } props[1] = (cl_context_properties)platform; ctx = clCreateContext(props, 1, &device, NULL, NULL, &err); if (err != CL_SUCCESS) { printf( "clCreateContext() failed with %d\n", err ); return 1; } queue = clCreateCommandQueue(ctx, device, 0, &err); if (err != CL_SUCCESS) { printf( "clCreateCommandQueue() failed with %d\n", err ); clReleaseContext(ctx); return 1; } /* Setup clblas. */ err = clblasSetup(); if (err != CL_SUCCESS) { printf("clblasSetup() failed with %d\n", err); clReleaseCommandQueue(queue); clReleaseContext(ctx); return 1; } /* Prepare OpenCL memory objects and place matrices inside them. */ bufX = clCreateBuffer(ctx, CL_MEM_READ_ONLY, (lenX*sizeof(cl_float)), NULL, &err); // Allocate minimum of (N/64) elements. But here allocating N elements for the sake of simplicity scratchBuf = clCreateBuffer(ctx, CL_MEM_READ_WRITE, (lenScratchBuf*sizeof(cl_float) * 2), NULL, &err); // Buffer to return the index of max absolute value in X iMax = clCreateBuffer(ctx, CL_MEM_WRITE_ONLY, sizeof(cl_uint), NULL, &err); err = clEnqueueWriteBuffer(queue, bufX, CL_TRUE, 0, (lenX*sizeof(cl_float)) , X, 0, NULL, NULL); /* Call clblas function. */ err = clblasiSamax( N, iMax, 0, bufX, 0, incx, scratchBuf, 1, &queue, 0, NULL, &event); if (err != CL_SUCCESS) { printf("clblasiSamax() failed with %d\n", err); ret = 1; } else { /* Wait for calculations to be finished. */ err = clWaitForEvents(1, &event); /* Fetch results of calculations from GPU memory. */ err = clEnqueueReadBuffer(queue, iMax, CL_TRUE, 0, sizeof(cl_uint), &indexMax, 0, NULL, NULL); printf("Result amax: %d\n", indexMax); } /* Release OpenCL events. */ clReleaseEvent(event); /* Release OpenCL memory objects. */ clReleaseMemObject(bufX); clReleaseMemObject(scratchBuf); clReleaseMemObject(iMax); /* Finalize work with clblas. */ clblasTeardown(); /* Release OpenCL working objects. */ clReleaseCommandQueue(queue); clReleaseContext(ctx); return ret; } clblas-2.10/src/samples/example_sasum.c000066400000000000000000000100241264277366700201570ustar00rootroot00000000000000/* ************************************************************************ * Copyright 2013 Advanced Micro Devices, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * ************************************************************************/ #include #include #include #include /* Include CLBLAS header. It automatically includes needed OpenCL header, * so we can drop out explicit inclusion of cl.h header. */ #include /* This example uses predefined matrices and their characteristics for * simplicity purpose. */ static const size_t N = 7; static cl_float X[] = { 1, 2, -11, 17, 5, 6, 81, }; static const int incx = 1; static cl_float asum; int main(void) { cl_int err; cl_platform_id platform = 0; cl_device_id device = 0; cl_context_properties props[3] = { CL_CONTEXT_PLATFORM, 0, 0 }; cl_context ctx = 0; cl_command_queue queue = 0; cl_mem bufX, bufAsum, scratchBuff; cl_event event = NULL; int ret = 0; int lenX = 1 + (N-1)*abs(incx); /* Setup OpenCL environment. */ err = clGetPlatformIDs(1, &platform, NULL); if (err != CL_SUCCESS) { printf( "clGetPlatformIDs() failed with %d\n", err ); return 1; } err = clGetDeviceIDs(platform, CL_DEVICE_TYPE_CPU, 1, &device, NULL); if (err != CL_SUCCESS) { printf( "clGetDeviceIDs() failed with %d\n", err ); return 1; } props[1] = (cl_context_properties)platform; ctx = clCreateContext(props, 1, &device, NULL, NULL, &err); if (err != CL_SUCCESS) { printf( "clCreateContext() failed with %d\n", err ); return 1; } queue = clCreateCommandQueue(ctx, device, 0, &err); if (err != CL_SUCCESS) { printf( "clCreateCommandQueue() failed with %d\n", err ); clReleaseContext(ctx); return 1; } /* Setup clblas. */ err = clblasSetup(); if (err != CL_SUCCESS) { printf("clblasSetup() failed with %d\n", err); clReleaseCommandQueue(queue); clReleaseContext(ctx); return 1; } /* Prepare OpenCL memory objects and place matrices inside them. */ bufX = clCreateBuffer(ctx, CL_MEM_READ_ONLY, (lenX*sizeof(cl_float)), NULL, &err); bufAsum = clCreateBuffer(ctx, CL_MEM_WRITE_ONLY, (sizeof(cl_float)), NULL, &err); // Allocate minimum of N elements scratchBuff = clCreateBuffer(ctx, CL_MEM_READ_WRITE, (N*sizeof(cl_float)), NULL, &err); err = clEnqueueWriteBuffer(queue, bufX, CL_TRUE, 0, (lenX*sizeof(cl_float)), X, 0, NULL, NULL); /* Call clblas function. */ err = clblasSasum( N, bufAsum, 0, bufX, 0, incx, scratchBuff, 1, &queue, 0, NULL, &event); if (err != CL_SUCCESS) { printf("clblasSasum() failed with %d\n", err); ret = 1; } else { /* Wait for calculations to be finished. */ err = clWaitForEvents(1, &event); /* Fetch results of calculations from GPU memory. */ err = clEnqueueReadBuffer(queue, bufAsum, CL_TRUE, 0, sizeof(cl_float), &asum, 0, NULL, NULL); printf("Result : %f\n", asum); } /* Release OpenCL events. */ clReleaseEvent(event); /* Release OpenCL memory objects. */ clReleaseMemObject(bufX); clReleaseMemObject(bufAsum); clReleaseMemObject(scratchBuff); /* Finalize work with clblas. */ clblasTeardown(); /* Release OpenCL working objects. */ clReleaseCommandQueue(queue); clReleaseContext(ctx); return ret; } clblas-2.10/src/samples/example_saxpy.c000066400000000000000000000104361264277366700202020ustar00rootroot00000000000000/* ************************************************************************ * Copyright 2013 Advanced Micro Devices, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * ************************************************************************/ #include #include #include #include /* Include CLBLAS header. It automatically includes needed OpenCL header, * so we can drop out explicit inclusion of cl.h header. */ #include /* This example uses predefined matrices and their characteristics for * simplicity purpose. */ static const size_t N = 7; static const cl_float alpha = 10; static cl_float X[] = { 11, 21, 31, 41, 51, 61, 71, }; static const int incx = 1; static cl_float Y[] = { 15, 11, 1, 2, 1, 8, 1, }; static const int incy = 1; static void printResult(void) { size_t i; printf("\nResult:\n"); printf("Y\n"); for (i = 0; i < N; i++) { printf("\t%f\n", Y[i]); } } int main(void) { cl_int err; cl_platform_id platform = 0; cl_device_id device = 0; cl_context_properties props[3] = { CL_CONTEXT_PLATFORM, 0, 0 }; cl_context ctx = 0; cl_command_queue queue = 0; cl_mem bufX, bufY; cl_event event = NULL; int ret = 0; int lenX = 1 + (N-1)*abs(incx); int lenY = 1 + (N-1)*abs(incy); /* Setup OpenCL environment. */ err = clGetPlatformIDs(1, &platform, NULL); if (err != CL_SUCCESS) { printf( "clGetPlatformIDs() failed with %d\n", err ); return 1; } err = clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, 1, &device, NULL); if (err != CL_SUCCESS) { printf( "clGetDeviceIDs() failed with %d\n", err ); return 1; } props[1] = (cl_context_properties)platform; ctx = clCreateContext(props, 1, &device, NULL, NULL, &err); if (err != CL_SUCCESS) { printf( "clCreateContext() failed with %d\n", err ); return 1; } queue = clCreateCommandQueue(ctx, device, 0, &err); if (err != CL_SUCCESS) { printf( "clCreateCommandQueue() failed with %d\n", err ); clReleaseContext(ctx); return 1; } /* Setup clblas. */ err = clblasSetup(); if (err != CL_SUCCESS) { printf("clblasSetup() failed with %d\n", err); clReleaseCommandQueue(queue); clReleaseContext(ctx); return 1; } /* Prepare OpenCL memory objects and place matrices inside them. */ bufX = clCreateBuffer(ctx, CL_MEM_READ_ONLY, (lenX*sizeof(cl_float)), NULL, &err); bufY = clCreateBuffer(ctx, CL_MEM_READ_WRITE, (lenY*sizeof(cl_float)), NULL, &err); err = clEnqueueWriteBuffer(queue, bufX, CL_TRUE, 0, (lenX*sizeof(cl_float)), X, 0, NULL, NULL); err = clEnqueueWriteBuffer(queue, bufY, CL_TRUE, 0, (lenY*sizeof(cl_float)), Y, 0, NULL, NULL); /* Call clblas function. */ err = clblasSaxpy( N, alpha, bufX, 0, incx, bufY, 0, incy, 1, &queue, 0, NULL, &event); if (err != CL_SUCCESS) { printf("clblasSaxpy() failed with %d\n", err); ret = 1; } else { /* Wait for calculations to be finished. */ err = clWaitForEvents(1, &event); /* Fetch results of calculations from GPU memory. */ err = clEnqueueReadBuffer(queue, bufY, CL_TRUE, 0, (lenY*sizeof(cl_float)), Y, 0, NULL, NULL); /* At this point you will get the result of SAXPY placed in vector Y. */ printResult(); } /* Release OpenCL events. */ clReleaseEvent(event); /* Release OpenCL memory objects. */ clReleaseMemObject(bufY); clReleaseMemObject(bufX); /* Finalize work with clblas. */ clblasTeardown(); /* Release OpenCL working objects. */ clReleaseCommandQueue(queue); clReleaseContext(ctx); return ret; } clblas-2.10/src/samples/example_scopy.c000066400000000000000000000107221264277366700201710ustar00rootroot00000000000000/* ************************************************************************ * Copyright 2013 Advanced Micro Devices, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * ************************************************************************/ #include #include #include #include /* Include CLBLAS header. It automatically includes needed OpenCL header, * so we can drop out explicit inclusion of cl.h header. */ #include /* This example uses predefined matrices and their characteristics for * simplicity purpose. */ static const size_t N = 7; static cl_float X[] = { 11, 21, 31, 41, 51, 61, 71, }; static const int incx = 1; static cl_float Y[] = { 0, 2, 0, 0, 0, 5, 0, }; static const int incy = 1; static void printResult(void) { size_t i; printf("\nResult:\n"); printf(" X\n"); for (i = 0; i < N; i++) { printf("\t%f\n", X[i]); } printf("Y\n"); for (i = 0; i < N; i++) { printf("\t%f\n", Y[i]); } } int main(void) { cl_int err; cl_platform_id platform = 0; cl_device_id device = 0; cl_context_properties props[3] = { CL_CONTEXT_PLATFORM, 0, 0 }; cl_context ctx = 0; cl_command_queue queue = 0; cl_mem bufX, bufY; cl_event event = NULL; int ret = 0; int lenX = 1 + (N-1)*abs(incx); int lenY = 1 + (N-1)*abs(incy); /* Setup OpenCL environment. */ err = clGetPlatformIDs(1, &platform, NULL); if (err != CL_SUCCESS) { printf( "clGetPlatformIDs() failed with %d\n", err ); return 1; } err = clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, 1, &device, NULL); if (err != CL_SUCCESS) { printf( "clGetDeviceIDs() failed with %d\n", err ); return 1; } props[1] = (cl_context_properties)platform; ctx = clCreateContext(props, 1, &device, NULL, NULL, &err); if (err != CL_SUCCESS) { printf( "clCreateContext() failed with %d\n", err ); return 1; } queue = clCreateCommandQueue(ctx, device, 0, &err); if (err != CL_SUCCESS) { printf( "clCreateCommandQueue() failed with %d\n", err ); clReleaseContext(ctx); return 1; } /* Setup clblas. */ err = clblasSetup(); if (err != CL_SUCCESS) { printf("clblasSetup() failed with %d\n", err); clReleaseCommandQueue(queue); clReleaseContext(ctx); return 1; } /* Prepare OpenCL memory objects and place matrices inside them. */ bufX = clCreateBuffer(ctx, CL_MEM_READ_ONLY, (lenX*sizeof(cl_float)), NULL, &err); bufY = clCreateBuffer(ctx, CL_MEM_READ_WRITE, (lenY*sizeof(cl_float)), NULL, &err); err = clEnqueueWriteBuffer(queue, bufX, CL_TRUE, 0, (lenX*sizeof(cl_float)), X, 0, NULL, NULL); err = clEnqueueWriteBuffer(queue, bufY, CL_TRUE, 0, (lenY*sizeof(cl_float)), Y, 0, NULL, NULL); /* Call clblas function. */ err = clblasScopy( N, bufX, 0, incx, bufY, 0, incy, 1, &queue, 0, NULL, &event); if (err != CL_SUCCESS) { printf("clblasScopy() failed with %d\n", err); ret = 1; } else { /* Wait for calculations to be finished. */ err = clWaitForEvents(1, &event); /* Fetch results of calculations from GPU memory. */ err = clEnqueueReadBuffer(queue, bufX, CL_TRUE, 0, (lenX*sizeof(cl_float)), X, 0, NULL, NULL); err = clEnqueueReadBuffer(queue, bufY, CL_TRUE, 0, (lenY*sizeof(cl_float)), Y, 0, NULL, NULL); /* At this point you will get the result of SSWAP placed in vector Y. */ printResult(); } /* Release OpenCL events. */ clReleaseEvent(event); /* Release OpenCL memory objects. */ clReleaseMemObject(bufY); clReleaseMemObject(bufX); /* Finalize work with clblas. */ clblasTeardown(); /* Release OpenCL working objects. */ clReleaseCommandQueue(queue); clReleaseContext(ctx); return ret; } clblas-2.10/src/samples/example_sdot.c000066400000000000000000000107261264277366700200110ustar00rootroot00000000000000/* ************************************************************************ * Copyright 2013 Advanced Micro Devices, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * ************************************************************************/ #include #include #include #include /* Include CLBLAS header. It automatically includes needed OpenCL header, * so we can drop out explicit inclusion of cl.h header. */ #include /* This example uses predefined matrices and their characteristics for * simplicity purpose. */ static const size_t N = 7; static cl_float X[] = { 1, 2, -11, 17, 5, 6, 81, }; static const int incx = 1; static cl_float Y[] = { 1, 5, 6, 4, 9, 10, 4, }; static const int incy = 1; static cl_float dotProduct; int main(void) { cl_int err; cl_platform_id platform = 0; cl_device_id device = 0; cl_context_properties props[3] = { CL_CONTEXT_PLATFORM, 0, 0 }; cl_context ctx = 0; cl_command_queue queue = 0; cl_mem bufX, bufY, bufDotP, scratchBuff; cl_event event = NULL; int ret = 0; int lenX = 1 + (N-1)*abs(incx); int lenY = 1 + (N-1)*abs(incy); /* Setup OpenCL environment. */ err = clGetPlatformIDs(1, &platform, NULL); if (err != CL_SUCCESS) { printf( "clGetPlatformIDs() failed with %d\n", err ); return 1; } err = clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, 1, &device, NULL); if (err != CL_SUCCESS) { printf( "clGetDeviceIDs() failed with %d\n", err ); return 1; } props[1] = (cl_context_properties)platform; ctx = clCreateContext(props, 1, &device, NULL, NULL, &err); if (err != CL_SUCCESS) { printf( "clCreateContext() failed with %d\n", err ); return 1; } queue = clCreateCommandQueue(ctx, device, 0, &err); if (err != CL_SUCCESS) { printf( "clCreateCommandQueue() failed with %d\n", err ); clReleaseContext(ctx); return 1; } /* Setup clblas. */ err = clblasSetup(); if (err != CL_SUCCESS) { printf("clblasSetup() failed with %d\n", err); clReleaseCommandQueue(queue); clReleaseContext(ctx); return 1; } /* Prepare OpenCL memory objects and place matrices inside them. */ bufX = clCreateBuffer(ctx, CL_MEM_READ_ONLY, (lenX*sizeof(cl_float)), NULL, &err); bufY = clCreateBuffer(ctx, CL_MEM_READ_ONLY, (lenY*sizeof(cl_float)), NULL, &err); // Allocate 1 element space for dotProduct bufDotP = clCreateBuffer(ctx, CL_MEM_WRITE_ONLY, (sizeof(cl_float)), NULL, &err); // Allocate minimum of N elements scratchBuff = clCreateBuffer(ctx, CL_MEM_READ_WRITE, (N*sizeof(cl_float)), NULL, &err); err = clEnqueueWriteBuffer(queue, bufX, CL_TRUE, 0, (lenX*sizeof(cl_float)), X, 0, NULL, NULL); err = clEnqueueWriteBuffer(queue, bufY, CL_TRUE, 0, (lenY*sizeof(cl_float)), Y, 0, NULL, NULL); /* Call clblas function. */ err = clblasSdot( N, bufDotP, 0, bufX, 0, incx, bufY, 0, incy, scratchBuff, 1, &queue, 0, NULL, &event); if (err != CL_SUCCESS) { printf("clblasSdot() failed with %d\n", err); ret = 1; } else { /* Wait for calculations to be finished. */ err = clWaitForEvents(1, &event); /* Fetch results of calculations from GPU memory. */ err = clEnqueueReadBuffer(queue, bufDotP, CL_TRUE, 0, sizeof(cl_float), &dotProduct, 0, NULL, NULL); printf("Result dot product: %f\n", dotProduct); } /* Release OpenCL events. */ clReleaseEvent(event); /* Release OpenCL memory objects. */ clReleaseMemObject(bufY); clReleaseMemObject(bufX); clReleaseMemObject(bufDotP); clReleaseMemObject(scratchBuff); /* Finalize work with clblas. */ clblasTeardown(); /* Release OpenCL working objects. */ clReleaseCommandQueue(queue); clReleaseContext(ctx); return ret; } clblas-2.10/src/samples/example_sgbmv.c000066400000000000000000000120461264277366700201530ustar00rootroot00000000000000/* ************************************************************************ * Copyright 2013 Advanced Micro Devices, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * ************************************************************************/ #include #include #include /* Include CLBLAS header. It automatically includes needed OpenCL header, * so we can drop out explicit inclusion of cl.h header. */ #include /* This example uses predefined matrices and their characteristics for * simplicity purpose. */ static const clblasOrder order = clblasRowMajor; static const clblasTranspose trans = clblasNoTrans; static const size_t M = 5; static const size_t N = 5; static const size_t KL = 1; static const size_t KU = 2; static const size_t lda = 4; // lda = KL + KU + 1 static const cl_float A[] = { 00, 12, 13, 14, 21, 22, 23, 24, 31, 32, 33, 34, 41, 42, 43, 00, 51, 62, 00, 00 }; static const cl_float alpha = 10; static const cl_float X[] = { 11, 21, 31, 41, 51, }; static const int incx = 1; static const cl_float beta = 20; static cl_float Y[] = { 11, 21, 31, 41, 51, }; static const int incy = 1; static void printResult(const char* str) { size_t i; printf("%s:\n", str); for (i = 0; i < M; i++) { printf("%f\n", Y[ i * incy ]); } } int main(void) { cl_int err; cl_platform_id platform = 0; cl_device_id device = 0; cl_context_properties props[3] = { CL_CONTEXT_PLATFORM, 0, 0 }; cl_context ctx = 0; cl_command_queue queue = 0; cl_mem bufA, bufX, bufY; cl_event event = NULL; int ret = 0; /* Setup OpenCL environment. */ err = clGetPlatformIDs(1, &platform, NULL); if (err != CL_SUCCESS) { printf( "clGetPlatformIDs() failed with %d\n", err ); return 1; } err = clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, 1, &device, NULL); if (err != CL_SUCCESS) { printf( "clGetDeviceIDs() failed with %d\n", err ); return 1; } props[1] = (cl_context_properties)platform; ctx = clCreateContext(props, 1, &device, NULL, NULL, &err); if (err != CL_SUCCESS) { printf( "clCreateContext() failed with %d\n", err ); return 1; } queue = clCreateCommandQueue(ctx, device, 0, &err); if (err != CL_SUCCESS) { printf( "clCreateCommandQueue() failed with %d\n", err ); clReleaseContext(ctx); return 1; } /* Setup clblas. */ err = clblasSetup(); if (err != CL_SUCCESS) { printf("clblasSetup() failed with %d\n", err); clReleaseCommandQueue(queue); clReleaseContext(ctx); return 1; } /* Prepare OpenCL memory objects and place matrices inside them. */ bufA = clCreateBuffer(ctx, CL_MEM_READ_ONLY, M * lda * sizeof(cl_float), NULL, &err); bufX = clCreateBuffer(ctx, CL_MEM_READ_ONLY, N * sizeof(cl_float), NULL, &err); bufY = clCreateBuffer(ctx, CL_MEM_READ_WRITE, M * sizeof(cl_float), NULL, &err); err = clEnqueueWriteBuffer(queue, bufA, CL_TRUE, 0, M * lda * sizeof(cl_float), A, 0, NULL, NULL); err = clEnqueueWriteBuffer(queue, bufX, CL_TRUE, 0, N * sizeof(cl_float), X, 0, NULL, NULL); err = clEnqueueWriteBuffer(queue, bufY, CL_TRUE, 0, M * sizeof(cl_float), Y, 0, NULL, NULL); /* Call clblas function. */ err = clblasSgbmv(order, trans, M, N, KL, KU, alpha, bufA, 0, lda, bufX, 0, incx, beta, bufY, 0, incy, 1, &queue, 0, NULL, &event); if (err != CL_SUCCESS) { printf("clblasSgbmv() failed with %d\n", err); ret = 1; } else { /* Wait for calculations to be finished. */ err = clWaitForEvents(1, &event); /* Fetch results of calculations from GPU memory. */ err = clEnqueueReadBuffer(queue, bufY, CL_TRUE, 0, M * sizeof(cl_float), Y, 0, NULL, NULL); /* At this point you will get the result of SGBMV placed in Y array. */ printResult("clblasSgbmv result"); } /* Release OpenCL events. */ clReleaseEvent(event); /* Release OpenCL memory objects. */ clReleaseMemObject(bufY); clReleaseMemObject(bufX); clReleaseMemObject(bufA); /* Finalize work with clblas. */ clblasTeardown(); /* Release OpenCL working objects. */ clReleaseCommandQueue(queue); clReleaseContext(ctx); return ret; } clblas-2.10/src/samples/example_sgemm.c000066400000000000000000000132011264277366700201370ustar00rootroot00000000000000/* ************************************************************************ * Copyright 2013 Advanced Micro Devices, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * ************************************************************************/ #include #include #include /* Include CLBLAS header. It automatically includes needed OpenCL header, * so we can drop out explicit inclusion of cl.h header. */ #include /* This example uses predefined matrices and their characteristics for * simplicity purpose. */ #define M 4 #define N 3 #define K 5 static const clblasOrder order = clblasRowMajor; static const cl_float alpha = 10; static const clblasTranspose transA = clblasNoTrans; static const cl_float A[M*K] = { 11, 12, 13, 14, 15, 21, 22, 23, 24, 25, 31, 32, 33, 34, 35, 41, 42, 43, 44, 45, }; static const size_t lda = K; /* i.e. lda = K */ static const clblasTranspose transB = clblasNoTrans; static const cl_float B[K*N] = { 11, 12, 13, 21, 22, 23, 31, 32, 33, 41, 42, 43, 51, 52, 53, }; static const size_t ldb = N; /* i.e. ldb = N */ static const cl_float beta = 20; static cl_float C[M*N] = { 11, 12, 13, 21, 22, 23, 31, 32, 33, 41, 42, 43, }; static const size_t ldc = N; /* i.e. ldc = N */ static cl_float result[M*N]; static const size_t off = 1; static const size_t offA = K + 1; /* K + off */ static const size_t offB = N + 1; /* N + off */ static const size_t offC = N + 1; /* N + off */ static void printResult(const char* str) { size_t i, j, nrows; printf("%s:\n", str); nrows = (sizeof(result) / sizeof(cl_float)) / ldc; for (i = 0; i < nrows; i++) { for (j = 0; j < ldc; j++) { printf("%d ", (int)result[i * ldc + j]); } printf("\n"); } } int main(void) { cl_int err; cl_platform_id platform = 0; cl_device_id device = 0; cl_context_properties props[3] = { CL_CONTEXT_PLATFORM, 0, 0 }; cl_context ctx = 0; cl_command_queue queue = 0; cl_mem bufA, bufB, bufC; cl_event event = NULL; int ret = 0; /* Setup OpenCL environment. */ err = clGetPlatformIDs(1, &platform, NULL); if (err != CL_SUCCESS) { printf( "clGetPlatformIDs() failed with %d\n", err ); return 1; } err = clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, 1, &device, NULL); if (err != CL_SUCCESS) { printf( "clGetDeviceIDs() failed with %d\n", err ); return 1; } props[1] = (cl_context_properties)platform; ctx = clCreateContext(props, 1, &device, NULL, NULL, &err); if (err != CL_SUCCESS) { printf( "clCreateContext() failed with %d\n", err ); return 1; } queue = clCreateCommandQueue(ctx, device, 0, &err); if (err != CL_SUCCESS) { printf( "clCreateCommandQueue() failed with %d\n", err ); clReleaseContext(ctx); return 1; } /* Setup clblas. */ err = clblasSetup(); if (err != CL_SUCCESS) { printf("clblasSetup() failed with %d\n", err); clReleaseCommandQueue(queue); clReleaseContext(ctx); return 1; } /* Prepare OpenCL memory objects and place matrices inside them. */ bufA = clCreateBuffer(ctx, CL_MEM_READ_ONLY, M * K * sizeof(*A), NULL, &err); bufB = clCreateBuffer(ctx, CL_MEM_READ_ONLY, K * N * sizeof(*B), NULL, &err); bufC = clCreateBuffer(ctx, CL_MEM_READ_WRITE, M * N * sizeof(*C), NULL, &err); err = clEnqueueWriteBuffer(queue, bufA, CL_TRUE, 0, M * K * sizeof(*A), A, 0, NULL, NULL); err = clEnqueueWriteBuffer(queue, bufB, CL_TRUE, 0, K * N * sizeof(*B), B, 0, NULL, NULL); err = clEnqueueWriteBuffer(queue, bufC, CL_TRUE, 0, M * N * sizeof(*C), C, 0, NULL, NULL); /* Call clblas extended function. Perform gemm for the lower right sub-matrices */ err = clblasSgemm(order, transA, transB, M - off, N - off, K - off, alpha, bufA, offA, lda, bufB, offB, ldb, beta, bufC, offC, ldc, 1, &queue, 0, NULL, &event); if (err != CL_SUCCESS) { printf("clblasSgemmEx() failed with %d\n", err); ret = 1; } else { /* Wait for calculations to be finished. */ err = clWaitForEvents(1, &event); /* Fetch results of calculations from GPU memory. */ err = clEnqueueReadBuffer(queue, bufC, CL_TRUE, 0, M * N * sizeof(*result), result, 0, NULL, NULL); /* At this point you will get the result of SGEMM placed in 'result' array. */ puts(""); printResult("clblasSgemmEx result"); } /* Release OpenCL events. */ clReleaseEvent(event); /* Release OpenCL memory objects. */ clReleaseMemObject(bufC); clReleaseMemObject(bufB); clReleaseMemObject(bufA); /* Finalize work with clblas. */ clblasTeardown(); /* Release OpenCL working objects. */ clReleaseCommandQueue(queue); clReleaseContext(ctx); return ret; } clblas-2.10/src/samples/example_sgemv.c000066400000000000000000000123641264277366700201610ustar00rootroot00000000000000/* ************************************************************************ * Copyright 2013 Advanced Micro Devices, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * ************************************************************************/ #include #include #include /* Include CLBLAS header. It automatically includes needed OpenCL header, * so we can drop out explicit inclusion of cl.h header. */ #include /* This example uses predefined matrices and their characteristics for * simplicity purpose. */ static const clblasOrder order = clblasRowMajor; static const size_t M = 4; static const size_t N = 5; static const cl_float alpha = 10; static const clblasTranspose transA = clblasNoTrans; static const cl_float A[] = { 11, 12, 13, 14, 15, 21, 22, 23, 24, 25, 31, 32, 33, 34, 35, 41, 42, 43, 44, 45 }; static const size_t lda = 5; /* i.e. lda = N */ static const cl_float X[] = { 11, 21, 31, 41, 51, }; static const int incx = 1; static const cl_float beta = 20; static cl_float Y[] = { 11, 21, 31, 41, }; static const int incy = 1; static cl_float result[4]; /* M */ static const size_t off = 1; static const size_t offA = 5 + 1; /* M + off */ static const size_t offX = 1; /* off */ static const size_t offY = 1; /* off */ static void printResult(const char* str) { size_t i, nElements; printf("%s:\n", str); nElements = (sizeof(result) / sizeof(cl_float)) / incy; for (i = 0; i < nElements; i++) { printf("%d\n", (int)result[i * incy]); } } int main(void) { cl_int err; cl_platform_id platform = 0; cl_device_id device = 0; cl_context_properties props[3] = { CL_CONTEXT_PLATFORM, 0, 0 }; cl_context ctx = 0; cl_command_queue queue = 0; cl_mem bufA, bufX, bufY; cl_event event = NULL; int ret = 0; /* Setup OpenCL environment. */ err = clGetPlatformIDs(1, &platform, NULL); if (err != CL_SUCCESS) { printf( "clGetPlatformIDs() failed with %d\n", err ); return 1; } err = clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, 1, &device, NULL); if (err != CL_SUCCESS) { printf( "clGetDeviceIDs() failed with %d\n", err ); return 1; } props[1] = (cl_context_properties)platform; ctx = clCreateContext(props, 1, &device, NULL, NULL, &err); if (err != CL_SUCCESS) { printf( "clCreateContext() failed with %d\n", err ); return 1; } queue = clCreateCommandQueue(ctx, device, 0, &err); if (err != CL_SUCCESS) { printf( "clCreateCommandQueue() failed with %d\n", err ); clReleaseContext(ctx); return 1; } /* Setup clblas. */ err = clblasSetup(); if (err != CL_SUCCESS) { printf("clblasSetup() failed with %d\n", err); clReleaseCommandQueue(queue); clReleaseContext(ctx); return 1; } /* Prepare OpenCL memory objects and place matrices inside them. */ bufA = clCreateBuffer(ctx, CL_MEM_READ_ONLY, M * N * sizeof(*A), NULL, &err); bufX = clCreateBuffer(ctx, CL_MEM_READ_ONLY, N * sizeof(*X), NULL, &err); bufY = clCreateBuffer(ctx, CL_MEM_READ_WRITE, M * sizeof(*Y), NULL, &err); err = clEnqueueWriteBuffer(queue, bufA, CL_TRUE, 0, M * N * sizeof(*A), A, 0, NULL, NULL); err = clEnqueueWriteBuffer(queue, bufX, CL_TRUE, 0, N * sizeof(*X), X, 0, NULL, NULL); err = clEnqueueWriteBuffer(queue, bufY, CL_TRUE, 0, M * sizeof(*Y), Y, 0, NULL, NULL); /* Call clblas extended function. */ err = clblasSgemv(order, transA, M - off, N - off, alpha, bufA, offA, lda, bufX, offX, incx, beta, bufY, offY, incy, 1, &queue, 0, NULL, &event); if (err != CL_SUCCESS) { printf("clblasSgemvEx() failed with %d\n", err); ret = 1; } else { /* Wait for calculations to be finished. */ err = clWaitForEvents(1, &event); /* Fetch results of calculations from GPU memory. */ err = clEnqueueReadBuffer(queue, bufY, CL_TRUE, 0, M * sizeof(*result), result, 0, NULL, NULL); /* At this point you will get the result of SGEMV placed in 'result' array. */ puts(""); printResult("clblasSgemvEx result"); } /* Release OpenCL events. */ clReleaseEvent(event); /* Release OpenCL memory objects. */ clReleaseMemObject(bufY); clReleaseMemObject(bufX); clReleaseMemObject(bufA); /* Finalize work with clblas. */ clblasTeardown(); /* Release OpenCL working objects. */ clReleaseCommandQueue(queue); clReleaseContext(ctx); return ret; } clblas-2.10/src/samples/example_sger.c000066400000000000000000000117041264277366700177750ustar00rootroot00000000000000/* ************************************************************************ * Copyright 2013 Advanced Micro Devices, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * ************************************************************************/ #include #include #include #include /* Include CLBLAS header. It automatically includes needed OpenCL header, * so we can drop out explicit inclusion of cl.h header. */ #include /* This example uses predefined matrices and their characteristics for * simplicity purpose. */ static const clblasOrder order = clblasRowMajor; static const size_t M = 5; static const size_t N = 5; static const cl_float alpha = 10; static cl_float A[] = { 11, 12, 13, 14, 15, 21, 22, 23, 24, 25, 31, 32, 33, 34, 35, 41, 42, 43, 44, 45, 81, 22, 33, 14, 75 }; static const size_t lda = 5; /* i.e. lda = N */ static const cl_float X[] = { 11, 21, 31, 41, 51, 91, }; static const int incx = 1; static const cl_float Y[] = { 45, 23, 39, 45, 50, 10, }; static const int incy = 1; static void printResult(void) { size_t i, j; printf("\nResult:\n"); for (i = 0; i < M; i++) { for(j = 0; j < N; j++) printf("\t%f", A[ i*N + j ]); printf("\n"); } } int main(void) { cl_int err; cl_platform_id platform = 0; cl_device_id device = 0; cl_context_properties props[3] = { CL_CONTEXT_PLATFORM, 0, 0 }; cl_context ctx = 0; cl_command_queue queue = 0; cl_mem bufA, bufX, bufY; cl_event event = NULL; int ret = 0; /* Setup OpenCL environment. */ err = clGetPlatformIDs(1, &platform, NULL); if (err != CL_SUCCESS) { printf( "clGetPlatformIDs() failed with %d\n", err ); return 1; } err = clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, 1, &device, NULL); if (err != CL_SUCCESS) { printf( "clGetDeviceIDs() failed with %d\n", err ); return 1; } props[1] = (cl_context_properties)platform; ctx = clCreateContext(props, 1, &device, NULL, NULL, &err); if (err != CL_SUCCESS) { printf( "clCreateContext() failed with %d\n", err ); return 1; } queue = clCreateCommandQueue(ctx, device, 0, &err); if (err != CL_SUCCESS) { printf( "clCreateCommandQueue() failed with %d\n", err ); clReleaseContext(ctx); return 1; } /* Setup clblas. */ err = clblasSetup(); if (err != CL_SUCCESS) { printf("clblasSetup() failed with %d\n", err); clReleaseCommandQueue(queue); clReleaseContext(ctx); return 1; } /* Prepare OpenCL memory objects and place matrices inside them. */ bufA = clCreateBuffer(ctx, CL_MEM_READ_WRITE, M * lda * sizeof(cl_float), NULL, &err); bufX = clCreateBuffer(ctx, CL_MEM_READ_ONLY, ( 1 + ( M - 1 )*abs( incx ) ) * sizeof(cl_float), NULL, &err); bufY = clCreateBuffer(ctx, CL_MEM_READ_ONLY, ( 1 + ( N - 1 )*abs( incy ) ) * sizeof(cl_float), NULL, &err); err = clEnqueueWriteBuffer(queue, bufA, CL_TRUE, 0, M * lda * sizeof(cl_float), A, 0, NULL, NULL); err = clEnqueueWriteBuffer(queue, bufX, CL_TRUE, 0, ( 1 + ( M - 1 )*abs( incx ) ) * sizeof(cl_float), X, 0, NULL, NULL); err = clEnqueueWriteBuffer(queue, bufY, CL_TRUE, 0, ( 1 + ( N - 1 )*abs( incy ) ) * sizeof(cl_float), Y, 0, NULL, NULL); /* Call clblas function. */ err = clblasSger(order, M, N, alpha, bufX, 0, incx, bufY, 0, incy, bufA, 0, lda, 1, &queue, 0, NULL, &event); if (err != CL_SUCCESS) { printf("clblasSger() failed with %d\n", err); ret = 1; } else { /* Wait for calculations to be finished. */ err = clWaitForEvents(1, &event); /* Fetch results of calculations from GPU memory. */ err = clEnqueueReadBuffer(queue, bufA, CL_TRUE, 0, (M * lda * sizeof(cl_float)), A, 0, NULL, NULL); /* At this point you will get the result of SGER placed in A array. */ printResult(); } /* Release OpenCL events. */ clReleaseEvent(event); /* Release OpenCL memory objects. */ clReleaseMemObject(bufY); clReleaseMemObject(bufX); clReleaseMemObject(bufA); /* Finalize work with clblas. */ clblasTeardown(); /* Release OpenCL working objects. */ clReleaseCommandQueue(queue); clReleaseContext(ctx); return ret; } clblas-2.10/src/samples/example_snrm2.c000066400000000000000000000101131264277366700200670ustar00rootroot00000000000000/* ************************************************************************ * Copyright 2013 Advanced Micro Devices, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * ************************************************************************/ #include #include #include #include /* Include CLBLAS header. It automatically includes needed OpenCL header, * so we can drop out explicit inclusion of cl.h header. */ #include /* This example uses predefined matrices and their characteristics for * simplicity purpose. */ static const size_t N = 7; static cl_float X[] = { 1, 2, -11, 17, 5, 6, 81, }; static const int incx = 1; static cl_float NRM2; int main(void) { cl_int err; cl_platform_id platform = 0; cl_device_id device = 0; cl_context_properties props[3] = { CL_CONTEXT_PLATFORM, 0, 0 }; cl_context ctx = 0; cl_command_queue queue = 0; cl_mem bufX, bufNRM2, scratchBuff; cl_event event = NULL; int ret = 0; int lenX = 1 + (N-1)*abs(incx); /* Setup OpenCL environment. */ err = clGetPlatformIDs(1, &platform, NULL); if (err != CL_SUCCESS) { printf( "clGetPlatformIDs() failed with %d\n", err ); return 1; } err = clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, 1, &device, NULL); if (err != CL_SUCCESS) { printf( "clGetDeviceIDs() failed with %d\n", err ); return 1; } props[1] = (cl_context_properties)platform; ctx = clCreateContext(props, 1, &device, NULL, NULL, &err); if (err != CL_SUCCESS) { printf( "clCreateContext() failed with %d\n", err ); return 1; } queue = clCreateCommandQueue(ctx, device, 0, &err); if (err != CL_SUCCESS) { printf( "clCreateCommandQueue() failed with %d\n", err ); clReleaseContext(ctx); return 1; } /* Setup clblas. */ err = clblasSetup(); if (err != CL_SUCCESS) { printf("clblasSetup() failed with %d\n", err); clReleaseCommandQueue(queue); clReleaseContext(ctx); return 1; } /* Prepare OpenCL memory objects and place vectors inside them. */ bufX = clCreateBuffer(ctx, CL_MEM_READ_ONLY, (lenX*sizeof(cl_float)), NULL, &err); // Allocate 1 element space for NRM2 bufNRM2 = clCreateBuffer(ctx, CL_MEM_WRITE_ONLY, (sizeof(cl_float)), NULL, &err); // Allocate minimum of N elements scratchBuff = clCreateBuffer(ctx, CL_MEM_READ_WRITE, (2*N*sizeof(cl_float)), NULL, &err); err = clEnqueueWriteBuffer(queue, bufX, CL_TRUE, 0, (lenX*sizeof(cl_float)), X, 0, NULL, NULL); /* Call clblas function. */ err = clblasSnrm2(N, bufNRM2, 0, bufX, 0, incx, scratchBuff, 1, &queue, 0, NULL, &event); if (err != CL_SUCCESS) { printf("clblasSnrm2() failed with %d\n", err); ret = 1; } else { /* Wait for calculations to be finished. */ err = clWaitForEvents(1, &event); /* Fetch results of calculations from GPU memory. */ err = clEnqueueReadBuffer(queue, bufNRM2, CL_TRUE, 0, sizeof(cl_float), &NRM2, 0, NULL, NULL); printf("Result Euclidean Norm: %f\n", NRM2); } /* Release OpenCL events. */ clReleaseEvent(event); /* Release OpenCL memory objects. */ clReleaseMemObject(bufX); clReleaseMemObject(bufNRM2); clReleaseMemObject(scratchBuff); /* Finalize work with clblas. */ clblasTeardown(); /* Release OpenCL working objects. */ clReleaseCommandQueue(queue); clReleaseContext(ctx); return ret; } clblas-2.10/src/samples/example_srot.c000066400000000000000000000111331264277366700200200ustar00rootroot00000000000000/* ************************************************************************ * Copyright 2013 Advanced Micro Devices, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * ************************************************************************/ #include #include #include #include /* Include CLBLAS header. It automatically includes needed OpenCL header, * so we can drop out explicit inclusion of cl.h header. */ #include /* This example uses predefined matrices and their characteristics for * simplicity purpose. */ static const size_t N = 5; static cl_float X[] = { 1, 2, 3, 4, 5, }; static const int incx = 1; static cl_float Y[] = { 6, 7, 8, 9, 9, }; static const int incy = 1; static const cl_float C = 2.0; static const cl_float S = 3.0; static void printResult(void) { size_t i; printf("\nResult:\n"); printf("X\n"); for (i = 0; i < N; i++) { printf("\t%f\n", X[i]); } printf("Y\n"); for (i = 0; i < N; i++) { printf("\t%f\n", Y[i]); } } int main(void) { cl_int err; cl_platform_id platform = 0; cl_device_id device = 0; cl_context_properties props[3] = { CL_CONTEXT_PLATFORM, 0, 0 }; cl_context ctx = 0; cl_command_queue queue = 0; cl_mem bufX, bufY; cl_event event = NULL; int ret = 0; int lenX = 1 + (N-1)*abs(incx); int lenY = 1 + (N-1)*abs(incy); /* Setup OpenCL environment. */ err = clGetPlatformIDs(1, &platform, NULL); err = clGetDeviceIDs(platform, CL_DEVICE_TYPE_CPU, 1, &device, NULL); if (err != CL_SUCCESS) { printf( "clGetPlatformIDs() failed with %d\n", err ); return 1; } err = clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, 1, &device, NULL); if (err != CL_SUCCESS) { printf( "clGetDeviceIDs() failed with %d\n", err ); return 1; } props[1] = (cl_context_properties)platform; ctx = clCreateContext(props, 1, &device, NULL, NULL, &err); if (err != CL_SUCCESS) { printf( "clCreateContext() failed with %d\n", err ); return 1; } queue = clCreateCommandQueue(ctx, device, 0, &err); if (err != CL_SUCCESS) { printf( "clCreateCommandQueue() failed with %d\n", err ); clReleaseContext(ctx); return 1; } /* Setup clblas. */ err = clblasSetup(); if (err != CL_SUCCESS) { printf("clblasSetup() failed with %d\n", err); clReleaseCommandQueue(queue); clReleaseContext(ctx); return 1; } /* Prepare OpenCL memory objects and place matrices inside them. */ bufX = clCreateBuffer(ctx, CL_MEM_READ_WRITE, (lenX*sizeof(cl_float)), NULL, &err); bufY = clCreateBuffer(ctx, CL_MEM_READ_WRITE, (lenY*sizeof(cl_float)), NULL, &err); err = clEnqueueWriteBuffer(queue, bufX, CL_TRUE, 0, (lenX*sizeof(cl_float)), X, 0, NULL, NULL); err = clEnqueueWriteBuffer(queue, bufY, CL_TRUE, 0, (lenY*sizeof(cl_float)), Y, 0, NULL, NULL); printResult(); /* Call clblas function. */ err = clblasSrot(N, bufX, 0, incx, bufY, 0, incy, C, S, 1, &queue, 0, NULL, &event); // printf("here\n"); if (err != CL_SUCCESS) { printf("clblasSrot() failed with %d\n", err); ret = 1; } else { /* Wait for calculations to be finished. */ err = clWaitForEvents(1, &event); /* Fetch results of calculations from GPU memory. */ err = clEnqueueReadBuffer(queue, bufY, CL_TRUE, 0, (lenY*sizeof(cl_float)), Y, 0, NULL, NULL); err = clEnqueueReadBuffer(queue, bufX, CL_TRUE, 0, (lenX*sizeof(cl_float)), X, 0, NULL, NULL); /* At this point you will get the result of SROT placed in vector Y. */ printResult(); } /* Release OpenCL events. */ clReleaseEvent(event); /* Release OpenCL memory objects. */ clReleaseMemObject(bufY); clReleaseMemObject(bufX); /* Finalize work with clblas. */ clblasTeardown(); /* Release OpenCL working objects. */ clReleaseCommandQueue(queue); clReleaseContext(ctx); return ret; } clblas-2.10/src/samples/example_srotg.c000066400000000000000000000112211264277366700201650ustar00rootroot00000000000000/* ************************************************************************ * Copyright 2013 Advanced Micro Devices, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * ************************************************************************/ #include #include #include #include /* Include CLBLAS header. It automatically includes needed OpenCL header, * so we can drop out explicit inclusion of cl.h header. */ #include /* This example uses predefined matrices and their characteristics for * simplicity purpose. */ static cl_float SA = 11; static cl_float SB = 21; static cl_float C = 0.2; static cl_float S = 0.5; static void printResult(void) { printf("\nResult:\n"); printf("SA: %f\tSB: %f\t C: %f\tS: %f\n", SA, SB, C, S); } int main(void) { cl_int err; cl_platform_id platform = 0; cl_device_id device = 0; cl_context_properties props[3] = { CL_CONTEXT_PLATFORM, 0, 0 }; cl_context ctx = 0; cl_command_queue queue = 0; cl_mem bufSA, bufSB, bufC, bufS; cl_event event = NULL; int ret = 0; /* Setup OpenCL environment. */ err = clGetPlatformIDs(1, &platform, NULL); if (err != CL_SUCCESS) { printf( "clGetPlatformIDs() failed with %d\n", err ); return 1; } err = clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, 1, &device, NULL); if (err != CL_SUCCESS) { printf( "clGetDeviceIDs() failed with %d\n", err ); return 1; } props[1] = (cl_context_properties)platform; ctx = clCreateContext(props, 1, &device, NULL, NULL, &err); if (err != CL_SUCCESS) { printf( "clCreateContext() failed with %d\n", err ); return 1; } queue = clCreateCommandQueue(ctx, device, 0, &err); if (err != CL_SUCCESS) { printf( "clCreateCommandQueue() failed with %d\n", err ); clReleaseContext(ctx); return 1; } /* Setup clblas. */ err = clblasSetup(); if (err != CL_SUCCESS) { printf("clblasSetup() failed with %d\n", err); clReleaseCommandQueue(queue); clReleaseContext(ctx); return 1; } /* Prepare OpenCL memory objects and place matrices inside them. */ bufSA = clCreateBuffer(ctx, CL_MEM_READ_WRITE, sizeof(cl_float), NULL, &err); bufSB = clCreateBuffer(ctx, CL_MEM_READ_WRITE, sizeof(cl_float), NULL, &err); bufC = clCreateBuffer(ctx, CL_MEM_READ_WRITE, sizeof(cl_float), NULL, &err); bufS = clCreateBuffer(ctx, CL_MEM_READ_WRITE, sizeof(cl_float), NULL, &err); err = clEnqueueWriteBuffer(queue, bufSA, CL_TRUE, 0, sizeof(cl_float), &SA, 0, NULL, NULL); err = clEnqueueWriteBuffer(queue, bufSB, CL_TRUE, 0, sizeof(cl_float), &SB, 0, NULL, NULL); err = clEnqueueWriteBuffer(queue, bufC, CL_TRUE, 0, sizeof(cl_float), &C, 0, NULL, NULL); err = clEnqueueWriteBuffer(queue, bufS, CL_TRUE, 0, sizeof(cl_float), &S, 0, NULL, NULL); /* Call clblas function. */ err = clblasSrotg(bufSA, 0, bufSB, 0, bufC, 0, bufS, 0, 1, &queue, 0, NULL, &event); if (err != CL_SUCCESS) { printf("clblasSrotg() failed with %d\n", err); ret = 1; } else { /* Wait for calculations to be finished. */ err = clWaitForEvents(1, &event); /* Fetch results of calculations from GPU memory. */ err = clEnqueueReadBuffer(queue, bufSA, CL_TRUE, 0, sizeof(cl_float), &SA, 0, NULL, NULL); err = clEnqueueReadBuffer(queue, bufSB, CL_TRUE, 0, sizeof(cl_float), &SB, 0, NULL, NULL); err = clEnqueueReadBuffer(queue, bufC, CL_TRUE, 0, sizeof(cl_float), &C, 0, NULL, NULL); err = clEnqueueReadBuffer(queue, bufS, CL_TRUE, 0, sizeof(cl_float), &S, 0, NULL, NULL); /* At this point you will get the result of SROTG placed in vector Y. */ printResult(); } /* Release OpenCL events. */ clReleaseEvent(event); /* Release OpenCL memory objects. */ clReleaseMemObject(bufSA); clReleaseMemObject(bufSB); clReleaseMemObject(bufC); clReleaseMemObject(bufS); /* Finalize work with clblas. */ clblasTeardown(); /* Release OpenCL working objects. */ clReleaseCommandQueue(queue); clReleaseContext(ctx); return ret; } clblas-2.10/src/samples/example_srotm.c000066400000000000000000000114661264277366700202060ustar00rootroot00000000000000/* ************************************************************************ * Copyright 2013 Advanced Micro Devices, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * ************************************************************************/ #include #include #include #include /* Include CLBLAS header. It automatically includes needed OpenCL header, * so we can drop out explicit inclusion of cl.h header. */ #include /* This example uses predefined matrices and their characteristics for * simplicity purpose. */ static const size_t N = 7; static cl_float X[] = { 11, 21, 31, 41, 51, 61, 71, }; static const int incx = 1; static cl_float Y[] = { 15, 11, 1, 2, 1, 8, 1, }; static const int incy = 1; static cl_float SPARAM[] = { -1, 10, 12, 20, 2 }; static void printResult(void) { size_t i; printf("\nResult:\n"); printf("X\n"); for (i = 0; i < N; i++) { printf("\t%f\n", X[i]); } printf("Y\n"); for (i = 0; i < N; i++) { printf("\t%f\n", Y[i]); } } int main(void) { cl_int err; cl_platform_id platform = 0; cl_device_id device = 0; cl_context_properties props[3] = { CL_CONTEXT_PLATFORM, 0, 0 }; cl_context ctx = 0; cl_command_queue queue = 0; cl_mem bufX, bufY, bufParam; cl_event event = NULL; int ret = 0; int lenX = 1 + (N-1)*abs(incx); int lenY = 1 + (N-1)*abs(incy); int lenParam = 5; /* Setup OpenCL environment. */ err = clGetPlatformIDs(1, &platform, NULL); if (err != CL_SUCCESS) { printf( "clGetPlatformIDs() failed with %d\n", err ); return 1; } err = clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, 1, &device, NULL); if (err != CL_SUCCESS) { printf( "clGetDeviceIDs() failed with %d\n", err ); return 1; } props[1] = (cl_context_properties)platform; ctx = clCreateContext(props, 1, &device, NULL, NULL, &err); if (err != CL_SUCCESS) { printf( "clCreateContext() failed with %d\n", err ); return 1; } queue = clCreateCommandQueue(ctx, device, 0, &err); if (err != CL_SUCCESS) { printf( "clCreateCommandQueue() failed with %d\n", err ); clReleaseContext(ctx); return 1; } /* Setup clblas. */ err = clblasSetup(); if (err != CL_SUCCESS) { printf("clblasSetup() failed with %d\n", err); clReleaseCommandQueue(queue); clReleaseContext(ctx); return 1; } /* Prepare OpenCL memory objects and place matrices inside them. */ bufX = clCreateBuffer(ctx, CL_MEM_READ_WRITE, (lenX*sizeof(cl_float)), NULL, &err); bufY = clCreateBuffer(ctx, CL_MEM_READ_WRITE, (lenY*sizeof(cl_float)), NULL, &err); bufParam = clCreateBuffer(ctx, CL_MEM_READ_WRITE, (lenParam*sizeof(cl_float)), NULL, &err); err = clEnqueueWriteBuffer(queue, bufX, CL_TRUE, 0, (lenX*sizeof(cl_float)), X, 0, NULL, NULL); err = clEnqueueWriteBuffer(queue, bufY, CL_TRUE, 0, (lenY*sizeof(cl_float)), Y, 0, NULL, NULL); err = clEnqueueWriteBuffer(queue, bufParam, CL_TRUE, 0, (lenParam*sizeof(cl_float)), SPARAM, 0, NULL, NULL); /* Call clblas function. */ err = clblasSrotm(N, bufX, 0, incx, bufY, 0, incy, bufParam, 0, 1, &queue, 0, NULL, &event); if (err != CL_SUCCESS) { printf("clblasSrotm() failed with %d\n", err); ret = 1; } else { /* Wait for calculations to be finished. */ err = clWaitForEvents(1, &event); /* Fetch results of calculations from GPU memory. */ err = clEnqueueReadBuffer(queue, bufY, CL_TRUE, 0, (lenY*sizeof(cl_float)), Y, 0, NULL, NULL); err = clEnqueueReadBuffer(queue, bufX, CL_TRUE, 0, (lenX*sizeof(cl_float)), X, 0, NULL, NULL); /* At this point you will get the result of SROTM placed in vector Y. */ printResult(); } /* Release OpenCL events. */ clReleaseEvent(event); /* Release OpenCL memory objects. */ clReleaseMemObject(bufY); clReleaseMemObject(bufX); clReleaseMemObject(bufParam); /* Finalize work with clblas. */ clblasTeardown(); /* Release OpenCL working objects. */ clReleaseCommandQueue(queue); clReleaseContext(ctx); return ret; } clblas-2.10/src/samples/example_srotmg.c000066400000000000000000000124101264277366700203430ustar00rootroot00000000000000/* ************************************************************************ * Copyright 2013 Advanced Micro Devices, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * ************************************************************************/ #include #include #include #include /* Include CLBLAS header. It automatically includes needed OpenCL header, * so we can drop out explicit inclusion of cl.h header. */ #include /* This example uses predefined matrices and their characteristics for * simplicity purpose. */ static cl_float SD1 = 10; static cl_float SD2 = 21; static cl_float SX1 = 1; static cl_float SY1 = -1; static cl_float SPARAM[] = { -1, 10, 12, 20, 2 }; static void printResult(void) { printf("\nResult:\n"); printf("SD1: %f,\tSD2: %f,\t SX1: %f,\tSY1: %f\nSPARAM: %f %f %f %f %f\n", SD1, SD2, SX1, SY1, SPARAM[0], SPARAM[1], SPARAM[2], SPARAM[3], SPARAM[4]); } int main(void) { cl_int err; cl_platform_id platform = 0; cl_device_id device = 0; cl_context_properties props[3] = { CL_CONTEXT_PLATFORM, 0, 0 }; cl_context ctx = 0; cl_command_queue queue = 0; cl_mem bufD1, bufD2, bufX1, bufY1, bufParam; cl_event event = NULL; int ret = 0; int lenParam = 5; /* Setup OpenCL environment. */ err = clGetPlatformIDs(1, &platform, NULL); if (err != CL_SUCCESS) { printf( "clGetPlatformIDs() failed with %d\n", err ); return 1; } err = clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, 1, &device, NULL); if (err != CL_SUCCESS) { printf( "clGetDeviceIDs() failed with %d\n", err ); return 1; } props[1] = (cl_context_properties)platform; ctx = clCreateContext(props, 1, &device, NULL, NULL, &err); if (err != CL_SUCCESS) { printf( "clCreateContext() failed with %d\n", err ); return 1; } queue = clCreateCommandQueue(ctx, device, 0, &err); if (err != CL_SUCCESS) { printf( "clCreateCommandQueue() failed with %d\n", err ); clReleaseContext(ctx); return 1; } /* Setup clblas. */ err = clblasSetup(); if (err != CL_SUCCESS) { printf("clblasSetup() failed with %d\n", err); clReleaseCommandQueue(queue); clReleaseContext(ctx); return 1; } /* Prepare OpenCL memory objects and place matrices inside them. */ bufD1 = clCreateBuffer(ctx, CL_MEM_READ_WRITE, sizeof(cl_float), NULL, &err); bufD2 = clCreateBuffer(ctx, CL_MEM_READ_WRITE, sizeof(cl_float), NULL, &err); bufX1 = clCreateBuffer(ctx, CL_MEM_READ_WRITE, sizeof(cl_float), NULL, &err); bufY1 = clCreateBuffer(ctx, CL_MEM_READ_WRITE, sizeof(cl_float), NULL, &err); bufParam = clCreateBuffer(ctx, CL_MEM_READ_WRITE, (lenParam*sizeof(cl_float)), NULL, &err); err = clEnqueueWriteBuffer(queue, bufD1, CL_TRUE, 0, sizeof(cl_float), &SD1, 0, NULL, NULL); err = clEnqueueWriteBuffer(queue, bufD2, CL_TRUE, 0, sizeof(cl_float), &SD2, 0, NULL, NULL); err = clEnqueueWriteBuffer(queue, bufX1, CL_TRUE, 0, sizeof(cl_float), &SX1, 0, NULL, NULL); err = clEnqueueWriteBuffer(queue, bufY1, CL_TRUE, 0, sizeof(cl_float), &SY1, 0, NULL, NULL); err = clEnqueueWriteBuffer(queue, bufParam, CL_TRUE, 0, (lenParam*sizeof(cl_float)), SPARAM, 0, NULL, NULL); /* Call clblas function. */ err = clblasSrotmg(bufD1, 0, bufD2, 0, bufX1, 0, bufY1, 0, bufParam, 0, 1, &queue, 0, NULL, &event); if (err != CL_SUCCESS) { printf("clblasSrotmg() failed with %d\n", err); ret = 1; } else { /* Wait for calculations to be finished. */ err = clWaitForEvents(1, &event); /* Fetch results of calculations from GPU memory. */ err = clEnqueueReadBuffer(queue, bufD1, CL_TRUE, 0, sizeof(cl_float), &SD1, 0, NULL, NULL); err = clEnqueueReadBuffer(queue, bufD2, CL_TRUE, 0, sizeof(cl_float), &SD2, 0, NULL, NULL); err = clEnqueueReadBuffer(queue, bufX1, CL_TRUE, 0, sizeof(cl_float), &SX1, 0, NULL, NULL); err = clEnqueueReadBuffer(queue, bufY1, CL_TRUE, 0, sizeof(cl_float), &SY1, 0, NULL, NULL); err = clEnqueueReadBuffer(queue, bufParam, CL_TRUE, 0, (lenParam*sizeof(cl_float)), SPARAM, 0, NULL, NULL); /* At this point you will get the result of SROTG placed in vector Y. */ printResult(); } /* Release OpenCL events. */ clReleaseEvent(event); /* Release OpenCL memory objects. */ clReleaseMemObject(bufD1); clReleaseMemObject(bufD2); clReleaseMemObject(bufX1); clReleaseMemObject(bufY1); clReleaseMemObject(bufParam); /* Finalize work with clblas. */ clblasTeardown(); /* Release OpenCL working objects. */ clReleaseCommandQueue(queue); clReleaseContext(ctx); return ret; } clblas-2.10/src/samples/example_ssbmv.c000066400000000000000000000116731264277366700201740ustar00rootroot00000000000000/* ************************************************************************ * Copyright 2013 Advanced Micro Devices, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * ************************************************************************/ #include #include #include /* Include CLBLAS header. It automatically includes needed OpenCL header, * so we can drop out explicit inclusion of cl.h header. */ #include /* This example uses predefined matrices and their characteristics for * simplicity purpose. */ static const clblasOrder order = clblasRowMajor; static const clblasUplo uplo = clblasUpper; static const size_t N = 5; static const size_t K = 2; static const cl_float alpha = 10; static const cl_float A[] = { 11, 12, 13, 22, 23, 24, 33, 34, 35, 44, 45, 00, 55, 00, 00 }; static const size_t lda = 3; // lda = K + 1 static const cl_float X[] = { 11, 21, 31, 41, 51 }; static const int incx = 1; static const cl_float beta = 20; static cl_float Y[] = { 11, 21, 31, 41, 51 }; static const int incy = 1; static void printResult(const char* str) { size_t i; printf("%s:\n", str); for (i = 0; i < N; i++) { printf("%f\n", Y[ i * incy ]); } } int main(void) { cl_int err; cl_platform_id platform = 0; cl_device_id device = 0; cl_context_properties props[3] = { CL_CONTEXT_PLATFORM, 0, 0 }; cl_context ctx = 0; cl_command_queue queue = 0; cl_mem bufA, bufX, bufY; cl_event event = NULL; int ret = 0; /* Setup OpenCL environment. */ err = clGetPlatformIDs(1, &platform, NULL); if (err != CL_SUCCESS) { printf( "clGetPlatformIDs() failed with %d\n", err ); return 1; } err = clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, 1, &device, NULL); if (err != CL_SUCCESS) { printf( "clGetDeviceIDs() failed with %d\n", err ); return 1; } props[1] = (cl_context_properties)platform; ctx = clCreateContext(props, 1, &device, NULL, NULL, &err); if (err != CL_SUCCESS) { printf( "clCreateContext() failed with %d\n", err ); return 1; } queue = clCreateCommandQueue(ctx, device, 0, &err); if (err != CL_SUCCESS) { printf( "clCreateCommandQueue() failed with %d\n", err ); clReleaseContext(ctx); return 1; } /* Setup clblas. */ err = clblasSetup(); if (err != CL_SUCCESS) { printf("clblasSetup() failed with %d\n", err); clReleaseCommandQueue(queue); clReleaseContext(ctx); return 1; } /* Prepare OpenCL memory objects and place matrices inside them. */ bufA = clCreateBuffer(ctx, CL_MEM_READ_ONLY, N * lda * sizeof(cl_float), NULL, &err); bufX = clCreateBuffer(ctx, CL_MEM_READ_ONLY, N * sizeof(cl_float), NULL, &err); bufY = clCreateBuffer(ctx, CL_MEM_READ_WRITE, N * sizeof(cl_float), NULL, &err); err = clEnqueueWriteBuffer(queue, bufA, CL_TRUE, 0, N * lda * sizeof(cl_float), A, 0, NULL, NULL); err = clEnqueueWriteBuffer(queue, bufX, CL_TRUE, 0, N * sizeof(cl_float), X, 0, NULL, NULL); err = clEnqueueWriteBuffer(queue, bufY, CL_TRUE, 0, N * sizeof(cl_float), Y, 0, NULL, NULL); /* Call clblas function. */ err = clblasSsbmv(order, uplo, N, K, alpha, bufA, 0, lda, bufX, 0, incx, beta, bufY, 0, incy, 1, &queue, 0, NULL, &event); if (err != CL_SUCCESS) { printf("clblasSsbmv() failed with %d\n", err); ret = 1; } else { /* Wait for calculations to be finished. */ err = clWaitForEvents(1, &event); /* Fetch results of calculations from GPU memory. */ err = clEnqueueReadBuffer(queue, bufY, CL_TRUE, 0, N * sizeof(cl_float), Y, 0, NULL, NULL); /* At this point you will get the result of SSBMV placed in Y array. */ printResult("clblasSsbmv result"); } /* Release OpenCL events. */ clReleaseEvent(event); /* Release OpenCL memory objects. */ clReleaseMemObject(bufY); clReleaseMemObject(bufX); clReleaseMemObject(bufA); /* Finalize work with clblas. */ clblasTeardown(); /* Release OpenCL working objects. */ clReleaseCommandQueue(queue); clReleaseContext(ctx); return ret; } clblas-2.10/src/samples/example_sscal.c000066400000000000000000000077431264277366700201520ustar00rootroot00000000000000/* ************************************************************************ * Copyright 2013 Advanced Micro Devices, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * ************************************************************************/ #include #include #include #include /* Include CLBLAS header. It automatically includes needed OpenCL header, * so we can drop out explicit inclusion of cl.h header. */ #include /* This example uses predefined matrices and their characteristics for * simplicity purpose. */ static const clblasOrder order = clblasRowMajor; static const size_t N = 5; static const cl_float alpha = 10; static cl_float X[] = { 11, 21, 31, 41, 51, }; static const int incx = 1; static void printResult(void) { size_t i; printf("\nResult:\n"); for (i = 0; i < N; i++) { printf("\t%f", X[i] ); printf("\n"); } } int main(void) { cl_int err; cl_platform_id platform = 0; cl_device_id device = 0; cl_context_properties props[3] = { CL_CONTEXT_PLATFORM, 0, 0 }; cl_context ctx = 0; cl_command_queue queue = 0; cl_mem bufX; cl_event event = NULL; int ret = 0; int lenX = 1 + (N-1)*abs(incx); /* Setup OpenCL environment. */ err = clGetPlatformIDs(1, &platform, NULL); if (err != CL_SUCCESS) { printf( "clGetPlatformIDs() failed with %d\n", err ); return 1; } err = clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, 1, &device, NULL); if (err != CL_SUCCESS) { printf( "clGetDeviceIDs() failed with %d\n", err ); return 1; } props[1] = (cl_context_properties)platform; ctx = clCreateContext(props, 1, &device, NULL, NULL, &err); if (err != CL_SUCCESS) { printf( "clCreateContext() failed with %d\n", err ); return 1; } queue = clCreateCommandQueue(ctx, device, 0, &err); if (err != CL_SUCCESS) { printf( "clCreateCommandQueue() failed with %d\n", err ); clReleaseContext(ctx); return 1; } /* Setup clblas. */ err = clblasSetup(); if (err != CL_SUCCESS) { printf("clblasSetup() failed with %d\n", err); clReleaseCommandQueue(queue); clReleaseContext(ctx); return 1; } /* Prepare OpenCL memory objects and place vectors inside them. */ bufX = clCreateBuffer(ctx, CL_MEM_READ_WRITE, ( lenX * sizeof(cl_float)), NULL, &err); err = clEnqueueWriteBuffer(queue, bufX, CL_TRUE, 0, ( lenX * sizeof(cl_float)), X, 0, NULL, NULL); /* Call clblas function. */ err = clblasSscal( N, alpha, bufX, 0, incx, 1, &queue, 0, NULL, &event); if (err != CL_SUCCESS) { printf("clblasSscal() failed with %d\n", err); ret = 1; } else { /* Wait for calculations to be finished. */ err = clWaitForEvents(1, &event); /* Fetch results of calculations from GPU memory. */ err = clEnqueueReadBuffer(queue, bufX, CL_TRUE, 0, (lenX * sizeof(cl_float)), X, 0, NULL, NULL); /* At this point you will get the result of SSCAL placed in X vector. */ printResult(); } /* Release OpenCL events. */ clReleaseEvent(event); /* Release OpenCL memory objects. */ clReleaseMemObject(bufX); /* Finalize work with clblas. */ clblasTeardown(); /* Release OpenCL working objects. */ clReleaseCommandQueue(queue); clReleaseContext(ctx); return ret; } clblas-2.10/src/samples/example_sspmv.c000066400000000000000000000120221264277366700201770ustar00rootroot00000000000000/* ************************************************************************ * Copyright 2013 Advanced Micro Devices, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * ************************************************************************/ #include #include #include /* Include CLBLAS header. It automatically includes needed OpenCL header, * so we can drop out explicit inclusion of cl.h header. */ #include /* This example uses predefined matrices and their characteristics for * simplicity purpose. */ static const clblasOrder order = clblasRowMajor; static const size_t N = 5; static const cl_float alpha = 10; static const clblasUplo uplo = clblasUpper; static const cl_float AP[] = { 11, 12, 13, 14, 15, 22, 23, 24, 25, 33, 34, 35, 44, 45, 55 }; static const size_t lda = 5; /* i.e. lda = N */ static const cl_float X[] = { 11, 21, 31, 41, 51 }; static const int incx = 1; static const cl_float beta = 20; static cl_float Y[] = { 11, 21, 31, 41, 51 }; static const int incy = 1; static void printResult(void) { size_t i; printf("Result:\n\n"); for (i = 0; i < N; i++) { printf("%.3f\n", Y[i * incy]); } } int main(void) { cl_int err; cl_platform_id platform = 0; cl_device_id device = 0; cl_context_properties props[3] = { CL_CONTEXT_PLATFORM, 0, 0 }; cl_context ctx = 0; cl_command_queue queue = 0; cl_mem bufAP, bufX, bufY; cl_event event = NULL; int ret = 0, numElementsAP; /* Setup OpenCL environment. */ err = clGetPlatformIDs(1, &platform, NULL); if (err != CL_SUCCESS) { printf( "clGetPlatformIDs() failed with %d\n", err ); return 1; } err = clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, 1, &device, NULL); if (err != CL_SUCCESS) { printf( "clGetDeviceIDs() failed with %d\n", err ); return 1; } props[1] = (cl_context_properties)platform; ctx = clCreateContext(props, 1, &device, NULL, NULL, &err); if (err != CL_SUCCESS) { printf( "clCreateContext() failed with %d\n", err ); return 1; } queue = clCreateCommandQueue(ctx, device, 0, &err); if (err != CL_SUCCESS) { printf( "clCreateCommandQueue() failed with %d\n", err ); clReleaseContext(ctx); return 1; } /* Setup clblas. */ err = clblasSetup(); if (err != CL_SUCCESS) { printf("clblasSetup() failed with %d\n", err); clReleaseCommandQueue(queue); clReleaseContext(ctx); return 1; } numElementsAP = (N * (N+1)) / 2; // To get number of elements in a packed matrix /* Prepare OpenCL memory objects and place matrices inside them. */ bufAP = clCreateBuffer(ctx, CL_MEM_READ_ONLY, numElementsAP * sizeof(cl_float), NULL, &err); bufX = clCreateBuffer(ctx, CL_MEM_READ_ONLY, N * sizeof(cl_float), NULL, &err); bufY = clCreateBuffer(ctx, CL_MEM_READ_WRITE, N * sizeof(cl_float), NULL, &err); err = clEnqueueWriteBuffer(queue, bufAP, CL_TRUE, 0, numElementsAP * sizeof(cl_float), AP, 0, NULL, NULL); err = clEnqueueWriteBuffer(queue, bufX, CL_TRUE, 0, N * sizeof(cl_float), X, 0, NULL, NULL); err = clEnqueueWriteBuffer(queue, bufY, CL_TRUE, 0, N * sizeof(cl_float), Y, 0, NULL, NULL); /* Call clblas function. */ err = clblasSspmv(order, uplo, N, alpha, bufAP, 0, bufX, 0, incx, beta, bufY, 0, incy, 1, &queue, 0, NULL, &event); if (err != CL_SUCCESS) { printf("clblasSspmv() failed with %d\n", err); ret = 1; } else { /* Wait for calculations to be finished. */ err = clWaitForEvents(1, &event); /* Fetch results of calculations from GPU memory. */ err = clEnqueueReadBuffer(queue, bufY, CL_TRUE, 0, N * sizeof(cl_float), Y, 0, NULL, NULL); /* At this point you will get the result of SSPMV placed in Y array. */ printResult(); } /* Release OpenCL events. */ clReleaseEvent(event); /* Release OpenCL memory objects. */ clReleaseMemObject(bufY); clReleaseMemObject(bufX); clReleaseMemObject(bufAP); /* Finalize work with clblas. */ clblasTeardown(); /* Release OpenCL working objects. */ clReleaseCommandQueue(queue); clReleaseContext(ctx); return ret; } clblas-2.10/src/samples/example_sspr.c000066400000000000000000000115541264277366700200270ustar00rootroot00000000000000/* ************************************************************************ * Copyright 2013 Advanced Micro Devices, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * ************************************************************************/ #include #include #include /* Include CLBLAS header. It automatically includes needed OpenCL header, * so we can drop out explicit inclusion of cl.h header. */ #include /* This example uses predefined matrices and their characteristics for * simplicity purpose. */ static const clblasOrder order = clblasRowMajor; static const size_t N = 5; static cl_float alpha = 10.0; static const clblasUplo uplo = clblasUpper; static cl_float AP[] = { 1.0, 02.0, 03.0, 04.0, 05.0, 06.0, 07.0, 08.0, 09.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0 }; static const cl_float X[] = { 1.0, 2.0, 3.0, 4.0, 5.0 }; static const int incx = 1; static void printResult(void) { size_t i, j, off; printf("\nResult:\n"); off = 0; for (i = 0; i < N; i++) { for(j = 0; j < N; j++) { if( ( (uplo == clblasUpper) && (i > j)) || ((uplo == clblasLower) && (j > i)) ) { printf("\t\t"); continue; } printf("%8.2lf\t", AP[ off ]); off ++ ; } printf("\n"); } } int main(void) { cl_int err; cl_platform_id platform = 0; cl_device_id device = 0; cl_context_properties props[3] = { CL_CONTEXT_PLATFORM, 0, 0 }; cl_context ctx = 0; cl_command_queue queue = 0; cl_mem bufAP, bufX; cl_event event = NULL; int ret = 0, numElementsAP; /* Setup OpenCL environment. */ err = clGetPlatformIDs(1, &platform, NULL); if (err != CL_SUCCESS) { printf( "clGetPlatformIDs() failed with %d\n", err ); return 1; } err = clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, 1, &device, NULL); if (err != CL_SUCCESS) { printf( "clGetDeviceIDs() failed with %d\n", err ); return 1; } props[1] = (cl_context_properties)platform; ctx = clCreateContext(props, 1, &device, NULL, NULL, &err); if (err != CL_SUCCESS) { printf( "clCreateContext() failed with %d\n", err ); return 1; } queue = clCreateCommandQueue(ctx, device, 0, &err); if (err != CL_SUCCESS) { printf( "clCreateCommandQueue() failed with %d\n", err ); clReleaseContext(ctx); return 1; } /* Setup clblas. */ err = clblasSetup(); if (err != CL_SUCCESS) { printf("clblasSetup() failed with %d\n", err); clReleaseCommandQueue(queue); clReleaseContext(ctx); return 1; } numElementsAP = (N * (N+1)) / 2; // To get number of elements in a packed matrix /* Prepare OpenCL memory objects and place matrices inside them. */ bufAP = clCreateBuffer(ctx, CL_MEM_READ_WRITE, numElementsAP * sizeof(cl_float), NULL, &err); bufX = clCreateBuffer(ctx, CL_MEM_READ_ONLY, N * sizeof(cl_float), NULL, &err); err = clEnqueueWriteBuffer(queue, bufAP, CL_TRUE, 0, numElementsAP * sizeof(cl_float), AP, 0, NULL, NULL); err = clEnqueueWriteBuffer(queue, bufX, CL_TRUE, 0, N * sizeof(cl_float), X, 0, NULL, NULL); err = clblasSspr(order, uplo, N, alpha, bufX, 0 /*offx */, incx, bufAP, 0 /*offa */, 1, &queue, 0, NULL, &event); if (err != CL_SUCCESS) { printf("clblasSspr() failed with %d\n", err); ret = 1; } else { /* Wait for calculations to be finished. */ err = clWaitForEvents(1, &event); /* Fetch results of calculations from GPU memory. */ err = clEnqueueReadBuffer(queue, bufAP, CL_TRUE, 0, (numElementsAP * sizeof(cl_float)), AP, 0, NULL, NULL); /* At this point you will get the result of SSPR placed in A array. */ printResult(); } /* Release OpenCL events. */ clReleaseEvent(event); /* Release OpenCL memory objects. */ clReleaseMemObject(bufX); clReleaseMemObject(bufAP); /* Finalize work with clblas. */ clblasTeardown(); /* Release OpenCL working objects. */ clReleaseCommandQueue(queue); clReleaseContext(ctx); return ret; } clblas-2.10/src/samples/example_sspr2.c000066400000000000000000000122661264277366700201120ustar00rootroot00000000000000/* ************************************************************************ * Copyright 2013 Advanced Micro Devices, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * ************************************************************************/ #include #include #include /* Include CLBLAS header. It automatically includes needed OpenCL header, * so we can drop out explicit inclusion of cl.h header. */ #include /* This example uses predefined matrices and their characteristics for * simplicity purpose. */ static const clblasOrder order = clblasRowMajor; static const size_t N = 5; static cl_float alpha = 10.0; static const clblasUplo uplo = clblasUpper; static cl_float AP[] = { 01.0, 02.0, 03.0, 04.0, 05.0, 06.0, 07.0, 08.0, 09.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0 }; static const cl_float X[] = { 1.0, 2.0, 3.0, 4.0, 5.0 }; static const int incx = 1; static const cl_float Y[] = { 5.0, 4.0, 3.0, 2.0, 1.0 }; static const int incy = 1; static void printResult(void) { size_t i, j, off; printf("\nResult:\n"); off = 0; for (i = 0; i < N; i++) { for(j = 0; j < N; j++) { if( ( (uplo == clblasUpper) && (i > j)) || ((uplo == clblasLower) && (j > i)) ) { printf("\t\t"); continue; } printf("%8.2lf\t", AP[ off ]); off ++ ; } printf("\n"); } } int main(void) { cl_int err; cl_platform_id platform = 0; cl_device_id device = 0; cl_context_properties props[3] = { CL_CONTEXT_PLATFORM, 0, 0 }; cl_context ctx = 0; cl_command_queue queue = 0; cl_mem bufAP, bufX, bufY; cl_event event = NULL; int ret = 0, numElementsAP; /* Setup OpenCL environment. */ err = clGetPlatformIDs(1, &platform, NULL); if (err != CL_SUCCESS) { printf( "clGetPlatformIDs() failed with %d\n", err ); return 1; } err = clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, 1, &device, NULL); if (err != CL_SUCCESS) { printf( "clGetDeviceIDs() failed with %d\n", err ); return 1; } props[1] = (cl_context_properties)platform; ctx = clCreateContext(props, 1, &device, NULL, NULL, &err); if (err != CL_SUCCESS) { printf( "clCreateContext() failed with %d\n", err ); return 1; } queue = clCreateCommandQueue(ctx, device, 0, &err); if (err != CL_SUCCESS) { printf( "clCreateCommandQueue() failed with %d\n", err ); clReleaseContext(ctx); return 1; } /* Setup clblas. */ err = clblasSetup(); if (err != CL_SUCCESS) { printf("clblasSetup() failed with %d\n", err); clReleaseCommandQueue(queue); clReleaseContext(ctx); return 1; } numElementsAP = (N * (N+1)) / 2; // To get number of elements in a packed matrix /* Prepare OpenCL memory objects and place matrices inside them. */ bufAP = clCreateBuffer(ctx, CL_MEM_READ_WRITE, (numElementsAP * sizeof(cl_float)), NULL, &err); bufX = clCreateBuffer(ctx, CL_MEM_READ_ONLY, N * sizeof(cl_float), NULL, &err); bufY = clCreateBuffer(ctx, CL_MEM_READ_ONLY, N * sizeof(cl_float), NULL, &err); err = clEnqueueWriteBuffer(queue, bufAP, CL_TRUE, 0, numElementsAP * sizeof(cl_float), AP, 0, NULL, NULL); err = clEnqueueWriteBuffer(queue, bufX, CL_TRUE, 0, N * sizeof(cl_float), X, 0, NULL, NULL); err = clEnqueueWriteBuffer(queue, bufY, CL_TRUE, 0, N * sizeof(cl_float), Y, 0, NULL, NULL); err = clblasSspr2(order, uplo, N, alpha, bufX, 0 /*offx */, incx, bufY, 0 /*offy*/, incy, bufAP, 0 /*offa */, 1, &queue, 0, NULL, &event); if (err != CL_SUCCESS) { printf("clblasSspr2() failed with %d\n", err); ret = 1; } else { /* Wait for calculations to be finished. */ err = clWaitForEvents(1, &event); /* Fetch results of calculations from GPU memory. */ err = clEnqueueReadBuffer(queue, bufAP, CL_TRUE, 0, (numElementsAP * sizeof(cl_float)), AP, 0, NULL, NULL); /* At this point you will get the result of SSPR2 placed in A array. */ printResult(); } /* Release OpenCL events. */ clReleaseEvent(event); /* Release OpenCL memory objects. */ clReleaseMemObject(bufX); clReleaseMemObject(bufAP); clReleaseMemObject(bufY); /* Finalize work with clblas. */ clblasTeardown(); /* Release OpenCL working objects. */ clReleaseCommandQueue(queue); clReleaseContext(ctx); return ret; } clblas-2.10/src/samples/example_sswap.c000066400000000000000000000107321264277366700201720ustar00rootroot00000000000000/* ************************************************************************ * Copyright 2013 Advanced Micro Devices, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * ************************************************************************/ #include #include #include #include /* Include CLBLAS header. It automatically includes needed OpenCL header, * so we can drop out explicit inclusion of cl.h header. */ #include /* This example uses predefined matrices and their characteristics for * simplicity purpose. */ static const size_t N = 7; static cl_float X[] = { 11, 21, 31, 41, 51, 61, 71, }; static const int incx = 1; static cl_float Y[] = { 45, 23, 39, 45, 50, 55, 65, }; static const int incy = 1; static void printResult(void) { size_t i; printf("\nResult:\n"); printf(" X\n"); for (i = 0; i < N; i++) { printf("\t%f\n", X[i]); } printf("Y\n"); for (i = 0; i < N; i++) { printf("\t%f\n", Y[i]); } } int main(void) { cl_int err; cl_platform_id platform = 0; cl_device_id device = 0; cl_context_properties props[3] = { CL_CONTEXT_PLATFORM, 0, 0 }; cl_context ctx = 0; cl_command_queue queue = 0; cl_mem bufX, bufY; cl_event event = NULL; int ret = 0; int lenX = 1 + (N-1)*abs(incx); int lenY = 1 + (N-1)*abs(incy); /* Setup OpenCL environment. */ err = clGetPlatformIDs(1, &platform, NULL); if (err != CL_SUCCESS) { printf( "clGetPlatformIDs() failed with %d\n", err ); return 1; } err = clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, 1, &device, NULL); if (err != CL_SUCCESS) { printf( "clGetDeviceIDs() failed with %d\n", err ); return 1; } props[1] = (cl_context_properties)platform; ctx = clCreateContext(props, 1, &device, NULL, NULL, &err); if (err != CL_SUCCESS) { printf( "clCreateContext() failed with %d\n", err ); return 1; } queue = clCreateCommandQueue(ctx, device, 0, &err); if (err != CL_SUCCESS) { printf( "clCreateCommandQueue() failed with %d\n", err ); clReleaseContext(ctx); return 1; } /* Setup clblas. */ err = clblasSetup(); if (err != CL_SUCCESS) { printf("clblasSetup() failed with %d\n", err); clReleaseCommandQueue(queue); clReleaseContext(ctx); return 1; } /* Prepare OpenCL memory objects and place vectors inside them. */ bufX = clCreateBuffer(ctx, CL_MEM_READ_WRITE, (lenX*sizeof(cl_float)), NULL, &err); bufY = clCreateBuffer(ctx, CL_MEM_READ_WRITE, (lenY*sizeof(cl_float)), NULL, &err); err = clEnqueueWriteBuffer(queue, bufX, CL_TRUE, 0, (lenX*sizeof(cl_float)), X, 0, NULL, NULL); err = clEnqueueWriteBuffer(queue, bufY, CL_TRUE, 0, (lenY*sizeof(cl_float)), Y, 0, NULL, NULL); /* Call clblas function. */ err = clblasSswap( N, bufX, 0, incx, bufY, 0, incy, 1, &queue, 0, NULL, &event); if (err != CL_SUCCESS) { printf("clblasSswap() failed with %d\n", err); ret = 1; } else { /* Wait for calculations to be finished. */ err = clWaitForEvents(1, &event); /* Fetch results of calculations from GPU memory. */ err = clEnqueueReadBuffer(queue, bufX, CL_TRUE, 0, (lenX*sizeof(cl_float)), X, 0, NULL, NULL); err = clEnqueueReadBuffer(queue, bufY, CL_TRUE, 0, (lenY*sizeof(cl_float)), Y, 0, NULL, NULL); /* At this point you will get the result of SSWAP placed in vector X. */ printResult(); } /* Release OpenCL events. */ clReleaseEvent(event); /* Release OpenCL memory objects. */ clReleaseMemObject(bufY); clReleaseMemObject(bufX); /* Finalize work with clblas. */ clblasTeardown(); /* Release OpenCL working objects. */ clReleaseCommandQueue(queue); clReleaseContext(ctx); return ret; } clblas-2.10/src/samples/example_ssymm.c000066400000000000000000000120401264277366700201770ustar00rootroot00000000000000/* ************************************************************************ * Copyright 2013 Advanced Micro Devices, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * ************************************************************************/ #include #include #include /* Include CLBLAS header. It automatically includes needed OpenCL header, * so we can drop out explicit inclusion of cl.h header. */ #include /* This example uses predefined matrices and their characteristics for * simplicity purpose. */ static const clblasOrder order = clblasRowMajor; #define M 4 #define N 3 static const cl_float alpha = 10; static const clblasSide side = clblasLeft; static const clblasUplo uplo = clblasLower; static const cl_float A[M*M] = { 11, -1, -1, -1, 21, 22, -1, -1, 31, 32, 33, -1, 41, 42, 43, 44, }; static const size_t lda = M; static const cl_float B[M*N] = { 11, 12, 13, 21, 22, 23, 31, 32, 33, 41, 42, 43, }; static const size_t ldb = N; static const cl_float beta = 20; static cl_float C[M*N] = { 11, 12, 13, 21, 22, 23, 31, 32, 33, 41, 42, 43, }; static const size_t ldc = N; static void printResult(void) { size_t i, j, nrows; printf("Result:\n"); nrows = (sizeof(C) / sizeof(cl_float)) / ldc; for (i = 0; i < nrows; i++) { for (j = 0; j < ldc; j++) { printf("%d ", (int)C[i * ldc + j]); } printf("\n"); } } int main(void) { cl_int err; cl_platform_id platform = 0; cl_device_id device = 0; cl_context_properties props[3] = { CL_CONTEXT_PLATFORM, 0, 0 }; cl_context ctx = 0; cl_command_queue queue = 0; cl_mem bufA, bufB, bufC; cl_event event = NULL; int ret = 0; /* Setup OpenCL environment. */ err = clGetPlatformIDs(1, &platform, NULL); if (err != CL_SUCCESS) { printf( "clGetPlatformIDs() failed with %d\n", err ); return 1; } err = clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, 1, &device, NULL); if (err != CL_SUCCESS) { printf( "clGetDeviceIDs() failed with %d\n", err ); return 1; } props[1] = (cl_context_properties)platform; ctx = clCreateContext(props, 1, &device, NULL, NULL, &err); if (err != CL_SUCCESS) { printf( "clCreateContext() failed with %d\n", err ); return 1; } queue = clCreateCommandQueue(ctx, device, 0, &err); if (err != CL_SUCCESS) { printf( "clCreateCommandQueue() failed with %d\n", err ); clReleaseContext(ctx); return 1; } /* Setup clblas. */ err = clblasSetup(); if (err != CL_SUCCESS) { printf("clblasSetup() failed with %d\n", err); clReleaseCommandQueue(queue); clReleaseContext(ctx); return 1; } /* Prepare OpenCL memory objects and place matrices inside them. */ bufA = clCreateBuffer(ctx, CL_MEM_READ_ONLY, M * M * sizeof(*A), NULL, &err); bufB = clCreateBuffer(ctx, CL_MEM_READ_ONLY, M * N * sizeof(*B), NULL, &err); bufC = clCreateBuffer(ctx, CL_MEM_READ_WRITE, M * N * sizeof(*C), NULL, &err); err = clEnqueueWriteBuffer(queue, bufA, CL_TRUE, 0, M * M * sizeof(*A), A, 0, NULL, NULL); err = clEnqueueWriteBuffer(queue, bufB, CL_TRUE, 0, M * N * sizeof(*B), B, 0, NULL, NULL); err = clEnqueueWriteBuffer(queue, bufC, CL_TRUE, 0, M * N * sizeof(*C), C, 0, NULL, NULL); /* Call clblas function. */ err = clblasSsymm(order, side, uplo, M, N, alpha, bufA, 0, lda, bufB, 0, ldb, beta, bufC, 0, ldc, 1, &queue, 0, NULL, &event); if (err != CL_SUCCESS) { printf("clblasSsymm() failed with %d\n", err); ret = 1; } else { /* Wait for calculations to be finished. */ err = clWaitForEvents(1, &event); /* Fetch results of calculations from GPU memory. */ err = clEnqueueReadBuffer(queue, bufC, CL_TRUE, 0, M * N * sizeof(*C), C, 0, NULL, NULL); /* At this point you will get the result of SYMM placed in C array. */ printResult(); } /* Release OpenCL events. */ clReleaseEvent(event); /* Release OpenCL memory objects. */ clReleaseMemObject(bufC); clReleaseMemObject(bufB); clReleaseMemObject(bufA); /* Finalize work with clblas. */ clblasTeardown(); /* Release OpenCL working objects. */ clReleaseCommandQueue(queue); clReleaseContext(ctx); return ret; } clblas-2.10/src/samples/example_ssymv.c000066400000000000000000000123301264277366700202120ustar00rootroot00000000000000/* ************************************************************************ * Copyright 2013 Advanced Micro Devices, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * ************************************************************************/ #include #include #include /* Include CLBLAS header. It automatically includes needed OpenCL header, * so we can drop out explicit inclusion of cl.h header. */ #include /* This example uses predefined matrices and their characteristics for * simplicity purpose. */ static const clblasOrder order = clblasRowMajor; static const size_t N = 5; static const cl_float alpha = 10; static const clblasUplo uplo = clblasUpper; static const cl_float A[] = { 11, 12, 13, 14, 15, 0, 22, 23, 24, 25, 0, 0, 33, 34, 35, 0, 0, 0, 44, 45, 0, 0, 0, 0, 55 }; static const size_t lda = 5; /* i.e. lda = N */ static const cl_float X[] = { 11, 21, 31, 41, 51 }; static const int incx = 1; static const cl_float beta = 20; static cl_float Y[] = { 11, 21, 31, 41, 51 }; static const int incy = 1; static cl_float result[5]; /* N */ static const size_t off = 1; static const size_t offa = 5 + 1; /* N + off */ static const size_t offx = 1; /* off */ static const size_t offy = 1; /* off */ static void printResult(const char* str) { size_t i, nElements; printf("%s:\n", str); nElements = (sizeof(result) / sizeof(cl_float)) / incy; for (i = 0; i < nElements; i++) { printf("%d\n", (int)result[i * incy]); } } int main(void) { cl_int err; cl_platform_id platform = 0; cl_device_id device = 0; cl_context_properties props[3] = { CL_CONTEXT_PLATFORM, 0, 0 }; cl_context ctx = 0; cl_command_queue queue = 0; cl_mem bufA, bufX, bufY; cl_event event = NULL; int ret = 0; /* Setup OpenCL environment. */ err = clGetPlatformIDs(1, &platform, NULL); if (err != CL_SUCCESS) { printf( "clGetPlatformIDs() failed with %d\n", err ); return 1; } err = clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, 1, &device, NULL); if (err != CL_SUCCESS) { printf( "clGetDeviceIDs() failed with %d\n", err ); return 1; } props[1] = (cl_context_properties)platform; ctx = clCreateContext(props, 1, &device, NULL, NULL, &err); if (err != CL_SUCCESS) { printf( "clCreateContext() failed with %d\n", err ); return 1; } queue = clCreateCommandQueue(ctx, device, 0, &err); if (err != CL_SUCCESS) { printf( "clCreateCommandQueue() failed with %d\n", err ); clReleaseContext(ctx); return 1; } /* Setup clblas. */ err = clblasSetup(); if (err != CL_SUCCESS) { printf("clblasSetup() failed with %d\n", err); clReleaseCommandQueue(queue); clReleaseContext(ctx); return 1; } /* Prepare OpenCL memory objects and place matrices inside them. */ bufA = clCreateBuffer(ctx, CL_MEM_READ_ONLY, N * N * sizeof(*A), NULL, &err); bufX = clCreateBuffer(ctx, CL_MEM_READ_ONLY, N * sizeof(*X), NULL, &err); bufY = clCreateBuffer(ctx, CL_MEM_READ_WRITE, N * sizeof(*Y), NULL, &err); err = clEnqueueWriteBuffer(queue, bufA, CL_TRUE, 0, N * N * sizeof(*A), A, 0, NULL, NULL); err = clEnqueueWriteBuffer(queue, bufX, CL_TRUE, 0, N * sizeof(*X), X, 0, NULL, NULL); err = clEnqueueWriteBuffer(queue, bufY, CL_TRUE, 0, N * sizeof(*Y), Y, 0, NULL, NULL); /* Call clblas extended function. */ err = clblasSsymv(order, uplo, N - off, alpha, bufA, offa, lda, bufX, offx, incx, beta, bufY, offy, incy, 1, &queue, 0, NULL, &event); if (err != CL_SUCCESS) { printf("clblasSsymvEx() failed with %d\n", err); ret = 1; } else { /* Wait for calculations to be finished. */ err = clWaitForEvents(1, &event); /* Fetch results of calculations from GPU memory. */ err = clEnqueueReadBuffer(queue, bufY, CL_TRUE, 0, N * sizeof(*result), result, 0, NULL, NULL); /* At this point you will get the result of SSYMV placed in 'result' array. */ puts(""); printResult("clblasSsymvEx result"); } /* Release OpenCL events. */ clReleaseEvent(event); /* Release OpenCL memory objects. */ clReleaseMemObject(bufY); clReleaseMemObject(bufX); clReleaseMemObject(bufA); /* Finalize work with clblas. */ clblasTeardown(); /* Release OpenCL working objects. */ clReleaseCommandQueue(queue); clReleaseContext(ctx); return ret; } clblas-2.10/src/samples/example_ssyr.c000066400000000000000000000110271264277366700200330ustar00rootroot00000000000000/* ************************************************************************ * Copyright 2013 Advanced Micro Devices, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * ************************************************************************/ #include #include #include /* Include CLBLAS header. It automatically includes needed OpenCL header, * so we can drop out explicit inclusion of cl.h header. */ #include /* This example uses predefined matrices and their characteristics for * simplicity purpose. */ static const clblasOrder order = clblasRowMajor; static const size_t N = 5; static cl_float alpha = 10.0; static const clblasUplo uplo = clblasUpper; static cl_float A[] = { 1.0, 2.0, 3.0, 4.0, 5.0, 0.0, 6.0, 7.0, 8.0, 9.0, 0.0, 0.0, 10.0, 11.0, 12.0, 0.0, 0.0, 0.0, 13.0, 14.0, 0.0, 0.0, 0.0, 00.0, 15.0 }; static const size_t lda = 5; /* i.e. lda = N */ static const cl_float X[] = { 1.0, 2.0, 3.0, 4.0, 5.0 }; static const int incx = 1; static void printResult(void) { size_t i, j; printf("\nResult:\n"); for (i = 0; i < N; i++) { for(j = 0; j < N; j++) printf("\t(%.2f)", A[ i*N + j ]); printf("\n"); } } int main(void) { cl_int err; cl_platform_id platform = 0; cl_device_id device = 0; cl_context_properties props[3] = { CL_CONTEXT_PLATFORM, 0, 0 }; cl_context ctx = 0; cl_command_queue queue = 0; cl_mem bufA, bufX; cl_event event = NULL; int ret = 0; /* Setup OpenCL environment. */ err = clGetPlatformIDs(1, &platform, NULL); if (err != CL_SUCCESS) { printf( "clGetPlatformIDs() failed with %d\n", err ); return 1; } err = clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, 1, &device, NULL); if (err != CL_SUCCESS) { printf( "clGetDeviceIDs() failed with %d\n", err ); return 1; } props[1] = (cl_context_properties)platform; ctx = clCreateContext(props, 1, &device, NULL, NULL, &err); if (err != CL_SUCCESS) { printf( "clCreateContext() failed with %d\n", err ); return 1; } queue = clCreateCommandQueue(ctx, device, 0, &err); if (err != CL_SUCCESS) { printf( "clCreateCommandQueue() failed with %d\n", err ); clReleaseContext(ctx); return 1; } /* Setup clblas. */ err = clblasSetup(); if (err != CL_SUCCESS) { printf("clblasSetup() failed with %d\n", err); clReleaseCommandQueue(queue); clReleaseContext(ctx); return 1; } printResult(); /* Prepare OpenCL memory objects and place matrices inside them. */ bufA = clCreateBuffer(ctx, CL_MEM_READ_WRITE, N * lda * sizeof(cl_float), NULL, &err); bufX = clCreateBuffer(ctx, CL_MEM_READ_ONLY, N * sizeof(cl_float), NULL, &err); err = clEnqueueWriteBuffer(queue, bufA, CL_TRUE, 0, N * lda * sizeof(cl_float), A, 0, NULL, NULL); err = clEnqueueWriteBuffer(queue, bufX, CL_TRUE, 0, N * sizeof(cl_float), X, 0, NULL, NULL); err = clblasSsyr(order, uplo, N, alpha, bufX, 0 /*offx */, incx, bufA, 0 /*offa */, lda, 1, &queue, 0, NULL, &event); if (err != CL_SUCCESS) { printf("clblasSsyr() failed with %d\n", err); ret = 1; } else { /* Wait for calculations to be finished. */ err = clWaitForEvents(1, &event); /* Fetch results of calculations from GPU memory. */ err = clEnqueueReadBuffer(queue, bufA, CL_TRUE, 0, (N * lda * sizeof(cl_float)), A, 0, NULL, NULL); /* At this point you will get the result of SSYR placed in A array. */ printResult(); } /* Release OpenCL events. */ clReleaseEvent(event); /* Release OpenCL memory objects. */ clReleaseMemObject(bufX); clReleaseMemObject(bufA); /* Finalize work with clblas. */ clblasTeardown(); /* Release OpenCL working objects. */ clReleaseCommandQueue(queue); clReleaseContext(ctx); return ret; } clblas-2.10/src/samples/example_ssyr2.c000066400000000000000000000115261264277366700201210ustar00rootroot00000000000000/* ************************************************************************ * Copyright 2013 Advanced Micro Devices, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * ************************************************************************/ #include #include #include /* Include CLBLAS header. It automatically includes needed OpenCL header, * so we can drop out explicit inclusion of cl.h header. */ #include /* This example uses predefined matrices and their characteristics for * simplicity purpose. */ static const clblasOrder order = clblasRowMajor; static const size_t N = 5; static cl_float alpha = 10.0; static const clblasUplo uplo = clblasUpper; static cl_float A[] = { 1.0, 2.0, 3.0, 4.0, 5.0, 0.0, 6.0, 7.0, 8.0, 9.0, 0.0, 0.0, 10.0, 11.0, 12.0, 0.0, 0.0, 0.0, 13.0, 14.0, 0.0, 0.0, 0.0, 00.0, 15.0 }; static const size_t lda = 5; /* i.e. lda = N */ static const cl_float X[] = { 1.0, 2.0, 3.0, 4.0, 5.0 }; static const cl_float Y[] = { 5.0, 4.0, 3.0, 2.0, 1.0 }; static const int incx = 1, incy = 1; static void printResult(void) { size_t i, j; printf("\nResult:\n"); for (i = 0; i < N; i++) { for(j = 0; j < N; j++) printf("\t(%.2f)", A[ i*N + j ]); printf("\n"); } } int main(void) { cl_int err; cl_platform_id platform = 0; cl_device_id device = 0; cl_context_properties props[3] = { CL_CONTEXT_PLATFORM, 0, 0 }; cl_context ctx = 0; cl_command_queue queue = 0; cl_mem bufA, bufX, bufY; cl_event event = NULL; int ret = 0; /* Setup OpenCL environment. */ err = clGetPlatformIDs(1, &platform, NULL); if (err != CL_SUCCESS) { printf( "clGetPlatformIDs() failed with %d\n", err ); return 1; } err = clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, 1, &device, NULL); if (err != CL_SUCCESS) { printf( "clGetDeviceIDs() failed with %d\n", err ); return 1; } props[1] = (cl_context_properties)platform; ctx = clCreateContext(props, 1, &device, NULL, NULL, &err); if (err != CL_SUCCESS) { printf( "clCreateContext() failed with %d\n", err ); return 1; } queue = clCreateCommandQueue(ctx, device, 0, &err); if (err != CL_SUCCESS) { printf( "clCreateCommandQueue() failed with %d\n", err ); clReleaseContext(ctx); return 1; } /* Setup clblas. */ err = clblasSetup(); if (err != CL_SUCCESS) { printf("clblasSetup() failed with %d\n", err); clReleaseCommandQueue(queue); clReleaseContext(ctx); return 1; } printResult(); /* Prepare OpenCL memory objects and place matrices inside them. */ bufA = clCreateBuffer(ctx, CL_MEM_READ_WRITE, N * lda * sizeof(cl_float), NULL, &err); bufX = clCreateBuffer(ctx, CL_MEM_READ_ONLY, N * sizeof(cl_float), NULL, &err); bufY = clCreateBuffer(ctx, CL_MEM_READ_ONLY, N * sizeof(cl_float), NULL, &err); err = clEnqueueWriteBuffer(queue, bufA, CL_TRUE, 0, N * lda * sizeof(cl_float), A, 0, NULL, NULL); err = clEnqueueWriteBuffer(queue, bufX, CL_TRUE, 0, N * sizeof(cl_float), X, 0, NULL, NULL); err = clEnqueueWriteBuffer(queue, bufY, CL_TRUE, 0, N * sizeof(cl_float), Y, 0, NULL, NULL); err = clblasSsyr2(order, uplo, N, alpha, bufX, 0 /*offx */, incx, bufY, 0 /*offy*/, incy, bufA, 0 /*offa */, lda, 1, &queue, 0, NULL, &event); if (err != CL_SUCCESS) { printf("clblasSsyr2() failed with %d\n", err); ret = 1; } else { /* Wait for calculations to be finished. */ err = clWaitForEvents(1, &event); /* Fetch results of calculations from GPU memory. */ err = clEnqueueReadBuffer(queue, bufA, CL_TRUE, 0, (N * lda * sizeof(cl_float)), A, 0, NULL, NULL); /* At this point you will get the result of SSYR2 placed in A array. */ printResult(); } /* Release OpenCL events. */ clReleaseEvent(event); /* Release OpenCL memory objects. */ clReleaseMemObject(bufX); clReleaseMemObject(bufA); clReleaseMemObject(bufY); /* Finalize work with clblas. */ clblasTeardown(); /* Release OpenCL working objects. */ clReleaseCommandQueue(queue); clReleaseContext(ctx); return ret; } clblas-2.10/src/samples/example_ssyr2k.c000066400000000000000000000132661264277366700202770ustar00rootroot00000000000000/* ************************************************************************ * Copyright 2013 Advanced Micro Devices, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * ************************************************************************/ #include #include #include /* Include CLBLAS header. It automatically includes needed OpenCL header, * so we can drop out explicit inclusion of cl.h header. */ #include /* This example uses predefined matrices and their characteristics for * simplicity purpose. */ static const clblasOrder order = clblasRowMajor; static const clblasUplo uplo = clblasUpper; static const clblasTranspose transAB = clblasNoTrans; static const size_t N = 5; static const size_t K = 4; static const cl_float alpha = 10; static const cl_float A[] = { 11, 12, 13, 14, 21, 22, 23, 24, 31, 32, 33, 34, 41, 42, 43, 44, 51, 52, 53, 54 }; static const size_t lda = 4; /* i.e. lda = K */ static cl_float B[] = { 11, 12, 13, 14, 21, 22, 23, 24, 31, 32, 33, 34, 41, 42, 43, 44, 51, 52, 53, 54 }; static const size_t ldb = 4; /* i.e. ldb = K */ static const cl_float beta = 20; static cl_float C[] = { 11, 12, 13, 14, 15, 12, 22, 23, 24, 25, 13, 23, 33, 34, 35, 14, 24, 34, 44, 45, 15, 25, 35, 45, 55 }; static const size_t ldc = 5; /* i.e. ldc = N */ static cl_float result[5*5]; /* ldc * N */ const size_t off = 1; static const size_t offA = 4 + 1; /* K + off */ static const size_t offB = 4 + 1; /* K + off */ static const size_t offC = 5 + 1; /* N + off */ static void printResult(const char* str) { size_t i, j, nrows; printf("%s:\n", str); nrows = (sizeof(result) / sizeof(cl_float)) / ldc; for (i = 0; i < nrows; i++) { for (j = 0; j < ldc; j++) { printf("%d ", (int)result[i * ldc + j]); } printf("\n"); } } int main(void) { cl_int err; cl_platform_id platform = 0; cl_device_id device = 0; cl_context_properties props[3] = { CL_CONTEXT_PLATFORM, 0, 0 }; cl_context ctx = 0; cl_command_queue queue = 0; cl_mem bufA, bufB, bufC; cl_event event = NULL; int ret = 0; /* Setup OpenCL environment. */ err = clGetPlatformIDs(1, &platform, NULL); if (err != CL_SUCCESS) { printf( "clGetPlatformIDs() failed with %d\n", err ); return 1; } err = clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, 1, &device, NULL); if (err != CL_SUCCESS) { printf( "clGetDeviceIDs() failed with %d\n", err ); return 1; } props[1] = (cl_context_properties)platform; ctx = clCreateContext(props, 1, &device, NULL, NULL, &err); if (err != CL_SUCCESS) { printf( "clCreateContext() failed with %d\n", err ); return 1; } queue = clCreateCommandQueue(ctx, device, 0, &err); if (err != CL_SUCCESS) { printf( "clCreateCommandQueue() failed with %d\n", err ); clReleaseContext(ctx); return 1; } /* Setup clblas. */ err = clblasSetup(); if (err != CL_SUCCESS) { printf("clblasSetup() failed with %d\n", err); clReleaseCommandQueue(queue); clReleaseContext(ctx); return 1; } /* Prepare OpenCL memory objects and place matrices inside them. */ bufA = clCreateBuffer(ctx, CL_MEM_READ_ONLY, N * K * sizeof(*A), NULL, &err); bufB = clCreateBuffer(ctx, CL_MEM_READ_ONLY, N * K * sizeof(*B), NULL, &err); bufC = clCreateBuffer(ctx, CL_MEM_READ_WRITE, N * N * sizeof(*C), NULL, &err); err = clEnqueueWriteBuffer(queue, bufA, CL_TRUE, 0, N * K * sizeof(*A), A, 0, NULL, NULL); err = clEnqueueWriteBuffer(queue, bufB, CL_TRUE, 0, N * K * sizeof(*B), B, 0, NULL, NULL); err = clEnqueueWriteBuffer(queue, bufC, CL_TRUE, 0, N * N * sizeof(*C), C, 0, NULL, NULL); /* Call clblas extended function. Perform SYR2K for the lower right sub-matrices */ err = clblasSsyr2k(order, uplo, transAB, N - off, K - off, alpha, bufA, offA, lda, bufB, offB, ldb, beta, bufC, offC, ldc, 1, &queue, 0, NULL, &event); if (err != CL_SUCCESS) { printf("clblasSsyr2kEx() failed with %d\n", err); ret = 1; } else { /* Wait for calculations to be finished. */ err = clWaitForEvents(1, &event); /* Fetch results of calculations from GPU memory. */ err = clEnqueueReadBuffer(queue, bufC, CL_TRUE, 0, N * N * sizeof(*result), result, 0, NULL, NULL); /* At this point you will get the result of SSYR2K placed in 'result' array. */ puts(""); printResult("clblasSsyr2kEx result"); } /* Release OpenCL events. */ clReleaseEvent(event); /* Release OpenCL memory objects. */ clReleaseMemObject(bufC); clReleaseMemObject(bufB); clReleaseMemObject(bufA); /* Finalize work with clblas. */ clblasTeardown(); /* Release OpenCL working objects. */ clReleaseCommandQueue(queue); clReleaseContext(ctx); return ret; } clblas-2.10/src/samples/example_ssyrk.c000066400000000000000000000122531264277366700202100ustar00rootroot00000000000000/* ************************************************************************ * Copyright 2013 Advanced Micro Devices, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * ************************************************************************/ #include #include #include /* Include CLBLAS header. It automatically includes needed OpenCL header, * so we can drop out explicit inclusion of cl.h header. */ #include /* This example uses predefined matrices and their characteristics for * simplicity purpose. */ static const clblasOrder order = clblasRowMajor; static const clblasUplo uplo = clblasUpper; static const clblasTranspose transA = clblasNoTrans; static const size_t N = 5; static const size_t K = 4; static const cl_float alpha = 10; static const cl_float A[] = { 11, 12, 13, 14, 21, 22, 23, 24, 31, 32, 33, 34, 41, 42, 43, 44, 51, 52, 53, 54 }; static const size_t lda = 4; /* i.e. lda = K */ static const cl_float beta = 20; static cl_float C[] = { 11, 12, 13, 14, 15, 12, 22, 23, 24, 25, 13, 23, 33, 34, 35, 14, 24, 34, 44, 45, 15, 25, 35, 45, 55 }; static const size_t ldc = 5; /* i.e. ldc = N */ static cl_float result[5*5]; /* ldc*N */ static const size_t off = 1; static const size_t offA = 4 + 1; /* K + off */ static const size_t offC = 5 + 1; /* N + off */ static void printResult(const char* str) { size_t i, j, nrows; printf("%s:\n", str); nrows = (sizeof(result) / sizeof(cl_float)) / ldc; for (i = 0; i < nrows; i++) { for (j = 0; j < ldc; j++) { printf("%d ", (int)result[i * ldc + j]); } printf("\n"); } } int main(void) { cl_int err; cl_platform_id platform = 0; cl_device_id device = 0; cl_context_properties props[3] = { CL_CONTEXT_PLATFORM, 0, 0 }; cl_context ctx = 0; cl_command_queue queue = 0; cl_mem bufA, bufC; cl_event event = NULL; int ret = 0; /* Setup OpenCL environment. */ err = clGetPlatformIDs(1, &platform, NULL); if (err != CL_SUCCESS) { printf( "clGetPlatformIDs() failed with %d\n", err ); return 1; } err = clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, 1, &device, NULL); if (err != CL_SUCCESS) { printf( "clGetDeviceIDs() failed with %d\n", err ); return 1; } props[1] = (cl_context_properties)platform; ctx = clCreateContext(props, 1, &device, NULL, NULL, &err); if (err != CL_SUCCESS) { printf( "clCreateContext() failed with %d\n", err ); return 1; } queue = clCreateCommandQueue(ctx, device, 0, &err); if (err != CL_SUCCESS) { printf( "clCreateCommandQueue() failed with %d\n", err ); clReleaseContext(ctx); return 1; } /* Setup clblas. */ err = clblasSetup(); if (err != CL_SUCCESS) { printf("clblasSetup() failed with %d\n", err); clReleaseCommandQueue(queue); clReleaseContext(ctx); return 1; } /* Prepare OpenCL memory objects and place matrices inside them. */ bufA = clCreateBuffer(ctx, CL_MEM_READ_ONLY, N * K * sizeof(*A), NULL, &err); bufC = clCreateBuffer(ctx, CL_MEM_READ_WRITE, N * N * sizeof(*C), NULL, &err); err = clEnqueueWriteBuffer(queue, bufA, CL_TRUE, 0, N * K * sizeof(*A), A, 0, NULL, NULL); err = clEnqueueWriteBuffer(queue, bufC, CL_TRUE, 0, N * N * sizeof(*C), C, 0, NULL, NULL); /* Call clblas extended function. Perform SYRK for the lower right sub-matrices */ err = clblasSsyrk(order, uplo, transA, N - off, K - off, alpha, bufA, offA, lda, beta, bufC, offC, ldc, 1, &queue, 0, NULL, &event); if (err != CL_SUCCESS) { printf("clblasSsyrkEx() failed with %d\n", err); ret = 1; } else { /* Wait for calculations to be finished. */ err = clWaitForEvents(1, &event); /* Fetch results of calculations from GPU memory. */ err = clEnqueueReadBuffer(queue, bufC, CL_TRUE, 0, N * N * sizeof(*result), result, 0, NULL, NULL); /* At this point you will get the result of SSYRK placed in 'result' array. */ puts(""); printResult("clblasSsyrkEx result"); } /* Release OpenCL events. */ clReleaseEvent(event); /* Release OpenCL memory objects. */ clReleaseMemObject(bufC); clReleaseMemObject(bufA); /* Finalize work with clblas. */ clblasTeardown(); /* Release OpenCL working objects. */ clReleaseCommandQueue(queue); clReleaseContext(ctx); return ret; } clblas-2.10/src/samples/example_stbmv.c000066400000000000000000000111631264277366700201670ustar00rootroot00000000000000/* ************************************************************************ * Copyright 2013 Advanced Micro Devices, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * ************************************************************************/ #include #include #include /* Include CLBLAS header. It automatically includes needed OpenCL header, * so we can drop out explicit inclusion of cl.h header. */ #include /* This example uses predefined matrices and their characteristics for * simplicity purpose. */ static const clblasOrder order = clblasRowMajor; static const clblasUplo uplo = clblasUpper; static const size_t N = 5; static const size_t K = 2; static const cl_float A[] = { 11, 12, 13, 22, 23, 24, 33, 34, 35, 44, 45, 00, 55, 00, 00 }; static const size_t lda = 3; // lda = K + 1 static cl_float X[] = { 11, 21, 31, 41, 51 }; static const int incx = 1; static void printResult(void) { size_t i; printf("Result:\n\n"); for (i = 0; i < N; i++) { printf("%.3f\n", X[i * incx]); } } int main(void) { cl_int err; cl_platform_id platform = 0; cl_device_id device = 0; cl_context_properties props[3] = { CL_CONTEXT_PLATFORM, 0, 0 }; cl_context ctx = 0; cl_command_queue queue = 0; cl_mem bufA, bufX, scratchBuff; cl_event event = NULL; int ret = 0; /* Setup OpenCL environment. */ err = clGetPlatformIDs(1, &platform, NULL); if (err != CL_SUCCESS) { printf( "clGetPlatformIDs() failed with %d\n", err ); return 1; } err = clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, 1, &device, NULL); if (err != CL_SUCCESS) { printf( "clGetDeviceIDs() failed with %d\n", err ); return 1; } props[1] = (cl_context_properties)platform; ctx = clCreateContext(props, 1, &device, NULL, NULL, &err); if (err != CL_SUCCESS) { printf( "clCreateContext() failed with %d\n", err ); return 1; } queue = clCreateCommandQueue(ctx, device, 0, &err); if (err != CL_SUCCESS) { printf( "clCreateCommandQueue() failed with %d\n", err ); clReleaseContext(ctx); return 1; } /* Setup clblas. */ err = clblasSetup(); if (err != CL_SUCCESS) { printf("clblasSetup() failed with %d\n", err); clReleaseCommandQueue(queue); clReleaseContext(ctx); return 1; } /* Prepare OpenCL memory objects and place matrices inside them. */ bufA = clCreateBuffer(ctx, CL_MEM_READ_ONLY, N * lda * sizeof(cl_float), NULL, &err); bufX = clCreateBuffer(ctx, CL_MEM_WRITE_ONLY, N * sizeof(cl_float), NULL, &err); scratchBuff = clCreateBuffer(ctx, CL_MEM_READ_ONLY, N * sizeof(cl_float), NULL, &err); err = clEnqueueWriteBuffer(queue, bufA, CL_TRUE, 0, N * lda * sizeof(cl_float), A, 0, NULL, NULL); err = clEnqueueWriteBuffer(queue, bufX, CL_TRUE, 0, N * sizeof(cl_float), X, 0, NULL, NULL); err = clblasStbmv(order, uplo, clblasNoTrans, clblasNonUnit, N, K, bufA, 0 /*offA */, lda, bufX, 0 /*offX */, incx, scratchBuff, 1, &queue, 0, NULL, &event); if (err != CL_SUCCESS) { printf("clblasStbmv() failed with %d\n", err); ret = 1; } else { /* Wait for calculations to be finished. */ err = clWaitForEvents(1, &event); /* Fetch results of calculations from GPU memory. */ err = clEnqueueReadBuffer(queue, bufX, CL_TRUE, 0, N * sizeof(*X), X, 0, NULL, NULL); /* At this point you will get the result of STBMV placed in X array. */ printResult(); } /* Release OpenCL events. */ clReleaseEvent(event); /* Release OpenCL memory objects. */ clReleaseMemObject(scratchBuff); clReleaseMemObject(bufX); clReleaseMemObject(bufA); /* Finalize work with clblas. */ clblasTeardown(); /* Release OpenCL working objects. */ clReleaseCommandQueue(queue); clReleaseContext(ctx); return ret; } clblas-2.10/src/samples/example_stbsv.c000066400000000000000000000110461264277366700201750ustar00rootroot00000000000000/* ************************************************************************ * Copyright 2013 Advanced Micro Devices, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * ************************************************************************/ #include #include #include /* Include CLBLAS header. It automatically includes needed OpenCL header, * so we can drop out explicit inclusion of cl.h header. */ #include /* This example uses predefined matrices and their characteristics for * simplicity purpose. */ static const clblasOrder order = clblasRowMajor; static const clblasTranspose trans = clblasTrans; static const clblasUplo uplo = clblasLower; static const clblasDiag diag = clblasNonUnit; static const size_t N = 5; static const size_t K = 2; static const cl_float A[] = { 11, 12, 13, 22, 23, 24, 33, 34, 35, 44, 45, 00, 55, 00, 00 }; static const size_t lda = 3; // lda = K + 1 static cl_float X[] = { 11, 21, 31, 41, 51 }; static const int incx = 1; static void printResult(void) { size_t i; printf("Result:\n\n"); for (i = 0; i < N; i++) { printf("%f \n", X[i * incx]); } } int main(void) { cl_int err; cl_platform_id platform = 0; cl_device_id device = 0; cl_context_properties props[3] = { CL_CONTEXT_PLATFORM, 0, 0 }; cl_context ctx = 0; cl_command_queue queue = 0; cl_mem bufA, bufX; cl_event event = NULL; int ret = 0; /* Setup OpenCL environment. */ err = clGetPlatformIDs(1, &platform, NULL); if (err != CL_SUCCESS) { printf( "clGetPlatformIDs() failed with %d\n", err ); return 1; } err = clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, 1, &device, NULL); if (err != CL_SUCCESS) { printf( "clGetDeviceIDs() failed with %d\n", err ); return 1; } props[1] = (cl_context_properties)platform; ctx = clCreateContext(props, 1, &device, NULL, NULL, &err); if (err != CL_SUCCESS) { printf( "clCreateContext() failed with %d\n", err ); return 1; } queue = clCreateCommandQueue(ctx, device, 0, &err); if (err != CL_SUCCESS) { printf( "clCreateCommandQueue() failed with %d\n", err ); clReleaseContext(ctx); return 1; } /* Setup clblas. */ err = clblasSetup(); if (err != CL_SUCCESS) { printf("clblasSetup() failed with %d\n", err); clReleaseCommandQueue(queue); clReleaseContext(ctx); return 1; } /* Prepare OpenCL memory objects and place matrices inside them. */ bufA = clCreateBuffer(ctx, CL_MEM_READ_ONLY, N * lda * sizeof(cl_float), NULL, &err); bufX = clCreateBuffer(ctx, CL_MEM_READ_WRITE, N * sizeof(cl_float), NULL, &err); err = clEnqueueWriteBuffer(queue, bufA, CL_TRUE, 0, N * lda * sizeof(cl_float), A, 0, NULL, NULL); err = clEnqueueWriteBuffer(queue, bufX, CL_TRUE, 0, N * sizeof(cl_float), X, 0, NULL, NULL); /* Call clblas function. */ err = clblasStbsv(order, uplo, trans, diag, N, K, bufA, 0, lda, bufX, 0, incx, 1, &queue, 0, NULL, &event); if (err != CL_SUCCESS) { printf("clblasStbsv() failed with %d\n", err); ret = 1; } else { /* Wait for calculations to be finished. */ err = clWaitForEvents(1, &event); /* Fetch results of calculations from GPU memory. */ err = clEnqueueReadBuffer(queue, bufX, CL_TRUE, 0, N * sizeof(cl_float), X, 0, NULL, NULL); /* At this point you will get the result of STBSV placed in X array. */ printResult(); } /* Release OpenCL events. */ clReleaseEvent(event); /* Release OpenCL memory objects. */ clReleaseMemObject(bufX); clReleaseMemObject(bufA); /* Finalize work with clblas. */ clblasTeardown(); /* Release OpenCL working objects. */ clReleaseCommandQueue(queue); clReleaseContext(ctx); return ret; } clblas-2.10/src/samples/example_stpmv.c000066400000000000000000000111331264277366700202020ustar00rootroot00000000000000/* ************************************************************************ * Copyright 2013 Advanced Micro Devices, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * ************************************************************************/ #include #include #include /* Include CLBLAS header. It automatically includes needed OpenCL header, * so we can drop out explicit inclusion of cl.h header. */ #include /* This example uses predefined matrices and their characteristics for * simplicity purpose. */ static const clblasOrder order = clblasRowMajor; static const clblasUplo uplo = clblasUpper; static const size_t N = 5; static const cl_float AP[] = { 11, 12, 13, 14, 15, 22, 23, 24, 25, 33, 34, 35, 44, 45, 55 }; static cl_float X[] = { 11, 21, 31, 41, 51 }; static const int incx = 1; static void printResult(void) { size_t i; printf("Result:\n\n"); for (i = 0; i < N; i++) { printf("%.3f\n", X[i * incx]); } } int main(void) { cl_int err; cl_platform_id platform = 0; cl_device_id device = 0; cl_context_properties props[3] = { CL_CONTEXT_PLATFORM, 0, 0 }; cl_context ctx = 0; cl_command_queue queue = 0; cl_mem bufAP, bufX, scratchBuff; cl_event event = NULL; int ret = 0, numElementsAP; /* Setup OpenCL environment. */ err = clGetPlatformIDs(1, &platform, NULL); if (err != CL_SUCCESS) { printf( "clGetPlatformIDs() failed with %d\n", err ); return 1; } err = clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, 1, &device, NULL); if (err != CL_SUCCESS) { printf( "clGetDeviceIDs() failed with %d\n", err ); return 1; } props[1] = (cl_context_properties)platform; ctx = clCreateContext(props, 1, &device, NULL, NULL, &err); if (err != CL_SUCCESS) { printf( "clCreateContext() failed with %d\n", err ); return 1; } queue = clCreateCommandQueue(ctx, device, 0, &err); if (err != CL_SUCCESS) { printf( "clCreateCommandQueue() failed with %d\n", err ); clReleaseContext(ctx); return 1; } /* Setup clblas. */ err = clblasSetup(); if (err != CL_SUCCESS) { printf("clblasSetup() failed with %d\n", err); clReleaseCommandQueue(queue); clReleaseContext(ctx); return 1; } numElementsAP = (N * (N+1)) / 2; // To get number of elements in a packed matrix /* Prepare OpenCL memory objects and place matrices inside them. */ bufAP = clCreateBuffer(ctx, CL_MEM_READ_ONLY, numElementsAP * sizeof(cl_float), NULL, &err); bufX = clCreateBuffer(ctx, CL_MEM_WRITE_ONLY, N * sizeof(cl_float), NULL, &err); scratchBuff = clCreateBuffer(ctx, CL_MEM_READ_ONLY, N * sizeof(cl_float), NULL, &err); err = clEnqueueWriteBuffer(queue, bufAP, CL_TRUE, 0, numElementsAP * sizeof(cl_float), AP, 0, NULL, NULL); err = clEnqueueWriteBuffer(queue, bufX, CL_TRUE, 0, N * sizeof(cl_float), X, 0, NULL, NULL); err = clblasStpmv(order, uplo, clblasTrans, clblasUnit, N, bufAP, 0 /*offA */, bufX, 0 /*offX */, incx, scratchBuff, 1, &queue, 0, NULL, &event); if (err != CL_SUCCESS) { printf("clblasStpmv() failed with %d\n", err); ret = 1; } else { /* Wait for calculations to be finished. */ err = clWaitForEvents(1, &event); /* Fetch results of calculations from GPU memory. */ err = clEnqueueReadBuffer(queue, bufX, CL_TRUE, 0, N * sizeof(cl_float), X, 0, NULL, NULL); /* At this point you will get the result of STRMV placed in Y array. */ printResult(); } /* Release OpenCL events. */ clReleaseEvent(event); /* Release OpenCL memory objects. */ clReleaseMemObject(scratchBuff); clReleaseMemObject(bufX); clReleaseMemObject(bufAP); /* Finalize work with clblas. */ clblasTeardown(); /* Release OpenCL working objects. */ clReleaseCommandQueue(queue); clReleaseContext(ctx); return ret; } clblas-2.10/src/samples/example_stpsv.c000066400000000000000000000113021264277366700202060ustar00rootroot00000000000000/* ************************************************************************ * Copyright 2013 Advanced Micro Devices, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * ************************************************************************/ #include #include #include /* Include CLBLAS header. It automatically includes needed OpenCL header, * so we can drop out explicit inclusion of cl.h header. */ #include /* This example uses predefined matrices and their characteristics for * simplicity purpose. */ static const clblasOrder order = clblasRowMajor; static const size_t N = 5; static const clblasTranspose transA = clblasTrans; static const clblasUplo uploA = clblasUpper; static const clblasDiag diagA = clblasUnit; static const cl_float A[] = { 11, 12, 13, 14, 15, 22, 23, 24, 25, 33, 34, 35, 44, 45, 55 }; static const size_t lda = 0; /* i.e. lda = N */ static cl_float X[] = { 11.0, 153.0, 657.0, 1753.0, 3671.0 }; static const int incx = 1; static void printResult(void) { size_t i; printf("Result:\n\n"); for (i = 0; i < N; i++) { printf("%f \n", X[i * incx]); } } int main(void) { cl_int err; cl_platform_id platform = 0; cl_device_id device = 0; cl_context_properties props[3] = { CL_CONTEXT_PLATFORM, 0, 0 }; cl_context ctx = 0; cl_command_queue queue = 0; cl_mem bufA, bufX; cl_event event = NULL; int ret = 0, numElementsAP; /* Setup OpenCL environment. */ err = clGetPlatformIDs(1, &platform, NULL); if (err != CL_SUCCESS) { printf( "clGetPlatformIDs() failed with %d\n", err ); return 1; } err = clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, 1, &device, NULL); if (err != CL_SUCCESS) { printf( "clGetDeviceIDs() failed with %d\n", err ); return 1; } props[1] = (cl_context_properties)platform; ctx = clCreateContext(props, 1, &device, NULL, NULL, &err); if (err != CL_SUCCESS) { printf( "clCreateContext() failed with %d\n", err ); return 1; } queue = clCreateCommandQueue(ctx, device, 0, &err); if (err != CL_SUCCESS) { printf( "clCreateCommandQueue() failed with %d\n", err ); clReleaseContext(ctx); return 1; } /* Setup clblas. */ err = clblasSetup(); if (err != CL_SUCCESS) { printf("clblasSetup() failed with %d\n", err); clReleaseCommandQueue(queue); clReleaseContext(ctx); return 1; } numElementsAP = (N * (N+1)) / 2; // To get number of elements in a packed matrix /* Prepare OpenCL memory objects and place matrices inside them. */ bufA = clCreateBuffer(ctx, CL_MEM_READ_ONLY, numElementsAP * sizeof(cl_float), NULL, &err); bufX = clCreateBuffer(ctx, CL_MEM_READ_WRITE, N * sizeof(cl_float), NULL, &err); err = clEnqueueWriteBuffer(queue, bufA, CL_TRUE, 0, numElementsAP * sizeof(cl_float), A, 0, NULL, NULL); err = clEnqueueWriteBuffer(queue, bufX, CL_TRUE, 0, N * sizeof(cl_float), X, 0, NULL, NULL); /* Call clblas function. */ err = clblasStpsv(order, uploA, transA, diagA, N, bufA, 0, bufX, 0, incx, 1, &queue, 0, NULL, &event); if (err != CL_SUCCESS) { printf("clblasStpsv() failed with %d\n", err); ret = 1; } else { /* Wait for calculations to be finished. */ err = clWaitForEvents(1, &event); /* Fetch results of calculations from GPU memory. */ err = clEnqueueReadBuffer(queue, bufX, CL_TRUE, 0, N * sizeof(cl_float), X, 0, NULL, NULL); /* At this point you will get the result of STPSV placed in X array. */ printResult(); } /* Release OpenCL events. */ clReleaseEvent(event); /* Release OpenCL memory objects. */ clReleaseMemObject(bufX); clReleaseMemObject(bufA); /* Finalize work with clblas. */ clblasTeardown(); /* Release OpenCL working objects. */ clReleaseCommandQueue(queue); clReleaseContext(ctx); return ret; } clblas-2.10/src/samples/example_strmm.c000066400000000000000000000123251264277366700201770ustar00rootroot00000000000000/* ************************************************************************ * Copyright 2013 Advanced Micro Devices, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * ************************************************************************/ #include #include #include /* Include CLBLAS header. It automatically includes needed OpenCL header, * so we can drop out explicit inclusion of cl.h header. */ #include /* This example uses predefined matrices and their characteristics for * simplicity purpose. */ static const clblasOrder order = clblasRowMajor; static const clblasSide side = clblasLeft; static const size_t M = 4; static const size_t N = 5; static const cl_float alpha = 10; static const clblasTranspose transA = clblasNoTrans; static const clblasUplo uploA = clblasUpper; static const clblasDiag diagA = clblasNonUnit; static const cl_float A[] = { 11, 12, 13, 14, 0, 22, 23, 24, 0, 0, 33, 34, 0, 0, 0, 44 }; static const size_t lda = 4; /* i.e. lda = M */ static cl_float B[] = { 11, 12, 13, 14, 15, 21, 22, 23, 24, 25, 31, 32, 33, 34, 35, 41, 42, 43, 44, 45 }; static const size_t ldb = 5; /* i.e. ldb = N */ static cl_float result[20]; /* ldb * M */ static const size_t off = 1; static const size_t offA = 4 + 1; /* K + off */ static const size_t offB = 5 + 1; /* N + off */ static void printResult(const char* str) { size_t i, j, nrows; printf("%s:\n", str); nrows = (sizeof(result) / sizeof(cl_float)) / ldb; for (i = 0; i < nrows; i++) { for (j = 0; j < ldb; j++) { printf("%d ", (int)result[i * ldb + j]); } printf("\n"); } } int main(void) { cl_int err; cl_platform_id platform = 0; cl_device_id device = 0; cl_context_properties props[3] = { CL_CONTEXT_PLATFORM, 0, 0 }; cl_context ctx = 0; cl_command_queue queue = 0; cl_mem bufA, bufB; cl_event event = NULL; int ret = 0; /* Setup OpenCL environment. */ err = clGetPlatformIDs(1, &platform, NULL); if (err != CL_SUCCESS) { printf( "clGetPlatformIDs() failed with %d\n", err ); return 1; } err = clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, 1, &device, NULL); if (err != CL_SUCCESS) { printf( "clGetDeviceIDs() failed with %d\n", err ); return 1; } props[1] = (cl_context_properties)platform; ctx = clCreateContext(props, 1, &device, NULL, NULL, &err); if (err != CL_SUCCESS) { printf( "clCreateContext() failed with %d\n", err ); return 1; } queue = clCreateCommandQueue(ctx, device, 0, &err); if (err != CL_SUCCESS) { printf( "clCreateCommandQueue() failed with %d\n", err ); clReleaseContext(ctx); return 1; } /* Setup clblas. */ err = clblasSetup(); if (err != CL_SUCCESS) { printf("clblasSetup() failed with %d\n", err); clReleaseCommandQueue(queue); clReleaseContext(ctx); return 1; } /* Prepare OpenCL memory objects and place matrices inside them. */ bufA = clCreateBuffer(ctx, CL_MEM_READ_ONLY, M * M * sizeof(*A), NULL, &err); bufB = clCreateBuffer(ctx, CL_MEM_READ_WRITE, M * N * sizeof(*B), NULL, &err); err = clEnqueueWriteBuffer(queue, bufA, CL_TRUE, 0, M * M * sizeof(*A), A, 0, NULL, NULL); err = clEnqueueWriteBuffer(queue, bufB, CL_TRUE, 0, M * N * sizeof(*B), B, 0, NULL, NULL); /* Call clblas extended function. Perform TRMM for the lower right sub-matrices */ err = clblasStrmm(order, side, uploA, transA, diagA, M - off, N - off, alpha, bufA, offA, lda, bufB, offB, ldb, 1, &queue, 0, NULL, &event); if (err != CL_SUCCESS) { printf("clblasStrmmEx() failed with %d\n", err); ret = 1; } else { /* Wait for calculations to be finished. */ err = clWaitForEvents(1, &event); /* Fetch results of calculations from GPU memory. */ err = clEnqueueReadBuffer(queue, bufB, CL_TRUE, 0, M * N * sizeof(*result), result, 0, NULL, NULL); /* At this point you will get the result of STRMM placed in 'result' array. */ puts(""); printResult("clblasStrmmEx result"); } /* Release OpenCL events. */ clReleaseEvent(event); /* Release OpenCL memory objects. */ clReleaseMemObject(bufB); clReleaseMemObject(bufA); /* Finalize work with clblas. */ clblasTeardown(); /* Release OpenCL working objects. */ clReleaseCommandQueue(queue); clReleaseContext(ctx); return ret; } clblas-2.10/src/samples/example_strmv.c000066400000000000000000000111131264277366700202020ustar00rootroot00000000000000/* ************************************************************************ * Copyright 2013 Advanced Micro Devices, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * ************************************************************************/ #include #include #include /* Include CLBLAS header. It automatically includes needed OpenCL header, * so we can drop out explicit inclusion of cl.h header. */ #include /* This example uses predefined matrices and their characteristics for * simplicity purpose. */ static const clblasOrder order = clblasRowMajor; static const clblasUplo uplo = clblasUpper; static const size_t N = 5; static const cl_float A[] = { 11, 12, 13, 14, 15, 0, 22, 23, 24, 25, 0, 0, 33, 34, 35, 0, 0, 0, 44, 45, 0, 0, 0, 0, 55 }; static const size_t lda = 5; /* i.e. lda = N */ static cl_float X[] = { 11, 21, 31, 41, 51 }; static const int incx = 1; static void printResult(void) { size_t i; printf("Result:\n\n"); for (i = 0; i < N; i++) { printf("%.3f\n", X[i * incx]); } } int main(void) { cl_int err; cl_platform_id platform = 0; cl_device_id device = 0; cl_context_properties props[3] = { CL_CONTEXT_PLATFORM, 0, 0 }; cl_context ctx = 0; cl_command_queue queue = 0; cl_mem bufA, bufX, scratchBuff; cl_event event = NULL; int ret = 0; /* Setup OpenCL environment. */ err = clGetPlatformIDs(1, &platform, NULL); if (err != CL_SUCCESS) { printf( "clGetPlatformIDs() failed with %d\n", err ); return 1; } err = clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, 1, &device, NULL); if (err != CL_SUCCESS) { printf( "clGetDeviceIDs() failed with %d\n", err ); return 1; } props[1] = (cl_context_properties)platform; ctx = clCreateContext(props, 1, &device, NULL, NULL, &err); if (err != CL_SUCCESS) { printf( "clCreateContext() failed with %d\n", err ); return 1; } queue = clCreateCommandQueue(ctx, device, 0, &err); if (err != CL_SUCCESS) { printf( "clCreateCommandQueue() failed with %d\n", err ); clReleaseContext(ctx); return 1; } /* Setup clblas. */ err = clblasSetup(); if (err != CL_SUCCESS) { printf("clblasSetup() failed with %d\n", err); clReleaseCommandQueue(queue); clReleaseContext(ctx); return 1; } /* Prepare OpenCL memory objects and place matrices inside them. */ bufA = clCreateBuffer(ctx, CL_MEM_READ_ONLY, N * lda * sizeof(*A), NULL, &err); bufX = clCreateBuffer(ctx, CL_MEM_WRITE_ONLY, N * sizeof(*X), NULL, &err); scratchBuff = clCreateBuffer(ctx, CL_MEM_READ_ONLY, N * sizeof(*X), NULL, &err); err = clEnqueueWriteBuffer(queue, bufA, CL_TRUE, 0, N * lda * sizeof(*A), A, 0, NULL, NULL); err = clEnqueueWriteBuffer(queue, bufX, CL_TRUE, 0, N * sizeof(*X), X, 0, NULL, NULL); err = clblasStrmv(order, uplo, clblasTrans, clblasUnit, N, bufA, 0 /*offA */, lda, bufX, 0 /*offX */, incx, scratchBuff, 1, &queue, 0, NULL, &event); if (err != CL_SUCCESS) { printf("clblasStrmv() failed with %d\n", err); ret = 1; } else { /* Wait for calculations to be finished. */ err = clWaitForEvents(1, &event); /* Fetch results of calculations from GPU memory. */ err = clEnqueueReadBuffer(queue, bufX, CL_TRUE, 0, N * sizeof(*X), X, 0, NULL, NULL); /* At this point you will get the result of STRMV placed in X array. */ printResult(); } /* Release OpenCL events. */ clReleaseEvent(event); /* Release OpenCL memory objects. */ clReleaseMemObject(scratchBuff); clReleaseMemObject(bufX); clReleaseMemObject(bufA); /* Finalize work with clblas. */ clblasTeardown(); /* Release OpenCL working objects. */ clReleaseCommandQueue(queue); clReleaseContext(ctx); return ret; } clblas-2.10/src/samples/example_strsm.c000066400000000000000000000122641264277366700202070ustar00rootroot00000000000000/* ************************************************************************ * Copyright 2013 Advanced Micro Devices, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * ************************************************************************/ #include #include #include /* Include CLBLAS header. It automatically includes needed OpenCL header, * so we can drop out explicit inclusion of cl.h header. */ #include /* This example uses predefined matrices and their characteristics for * simplicity purpose. */ static const clblasOrder order = clblasRowMajor; static const clblasSide side = clblasLeft; static const size_t M = 4; static const size_t N = 5; static const cl_float alpha = 10; static const clblasTranspose transA = clblasNoTrans; static const clblasUplo uploA = clblasUpper; static const clblasDiag diagA = clblasNonUnit; static const cl_float A[] = { 11, 12, 13, 14, 0, 22, 23, 24, 0, 0, 33, 34, 0, 0, 0, 44 }; static const size_t lda = 4; /* i.e. lda = M */ static cl_float B[] = { 11, 12, 13, 14, 15, 21, 22, 23, 24, 25, 31, 32, 33, 34, 35, 41, 42, 43, 44, 45 }; static const size_t ldb = 5; /* i.e. ldb = N */ static cl_float result[20]; /* ldb*M */ static const size_t off = 1; static const size_t offA = 4 + 1; /* M + off */ static const size_t offB = 5 + 1; /* N + off */ static void printResult(const char* str) { size_t i, j, nrows; printf("%s:\n", str); nrows = (sizeof(result) / sizeof(cl_float)) / ldb; for (i = 0; i < nrows; i++) { for (j = 0; j < ldb; j++) { printf("%.5e ", result[i * ldb + j]); } printf("\n"); } } int main(void) { cl_int err; cl_platform_id platform = 0; cl_device_id device = 0; cl_context_properties props[3] = { CL_CONTEXT_PLATFORM, 0, 0 }; cl_context ctx = 0; cl_command_queue queue = 0; cl_mem bufA, bufB; cl_event event = NULL; int ret = 0; /* Setup OpenCL environment. */ err = clGetPlatformIDs(1, &platform, NULL); if (err != CL_SUCCESS) { printf( "clGetPlatformIDs() failed with %d\n", err ); return 1; } err = clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, 1, &device, NULL); if (err != CL_SUCCESS) { printf( "clGetDeviceIDs() failed with %d\n", err ); return 1; } props[1] = (cl_context_properties)platform; ctx = clCreateContext(props, 1, &device, NULL, NULL, &err); if (err != CL_SUCCESS) { printf( "clCreateContext() failed with %d\n", err ); return 1; } queue = clCreateCommandQueue(ctx, device, 0, &err); if (err != CL_SUCCESS) { printf( "clCreateCommandQueue() failed with %d\n", err ); clReleaseContext(ctx); return 1; } /* Setup clblas. */ err = clblasSetup(); if (err != CL_SUCCESS) { printf("clblasSetup() failed with %d\n", err); clReleaseCommandQueue(queue); clReleaseContext(ctx); return 1; } /* Prepare OpenCL memory objects and place matrices inside them. */ bufA = clCreateBuffer(ctx, CL_MEM_READ_ONLY, M * M * sizeof(*A), NULL, &err); bufB = clCreateBuffer(ctx, CL_MEM_READ_WRITE, M * N * sizeof(*B), NULL, &err); err = clEnqueueWriteBuffer(queue, bufA, CL_TRUE, 0, M * M * sizeof(*A), A, 0, NULL, NULL); err = clEnqueueWriteBuffer(queue, bufB, CL_TRUE, 0, M * N * sizeof(*B), B, 0, NULL, NULL); /* Call clblas function. Perform TRSM for the lower right sub-matrices */ err = clblasStrsm(order, side, uploA, transA, diagA, M - off, N - off, alpha, bufA, offA, lda, bufB, offB, ldb, 1, &queue, 0, NULL, &event); if (err != CL_SUCCESS) { printf("clblasStrsmEx() failed with %d\n", err); ret = 1; } else { /* Wait for calculations to be finished. */ err = clWaitForEvents(1, &event); /* Fetch results of calculations from GPU memory. */ err = clEnqueueReadBuffer(queue, bufB, CL_TRUE, 0, M * N * sizeof(*result), result, 0, NULL, NULL); /* At this point you will get the result of STRSM placed in 'result' array. */ puts(""); printResult("clblasStrsmEx result"); } /* Release OpenCL events. */ clReleaseEvent(event); /* Release OpenCL memory objects. */ clReleaseMemObject(bufB); clReleaseMemObject(bufA); /* Finalize work with clblas. */ clblasTeardown(); /* Release OpenCL working objects. */ clReleaseCommandQueue(queue); clReleaseContext(ctx); return ret; } clblas-2.10/src/samples/example_strsv.c000066400000000000000000000107611264277366700202200ustar00rootroot00000000000000/* ************************************************************************ * Copyright 2013 Advanced Micro Devices, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * ************************************************************************/ #include #include #include /* Include CLBLAS header. It automatically includes needed OpenCL header, * so we can drop out explicit inclusion of cl.h header. */ #include /* This example uses predefined matrices and their characteristics for * simplicity purpose. */ static const clblasOrder order = clblasRowMajor; static const size_t N = 4; static const clblasTranspose transA = clblasTrans; static const clblasUplo uploA = clblasLower; static const clblasDiag diagA = clblasNonUnit; static const cl_float A[] = { 11, 0, 0, 0, 12, 22, 0, 0, 13, 23, 33, 0, 14, 24, 34, 44 }; static const size_t lda = 4; /* i.e. lda = N */ static cl_float X[] = { 11, 21, 31, 41 }; static const int incx = 1; static void printResult(void) { size_t i; printf("Result:\n\n"); for (i = 0; i < N; i++) { printf("%f \n", X[i * incx]); } } int main(void) { cl_int err; cl_platform_id platform = 0; cl_device_id device = 0; cl_context_properties props[3] = { CL_CONTEXT_PLATFORM, 0, 0 }; cl_context ctx = 0; cl_command_queue queue = 0; cl_mem bufA, bufX; cl_event event = NULL; int ret = 0; /* Setup OpenCL environment. */ err = clGetPlatformIDs(1, &platform, NULL); if (err != CL_SUCCESS) { printf( "clGetPlatformIDs() failed with %d\n", err ); return 1; } err = clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, 1, &device, NULL); if (err != CL_SUCCESS) { printf( "clGetDeviceIDs() failed with %d\n", err ); return 1; } props[1] = (cl_context_properties)platform; ctx = clCreateContext(props, 1, &device, NULL, NULL, &err); if (err != CL_SUCCESS) { printf( "clCreateContext() failed with %d\n", err ); return 1; } queue = clCreateCommandQueue(ctx, device, 0, &err); if (err != CL_SUCCESS) { printf( "clCreateCommandQueue() failed with %d\n", err ); clReleaseContext(ctx); return 1; } /* Setup clblas. */ err = clblasSetup(); if (err != CL_SUCCESS) { printf("clblasSetup() failed with %d\n", err); clReleaseCommandQueue(queue); clReleaseContext(ctx); return 1; } /* Prepare OpenCL memory objects and place matrices inside them. */ bufA = clCreateBuffer(ctx, CL_MEM_READ_ONLY, N * N * sizeof(cl_float), NULL, &err); bufX = clCreateBuffer(ctx, CL_MEM_READ_WRITE, N * sizeof(cl_float), NULL, &err); err = clEnqueueWriteBuffer(queue, bufA, CL_TRUE, 0, N * N * sizeof(cl_float), A, 0, NULL, NULL); err = clEnqueueWriteBuffer(queue, bufX, CL_TRUE, 0, N * sizeof(cl_float), X, 0, NULL, NULL); /* Call clblas function. */ err = clblasStrsv(order, uploA, transA, diagA, N, bufA, 0, lda, bufX, 0, incx, 1, &queue, 0, NULL, &event); if (err != CL_SUCCESS) { printf("clblasStrsv() failed with %d\n", err); ret = 1; } else { /* Wait for calculations to be finished. */ err = clWaitForEvents(1, &event); /* Fetch results of calculations from GPU memory. */ err = clEnqueueReadBuffer(queue, bufX, CL_TRUE, 0, N * sizeof(cl_float), X, 0, NULL, NULL); /* At this point you will get the result of STRSV placed in X array. */ printResult(); } /* Release OpenCL events. */ clReleaseEvent(event); /* Release OpenCL memory objects. */ clReleaseMemObject(bufX); clReleaseMemObject(bufA); /* Finalize work with clblas. */ clblasTeardown(); /* Release OpenCL working objects. */ clReleaseCommandQueue(queue); clReleaseContext(ctx); return ret; } clblas-2.10/src/samples/example_zhemv.cpp000066400000000000000000000133541264277366700205310ustar00rootroot00000000000000/* ************************************************************************ * Copyright 2013 Advanced Micro Devices, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * ************************************************************************/ #include #include #include /* Include CLBLAS header. It automatically includes needed OpenCL header, * so we can drop out explicit inclusion of cl.h header. */ #include /* This example uses predefined matrices and their characteristics for * simplicity purpose. */ static const clblasOrder order = clblasColumnMajor; static const size_t N = 5; static const cl_double2 alpha = {{10,10}}; static const clblasUplo uplo = clblasUpper; static const cl_double2 A[] = { {{ 1.0, 00.0}}, {{ 2.0, 02.0}}, {{ 4.0, 4.0}}, {{ 7.0, 7.0}}, {{11.0, 11.0}}, {{00.0, 00.0}}, {{ 3.0, 03.0}}, {{ 5.0, 5.0}}, {{ 8.0, 8.0}}, {{12.0, 12.0}}, {{00.0, 00.0}}, {{00.0, 00.0}}, {{ 6.0, 6.0}}, {{ 9.0, 9.0}}, {{13.0, 13.0}}, {{00.0, 00.0}}, {{00.0, 00.0}}, {{00.0, 00.0}}, {{10.0, 10.0}}, {{14.0, 14.0}}, {{00.0, 00.0}}, {{00.0, 00.0}}, {{00.0, 00.0}}, {{00.0, 00.0}}, {{15.0, 15.0}} }; static const size_t lda = 5; /* i.e. lda = N */ static const cl_double2 X[] = { {{1.0, 0.0}}, {{2.0, 0.0}}, {{3.0, 0.0}}, {{4.0, 0.0}}, {{5.0, 0.0}} }; static const int incx = 1; static const cl_double2 beta = {{20.0, 20.0}}; static cl_double2 Y[] = { {{1.0, 0.0}}, {{2.0, 0.0}}, {{3.0, 0.0}}, {{4.0, 0.0}}, {{5.0, 0.0}} }; static const int incy = 1; static void printResult(void) { size_t i, nElements; printf("Result:\n"); nElements = (sizeof(Y) / sizeof(cl_double2)) / incy; for (i = 0; i < nElements; i++) { printf("(%9.2f, %-9.2f)\n", CREAL(Y[i * incy]), CIMAG(Y[i * incy])); } } int main(void) { cl_int err; cl_platform_id platform = 0; cl_device_id device = 0; cl_context_properties props[3] = { CL_CONTEXT_PLATFORM, 0, 0 }; cl_context ctx = 0; cl_command_queue queue = 0; cl_mem bufA, bufX, bufY; cl_event event = NULL; int ret = 0; /* Setup OpenCL environment. */ err = clGetPlatformIDs(1, &platform, NULL); if (err != CL_SUCCESS) { printf( "clGetPlatformIDs() failed with %d\n", err ); return 1; } err = clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, 1, &device, NULL); if (err != CL_SUCCESS) { printf( "clGetDeviceIDs() failed with %d\n", err ); return 1; } props[1] = (cl_context_properties)platform; ctx = clCreateContext(props, 1, &device, NULL, NULL, &err); if (err != CL_SUCCESS) { printf( "clCreateContext() failed with %d\n", err ); return 1; } queue = clCreateCommandQueue(ctx, device, 0, &err); if (err != CL_SUCCESS) { printf( "clCreateCommandQueue() failed with %d\n", err ); clReleaseContext(ctx); return 1; } /* Setup clblas. */ err = clblasSetup(); if (err != CL_SUCCESS) { printf("clblasSetup() failed with %d\n", err); clReleaseCommandQueue(queue); clReleaseContext(ctx); return 1; } /* Prepare OpenCL memory objects and place matrices inside them. */ bufA = clCreateBuffer(ctx, CL_MEM_READ_ONLY, N * lda * sizeof(*A), NULL, &err); bufX = clCreateBuffer(ctx, CL_MEM_READ_ONLY, N * sizeof(*X), NULL, &err); bufY = clCreateBuffer(ctx, CL_MEM_READ_WRITE, N * sizeof(*Y), NULL, &err); err = clEnqueueWriteBuffer(queue, bufA, CL_TRUE, 0, N * lda * sizeof(*A), A, 0, NULL, NULL); err = clEnqueueWriteBuffer(queue, bufX, CL_TRUE, 0, N * sizeof(*X), X, 0, NULL, NULL); err = clEnqueueWriteBuffer(queue, bufY, CL_TRUE, 0, N * sizeof(*Y), Y, 0, NULL, NULL); /* Call clblas function. */ err = clblasZhemv(order, uplo, N, alpha, bufA, 0 /*offA */, lda, bufX, 0 /*offx*/, incx, beta, bufY, 0 /*offx*/, incy, 1, &queue, 0, NULL, &event); // blasZhemv(order, uplo, N, alpha, (DoubleComplex*)A, 0, lda, (DoubleComplex*)X, 0, incx, beta, (DoubleComplex*)Y, 0, incy); // err = CL_SUCCESS; //err = clblasZtrmv(order, uplo, clblasNoTrans, clblasNonUnit, N, bufA, 0 /*offA */, lda, // bufX, 0 /*offx*/, incx, // bufY, 1, &queue, 0, NULL, &event); if (err != CL_SUCCESS) { printf("clblasZhemv() failed with %d\n", err); ret = 1; } else { /* Wait for calculations to be finished. */ err = clWaitForEvents(1, &event); printResult(); /* Fetch results of calculations from GPU memory. */ err = clEnqueueReadBuffer(queue, bufY, CL_TRUE, 0, N * sizeof(*Y), Y, 0, NULL, NULL); /* At this point you will get the result of SSYMV placed in Y array. */ printResult(); } /* Release OpenCL events. */ clReleaseEvent(event); /* Release OpenCL memory objects. */ clReleaseMemObject(bufY); clReleaseMemObject(bufX); clReleaseMemObject(bufA); /* Finalize work with clblas. */ clblasTeardown(); /* Release OpenCL working objects. */ clReleaseCommandQueue(queue); clReleaseContext(ctx); return ret; } clblas-2.10/src/samples/example_zher2.c000066400000000000000000000125441264277366700200720ustar00rootroot00000000000000/* ************************************************************************ * Copyright 2013 Advanced Micro Devices, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * ************************************************************************/ #include #include #include /* Include CLBLAS header. It automatically includes needed OpenCL header, * so we can drop out explicit inclusion of cl.h header. */ #include /* This example uses predefined matrices and their characteristics for * simplicity purpose. */ static const clblasOrder order = clblasRowMajor; static const size_t N = 5; static cl_double2 alpha = {{10.0f, 2.0f}}; static const clblasUplo uplo = clblasUpper; static cl_double2 A[] = { {{11.0f, 00.0f}}, {{12.0f, 02.0f}}, {{13.0f, 05.0f}}, {{14.0f, 12.0f}}, {{15.0f, 06.0f}}, {{00.0f, 00.0f}}, {{22.0f, 00.0f}}, {{23.0f, 25.0f}}, {{24.0f, 23.0f}}, {{25.0f, 61.0f}}, {{00.0f, 00.0f}}, {{00.0f, 00.0f}}, {{33.0f, 00.0f}}, {{34.0f, 23.0f}}, {{35.0f, 21.0f}}, {{00.0f, 00.0f}}, {{00.0f, 00.0f}}, {{00.0f, 00.0f}}, {{44.0f, 00.0f}}, {{45.0f, 23.0f}}, {{00.0f, 00.0f}}, {{00.0f, 00.0f}}, {{00.0f, 00.0f}}, {{00.0f, 00.0f}}, {{55.0f, 00.0f}} }; static const size_t lda = 5; /* i.e. lda = N */ static const cl_double2 X[] = { {{11.0f, 03.0f}}, {{01.0f, 15.0f}}, {{30.0f, 20.0f}}, {{01.0f, 02.0f}}, {{11.0f, 10.0f}} }; static const int incx = 1; static const cl_double2 Y[] = { {{11.0f, 03.0f}}, {{03.0f, 05.0f}}, {{09.0f, 00.0f}}, {{01.0f, 02.0f}}, {{11.0f, 00.0f}} }; static const int incy = 1; static void printResult(void) { size_t i, j; printf("\nResult:\n"); for (i = 0; i < N; i++) { for(j = 0; j < N; j++) printf("(%9.2lf, %-9.2lf)\t", CREAL( A[ i*N + j ] ), CIMAG( A[ i*N + j ] )); printf("\n"); } } int main(void) { cl_int err; cl_platform_id platform = 0; cl_device_id device = 0; cl_context_properties props[3] = { CL_CONTEXT_PLATFORM, 0, 0 }; cl_context ctx = 0; cl_command_queue queue = 0; cl_mem bufA, bufX, bufY; cl_event event = NULL; int ret = 0; /* Setup OpenCL environment. */ err = clGetPlatformIDs(1, &platform, NULL); if (err != CL_SUCCESS) { printf( "clGetPlatformIDs() failed with %d\n", err ); return 1; } err = clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, 1, &device, NULL); if (err != CL_SUCCESS) { printf( "clGetDeviceIDs() failed with %d\n", err ); return 1; } props[1] = (cl_context_properties)platform; ctx = clCreateContext(props, 1, &device, NULL, NULL, &err); if (err != CL_SUCCESS) { printf( "clCreateContext() failed with %d\n", err ); return 1; } queue = clCreateCommandQueue(ctx, device, 0, &err); if (err != CL_SUCCESS) { printf( "clCreateCommandQueue() failed with %d\n", err ); clReleaseContext(ctx); return 1; } /* Setup clblas. */ err = clblasSetup(); if (err != CL_SUCCESS) { printf("clblasSetup() failed with %d\n", err); clReleaseCommandQueue(queue); clReleaseContext(ctx); return 1; } /* Prepare OpenCL memory objects and place matrices inside them. */ bufA = clCreateBuffer(ctx, CL_MEM_READ_WRITE, N * lda * sizeof(cl_double2), NULL, &err); bufX = clCreateBuffer(ctx, CL_MEM_READ_ONLY, N * sizeof(cl_double2), NULL, &err); bufY = clCreateBuffer(ctx, CL_MEM_READ_ONLY, N * sizeof(cl_double2), NULL, &err); err = clEnqueueWriteBuffer(queue, bufA, CL_TRUE, 0, N * lda * sizeof(cl_double2), A, 0, NULL, NULL); err = clEnqueueWriteBuffer(queue, bufX, CL_TRUE, 0, N * sizeof(cl_double2), X, 0, NULL, NULL); err = clEnqueueWriteBuffer(queue, bufY, CL_TRUE, 0, N * sizeof(cl_double2), Y, 0, NULL, NULL); err = clblasZher2(order, uplo, N, alpha, bufX, 0 /*offx */, incx, bufY, 0 /*offy*/, incy, bufA, 0 /*offa */, lda, 1, &queue, 0, NULL, &event); if (err != CL_SUCCESS) { printf("clblasZher2() failed with %d\n", err); ret = 1; } else { /* Wait for calculations to be finished. */ err = clWaitForEvents(1, &event); /* Fetch results of calculations from GPU memory. */ err = clEnqueueReadBuffer(queue, bufA, CL_TRUE, 0, (N * lda * sizeof(cl_double2)), A, 0, NULL, NULL); /* At this point you will get the result of ZHER2 placed in A array. */ printResult(); } /* Release OpenCL events. */ clReleaseEvent(event); /* Release OpenCL memory objects. */ clReleaseMemObject(bufX); clReleaseMemObject(bufA); clReleaseMemObject(bufY); /* Finalize work with clblas. */ clblasTeardown(); /* Release OpenCL working objects. */ clReleaseCommandQueue(queue); clReleaseContext(ctx); return ret; } clblas-2.10/src/samples/example_zhpr2.c000066400000000000000000000133261264277366700201040ustar00rootroot00000000000000/* ************************************************************************ * Copyright 2013 Advanced Micro Devices, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * ************************************************************************/ #include #include #include /* Include CLBLAS header. It automatically includes needed OpenCL header, * so we can drop out explicit inclusion of cl.h header. */ #include /* This example uses predefined matrices and their characteristics for * simplicity purpose. */ static const clblasOrder order = clblasRowMajor; static const size_t N = 5; static cl_double2 alpha = {{10.0f, 2.0f}}; static const clblasUplo uplo = clblasUpper; static cl_double2 AP[] = { {{11.0f, 00.0f}}, {{12.0f, 02.0f}}, {{13.0f, 05.0f}}, {{14.0f, 12.0f}}, {{15.0f, 06.0f}}, {{22.0f, 00.0f}}, {{23.0f, 25.0f}}, {{24.0f, 23.0f}}, {{25.0f, 61.0f}}, {{33.0f, 00.0f}}, {{34.0f, 23.0f}}, {{35.0f, 21.0f}}, {{44.0f, 00.0f}}, {{45.0f, 23.0f}}, {{55.0f, 00.0f}} }; static const cl_double2 X[] = { {{11.0f, 03.0f}}, {{01.0f, 15.0f}}, {{30.0f, 20.0f}}, {{01.0f, 02.0f}}, {{11.0f, 10.0f}} }; static const int incx = 1; static const cl_double2 Y[] = { {{11.0f, 03.0f}}, {{03.0f, 05.0f}}, {{09.0f, 00.0f}}, {{01.0f, 02.0f}}, {{11.0f, 00.0f}} }; static const int incy = 1; static void printResult(void) { size_t i, j, off; printf("\nResult:\n"); off = 0; for (i = 0; i < N; i++) { for(j = 0; j < N; j++) { if( ( (uplo == clblasUpper) && (i > j)) || ((uplo == clblasLower) && (j > i)) ) { printf("\t\t\t"); continue; } printf("(%9.2lf, %-9.2lf)\t", CREAL( AP[ off ] ), CIMAG( AP[ off ] )); off ++ ; } printf("\n"); } } int main(void) { cl_int err; cl_platform_id platform = 0; cl_device_id device = 0; cl_context_properties props[3] = { CL_CONTEXT_PLATFORM, 0, 0 }; cl_context ctx = 0; cl_command_queue queue = 0; cl_mem bufAP, bufX, bufY; cl_event event = NULL; int ret = 0, numElementsAP; /* Setup OpenCL environment. */ err = clGetPlatformIDs(1, &platform, NULL); if (err != CL_SUCCESS) { printf( "clGetPlatformIDs() failed with %d\n", err ); return 1; } err = clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, 1, &device, NULL); if (err != CL_SUCCESS) { printf( "clGetDeviceIDs() failed with %d\n", err ); return 1; } props[1] = (cl_context_properties)platform; ctx = clCreateContext(props, 1, &device, NULL, NULL, &err); if (err != CL_SUCCESS) { printf( "clCreateContext() failed with %d\n", err ); return 1; } queue = clCreateCommandQueue(ctx, device, 0, &err); if (err != CL_SUCCESS) { printf( "clCreateCommandQueue() failed with %d\n", err ); clReleaseContext(ctx); return 1; } /* Setup clblas. */ err = clblasSetup(); if (err != CL_SUCCESS) { printf("clblasSetup() failed with %d\n", err); clReleaseCommandQueue(queue); clReleaseContext(ctx); return 1; } numElementsAP = (N * (N+1)) / 2; // To get number of elements in a packed matrix /* Prepare OpenCL memory objects and place matrices inside them. */ bufAP = clCreateBuffer(ctx, CL_MEM_READ_WRITE, (numElementsAP * sizeof(cl_double2)), NULL, &err); bufX = clCreateBuffer(ctx, CL_MEM_READ_ONLY, N * sizeof(cl_double2), NULL, &err); bufY = clCreateBuffer(ctx, CL_MEM_READ_ONLY, N * sizeof(cl_double2), NULL, &err); err = clEnqueueWriteBuffer(queue, bufAP, CL_TRUE, 0, numElementsAP * sizeof(cl_double2), AP, 0, NULL, NULL); err = clEnqueueWriteBuffer(queue, bufX, CL_TRUE, 0, N * sizeof(cl_double2), X, 0, NULL, NULL); err = clEnqueueWriteBuffer(queue, bufY, CL_TRUE, 0, N * sizeof(cl_double2), Y, 0, NULL, NULL); err = clblasZhpr2(order, uplo, N, alpha, bufX, 0 /*offx */, incx, bufY, 0 /*offy*/, incy, bufAP, 0 /*offa */, 1, &queue, 0, NULL, &event); if (err != CL_SUCCESS) { printf("clblasZhpr2() failed with %d\n", err); ret = 1; } else { /* Wait for calculations to be finished. */ err = clWaitForEvents(1, &event); /* Fetch results of calculations from GPU memory. */ err = clEnqueueReadBuffer(queue, bufAP, CL_TRUE, 0, (numElementsAP * sizeof(cl_double2)), AP, 0, NULL, NULL); /* At this point you will get the result of ZHPR2 placed in A array. */ printResult(); } /* Release OpenCL events. */ clReleaseEvent(event); /* Release OpenCL memory objects. */ clReleaseMemObject(bufX); clReleaseMemObject(bufAP); clReleaseMemObject(bufY); /* Finalize work with clblas. */ clblasTeardown(); /* Release OpenCL working objects. */ clReleaseCommandQueue(queue); clReleaseContext(ctx); return ret; } clblas-2.10/src/scripts/000077500000000000000000000000001264277366700151765ustar00rootroot00000000000000clblas-2.10/src/scripts/perf/000077500000000000000000000000001264277366700161325ustar00rootroot00000000000000clblas-2.10/src/scripts/perf/CMakeLists.txt000066400000000000000000000020541264277366700206730ustar00rootroot00000000000000# ######################################################################## # Copyright 2013 Advanced Micro Devices, Inc. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # ######################################################################## set(GRAPHING_SCRIPTS measurePerformance.py plotPerformance.py blasPerformanceTesting.py errorHandler.py performanceUtility.py ) if( WIN32 ) install( FILES ${GRAPHING_SCRIPTS} DESTINATION bin${SUFFIX_BIN} ) else ( ) install( FILES ${GRAPHING_SCRIPTS} DESTINATION share/clBLAS ) endif( ) clblas-2.10/src/scripts/perf/blasPerformanceTesting.py000066400000000000000000000304171264277366700231520ustar00rootroot00000000000000# ######################################################################## # Copyright 2013 Advanced Micro Devices, Inc. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # ######################################################################## import itertools import re#gex import subprocess import os import sys from datetime import datetime # Common data and functions for the performance suite class TestCombination: def __init__(self, lengthx, lengthy, lengthz, batchsize, device, inlayout, outlayout, placeness, ldscomplex, ldsfraction, cachesize, xfactor, label): self.x = lengthx self.y = lengthy self.z = lengthz self.batchsize = batchsize self.device = device self.inlayout = inlayout self.outlayout = outlayout self.placeness = placeness self.ldscomplex = ldscomplex self.ldsfraction = ldsfraction self.cachesize = cachesize self.xfactor = xfactor self.label = label def __str__(self): return self.x + 'x' + self.y + 'x' + self.z + ':' + self.batchsize + ', ' + self.device + ', ' + self.inlayout + '/' + self.outlayout + ', ' + self.placeness + ', LDS comp(' + self.ldscomplex + '), LDS frac(' + self.ldsfraction + '), cachesz(' + self.cachesize + '), X-factor(' + self.xfactor + ') -- ' + self.label class GraphPoint: def __init__(self, lengthx, lengthy, lengthz, batchsize, ldsfraction, device, label, gflops): self.x = lengthx self.y = lengthy self.z = lengthz self.batchsize = batchsize self.device = device self.label = label self.ldsfraction = ldsfraction self.gflops = gflops self.problemsize = str(int(self.x) * int(self.y) * int(self.z) * int(self.batchsize)) def __str__(self): # ALL members must be represented here (x, y, z, batch, device, label, ldsfraction, etc) return self.x + 'x' + self.y + 'x' + self.z + ':' + self.batchsize + ', ' + self.device + ', LDS fraction = ' + self.ldsfraction + ' -- ' + self.label + '; ' + self.gflops class TableRow: # parameters = class TestCombination instantiation def __init__(self, parameters, gflops): self.parameters = parameters self.gflops = gflops def __str__(self): return self.parameters.__str__() + '; ' + self.gflops def transformDimension(x,y,z): if int(z) != 1: return 3 elif int(y) != 1: return 2 elif int(x) != 1: return 1 def executable(library): if type(library) != str: print 'ERROR: expected library name to be a string' quit() if sys.platform != 'win32' and sys.platform != 'linux2': print 'ERROR: unknown operating system' quit() if library == 'clblas': if sys.platform == 'win32': exe = 'clBLAS-client.exe' elif sys.platform == 'linux2': exe = './clBLAS-client' if library == 'acmlblas': if sys.platform == 'win32': exe = 'ACMLBlas_client.exe' elif sys.platform == 'linux2': exe = './ACMLBlas_client' if library!='null' and library!='clblas' and library!='acmlblas': print 'ERROR: unknown library -- cannot determine executable name ' + library quit() if not os.path.isfile(exe): error_message = 'ERROR: could not find client named ' + exe print error_message quit() return exe def max_mem_available_in_bytes(exe, device): arguments = [exe, '-i', device] deviceInfo = subprocess.check_output(arguments, stderr=subprocess.STDOUT).split(os.linesep) deviceInfo = itertools.ifilter( lambda x: x.count('MAX_MEM_ALLOC_SIZE'), deviceInfo) deviceInfo = list(itertools.islice(deviceInfo, None)) maxMemoryAvailable = re.search('\d+$', deviceInfo[0]) return int(maxMemoryAvailable.group(0)) def max_problem_size(exe, device): numbers_in_one_datapoint = 2 # (i.e.: real or complex?) bytes_in_one_number = 4 # (i.e.: single or double precision?) return max_mem_available_in_bytes(exe, device) / (numbers_in_one_datapoint * bytes_in_one_number) def maxBatchSize(lengthx, lengthy, lengthz, exe, device): problemSize = int(lengthx) * int(lengthy) * int(lengthz) maxBatchSize = max_problem_size(exe, device) / problemSize if int(lengthx) == pow(2,16) or int(lengthx) == pow(2,17): # special cases in the kernel. extra padding is added in, so we need to shrink the batch size to accommodate return str(maxBatchSize/2) else: return str(maxBatchSize) def create_ini_file_if_requested(args): if args.createIniFilename: #print vars(args) for x in vars(args): #print x if (type(getattr(args,x)) != file) and getattr(args,x) != None\ and x.count('File') == 0: args.createIniFilename.write('--' + x + ' ') args.createIniFilename.write(str(getattr(args,x)) + '; ') quit() def load_ini_file_if_requested(args, parser): if args.useIniFilename: argument_list = args.useIniFilename.readlines() arg_string = str() for a in argument_list: arg_string += a arg_string = arg_string.replace(';', '') arg_string = arg_string.split() args = parser.parse_args(arg_string) return args def is_numeric_type(x): return type(x) == int or type(x) == long or type(x) == float def split_up_comma_delimited_lists(args): for x in vars(args): attr = getattr(args, x) if attr == None: setattr(args, x, [None]) elif is_numeric_type(attr): setattr(args, x, [attr]) elif type(attr) == str: setattr(args, x, attr.split(',')) return args class Range: def __init__(self, ranges, defaultStep='+1'): # we might be passed in a single value or a list of strings # if we receive a single value, we want to feed it right back if type(ranges) != list: self.expanded = ranges elif ranges[0] == None: self.expanded = [None] else: self.expanded = [] for thisRange in ranges: thisRange = str(thisRange) if re.search('^\+\d+$', thisRange): self.expanded = self.expanded + [thisRange] elif thisRange == 'max': self.expanded = self.expanded + ['max'] else: #elif thisRange != 'max': if thisRange.count(':'): self._stepAmount = thisRange.split(':')[1] else: self._stepAmount = defaultStep thisRange = thisRange.split(':')[0] if self._stepAmount.count('x'): self._stepper = '_mult' else: self._stepper = '_add' self._stepAmount = self._stepAmount.lstrip('+x') self._stepAmount = int(self._stepAmount) if thisRange.count('-'): self.begin = int(thisRange.split('-')[0]) self.end = int(thisRange.split('-')[1]) else: self.begin = int(thisRange.split('-')[0]) self.end = int(thisRange.split('-')[0]) self.current = self.begin if self.begin == 0 and self._stepper == '_mult': self.expanded = self.expanded + [0] else: while self.current <= self.end: self.expanded = self.expanded + [self.current] self._step() # now we want to uniquify and sort the expanded range self.expanded = list(set(self.expanded)) self.expanded.sort() # advance current value to next def _step(self): getattr(self, self._stepper)() def _mult(self): self.current = self.current * self._stepAmount def _add(self): self.current = self.current + self._stepAmount def expand_range(a_range): return Range(a_range).expanded def decode_parameter_problemsize(problemsize): if not problemsize.count(None): i = 0 while i < len(problemsize): problemsize[i] = problemsize[i].split(':') j = 0 while j < len(problemsize[i]): problemsize[i][j] = problemsize[i][j].split('x') j = j+1 i = i+1 return problemsize def blas_table_header(): return 'm,n,k,lda,ldb,ldc,offa,offb,offc,alpha,beta,order,transa,transb,side,uplo,diag,function,device,library,numQueues,label,GFLOPS' class BlasTestCombination: def __init__(self, sizem, sizen, sizek, lda, ldb, ldc, offa, offb, offc, alpha, beta, order, transa, transb, side, uplo, diag, function, precision, device, library, label): self.sizem = str(sizem) self.sizen = str(sizen) self.sizek = str(sizek) self.lda = str(lda) self.ldb = str(ldb) self.ldc = str(ldc) self.offa = str(offa) self.offb = str(offb) self.offc = str(offc) self.alpha = str(alpha) self.beta = str(beta) self.order = order self.transa = transa self.transb = transb self.side = side self.uplo = uplo self.diag = diag self.function = function self.precision = precision self.device = device self.library = library self.label = label def __str__(self): return self.sizem + 'x' + self.sizen + 'x' + self.sizek + ':' + self.lda + 'x' + self.ldb + 'x' + self.ldc + self.offa + 'x' + self.offb + 'x' + self.offc + ', ' + self.device + ', ' + self.precision + self.function + ', ' + self.library + ', alpha(' + self.alpha + '), beta(' + self.beta + '), order(' + self.order + '), transa(' + self.transa + '), transb(' + self.transb + '), side(' + self.side + '), uplo(' + self.uplo + '), diag(' + self.diag + ') -- ' + self.label class BlasGraphPoint: def __init__(self, sizem, sizen, sizek, lda, ldb, ldc, offa, offb, offc, device, order, transa, transb, function, library, label, gflops): self.sizem = sizem self.sizen = sizen self.sizek = sizek self.lda = lda self.ldb = ldb self.ldc = ldc self.offa = offa self.offb = offb self.offc = offc self.device = device self.order = order self.transa = transa self.transb = transb self.function = function self.library = library self.label = label self.gflops = gflops def __str__(self): # ALL members must be represented here (x, y, z, batch, device, label, ldsfraction, etc) return self.sizem + 'x' + self.sizen + 'x' + self.sizek + ':' + self.device + ', ' + self.function + ', ' + self.library + ', order(' + self.order + '), transa(' + self.transa + '), transb(' + self.transb + ') -- ' + self.label + '; ' + self.gflops + ' gflops' def open_file( filename ): if type(filename) == list: filename = filename[0] if filename == None: filename = 'results' + datetime.now().isoformat().replace(':','.') + '.txt' else: if os.path.isfile(filename): oldname = filename filename = filename + datetime.now().isoformat().replace(':','.') message = 'A file with the name ' + oldname + ' already exists. Changing filename to ' + filename print message return open(filename, 'w') clblas-2.10/src/scripts/perf/errorHandler.py000066400000000000000000000054101264277366700211330ustar00rootroot00000000000000# ######################################################################## # Copyright 2013 Advanced Micro Devices, Inc. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # ######################################################################## #---------------------------------File Note------------------------------------ #Date: 27 January 2012 #This file defines all the error code and error handler mechanism #--------------------------------Global Variables------------------------------ UINS_CAT = 100 WIN_REG_SEARCH_FAIL = 101 UNIMPL_APP = 200 SYS_ERR = 300 TIME_OUT = 400 DIM_INCO_FILE_FMT = 500 #incorrect file format for dimension DIM_FILE_VAL_INCO = 501 #Value coming from dimension file is incorrect #__errorTable : Defines all the errors in the system. Add a new error code and # error message here """Error table is defined as private to this module""" errorTable = { UINS_CAT: 'Application is not able to find the installed catalyst', WIN_REG_SEARCH_FAIL: 'Windows Registry search for catalysts version is unsuccessful', UNIMPL_APP: 'Unimplemented Application requirement', SYS_ERR: 'System error occurred - Please check the source code', TIME_OUT: 'Operation is timed out', DIM_INCO_FILE_FMT: 'incorrect file format for dimension - Not able to find dimension', DIM_FILE_VAL_INCO: 'Value coming from dimension file is incorrect' } #--------------------------------Class Definitions----------------------------- class TimeoutException(Exception): pass """Base class for handling all the application generated exception""" class ApplicationException(Exception): def __init__(self, fileName, errno, msg = ""): self.fileName = fileName self.errno = errno self.mess = errorTable[errno] + msg self.message = 'Application ERROR:'+repr(self.fileName+'-'+str(self.errno)+'-'+self.mess) def __str__(self): return repr(self.fileName+'-'+str(self.errno)+'-'+self.mess) #--------------------------------Global Function------------------------------- if __name__ == '__main__': #print errorTable try: raise ApplicationException('errorHandler', SYS_ERR) except: print 'Generic exception' clblas-2.10/src/scripts/perf/measurePerformance.py000066400000000000000000000607741264277366700223450ustar00rootroot00000000000000# ######################################################################## # Copyright 2013 Advanced Micro Devices, Inc. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # ######################################################################## import sys import argparse import subprocess import itertools import re#gex import os from threading import Timer, Thread import thread, time from platform import system from datetime import datetime import errorHandler from blasPerformanceTesting import * from performanceUtility import timeout, log IAM = 'BLAS' TIMOUT_VAL = 900 #In seconds """ define and parse parameters """ devicevalues = ['gpu', 'cpu'] libraryvalues = ['clblas','acmlblas'] ordervalues = ['row','column'] transvalues = ['none','transpose','conj'] sidevalues = ['left','right'] uplovalues = ['upper','lower'] diagvalues = ['unit','nonunit'] functionvalues = ['gemm', 'trmm', 'trsm', 'syrk', 'syr2k', 'gemv', 'symv', 'symm', 'hemm', 'herk', 'her2k' ] precisionvalues = ['s', 'd', 'c', 'z'] roundtripvalues = ['roundtrip','noroundtrip','both'] memallocvalues = ['default','alloc_host_ptr','use_host_ptr','copy_host_ptr','use_persistent_mem_amd'] parser = argparse.ArgumentParser(description='Measure performance of the clblas library') parser.add_argument('--device', dest='device', default='gpu', help='device(s) to run on; may be a comma-delimited list. choices are ' + str(devicevalues) + '. (default gpu)') parser.add_argument('-m', '--sizem', dest='sizem', default=None, help='size(s) of m to test; may include ranges and comma-delimited lists. stepping may be indicated with a colon. e.g., 1024 or 100-800:100 or 15,2048-3000') parser.add_argument('-n', '--sizen', dest='sizen', default=None, help='size(s) of n to test; may include ranges and comma-delimited lists. stepping may be indicated with a colon. e.g., 1024 or 100-800:100 or 15,2048-3000') parser.add_argument('-k', '--sizek', dest='sizek', default=None, help='size(s) of k to test; may include ranges and comma-delimited lists. stepping may be indicated with a colon. e.g., 1024 or 100-800:100 or 15,2048-3000') parser.add_argument('-s', '--square', dest='square', default=None, help='size(s) of m=n=k to test; may include ranges and comma-delimited lists. stepping may be indicated with a colon. this option sets lda = ldb = ldc to the values indicated with --lda for all problems set with --square. e.g., 1024 or 100-800:100 or 15,2048-3000') parser.add_argument('--problemsize', dest='problemsize', default=None, help='additional problems of a set size. may be used in addition to sizem/n/k and lda/b/c. each indicated problem size will be added to the list of problems to complete. should be entered in MxNxK:AxBxC format (where :AxBxC specifies lda/b/c. :AxBxC is optional. if included, lda/b/c are subject to the same range restrictions as indicated in the lda/b/c section of this help. if omitted, :0x0x0 is assumed). may enter multiple in a comma-delimited list. e.g., 2x2x2:4x6x9,3x3x3 or 1024x800x333') parser.add_argument('--lda', dest='lda', default=0, help='value of lda; may include ranges and comma-delimited lists. stepping may be indicated with a colon. if transA = \'n\', lda must be >= \'m\'. otherwise, lda must be >= \'k\'. if this is violated, the problem will be skipped. if lda is 0, it will be automatically set to match either \'m\' (if transA = \'n\') or \'k\' (otherwise). may indicate relative size with +X, where X is the offset relative to M or K (depending on transA). e.g., 1024 or 100-800:100 or 15,2048-3000 or +10 (if transA = \'n\' and M = 100, lda = 110) (default 0)') parser.add_argument('--ldb', dest='ldb', default=0, help='value of ldb; may include ranges and comma-delimited lists. stepping may be indicated with a colon. if transB = \'n\', ldb must be >= \'k\'. otherwise, ldb must be >= \'n\'. if this is violated, the problem will be skipped. if ldb is 0, it will be automatically set to match either \'k\' (if transB = \'n\') or \'n\' (otherwise). may indicate relative size with +X, where X is the offset relative to K or N (depending on transB). e.g., 1024 or 100-800:100 or 15,2048-3000 or +100 (if transB = \'n\' and K = 2000, ldb = 2100) (default 0)') parser.add_argument('--ldc', dest='ldc', default=0, help='value of ldc; may include ranges and comma-delimited lists. stepping may be indicated with a colon. ldc must be >= \'m\'. if this is violated, the problem will be skipped. if ldc is 0, it will be automatically set to match \'m\'. may indicate relative size with +X, where X is the offset relative to M. e.g., 1024 or 100-800:100 or 15,2048-3000 or +5 (if M = 15, ldc = 20) (default 0)') parser.add_argument('--offa', dest='offa', default=0, help='offset of the matrix A in memory; may include ranges and comma-delimited lists. stepping may be indicated with a colon. e.g., 0-31 or 100-128:2 or 42 (default 0)') parser.add_argument('--offb', dest='offb', default=0, help='offset of the matrix B or vector X in memory; may include ranges and comma-delimited lists. stepping may be indicated with a colon. e.g., 0-31 or 100-128:2 or 42 (default 0)') parser.add_argument('--offc', dest='offc', default=0, help='offset of the matrix C or vector Y in memory; may include ranges and comma-delimited lists. stepping may be indicated with a colon. e.g., 0-31 or 100-128:2 or 42 (default 0)') parser.add_argument('-a', '--alpha', dest='alpha', default=1.0, type=float, help='specifies the scalar alpha') parser.add_argument('-b', '--beta', dest='beta', default=1.0, type=float, help='specifies the scalar beta') parser.add_argument('-f', '--function', dest='function', default='gemm', help='indicates the function(s) to use. may be a comma delimited list. choices are ' + str(functionvalues) + ' (default gemm)') parser.add_argument('-r', '--precision', dest='precision', default='s', help='specifies the precision for the function. may be a comma delimited list. choices are ' + str(precisionvalues) + ' (default s)') parser.add_argument('-o', '--order', dest='order', default='row', help='select row or column major. may be a comma delimited list. choices are ' + str(ordervalues) + ' (default row)') parser.add_argument('--transa', dest='transa', default='none', help='select none, transpose, or conjugate transpose for matrix A. may be a comma delimited list. choices are ' + str(transvalues) + ' (default none)') parser.add_argument('--transb', dest='transb', default='none', help='select none, transpose, or conjugate transpose for matrix B. may be a comma delimited list. choices are ' + str(transvalues) + ' (default none)') parser.add_argument('--side', dest='side', default='left', help='select side, left or right for TRMM and TRSM. may be a comma delimited list. choices are ' + str(sidevalues) + ' (default left)') parser.add_argument('--uplo', dest='uplo', default='upper', help='select uplo, upper or lower triangle. may be a comma delimited list. choices are ' + str(uplovalues) + ' (default upper)') parser.add_argument('--diag', dest='diag', default='unit', help='select diag, whether set diagonal elements to one. may be a comma delimited list. choices are ' + str(diagvalues) + ' (default unit)') parser.add_argument('--library', dest='library', default='clblas', help='indicates the library to use. choices are ' + str(libraryvalues) + ' (default clblas)') parser.add_argument('--label', dest='label', default=None, help='a label to be associated with all transforms performed in this run. if LABEL includes any spaces, it must be in \"double quotes\". note that the label is not saved to an .ini file. e.g., --label cayman may indicate that a test was performed on a cayman card or --label \"Windows 32\" may indicate that the test was performed on Windows 32') parser.add_argument('--tablefile', dest='tableOutputFilename', default=None, help='save the results to a plaintext table with the file name indicated. this can be used with clblas.plotPerformance.py to generate graphs of the data (default: table prints to screen)') parser.add_argument('--roundtrip', dest='roundtrip', default='noroundtrip', help='whether measure the roundtrips or not. choices are ' + str(roundtripvalues) + '. (default noroundtrip); should not be specified when calling ACML') parser.add_argument('--memalloc', dest='memalloc', default='default', help='set the flags for OpenCL memory allocation. Choices are ' + str(memallocvalues) + '. (default is default); do not need to set when calling ACML or if roundtrip is not set') ini_group = parser.add_mutually_exclusive_group() ini_group.add_argument('--createini', dest='createIniFilename', default=None, type=argparse.FileType('w'), help='create an .ini file with the given name that saves the other parameters given at the command line, then quit. e.g., \'clblas.measurePerformance.py -m 10 -n 100 -k 1000-1010 -f sgemm --createini my_favorite_setup.ini\' will create an .ini file that will save the configuration for an sgemm of the indicated sizes.') ini_group.add_argument('--ini', dest='useIniFilename', default=None, type=argparse.FileType('r'), help='use the parameters in the named .ini file instead of the command line parameters.') args = parser.parse_args() label = str(args.label) roundtrip = str(args.roundtrip) library = str(args.library) memalloc = str(args.memalloc) subprocess.call('mkdir perfLog', shell = True) logfile = os.path.join('perfLog', (label+'-'+'blasMeasurePerfLog.txt')) def printLog(txt): print txt log(logfile, txt) printLog("=========================MEASURE PERFORMANCE START===========================") printLog("Process id of Measure Performance:"+str(os.getpid())) #This function is defunct now @timeout(5, "fileName") # timeout is 15 minutes, 15*60 = 300 secs def checkTimeOutPut2(args): global currCommandProcess #ret = subprocess.check_output(args, stderr=subprocess.STDOUT) #return ret currCommandProcess = subprocess.Popen(args, stdout=subprocess.PIPE, stderr=subprocess.PIPE) printLog("Curr Command Process id = "+str(currCommandProcess.pid)) ret = currCommandProcess.communicate() if(ret[0] == None or ret[0] == ''): errCode = currCommandProcess.poll() raise subprocess.CalledProcessError(errCode, args, output=ret[1]) return ret[0] #Spawns a separate thread to execute the library command and wait for that thread to complete #This wait is of 900 seconds (15 minutes). If still the thread is alive then we kill the thread def checkTimeOutPut(args): t = None global currCommandProcess global stde global stdo stde = None stdo = None def executeCommand(): global currCommandProcess global stdo global stde try: stdo, stde = currCommandProcess.communicate() printLog('stdout:\n'+str(stdo)) printLog('stderr:\n'+str(stde)) except: printLog("ERROR: UNKNOWN Exception - +checkWinTimeOutPut()::executeCommand()") currCommandProcess = subprocess.Popen(args, stdout=subprocess.PIPE, stderr=subprocess.PIPE) thread = Thread(target=executeCommand) thread.start() thread.join(TIMOUT_VAL) #wait for the thread to complete if thread.is_alive(): printLog('ERROR: Killing the process - terminating thread because it is taking too much of time to execute') currCommandProcess.kill() printLog('ERROR: Timed out exception') raise errorHandler.ApplicationException(__file__, errorHandler.TIME_OUT) if stdo == "" or stdo==None: errCode = currCommandProcess.poll() printLog('ERROR: @@@@@Raising Called processor exception') raise subprocess.CalledProcessError(errCode, args, output=stde) return stdo printLog('Executing measure performance for label: '+str(label)) create_ini_file_if_requested(args) args = load_ini_file_if_requested(args, parser) args = split_up_comma_delimited_lists(args) """ check parameters for sanity """ if args.sizem.count(None) == 0 and (args.sizen.count(None) or args.sizek.count(None)): printLog( 'ERROR: if any of m, n, or k are specified, all of m, n, and k must be specified') quit() if args.sizen.count(None) == 0 and (args.sizem.count(None) or args.sizek.count(None)): printLog( 'ERROR: if any of m, n, or k are specified, all of m, n, and k must be specified') quit() if args.sizek.count(None) == 0 and (args.sizem.count(None) or args.sizen.count(None)): printLog( 'ERROR: if any of m, n, or k are specified, all of m, n, and k must be specified') quit() if args.square.count(None) and args.problemsize.count(None) and args.sizem.count(None) and args.sizen.count(None) and args.sizek.count(None): printLog( 'ERROR: at least one of [--square] or [--problemsize] or [-m, -n, and -k] must be specified') quit() args.sizem = expand_range(args.sizem) args.sizen = expand_range(args.sizen) args.sizek = expand_range(args.sizek) args.square = expand_range(args.square) args.lda = expand_range(args.lda) args.ldb = expand_range(args.ldb) args.ldc = expand_range(args.ldc) args.offa = expand_range(args.offa) args.offb = expand_range(args.offb) args.offc = expand_range(args.offc) args.problemsize = decode_parameter_problemsize(args.problemsize) """ create the problem size combinations for each run of the client """ if not args.sizem.count(None): # we only need to do make combinations of problem sizes if m,n,k have been specified explicitly problem_size_combinations = itertools.product(args.sizem, args.sizen, args.sizek, args.lda, args.ldb, args.ldc) problem_size_combinations = list(itertools.islice(problem_size_combinations, None)) else: problem_size_combinations = [] """ add manually entered problem sizes to the list of problems to crank out """ manual_test_combinations = [] if not args.problemsize.count(None): for n in args.problemsize: sizem = [] sizen = [] sizek = [] lda = [] ldb = [] ldc = [] sizem.append(int(n[0][0])) sizen.append(int(n[0][1])) sizek.append(int(n[0][2])) if len(n) > 1: lda.append(int(n[1][0])) ldb.append(int(n[1][1])) ldc.append(int(n[1][2])) else: lda.append(0) ldb.append(0) ldc.append(0) combos = itertools.product(sizem,sizen,sizek,lda,ldb,ldc) combos = list(itertools.islice(combos, None)) for n in combos: manual_test_combinations.append(n) """ add square problem sizes to the list of problems to crank out """ square_test_combinations = [] if not args.square.count(None): for n in args.square: combos = itertools.product([n],[n],[n],args.lda) # only lda is considered with --square, and lda/b/c are all set to the values specified by lda combos = list(itertools.islice(combos, None)) for n in combos: square_test_combinations.append((n[0],n[1],n[2],n[3],n[3],n[3])) # set lda/b/c = lda problem_size_combinations = problem_size_combinations + manual_test_combinations + square_test_combinations """ create final list of all transformations (with problem sizes and transform properties) """ test_combinations = itertools.product(problem_size_combinations, args.offa, args.offb, args.offc, args.alpha, args.beta, args.order, args.transa, args.transb, args.side, args.uplo, args.diag, args.function, args.precision, args.device, args.library) test_combinations = list(itertools.islice(test_combinations, None)) test_combinations = [BlasTestCombination(params[0][0], params[0][1], params[0][2], params[0][3], params[0][4], params[0][5], params[1], params[2], params[3], params[4], params[5], params[6], params[7], params[8], params[9], params[10], params[11], params[12], params[13], params[14], params[15], label) for params in test_combinations] """ open output file and write the header """ table = open_file(args.tableOutputFilename) table.write(blas_table_header() + '\n') table.flush() """ turn each test combination into a command, run the command, and then stash the gflops """ result = [] # this is where we'll store the results for the table printLog( 'Total combinations = '+str(len(test_combinations))) vi = 0 #test_combinations = test_combinations[:5] for params in test_combinations: vi = vi+1 printLog('preparing command: '+ str(vi)) device = params.device sizem = params.sizem sizen = params.sizen sizek = params.sizek lda = params.lda ldb = params.ldb ldc = params.ldc offa = params.offa offb = params.offb offc = params.offc alpha = params.alpha beta = params.beta function = params.function precision = params.precision library = params.library label = params.label if params.order == 'row': order = str(0) elif params.order == 'column': order = str(1) else: printLog( 'ERROR: unknown value for order') quit() if params.side == 'left': side = str(0) elif params.side == 'right': side = str(1) else: printLog( 'ERROR: unknown value for side') quit() if params.uplo == 'upper': uplo = str(0) elif params.uplo == 'lower': uplo = str(1) else: printLog( 'ERROR: unknown value for uplo') quit() if params.diag == 'unit': diag = str(0) elif params.diag == 'nonunit': diag = str(1) else: printLog( 'ERROR: unknown value for diag') quit() if re.search('^\+\d+$', lda): if params.transa == 'none': lda = str(int(lda.lstrip('+')) + int(sizem)) else: lda = str(int(lda.lstrip('+')) + int(sizek)) if re.search('^\+\d+$', ldb): if params.transb == 'none': ldb = str(int(ldb.lstrip('+')) + int(sizek)) else: ldb = str(int(ldb.lstrip('+')) + int(sizen)) if re.search('^\+\d+$', ldc): ldc = str(int(ldc.lstrip('+')) + int(sizem)) if params.transa == 'none': transa = str(0) elif params.transa == 'transpose': transa = str(1) elif params.transa == 'conj': transa = str(2) else: printLog( 'ERROR: unknown value for transa') if params.transb == 'none': transb = str(0) elif params.transb == 'transpose': transb = str(1) elif params.transb == 'conj': transb = str(2) else: printLog( 'ERROR: unknown value for transb') if library == 'acmlblas': arguments = [executable(library), '--' + device, '-m', sizem, '-n', sizen, '-k', sizek, '--lda', lda, '--ldb', ldb, '--ldc', ldc, '--offA', offa, '--offBX', offb, '--offCY', offc, '--alpha', alpha, '--beta', beta, '--order', order, '--transposeA', transa, '--transposeB', transb, '--side', side, '--uplo', uplo, '--diag', diag, '--function', function, '--precision', precision, '-p', '10', '--roundtrip', roundtrip] elif library == 'clblas': arguments = [executable(library), '--' + device, '-m', sizem, '-n', sizen, '-k', sizek, '--lda', lda, '--ldb', ldb, '--ldc', ldc, '--offA', offa, '--offBX', offb, '--offCY', offc, '--alpha', alpha, '--beta', beta, '--order', order, '--transposeA', transa, '--transposeB', transb, '--side', side, '--uplo', uplo, '--diag', diag, '--function', function, '--precision', precision, '-p', '10', '--roundtrip', roundtrip, '--memalloc', memalloc] else: printLog( 'ERROR: unknown library:"' +library+ '" can\'t assemble command') quit() writeline = True try: printLog('Executing Command: '+str(arguments)) output = checkTimeOutPut(arguments); output = output.split(os.linesep); printLog('Execution Successfull---------------\n') except errorHandler.ApplicationException as ae: writeline = False #Killing the process #if system() != 'Windows': # currCommandProcess.kill() # printLog('ERROR: Killed process') printLog('ERROR: Command is taking too much of time-- '+ae.message+'\n'+'Command: \n'+str(arguments)) except subprocess.CalledProcessError as clientCrash: if clientCrash.output.count('bad_alloc'): writeline = False printLog( 'Omitting line from table - problem is too large') elif clientCrash.output.count('CL_INVALID_BUFFER_SIZE'): writeline = False printLog( 'Omitting line from table - problem is too large') elif clientCrash.output.count('CL_INVALID_WORK_GROUP_SIZE'): writeline = False printLog( 'Omitting line from table - workgroup size is invalid') elif clientCrash.output.count('lda must be set to 0 or a value >='): writeline = False printLog( 'Omitting line from table - lda is too small') elif clientCrash.output.count('ldb must be set to 0 or a value >='): writeline = False printLog( 'Omitting line from table - ldb is too small') elif clientCrash.output.count('ldc must be set to 0 or a value >='): writeline = False printLog( 'Omitting line from table - ldc is too small') else: writeline = False printLog('ERROR: client crash.\n') printLog(str(clientCrash.output)) printLog( str(clientCrash)) printLog('In original code we quit here - 1') continue #quit() if writeline: gflopsoutput = itertools.ifilter( lambda x: x.count('Gflops'), output) gflopsoutput = list(itertools.islice(gflopsoutput, None)) thisResult = re.search('\d+\.*\d*e*-*\d*$', gflopsoutput[0]) if thisResult != None: thisResult = float(thisResult.group(0)) thisResult = (params.sizem, params.sizen, params.sizek, params.lda, params.ldb, params.ldc, params.offa, params.offb, params.offc, params.alpha, params.beta, params.order, params.transa, params.transb, params.side, params.uplo, params.diag, params.precision + params.function, params.device, params.library, params.label, thisResult) outputRow = '' for x in thisResult: outputRow = outputRow + str(x) + ',' outputRow = outputRow.rstrip(',') table.write(outputRow + '\n') table.flush() else: if gflopsoutput[0].find('nan') or gflopsoutput[0].find('inf'): printLog( 'WARNING: output from client was funky for this run. skipping table row') else: prinLog( 'ERROR: output from client makes no sense') prinLog(str( gflopsoutput[0])) printLog('In original code we quit here - 2') continue #quit() printLog("=========================MEASURE PERFORMANCE ENDS===========================\n") clblas-2.10/src/scripts/perf/performanceUtility.py000066400000000000000000000057441264277366700224030ustar00rootroot00000000000000# ######################################################################## # Copyright 2013 Advanced Micro Devices, Inc. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # ######################################################################## #This file contains a number of utilities function which could be independent of #any specific domain concept import signal from subprocess import check_output import errorHandler from datetime import datetime def currentUser(): try: return check_output("who", shell = True).split()[0]; except: print 'Unhandled Exception at performanceUtility::currentUser()' raise #Details: Generate sorted numbers in radices of 2,3 and 5 upto a given upper limit number def generate235Radices(maxSize): sizeList = list() i = 0 j = 0 k = 0 SUM = int() sumj = int() sumk = int() sumi = 1 while(True): sumj = 1 j = 0 while(True): sumk = 1 k = 0 while(True): SUM = sumi*sumj*sumk if ( SUM > maxSize ): break sizeList.append(SUM) k += 1 sumk *= 2 if (k == 0): break j += 1 sumj *= 3 if ( j == 0 and k == 0): break i += 1 sumi *= 5 sizeList.sort() return sizeList def timeout(timeout_time, default): def timeout_function(f): def f2(args): def timeout_handler(signum, frame): raise errorHandler.TimeoutException() old_handler = signal.signal(signal.SIGALRM, timeout_handler) signal.alarm(timeout_time) # triger alarm in timeout_time seconds retval = "" try: retval = f(args) except errorHandler.TimeoutException: raise errorHandler.ApplicationException(__file__, errorHandler.TIME_OUT) except: signal.alarm(0) raise finally: #print 'executing finally' signal.signal(signal.SIGALRM, old_handler) signal.alarm(0) return retval return f2 return timeout_function def logTxtOutput(fileName, mode, txt): todayFile = fileName+'-'+datetime.now().strftime('%Y-%b-%d')+'.txt' with open(todayFile, mode) as f: f.write('------\n'+txt+'\n') def log(filename, txt): with open(filename, 'a') as f: f.write(datetime.now().ctime()+'# '+txt+'\n') clblas-2.10/src/scripts/perf/plotPerformance.py000066400000000000000000000277761264277366700216670ustar00rootroot00000000000000# ######################################################################## # Copyright 2013 Advanced Micro Devices, Inc. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # ######################################################################## # to use this script, you will need to download and install the 32-BIT VERSION of: # - Python 2.7 x86 (32-bit) - http://www.python.org/download/releases/2.7.1 # # you will also need the 32-BIT VERSIONS of the following packages as not all the packages are available in 64bit at the time of this writing # The ActiveState python distribution is recommended for windows # (make sure to get the python 2.7-compatible packages): # - NumPy 1.5.1 (32-bit, 64-bit unofficial, supports Python 2.4 - 2.7 and 3.1 - 3.2.) - http://sourceforge.net/projects/numpy/files/NumPy/ # - matplotlib 1.0.1 (32-bit & 64-bit, supports Python 2.4 - 2.7) - http://sourceforge.net/projects/matplotlib/files/matplotlib/ # # For ActiveState Python, all that one should need to type is 'pypm install matplotlib' import datetime import sys import argparse import subprocess import itertools import os import matplotlib import pylab from matplotlib.backends.backend_pdf import PdfPages from blasPerformanceTesting import * def plotGraph(dataForAllPlots, title, plottype, plotkwargs, xaxislabel, yaxislabel): """ display a pretty graph """ colors = ['k','y','m','c','r','b','g'] #plottype = 'plot' for thisPlot in dataForAllPlots: getattr(pylab, plottype)(thisPlot.xdata, thisPlot.ydata, '{}.-'.format(colors.pop()), label=thisPlot.label, **plotkwargs) if len(dataForAllPlots) > 1: pylab.legend(loc='best') pylab.title(title) pylab.xlabel(xaxislabel) pylab.ylabel(yaxislabel) pylab.grid(True) if args.outputFilename == None: # if no pdf output is requested, spit the graph to the screen . . . pylab.show() else: pylab.savefig(args.outputFilename,dpi=(1024/8)) # . . . otherwise, gimme gimme pdf #pdf = PdfPages(args.outputFilename) #pdf.savefig() #pdf.close() ######## plotFromDataFile() Function to plot from data file begins ######## def plotFromDataFile(): data = [] """ read in table(s) from file(s) """ for thisFile in args.datafile: if not os.path.isfile(thisFile): print 'No file with the name \'{}\' exists. Please indicate another filename.'.format(thisFile) quit() results = open(thisFile, 'r') results_contents = results.read() results_contents = results_contents.rstrip().split('\n') firstRow = results_contents.pop(0) print firstRow print blas_table_header() print firstRow.rstrip()==blas_table_header() if firstRow.rstrip() != blas_table_header(): print 'ERROR: input file \'{}\' does not match expected format.'.format(thisFile) quit() for row in results_contents: row = row.split(',') row = TableRow(BlasTestCombination(row[0],row[1],row[2],row[3],row[4],row[5],row[6],row[7],row[8],row[9],row[10],row[11],row[12],row[13],row[14], row[15], row[16], row[17][1:], row[17][0], row[18], row[19], row[20]), row[21]) data.append(BlasGraphPoint(row.parameters.sizem, row.parameters.sizen, row.parameters.sizek, row.parameters.lda, row.parameters.ldb, row.parameters.ldc, row.parameters.offa , row.parameters.offb , row.parameters.offc , row.parameters.device, row.parameters.order, row.parameters.transa, row.parameters.transb, row.parameters.precision + row.parameters.function, row.parameters.library, row.parameters.label, row.gflops)) """ data sanity check """ # if multiple plotvalues have > 1 value among the data rows, the user must specify which to plot multiplePlotValues = [] for option in plotvalues: values = [] for point in data: values.append(getattr(point, option)) multiplePlotValues.append(len(set(values)) > 1) if multiplePlotValues.count(True) > 1 and args.plot == None: print 'ERROR: more than one parameter of {} has multiple values. Please specify which parameter to plot with --plot'.format(plotvalues) quit() # if args.graphxaxis is not 'problemsize', the user should know that the results might be strange #if args.graphxaxis != 'problemsize': # xaxisvalueSet = [] # for option in xaxisvalues: # if option != 'problemsize': # values = [] # for point in data: # values.append(getattr(point, option)) # xaxisvalueSet.append(len(set(values)) > 1) # if xaxisvalueSet.count(True) > 1: # print 'WARNING: more than one parameter of {} is varied. unexpected results may occur. please double check your graphs for accuracy.'.format(xaxisvalues) # multiple rows should not have the same input values #pointInputs = [] #for point in data: # pointInputs.append(point.__str__().split(';')[0]) #if len(set(pointInputs)) != len(data): # print 'ERROR: imported table has duplicate rows with identical input parameters' # quit() """ figure out if we have multiple plots on this graph (and what they should be) """ if args.plot != None: multiplePlots = args.plot elif multiplePlotValues.count(True) == 1 and plotvalues[multiplePlotValues.index(True)] != 'sizek': # we don't ever want to default to sizek, because it's probably going to vary for most plots # we'll require the user to explicitly request multiple plots on sizek if necessary multiplePlots = plotvalues[multiplePlotValues.index(True)] else: # default to device if none of the options to plot have multiple values multiplePlots = 'device' """ assemble data for the graphs """ data.sort(key=lambda row: int(getattr(row, args.graphxaxis))) # choose scale for x axis if args.xaxisscale == None: # user didn't specify. autodetect if int(getattr(data[len(data)-1], args.graphxaxis)) > 2000: # big numbers on x-axis args.xaxisscale = 'log2' elif int(getattr(data[len(data)-1], args.graphxaxis)) > 10000: # bigger numbers on x-axis args.xaxisscale = 'log10' else: # small numbers on x-axis args.xaxisscale = 'linear' if args.xaxisscale == 'linear': plotkwargs = {} plottype = 'plot' elif args.xaxisscale == 'log2': plottype = 'semilogx' plotkwargs = {'basex':2} elif args.xaxisscale == 'log10': plottype = 'semilogx' plotkwargs = {'basex':10} else: print 'ERROR: invalid value for x-axis scale' quit() plots = set(getattr(row, multiplePlots) for row in data) class DataForOnePlot: def __init__(self, inlabel, inxdata, inydata): self.label = inlabel self.xdata = inxdata self.ydata = inydata dataForAllPlots = [] for plot in plots: dataForThisPlot = itertools.ifilter( lambda x: getattr(x, multiplePlots) == plot, data) dataForThisPlot = list(itertools.islice(dataForThisPlot, None)) #if args.graphxaxis == 'problemsize': # xdata = [int(row.x) * int(row.y) * int(row.z) * int(row.batchsize) for row in dataForThisPlot] #else: xdata = [getattr(row, args.graphxaxis) for row in dataForThisPlot] ydata = [getattr(row, args.graphyaxis) for row in dataForThisPlot] dataForAllPlots.append(DataForOnePlot(plot,xdata,ydata)) """ assemble labels for the graph or use the user-specified ones """ if args.graphtitle: # use the user selection title = args.graphtitle else: # autogen a lovely title title = 'Performance vs. ' + args.graphxaxis.capitalize() if args.xaxislabel: # use the user selection xaxislabel = args.xaxislabel else: # autogen a lovely x-axis label if args.graphxaxis == 'cachesize': units = '(bytes)' else: units = '(datapoints)' xaxislabel = args.graphxaxis + ' ' + units if args.yaxislabel: # use the user selection yaxislabel = args.yaxislabel else: # autogen a lovely y-axis label if args.graphyaxis == 'gflops': units = 'GFLOPS' yaxislabel = 'Performance (' + units + ')' """ display a pretty graph """ colors = ['k','y','m','c','r','b','g'] for thisPlot in dataForAllPlots: getattr(pylab, plottype)(thisPlot.xdata, thisPlot.ydata, '{}.-'.format(colors.pop()), label=thisPlot.label, **plotkwargs) if len(dataForAllPlots) > 1: pylab.legend(loc='best') pylab.title(title) pylab.xlabel(xaxislabel) pylab.ylabel(yaxislabel) pylab.grid(True) if args.outputFilename == None: # if no pdf output is requested, spit the graph to the screen . . . pylab.show() else: # . . . otherwise, gimme gimme pdf #pdf = PdfPages(args.outputFilename) #pdf.savefig() #pdf.close() pylab.savefig(args.outputFilename,dpi=(1024/8)) ######### plotFromDataFile() Function to plot from data file ends ######### ######## "main" program begins ##### """ define and parse parameters """ xaxisvalues = ['sizem','sizen','sizek'] yaxisvalues = ['gflops'] plotvalues = ['lda','ldb','ldc','sizek','device','label','order','transa','transb','function','library'] parser = argparse.ArgumentParser(description='Plot performance of the clblas\ library. clblas.plotPerformance.py reads in data tables from clblas.\ measurePerformance.py and plots their values') fileOrDb = parser.add_mutually_exclusive_group(required=True) fileOrDb.add_argument('-d', '--datafile', dest='datafile', action='append', default=None, required=False, help='indicate a file to use as input. must be in the format output by\ clblas.measurePerformance.py. may be used multiple times to indicate\ multiple input files. e.g., -d cypressOutput.txt -d caymanOutput.txt') parser.add_argument('-x', '--x_axis', dest='graphxaxis', default=None, choices=xaxisvalues, required=True, help='indicate which value will be represented on the x axis. problemsize\ is defined as x*y*z*batchsize') parser.add_argument('-y', '--y_axis', dest='graphyaxis', default='gflops', choices=yaxisvalues, help='indicate which value will be represented on the y axis') parser.add_argument('--plot', dest='plot', default=None, choices=plotvalues, help='indicate which of {} should be used to differentiate multiple plots.\ this will be chosen automatically if not specified'.format(plotvalues)) parser.add_argument('--title', dest='graphtitle', default=None, help='the desired title for the graph generated by this execution. if\ GRAPHTITLE contains any spaces, it must be entered in \"double quotes\".\ if this option is not specified, the title will be autogenerated') parser.add_argument('--x_axis_label', dest='xaxislabel', default=None, help='the desired label for the graph\'s x-axis. if XAXISLABEL contains\ any spaces, it must be entered in \"double quotes\". if this option\ is not specified, the x-axis label will be autogenerated') parser.add_argument('--x_axis_scale', dest='xaxisscale', default=None, choices=['linear','log2','log10'], help='the desired scale for the graph\'s x-axis. if nothing is specified,\ it will be selected automatically') parser.add_argument('--y_axis_label', dest='yaxislabel', default=None, help='the desired label for the graph\'s y-axis. if YAXISLABEL contains any\ spaces, it must be entered in \"double quotes\". if this option is not\ specified, the y-axis label will be autogenerated') parser.add_argument('--outputfile', dest='outputFilename', default=None, help='name of the file to output graphs. Supported formats: emf, eps, pdf, png, ps, raw, rgba, svg, svgz.') args = parser.parse_args() if args.datafile != None: plotFromDataFile() else: print "Atleast specify if you want to use text files or database for plotting graphs. Use -h or --help option for more details" quit() clblas-2.10/src/targetver.h000066400000000000000000000027411264277366700156670ustar00rootroot00000000000000/* ************************************************************************ * Copyright 2013 Advanced Micro Devices, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * ************************************************************************/ #pragma once // The following macros define the minimum required platform. The minimum required platform // is the earliest version of Windows, Internet Explorer etc. that has the necessary features to run // your application. The macros work by enabling all features available on platform versions up to and // including the version specified. // Modify the following defines if you have to target a platform prior to the ones specified below. // Refer to MSDN for the latest info on corresponding values for different platforms. #ifndef _WIN32_WINNT // Specifies that the minimum required platform is Windows Vista. #define _WIN32_WINNT 0x0600 // Change this to the appropriate value to target other versions of Windows. #endif clblas-2.10/src/tests/000077500000000000000000000000001264277366700146515ustar00rootroot00000000000000clblas-2.10/src/tests/BasicRoutines.cpp000066400000000000000000000064201264277366700201310ustar00rootroot00000000000000/* ************************************************************************ * Copyright 2013 Advanced Micro Devices, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * ************************************************************************/ #include #include class BasicRoutines : public testing::Test { protected: BasicRoutines() : platform(0), device(0), context(NULL), queue(NULL) { } virtual ~BasicRoutines() { } virtual void SetUp() { cl_int err; cl_context_properties props[] = { CL_CONTEXT_PLATFORM, 0, 0 }; ASSERT_EQ(CL_SUCCESS, clGetPlatformIDs(1, &platform, NULL)); ASSERT_EQ(CL_SUCCESS, clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, 1, &device, NULL)); props[1] = (cl_context_properties)platform; context = clCreateContext(props, 1, &device, NULL, NULL, &err); ASSERT_EQ(CL_SUCCESS, err) << "clCreateContext() failed"; queue = clCreateCommandQueue(context, device, 0, &err); ASSERT_EQ(CL_SUCCESS, err) << "clCreateCommandQueue() failed"; } virtual void TearDown() { if (queue != NULL) { clReleaseCommandQueue(queue); } if (context != NULL) { clReleaseContext(context); } } cl_platform_id platform; cl_device_id device; cl_context context; cl_command_queue queue; }; TEST_F(BasicRoutines, UsualCodeFlow) { EXPECT_EQ(CL_SUCCESS, clblasSetup()); EXPECT_EQ(CL_SUCCESS, AMD_clBLAS_PREBUILD_KERNELS(context)); EXPECT_EQ(CL_SUCCESS, AMD_clBLAS_CLEANUP_KERNELS(context)); clblasTeardown(); } TEST_F(BasicRoutines, DoubleSetup) { EXPECT_EQ(CL_SUCCESS, clblasSetup()); EXPECT_NE(clblasSetup(), CL_SUCCESS); clblasTeardown(); } TEST_F(BasicRoutines, MissedSetup) { EXPECT_NE(AMD_clBLAS_PREBUILD_KERNELS(context), CL_SUCCESS); } TEST_F(BasicRoutines, BadContext) { EXPECT_EQ(CL_SUCCESS, clblasSetup()); EXPECT_NE(AMD_clBLAS_PREBUILD_KERNELS(NULL), CL_SUCCESS); clblasTeardown(); } TEST_F(BasicRoutines, TwoContexts) { cl_int err; cl_context_properties props[] = { CL_CONTEXT_PLATFORM, 0, 0 }; cl_context anotherContext; EXPECT_EQ(CL_SUCCESS, clblasSetup()); props[1] = (cl_context_properties)platform; anotherContext = clCreateContext(props, 1, &device, NULL, NULL, &err); ASSERT_EQ(CL_SUCCESS, err) << "Need a context"; ASSERT_NE(context, anotherContext) << "Contexts must be different"; EXPECT_EQ(CL_SUCCESS, AMD_clBLAS_PREBUILD_KERNELS(context)); EXPECT_EQ(CL_SUCCESS, AMD_clBLAS_PREBUILD_KERNELS(anotherContext)); EXPECT_EQ(CL_SUCCESS, AMD_clBLAS_CLEANUP_KERNELS(context)); EXPECT_EQ(CL_SUCCESS, AMD_clBLAS_CLEANUP_KERNELS(anotherContext)); clReleaseContext(context); clblasTeardown(); } clblas-2.10/src/tests/BlasBase.cpp000066400000000000000000000331121264277366700170310ustar00rootroot00000000000000/* ************************************************************************ * Copyright 2013 Advanced Micro Devices, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * ************************************************************************/ #include #include #include #include #include #include namespace clMath { BlasBase* BlasBase::getInstance() { static BlasBase instance; if (!instance.initialized()) { return NULL; } return &instance; } BlasBase::BlasBase() : platform_(0), primaryDevice_(0), additionalDevice_(0), context_(NULL), useNumCommandQueues_(false), numCommandQueues_(1), useAlpha_(false), useBeta_(false), useSeed_(false), useM_(false), useN_(false), useK_(false), M_(0), N_(0), K_(0), useIncX_(false), useIncY_(false), incX_(0), incY_(0), useImages_(false), devType_(CL_DEVICE_TYPE_GPU), imageA_(0), imageB_(0) { memset(&alpha_, 0, sizeof(alpha_)); memset(&beta_, 0, sizeof(beta_)); memset(commandQueues_, 0, sizeof(commandQueues_)); SetUp(); } BlasBase::~BlasBase() { /* * Teardown() is disabled due to troubles with test interrupting * with CTRL-C in windows. This occurs since after pressing of these keys * the OpenCL runtime is destroyed before calling global object destructors. */ #if 0 TearDown(); #endif } cl_int BlasBase::getPlatforms(cl_platform_id **platforms, cl_int *error) { cl_int err; cl_uint nrPlatforms; //platforms = NULL; if (error != NULL) { *error = CL_SUCCESS; } err = clGetPlatformIDs(0, NULL, &nrPlatforms); if (err != CL_SUCCESS) { if (error != NULL) { *error = err; } return 0; } if (nrPlatforms == 0) { return 0; } *platforms = new cl_platform_id[nrPlatforms]; err = clGetPlatformIDs(nrPlatforms, *platforms, NULL); if (err != CL_SUCCESS) { if (error != NULL) { *error = err; } delete[] platforms; return 0; } return nrPlatforms; } cl_device_id BlasBase::getDevice(cl_device_type type, const char* name, cl_int *error) { cl_int err; cl_uint nrDevices, i, p; cl_device_id *devices, result = NULL; size_t sz; char *str; cl_platform_id *platforms, selPlatform = NULL; cl_uint nrPlatforms; cl_device_info devInfo; nrPlatforms = getPlatforms(&platforms, &err); if (error != NULL) { *error = CL_SUCCESS; } /* * If device name is not specified, then any AMD device is preferable. * It there are not AMD devices of such a type presented in the system, * then get a device of another vendor. If this is the additional device * which is being tried to get, it must be supported by the same platform * as the primary device does. */ if (name == NULL) { name = "Advanced Micro Devices, Inc."; devInfo = CL_DEVICE_VENDOR; } else { devInfo = CL_DEVICE_NAME; type = CL_DEVICE_TYPE_ALL; } for (p = 0; p < nrPlatforms; p++) { cl_platform_id platform = platforms[p]; err = clGetDeviceIDs(platform, type, 0, NULL, &nrDevices); if (err == CL_DEVICE_NOT_FOUND) { continue; } if (err != CL_SUCCESS) { if (error != NULL) { *error = err; } return NULL; } if (nrDevices == 0) { return NULL; } devices = new cl_device_id[nrDevices]; err = clGetDeviceIDs(platform, type, nrDevices, devices, NULL); if (err != CL_SUCCESS) { if (error != NULL) { *error = err; } delete[] devices; return NULL; } for (i = 0; i < nrDevices; i++) { err = clGetDeviceInfo(devices[i], devInfo, 0, NULL, &sz); if (err != CL_SUCCESS) { continue; } str = new char[sz + 1]; memset(str, 0, sz + 1); err = clGetDeviceInfo(devices[i], devInfo, sz, str, NULL); if (err != CL_SUCCESS) { delete[] str; continue; } if ((devInfo == CL_DEVICE_VENDOR) && (result == NULL) && ((platform_ == NULL) || (platform == platform_))) { result = devices[i]; selPlatform = platform; } printf("---- %s\n", str); if (strcmp(str, name) == 0) { //printf("---- %s\n", str); platform_ = platform; result = devices[i]; delete[] str; break; } delete[] str; } delete[] devices; devices = NULL; } if (platform_ == NULL) { platform_ = selPlatform; } delete[] platforms; return result; } void BlasBase::SetUp() { cl_int err = CL_SUCCESS; cl_context_properties props[] = { CL_CONTEXT_PLATFORM, 0, 0 }; cl_uint i = 1; cl_uint addDevQueueIdx = MAX_COMMAND_QUEUES; cl_device_id devices[2] = {NULL, NULL}; primaryDevice_ = getDevice(devType_, devName_, &err); if ((err != CL_SUCCESS) || (primaryDevice_ == NULL)) { ASSERT_EQ(CL_SUCCESS, clGetPlatformIDs(1, &platform_, NULL)); ASSERT_EQ(CL_SUCCESS, clGetDeviceIDs(platform_, devType_, 1, &primaryDevice_, NULL)); } devices[0] = primaryDevice_; #if !defined(TEST_WITH_SINGLE_DEVICE) cl_device_type addDevType; if (MAX_COMMAND_QUEUES > 1) { addDevType = (devType_ == CL_DEVICE_TYPE_GPU) ? CL_DEVICE_TYPE_CPU : CL_DEVICE_TYPE_GPU; additionalDevice_ = getDevice(addDevType, NULL, NULL); if (additionalDevice_ != NULL) { addDevQueueIdx = (MAX_COMMAND_QUEUES <= 3) ? (MAX_COMMAND_QUEUES - 1) : 2; devices[1] = additionalDevice_; i = 2; } } #endif /* !TEST_WITH_SINGLE_DEVICE */ props[1] = (cl_context_properties)platform_; context_ = clCreateContext(props, i, devices, NULL, NULL, &err); ASSERT_EQ(CL_SUCCESS, err) << "clCreateContext() failed"; #ifdef DEBUG_CONTEXT printf("SetUp: Created context %p\n", context_); #endif printf("SetUp: about to create command queues\n"); for (i = 0; i < MAX_COMMAND_QUEUES; i++) { cl_device_id dev; dev = (i == addDevQueueIdx) ? additionalDevice_ : primaryDevice_; commandQueues_[i] = clCreateCommandQueue(context_, dev, 0 /*CL_QUEUE_PROFILING_ENABLE*/, &err); ASSERT_EQ(CL_SUCCESS, err) << "clCreateCommandQueue() failed"; } ASSERT_EQ(CL_SUCCESS, clblasSetup()); } void BlasBase::TearDown() { cl_uint i; for (i = 0; i < MAX_COMMAND_QUEUES; i++) { clReleaseCommandQueue(commandQueues_[i]); } numCommandQueues_ = 1; if (context_ != NULL) { clReleaseContext(context_); context_ = NULL; } primaryDevice_ = additionalDevice_ = NULL; clblasTeardown(); } bool BlasBase::initialized() { return (context_ != NULL); } bool BlasBase::setDeviceType(cl_device_type* devType, const char* devName) { if (devType_ == *devType && devName_ == devName) { return true; } devType_ = *devType; devName_ = devName; if (!initialized()) { return true; } TearDown(); SetUp(); *devType = devType_; return initialized(); } cl_mem BlasBase::createEnqueueBuffer( const void *data, size_t matrSize, size_t off, cl_mem_flags mode) { cl_int err; cl_mem buf; cl_uint i; #ifdef DEBUG_CONTEXT cl_uint refcnt; printf("BLASBASE: createEnqBuff - Querying context %p\n", context_); if (clGetContextInfo(context_, CL_CONTEXT_REFERENCE_COUNT, sizeof(cl_uint), &refcnt, NULL) != CL_SUCCESS) { printf("BLASBASE: clGetContextInfo FAILED\n"); } else { printf("BLASBASE: REFCNT = %u\n", refcnt); } #endif buf = clCreateBuffer(context_, mode, matrSize + off, NULL, &err); if ( data != NULL ) { if (err == CL_SUCCESS ) { for (i = 0; i < numCommandQueues_; i++) { err = clEnqueueWriteBuffer(commandQueues_[i], buf, CL_TRUE, off, matrSize, data, 0, NULL, NULL); if (err != CL_SUCCESS) { clReleaseMemObject(buf); return NULL; } } } } return buf; } bool BlasBase::isDevSupportDoublePrecision(void) { cl_int err; cl_uint v; size_t len; char *extensions, *s; /* Check for cl_khr_fp64 extension */ err = clGetDeviceInfo(primaryDevice_, CL_DEVICE_PREFERRED_VECTOR_WIDTH_DOUBLE, sizeof(cl_uint), &v, NULL); if (err != CL_SUCCESS) { return false; } if (v != 0) { return true; } /* Check extensions */ err = clGetDeviceInfo(primaryDevice_, CL_DEVICE_EXTENSIONS, 0, NULL, &len); if (err != CL_SUCCESS) { return false; } extensions = new char[len]; err = clGetDeviceInfo(primaryDevice_, CL_DEVICE_EXTENSIONS, len, extensions, NULL); if (err != CL_SUCCESS) { delete[] extensions; return false; } /* Check for cl_amd_fp64 extension */ s = strstr(extensions, "cl_amd_fp64"); /* strlen("cl_amd_fp64") = 11 */ if (s != NULL) { if ((s[11] == ' ') || (s[11] == '\0')) { delete[] extensions; return true; } } delete[] extensions; return false; } void BlasBase::removeScratchImages(void) { //if (imageB_) { // clblasRemoveScratchImage(imageB_); //} //if (imageA_) { // clblasRemoveScratchImage(imageA_); //} } size_t BlasBase::scratchImageWidth(void) { size_t width; clGetImageInfo(reinterpret_cast(imageA_), CL_IMAGE_WIDTH, sizeof(width), &width, NULL); return width; } size_t BlasBase::scratchImageHeight(void) { size_t height; clGetImageInfo(reinterpret_cast(imageA_), CL_IMAGE_HEIGHT, sizeof(height), &height, NULL); return height; } cl_ulong BlasBase::maxMemAllocSize(void) { cl_int err; cl_ulong rc = 0; err = clGetDeviceInfo(primaryDevice_, CL_DEVICE_MAX_MEM_ALLOC_SIZE, sizeof(rc), &rc, NULL); if ((err == CL_SUCCESS) && (additionalDevice_ != NULL)) { cl_ulong u; err = clGetDeviceInfo(additionalDevice_, CL_DEVICE_MAX_MEM_ALLOC_SIZE, sizeof(u), &u, NULL); if (err == CL_SUCCESS) { rc = std::min(rc, u); } } return rc; } cl_ulong BlasBase::availGlobalMemSize(int primAdd) { cl_ulong gmemSize; cl_device_id dev; dev = (primAdd) ? additionalDevice_ : primaryDevice_; clGetDeviceInfo(dev, CL_DEVICE_GLOBAL_MEM_SIZE, sizeof(gmemSize), &gmemSize, NULL); return gmemSize; } void BlasBase::printDevInfoStr(cl_device_info param, const char *paramName, int primAdd) { char buf[4096]; cl_device_id dev; dev = (primAdd) ? additionalDevice_ : primaryDevice_; if (clGetDeviceInfo(dev, param, sizeof(buf), buf, NULL) == CL_SUCCESS) { std::cout << paramName << ": " << buf << std::endl; } } void BlasBase::printEnvInfo(void) { cl_ulong memSize; int i; if (primaryDevice_ == NULL) { return; } cl_uint libMajor, libMinor, libPatch; clblasGetVersion( &libMajor, &libMinor, &libPatch ); std::cout << std::endl << "Test environment:" << std::endl << std::endl; for (i = 0; i < 2; i++) { if (additionalDevice_ != NULL) { if (!i) { std::cout << "PRIMARY DEVICE (used in all cases):" << std::endl; } else { std::cout << "ADDITIONAL DEVICE (used only in cases with " "multiple command queues to cover cases with " "problem distribution among command queues " "belonging to different devices):" << std::endl; } } else if (i) { break; } printDevInfoStr(CL_DEVICE_NAME, "Device name", i); printDevInfoStr(CL_DEVICE_VENDOR, "Device vendor", i); std::cout << "Platform (bit): "; #if defined( _WIN32 ) std::cout << "Windows "; #if defined( _WIN64 ) std::cout << "(x64)" << std::endl; #else std::cout << "(x32)" << std::endl; #endif #elif defined( __APPLE__ ) std::cout << "Apple OS X" << std::endl; #else std::cout << "Linux" << std::endl; #endif std::cout << "clblas version: " << libMajor << "." << libMinor << "." << libPatch << std::endl; printDevInfoStr(CL_DRIVER_VERSION, "Driver version", i); printDevInfoStr(CL_DEVICE_VERSION, "Device version", i); memSize = availGlobalMemSize(i); std::cout << "Global mem size: " << memSize / (1024 * 1024) << " MB" << std::endl; std::cout << "---------------------------------------------------------" << std::endl << std::endl; } } } // namespace clblas-2.10/src/tests/CMakeLists.txt000066400000000000000000000402441264277366700174150ustar00rootroot00000000000000# ######################################################################## # Copyright 2013 Advanced Micro Devices, Inc. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # ######################################################################## set(SRC_COMMON cmdline.c common.cpp clBLAS-wrapper.cpp BlasBase.cpp ) set(SRC_COMMON_TIMER timer.c ) # group of sources with reference implementation stuff set (SRC_COMMON_REFIMPL blas.c blas-cblas.c blas-wrapper.cpp ) set(SRC_CORR correctness/blas-lapack.c correctness/BlasBase-corr.cpp correctness/corr-gemm.cpp correctness/corr-trmm.cpp correctness/corr-trsm.cpp correctness/corr-gemv.cpp correctness/corr-symv.cpp correctness/corr-spmv.cpp correctness/corr-syr2k.cpp correctness/corr-syrk.cpp correctness/corr-trmv.cpp correctness/corr-tpmv.cpp correctness/corr-trsv.cpp correctness/corr-symm.cpp correctness/corr-gemm2.cpp correctness/corr-ger.cpp correctness/corr-gerc.cpp correctness/corr-her.cpp correctness/corr-her2.cpp correctness/corr-syr.cpp correctness/corr-spr.cpp correctness/corr-syr2.cpp correctness/corr-hemv.cpp correctness/corr-hpmv.cpp correctness/corr-hemm.cpp correctness/corr-herk.cpp correctness/corr-tpsv.cpp correctness/corr-hpr.cpp correctness/corr-hpr2.cpp correctness/corr-spr2.cpp correctness/corr-gbmv.cpp correctness/corr-hbmv.cpp correctness/corr-tbmv.cpp correctness/corr-tbsv.cpp correctness/corr-sbmv.cpp correctness/corr-her2k.cpp correctness/corr-scal.cpp correctness/corr-swap.cpp correctness/corr-copy.cpp correctness/corr-axpy.cpp correctness/corr-dot.cpp correctness/corr-dotc.cpp correctness/corr-rotg.cpp correctness/corr-rotm.cpp correctness/corr-rot.cpp correctness/corr-rotmg.cpp correctness/corr-nrm2.cpp correctness/corr-asum.cpp correctness/corr-iamax.cpp correctness/test-correctness.cpp correctness/tcase-filter.cpp ) set(SRC_PERF performance/PerformanceRecorder.cpp performance/PerformanceTest.cpp performance/TrxmPerformanceTest.cpp performance/BlasBase-perf.cpp performance/perf-gemm.cpp performance/perf-gemm2.cpp performance/perf-gemv.cpp performance/perf-syr2k.cpp performance/perf-syrk.cpp performance/perf-symv.cpp performance/perf-spmv.cpp performance/perf-trmm.cpp performance/perf-trsm.cpp performance/perf-trmv.cpp performance/perf-tpmv.cpp performance/perf-trsv.cpp performance/perf-symm.cpp performance/perf-ger.cpp performance/perf-gerc.cpp performance/perf-syr.cpp performance/perf-spr.cpp performance/perf-her.cpp performance/perf-her2.cpp performance/perf-syr2.cpp performance/perf-hemm.cpp performance/perf-hemv.cpp performance/perf-hpmv.cpp performance/perf-herk.cpp performance/perf-tpsv.cpp performance/perf-hpr.cpp performance/perf-hpr2.cpp performance/perf-spr2.cpp performance/perf-sbmv.cpp performance/perf-gbmv.cpp performance/perf-hbmv.cpp performance/perf-tbmv.cpp performance/perf-tbsv.cpp performance/perf-her2k.cpp performance/perf-scal.cpp performance/perf-swap.cpp performance/perf-copy.cpp performance/perf-axpy.cpp performance/perf-dot.cpp performance/perf-dotc.cpp performance/perf-rotg.cpp performance/perf-rotm.cpp performance/perf-rot.cpp performance/perf-rotmg.cpp performance/perf-nrm2.cpp performance/perf-asum.cpp performance/perf-iamax.cpp performance/test-performance.cpp ) set(SRC_FUNC functional/func-error.cpp functional/func-event.cpp functional/func-thread.cpp functional/func-queue.cpp #functional/func-images.cpp functional/test-functional.cpp functional/BlasBase-func.cpp ) set(TESTS_HEADERS ${clBLAS_SOURCE_DIR}/clBLAS.h ${clBLAS_SOURCE_DIR}/clBLAS-complex.h ${clBLAS_SOURCE_DIR}/include/cltypes.h ${clBLAS_SOURCE_DIR}/include/defbool.h include/blas-internal.h include/blas-cblas.h include/blas-wrapper.h include/clBLAS-wrapper.h include/cmdline.h include/BlasBase.h include/common.h include/BlasBase.h include/gemm.h include/trmm.h include/tpmv.h include/trsm.h include/gemv.h include/symv.h include/spmv.h include/syr2k.h include/syrk.h include/trmv.h include/trsv.h include/symm.h include/ger.h include/gerc.h include/syr.h include/spr.h include/her.h include/her2.h include/syr2.h include/hemv.h include/hpmv.h include/hemm.h include/herk.h include/tpsv.h include/hpr.h include/hpr2.h include/spr2.h include/gbmv.h include/hbmv.h include/tbmv.h include/tbsv.h include/copy.h include/sbmv.h include/dot.h include/dotc.h include/her2k.h include/scal.h include/swap.h include/axpy.h include/rotg.h include/rotm.h include/rot.h include/asum.h include/rotmg.h include/nrm2.h include/iamax.h include/blas-math.h include/blas-random.h include/matrix.h include/timer.h ) set(CORR_HEADERS correctness/blas-lapack.h correctness/trsm-delta.h correctness/tcase-filter.h correctness/delta.h correctness/trsv-delta.h ) set(PERF_HEADERS performance/PerformanceTest.h performance/PerformanceRecorder.h ) set(FUNC_HEADERS functional/func.h ) # Setup Visual Studio file tabs source_group(correctness FILES ${SRC_CORR} ${CORR_HEADERS}) source_group(performance FILES ${SRC_PERF} ${PERF_HEADERS}) source_group(functional FILES ${SRC_FUNC} ${FUNC_HEADERS}) # FIXME: it's a temporary solution to workaround segfault in clGetProgramInfo() # at paramVal = CL_PROGRAM_BINARIES and several devices in the context add_definitions( -DTEST_WITH_SINGLE_DEVICE ) # vs11 needs std::tuples compiled with 10 parameters by default # NOTE: this assumes that googletest is compiled with the same preprocessor macro; they must match if( MSVC11 ) add_definitions( "/D_VARIADIC_MAX=10" ) endif() # Having problems on build server, compiling gtest headers with -pedantic; disabling detection of long long # http://code.google.com/p/googletest/issues/detail?id=334 if( CMAKE_COMPILER_IS_GNUCXX ) add_definitions( -Wno-long-long ) endif( ) if( CMAKE_Fortran_COMPILER_ID STREQUAL "PGI" ) message( STATUS "Detected PGI Fortran compiler." ) # By default, -Mipa=fast is used, and this does not mix well with the cl compiler string( REPLACE "-Mipa=fast" "" CMAKE_Fortran_FLAGS_RELEASE ${CMAKE_Fortran_FLAGS_RELEASE} ) # In windows, dynamically link to the C runtime, and tell fortran linker to not include default main subroutine if( WIN32 ) set( CMAKE_EXE_LINKER_FLAGS "-Bdynamic -Mnostartup ${CMAKE_EXE_LINKER_FLAGS}" ) endif( ) endif( ) # Library with functions for time measurement. In Windows they are included automatically if(UNIX) if(NOT APPLE) set(TIME_LIBRARY "rt") endif() set(THREAD_LIBRARY "pthread") endif() # This logic supports the build server, if it compiles the runtime seperately from the test programs # It stitches together a path to a previously built static library, based on our 'make install' logic # Search for 64bit libs if FIND_LIBRARY_USE_LIB64_PATHS is set to true in the global environment, 32bit libs else get_property( LIB64 GLOBAL PROPERTY FIND_LIBRARY_USE_LIB64_PATHS ) set( runtime.library "${CMAKE_INSTALL_PREFIX}/lib${SUFFIX_LIB}" ) if( WIN32 ) set( runtime.library "${runtime.library}/import/clBLAS${CMAKE_STATIC_LIBRARY_SUFFIX}" ) else( ) set( runtime.library "${runtime.library}/${CMAKE_SHARED_LIBRARY_PREFIX}clBLAS${CMAKE_SHARED_LIBRARY_SUFFIX}" ) endif( ) # Find Google Test package include(gtest.cmake) if( GTEST_FOUND ) if( CORR_TEST_WITH_ACML AND ACML_FOUND ) include_directories(${OPENCL_INCLUDE_DIRS} ${GTEST_INCLUDE_DIRS} ${clBLAS_SOURCE_DIR} ${ACML_INCLUDE_DIRS} ${clBLAS_SOURCE_DIR}/tests/include ${clBLAS_SOURCE_DIR}/include) add_definitions(-DCORR_TEST_WITH_ACML) add_executable(test-correctness ${SRC_CORR} ${SRC_COMMON} ${SRC_COMMON_REFIMPL} ${CORR_HEADERS} ${TESTS_HEADERS}) set_target_properties( test-correctness PROPERTIES RUNTIME_OUTPUT_DIRECTORY "${PROJECT_BINARY_DIR}/staging" ) add_executable(test-medium ${SRC_CORR} ${SRC_COMMON} ${SRC_COMMON_REFIMPL} ${CORR_HEADERS} ${TESTS_HEADERS}) set_target_properties(test-medium PROPERTIES COMPILE_DEFINITIONS MEDIUM_TESTS) set_target_properties( test-medium PROPERTIES RUNTIME_OUTPUT_DIRECTORY "${PROJECT_BINARY_DIR}/staging" ) add_executable(test-short ${SRC_CORR} ${SRC_COMMON} ${SRC_COMMON_REFIMPL} ${CORR_HEADERS} ${TESTS_HEADERS}) set_target_properties(test-short PROPERTIES COMPILE_DEFINITIONS SHORT_TESTS) set_target_properties( test-short PROPERTIES RUNTIME_OUTPUT_DIRECTORY "${PROJECT_BINARY_DIR}/staging" ) # The build server builds the library with gcc 4.1.2 to support Red Hat 5.5, but the test programs must be built with # gcc > 4.3.2 to support ACML. # If the runtime is being built by the project, use it, otherwise link to a runtime library specified in the install prefix if( BUILD_RUNTIME ) target_link_libraries(test-correctness ${ACML_LIBRARIES} ${GTEST_LIBRARIES} ${THREAD_LIBRARY} clBLAS) target_link_libraries(test-medium ${ACML_LIBRARIES} ${GTEST_LIBRARIES} ${THREAD_LIBRARY} clBLAS) target_link_libraries(test-short ${ACML_LIBRARIES} ${GTEST_LIBRARIES} ${THREAD_LIBRARY} clBLAS) else( ) target_link_libraries(test-correctness ${ACML_LIBRARIES} ${GTEST_LIBRARIES} ${OPENCL_LIBRARIES} ${THREAD_LIBRARY} ${runtime.library}) target_link_libraries(test-medium ${ACML_LIBRARIES} ${GTEST_LIBRARIES} ${OPENCL_LIBRARIES} ${THREAD_LIBRARY} ${runtime.library}) target_link_libraries(test-short ${ACML_LIBRARIES} ${GTEST_LIBRARIES} ${OPENCL_LIBRARIES} ${THREAD_LIBRARY} ${runtime.library}) endif( ) else( ) #Link against the netlib reference library include_directories(${OPENCL_INCLUDE_DIRS} ${GTEST_INCLUDE_DIRS} ${clBLAS_SOURCE_DIR} ${clBLAS_SOURCE_DIR}/tests/include ${clBLAS_SOURCE_DIR}/include}) add_executable(test-correctness ${SRC_CORR} ${SRC_COMMON} ${SRC_COMMON_REFIMPL} ${CORR_HEADERS} ${TESTS_HEADERS}) set_target_properties( test-correctness PROPERTIES RUNTIME_OUTPUT_DIRECTORY "${PROJECT_BINARY_DIR}/staging" ) add_executable(test-medium ${SRC_CORR} ${SRC_COMMON} ${SRC_COMMON_REFIMPL} ${CORR_HEADERS} ${TESTS_HEADERS}) set_target_properties( test-medium PROPERTIES COMPILE_DEFINITIONS MEDIUM_TESTS ) set_target_properties( test-medium PROPERTIES RUNTIME_OUTPUT_DIRECTORY "${PROJECT_BINARY_DIR}/staging" ) add_executable(test-short ${SRC_CORR} ${SRC_COMMON} ${SRC_COMMON_REFIMPL} ${CORR_HEADERS} ${TESTS_HEADERS}) set_target_properties( test-short PROPERTIES COMPILE_DEFINITIONS SHORT_TESTS ) set_target_properties( test-short PROPERTIES RUNTIME_OUTPUT_DIRECTORY "${PROJECT_BINARY_DIR}/staging" ) if( NOT CORR_TEST_WITH_ACML AND NOT WIN32 AND NOT APPLE) set_target_properties( test-correctness PROPERTIES LINKER_LANGUAGE Fortran ) set_target_properties( test-medium PROPERTIES LINKER_LANGUAGE Fortran ) set_target_properties( test-short PROPERTIES LINKER_LANGUAGE Fortran ) endif( ) if( BUILD_RUNTIME ) if( NETLIB_FOUND ) target_link_libraries(test-correctness ${Netlib_LIBRARIES} ${GTEST_LIBRARIES} ${THREAD_LIBRARY} clBLAS) target_link_libraries(test-medium ${Netlib_LIBRARIES} ${GTEST_LIBRARIES} ${THREAD_LIBRARY} clBLAS) target_link_libraries(test-short ${Netlib_LIBRARIES} ${GTEST_LIBRARIES} ${THREAD_LIBRARY} clBLAS) else( ) target_link_libraries(test-correctness ${BLAS_LIBRARIES} ${GTEST_LIBRARIES} ${THREAD_LIBRARY} clBLAS) target_link_libraries(test-medium ${BLAS_LIBRARIES} ${GTEST_LIBRARIES} ${THREAD_LIBRARY} clBLAS) target_link_libraries(test-short ${BLAS_LIBRARIES} ${GTEST_LIBRARIES} ${THREAD_LIBRARY} clBLAS) endif( ) else( ) if( NETLIB_FOUND ) target_link_libraries(test-correctness ${Netlib_LIBRARIES} ${GTEST_LIBRARIES} ${OPENCL_LIBRARIES} ${runtime.library} ) target_link_libraries(test-medium ${Netlib_LIBRARIES} ${GTEST_LIBRARIES} ${OPENCL_LIBRARIES} ${runtime.library} ) target_link_libraries(test-short ${Netlib_LIBRARIES} ${GTEST_LIBRARIES} ${OPENCL_LIBRARIES} ${runtime.library} ) else( ) target_link_libraries(test-correctness ${BLAS_LIBRARIES} ${GTEST_LIBRARIES} ${OPENCL_LIBRARIES} ${runtime.library} ) target_link_libraries(test-medium ${BLAS_LIBRARIES} ${GTEST_LIBRARIES} ${OPENCL_LIBRARIES} ${runtime.library} ) target_link_libraries(test-short ${BLAS_LIBRARIES} ${GTEST_LIBRARIES} ${OPENCL_LIBRARIES} ${runtime.library} ) endif( ) endif( ) endif( ) set_property( TARGET test-correctness PROPERTY FOLDER "Test") set_property( TARGET test-medium PROPERTY FOLDER "Test") set_property( TARGET test-short PROPERTY FOLDER "Test") # CPack configuration; include the executable into the package install( TARGETS test-correctness test-medium test-short RUNTIME DESTINATION bin${SUFFIX_BIN} LIBRARY DESTINATION lib${SUFFIX_LIB} ARCHIVE DESTINATION lib${SUFFIX_LIB}/import ) get_target_property( testLocation test-correctness LOCATION ) configure_file( "${CMAKE_CURRENT_SOURCE_DIR}/copyTestDependencies.cmake.in" "${CMAKE_CURRENT_BINARY_DIR}/copyTestDependencies.cmake" @ONLY ) # Register script at run at install time to analyze the executable and copy dependencies into package install( SCRIPT "${CMAKE_CURRENT_BINARY_DIR}/copyTestDependencies.cmake") if( ACML_FOUND ) include_directories(${OPENCL_INCLUDE_DIRS} ${GTEST_INCLUDE_DIRS} ${clBLAS_SOURCE_DIR} ${clBLAS_SOURCE_DIR}/tests/include ${clBLAS_SOURCE_DIR}/include) add_definitions(-DPERF_TEST_WITH_ACML) include_directories(${ACML_INCLUDE_DIRS}) add_executable(test-performance ${SRC_PERF} ${SRC_COMMON} ${SRC_COMMON_TIMER} ${PERF_HEADERS} ${TESTS_HEADERS} ${SRC_COMMON_REFIMPL}) target_link_libraries(test-performance ${ACML_LIBRARIES} ${THREAD_LIBRARY}) set_target_properties( test-performance PROPERTIES RUNTIME_OUTPUT_DIRECTORY "${PROJECT_BINARY_DIR}/staging" ) if( BUILD_RUNTIME ) target_link_libraries(test-performance ${GTEST_LIBRARIES} ${TIME_LIBRARY} ${THREAD_LIBRARY} clBLAS) else() target_link_libraries( test-performance ${GTEST_LIBRARIES} ${TIME_LIBRARY} ${OPENCL_LIBRARIES} ${runtime.library} ) endif() set_property( TARGET test-performance PROPERTY FOLDER "Test") # CPack configuration; include the executable into the package install( TARGETS test-performance RUNTIME DESTINATION bin${SUFFIX_BIN} LIBRARY DESTINATION lib${SUFFIX_LIB} ARCHIVE DESTINATION lib${SUFFIX_LIB}/import ) endif() include_directories(${OPENCL_INCLUDE_DIRS} ${GTEST_INCLUDE_DIRS} ${clBLAS_SOURCE_DIR} ${clBLAS_SOURCE_DIR}/tests/include ${clBLAS_SOURCE_DIR}/include ) add_executable(test-functional ${SRC_FUNC} ${SRC_COMMON} ${SRC_COMMON_TIMER} ${FUNC_HEADERS} ${TESTS_HEADERS}) set_target_properties( test-functional PROPERTIES RUNTIME_OUTPUT_DIRECTORY "${PROJECT_BINARY_DIR}/staging" ) if( BUILD_RUNTIME ) target_link_libraries(test-functional ${GTEST_LIBRARIES} ${TIME_LIBRARY} ${THREAD_LIBRARY} clBLAS ) else() target_link_libraries(test-functional ${GTEST_LIBRARIES} ${TIME_LIBRARY} ${THREAD_LIBRARY} ${OPENCL_LIBRARIES} ${runtime.library} ) endif() set_property( TARGET test-functional PROPERTY FOLDER "Test") # CPack configuration; include the executable into the package install( TARGETS test-functional RUNTIME DESTINATION bin${SUFFIX_BIN} LIBRARY DESTINATION lib${SUFFIX_LIB} ARCHIVE DESTINATION lib${SUFFIX_LIB}/import ) endif() clblas-2.10/src/tests/blas-cblas.c000066400000000000000000000023011264277366700170140ustar00rootroot00000000000000/* ************************************************************************ * Copyright 2013 Advanced Micro Devices, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * ************************************************************************/ #include complex compose_complex(float x, float y) { complex z = { x, y }; return z; } float complex_real(complex z) { return z.real; } float complex_imag(complex z) { return z.imag; } doublecomplex compose_doublecomplex(double x, double y) { doublecomplex z = { x, y }; return z; } double doublecomplex_real(doublecomplex z) { return z.real; } double doublecomplex_imag(doublecomplex z) { return z.imag; } clblas-2.10/src/tests/blas-wrapper.cpp000066400000000000000000001362301264277366700177610ustar00rootroot00000000000000/* ************************************************************************ * Copyright 2013 Advanced Micro Devices, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * ************************************************************************/ #include #include #include void ::clMath::blas::gemv( clblasOrder order, clblasTranspose transA, size_t M, size_t N, float alpha, const float *A, size_t lda, const float *X, int incx, float beta, float *Y, int incy) { blasSgemv(order, transA, M, N, alpha, A, lda, X, incx, beta, Y, incy); } void ::clMath::blas::gemv( clblasOrder order, clblasTranspose transA, size_t M, size_t N, double alpha, const double *A, size_t lda, const double *X, int incx, double beta, double *Y, int incy) { blasDgemv(order, transA, M, N, alpha, A, lda, X, incx, beta, Y, incy); } void ::clMath::blas::gemv( clblasOrder order, clblasTranspose transA, size_t M, size_t N, FloatComplex alpha, const FloatComplex *A, size_t lda, const FloatComplex *X, int incx, FloatComplex beta, FloatComplex *Y, int incy) { blasCgemv(order, transA, M, N, alpha, A, lda, X, incx, beta, Y, incy); } void ::clMath::blas::gemv( clblasOrder order, clblasTranspose transA, size_t M, size_t N, DoubleComplex alpha, const DoubleComplex *A, size_t lda, const DoubleComplex *X, int incx, DoubleComplex beta, DoubleComplex *Y, int incy) { blasZgemv(order, transA, M, N, alpha, A, lda, X, incx, beta, Y, incy); } void ::clMath::blas::symv( clblasOrder order, clblasUplo uplo, size_t N, float alpha, const float *A, size_t lda, const float *X, int incx, float beta, float *Y, int incy) { blasSsymv(order, uplo, N, alpha, A, lda, X, incx, beta, Y, incy); } void ::clMath::blas::symv( clblasOrder order, clblasUplo uplo, size_t N, double alpha, const double *A, size_t lda, const double *X, int incx, double beta, double *Y, int incy) { blasDsymv(order, uplo, N, alpha, A, lda, X, incx, beta, Y, incy); } void ::clMath::blas::gemm( clblasOrder order, clblasTranspose transA, clblasTranspose transB, size_t M, size_t N, size_t K, float alpha, const float *A, size_t lda, const float *B, size_t ldb, float beta, float *C, size_t ldc) { blasSgemm(order, transA, transB, M, N, K, alpha, A, lda, B, ldb, beta, C, ldc); } void ::clMath::blas::gemm( clblasOrder order, clblasTranspose transA, clblasTranspose transB, size_t M, size_t N, size_t K, double alpha, const double *A, size_t lda, const double *B, size_t ldb, double beta, double *C, size_t ldc) { blasDgemm(order, transA, transB, M, N, K, alpha, A, lda, B, ldb, beta, C, ldc); } void ::clMath::blas::gemm( clblasOrder order, clblasTranspose transA, clblasTranspose transB, size_t M, size_t N, size_t K, FloatComplex alpha, const FloatComplex *A, size_t lda, const FloatComplex *B, size_t ldb, FloatComplex beta, FloatComplex *C, size_t ldc) { blasCgemm(order, transA, transB, M, N, K, alpha, A, lda, B, ldb, beta, C, ldc); } void ::clMath::blas::gemm( clblasOrder order, clblasTranspose transA, clblasTranspose transB, size_t M, size_t N, size_t K, DoubleComplex alpha, const DoubleComplex *A, size_t lda, const DoubleComplex *B, size_t ldb, DoubleComplex beta, DoubleComplex *C, size_t ldc) { blasZgemm(order, transA, transB, M, N, K, alpha, A, lda, B, ldb, beta, C, ldc); } void ::clMath::blas::trmm( clblasOrder order, clblasSide side, clblasUplo uplo, clblasTranspose transA, clblasDiag diag, size_t M, size_t N, float alpha, const float *A, size_t lda, float *B, size_t ldb) { blasStrmm(order, side, uplo, transA, diag, M, N, alpha, A, lda, B, ldb); } void ::clMath::blas::trmm( clblasOrder order, clblasSide side, clblasUplo uplo, clblasTranspose transA, clblasDiag diag, size_t M, size_t N, double alpha, const double *A, size_t lda, double *B, size_t ldb) { blasDtrmm(order, side, uplo, transA, diag, M, N, alpha, A, lda, B, ldb); } void ::clMath::blas::trmm( clblasOrder order, clblasSide side, clblasUplo uplo, clblasTranspose transA, clblasDiag diag, size_t M, size_t N, FloatComplex alpha, const FloatComplex *A, size_t lda, FloatComplex *B, size_t ldb) { blasCtrmm(order, side, uplo, transA, diag, M, N, alpha, A, lda, B, ldb); } void ::clMath::blas::trmm( clblasOrder order, clblasSide side, clblasUplo uplo, clblasTranspose transA, clblasDiag diag, size_t M, size_t N, DoubleComplex alpha, const DoubleComplex *A, size_t lda, DoubleComplex *B, size_t ldb) { blasZtrmm(order, side, uplo, transA, diag, M, N, alpha, A, lda, B, ldb); } void ::clMath::blas::trsm( clblasOrder order, clblasSide side, clblasUplo uplo, clblasTranspose transA, clblasDiag diag, size_t M, size_t N, float alpha, const float *A, size_t lda, float *B, size_t ldb) { blasStrsm(order, side, uplo, transA, diag, M, N, alpha, A, lda, B, ldb); } void ::clMath::blas::trsm( clblasOrder order, clblasSide side, clblasUplo uplo, clblasTranspose transA, clblasDiag diag, size_t M, size_t N, double alpha, const double *A, size_t lda, double *B, size_t ldb) { blasDtrsm(order, side, uplo, transA, diag, M, N, alpha, A, lda, B, ldb); } void ::clMath::blas::trsm( clblasOrder order, clblasSide side, clblasUplo uplo, clblasTranspose transA, clblasDiag diag, size_t M, size_t N, FloatComplex alpha, const FloatComplex *A, size_t lda, FloatComplex *B, size_t ldb) { blasCtrsm(order, side, uplo, transA, diag, M, N, alpha, A, lda, B, ldb); } void ::clMath::blas::trsm( clblasOrder order, clblasSide side, clblasUplo uplo, clblasTranspose transA, clblasDiag diag, size_t M, size_t N, DoubleComplex alpha, const DoubleComplex *A, size_t lda, DoubleComplex *B, size_t ldb) { blasZtrsm(order, side, uplo, transA, diag, M, N, alpha, A, lda, B, ldb); } void ::clMath::blas::syr2k( clblasOrder order, clblasUplo uplo, clblasTranspose transA, size_t N, size_t K, float alpha, const float *A, size_t lda, const float *B, size_t ldb, float beta, float *C, size_t ldc) { blasSsyr2k(order, uplo, transA, N, K, alpha, A, lda, B, ldb, beta, C, ldc); } void ::clMath::blas::syr2k( clblasOrder order, clblasUplo uplo, clblasTranspose transA, size_t N, size_t K, double alpha, const double *A, size_t lda, const double *B, size_t ldb, double beta, double *C, size_t ldc) { blasDsyr2k(order, uplo, transA, N, K, alpha, A, lda, B, ldb, beta, C, ldc); } void ::clMath::blas::syr2k( clblasOrder order, clblasUplo uplo, clblasTranspose transA, size_t N, size_t K, FloatComplex alpha, const FloatComplex *A, size_t lda, const FloatComplex *B, size_t ldb, FloatComplex beta, FloatComplex *C, size_t ldc) { blasCsyr2k(order, uplo, transA, N, K, alpha, A, lda, B, ldb, beta, C, ldc); } void ::clMath::blas::syr2k( clblasOrder order, clblasUplo uplo, clblasTranspose transA, size_t N, size_t K, DoubleComplex alpha, const DoubleComplex *A, size_t lda, const DoubleComplex *B, size_t ldb, DoubleComplex beta, DoubleComplex *C, size_t ldc) { blasZsyr2k(order, uplo, transA, N, K, alpha, A, lda, B, ldb, beta, C, ldc); } void ::clMath::blas::syrk( clblasOrder order, clblasUplo uplo, clblasTranspose transA, size_t N, size_t K, float alpha, const float *A, size_t lda, float beta, float *C, size_t ldc) { blasSsyrk(order, uplo, transA, N, K, alpha, A, lda, beta, C, ldc); } void ::clMath::blas::syrk( clblasOrder order, clblasUplo uplo, clblasTranspose transA, size_t N, size_t K, double alpha, const double *A, size_t lda, double beta, double *C, size_t ldc) { blasDsyrk(order, uplo, transA, N, K, alpha, A, lda, beta, C, ldc); } void ::clMath::blas::syrk( clblasOrder order, clblasUplo uplo, clblasTranspose transA, size_t N, size_t K, FloatComplex alpha, const FloatComplex *A, size_t lda, FloatComplex beta, FloatComplex *C, size_t ldc) { blasCsyrk(order, uplo, transA, N, K, alpha, A, lda, beta, C, ldc); } void ::clMath::blas::syrk( clblasOrder order, clblasUplo uplo, clblasTranspose transA, size_t N, size_t K, DoubleComplex alpha, const DoubleComplex *A, size_t lda, DoubleComplex beta, DoubleComplex *C, size_t ldc) { blasZsyrk(order, uplo, transA, N, K, alpha, A, lda, beta, C, ldc); } void ::clMath::blas::trmv( clblasOrder order, clblasUplo uplo, clblasTranspose transA, clblasDiag diag, size_t N, float *A, size_t offa, size_t lda, float *X, size_t offx, int incx) { blasStrmv( order, uplo, transA, diag, N, A, offa, lda, X, offx, incx ); } void ::clMath::blas::trmv( clblasOrder order, clblasUplo uplo, clblasTranspose transA, clblasDiag diag, size_t N, double *A, size_t offa, size_t lda, double *X, size_t offx, int incx) { blasDtrmv( order, uplo, transA, diag, N, A, offa, lda, X, offx, incx ); } void ::clMath::blas::trmv( clblasOrder order, clblasUplo uplo, clblasTranspose transA, clblasDiag diag, size_t N, FloatComplex *A, size_t offa, size_t lda, FloatComplex *X, size_t offx, int incx) { blasCtrmv( order, uplo, transA, diag, N, A, offa, lda, X, offx, incx ); } void ::clMath::blas::trmv( clblasOrder order, clblasUplo uplo, clblasTranspose transA, clblasDiag diag, size_t N, DoubleComplex *A, size_t offa, size_t lda, DoubleComplex *X, size_t offx, int incx) { blasZtrmv( order, uplo, transA, diag, N, A, offa, lda, X, offx, incx ); } //TPMV void ::clMath::blas::tpmv( clblasOrder order, clblasUplo uplo, clblasTranspose transA, clblasDiag diag, size_t N, float *AP, size_t offa, float *X, size_t offx, int incx) { blasStpmv( order, uplo, transA, diag, N, AP, offa, X, offx, incx ); } void ::clMath::blas::tpmv( clblasOrder order, clblasUplo uplo, clblasTranspose transA, clblasDiag diag, size_t N, double *AP, size_t offa, double *X, size_t offx, int incx) { blasDtpmv( order, uplo, transA, diag, N, AP, offa, X, offx, incx ); } void ::clMath::blas::tpmv( clblasOrder order, clblasUplo uplo, clblasTranspose transA, clblasDiag diag, size_t N, FloatComplex *AP, size_t offa, FloatComplex *X, size_t offx, int incx) { blasCtpmv( order, uplo, transA, diag, N, AP, offa, X, offx, incx ); } void ::clMath::blas::tpmv( clblasOrder order, clblasUplo uplo, clblasTranspose transA, clblasDiag diag, size_t N, DoubleComplex *AP, size_t offa, DoubleComplex *X, size_t offx, int incx) { blasZtpmv( order, uplo, transA, diag, N, AP, offa, X, offx, incx ); } void ::clMath::blas::trsv( clblasOrder order, clblasUplo uplo, clblasTranspose transA, clblasDiag diag, size_t N, float *A, size_t offa, size_t lda, float *X, size_t offx, int incx) { blasStrsv( order, uplo, transA, diag, N, A,offa, lda, X,offx, incx ); } void ::clMath::blas::trsv( clblasOrder order, clblasUplo uplo, clblasTranspose transA, clblasDiag diag, size_t N, double *A, size_t offa, size_t lda, double *X, size_t offx, int incx) { blasDtrsv( order, uplo, transA, diag, N, A,offa, lda, X,offx, incx ); } void ::clMath::blas::trsv( clblasOrder order, clblasUplo uplo, clblasTranspose transA, clblasDiag diag, size_t N, FloatComplex *A, size_t offa, size_t lda, FloatComplex *X, size_t offx, int incx) { blasCtrsv( order, uplo, transA, diag, N, A,offa, lda, X,offx, incx ); } void ::clMath::blas::trsv( clblasOrder order, clblasUplo uplo, clblasTranspose transA, clblasDiag diag, size_t N, DoubleComplex *A, size_t offa, size_t lda, DoubleComplex *X, size_t offx, int incx) { blasZtrsv( order, uplo, transA, diag, N, A,offa, lda, X,offx, incx ); } void ::clMath::blas::tpsv( clblasOrder order, clblasUplo uplo, clblasTranspose transA, clblasDiag diag, size_t N, float *A, size_t offa, float *X, size_t offx, int incx) { blasStpsv( order, uplo, transA, diag, N, A, offa, X, offx, incx ); } void ::clMath::blas::tpsv( clblasOrder order, clblasUplo uplo, clblasTranspose transA, clblasDiag diag, size_t N, double *A, size_t offa, double *X, size_t offx, int incx) { blasDtpsv( order, uplo, transA, diag, N, A, offa, X, offx, incx ); } void ::clMath::blas::tpsv( clblasOrder order, clblasUplo uplo, clblasTranspose transA, clblasDiag diag, size_t N, FloatComplex *A, size_t offa, FloatComplex *X, size_t offx, int incx) { blasCtpsv( order, uplo, transA, diag, N, A, offa, X, offx, incx ); } void ::clMath::blas::tpsv( clblasOrder order, clblasUplo uplo, clblasTranspose transA, clblasDiag diag, size_t N, DoubleComplex *A, size_t offa, DoubleComplex *X, size_t offx, int incx) { blasZtpsv( order, uplo, transA, diag, N, A, offa, X, offx, incx ); } void ::clMath::blas::symm( clblasOrder order, clblasSide side, clblasUplo uplo, size_t M, size_t N, float alpha, float* A, size_t offa, size_t lda, float* B, size_t offb, size_t ldb, float beta, float* C, size_t offc, size_t ldc) { blasSsymm( order, side, uplo, M, N, alpha, A, offa, lda, B, offb, ldb, beta, C, offc, ldc ); } void ::clMath::blas::symm( clblasOrder order, clblasSide side, clblasUplo uplo, size_t M, size_t N, double alpha, double* A, size_t offa, size_t lda, double* B, size_t offb, size_t ldb, double beta, double* C, size_t offc, size_t ldc) { blasDsymm( order, side, uplo, M, N, alpha, A, offa, lda, B, offb, ldb, beta, C, offc, ldc ); } void ::clMath::blas::symm( clblasOrder order, clblasSide side, clblasUplo uplo, size_t M, size_t N, FloatComplex alpha, FloatComplex* A, size_t offa, size_t lda, FloatComplex* B, size_t offb, size_t ldb, FloatComplex beta, FloatComplex* C, size_t offc, size_t ldc) { blasCsymm( order, side, uplo, M, N, alpha, A, offa, lda, B, offb, ldb, beta, C, offc, ldc ); } void ::clMath::blas::symm( clblasOrder order, clblasSide side, clblasUplo uplo, size_t M, size_t N, DoubleComplex alpha, DoubleComplex* A, size_t offa, size_t lda, DoubleComplex* B, size_t offb, size_t ldb, DoubleComplex beta, DoubleComplex* C, size_t offc, size_t ldc) { blasZsymm( order, side, uplo, M, N, alpha, A, offa, lda, B, offb, ldb, beta, C, offc, ldc ); } void ::clMath::blas::ger( clblasOrder order, size_t M, size_t N, float alpha, float *x, size_t offx, int incx, float *y, size_t offy, int incy, float *A, size_t offa, size_t lda) { blasSger( order, M, N, alpha, x, offx, incx, y, offy, incy , A, offa, lda ); } void ::clMath::blas::ger( clblasOrder order, size_t M, size_t N, double alpha, double *x, size_t offx, int incx, double *y, size_t offy, int incy, double *A, size_t offa, size_t lda) { blasDger( order, M, N, alpha, x, offx, incx, y, offy, incy , A, offa, lda ); } void ::clMath::blas::ger( clblasOrder order, size_t M, size_t N, FloatComplex alpha, FloatComplex *x, size_t offx, int incx, FloatComplex *y, size_t offy, int incy, FloatComplex *A, size_t offa, size_t lda) { blasCgeru( order, M, N, alpha, x, offx, incx, y, offy, incy , A, offa, lda ); } void ::clMath::blas::ger( clblasOrder order, size_t M, size_t N, DoubleComplex alpha, DoubleComplex *x, size_t offx, int incx, DoubleComplex *y, size_t offy, int incy, DoubleComplex *A, size_t offa, size_t lda) { blasZgeru( order, M, N, alpha, x, offx, incx, y, offy, incy , A, offa, lda ); } void ::clMath::blas::gerc( clblasOrder order, size_t M, size_t N, FloatComplex alpha, FloatComplex *x, size_t offx, int incx, FloatComplex *y, size_t offy, int incy, FloatComplex *A, size_t offa, size_t lda) { blasCgerc( order, M, N, alpha, x, offx, incx, y, offy, incy , A, offa, lda ); } void ::clMath::blas::gerc( clblasOrder order, size_t M, size_t N, DoubleComplex alpha, DoubleComplex *x, size_t offx, int incx, DoubleComplex *y, size_t offy, int incy, DoubleComplex *A, size_t offa, size_t lda) { blasZgerc( order, M, N, alpha, x, offx, incx, y, offy, incy , A, offa, lda ); } void ::clMath::blas::syr( clblasOrder order, clblasUplo uplo, size_t N, float alpha, float* X, size_t offx, int incx, float* A, size_t offa, size_t lda) { blasSsyr(order, uplo, N, alpha, X, offx, incx, A, offa, lda); } void ::clMath::blas::syr( clblasOrder order, clblasUplo uplo, size_t N, double alpha, double* X, size_t offx, int incx, double* A, size_t offa, size_t lda) { blasDsyr(order, uplo, N, alpha, X, offx, incx, A, offa, lda); } //SPR void ::clMath::blas::spr( clblasOrder order, clblasUplo uplo, size_t N, float alpha, float* X, size_t offx, int incx, float* AP, size_t offa) { blasSspr(order, uplo, N, alpha, X, offx, incx, AP, offa); } void ::clMath::blas::spr( clblasOrder order, clblasUplo uplo, size_t N, double alpha, double* X, size_t offx, int incx, double* AP, size_t offa) { blasDspr(order, uplo, N, alpha, X, offx, incx, AP, offa); } void ::clMath::blas::her( clblasOrder order, clblasUplo uplo, size_t N, float alpha, FloatComplex *x, size_t offx, int incx, FloatComplex *A, size_t offa, size_t lda) { blasCher( order, uplo, N, alpha, x, offx, incx, A, offa, lda ); } void ::clMath::blas::her( clblasOrder order, clblasUplo uplo, size_t N, double alpha, DoubleComplex *x, size_t offx, int incx, DoubleComplex *A, size_t offa, size_t lda) { blasZher( order, uplo, N, alpha, x, offx, incx, A, offa, lda ); } void ::clMath::blas::syr2( clblasOrder order, clblasUplo uplo, size_t N, float alpha, float* X, size_t offx, int incx, float* Y, size_t offy, int incy, float* A, size_t offa, size_t lda) { blasSsyr2(order, uplo, N, alpha, X, offx, incx, Y, offy, incy, A, offa, lda); } void ::clMath::blas::syr2( clblasOrder order, clblasUplo uplo, size_t N, double alpha, double* X, size_t offx, int incx, double* Y, size_t offy, int incy, double* A, size_t offa, size_t lda) { blasDsyr2(order, uplo, N, alpha, X, offx, incx, Y, offy, incy, A, offa, lda); } //HER2 void ::clMath::blas::her2( clblasOrder order, clblasUplo uplo, size_t N, FloatComplex alpha, FloatComplex* X, size_t offx, int incx, FloatComplex* Y, size_t offy, int incy, FloatComplex* A, size_t offa, size_t lda) { blasCher2(order, uplo, N, alpha, X, offx, incx, Y, offy, incy, A, offa, lda); } void ::clMath::blas::her2( clblasOrder order, clblasUplo uplo, size_t N, DoubleComplex alpha, DoubleComplex* X, size_t offx, int incx, DoubleComplex* Y, size_t offy, int incy, DoubleComplex* A, size_t offa, size_t lda) { blasZher2(order, uplo, N, alpha, X, offx, incx, Y, offy, incy, A, offa, lda); } void ::clMath::blas::hemv( clblasOrder order, clblasUplo uplo, size_t N, FloatComplex alpha, FloatComplex* A, size_t offa, size_t lda, FloatComplex* X, size_t offx, int incx, FloatComplex beta, FloatComplex* Y, size_t offy, int incy) { blasChemv(order, uplo, N, alpha, A, offa, lda, X, offx, incx, beta, Y, offy, incy); } void ::clMath::blas::hemv( clblasOrder order, clblasUplo uplo, size_t N, DoubleComplex alpha, DoubleComplex* A, size_t offa, size_t lda, DoubleComplex* X, size_t offx, int incx, DoubleComplex beta, DoubleComplex* Y, size_t offy, int incy) { blasZhemv(order, uplo, N, alpha, A, offa, lda, X, offx, incx, beta, Y, offy, incy); } //HEMM void ::clMath::blas::hemm( clblasOrder order, clblasSide side, clblasUplo uplo, size_t M, size_t N, FloatComplex alpha, FloatComplex* A, size_t offa, size_t lda, FloatComplex* B, size_t offb, size_t ldb, FloatComplex beta, FloatComplex* C, size_t offc, size_t ldc) { blasChemm( order, side, uplo, M, N, alpha, A, offa, lda, B, offb, ldb, beta, C, offc, ldc ); } void ::clMath::blas::hemm( clblasOrder order, clblasSide side, clblasUplo uplo, size_t M, size_t N, DoubleComplex alpha, DoubleComplex* A, size_t offa, size_t lda, DoubleComplex* B, size_t offb, size_t ldb, DoubleComplex beta, DoubleComplex* C, size_t offc, size_t ldc) { blasZhemm( order, side, uplo, M, N, alpha, A, offa, lda, B, offb, ldb, beta, C, offc, ldc ); } void ::clMath::blas::herk( clblasOrder order, clblasUplo uplo, clblasTranspose transA, size_t N, size_t K, float alpha, const FloatComplex *A, size_t lda, float beta, FloatComplex *C, size_t ldc) { blasCherk(order, uplo, transA, N, K, alpha, A, lda, beta, C, ldc); } void ::clMath::blas::herk( clblasOrder order, clblasUplo uplo, clblasTranspose transA, size_t N, size_t K, double alpha, const DoubleComplex *A, size_t lda, double beta, DoubleComplex *C, size_t ldc) { blasZherk(order, uplo, transA, N, K, alpha, A, lda, beta, C, ldc); } void ::clMath::blas::spmv( clblasOrder order, clblasUplo uplo, size_t N, float alpha, const float *A, size_t offa, const float *X, size_t offx, int incx, float beta, float *Y, size_t offy, int incy) { blasSspmv(order, uplo, N, alpha, A, offa, X, offx, incx, beta, Y, offy, incy); } void ::clMath::blas::spmv( clblasOrder order, clblasUplo uplo, size_t N, double alpha, const double *A, size_t offa, const double *X, size_t offx, int incx, double beta, double *Y, size_t offy, int incy) { blasDspmv(order, uplo, N, alpha, A, offa, X, offx, incx, beta, Y, offy, incy); } void ::clMath::blas::hpmv( clblasOrder order, clblasUplo uplo, size_t N, FloatComplex alpha, FloatComplex* A, size_t offa, FloatComplex* X, size_t offx, int incx, FloatComplex beta, FloatComplex* Y, size_t offy, int incy) { blasChpmv(order, uplo, N, alpha, A, offa, X, offx, incx, beta, Y, offy, incy); } void ::clMath::blas::hpmv( clblasOrder order, clblasUplo uplo, size_t N, DoubleComplex alpha, DoubleComplex* A, size_t offa, DoubleComplex* X, size_t offx, int incx, DoubleComplex beta, DoubleComplex* Y, size_t offy, int incy) { blasZhpmv(order, uplo, N, alpha, A, offa, X, offx, incx, beta, Y, offy, incy); } void ::clMath::blas::hpr( clblasOrder order, clblasUplo uplo, size_t N, float alpha, FloatComplex *x, size_t offx, int incx, FloatComplex *AP, size_t offa) { blasChpr( order, uplo, N, alpha, x, offx, incx, AP, offa); } void ::clMath::blas::hpr( clblasOrder order, clblasUplo uplo, size_t N, double alpha, DoubleComplex *x, size_t offx, int incx, DoubleComplex *AP, size_t offa) { blasZhpr( order, uplo, N, alpha, x, offx, incx, AP, offa ); } void ::clMath::blas::spr2( clblasOrder order, clblasUplo uplo, size_t N, float alpha, float* X, size_t offx, int incx, float* Y, size_t offy, int incy, float* AP, size_t offa) { blasSspr2(order, uplo, N, alpha, X, offx, incx, Y, offy, incy, AP, offa); } void ::clMath::blas::spr2( clblasOrder order, clblasUplo uplo, size_t N, double alpha, double* X, size_t offx, int incx, double* Y, size_t offy, int incy, double* AP, size_t offa) { blasDspr2(order, uplo, N, alpha, X, offx, incx, Y, offy, incy, AP, offa); } void ::clMath::blas::hpr2( clblasOrder order, clblasUplo uplo, size_t N, FloatComplex alpha, FloatComplex* X, size_t offx, int incx, FloatComplex* Y, size_t offy, int incy, FloatComplex* AP, size_t offa) { blasChpr2(order, uplo, N, alpha, X, offx, incx, Y, offy, incy, AP, offa); } void ::clMath::blas::hpr2( clblasOrder order, clblasUplo uplo, size_t N, DoubleComplex alpha, DoubleComplex* X, size_t offx, int incx, DoubleComplex* Y, size_t offy, int incy, DoubleComplex* AP, size_t offa) { blasZhpr2(order, uplo, N, alpha, X, offx, incx, Y, offy, incy, AP, offa); } void clMath::blas::gbmv( clblasOrder order, clblasTranspose trans, size_t M, size_t N, size_t KL, size_t KU, float alpha, float *A, size_t offa, size_t lda, float *X, size_t offx, int incx, float beta, float *Y, size_t offy, int incy) { return blasSgbmv( order, trans, M, N, KL, KU, alpha, A, offa, lda, X, offx, incx, beta, Y, offy, incy ); } void clMath::blas::gbmv( clblasOrder order, clblasTranspose trans, size_t M, size_t N, size_t KL, size_t KU, double alpha, double *A, size_t offa, size_t lda, double *X, size_t offx, int incx, double beta, double *Y, size_t offy, int incy) { return blasDgbmv( order, trans, M, N, KL, KU, alpha, A, offa, lda, X, offx, incx, beta, Y, offy, incy); } void clMath::blas::gbmv( clblasOrder order, clblasTranspose trans, size_t M, size_t N, size_t KL, size_t KU, FloatComplex alpha, FloatComplex *A, size_t offa, size_t lda, FloatComplex *X, size_t offx, int incx, FloatComplex beta, FloatComplex *Y, size_t offy, int incy) { return blasCgbmv( order, trans, M, N, KL, KU, alpha, A, offa, lda, X, offx, incx, beta, Y, offy, incy); } void clMath::blas::gbmv( clblasOrder order, clblasTranspose trans, size_t M, size_t N, size_t KL, size_t KU, DoubleComplex alpha, DoubleComplex *A, size_t offa, size_t lda, DoubleComplex *X, size_t offx, int incx, DoubleComplex beta, DoubleComplex *Y, size_t offy, int incy) { return blasZgbmv( order, trans, M, N, KL, KU, alpha, A, offa, lda, X, offx, incx, beta, Y, offy, incy ); } //TBMV void clMath::blas::tbmv( clblasOrder order, clblasUplo uplo, clblasTranspose trans, clblasDiag diag, size_t N, size_t K, float *A, size_t offa, size_t lda, float *X, size_t offx, int incx) { return blasStbmv( order, uplo, trans, diag, N, K, A, offa, lda, X, offx, incx ); } void clMath::blas::tbmv( clblasOrder order, clblasUplo uplo, clblasTranspose trans, clblasDiag diag, size_t N, size_t K, double *A, size_t offa, size_t lda, double *X, size_t offx, int incx) { return blasDtbmv( order, uplo, trans, diag, N, K, A, offa, lda, X, offx, incx ); } void clMath::blas::tbmv( clblasOrder order, clblasUplo uplo, clblasTranspose trans, clblasDiag diag, size_t N, size_t K, FloatComplex *A, size_t offa, size_t lda, FloatComplex *X, size_t offx, int incx) { return blasCtbmv( order, uplo, trans, diag, N, K, A, offa, lda, X, offx, incx ); } void clMath::blas::tbmv( clblasOrder order, clblasUplo uplo, clblasTranspose trans, clblasDiag diag, size_t N, size_t K, DoubleComplex *A, size_t offa, size_t lda, DoubleComplex *X, size_t offx, int incx) { return blasZtbmv( order, uplo, trans, diag, N, K, A, offa, lda, X, offx, incx ); } //SBMV void clMath::blas::sbmv( clblasOrder order, clblasUplo uplo, size_t N, size_t K, float alpha, float *A, size_t offa, size_t lda, float *X, size_t offx, int incx, float beta, float *Y, size_t offy, int incy) { return blasSsbmv( order, uplo, N, K, alpha, A, offa, lda, X, offx, incx, beta, Y, offy, incy ); } void clMath::blas::sbmv( clblasOrder order, clblasUplo uplo, size_t N, size_t K, double alpha, double *A, size_t offa, size_t lda, double *X, size_t offx, int incx, double beta, double *Y, size_t offy, int incy) { return blasDsbmv( order, uplo, N, K, alpha, A, offa, lda, X, offx, incx, beta, Y, offy, incy ); } //HBMV void clMath::blas::hbmv( clblasOrder order, clblasUplo uplo, size_t N, size_t K, FloatComplex alpha, FloatComplex *A, size_t offa, size_t lda, FloatComplex *X, size_t offx, int incx, FloatComplex beta, FloatComplex *Y, size_t offy, int incy) { return blasChbmv( order, uplo, N, K, alpha, A, offa, lda, X, offx, incx, beta, Y, offy, incy ); } void clMath::blas::hbmv( clblasOrder order, clblasUplo uplo, size_t N, size_t K, DoubleComplex alpha, DoubleComplex *A, size_t offa, size_t lda, DoubleComplex *X, size_t offx, int incx, DoubleComplex beta, DoubleComplex *Y, size_t offy, int incy) { return blasZhbmv( order, uplo, N, K, alpha, A, offa, lda, X, offx, incx, beta, Y, offy, incy ); } //TBSV void clMath::blas::tbsv( clblasOrder order, clblasUplo uplo, clblasTranspose trans, clblasDiag diag, size_t N, size_t K, float *A, size_t offa, size_t lda, float *X, size_t offx, int incx) { return blasStbsv( order, uplo, trans, diag, N, K, A, offa, lda, X, offx, incx ); } void clMath::blas::tbsv( clblasOrder order, clblasUplo uplo, clblasTranspose trans, clblasDiag diag, size_t N, size_t K, double *A, size_t offa, size_t lda, double *X, size_t offx, int incx) { return blasDtbsv( order, uplo, trans, diag, N, K, A, offa, lda, X, offx, incx ); } void clMath::blas::tbsv( clblasOrder order, clblasUplo uplo, clblasTranspose trans, clblasDiag diag, size_t N, size_t K, FloatComplex *A, size_t offa, size_t lda, FloatComplex *X, size_t offx, int incx) { return blasCtbsv( order, uplo, trans, diag, N, K, A, offa, lda, X, offx, incx ); } void clMath::blas::tbsv( clblasOrder order, clblasUplo uplo, clblasTranspose trans, clblasDiag diag, size_t N, size_t K, DoubleComplex *A, size_t offa, size_t lda, DoubleComplex *X, size_t offx, int incx) { return blasZtbsv( order, uplo, trans, diag, N, K, A, offa, lda, X, offx, incx ); } void ::clMath::blas::her2k( clblasOrder order, clblasUplo uplo, clblasTranspose transA, size_t N, size_t K, FloatComplex alpha, const FloatComplex *A, size_t offa, size_t lda, const FloatComplex *B, size_t offb, size_t ldb, float beta, FloatComplex *C, size_t offc, size_t ldc) { blasCher2k(order, uplo, transA, N, K, alpha, A, offa, lda, B, offb, ldb, beta, C, offc, ldc); } void ::clMath::blas::her2k( clblasOrder order, clblasUplo uplo, clblasTranspose transA, size_t N, size_t K, DoubleComplex alpha, const DoubleComplex *A, size_t offa, size_t lda, const DoubleComplex *B, size_t offb, size_t ldb, double beta, DoubleComplex *C, size_t offc, size_t ldc) { blasZher2k(order, uplo, transA, N, K, alpha, A, offa, lda, B, offb, ldb, beta, C, offc, ldc); } //copy void ::clMath::blas::copy( size_t N, float *X, size_t offx, int incx, float *Y, size_t offy, int incy) { return blasScopy( N, X, offx, incx, Y, offy, incy ); } void ::clMath::blas::copy( size_t N, double *X, size_t offx, int incx, double *Y, size_t offy, int incy) { return blasDcopy( N, X, offx, incx, Y, offy, incy ); } void ::clMath::blas::copy( size_t N, FloatComplex *X, size_t offx, int incx, FloatComplex *Y, size_t offy, int incy) { return blasCcopy( N, X, offx, incx, Y, offy, incy ); } void ::clMath::blas::copy( size_t N, DoubleComplex *X, size_t offx, int incx, DoubleComplex *Y, size_t offy, int incy) { return blasZcopy( N, X, offx, incx, Y, offy, incy ); } //swap void clMath::blas::swap( size_t N, float *X, size_t offBX, int incx, float *Y, size_t offCY, int incy) { return blasSswap( N, X, offBX, incx, Y, offCY, incy ); } void clMath::blas::swap( size_t N, double *X, size_t offBX, int incx, double *Y, size_t offCY, int incy) { return blasDswap( N, X, offBX, incx, Y, offCY, incy ); } void clMath::blas::swap( size_t N, FloatComplex *X, size_t offBX, int incx, FloatComplex *Y, size_t offCY, int incy) { return blasCswap( N, X, offBX, incx, Y, offCY, incy ); } void clMath::blas::swap( size_t N, DoubleComplex *X, size_t offBX, int incx, DoubleComplex *Y, size_t offCY, int incy) { return blasZswap( N, X, offBX, incx, Y, offCY, incy ); } void ::clMath::blas::scal( bool is_css_zds, size_t N, float alpha, float *X, size_t offx, int incx) { is_css_zds = is_css_zds; return blasSscal(N, alpha, X, offx, incx); } void ::clMath::blas::scal( bool is_css_zds, size_t N, double alpha, double *X, size_t offx, int incx) { is_css_zds = is_css_zds; // Remove warning return blasDscal(N, alpha, X, offx, incx); } void ::clMath::blas::scal( bool is_css_zds, size_t N, FloatComplex alpha, FloatComplex *X, size_t offx, int incx) { if(is_css_zds) { return blasCsscal(N, CREAL(alpha), X, offx, incx); } else { return blasCscal(N, alpha, X, offx, incx); } } void ::clMath::blas::scal( bool is_css_zds, size_t N, DoubleComplex alpha, DoubleComplex *X, size_t offx, int incx) { if(is_css_zds) { return blasZdscal(N, CREAL(alpha), X, offx, incx); } else { return blasZscal(N, alpha, X, offx, incx); } } //DOT float clMath::blas::dot( size_t N, float *X, size_t offx, int incx, float *Y, size_t offy, int incy) { return blasSdot( N, X, offx, incx, Y, offy, incy ); } double clMath::blas::dot( size_t N, double *X, size_t offx, int incx, double *Y, size_t offy, int incy) { return blasDdot( N, X, offx, incx, Y, offy, incy ); } FloatComplex clMath::blas::dot( size_t N, FloatComplex *X, size_t offx, int incx, FloatComplex *Y, size_t offy, int incy) { return blasCdotu( N, X, offx, incx, Y, offy, incy ); } DoubleComplex clMath::blas::dot( size_t N, DoubleComplex *X, size_t offx, int incx, DoubleComplex *Y, size_t offy, int incy) { return blasZdotu( N, X, offx, incx, Y, offy, incy ); } //ASUM float clMath::blas::asum( size_t N, float *X, size_t offx, int incx) { return blasSasum( N, X, offx, incx); } double clMath::blas::asum( size_t N, double *X, size_t offx, int incx) { return blasDasum( N, X, offx, incx); } float clMath::blas::asum( size_t N, FloatComplex *X, size_t offx, int incx) { return blasScasum( N, X, offx, incx); } double clMath::blas::asum( size_t N, DoubleComplex *X, size_t offx, int incx) { return blasDzasum( N, X, offx, incx); } //DOTC FloatComplex clMath::blas::dotc( size_t N, FloatComplex *X, size_t offx, int incx, FloatComplex *Y, size_t offy, int incy) { return blasCdotc( N, X, offx, incx, Y, offy, incy ); } DoubleComplex clMath::blas::dotc( size_t N, DoubleComplex *X, size_t offx, int incx, DoubleComplex *Y, size_t offy, int incy) { return blasZdotc( N, X, offx, incx, Y, offy, incy ); } //axpy calls void clMath::blas::axpy( size_t N, float alpha, const float * X, size_t offBX, int incx, float *Y, size_t offCY, int incy) { return blasSaxpy(N, alpha, X, offBX, incx, Y, offCY, incy); } void clMath::blas::axpy( size_t N, double alpha, const double *X, size_t offBX, int incx, double *Y, size_t offCY, int incy) { return blasDaxpy(N, alpha, X, offBX, incx, Y, offCY, incy); } void clMath::blas::axpy( size_t N, FloatComplex alpha, const FloatComplex *X, size_t offBX, int incx, FloatComplex *Y, size_t offCY, int incy) { return blasCaxpy(N, alpha, X, offBX, incx, Y, offCY, incy); } void clMath::blas::axpy( size_t N, DoubleComplex alpha, const DoubleComplex *X, size_t offBX, int incx, DoubleComplex *Y, size_t offCY, int incy) { return blasZaxpy(N, alpha, X, offBX, incx, Y, offCY, incy); } void clMath::blas::rotg( float* SA, size_t offSA, float* SB, size_t offSB, float* C, size_t offC, float* S, size_t offS) { return blasSrotg(SA, offSA, SB, offSB, C, offC, S, offS); } void clMath::blas::rotg( double* SA, size_t offSA, double* SB, size_t offSB, double* C, size_t offC, double* S, size_t offS) { return blasDrotg(SA, offSA, SB, offSB, C, offC, S, offS); } void clMath::blas::rotg( FloatComplex* SA, size_t offSA, FloatComplex* SB, size_t offSB, float* C, size_t offC, FloatComplex* S, size_t offS) { return blasCrotg(SA, offSA, SB, offSB, C, offC, S, offS); } void clMath::blas::rotg( DoubleComplex* SA, size_t offSA, DoubleComplex* SB, size_t offSB, double* C, size_t offC, DoubleComplex* S, size_t offS) { return blasZrotg(SA, offSA, SB, offSB, C, offC, S, offS); } void clMath::blas::rotmg( float *D1, size_t offD1, float *D2, size_t offD2, float *X1, size_t offX1, const float *Y1, size_t offY1, float *PARAM, size_t offParam) { return blasSrotmg(D1, offD1, D2, offD2, X1, offX1, Y1, offY1, PARAM, offParam); } void clMath::blas::rotmg( double *D1, size_t offD1, double *D2, size_t offD2, double *X1, size_t offX1, const double *Y1, size_t offY1, double *PARAM, size_t offParam) { return blasDrotmg(D1, offD1, D2, offD2, X1, offX1, Y1, offY1, PARAM, offParam); } void clMath::blas::rotm( size_t N, float *X, size_t offx, int incx, float *Y, size_t offy, int incy, float *PARAM, size_t offParam) { return blasSrotm(N, X, offx, incx, Y, offy, incy, PARAM, offParam); } void clMath::blas::rotm( size_t N, double *X, size_t offx, int incx, double *Y, size_t offy, int incy, double *PARAM, size_t offParam) { return blasDrotm(N, X, offx, incx, Y, offy, incy, PARAM, offParam); } //rot void clMath::blas::rot( size_t N, float *X, size_t offx, int incx, float *Y, size_t offy, int incy, float C, float S) { return blasSrot(N, X, offx, incx, Y, offy, incy, C, S); } void clMath::blas::rot( size_t N, double *X, size_t offx, int incx, double *Y, size_t offy, int incy, double C, double S) { return blasDrot(N, X, offx, incx, Y, offy, incy, C, S); } void clMath::blas::rot( size_t N, FloatComplex *X, size_t offx, int incx, FloatComplex *Y, size_t offy, int incy, FloatComplex C, FloatComplex S) { return blasCsrot(N, X, offx, incx, Y, offy, incy, CREAL(C), CREAL(S)); } void clMath::blas::rot( size_t N, DoubleComplex *X, size_t offx, int incx, DoubleComplex *Y, size_t offy, int incy, DoubleComplex C, DoubleComplex S) { return blasZdrot(N, X, offx, incx, Y, offy, incy, CREAL(C), CREAL(S)); } int clMath::blas::iamax( size_t N, float *X, size_t offx, int incx) { return blasiSamax( N, X, offx, incx ); } int clMath::blas::iamax( size_t N, double *X, size_t offx, int incx) { return blasiDamax( N, X, offx, incx ); } int clMath::blas::iamax( size_t N, FloatComplex *X, size_t offx, int incx) { return blasiCamax( N, X, offx, incx ); } int clMath::blas::iamax( size_t N, DoubleComplex *X, size_t offx, int incx) { return blasiZamax( N, X, offx, incx ); } float clMath::blas::nrm2( size_t N, float *X, size_t offx, int incx) { return blasSnrm2( N, X, offx, incx ); } double clMath::blas::nrm2( size_t N, double *X, size_t offx, int incx) { return blasDnrm2( N, X, offx, incx ); } float clMath::blas::nrm2( size_t N, FloatComplex *X, size_t offx, int incx) { return blasScnrm2( N, X, offx, incx ); } double clMath::blas::nrm2( size_t N, DoubleComplex *X, size_t offx, int incx) { return blasDznrm2( N, X, offx, incx ); } clblas-2.10/src/tests/blas.c000066400000000000000000003252131264277366700157440ustar00rootroot00000000000000/* ************************************************************************ * Copyright 2013 Advanced Micro Devices, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * ************************************************************************/ #include /* abort() */ #include /* fprintf(), stderr */ #include #include #include #if defined CORR_TEST_WITH_ACML #include #else #include #endif void blasSgemv( clblasOrder order, clblasTranspose transA, size_t M, size_t N, float alpha, const float *A, size_t lda, const float *X, int incx, float beta, float *Y, int incy) { char fTransA; int fM, fN; int fLDA; if (order != clblasColumnMajor) { fprintf(stderr, "LAPACK routines require clblasColumnMajor order\n"); abort(); } fTransA = encodeTranspose(transA); fM = (int)M; fN = (int)N; fLDA = (int)lda; sgemv(fTransA, fM, fN, alpha, (float*)A, fLDA, (float*)X, incx, beta, Y, incy); } void blasDgemv( clblasOrder order, clblasTranspose transA, size_t M, size_t N, double alpha, const double *A, size_t lda, const double *X, int incx, double beta, double *Y, int incy) { char fTransA; int fM, fN; int fLDA; if (order != clblasColumnMajor) { fprintf(stderr, "LAPACK routines require clblasColumnMajor order\n"); abort(); } fTransA = encodeTranspose(transA); fM = (int)M; fN = (int)N; fLDA = (int)lda; dgemv(fTransA, fM, fN, alpha, (double*)A, fLDA, (double*)X, incx, beta, Y, incy); } void blasCgemv( clblasOrder order, clblasTranspose transA, size_t M, size_t N, FloatComplex alpha, const FloatComplex *A, size_t lda, const FloatComplex *X, int incx, FloatComplex beta, FloatComplex *Y, int incy) { char fTransA; int fM, fN; int fLDA; complex *fA, *fX, *fY; complex fAlpha, fBeta; #if 0 size_t sizeA, sizeX, sizeY; size_t i; sizeA = lda * N; //column major if (transA == clblasNoTrans) { sizeX = (N - 1) * abs(incx) + 1; sizeY = (M - 1) * abs(incy) + 1; } else { sizeX = (M - 1) * abs(incx) + 1; sizeY = (N - 1) * abs(incy) + 1; } #endif if (order != clblasColumnMajor) { fprintf(stderr, "LAPACK routines require clblasColumnMajor order\n"); abort(); } fTransA = encodeTranspose(transA); fM = (int)M; fN = (int)N; fLDA = (int)lda; fAlpha = compose_complex(CREAL(alpha), CIMAG(alpha)); fBeta = compose_complex(CREAL(beta), CIMAG(beta)); #if 0 fA = (complex*)calloc(sizeA, sizeof(complex)); if (fA == NULL) { return; } fX = (complex*)calloc(sizeX, sizeof(complex)); if (fX == NULL) { free(fA); return; } fY = (complex*)calloc(sizeY, sizeof(complex)); if (fY == NULL) { free(fX); free(fA); return; } for (i = 0; i < sizeA; i++) { fA[i] = compose_complex(CREAL(A[i]), CIMAG(A[i])); } for (i = 0; i < sizeX; i++) { fX[i] = compose_complex(CREAL(X[i]), CIMAG(X[i])); } for (i = 0; i < sizeY; i++) { fY[i] = compose_complex(CREAL(Y[i]), CIMAG(Y[i])); } #else fA = (complex*)A; fX = (complex*)X; fY = (complex*)Y; #endif cgemv(fTransA, fM, fN, &fAlpha, fA, fLDA, fX, incx, &fBeta, fY, incy); #if 0 for (i = 0; i < sizeY; i++) { Y[i] = floatComplex(complex_real(fY[i]), complex_imag(fY[i])); } free(fY); free(fX); free(fA); #endif } void blasZgemv( clblasOrder order, clblasTranspose transA, size_t M, size_t N, DoubleComplex alpha, const DoubleComplex *A, size_t lda, const DoubleComplex *X, int incx, DoubleComplex beta, DoubleComplex *Y, int incy) { char fTransA; int fM, fN; int fLDA; doublecomplex *fA, *fX, *fY; doublecomplex fAlpha, fBeta; #if 0 size_t sizeA, sizeX, sizeY; size_t i; sizeA = lda * N; //column major if (transA == clblasNoTrans) { sizeX = (N - 1) * abs(incx) + 1; sizeY = (M - 1) * abs(incy) + 1; } else { sizeX = (M - 1) * abs(incx) + 1; sizeY = (N - 1) * abs(incy) + 1; } #endif if (order != clblasColumnMajor) { fprintf(stderr, "LAPACK routines require clblasColumnMajor order\n"); abort(); } fTransA = encodeTranspose(transA); fM = (int)M; fN = (int)N; fLDA = (int)lda; fAlpha = compose_doublecomplex(CREAL(alpha), CIMAG(alpha)); fBeta = compose_doublecomplex(CREAL(beta), CIMAG(beta)); #if 0 fA = (doublecomplex*)calloc(sizeA, sizeof(doublecomplex)); if (fA == NULL) { return; } fX = (doublecomplex*)calloc(sizeX, sizeof(doublecomplex)); if (fX == NULL) { free(fA); return; } fY = (doublecomplex*)calloc(sizeY, sizeof(doublecomplex)); if (fY == NULL) { free(fX); free(fA); return; } for (i = 0; i < sizeA; i++) { fA[i] = compose_doublecomplex(CREAL(A[i]), CIMAG(A[i])); } for (i = 0; i < sizeX; i++) { fX[i] = compose_doublecomplex(CREAL(X[i]), CIMAG(X[i])); } for (i = 0; i < sizeY; i++) { fY[i] = compose_doublecomplex(CREAL(Y[i]), CIMAG(Y[i])); } #else fA = (doublecomplex*)A; fX = (doublecomplex*)X; fY = (doublecomplex*)Y; #endif zgemv(fTransA, fM, fN, &fAlpha, fA, fLDA, fX, incx, &fBeta, fY, incy); #if 0 for (i = 0; i < sizeY; i++) { Y[i] = doubleComplex( doublecomplex_real(fY[i]), doublecomplex_imag(fY[i])); } free(fY); free(fX); free(fA); #endif } void blasSsymv( clblasOrder order, clblasUplo uplo, size_t N, float alpha, const float *A, size_t lda, const float *X, int incx, float beta, float *Y, int incy) { char fUplo; int fN; int fLDA; if (order != clblasColumnMajor) { fprintf(stderr, "LAPACK routines require clblasColumnMajor order\n"); abort(); } fUplo = encodeUplo(uplo); fN = (int)N; fLDA = (int)lda; ssymv(fUplo, fN, alpha, (float*)A, fLDA, (float*)X, incx, beta, Y, incy); } void blasDsymv( clblasOrder order, clblasUplo uplo, size_t N, double alpha, const double *A, size_t lda, const double *X, int incx, double beta, double *Y, int incy) { char fUplo; int fN; int fLDA; if (order != clblasColumnMajor) { fprintf(stderr, "LAPACK routines require clblasColumnMajor order\n"); abort(); } fUplo = encodeUplo(uplo); fN = (int)N; fLDA = (int)lda; dsymv(fUplo, fN, alpha, (double*)A, fLDA, (double*)X, incx, beta, Y, incy); } void blasSgemm( clblasOrder order, clblasTranspose transA, clblasTranspose transB, size_t M, size_t N, size_t K, float alpha, const float *A, size_t lda, const float *B, size_t ldb, float beta, float *C, size_t ldc) { char fTransA, fTransB; int fM, fN, fK; int fLDA, fLDB, fLDC; if (order != clblasColumnMajor) { fprintf(stderr, "LAPACK routines require clblasColumnMajor order\n"); abort(); } fTransA = encodeTranspose(transA); fTransB = encodeTranspose(transB); fM = (int)M; fN = (int)N; fK = (int)K; fLDA = (int)lda; fLDB = (int)ldb; fLDC = (int)ldc; sgemm(fTransA, fTransB, fM, fN, fK, alpha, (float*)A, fLDA, (float*)B, fLDB, beta, C, fLDC); } void blasDgemm( clblasOrder order, clblasTranspose transA, clblasTranspose transB, size_t M, size_t N, size_t K, double alpha, const double *A, size_t lda, const double *B, size_t ldb, double beta, double *C, size_t ldc) { char fTransA, fTransB; int fM, fN, fK; int fLDA, fLDB, fLDC; if (order != clblasColumnMajor) { fprintf(stderr, "LAPACK routines require clblasColumnMajor order\n"); abort(); } fTransA = encodeTranspose(transA); fTransB = encodeTranspose(transB); fM = (int)M; fN = (int)N; fK = (int)K; fLDA = (int)lda; fLDB = (int)ldb; fLDC = (int)ldc; dgemm(fTransA, fTransB, fM, fN, fK, alpha, (double*)A, fLDA, (double*)B, fLDB, beta, C, fLDC); } void blasCgemm( clblasOrder order, clblasTranspose transA, clblasTranspose transB, size_t M, size_t N, size_t K, FloatComplex alpha, const FloatComplex *A, size_t lda, const FloatComplex *B, size_t ldb, FloatComplex beta, FloatComplex *C, size_t ldc) { char fTransA, fTransB; int fM, fN, fK; int fLDA, fLDB, fLDC; complex *fA, *fB, *fC; complex fAlpha, fBeta; #if 0 size_t ma, ka, nb, kb, mc, nc; size_t i; if (transA == clblasNoTrans) { ma = lda; ka = K; } else { ka = lda; ma = M; } if (transB == clblasNoTrans) { kb = ldb; nb = N; } else { nb = ldb; kb = K; } mc = ldc; nc = N; #endif if (order != clblasColumnMajor) { fprintf(stderr, "LAPACK routines require clblasColumnMajor order\n"); abort(); } fTransA = encodeTranspose(transA); fTransB = encodeTranspose(transB); fM = (int)M; fN = (int)N; fK = (int)K; fLDA = (int)lda; fLDB = (int)ldb; fLDC = (int)ldc; fAlpha = compose_complex(CREAL(alpha), CIMAG(alpha)); fBeta = compose_complex(CREAL(beta), CIMAG(beta)); #if 0 fA = (complex*)calloc(ma * ka, sizeof(complex)); if (fA == NULL) { return; } fB = (complex*)calloc(kb * nb, sizeof(complex)); if (fB == NULL) { free(fA); return; } fC = (complex*)calloc(mc * nc, sizeof(complex)); if (fC == NULL) { free(fB); free(fA); return; } for (i = 0; i < ma * ka; i++) { fA[i] = compose_complex(CREAL(A[i]), CIMAG(A[i])); } for (i = 0; i < kb * nb; i++) { fB[i] = compose_complex(CREAL(B[i]), CIMAG(B[i])); } for (i = 0; i < mc * nc; i++) { fC[i] = compose_complex(CREAL(C[i]), CIMAG(C[i])); } #else fA = (complex*)A; fB = (complex*)B; fC = (complex*)C; #endif cgemm(fTransA, fTransB, fM, fN, fK, &fAlpha, fA, fLDA, fB, fLDB, &fBeta, fC, fLDC); #if 0 for (i = 0; i < mc * nc; i++) { C[i] = floatComplex(complex_real(fC[i]), complex_imag(fC[i])); } free(fC); free(fB); free(fA); #endif } void blasZgemm( clblasOrder order, clblasTranspose transA, clblasTranspose transB, size_t M, size_t N, size_t K, DoubleComplex alpha, const DoubleComplex *A, size_t lda, const DoubleComplex *B, size_t ldb, DoubleComplex beta, DoubleComplex *C, size_t ldc) { char fTransA, fTransB; int fM, fN, fK; int fLDA, fLDB, fLDC; doublecomplex *fA, *fB, *fC; doublecomplex fAlpha, fBeta; #if 0 size_t ma, ka, nb, kb, mc, nc; size_t i; if (transA == clblasNoTrans) { ma = lda; ka = K; } else { ka = lda; ma = M; } if (transB == clblasNoTrans) { kb = ldb; nb = N; } else { nb = ldb; kb = K; } mc = ldc; nc = N; #endif if (order != clblasColumnMajor) { fprintf(stderr, "LAPACK routines require clblasColumnMajor order\n"); abort(); } fTransA = encodeTranspose(transA); fTransB = encodeTranspose(transB); fM = (int)M; fN = (int)N; fK = (int)K; fLDA = (int)lda; fLDB = (int)ldb; fLDC = (int)ldc; fAlpha = compose_doublecomplex(CREAL(alpha), CIMAG(alpha)); fBeta = compose_doublecomplex(CREAL(beta), CIMAG(beta)); #if 0 fA = (doublecomplex*)calloc(ma * ka, sizeof(doublecomplex)); if (fA == NULL) { return; } fB = (doublecomplex*)calloc(kb * nb, sizeof(doublecomplex)); if (fB == NULL) { free(fA); return; } fC = (doublecomplex*)calloc(mc * nc, sizeof(doublecomplex)); if (fC == NULL) { free(fB); free(fA); return; } for (i = 0; i < ma * ka; i++) { fA[i] = compose_doublecomplex(CREAL(A[i]), CIMAG(A[i])); } for (i = 0; i < kb * nb; i++) { fB[i] = compose_doublecomplex(CREAL(B[i]), CIMAG(B[i])); } for (i = 0; i < mc * nc; i++) { fC[i] = compose_doublecomplex(CREAL(C[i]), CIMAG(C[i])); } #else fA = (doublecomplex*)A; fB = (doublecomplex*)B; fC = (doublecomplex*)C; #endif zgemm(fTransA, fTransB, fM, fN, fK, &fAlpha, fA, fLDA, fB, fLDB, &fBeta, fC, fLDC); #if 0 for (i = 0; i < mc * nc; i++) { C[i] = doubleComplex(doublecomplex_real(fC[i]), doublecomplex_imag(fC[i])); } free(fC); free(fB); free(fA); #endif } void blasStrmm( clblasOrder order, clblasSide side, clblasUplo uplo, clblasTranspose transA, clblasDiag diag, size_t M, size_t N, float alpha, const float *A, size_t lda, float *B, size_t ldb) { char fSide, fUplo, fTransA, fDiag; int fM, fN; int fLDA, fLDB; if (order != clblasColumnMajor) { fprintf(stderr, "LAPACK routines require clblasColumnMajor order\n"); abort(); } fSide = encodeSide(side); fUplo = encodeUplo(uplo); fTransA = encodeTranspose(transA); fDiag = encodeDiag(diag); fM = (int)M; fN = (int)N; fLDA = (int)lda; fLDB = (int)ldb; strmm(fSide, fUplo, fTransA, fDiag, fM, fN, alpha, (float*)A, fLDA, B, fLDB); } void blasDtrmm( clblasOrder order, clblasSide side, clblasUplo uplo, clblasTranspose transA, clblasDiag diag, size_t M, size_t N, double alpha, const double *A, size_t lda, double *B, size_t ldb) { char fSide, fUplo, fTransA, fDiag; int fM, fN; int fLDA, fLDB; if (order != clblasColumnMajor) { fprintf(stderr, "LAPACK routines require clblasColumnMajor order\n"); abort(); } fSide = encodeSide(side); fUplo = encodeUplo(uplo); fTransA = encodeTranspose(transA); fDiag = encodeDiag(diag); fM = (int)M; fN = (int)N; fLDA = (int)lda; fLDB = (int)ldb; dtrmm(fSide, fUplo, fTransA, fDiag, fM, fN, alpha, (double*)A, fLDA, B, fLDB); } void blasCtrmm( clblasOrder order, clblasSide side, clblasUplo uplo, clblasTranspose transA, clblasDiag diag, size_t M, size_t N, FloatComplex alpha, const FloatComplex *A, size_t lda, FloatComplex *B, size_t ldb) { char fSide, fUplo, fTransA, fDiag; int fM, fN; int fLDA, fLDB; complex *fA, *fB; complex fAlpha; #if 0 size_t ma, na, mb, nb; size_t i; ma = lda; if (side == clblasLeft) { na = M; } else { na = N; } mb = ldb; nb = N; #endif if (order != clblasColumnMajor) { fprintf(stderr, "LAPACK routines require clblasColumnMajor order\n"); abort(); } fSide = encodeSide(side); fUplo = encodeUplo(uplo); fTransA = encodeTranspose(transA); fDiag = encodeDiag(diag); fM = (int)M; fN = (int)N; fLDA = (int)lda; fLDB = (int)ldb; fAlpha = compose_complex(CREAL(alpha), CIMAG(alpha)); #if 0 fA = (complex*)calloc(ma * na, sizeof(complex)); if (fA == NULL) { return; } fB = (complex*)calloc(mb * nb, sizeof(complex)); if (fB == NULL) { free(fA); return; } for (i = 0; i < ma * na; i++) { fA[i] = compose_complex(CREAL(A[i]), CIMAG(A[i])); } for (i = 0; i < mb * nb; i++) { fB[i] = compose_complex(CREAL(B[i]), CIMAG(B[i])); } #else fA = (complex*)A; fB = (complex*)B; #endif ctrmm(fSide, fUplo, fTransA, fDiag, fM, fN, &fAlpha, fA, fLDA, fB, fLDB); #if 0 for (i = 0; i < mb * nb; i++) { B[i] = floatComplex(complex_real(fB[i]), complex_imag(fB[i])); } free(fB); free(fA); #endif } void blasZtrmm( clblasOrder order, clblasSide side, clblasUplo uplo, clblasTranspose transA, clblasDiag diag, size_t M, size_t N, DoubleComplex alpha, const DoubleComplex *A, size_t lda, DoubleComplex *B, size_t ldb) { char fSide, fUplo, fTransA, fDiag; int fM, fN; int fLDA, fLDB; doublecomplex *fA, *fB; doublecomplex fAlpha; #if 0 size_t ma, na, mb, nb; size_t i; ma = lda; if (side == clblasLeft) { na = M; } else { na = N; } mb = ldb; nb = N; #endif if (order != clblasColumnMajor) { fprintf(stderr, "LAPACK routines require clblasColumnMajor order\n"); abort(); } fSide = encodeSide(side); fUplo = encodeUplo(uplo); fTransA = encodeTranspose(transA); fDiag = encodeDiag(diag); fM = (int)M; fN = (int)N; fLDA = (int)lda; fLDB = (int)ldb; fAlpha = compose_doublecomplex(CREAL(alpha), CIMAG(alpha)); #if 0 fA = (doublecomplex*)calloc(ma * na, sizeof(doublecomplex)); if (fA == NULL) { return; } fB = (doublecomplex*)calloc(mb * nb, sizeof(doublecomplex)); if (fB == NULL) { free(fA); return; } for (i = 0; i < ma * na; i++) { fA[i] = compose_doublecomplex(CREAL(A[i]), CIMAG(A[i])); } for (i = 0; i < mb * nb; i++) { fB[i] = compose_doublecomplex(CREAL(B[i]), CIMAG(B[i])); } #else fA = (doublecomplex*)A; fB = (doublecomplex*)B; #endif ztrmm(fSide, fUplo, fTransA, fDiag, fM, fN, &fAlpha, fA, fLDA, fB, fLDB); #if 0 for (i = 0; i < mb * nb; i++) { B[i] = doubleComplex(doublecomplex_real(fB[i]), doublecomplex_imag(fB[i])); } free(fB); free(fA); #endif } void blasStrsm( clblasOrder order, clblasSide side, clblasUplo uplo, clblasTranspose transA, clblasDiag diag, size_t M, size_t N, float alpha, const float *A, size_t lda, float *B, size_t ldb) { char fSide, fUplo, fTransA, fDiag; int fM, fN; int fLDA, fLDB; if (order != clblasColumnMajor) { fprintf(stderr, "LAPACK routines require clblasColumnMajor order\n"); abort(); } fSide = encodeSide(side); fUplo = encodeUplo(uplo); fTransA = encodeTranspose(transA); fDiag = encodeDiag(diag); fM = (int)M; fN = (int)N; fLDA = (int)lda; fLDB = (int)ldb; strsm(fSide, fUplo, fTransA, fDiag, fM, fN, alpha, (float*)A, fLDA, B, fLDB); } void blasDtrsm( clblasOrder order, clblasSide side, clblasUplo uplo, clblasTranspose transA, clblasDiag diag, size_t M, size_t N, double alpha, const double *A, size_t lda, double *B, size_t ldb) { char fSide, fUplo, fTransA, fDiag; int fM, fN; int fLDA, fLDB; if (order != clblasColumnMajor) { fprintf(stderr, "LAPACK routines require clblasColumnMajor order\n"); abort(); } fSide = encodeSide(side); fUplo = encodeUplo(uplo); fTransA = encodeTranspose(transA); fDiag = encodeDiag(diag); fM = (int)M; fN = (int)N; fLDA = (int)lda; fLDB = (int)ldb; dtrsm(fSide, fUplo, fTransA, fDiag, fM, fN, alpha, (double*)A, fLDA, B, fLDB); } void blasCtrsm( clblasOrder order, clblasSide side, clblasUplo uplo, clblasTranspose transA, clblasDiag diag, size_t M, size_t N, FloatComplex alpha, const FloatComplex *A, size_t lda, FloatComplex *B, size_t ldb) { char fSide, fUplo, fTransA, fDiag; int fM, fN; int fLDA, fLDB; complex *fA, *fB; complex fAlpha; #if 0 size_t ma, na, mb, nb; size_t i; ma = lda; if (side == clblasLeft) { na = M; } else { na = N; } mb = ldb; nb = N; #endif if (order != clblasColumnMajor) { fprintf(stderr, "LAPACK routines require clblasColumnMajor order\n"); abort(); } fSide = encodeSide(side); fUplo = encodeUplo(uplo); fTransA = encodeTranspose(transA); fDiag = encodeDiag(diag); fM = (int)M; fN = (int)N; fLDA = (int)lda; fLDB = (int)ldb; fAlpha = compose_complex(CREAL(alpha), CIMAG(alpha)); #if 0 fA = (complex*)calloc(ma * na, sizeof(complex)); if (fA == NULL) { return; } fB = (complex*)calloc(mb * nb, sizeof(complex)); if (fB == NULL) { free(fA); return; } for (i = 0; i < ma * na; i++) { fA[i] = compose_complex(CREAL(A[i]), CIMAG(A[i])); } for (i = 0; i < mb * nb; i++) { fB[i] = compose_complex(CREAL(B[i]), CIMAG(B[i])); } #else fA = (complex*)A; fB = (complex*)B; #endif ctrsm(fSide, fUplo, fTransA, fDiag, fM, fN, &fAlpha, fA, fLDA, fB, fLDB); #if 0 for (i = 0; i < mb * nb; i++) { B[i] = floatComplex(complex_real(fB[i]), complex_imag(fB[i])); } free(fB); free(fA); #endif } void blasZtrsm( clblasOrder order, clblasSide side, clblasUplo uplo, clblasTranspose transA, clblasDiag diag, size_t M, size_t N, DoubleComplex alpha, const DoubleComplex *A, size_t lda, DoubleComplex *B, size_t ldb) { char fSide, fUplo, fTransA, fDiag; int fM, fN; int fLDA, fLDB; doublecomplex *fA, *fB; doublecomplex fAlpha; #if 0 size_t ma, na, mb, nb; size_t i; ma = lda; if (side == clblasLeft) { na = M; } else { na = N; } mb = ldb; nb = N; #endif if (order != clblasColumnMajor) { fprintf(stderr, "LAPACK routines require clblasColumnMajor order\n"); abort(); } fSide = encodeSide(side); fUplo = encodeUplo(uplo); fTransA = encodeTranspose(transA); fDiag = encodeDiag(diag); fM = (int)M; fN = (int)N; fLDA = (int)lda; fLDB = (int)ldb; fAlpha = compose_doublecomplex(CREAL(alpha), CIMAG(alpha)); #if 0 fA = (doublecomplex*)calloc(ma * na, sizeof(doublecomplex)); if (fA == NULL) { return; } fB = (doublecomplex*)calloc(mb * nb, sizeof(doublecomplex)); if (fB == NULL) { free(fA); return; } for (i = 0; i < ma * na; i++) { fA[i] = compose_doublecomplex(CREAL(A[i]), CIMAG(A[i])); } for (i = 0; i < mb * nb; i++) { fB[i] = compose_doublecomplex(CREAL(B[i]), CIMAG(B[i])); } #else fA = (doublecomplex*)A; fB = (doublecomplex*)B; #endif ztrsm(fSide, fUplo, fTransA, fDiag, fM, fN, &fAlpha, fA, fLDA, fB, fLDB); #if 0 for (i = 0; i < mb * nb; i++) { B[i] = doubleComplex(doublecomplex_real(fB[i]), doublecomplex_imag(fB[i])); } free(fB); free(fA); #endif } void blasSsyr2k( clblasOrder order, clblasUplo uplo, clblasTranspose transA, size_t N, size_t K, float alpha, const float *A, size_t lda, const float *B, size_t ldb, float beta, float *C, size_t ldc) { char fUplo, fTransA; int fN, fK; int fLDA, fLDB, fLDC; if (order != clblasColumnMajor) { fprintf(stderr, "LAPACK routines require clblasColumnMajor order\n"); abort(); } fUplo = encodeUplo(uplo); fTransA = encodeTranspose(transA); fN = (int)N; fK = (int)K; fLDA = (int)lda; fLDB = (int)ldb; fLDC = (int)ldc; ssyr2k(fUplo, fTransA, fN, fK, alpha, (float*)A, fLDA, (float*)B, fLDB, beta, C, fLDC); } void blasDsyr2k( clblasOrder order, clblasUplo uplo, clblasTranspose transA, size_t N, size_t K, double alpha, const double *A, size_t lda, const double *B, size_t ldb, double beta, double *C, size_t ldc) { char fUplo, fTransA; int fN, fK; int fLDA, fLDB, fLDC; if (order != clblasColumnMajor) { fprintf(stderr, "LAPACK routines require clblasColumnMajor order\n"); abort(); } fUplo = encodeUplo(uplo); fTransA = encodeTranspose(transA); fN = (int)N; fK = (int)K; fLDA = (int)lda; fLDB = (int)ldb; fLDC = (int)ldc; dsyr2k(fUplo, fTransA, fN, fK, alpha, (double*)A, fLDA, (double*)B, fLDB, beta, C, fLDC); } void blasCsyr2k( clblasOrder order, clblasUplo uplo, clblasTranspose transA, size_t N, size_t K, FloatComplex alpha, const FloatComplex *A, size_t lda, const FloatComplex *B, size_t ldb, FloatComplex beta, FloatComplex *C, size_t ldc) { char fUplo, fTransA; int fN, fK; int fLDA, fLDB, fLDC; complex *fA, *fB, *fC; complex fAlpha, fBeta; #if 0 size_t na, ka, nb, kb, rowsC, columnsC; size_t i; if (transA == clblasNoTrans) { na = lda; ka = K; nb = ldb; kb = K; } else { ka = lda; na = N; kb = ldb; nb = N; } rowsC = ldc; columnsC = N; #endif if (order != clblasColumnMajor) { fprintf(stderr, "LAPACK routines require clblasColumnMajor order\n"); abort(); } fUplo = encodeUplo(uplo); fTransA = encodeTranspose(transA); fN = (int)N; fK = (int)K; fLDA = (int)lda; fLDB = (int)ldb; fLDC = (int)ldc; fAlpha = compose_complex(CREAL(alpha), CIMAG(alpha)); fBeta = compose_complex(CREAL(beta), CIMAG(beta)); #if 0 fA = (complex*)calloc(na * ka, sizeof(complex)); if (fA == NULL) { return; } fB = (complex*)calloc(nb * kb, sizeof(complex)); if (fB == NULL) { free(fA); return; } fC = (complex*)calloc(rowsC * columnsC, sizeof(complex)); if (fC == NULL) { free(fB); free(fA); return; } for (i = 0; i < na * ka; i++) { fA[i] = compose_complex(CREAL(A[i]), CIMAG(A[i])); } for (i = 0; i < nb * kb; i++) { fB[i] = compose_complex(CREAL(B[i]), CIMAG(B[i])); } for (i = 0; i < rowsC * columnsC; i++) { fC[i] = compose_complex(CREAL(C[i]), CIMAG(C[i])); } #else fA = (complex*)A; fB = (complex*)B; fC = (complex*)C; #endif csyr2k(fUplo, fTransA, fN, fK, &fAlpha, fA, fLDA, fB, fLDB, &fBeta, fC, fLDC); #if 0 for (i = 0; i < rowsC * columnsC; i++) { C[i] = floatComplex(complex_real(fC[i]), complex_imag(fC[i])); } free(fC); free(fB); free(fA); #endif } void blasZsyr2k( clblasOrder order, clblasUplo uplo, clblasTranspose transA, size_t N, size_t K, DoubleComplex alpha, const DoubleComplex *A, size_t lda, const DoubleComplex *B, size_t ldb, DoubleComplex beta, DoubleComplex *C, size_t ldc) { char fUplo, fTransA; int fN, fK; int fLDA, fLDB, fLDC; doublecomplex *fA, *fB, *fC; doublecomplex fAlpha, fBeta; #if 0 size_t na, ka, nb, kb, rowsC, columnsC; size_t i; if (transA == clblasNoTrans) { na = lda; ka = K; nb = ldb; kb = K; } else { ka = lda; na = N; kb = ldb; nb = N; } rowsC = ldc; columnsC = N; #endif if (order != clblasColumnMajor) { fprintf(stderr, "LAPACK routines require clblasColumnMajor order\n"); abort(); } fUplo = encodeUplo(uplo); fTransA = encodeTranspose(transA); fN = (int)N; fK = (int)K; fLDA = (int)lda; fLDB = (int)ldb; fLDC = (int)ldc; fAlpha = compose_doublecomplex(CREAL(alpha), CIMAG(alpha)); fBeta = compose_doublecomplex(CREAL(beta), CIMAG(beta)); #if 0 fA = (doublecomplex*)calloc(na * ka, sizeof(doublecomplex)); if (fA == NULL) { return; } fB = (doublecomplex*)calloc(nb * kb, sizeof(doublecomplex)); if (fB == NULL) { free(fA); return; } fC = (doublecomplex*)calloc(rowsC * columnsC, sizeof(doublecomplex)); if (fC == NULL) { free(fB); free(fA); return; } for (i = 0; i < na * ka; i++) { fA[i] = compose_doublecomplex(CREAL(A[i]), CIMAG(A[i])); } for (i = 0; i < nb * kb; i++) { fB[i] = compose_doublecomplex(CREAL(B[i]), CIMAG(B[i])); } for (i = 0; i < rowsC * columnsC; i++) { fC[i] = compose_doublecomplex(CREAL(C[i]), CIMAG(C[i])); } #else fA = (doublecomplex*)A; fB = (doublecomplex*)B; fC = (doublecomplex*)C; #endif zsyr2k(fUplo, fTransA, fN, fK, &fAlpha, fA, fLDA, fB, fLDB, &fBeta, fC, fLDC); #if 0 for (i = 0; i < rowsC * columnsC; i++) { C[i] = doubleComplex(doublecomplex_real(fC[i]), doublecomplex_imag(fC[i])); } free(fC); free(fB); free(fA); #endif } void blasSsyrk( clblasOrder order, clblasUplo uplo, clblasTranspose transA, size_t N, size_t K, float alpha, const float *A, size_t lda, float beta, float *C, size_t ldc) { char fUplo, fTransA; int fN, fK; int fLDA, fLDC; if (order != clblasColumnMajor) { fprintf(stderr, "LAPACK routines require clblasColumnMajor order\n"); abort(); } fUplo = encodeUplo(uplo); fTransA = encodeTranspose(transA); fN = (int)N; fK = (int)K; fLDA = (int)lda; fLDC = (int)ldc; ssyrk(fUplo, fTransA, fN, fK, alpha, (float*)A, fLDA, beta, C, fLDC); } void blasDsyrk( clblasOrder order, clblasUplo uplo, clblasTranspose transA, size_t N, size_t K, double alpha, const double *A, size_t lda, double beta, double *C, size_t ldc) { char fUplo, fTransA; int fN, fK; int fLDA, fLDC; if (order != clblasColumnMajor) { fprintf(stderr, "LAPACK routines require clblasColumnMajor order\n"); abort(); } fUplo = encodeUplo(uplo); fTransA = encodeTranspose(transA); fN = (int)N; fK = (int)K; fLDA = (int)lda; fLDC = (int)ldc; dsyrk(fUplo, fTransA, fN, fK, alpha, (double*)A, fLDA, beta, C, fLDC); } void blasCsyrk( clblasOrder order, clblasUplo uplo, clblasTranspose transA, size_t N, size_t K, FloatComplex alpha, const FloatComplex *A, size_t lda, FloatComplex beta, FloatComplex *C, size_t ldc) { char fUplo, fTransA; int fN, fK; int fLDA, fLDC; complex *fA, *fC; complex fAlpha, fBeta; #if 0 size_t i; size_t na, ka, rowsC, columnsC; if (transA == clblasNoTrans) { na = lda; ka = K; } else { ka = lda; na = N; } rowsC = ldc; columnsC = N; #endif if (order != clblasColumnMajor) { fprintf(stderr, "LAPACK routines require clblasColumnMajor order\n"); abort(); } fUplo = encodeUplo(uplo); fTransA = encodeTranspose(transA); fN = (int)N; fK = (int)K; fLDA = (int)lda; fLDC = (int)ldc; fAlpha = compose_complex(CREAL(alpha), CIMAG(alpha)); fBeta = compose_complex(CREAL(beta), CIMAG(beta)); #if 0 fA = (complex*)calloc(na * ka, sizeof(complex)); if (fA == NULL) { return; } fC = (complex*)calloc(rowsC * columnsC, sizeof(complex)); if (fC == NULL) { free(fA); return; } for (i = 0; i < na * ka; i++) { fA[i] = compose_complex(CREAL(A[i]), CIMAG(A[i])); } for (i = 0; i < rowsC * columnsC; i++) { fC[i] = compose_complex(CREAL(C[i]), CIMAG(C[i])); } #else fA = (complex*)A; fC = (complex*)C; #endif csyrk(fUplo, fTransA, fN, fK, &fAlpha, fA, fLDA, &fBeta, fC, fLDC); #if 0 for (i = 0; i < rowsC * columnsC; i++) { C[i] = floatComplex(complex_real(fC[i]), complex_imag(fC[i])); } free(fC); free(fA); #endif } void blasZsyrk( clblasOrder order, clblasUplo uplo, clblasTranspose transA, size_t N, size_t K, DoubleComplex alpha, const DoubleComplex *A, size_t lda, DoubleComplex beta, DoubleComplex *C, size_t ldc) { char fUplo, fTransA; int fN, fK; int fLDA, fLDC; doublecomplex *fA, *fC; doublecomplex fAlpha, fBeta; #if 0 size_t na, ka, rowsC, columnsC; size_t i; if (transA == clblasNoTrans) { na = lda; ka = K; } else { ka = lda; na = N; } rowsC = ldc; columnsC = N; #endif if (order != clblasColumnMajor) { fprintf(stderr, "LAPACK routines require clblasColumnMajor order\n"); abort(); } fUplo = encodeUplo(uplo); fTransA = encodeTranspose(transA); fN = (int)N; fK = (int)K; fLDA = (int)lda; fLDC = (int)ldc; fAlpha = compose_doublecomplex(CREAL(alpha), CIMAG(alpha)); fBeta = compose_doublecomplex(CREAL(beta), CIMAG(beta)); #if 0 fA = (doublecomplex*)calloc(na * ka, sizeof(doublecomplex)); if (fA == NULL) { return; } fC = (doublecomplex*)calloc(rowsC * columnsC, sizeof(doublecomplex)); if (fC == NULL) { free(fA); return; } for (i = 0; i < na * ka; i++) { fA[i] = compose_doublecomplex(CREAL(A[i]), CIMAG(A[i])); } for (i = 0; i < rowsC * columnsC; i++) { fC[i] = compose_doublecomplex(CREAL(C[i]), CIMAG(C[i])); } #else fA = (doublecomplex*)A; fC = (doublecomplex*)C; #endif zsyrk(fUplo, fTransA, fN, fK, &fAlpha, fA, fLDA, &fBeta, fC, fLDC); #if 0 for (i = 0; i < rowsC * columnsC; i++) { C[i] = doubleComplex(doublecomplex_real(fC[i]), doublecomplex_imag(fC[i])); } free(fC); free(fA); #endif } void blasStrmv( clblasOrder order, clblasUplo uplo, clblasTranspose transA, clblasDiag diag, size_t N, float *A, size_t offa, size_t lda, float *X, size_t offx, int incx) { char fUplo, fDiag, fTrans; int fN, fLda; fUplo = encodeUplo(uplo); fTrans = encodeTranspose(transA); fDiag = encodeDiag(diag); if (order != clblasColumnMajor) { fprintf(stderr, "LAPACK routines require clblasColumnMajor order\n"); abort(); } fN = (int)N; fLda = (int)lda; strmv( fUplo, fTrans, fDiag, fN, A+offa, fLda, X+offx, incx ); } void blasDtrmv( clblasOrder order, clblasUplo uplo, clblasTranspose transA, clblasDiag diag, size_t N, double *A, size_t offa, size_t lda, double *X, size_t offx, int incx) { char fUplo, fDiag, fTrans; int fN, fLda; fUplo = encodeUplo(uplo); fTrans = encodeTranspose(transA); fDiag = encodeDiag(diag); if (order != clblasColumnMajor) { fprintf(stderr, "LAPACK routines require clblasColumnMajor order\n"); abort(); } fN = (int)N; fLda = (int)lda; dtrmv( fUplo, fTrans, fDiag, fN, A+offa , fLda, X+offx, incx ); } void blasCtrmv( clblasOrder order, clblasUplo uplo, clblasTranspose transA, clblasDiag diag, size_t N, FloatComplex *A, size_t offa, size_t lda, FloatComplex *X, size_t offx, int incx) { char fUplo, fDiag, fTrans; int fN, fLda; complex *fA, *fX; fUplo = encodeUplo(uplo); fTrans = encodeTranspose(transA); fDiag = encodeDiag(diag); if (order != clblasColumnMajor) { fprintf(stderr, "LAPACK routines require clblasColumnMajor order\n"); abort(); } fN = (int)N; fLda = (int)lda; fA = (complex*) A + offa; fX = (complex*) X + offx; ctrmv( fUplo, fTrans, fDiag, fN, fA, fLda, fX, incx ); } void blasZtrmv( clblasOrder order, clblasUplo uplo, clblasTranspose transA, clblasDiag diag, size_t N, DoubleComplex *A, size_t offa, size_t lda, DoubleComplex *X, size_t offx, int incx) { char fUplo, fDiag, fTrans; int fN, fLda; doublecomplex *fA, *fX; fUplo = encodeUplo(uplo); fTrans = encodeTranspose(transA); fDiag = encodeDiag(diag); if (order != clblasColumnMajor) { fprintf(stderr, "LAPACK routines require clblasColumnMajor order\n"); abort(); } fN = (int)N; fLda = (int)lda; fA = (doublecomplex*)A + offa; fX = (doublecomplex*)X + offx; ztrmv( fUplo, fTrans, fDiag, fN, fA, fLda, fX, incx ); } //TPMV void blasStpmv( clblasOrder order, clblasUplo uplo, clblasTranspose transA, clblasDiag diag, size_t N, float *AP, size_t offa, float *X, size_t offx, int incx) { char fUplo, fDiag, fTrans; int fN; fUplo = encodeUplo(uplo); fTrans = encodeTranspose(transA); fDiag = encodeDiag(diag); if (order != clblasColumnMajor) { fprintf(stderr, "LAPACK routines require clblasColumnMajor order\n"); abort(); } fN = (int)N; stpmv( fUplo, fTrans, fDiag, fN, AP+offa, X+offx, incx ); } void blasDtpmv( clblasOrder order, clblasUplo uplo, clblasTranspose transA, clblasDiag diag, size_t N, double *AP, size_t offa, double *X, size_t offx, int incx) { char fUplo, fDiag, fTrans; int fN; fUplo = encodeUplo(uplo); fTrans = encodeTranspose(transA); fDiag = encodeDiag(diag); if (order != clblasColumnMajor) { fprintf(stderr, "LAPACK routines require clblasColumnMajor order\n"); abort(); } fN = (int)N; dtpmv( fUplo, fTrans, fDiag, fN, AP+offa , X+offx, incx ); } void blasCtpmv( clblasOrder order, clblasUplo uplo, clblasTranspose transA, clblasDiag diag, size_t N, FloatComplex *AP, size_t offa, FloatComplex *X, size_t offx, int incx) { char fUplo, fDiag, fTrans; int fN; complex *fAP, *fX; fUplo = encodeUplo(uplo); fTrans = encodeTranspose(transA); fDiag = encodeDiag(diag); if (order != clblasColumnMajor) { fprintf(stderr, "LAPACK routines require clblasColumnMajor order\n"); abort(); } fN = (int)N; fAP = (complex*) AP + offa; fX = (complex*) X + offx; ctpmv( fUplo, fTrans, fDiag, fN, fAP, fX, incx ); } void blasZtpmv( clblasOrder order, clblasUplo uplo, clblasTranspose transA, clblasDiag diag, size_t N, DoubleComplex *AP, size_t offa, DoubleComplex *X, size_t offx, int incx) { char fUplo, fDiag, fTrans; int fN; doublecomplex *fAP, *fX; fUplo = encodeUplo(uplo); fTrans = encodeTranspose(transA); fDiag = encodeDiag(diag); if (order != clblasColumnMajor) { fprintf(stderr, "LAPACK routines require clblasColumnMajor order\n"); abort(); } fN = (int)N; fAP = (doublecomplex*)AP + offa; fX = (doublecomplex*)X + offx; ztpmv( fUplo, fTrans, fDiag, fN, fAP, fX, incx ); } void blasStrsv( clblasOrder order, clblasUplo uplo, clblasTranspose transA, clblasDiag diag, size_t N, float *A, size_t offa, size_t lda, float *X, size_t offx, int incx) { char fUplo, fDiag, fTrans; int fN, fLda; fUplo = encodeUplo(uplo); fTrans = encodeTranspose(transA); fDiag = encodeDiag(diag); if (order != clblasColumnMajor) { fprintf(stderr, "LAPACK routines require clblasColumnMajor order\n"); abort(); } fN = (int)N; fLda = (int)lda; strsv( fUplo, fTrans, fDiag, fN, (A+offa), fLda, (X+offx), incx ); } void blasDtrsv( clblasOrder order, clblasUplo uplo, clblasTranspose transA, clblasDiag diag, size_t N, double *A, size_t offa, size_t lda, double *X, size_t offx, int incx) { char fUplo, fDiag, fTrans; int fN, fLda; fUplo = encodeUplo(uplo); fTrans = encodeTranspose(transA); fDiag = encodeDiag(diag); if (order != clblasColumnMajor) { fprintf(stderr, "LAPACK routines require clblasColumnMajor order\n"); abort(); } fN = (int)N; fLda = (int)lda; dtrsv( fUplo, fTrans, fDiag, fN, (A+offa), fLda, (X+offx), incx ); } void blasCtrsv( clblasOrder order, clblasUplo uplo, clblasTranspose transA, clblasDiag diag, size_t N, FloatComplex *A, size_t offa, size_t lda, FloatComplex *X, size_t offx, int incx) { char fUplo, fDiag, fTrans; int fN, fLda; complex *fA, *fX; fUplo = encodeUplo(uplo); fTrans = encodeTranspose(transA); fDiag = encodeDiag(diag); if (order != clblasColumnMajor) { fprintf(stderr, "LAPACK routines require clblasColumnMajor order\n"); abort(); } fN = (int)N; fLda = (int)lda; #if 0 fA = (complex*)calloc(N * lda, sizeof(complex)); if (fA == NULL) { return; } fX = (complex*)calloc(1 + ((N-1)* abs(incx)), sizeof(complex)); if (fX == NULL) { free(fA); return; } for (i = 0; i < (N * lda); i++) { fA[i] = compose_complex(CREAL(A[i]), CIMAG(A[i])); } for (i = 0; i < (1 +((N-1)* abs(incx))); i++) { fX[i] = compose_complex(CREAL(X[i]), CIMAG(X[i])); } #else fA = (complex*)A; fX = (complex*)X; #endif ctrsv(fUplo, fTrans,fDiag, fN,fA+offa, fLda, fX+offx, incx); #if 0 for (i = 0; i < (1 +((N-1)* abs(incx))); i++) { X[i] = floatComplex(complex_real(fX[i]), complex_imag(fX[i])); } free(fX); free(fA); #endif } void blasZtrsv( clblasOrder order, clblasUplo uplo, clblasTranspose transA, clblasDiag diag, size_t N, DoubleComplex *A, size_t offa, size_t lda, DoubleComplex *X, size_t offx, int incx) { char fUplo, fDiag, fTrans; int fN, fLda; doublecomplex *fA, *fX; fUplo = encodeUplo(uplo); fTrans = encodeTranspose(transA); fDiag = encodeDiag(diag); if (order != clblasColumnMajor) { fprintf(stderr, "LAPACK routines require clblasColumnMajor order\n"); abort(); } fN = (int)N; fLda = (int)lda; #if 0 fA = (doublecomplex*)calloc(N * lda, sizeof(doublecomplex)); if (fA == NULL) { return; } fX = (doublecomplex*)calloc((1 + ((N-1) * abs(incx))), sizeof(doublecomplex)); if (fX == NULL) { free(fX); return; } for (i = 0; i < (N * lda); i++) { fA[i] = compose_doublecomplex(CREAL(A[i]), CIMAG(A[i])); } for (i = 0; i < (1 + ((N-1) * abs(incx))); i++) { fX[i] = compose_doublecomplex(CREAL(X[i]), CIMAG(X[i])); } #else fA = (doublecomplex*)A; fX = (doublecomplex*)X; #endif ztrsv( fUplo, fTrans, fDiag, fN, fA + offa, fLda, fX + offx, incx ); #if 0 for (i = 0; i < ((1 + ((N-1) * abs(incx))); i++) { X[i] = doubleComplex(doublecomplex_real(fX[i]), doublecomplex_imag(fX[i])); } free(fX); free(fA); #endif } void blasStpsv( clblasOrder order, clblasUplo uplo, clblasTranspose transA, clblasDiag diag, size_t N, float *A, size_t offa, float *X, size_t offx, int incx) { char fUplo, fDiag, fTrans; int fN; fUplo = encodeUplo(uplo); fTrans = encodeTranspose(transA); fDiag = encodeDiag(diag); if (order != clblasColumnMajor) { fprintf(stderr, "LAPACK routines require clblasColumnMajor order\n"); abort(); } fN = (int)N; stpsv( fUplo, fTrans, fDiag, fN, (A+offa), (X+offx), incx ); } void blasDtpsv( clblasOrder order, clblasUplo uplo, clblasTranspose transA, clblasDiag diag, size_t N, double *A, size_t offa, double *X, size_t offx, int incx) { char fUplo, fDiag, fTrans; int fN; fUplo = encodeUplo(uplo); fTrans = encodeTranspose(transA); fDiag = encodeDiag(diag); if (order != clblasColumnMajor) { fprintf(stderr, "LAPACK routines require clblasColumnMajor order\n"); abort(); } fN = (int)N; dtpsv( fUplo, fTrans, fDiag, fN, (A+offa), (X+offx), incx ); } void blasCtpsv( clblasOrder order, clblasUplo uplo, clblasTranspose transA, clblasDiag diag, size_t N, FloatComplex *A, size_t offa, FloatComplex *X, size_t offx, int incx) { char fUplo, fDiag, fTrans; int fN; complex *fA, *fX; fUplo = encodeUplo(uplo); fTrans = encodeTranspose(transA); fDiag = encodeDiag(diag); if (order != clblasColumnMajor) { fprintf(stderr, "LAPACK routines require clblasColumnMajor order\n"); abort(); } fN = (int)N; fA = (complex*)A; fX = (complex*)X; ctpsv(fUplo, fTrans,fDiag, fN,fA+offa, fX+offx, incx); } void blasZtpsv( clblasOrder order, clblasUplo uplo, clblasTranspose transA, clblasDiag diag, size_t N, DoubleComplex *A, size_t offa, DoubleComplex *X, size_t offx, int incx) { char fUplo, fDiag, fTrans; int fN; doublecomplex *fA, *fX; fUplo = encodeUplo(uplo); fTrans = encodeTranspose(transA); fDiag = encodeDiag(diag); if (order != clblasColumnMajor) { fprintf(stderr, "LAPACK routines require clblasColumnMajor order\n"); abort(); } fN = (int)N; fA = (doublecomplex*)A; fX = (doublecomplex*)X; ztpsv(fUplo, fTrans,fDiag, fN,fA+offa, fX+offx, incx); } void blasSsymm( clblasOrder order, clblasSide side, clblasUplo uplo, size_t M, size_t N, float alpha, float* A, size_t offa, size_t lda, float* B, size_t offb, size_t ldb, float beta, float* C, size_t offc, size_t ldc) { char fSide, fUplo; int fM, fN, fLda, fLdb, fLdc; fSide = encodeSide( side ); fUplo = encodeUplo( uplo ); fM = (int) M; fN = (int) N; fLda= (int) lda; fLdb = (int) ldb; fLdc = (int) ldc; if (order != clblasColumnMajor) { fprintf(stderr, "LAPACK routines require clblasColumnMajor order\n"); abort(); } ssymm( fSide, fUplo, fM, fN, alpha, (A+offa), fLda, (B+offb), fLdb, beta, (C+offc), fLdc ); } void blasDsymm( clblasOrder order, clblasSide side, clblasUplo uplo, size_t M, size_t N, double alpha, double* A, size_t offa, size_t lda, double* B, size_t offb, size_t ldb, double beta, double* C, size_t offc, size_t ldc) { char fSide, fUplo; int fM, fN, fLda, fLdb, fLdc; fSide = encodeSide( side ); fUplo = encodeUplo( uplo ); fM = (int) M; fN = (int) N; fLda= (int) lda; fLdb = (int) ldb; fLdc = (int) ldc; if (order != clblasColumnMajor) { fprintf(stderr, "LAPACK routines require clblasColumnMajor order\n"); abort(); } dsymm( fSide, fUplo, fM, fN, alpha, (A+offa), fLda, (B+offb), fLdb, beta, (C+offc), fLdc ); } void blasCsymm( clblasOrder order, clblasSide side, clblasUplo uplo, size_t M, size_t N, FloatComplex alpha, FloatComplex* A, size_t offa, size_t lda, FloatComplex* B, size_t offb, size_t ldb, FloatComplex beta, FloatComplex* C, size_t offc, size_t ldc) { char fSide, fUplo; int fM, fN, fLda, fLdb, fLdc; complex *fA, *fB, *fC, fAlpha, fBeta; fSide = encodeSide( side ); fUplo = encodeUplo( uplo ); fM = (int) M; fN = (int) N; fLda= (int) lda; fLdb = (int) ldb; fLdc = (int) ldc; fA = (complex*) A; fB = (complex*) B; fC = (complex*) C; fAlpha = compose_complex(CREAL(alpha), CIMAG(alpha)); fBeta = compose_complex(CREAL(beta), CIMAG(beta)); if (order != clblasColumnMajor) { fprintf(stderr, "LAPACK routines require clblasColumnMajor order\n"); abort(); } csymm( fSide, fUplo, fM, fN, &fAlpha, (fA+offa), fLda, (fB+offb), fLdb, &fBeta, (fC+offc), fLdc ); } void blasZsymm( clblasOrder order, clblasSide side, clblasUplo uplo, size_t M, size_t N, DoubleComplex alpha, DoubleComplex* A, size_t offa, size_t lda, DoubleComplex* B, size_t offb, size_t ldb, DoubleComplex beta, DoubleComplex* C, size_t offc, size_t ldc) { char fSide, fUplo; int fM, fN, fLda, fLdb, fLdc; doublecomplex *fA, *fB, *fC, fAlpha, fBeta; fSide = encodeSide( side ); fUplo = encodeUplo( uplo ); fM = (int) M; fN = (int) N; fLda= (int) lda; fLdb = (int) ldb; fLdc = (int) ldc; fA =(doublecomplex*) A; fB =(doublecomplex*) B; fC =(doublecomplex*) C; fAlpha = compose_doublecomplex(CREAL(alpha), CIMAG(alpha)); fBeta = compose_doublecomplex(CREAL(beta), CIMAG(beta)); if (order != clblasColumnMajor) { fprintf(stderr, "LAPACK routines require clblasColumnMajor order\n"); abort(); } zsymm( fSide, fUplo, fM, fN, &fAlpha, (fA+offa), fLda, (fB+offb), fLdb, &fBeta, (fC+offc), fLdc ); } void blasSger( clblasOrder order, size_t M, size_t N, float alpha, float* x, size_t offx, int incx, float* y, size_t offy, int incy, float* A, size_t offa, size_t lda) { int fM, fN, fLda; fM = (int) M; fN = (int) N; fLda= (int) lda; if (order != clblasColumnMajor) { fprintf(stderr, "LAPACK routines require clblasColumnMajor order\n"); abort(); } sger( fM, fN, alpha, (x+offx), incx, (y+offy), incy, (A+offa), fLda ); } void blasDger( clblasOrder order, size_t M, size_t N, double alpha, double* x, size_t offx, int incx, double* y, size_t offy, int incy, double* A, size_t offa, size_t lda) { int fM, fN, fLda; fM = (int) M; fN = (int) N; fLda= (int) lda; if (order != clblasColumnMajor) { fprintf(stderr, "LAPACK routines require clblasColumnMajor order\n"); abort(); } dger( fM, fN, alpha, (x+offx), incx, (y+offy), incy, (A+offa), fLda ); } void blasCgeru( clblasOrder order, size_t M, size_t N, FloatComplex alpha, FloatComplex* x, size_t offx, int incx, FloatComplex* y, size_t offy, int incy, FloatComplex* A, size_t offa, size_t lda) { int fM, fN, fLda; complex *fA, *fx, *fy, fAlpha; fM = (int) M; fN = (int) N; fLda= (int) lda; fA = (complex*) A; fx = (complex*) x; fy = (complex*) y; fAlpha = compose_complex(CREAL(alpha), CIMAG(alpha)); if (order != clblasColumnMajor) { fprintf(stderr, "LAPACK routines require clblasColumnMajor order\n"); abort(); } cgeru( fM, fN, &fAlpha, (fx+offx), incx, (fy+offy), incy, (fA+offa), fLda ); } void blasZgeru( clblasOrder order, size_t M, size_t N, DoubleComplex alpha, DoubleComplex* x, size_t offx, int incx, DoubleComplex* y, size_t offy, int incy, DoubleComplex* A, size_t offa, size_t lda) { int fM, fN, fLda; doublecomplex *fA, *fx, *fy, fAlpha; fM = (int) M; fN = (int) N; fLda= (int) lda; fA =(doublecomplex*) A; fx =(doublecomplex*) x; fy =(doublecomplex*) y; fAlpha = compose_doublecomplex(CREAL(alpha), CIMAG(alpha)); if (order != clblasColumnMajor) { fprintf(stderr, "LAPACK routines require clblasColumnMajor order\n"); abort(); } zgeru( fM, fN, &fAlpha, (fx+offx), incx, (fy+offy), incy, (fA+offa), fLda ); } void blasCgerc( clblasOrder order, size_t M, size_t N, FloatComplex alpha, FloatComplex* x, size_t offx, int incx, FloatComplex* y, size_t offy, int incy, FloatComplex* A, size_t offa, size_t lda) { int fM, fN, fLda; complex *fA, *fx, *fy, fAlpha; fM = (int) M; fN = (int) N; fLda= (int) lda; fA = (complex*) A; fx = (complex*) x; fy = (complex*) y; fAlpha = compose_complex(CREAL(alpha), CIMAG(alpha)); if (order != clblasColumnMajor) { fprintf(stderr, "LAPACK routines require clblasColumnMajor order\n"); abort(); } cgerc( fM, fN, &fAlpha, (fx+offx), incx, (fy+offy), incy, (fA+offa), fLda ); } void blasZgerc( clblasOrder order, size_t M, size_t N, DoubleComplex alpha, DoubleComplex* x, size_t offx, int incx, DoubleComplex* y, size_t offy, int incy, DoubleComplex* A, size_t offa, size_t lda) { int fM, fN, fLda; doublecomplex *fA, *fx, *fy, fAlpha; fM = (int) M; fN = (int) N; fLda= (int) lda; fA =(doublecomplex*) A; fx =(doublecomplex*) x; fy =(doublecomplex*) y; fAlpha = compose_doublecomplex(CREAL(alpha), CIMAG(alpha)); if (order != clblasColumnMajor) { fprintf(stderr, "LAPACK routines require clblasColumnMajor order\n"); abort(); } zgerc( fM, fN, &fAlpha, (fx+offx), incx, (fy+offy), incy, (fA+offa), fLda ); } void blasSsyr( clblasOrder order, clblasUplo uplo, size_t N, float alpha, float* X, size_t offx, int incx, float* A, size_t offa, size_t lda) { char fUplo; int fN, fLda, fIncx; float *fA, fAlpha, *fX; fUplo = encodeUplo( uplo ); fN = (int) N; fLda = (int) lda; fIncx = (int) incx; fA = (float*) A; fX = (float*) X; fAlpha = alpha; if (order != clblasColumnMajor) { fprintf(stderr, "LAPACK routines require clblasColumnMajor order\n"); abort(); } ssyr(fUplo, fN, fAlpha, (fX + offx), fIncx, (fA + offa), fLda); } void blasDsyr( clblasOrder order, clblasUplo uplo, size_t N, double alpha, double* X, size_t offx, int incx, double* A, size_t offa, size_t lda) { char fUplo; int fN, fLda, fIncx; double *fA, fAlpha, *fX; fUplo = encodeUplo( uplo ); fN = (int) N; fLda = (int) lda; fIncx = (int) incx; fA = (double*) A; fX = (double*) X; fAlpha = alpha; if (order != clblasColumnMajor) { fprintf(stderr, "LAPACK routines require clblasColumnMajor order\n"); abort(); } dsyr(fUplo, fN, fAlpha, (fX + offx), fIncx, (fA + offa), fLda); } //SPR void blasSspr( clblasOrder order, clblasUplo uplo, size_t N, float alpha, float* X, size_t offx, int incx, float* AP, size_t offa) { char fUplo; int fN, fIncx; float *fAP, fAlpha, *fX; fUplo = encodeUplo( uplo ); fN = (int) N; fIncx = (int) incx; fAP = (float*) AP; fX = (float*) X; fAlpha = alpha; if (order != clblasColumnMajor) { fprintf(stderr, "LAPACK routines require clblasColumnMajor order\n"); abort(); } sspr(fUplo, fN, fAlpha, (fX + offx), fIncx, (fAP + offa)); } void blasDspr( clblasOrder order, clblasUplo uplo, size_t N, double alpha, double* X, size_t offx, int incx, double* AP, size_t offa) { char fUplo; int fN, fIncx; double *fAP, fAlpha, *fX; fUplo = encodeUplo( uplo ); fN = (int) N; fIncx = (int) incx; fAP = (double*) AP; fX = (double*) X; fAlpha = alpha; if (order != clblasColumnMajor) { fprintf(stderr, "LAPACK routines require clblasColumnMajor order\n"); abort(); } dspr(fUplo, fN, fAlpha, (fX + offx), fIncx, (fAP + offa)); } void blasCher( clblasOrder order, clblasUplo uplo, size_t N, float alpha, FloatComplex* x, size_t offx, int incx, FloatComplex* A, size_t offa, size_t lda) { char fUplo; int fN, fLda; complex *fA, *fx ; fUplo = encodeUplo( uplo ); fN = (int) N; fLda= (int) lda; fA = (complex*) A; fx = (complex*) x; if (order != clblasColumnMajor) { fprintf(stderr, "LAPACK routines require clblasColumnMajor order\n"); abort(); } cher( fUplo, fN, alpha, (fx+offx), incx, (fA+offa), fLda ); } void blasZher( clblasOrder order, clblasUplo uplo, size_t N, double alpha, DoubleComplex* x, size_t offx, int incx, DoubleComplex* A, size_t offa, size_t lda) { char fUplo; int fN, fLda; doublecomplex *fA, *fx; fUplo = encodeUplo( uplo ); fN = (int) N; fLda= (int) lda; fA =(doublecomplex*) A; fx =(doublecomplex*) x; if (order != clblasColumnMajor) { fprintf(stderr, "LAPACK routines require clblasColumnMajor order\n"); abort(); } zher( fUplo, fN, alpha, (fx+offx), incx, (fA+offa), fLda ); } void blasSsyr2( clblasOrder order, clblasUplo uplo, size_t N, float alpha, float* X, size_t offx, int incx, float* Y, size_t offy, int incy, float* A, size_t offa, size_t lda) { char fUplo; int fN, fLda, fIncx, fIncy; float *fA, fAlpha, *fX, *fY; fUplo = encodeUplo( uplo ); fN = (int) N; fLda = (int) lda; fIncx = (int) incx; fIncy = (int) incy; fA = (float*) A; fX = (float*) X; fY = (float*) Y; fAlpha = alpha; if (order != clblasColumnMajor) { fprintf(stderr, "LAPACK routines require clblasColumnMajor order\n"); abort(); } ssyr2(fUplo, fN, fAlpha, (fX + offx), fIncx, (fY + offy), fIncy, (fA + offa), fLda); } void blasDsyr2( clblasOrder order, clblasUplo uplo, size_t N, double alpha, double* X, size_t offx, int incx, double* Y, size_t offy, int incy, double* A, size_t offa, size_t lda) { char fUplo; int fN, fLda, fIncx, fIncy; double *fA, fAlpha, *fX, *fY; fUplo = encodeUplo( uplo ); fN = (int) N; fLda = (int) lda; fIncx = (int) incx; fIncy = (int) incy; fA = (double*) A; fX = (double*) X; fY = (double*) Y; fAlpha = alpha; if (order != clblasColumnMajor) { fprintf(stderr, "LAPACK routines require clblasColumnMajor order\n"); abort(); } dsyr2(fUplo, fN, fAlpha, (fX + offx), fIncx, (fY + offy), fIncy, (fA + offa), fLda); } //HER2 void blasCher2( clblasOrder order, clblasUplo uplo, size_t N, FloatComplex alpha, FloatComplex* X, size_t offx, int incx, FloatComplex* Y, size_t offy, int incy, FloatComplex* A, size_t offa, size_t lda) { char fUplo; int fN, fLda; complex *fA, fAlpha, *fX, *fY; fUplo = encodeUplo( uplo ); fN = (int) N; fLda = (int) lda; fA = (complex*) A; fX = (complex*) X; fY = (complex*) Y; fAlpha = compose_complex(CREAL(alpha), CIMAG(alpha)); if (order != clblasColumnMajor) { fprintf(stderr, "LAPACK routines require clblasColumnMajor order\n"); abort(); } cher2(fUplo, fN, &fAlpha, (fX + offx), incx, (fY + offy), incy, (fA + offa), fLda); } void blasZher2( clblasOrder order, clblasUplo uplo, size_t N, DoubleComplex alpha, DoubleComplex* X, size_t offx, int incx, DoubleComplex* Y, size_t offy, int incy, DoubleComplex* A, size_t offa, size_t lda) { char fUplo; int fN, fLda ; doublecomplex *fA, fAlpha, *fX, *fY; fUplo = encodeUplo( uplo ); fN = (int) N; fLda = (int) lda; fA = (doublecomplex*) A; fX = (doublecomplex*) X; fY = (doublecomplex*) Y; fAlpha = compose_doublecomplex(CREAL(alpha), CIMAG(alpha)); if (order != clblasColumnMajor) { fprintf(stderr, "LAPACK routines require clblasColumnMajor order\n"); abort(); } zher2(fUplo, fN, &fAlpha, (fX + offx), incx, (fY + offy), incy, (fA + offa), fLda); } void blasChemv( clblasOrder order, clblasUplo uplo, size_t N, FloatComplex alpha, FloatComplex* A, size_t offa, size_t lda, FloatComplex* X, size_t offx, int incx, FloatComplex beta, FloatComplex* Y, size_t offy, int incy) { char fUplo; int fN, fLda, fIncx, fIncy; complex *fA, fAlpha, fBeta, *fX, *fY; fUplo = encodeUplo( uplo ); fN = (int) N; fLda = (int) lda; fIncx = (int) incx; fIncy = (int) incy; fA = (complex*) A; fX = (complex*) X; fY = (complex*) Y; fAlpha = compose_complex(CREAL(alpha), CIMAG(alpha)); fBeta = compose_complex(CREAL(beta), CIMAG(beta)); if (order != clblasColumnMajor) { fprintf(stderr, "LAPACK routines require clblasColumnMajor order\n"); abort(); } chemv(fUplo, fN, &fAlpha, (fA + offa), fLda, (fX + offx), fIncx, &fBeta, (fY + offy), fIncy); } void blasZhemv( clblasOrder order, clblasUplo uplo, size_t N, DoubleComplex alpha, DoubleComplex* A, size_t offa, size_t lda, DoubleComplex* X, size_t offx, int incx, DoubleComplex beta, DoubleComplex* Y, size_t offy, int incy) { char fUplo; int fN, fLda, fIncx, fIncy; doublecomplex *fA, fAlpha, fBeta, *fX, *fY; fUplo = encodeUplo( uplo ); fN = (int) N; fLda = (int) lda; fIncx = (int) incx; fIncy = (int) incy; fA = (doublecomplex*) A; fX = (doublecomplex*) X; fY = (doublecomplex*) Y; fAlpha = compose_doublecomplex(CREAL(alpha), CIMAG(alpha)); fBeta = compose_doublecomplex(CREAL(beta), CIMAG(beta)); if (order != clblasColumnMajor) { fprintf(stderr, "LAPACK routines require clblasColumnMajor order\n"); abort(); } zhemv(fUplo, fN, &fAlpha, (fA + offa), fLda, (fX + offx), fIncx, &fBeta, (fY + offy), fIncy); } //HEMM void blasChemm( clblasOrder order, clblasSide side, clblasUplo uplo, size_t M, size_t N, FloatComplex alpha, FloatComplex* A, size_t offa, size_t lda, FloatComplex* B, size_t offb, size_t ldb, FloatComplex beta, FloatComplex* C, size_t offc, size_t ldc) { char fSide, fUplo; int fM, fN, fLda, fLdb, fLdc; complex *fA, *fB, *fC, fAlpha, fBeta; fSide = encodeSide( side ); fUplo = encodeUplo( uplo ); fM = (int) M; fN = (int) N; fLda= (int) lda; fLdb = (int) ldb; fLdc = (int) ldc; fA = (complex*) A; fB = (complex*) B; fC = (complex*) C; fAlpha = compose_complex(CREAL(alpha), CIMAG(alpha)); fBeta = compose_complex(CREAL(beta), CIMAG(beta)); if (order != clblasColumnMajor) { fprintf(stderr, "LAPACK routines require clblasColumnMajor order\n"); abort(); } chemm( fSide, fUplo, fM, fN, &fAlpha, (fA+offa), fLda, (fB+offb), fLdb, &fBeta, (fC+offc), fLdc ); } void blasZhemm( clblasOrder order, clblasSide side, clblasUplo uplo, size_t M, size_t N, DoubleComplex alpha, DoubleComplex* A, size_t offa, size_t lda, DoubleComplex* B, size_t offb, size_t ldb, DoubleComplex beta, DoubleComplex* C, size_t offc, size_t ldc) { char fSide, fUplo; int fM, fN, fLda, fLdb, fLdc; doublecomplex *fA, *fB, *fC, fAlpha, fBeta; fSide = encodeSide( side ); fUplo = encodeUplo( uplo ); fM = (int) M; fN = (int) N; fLda= (int) lda; fLdb = (int) ldb; fLdc = (int) ldc; fA =(doublecomplex*) A; fB =(doublecomplex*) B; fC =(doublecomplex*) C; fAlpha = compose_doublecomplex(CREAL(alpha), CIMAG(alpha)); fBeta = compose_doublecomplex(CREAL(beta), CIMAG(beta)); if (order != clblasColumnMajor) { fprintf(stderr, "LAPACK routines require clblasColumnMajor order\n"); abort(); } zhemm( fSide, fUplo, fM, fN, &fAlpha, (fA+offa), fLda, (fB+offb), fLdb, &fBeta, (fC+offc), fLdc ); } void blasCherk( clblasOrder order, clblasUplo uplo, clblasTranspose transA, size_t N, size_t K, float alpha, const FloatComplex *A, size_t lda, float beta, FloatComplex *C, size_t ldc) { char fUplo, fTransA; int fN, fK; int fLDA, fLDC; complex *fA, *fC; if (order != clblasColumnMajor) { fprintf(stderr, "LAPACK routines require clblasColumnMajor order\n"); abort(); } fUplo = encodeUplo(uplo); fTransA = encodeTranspose(transA); fN = (int)N; fK = (int)K; fLDA = (int)lda; fLDC = (int)ldc; fA = (complex*)A; fC = (complex*)C; cherk(fUplo, fTransA, fN, fK, alpha, fA, fLDA, beta, fC, fLDC); } void blasZherk( clblasOrder order, clblasUplo uplo, clblasTranspose transA, size_t N, size_t K, double alpha, const DoubleComplex *A, size_t lda, double beta, DoubleComplex *C, size_t ldc) { char fUplo, fTransA; int fN, fK; int fLDA, fLDC; doublecomplex *fA, *fC; if (order != clblasColumnMajor) { fprintf(stderr, "LAPACK routines require clblasColumnMajor order\n"); abort(); } fUplo = encodeUplo(uplo); fTransA = encodeTranspose(transA); fN = (int)N; fK = (int)K; fLDA = (int)lda; fLDC = (int)ldc; fA = (doublecomplex*)A; fC = (doublecomplex*)C; zherk(fUplo, fTransA, fN, fK, alpha, fA, fLDA, beta, fC, fLDC); } void blasSspmv( clblasOrder order, clblasUplo uplo, size_t N, float alpha, const float *A, size_t offa, const float *X, size_t offx, int incx, float beta, float *Y, size_t offy, int incy) { char fUplo; int fN; if (order != clblasColumnMajor) { fprintf(stderr, "LAPACK routines require clblasColumnMajor order\n"); abort(); } fUplo = encodeUplo(uplo); fN = (int)N; sspmv(fUplo, fN, alpha, (float*)(A+offa), (float*)(X+offx), incx, beta, (Y+offy), incy); } void blasDspmv( clblasOrder order, clblasUplo uplo, size_t N, double alpha, const double *A, size_t offa, const double *X, size_t offx, int incx, double beta, double *Y, size_t offy, int incy) { char fUplo; int fN; if (order != clblasColumnMajor) { fprintf(stderr, "LAPACK routines require clblasColumnMajor order\n"); abort(); } fUplo = encodeUplo(uplo); fN = (int)N; dspmv(fUplo, fN, alpha, (double*)(A+offa),(double*)(X+offx), incx, beta, (Y+offy), incy); } void blasChpmv( clblasOrder order, clblasUplo uplo, size_t N, FloatComplex alpha, FloatComplex* A, size_t offa, FloatComplex* X, size_t offx, int incx, FloatComplex beta, FloatComplex* Y, size_t offy, int incy) { char fUplo; int fN, fIncx, fIncy; complex *fA, fAlpha, fBeta, *fX, *fY; fUplo = encodeUplo( uplo ); fN = (int) N; fIncx = (int) incx; fIncy = (int) incy; fA = (complex*) A; fX = (complex*) X; fY = (complex*) Y; fAlpha = compose_complex(CREAL(alpha), CIMAG(alpha)); fBeta = compose_complex(CREAL(beta), CIMAG(beta)); if (order != clblasColumnMajor) { fprintf(stderr, "LAPACK routines require clblasColumnMajor order\n"); abort(); } chpmv(fUplo, fN, &fAlpha, (fA + offa), (fX + offx), fIncx, &fBeta, (fY + offy), fIncy); } void blasZhpmv( clblasOrder order, clblasUplo uplo, size_t N, DoubleComplex alpha, DoubleComplex* A, size_t offa, DoubleComplex* X, size_t offx, int incx, DoubleComplex beta, DoubleComplex* Y, size_t offy, int incy) { char fUplo; int fN, fIncx, fIncy; doublecomplex *fA, fAlpha, fBeta, *fX, *fY; fUplo = encodeUplo( uplo ); fN = (int) N; fIncx = (int) incx; fIncy = (int) incy; fA = (doublecomplex*) A; fX = (doublecomplex*) X; fY = (doublecomplex*) Y; fAlpha = compose_doublecomplex(CREAL(alpha), CIMAG(alpha)); fBeta = compose_doublecomplex(CREAL(beta), CIMAG(beta)); if (order != clblasColumnMajor) { fprintf(stderr, "LAPACK routines require clblasColumnMajor order\n"); abort(); } zhpmv(fUplo, fN, &fAlpha, (fA + offa), (fX + offx), fIncx, &fBeta, (fY + offy), fIncy); } void blasChpr( clblasOrder order, clblasUplo uplo, size_t N, float alpha, FloatComplex* x, size_t offx, int incx, FloatComplex* A, size_t offa) { char fUplo; int fN; complex *fA, *fx ; fUplo = encodeUplo( uplo ); fN = (int) N; fA = (complex*) A; fx = (complex*) x; if (order != clblasColumnMajor) { fprintf(stderr, "LAPACK routines require clblasColumnMajor order\n"); abort(); } chpr( fUplo, fN, alpha, (fx+offx), incx, (fA+offa)); } void blasZhpr( clblasOrder order, clblasUplo uplo, size_t N, double alpha, DoubleComplex* x, size_t offx, int incx, DoubleComplex* A, size_t offa) { char fUplo; int fN; doublecomplex *fA, *fx; fUplo = encodeUplo( uplo ); fN = (int) N; fA =(doublecomplex*) A; fx =(doublecomplex*) x; if (order != clblasColumnMajor) { fprintf(stderr, "LAPACK routines require clblasColumnMajor order\n"); abort(); } zhpr( fUplo, fN, alpha, (fx+offx), incx, (fA+offa) ); } void blasSspr2( clblasOrder order, clblasUplo uplo, size_t N, float alpha, float* X, size_t offx, int incx, float* Y, size_t offy, int incy, float* A, size_t offa) { char fUplo; int fN, fIncx, fIncy; float *fA, fAlpha, *fX, *fY; fUplo = encodeUplo( uplo ); fN = (int) N; fIncx = (int) incx; fIncy = (int) incy; fA = (float*) A; fX = (float*) X; fY = (float*) Y; fAlpha = alpha; if (order != clblasColumnMajor) { fprintf(stderr, "LAPACK routines require clblasColumnMajor order\n"); abort(); } sspr2(fUplo, fN, fAlpha, (fX + offx), fIncx, (fY + offy), fIncy, (fA + offa)); } void blasDspr2( clblasOrder order, clblasUplo uplo, size_t N, double alpha, double* X, size_t offx, int incx, double* Y, size_t offy, int incy, double* A, size_t offa) { char fUplo; int fN, fIncx, fIncy; double *fA, fAlpha, *fX, *fY; fUplo = encodeUplo( uplo ); fN = (int) N; fIncx = (int) incx; fIncy = (int) incy; fA = (double*) A; fX = (double*) X; fY = (double*) Y; fAlpha = alpha; if (order != clblasColumnMajor) { fprintf(stderr, "LAPACK routines require clblasColumnMajor order\n"); abort(); } dspr2(fUplo, fN, fAlpha, (fX + offx), fIncx, (fY + offy), fIncy, (fA + offa)); } void blasChpr2( clblasOrder order, clblasUplo uplo, size_t N, FloatComplex alpha, FloatComplex* X, size_t offx, int incx, FloatComplex* Y, size_t offy, int incy, FloatComplex* A, size_t offa) { char fUplo; int fN; complex *fA, fAlpha, *fX, *fY; fUplo = encodeUplo( uplo ); fN = (int) N; fA = (complex*) A; fX = (complex*) X; fY = (complex*) Y; fAlpha = compose_complex(CREAL(alpha), CIMAG(alpha)); if (order != clblasColumnMajor) { fprintf(stderr, "LAPACK routines require clblasColumnMajor order\n"); abort(); } chpr2(fUplo, fN, &fAlpha, (fX + offx), incx, (fY + offy), incy, (fA + offa)); } void blasZhpr2( clblasOrder order, clblasUplo uplo, size_t N, DoubleComplex alpha, DoubleComplex* X, size_t offx, int incx, DoubleComplex* Y, size_t offy, int incy, DoubleComplex* A, size_t offa) { char fUplo; int fN ; doublecomplex *fA, fAlpha, *fX, *fY; fUplo = encodeUplo( uplo ); fN = (int) N; fA = (doublecomplex*) A; fX = (doublecomplex*) X; fY = (doublecomplex*) Y; fAlpha = compose_doublecomplex(CREAL(alpha), CIMAG(alpha)); if (order != clblasColumnMajor) { fprintf(stderr, "LAPACK routines require clblasColumnMajor order\n"); abort(); } zhpr2(fUplo, fN, &fAlpha, (fX + offx), incx, (fY + offy), incy, (fA + offa)); } void blasSgbmv( clblasOrder order, clblasTranspose trans, size_t M, size_t N, size_t KL, size_t KU, float alpha, float *A, size_t offa, size_t lda, float *X, size_t offx, int incx, float beta, float *Y, size_t offy, int incy) { char fTrans; int fN, fM, fKL, fKU, fLda; fTrans = encodeTranspose(trans); fN = (int) N; fM = (int) M; fKL = (int) KL; fKU = (int) KU; fLda = (int) lda; if (order != clblasColumnMajor) { fprintf(stderr, "LAPACK routines require clblasColumnMajor order\n"); abort(); } sgbmv(fTrans, fM, fN, fKL, fKU, alpha, (A+offa), fLda, (X+offx), incx, beta, (Y+offy), incy); } void blasDgbmv( clblasOrder order, clblasTranspose trans, size_t M, size_t N, size_t KL, size_t KU, double alpha, double *A, size_t offa, size_t lda, double *X, size_t offx, int incx, double beta, double *Y, size_t offy, int incy) { char fTrans; int fN, fM, fKL, fKU, fLda; fTrans = encodeTranspose(trans); fN = (int) N; fM = (int) M; fKL = (int) KL; fKU = (int) KU; fLda = (int) lda; if (order != clblasColumnMajor) { fprintf(stderr, "LAPACK routines require clblasColumnMajor order\n"); abort(); } dgbmv(fTrans, fM, fN, fKL, fKU, alpha, (A+offa), fLda, (X+offx), incx, beta, (Y+offy), incy); } void blasCgbmv( clblasOrder order, clblasTranspose trans, size_t M, size_t N, size_t KL, size_t KU, FloatComplex alpha, FloatComplex *A, size_t offa, size_t lda, FloatComplex *X, size_t offx, int incx, FloatComplex beta, FloatComplex *Y, size_t offy, int incy) { char fTrans; int fN, fM, fKL, fKU, fLda; complex *fA, *fX, *fY, fAlpha, fBeta; fTrans = encodeTranspose(trans); fN = (int) N; fM = (int) M; fKL = (int) KL; fKU = (int) KU; fLda = (int) lda; fA = (complex*) (A + offa); fX = (complex*) (X + offx); fY = (complex*) (Y + offy); fAlpha = compose_complex(CREAL(alpha), CIMAG(alpha)); fBeta = compose_complex(CREAL(beta), CIMAG(beta)); if (order != clblasColumnMajor) { fprintf(stderr, "LAPACK routines require clblasColumnMajor order\n"); abort(); } cgbmv(fTrans, fM, fN, fKL, fKU, &fAlpha, fA, fLda, fX, incx, &fBeta, fY, incy); } void blasZgbmv( clblasOrder order, clblasTranspose trans, size_t M, size_t N, size_t KL, size_t KU, DoubleComplex alpha, DoubleComplex *A, size_t offa, size_t lda, DoubleComplex *X, size_t offx, int incx, DoubleComplex beta, DoubleComplex *Y, size_t offy, int incy) { char fTrans; int fN, fM, fKL, fKU, fLda; doublecomplex *fA, *fX, *fY, fAlpha, fBeta; fTrans = encodeTranspose(trans); fN = (int) N; fM = (int) M; fKL = (int) KL; fKU = (int) KU; fLda = (int) lda; fA = (doublecomplex*) (A + offa); fX = (doublecomplex*) (X + offx); fY = (doublecomplex*) (Y + offy); fAlpha = compose_doublecomplex(CREAL(alpha), CIMAG(alpha)); fBeta = compose_doublecomplex(CREAL(beta), CIMAG(beta)); if (order != clblasColumnMajor) { fprintf(stderr, "LAPACK routines require clblasColumnMajor order\n"); abort(); } zgbmv(fTrans, fM, fN, fKL, fKU, &fAlpha, fA, fLda, fX, incx, &fBeta, fY, incy); } //TBMV void blasStbmv( clblasOrder order, clblasUplo uplo, clblasTranspose trans, clblasDiag diag, size_t N, size_t K, float *A, size_t offa, size_t lda, float *X, size_t offx, int incx) { char fTrans, fUplo, fDiag; int fN, fK, fLda; fTrans = encodeTranspose(trans); fUplo = encodeUplo(uplo); fDiag = encodeDiag(diag); fN = (int) N; fK = (int) K; fLda = (int) lda; if (order != clblasColumnMajor) { fprintf(stderr, "LAPACK routines require clblasColumnMajor order\n"); abort(); } stbmv(fUplo, fTrans, fDiag, fN, fK, (A+offa), fLda, (X+offx), incx ); } void blasDtbmv( clblasOrder order, clblasUplo uplo, clblasTranspose trans, clblasDiag diag, size_t N, size_t K, double *A, size_t offa, size_t lda, double *X, size_t offx, int incx) { char fTrans, fUplo, fDiag; int fN, fK, fLda; fTrans = encodeTranspose(trans); fUplo = encodeUplo(uplo); fDiag = encodeDiag(diag); fN = (int) N; fK = (int) K; fLda = (int) lda; if (order != clblasColumnMajor) { fprintf(stderr, "LAPACK routines require clblasColumnMajor order\n"); abort(); } dtbmv(fUplo, fTrans, fDiag, fN, fK, (A+offa), fLda, (X+offx), incx ); } void blasCtbmv( clblasOrder order, clblasUplo uplo, clblasTranspose trans, clblasDiag diag, size_t N, size_t K, FloatComplex *A, size_t offa, size_t lda, FloatComplex *X, size_t offx, int incx) { char fTrans, fUplo, fDiag; int fN, fK, fLda; complex *fA, *fX; fUplo = encodeUplo(uplo); fDiag = encodeDiag(diag); fTrans = encodeTranspose(trans); fN = (int) N; fK = (int) K; fLda = (int) lda; fA = (complex*) (A + offa); fX = (complex*) (X + offx); if (order != clblasColumnMajor) { fprintf(stderr, "LAPACK routines require clblasColumnMajor order\n"); abort(); } ctbmv(fUplo, fTrans, fDiag, fN, fK, fA, fLda, fX, incx ); } void blasZtbmv( clblasOrder order, clblasUplo uplo, clblasTranspose trans, clblasDiag diag, size_t N, size_t K, DoubleComplex *A, size_t offa, size_t lda, DoubleComplex *X, size_t offx, int incx) { char fTrans, fUplo, fDiag; int fN, fK, fLda; doublecomplex *fA, *fX; fUplo = encodeUplo(uplo); fDiag = encodeDiag(diag); fTrans = encodeTranspose(trans); fN = (int) N; fK = (int) K; fLda = (int) lda; fA = (doublecomplex*) (A + offa); fX = (doublecomplex*) (X + offx); if (order != clblasColumnMajor) { fprintf(stderr, "LAPACK routines require clblasColumnMajor order\n"); abort(); } ztbmv(fUplo, fTrans, fDiag, fN, fK, fA, fLda, fX, incx ); } //SBMV void blasSsbmv( clblasOrder order, clblasUplo uplo, size_t N, size_t K, float alpha, float *A, size_t offa, size_t lda, float *X, size_t offx, int incx, float beta, float *Y, size_t offy, int incy) { char fUplo; int fN, fK, fLda; fUplo = encodeUplo(uplo); fN = (int) N; fK = (int) K; fLda = (int) lda; if (order != clblasColumnMajor) { fprintf(stderr, "LAPACK routines require clblasColumnMajor order\n"); abort(); } ssbmv( fUplo, fN, fK, alpha, (A+offa), fLda, (X+offx), incx, beta, (Y+offy), incy ); } void blasDsbmv( clblasOrder order, clblasUplo uplo, size_t N, size_t K, double alpha, double *A, size_t offa, size_t lda, double *X, size_t offx, int incx, double beta, double *Y, size_t offy, int incy) { char fUplo; int fN, fK, fLda; fUplo = encodeUplo(uplo); fN = (int) N; fK = (int) K; fLda = (int) lda; if (order != clblasColumnMajor) { fprintf(stderr, "LAPACK routines require clblasColumnMajor order\n"); abort(); } dsbmv(fUplo, fN, fK, alpha, (A+offa), fLda, (X+offx), incx, beta, (Y+offy), incy ); } //HBMV void blasChbmv( clblasOrder order, clblasUplo uplo, size_t N, size_t K, FloatComplex alpha, FloatComplex *A, size_t offa, size_t lda, FloatComplex *X, size_t offx, int incx, FloatComplex beta, FloatComplex *Y, size_t offy, int incy) { char fUplo; int fN, fK, fLda; complex *fA, *fX, *fY, fAlpha, fBeta; fUplo = encodeUplo(uplo); fN = (int) N; fK = (int) K; fLda = (int) lda; fA = (complex*) (A + offa); fX = (complex*) (X + offx); fY = (complex*) (Y + offy); fAlpha = compose_complex(CREAL(alpha), CIMAG(alpha)); fBeta = compose_complex(CREAL(beta), CIMAG(beta)); if (order != clblasColumnMajor) { fprintf(stderr, "LAPACK routines require clblasColumnMajor order\n"); abort(); } chbmv( fUplo, fN, fK, &fAlpha, fA, fLda, fX, incx, &fBeta, fY, incy ); } void blasZhbmv( clblasOrder order, clblasUplo uplo, size_t N, size_t K, DoubleComplex alpha, DoubleComplex *A, size_t offa, size_t lda, DoubleComplex *X, size_t offx, int incx, DoubleComplex beta, DoubleComplex *Y, size_t offy, int incy) { char fUplo; int fN, fK, fLda; doublecomplex *fA, *fX, *fY, fAlpha, fBeta; fUplo = encodeUplo(uplo); fN = (int) N; fK = (int) K; fLda = (int) lda; fA = (doublecomplex*) (A + offa); fX = (doublecomplex*) (X + offx); fY = (doublecomplex*) (Y + offy); fAlpha = compose_doublecomplex(CREAL(alpha), CIMAG(alpha)); fBeta = compose_doublecomplex(CREAL(beta), CIMAG(beta)); if (order != clblasColumnMajor) { fprintf(stderr, "LAPACK routines require clblasColumnMajor order\n"); abort(); } zhbmv(fUplo, fN, fK, &fAlpha, fA, fLda, fX, incx, &fBeta, fY, incy ); } //TBSV void blasStbsv( clblasOrder order, clblasUplo uplo, clblasTranspose trans, clblasDiag diag, size_t N, size_t K, float *A, size_t offa, size_t lda, float *X, size_t offx, int incx) { char fTrans, fUplo, fDiag; int fN, fK, fLda; fTrans = encodeTranspose(trans); fUplo = encodeUplo(uplo); fDiag = encodeDiag(diag); fN = (int) N; fK = (int) K; fLda = (int) lda; if (order != clblasColumnMajor) { fprintf(stderr, "LAPACK routines require clblasColumnMajor order\n"); abort(); } stbsv(fUplo, fTrans, fDiag, fN, fK, (A+offa), fLda, (X+offx), incx ); } void blasDtbsv( clblasOrder order, clblasUplo uplo, clblasTranspose trans, clblasDiag diag, size_t N, size_t K, double *A, size_t offa, size_t lda, double *X, size_t offx, int incx) { char fTrans, fUplo, fDiag; int fN, fK, fLda; fTrans = encodeTranspose(trans); fUplo = encodeUplo(uplo); fDiag = encodeDiag(diag); fN = (int) N; fK = (int) K; fLda = (int) lda; if (order != clblasColumnMajor) { fprintf(stderr, "LAPACK routines require clblasColumnMajor order\n"); abort(); } dtbsv(fUplo, fTrans, fDiag, fN, fK, (A+offa), fLda, (X+offx), incx ); } void blasCtbsv( clblasOrder order, clblasUplo uplo, clblasTranspose trans, clblasDiag diag, size_t N, size_t K, FloatComplex *A, size_t offa, size_t lda, FloatComplex *X, size_t offx, int incx) { char fTrans, fUplo, fDiag; int fN, fK, fLda; complex *fA, *fX; fUplo = encodeUplo(uplo); fDiag = encodeDiag(diag); fTrans = encodeTranspose(trans); fN = (int) N; fK = (int) K; fLda = (int) lda; fA = (complex*) (A + offa); fX = (complex*) (X + offx); if (order != clblasColumnMajor) { fprintf(stderr, "LAPACK routines require clblasColumnMajor order\n"); abort(); } ctbsv(fUplo, fTrans, fDiag, fN, fK, fA, fLda, fX, incx ); } void blasZtbsv( clblasOrder order, clblasUplo uplo, clblasTranspose trans, clblasDiag diag, size_t N, size_t K, DoubleComplex *A, size_t offa, size_t lda, DoubleComplex *X, size_t offx, int incx) { char fTrans, fUplo, fDiag; int fN, fK, fLda; doublecomplex *fA, *fX; fUplo = encodeUplo(uplo); fDiag = encodeDiag(diag); fTrans = encodeTranspose(trans); fN = (int) N; fK = (int) K; fLda = (int) lda; fA = (doublecomplex*) (A + offa); fX = (doublecomplex*) (X + offx); if (order != clblasColumnMajor) { fprintf(stderr, "LAPACK routines require clblasColumnMajor order\n"); abort(); } ztbsv(fUplo, fTrans, fDiag, fN, fK, fA, fLda, fX, incx ); } void blasCher2k( clblasOrder order, clblasUplo uplo, clblasTranspose transA, size_t N, size_t K, FloatComplex alpha, const FloatComplex *A, size_t offa, size_t lda, const FloatComplex *B, size_t offb, size_t ldb, float beta, FloatComplex *C, size_t offc, size_t ldc) { char fUplo, fTransA; int fN, fK; int fLDA, fLDC, fLDB; complex *fA, *fC, *fB, *fAlpha; if (order != clblasColumnMajor) { fprintf(stderr, "LAPACK routines require clblasColumnMajor order\n"); abort(); } fUplo = encodeUplo(uplo); fTransA = encodeTranspose(transA); fN = (int)N; fK = (int)K; fLDA = (int)lda; fLDB = (int)ldb; fLDC = (int)ldc; fA = (complex*)(A+offa); fB = (complex*)(B+offb); fC = (complex*)(C+offc); fAlpha = (complex*)(&alpha); cher2k(fUplo, fTransA, fN, fK, fAlpha, fA, fLDA, fB, fLDB, beta, fC, fLDC); } void blasZher2k( clblasOrder order, clblasUplo uplo, clblasTranspose transA, size_t N, size_t K, DoubleComplex alpha, const DoubleComplex *A, size_t offa, size_t lda, const DoubleComplex *B, size_t offb, size_t ldb, double beta, DoubleComplex *C, size_t offc, size_t ldc) { char fUplo, fTransA; int fN, fK; int fLDA, fLDC, fLDB; doublecomplex *fA, *fC, *fB, *fAlpha; if (order != clblasColumnMajor) { fprintf(stderr, "LAPACK routines require clblasColumnMajor order\n"); abort(); } fUplo = encodeUplo(uplo); fTransA = encodeTranspose(transA); fN = (int)N; fK = (int)K; fLDA = (int)lda; fLDB = (int)ldb; fLDC = (int)ldc; fA = (doublecomplex*)(A+offa); fB = (doublecomplex*)(B+offb); fC = (doublecomplex*)(C+offc); fAlpha = (doublecomplex*)(&alpha); zher2k(fUplo, fTransA, fN, fK, fAlpha, fA, fLDA, fB, fLDB, beta, fC, fLDC); } //COPY void blasScopy( size_t N, float *X, size_t offx, int incx, float *Y, size_t offy, int incy) { int fN; fN = (int) N; scopy(fN, (X+offx), incx, (Y+offy), incy); } void blasDcopy( size_t N, double *X, size_t offx, int incx, double *Y, size_t offy, int incy) { int fN; fN = (int) N; dcopy( fN, (X+offx), incx, (Y+offy), incy ); } void blasCcopy( size_t N, FloatComplex *X, size_t offx, int incx, FloatComplex *Y, size_t offy, int incy) { int fN; complex *fY, *fX; fN = (int) N; fY = (complex*) (Y + offy); fX = (complex*) (X + offx); ccopy( fN, fX, incx, fY, incy ); } void blasZcopy( size_t N, DoubleComplex *X, size_t offx, int incx, DoubleComplex *Y, size_t offy, int incy) { int fN; doublecomplex *fY, *fX; fN = (int) N; fY = (doublecomplex*) (Y + offy); fX = (doublecomplex*) (X + offx); zcopy(fN, fX, incx , fY, incy); } //SWAP void blasSswap( size_t N, float *X, size_t offBX, int incx, float *Y, size_t offCY, int incy) { int fN; fN = (int) N; sswap(fN, (X+offBX), incx, (Y+offCY), incy); } void blasDswap( size_t N, double *X, size_t offBX, int incx, double *Y, size_t offCY, int incy) { int fN; fN = (int) N; dswap( fN, (X+offBX), incx, (Y+offCY), incy ); } void blasCswap( size_t N, FloatComplex *X, size_t offBX, int incx, FloatComplex *Y, size_t offCY, int incy) { int fN; complex *fY, *fX; fN = (int) N; fY = (complex*) (Y + offCY); fX = (complex*) (X + offBX); cswap( fN, fX, incx, fY, incy ); } void blasZswap( size_t N, DoubleComplex *X, size_t offBX, int incx, DoubleComplex *Y, size_t offCY, int incy) { int fN; doublecomplex *fY, *fX; fN = (int) N; fY = (doublecomplex*) (Y + offCY); fX = (doublecomplex*) (X + offBX); zswap(fN, fX, incx , fY, incy); } void blasSscal( size_t N, float alpha, float *X, size_t offx, int incx) { sscal((int)N, alpha, (X+offx), incx); } void blasDscal( size_t N, double alpha, double *X, size_t offx, int incx) { dscal((int)N, alpha, (X+offx), incx); } void blasCscal( size_t N, FloatComplex alpha, FloatComplex *X, size_t offx, int incx) { cscal((int)N, (complex*)(&alpha), (complex*)(X+offx), incx); } void blasZscal( size_t N, DoubleComplex alpha, DoubleComplex *X, size_t offx, int incx) { zscal((int)N, (doublecomplex*)(&alpha), (doublecomplex*)(X+offx), incx); } void blasCsscal( size_t N, float alpha, FloatComplex *X, size_t offx, int incx) { csscal((int)N, alpha, (complex*)(X+offx), incx); } void blasZdscal( size_t N, double alpha, DoubleComplex *X, size_t offx, int incx) { zdscal((int)N, alpha, (doublecomplex*)(X+offx), incx); } //DOT float blasSdot( size_t N, float *X, size_t offx, int incx, float *Y, size_t offy, int incy) { return sdot((int)N, (X+offx), incx, (Y+offy), incy); } double blasDdot( size_t N, double *X, size_t offx, int incx, double *Y, size_t offy, int incy) { return ddot( (int)N, (X+offx), incx, (Y+offy), incy ); } FloatComplex blasCdotu( size_t N, FloatComplex *X, size_t offx, int incx, FloatComplex *Y, size_t offy, int incy) { complex ans = cdotu((int)N, (complex*)(X+offx), incx, (complex*)(Y+offy), incy); FloatComplex ret; CREAL(ret) = ans.real; CIMAG(ret) = ans.imag; return ret; } DoubleComplex blasZdotu( size_t N, DoubleComplex *X, size_t offx, int incx, DoubleComplex *Y, size_t offy, int incy) { doublecomplex answer = zdotu( (int)N, (doublecomplex*)(X+offx), incx, (doublecomplex*)(Y+offy), incy ); DoubleComplex ret2; CREAL(ret2) = answer.real; CIMAG(ret2) = answer.imag; return ret2; } //ASUM float blasSasum( size_t N, float *X, size_t offx, int incx) { return sasum((int)N, (X+offx), incx); } double blasDasum( size_t N, double *X, size_t offx, int incx) { return dasum( (int)N, (X+offx), incx); } float blasScasum( size_t N, FloatComplex *X, size_t offx, int incx) { return scasum((int)N, (complex*)(X+offx), incx); } double blasDzasum( size_t N, DoubleComplex *X, size_t offx, int incx) { return dzasum( (int)N, (doublecomplex*)(X+offx), incx); } //DOTC FloatComplex blasCdotc( size_t N, FloatComplex *X, size_t offx, int incx, FloatComplex *Y, size_t offy, int incy) { complex ans = cdotc((int)N, (complex*)(X+offx), incx, (complex*)(Y+offy), incy); FloatComplex ret; CREAL(ret) = ans.real; CIMAG(ret) = ans.imag; return ret; } DoubleComplex blasZdotc( size_t N, DoubleComplex *X, size_t offx, int incx, DoubleComplex *Y, size_t offy, int incy) { doublecomplex answer = zdotc( (int)N, (doublecomplex*)(X+offx), incx, (doublecomplex*)(Y+offy), incy ); DoubleComplex ret2; CREAL(ret2) = answer.real; CIMAG(ret2) = answer.imag; return ret2; } void blasSaxpy( size_t N, float alpha, const float *X, size_t offBX, int incx, float *Y, size_t offCY, int incy) { saxpy((int)N, alpha, (float*)(X+offBX), incx, (Y+offCY), incy); } void blasDaxpy( size_t N, double alpha, const double *X, size_t offBX, int incx, double *Y, size_t offCY, int incy) { daxpy((int)N, alpha, (double*)(X+offBX), incx, (Y+offCY), incy); } void blasCaxpy( size_t N, FloatComplex alpha, const FloatComplex *X, size_t offBX, int incx, FloatComplex *Y, size_t offCY, int incy) { caxpy((int)N, (complex*)(&alpha),(complex*)(X+offBX), incx, (complex*)(Y+offCY), incy); } void blasZaxpy( size_t N, DoubleComplex alpha, const DoubleComplex *X, size_t offBX, int incx, DoubleComplex *Y, size_t offCY, int incy) { zaxpy((int)N, (doublecomplex*)(&alpha), (doublecomplex*)(X+offBX), incx, (doublecomplex*)(Y+offCY), incy); } //ROTG void blasSrotg( float* SA, size_t offSA, float* SB, size_t offSB, float* C, size_t offC, float* S, size_t offS) { srotg((SA+offSA), (SB+offSB), (C+offC), (S+offS)); } void blasDrotg( double* SA, size_t offSA, double* SB, size_t offSB, double* C, size_t offC, double* S, size_t offS) { drotg((SA+offSA), (SB+offSB), (C+offC), (S+offS)); } void blasCrotg( FloatComplex* SA, size_t offSA, FloatComplex* SB, size_t offSB, float* C, size_t offC, FloatComplex* S, size_t offS) { crotg((complex*)(SA+offSA), (complex*)(SB+offSB), (C+offC), (complex*)(S+offS)); } void blasZrotg( DoubleComplex* SA, size_t offSA, DoubleComplex* SB, size_t offSB, double* C, size_t offC, DoubleComplex* S, size_t offS) { zrotg((doublecomplex*)(SA+offSA), (doublecomplex*)(SB+offSB), (C+offC), (doublecomplex*)(S+offS)); } void blasSrotmg( float *D1, size_t offD1, float *D2, size_t offD2, float *X1, size_t offX1, const float *Y1, size_t offY1, float *PARAM, size_t offParam) { // C and fortran interface are different for rotmg.. FIXME #if defined CORR_TEST_WITH_ACML srotmg(D1[offD1], D2[offD2], X1[offX1], Y1[offY1], (PARAM+offParam)); #else srotmg((D1+offD1), (D2+offD2), (X1+offX1), (Y1+offY1), (PARAM+offParam)); #endif } void blasDrotmg( double *D1, size_t offD1, double *D2, size_t offD2, double *X1, size_t offX1, const double *Y1, size_t offY1, double *PARAM, size_t offParam) { // C and fortran interface are different for rotmg.. FIXME #if defined CORR_TEST_WITH_ACML drotmg(D1[offD1], D2[offD2], X1[offX1], Y1[offY1], (PARAM+offParam)); #else drotmg((D1+offD1), (D2+offD2), (X1+offX1), (Y1+offY1), (PARAM+offParam)); #endif } void blasSrotm( size_t N, float *X, size_t offx, int incx, float *Y, size_t offy, int incy, float *PARAM, size_t offParam) { srotm(N, (X+offx), incx, (Y+offy), incy, (PARAM+offParam)); } void blasDrotm( size_t N, double *X, size_t offx, int incx, double *Y, size_t offy, int incy, double *PARAM, size_t offParam) { drotm(N, (X+offx), incx, (Y+offy), incy, (PARAM+offParam)); } //ROT void blasSrot( size_t N, float *X, size_t offx, int incx, float *Y, size_t offy, int incy, float C, float S) { srot(N, (X+offx), incx, (Y+offy), incy, C, S); } void blasDrot( size_t N, double *X, size_t offx, int incx, double *Y, size_t offy, int incy, double C, double S) { drot(N, (X+offx), incx, (Y+offy), incy, C, S); } void blasCsrot( size_t N, FloatComplex *X, size_t offx, int incx, FloatComplex *Y, size_t offy, int incy, float C, float S) { csrot(N, (complex*)(X+offx), incx, (complex*)(Y+offy), incy, C, S); } void blasZdrot( size_t N, DoubleComplex *X, size_t offx, int incx, DoubleComplex *Y, size_t offy, int incy, double C, double S) { zdrot(N, (doublecomplex*)(X+offx), incx, (doublecomplex*)(Y+offy), incy, C, S); } int blasiSamax( size_t N, float *X, size_t offx, int incx) { return isamax((int)N, (X+offx), incx); } int blasiDamax( size_t N, double *X, size_t offx, int incx) { return idamax( (int)N, (X+offx), incx); } int blasiCamax( size_t N, FloatComplex *X, size_t offx, int incx) { return icamax((int)N, (complex*)(X+offx), incx); } int blasiZamax( size_t N, DoubleComplex *X, size_t offx, int incx) { return izamax( (int)N, (doublecomplex*)(X+offx), incx); } float blasSnrm2( size_t N, float *X, size_t offx, int incx) { return snrm2((int)N, (X+offx), incx); } double blasDnrm2( size_t N, double *X, size_t offx, int incx) { return dnrm2( (int)N, (X+offx), incx); } float blasScnrm2( size_t N, FloatComplex *X, size_t offx, int incx) { return scnrm2((int)N, (complex*)(X+offx), incx); } double blasDznrm2( size_t N, DoubleComplex *X, size_t offx, int incx) { return dznrm2( (int)N, (doublecomplex*)(X+offx), incx); } clblas-2.10/src/tests/clBLAS-wrapper.cpp000066400000000000000000002556031264277366700201060ustar00rootroot00000000000000/* ************************************************************************ * Copyright 2013 Advanced Micro Devices, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * ************************************************************************/ #include #include "clBLAS-wrapper.h" clblasStatus clMath::clblas::gemv( clblasOrder order, clblasTranspose transA, size_t M, size_t N, float alpha, const cl_mem A, size_t offA, size_t lda, const cl_mem X, size_t offx, int incx, float beta, cl_mem Y, size_t offy, int incy, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events) { clblasStatus ret; ret = clblasSgemv(order, transA, M, N, alpha, A, offA, lda, X, offx, incx, beta, Y, offy, incy, numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events); return ret; } clblasStatus clMath::clblas::gemv( clblasOrder order, clblasTranspose transA, size_t M, size_t N, double alpha, const cl_mem A, size_t offA, size_t lda, const cl_mem X, size_t offx, int incx, double beta, cl_mem Y, size_t offy, int incy, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events) { clblasStatus ret; ret = clblasDgemv(order, transA, M, N, alpha, A, offA, lda, X, offx, incx, beta, Y, offy, incy, numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events); return ret; } clblasStatus clMath::clblas::gemv( clblasOrder order, clblasTranspose transA, size_t M, size_t N, FloatComplex alpha, const cl_mem A, size_t offA, size_t lda, const cl_mem X, size_t offx, int incx, FloatComplex beta, cl_mem Y, size_t offy, int incy, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events) { clblasStatus ret; ret = clblasCgemv(order, transA, M, N, alpha, A, offA, lda, X, offx, incx, beta, Y, offy, incy, numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events); return ret; } clblasStatus clMath::clblas::gemv( clblasOrder order, clblasTranspose transA, size_t M, size_t N, DoubleComplex alpha, const cl_mem A, size_t offA, size_t lda, const cl_mem X, size_t offx, int incx, DoubleComplex beta, cl_mem Y, size_t offy, int incy, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events) { clblasStatus ret; ret = clblasZgemv(order, transA, M, N, alpha, A, offA, lda, X, offx, incx, beta, Y, offy, incy, numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events); return ret; } // SYMV wrappers clblasStatus clMath::clblas::symv( clblasOrder order, clblasUplo uplo, size_t N, float alpha, const cl_mem A, size_t offA, size_t lda, const cl_mem X, size_t offx, int incx, float beta, cl_mem Y, size_t offy, int incy, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events) { clblasStatus ret; ret = clblasSsymv(order, uplo, N, alpha, A, offA, lda, X, offx, incx, beta, Y, offy, incy, numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events); return ret; } clblasStatus clMath::clblas::symv( clblasOrder order, clblasUplo uplo, size_t N, double alpha, const cl_mem A, size_t offA, size_t lda, const cl_mem X, size_t offx, int incx, double beta, cl_mem Y, size_t offy, int incy, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events) { clblasStatus ret; ret = clblasDsymv(order, uplo, N, alpha, A, offA, lda, X, offx, incx, beta, Y, offy, incy, numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events); return ret; } clblasStatus clMath::clblas::gemm( clblasOrder order, clblasTranspose transA, clblasTranspose transB, size_t M, size_t N, size_t K, float alpha, const cl_mem A, size_t offA, size_t lda, const cl_mem B, size_t offB, size_t ldb, float beta, cl_mem C, size_t offC, size_t ldc, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events) { clblasStatus ret; ret = clblasSgemm(order, transA, transB, M, N, K, alpha, A, offA, lda, B, offB, ldb, beta, C, offC, ldc, numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events); return ret; } clblasStatus clMath::clblas::gemm( clblasOrder order, clblasTranspose transA, clblasTranspose transB, size_t M, size_t N, size_t K, double alpha, const cl_mem A, size_t offA, size_t lda, const cl_mem B, size_t offB, size_t ldb, double beta, cl_mem C, size_t offC, size_t ldc, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events) { clblasStatus ret; ret = clblasDgemm(order, transA, transB, M, N, K, alpha, A, offA, lda, B, offB, ldb, beta, C, offC, ldc, numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events); return ret; } clblasStatus clMath::clblas::gemm( clblasOrder order, clblasTranspose transA, clblasTranspose transB, size_t M, size_t N, size_t K, FloatComplex alpha, const cl_mem A, size_t offA, size_t lda, const cl_mem B, size_t offB, size_t ldb, FloatComplex beta, cl_mem C, size_t offC, size_t ldc, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events) { clblasStatus ret; ret = clblasCgemm(order, transA, transB, M, N, K, alpha, A, offA, lda, B, offB, ldb, beta, C, offC, ldc, numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events); return ret; } clblasStatus clMath::clblas::gemm( clblasOrder order, clblasTranspose transA, clblasTranspose transB, size_t M, size_t N, size_t K, DoubleComplex alpha, const cl_mem A, size_t offA, size_t lda, const cl_mem B, size_t offB, size_t ldb, DoubleComplex beta, cl_mem C, size_t offC, size_t ldc, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events) { clblasStatus ret; ret = clblasZgemm(order, transA, transB, M, N, K, alpha, A, offA, lda, B, offB, ldb, beta, C, offC, ldc, numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events); return ret; } #undef GEMMV2_VISIBLE // GEMM2 is not exported. clblasStatus clMath::clblas::gemm2( clblasOrder order, clblasTranspose transA, clblasTranspose transB, size_t M, size_t N, size_t K, float alpha, const cl_mem A, size_t offA, size_t lda, const cl_mem B, size_t offB, size_t ldb, float beta, cl_mem C, size_t offC, size_t ldc, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events) { clblasStatus ret = clblasNotImplemented; #ifdef GEMMV2_VISIBLE //If GEMM2 is visible if (!(offA || offB || offC)) { ret = clblasSgemmV2(order, transA, transB, M, N, K, alpha, A, lda, B, ldb, beta, C, ldc, numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events); } else { ret = clblasSgemmExV2(order, transA, transB, M, N, K, alpha, A, offA, lda, B, offB, ldb, beta, C, offC, ldc, numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events); } #else //To avoid warnings order = order; transA = transA; transB = transB; M = M; N = N; K = K; alpha = alpha; lda = lda; ldb = ldb; beta = beta; C = A; C = B; C = C; ldc = ldc; numCommandQueues = numCommandQueues; commandQueues = commandQueues; numEventsInWaitList = numEventsInWaitList; eventWaitList = eventWaitList; events = events; offA = offA; offB = offB; offC = offC; #endif return ret; } clblasStatus clMath::clblas::gemm2( clblasOrder order, clblasTranspose transA, clblasTranspose transB, size_t M, size_t N, size_t K, double alpha, const cl_mem A, size_t offA, size_t lda, const cl_mem B, size_t offB, size_t ldb, double beta, cl_mem C, size_t offC, size_t ldc, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events) { clblasStatus ret = clblasNotImplemented; #ifdef GEMMV2_VISIBLE if (!(offA || offB || offC)) { ret = clblasDgemmV2(order, transA, transB, M, N, K, alpha, A, lda, B, ldb, beta, C, ldc, numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events); } else { ret = clblasDgemmExV2(order, transA, transB, M, N, K, alpha, A, offA, lda, B, offB, ldb, beta, C, offC, ldc, numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events); } #else //To avoid warnings order = order; transA = transA; transB = transB; M = M; N = N; K = K; alpha = alpha; lda = lda; ldb = ldb; beta = beta; C = A; C = B; C = C; ldc = ldc; numCommandQueues = numCommandQueues; commandQueues = commandQueues; numEventsInWaitList = numEventsInWaitList; eventWaitList = eventWaitList; events = events; offA = offA; offB = offB; offC = offC; #endif return ret; } clblasStatus clMath::clblas::gemm2( clblasOrder order, clblasTranspose transA, clblasTranspose transB, size_t M, size_t N, size_t K, FloatComplex alpha, const cl_mem A, size_t offA, size_t lda, const cl_mem B, size_t offB, size_t ldb, FloatComplex beta, cl_mem C, size_t offC, size_t ldc, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events) { clblasStatus ret = clblasNotImplemented; #ifdef GEMMV2_VISIBLE if (!(offA || offB || offC)) { ret = clblasCgemmV2(order, transA, transB, M, N, K, alpha, A, lda, B, ldb, beta, C, ldc, numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events); } else { ret = clblasCgemmExV2(order, transA, transB, M, N, K, alpha, A, offA, lda, B, offB, ldb, beta, C, offC, ldc, numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events); } #else //To avoid warnings order = order; transA = transA; transB = transB; M = M; N = N; K = K; alpha = alpha; lda = lda; ldb = ldb; beta = beta; C = A; C = B; C = C; ldc = ldc; numCommandQueues = numCommandQueues; commandQueues = commandQueues; numEventsInWaitList = numEventsInWaitList; eventWaitList = eventWaitList; events = events; offA = offA; offB = offB; offC = offC; #endif return ret; } clblasStatus clMath::clblas::gemm2( clblasOrder order, clblasTranspose transA, clblasTranspose transB, size_t M, size_t N, size_t K, DoubleComplex alpha, const cl_mem A, size_t offA, size_t lda, const cl_mem B, size_t offB, size_t ldb, DoubleComplex beta, cl_mem C, size_t offC, size_t ldc, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events) { clblasStatus ret = clblasNotImplemented; #ifdef GEMMV2_VISIBLE if (!(offA || offB || offC)) { ret = clblasZgemmV2(order, transA, transB, M, N, K, alpha, A, lda, B, ldb, beta, C, ldc, numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events); } else { ret = clblasZgemmExV2(order, transA, transB, M, N, K, alpha, A, offA, lda, B, offB, ldb, beta, C, offC, ldc, numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events); } #else //To avoid warnings order = order; transA = transA; transB = transB; M = M; N = N; K = K; alpha = alpha; lda = lda; ldb = ldb; beta = beta; C = A; C = B; C = C; ldc = ldc; numCommandQueues = numCommandQueues; commandQueues = commandQueues; numEventsInWaitList = numEventsInWaitList; eventWaitList = eventWaitList; events = events; offA = offA; offB = offB; offC = offC; #endif return ret; } clblasStatus clMath::clblas::trmm( clblasOrder order, clblasSide side, clblasUplo uplo, clblasTranspose transA, clblasDiag diag, size_t M, size_t N, float alpha, const cl_mem A, size_t offA, size_t lda, cl_mem B, size_t offB, size_t ldb, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events) { clblasStatus ret; ret = clblasStrmm(order, side, uplo, transA, diag, M, N, alpha, A, offA, lda, B, offB, ldb, numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events); return ret; } clblasStatus clMath::clblas::trmm( clblasOrder order, clblasSide side, clblasUplo uplo, clblasTranspose transA, clblasDiag diag, size_t M, size_t N, double alpha, const cl_mem A, size_t offA, size_t lda, cl_mem B, size_t offB, size_t ldb, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events) { clblasStatus ret; ret = clblasDtrmm(order, side, uplo, transA, diag, M, N, alpha, A, offA, lda, B, offB, ldb, numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events); return ret; } clblasStatus clMath::clblas::trmm( clblasOrder order, clblasSide side, clblasUplo uplo, clblasTranspose transA, clblasDiag diag, size_t M, size_t N, FloatComplex alpha, const cl_mem A, size_t offA, size_t lda, cl_mem B, size_t offB, size_t ldb, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events) { clblasStatus ret; ret = clblasCtrmm(order, side, uplo, transA, diag, M, N, alpha, A, offA, lda, B, offB, ldb, numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events); return ret; } clblasStatus clMath::clblas::trmm( clblasOrder order, clblasSide side, clblasUplo uplo, clblasTranspose transA, clblasDiag diag, size_t M, size_t N, DoubleComplex alpha, const cl_mem A, size_t offA, size_t lda, cl_mem B, size_t offB, size_t ldb, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events) { clblasStatus ret; ret = clblasZtrmm(order, side, uplo, transA, diag, M, N, alpha, A, offA, lda, B, offB, ldb, numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events); return ret; } clblasStatus clMath::clblas::trsm( clblasOrder order, clblasSide side, clblasUplo uplo, clblasTranspose transA, clblasDiag diag, size_t M, size_t N, float alpha, const cl_mem A, size_t offA, size_t lda, cl_mem B, size_t offB, size_t ldb, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events) { clblasStatus ret; ret = clblasStrsm(order, side, uplo, transA, diag, M, N, alpha, A, offA, lda, B, offB, ldb, numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events); return ret; } clblasStatus clMath::clblas::trsm( clblasOrder order, clblasSide side, clblasUplo uplo, clblasTranspose transA, clblasDiag diag, size_t M, size_t N, double alpha, const cl_mem A, size_t offA, size_t lda, cl_mem B, size_t offB, size_t ldb, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events) { clblasStatus ret; ret = clblasDtrsm(order, side, uplo, transA, diag, M, N, alpha, A, offA, lda, B, offB, ldb, numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events); return ret; } clblasStatus clMath::clblas::trsm( clblasOrder order, clblasSide side, clblasUplo uplo, clblasTranspose transA, clblasDiag diag, size_t M, size_t N, FloatComplex alpha, const cl_mem A, size_t offA, size_t lda, cl_mem B, size_t offB, size_t ldb, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events) { clblasStatus ret; ret = clblasCtrsm(order, side, uplo, transA, diag, M, N, alpha, A, offA, lda, B, offB, ldb, numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events); return ret; } clblasStatus clMath::clblas::trsm( clblasOrder order, clblasSide side, clblasUplo uplo, clblasTranspose transA, clblasDiag diag, size_t M, size_t N, DoubleComplex alpha, const cl_mem A, size_t offA, size_t lda, cl_mem B, size_t offB, size_t ldb, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events) { clblasStatus ret; ret = clblasZtrsm(order, side, uplo, transA, diag, M, N, alpha, A, offA, lda, B, offB, ldb, numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events); return ret; } clblasStatus clMath::clblas::syr2k( clblasOrder order, clblasUplo uplo, clblasTranspose transAB, size_t N, size_t K, float alpha, const cl_mem A, size_t offA, size_t lda, const cl_mem B, size_t offB, size_t ldb, float beta, cl_mem C, size_t offC, size_t ldc, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events) { clblasStatus ret; ret = clblasSsyr2k(order, uplo, transAB, N, K, alpha, A, offA, lda, B, offB, ldb, beta, C, offC, ldc, numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events); return ret; } clblasStatus clMath::clblas::syr2k( clblasOrder order, clblasUplo uplo, clblasTranspose transAB, size_t N, size_t K, double alpha, const cl_mem A, size_t offA, size_t lda, const cl_mem B, size_t offB, size_t ldb, double beta, cl_mem C, size_t offC, size_t ldc, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events) { clblasStatus ret; ret = clblasDsyr2k(order, uplo, transAB, N, K, alpha, A, offA, lda, B, offB, ldb, beta, C, offC, ldc, numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events); return ret; } clblasStatus clMath::clblas::syr2k( clblasOrder order, clblasUplo uplo, clblasTranspose transAB, size_t N, size_t K, FloatComplex alpha, const cl_mem A, size_t offA, size_t lda, const cl_mem B, size_t offB, size_t ldb, FloatComplex beta, cl_mem C, size_t offC, size_t ldc, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events) { clblasStatus ret; ret = clblasCsyr2k(order, uplo, transAB, N, K, alpha, A, offA, lda, B, offB, ldb, beta, C, offC, ldc, numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events); return ret; } clblasStatus clMath::clblas::syr2k( clblasOrder order, clblasUplo uplo, clblasTranspose transAB, size_t N, size_t K, DoubleComplex alpha, const cl_mem A, size_t offA, size_t lda, const cl_mem B, size_t offB, size_t ldb, DoubleComplex beta, cl_mem C, size_t offC, size_t ldc, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events) { clblasStatus ret; ret = clblasZsyr2k(order, uplo, transAB, N, K, alpha, A, offA, lda, B, offB, ldb, beta, C, offC, ldc, numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events); return ret; } clblasStatus clMath::clblas::syrk( clblasOrder order, clblasUplo uplo, clblasTranspose transA, size_t N, size_t K, float alpha, const cl_mem A, size_t offA, size_t lda, float beta, cl_mem C, size_t offC, size_t ldc, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events) { clblasStatus ret; ret = clblasSsyrk(order, uplo, transA, N, K, alpha, A, offA, lda, beta, C, offC, ldc, numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events); return ret; } clblasStatus clMath::clblas::syrk( clblasOrder order, clblasUplo uplo, clblasTranspose transA, size_t N, size_t K, double alpha, const cl_mem A, size_t offA, size_t lda, double beta, cl_mem C, size_t offC, size_t ldc, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events) { clblasStatus ret; ret = clblasDsyrk(order, uplo, transA, N, K, alpha, A, offA, lda, beta, C, offC, ldc, numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events); return ret; } clblasStatus clMath::clblas::syrk( clblasOrder order, clblasUplo uplo, clblasTranspose transA, size_t N, size_t K, FloatComplex alpha, const cl_mem A, size_t offA, size_t lda, FloatComplex beta, cl_mem C, size_t offC, size_t ldc, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events) { clblasStatus ret; ret = clblasCsyrk(order, uplo, transA, N, K, alpha, A, offA, lda, beta, C, offC, ldc, numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events); return ret; } clblasStatus clMath::clblas::syrk( clblasOrder order, clblasUplo uplo, clblasTranspose transA, size_t N, size_t K, DoubleComplex alpha, const cl_mem A, size_t offA, size_t lda, DoubleComplex beta, cl_mem C, size_t offC, size_t ldc, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events) { clblasStatus ret; ret = clblasZsyrk(order, uplo, transA, N, K, alpha, A, offA, lda, beta, C, offC, ldc, numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events); return ret; } clblasStatus clMath::clblas::trmv( DataType type, clblasOrder order, clblasUplo uplo, clblasTranspose trans, clblasDiag diag, size_t N, const cl_mem A, size_t offa, size_t lda, cl_mem X, size_t offx, int incx, cl_mem scratchBuff, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events) { switch(type) { case TYPE_FLOAT: return clblasStrmv( order, uplo, trans, diag, N, A, offa, lda, X, offx, incx, scratchBuff, numCommandQueues,commandQueues, numEventsInWaitList, eventWaitList, events); case TYPE_DOUBLE: return clblasDtrmv( order, uplo, trans, diag, N, A, offa, lda, X, offx, incx, scratchBuff, numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events); case TYPE_COMPLEX_FLOAT: return clblasCtrmv( order, uplo, trans, diag, N, A, offa, lda, X, offx, incx, scratchBuff, numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events); case TYPE_COMPLEX_DOUBLE: return clblasZtrmv( order, uplo, trans, diag, N, A, offa, lda, X, offx, incx, scratchBuff, numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events); default: return clblasInvalidValue; } } clblasStatus clMath::clblas::trsv( DataType type, clblasOrder order, clblasUplo uplo, clblasTranspose trans, clblasDiag diag, size_t N, const cl_mem A, size_t offa, size_t lda, cl_mem X, size_t offx, int incx, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events) { switch(type) { case TYPE_FLOAT: return clblasStrsv( order, uplo, trans, diag, N, A, offa, lda, X, offx, incx, numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events); case TYPE_DOUBLE: return clblasDtrsv( order, uplo, trans, diag, N, A, offa, lda, X, offx, incx, numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events); case TYPE_COMPLEX_FLOAT: return clblasCtrsv( order, uplo, trans, diag, N, A, offa, lda, X, offx, incx,numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events); case TYPE_COMPLEX_DOUBLE: return clblasZtrsv( order, uplo, trans, diag, N, A, offa, lda, X, offx, incx, numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events); default: return clblasInvalidValue; } } clblasStatus clMath::clblas::tpsv( DataType type, clblasOrder order, clblasUplo uplo, clblasTranspose trans, clblasDiag diag, size_t N, const cl_mem A, size_t offa, cl_mem X, size_t offx, int incx, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events) { switch(type) { case TYPE_FLOAT: return clblasStpsv( order, uplo, trans, diag, N, A, offa, X, offx, incx, numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events); case TYPE_DOUBLE: return clblasDtpsv( order, uplo, trans, diag, N, A, offa, X, offx, incx, numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events); case TYPE_COMPLEX_FLOAT: return clblasCtpsv( order, uplo, trans, diag, N, A, offa, X, offx, incx,numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events); case TYPE_COMPLEX_DOUBLE: return clblasZtpsv( order, uplo, trans, diag, N, A, offa, X, offx, incx, numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events); default: return clblasInvalidValue; } } clblasStatus clMath::clblas::symm( clblasOrder order, clblasSide side, clblasUplo uplo, size_t M, size_t N, float alpha, const cl_mem A, size_t offa, size_t lda, const cl_mem B, size_t offb, size_t ldb, float beta, cl_mem C, size_t offc, size_t ldc, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events) { return clblasSsymm( order, side, uplo, M, N, alpha, A, offa, lda, B, offb, ldb, beta, C, offc, ldc, numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events); } clblasStatus clMath::clblas::symm( clblasOrder order, clblasSide side, clblasUplo uplo, size_t M, size_t N, double alpha, const cl_mem A, size_t offa, size_t lda, const cl_mem B, size_t offb, size_t ldb, double beta, cl_mem C, size_t offc, size_t ldc, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events) { return clblasDsymm( order, side, uplo, M, N, alpha, A, offa, lda, B, offb, ldb, beta, C, offc, ldc, numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events); } clblasStatus clMath::clblas::symm( clblasOrder order, clblasSide side, clblasUplo uplo, size_t M, size_t N, FloatComplex alpha, const cl_mem A, size_t offa, size_t lda, const cl_mem B, size_t offb, size_t ldb, FloatComplex beta, cl_mem C, size_t offc, size_t ldc, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events) { return clblasCsymm( order, side, uplo, M, N, alpha, A, offa, lda, B, offb, ldb, beta, C, offc, ldc, numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events); } clblasStatus clMath::clblas::symm( clblasOrder order, clblasSide side, clblasUplo uplo, size_t M, size_t N, DoubleComplex alpha, const cl_mem A, size_t offa, size_t lda, const cl_mem B, size_t offb, size_t ldb, DoubleComplex beta, cl_mem C, size_t offc, size_t ldc, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events) { return clblasZsymm( order, side, uplo, M, N, alpha, A, offa, lda, B, offb, ldb, beta, C, offc, ldc, numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events); } clblasStatus clMath::clblas::ger( clblasOrder order, size_t M, size_t N, float alpha, const cl_mem X, size_t offx, int incx, const cl_mem Y, size_t offy, int incy, cl_mem A, size_t offa, size_t lda, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events) { return clblasSger( order, M, N, alpha, X, offx, incx, Y, offy, incy, A, offa, lda, numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events); } clblasStatus clMath::clblas::ger( clblasOrder order, size_t M, size_t N, double alpha, const cl_mem X, size_t offx, int incx, const cl_mem Y, size_t offy, int incy, cl_mem A, size_t offa, size_t lda, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events) { return clblasDger( order, M, N, alpha, X, offx, incx, Y, offy, incy, A, offa, lda, numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events); } clblasStatus clMath::clblas::ger( clblasOrder order, size_t M, size_t N, FloatComplex alpha, const cl_mem X, size_t offx, int incx, const cl_mem Y, size_t offy, int incy, cl_mem A, size_t offa, size_t lda, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events) { return clblasCgeru( order, M, N, alpha, X, offx, incx, Y, offy, incy, A, offa, lda, numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events); } clblasStatus clMath::clblas::ger( clblasOrder order, size_t M, size_t N, DoubleComplex alpha, const cl_mem X, size_t offx, int incx, const cl_mem Y, size_t offy, int incy, cl_mem A, size_t offa, size_t lda, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events) { return clblasZgeru( order, M, N, alpha, X, offx, incx, Y, offy, incy, A, offa, lda, numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events); } clblasStatus clMath::clblas::gerc( clblasOrder order, size_t M, size_t N, FloatComplex alpha, const cl_mem X, size_t offx, int incx, const cl_mem Y, size_t offy, int incy, cl_mem A, size_t offa, size_t lda, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events) { return clblasCgerc( order, M, N, alpha, X, offx, incx, Y, offy, incy, A, offa, lda, numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events); } clblasStatus clMath::clblas::gerc( clblasOrder order, size_t M, size_t N, DoubleComplex alpha, const cl_mem X, size_t offx, int incx, const cl_mem Y, size_t offy, int incy, cl_mem A, size_t offa, size_t lda, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events) { return clblasZgerc( order, M, N, alpha, X, offx, incx, Y, offy, incy, A, offa, lda, numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events); } clblasStatus clMath::clblas::syr( clblasOrder order, clblasUplo uplo, size_t N, float alpha, const cl_mem X, size_t offx, int incx, cl_mem A, size_t offa, size_t lda, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events) { return clblasSsyr( order, uplo, N, alpha, X, offx, incx, A, offa, lda, numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events); } clblasStatus clMath::clblas::syr( clblasOrder order, clblasUplo uplo, size_t N, double alpha, const cl_mem X, size_t offx, int incx, cl_mem A, size_t offa, size_t lda, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events) { return clblasDsyr( order, uplo, N, alpha, X, offx, incx, A, offa, lda, numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events); } //SPR clblasStatus clMath::clblas::her( clblasOrder order, clblasUplo uplo, size_t N, float alpha, const cl_mem X, size_t offx, int incx, cl_mem A, size_t offa, size_t lda, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events) { return clblasCher( order, uplo, N, alpha, X, offx, incx, A, offa, lda, numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events); } clblasStatus clMath::clblas::her( clblasOrder order, clblasUplo uplo, size_t N, double alpha, const cl_mem X, size_t offx, int incx, cl_mem A, size_t offa, size_t lda, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events) { return clblasZher( order, uplo, N, alpha, X, offx, incx, A, offa, lda, numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events); } clblasStatus clMath::clblas::syr2( clblasOrder order, clblasUplo uplo, size_t N, float alpha, const cl_mem X, size_t offx, int incx, const cl_mem Y, size_t offy, int incy, cl_mem A, size_t offa, size_t lda, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events) { return clblasSsyr2( order, uplo, N, alpha, X, offx, incx, Y, offy, incy, A, offa, lda, numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events); } clblasStatus clMath::clblas::syr2( clblasOrder order, clblasUplo uplo, size_t N, double alpha, const cl_mem X, size_t offx, int incx, const cl_mem Y, size_t offy, int incy, cl_mem A, size_t offa, size_t lda, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events) { return clblasDsyr2( order, uplo, N, alpha, X, offx, incx, Y, offy, incy, A, offa, lda, numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events); } clblasStatus clMath::clblas::her2( clblasOrder order, clblasUplo uplo, size_t N, FloatComplex alpha, const cl_mem X, size_t offx, int incx, const cl_mem Y, size_t offy, int incy, cl_mem A, size_t offa, size_t lda, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events) { return clblasCher2( order, uplo, N, alpha, X, offx, incx, Y, offy, incy, A, offa, lda, numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events); } clblasStatus clMath::clblas::her2( clblasOrder order, clblasUplo uplo, size_t N, DoubleComplex alpha, const cl_mem X, size_t offx, int incx, const cl_mem Y, size_t offy, int incy, cl_mem A, size_t offa, size_t lda, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events) { return clblasZher2( order, uplo, N, alpha, X, offx, incx, Y, offy, incy, A, offa, lda, numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events); } clblasStatus clMath::clblas::hemv( clblasOrder order, clblasUplo uplo, size_t N, FloatComplex alpha, const cl_mem A, size_t offa, size_t lda, const cl_mem X, size_t offx, int incx, FloatComplex beta, cl_mem Y, size_t offy, int incy, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events) { return clblasChemv( order, uplo, N, alpha, A, offa, lda, X, offx, incx, beta, Y, offy, incy, numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events); } clblasStatus clMath::clblas::hemv( clblasOrder order, clblasUplo uplo, size_t N, DoubleComplex alpha, const cl_mem A, size_t offa, size_t lda, const cl_mem X, size_t offx, int incx, DoubleComplex beta, cl_mem Y, size_t offy, int incy, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events) { return clblasZhemv( order, uplo, N, alpha, A, offa, lda, X, offx, incx, beta, Y, offy, incy, numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events); } //HEMM clblasStatus clMath::clblas::hemm( clblasOrder order, clblasSide side, clblasUplo uplo, size_t M, size_t N, FloatComplex alpha, const cl_mem A, size_t offa, size_t lda, const cl_mem B, size_t offb, size_t ldb, FloatComplex beta, cl_mem C, size_t offc, size_t ldc, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events) { return clblasChemm( order, side, uplo, M, N, alpha, A, offa, lda, B, offb, ldb, beta, C, offc, ldc, numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events); } clblasStatus clMath::clblas::hemm( clblasOrder order, clblasSide side, clblasUplo uplo, size_t M, size_t N, DoubleComplex alpha, const cl_mem A, size_t offa, size_t lda, const cl_mem B, size_t offb, size_t ldb, DoubleComplex beta, cl_mem C, size_t offc, size_t ldc, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events) { return clblasZhemm( order, side, uplo, M, N, alpha, A, offa, lda, B, offb, ldb, beta, C, offc, ldc, numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events); } clblasStatus clMath::clblas::herk( clblasOrder order, clblasUplo uplo, clblasTranspose transA, size_t N, size_t K, float alpha, const cl_mem A, size_t offA, size_t lda, float beta, cl_mem C, size_t offC, size_t ldc, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events) { clblasStatus ret; ret = clblasCherk(order, uplo, transA, N, K, alpha, A, offA, lda, beta, C, offC, ldc, numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events); return ret; } clblasStatus clMath::clblas::herk( clblasOrder order, clblasUplo uplo, clblasTranspose transA, size_t N, size_t K, double alpha, const cl_mem A, size_t offA, size_t lda, double beta, cl_mem C, size_t offC, size_t ldc, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events) { clblasStatus ret; ret = clblasZherk(order, uplo, transA, N, K, alpha, A, offA, lda, beta, C, offC, ldc, numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events); return ret; } clblasStatus clMath::clblas::tpmv( DataType type, clblasOrder order, clblasUplo uplo, clblasTranspose trans, clblasDiag diag, size_t N, const cl_mem AP, size_t offa, cl_mem X, size_t offx, int incx, cl_mem scratchBuff, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events) { switch(type) { case TYPE_FLOAT: return clblasStpmv( order, uplo, trans, diag, N, AP, offa, X, offx, incx, scratchBuff, numCommandQueues,commandQueues, numEventsInWaitList, eventWaitList, events); case TYPE_DOUBLE: return clblasDtpmv( order, uplo, trans, diag, N, AP, offa, X, offx, incx, scratchBuff, numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events); case TYPE_COMPLEX_FLOAT: return clblasCtpmv( order, uplo, trans, diag, N, AP, offa, X, offx, incx, scratchBuff, numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events); case TYPE_COMPLEX_DOUBLE: return clblasZtpmv( order, uplo, trans, diag, N, AP, offa, X, offx, incx, scratchBuff, numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events); default: return clblasInvalidValue; } } clblasStatus clMath::clblas::spmv( clblasOrder order, clblasUplo uplo, size_t N, cl_float alpha, const cl_mem AP, size_t offa, const cl_mem X, size_t offx, int incx, cl_float beta, cl_mem Y, size_t offy, int incy, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events) { clblasStatus ret; ret = clblasSspmv(order, uplo, N, alpha, AP, offa, X, offx, incx, beta, Y, offy, incy, numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events); return ret; } clblasStatus clMath::clblas::spmv( clblasOrder order, clblasUplo uplo, size_t N, cl_double alpha, const cl_mem AP, size_t offa, const cl_mem X, size_t offx, int incx, cl_double beta, cl_mem Y, size_t offy, int incy, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events) { clblasStatus ret; ret = clblasDspmv(order, uplo, N, alpha, AP, offa, X, offx, incx, beta, Y, offy, incy, numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events); return ret; } clblasStatus clMath::clblas::hpmv( clblasOrder order, clblasUplo uplo, size_t N, FloatComplex alpha, const cl_mem AP, size_t offa, const cl_mem X, size_t offx, int incx, FloatComplex beta, cl_mem Y, size_t offy, int incy, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events) { return clblasChpmv(order, uplo, N, alpha, AP, offa, X, offx, incx, beta, Y, offy, incy, numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events ); } clblasStatus clMath::clblas::hpmv( clblasOrder order, clblasUplo uplo, size_t N, DoubleComplex alpha, const cl_mem AP, size_t offa, const cl_mem X, size_t offx, int incx, DoubleComplex beta, cl_mem Y, size_t offy, int incy, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events) { return clblasZhpmv(order, uplo, N, alpha, AP, offa, X, offx, incx, beta, Y, offy, incy, numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events ); } clblasStatus clMath::clblas::spr( clblasOrder order, clblasUplo uplo, size_t N, float alpha, const cl_mem X, size_t offx, int incx, cl_mem AP, size_t offa, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events) { return clblasSspr( order, uplo, N, alpha, X, offx, incx, AP, offa, numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events); } clblasStatus clMath::clblas::spr( clblasOrder order, clblasUplo uplo, size_t N, double alpha, const cl_mem X, size_t offx, int incx, cl_mem AP, size_t offa, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events) { return clblasDspr( order, uplo, N, alpha, X, offx, incx, AP, offa, numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events); } clblasStatus clMath::clblas::hpr( clblasOrder order, clblasUplo uplo, size_t N, float alpha, const cl_mem X, size_t offx, int incx, cl_mem AP, size_t offa, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events) { return clblasChpr( order, uplo, N, alpha, X, offx, incx, AP, offa, numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events); } clblasStatus clMath::clblas::hpr( clblasOrder order, clblasUplo uplo, size_t N, double alpha, const cl_mem X, size_t offx, int incx, cl_mem AP, size_t offa, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events) { return clblasZhpr( order, uplo, N, alpha, X, offx, incx, AP, offa, numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events); } clblasStatus clMath::clblas::spr2( clblasOrder order, clblasUplo uplo, size_t N, float alpha, const cl_mem X, size_t offx, int incx, const cl_mem Y, size_t offy, int incy, cl_mem AP, size_t offa, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events) { return clblasSspr2( order, uplo, N, alpha, X, offx, incx, Y, offy, incy, AP, offa, numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events); } clblasStatus clMath::clblas::spr2( clblasOrder order, clblasUplo uplo, size_t N, double alpha, const cl_mem X, size_t offx, int incx, const cl_mem Y, size_t offy, int incy, cl_mem AP, size_t offa, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events) { return clblasDspr2( order, uplo, N, alpha, X, offx, incx, Y, offy, incy, AP, offa, numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events); } clblasStatus clMath::clblas::hpr2( clblasOrder order, clblasUplo uplo, size_t N, FloatComplex alpha, const cl_mem X, size_t offx, int incx, const cl_mem Y, size_t offy, int incy, cl_mem AP, size_t offa, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events) { return clblasChpr2( order, uplo, N, alpha, X, offx, incx, Y, offy, incy, AP, offa, numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events); } clblasStatus clMath::clblas::hpr2( clblasOrder order, clblasUplo uplo, size_t N, DoubleComplex alpha, const cl_mem X, size_t offx, int incx, const cl_mem Y, size_t offy, int incy, cl_mem AP, size_t offa, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events) { return clblasZhpr2( order, uplo, N, alpha, X, offx, incx, Y, offy, incy, AP, offa, numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events); } clblasStatus clMath::clblas::gbmv( clblasOrder order, clblasTranspose trans, size_t M, size_t N, size_t KL, size_t KU, cl_float alpha, const cl_mem A, size_t offa, size_t lda, const cl_mem X, size_t offx, int incx, cl_float beta, cl_mem Y, size_t offy, int incy, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events) { return clblasSgbmv( order, trans, M, N, KL, KU, alpha, A, offa, lda, X, offx, incx, beta, Y, offy, incy, numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events); } clblasStatus clMath::clblas::gbmv( clblasOrder order, clblasTranspose trans, size_t M, size_t N, size_t KL, size_t KU, cl_double alpha, const cl_mem A, size_t offa, size_t lda, const cl_mem X, size_t offx, int incx, cl_double beta, cl_mem Y, size_t offy, int incy, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events) { return clblasDgbmv( order, trans, M, N, KL, KU, alpha, A, offa, lda, X, offx, incx, beta, Y, offy, incy, numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events); } clblasStatus clMath::clblas::gbmv( clblasOrder order, clblasTranspose trans, size_t M, size_t N, size_t KL, size_t KU, cl_float2 alpha, const cl_mem A, size_t offa, size_t lda, const cl_mem X, size_t offx, int incx, cl_float2 beta, cl_mem Y, size_t offy, int incy, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events) { return clblasCgbmv( order, trans, M, N, KL, KU, alpha, A, offa, lda, X, offx, incx, beta, Y, offy, incy, numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events); } clblasStatus clMath::clblas::gbmv( clblasOrder order, clblasTranspose trans, size_t M, size_t N, size_t KL, size_t KU, cl_double2 alpha, const cl_mem A, size_t offa, size_t lda, const cl_mem X, size_t offx, int incx, cl_double2 beta, cl_mem Y, size_t offy, int incy, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events) { return clblasZgbmv( order, trans, M, N, KL, KU, alpha, A, offa, lda, X, offx, incx, beta, Y, offy, incy, numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events); } clblasStatus clMath::clblas::tbmv( DataType type, clblasOrder order, clblasUplo uplo, clblasTranspose trans, clblasDiag diag, size_t N, size_t K, const cl_mem A, size_t offa, size_t lda, cl_mem X, size_t offx, int incx, cl_mem scratchBuff, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events) { switch(type) { case TYPE_FLOAT: return clblasStbmv( order, uplo, trans, diag, N, K, A, offa, lda, X, offx, incx, scratchBuff, numCommandQueues,commandQueues, numEventsInWaitList, eventWaitList, events); case TYPE_DOUBLE: return clblasDtbmv( order, uplo, trans, diag, N, K, A, offa, lda, X, offx, incx, scratchBuff, numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events); case TYPE_COMPLEX_FLOAT: return clblasCtbmv( order, uplo, trans, diag, N, K, A, offa, lda, X, offx, incx, scratchBuff, numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events); case TYPE_COMPLEX_DOUBLE: return clblasZtbmv( order, uplo, trans, diag, N, K, A, offa, lda, X, offx, incx, scratchBuff, numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events); default: return clblasInvalidValue; } } //SBMV clblasStatus clMath::clblas::sbmv( clblasOrder order, clblasUplo uplo, size_t N, size_t K, cl_float alpha, const cl_mem A, size_t offa, size_t lda, const cl_mem X, size_t offx, int incx, cl_float beta, cl_mem Y, size_t offy, int incy, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events) { return clblasSsbmv( order, uplo, N, K, alpha, A, offa, lda, X, offx, incx, beta, Y, offy, incy, numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events ); } clblasStatus clMath::clblas::sbmv( clblasOrder order, clblasUplo uplo, size_t M, size_t K, cl_double alpha, const cl_mem A, size_t offa, size_t lda, const cl_mem X, size_t offx, int incx, cl_double beta, cl_mem Y, size_t offy, int incy, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events) { return clblasDsbmv( order, uplo, M, K, alpha, A, offa, lda, X, offx, incx, beta, Y, offy, incy, numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events); } //HBMV clblasStatus clMath::clblas::hbmv( clblasOrder order, clblasUplo uplo, size_t N, size_t K, cl_float2 alpha, const cl_mem A, size_t offa, size_t lda, const cl_mem X, size_t offx, int incx, cl_float2 beta, cl_mem Y, size_t offy, int incy, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events) { return clblasChbmv( order, uplo, N, K, alpha, A, offa, lda, X, offx, incx, beta, Y, offy, incy, numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events); } clblasStatus clMath::clblas::hbmv( clblasOrder order, clblasUplo uplo, size_t N, size_t K, cl_double2 alpha, const cl_mem A, size_t offa, size_t lda, const cl_mem X, size_t offx, int incx, cl_double2 beta, cl_mem Y, size_t offy, int incy, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events) { return clblasZhbmv( order, uplo, N, K, alpha, A, offa, lda, X, offx, incx, beta, Y, offy, incy, numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events); } //TBSV clblasStatus clMath::clblas::tbsv( DataType type, clblasOrder order, clblasUplo uplo, clblasTranspose trans, clblasDiag diag, size_t N, size_t K, const cl_mem A, size_t offa, size_t lda, cl_mem X, size_t offx, int incx, //cl_mem scratchBuff, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events) { switch(type) { case TYPE_FLOAT: return clblasStbsv( order, uplo, trans, diag, N, K, A, offa, lda, X, offx, incx, numCommandQueues,commandQueues, numEventsInWaitList, eventWaitList, events); case TYPE_DOUBLE: return clblasDtbsv( order, uplo, trans, diag, N, K, A, offa, lda, X, offx, incx, numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events); case TYPE_COMPLEX_FLOAT: return clblasCtbsv( order, uplo, trans, diag, N, K, A, offa, lda, X, offx, incx, numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events); case TYPE_COMPLEX_DOUBLE: return clblasZtbsv( order, uplo, trans, diag, N, K, A, offa, lda, X, offx, incx, numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events); default: return clblasInvalidValue; } } clblasStatus clMath::clblas::her2k( clblasOrder order, clblasUplo uplo, clblasTranspose transA, size_t N, size_t K, cl_float2 alpha, const cl_mem A, size_t offa, size_t lda, const cl_mem B, size_t offb, size_t ldb, cl_float beta, cl_mem C, size_t offc, size_t ldc, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events) { clblasStatus ret; ret = clblasCher2k(order, uplo, transA, N, K, alpha, A, offa, lda, B, offb, ldb, beta, C, offc, ldc, numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events); return ret; } clblasStatus clMath::clblas::her2k( clblasOrder order, clblasUplo uplo, clblasTranspose transA, size_t N, size_t K, cl_double2 alpha, const cl_mem A, size_t offa, size_t lda, const cl_mem B, size_t offb, size_t ldb, cl_double beta, cl_mem C, size_t offc, size_t ldc, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events) { clblasStatus ret; ret = clblasZher2k(order, uplo, transA, N, K, alpha, A, offa, lda, B, offb, ldb, beta, C, offc, ldc, numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events); return ret; } clblasStatus clMath::clblas::swap( DataType type, size_t N, cl_mem X, size_t offx, int incx, cl_mem Y, size_t offy, int incy, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events) { switch(type) { case TYPE_FLOAT: return clblasSswap( N, X, offx, incx, Y, offy, incy, numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events); case TYPE_DOUBLE: return clblasDswap( N, X, offx, incx, Y, offy, incy, numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events); case TYPE_COMPLEX_FLOAT: return clblasCswap( N, X, offx, incx, Y, offy, incy, numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events); case TYPE_COMPLEX_DOUBLE: return clblasZswap( N, X, offx, incx, Y, offy, incy, numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events); default: return clblasInvalidValue; } } clblasStatus clMath::clblas::copy( DataType type, size_t N, cl_mem X, size_t offx, int incx, cl_mem Y, size_t offy, int incy, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events) { switch(type) { case TYPE_FLOAT: return clblasScopy( N, X, offx, incx, Y, offy, incy, numCommandQueues,commandQueues, numEventsInWaitList, eventWaitList, events); case TYPE_DOUBLE: return clblasDcopy( N, X, offx, incx, Y, offy, incy, numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events); case TYPE_COMPLEX_FLOAT: return clblasCcopy( N, X, offx, incx, Y, offy, incy, numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events); case TYPE_COMPLEX_DOUBLE: return clblasZcopy( N, X, offx, incx, Y, offy, incy, numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events); default: return clblasInvalidValue; } } // scal, csscal & zdscal wrappers clblasStatus clMath::clblas::scal( bool is_css_zds, size_t N, cl_float alpha, cl_mem X, size_t offx, int incx, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events) { is_css_zds = is_css_zds; // Remove warning return clblasSscal(N, alpha, X, offx, incx, numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events); } clblasStatus clMath::clblas::scal( bool is_css_zds, size_t N, cl_double alpha, cl_mem X, size_t offx, int incx, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events) { is_css_zds = is_css_zds; // Remove warning return clblasDscal(N, alpha, X, offx, incx, numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events); } clblasStatus clMath::clblas::scal( bool is_css_zds, size_t N, FloatComplex alpha, cl_mem X, size_t offx, int incx, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events) { if(is_css_zds) { return clblasCsscal(N, CREAL(alpha), X, offx, incx, numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events); } else { return clblasCscal(N, alpha, X, offx, incx, numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events); } } clblasStatus clMath::clblas::scal( bool is_css_zds, size_t N, DoubleComplex alpha, cl_mem X, size_t offx, int incx, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events) { if(is_css_zds) { return clblasZdscal(N, CREAL(alpha), X, offx, incx, numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events); } else { return clblasZscal(N, alpha, X, offx, incx, numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events); } } // DOT clblasStatus clMath::clblas::dot( DataType type, size_t N, cl_mem dotProduct, size_t offDP, cl_mem X, size_t offx, int incx, cl_mem Y, size_t offy, int incy, cl_mem scratchBuff, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events) { switch(type){ case TYPE_FLOAT: return clblasSdot( N, dotProduct, offDP, X, offx, incx, Y, offy, incy, scratchBuff, numCommandQueues,commandQueues, numEventsInWaitList, eventWaitList, events); case TYPE_DOUBLE: return clblasDdot( N, dotProduct, offDP, X, offx, incx, Y, offy, incy, scratchBuff, numCommandQueues,commandQueues, numEventsInWaitList, eventWaitList, events); case TYPE_COMPLEX_FLOAT: return clblasCdotu( N, dotProduct, offDP, X, offx, incx, Y, offy, incy, scratchBuff, numCommandQueues,commandQueues, numEventsInWaitList, eventWaitList, events); case TYPE_COMPLEX_DOUBLE: return clblasZdotu( N, dotProduct, offDP, X, offx, incx, Y, offy, incy, scratchBuff, numCommandQueues,commandQueues, numEventsInWaitList, eventWaitList, events); default: return clblasInvalidValue; } } //ASUM clblasStatus clMath::clblas::asum( DataType type, size_t N, cl_mem asum, size_t offAsum, cl_mem X, size_t offx, int incx, cl_mem scratchBuff, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events) { switch(type){ case TYPE_FLOAT: return clblasSasum( N, asum, offAsum, X, offx, incx, scratchBuff, numCommandQueues,commandQueues, numEventsInWaitList, eventWaitList, events); case TYPE_DOUBLE: return clblasDasum( N, asum, offAsum, X, offx, incx, scratchBuff, numCommandQueues,commandQueues, numEventsInWaitList, eventWaitList, events); case TYPE_COMPLEX_FLOAT: return clblasScasum( N, asum, offAsum, X, offx, incx, scratchBuff, numCommandQueues,commandQueues, numEventsInWaitList, eventWaitList, events); case TYPE_COMPLEX_DOUBLE: return clblasDzasum( N, asum, offAsum, X, offx, incx, scratchBuff, numCommandQueues,commandQueues, numEventsInWaitList, eventWaitList, events); default: return clblasInvalidValue; } } //DOTC clblasStatus clMath::clblas::dotc( DataType type, size_t N, cl_mem dotProduct, size_t offDP, cl_mem X, size_t offx, int incx, cl_mem Y, size_t offy, int incy, cl_mem scratchBuff, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events) { switch(type){ case TYPE_COMPLEX_FLOAT: return clblasCdotc( N, dotProduct, offDP, X, offx, incx, Y, offy, incy, scratchBuff, numCommandQueues,commandQueues, numEventsInWaitList, eventWaitList, events); case TYPE_COMPLEX_DOUBLE: return clblasZdotc( N, dotProduct, offDP, X, offx, incx, Y, offy, incy, scratchBuff, numCommandQueues,commandQueues, numEventsInWaitList, eventWaitList, events); default: return clblasInvalidValue; } } //axpy calls clblasStatus clMath::clblas::axpy( size_t N, cl_float alpha, cl_mem X, size_t offBX, int incx, cl_mem Y, size_t offCY, int incy, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events) { return clblasSaxpy(N, alpha, X, offBX, incx, Y, offCY, incy, numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events); } clblasStatus clMath::clblas::axpy( size_t N, cl_double alpha, cl_mem X, size_t offBX, int incx, cl_mem Y, size_t offCY, int incy, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events) { return clblasDaxpy(N, alpha, X, offBX, incx, Y, offCY, incy, numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events); } clblasStatus clMath::clblas::axpy( size_t N, FloatComplex alpha, cl_mem X, size_t offBX, int incx, cl_mem Y, size_t offCY, int incy, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events) { return clblasCaxpy(N, alpha, X, offBX, incx, Y, offCY, incy, numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events); } clblasStatus clMath::clblas::axpy( size_t N, DoubleComplex alpha, cl_mem X, size_t offBX, int incx, cl_mem Y, size_t offCY, int incy, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events) { return clblasZaxpy(N, alpha, X, offBX, incx, Y, offCY, incy, numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events); } clblasStatus clMath::clblas::rotg( DataType type, cl_mem SA, size_t offSA, cl_mem SB, size_t offSB, cl_mem C, size_t offC, cl_mem S, size_t offS, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events) { switch(type) { case TYPE_FLOAT: return clblasSrotg( SA, offSA, SB, offSB, C, offC, S, offS, numCommandQueues,commandQueues, numEventsInWaitList, eventWaitList, events); case TYPE_DOUBLE: return clblasDrotg( SA, offSA, SB, offSB, C, offC, S, offS, numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events); case TYPE_COMPLEX_FLOAT: return clblasCrotg( SA, offSA, SB, offSB, C, offC, S, offS, numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events); case TYPE_COMPLEX_DOUBLE: return clblasZrotg( SA, offSA, SB, offSB, C, offC, S, offS, numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events); default: return clblasInvalidValue; } } clblasStatus clMath::clblas::rotm( DataType type, size_t N, cl_mem X, size_t offx, int incx, cl_mem Y, size_t offy, int incy, cl_mem PARAM, size_t offParam, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events) { switch(type) { case TYPE_FLOAT: return clblasSrotm( N, X, offx, incx, Y, offy, incy, PARAM, offParam, numCommandQueues,commandQueues, numEventsInWaitList, eventWaitList, events); case TYPE_DOUBLE: return clblasDrotm( N, X, offx, incx, Y, offy, incy, PARAM, offParam, numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events); default: return clblasInvalidValue; } } clblasStatus clMath::clblas::rotmg( DataType type, cl_mem D1, size_t offD1, cl_mem D2, size_t offD2, cl_mem X1, size_t offX1, cl_mem Y1, size_t offY1, cl_mem PARAM, size_t offParam, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events) { switch(type) { case TYPE_FLOAT: return clblasSrotmg( D1, offD1, D2, offD2, X1, offX1, Y1, offY1, PARAM, offParam, numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events); case TYPE_DOUBLE: return clblasDrotmg( D1, offD1, D2, offD2, X1, offX1, Y1, offY1, PARAM, offParam, numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events); default: return clblasInvalidValue; } } //ROT clblasStatus clMath::clblas::rot( size_t N, cl_mem X, size_t offx, int incx, cl_mem Y, size_t offy, int incy, float C, float S, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events) { return clblasSrot( N, X, offx, incx, Y, offy, incy, (C), (S), numCommandQueues,commandQueues, numEventsInWaitList, eventWaitList, events); } clblasStatus clMath::clblas::rot( size_t N, cl_mem X, size_t offx, int incx, cl_mem Y, size_t offy, int incy, double C, double S, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events) { return clblasDrot( N, X, offx, incx, Y, offy, incy, (C), (S), numCommandQueues,commandQueues, numEventsInWaitList, eventWaitList, events); } clblasStatus clMath::clblas::rot( size_t N, cl_mem X, size_t offx, int incx, cl_mem Y, size_t offy, int incy, FloatComplex C, FloatComplex S, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events) { return clblasCsrot( N, X, offx, incx, Y, offy, incy, CREAL(C), CREAL(S), numCommandQueues,commandQueues, numEventsInWaitList, eventWaitList, events); } clblasStatus clMath::clblas::rot( size_t N, cl_mem X, size_t offx, int incx, cl_mem Y, size_t offy, int incy, DoubleComplex C, DoubleComplex S, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events) { return clblasZdrot( N, X, offx, incx, Y, offy, incy, CREAL(C), CREAL(S), numCommandQueues,commandQueues, numEventsInWaitList, eventWaitList, events); } // iAMAX clblasStatus clMath::clblas::iamax( DataType type, size_t N, cl_mem iMax, size_t offiMax, cl_mem X, size_t offx, int incx, cl_mem scratchBuf, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events) { switch(type){ case TYPE_FLOAT: return clblasiSamax( N, iMax, offiMax, X, offx, incx, scratchBuf, numCommandQueues,commandQueues, numEventsInWaitList, eventWaitList, events); case TYPE_DOUBLE: return clblasiDamax( N, iMax, offiMax, X, offx, incx, scratchBuf, numCommandQueues,commandQueues, numEventsInWaitList, eventWaitList, events); case TYPE_COMPLEX_FLOAT: return clblasiCamax( N, iMax, offiMax, X, offx, incx, scratchBuf, numCommandQueues,commandQueues, numEventsInWaitList, eventWaitList, events); case TYPE_COMPLEX_DOUBLE: return clblasiZamax( N, iMax, offiMax, X, offx, incx, scratchBuf, numCommandQueues,commandQueues, numEventsInWaitList, eventWaitList, events); default: return clblasInvalidValue; } } clblasStatus clMath::clblas::nrm2( DataType type, size_t N, cl_mem NRM2, size_t offNRM2, cl_mem X, size_t offx, int incx, cl_mem scratchBuff, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events) { switch(type){ case TYPE_FLOAT: return clblasSnrm2( N, NRM2, offNRM2, X, offx, incx, scratchBuff, numCommandQueues,commandQueues, numEventsInWaitList, eventWaitList, events); case TYPE_DOUBLE: return clblasDnrm2( N, NRM2, offNRM2, X, offx, incx, scratchBuff, numCommandQueues,commandQueues, numEventsInWaitList, eventWaitList, events); case TYPE_COMPLEX_FLOAT: return clblasScnrm2( N, NRM2, offNRM2, X, offx, incx, scratchBuff, numCommandQueues,commandQueues, numEventsInWaitList, eventWaitList, events); case TYPE_COMPLEX_DOUBLE: return clblasDznrm2( N, NRM2, offNRM2, X, offx, incx, scratchBuff, numCommandQueues,commandQueues, numEventsInWaitList, eventWaitList, events); default: return clblasInvalidValue; } } clblas-2.10/src/tests/cmdline.c000066400000000000000000000146051264277366700164360ustar00rootroot00000000000000/* ************************************************************************ * Copyright 2013 Advanced Micro Devices, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * ************************************************************************/ #include /* strcmp */ #include /* atoi, strtol */ #include /* printf */ #include static const char *testUsage = " [--seed s] [--alpha a] [--beta b] " "[--alpha-real a] [--beta-real b] [--alpha-imag a] [--beta-imag b] " "[--use-images f] [--device dev] [--queues n]\n" "\n" "seed - seed for the random number generator" "\n" "alpha - alpha multiplier" "\n" "beta - beta multiplier" "\n" "alpha-real - alpha multiplier real part" "\n" "beta-real - beta multiplier real part" "\n" "alpha-imag - alpha-multiplier imaginary part" "\n" "beta-imag - beta-multiplier imaginary part" "\n" "use-images - allow the library to use images for computing" "\n" "device - device to run the test on, 'cpu' or 'gpu'(default)" "\n" "queues - number of command queues to use" "\n" "Parameters defined through the command line are kept over the whole " "set of custom test cases. The use-images parameter value is ignored if " "the target device is CPU\n\n"; typedef struct SetterArg { TestParams *params; const char *arg; long extra; } SetterArg; typedef struct CmdLineOpt { const char *name; unsigned int flagToSet; int (*setter)(SetterArg*); long setterExtra; } CmdLineOpt; enum { MULT_ALPHA = 0x01, MULT_BETA = 0x02, MULT_REAL_ONLY = 0x04, MULT_IMAG_ONLY = 0x08 }; static int doParseCmdLine( int argc, char *argv[], const CmdLineOpt *opts, unsigned int nrOpts, TestParams *params) { int i = 1, j = 0; int ret = 0; const CmdLineOpt *currOpt; const char *currArg; SetterArg sarg = {params, NULL, 0}; do { currArg = (const char*)argv[i]; i++; if (currArg[0] != '-') { // some of size arguments switch (j) { case 0: params->M = atoi(currArg); params->optFlags |= SET_M; break; case 1: params->N = atoi(currArg); params->optFlags |= SET_N; break; case 2: params->K = atoi(currArg); params->optFlags |= SET_K; break; } j++; continue; } else if (currArg[1] != '-') { // it can be some parameter of a used test framework, skip it j = 0; continue; } j = 0; for (currOpt = opts; currOpt < opts + nrOpts; currOpt++) { if (!strcmp(currOpt->name, &currArg[2])) { if (i == argc) { printf("Error: parameter '%s' is not specified!\n", currOpt->name); ret = -1; } else { sarg.arg = argv[i++]; sarg.extra = currOpt->setterExtra; ret = currOpt->setter(&sarg); params->optFlags |= currOpt->flagToSet; } break; } } } while ((i < argc) && !ret); return ret; } static int setSeed(SetterArg *sarg) { sarg->params->seed = atoi(sarg->arg); return 0; } static int setMult(SetterArg *sarg) { ComplexLong *mult; long val; char *end; long flags = sarg->extra; mult = (flags & MULT_BETA) ? &sarg->params->beta : &sarg->params->alpha; mult->re = 0; mult->imag = 0; val = strtol(sarg->arg, &end, 10); if (!(flags & MULT_IMAG_ONLY)) { mult->re = val; } if (!(flags & MULT_REAL_ONLY)) { mult->imag = val; } return 0; } static int setDevice(SetterArg *sarg) { if (!strcmp(sarg->arg, "cpu")) { sarg->params->devType = CL_DEVICE_TYPE_CPU; sarg->params->devName = NULL; return 0; } if (!strcmp(sarg->arg, "gpu")) { sarg->params->devType = CL_DEVICE_TYPE_GPU; sarg->params->devName = NULL; return 0; } sarg->params->devName = sarg->arg; return 0; } static int setNumCommandQueues(SetterArg *sarg) { sarg->params->numCommandQueues = atoi(sarg->arg); return 0; } static const CmdLineOpt opts[] = { {"seed", SET_SEED, setSeed, 0}, {"alpha", SET_ALPHA, setMult, MULT_ALPHA | MULT_REAL_ONLY}, {"beta", SET_BETA, setMult, MULT_BETA | MULT_REAL_ONLY}, {"alpha-real", SET_ALPHA, setMult, MULT_ALPHA | MULT_REAL_ONLY}, {"alpha-imag", SET_ALPHA, setMult, MULT_ALPHA | MULT_IMAG_ONLY}, {"beta-real", SET_BETA, setMult, MULT_BETA | MULT_REAL_ONLY}, {"beta-imag", SET_BETA, setMult, MULT_BETA | MULT_IMAG_ONLY}, {"device", SET_DEVICE_TYPE, setDevice, 0}, {"queues", SET_NUM_COMMAND_QUEUES, setNumCommandQueues, 0}, }; static const unsigned int nrOpts = sizeof(opts) / sizeof(CmdLineOpt); int parseBlasCmdLineArgs( int argc, char *argv[], TestParams *params) { return doParseCmdLine(argc, argv, opts, nrOpts, params); } void printUsage(const char *appName) { printf("%s %s\n", appName, testUsage); } void parseEnv(TestParams *params) { const char *str; int createImages = 0; str = getenv("AMD_CLBLAS_GEMM_IMPLEMENTATION"); if ((str != NULL) && (strcmp(str, "1") == 0)) { createImages = 1; } str = getenv("AMD_CLBLAS_TRMM_IMPLEMENTATION"); if ((str != NULL) && (strcmp(str, "1") == 0)) { createImages = 1; } str = getenv("AMD_CLBLAS_TRSM_IMPLEMENTATION"); if ((str != NULL) && (strcmp(str, "1") == 0)) { createImages = 1; } params->optFlags = NO_FLAGS; if (createImages) { params->optFlags |= SET_USE_IMAGES; } } clblas-2.10/src/tests/common.cpp000066400000000000000000000567351264277366700166650ustar00rootroot00000000000000/* ************************************************************************ * Copyright 2013 Advanced Micro Devices, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * ************************************************************************/ #include #include #include #include cl_context getQueueContext(cl_command_queue commandQueue, cl_int *error) { cl_int err; cl_context ctx = NULL; err = clGetCommandQueueInfo(commandQueue, CL_QUEUE_CONTEXT, sizeof(cl_context), &ctx, NULL); if (error != NULL) { *error = err; } return ctx; } cl_int waitForSuccessfulFinish( cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_event *events) { cl_int err = CL_SUCCESS; cl_uint i; for (i = 0; i < numCommandQueues; i++) { cl_int e; cl_int status; e = clFinish(commandQueues[i]); if ((events != NULL) && (events[i] != NULL)) { if (e == CL_SUCCESS) { status = CL_COMPLETE; e = clGetEventInfo(events[i], CL_EVENT_COMMAND_EXECUTION_STATUS, sizeof(status), &status, NULL); if ((e == CL_SUCCESS) && (status < 0)) { e = -status; } } clReleaseEvent(events[i]); } if (err == CL_SUCCESS) { err = e; } } return err; } cl_int flushAll( cl_uint numCommandQueues, cl_command_queue *commandQueues) { cl_int err; cl_uint i; for (i = 0; i < numCommandQueues; i++) { err = clFlush(commandQueues[i]); if (err != CL_SUCCESS) { return err; } } return CL_SUCCESS; } void printTestParams( clblasOrder order, clblasTranspose transA, clblasTranspose transB, size_t M, size_t N, size_t K, bool useAlpha, ComplexLong alpha, size_t offA, size_t lda, size_t offB, size_t ldb, bool useBeta, ComplexLong beta, size_t offC, size_t ldc) { ::std::cerr << orderStr(order) << ", " << transStr(transA) << ", " << transStr(transB) << ", " << "M = " << M << ", " << "N = " << N << ", " << "K = " << K << ", " << "offA = " << offA << ", " << "offB = " << offB << ", " << "offC = " << offC << ", " << "lda = " << lda << ", " << "ldb = " << ldb << ", " << "ldc = " << ldc; if (useAlpha) { ::std::cerr << ", " << "alpha = (" << alpha.re << "," << alpha.imag << ")"; } if (useBeta) { ::std::cerr << ", " << "beta = (" << beta.re << "," << beta.imag << ")"; } ::std::cerr << std::endl; } void printTestParams( clblasOrder order, clblasSide side, clblasUplo uplo, clblasTranspose transA, clblasDiag diag, size_t M, size_t N, bool useAlpha, ComplexLong alpha, size_t offA, size_t lda, size_t offB, size_t ldb) { ::std::cerr << orderStr(order) << ", " << sideStr(side) << ", " << uploStr(uplo) << ", " << transStr(transA) << ", " << diagStr(diag) << ::std::endl; ::std::cerr << "M = " << M << ", N = " << N << ::std::endl; ::std::cerr << "offA = " << offA << ", offB = " << offB << ::std::endl; ::std::cerr << "lda = " << lda << ", ldb = " << ldb << ::std::endl; if (useAlpha) { ::std::cerr << "alpha = (" << alpha.re << "," << alpha.imag << ")" << ::std::endl; } } //SYR void printTestParams( clblasOrder order, clblasUplo uplo, size_t N, double alpha, size_t offx, int incx, size_t offa, size_t lda) { ::std::cerr << orderStr(order) << ", " << uploStr(uplo) << ::std::endl; ::std::cerr << "N = " << N << ", offx = " << offx << ", incx = " << incx << ::std::endl; ::std::cerr << "offa = " << offa << ::std::endl; if( lda ) ::std::cerr << ", lda = " << lda << ::std::endl; ::std::cerr << "alpha = " << alpha << ::std::endl; } //SPR void printTestParams( clblasOrder order, clblasUplo uplo, size_t N, double alpha, size_t offx, int incx, size_t offa) { ::std::cerr << orderStr(order) << ", " << uploStr(uplo) << ::std::endl; ::std::cerr << "N = " << N << ", offx = " << offx << ", incx = " << incx << ::std::endl; ::std::cerr << "offa = " << offa << ::std::endl; ::std::cerr << "alpha = " << alpha << ::std::endl; } //SYR2 void printTestParams( clblasOrder order, clblasUplo uplo, size_t N, double alpha, size_t offx, int incx, size_t offy, int incy, size_t offa, size_t lda) { ::std::cerr << orderStr(order) << ", " << uploStr(uplo) << ::std::endl; ::std::cerr << "N = " << N << ", offx = " << offx << ", incx = " << incx << ::std::endl; ::std::cerr << "offy = " << offy << ", incy = " << incy << ::std::endl; ::std::cerr << "offa = " << offa << ::std::endl; if( lda ) ::std::cerr << ", lda = " << lda << ::std::endl; ::std::cerr << "alpha = " << alpha << ::std::endl; } //copy, dot, swap, dotc void printTestParams( size_t N, size_t offx, int incx, size_t offy, int incy) { ::std::cerr << "N = " << N << ", offx = " << offx << ", incx = " << incx << ::std::endl; ::std::cerr << "offy = " << offy << ", incy = " << incy << ::std::endl; } //HER2 void printTestParams( clblasOrder order, clblasUplo uplo, size_t N, bool useAlpha, cl_float2 alpha, size_t offx, int incx, size_t offy, int incy, size_t offa, size_t lda) { ::std::cerr << orderStr(order) << ", " << uploStr(uplo) << ::std::endl; ::std::cerr << "N = " << N << ", offx = " << offx << ", incx = " << incx << ::std::endl; ::std::cerr << "offy = " << offy << ", incy = " << incy << ::std::endl; ::std::cerr << "offa = " << offa << ::std::endl; if( lda ) ::std::cerr << ", lda = " << lda << ::std::endl; if(useAlpha) ::std::cerr << "alpha = (" << CREAL(alpha) << ", " << CIMAG(alpha) << ")" << ::std::endl; } //HEMV void printTestParams( clblasOrder order, clblasUplo uplo, size_t N, ComplexLong alpha, size_t offa, size_t lda, size_t offx, int incx, ComplexLong beta, size_t offy, int incy) { ::std::cerr << orderStr(order) << ", " << uploStr(uplo) << ::std::endl; ::std::cerr << "N = " << N << ", offx = " << offx << ", incx = " << incx << ::std::endl; ::std::cerr << "offy = " << offy << ", incy = " << incy << ::std::endl; ::std::cerr << "offa = " << offa; if( lda ) ::std::cerr << ", lda = " << lda; ::std::cerr << ::std::endl << "alpha = (" << alpha.re << "," << alpha.imag << ")" << ::std::endl; ::std::cerr << "beta = (" << beta.re << "," << beta.imag << ")" << ::std::endl; } //SYMM , HEMM void printTestParams( clblasOrder order, clblasSide side, clblasUplo uplo, size_t M, size_t N, bool useAlpha, ComplexLong alpha, bool useBeta, ComplexLong beta, size_t lda, size_t ldb, size_t ldc, size_t offa, size_t offb, size_t offc ) { ::std::cerr << orderStr(order) << ", " << sideStr(side) << ", " << uploStr(uplo) << ::std::endl; ::std::cerr << "M = " << M << ", N = " << N << ::std::endl; ::std::cerr << "lda = " << lda << ", ldb = " << ldb << ", ldc = " << ldc<< ::std::endl; if (useAlpha) { ::std::cerr << "alpha = (" << alpha.re << "," << alpha.imag << ")" << ::std::endl; } if (useBeta) { ::std::cerr << "beta = (" << beta.re << "," << beta.imag << ")" << ::std::endl; } ::std::cerr << "offa = " << offa << ", offb = " << offb << ", offc = " << offc<< ::std::endl; } //xHEMM void printTestParams( clblasOrder order, clblasSide side, clblasUplo uplo, size_t M, size_t N, bool useAlpha, cl_float2 alpha, bool useBeta, cl_float2 beta, size_t lda, size_t ldb, size_t ldc, size_t offa, size_t offb, size_t offc ) { ::std::cerr << orderStr(order) << ", " << sideStr(side) << ", " << uploStr(uplo) << ::std::endl; ::std::cerr << "M = " << M << ", N = " << N << ::std::endl; ::std::cerr << "lda = " << lda << ", ldb = " << ldb << ", ldc = " << ldc<< ::std::endl; if (useAlpha) { ::std::cerr << "alpha = (" << CREAL(alpha) << "," << CIMAG(alpha) << ")" << ::std::endl; } if (useBeta) { ::std::cerr << "beta = (" << CREAL(beta) << "," << CIMAG(beta) << ")" << ::std::endl; } ::std::cerr << "offa = " << offa << ", offb = " << offb << ", offc = " << offc<< ::std::endl; } void printTestParams( clblasOrder order, size_t M, size_t N, bool useAlpha, ComplexLong alpha, size_t lda, int incx, int incy, size_t offa, size_t offx, size_t offy ) { ::std::cerr << orderStr(order) << ", " << ::std::endl; ::std::cerr << "M = " << M << ", N = " << N << ::std::endl; ::std::cerr << "lda = " << lda << ", incx = " << incx << ", incy = " << incy<< ::std::endl; if (useAlpha) { ::std::cerr << "alpha = (" << alpha.re << "," << alpha.imag << ")" << ::std::endl; } ::std::cerr << "offa = " << offa << ", offx = " << offx << ", offy = " << offy << ::std::endl; } // xGBMV void printTestParams( clblasOrder order, clblasTranspose transA, size_t M, size_t N, size_t KL, size_t KU, ComplexLong alpha, size_t offa, size_t lda, size_t offx, int incx, ComplexLong beta, size_t offy, int incy) { ::std::cerr << orderStr(order) << ", " << transStr(transA) << ", " << ::std::endl; ::std::cerr << "M = " << M << ", N = " << N << ", KL = " << KL << ", KU = " << KU << ::std::endl; ::std::cerr << "lda = " << lda << ", incx = " << incx << ", incy = " << incy<< ::std::endl; ::std::cerr << "offa = " << offa << ", offx = " << offx << ", offy = " << offy << ::std::endl; ::std::cerr << "alpha = (" << alpha.re << "," << alpha.imag << ")" << ::std::endl; ::std::cerr << "beta = (" << beta.re << "," << beta.imag << ")" << ::std::endl; } //HBMV //SBMV void printTestParams( clblasOrder order, clblasUplo uplo, size_t N, size_t K, ComplexLong alpha, size_t offa, size_t lda, size_t offx, int incx, ComplexLong beta, size_t offy, int incy) { ::std::cerr << orderStr(order) << ", " << uploStr(uplo) << ", " << ::std::endl; ::std::cerr << ", N = " << N << ", K = " << K << ::std::endl; ::std::cerr << "lda = " << lda << ", incx = " << incx << ", incy = " << incy<< ::std::endl; ::std::cerr << "offa = " << offa << ", offx = " << offx << ", offy = " << offy << ::std::endl; ::std::cerr << "alpha = (" << alpha.re << "," << alpha.imag << ")" << ::std::endl; ::std::cerr << "beta = (" << beta.re << "," << beta.imag << ")" << ::std::endl; } //xTBMV void printTestParams( clblasOrder order, clblasUplo uplo, clblasTranspose transA, clblasDiag diag, size_t N, size_t KLU, size_t offA, size_t lda, size_t offx, int incx, size_t offy, int incy) { ::std::cerr << orderStr(order) << ", " << uploStr(uplo) << ", " << transStr(transA) << ", " << diagStr(diag) << ::std::endl; ::std::cerr << ", N = " << N << ", KL or KU = " << KLU << ::std::endl; ::std::cerr << "lda = " << lda << ", incx = " << incx << ", incy = " << incy<< ::std::endl; ::std::cerr << "offa = " << offA << ", offx = " << offx << ", offy = " << offy << ::std::endl; } //HER void printTestParams( clblasOrder order, clblasUplo uplo, size_t N, bool useAlpha, ComplexLong alpha, size_t lda, int incx, size_t offa, size_t offx) { ::std::cerr << orderStr(order) << ", " << ::std::endl; ::std::cerr << uploStr(uplo) << ", " << ::std::endl; ::std::cerr << " N = " << N << ::std::endl; ::std::cerr << "lda = " << lda << ", incx = " << incx << ::std::endl; if (useAlpha) { ::std::cerr << "alpha = (" << alpha.re << "," << alpha.imag << ")" << ::std::endl; } ::std::cerr << "offa = " << offa << ", offx = " << offx << ::std::endl; } void printTestParams( clblasOrder order, clblasUplo uplo, clblasTranspose transA, clblasDiag diag, size_t N, size_t lda, int incx, size_t offa, size_t offx) { ::std::cerr << orderStr(order) << ", " << uploStr(uplo) << ", " << transStr(transA) << ", " < #include #include #include namespace clMath { clblasStatus BlasBase::addScratchImages(void) { //clblasStatus status; //// Height must be less than 1024 //imageA_ = clblasAddScratchImage(context_, 2048, 512, &status); //if (imageA_) { // imageB_ = clblasAddScratchImage(context_, 2048, 512, &status); //} //return status; return clblasNotImplemented; } } // namespace clblas-2.10/src/tests/correctness/blas-lapack.c000066400000000000000000000634771264277366700215420ustar00rootroot00000000000000/* ************************************************************************ * Copyright 2013 Advanced Micro Devices, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * ************************************************************************/ /* * cblas to lapack's blas interface adapter */ #include #if !defined CORR_TEST_WITH_ACML #include "blas-lapack.h" #if defined(__APPLE__) #include #endif void sgemv(char transa, int m, int n, float alpha, float *a, int lda, float *x, int incx, float beta, float *y, int incy) { sgemv_(&transa, &m, &n, &alpha, a, &lda, x, &incx, &beta, y, &incy); } void dgemv(char transa, int m, int n, double alpha, double *a, int lda, double *x, int incx, double beta, double *y, int incy) { dgemv_(&transa, &m, &n, &alpha, a, &lda, x, &incx, &beta, y, &incy); } void cgemv(char transa, int m, int n, complex *alpha, complex *a, int lda, complex *x, int incx, complex *beta, complex *y, int incy) { cgemv_(&transa, &m, &n, alpha, a, &lda, x, &incx, beta, y, &incy); } void zgemv(char transa, int m, int n, doublecomplex *alpha, doublecomplex *a, int lda, doublecomplex *x, int incx, doublecomplex *beta, doublecomplex *y, int incy) { zgemv_(&transa, &m, &n, alpha, a, &lda, x, &incx, beta, y, &incy); } void ssymv(char uplo, int n, float alpha, float *a, int lda, float *x, int incx, float beta, float *y, int incy) { ssymv_(&uplo, &n, &alpha, a, &lda, x, &incx, &beta, y, &incy); } void dsymv(char uplo, int n, double alpha, double *a, int lda, double *x, int incx, double beta, double *y, int incy) { dsymv_(&uplo, &n, &alpha, a, &lda, x, &incx, &beta, y, &incy); } void sgemm(char transa, char transb, int m, int n, int k, float alpha, float *a, int lda, float *b, int ldb, float beta, float *c, int ldc) { sgemm_(&transa, &transb, &m, &n, &k, &alpha, a, &lda, b, &ldb, &beta, c, &ldc); } void dgemm(char transa, char transb, int m, int n, int k, double alpha, double *a, int lda, double *b, int ldb, double beta, double *c, int ldc) { dgemm_(&transa, &transb, &m, &n, &k, &alpha, a, &lda, b, &ldb, &beta, c, &ldc); } void cgemm(char transa, char transb, int m, int n, int k, complex *alpha, complex *a, int lda, complex *b, int ldb, complex *beta, complex *c, int ldc) { cgemm_(&transa, &transb, &m, &n, &k, alpha, a, &lda, b, &ldb, beta, c, &ldc); } void zgemm(char transa, char transb, int m, int n, int k, doublecomplex *alpha, doublecomplex *a, int lda, doublecomplex *b, int ldb, doublecomplex *beta, doublecomplex *c, int ldc) { zgemm_(&transa, &transb, &m, &n, &k, alpha, a, &lda, b, &ldb, beta, c, &ldc); } void strmm(char side, char uplo, char transa, char diag, int m, int n, float alpha, float *a, int lda, float *b, int ldb) { strmm_(&side, &uplo, &transa, &diag, &m, &n, &alpha, a, &lda, b, &ldb); } void dtrmm(char side, char uplo, char transa, char diag, int m, int n, double alpha, double *a, int lda, double *b, int ldb) { dtrmm_(&side, &uplo, &transa, &diag, &m, &n, &alpha, a, &lda, b, &ldb); } void ctrmm(char side, char uplo, char transa, char diag, int m, int n, complex *alpha, complex *a, int lda, complex *b, int ldb) { ctrmm_(&side, &uplo, &transa, &diag, &m, &n, alpha, a, &lda, b, &ldb); } void ztrmm(char side, char uplo, char transa, char diag, int m, int n, doublecomplex *alpha, doublecomplex *a, int lda, doublecomplex *b, int ldb) { ztrmm_(&side, &uplo, &transa, &diag, &m, &n, alpha, a, &lda, b, &ldb); } void strsm(char side, char uplo, char transa, char diag, int m, int n, float alpha, float *a, int lda, float *b, int ldb) { strsm_(&side, &uplo, &transa, &diag, &m, &n, &alpha, a, &lda, b, &ldb); } void dtrsm(char side, char uplo, char transa, char diag, int m, int n, double alpha, double *a, int lda, double *b, int ldb) { dtrsm_(&side, &uplo, &transa, &diag, &m, &n, &alpha, a, &lda, b, &ldb); } void ctrsm(char side, char uplo, char transa, char diag, int m, int n, complex *alpha, complex *a, int lda, complex *b, int ldb) { ctrsm_(&side, &uplo, &transa, &diag, &m, &n, alpha, a, &lda, b, &ldb); } void ztrsm(char side, char uplo, char transa, char diag, int m, int n, doublecomplex *alpha, doublecomplex *a, int lda, doublecomplex *b, int ldb) { ztrsm_(&side, &uplo, &transa, &diag, &m, &n, alpha, a, &lda, b, &ldb); } void ssyr2k(char uplo, char transa, int n, int k, float alpha, float *a, int lda, float *b, int ldb, float beta, float *c, int ldc) { ssyr2k_(&uplo, &transa, &n, &k, &alpha, a, &lda, b, &ldb, &beta, c, &ldc); } void dsyr2k(char uplo, char transa, int n, int k, double alpha, double *a, int lda, double *b, int ldb, double beta, double *c, int ldc) { dsyr2k_(&uplo, &transa, &n, &k, &alpha, a, &lda, b, &ldb, &beta, c, &ldc); } void csyr2k(char uplo, char transa, int n, int k, complex *alpha, complex *a, int lda, complex *b, int ldb, complex *beta, complex *c, int ldc) { csyr2k_(&uplo, &transa, &n, &k, alpha, a, &lda, b, &ldb, beta, c, &ldc); } void zsyr2k(char uplo, char transa, int n, int k, doublecomplex *alpha, doublecomplex *a, int lda, doublecomplex *b, int ldb, doublecomplex *beta, doublecomplex *c, int ldc) { zsyr2k_(&uplo, &transa, &n, &k, alpha, a, &lda, b, &ldb, beta, c, &ldc); } void ssyrk(char uplo, char transa, int n, int k, float alpha, float *a, int lda, float beta, float *c, int ldc) { ssyrk_(&uplo, &transa, &n, &k, &alpha, a, &lda, &beta, c, &ldc); } void dsyrk(char uplo, char transa, int n, int k, double alpha, double *a, int lda, double beta, double *c, int ldc) { dsyrk_(&uplo, &transa, &n, &k, &alpha, a, &lda, &beta, c, &ldc); } void csyrk(char uplo, char transa, int n, int k, complex *alpha, complex *a, int lda, complex *beta, complex *c, int ldc) { csyrk_(&uplo, &transa, &n, &k, alpha, a, &lda, beta, c, &ldc); } void zsyrk(char uplo, char transa, int n, int k, doublecomplex *alpha, doublecomplex *a, int lda, doublecomplex *beta, doublecomplex *c, int ldc) { zsyrk_(&uplo, &transa, &n, &k, alpha, a, &lda, beta, c, &ldc); } void strmv(char uplo, char transa, char diag, int n, float *a, int lda, float *x, int incx) { strmv_( &uplo, &transa, &diag, &n, a, &lda, x, &incx); } void dtrmv(char uplo, char transa, char diag, int n, double *a, int lda, double *x, int incx) { dtrmv_( &uplo, &transa, &diag, &n, a, &lda, x, &incx); } void ctrmv(char uplo, char transa, char diag, int n, complex *a, int lda, complex *x, int incx) { ctrmv_( &uplo, &transa, &diag, &n, a, &lda, x, &incx); } void ztrmv(char uplo, char transa, char diag, int n, doublecomplex *a, int lda, doublecomplex *x, int incx) { ztrmv_( &uplo, &transa, &diag, &n, a, &lda, x, &incx); } void strsv(char uplo, char transa, char diag, int n, float *a, int lda, float *x, int incx) { strsv_( &uplo, &transa, &diag, &n, a, &lda, x, &incx); } void dtrsv(char uplo, char transa, char diag, int n, double *a, int lda, double *x, int incx) { dtrsv_( &uplo, &transa, &diag, &n, a, &lda, x, &incx); } void ctrsv(char uplo, char transa, char diag, int n, complex *a, int lda, complex *x, int incx) { ctrsv_( &uplo, &transa, &diag, &n, a, &lda, x, &incx); } void ztrsv(char uplo, char transa, char diag, int n, doublecomplex *a, int lda, doublecomplex *x, int incx) { ztrsv_( &uplo, &transa, &diag, &n, a, &lda, x, &incx); } void ssymm(char side, char uplo, int m, int n, float alpha, float *a, int lda, float *b, int ldb, float beta, float *c, int ldc) { ssymm_( &side, &uplo, &m, &n, &alpha, a, &lda, b, &ldb, &beta, c, &ldc); } void dsymm(char side, char uplo, int m, int n, double alpha, double *a, int lda, double *b, int ldb, double beta, double *c, int ldc) { dsymm_( &side, &uplo, &m, &n, &alpha, a, &lda, b, &ldb, &beta, c, &ldc); } void csymm(char side, char uplo, int m, int n, complex *alpha, complex *a, int lda, complex *b, int ldb, complex *beta, complex *c, int ldc) { csymm_( &side, &uplo, &m, &n, alpha, a, &lda, b, &ldb, beta, c, &ldc); } void zsymm(char side, char uplo, int m, int n, doublecomplex *alpha, doublecomplex *a, int lda, doublecomplex *b, int ldb, doublecomplex *beta, doublecomplex *c, int ldc) { zsymm_( &side, &uplo, &m, &n, alpha, a, &lda, b, &ldb, beta, c, &ldc); } void sger(int m, int n, float alpha, float *x, int incx, float *y, int incy, float *a, int lda) { sger_( &m, &n, &alpha, x, &incx, y, &incy, a, &lda); } void dger(int m, int n, double alpha, double *x, int incx, double *y, int incy, double *a, int lda) { dger_( &m, &n, &alpha, x, &incx, y, &incy, a, &lda); } void cgeru(int m, int n, complex *alpha, complex *x, int incx, complex *y, int incy, complex *a, int lda) { cgeru_( &m, &n, alpha, x, &incx, y, &incy, a, &lda); } void zgeru(int m, int n, doublecomplex *alpha, doublecomplex *x, int incx, doublecomplex *y, int incy, doublecomplex *a, int lda) { zgeru_( &m, &n, alpha, x, &incx, y, &incy, a, &lda); } void cgerc(int m, int n, complex *alpha, complex *x, int incx, complex *y, int incy, complex *a, int lda) { cgerc_( &m, &n, alpha, x, &incx, y, &incy, a, &lda); } void zgerc(int m, int n, doublecomplex *alpha, doublecomplex *x, int incx, doublecomplex *y, int incy, doublecomplex *a, int lda) { zgerc_( &m, &n, alpha, x, &incx, y, &incy, a, &lda); } void ssyr(char uplo, int n, float alpha, float *x, int incx, float *a, int lda) { ssyr_( &uplo, &n, &alpha, x, &incx, a, &lda); } void dsyr(char uplo, int n, double alpha, double *x, int incx, double *a, int lda) { dsyr_( &uplo, &n, &alpha, x, &incx, a, &lda); } void ssyr2(char uplo, int n, float alpha, float *x, int incx, float *y, int incy, float *a, int lda) { ssyr2_( &uplo, &n, &alpha, x, &incx, y, &incy, a, &lda); } void dsyr2(char uplo, int n, double alpha, double *x, int incx, double *y, int incy, double *a, int lda) { dsyr2_( &uplo, &n, &alpha, x, &incx, y, &incy, a, &lda); } void cher(char uplo, int n, float alpha, complex *x, int incx, complex *a, int lda) { cher_( &uplo, &n, &alpha, x, &incx, a, &lda); } void zher(char uplo, int n, double alpha, doublecomplex *x, int incx, doublecomplex *a, int lda) { zher_( &uplo, &n, &alpha, x, &incx, a, &lda); } void cher2(char uplo, int n, complex *alpha, complex *x, int incx, complex *y, int incy, complex *a, int lda) { cher2_( &uplo, &n, alpha, x, &incx, y, &incy, a, &lda); } void zher2(char uplo, int n, doublecomplex *alpha, doublecomplex *x, int incx, doublecomplex *y, int incy, doublecomplex *a, int lda) { zher2_( &uplo, &n, alpha, x, &incx, y, &incy, a, &lda); } void chemv(char uplo, int n, complex *alpha, complex *a, int lda, complex *x, int incx, complex *beta, complex *y, int incy) { chemv_( &uplo, &n, alpha, a, &lda, x, &incx, beta, y, &incy ); } void zhemv(char uplo, int n, doublecomplex *alpha, doublecomplex *a, int lda, doublecomplex *x, int incx, doublecomplex *beta, doublecomplex *y, int incy) { zhemv_( &uplo, &n, alpha, a, &lda, x, &incx, beta, y, &incy ); } void stpmv(char uplo, char transa, char diag, int n, float *ap, float *x, int incx) { stpmv_( &uplo, &transa, &diag, &n, ap, x, &incx); } void dtpmv(char uplo, char transa, char diag, int n, double *ap, double *x, int incx) { dtpmv_( &uplo, &transa, &diag, &n, ap, x, &incx); } void ctpmv(char uplo, char transa, char diag, int n, complex *ap, complex *x, int incx) { ctpmv_( &uplo, &transa, &diag, &n, ap, x, &incx); } void ztpmv(char uplo, char transa, char diag, int n, doublecomplex *ap, doublecomplex *x, int incx) { ztpmv_( &uplo, &transa, &diag, &n, ap, x, &incx); } void stpsv(char uplo, char transa, char diag, int n, float *ap, float *x, int incx) { stpsv_( &uplo, &transa, &diag, &n, ap, x, &incx); } void dtpsv(char uplo, char transa, char diag, int n, double *ap, double *x, int incx) { dtpsv_( &uplo, &transa, &diag, &n, ap, x, &incx); } void ctpsv(char uplo, char transa, char diag, int n, complex *ap, complex *x, int incx) { ctpsv_( &uplo, &transa, &diag, &n, ap, x, &incx); } void ztpsv(char uplo, char transa, char diag, int n, doublecomplex *ap, doublecomplex *x, int incx) { ztpsv_( &uplo, &transa, &diag, &n, ap, x, &incx); } void sspr(char uplo, int n, float alpha, float *x, int incx, float *ap ) { sspr_( &uplo, &n, &alpha, x, &incx, ap ); } void dspr(char uplo, int n, double alpha, double *x, int incx, double *ap ) { dspr_( &uplo, &n, &alpha, x, &incx, ap ); } void sspmv(char uplo, int n, float alpha, float *ap, float *x, int incx, float beta, float *y, int incy) { sspmv_( &uplo, &n, &alpha, ap, x, &incx, &beta, y, &incy ); } void dspmv(char uplo, int n, double alpha, double *ap, double *x, int incx, double beta, double *y, int incy) { dspmv_( &uplo, &n, &alpha, ap, x, &incx, &beta, y, &incy ); } void chpmv(char uplo, int n, complex *alpha, complex *ap, complex *x, int incx, complex *beta, complex *y, int incy) { chpmv_( &uplo, &n, alpha, ap, x, &incx, beta, y, &incy ); } void zhpmv(char uplo, int n, doublecomplex *alpha, doublecomplex *ap, doublecomplex *x, int incx, doublecomplex *beta, doublecomplex *y, int incy) { zhpmv_( &uplo, &n, alpha, ap, x, &incx, beta, y, &incy ); } void chpr(char uplo, int n, float alpha, complex *x, int incx, complex *ap ) { chpr_( &uplo, &n, &alpha, x, &incx, ap ); } void zhpr(char uplo, int n, double alpha, doublecomplex *x, int incx, doublecomplex *ap ) { zhpr_( &uplo, &n, &alpha, x, &incx, ap ); } void sspr2(char uplo, int n, float alpha, float *x, int incx, float *y, int incy, float *a ) { sspr2_( &uplo, &n, &alpha, x, &incx, y, &incy, a ); } void dspr2(char uplo, int n, double alpha, double *x, int incx, double *y, int incy, double *a ) { dspr2_( &uplo, &n, &alpha, x, &incx, y, &incy, a ); } void chpr2(char uplo, int n, complex *alpha, complex *x, int incx, complex *y, int incy, complex *a ) { chpr2_( &uplo, &n, alpha, x, &incx, y, &incy, a ); } void zhpr2(char uplo, int n, doublecomplex *alpha, doublecomplex *x, int incx, doublecomplex *y, int incy, doublecomplex *a ) { zhpr2_( &uplo, &n, alpha, x, &incx, y, &incy, a ); } void sgbmv(char trans, int m, int n, int kl, int ku, float alpha, float *a, int inca, float *x, int incx, float beta, float *y, int incy ) { sgbmv_( &trans, &m, &n, &kl, &ku, &alpha, a, &inca, x, &incx, &beta, y, &incy ); } void dgbmv(char trans, int m, int n, int kl, int ku, double alpha, double *a, int inca, double *x, int incx, double beta, double *y, int incy ) { dgbmv_( &trans, &m, &n, &kl, &ku, &alpha, a, &inca, x, &incx, &beta, y, &incy ); } void cgbmv(char trans, int m, int n, int kl, int ku, complex *alpha, complex *a, int inca, complex *x, int incx, complex *beta, complex *y, int incy ) { cgbmv_( &trans, &m, &n, &kl, &ku, alpha, a, &inca, x, &incx, beta, y, &incy ); } void zgbmv(char trans, int m, int n, int kl, int ku, doublecomplex *alpha, doublecomplex *a, int inca, doublecomplex *x, int incx, doublecomplex *beta, doublecomplex *y, int incy ) { zgbmv_( &trans, &m, &n, &kl, &ku, alpha, a, &inca, x, &incx, beta, y, &incy ); } void stbmv(char uplo, char trans, char diag, int n, int k, float *a, int lda, float *x, int incx ) { stbmv_( &uplo, &trans, &diag, &n, &k, a, &lda, x, &incx ); } void dtbmv(char uplo, char trans, char diag, int n, int k, double *a, int lda, double *x, int incx ) { dtbmv_( &uplo, &trans, &diag, &n, &k, a, &lda, x, &incx ); } void ctbmv(char uplo, char trans, char diag, int n, int k, complex *a, int lda, complex *x, int incx ) { ctbmv_( &uplo, &trans, &diag, &n, &k, a, &lda, x, &incx ); } void ztbmv(char uplo, char trans, char diag, int n, int k, doublecomplex *a, int lda, doublecomplex *x, int incx ) { ztbmv_( &uplo, &trans, &diag, &n, &k, a, &lda, x, &incx ); } void ssbmv(char uplo, int n, int k, float alpha, float *a, int lda, float *x, int incx, float beta, float *y, int incy ) { ssbmv_( &uplo, &n, &k, &alpha, a, &lda, x, &incx, &beta, y, &incy ); } void dsbmv(char uplo, int n, int k, double alpha, double *a, int lda, double *x, int incx, double beta, double *y, int incy ) { dsbmv_( &uplo, &n, &k, &alpha, a, &lda, x, &incx, &beta, y, &incy ); } void chbmv(char uplo, int n, int k, complex *alpha, complex *a, int lda, complex *x, int incx, complex *beta, complex *y, int incy ) { chbmv_( &uplo, &n, &k, alpha, a, &lda, x, &incx, beta, y, &incy ); } void zhbmv(char uplo, int n, int k, doublecomplex *alpha, doublecomplex *a, int lda, doublecomplex *x, int incx, doublecomplex *beta, doublecomplex *y, int incy ) { zhbmv_( &uplo, &n, &k, alpha, a, &lda, x, &incx, beta, y, &incy ); } void stbsv(char uplo, char trans, char diag, int n, int k, float *a, int lda, float *x, int incx ) { stbsv_( &uplo, &trans, &diag, &n, &k, a, &lda, x, &incx ); } void dtbsv(char uplo, char trans, char diag, int n, int k, double *a, int lda, double *x, int incx ) { dtbsv_( &uplo, &trans, &diag, &n, &k, a, &lda, x, &incx ); } void ctbsv(char uplo, char trans, char diag, int n, int k, complex *a, int lda, complex *x, int incx ) { ctbsv_( &uplo, &trans, &diag, &n, &k, a, &lda, x, &incx ); } void ztbsv(char uplo, char trans, char diag, int n, int k, doublecomplex *a, int lda, doublecomplex *x, int incx ) { ztbsv_( &uplo, &trans, &diag, &n, &k, a, &lda, x, &incx ); } void chemm(char side, char uplo, int m, int n, complex *alpha, complex *a, int lda, complex *b, int ldb, complex *beta, complex *c, int ldc) { chemm_( &side, &uplo, &m, &n, alpha, a, &lda, b, &ldb, beta, c, &ldc); } void zhemm(char side, char uplo, int m, int n, doublecomplex *alpha, doublecomplex *a, int lda, doublecomplex *b, int ldb, doublecomplex *beta, doublecomplex *c, int ldc) { zhemm_( &side, &uplo, &m, &n, alpha, a, &lda, b, &ldb, beta, c, &ldc); } void cherk(char uplo, char transa, int n, int k, float alpha, complex *a, int lda, float beta, complex *c, int ldc) { cherk_( &uplo, &transa, &n, &k, &alpha, a, &lda, &beta, c, &ldc); } void zherk(char uplo, char transa, int n, int k, double alpha, doublecomplex *a, int lda, double beta, doublecomplex *c, int ldc) { zherk_( &uplo, &transa, &n, &k, &alpha, a, &lda, &beta, c, &ldc); } void cher2k(char uplo, char transa, int n, int k, complex *alpha, complex *a, int lda, complex *b, int ldb, float beta, complex *c, int ldc) { cher2k_( &uplo, &transa, &n, &k, alpha, a, &lda, b, &ldb, &beta, c, &ldc); } void zher2k(char uplo, char transa, int n, int k, doublecomplex *alpha, doublecomplex *a, int lda, doublecomplex *b, int ldb, double beta, doublecomplex *c, int ldc) { zher2k_( &uplo, &transa, &n, &k, alpha, a, &lda, b, &ldb, &beta, c, &ldc); } void sscal( int n, float alpha, float *x, int incx) { sscal_(&n, &alpha, x, &incx); } void dscal( int n, double alpha, double *x, int incx) { dscal_(&n, &alpha, x, &incx); } void cscal( int n, complex* alpha, complex *x, int incx) { cscal_(&n, alpha, x, &incx); } void zscal( int n, doublecomplex* alpha, doublecomplex *x, int incx) { zscal_(&n, alpha, x, &incx); } void csscal( int n, float alpha, complex *x, int incx) { csscal_(&n, &alpha, x, &incx); } void zdscal( int n, double alpha, doublecomplex *x, int incx) { zdscal_(&n, &alpha, x, &incx); } float sdot( int n, float *x, int incx, float *y, int incy) { #ifdef __APPLE__ return cblas_sdot(n, x, incx, y, incy); #else return sdot_(&n, x, &incx, y, &incy); #endif } double ddot( int n, double *x, int incx, double *y, int incy) { #ifdef __APPLE__ return cblas_ddot(n, x, incx, y, incy); #else return ddot_(&n, x, &incx, y, &incy); #endif } complex cdotu( int n, complex *x, int incx, complex *y, int incy) { complex ans; #if defined( _WIN32 ) || defined( _WIN64 ) ans = cdotu_(&n, x, &incx, y, &incy); #elif defined( __APPLE__) cblas_cdotu_sub(n, x, incx, y, incy, &ans); #else cdotusub_(&n, x, &incx, y, &incy, &ans); #endif return ans; } doublecomplex zdotu( int n, doublecomplex *x, int incx, doublecomplex *y, int incy) { doublecomplex ans; #if defined( _WIN32 ) || defined( _WIN64 ) ans = zdotu_(&n, x, &incx, y, &incy); #elif defined(__APPLE__) cblas_zdotu_sub(n, x, incx, y, incy, &ans); #else zdotusub_(&n, x, &incx, y, &incy, &ans); #endif return ans; } complex cdotc( int n, complex *x, int incx, complex *y, int incy) { complex ans; #if defined( _WIN32 ) || defined( _WIN64 ) ans = cdotc_(&n, x, &incx, y, &incy); #elif defined(__APPLE__) cblas_cdotc_sub(n, x, incx, y, incy, &ans); #else cdotcsub_(&n, x, &incx, y, &incy, &ans); #endif return ans; } doublecomplex zdotc( int n, doublecomplex *x, int incx, doublecomplex *y, int incy) { doublecomplex ans; #if defined( _WIN32 ) || defined( _WIN64 ) ans = zdotc_(&n, x, &incx, y, &incy); #elif defined(__APPLE__) cblas_zdotc_sub(n, x, incx, y, incy, &ans); #else zdotcsub_(&n, x, &incx, y, &incy, &ans); #endif return ans; } void scopy( int n, float *x, int incx, float *y, int incy) { scopy_(&n, x, &incx, y, &incy); } void dcopy( int n, double *x, int incx, double *y, int incy) { dcopy_(&n, x, &incx, y, &incy); } void ccopy( int n, complex *x, int incx, complex *y, int incy) { ccopy_(&n, x, &incx, y, &incy); } void zcopy( int n, doublecomplex *x, int incx, doublecomplex *y, int incy) { zcopy_(&n, x, &incx, y, &incy); } void sswap( int n, float *x, int incx, float *y, int incy) { sswap_(&n, x, &incx, y, &incy); } void dswap( int n, double *x, int incx, double *y, int incy) { dswap_(&n, x, &incx, y, &incy); } void cswap( int n, complex *x, int incx, complex *y, int incy) { cswap_(&n, x, &incx, y, &incy); } void zswap( int n, doublecomplex *x, int incx, doublecomplex *y, int incy) { zswap_(&n, x, &incx, y, &incy); } void saxpy( int n, float alpha, float *x, int incx, float *y, int incy) { saxpy_(&n, &alpha, x, &incx, y, &incy); } void daxpy( int n, double alpha, double *x, int incx, double *y, int incy) { daxpy_(&n, &alpha, x, &incx, y, &incy); } void caxpy( int n, complex *alpha, complex *x, int incx, complex *y, int incy) { caxpy_(&n, alpha, x, &incx, y, &incy); } void zaxpy( int n, doublecomplex *alpha, doublecomplex *x, int incx, doublecomplex *y, int incy) { zaxpy_(&n, alpha, x, &incx, y, &incy); } void srotg(float *A, float *B, float *C, float *S) { srotg_(A, B, C, S); } void drotg(double *A, double *B, double *C, double *S) { drotg_(A, B, C, S); } void crotg(complex *A, complex *B, float *C, complex *S) { crotg_(A, B, C, S); } void zrotg(doublecomplex *A, doublecomplex *B, double *C, doublecomplex *S) { zrotg_(A, B, C, S); } void srotmg(float *D1, float *D2, float *X1, const float *Y1, float *PARAM) { srotmg_(D1, D2, X1, (float*)Y1, PARAM); } void drotmg(double *D1, double *D2, double *X1, const double *Y1, double *PARAM) { drotmg_(D1, D2, X1, (double*)Y1, PARAM); } void srot(int N, float *x, int incx, float *y, int incy, float c, float s) { srot_(&N, x, &incx, y, &incy, &c, &s); } void drot(int N, double *x, int incx, double *y, int incy, double c, double s) { drot_(&N, x, &incx, y, &incy, &c, &s); } void csrot(int N, complex *x, int incx, complex *y, int incy, float c, float s) { csrot_(&N, x, &incx, y, &incy, &c, &s); } void zdrot(int N, doublecomplex *cx, int incx, doublecomplex *cy, int incy, double c, double s) { zdrot_(&N, cx, &incx, cy, &incy, &c, &s); } void srotm(int N, float *X, int incx, float *Y, int incy, float* PARAM) { srotm_(&N, X, &incx, Y, &incy, PARAM); } void drotm(int N, double *X, int incx, double *Y, int incy, double* PARAM) { drotm_(&N, X, &incx, Y, &incy, PARAM); } int isamax( int n, float *x, int incx) { return isamax_(&n, x, &incx); } int idamax( int n, double *x, int incx) { return idamax_(&n, x, &incx); } int icamax( int n, complex *x, int incx) { return icamax_(&n, x, &incx); } int izamax( int n, doublecomplex *x, int incx) { return izamax_(&n, x, &incx); } float snrm2( int n, float *x, int incx) { #ifdef __APPLE__ //On OSX passing negative values for incx can lead to a //a crash, so we catch it here (cf. Github issue #37). if (n < 1 || incx < 1) { return 0; } return cblas_snrm2(n, x, incx); #else return snrm2_(&n, x, &incx); #endif } double dnrm2( int n, double *x, int incx) { #ifdef __APPLE__ //On OSX passing negative values for incx can lead to a //a crash, so we catch it here (cf. Github issue #37). if (n < 1 || incx < 1) { return 0; } return cblas_dnrm2(n, x, incx); #else return dnrm2_(&n, x, &incx); #endif } float scnrm2( int n, complex *x, int incx) { #ifdef __APPLE__ //On OSX passing negative values for incx can lead to a //a crash, so we catch it here (cf. Github issue #37). if (n < 1 || incx < 1) { return 0; } return cblas_scnrm2(n, x, incx); #else return scnrm2_(&n, x, &incx); #endif } double dznrm2( int n, doublecomplex *x, int incx) { #ifdef __APPLE__ //On OSX passing negative values for incx can lead to a //a crash, so we catch it here (cf. Github issue #37). if (n < 1 || incx < 1) { return 0; } return cblas_dznrm2(n, x, incx); #else return dznrm2_(&n, x, &incx); #endif } float sasum( int n, float *x, int incx) { #ifdef __APPLE__ return cblas_sasum(n, x, incx); #else return sasum_(&n, x, &incx); #endif } double dasum( int n, double *x, int incx) { #ifdef __APPLE__ return cblas_dasum(n, x, incx); #else return dasum_(&n, x, &incx); #endif } float scasum( int n, complex *x, int incx) { #ifdef __APPLE__ return cblas_scasum(n, x, incx); #else return scasum_(&n, x, &incx); #endif } double dzasum( int n, doublecomplex *x, int incx) { #ifdef __APPLE__ return cblas_dzasum(n, x, incx); #else return dzasum_(&n, x, &incx); #endif } #endif clblas-2.10/src/tests/correctness/blas-lapack.h000066400000000000000000000621601264277366700215330ustar00rootroot00000000000000/* ************************************************************************ * Copyright 2013 Advanced Micro Devices, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * ************************************************************************/ #ifndef BLAS_LAPACK_H_ #define BLAS_LAPACK_H_ #ifdef __cplusplus extern "C" { #endif /* BLAS-2 functions */ void sgemv_( const char *transA, const int *M, const int *N, const float *alpha, const float *A, const int *lda, const float *X, const int *incx, const float *beta, float *Y, const int *incy); void dgemv_( const char *transA, const int *M, const int *N, const double *alpha, const double *A, const int *lda, const double *X, const int *incx, const double *beta, double *Y, const int *incy); void cgemv_( const char *transA, const int *M, const int *N, const complex *alpha, const complex *A, const int *lda, const complex *X, const int *incx, const complex *beta, complex *Y, const int *incy); void zgemv_( const char *transA, const int *M, const int *N, const doublecomplex *alpha, const doublecomplex *A, const int *lda, const doublecomplex *X, const int *incx, const doublecomplex *beta, doublecomplex *Y, const int *incy); void ssymv_( const char *uplo, const int *N, const float *alpha, const float *A, const int *lda, const float *X, const int *incx, const float *beta, float *Y, int *incy); void dsymv_( const char *uplo, const int *N, const double *alpha, const double *A, const int *lda, const double *X, const int *incx, const double *beta, double *Y, const int *incy); /* BLAS-3 functions */ void sgemm_( const char *transA, const char *transB, const int *M, const int *N, const int *K, const float *alpha, const float *A, const int *lda, const float *B, const int *ldb, const float *beta, float *C, const int *ldc); void dgemm_( const char *transA, const char *transB, const int *M, const int *N, const int *K, const double *alpha, const double *A, const int *lda, const double *B, const int *ldb, const double *beta, double *C, const int *ldc); void cgemm_( const char *transA, const char *transB, const int *M, const int *N, const int *K, const complex *alpha, const complex *A, const int *lda, const complex *B, const int *ldb, const complex *beta, complex *C, const int *ldc); void zgemm_( const char *transA, const char *transB, const int *M, const int *N, const int *K, const doublecomplex *alpha, const doublecomplex *A, const int *lda, const doublecomplex *B, const int *ldb, const doublecomplex *beta, doublecomplex *C, const int *ldc); void strmm_( const char *side, const char *uplo, const char *transA, const char *diag, const int *M, const int *N, const float *alpha, const float *A, const int *lda, float *B, const int *ldb); void dtrmm_( const char *side, const char *uplo, const char *transA, const char *diag, const int *M, const int *N, const double *alpha, const double *A, const int *lda, double *B, const int *ldb); void ctrmm_( const char *side, const char *uplo, const char *transA, const char *diag, const int *M, const int *N, const complex *alpha, const complex *A, const int *lda, complex *B, const int *ldb); void ztrmm_( const char *side, const char *uplo, const char *transA, const char *diag, const int *M, const int *N, const doublecomplex *alpha, const doublecomplex *A, const int *lda, doublecomplex *B, const int *ldb); void strsm_( const char *side, const char *uplo, const char *transA, const char *diag, const int *M, const int *N, const float *aplha, const float *A, const int *lda, float *B, const int *ldb); void dtrsm_( const char *side, const char *uplo, const char *transA, const char *diag, const int *M, const int *N, const double *alpha, const double *A, const int *lda, double *B, const int *ldb); void ctrsm_( const char *side, const char *uplo, const char *transA, const char *diag, const int *M, const int *N, const complex *alpha, const complex *A, const int *lda, complex *B, const int *ldb); void ztrsm_( const char *side, const char *uplo, const char *transA, const char *diag, const int *M, const int *N, const doublecomplex *alpha, const doublecomplex *A, const int *lda, doublecomplex *B, const int *ldb); void ssyr2k_( const char *uplo, const char *transA, const int *N, const int *K, const float *alpha, const float *A, const int *lda, const float *B, const int *ldb, const float *beta, float *C, const int *ldc); void dsyr2k_( const char *uplo, const char *transA, const int *N, const int *K, const double *alpha, const double *A, const int *lda, const double *B, const int *ldb, const double *beta, double *C, const int *ldc); void csyr2k_( const char *uplo, const char *transA, const int *N, const int *K, const complex *alpha, const complex *A, const int *lda, const complex *B, const int *ldb, const complex *beta, complex *C, const int *ldc); void zsyr2k_( const char *uplo, const char *transA, const int *N, const int *K, const doublecomplex *alpha, const doublecomplex *A, const int *lda, const doublecomplex *B, const int *ldb, const doublecomplex *beta, doublecomplex *C, const int *ldc); void ssyrk_( const char *uplo, const char *transA, const int *N, const int *K, const float *alpha, const float *A, const int *lda, const float *beta, float *C, const int *ldc); void dsyrk_( const char *uplo, const char *transA, const int *N, const int *K, const double *alpha, const double *A, const int *lda, const double *beta, double *C, const int *ldc); void csyrk_( const char *uplo, const char *transA, const int *N, const int *K, const complex *alpha, const complex *A, const int *lda, const complex *beta, complex *C, const int *ldc); void zsyrk_( const char *uplo, const char *transA, const int *N, const int *K, const doublecomplex *alpha, const doublecomplex *A, const int *lda, const doublecomplex *beta, doublecomplex *C, const int *ldc); void strmv_( const char *uplo, const char *transa, const char *diag, const int *n, const float *a, const int *lda, float *x, const int *incx); void dtrmv_( const char *uplo, const char *transa, const char *diag, const int *n, const double *a, const int *lda, double *x, const int *incx); void ctrmv_( const char *uplo, const char *transa, const char *diag, const int *n, const complex *a, const int *lda, complex *x, const int *incx); void ztrmv_( const char *uplo, const char *transa, const char *diag, const int *n, const doublecomplex *a, const int *lda, doublecomplex *x, const int *incx); void strsv_( const char *uplo, const char *transa, const char *diag, const int *n, const float *a, const int *lda, float *x, const int *incx); void dtrsv_( const char *uplo, const char *transa, const char *diag, const int *n, const double *a, const int *lda, double *x, const int *incx); void ctrsv_( const char *uplo, const char *transa, const char *diag, const int *n, const complex *a, const int *lda, complex *x, const int *incx); void ztrsv_( const char *uplo, const char *transa, const char *diag, const int *n, const doublecomplex *a, const int *lda, doublecomplex *x, const int *incx); void ssymm_( const char *side, const char *uplo, const int *m, const int *n, const float *alpha, const float *a, const int *lda, const float *b, const int *ldb, const float *beta, float *c, const int *ldc); void dsymm_( const char *side, const char *uplo, const int *m, const int *n, const double *alpha, const double *a, const int *lda, const double *b, const int *ldb, const double *beta, double *c, const int *ldc); void csymm_( const char *side, const char *uplo, const int *m, const int *n, const complex *alpha, const complex *a, const int *lda, const complex *b, const int *ldb, const complex *beta, complex *c, const int *ldc); void zsymm_( const char *side, const char *uplo, const int *m, const int *n, const doublecomplex *alpha, const doublecomplex *a, const int *lda, const doublecomplex *b, const int *ldb, const doublecomplex *beta, doublecomplex *c, const int *ldc); void sger_( const int *m, const int *n, const float *alpha, const float *x, const int *incx, const float *y, const int *incy, float *a, const int *lda); void dger_( const int *m, const int *n, const double *alpha, const double *x, const int *incx, const double *y, const int *incy, double *a, const int *lda); void cgeru_( const int *m, const int *n, const complex *alpha, const complex *x, const int *incx, const complex *y, const int *incy, complex *a, const int *lda); void zgeru_( const int *m, const int *n, const doublecomplex *alpha, const doublecomplex *x, const int *incx, const doublecomplex *y, const int *incy, doublecomplex *a, const int *lda); void cgerc_( const int *m, const int *n, const complex *alpha, const complex *x, const int *incx, const complex *y, const int *incy, complex *a, const int *lda); void zgerc_( const int *m, const int *n, const doublecomplex *alpha, const doublecomplex *x, const int *incx, const doublecomplex *y, const int *incy, doublecomplex *a, const int *lda); void ssyr_( const char *uplo, const int *n, const float *alpha, const float *x, const int *incx, float *a, const int *lda); void dsyr_( const char *uplo, const int *n, const double *alpha, const double *x, const int *incx, double *a, const int *lda); void ssyr2_( const char *uplo, const int *n, const float *alpha, const float *x, const int *incx, const float *y, const int *incy, float *a, const int *lda); void dsyr2_( const char *uplo, const int *n, const double *alpha, const double *x, const int *incx, const double *y, const int *incy, double *a, const int *lda); void cher_( const char *uplo, const int *n, const float *alpha, const complex *x, const int *incx, complex *a, const int *lda); void zher_( const char *uplo, const int *n, const double *alpha, const doublecomplex *x, const int *incx, doublecomplex *a, const int *lda); void cher2_( const char *uplo, const int *n, const complex *alpha, const complex *x, const int *incx, const complex *y, const int *incy, complex *a, const int *lda); void zher2_( const char *uplo, const int *n, const doublecomplex *alpha, const doublecomplex *x, const int *incx, const doublecomplex *y, const int *incy, doublecomplex *a, const int *lda); void chemv_( const char *uplo, const int *n, const complex *alpha, const complex *a, const int *lda, const complex *x, const int *incx, const complex *beta, complex *y, const int *incy); void zhemv_( const char *uplo, const int *n, const doublecomplex *alpha, const doublecomplex *a, const int *lda, const doublecomplex *x, const int *incx, const doublecomplex *beta, doublecomplex *y, const int *incy); void stpmv_( const char *uplo, const char *transa, const char *diag, const int *n, const float *ap, float *x, const int *incx); void dtpmv_( const char *uplo, const char *transa, const char *diag, const int *n, const double *ap, double *x, const int *incx); void ctpmv_( const char *uplo, const char *transa, const char *diag, const int *n, const complex *ap, complex *x, const int *incx); void ztpmv_( const char *uplo, const char *transa, const char *diag, const int *n, const doublecomplex *ap, doublecomplex *x, const int *incx); void stpsv_( const char *uplo, const char *transa, const char *diag, const int *n, const float *ap, float *x, const int *incx); void dtpsv_( const char *uplo, const char *transa, const char *diag, const int *n, const double *ap, double *x, const int *incx); void ctpsv_( const char *uplo, const char *transa, const char *diag, const int *n, const complex *ap, complex *x, const int *incx); void ztpsv_( const char *uplo, const char *transa, const char *diag, const int *n, const doublecomplex *ap, doublecomplex *x, const int *incx); void sspr_( const char *uplo, const int *n, const float *alpha, const float *x, const int *incx, float *ap); void dspr_( const char *uplo, const int *n, const double *alpha, const double *x, const int *incx, double *ap); void sspmv_( const char *uplo, const int *n, const float *alpha, const float *ap, const float *x, const int *incx, const float *beta, float *y, const int *incy); void dspmv_( const char *uplo, const int *n, const double *alpha, const double *ap, const double *x, const int *incx, const double *beta, double *y, const int *incy); void chpmv_( const char *uplo, const int *n, const complex *alpha, const complex *ap, const complex *x, const int *incx, const complex *beta, complex *y, const int *incy); void zhpmv_( const char *uplo, const int *n, const doublecomplex *alpha, const doublecomplex *ap, const doublecomplex *x, const int *incx, const doublecomplex *beta, doublecomplex *y, const int *incy); void chpr_( const char *uplo, const int *n, const float *alpha, const complex *x, const int *incx, complex *ap); void zhpr_( const char *uplo, const int *n, const double *alpha, const doublecomplex *x, const int *incx, doublecomplex *ap); void sspr2_( const char *uplo, const int *n, const float *alpha, const float *x, const int *incx, const float *y, const int *incy, float *a ); void dspr2_( const char *uplo, const int *n, const double *alpha, const double *x, const int *incx, const double *y, const int *incy, double *a ); void chpr2_( const char *uplo, const int *n, const complex *alpha, const complex *x, const int *incx, const complex *y, const int *incy, complex *a ); void zhpr2_( const char *uplo, const int *n, const doublecomplex *alpha, const doublecomplex *x, const int *incx, const doublecomplex *y, const int *incy, doublecomplex *a ); void sgbmv_( const char *trans, const int *m, const int *n, const int *kl, const int *ku, const float *alpha, const float *a, const int *inca, const float *x, const int *incx, const float *beta, float *y, const int *incy ); void dgbmv_( const char *trans, const int *m, const int *n, const int *kl, const int *ku, const double *alpha, const double *a, const int *inca, const double *x, const int *incx, const double *beta, double *y, const int *incy ); void cgbmv_( const char *trans, const int *m, const int *n, const int *kl, const int *ku, const complex *alpha, const complex *a, const int *inca, const complex *x, const int *incx, const complex *beta, complex *y, const int *incy ); void zgbmv_( const char *trans, const int *m, const int *n, const int *kl, const int *ku, const doublecomplex *alpha, const doublecomplex *a, const int *inca, const doublecomplex *x, const int *incx, const doublecomplex *beta, doublecomplex *y, const int *incy ); void stbmv_( const char *uplo, const char *trans, const char *diag, const int *n, const int *k, const float *a, const int *lda, float *x, const int *incx ); void dtbmv_( const char *uplo, const char *trans, const char *diag, const int *n, const int *k, const double *a, const int *lda, double *x, const int *incx ); void ctbmv_( const char *uplo, const char *trans, const char *diag, const int *n, const int *k, const complex *a, const int *lda, complex *x, const int *incx ); void ztbmv_( const char *uplo, const char *trans, const char *diag, const int *n, const int *k, const doublecomplex *a, const int *lda, doublecomplex *x, const int *incx ); void ssbmv_( const char *uplo, const int *n, const int *k, const float *alpha, const float *a, const int *lda, const float *x, const int *incx, const float *beta, float *y, const int *incy ); void dsbmv_( const char *uplo, const int *n, const int *k, const double *alpha, const double *a, const int *lda, const double *x, const int *incx, const double *beta, double *y, const int *incy ); void chbmv_( const char *uplo, const int *n, const int *k, const complex *alpha, const complex *a, const int *lda, const complex *x, const int *incx, const complex *beta, complex *y, const int *incy ); void zhbmv_( const char *uplo, const int *n, const int *k, const doublecomplex *alpha, const doublecomplex *a, const int *lda, const doublecomplex *x, const int *incx, const doublecomplex *beta, doublecomplex *y, const int *incy ); void stbsv_( const char *uplo, const char *trans, const char *diag, const int *n, const int *k, const float *a, const int *lda, float *x, const int *incx ); void dtbsv_( const char *uplo, const char *trans, const char *diag, const int *n, const int *k, const double *a, const int *lda, double *x, const int *incx ); void ctbsv_( const char *uplo, const char *trans, const char *diag, const int *n, const int *k, const complex *a, const int *lda, complex *x, const int *incx ); void ztbsv_( const char *uplo, const char *trans, const char *diag, const int *n, const int *k, const doublecomplex *a, const int *lda, doublecomplex *x, const int *incx ); void chemm_( const char *side, const char *uplo, const int *m, const int *n, const complex *alpha, const complex *a, const int *lda, const complex *b, const int *ldb, const complex *beta, complex *c, const int *ldc); void zhemm_( const char *side, const char *uplo, const int *m, const int *n, const doublecomplex *alpha, const doublecomplex *a, const int *lda, const doublecomplex *b, const int *ldb, const doublecomplex *beta, doublecomplex *c, const int *ldc); void cherk_( const char *uplo, const char *transa, const int *n, const int *k, const float *alpha, const complex *a, const int *lda, const float *beta, complex *c, const int *ldc); void zherk_( const char *uplo, const char *transa, const int *n, const int *k, const double *alpha, const doublecomplex *a, const int *lda, const double *beta, doublecomplex *c, const int *ldc); void cher2k_( const char *uplo, const char *transa, const int *n, const int *k, const complex *alpha, const complex *a, const int *lda, const complex *b, const int *ldb, const float *beta, complex *c, const int *ldc); void zher2k_( const char *uplo, const char *transa, const int *n, const int *k, const doublecomplex *alpha, const doublecomplex *a, const int *lda, const doublecomplex *b, const int *ldb, const double *beta, doublecomplex *c, const int *ldc); void sscal_(int *n, float *alpha, float *x, int *incx); void dscal_(int *n, double *alpha, double *x, int *incx); void cscal_(int *n, complex *alpha, complex *x, int *incx); void zscal_(int *n, doublecomplex *alpha, doublecomplex *x, int *incx); void csscal_(int *n, float *alpha, complex *x, int *incx); void zdscal_(int *n, double *alpha, doublecomplex *x, int *incx); void scopy_(int *n, float *x, int *incx, float* y, int *incy); void dcopy_(int *n, double *x, int *incx, double* y, int *incy); void ccopy_(int *n, complex *x, int *incx, complex *y, int *incy); void zcopy_(int *n, doublecomplex *x, int *incx, doublecomplex *y, int *incy); float sdot_(int *n, float *x, int *incx, float* y, int *incy); double ddot_(int *n, double *x, int *incx, double* y, int *incy); #if defined( _WIN32 ) || defined( _WIN64 ) || defined( __APPLE__) complex cdotu_(int *n, complex *x, int *incx, complex* y, int *incy); doublecomplex zdotu_(int *n, doublecomplex *x, int *incx, doublecomplex* y, int *incy); complex cdotc_(int *n, complex *x, int *incx, complex* y, int *incy); doublecomplex zdotc_(int *n, doublecomplex *x, int *incx, doublecomplex* y, int *incy); #else void cdotusub_(int *n, complex *x, int *incx, complex* y, int *incy, complex *ans); void zdotusub_(int *n, doublecomplex *x, int *incx, doublecomplex* y, int *incy, doublecomplex *ans); void cdotcsub_(int *n, complex *x, int *incx, complex* y, int *incy, complex *ans); void zdotcsub_(int *n, doublecomplex *x, int *incx, doublecomplex* y, int *incy, doublecomplex *ans); #endif void sswap_(int *n, float *x, int *incx, float* y, int *incy); void dswap_(int *n, double *x, int *incx, double* y, int *incy); void cswap_(int *n, complex *x, int *incx, complex *y, int *incy); void zswap_(int *n, doublecomplex *x, int *incx, doublecomplex *y, int *incy); void saxpy_(int *n, float *alpha, float *x, int *incx, float* y, int *incy); void daxpy_(int *n, double *alpha, double *x, int *incx, double* y, int *incy); void caxpy_(int *n, complex *alpha, complex *x, int *incx, complex *y, int *incy); void zaxpy_(int *n, doublecomplex *alpha, doublecomplex *x, int *incx, doublecomplex *y, int *incy); void srotg_(float *A, float *B, float *C, float *S); void drotg_(double *A, double *B, double *C, double *S); void crotg_(complex *A, complex *B, float *C, complex *S); void zrotg_(doublecomplex *A, doublecomplex *B, double *C, doublecomplex *S); void srotmg_(float *D1, float *D2, float *X1, float *Y1, float *PARAM); void drotmg_(double *D1, double *D2, double *X1, double *Y1, double *PARAM); void srot_(int *n, float *x, int *incx, float *y, int *incy, float *c, float *s); void drot_(int *n, double *x, int *incx, double *y, int *incy, double *c, double *s); void csrot_(int *n, complex *x, int *incx, complex *y, int *incy, float *c, float *s); void zdrot_(int *n, doublecomplex *x, int *incx, doublecomplex *y, int *incy, double *c, double *s); void srotm_(int* N, float *X, int* incx, float *Y, int* incy, float* PARAM); void drotm_(int* N, double *X, int* incx, double *Y, int* incy, double* PARAM); float sasum_(int *n, float *x, int *incx); double dasum_(int *n, double *x, int *incx); float scasum_(int *n, complex *x, int *incx); double dzasum_(int *n, doublecomplex *x, int *incx); int isamax_(int *n, float *x, int *incx); int idamax_(int *n, double *x, int *incx); int icamax_(int *n, complex *x, int *incx); int izamax_(int *n, doublecomplex *x, int *incx); float snrm2_(int *n, float *x, int *incx); double dnrm2_(int *n, double *x, int *incx); float scnrm2_(int *n, complex *x, int *incx); double dznrm2_(int *n, doublecomplex *x, int *incx); #ifdef __cplusplus } #endif #endif /* BLAS_LAPACK_H */ clblas-2.10/src/tests/correctness/corr-asum.cpp000066400000000000000000000145431264277366700216260ustar00rootroot00000000000000/* ************************************************************************ * Copyright 2013 Advanced Micro Devices, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * ************************************************************************/ #include // srand() #include // memcpy() #include #include #include #include #include #include #include #include #include static void releaseMemObjects(cl_mem objX, cl_mem objAsum, cl_mem objScratch) { if(objX != NULL) { clReleaseMemObject(objX); } if(objAsum != NULL) { clReleaseMemObject(objAsum); } if(objScratch != NULL) { clReleaseMemObject(objScratch); } } template static void deleteBuffers(T *blasX, T *blasAsum=NULL, T *clblasAsum=NULL) { if(blasX != NULL) { delete[] blasX; } if(clblasAsum != NULL) { delete[] clblasAsum; } if(blasAsum != NULL) { delete(blasAsum); } } template void asumCorrectnessTest(TestParams *params) { cl_int err; T1 *blasX; T2 *clblasAsum, *blasAsum; cl_mem bufX, bufAsum, scratchBuff; clMath::BlasBase *base; cl_event *events; base = clMath::BlasBase::getInstance(); if ((typeid(T1) == typeid(cl_double) || typeid(T1) == typeid(DoubleComplex)) && !base->isDevSupportDoublePrecision()) { std::cerr << ">> WARNING: The target device doesn't support native " "double precision floating point arithmetic" << std::endl << ">> Test skipped" << std::endl; SUCCEED(); return; } printf("number of command queues : %d\n\n", params->numCommandQueues); events = new cl_event[params->numCommandQueues]; memset(events, 0, params->numCommandQueues * sizeof(cl_event)); size_t lengthX = (1 + ((params->N -1) * abs(params->incx))); blasX = new T1[lengthX + params->offBX ]; blasAsum = new T2[1]; clblasAsum = new T2[1 + params->offa]; if((blasX == NULL) || (clblasAsum == NULL) || (blasAsum == NULL)) { ::std::cerr << "Cannot allocate memory on host side\n" << "!!!!!!!!!!!!Test skipped.!!!!!!!!!!!!" << ::std::endl; deleteBuffers(blasX); deleteBuffers(blasAsum, clblasAsum); delete[] events; SUCCEED(); return; } srand(params->seed); ::std::cerr << "Generating input data... "; randomVectors(params->N, (blasX + params->offBX), params->incx, (T1*)NULL, 0, true); ::std::cerr << "Done" << ::std::endl; // Allocate buffers bufX = base->createEnqueueBuffer(blasX, (lengthX + params->offBX)* sizeof(*blasX), 0, CL_MEM_READ_WRITE); bufAsum = base->createEnqueueBuffer(NULL, (1 + params->offa) * sizeof(T2), 0, CL_MEM_READ_WRITE); scratchBuff = base->createEnqueueBuffer(NULL, (lengthX * sizeof(T1)), 0, CL_MEM_READ_WRITE); ::std::cerr << "Calling reference xASUM routine... "; *blasAsum = ::clMath::blas::asum( params->N, blasX, params->offBX, params->incx); ::std::cerr << "Done" << ::std::endl; if ((bufX == NULL) || (bufAsum == NULL) || (scratchBuff == NULL)) { releaseMemObjects(bufX, bufAsum, scratchBuff); deleteBuffers(blasX); deleteBuffers(blasAsum, clblasAsum); delete[] events; ::std::cerr << ">> Failed to create/enqueue buffer for a matrix." << ::std::endl << ">> Can't execute the test, because data is not transfered to GPU." << ::std::endl << ">> Test skipped." << ::std::endl; SUCCEED(); return; } ::std::cerr << "Calling clblas xASUM routine... "; DataType type; type = ( typeid(T1) == typeid(cl_float))? TYPE_FLOAT : ( typeid(T1) == typeid(cl_double))? TYPE_DOUBLE: ( typeid(T1) == typeid(cl_float2))? TYPE_COMPLEX_FLOAT:TYPE_COMPLEX_DOUBLE; // Should use bufXTemp as well err = (cl_int)::clMath::clblas::asum( type, params->N, bufAsum, params->offa, bufX, params->offBX, params->incx, scratchBuff, params->numCommandQueues, base->commandQueues(), 0, NULL, events); if (err != CL_SUCCESS) { releaseMemObjects(bufX, bufAsum, scratchBuff); deleteBuffers(blasX ); deleteBuffers(blasAsum, clblasAsum); delete[] events; ASSERT_EQ(CL_SUCCESS, err) << "::clMath::clblas::ASUM() failed"; } err = waitForSuccessfulFinish(params->numCommandQueues, base->commandQueues(), events); if (err != CL_SUCCESS) { releaseMemObjects(bufX, bufAsum, scratchBuff); deleteBuffers(blasX ); deleteBuffers(blasAsum, clblasAsum); delete[] events; ASSERT_EQ(CL_SUCCESS, err) << "waitForSuccessfulFinish()"; } ::std::cerr << "Done" << ::std::endl; err = clEnqueueReadBuffer(base->commandQueues()[0], bufAsum, CL_TRUE, 0, (1 + params->offa) * sizeof(*clblasAsum), clblasAsum, 0, NULL, NULL); if (err != CL_SUCCESS) { ::std::cerr << "ASUM: Reading results failed...." << std::endl; } releaseMemObjects(bufX, bufAsum, scratchBuff); compareMatrices(clblasColumnMajor, 1 , 1, (blasAsum), (clblasAsum+params->offa), 1); deleteBuffers(blasX); deleteBuffers(blasAsum, clblasAsum); delete[] events; } // Instantiate the test TEST_P(ASUM, sasum) { TestParams params; getParams(¶ms); asumCorrectnessTest(¶ms); } TEST_P(ASUM, dasum) { TestParams params; getParams(¶ms); asumCorrectnessTest(¶ms); } TEST_P(ASUM, scasum) { TestParams params; getParams(¶ms); asumCorrectnessTest(¶ms); } TEST_P(ASUM, dzasum) { TestParams params; getParams(¶ms); asumCorrectnessTest(¶ms); } clblas-2.10/src/tests/correctness/corr-axpy.cpp000066400000000000000000000141121264277366700216320ustar00rootroot00000000000000/* ************************************************************************ * Copyright 2013 Advanced Micro Devices, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * ************************************************************************/ #include // srand() #include // memcpy() #include #include #include #include #include #include #include #include #include static void releaseMemObjects(cl_mem objX, cl_mem objY) { if(objX != NULL) { clReleaseMemObject(objX); } if(objY != NULL) { clReleaseMemObject(objY); } } template static void deleteBuffers(T *X, T *Y, T *blasX, T *blasY) { if(X != NULL) { delete[] X; } if(blasX != NULL) { delete[] blasX; } if(Y != NULL) { delete[] Y; } if(blasY != NULL) { delete[] blasY; } } template void axpyCorrectnessTest(TestParams *params) { cl_int err; T *X, *Y; //For OpenCL implementation T *blasX, *blasY;// For reference implementation cl_mem bufX, bufY; clMath::BlasBase *base; cl_event *events; T alpha; base = clMath::BlasBase::getInstance(); if ((typeid(T) == typeid(cl_double) || typeid(T) == typeid(DoubleComplex)) && !base->isDevSupportDoublePrecision()) { std::cerr << ">> WARNING: The target device doesn't support native " "double precision floating point arithmetic" << std::endl << ">> Test skipped" << std::endl; SUCCEED(); return; } printf("number of command queues : %d\n\n", params->numCommandQueues); events = new cl_event[params->numCommandQueues]; memset(events, 0, params->numCommandQueues * sizeof(cl_event)); size_t lengthX = (1 + ((params->N -1) * abs(params->incx))); size_t lengthY = (1 + ((params->N -1) * abs(params->incy))); X = new T[lengthX + params->offBX ]; Y = new T[lengthY + params->offCY ]; blasX = new T[lengthX + params->offBX ]; blasY = new T[lengthY + params->offCY ]; if((X == NULL) || (blasX == NULL) || (Y == NULL) || (blasY == NULL)) { ::std::cerr << "Cannot allocate memory on host side\n" << "!!!!!!!!!!!!Test skipped.!!!!!!!!!!!!" << ::std::endl; deleteBuffers(X, Y, blasX, blasY); delete[] events; SUCCEED(); return; } srand(params->seed); ::std::cerr << "Generating input data... "; // Populate X and Y randomVectors(params->N, (X+params->offBX), params->incx, (Y+params->offCY), params->incy); memcpy(blasX, X, (lengthX + params->offBX) * sizeof(T)); memcpy(blasY, Y, (lengthY + params->offCY) * sizeof(T)); alpha = convertMultiplier(params->alpha); ::std::cerr << "Done" << ::std::endl; // Allocate buffers bufX = base->createEnqueueBuffer(X, (lengthX + params->offBX)* sizeof(T), 0, CL_MEM_READ_ONLY); bufY = base->createEnqueueBuffer(Y, (lengthY + params->offCY)* sizeof(T), 0, CL_MEM_READ_WRITE); if ((bufX == NULL) || (bufY == NULL)) { /* Skip the test, the most probable reason is * matrix too big for a device. */ releaseMemObjects(bufX, bufY); deleteBuffers(X, Y, blasX, blasY); delete[] events; ::std::cerr << ">> Failed to create/enqueue buffer for a matrix." << ::std::endl << ">> Can't execute the test, because data is not transfered to GPU." << ::std::endl << ">> Test skipped." << ::std::endl; SUCCEED(); return; } ::std::cerr << "Calling reference xAXPY routine... "; ::clMath::blas::axpy((size_t)params->N, alpha, blasX, (size_t)params->offBX, params->incx, blasY, (size_t)params->offCY, params->incy); ::std::cerr << "Done" << ::std::endl; ::std::cerr << "Calling clblas xAXPY routine... "; err = (cl_int)::clMath::clblas::axpy(params->N, alpha, bufX, params->offBX, params->incx, bufY, params->offCY, params->incy, params->numCommandQueues, base->commandQueues(), 0, NULL, events); if (err != CL_SUCCESS) { releaseMemObjects(bufX, bufY); deleteBuffers(X, Y, blasX, blasY); delete[] events; ASSERT_EQ(CL_SUCCESS, err) << "::clMath::clblas::AXPY() failed"; } err = waitForSuccessfulFinish(params->numCommandQueues, base->commandQueues(), events); if (err != CL_SUCCESS) { releaseMemObjects(bufX, bufY); deleteBuffers(X, Y, blasX, blasY); delete[] events; ASSERT_EQ(CL_SUCCESS, err) << "waitForSuccessfulFinish()"; } ::std::cerr << "Done" << ::std::endl; err = clEnqueueReadBuffer(base->commandQueues()[0], bufY, CL_TRUE, 0, (lengthY + params->offCY) * sizeof(T), Y, 0, NULL, NULL); if (err != CL_SUCCESS) { ::std::cerr << "AXPY: Reading results failed...." << std::endl; } releaseMemObjects(bufX, bufY); compareMatrices(clblasRowMajor, lengthY , 1, (blasY + params->offCY), (Y + params->offCY), 1); deleteBuffers(X, Y, blasX, blasY); delete[] events; } // Instantiate the test TEST_P(AXPY, saxpy) { TestParams params; getParams(¶ms); axpyCorrectnessTest(¶ms); } TEST_P(AXPY, daxpy) { TestParams params; getParams(¶ms); axpyCorrectnessTest(¶ms); } TEST_P(AXPY, caxpy) { TestParams params; getParams(¶ms); axpyCorrectnessTest(¶ms); } TEST_P(AXPY, zaxpy) { TestParams params; getParams(¶ms); axpyCorrectnessTest(¶ms); } clblas-2.10/src/tests/correctness/corr-copy.cpp000066400000000000000000000141621264277366700216300ustar00rootroot00000000000000/* ************************************************************************ * Copyright 2013 Advanced Micro Devices, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * ************************************************************************/ #include // srand() #include // memcpy() #include #include #include #include #include #include #include #include #include static void releaseMemObjects(cl_mem objX, cl_mem objY) { if(objX != NULL) { clReleaseMemObject(objX); } if(objY != NULL) { clReleaseMemObject(objY); } } template static void deleteBuffers(T *blasX, T *blasY, T *clblasY) { if(blasX != NULL) { delete[] blasX; } if(blasY != NULL) { delete[] blasY; } if(clblasY != NULL) { delete[] clblasY; } } template void copyCorrectnessTest(TestParams *params) { cl_int err; T *blasX, *blasY, *clblasY; cl_mem bufX, bufY; clMath::BlasBase *base; cl_event *events; base = clMath::BlasBase::getInstance(); if ((typeid(T) == typeid(cl_double) || typeid(T) == typeid(DoubleComplex)) && !base->isDevSupportDoublePrecision()) { std::cerr << ">> WARNING: The target device doesn't support native " "double precision floating point arithmetic" << std::endl << ">> Test skipped" << std::endl; SUCCEED(); return; } printf("number of command queues : %d\n\n", params->numCommandQueues); events = new cl_event[params->numCommandQueues]; memset(events, 0, params->numCommandQueues * sizeof(cl_event)); size_t lengthX = (1 + ((params->N -1) * abs(params->incx))); size_t lengthY = (1 + ((params->N -1) * abs(params->incy))); blasX = new T[lengthX + params->offBX ]; blasY = new T[lengthY + params->offCY ]; clblasY = new T[lengthY + params->offCY ]; if((blasX == NULL) || (blasY == NULL) || (clblasY == NULL)) { ::std::cerr << "Cannot allocate memory on host side\n" << "!!!!!!!!!!!!Test skipped.!!!!!!!!!!!!" << ::std::endl; deleteBuffers(blasX, blasY, clblasY); delete[] events; SUCCEED(); return; } srand(params->seed); ::std::cerr << "Generating input data... "; // Populate A and blasX randomVectors( params->N, (blasX+params->offBX), params->incx, (blasY+params->offCY), params->incy ); memcpy(clblasY, blasY, (lengthY + params->offCY)* sizeof(*blasY)); ::std::cerr << "Done" << ::std::endl; // Allocate buffers bufX = base->createEnqueueBuffer(blasX, (lengthX + params->offBX)* sizeof(*blasX), 0, CL_MEM_READ_WRITE); bufY = base->createEnqueueBuffer(blasY, (lengthY + params->offCY)* sizeof(*blasY), 0, CL_MEM_READ_WRITE); ::std::cerr << "Calling reference xCOPY routine... "; ::clMath::blas::copy( params->N, blasX, params->offBX, params->incx, blasY, params->offCY, params->incy); ::std::cerr << "Done" << ::std::endl; if ((bufX == NULL) || (bufY == NULL)) { /* Skip the test, the most probable reason is * matrix too big for a device. */ releaseMemObjects(bufX, bufY); deleteBuffers(blasX, blasY, clblasY); delete[] events; ::std::cerr << ">> Failed to create/enqueue buffer for a matrix." << ::std::endl << ">> Can't execute the test, because data is not transfered to GPU." << ::std::endl << ">> Test skipped." << ::std::endl; SUCCEED(); return; } ::std::cerr << "Calling clblas xCOPY routine... "; DataType type; type = ( typeid(T) == typeid(cl_float))? TYPE_FLOAT : ( typeid(T) == typeid(cl_double))? TYPE_DOUBLE: ( typeid(T) == typeid(cl_float2))? TYPE_COMPLEX_FLOAT:TYPE_COMPLEX_DOUBLE; // Should use bufXTemp as well err = (cl_int)::clMath::clblas::copy(type, params->N, bufX, params->offBX, params->incx, bufY, params->offCY, params->incy, params->numCommandQueues, base->commandQueues(), 0, NULL, events); if (err != CL_SUCCESS) { releaseMemObjects(bufX, bufY); deleteBuffers(blasX, blasY, clblasY); delete[] events; ASSERT_EQ(CL_SUCCESS, err) << "::clMath::clblas::COPY() failed"; } err = waitForSuccessfulFinish(params->numCommandQueues, base->commandQueues(), events); if (err != CL_SUCCESS) { releaseMemObjects(bufX, bufY); deleteBuffers(blasX, blasY, clblasY); delete[] events; ASSERT_EQ(CL_SUCCESS, err) << "waitForSuccessfulFinish()"; } ::std::cerr << "Done" << ::std::endl; err = clEnqueueReadBuffer(base->commandQueues()[0], bufY, CL_TRUE, 0, ((lengthY + params->offCY) * sizeof(*blasY)), clblasY, 0, NULL, NULL); if (err != CL_SUCCESS) { ::std::cerr << "COPY: Reading results failed...." << std::endl; } releaseMemObjects(bufX, bufY); compareMatrices(clblasColumnMajor, lengthY , 1, (blasY + params->offCY), (clblasY + params->offCY), lengthY, NULL); deleteBuffers(blasX, blasY, clblasY); delete[] events; } // Instantiate the test TEST_P(COPY, scopy) { TestParams params; getParams(¶ms); copyCorrectnessTest(¶ms); } TEST_P(COPY, dcopy) { TestParams params; getParams(¶ms); copyCorrectnessTest(¶ms); } TEST_P(COPY, ccopy) { TestParams params; getParams(¶ms); copyCorrectnessTest(¶ms); } TEST_P(COPY, zcopy) { TestParams params; getParams(¶ms); copyCorrectnessTest(¶ms); } clblas-2.10/src/tests/correctness/corr-dot.cpp000066400000000000000000000151111264277366700214370ustar00rootroot00000000000000/* ************************************************************************ * Copyright 2013 Advanced Micro Devices, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * ************************************************************************/ #include // srand() #include // memcpy() #include #include #include #include #include #include #include #include #include static void releaseMemObjects(cl_mem objX, cl_mem objY, cl_mem objDP, cl_mem objScratch) { if(objX != NULL) { clReleaseMemObject(objX); } if(objY != NULL) { clReleaseMemObject(objY); } if(objDP != NULL) { clReleaseMemObject(objDP); } if(objScratch != NULL) { clReleaseMemObject(objScratch); } } template static void deleteBuffers(T *blasX, T *blasY, T *blasDP, T *clblasDP) { if(blasX != NULL) { delete[] blasX; } if(blasY != NULL) { delete[] blasY; } if(clblasDP != NULL) { delete[] clblasDP; } if(blasDP != NULL) { delete(blasDP); } } template void dotCorrectnessTest(TestParams *params) { cl_int err; T *blasX, *blasY, *clblasDP, *blasDP; cl_mem bufX, bufY, bufDP, scratchBuff; clMath::BlasBase *base; cl_event *events; base = clMath::BlasBase::getInstance(); if ((typeid(T) == typeid(cl_double) || typeid(T) == typeid(DoubleComplex)) && !base->isDevSupportDoublePrecision()) { std::cerr << ">> WARNING: The target device doesn't support native " "double precision floating point arithmetic" << std::endl << ">> Test skipped" << std::endl; SUCCEED(); return; } printf("number of command queues : %d\n\n", params->numCommandQueues); events = new cl_event[params->numCommandQueues]; memset(events, 0, params->numCommandQueues * sizeof(cl_event)); size_t lengthX = (1 + ((params->N -1) * abs(params->incx))); size_t lengthY = (1 + ((params->N -1) * abs(params->incy))); blasX = new T[lengthX + params->offBX ]; blasY = new T[lengthY + params->offCY ]; blasDP = new T[1]; clblasDP = new T[1 + params->offa]; if((blasX == NULL) || (blasY == NULL) || (clblasDP == NULL) || (blasDP == NULL)) { ::std::cerr << "Cannot allocate memory on host side\n" << "!!!!!!!!!!!!Test skipped.!!!!!!!!!!!!" << ::std::endl; deleteBuffers(blasX, blasY, blasDP, clblasDP); delete[] events; SUCCEED(); return; } srand(params->seed); ::std::cerr << "Generating input data... "; randomVectors(params->N, (blasX + params->offBX), params->incx, (blasY + params->offCY), params->incy, true); ::std::cerr << "Done" << ::std::endl; // Allocate buffers bufX = base->createEnqueueBuffer(blasX, (lengthX + params->offBX)* sizeof(*blasX), 0, CL_MEM_READ_WRITE); bufY = base->createEnqueueBuffer(blasY, (lengthY + params->offCY)* sizeof(*blasY), 0, CL_MEM_READ_WRITE); bufDP = base->createEnqueueBuffer(NULL, (1 + params->offa) * sizeof(T), 0, CL_MEM_READ_WRITE); scratchBuff = base->createEnqueueBuffer(NULL, (lengthX * sizeof(T)), 0, CL_MEM_READ_WRITE); ::std::cerr << "Calling reference xDOT routine... "; *blasDP = ::clMath::blas::dot( params->N, blasX, params->offBX, params->incx, blasY, params->offCY, params->incy); ::std::cerr << "Done" << ::std::endl; if ((bufX == NULL) || (bufY == NULL) || (bufDP == NULL) || (scratchBuff == NULL)) { releaseMemObjects(bufX, bufY, bufDP, scratchBuff); deleteBuffers(blasX, blasY, blasDP, clblasDP); delete[] events; ::std::cerr << ">> Failed to create/enqueue buffer for a matrix." << ::std::endl << ">> Can't execute the test, because data is not transfered to GPU." << ::std::endl << ">> Test skipped." << ::std::endl; SUCCEED(); return; } ::std::cerr << "Calling clblas xDOT routine... "; DataType type; type = ( typeid(T) == typeid(cl_float))? TYPE_FLOAT : ( typeid(T) == typeid(cl_double))? TYPE_DOUBLE: ( typeid(T) == typeid(cl_float2))? TYPE_COMPLEX_FLOAT:TYPE_COMPLEX_DOUBLE; // Should use bufXTemp as well err = (cl_int)::clMath::clblas::dot( type, params->N, bufDP, params->offa, bufX, params->offBX, params->incx, bufY, params->offCY, params->incy, scratchBuff, params->numCommandQueues, base->commandQueues(), 0, NULL, events); if (err != CL_SUCCESS) { releaseMemObjects(bufX, bufY, bufDP, scratchBuff); deleteBuffers(blasX, blasY, blasDP, clblasDP); delete[] events; ASSERT_EQ(CL_SUCCESS, err) << "::clMath::clblas::DOT() failed"; } err = waitForSuccessfulFinish(params->numCommandQueues, base->commandQueues(), events); if (err != CL_SUCCESS) { releaseMemObjects(bufX, bufY, bufDP, scratchBuff); deleteBuffers(blasX, blasY, blasDP, clblasDP); delete[] events; ASSERT_EQ(CL_SUCCESS, err) << "waitForSuccessfulFinish()"; } ::std::cerr << "Done" << ::std::endl; err = clEnqueueReadBuffer(base->commandQueues()[0], bufDP, CL_TRUE, 0, (1 + params->offa) * sizeof(*clblasDP), clblasDP, 0, NULL, NULL); if (err != CL_SUCCESS) { ::std::cerr << "DOT: Reading results failed...." << std::endl; } releaseMemObjects(bufX, bufY, bufDP, scratchBuff); compareMatrices(clblasColumnMajor, 1 , 1, (blasDP), (clblasDP+params->offa), 1); deleteBuffers(blasX, blasY, blasDP, clblasDP); delete[] events; } // Instantiate the test TEST_P(DOT, sdot) { TestParams params; getParams(¶ms); dotCorrectnessTest(¶ms); } TEST_P(DOT, ddot) { TestParams params; getParams(¶ms); dotCorrectnessTest(¶ms); } TEST_P(DOT, cdotu) { TestParams params; getParams(¶ms); dotCorrectnessTest(¶ms); } TEST_P(DOT, zdotu) { TestParams params; getParams(¶ms); dotCorrectnessTest(¶ms); } clblas-2.10/src/tests/correctness/corr-dotc.cpp000066400000000000000000000145611264277366700216120ustar00rootroot00000000000000/* ************************************************************************ * Copyright 2013 Advanced Micro Devices, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * ************************************************************************/ #include // srand() #include // memcpy() #include #include #include #include #include #include #include #include #include static void releaseMemObjects(cl_mem objX, cl_mem objY, cl_mem objDP, cl_mem objScratch) { if(objX != NULL) { clReleaseMemObject(objX); } if(objY != NULL) { clReleaseMemObject(objY); } if(objDP != NULL) { clReleaseMemObject(objDP); } if(objScratch != NULL) { clReleaseMemObject(objScratch); } } template static void deleteBuffers(T *blasX, T *blasY, T *blasDP, T *clblasDP) { if(blasX != NULL) { delete[] blasX; } if(blasY != NULL) { delete[] blasY; } if(clblasDP != NULL) { delete[] clblasDP; } if(blasDP != NULL) { delete(blasDP); } } template void dotcCorrectnessTest(TestParams *params) { cl_int err; T *blasX, *blasY, *clblasDP, *blasDP; cl_mem bufX, bufY, bufDP, scratchBuff; clMath::BlasBase *base; cl_event *events; base = clMath::BlasBase::getInstance(); if ((typeid(T) == typeid(cl_double) || typeid(T) == typeid(DoubleComplex)) && !base->isDevSupportDoublePrecision()) { std::cerr << ">> WARNING: The target device doesn't support native " "double precision floating point arithmetic" << std::endl << ">> Test skipped" << std::endl; SUCCEED(); return; } printf("number of command queues : %d\n\n", params->numCommandQueues); events = new cl_event[params->numCommandQueues]; memset(events, 0, params->numCommandQueues * sizeof(cl_event)); size_t lengthX = (1 + ((params->N -1) * abs(params->incx))); size_t lengthY = (1 + ((params->N -1) * abs(params->incy))); blasX = new T[lengthX + params->offBX ]; blasY = new T[lengthY + params->offCY ]; blasDP = new T[1]; clblasDP = new T[1 + params->offa]; if((blasX == NULL) || (blasY == NULL) || (clblasDP == NULL) || (blasDP == NULL)) { ::std::cerr << "Cannot allocate memory on host side\n" << "!!!!!!!!!!!!Test skipped.!!!!!!!!!!!!" << ::std::endl; deleteBuffers(blasX, blasY, blasDP, clblasDP); delete[] events; SUCCEED(); return; } srand(params->seed); ::std::cerr << "Generating input data... "; randomVectors(params->N, (blasX + params->offBX), params->incx, (blasY + params->offCY), params->incy, true); ::std::cerr << "Done" << ::std::endl; // Allocate buffers bufX = base->createEnqueueBuffer(blasX, (lengthX + params->offBX)* sizeof(*blasX), 0, CL_MEM_READ_WRITE); bufY = base->createEnqueueBuffer(blasY, (lengthY + params->offCY)* sizeof(*blasY), 0, CL_MEM_READ_WRITE); bufDP = base->createEnqueueBuffer(NULL, (1 + params->offa) * sizeof(T), 0, CL_MEM_READ_WRITE); scratchBuff = base->createEnqueueBuffer(NULL, (lengthX * sizeof(T)), 0, CL_MEM_READ_WRITE); ::std::cerr << "Calling reference xDOTC routine... "; *blasDP = ::clMath::blas::dotc( params->N, blasX, params->offBX, params->incx, blasY, params->offCY, params->incy); ::std::cerr << "Done" << ::std::endl; if ((bufX == NULL) || (bufY == NULL) || (bufDP == NULL) || (scratchBuff == NULL)) { releaseMemObjects(bufX, bufY, bufDP, scratchBuff); deleteBuffers(blasX, blasY, blasDP, clblasDP); delete[] events; ::std::cerr << ">> Failed to create/enqueue buffer for a matrix." << ::std::endl << ">> Can't execute the test, because data is not transfered to GPU." << ::std::endl << ">> Test skipped." << ::std::endl; SUCCEED(); return; } ::std::cerr << "Calling clblas xDOTC routine... "; DataType type; type = ( typeid(T) == typeid(cl_float))? TYPE_FLOAT : ( typeid(T) == typeid(cl_double))? TYPE_DOUBLE: ( typeid(T) == typeid(cl_float2))? TYPE_COMPLEX_FLOAT:TYPE_COMPLEX_DOUBLE; // Should use bufXTemp as well err = (cl_int)::clMath::clblas::dotc( type, params->N, bufDP, params->offa, bufX, params->offBX, params->incx, bufY, params->offCY, params->incy, scratchBuff, params->numCommandQueues, base->commandQueues(), 0, NULL, events); if (err != CL_SUCCESS) { releaseMemObjects(bufX, bufY, bufDP, scratchBuff); deleteBuffers(blasX, blasY, blasDP, clblasDP); delete[] events; ASSERT_EQ(CL_SUCCESS, err) << "::clMath::clblas::DOTC() failed"; } err = waitForSuccessfulFinish(params->numCommandQueues, base->commandQueues(), events); if (err != CL_SUCCESS) { releaseMemObjects(bufX, bufY, bufDP, scratchBuff); deleteBuffers(blasX, blasY, blasDP, clblasDP); delete[] events; ASSERT_EQ(CL_SUCCESS, err) << "waitForSuccessfulFinish()"; } ::std::cerr << "Done" << ::std::endl; err = clEnqueueReadBuffer(base->commandQueues()[0], bufDP, CL_TRUE, 0, (1 + params->offa) * sizeof(*clblasDP), clblasDP, 0, NULL, NULL); if (err != CL_SUCCESS) { ::std::cerr << "DOTC: Reading results failed...." << std::endl; } releaseMemObjects(bufX, bufY, bufDP, scratchBuff); compareMatrices(clblasColumnMajor, 1 , 1, (blasDP), (clblasDP+params->offa), 1); deleteBuffers(blasX, blasY, blasDP, clblasDP); delete[] events; } // Instantiate the test TEST_P(DOTC, cdotc) { TestParams params; getParams(¶ms); dotcCorrectnessTest(¶ms); } TEST_P(DOTC, zdotc) { TestParams params; getParams(¶ms); dotcCorrectnessTest(¶ms); } clblas-2.10/src/tests/correctness/corr-gbmv.cpp000066400000000000000000000170431264277366700216120ustar00rootroot00000000000000/* ************************************************************************ * Copyright 2013 Advanced Micro Devices, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * ************************************************************************/ #include // srand() #include // memcpy() #include #include #include #include #include #include #include #include #include static void releaseMemObjects(cl_mem objA, cl_mem objX, cl_mem objY) { if(objA != NULL) { clReleaseMemObject(objA); } if(objX != NULL) { clReleaseMemObject(objX); } if(objY != NULL) { clReleaseMemObject(objY); } } template static void deleteBuffers(T *A, T *X, T *blasY, T *clblasY) { if(A != NULL) { delete[] A; } if(X != NULL) { delete[] X; } if(blasY != NULL) { delete[] blasY; } if(clblasY != NULL) { delete[] clblasY; // To hold clblas GBMV call results } } template void gbmvCorrectnessTest(TestParams *params) { cl_int err; T *A, *X, *blasY, *clblasY; cl_mem bufA, bufX, bufY; clMath::BlasBase *base; cl_event *events; T alpha, beta; size_t lengthX, lengthY, lengthA; base = clMath::BlasBase::getInstance(); if (( (typeid(T) == typeid(DoubleComplex)) || (typeid(T) == typeid(cl_double)) ) && !base->isDevSupportDoublePrecision()) { std::cerr << ">> WARNING: The target device doesn't support native " "double precision floating point arithmetic" << std::endl << ">> Test skipped" << std::endl; SUCCEED(); return; } printf("number of command queues : %d\n\n", params->numCommandQueues); events = new cl_event[params->numCommandQueues]; memset(events, 0, params->numCommandQueues * sizeof(cl_event)); lengthA = ((params->order == clblasColumnMajor)? params->N : params->M) * params->lda; if (params->transA == clblasNoTrans) { lengthX = (params->N - 1)*abs(params->incx) + 1; lengthY = (params->M - 1)*abs(params->incy) + 1; } else { lengthX = (params->M - 1)*abs(params->incx) + 1; lengthY = (params->N - 1)*abs(params->incy) + 1; } A = new T[lengthA + params->offA ]; X = new T[lengthX + params->offBX ]; blasY = new T[lengthY + params->offCY ]; clblasY = new T[lengthY + params->offCY ]; srand(params->seed); ::std::cerr << "Generating input data... "; if((A == NULL) || (X == NULL) || (blasY == NULL) || (clblasY == NULL)) { deleteBuffers(A, X, blasY, clblasY); ::std::cerr << "Cannot allocate memory on host side\n" << "!!!!!!!!!!!!Test skipped.!!!!!!!!!!!!" << ::std::endl; delete[] events; SUCCEED(); return; } alpha = convertMultiplier(params->alpha); beta = convertMultiplier(params->beta); randomGbmvMatrices(params->order, params->transA, params->M, params->N, &alpha, &beta, (A + params->offA), params->lda, (X+params->offBX), params->incx, (blasY+params->offCY), params->incy ); // Copy blasY to clblasY memcpy(clblasY, blasY, (lengthY + params->offCY)* sizeof(*blasY)); ::std::cerr << "Done" << ::std::endl; // Allocate buffers bufA = base->createEnqueueBuffer(A, (lengthA + params->offA)* sizeof(*A), 0, CL_MEM_READ_ONLY); bufX = base->createEnqueueBuffer(X, (lengthX + params->offBX)* sizeof(*X), 0, CL_MEM_READ_ONLY); bufY = base->createEnqueueBuffer(clblasY, (lengthY + params->offCY) * sizeof(*clblasY), 0, CL_MEM_READ_WRITE); ::std::cerr << "Calling reference xGBMV routine... "; clblasOrder fOrder; clblasTranspose fTrans; fOrder = params->order; fTrans = params->transA; size_t fM = params->M, fN = params->N, fKL = params->KL, fKU = params->KU; if (fOrder != clblasColumnMajor) { fOrder = clblasColumnMajor; fTrans = (params->transA == clblasNoTrans)? clblasTrans : clblasNoTrans; fM = params->N; fN = params->M; fKL = params->KU; fKU = params->KL; if( params->transA == clblasConjTrans ) doConjugate( (A+params->offa), 1, lengthA, params->lda ); } clMath::blas::gbmv(fOrder, fTrans, fM, fN, fKL, fKU, alpha, A, params->offA, params->lda, X, params->offBX, params->incx, beta, blasY, params->offCY, params->incy); ::std::cerr << "Done" << ::std::endl; if ((bufA == NULL) || (bufX == NULL) || (bufY == NULL)) { // Skip the test, the most probable reason is // matrix too big for a device. releaseMemObjects(bufA, bufX, bufY); deleteBuffers(A, X, blasY, clblasY); delete[] events; ::std::cerr << ">> Failed to create/enqueue buffer for a matrix." << ::std::endl << ">> Can't execute the test, because data is not transfered to GPU." << ::std::endl << ">> Test skipped." << ::std::endl; SUCCEED(); return; } ::std::cerr << "Calling clblas xGBMV routine... "; err = (cl_int)clMath::clblas::gbmv(params->order, params->transA, params->M, params->N, params->KL, params->KU, alpha, bufA, params->offA, params->lda, bufX, params->offBX, params->incx, beta, bufY, params->offCY, params->incy, params->numCommandQueues, base->commandQueues(), 0, NULL, events); if (err != CL_SUCCESS) { releaseMemObjects(bufA, bufX, bufY); deleteBuffers(A, X, blasY, clblasY); delete[] events; ASSERT_EQ(CL_SUCCESS, err) << "::clMath::clblas::GBMV() failed"; } err = waitForSuccessfulFinish(params->numCommandQueues, base->commandQueues(), events); if (err != CL_SUCCESS) { releaseMemObjects(bufA, bufX, bufY); deleteBuffers(A, X, blasY, clblasY); delete[] events; ASSERT_EQ(CL_SUCCESS, err) << "waitForSuccessfulFinish()"; } ::std::cerr << "Done" << ::std::endl; err = clEnqueueReadBuffer(base->commandQueues()[0], bufY, CL_TRUE, 0, (lengthY + params->offCY) * sizeof(*clblasY), clblasY, 0, NULL, NULL); if (err != CL_SUCCESS) { ::std::cerr << "GBMV: Reading results failed...." << std::endl; } releaseMemObjects(bufA, bufX, bufY); compareMatrices(clblasColumnMajor, lengthY , 1, (blasY + params->offCY), (clblasY + params->offCY), lengthY); deleteBuffers(A, X, blasY, clblasY); delete[] events; } // Instantiate the test TEST_P(GBMV, sgbmv) { TestParams params; getParams(¶ms); gbmvCorrectnessTest(¶ms); } TEST_P(GBMV, dgbmv) { TestParams params; getParams(¶ms); gbmvCorrectnessTest(¶ms); } TEST_P(GBMV, cgbmv) { TestParams params; getParams(¶ms); gbmvCorrectnessTest(¶ms); } TEST_P(GBMV, zgbmv) { TestParams params; getParams(¶ms); gbmvCorrectnessTest(¶ms); } clblas-2.10/src/tests/correctness/corr-gemm.cpp000066400000000000000000000177261264277366700216140ustar00rootroot00000000000000/* ************************************************************************ * Copyright 2013 Advanced Micro Devices, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * ************************************************************************/ #include // srand() #include // memcpy() #include #include #include #include #include #include #include #include #include #include "tcase-filter.h" static void releaseMemObjects(cl_mem objA, cl_mem objB, cl_mem objC) { clReleaseMemObject(objA); clReleaseMemObject(objB); clReleaseMemObject(objC); } template static void deleteBuffers(T *A, T *B, T *blasC, T *clblasC) { delete[] A; delete[] B; delete[] blasC; delete[] clblasC; } template void gemmCorrectnessTest(TestParams *params) { cl_int err; T *A, *B, *blasC, *clblasC; T alpha, beta; cl_mem bufA, bufB, bufC; clMath::BlasBase *base; bool useAlpha; bool useBeta; cl_event *events; bool isComplex; base = clMath::BlasBase::getInstance(); if ((typeid(T) == typeid(cl_double) || typeid(T) == typeid(DoubleComplex)) && !base->isDevSupportDoublePrecision()) { std::cerr << ">> WARNING: The target device doesn't support native " "double precision floating point arithmetic" << std::endl << ">> Test skipped" << std::endl; SUCCEED(); return; } isComplex = ((typeid(T) == typeid(FloatComplex)) || (typeid(T) == typeid(DoubleComplex))); if (canCaseBeSkipped(params, isComplex)) { std::cerr << ">> Test is skipped because it has no importance for this " "level of coverage" << std::endl; SUCCEED(); return; } useAlpha = base->useAlpha(); useBeta = base->useBeta(); alpha = ZERO(); beta = ZERO(); events = new cl_event[params->numCommandQueues]; memset(events, 0, params->numCommandQueues * sizeof(cl_event)); A = new T[params->rowsA * params->columnsA]; B = new T[params->rowsB * params->columnsB]; blasC = new T[params->rowsC * params->columnsC]; clblasC = new T[params->rowsC * params->columnsC]; srand(params->seed); if (useAlpha) { alpha = convertMultiplier(params->alpha); } if (useBeta) { beta = convertMultiplier(params->beta); } //::std::cerr << "Generating input data... "; randomGemmMatrices(params->order, params->transA, params->transB, params->M, params->N, params->K, useAlpha, &alpha, A, params->lda, B, params->ldb, useBeta, &beta, blasC, params->ldc); memcpy(clblasC, blasC, params->rowsC * params->columnsC * sizeof(*blasC)); //::std::cerr << "Done" << ::std::endl; //::std::cerr << "Calling reference xGEMM routine... "; if (params->order == clblasColumnMajor) { ::clMath::blas::gemm(clblasColumnMajor, params->transA, params->transB, params->M, params->N, params->K, alpha, A, params->lda, B, params->ldb, beta, blasC, params->ldc); } else { T *reorderedA = new T[params->rowsA * params->columnsA]; T *reorderedB = new T[params->rowsB * params->columnsB]; T *reorderedC = new T[params->rowsC * params->columnsC]; reorderMatrix(clblasRowMajor, params->rowsA, params->columnsA, A, reorderedA); reorderMatrix(clblasRowMajor, params->rowsB, params->columnsB, B, reorderedB); reorderMatrix(clblasRowMajor, params->rowsC, params->columnsC, blasC, reorderedC); ::clMath::blas::gemm(clblasColumnMajor, params->transA, params->transB, params->M, params->N, params->K, alpha, reorderedA, params->rowsA, reorderedB, params->rowsB, beta, reorderedC, params->rowsC); reorderMatrix(clblasColumnMajor, params->rowsC, params->columnsC, reorderedC, blasC); delete[] reorderedC; delete[] reorderedB; delete[] reorderedA; } //::std::cerr << "Done" << ::std::endl; bufA = base->createEnqueueBuffer(A, params->rowsA * params->columnsA * sizeof(*A), params->offA * sizeof(*A), CL_MEM_READ_ONLY); bufB = base->createEnqueueBuffer(B, params->rowsB * params->columnsB * sizeof(*B), params->offBX * sizeof(*B), CL_MEM_READ_ONLY); bufC = base->createEnqueueBuffer(clblasC, params->rowsC * params->columnsC * sizeof(*clblasC), params->offCY * sizeof(*clblasC), CL_MEM_READ_WRITE); if ((bufA == NULL) || (bufB == NULL) || (bufC == NULL)) { /* Skip the test, the most probable reason is * matrix too big for a device. */ releaseMemObjects(bufA, bufB, bufC); deleteBuffers(A, B, blasC, clblasC); delete[] events; ::std::cerr << ">> Failed to create/enqueue buffer for a matrix." << ::std::endl << ">> Can't execute the test, because data is not transfered to GPU." << ::std::endl << ">> Test skipped." << ::std::endl; SUCCEED(); return; } //::std::cerr << "Calling clblas xGEMM routine... "; err = (cl_int)::clMath::clblas::gemm(params->order, params->transA, params->transB, params->M, params->N, params->K, alpha, bufA, params->offA, params->lda, bufB, params->offBX, params->ldb, beta, bufC, params->offCY, params->ldc, params->numCommandQueues, base->commandQueues(), 0, NULL, events); if (err != CL_SUCCESS) { releaseMemObjects(bufA, bufB, bufC); deleteBuffers(A, B, blasC, clblasC); delete[] events; ASSERT_EQ(CL_SUCCESS, err) << "::clMath::clblas::GEMM() failed"; } err = waitForSuccessfulFinish(params->numCommandQueues, base->commandQueues(), events); if (err != CL_SUCCESS) { releaseMemObjects(bufA, bufB, bufC); deleteBuffers(A, B, blasC, clblasC); delete[] events; ASSERT_EQ(CL_SUCCESS, err) << "waitForSuccessfulFinish()"; } //::std::cerr << "Done" << ::std::endl; clEnqueueReadBuffer(base->commandQueues()[0], bufC, CL_TRUE, params->offCY * sizeof(*clblasC), params->rowsC * params->columnsC * sizeof(*clblasC), clblasC, 0, NULL, NULL); releaseMemObjects(bufA, bufB, bufC); compareMatrices(params->order, params->M, params->N, blasC, clblasC, params->ldc); deleteBuffers(A, B, blasC, clblasC); delete[] events; } // Instantiate the test TEST_P(GEMM, sgemm) { TestParams params; getParams(¶ms); gemmCorrectnessTest(¶ms); } TEST_P(GEMM, dgemm) { TestParams params; getParams(¶ms); gemmCorrectnessTest(¶ms); } TEST_P(GEMM, cgemm) { TestParams params; getParams(¶ms); gemmCorrectnessTest(¶ms); } TEST_P(GEMM, zgemm) { TestParams params; getParams(¶ms); gemmCorrectnessTest(¶ms); } clblas-2.10/src/tests/correctness/corr-gemm2.cpp000066400000000000000000000203741264277366700216670ustar00rootroot00000000000000/* ************************************************************************ * Copyright 2013 Advanced Micro Devices, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * ************************************************************************/ #include // srand() #include // memcpy() #include #include #include #include #include #include #include #include #include static void releaseMemObjects(cl_mem objA, cl_mem objB, cl_mem objC) { if(objA != NULL) { clReleaseMemObject(objA); } if(objB != NULL) { clReleaseMemObject(objB); } if(objC != NULL) { clReleaseMemObject(objC); } } template static void deleteBuffers(T *A, T *B, T *blasC, T *clblasC) { if(A != NULL) { delete[] A; } if(B != NULL) { delete[] B; } if(blasC != NULL) { delete[] blasC; } if(clblasC != NULL) { delete[] clblasC; } } template void gemm2CorrectnessTest(TestParams *params) { cl_int err; T *A, *B, *blasC, *clblasC; T alpha, beta; cl_mem bufA, bufB, bufC; clMath::BlasBase *base; bool useAlpha; bool useBeta; cl_event *events; base = clMath::BlasBase::getInstance(); useAlpha = base->useAlpha(); useBeta = base->useBeta(); alpha = ZERO(); beta = ZERO(); if ((typeid(T) == typeid(cl_double) || typeid(T) == typeid(DoubleComplex)) && !base->isDevSupportDoublePrecision()) { std::cerr << ">> WARNING: The target device doesn't support native " "double precision floating point arithmetic" << std::endl << ">> Test skipped" << std::endl; SUCCEED(); return; } events = new cl_event[params->numCommandQueues]; memset(events, 0, params->numCommandQueues * sizeof(cl_event)); A = new T[params->rowsA * params->columnsA]; B = new T[params->rowsB * params->columnsB]; blasC = new T[params->rowsC * params->columnsC]; clblasC = new T[params->rowsC * params->columnsC]; if((A == NULL) || (B == NULL) || (blasC == NULL) || (clblasC == NULL)) { ::std::cerr << "Cannot allocate memory on host side\n" << "!!!!!!!!!!!!Test skipped.!!!!!!!!!!!!" << ::std::endl; deleteBuffers(A, B, blasC, clblasC); SUCCEED(); return; } srand(params->seed); if (useAlpha) { alpha = convertMultiplier(params->alpha); } if (useBeta) { beta = convertMultiplier(params->beta); } ::std::cerr << "Generating input data... "; randomGemmMatrices(params->order, params->transA, params->transB, params->M, params->N, params->K, useAlpha, &alpha, A, params->lda, B, params->ldb, useBeta, &beta, blasC, params->ldc); memcpy(clblasC, blasC, params->rowsC * params->columnsC * sizeof(*blasC)); ::std::cerr << "Done" << ::std::endl; ::std::cerr << "Calling reference xGEMM routine... "; if (params->order == clblasColumnMajor) { ::clMath::blas::gemm(clblasColumnMajor, params->transA, params->transB, params->M, params->N, params->K, alpha, A, params->lda, B, params->ldb, beta, blasC, params->ldc); } else { T *reorderedA = new T[params->rowsA * params->columnsA]; T *reorderedB = new T[params->rowsB * params->columnsB]; T *reorderedC = new T[params->rowsC * params->columnsC]; if((reorderedA == NULL) || (reorderedB == NULL) || (reorderedC == NULL)) { ::std::cerr << "Cannot allocate memory on host side\n" << "!!!!!!!!!!!!Test skipped.!!!!!!!!!!!!" << ::std::endl; SUCCEED(); return; } reorderMatrix(clblasRowMajor, params->rowsA, params->columnsA, A, reorderedA); reorderMatrix(clblasRowMajor, params->rowsB, params->columnsB, B, reorderedB); reorderMatrix(clblasRowMajor, params->rowsC, params->columnsC, blasC, reorderedC); ::clMath::blas::gemm(clblasColumnMajor, params->transA, params->transB, params->M, params->N, params->K, alpha, reorderedA, params->rowsA, reorderedB, params->rowsB, beta, reorderedC, params->rowsC); reorderMatrix(clblasColumnMajor, params->rowsC, params->columnsC, reorderedC, blasC); delete[] reorderedC; delete[] reorderedB; delete[] reorderedA; } ::std::cerr << "Done" << ::std::endl; bufA = base->createEnqueueBuffer(A, params->rowsA * params->columnsA * sizeof(*A), params->offA * sizeof(*A), CL_MEM_READ_ONLY); bufB = base->createEnqueueBuffer(B, params->rowsB * params->columnsB * sizeof(*B), params->offBX * sizeof(*B), CL_MEM_READ_ONLY); bufC = base->createEnqueueBuffer(clblasC, params->rowsC * params->columnsC * sizeof(*clblasC), params->offCY * sizeof(*clblasC), CL_MEM_READ_WRITE); if ((bufA == NULL) || (bufB == NULL) || (bufC == NULL)) { /* Skip the test, the most probable reason is * matrix too big for a device. */ releaseMemObjects(bufA, bufB, bufC); deleteBuffers(A, B, blasC, clblasC); delete[] events; ::std::cerr << ">> Failed to create/enqueue buffer for a matrix." << ::std::endl << ">> Can't execute the test, because data is not transfered to GPU." << ::std::endl << ">> Test skipped." << ::std::endl; SUCCEED(); return; } ::std::cerr << "Calling clblas xGEMM routine... "; err = (cl_int)::clMath::clblas::gemm2(params->order, params->transA, params->transB, params->M, params->N, params->K, alpha, bufA, params->offA, params->lda, bufB, params->offBX, params->ldb, beta, bufC, params->offCY, params->ldc, params->numCommandQueues, base->commandQueues(), 0, NULL, events); if (err != CL_SUCCESS) { releaseMemObjects(bufA, bufB, bufC); deleteBuffers(A, B, blasC, clblasC); delete[] events; ASSERT_EQ(CL_SUCCESS, err) << "::clMath::clblas::GEMM() failed"; } err = waitForSuccessfulFinish(params->numCommandQueues, base->commandQueues(), events); if (err != CL_SUCCESS) { releaseMemObjects(bufA, bufB, bufC); deleteBuffers(A, B, blasC, clblasC); delete[] events; ASSERT_EQ(CL_SUCCESS, err) << "waitForSuccessfulFinish()"; } ::std::cerr << "Done" << ::std::endl; clEnqueueReadBuffer(base->commandQueues()[0], bufC, CL_TRUE, params->offCY * sizeof(*clblasC), params->rowsC * params->columnsC * sizeof(*clblasC), clblasC, 0, NULL, NULL); releaseMemObjects(bufA, bufB, bufC); compareMatrices(params->order, params->M, params->N, blasC, clblasC, params->ldc); deleteBuffers(A, B, blasC, clblasC); delete[] events; } // Instantiate the test TEST_P(GEMM2, sgemm2) { TestParams params; getParams(¶ms); gemm2CorrectnessTest(¶ms); } TEST_P(GEMM2, dgemm2) { TestParams params; getParams(¶ms); gemm2CorrectnessTest(¶ms); } TEST_P(GEMM2, cgemm2) { TestParams params; getParams(¶ms); gemm2CorrectnessTest(¶ms); } TEST_P(GEMM2, zgemm2) { TestParams params; getParams(¶ms); gemm2CorrectnessTest(¶ms); } clblas-2.10/src/tests/correctness/corr-gemv.cpp000066400000000000000000000176541264277366700216250ustar00rootroot00000000000000/* ************************************************************************ * Copyright 2013 Advanced Micro Devices, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * ************************************************************************/ #include // srand() #include // memcpy() #include #include #include #include #include #include #include #include #include #include "tcase-filter.h" static void releaseMemObjects(cl_mem objA, cl_mem objX, cl_mem objY) { clReleaseMemObject(objA); clReleaseMemObject(objX); clReleaseMemObject(objY); } template static void deleteBuffers(T *A, T *X, T *blasY, T *clblasY) { delete[] A; delete[] X; delete[] blasY; delete[] clblasY; } template void gemvCorrectnessTest(TestParams *params) { cl_int err; T *A, *B, *blasC, *clblasC, *X, *Y; T alpha, beta; cl_mem bufA, bufB, bufC; clMath::BlasBase *base; bool useAlpha, useBeta; cl_event *events; size_t lenY, lenX; bool isComplex; base = clMath::BlasBase::getInstance(); if ((typeid(T) == typeid(cl_double) || typeid(T) == typeid(DoubleComplex)) && !base->isDevSupportDoublePrecision()) { std::cerr << ">> WARNING: The target device doesn't support native " "double precision floating point arithmetic" << std::endl << ">> Test skipped" << std::endl; SUCCEED(); return; } isComplex = ((typeid(T) == typeid(FloatComplex)) || (typeid(T) == typeid(DoubleComplex))); if (canCaseBeSkipped(params, isComplex)) { std::cerr << ">> Test is skipped because it has no importance for this " "level of coverage" << std::endl; SUCCEED(); return; } useAlpha = base->useAlpha(); useBeta = base->useBeta(); beta = ZERO(); alpha = ZERO(); events = new cl_event[params->numCommandQueues]; memset(events, 0, params->numCommandQueues * sizeof(cl_event)); A = new T[params->rowsA * params->columnsA]; B = new T[params->rowsB * params->columnsB]; blasC = new T[params->rowsC * params->columnsC]; clblasC = new T[params->rowsC * params->columnsC]; X = &B[params->offBX]; Y = &blasC[params->offCY]; srand(params->seed); if (useAlpha) { alpha = convertMultiplier(params->alpha); } if (useBeta) { beta = convertMultiplier(params->beta); } if (params->transA == clblasNoTrans) { lenX = params->N; lenY = params->M; } else { lenX = params->M; lenY = params->N; } ::std::cerr << "Generating input data... "; setNans(params->rowsA * params->columnsA, A); setNans(params->rowsB * params->columnsB, B); setNans(params->rowsC * params->columnsC, blasC); randomGemmxMatrices(params->order, params->transA, params->transB, params->transC, lenY, params->K, lenX, useAlpha, &alpha, A, params->lda, B, params->ldb, useBeta, &beta, blasC, params->ldc); // set to NAN elements which must not be accessed // in matrix B containing vector X setVectorNans(params->offBX, abs(params->incx), B, lenX, params->columnsB * params->rowsB); // in matrix C containing vector Y setVectorNans(params->offCY, abs(params->incy), blasC, lenY, params->columnsC * params->rowsC); memcpy(clblasC, blasC, params->rowsC * params->columnsC * sizeof(*clblasC)); ::std::cerr << "Done" << ::std::endl; ::std::cerr << "Calling reference xGEMV routine... "; if (params->order == clblasColumnMajor) { ::clMath::blas::gemv(clblasColumnMajor, params->transA, params->M, params->N, alpha, A, params->lda, X, params->incx, beta, Y, params->incy); } else { T *reorderedA = new T[params->rowsA * params->columnsA]; reorderMatrix(clblasRowMajor, params->rowsA, params->columnsA, A, reorderedA); ::clMath::blas::gemv(clblasColumnMajor, params->transA, params->M, params->N, alpha, reorderedA, params->rowsA, X, params->incx, beta, Y, params->incy); delete[] reorderedA; } ::std::cerr << "Done" << ::std::endl; bufA = base->createEnqueueBuffer(A, params->rowsA * params->columnsA * sizeof(*A), params->offA * sizeof(*A), CL_MEM_READ_ONLY); bufB = base->createEnqueueBuffer(B, params->rowsB * params->columnsB * sizeof(*B), 0, CL_MEM_READ_ONLY); bufC = base->createEnqueueBuffer(clblasC, params->rowsC * params->columnsC * sizeof(*clblasC), 0, CL_MEM_READ_WRITE); if ((bufA == NULL) || (bufB == NULL) || (bufC == NULL)) { /* Skip the test, the most probable reason is * matrix too big for a device. */ releaseMemObjects(bufA, bufB, bufC); deleteBuffers(A, B, blasC, clblasC); delete[] events; ::std::cerr << ">> Failed to create/enqueue buffer for a matrix." << ::std::endl << ">> Can't execute the test, because data is not transfered to GPU." << ::std::endl << ">> Test skipped." << ::std::endl; SUCCEED(); return; } ::std::cerr << "Calling clblas xGEMV routine... "; err = (cl_int)::clMath::clblas::gemv(params->order, params->transA, params->M, params->N, alpha, bufA, params->offA, params->lda, bufB, params->offBX, params->incx, beta, bufC, params->offCY, params->incy, params->numCommandQueues, base->commandQueues(), 0, NULL, events); if (err != CL_SUCCESS) { releaseMemObjects(bufA, bufB, bufC); deleteBuffers(A, B, blasC, clblasC); delete[] events; ASSERT_EQ(CL_SUCCESS, err) << "::clMath::clblas::GEMV() failed"; } err = waitForSuccessfulFinish(params->numCommandQueues, base->commandQueues(), events); if (err != CL_SUCCESS) { releaseMemObjects(bufA, bufB, bufC); deleteBuffers(A, B, blasC, clblasC); delete[] events; ASSERT_EQ(CL_SUCCESS, err) << "waitForSuccessfulFinish()"; } ::std::cerr << "Done" << ::std::endl; clEnqueueReadBuffer(base->commandQueues()[0], bufC, CL_TRUE, 0, params->rowsC * params->columnsC * sizeof(*clblasC), clblasC, 0, NULL, NULL); releaseMemObjects(bufA, bufB, bufC); compareVectors(params->offCY, lenY, abs(params->incy), params->columnsC * params->rowsC, blasC, clblasC); deleteBuffers(A, B, blasC, clblasC); delete[] events; } // Instantiate the test TEST_P(GEMV, sgemv) { TestParams params; getParams(¶ms); gemvCorrectnessTest(¶ms); } TEST_P(GEMV, dgemv) { TestParams params; getParams(¶ms); gemvCorrectnessTest(¶ms); } TEST_P(GEMV, cgemv) { TestParams params; getParams(¶ms); gemvCorrectnessTest(¶ms); } TEST_P(GEMV, zgemv) { TestParams params; getParams(¶ms); gemvCorrectnessTest(¶ms); } clblas-2.10/src/tests/correctness/corr-ger.cpp000066400000000000000000000164531264277366700214400ustar00rootroot00000000000000/* ************************************************************************ * Copyright 2013 Advanced Micro Devices, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * ************************************************************************/ #include // srand() #include // memcpy() #include #include #include #include #include #include #include #include #include #include static void releaseMemObjects(cl_mem objA, cl_mem objx, cl_mem objy) { if( objA!=NULL) clReleaseMemObject(objA); if( objx!=NULL) clReleaseMemObject(objx); if( objy!=NULL) clReleaseMemObject(objy); } template static void deleteBuffers(T *A, T *x, T *y, T *backA) { if(A != NULL) { delete[] A; } if(backA != NULL) { delete[] backA; } if(x != NULL) { delete[] x; } if(y != NULL) { delete[] y; } } template void gerCorrectnessTest(TestParams *params) { cl_int err; T *A, *x, *y, *backA; //size_t N, M; T alpha_; cl_mem bufA, bufx, bufy; clMath::BlasBase *base; cl_event *events; // int ka, kxy; base = clMath::BlasBase::getInstance(); if ((typeid(T) == typeid(cl_double) || typeid(T) == typeid(DoubleComplex)) && !base->isDevSupportDoublePrecision()) { std::cerr << ">> WARNING: The target device doesn't support native " "double precision floating point arithmetic" << std::endl << ">> Test skipped" << std::endl; SUCCEED(); return; } events = new cl_event[params->numCommandQueues]; memset(events, 0, params->numCommandQueues * sizeof(cl_event)); size_t lengthA; if( params->order == clblasColumnMajor ) lengthA = params->N * params->lda; else lengthA = params->M * params->lda; size_t lengthx = (1 + (((params->M)-1) * abs(params->incx))); size_t lengthy = (1 + (((params->N)-1) * abs(params->incy))); bool useAlpha = base->useAlpha(); if (useAlpha) { alpha_ = convertMultiplier(params->alpha); } A = new T[lengthA + params->offa]; x = new T[lengthx + params->offBX]; y = new T[lengthy + params->offCY]; backA = new T[lengthA + params->offa]; if((A == NULL) || (backA == NULL) || (x == NULL) || (y == NULL)) { ::std::cerr << "Cannot allocate memory on host side\n" << "!!!!!!!!!!!!Test skipped.!!!!!!!!!!!!" << ::std::endl; deleteBuffers(A, backA, x, y); delete[] events; SUCCEED(); return; } srand(params->seed); ::std::cerr << "Generating input data... "; int creationFlags = 0; creationFlags = creationFlags | RANDOM_INIT; creationFlags = ( (params-> order) == clblasRowMajor)? (creationFlags | ROW_MAJOR_ORDER) : (creationFlags); BlasRoutineID BlasFn = CLBLAS_GER; populate( (A + params->offa), params->M, params->N, params-> lda, BlasFn, creationFlags); populate( (x + params->offBX), lengthx, 1, lengthx, BlasFn ); populate( (y + params->offCY), lengthy, 1, lengthy, BlasFn ); // Copy C to backX memcpy(backA, A, (lengthA + params->offa) * sizeof(T)); // Allocate buffers bufA = base->createEnqueueBuffer(A, (lengthA + params->offa) * sizeof(*A), 0, CL_MEM_READ_WRITE); bufx = base->createEnqueueBuffer(x, (lengthx + params->offBX) * sizeof(*x), 0, CL_MEM_READ_ONLY); bufy = base->createEnqueueBuffer(y, (lengthy + params->offCY) * sizeof(*y), 0, CL_MEM_READ_ONLY); ::std::cerr << "Done" << ::std::endl; ::std::cerr << "Calling reference xGER routine... "; clblasOrder fOrder; size_t fN, fM; size_t fOffx, fOffy; int fIncx, fIncy; T *fX, *fY; fOrder = params->order; fM = params->M; fN = params->N; fIncx = params->incx; fIncy = params->incy; fX = x; fY = y; fOffx = params->offBX; fOffy = params->offCY; if (fOrder != clblasColumnMajor) { fOrder = clblasColumnMajor; fM = params->N; fN = params->M; fX = y; fY = x; fIncx = params->incy; fIncy = params->incx; fOffx = params->offCY; fOffy = params->offBX; } // Call reference blas routine clMath::blas::ger(fOrder, fM, fN, alpha_, fX , fOffx, fIncx, fY, fOffy, fIncy, A, params->offa, params->lda); ::std::cerr << "Done" << ::std::endl; if ((bufA == NULL) || (bufx == NULL) || (bufy == NULL)) { /* Skip the test, the most probable reason is * matrix too big for a device. */ releaseMemObjects(bufA, bufx, bufy); deleteBuffers(A, x, y, backA); delete[] events; ::std::cerr << ">> Failed to create/enqueue buffer for a matrix." << ::std::endl << ">> Can't execute the test, because data is not transfered to GPU." << ::std::endl << ">> Test skipped." << ::std::endl; SUCCEED(); return; } ::std::cerr << "Calling clblas xGER routine... "; err = (cl_int)::clMath::clblas::ger( params->order, params->M, params->N, alpha_, bufx, params->offBX, params->incx, bufy, params->offCY, params->incy,bufA, params->offa, params->lda, params->numCommandQueues, base->commandQueues(), 0, NULL, events ); if (err != CL_SUCCESS) { releaseMemObjects(bufA, bufx, bufy); deleteBuffers(A, x, y, backA); delete[] events; ASSERT_EQ(CL_SUCCESS, err) << "::clMath::clblas::GER() failed"; } err = waitForSuccessfulFinish(params->numCommandQueues, base->commandQueues(), events); if (err != CL_SUCCESS) { releaseMemObjects(bufA, bufx, bufy); deleteBuffers(A, x, y, backA); delete[] events; ASSERT_EQ(CL_SUCCESS, err) << "waitForSuccessfulFinish()"; } ::std::cerr << "Done" << ::std::endl; clEnqueueReadBuffer(base->commandQueues()[0], bufA, CL_TRUE, 0, (lengthA + params->offa)* sizeof(*backA), backA, 0, NULL, NULL); releaseMemObjects(bufA, bufx, bufy); // handle lda correctly based on row-major/col-major.. compareMatrices(params->order, params->M , params->N, A+ params->offa, backA + params->offa, params->lda); deleteBuffers(A, x, y, backA); delete[] events; } // Instantiate the test TEST_P(GER, sger) { TestParams params; getParams(¶ms); gerCorrectnessTest(¶ms); } TEST_P(GER, dger) { TestParams params; getParams(¶ms); gerCorrectnessTest(¶ms); } TEST_P(GER, cgeru) { TestParams params; getParams(¶ms); gerCorrectnessTest(¶ms); } TEST_P(GER, zgeru) { TestParams params; getParams(¶ms); gerCorrectnessTest(¶ms); } clblas-2.10/src/tests/correctness/corr-gerc.cpp000066400000000000000000000164201264277366700215750ustar00rootroot00000000000000/* ************************************************************************ * Copyright 2013 Advanced Micro Devices, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * ************************************************************************/ #include // srand() #include // memcpy() #include #include #include #include #include #include #include #include #include #include static void releaseMemObjects(cl_mem objA, cl_mem objx, cl_mem objy) { if( objA!=NULL) clReleaseMemObject(objA); if( objx!=NULL) clReleaseMemObject(objx); if( objy!=NULL) clReleaseMemObject(objy); } template static void deleteBuffers(T *A, T *x, T *y, T *backA) { if(A != NULL) { delete[] A; } if(backA != NULL) { delete[] backA; } if(x != NULL) { delete[] x; } if(y != NULL) { delete[] y; } } template void gercCorrectnessTest(TestParams *params) { cl_int err; T *A, *x, *y, *backA; //size_t N, M; T alpha_; cl_mem bufA, bufx, bufy; clMath::BlasBase *base; cl_event *events; // int ka, kxy; base = clMath::BlasBase::getInstance(); if ((typeid(T) == typeid(cl_double) || typeid(T) == typeid(DoubleComplex)) && !base->isDevSupportDoublePrecision()) { std::cerr << ">> WARNING: The target device doesn't support native " "double precision floating point arithmetic" << std::endl << ">> Test skipped" << std::endl; SUCCEED(); return; } events = new cl_event[params->numCommandQueues]; memset(events, 0, params->numCommandQueues * sizeof(cl_event)); size_t lengthA; if( params->order == clblasColumnMajor ) lengthA = params->N * params->lda; else lengthA = params->M * params->lda; size_t lengthx = (1 + (((params->M)-1) * abs(params->incx))); size_t lengthy = (1 + (((params->N)-1) * abs(params->incy))); bool useAlpha = base->useAlpha(); if (useAlpha) { alpha_ = convertMultiplier(params->alpha); } A = new T[lengthA + params->offa]; x = new T[lengthx + params->offBX]; y = new T[lengthy + params->offCY]; backA = new T[lengthA + params->offa]; if((A == NULL) || (backA == NULL) || (x == NULL) || (y == NULL)) { ::std::cerr << "Cannot allocate memory on host side\n" << "!!!!!!!!!!!!Test skipped.!!!!!!!!!!!!" << ::std::endl; deleteBuffers(A, x, y, backA); delete[] events; SUCCEED(); return; } srand(params->seed); ::std::cerr << "Generating input data... "; int creationFlags = 0; creationFlags = creationFlags | RANDOM_INIT; creationFlags = ( (params-> order) == clblasRowMajor)? (creationFlags | ROW_MAJOR_ORDER) : (creationFlags); BlasRoutineID BlasFn = CLBLAS_GER; populate( (A + params->offa), params->M, params->N, params-> lda, BlasFn, creationFlags); populate( (x + params->offBX), lengthx, 1, lengthx, BlasFn ); populate( (y + params->offCY), lengthy, 1, lengthy, BlasFn ); // Copy C to backX memcpy(backA, A, (lengthA + params->offa) * sizeof(T)); // Allocate buffers bufA = base->createEnqueueBuffer(A, (lengthA + params->offa) * sizeof(*A), 0, CL_MEM_READ_WRITE); bufx = base->createEnqueueBuffer(x, (lengthx + params->offBX) * sizeof(*x), 0, CL_MEM_READ_ONLY); bufy = base->createEnqueueBuffer(y, (lengthy + params->offCY) * sizeof(*y), 0, CL_MEM_READ_ONLY); ::std::cerr << "Done" << ::std::endl; ::std::cerr << "Calling reference xGER routine... "; clblasOrder fOrder; size_t fN, fM; size_t fOffx, fOffy; int fIncx, fIncy; T *fX, *fY; fOrder = params->order; fM = params->M; fN = params->N; fIncx = params->incx; fIncy = params->incy; fX = x; fY = y; fOffx = params->offBX; fOffy = params->offCY; if (fOrder != clblasColumnMajor) { doConjugate( (y + params->offCY), (1 + (params->N-1) * abs(params->incy)), 1, 1 ); fOrder = clblasColumnMajor; fM = params->N; fN = params->M; fX = y; fY = x; fIncx = params->incy; fIncy = params->incx; fOffx = params->offCY; fOffy = params->offBX; // Note this according to the Legacy guide clMath::blas::ger(fOrder, fM, fN, alpha_, fX , fOffx, fIncx, fY, fOffy, fIncy, A, params->offa, params->lda); } else { clMath::blas::gerc(fOrder, fM, fN, alpha_, fX , fOffx, fIncx, fY, fOffy, fIncy, A, params->offa, params->lda); } ::std::cerr << "Done" << ::std::endl; if ((bufA == NULL) || (bufx == NULL) || (bufy == NULL)) { /* Skip the test, the most probable reason is * matrix too big for a device. */ releaseMemObjects(bufA, bufx, bufy); deleteBuffers(A, x, y, backA); delete[] events; ::std::cerr << ">> Failed to create/enqueue buffer for a matrix." << ::std::endl << ">> Can't execute the test, because data is not transfered to GPU." << ::std::endl << ">> Test skipped." << ::std::endl; SUCCEED(); return; } ::std::cerr << "Calling clblas xGER routine... "; err = (cl_int)::clMath::clblas::gerc( params->order, params->M, params->N, alpha_, bufx, params->offBX, params->incx, bufy, params->offCY, params->incy,bufA, params->offa, params->lda, params->numCommandQueues, base->commandQueues(), 0, NULL, events ); if (err != CL_SUCCESS) { releaseMemObjects(bufA, bufx, bufy); deleteBuffers(A, x, y, backA); delete[] events; ASSERT_EQ(CL_SUCCESS, err) << "::clMath::clblas::GER() failed"; } err = waitForSuccessfulFinish(params->numCommandQueues, base->commandQueues(), events); if (err != CL_SUCCESS) { releaseMemObjects(bufA, bufx, bufy); deleteBuffers(A, x, y, backA); delete[] events; ASSERT_EQ(CL_SUCCESS, err) << "waitForSuccessfulFinish()"; } ::std::cerr << "Done" << ::std::endl; clEnqueueReadBuffer(base->commandQueues()[0], bufA, CL_TRUE, 0, (lengthA + params->offa)* sizeof(*backA), backA, 0, NULL, NULL); releaseMemObjects(bufA, bufx, bufy); // handle lda correctly based on row-major/col-major.. compareMatrices(params->order, params->M , params->N, A+ params->offa, backA + params->offa, params->lda); deleteBuffers(A, x, y, backA); delete[] events; } // Instantiate the test TEST_P(GERC, cgerc) { TestParams params; getParams(¶ms); gercCorrectnessTest(¶ms); } TEST_P(GERC, zgerc) { TestParams params; getParams(¶ms); gercCorrectnessTest(¶ms); } clblas-2.10/src/tests/correctness/corr-hbmv.cpp000066400000000000000000000156021264277366700216120ustar00rootroot00000000000000/* ************************************************************************ * Copyright 2013 Advanced Micro Devices, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * ************************************************************************/ #include // srand() #include // memcpy() #include #include #include #include #include #include #include #include #include #include static void releaseMemObjects(cl_mem objA, cl_mem objX, cl_mem objY) { if(objA != NULL) { clReleaseMemObject(objA); } if(objX != NULL) { clReleaseMemObject(objX); } if(objY != NULL) { clReleaseMemObject(objY); } } template static void deleteBuffers(T *A, T *X, T *blasY, T *clblasY) { if(A != NULL) { delete[] A; } if(X != NULL) { delete[] X; } if(blasY != NULL) { delete[] blasY; } if(clblasY != NULL) { delete[] clblasY; // To hold clblas GBMV call results } } template void hbmvCorrectnessTest(TestParams *params) { cl_int err; T *A, *X, *blasY, *clblasY; cl_mem bufA, bufX, bufY; clMath::BlasBase *base; cl_event *events; T alpha, beta; size_t lengthX, lengthY, lengthA; base = clMath::BlasBase::getInstance(); if (( (typeid(T) == typeid(DoubleComplex)) || (typeid(T) == typeid(cl_double)) ) && !base->isDevSupportDoublePrecision()) { std::cerr << ">> WARNING: The target device doesn't support native " "double precision floating point arithmetic" << std::endl << ">> Test skipped" << std::endl; SUCCEED(); return; } printf("number of command queues : %d\n\n", params->numCommandQueues); events = new cl_event[params->numCommandQueues]; memset(events, 0, params->numCommandQueues * sizeof(cl_event)); lengthA = params->N * params->lda; lengthX = (params->N - 1)*abs(params->incx) + 1; lengthY = (params->N - 1)*abs(params->incy) + 1; A = new T[lengthA + params->offA ]; X = new T[lengthX + params->offBX ]; blasY = new T[lengthY + params->offCY ]; clblasY = new T[lengthY + params->offCY ]; srand(params->seed); ::std::cerr << "Generating input data... "; if((A == NULL) || (X == NULL) || (blasY == NULL) || (clblasY == NULL)) { deleteBuffers(A, X, blasY, clblasY); ::std::cerr << "Cannot allocate memory on host side\n" << "!!!!!!!!!!!!Test skipped.!!!!!!!!!!!!" << ::std::endl; delete[] events; SUCCEED(); return; } alpha = convertMultiplier(params->alpha); beta = convertMultiplier(params->beta); randomGbmvMatrices(params->order, clblasNoTrans, params->N, params->N, &alpha, &beta, (A + params->offA), params->lda, (X+params->offBX), params->incx, (blasY+params->offCY), params->incy ); // Copy blasY to clblasY memcpy(clblasY, blasY, (lengthY + params->offCY)* sizeof(*blasY)); ::std::cerr << "Done" << ::std::endl; // Allocate buffers bufA = base->createEnqueueBuffer(A, (lengthA + params->offA)* sizeof(*A), 0, CL_MEM_READ_ONLY); bufX = base->createEnqueueBuffer(X, (lengthX + params->offBX)* sizeof(*X), 0, CL_MEM_READ_ONLY); bufY = base->createEnqueueBuffer(clblasY, (lengthY + params->offCY) * sizeof(*clblasY), 0, CL_MEM_READ_WRITE); ::std::cerr << "Calling reference xHBMV routine... "; clblasOrder fOrder; clblasUplo fUplo; fOrder = params->order; fUplo = params->uplo; size_t fN = params->N, fK = params->K; if (fOrder != clblasColumnMajor) { fOrder = clblasColumnMajor; fUplo = (params->uplo == clblasLower)? clblasUpper : clblasLower; doConjugate( (A + params->offA), params->N, params->lda, params->lda ); } clMath::blas::hbmv(fOrder, fUplo, fN, fK, alpha, A, params->offA, params->lda, X, params->offBX, params->incx, beta, blasY, params->offCY, params->incy); ::std::cerr << "Done" << ::std::endl; if ((bufA == NULL) || (bufX == NULL) || (bufY == NULL)) { // Skip the test, the most probable reason is // matrix too big for a device. releaseMemObjects(bufA, bufX, bufY); deleteBuffers(A, X, blasY, clblasY); delete[] events; ::std::cerr << ">> Failed to create/enqueue buffer for a matrix." << ::std::endl << ">> Can't execute the test, because data is not transfered to GPU." << ::std::endl << ">> Test skipped." << ::std::endl; SUCCEED(); return; } ::std::cerr << "Calling clblas xHBMV routine... "; err = (cl_int)clMath::clblas::hbmv(params->order, params->uplo, params->N, params->K, alpha, bufA, params->offA, params->lda, bufX, params->offBX, params->incx, beta, bufY, params->offCY, params->incy, params->numCommandQueues, base->commandQueues(), 0, NULL, events); if (err != CL_SUCCESS) { releaseMemObjects(bufA, bufX, bufY); deleteBuffers(A, X, blasY, clblasY); delete[] events; ASSERT_EQ(CL_SUCCESS, err) << "::clMath::clblas::GBMV() failed"; } err = waitForSuccessfulFinish(params->numCommandQueues, base->commandQueues(), events); if (err != CL_SUCCESS) { releaseMemObjects(bufA, bufX, bufY); deleteBuffers(A, X, blasY, clblasY); delete[] events; ASSERT_EQ(CL_SUCCESS, err) << "waitForSuccessfulFinish()"; } ::std::cerr << "Done" << ::std::endl; err = clEnqueueReadBuffer(base->commandQueues()[0], bufY, CL_TRUE, 0, (lengthY + params->offCY) * sizeof(*clblasY), clblasY, 0, NULL, NULL); if (err != CL_SUCCESS) { ::std::cerr << "GBMV: Reading results failed...." << std::endl; } releaseMemObjects(bufA, bufX, bufY); compareMatrices(clblasColumnMajor, lengthY , 1, (blasY + params->offCY), (clblasY + params->offCY), lengthY); deleteBuffers(A, X, blasY, clblasY); delete[] events; } // Instantiate the test TEST_P(HBMV, chbmv) { TestParams params; getParams(¶ms); hbmvCorrectnessTest(¶ms); } TEST_P(HBMV, zhbmv) { TestParams params; getParams(¶ms); hbmvCorrectnessTest(¶ms); } clblas-2.10/src/tests/correctness/corr-hemm.cpp000066400000000000000000000174711264277366700216120ustar00rootroot00000000000000/* ************************************************************************ * Copyright 2013 Advanced Micro Devices, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * ************************************************************************/ #include // srand() #include // memcpy() #include #include #include #include #include #include #include #include #include #include static void releaseMemObjects(cl_mem objA, cl_mem objB, cl_mem objC) { if(objA != NULL) { clReleaseMemObject(objA); } if(objB != NULL) { clReleaseMemObject(objB); } if(objC != NULL) { clReleaseMemObject(objC); } } template static void deleteBuffers(T *A, T *B, T *C, T *backC) { if(A != NULL) { delete[] A; } if(B != NULL) { delete[] B; } if(C != NULL) { delete[] C; } if(backC != NULL) { delete[] backC;// To hold the original C } } template void hemmCorrectnessTest(TestParams *params) { cl_int err; T *A, *B, *C, *backC; T alpha_, beta_; cl_mem bufA, bufB, bufC; clMath::BlasBase *base; cl_event *events; size_t ka, kbc; base = clMath::BlasBase::getInstance(); if ((typeid(T) == typeid(DoubleComplex)) && !base->isDevSupportDoublePrecision()) { std::cerr << ">> WARNING: The target device doesn't support native " "double precision floating point arithmetic" << std::endl << ">> Test skipped" << std::endl; SUCCEED(); return; } events = new cl_event[params->numCommandQueues]; if (events == NULL) { std::cerr << ">> WARNING: Unable to allocate memory for events" << std::endl << ">> Test skipped" << std::endl; SUCCEED(); return; } memset(events, 0, params->numCommandQueues * sizeof(cl_event)); if( params->side == clblasLeft ) ka = params->M; else ka = params->N; if( params->order == clblasColumnMajor ) kbc = params->N; else kbc = params->M; size_t lengthA = ka * params->lda; size_t lengthB = kbc * params->ldb; size_t lengthC = kbc * params->ldc; alpha_ = convertMultiplier(params->alpha); beta_ = convertMultiplier(params->beta); A = new T[ lengthA + params->offA ]; B = new T[ lengthB + params->offBX ]; C = new T[ lengthC + params->offCY ]; backC = new T[ lengthC + params->offCY ]; if((A == NULL) || (B == NULL) || (C == NULL) || (backC == NULL)) { ::std::cerr << "Cannot allocate memory on host side\n" << "!!!!!!!!!!!!Test skipped.!!!!!!!!!!!!" << ::std::endl; deleteBuffers(A, B, C, backC); delete[] events; SUCCEED(); return; } srand(params->seed); ::std::cerr << "Generating input data... " << std::endl; int creationFlags = 0, AcreationFlags; creationFlags = creationFlags | RANDOM_INIT; creationFlags = ( (params-> order) == clblasRowMajor)? (creationFlags | ROW_MAJOR_ORDER) : (creationFlags); AcreationFlags = ( (params-> uplo) == clblasLower)? (creationFlags | LOWER_HALF_ONLY) : (creationFlags | UPPER_HALF_ONLY); BlasRoutineID BlasFn = CLBLAS_HEMM; populate( A + params->offA , ka, ka, params-> lda, BlasFn, AcreationFlags); populate( B + params->offBX , params-> M, params-> N, params-> ldb, BlasFn, creationFlags); populate( C + params->offCY , params-> M, params-> N, params-> ldc, BlasFn, creationFlags); memcpy(backC, C, (lengthC + params->offCY) * sizeof(T)); //printMatrixBlock( params->order, 0, 0, params->M, params->N, params->ldc, backC); // Allocate buffers bufA = base->createEnqueueBuffer(A, (lengthA + params->offA) * sizeof(T), 0, CL_MEM_READ_ONLY); bufB = base->createEnqueueBuffer(B, (lengthB + params->offBX) * sizeof(T), 0, CL_MEM_READ_ONLY); bufC = base->createEnqueueBuffer(backC, (lengthC + params->offCY) * sizeof(T), 0, CL_MEM_READ_WRITE); ::std::cerr << "Done" << ::std::endl; ::std::cerr << "Calling reference xHEMM routine... "; clblasOrder fOrder; clblasUplo fUplo; clblasSide fSide; size_t fN, fM; fOrder = params->order; fUplo = params->uplo; fSide = params->side; fM = params->M; fN = params->N; if (fOrder != clblasColumnMajor) { fOrder = clblasColumnMajor; fM = params->N; fN = params->M; fSide = (params->side == clblasLeft)? clblasRight: clblasLeft; fUplo = (params->uplo == clblasUpper)? clblasLower: clblasUpper; } // Call reference blas routine clMath::blas::hemm(fOrder, fSide, fUplo, fM, fN, alpha_, A, params->offA, params->lda, B, params->offBX, params->ldb, beta_, C, params->offCY, params->ldc); ::std::cerr << "Done" << ::std::endl; if ((bufA == NULL) || (bufB == NULL) || (bufC == NULL)) { /* Skip the test, the most probable reason is * matrix too big for a device. */ releaseMemObjects(bufA, bufB, bufC); deleteBuffers(A, B, C, backC); delete[] events; ::std::cerr << ">> Failed to create/enqueue buffer for a matrix." << ::std::endl << ">> Can't execute the test, because data is not transfered to GPU." << ::std::endl << ">> Test skipped." << ::std::endl; SUCCEED(); return; } ::std::cerr << "Calling clblas xHEMM routine... "; err = (cl_int)::clMath::clblas::hemm( params->order, params->side, params->uplo, params->M, params->N, alpha_, bufA, params->offA, params->lda, bufB, params->offBX, params->ldb, beta_, bufC, params->offCY, params->ldc, params->numCommandQueues, base->commandQueues(), 0, NULL, events ); if (err != CL_SUCCESS) { releaseMemObjects(bufA, bufB, bufC); deleteBuffers(A, B, C, backC); delete[] events; ASSERT_EQ(CL_SUCCESS, err) << "::clMath::clblas::HEMM() failed"; } err = waitForSuccessfulFinish(params->numCommandQueues, base->commandQueues(), events); if (err != CL_SUCCESS) { releaseMemObjects(bufA, bufB, bufC); deleteBuffers(A, B, C, backC); delete[] events; ASSERT_EQ(CL_SUCCESS, err) << "waitForSuccessfulFinish()"; } ::std::cerr << "Done" << ::std::endl; err = clEnqueueReadBuffer(base->commandQueues()[0], bufC, CL_TRUE, 0, (lengthC + params->offCY) * sizeof(T), backC, 0, NULL, NULL); if (err != CL_SUCCESS) { ::std::cerr << "WARNING: corr-hemm: Erorr reading buffer..." << err << ::std::endl; } //printMatrixBlock( params->order, 0, 0, params->M, params->N, params->ldc, backC); releaseMemObjects(bufA, bufB, bufC); // handle lda correctly based on row-major/col-major.. compareMatrices(params->order, params->M , params->N, (C + params->offCY), (backC + params->offCY), params->ldc); deleteBuffers(A, B, C, backC); delete[] events; } // Instantiate the test TEST_P(HEMM, chemm) { TestParams params; getParams(¶ms); hemmCorrectnessTest(¶ms); } TEST_P(HEMM, zhemm) { TestParams params; getParams(¶ms); hemmCorrectnessTest(¶ms); } clblas-2.10/src/tests/correctness/corr-hemv.cpp000066400000000000000000000175411264277366700216210ustar00rootroot00000000000000/* ************************************************************************ * Copyright 2013 Advanced Micro Devices, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * ************************************************************************/ #include // srand() #include // memcpy() #include #include #include #include #include #include #include #include #include static void releaseMemObjects(cl_mem objA, cl_mem objX, cl_mem objY) { if(objA != NULL) { clReleaseMemObject(objA); } if(objX != NULL) { clReleaseMemObject(objX); } if(objY != NULL) { clReleaseMemObject(objY); } } template static void deleteBuffers(T *A, T *X, T *blasY, T *clblasY) { if(A != NULL) { delete[] A; } if(X != NULL) { delete[] X; } if(blasY != NULL) { delete[] blasY; } if(clblasY != NULL) { delete[] clblasY; // To hold clblas HEMV call results } } /* template static void printVector(T *data, size_t length) { for(int i =0; i < length; i ++) { printf("(%20f, %20f)\n", data[i].s[0], data[i].s[1]); } } */ template void hemvCorrectnessTest(TestParams *params) { cl_int err; T *A, *X, *blasY, *clblasY; cl_mem bufA, bufX, bufY; clMath::BlasBase *base; cl_event *events; T alpha, beta; base = clMath::BlasBase::getInstance(); if ((typeid(T) == typeid(DoubleComplex)) && !base->isDevSupportDoublePrecision()) { std::cerr << ">> WARNING: The target device doesn't support native " "double precision floating point arithmetic" << std::endl << ">> Test skipped" << std::endl; SUCCEED(); return; } printf("number of command queues : %d\n\n", params->numCommandQueues); events = new cl_event[params->numCommandQueues]; memset(events, 0, params->numCommandQueues * sizeof(cl_event)); size_t lengthA = params->N * params->lda; size_t lengthX = (1 + ((params->N -1) * abs(params->incx))); size_t lengthY = (1 + ((params->N -1) * abs(params->incy))); A = new T[lengthA + params->offA ]; X = new T[lengthX + params->offBX ]; blasY = new T[lengthY + params->offCY ]; clblasY = new T[lengthY + params->offCY ]; srand(params->seed); ::std::cerr << "Generating input data... "; if((A == NULL) || (X == NULL) || (blasY == NULL) || (clblasY == NULL)) { deleteBuffers(A, X, blasY, clblasY); ::std::cerr << "Cannot allocate memory on host side\n" << "!!!!!!!!!!!!Test skipped.!!!!!!!!!!!!" << ::std::endl; delete[] events; SUCCEED(); return; } alpha = convertMultiplier(params->alpha); beta = convertMultiplier(params->beta); // beta.s[0] = 0.0f; // beta.s[1] = 0.0f; randomHemvMatrices(params->order, params->uplo, params->N, true, &alpha, (A + params->offA), params->lda, (X + params->offBX), params->incx, true, &beta, (blasY + params->offCY), params->incy); // Copy blasY to clblasY memcpy(clblasY, blasY, (lengthY + params->offCY)* sizeof(*blasY)); ::std::cerr << "Done" << ::std::endl; /* printf("\n\n before acml call\nA\n"); printMatrixBlock( params->order, 0, 0, params->N, params->N, params->lda, A+params->offA); printf("\nX\n"); printMatrixBlock( clblasColumnMajor, 0, 0, lengthX, 1, lengthX, X+params->offBX); printf("\nY\n"); printMatrixBlock( clblasColumnMajor, 0, 0, lengthY, 1, lengthY, blasY+params->offCY); printf("\nY\n"); printMatrixBlock( clblasColumnMajor, 0, 0, lengthY, 1, lengthY, clblasY + params->offCY); */ // Allocate buffers bufA = base->createEnqueueBuffer(A, (lengthA + params->offA)* sizeof(*A), 0, CL_MEM_READ_ONLY); bufX = base->createEnqueueBuffer(X, (lengthX + params->offBX)* sizeof(*X), 0, CL_MEM_READ_ONLY); bufY = base->createEnqueueBuffer(clblasY, (lengthY + params->offCY) * sizeof(*clblasY), 0, CL_MEM_READ_WRITE); //printData( "bufX", blasX, lengthX, 1, lengthX); //printData( "clblasX", clblasX, lengthX, 1, lengthX); ::std::cerr << "Calling reference xHEMV routine... "; clblasOrder order; clblasUplo fUplo; order = params->order; fUplo = params->uplo; if (order != clblasColumnMajor) { order = clblasColumnMajor; fUplo = (params->uplo == clblasUpper)? clblasLower : clblasUpper; doConjugate( (A + params->offA), params->N, params->N, params->lda ); } ::clMath::blas::hemv( order, fUplo, params->N, alpha, A, params->offA, params->lda, X, params->offBX, params->incx, beta, blasY, params->offCY, params->incy); ::std::cerr << "Done" << ::std::endl; /* printf("\n\n after acml call\n"); printf("\nY\n"); printMatrixBlock( clblasColumnMajor, 0, 0, lengthY, 1, lengthY, blasY+params->offCY); printf("Y in different format\n"); printVector(blasY+params->offCY, lengthY); */ if ((bufA == NULL) || (bufX == NULL) || (bufY == NULL)) { // Skip the test, the most probable reason is // matrix too big for a device. releaseMemObjects(bufA, bufX, bufY); deleteBuffers(A, X, blasY, clblasY); delete[] events; ::std::cerr << ">> Failed to create/enqueue buffer for a matrix." << ::std::endl << ">> Can't execute the test, because data is not transfered to GPU." << ::std::endl << ">> Test skipped." << ::std::endl; SUCCEED(); return; } ::std::cerr << "Calling clblas xHEMV routine... "; err = (cl_int)::clMath::clblas::hemv(params->order, params->uplo, params->N, alpha, bufA, params->offA, params->lda, bufX, params->offBX, params->incx, beta, bufY, params->offCY, params->incy, params->numCommandQueues, base->commandQueues(), 0, NULL, events); if (err != CL_SUCCESS) { releaseMemObjects(bufA, bufX, bufY); deleteBuffers(A, X, blasY, clblasY); delete[] events; ASSERT_EQ(CL_SUCCESS, err) << "::clMath::clblas::HEMV() failed"; } err = waitForSuccessfulFinish(params->numCommandQueues, base->commandQueues(), events); if (err != CL_SUCCESS) { releaseMemObjects(bufA, bufX, bufY); deleteBuffers(A, X, blasY, clblasY); delete[] events; ASSERT_EQ(CL_SUCCESS, err) << "waitForSuccessfulFinish()"; } ::std::cerr << "Done" << ::std::endl; err = clEnqueueReadBuffer(base->commandQueues()[0], bufY, CL_TRUE, 0, (lengthY + params->offCY) * sizeof(*clblasY), clblasY, 0, NULL, NULL); if (err != CL_SUCCESS) { ::std::cerr << "HEMV: Reading results failed...." << std::endl; } releaseMemObjects(bufA, bufX, bufY); /* printf("\n\n after our call\n"); printf("\nY\n"); printMatrixBlock( clblasColumnMajor, 0, 0, lengthY, 1, lengthY, clblasY+params->offCY); printf("Y in different format\n"); printVector(clblasY+params->offCY, lengthY); */ compareMatrices(clblasColumnMajor, lengthY , 1, (blasY + params->offCY), (clblasY + params->offCY), lengthY); deleteBuffers(A, X, blasY, clblasY); delete[] events; } // Instantiate the test TEST_P(HEMV, chemv) { TestParams params; getParams(¶ms); hemvCorrectnessTest(¶ms); } TEST_P(HEMV, zhemv) { TestParams params; getParams(¶ms); hemvCorrectnessTest(¶ms); } clblas-2.10/src/tests/correctness/corr-her.cpp000066400000000000000000000142021264277366700214270ustar00rootroot00000000000000/* ************************************************************************ * Copyright 2013 Advanced Micro Devices, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * ************************************************************************/ #include // srand() #include // memcpy() #include #include #include #include #include #include #include #include #include static void releaseMemObjects(cl_mem objA, cl_mem objX) { if( objA!=NULL) clReleaseMemObject(objA); if( objX!=NULL) clReleaseMemObject(objX); } template static void deleteBuffers(T *A, T *X, T *backA) { if(A != NULL) { delete[] A; } if(X != NULL) { delete[] X; } if(backA != NULL) { delete[] backA; } } template void herCorrectnessTest(TestParams *params) { cl_int err; T *A, *X, *backA; T alpha_; cl_mem bufA, bufX; clMath::BlasBase *base; cl_event *events; base = clMath::BlasBase::getInstance(); if ((typeid(T) == typeid(DoubleComplex)) && !base->isDevSupportDoublePrecision()) { std::cerr << ">> WARNING: The target device doesn't support native " "double precision floating point arithmetic" << std::endl << ">> Test skipped" << std::endl; SUCCEED(); return; } printf("number of command queues : %d\n\n", params->numCommandQueues); events = new cl_event[params->numCommandQueues]; memset(events, 0, params->numCommandQueues * sizeof(cl_event)); size_t lengthA = params->N * params->lda; size_t lengthX = (1 + ((params->N -1) * abs(params->incx))); alpha_ = convertMultiplier(params->alpha); A = new T[lengthA + params->offa ]; backA = new T[lengthA + params->offa ]; X = new T[lengthX + params->offBX ]; if((A == NULL) || (backA == NULL) || (X == NULL)) { ::std::cerr << "Cannot allocate memory on host side\n" << "!!!!!!!!!!!!Test skipped.!!!!!!!!!!!!" << ::std::endl; deleteBuffers(A, X, backA); delete[] events; SUCCEED(); return; } srand(params->seed); ::std::cerr << "Generating input data... "; randomHerMatrices( params->order, params->uplo, params->N, &alpha_, (A + params->offa), params->lda, (X + params->offBX), params->incx ); memcpy(backA, A, (lengthA + params->offa)* sizeof(*A)); ::std::cerr << "Done" << ::std::endl; // Allocate buffers bufA = base->createEnqueueBuffer(A, (lengthA + params->offa) * sizeof(*A), 0, CL_MEM_READ_WRITE); bufX = base->createEnqueueBuffer(X, (lengthX + params->offBX) * sizeof(*X), 0, CL_MEM_READ_ONLY); ::std::cerr << "Calling reference xHER routine... "; clblasOrder fOrder; clblasUplo fUplo; fOrder = params->order; fUplo = params->uplo; if (fOrder != clblasColumnMajor) { doConjugate( (X + params->offBX), (1 + (params->N-1) * abs(params->incx)), 1, 1 ); fOrder = clblasColumnMajor; fUplo = (fUplo == clblasLower)? clblasUpper : clblasLower; } clMath::blas::her( fOrder, fUplo, params->N, CREAL(alpha_), X , params->offBX, params->incx, A, params->offa, params->lda ); ::std::cerr << "Done" << ::std::endl; if ((bufA == NULL) || (bufX == NULL) ) { /* Skip the test, the most probable reason is * matrix too big for a device. */ releaseMemObjects(bufA, bufX); deleteBuffers(backA, A, X); delete[] events; if(bufA == NULL) { ::std::cerr << "BufA is null, lengthA is " << lengthA << ::std::endl; } if(bufX == NULL) { ::std::cerr << "BufX is null, lengthX is " << lengthX << ::std::endl; } ::std::cerr << ">> Failed to create/enqueue buffer for a matrix." << ::std::endl << ">> Can't execute the test, because data is not transfered to GPU." << ::std::endl << ">> Test skipped." << ::std::endl; SUCCEED(); return; } ::std::cerr << "Calling clblas xHER routine... "; err = (cl_int)::clMath::clblas::her( params->order, params->uplo, params->N, CREAL(alpha_), bufX, params->offBX, params->incx, bufA, params->offa, params->lda, params->numCommandQueues, base->commandQueues(), 0, NULL, events); if (err != CL_SUCCESS) { releaseMemObjects(bufA, bufX); deleteBuffers(backA, A, X); delete[] events; ASSERT_EQ(CL_SUCCESS, err) << "::clMath::clblas::HER() failed"; } err = waitForSuccessfulFinish(params->numCommandQueues, base->commandQueues(), events); if (err != CL_SUCCESS) { releaseMemObjects(bufA, bufX); deleteBuffers(backA, A, X); delete[] events; ASSERT_EQ(CL_SUCCESS, err) << "waitForSuccessfulFinish()"; } ::std::cerr << "Done" << ::std::endl; err = clEnqueueReadBuffer(base->commandQueues()[0], bufA, CL_TRUE, 0, (lengthA + params->offa) * sizeof(*A), backA, 0, NULL, NULL); if (err != CL_SUCCESS) { ::std::cerr << "HER: Reading results failed...." << std::endl; } releaseMemObjects(bufA, bufX); printf("Comparing the results\n"); compareMatrices(params->order, params->N , params->N, (A + params->offa), (backA + params->offa), params->lda); deleteBuffers( A, backA, X); delete[] events; } // Instantiate the test TEST_P(HER, cher) { TestParams params; getParams(¶ms); herCorrectnessTest(¶ms); } TEST_P(HER, zher) { TestParams params; getParams(¶ms); herCorrectnessTest(¶ms); } clblas-2.10/src/tests/correctness/corr-her2.cpp000066400000000000000000000155201264277366700215150ustar00rootroot00000000000000/* ************************************************************************ * Copyright 2013 Advanced Micro Devices, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * ************************************************************************/ #include // srand() #include // memcpy() #include #include #include #include #include #include #include #include #include static void releaseMemObjects(cl_mem objA, cl_mem objX, cl_mem objY) { if(objA != NULL) { clReleaseMemObject(objA); } if(objX != NULL) { clReleaseMemObject(objX); } if(objY != NULL) { clReleaseMemObject(objY); } } template static void deleteBuffers(T *blasA, T *clblasA, T *X, T *Y) { if(blasA != NULL) { delete[] blasA; } if(clblasA != NULL) { delete[] clblasA; } if(X != NULL) { delete[] X; } if(Y != NULL) { delete[] Y; } } template void her2CorrectnessTest(TestParams *params) { cl_int err; T *blasA, *clblasA, *X, *Y; cl_mem bufA, bufX, bufY; clMath::BlasBase *base; cl_event *events; T alpha; base = clMath::BlasBase::getInstance(); if ((typeid(T) == typeid(cl_double2)) && !base->isDevSupportDoublePrecision()) { std::cerr << ">> WARNING: The target device doesn't support native " "double precision floating point arithmetic" << std::endl << ">> Test skipped" << std::endl; SUCCEED(); return; } printf("number of command queues : %d\n\n", params->numCommandQueues); events = new cl_event[params->numCommandQueues]; memset(events, 0, params->numCommandQueues * sizeof(cl_event)); size_t lengthA = params->N * params->lda; size_t lengthX = (1 + ((params->N - 1) * abs(params->incx))); size_t lengthY = (1 + ((params->N - 1) * abs(params->incy))); blasA = new T[lengthA + params->offa ]; clblasA = new T[lengthA + params->offa ]; X = new T[lengthX + params->offBX ]; Y = new T[lengthY + params->offCY ]; srand(params->seed); if((blasA == NULL) || (clblasA == NULL) || (X == NULL) || (Y == NULL)) { deleteBuffers(blasA, clblasA, X, Y); ::std::cerr << "Cannot allocate memory on host side\n" << "!!!!!!!!!!!!Test skipped.!!!!!!!!!!!!" << ::std::endl; delete[] events; SUCCEED(); return; } alpha = convertMultiplier(params->alpha); ::std::cerr << "Generating input data... "; randomHer2Matrices(params->order, params->uplo, params->N, &alpha, (blasA + params->offa), params->lda, (X + params->offBX), params->incx, (Y + params->offCY), params->incy); // Copy blasA to clblasA memcpy(clblasA, blasA, (lengthA + params->offa)* sizeof(*blasA)); ::std::cerr << "Done" << ::std::endl; // Allocate buffers bufA = base->createEnqueueBuffer(clblasA, (lengthA + params->offa)* sizeof(*clblasA), 0,CL_MEM_READ_WRITE); bufX = base->createEnqueueBuffer(X, (lengthX + params->offBX)* sizeof(*X), 0, CL_MEM_READ_ONLY); bufY = base->createEnqueueBuffer(Y, (lengthY + params->offCY)* sizeof(*Y), 0, CL_MEM_READ_ONLY); ::std::cerr << "Calling reference xHER2 routine... "; clblasOrder order; clblasUplo fUplo; order = params->order; fUplo = params->uplo; if (order != clblasColumnMajor) { doConjugate( (X + params->offBX), 1, (1 + (params->N-1) * abs(params->incx)), (1 + (params->N-1) * abs(params->incx)) ); doConjugate( (Y + params->offCY), 1, (1 + (params->N-1) * abs(params->incy)), (1 + (params->N-1) * abs(params->incy)) ); order = clblasColumnMajor; fUplo = (fUplo == clblasLower)? clblasUpper : clblasLower; ::clMath::blas::her2( order, fUplo, params->N, alpha, Y, params->offCY, params->incy, X, params->offBX, params->incx, blasA, params->offa, params->lda); } else { ::clMath::blas::her2( order, fUplo, params->N, alpha, X, params->offBX, params->incx, Y, params->offCY, params->incy, blasA, params->offa, params->lda); } ::std::cerr << "Done" << ::std::endl; if ((bufA == NULL) || (bufX == NULL) || (bufY == NULL)) { /* Skip the test, the most probable reason is * matrix too big for a device. */ releaseMemObjects(bufA, bufX, bufY); deleteBuffers(blasA, clblasA, X, Y); delete[] events; ::std::cerr << ">> Failed to create/enqueue buffer for a matrix." << ::std::endl << ">> Can't execute the test, because data is not transfered to GPU." << ::std::endl << ">> Test skipped." << ::std::endl; SUCCEED(); return; } ::std::cerr << "Calling clblas xHER2 routine... "; err = (cl_int)::clMath::clblas::her2( params->order, params->uplo, params->N, alpha, bufX, params->offBX, params->incx, bufY, params->offCY, params->incy, bufA, params->offa, params->lda, params->numCommandQueues, base->commandQueues(), 0, NULL, events); if (err != CL_SUCCESS) { releaseMemObjects(bufA, bufX, bufY); deleteBuffers(blasA, clblasA, X, Y); delete[] events; ASSERT_EQ(CL_SUCCESS, err) << "::clMath::clblas::HER2() failed"; } err = waitForSuccessfulFinish(params->numCommandQueues, base->commandQueues(), events); if (err != CL_SUCCESS) { releaseMemObjects(bufA, bufX, bufY); deleteBuffers(blasA, clblasA, X, Y); delete[] events; ASSERT_EQ(CL_SUCCESS, err) << "waitForSuccessfulFinish()"; } ::std::cerr << "Done" << ::std::endl; err = clEnqueueReadBuffer(base->commandQueues()[0], bufA, CL_TRUE, 0, (lengthA + params->offa) * sizeof(*clblasA), clblasA, 0, NULL, NULL); if (err != CL_SUCCESS) { ::std::cerr << "HER2: Reading results failed...." << std::endl; } releaseMemObjects(bufA, bufX, bufY); compareMatrices(params->order, params->N , params->N, (blasA + params->offa), (clblasA + params->offa), params->lda); deleteBuffers(blasA, clblasA, X, Y); delete[] events; } // Instantiate the test TEST_P(HER2, cher2) { TestParams params; getParams(¶ms); her2CorrectnessTest(¶ms); } TEST_P(HER2, zher2) { TestParams params; getParams(¶ms); her2CorrectnessTest(¶ms); } clblas-2.10/src/tests/correctness/corr-her2k.cpp000066400000000000000000000167161264277366700217000ustar00rootroot00000000000000/* ************************************************************************ * Copyright 2013 Advanced Micro Devices, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * ************************************************************************/ #include // srand() #include // memcpy() #include #include #include #include #include #include #include #include #include static void releaseMemObjects(cl_mem objA, cl_mem objC, cl_mem objB) { if(objA != NULL) clReleaseMemObject(objA); if(objC != NULL) clReleaseMemObject(objC); if(objB != NULL) clReleaseMemObject(objB); } template static void deleteBuffers(T *A, T *B, T *blasC, T *clblasC) { if(A != NULL) delete[] A; if(B != NULL) delete[] B; if(blasC != NULL) delete[] blasC; if(clblasC != NULL) delete[] clblasC; } template void her2kCorrectnessTest(TestParams *params) { cl_int err; T *A, *B, *blasC, *clblasC; T alpha, beta; cl_mem bufA, bufC, bufB; clMath::BlasBase *base; cl_event *events; if (params->transA == clblasTrans) { ::std::cerr << ">> her2k(TRANSPOSE) for complex numbers " "is not allowed." << ::std::endl << ">> Test skipped." << ::std::endl; SUCCEED(); return; } base = clMath::BlasBase::getInstance(); if ((typeid(T) == typeid(cl_double) || typeid(T) == typeid(DoubleComplex)) && !base->isDevSupportDoublePrecision()) { std::cerr << ">> WARNING: The target device doesn't support native " "double precision floating point arithmetic" << std::endl << ">> Test skipped" << std::endl; SUCCEED(); return; } events = new cl_event[params->numCommandQueues]; memset(events, 0, params->numCommandQueues * sizeof(cl_event)); A = new T[params->rowsA * params->columnsA]; B = new T[params->rowsB * params->columnsB]; blasC = new T[params->rowsC * params->columnsC]; clblasC = new T[params->rowsC * params->columnsC]; if((A == NULL) || (B == NULL) || (blasC == NULL) || (clblasC == NULL)) { deleteBuffers(A, B, blasC, clblasC); ::std::cerr << "Cannot allocate memory on host side\n" << "!!!!!!!!!!!!Test skipped.!!!!!!!!!!!!" << ::std::endl; delete[] events; SUCCEED(); return; } srand(params->seed); alpha = convertMultiplier(params->alpha); beta = convertMultiplier(params->beta); ::std::cerr << "Generating input data... "; clblasTranspose ftransB = (params->transA==clblasNoTrans)? clblasConjTrans: clblasNoTrans; randomGemmMatrices(params->order, params->transA, ftransB, params->N, params->N, params->K, true, &alpha, A, params->lda, B, params->ldb, true, &beta, blasC, params->ldc); memcpy(clblasC, blasC, params->rowsC * params->columnsC * sizeof(*blasC)); ::std::cerr << "Done" << ::std::endl; bufA = base->createEnqueueBuffer(A, params->rowsA * params->columnsA * sizeof(*A), params->offA * sizeof(*A), CL_MEM_READ_ONLY); bufB = base->createEnqueueBuffer(B, params->rowsB * params->columnsB * sizeof(*B), params->offBX * sizeof(*B), CL_MEM_READ_ONLY); bufC = base->createEnqueueBuffer(clblasC, params->rowsC * params->columnsC * sizeof(*clblasC), params->offCY * sizeof(*clblasC), CL_MEM_READ_WRITE); if ((bufA == NULL) || (bufB == NULL)|| (bufC == NULL)) { /* Skip the test, the most probable reason is * matrix too big for a device. */ releaseMemObjects(bufA, bufB, bufC); deleteBuffers(A, B, blasC, clblasC); delete[] events; ::std::cerr << ">> Failed to create/enqueue buffer for a matrix." << ::std::endl << ">> Can't execute the test, because data is not transfered to GPU." << ::std::endl << ">> Test skipped." << ::std::endl; SUCCEED(); return; } ::std::cerr << "Calling reference xHER2K routine... "; T fAlpha = alpha; if (params->order == clblasColumnMajor) { ::clMath::blas::her2k(clblasColumnMajor, params->uplo, params->transA, params->N, params->K, fAlpha, A, 0, params->lda, B, 0, params->ldb, CREAL(beta), blasC, 0, params->ldc); } else { CIMAG( fAlpha ) *= -1.0; // According to netlib C- interface clblasTranspose fTransA = (params->transA == clblasNoTrans) ? clblasConjTrans : clblasNoTrans; clblasUplo fUplo = (params->uplo == clblasUpper) ? clblasLower : clblasUpper; ::clMath::blas::her2k(clblasColumnMajor, fUplo, fTransA, params->N, params->K, fAlpha, A, 0, params->lda, B, 0, params->ldb, CREAL(beta), blasC, 0, params->ldc); } ::std::cerr << "Done" << ::std::endl; ::std::cerr << "Calling clblas xHER2K routine... "; err = (cl_int)::clMath::clblas::her2k(params->order, params->uplo, params->transA, params->N, params->K, alpha, bufA, params->offA, params->lda, bufB, params->offBX, params->ldb, CREAL(beta), bufC, params->offCY, params->ldc, params->numCommandQueues, base->commandQueues(), 0, NULL, events); if (err != CL_SUCCESS) { releaseMemObjects(bufA, bufB, bufC); deleteBuffers(A, B, blasC, clblasC); delete[] events; ASSERT_EQ(CL_SUCCESS, err) << "::clMath::clblas::HER2K() failed"; } err = waitForSuccessfulFinish(params->numCommandQueues, base->commandQueues(), events); if (err != CL_SUCCESS) { releaseMemObjects(bufA, bufB, bufC); deleteBuffers(A, B, blasC, clblasC); delete[] events; ASSERT_EQ(CL_SUCCESS, err) << "waitForSuccessfulFinish()"; } ::std::cerr << "Done" << ::std::endl; clEnqueueReadBuffer(base->commandQueues()[0], bufC, CL_TRUE, params->offCY * sizeof(*clblasC), params->rowsC * params->columnsC * sizeof(*clblasC), clblasC, 0, NULL, NULL); releaseMemObjects(bufA, bufB, bufC); compareMatrices(params->order, params->N, params->N, blasC, clblasC, params->ldc); deleteBuffers(A, B, blasC, clblasC); delete[] events; } // Instantiate the test TEST_P(HER2K, cher2k) { TestParams params; getParams(¶ms); her2kCorrectnessTest(¶ms); } TEST_P(HER2K, zher2k) { TestParams params; getParams(¶ms); her2kCorrectnessTest(¶ms); } clblas-2.10/src/tests/correctness/corr-herk.cpp000066400000000000000000000174731264277366700216170ustar00rootroot00000000000000/* ************************************************************************ * Copyright 2013 Advanced Micro Devices, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * ************************************************************************/ #include // srand() #include // memcpy() #include #include #include #include #include #include #include #include #include static void releaseMemObjects(cl_mem objA, cl_mem objC) { if(objA != NULL) { clReleaseMemObject(objA); } if(objC != NULL) { clReleaseMemObject(objC); } } template static void deleteBuffers(T *A, T *blasC, T *clblasC) { if(A != NULL) { delete[] A; } if(blasC != NULL) { delete[] blasC; } if(clblasC != NULL) { delete[] clblasC; } } template void herkCorrectnessTest(TestParams *params) { cl_int err; T *A, *blasC, *clblasC; T alpha, beta; cl_mem bufA, bufC; clMath::BlasBase *base; bool useAlpha; bool useBeta; cl_event *events; if (params->transA == clblasTrans) { ::std::cerr << ">> herk(TRANSPOSE) for complex numbers " "is not allowed." << ::std::endl << ">> Test skipped." << ::std::endl; SUCCEED(); return; } base = clMath::BlasBase::getInstance(); alpha = ZERO(); beta = ZERO(); if ((typeid(T) == typeid(cl_double) || typeid(T) == typeid(DoubleComplex)) && !base->isDevSupportDoublePrecision()) { std::cerr << ">> WARNING: The target device doesn't support native " "double precision floating point arithmetic" << std::endl << ">> Test skipped" << std::endl; SUCCEED(); return; } events = new cl_event[params->numCommandQueues]; memset(events, 0, params->numCommandQueues * sizeof(cl_event)); A = new T[params->rowsA * params->columnsA]; blasC = new T[params->rowsC * params->columnsC]; clblasC = new T[params->rowsC * params->columnsC]; if((A == NULL) || (blasC == NULL) || (clblasC == NULL)) { deleteBuffers(A, blasC, clblasC); ::std::cerr << "Cannot allocate memory on host side\n" << "!!!!!!!!!!!!Test skipped.!!!!!!!!!!!!" << ::std::endl; delete[] events; SUCCEED(); return; } srand(params->seed); useAlpha = true; useBeta = true; alpha = convertMultiplier(params->alpha); beta = convertMultiplier(params->beta); ::std::cerr << "Generating input data... "; randomGemmMatrices(params->order, params->transA, clblasNoTrans, params->N, params->N, params->K, useAlpha, &alpha, A, params->lda, NULL, 0, useBeta, &beta, blasC, params->ldc); memcpy(clblasC, blasC, params->rowsC * params->columnsC * sizeof(*blasC)); ::std::cerr << "Done" << ::std::endl; ::std::cerr << "Calling reference xHERK routine... "; if (params->order == clblasColumnMajor) { ::clMath::blas::herk(clblasColumnMajor, params->uplo, params->transA, params->N, params->K, CREAL(alpha), A, params->lda, CREAL(beta), blasC, params->ldc); } else { /* T *reorderedA = new T[params->rowsA * params->columnsA]; T *reorderedC = new T[params->rowsC * params->columnsC]; reorderMatrix(clblasRowMajor, params->rowsA, params->columnsA, A, reorderedA); reorderMatrix(clblasRowMajor, params->rowsC, params->columnsC, blasC, reorderedC); ::clMath::blas::herk(clblasColumnMajor, params->uplo, params->transA, params->N, params->K, CREAL(alpha), reorderedA, params->rowsA, CREAL(beta), reorderedC, params->rowsC); reorderMatrix(clblasColumnMajor, params->rowsC, params->columnsC, reorderedC, blasC); delete[] reorderedC; delete[] reorderedA; */ clblasTranspose fTransA = (params->transA == clblasNoTrans) ? clblasConjTrans : clblasNoTrans; clblasUplo fUplo = (params->uplo == clblasUpper) ? clblasLower : clblasUpper; ::clMath::blas::herk(clblasColumnMajor, fUplo, fTransA, params->N, params->K, CREAL(alpha), A, params->lda, CREAL(beta), blasC, params->ldc); } ::std::cerr << "Done" << ::std::endl; bufA = base->createEnqueueBuffer(A, params->rowsA * params->columnsA * sizeof(*A), params->offA * sizeof(*A), CL_MEM_READ_ONLY); bufC = base->createEnqueueBuffer(clblasC, params->rowsC * params->columnsC * sizeof(*clblasC), params->offCY * sizeof(*clblasC), CL_MEM_READ_WRITE); if ((bufA == NULL) || (bufC == NULL)) { /* Skip the test, the most probable reason is * matrix too big for a device. */ releaseMemObjects(bufA, bufC); deleteBuffers(A, blasC, clblasC); delete[] events; ::std::cerr << ">> Failed to create/enqueue buffer for a matrix." << ::std::endl << ">> Can't execute the test, because data is not transfered to GPU." << ::std::endl << ">> Test skipped." << ::std::endl; SUCCEED(); return; } ::std::cerr << "Calling clblas xHERK routine... "; err = (cl_int)::clMath::clblas::herk(params->order, params->uplo, params->transA, params->N, params->K, CREAL(alpha), bufA, params->offA, params->lda, CREAL(beta), bufC, params->offCY, params->ldc, params->numCommandQueues, base->commandQueues(), 0, NULL, events); if (err != CL_SUCCESS) { releaseMemObjects(bufA, bufC); deleteBuffers(A, blasC, clblasC); delete[] events; ASSERT_EQ(CL_SUCCESS, err) << "::clMath::clblas::HERK() failed"; } err = waitForSuccessfulFinish(params->numCommandQueues, base->commandQueues(), events); if (err != CL_SUCCESS) { releaseMemObjects(bufA, bufC); deleteBuffers(A, blasC, clblasC); delete[] events; ASSERT_EQ(CL_SUCCESS, err) << "waitForSuccessfulFinish()"; } ::std::cerr << "Done" << ::std::endl; clEnqueueReadBuffer(base->commandQueues()[0], bufC, CL_TRUE, params->offCY * sizeof(*clblasC), params->rowsC * params->columnsC * sizeof(*clblasC), clblasC, 0, NULL, NULL); releaseMemObjects(bufA, bufC); compareMatrices(params->order, params->N, params->N, blasC, clblasC, params->ldc); deleteBuffers(A, blasC, clblasC); delete[] events; } // Instantiate the test TEST_P(HERK, cherk) { TestParams params; getParams(¶ms); herkCorrectnessTest(¶ms); } TEST_P(HERK, zherk) { TestParams params; getParams(¶ms); herkCorrectnessTest(¶ms); } clblas-2.10/src/tests/correctness/corr-hpmv.cpp000066400000000000000000000152001264277366700216220ustar00rootroot00000000000000/* ************************************************************************ * Copyright 2013 Advanced Micro Devices, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * ************************************************************************/ #include // srand() #include // memcpy() #include #include #include #include #include #include #include #include #include static void releaseMemObjects(cl_mem objA, cl_mem objX, cl_mem objY) { if(objA != NULL) { clReleaseMemObject(objA); } if(objX != NULL) { clReleaseMemObject(objX); } if(objY != NULL) { clReleaseMemObject(objY); } } template static void deleteBuffers(T *A, T *X, T *blasY, T *clblasY) { if(A != NULL) { delete[] A; } if(X != NULL) { delete[] X; } if(blasY != NULL) { delete[] blasY; } if(clblasY != NULL) { delete[] clblasY; // To hold clblas HPMV call results } } template void hpmvCorrectnessTest(TestParams *params) { cl_int err; T *AP, *X, *blasY, *clblasY; cl_mem bufAP, bufX, bufY; clMath::BlasBase *base; cl_event *events; T alpha, beta; base = clMath::BlasBase::getInstance(); if ((typeid(T) == typeid(DoubleComplex)) && !base->isDevSupportDoublePrecision()) { std::cerr << ">> WARNING: The target device doesn't support native " "double precision floating point arithmetic" << std::endl << ">> Test skipped" << std::endl; SUCCEED(); return; } printf("number of command queues : %d\n\n", params->numCommandQueues); events = new cl_event[params->numCommandQueues]; memset(events, 0, params->numCommandQueues * sizeof(cl_event)); size_t lengthA = (params->N * (params->N + 1)) / 2; size_t lengthX = (1 + ((params->N -1) * abs(params->incx))); size_t lengthY = (1 + ((params->N -1) * abs(params->incy))); AP = new T[lengthA + params->offA ]; X = new T[lengthX + params->offBX ]; blasY = new T[lengthY + params->offCY ]; clblasY = new T[lengthY + params->offCY ]; srand(params->seed); ::std::cerr << "Generating input data... "; if((AP == NULL) || (X == NULL) || (blasY == NULL) || (clblasY == NULL)) { deleteBuffers(AP, X, blasY, clblasY); ::std::cerr << "Cannot allocate memory on host side\n" << "!!!!!!!!!!!!Test skipped.!!!!!!!!!!!!" << ::std::endl; delete[] events; SUCCEED(); return; } alpha = convertMultiplier(params->alpha); beta = convertMultiplier(params->beta); randomHemvMatrices(params->order, params->uplo, params->N, true, &alpha, (AP + params->offA), params->lda, (X + params->offBX), params->incx, true, &beta, (blasY + params->offCY), params->incy); // Copy blasY to clblasY memcpy(clblasY, blasY, (lengthY + params->offCY)* sizeof(*blasY)); ::std::cerr << "Done" << ::std::endl; // Allocate buffers bufAP = base->createEnqueueBuffer(AP, (lengthA + params->offA)* sizeof(*AP), 0, CL_MEM_READ_ONLY); bufX = base->createEnqueueBuffer(X, (lengthX + params->offBX)* sizeof(*X), 0, CL_MEM_READ_ONLY); bufY = base->createEnqueueBuffer(clblasY, (lengthY + params->offCY) * sizeof(*clblasY), 0, CL_MEM_READ_WRITE); ::std::cerr << "Calling reference xHPMV routine... "; clblasOrder order; clblasUplo fUplo; order = params->order; fUplo = params->uplo; if (order != clblasColumnMajor) { order = clblasColumnMajor; fUplo = (params->uplo == clblasUpper)? clblasLower : clblasUpper; doConjugate( (AP + params->offA), lengthA, 1, 1 ); } ::clMath::blas::hpmv( order, fUplo, params->N, alpha, AP, params->offA, X, params->offBX, params->incx, beta, blasY, params->offCY, params->incy); ::std::cerr << "Done" << ::std::endl; if ((bufAP == NULL) || (bufX == NULL) || (bufY == NULL)) { // Skip the test, the most probable reason is // matrix too big for a device. releaseMemObjects(bufAP, bufX, bufY); deleteBuffers(AP, X, blasY, clblasY); delete[] events; ::std::cerr << ">> Failed to create/enqueue buffer for a matrix." << ::std::endl << ">> Can't execute the test, because data is not transfered to GPU." << ::std::endl << ">> Test skipped." << ::std::endl; SUCCEED(); return; } ::std::cerr << "Calling clblas xHPMV routine... "; err = (cl_int)::clMath::clblas::hpmv(params->order, params->uplo, params->N, alpha, bufAP, params->offA, bufX, params->offBX, params->incx, beta, bufY, params->offCY, params->incy, params->numCommandQueues, base->commandQueues(), 0, NULL, events); if (err != CL_SUCCESS) { releaseMemObjects(bufAP, bufX, bufY); deleteBuffers(AP, X, blasY, clblasY); delete[] events; ASSERT_EQ(CL_SUCCESS, err) << "::clMath::clblas::HPMV() failed"; } err = waitForSuccessfulFinish(params->numCommandQueues, base->commandQueues(), events); if (err != CL_SUCCESS) { releaseMemObjects(bufAP, bufX, bufY); deleteBuffers(AP, X, blasY, clblasY); delete[] events; ASSERT_EQ(CL_SUCCESS, err) << "waitForSuccessfulFinish()"; } ::std::cerr << "Done" << ::std::endl; err = clEnqueueReadBuffer(base->commandQueues()[0], bufY, CL_TRUE, 0, (lengthY + params->offCY) * sizeof(*clblasY), clblasY, 0, NULL, NULL); if (err != CL_SUCCESS) { ::std::cerr << "HPMV: Reading results failed...." << std::endl; } releaseMemObjects(bufAP, bufX, bufY); compareMatrices(clblasColumnMajor, lengthY , 1, (blasY + params->offCY), (clblasY + params->offCY), lengthY); deleteBuffers(AP, X, blasY, clblasY); delete[] events; } // Instantiate the test TEST_P(HPMV, chpmv) { TestParams params; getParams(¶ms); hpmvCorrectnessTest(¶ms); } TEST_P(HPMV, zhpmv) { TestParams params; getParams(¶ms); hpmvCorrectnessTest(¶ms); } clblas-2.10/src/tests/correctness/corr-hpr.cpp000066400000000000000000000142071264277366700214470ustar00rootroot00000000000000/* ************************************************************************ * Copyright 2013 Advanced Micro Devices, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * ************************************************************************/ #include // srand() #include // memcpy() #include #include #include #include #include #include #include #include #include static void releaseMemObjects(cl_mem objA, cl_mem objX) { if( objA!=NULL) clReleaseMemObject(objA); if( objX!=NULL) clReleaseMemObject(objX); } template static void deleteBuffers(T *A, T *X, T *backA) { if(A != NULL) { delete[] A; } if(X != NULL) { delete[] X; } if(backA != NULL) { delete[] backA; } } template void hprCorrectnessTest(TestParams *params) { cl_int err; T *AP, *X, *backA; T alpha_; cl_mem bufAP, bufX; clMath::BlasBase *base; cl_event *events; base = clMath::BlasBase::getInstance(); if ((typeid(T) == typeid(cl_double2)) && !base->isDevSupportDoublePrecision()) { std::cerr << ">> WARNING: The target device doesn't support native " "double precision floating point arithmetic" << std::endl << ">> Test skipped" << std::endl; SUCCEED(); return; } printf("number of command queues : %d\n\n", params->numCommandQueues); events = new cl_event[params->numCommandQueues]; memset(events, 0, params->numCommandQueues * sizeof(cl_event)); size_t lengthAP = (params->N *( params->N + 1 ))/2 ; size_t lengthX = (1 + ((params->N -1) * abs(params->incx))); alpha_ = convertMultiplier(params->alpha); AP = new T[lengthAP + params->offa ]; backA = new T[lengthAP + params->offa ]; X = new T[lengthX + params->offBX ]; if((AP == NULL) || (backA == NULL) || (X == NULL)) { ::std::cerr << "Cannot allocate memory on host side\n" << "!!!!!!!!!!!!Test skipped.!!!!!!!!!!!!" << ::std::endl; deleteBuffers(AP, X, backA); delete[] events; SUCCEED(); return; } srand(params->seed); ::std::cerr << "Generating input data... "; randomHerMatrices( params->order, params->uplo, params->N, &alpha_, (AP + params->offa), params->lda, (X + params->offBX), params->incx ); memcpy(backA, AP, (lengthAP + params->offa)* sizeof(T)); ::std::cerr << "Done" << ::std::endl; // Allocate buffers bufAP = base->createEnqueueBuffer(AP, (lengthAP + params->offa) * sizeof(T), 0, CL_MEM_READ_WRITE); bufX = base->createEnqueueBuffer(X, (lengthX + params->offBX) * sizeof(*X), 0, CL_MEM_READ_ONLY); ::std::cerr << "Calling reference xHPR routine... "; clblasOrder fOrder; clblasUplo fUplo; fOrder = params->order; fUplo = params->uplo; if (fOrder != clblasColumnMajor) { doConjugate( (X + params->offBX), (1 + (params->N-1) * abs(params->incx)), 1, 1 ); fOrder = clblasColumnMajor; fUplo = (fUplo == clblasLower)? clblasUpper : clblasLower; } clMath::blas::hpr( fOrder, fUplo, params->N, CREAL(alpha_), X , params->offBX, params->incx, AP, params->offa); ::std::cerr << "Done" << ::std::endl; if ((bufAP == NULL) || (bufX == NULL) ) { /* Skip the test, the most probable reason is * matrix too big for a device. */ releaseMemObjects(bufAP, bufX); deleteBuffers(backA, AP, X); delete[] events; if(bufAP == NULL) { ::std::cerr << "BufA is null, lengthA is " << lengthAP << ::std::endl; } if(bufX == NULL) { ::std::cerr << "BufX is null, lengthX is " << lengthX << ::std::endl; } ::std::cerr << ">> Failed to create/enqueue buffer for a matrix." << ::std::endl << ">> Can't execute the test, because data is not transfered to GPU." << ::std::endl << ">> Test skipped." << ::std::endl; SUCCEED(); return; } ::std::cerr << "Calling clblas xHPR routine... "; err = (cl_int)::clMath::clblas::hpr( params->order, params->uplo, params->N, CREAL(alpha_), bufX, params->offBX, params->incx, bufAP, params->offa, params->numCommandQueues, base->commandQueues(), 0, NULL, events); if (err != CL_SUCCESS) { releaseMemObjects(bufAP, bufX); deleteBuffers(backA, AP, X); delete[] events; ASSERT_EQ(CL_SUCCESS, err) << "::clMath::clblas::HPR() failed"; } err = waitForSuccessfulFinish(params->numCommandQueues, base->commandQueues(), events); if (err != CL_SUCCESS) { releaseMemObjects(bufAP, bufX); deleteBuffers(backA, AP, X); delete[] events; ASSERT_EQ(CL_SUCCESS, err) << "waitForSuccessfulFinish()"; } ::std::cerr << "Done" << ::std::endl; err = clEnqueueReadBuffer(base->commandQueues()[0], bufAP, CL_TRUE, 0, (lengthAP + params->offa) * sizeof(T), backA, 0, NULL, NULL); if (err != CL_SUCCESS) { ::std::cerr << "HPR: Reading results failed...." << std::endl; } releaseMemObjects(bufAP, bufX); printf("Comparing the results\n"); compareMatrices(clblasColumnMajor, lengthAP, 1, (AP + params->offa), (backA + params->offa), lengthAP); deleteBuffers( AP, backA, X); delete[] events; } // Instantiate the test TEST_P(HPR, chpr) { TestParams params; getParams(¶ms); hprCorrectnessTest(¶ms); } TEST_P(HPR, zhpr) { TestParams params; getParams(¶ms); hprCorrectnessTest(¶ms); } clblas-2.10/src/tests/correctness/corr-hpr2.cpp000066400000000000000000000155001264277366700215260ustar00rootroot00000000000000/* ************************************************************************ * Copyright 2013 Advanced Micro Devices, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * ************************************************************************/ #include // srand() #include // memcpy() #include #include #include #include #include #include #include #include #include static void releaseMemObjects(cl_mem objA, cl_mem objX, cl_mem objY) { if(objA != NULL) { clReleaseMemObject(objA); } if(objX != NULL) { clReleaseMemObject(objX); } if(objY != NULL) { clReleaseMemObject(objY); } } template static void deleteBuffers(T *blasA, T *clblasA, T *X, T *Y) { if(blasA != NULL) { delete[] blasA; } if(clblasA != NULL) { delete[] clblasA; } if(X != NULL) { delete[] X; } if(Y != NULL) { delete[] Y; } } template void hpr2CorrectnessTest(TestParams *params) { cl_int err; T *blasAP, *clblasAP, *X, *Y; cl_mem bufAP, bufX, bufY; clMath::BlasBase *base; cl_event *events; T alpha; base = clMath::BlasBase::getInstance(); if ((typeid(T) == typeid(cl_double2)) && !base->isDevSupportDoublePrecision()) { std::cerr << ">> WARNING: The target device doesn't support native " "double precision floating point arithmetic" << std::endl << ">> Test skipped" << std::endl; SUCCEED(); return; } printf("number of command queues : %d\n\n", params->numCommandQueues); events = new cl_event[params->numCommandQueues]; memset(events, 0, params->numCommandQueues * sizeof(cl_event)); size_t lengthAP = (params->N *( params->N + 1 ))/2 ; size_t lengthX = (1 + ((params->N - 1) * abs(params->incx))); size_t lengthY = (1 + ((params->N - 1) * abs(params->incy))); blasAP = new T[lengthAP + params->offa ]; clblasAP = new T[lengthAP + params->offa ]; X = new T[lengthX + params->offBX ]; Y = new T[lengthY + params->offCY ]; srand(params->seed); if((blasAP == NULL) || (clblasAP == NULL) || (X == NULL) || (Y == NULL)) { deleteBuffers(blasAP, clblasAP, X, Y); ::std::cerr << "Cannot allocate memory on host side\n" << "!!!!!!!!!!!!Test skipped.!!!!!!!!!!!!" << ::std::endl; delete[] events; SUCCEED(); return; } alpha = convertMultiplier(params->alpha); ::std::cerr << "Generating input data... "; randomHer2Matrices(params->order, params->uplo, params->N, &alpha, (blasAP + params->offa), params->lda, (X + params->offBX), params->incx, (Y + params->offCY), params->incy); // Copy blasA to clblasA memcpy(clblasAP, blasAP, (lengthAP + params->offa)* sizeof(*blasAP)); ::std::cerr << "Done" << ::std::endl; // Allocate buffers bufAP = base->createEnqueueBuffer(clblasAP, (lengthAP + params->offa)* sizeof(*clblasAP), 0,CL_MEM_READ_WRITE); bufX = base->createEnqueueBuffer(X, (lengthX + params->offBX)* sizeof(*X), 0, CL_MEM_READ_ONLY); bufY = base->createEnqueueBuffer(Y, (lengthY + params->offCY)* sizeof(*Y), 0, CL_MEM_READ_ONLY); ::std::cerr << "Calling reference xHPR2 routine... "; clblasOrder order; clblasUplo fUplo; order = params->order; fUplo = params->uplo; if (order != clblasColumnMajor) { doConjugate( (X + params->offBX), 1, (1 + (params->N-1) * abs(params->incx)), (1 + (params->N-1) * abs(params->incx)) ); doConjugate( (Y + params->offCY), 1, (1 + (params->N-1) * abs(params->incy)), (1 + (params->N-1) * abs(params->incy)) ); order = clblasColumnMajor; fUplo = (fUplo == clblasLower)? clblasUpper : clblasLower; ::clMath::blas::hpr2( order, fUplo, params->N, alpha, Y, params->offCY, params->incy, X, params->offBX, params->incx, blasAP, params->offa); } else { ::clMath::blas::hpr2( order, fUplo, params->N, alpha, X, params->offBX, params->incx, Y, params->offCY, params->incy, blasAP, params->offa); } ::std::cerr << "Done" << ::std::endl; if ((bufAP == NULL) || (bufX == NULL) || (bufY == NULL)) { /* Skip the test, the most probable reason is * matrix too big for a device. */ releaseMemObjects(bufAP, bufX, bufY); deleteBuffers(blasAP, clblasAP, X, Y); delete[] events; ::std::cerr << ">> Failed to create/enqueue buffer for a matrix." << ::std::endl << ">> Can't execute the test, because data is not transfered to GPU." << ::std::endl << ">> Test skipped." << ::std::endl; SUCCEED(); return; } ::std::cerr << "Calling clblas xHPR2 routine... "; err = (cl_int)::clMath::clblas::hpr2( params->order, params->uplo, params->N, alpha, bufX, params->offBX, params->incx, bufY, params->offCY, params->incy, bufAP, params->offa, params->numCommandQueues, base->commandQueues(), 0, NULL, events); if (err != CL_SUCCESS) { releaseMemObjects(bufAP, bufX, bufY); deleteBuffers(blasAP, clblasAP, X, Y); delete[] events; ASSERT_EQ(CL_SUCCESS, err) << "::clMath::clblas::HPR2() failed"; } err = waitForSuccessfulFinish(params->numCommandQueues, base->commandQueues(), events); if (err != CL_SUCCESS) { releaseMemObjects(bufAP, bufX, bufY); deleteBuffers(blasAP, clblasAP, X, Y); delete[] events; ASSERT_EQ(CL_SUCCESS, err) << "waitForSuccessfulFinish()"; } ::std::cerr << "Done" << ::std::endl; err = clEnqueueReadBuffer(base->commandQueues()[0], bufAP, CL_TRUE, 0, (lengthAP + params->offa) * sizeof(*clblasAP), clblasAP, 0, NULL, NULL); if (err != CL_SUCCESS) { ::std::cerr << "HPR2: Reading results failed...." << std::endl; } releaseMemObjects(bufAP, bufX, bufY); compareMatrices(clblasColumnMajor, lengthAP, 1, (blasAP + params->offa), (clblasAP + params->offa), lengthAP); deleteBuffers(blasAP, clblasAP, X, Y); delete[] events; } // Instantiate the test TEST_P(HPR2, chpr2) { TestParams params; getParams(¶ms); hpr2CorrectnessTest(¶ms); } TEST_P(HPR2, zhpr2) { TestParams params; getParams(¶ms); hpr2CorrectnessTest(¶ms); } clblas-2.10/src/tests/correctness/corr-iamax.cpp000066400000000000000000000143541264277366700217600ustar00rootroot00000000000000/* ************************************************************************ * Copyright 2013 Advanced Micro Devices, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * ************************************************************************/ #include // srand() #include // memcpy() #include #include #include #include #include #include #include #include #include static void releaseMemObjects(cl_mem objX, cl_mem objiAmax, cl_mem objScratch) { if(objX != NULL) { clReleaseMemObject(objX); } if(objiAmax != NULL) { clReleaseMemObject(objiAmax); } if(objScratch != NULL) { clReleaseMemObject(objScratch); } } template static void deleteBuffers(T *blasX, int *blasiAmax=NULL, int *clblasiAmax=NULL) { if(blasX != NULL) { delete[] blasX; } if(clblasiAmax != NULL) { delete[] clblasiAmax; } if(blasiAmax != NULL) { delete(blasiAmax); } } template void iamaxCorrectnessTest(TestParams *params) { cl_int err; T *blasX; int *clblasiAmax, *blasiAmax; cl_mem bufX, bufiAmax, scratchBuff; clMath::BlasBase *base; cl_event *events; base = clMath::BlasBase::getInstance(); if ((typeid(T) == typeid(cl_double) || typeid(T) == typeid(DoubleComplex)) && !base->isDevSupportDoublePrecision()) { std::cerr << ">> WARNING: The target device doesn't support native " "double precision floating point arithmetic" << std::endl << ">> Test skipped" << std::endl; SUCCEED(); return; } printf("number of command queues : %d\n\n", params->numCommandQueues); events = new cl_event[params->numCommandQueues]; memset(events, 0, params->numCommandQueues * sizeof(cl_event)); size_t lengthX = (1 + ((params->N -1) * abs(params->incx))); blasX = new T[lengthX + params->offBX ]; blasiAmax = new int[1]; clblasiAmax = new int[1 + params->offa]; if((blasX == NULL) || (clblasiAmax == NULL) || (blasiAmax == NULL)) { ::std::cerr << "Cannot allocate memory on host side\n" << "!!!!!!!!!!!!Test skipped.!!!!!!!!!!!!" << ::std::endl; deleteBuffers(blasX, blasiAmax, clblasiAmax); delete[] events; SUCCEED(); return; } srand(params->seed); ::std::cerr << "Generating input data... "; randomVectors(params->N, (blasX + params->offBX), params->incx, NULL, 0); ::std::cerr << "Done" << ::std::endl; // Allocate buffers bufX = base->createEnqueueBuffer(blasX, (lengthX + params->offBX)* sizeof(T), 0, CL_MEM_READ_ONLY); bufiAmax = base->createEnqueueBuffer(NULL, (1 + params->offa) * sizeof(int), 0, CL_MEM_READ_WRITE); scratchBuff = base->createEnqueueBuffer(NULL, (2 * lengthX * sizeof(T)), 0, CL_MEM_READ_WRITE); ::std::cerr << "Calling reference xiAMAX routine... "; *blasiAmax = ::clMath::blas::iamax( params->N, blasX, params->offBX, params->incx); ::std::cerr << "Done" << ::std::endl; if ((bufX == NULL) || (bufiAmax == NULL) || (scratchBuff == NULL)) { releaseMemObjects(bufX, bufiAmax, scratchBuff); deleteBuffers(blasX, blasiAmax, clblasiAmax); delete[] events; ::std::cerr << ">> Failed to create/enqueue buffer for a matrix." << ::std::endl << ">> Can't execute the test, because data is not transfered to GPU." << ::std::endl << ">> Test skipped." << ::std::endl; SUCCEED(); return; } ::std::cerr << "Calling clblas xiAMAX routine... "; DataType type; type = ( typeid(T) == typeid(cl_float))? TYPE_FLOAT : ( typeid(T) == typeid(cl_double))? TYPE_DOUBLE: ( typeid(T) == typeid(cl_float2))? TYPE_COMPLEX_FLOAT:TYPE_COMPLEX_DOUBLE; // Should use bufXTemp as well err = (cl_int)::clMath::clblas::iamax( type, params->N, bufiAmax, params->offa, bufX, params->offBX, params->incx, scratchBuff, params->numCommandQueues, base->commandQueues(), 0, NULL, events); if (err != CL_SUCCESS) { releaseMemObjects(bufX, bufiAmax, scratchBuff); deleteBuffers(blasX, blasiAmax, clblasiAmax); delete[] events; ASSERT_EQ(CL_SUCCESS, err) << "::clMath::clblas::iAMAX() failed"; } err = waitForSuccessfulFinish(params->numCommandQueues, base->commandQueues(), events); if (err != CL_SUCCESS) { releaseMemObjects(bufX, bufiAmax, scratchBuff); deleteBuffers(blasX, blasiAmax, clblasiAmax); delete[] events; ASSERT_EQ(CL_SUCCESS, err) << "waitForSuccessfulFinish()"; } ::std::cerr << "Done" << ::std::endl; err = clEnqueueReadBuffer(base->commandQueues()[0], bufiAmax, CL_TRUE, 0, (1 + params->offa) * sizeof(*clblasiAmax), clblasiAmax, 0, NULL, NULL); if (err != CL_SUCCESS) { ::std::cerr << "iAMAX: Reading results failed...." << std::endl; } compareValues((blasiAmax), (clblasiAmax+params->offa), 0); releaseMemObjects(bufX, bufiAmax, scratchBuff); deleteBuffers(blasX, blasiAmax, clblasiAmax); delete[] events; } // Instantiate the test TEST_P(iAMAX, isamax) { TestParams params; getParams(¶ms); iamaxCorrectnessTest(¶ms); } TEST_P(iAMAX, idamax) { TestParams params; getParams(¶ms); iamaxCorrectnessTest(¶ms); } TEST_P(iAMAX, icamax) { TestParams params; getParams(¶ms); iamaxCorrectnessTest(¶ms); } TEST_P(iAMAX, izamax) { TestParams params; getParams(¶ms); iamaxCorrectnessTest(¶ms); } clblas-2.10/src/tests/correctness/corr-nrm2.cpp000066400000000000000000000151771264277366700215430ustar00rootroot00000000000000/* ************************************************************************ * Copyright 2013 Advanced Micro Devices, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * ************************************************************************/ #include // srand() #include // memcpy() #include #include #include #include #include #include #include #include #include #include "delta.h" static void releaseMemObjects(cl_mem objX, cl_mem objNrm2, cl_mem objScratch) { if(objX != NULL) { clReleaseMemObject(objX); } if(objNrm2 != NULL) { clReleaseMemObject(objNrm2); } if(objScratch != NULL) { clReleaseMemObject(objScratch); } } template static void deleteBuffers(T *blasX, T *blasNRM2=NULL, T *clblasNRM2=NULL) { if(blasX != NULL) { delete[] blasX; } if(clblasNRM2 != NULL) { delete[] clblasNRM2; } if(blasNRM2 != NULL) { delete(blasNRM2); } } template void nrm2CorrectnessTest(TestParams *params) { cl_int err; T1 *blasX; T2 *clblasNRM2, *blasNRM2; cl_mem bufX, bufNRM2, scratchBuff; clMath::BlasBase *base; cl_event *events; cl_double deltaForType = 0.0; base = clMath::BlasBase::getInstance(); if ((typeid(T1) == typeid(cl_double) || typeid(T1) == typeid(DoubleComplex)) && !base->isDevSupportDoublePrecision()) { std::cerr << ">> WARNING: The target device doesn't support native " "double precision floating point arithmetic" << std::endl << ">> Test skipped" << std::endl; SUCCEED(); return; } printf("number of command queues : %d\n\n", params->numCommandQueues); events = new cl_event[params->numCommandQueues]; memset(events, 0, params->numCommandQueues * sizeof(cl_event)); size_t lengthX = (1 + ((params->N -1) * abs(params->incx))); blasX = new T1[lengthX + params->offBX ]; blasNRM2 = new T2[1]; clblasNRM2 = new T2[1 + params->offa]; if((blasX == NULL) || (clblasNRM2 == NULL) || (blasNRM2 == NULL)) { ::std::cerr << "Cannot allocate memory on host side\n" << "!!!!!!!!!!!!Test skipped.!!!!!!!!!!!!" << ::std::endl; deleteBuffers(blasX); deleteBuffers(blasNRM2, clblasNRM2); delete[] events; SUCCEED(); return; } srand(params->seed); ::std::cerr << "Generating input data... "; randomVectors(params->N, (blasX + params->offBX), params->incx, (T1*)NULL, 0, true); ::std::cerr << "Done" << ::std::endl; // Allocate buffers bufX = base->createEnqueueBuffer(blasX, (lengthX + params->offBX)* sizeof(*blasX), 0, CL_MEM_READ_WRITE); bufNRM2 = base->createEnqueueBuffer(NULL, (1 + params->offa) * sizeof(T2), 0, CL_MEM_READ_WRITE); scratchBuff = base->createEnqueueBuffer(NULL, (lengthX * 2 * sizeof(T1)), 0, CL_MEM_READ_WRITE); ::std::cerr << "Calling reference xNRM2 routine... "; *blasNRM2 = ::clMath::blas::nrm2( params->N, blasX, params->offBX, params->incx); ::std::cerr << "Done" << ::std::endl; if ((bufX == NULL) || (bufNRM2 == NULL) || (scratchBuff == NULL)) { releaseMemObjects(bufX, bufNRM2, scratchBuff); deleteBuffers(blasX); deleteBuffers(blasNRM2, clblasNRM2); delete[] events; ::std::cerr << ">> Failed to create/enqueue buffer for a matrix." << ::std::endl << ">> Can't execute the test, because data is not transfered to GPU." << ::std::endl << ">> Test skipped." << ::std::endl; SUCCEED(); return; } ::std::cerr << "Calling clblas xNRM2 routine... "; DataType type; type = ( typeid(T1) == typeid(cl_float))? TYPE_FLOAT : ( typeid(T1) == typeid(cl_double))? TYPE_DOUBLE: ( typeid(T1) == typeid(cl_float2))? TYPE_COMPLEX_FLOAT:TYPE_COMPLEX_DOUBLE; err = (cl_int)::clMath::clblas::nrm2( type, params->N, bufNRM2, params->offa, bufX, params->offBX, params->incx, scratchBuff, params->numCommandQueues, base->commandQueues(), 0, NULL, events); if (err != CL_SUCCESS) { releaseMemObjects(bufX, bufNRM2, scratchBuff); deleteBuffers(blasX); deleteBuffers(blasNRM2, clblasNRM2); delete[] events; ASSERT_EQ(CL_SUCCESS, err) << "::clMath::clblas::NRM2() failed"; } err = waitForSuccessfulFinish(params->numCommandQueues, base->commandQueues(), events); if (err != CL_SUCCESS) { releaseMemObjects(bufX, bufNRM2, scratchBuff); deleteBuffers(blasX); deleteBuffers(blasNRM2, clblasNRM2); delete[] events; ASSERT_EQ(CL_SUCCESS, err) << "waitForSuccessfulFinish()"; } ::std::cerr << "Done" << ::std::endl; err = clEnqueueReadBuffer(base->commandQueues()[0], bufNRM2, CL_TRUE, 0, (1 + params->offa) * sizeof(*clblasNRM2), clblasNRM2, 0, NULL, NULL); if (err != CL_SUCCESS) { ::std::cerr << "NRM2: Reading results failed...." << std::endl; } releaseMemObjects(bufX, bufNRM2, scratchBuff); deltaForType = DELTA_0(); // Since every element of X encounters a division, delta would be sum of deltas for every element in X cl_double delta = 0; for(unsigned int i=0; i<(params->N); i++) { delta += deltaForType * returnMax(blasX[params->offBX + i]); } compareValues( (blasNRM2), (clblasNRM2+params->offa), delta); deleteBuffers(blasX); deleteBuffers(blasNRM2, clblasNRM2); delete[] events; } // Instantiate the test TEST_P(NRM2, snrm2) { TestParams params; getParams(¶ms); nrm2CorrectnessTest(¶ms); } TEST_P(NRM2, dnrm2) { TestParams params; getParams(¶ms); nrm2CorrectnessTest(¶ms); } TEST_P(NRM2, scnrm2) { TestParams params; getParams(¶ms); nrm2CorrectnessTest(¶ms); } TEST_P(NRM2, dznrm2) { TestParams params; getParams(¶ms); nrm2CorrectnessTest(¶ms); } clblas-2.10/src/tests/correctness/corr-rot.cpp000066400000000000000000000145211264277366700214610ustar00rootroot00000000000000/* ************************************************************************ * Copyright 2013 Advanced Micro Devices, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * ************************************************************************/ #include // srand() #include // memcpy() #include #include #include #include #include #include #include #include #include #include //#include "delta.h" static void releaseMemObjects(cl_mem bufX, cl_mem bufY) { if(bufX != NULL) { clReleaseMemObject(bufX); } if(bufY != NULL) { clReleaseMemObject(bufY); } } template static void deleteBuffers(T *X, T *Y, T *back_X, T *back_Y) { if(X != NULL) { delete[] X; } if(Y != NULL) { delete[] Y; } if(back_X != NULL) { delete[] back_X; } if(back_Y != NULL) { delete[] back_Y; } } template void rotCorrectnessTest(TestParams *params) { cl_int err; T *X, *Y, *back_X, *back_Y; T alpha, beta; cl_mem bufX, bufY; clMath::BlasBase *base; cl_event *events; base = clMath::BlasBase::getInstance(); if ((typeid(T) == typeid(cl_double) || typeid(T) == typeid(DoubleComplex)) && !base->isDevSupportDoublePrecision()) { std::cerr << ">> WARNING: The target device doesn't support native " "double precision floating point arithmetic" << std::endl << ">> Test skipped" << std::endl; SUCCEED(); return; } printf("number of command queues : %d\n\n", params->numCommandQueues); events = new cl_event[params->numCommandQueues]; memset(events, 0, params->numCommandQueues * sizeof(cl_event)); size_t lengthx = 1 + (params->N - 1) * abs(params->incx); size_t lengthy = 1 + (params->N - 1) * abs(params->incy); X = new T[lengthx + params->offa]; Y = new T[lengthy + params->offb]; back_X = new T[lengthx + params->offa]; back_Y = new T[lengthy + params->offb]; if((X == NULL) || (Y == NULL) || (back_X == NULL) || (back_Y == NULL)) { ::std::cerr << "Cannot allocate memory on host side\n" << "!!!!!!!!!!!!Test skipped.!!!!!!!!!!!!" << ::std::endl; deleteBuffers(X, Y, back_X, back_Y); delete[] events; SUCCEED(); return; } srand(params->seed); ::std::cerr << "Generating input data... "; //Filling random values for SA and SB. C & S are only for output sake randomVectors(params->N, (X + params->offa), params->incx, (Y+params->offb), params->incy); alpha = convertMultiplier(params->alpha); beta = convertMultiplier(params->beta); memcpy(back_X, X, (lengthx + params->offa) * sizeof(T)); memcpy(back_Y, Y, (lengthy + params->offb) * sizeof(T)); ::std::cerr << "Done" << ::std::endl; // Allocate buffers bufX = base->createEnqueueBuffer(X, (lengthx + params->offa) * sizeof(T), 0, CL_MEM_READ_WRITE); bufY = base->createEnqueueBuffer(Y, (lengthy + params->offb) * sizeof(T), 0, CL_MEM_READ_WRITE); ::std::cerr << "Calling reference xROT routine... "; ::clMath::blas::rot(params->N, back_X, params->offa, params->incx, back_Y, params->offb, params->incy, alpha, beta); ::std::cerr << "Done" << ::std::endl; // Hold X vector if ((bufX == NULL) || (bufY == NULL)) { releaseMemObjects(bufX, bufY); deleteBuffers(X, Y, back_X, back_Y); delete[] events; ::std::cerr << ">> Failed to create/enqueue buffer for a matrix." << ::std::endl << ">> Can't execute the test, because data is not transfered to GPU." << ::std::endl << ">> Test skipped." << ::std::endl; SUCCEED(); return; } ::std::cerr << "Calling clblas xROT routine... "; err = (cl_int)::clMath::clblas::rot( params->N, bufX, params->offa, params->incx, bufY, params->offb, params->incy, alpha, beta, params->numCommandQueues, base->commandQueues(), 0, NULL, events); if (err != CL_SUCCESS) { releaseMemObjects(bufX, bufY); deleteBuffers(X, Y, back_X, back_Y); delete[] events; ASSERT_EQ(CL_SUCCESS, err) << "::clMath::clblas::ROT() failed"; } err = waitForSuccessfulFinish(params->numCommandQueues, base->commandQueues(), events); if (err != CL_SUCCESS) { releaseMemObjects(bufX, bufY); deleteBuffers(X, Y, back_X, back_Y ); delete[] events; ASSERT_EQ(CL_SUCCESS, err) << "waitForSuccessfulFinish()"; } ::std::cerr << "Done" << ::std::endl; err = clEnqueueReadBuffer(base->commandQueues()[0], bufX, CL_TRUE, 0, (lengthx + params->offa) * sizeof(T), X, 0, NULL, NULL); err |= clEnqueueReadBuffer(base->commandQueues()[0], bufY, CL_TRUE, 0, (lengthy + params->offb) * sizeof(T), Y, 0, NULL, NULL); if (err != CL_SUCCESS) { ::std::cerr << "ROT: Reading results failed...." << std::endl; } releaseMemObjects(bufX, bufY); compareMatrices(clblasRowMajor, lengthx , 1, (back_X + params->offa), (X + params->offa), 1); compareMatrices(clblasRowMajor, lengthy , 1, (back_Y + params->offb), (Y + params->offb), 1); deleteBuffers(X, Y, back_X, back_Y); delete[] events; } // Instantiate the test TEST_P(ROT, srot) { TestParams params; getParams(¶ms); rotCorrectnessTest(¶ms); } TEST_P(ROT, drot) { TestParams params; getParams(¶ms); rotCorrectnessTest(¶ms); } TEST_P(ROT, csrot) { TestParams params; getParams(¶ms); rotCorrectnessTest(¶ms); } TEST_P(ROT, zdrot) { TestParams params; getParams(¶ms); rotCorrectnessTest(¶ms); } clblas-2.10/src/tests/correctness/corr-rotg.cpp000066400000000000000000000216141264277366700216310ustar00rootroot00000000000000/* ************************************************************************ * Copyright 2013 Advanced Micro Devices, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * ************************************************************************/ #include // srand() #include // memcpy() #include #include #include #include #include #include #include #include #include #include #include "delta.h" static void releaseMemObjects(cl_mem bufSA, cl_mem bufSB, cl_mem bufC, cl_mem bufS) { if(bufSA != NULL) { clReleaseMemObject(bufSA); } if(bufSB != NULL) { clReleaseMemObject(bufSB); } if(bufC != NULL) { clReleaseMemObject(bufC); } if(bufS != NULL) { clReleaseMemObject(bufS); } } template static void deleteBuffers(T *A, T *B, T *C=NULL, T *D=NULL, T *E=NULL, T *F=NULL) { if(A != NULL) { delete[] A; } if(B != NULL) { delete[] B; } if(C != NULL) { delete[] C; } if(D != NULL) { delete[] D; } if(E != NULL) { delete[] E; } if(F != NULL) { delete[] F; } } // type T1 indicates the basic type, // while T2 indicates type of buffer C. C is not complex for complex types template void rotgCorrectnessTest(TestParams *params) { cl_int err; T1 *SA, *SB, *S, *back_SA, *back_SB, *back_S; T2 *C, *back_C; cl_mem bufSA, bufSB, bufC, bufS; clMath::BlasBase *base; cl_event *events; cl_double deltaForType = 0.0; base = clMath::BlasBase::getInstance(); if ((typeid(T1) == typeid(cl_double) || typeid(T1) == typeid(DoubleComplex)) && !base->isDevSupportDoublePrecision()) { std::cerr << ">> WARNING: The target device doesn't support native " "double precision floating point arithmetic" << std::endl << ">> Test skipped" << std::endl; SUCCEED(); return; } printf("number of command queues : %d\n\n", params->numCommandQueues); events = new cl_event[params->numCommandQueues]; memset(events, 0, params->numCommandQueues * sizeof(cl_event)); size_t length = 1;//only one element need to be accessed always SA = new T1[length + params->offBX ]; SB = new T1[length + params->offCY ]; C = new T2[length + params->offa ]; S = new T1[length + params->offb ]; back_SA = new T1[length + params->offBX ]; back_SB = new T1[length + params->offCY ]; back_C = new T2[length + params->offa ]; back_S = new T1[length + params->offb ]; if((SA == NULL) || (SB == NULL) || (C == NULL) || (S == NULL) || (back_SA == NULL) || (back_SB == NULL) || (back_C == NULL) || (back_S == NULL)) { ::std::cerr << "Cannot allocate memory on host side\n" << "!!!!!!!!!!!!Test skipped.!!!!!!!!!!!!" << ::std::endl; deleteBuffers(SA, SB, S, back_SA, back_SB, back_S); deleteBuffers(C, back_C); delete[] events; SUCCEED(); return; } srand(params->seed); ::std::cerr << "Generating input data... "; //Filling random values for SA and SB. C & S are only for output sake randomVectors(1, (SA+params->offBX), 1, (SB+params->offCY), 1); S[params->offb] = back_S[params->offb] = ZERO(); C[params->offa] = back_C[params->offa] = ZERO(); back_SA[params->offBX] = SA[params->offBX]; back_SB[params->offCY] = SB[params->offCY]; ::std::cerr << "Done" << ::std::endl; //printing the inputs, as they change after processing ::std::cerr << "A = "; printElement(SA[params->offBX]); ::std::cerr << "\tB = "; printElement(SB[params->offCY]); ::std::cerr << "\tC = "; printElement(C[params->offa]); ::std::cerr << "\tS = "; printElement(S[params->offb]); ::std::cout << std::endl << std::endl; // Allocate buffers bufSA = base->createEnqueueBuffer(SA, (length + params->offBX) * sizeof(T1), 0, CL_MEM_READ_WRITE); bufSB = base->createEnqueueBuffer(SB, (length + params->offCY) * sizeof(T1), 0, CL_MEM_READ_WRITE); bufC = base->createEnqueueBuffer(C, (length + params->offa ) * sizeof(T2), 0, CL_MEM_WRITE_ONLY); bufS = base->createEnqueueBuffer(S, (length + params->offb ) * sizeof(T1), 0, CL_MEM_WRITE_ONLY); ::std::cerr << "Calling reference xROTG routine... "; ::clMath::blas::rotg(back_SA, params->offBX, back_SB, params->offCY, back_C, params->offa, back_S, params->offb); ::std::cerr << "Done" << ::std::endl; // Hold X vector if ((bufSA == NULL) || (bufSB == NULL) || (bufC == NULL) || (bufS == NULL)) { releaseMemObjects(bufSA, bufSB, bufC, bufS); deleteBuffers(SA, SB, S, back_SA, back_SB, back_S); deleteBuffers(C, back_C); delete[] events; ::std::cerr << ">> Failed to create/enqueue buffer for a matrix." << ::std::endl << ">> Can't execute the test, because data is not transfered to GPU." << ::std::endl << ">> Test skipped." << ::std::endl; SUCCEED(); return; } ::std::cerr << "Calling clblas xROTG routine... "; DataType type; type = ( typeid(T1) == typeid(cl_float)) ? TYPE_FLOAT : ( typeid(T1) == typeid(cl_double)) ? TYPE_DOUBLE: ( typeid(T1) == typeid(cl_float2)) ? TYPE_COMPLEX_FLOAT: TYPE_COMPLEX_DOUBLE; err = (cl_int)::clMath::clblas::rotg( type, bufSA, params->offBX, bufSB, params->offCY, bufC, params->offa, bufS, params->offb, params->numCommandQueues, base->commandQueues(), 0, NULL, events); if (err != CL_SUCCESS) { releaseMemObjects(bufSA, bufSB, bufC, bufS); deleteBuffers(SA, SB, S, back_SA, back_SB, back_S); deleteBuffers(C, back_C); delete[] events; ASSERT_EQ(CL_SUCCESS, err) << "::clMath::clblas::ROTG() failed"; } err = waitForSuccessfulFinish(params->numCommandQueues, base->commandQueues(), events); if (err != CL_SUCCESS) { releaseMemObjects(bufSA, bufSB, bufC, bufS); deleteBuffers(SA, SB, S, back_SA, back_SB, back_S); deleteBuffers(C, back_C); delete[] events; ASSERT_EQ(CL_SUCCESS, err) << "waitForSuccessfulFinish()"; } ::std::cerr << "Done" << ::std::endl; err = clEnqueueReadBuffer(base->commandQueues()[0], bufSA, CL_TRUE, 0, (length + params->offBX) * sizeof(T1), SA, 0, NULL, NULL); err |= clEnqueueReadBuffer(base->commandQueues()[0], bufSB, CL_TRUE, 0, (length + params->offCY) * sizeof(T1), SB, 0, NULL, NULL); err |= clEnqueueReadBuffer(base->commandQueues()[0], bufC, CL_TRUE, 0, (length + params->offa) * sizeof(T2), C, 0, NULL, NULL); err |= clEnqueueReadBuffer(base->commandQueues()[0], bufS, CL_TRUE, 0, (length + params->offb) * sizeof(T1), S, 0, NULL, NULL); if (err != CL_SUCCESS) { ::std::cerr << "ROTG: Reading results failed...." << std::endl; } releaseMemObjects(bufSA, bufSB, bufC, bufS); deltaForType = DELTA_0(); cl_double delta; delta = deltaForType * returnMax(back_SA[params->offBX]); compareValues( (back_SA + params->offBX), (SA + params->offBX), delta); delta = deltaForType * returnMax(back_SB[params->offCY]); compareValues( (back_SB + params->offCY), (SB + params->offCY), delta); delta = deltaForType * returnMax(back_C[params->offa]); compareValues( (back_C + params->offa), (C + params->offa), delta); delta = deltaForType * returnMax(back_S[params->offb]); compareValues( (back_S + params->offb), (S + params->offb), delta); deleteBuffers(SA, SB, S, back_SA, back_SB, back_S); deleteBuffers(C, back_C); delete[] events; } // Instantiate the test TEST_P(ROTG, srotg) { TestParams params; getParams(¶ms); rotgCorrectnessTest(¶ms); } TEST_P(ROTG, drotg) { TestParams params; getParams(¶ms); rotgCorrectnessTest(¶ms); } TEST_P(ROTG, crotg) { TestParams params; getParams(¶ms); rotgCorrectnessTest(¶ms); } TEST_P(ROTG, zrotg) { TestParams params; getParams(¶ms); rotgCorrectnessTest(¶ms); } clblas-2.10/src/tests/correctness/corr-rotm.cpp000066400000000000000000000157471264277366700216510ustar00rootroot00000000000000/* ************************************************************************ * Copyright 2013 Advanced Micro Devices, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * ************************************************************************/ #include // srand() #include // memcpy() #include #include #include #include #include #include #include #include #include #include static void releaseMemObjects(cl_mem bufX, cl_mem bufY, cl_mem bufParam) { if(bufX != NULL) { clReleaseMemObject(bufX); } if(bufY != NULL) { clReleaseMemObject(bufY); } if(bufParam != NULL) { clReleaseMemObject(bufParam); } } template static void deleteBuffers(T *X, T *Y, T *PARAM, T *back_X, T *back_Y, T *back_PARAM) { if(X != NULL) { delete[] X; } if(Y != NULL) { delete[] Y; } if(PARAM != NULL) { delete[] PARAM; } if(back_X != NULL) { delete[] back_X; } if(back_Y != NULL) { delete[] back_Y; } if(back_PARAM != NULL) { delete[] back_PARAM; } } template void rotmCorrectnessTest(TestParams *params) { cl_int err; T *X, *Y, *back_X, *back_Y; T *PARAM, *back_PARAM; T sflagParam; cl_mem bufX, bufY, bufParam; clMath::BlasBase *base; cl_event *events; base = clMath::BlasBase::getInstance(); if ((typeid(T) == typeid(cl_double)) && !base->isDevSupportDoublePrecision()) { std::cerr << ">> WARNING: The target device doesn't support native " "double precision floating point arithmetic" << std::endl << ">> Test skipped" << std::endl; SUCCEED(); return; } printf("number of command queues : %d\n\n", params->numCommandQueues); events = new cl_event[params->numCommandQueues]; memset(events, 0, params->numCommandQueues * sizeof(cl_event)); size_t lengthx = 1 + (params->N - 1) * abs(params->incx); size_t lengthy = 1 + (params->N - 1) * abs(params->incy); X = new T[lengthx + params->offa]; Y = new T[lengthy + params->offb]; PARAM = new T[5 + params->offc]; //params always has 5 elements back_X = new T[lengthx + params->offa]; back_Y = new T[lengthy + params->offb]; back_PARAM = new T[5 + params->offc]; //params always has 5 elements if((X == NULL) || (Y == NULL) || (PARAM == NULL) || (back_X == NULL) || (back_Y == NULL) || (back_PARAM == NULL)) { ::std::cerr << "Cannot allocate memory on host side\n" << "!!!!!!!!!!!!Test skipped.!!!!!!!!!!!!" << ::std::endl; deleteBuffers(X, Y, PARAM, back_X, back_Y, back_PARAM); delete[] events; SUCCEED(); return; } srand(params->seed); ::std::cerr << "Generating input data... "; randomVectors(params->N, (X + params->offa), params->incx, (Y+params->offb), params->incy); randomVectors(4, (PARAM + params->offc + 1), 1); //1st element is initialized separately sflagParam = convertMultiplier(params->alpha); PARAM[params->offc] = sflagParam; // initializing first element memcpy(back_X, X, (lengthx + params->offa)*sizeof(T)); memcpy(back_Y, Y, (lengthy + params->offb)*sizeof(T)); memcpy(back_PARAM, PARAM, (params->offc + 5)*sizeof(T)); ::std::cerr << "Done" << ::std::endl; // Allocate buffers bufX = base->createEnqueueBuffer(X, (lengthx + params->offa) * sizeof(T), 0, CL_MEM_READ_WRITE); bufY = base->createEnqueueBuffer(Y, (lengthy + params->offb) * sizeof(T), 0, CL_MEM_READ_WRITE); bufParam = base->createEnqueueBuffer(PARAM, (5 + params->offc) * sizeof(T), 0, CL_MEM_READ_ONLY); ::std::cerr << "Calling reference xROTM routine... "; ::clMath::blas::rotm(params->N, back_X, params->offa, params->incx, back_Y, params->offb, params->incy, back_PARAM, params->offc); ::std::cerr << "Done" << ::std::endl; if ((bufX == NULL) || (bufY == NULL) || (bufParam == NULL)) { releaseMemObjects(bufX, bufY, bufParam); deleteBuffers(X, Y, PARAM, back_X, back_Y, back_PARAM); delete[] events; ::std::cerr << ">> Failed to create/enqueue buffer for a matrix." << ::std::endl << ">> Can't execute the test, because data is not transfered to GPU." << ::std::endl << ">> Test skipped." << ::std::endl; SUCCEED(); return; } ::std::cerr << "Calling clblas xROTM routine... "; DataType type; type = ( typeid(T) == typeid(cl_float)) ? TYPE_FLOAT : TYPE_DOUBLE; err = (cl_int)::clMath::clblas::rotm( type, params->N, bufX, params->offa, params->incx, bufY, params->offb, params->incy, bufParam, params->offc, params->numCommandQueues, base->commandQueues(), 0, NULL, events); if (err != CL_SUCCESS) { releaseMemObjects(bufX, bufY, bufParam); deleteBuffers(X, Y, PARAM, back_X, back_Y, back_PARAM); delete[] events; ASSERT_EQ(CL_SUCCESS, err) << "::clMath::clblas::ROTM() failed"; } err = waitForSuccessfulFinish(params->numCommandQueues, base->commandQueues(), events); if (err != CL_SUCCESS) { releaseMemObjects(bufX, bufY, bufParam); deleteBuffers(X, Y, PARAM, back_X, back_Y, back_PARAM); delete[] events; ASSERT_EQ(CL_SUCCESS, err) << "waitForSuccessfulFinish()"; } ::std::cerr << "Done" << ::std::endl; err = clEnqueueReadBuffer(base->commandQueues()[0], bufX, CL_TRUE, 0, (lengthx + params->offa) * sizeof(T), X, 0, NULL, NULL); err |= clEnqueueReadBuffer(base->commandQueues()[0], bufY, CL_TRUE, 0, (lengthy + params->offb) * sizeof(T), Y, 0, NULL, NULL); if (err != CL_SUCCESS) { ::std::cerr << "ROTM: Reading results failed...." << std::endl; } releaseMemObjects(bufX, bufY, bufParam); compareMatrices(clblasColumnMajor, lengthx , 1, (back_X + params->offa), (X + params->offa), lengthx); compareMatrices(clblasColumnMajor, lengthy , 1, (back_Y + params->offb), (Y + params->offb), lengthy); deleteBuffers(X, Y, PARAM, back_X, back_Y, back_PARAM); delete[] events; } // Instantiate the test TEST_P(ROTM, srotm) { TestParams params; getParams(¶ms); rotmCorrectnessTest(¶ms); } TEST_P(ROTM, drotm) { TestParams params; getParams(¶ms); rotmCorrectnessTest(¶ms); } clblas-2.10/src/tests/correctness/corr-rotmg.cpp000066400000000000000000000223401264277366700220030ustar00rootroot00000000000000/* ************************************************************************ * Copyright 2013 Advanced Micro Devices, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * ************************************************************************/ #include // srand() #include // memcpy() #include #include #include #include #include #include #include #include #include #include #include "delta.h" static void releaseMemObjects(cl_mem bufD1, cl_mem bufD2, cl_mem bufX, cl_mem bufY, cl_mem bufParam) { if(bufD1 != NULL) { clReleaseMemObject(bufD1); } if(bufD2 != NULL) { clReleaseMemObject(bufD2); } if(bufX != NULL) { clReleaseMemObject(bufX); } if(bufY != NULL) { clReleaseMemObject(bufY); } if(bufParam != NULL) { clReleaseMemObject(bufParam); } } template static void deleteBuffers(T *D1, T *D2, T *X, T *Y, T *PARAM) { if(D1 != NULL) { delete[] D1; } if(D2 != NULL) { delete[] D2; } if(X != NULL) { delete[] X; } if(Y != NULL) { delete[] Y; } if(PARAM != NULL) { delete[] PARAM; } } template void rotmgCorrectnessTest(TestParams *params) { cl_int err; T *D1, *D2, *X, *Y, *PARAM; T *back_D1, *back_D2, *back_X, *back_Y, *back_PARAM; T sflagParam; cl_mem bufD1, bufD2, bufX, bufY, bufParam; clMath::BlasBase *base; cl_event *events; cl_double deltaForType = 0.0; base = clMath::BlasBase::getInstance(); if ((typeid(T) == typeid(cl_double)) && !base->isDevSupportDoublePrecision()) { std::cerr << ">> WARNING: The target device doesn't support native " "double precision floating point arithmetic" << std::endl << ">> Test skipped" << std::endl; SUCCEED(); return; } printf("number of command queues : %d\n\n", params->numCommandQueues); events = new cl_event[params->numCommandQueues]; memset(events, 0, params->numCommandQueues * sizeof(cl_event)); X = new T[1 + params->offBX]; Y = new T[1 + params->offCY]; D1 = new T[1 + params->offa]; D2 = new T[1 + params->offb]; PARAM = new T[5 + params->offc]; //params always has 5 elements back_X = new T[1 + params->offBX]; back_Y = new T[1 + params->offCY]; back_D1 = new T[1 + params->offa]; back_D2 = new T[1 + params->offb]; back_PARAM = new T[5 + params->offc]; //params always has 5 elements if((D1 == NULL) || (D2 == NULL) || (X == NULL) || (Y == NULL) || (PARAM == NULL) || (back_D1 == NULL) || (back_D2 == NULL) ||(back_X == NULL) || (back_Y == NULL) || (back_PARAM == NULL)) { ::std::cerr << "Cannot allocate memory on host side\n" << "!!!!!!!!!!!!Test skipped.!!!!!!!!!!!!" << ::std::endl; deleteBuffers(D1, D2, X, Y, PARAM); deleteBuffers(back_D1, back_D2, back_X, back_Y, back_PARAM); delete[] events; SUCCEED(); return; } srand(params->seed); ::std::cerr << "Generating input data... "; //Filling random values for SA and SB. C & S are only for output sake randomRotmg( (D1 + params->offa), (D2 + params->offb), (X + params->offBX), (Y + params->offCY), (PARAM + params->offc) ); sflagParam = convertMultiplier(params->alpha); PARAM[params->offc] = sflagParam; // initializing first element memcpy(back_X, X, (1 + params->offBX)*sizeof(T)); memcpy(back_Y, Y, (1 + params->offCY)*sizeof(T)); memcpy(back_D1, D1, (1 + params->offa)*sizeof(T)); memcpy(back_D2, D2, (1 + params->offb)*sizeof(T)); memcpy(back_PARAM, PARAM, (params->offc + 5)*sizeof(T)); ::std::cerr << "Done" << ::std::endl; // Allocate buffers bufD1 = base->createEnqueueBuffer(D1, (1 + params->offa) * sizeof(T), 0, CL_MEM_READ_WRITE); bufD2 = base->createEnqueueBuffer(D2, (1 + params->offb) * sizeof(T), 0, CL_MEM_READ_WRITE); bufX = base->createEnqueueBuffer(X, (1 + params->offBX) * sizeof(T), 0, CL_MEM_READ_WRITE); bufY = base->createEnqueueBuffer(Y, (1 + params->offCY) * sizeof(T), 0, CL_MEM_READ_ONLY); bufParam = base->createEnqueueBuffer(PARAM, (5 + params->offc) * sizeof(T), 0, CL_MEM_READ_WRITE); ::std::cerr << "Calling reference xROTMG routine... "; ::clMath::blas::rotmg(back_D1, params->offa, back_D2, params->offb, back_X, params->offBX, back_Y, params->offCY, back_PARAM, params->offc); ::std::cerr << "Done" << ::std::endl; // Hold X vector if ((bufD1 == NULL) || (bufD2 == NULL) || (bufX == NULL) || (bufY == NULL) || (bufParam == NULL)) { releaseMemObjects(bufD1, bufD2, bufX, bufY, bufParam); deleteBuffers(D1, D2, X, Y, PARAM); deleteBuffers(back_D1, back_D2, back_X, back_Y, back_PARAM); delete[] events; ::std::cerr << ">> Failed to create/enqueue buffer for a matrix." << ::std::endl << ">> Can't execute the test, because data is not transfered to GPU." << ::std::endl << ">> Test skipped." << ::std::endl; SUCCEED(); return; } ::std::cerr << "Calling clblas xROTMG routine... "; DataType type; type = ( typeid(T) == typeid(cl_float)) ? TYPE_FLOAT : TYPE_DOUBLE; err = (cl_int)::clMath::clblas::rotmg( type, bufD1, params->offa, bufD2, params->offb, bufX, params->offBX, bufY, params->offCY, bufParam, params->offc, params->numCommandQueues, base->commandQueues(), 0, NULL, events); if (err != CL_SUCCESS) { releaseMemObjects(bufD1, bufD2, bufX, bufY, bufParam); deleteBuffers(D1, D2, X, Y, PARAM); deleteBuffers(back_D1, back_D2, back_X, back_Y, back_PARAM); delete[] events; ASSERT_EQ(CL_SUCCESS, err) << "::clMath::clblas::ROTMG() failed"; } err = waitForSuccessfulFinish(params->numCommandQueues, base->commandQueues(), events); if (err != CL_SUCCESS) { releaseMemObjects(bufD1, bufD2, bufX, bufY, bufParam); deleteBuffers(D1, D2, X, Y, PARAM); deleteBuffers(back_D1, back_D2, back_X, back_Y, back_PARAM); delete[] events; ASSERT_EQ(CL_SUCCESS, err) << "waitForSuccessfulFinish()"; } ::std::cerr << "Done" << ::std::endl; err = clEnqueueReadBuffer(base->commandQueues()[0], bufD1, CL_TRUE, 0, (1 + params->offa) * sizeof(T), D1, 0, NULL, NULL); err |= clEnqueueReadBuffer(base->commandQueues()[0], bufD2, CL_TRUE, 0, (1 + params->offb) * sizeof(T), D2, 0, NULL, NULL); err = clEnqueueReadBuffer(base->commandQueues()[0], bufX, CL_TRUE, 0, (1 + params->offBX) * sizeof(T), X, 0, NULL, NULL); err = clEnqueueReadBuffer(base->commandQueues()[0], bufY, CL_TRUE, 0, (1 + params->offCY) * sizeof(T), Y, 0, NULL, NULL); err |= clEnqueueReadBuffer(base->commandQueues()[0], bufParam, CL_TRUE, 0, (5 + params->offc) * sizeof(T), PARAM, 0, NULL, NULL); if (err != CL_SUCCESS) { ::std::cerr << "ROTMG: Reading results failed...." << std::endl; } releaseMemObjects(bufD1, bufD2, bufX, bufY, bufParam); deltaForType = DELTA_0(); #ifndef CORR_TEST_WITH_ACML // Acml doesn't store answer in D1, D2 and X1. So skipping those checks cl_double delta; delta = deltaForType * returnMax(back_D1[params->offa]); compareValues( (back_D1 + params->offa), (D1 + params->offa), delta); delta = deltaForType * returnMax(back_D2[params->offb]); compareValues( (back_D2 + params->offb), (D2 + params->offb), delta); delta = deltaForType * returnMax(back_X[params->offBX]); compareValues( (back_X + params->offBX), (X + params->offBX), delta); delta = deltaForType * returnMax(back_Y[params->offCY]); compareValues( (back_Y + params->offCY), (Y + params->offCY), delta); #endif // Creating delta array for PARAM array cl_double deltaArr[5]; for(int i=0; i<5; i++) { deltaArr[i] = deltaForType * returnMax(back_PARAM[i + (params->offc)]); } compareMatrices(clblasColumnMajor, 5 , 1, (back_PARAM + params->offc), (PARAM + params->offc), 5, deltaArr); deleteBuffers(D1, D2, X, Y, PARAM); deleteBuffers(back_D1, back_D2, back_X, back_Y, back_PARAM); delete[] events; } // Instantiate the test TEST_P(ROTMG, srotmg) { TestParams params; getParams(¶ms); rotmgCorrectnessTest(¶ms); } TEST_P(ROTMG, drotmg) { TestParams params; getParams(¶ms); rotmgCorrectnessTest(¶ms); } clblas-2.10/src/tests/correctness/corr-sbmv.cpp000066400000000000000000000154301264277366700216240ustar00rootroot00000000000000/* ************************************************************************ * Copyright 2013 Advanced Micro Devices, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * ************************************************************************/ #include // srand() #include // memcpy() #include #include #include #include #include #include #include #include #include #include static void releaseMemObjects(cl_mem objA, cl_mem objX, cl_mem objY) { if(objA != NULL) { clReleaseMemObject(objA); } if(objX != NULL) { clReleaseMemObject(objX); } if(objY != NULL) { clReleaseMemObject(objY); } } template static void deleteBuffers(T *A, T *X, T *blasY, T *clblasY) { if(A != NULL) { delete[] A; } if(X != NULL) { delete[] X; } if(blasY != NULL) { delete[] blasY; } if(clblasY != NULL) { delete[] clblasY; // To hold clblas GBMV call results } } template void sbmvCorrectnessTest(TestParams *params) { cl_int err; T *A, *X, *blasY, *clblasY; cl_mem bufA, bufX, bufY; clMath::BlasBase *base; cl_event *events; T alpha, beta; size_t lengthX, lengthY, lengthA; base = clMath::BlasBase::getInstance(); if (((typeid(T) == typeid(cl_double))) && !base->isDevSupportDoublePrecision()) { std::cerr << ">> WARNING: The target device doesn't support native " "double precision floating point arithmetic" << std::endl << ">> Test skipped" << std::endl; SUCCEED(); return; } printf("number of command queues : %d\n\n", params->numCommandQueues); events = new cl_event[params->numCommandQueues]; memset(events, 0, params->numCommandQueues * sizeof(cl_event)); lengthA = params->N * params->lda; lengthX = (params->N - 1)*abs(params->incx) + 1; lengthY = (params->N - 1)*abs(params->incy) + 1; A = new T[ lengthA + params->offA ]; X = new T[ lengthX + params->offBX ]; blasY = new T[ lengthY + params->offCY ]; clblasY = new T[ lengthY + params->offCY ]; srand(params->seed); ::std::cerr << "Generating input data... "; if((A == NULL) || (X == NULL) || (blasY == NULL) || (clblasY == NULL)) { deleteBuffers(A, X, blasY, clblasY); ::std::cerr << "Cannot allocate memory on host side\n" << "!!!!!!!!!!!!Test skipped.!!!!!!!!!!!!" << ::std::endl; delete[] events; SUCCEED(); return; } alpha = convertMultiplier(params->alpha); beta = convertMultiplier(params->beta); randomGbmvMatrices(params->order, clblasNoTrans, params->N, params->N, &alpha, &beta, (A + params->offA), params->lda, (X+params->offBX), params->incx, (blasY+params->offCY), params->incy ); // Copy blasY to clblasY memcpy(clblasY, blasY, (lengthY + params->offCY)* sizeof(*blasY)); ::std::cerr << "Done" << ::std::endl; // Allocate buffers bufA = base->createEnqueueBuffer(A, (lengthA + params->offA)* sizeof(*A), 0, CL_MEM_READ_ONLY); bufX = base->createEnqueueBuffer(X, (lengthX + params->offBX)* sizeof(*X), 0, CL_MEM_READ_ONLY); bufY = base->createEnqueueBuffer(clblasY, (lengthY + params->offCY) * sizeof(*clblasY), 0, CL_MEM_READ_WRITE); ::std::cerr << "Calling reference xSBMV routine... "; clblasOrder fOrder; clblasUplo fUplo; fOrder = params->order; fUplo = params->uplo; size_t fN = params->N, fK = params->K; if (fOrder != clblasColumnMajor) { fOrder = clblasColumnMajor; fUplo = (params->uplo == clblasLower)? clblasUpper : clblasLower; fN = params->N; } clMath::blas::sbmv(fOrder, fUplo, fN, fK, alpha, A, params->offA, params->lda, X, params->offBX, params->incx, beta, blasY, params->offCY, params->incy); ::std::cerr << "Done" << ::std::endl; if ((bufA == NULL) || (bufX == NULL) || (bufY == NULL)) { // Skip the test, the most probable reason is // matrix too big for a device. releaseMemObjects(bufA, bufX, bufY); deleteBuffers(A, X, blasY, clblasY); delete[] events; ::std::cerr << ">> Failed to create/enqueue buffer for a matrix." << ::std::endl << ">> Can't execute the test, because data is not transfered to GPU." << ::std::endl << ">> Test skipped." << ::std::endl; SUCCEED(); return; } ::std::cerr << "Calling clblas xSBMV routine... "; err = (cl_int)clMath::clblas::sbmv(params->order, params->uplo, params->N, params->K, alpha, bufA, params->offA, params->lda, bufX, params->offBX, params->incx, beta, bufY, params->offCY, params->incy, params->numCommandQueues, base->commandQueues(), 0, NULL, events); if (err != CL_SUCCESS) { releaseMemObjects(bufA, bufX, bufY); deleteBuffers(A, X, blasY, clblasY); delete[] events; ASSERT_EQ(CL_SUCCESS, err) << "::clMath::clblas::SBMV() failed"; } err = waitForSuccessfulFinish(params->numCommandQueues, base->commandQueues(), events); if (err != CL_SUCCESS) { releaseMemObjects(bufA, bufX, bufY); deleteBuffers(A, X, blasY, clblasY); delete[] events; ASSERT_EQ(CL_SUCCESS, err) << "waitForSuccessfulFinish()"; } ::std::cerr << "Done" << ::std::endl; err = clEnqueueReadBuffer(base->commandQueues()[0], bufY, CL_TRUE, 0, (lengthY + params->offCY) * sizeof(*clblasY), clblasY, 0, NULL, NULL); if (err != CL_SUCCESS) { ::std::cerr << "SBMV: Reading results failed...." << std::endl; } releaseMemObjects(bufA, bufX, bufY); compareMatrices(clblasColumnMajor, lengthY , 1, (blasY + params->offCY), (clblasY + params->offCY), lengthY); deleteBuffers(A, X, blasY, clblasY); delete[] events; } // Instantiate the test TEST_P(SBMV, ssbmv) { TestParams params; getParams(¶ms); sbmvCorrectnessTest(¶ms); } TEST_P(SBMV, dsbmv) { TestParams params; getParams(¶ms); sbmvCorrectnessTest(¶ms); } clblas-2.10/src/tests/correctness/corr-scal.cpp000066400000000000000000000152771264277366700216100ustar00rootroot00000000000000/* ************************************************************************ * Copyright 2013 Advanced Micro Devices, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * ************************************************************************/ #include // srand() #include // memcpy() #include #include #include #include #include #include #include #include #include static void releaseMemObjects(cl_mem objX) { if(objX != NULL) { clReleaseMemObject(objX); } } template static void deleteBuffers(T *blasX, T *clblasX) { if(blasX != NULL) { delete[] blasX; } if(clblasX != NULL) { delete[] clblasX; } } template void scalCorrectnessTest(TestParams *params) { cl_int err; T *blasX, *clblasX; cl_mem bufX; clMath::BlasBase *base; cl_event *events; T alpha; base = clMath::BlasBase::getInstance(); if ((typeid(T) == typeid(cl_double) || typeid(T) == typeid(DoubleComplex)) && !base->isDevSupportDoublePrecision()) { std::cerr << ">> WARNING: The target device doesn't support native " "double precision floating point arithmetic" << std::endl << ">> Test skipped" << std::endl; SUCCEED(); return; } printf("number of command queues : %d\n\n", params->numCommandQueues); events = new cl_event[params->numCommandQueues]; memset(events, 0, params->numCommandQueues * sizeof(cl_event)); size_t lengthX = (1 + ((params->N -1) * abs(params->incx))); bool is_css_zds = (params->K == 1)? true: false; // K indicates csscal/zdscal blasX = new T[lengthX + params->offBX ]; clblasX = new T[lengthX + params->offBX ]; if( (blasX == NULL) || (clblasX == NULL) ) { ::std::cerr << "Cannot allocate memory on host side\n" << "!!!!!!!!!!!!Test skipped.!!!!!!!!!!!!" << ::std::endl; deleteBuffers(blasX, clblasX); delete[] events; SUCCEED(); return; } srand(params->seed); ::std::cerr << "Generating input data... "; randomVectors(params->N, (blasX+params->offBX), params->incx); alpha = convertMultiplier(params->alpha); memcpy(clblasX, blasX, (lengthX + params->offBX)* sizeof(*blasX)); ::std::cerr << "Done" << ::std::endl; bufX = base->createEnqueueBuffer(clblasX, (lengthX + params->offBX)* sizeof(*clblasX), 0, CL_MEM_READ_WRITE); ::std::cerr << "Calling reference xSCAL routine... "; // Both blas and clBlas wrapper functions consider the real part of alpha in case of css/zdscal // This is to make sure both get the same scalar alpha. check wrapper functions ::clMath::blas::scal(is_css_zds, params->N, alpha, blasX, params->offBX, params->incx); ::std::cerr << "Done" << ::std::endl; if (bufX == NULL) { /* Skip the test, the most probable reason is * matrix too big for a device. */ releaseMemObjects(bufX); deleteBuffers(blasX, clblasX); delete[] events; ::std::cerr << ">> Failed to create/enqueue buffer for a matrix." << ::std::endl << ">> Can't execute the test, because data is not transfered to GPU." << ::std::endl << ">> Test skipped." << ::std::endl; SUCCEED(); return; } ::std::cerr << "Calling clblas xSCAL routine... "; // Both blas and clBlas wrapper functions consider the real part of alpha in case of css/zdscal // This is to make sure both get the same scalar alpha. check wrapper functions err = (cl_int)::clMath::clblas::scal(is_css_zds, params->N, alpha, bufX, params->offBX, params->incx, params->numCommandQueues, base->commandQueues(), 0, NULL, events); if (err != CL_SUCCESS) { releaseMemObjects(bufX); deleteBuffers(blasX, clblasX); delete[] events; ASSERT_EQ(CL_SUCCESS, err) << "::clMath::clblas::SCAL() failed"; } err = waitForSuccessfulFinish(params->numCommandQueues, base->commandQueues(), events); if (err != CL_SUCCESS) { releaseMemObjects(bufX); deleteBuffers(blasX, clblasX); delete[] events; ASSERT_EQ(CL_SUCCESS, err) << "waitForSuccessfulFinish()"; } ::std::cerr << "Done" << ::std::endl; err = clEnqueueReadBuffer(base->commandQueues()[0], bufX, CL_TRUE, 0, (lengthX + params->offBX) * sizeof(*clblasX), clblasX, 0, NULL, NULL); if (err != CL_SUCCESS) { ::std::cerr << "SCAL: Reading results failed...." << std::endl; } releaseMemObjects(bufX); compareMatrices(clblasColumnMajor, lengthX , 1, (blasX + params->offBX), (clblasX + params->offBX), lengthX); deleteBuffers(blasX, clblasX); delete[] events; } // Instantiate the test TEST_P(SCAL, sscal) { TestParams params; getParams(¶ms); params.K = 0; // K will indicate wheather routine is csscal/zdscal scalCorrectnessTest(¶ms); } TEST_P(SCAL, dscal) { TestParams params; getParams(¶ms); params.K = 0; // K will indicate wheather routine is csscal/zdscal scalCorrectnessTest(¶ms); } TEST_P(SCAL, cscal) { TestParams params; getParams(¶ms); params.K = 0; // K will indicate wheather routine is csscal/zdscal scalCorrectnessTest(¶ms); } TEST_P(SCAL, zscal) { TestParams params; getParams(¶ms); params.K = 0; // K will indicate wheather routine is csscal/zdscal scalCorrectnessTest(¶ms); } // For these 2 routines alpha is scalar TEST_P(SCAL, csscal) { TestParams params; getParams(¶ms); params.K = 1; // K will indicate wheather routine is csscal/zdscal scalCorrectnessTest(¶ms); } TEST_P(SCAL, zdscal) { TestParams params; getParams(¶ms); params.K = 1; // K will indicate wheather routine is csscal/zdscal scalCorrectnessTest(¶ms); } clblas-2.10/src/tests/correctness/corr-spmv.cpp000066400000000000000000000150651264277366700216460ustar00rootroot00000000000000/* ************************************************************************ * Copyright 2013 Advanced Micro Devices, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * ************************************************************************/ #include // srand() #include // memcpy() #include #include #include #include #include #include #include #include #include static void releaseMemObjects(cl_mem objA, cl_mem objX, cl_mem objY) { if(objA != NULL) { clReleaseMemObject(objA); } if(objX != NULL) { clReleaseMemObject(objX); } if(objY != NULL) { clReleaseMemObject(objY); } } template static void deleteBuffers(T *A, T *X, T *blasY, T *clblasY) { if(A != NULL) { delete[] A; } if(X != NULL) { delete[] X; } if(blasY != NULL) { delete[] blasY; } if(clblasY != NULL) { delete[] clblasY; // To hold clblas SPMV call results } } template void spmvCorrectnessTest(TestParams *params) { cl_int err; T *AP, *X, *blasY, *clblasY; cl_mem bufAP, bufX, bufY; clMath::BlasBase *base; cl_event *events; T alpha, beta; base = clMath::BlasBase::getInstance(); if ((typeid(T) == typeid(cl_double)) && !base->isDevSupportDoublePrecision()) { std::cerr << ">> WARNING: The target device doesn't support native " "double precision floating point arithmetic" << std::endl << ">> Test skipped" << std::endl; SUCCEED(); return; } printf("number of command queues : %d\n\n", params->numCommandQueues); events = new cl_event[params->numCommandQueues]; memset(events, 0, params->numCommandQueues * sizeof(cl_event)); size_t lengthA = (params->N * (params->N + 1)) / 2; size_t lengthX = (1 + ((params->N -1) * abs(params->incx))); size_t lengthY = (1 + ((params->N -1) * abs(params->incy))); AP = new T[lengthA + params->offA ]; X = new T[lengthX + params->offBX ]; blasY = new T[lengthY + params->offCY ]; clblasY = new T[lengthY + params->offCY ]; srand(params->seed); ::std::cerr << "Generating input data... "; if((AP == NULL) || (X == NULL) || (blasY == NULL) || (clblasY == NULL)) { deleteBuffers(AP, X, blasY, clblasY); ::std::cerr << "Cannot allocate memory on host side\n" << "!!!!!!!!!!!!Test skipped.!!!!!!!!!!!!" << ::std::endl; delete[] events; SUCCEED(); return; } alpha = convertMultiplier(params->alpha); beta = convertMultiplier(params->beta); randomSpmvMatrices(params->order, params->uplo, params->N, true, &alpha, (AP + params->offA), (X + params->offBX), params->incx, true, &beta, (blasY + params->offCY), params->incy); // Copy blasY to clblasY memcpy(clblasY, blasY, (lengthY + params->offCY)* sizeof(*blasY)); ::std::cerr << "Done" << ::std::endl; // Allocate buffers bufAP = base->createEnqueueBuffer(AP, (lengthA + params->offA)* sizeof(*AP), 0, CL_MEM_READ_ONLY); bufX = base->createEnqueueBuffer(X, (lengthX + params->offBX)* sizeof(*X), 0, CL_MEM_READ_ONLY); bufY = base->createEnqueueBuffer(clblasY, (lengthY + params->offCY) * sizeof(*clblasY), 0, CL_MEM_READ_WRITE); ::std::cerr << "Calling reference xSPMV routine... "; clblasOrder order; clblasUplo fUplo; order = params->order; fUplo = params->uplo; if (order != clblasColumnMajor) { order = clblasColumnMajor; fUplo = (params->uplo == clblasUpper)? clblasLower : clblasUpper; } ::clMath::blas::spmv( order, fUplo, params->N, alpha, AP, params->offA, X, params->offBX, params->incx, beta, blasY, params->offCY, params->incy); ::std::cerr << "Done" << ::std::endl; if ((bufAP == NULL) || (bufX == NULL) || (bufY == NULL)) { // Skip the test, the most probable reason is // matrix too big for a device. releaseMemObjects(bufAP, bufX, bufY); deleteBuffers(AP, X, blasY, clblasY); delete[] events; ::std::cerr << ">> Failed to create/enqueue buffer for a matrix." << ::std::endl << ">> Can't execute the test, because data is not transfered to GPU." << ::std::endl << ">> Test skipped." << ::std::endl; SUCCEED(); return; } ::std::cerr << "Calling clblas xSPMV routine... "; err = (cl_int)::clMath::clblas::spmv(params->order, params->uplo, params->N, alpha, bufAP, params->offA, bufX, params->offBX, params->incx, beta, bufY, params->offCY, params->incy, params->numCommandQueues, base->commandQueues(), 0, NULL, events); if (err != CL_SUCCESS) { releaseMemObjects(bufAP, bufX, bufY); deleteBuffers(AP, X, blasY, clblasY); delete[] events; ASSERT_EQ(CL_SUCCESS, err) << "::clMath::clblas::SPMV() failed"; } err = waitForSuccessfulFinish(params->numCommandQueues, base->commandQueues(), events); if (err != CL_SUCCESS) { releaseMemObjects(bufAP, bufX, bufY); deleteBuffers(AP, X, blasY, clblasY); delete[] events; ASSERT_EQ(CL_SUCCESS, err) << "waitForSuccessfulFinish()"; } ::std::cerr << "Done" << ::std::endl; err = clEnqueueReadBuffer(base->commandQueues()[0], bufY, CL_TRUE, 0, (lengthY + params->offCY) * sizeof(*clblasY), clblasY, 0, NULL, NULL); if (err != CL_SUCCESS) { ::std::cerr << "SPMV: Reading results failed...." << std::endl; } releaseMemObjects(bufAP, bufX, bufY); compareMatrices(clblasColumnMajor, lengthY , 1, (blasY + params->offCY), (clblasY + params->offCY), lengthY); deleteBuffers(AP, X, blasY, clblasY); delete[] events; } // Instantiate the test TEST_P(SPMV, sspmv) { TestParams params; getParams(¶ms); spmvCorrectnessTest(¶ms); } TEST_P(SPMV, dspmv) { TestParams params; getParams(¶ms); spmvCorrectnessTest(¶ms); } clblas-2.10/src/tests/correctness/corr-spr.cpp000066400000000000000000000146261264277366700214670ustar00rootroot00000000000000/* ************************************************************************ * Copyright 2013 Advanced Micro Devices, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * ************************************************************************/ #include // srand() #include // memcpy() #include #include #include #include #include #include #include #include #include static void releaseMemObjects(cl_mem objAP, cl_mem objX) { if(objAP != NULL) { clReleaseMemObject(objAP); } if(objX != NULL) { clReleaseMemObject(objX); } } template static void deleteBuffers(T *blasAP, T *clblasAP, T *X) { if(blasAP != NULL) { delete[] blasAP; } if(clblasAP != NULL) { delete[] clblasAP; } if(X != NULL) { delete[] X; } } template void sprCorrectnessTest(TestParams *params) { cl_int err; T *blasAP, *clblasAP, *X; // T *tempA; cl_mem bufAP, bufX; clMath::BlasBase *base; cl_event *events; bool useAlpha; T alpha; base = clMath::BlasBase::getInstance(); if ((typeid(T) == typeid(cl_double)) && !base->isDevSupportDoublePrecision()) { std::cerr << ">> WARNING: The target device doesn't support native " "double precision floating point arithmetic" << std::endl << ">> Test skipped" << std::endl; SUCCEED(); return; } printf("number of command queues : %d\n\n", params->numCommandQueues); events = new cl_event[params->numCommandQueues]; memset(events, 0, params->numCommandQueues * sizeof(cl_event)); size_t lengthAP = ( ( params->N*( params->N + 1 ) )/2 ); size_t lengthX = (1 + ((params->N -1) * abs(params->incx))); blasAP = new T[lengthAP + params->offa]; clblasAP = new T[lengthAP + params->offa]; X = new T[lengthX + params->offBX]; // tempA = new T[lengthA + params->offa ]; srand(params->seed); ::std::cerr << "Generating input data... "; memset(blasAP, -1, (lengthAP + params->offa)); memset(clblasAP, -1, (lengthAP + params->offa)); memset(X, -1, (lengthX + params->offBX)); alpha = convertMultiplier(params->alpha); useAlpha = true; #ifdef DEBUG_SPR printf("ALPHA in CORR_SPR.CPP %f\n", alpha); #endif if((blasAP == NULL) || (X == NULL) || (clblasAP == NULL)) { ::std::cerr << "Cannot allocate memory on host side\n" << "!!!!!!!!!!!!Test skipped.!!!!!!!!!!!!" << ::std::endl; deleteBuffers(blasAP, clblasAP, X); delete[] events; SUCCEED(); return; } randomSyrMatrices(params->order, params->uplo, params->N, useAlpha, &alpha, (blasAP + params->offa), 0, (X + params->offBX), params->incx); memcpy(clblasAP, blasAP, (lengthAP + params->offa)* sizeof(*blasAP)); ::std::cerr << "Done" << ::std::endl; bufAP = base->createEnqueueBuffer(clblasAP, (lengthAP + params->offa) * sizeof(*clblasAP), 0, CL_MEM_READ_WRITE); bufX = base->createEnqueueBuffer(X, (lengthX + params->offBX)* sizeof(*X), 0, CL_MEM_READ_ONLY); ::std::cerr << "Calling reference xSPR routine... "; clblasOrder order; clblasUplo fUplo; order = params->order; fUplo = params->uplo; if (order != clblasColumnMajor) { order = clblasColumnMajor; fUplo = (params->uplo == clblasUpper)? clblasLower : clblasUpper; if( params->transA == clblasConjTrans ) doConjugate( (blasAP +params->offa), (( params->N * (params->N + 1)) / 2) , 1, 1 ); } clMath::blas::spr( clblasColumnMajor, fUplo, params->N, alpha, X, params->offBX, params->incx, blasAP, params->offa); ::std::cerr << "Done" << ::std::endl; if ((bufAP == NULL) || (bufX == NULL) ) { /* Skip the test, the most probable reason is * matrix too big for a device. */ releaseMemObjects(bufAP, bufX); deleteBuffers(blasAP, clblasAP, X); delete[] events; ::std::cerr << ">> Failed to create/enqueue buffer for a matrix." << ::std::endl << ">> Can't execute the test, because data is not transfered to GPU." << ::std::endl << ">> Test skipped." << ::std::endl; SUCCEED(); return; } ::std::cerr << "Calling clblas xSPR routine... "; err = (cl_int)::clMath::clblas::spr( params->order, params->uplo, params->N, alpha, bufX, params->offBX, params->incx, bufAP, params->offa, params->numCommandQueues, base->commandQueues(), 0, NULL, events); if (err != CL_SUCCESS) { releaseMemObjects(bufAP, bufX); deleteBuffers(blasAP, clblasAP, X); delete[] events; ASSERT_EQ(CL_SUCCESS, err) << "::clMath::clblas::SYR() failed"; } err = waitForSuccessfulFinish(params->numCommandQueues, base->commandQueues(), events); if (err != CL_SUCCESS) { releaseMemObjects(bufAP, bufX); deleteBuffers(blasAP, clblasAP, X); delete[] events; ASSERT_EQ(CL_SUCCESS, err) << "waitForSuccessfulFinish()"; } ::std::cerr << "Done" << ::std::endl; err = clEnqueueReadBuffer(base->commandQueues()[0], bufAP, CL_TRUE, 0, (lengthAP + params->offa) * sizeof(*clblasAP), clblasAP, 0, NULL, NULL); if (err != CL_SUCCESS) { ::std::cerr << "SPR: Reading results failed...." << std::endl; } releaseMemObjects(bufAP, bufX); printf("Comparing the results\n"); compareMatrices(clblasColumnMajor, lengthAP , 1, (blasAP + params->offa), (clblasAP + params->offa), lengthAP); deleteBuffers(blasAP, clblasAP, X); delete[] events; } // Instantiate the test TEST_P(SPR, sspr) { TestParams params; getParams(¶ms); sprCorrectnessTest(¶ms); } TEST_P(SPR, dspr) { TestParams params; getParams(¶ms); sprCorrectnessTest(¶ms); } clblas-2.10/src/tests/correctness/corr-spr2.cpp000066400000000000000000000150371264277366700215460ustar00rootroot00000000000000/* ************************************************************************ * Copyright 2013 Advanced Micro Devices, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * ************************************************************************/ #include // srand() #include // memcpy() #include #include #include #include #include #include #include #include #include static void releaseMemObjects(cl_mem objA, cl_mem objX, cl_mem objY) { if(objA != NULL) { clReleaseMemObject(objA); } if(objX != NULL) { clReleaseMemObject(objX); } if(objY != NULL) { clReleaseMemObject(objY); } } template static void deleteBuffers(T *blasA, T *clblasA, T *X, T *Y) { if(blasA != NULL) { delete[] blasA; } if(clblasA != NULL) { delete[] clblasA; } if(X != NULL) { delete[] X; } if(Y != NULL) { delete[] Y; } } template void spr2CorrectnessTest(TestParams *params) { cl_int err; T *blasAP, *clblasAP, *X, *Y; cl_mem bufAP, bufX, bufY; clMath::BlasBase *base; cl_event *events; bool useAlpha; T alpha; base = clMath::BlasBase::getInstance(); if ((typeid(T) == typeid(cl_double)) && !base->isDevSupportDoublePrecision()) { std::cerr << ">> WARNING: The target device doesn't support native " "double precision floating point arithmetic" << std::endl << ">> Test skipped" << std::endl; SUCCEED(); return; } printf("number of command queues : %d\n\n", params->numCommandQueues); events = new cl_event[params->numCommandQueues]; memset(events, 0, params->numCommandQueues * sizeof(cl_event)); size_t lengthAP = (params->N *( params->N + 1 ))/2 ; size_t lengthX = (1 + ((params->N - 1) * abs(params->incx))); size_t lengthY = (1 + ((params->N - 1) * abs(params->incy))); blasAP = new T[lengthAP + params->offa ]; clblasAP = new T[lengthAP + params->offa ]; X = new T[lengthX + params->offBX ]; Y = new T[lengthY + params->offCY ]; srand(params->seed); if((blasAP == NULL) || (clblasAP == NULL) || (X == NULL) || (Y == NULL)) { deleteBuffers(blasAP, clblasAP, X, Y); ::std::cerr << "Cannot allocate memory on host side\n" << "!!!!!!!!!!!!Test skipped.!!!!!!!!!!!!" << ::std::endl; delete[] events; SUCCEED(); return; } alpha = convertMultiplier(params->alpha); useAlpha = true; ::std::cerr << "Generating input data... "; randomSyr2Matrices(params->order, params->uplo, params->N, useAlpha, &alpha, (blasAP + params->offa), params->lda, (X + params->offBX), params->incx, (Y + params->offCY), params->incy); // Copy blasAP to clblasAP memcpy(clblasAP, blasAP, (lengthAP + params->offa)* sizeof(*blasAP)); ::std::cerr << "Done" << ::std::endl; // Allocate buffers bufAP = base->createEnqueueBuffer(clblasAP, (lengthAP + params->offa)* sizeof(*clblasAP), 0,CL_MEM_READ_WRITE); bufX = base->createEnqueueBuffer(X, (lengthX + params->offBX)* sizeof(*X), 0, CL_MEM_READ_ONLY); bufY = base->createEnqueueBuffer(Y, (lengthY + params->offCY)* sizeof(*Y), 0, CL_MEM_READ_ONLY); ::std::cerr << "Calling reference xSPR2 routine... "; clblasOrder order; clblasUplo fUplo; order = params->order; fUplo = params->uplo; if (order != clblasColumnMajor) { order = clblasColumnMajor; fUplo = (params->uplo == clblasUpper)? clblasLower : clblasUpper; } ::clMath::blas::spr2( order, fUplo, params->N, alpha, X, params->offBX, params->incx, Y, params->offCY, params->incy, blasAP, params->offa); ::std::cerr << "Done" << ::std::endl; if ((bufAP == NULL) || (bufX == NULL) || (bufY == NULL)) { /* Skip the test, the most probable reason is * matrix too big for a device. */ releaseMemObjects(bufAP, bufX, bufY); deleteBuffers(blasAP, clblasAP, X, Y); delete[] events; ::std::cerr << ">> Failed to create/enqueue buffer for a matrix." << ::std::endl << ">> Can't execute the test, because data is not transfered to GPU." << ::std::endl << ">> Test skipped." << ::std::endl; SUCCEED(); return; } ::std::cerr << "Calling clblas xSPR2 routine... "; err = (cl_int)::clMath::clblas::spr2( params->order, params->uplo, params->N, alpha, bufX, params->offBX, params->incx, bufY, params->offCY, params->incy, bufAP, params->offa, params->numCommandQueues, base->commandQueues(), 0, NULL, events); if (err != CL_SUCCESS) { releaseMemObjects(bufAP, bufX, bufY); deleteBuffers(blasAP, clblasAP, X, Y); delete[] events; ASSERT_EQ(CL_SUCCESS, err) << "::clMath::clblas::SPR2() failed"; } err = waitForSuccessfulFinish(params->numCommandQueues, base->commandQueues(), events); if (err != CL_SUCCESS) { releaseMemObjects(bufAP, bufX, bufY); deleteBuffers(blasAP, clblasAP, X, Y); delete[] events; ASSERT_EQ(CL_SUCCESS, err) << "waitForSuccessfulFinish()"; } ::std::cerr << "Done" << ::std::endl; err = clEnqueueReadBuffer(base->commandQueues()[0], bufAP, CL_TRUE, 0, (lengthAP + params->offa) * sizeof(*clblasAP), clblasAP, 0, NULL, NULL); if (err != CL_SUCCESS) { ::std::cerr << "SPR2: Reading results failed...." << std::endl; } releaseMemObjects(bufAP, bufX, bufY); compareMatrices(clblasColumnMajor, lengthAP, 1, (blasAP + params->offa), (clblasAP + params->offa), lengthAP); deleteBuffers(blasAP, clblasAP, X, Y); delete[] events; } // Instantiate the test TEST_P(SPR2, sspr2) { TestParams params; getParams(¶ms); spr2CorrectnessTest(¶ms); } TEST_P(SPR2, dspr2) { TestParams params; getParams(¶ms); spr2CorrectnessTest(¶ms); } clblas-2.10/src/tests/correctness/corr-swap.cpp000066400000000000000000000146351264277366700216350ustar00rootroot00000000000000/* ************************************************************************ * Copyright 2013 Advanced Micro Devices, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * ************************************************************************/ #include // srand() #include // memcpy() #include #include #include #include #include #include #include #include #include static void releaseMemObjects(cl_mem objX, cl_mem objY) { if(objX != NULL) { clReleaseMemObject(objX); } if(objY != NULL) { clReleaseMemObject(objY); } } template static void deleteBuffers(T *X, T *Y, T *blasX, T *blasY) { if(X != NULL) { delete[] X; } if(Y != NULL) { delete[] Y; } if(blasX != NULL) { delete[] blasX; } if(blasY != NULL) { delete[] blasY; } } template void swapCorrectnessTest(TestParams *params) { cl_int err; T *X, *Y, *blasX, *blasY; cl_mem bufX, bufY; clMath::BlasBase *base; cl_event *events; base = clMath::BlasBase::getInstance(); if ((typeid(T) == typeid(cl_double) || typeid(T) == typeid(DoubleComplex)) && !base->isDevSupportDoublePrecision()) { std::cerr << ">> WARNING: The target device doesn't support native " "double precision floating point arithmetic" << std::endl << ">> Test skipped" << std::endl; SUCCEED(); return; } printf("number of command queues : %d\n\n", params->numCommandQueues); events = new cl_event[params->numCommandQueues]; memset(events, 0, params->numCommandQueues * sizeof(cl_event)); size_t lengthX = (1 + ((params->N -1) * abs(params->incx))); size_t lengthY = (1 + ((params->N -1) * abs(params->incy))); X = new T[lengthX + params->offBX ]; Y = new T[lengthY + params->offCY ]; blasX = new T[lengthX + params->offBX ]; blasY = new T[lengthY + params->offCY ]; if((X == NULL) || (blasX == NULL) || (Y == NULL) || (blasY == NULL)) { ::std::cerr << "Cannot allocate memory on host side\n" << "!!!!!!!!!!!!Test skipped.!!!!!!!!!!!!" << ::std::endl; deleteBuffers(X, Y, blasX, blasY); delete[] events; SUCCEED(); return; } srand(params->seed); ::std::cerr << "Generating input data... "; // Populate A and blasX randomVectors(params->N, (X+params->offBX), params->incx, (Y+params->offCY), params->incy); memcpy(blasX, X, (lengthX + params->offBX) * sizeof(T)); memcpy(blasY, Y, (lengthY + params->offCY) * sizeof(T)); ::std::cerr << "Done" << ::std::endl; // Allocate buffers bufX = base->createEnqueueBuffer(X, (lengthX + params->offBX)* sizeof(T), 0, CL_MEM_READ_WRITE); bufY = base->createEnqueueBuffer(Y, (lengthY + params->offCY)* sizeof(T), 0, CL_MEM_READ_WRITE); if ((bufX == NULL) || (bufY == NULL)) { /* Skip the test, the most probable reason is * matrix too big for a device. */ releaseMemObjects(bufX, bufY); deleteBuffers(X, Y, blasX, blasY); delete[] events; ::std::cerr << ">> Failed to create/enqueue buffer for a matrix." << ::std::endl << ">> Can't execute the test, because data is not transfered to GPU." << ::std::endl << ">> Test skipped." << ::std::endl; SUCCEED(); return; } ::std::cerr << "Calling reference xSWAP routine... "; ::clMath::blas::swap( params->N, blasX, params->offBX, params->incx, blasY, params->offCY, params->incy); ::std::cerr << "Done" << ::std::endl; ::std::cerr << "Calling clblas xSWAP routine... "; DataType type; type = ( typeid(T) == typeid(cl_float))? TYPE_FLOAT : (( typeid(T) == typeid(cl_double))? TYPE_DOUBLE: (( typeid(T) == typeid(cl_float2))? TYPE_COMPLEX_FLOAT:TYPE_COMPLEX_DOUBLE)); err = (cl_int)::clMath::clblas::swap( type, params->N, bufX, params->offBX, params->incx, bufY, params->offCY, params->incy, params->numCommandQueues, base->commandQueues(), 0, NULL, events); if (err != CL_SUCCESS) { releaseMemObjects(bufX, bufY); deleteBuffers(X, Y, blasX, blasY); delete[] events; ASSERT_EQ(CL_SUCCESS, err) << "::clMath::clblas::SWAP() failed"; } err = waitForSuccessfulFinish(params->numCommandQueues, base->commandQueues(), events); if (err != CL_SUCCESS) { releaseMemObjects(bufX, bufY); deleteBuffers(X, Y, blasX, blasY); delete[] events; ASSERT_EQ(CL_SUCCESS, err) << "waitForSuccessfulFinish()"; } ::std::cerr << "Done" << ::std::endl; err = clEnqueueReadBuffer(base->commandQueues()[0], bufX, CL_TRUE, 0, (lengthX + params->offBX) * sizeof(T), X, 0, NULL, NULL); err |= clEnqueueReadBuffer(base->commandQueues()[0], bufY, CL_TRUE, 0, (lengthY + params->offCY) * sizeof(T), Y, 0, NULL, NULL); if (err != CL_SUCCESS) { ::std::cerr << "SWAP: Reading results failed...." << std::endl; } releaseMemObjects(bufX, bufY); compareMatrices(clblasColumnMajor, lengthX , 1, (blasX + params->offBX), (X + params->offBX), lengthX); compareMatrices(clblasColumnMajor, lengthY , 1, (blasY + params->offCY), (Y + params->offCY), lengthY); deleteBuffers(X, Y, blasX, blasY); delete[] events; } // Instantiate the test TEST_P(SWAPXY, sswap) { TestParams params; getParams(¶ms); swapCorrectnessTest(¶ms); } TEST_P(SWAPXY, dswap) { TestParams params; getParams(¶ms); swapCorrectnessTest(¶ms); } TEST_P(SWAPXY, cswap) { TestParams params; getParams(¶ms); swapCorrectnessTest(¶ms); } TEST_P(SWAPXY, zswap) { TestParams params; getParams(¶ms); swapCorrectnessTest(¶ms); } clblas-2.10/src/tests/correctness/corr-symm.cpp000066400000000000000000000201161264277366700216370ustar00rootroot00000000000000/* ************************************************************************ * Copyright 2013 Advanced Micro Devices, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * ************************************************************************/ #include // srand() #include // memcpy() #include #include #include #include #include #include #include #include #include #include static void releaseMemObjects(cl_mem objA, cl_mem objB, cl_mem objC) { if(objA != NULL) { clReleaseMemObject(objA); } if(objB != NULL) { clReleaseMemObject(objB); } if(objC != NULL) { clReleaseMemObject(objC); } } template static void deleteBuffers(T *A, T *B, T *C, T *backC) { if(A != NULL) { delete[] A; } if(B != NULL) { delete[] B; } if(C != NULL) { delete[] C; } if(backC != NULL) { delete[] backC; } } template void symmCorrectnessTest(TestParams *params) { cl_int err; T *A, *B, *C, *backC; T alpha_, beta_; cl_mem bufA, bufB, bufC; clMath::BlasBase *base; cl_event *events; size_t ka, kbc; base = clMath::BlasBase::getInstance(); if ((typeid(T) == typeid(cl_double) || typeid(T) == typeid(DoubleComplex)) && !base->isDevSupportDoublePrecision()) { std::cerr << ">> WARNING: The target device doesn't support native " "double precision floating point arithmetic" << std::endl << ">> Test skipped" << std::endl; SUCCEED(); return; } events = new cl_event[params->numCommandQueues]; if (events == NULL) { } memset(events, 0, params->numCommandQueues * sizeof(cl_event)); if( params->side == clblasLeft ) ka = params->M; else ka = params->N; if( params->order == clblasColumnMajor ) kbc = params->N; else kbc = params->M; size_t lengthA = ka * params->lda; size_t lengthB = kbc * params->ldb; size_t lengthC = kbc * params->ldc; alpha_ = convertMultiplier(params->alpha); beta_ = convertMultiplier(params->beta); A = new T[ lengthA + params->offa ]; B = new T[ lengthB + params->offb ]; C = new T[ lengthC + params->offc ]; backC = new T[ lengthC + params->offc ]; if((A == NULL) || (B == NULL) || (C == NULL) || (backC == NULL)) { ::std::cerr << "Cannot allocate memory on host side\n" << "!!!!!!!!!!!!Test skipped.!!!!!!!!!!!!" << ::std::endl; deleteBuffers(A, B, C, backC); delete[] events; SUCCEED(); return; } srand(params->seed); ::std::cerr << "Generating input data... "; int creationFlags = 0, AcreationFlags; creationFlags = creationFlags | RANDOM_INIT; creationFlags = ( (params-> order) == clblasRowMajor)? (creationFlags | ROW_MAJOR_ORDER) : (creationFlags); AcreationFlags = ( (params-> uplo) == clblasLower)? (creationFlags | LOWER_HALF_ONLY) : (creationFlags | UPPER_HALF_ONLY); BlasRoutineID BlasFn = CLBLAS_SYMM; #ifdef __TEST_CSYMM_ACML_NANBUG__ // // NOTE: Whether this clearing to zero is present or not // ACML returns "nan" for few csymm cases. This is here // to make things easier and rule of out-of-bound inputs // memset(A, 0, (lengthA + params->offa)*sizeof(T)); memset(B, 0, (lengthB + params->offb)*sizeof(T)); memset(C, 0, (lengthC + params->offc)*sizeof(T)); #else populate( A + params->offa , ka, ka, params-> lda, BlasFn, AcreationFlags); populate( B + params->offb , params-> M, params-> N, params-> ldb, BlasFn, creationFlags); populate( C + params->offc , params-> M, params-> N, params-> ldc, BlasFn, creationFlags); #endif // Copy C to backX memcpy(backC, C, (lengthC + params->offc) * sizeof(T)); // Allocate buffers bufA = base->createEnqueueBuffer(A, (lengthA + params->offa) * sizeof(T), 0, CL_MEM_READ_ONLY); bufB = base->createEnqueueBuffer(B, (lengthB + params->offb) * sizeof(T), 0, CL_MEM_READ_ONLY); bufC = base->createEnqueueBuffer(backC, (lengthC + params->offc) * sizeof(T), 0, CL_MEM_READ_WRITE); ::std::cerr << "Done" << ::std::endl; ::std::cerr << "Calling reference xSYMM routine... "; clblasOrder fOrder; clblasUplo fUplo; clblasSide fSide; size_t fN, fM; fOrder = params->order; fUplo = params->uplo; fSide = params->side; fM = params->M; fN = params->N; if (fOrder != clblasColumnMajor) { fOrder = clblasColumnMajor; fM = params->N; fN = params->M; fSide = (params->side == clblasLeft)? clblasRight: clblasLeft; fUplo = (params->uplo == clblasUpper)? clblasLower: clblasUpper; } // Call reference blas routine clMath::blas::symm(fOrder, fSide, fUplo, fM, fN, alpha_, A, params->offa, params->lda, B, params->offb, params->ldb, beta_, C, params->offc, params->ldc); ::std::cerr << "Done" << ::std::endl; if ((bufA == NULL) || (bufB == NULL) || (bufC == NULL)) { /* Skip the test, the most probable reason is * matrix too big for a device. */ releaseMemObjects(bufA, bufB, bufC); deleteBuffers(A, B, C, backC); delete[] events; ::std::cerr << ">> Failed to create/enqueue buffer for a matrix." << ::std::endl << ">> Can't execute the test, because data is not transfered to GPU." << ::std::endl << ">> Test skipped." << ::std::endl; SUCCEED(); return; } ::std::cerr << "Calling clblas xSYMM routine... "; err = (cl_int)::clMath::clblas::symm( params->order, params->side, params->uplo, params->M, params->N, alpha_, bufA, params->offa, params->lda, bufB, params->offb, params->ldb, beta_, bufC, params->offc, params->ldc, params->numCommandQueues, base->commandQueues(), 0, NULL, events ); if (err != CL_SUCCESS) { releaseMemObjects(bufA, bufB, bufC); deleteBuffers(A, B, C, backC); delete[] events; ASSERT_EQ(CL_SUCCESS, err) << "::clMath::clblas::SYMM() failed"; } err = waitForSuccessfulFinish(params->numCommandQueues, base->commandQueues(), events); if (err != CL_SUCCESS) { releaseMemObjects(bufA, bufB, bufC); deleteBuffers(A, B, C, backC); delete[] events; ASSERT_EQ(CL_SUCCESS, err) << "waitForSuccessfulFinish()"; } ::std::cerr << "Done" << ::std::endl; clEnqueueReadBuffer(base->commandQueues()[0], bufC, CL_TRUE, 0, (lengthC + params->offc) * sizeof(T), backC, 0, NULL, NULL); releaseMemObjects(bufA, bufB, bufC); // handle lda correctly based on row-major/col-major.. compareMatrices(params->order, params->M , params->N, (C + params->offc), (backC + params->offc), params->ldc); deleteBuffers(A, B, C, backC); delete[] events; } // Instantiate the test #ifndef __TEST_CSYMM_ACML_NANBUG__ TEST_P(SYMM, ssymm) { TestParams params; getParams(¶ms); symmCorrectnessTest(¶ms); } TEST_P(SYMM, dsymm) { TestParams params; getParams(¶ms); symmCorrectnessTest(¶ms); } TEST_P(SYMM, csymm) { TestParams params; getParams(¶ms); symmCorrectnessTest(¶ms); } TEST_P(SYMM, zsymm) { TestParams params; getParams(¶ms); symmCorrectnessTest(¶ms); } #else TEST_P(SYMM, csymm) { TestParams params; getParams(¶ms); symmCorrectnessTest(¶ms); } #endif clblas-2.10/src/tests/correctness/corr-symv.cpp000066400000000000000000000171411264277366700216540ustar00rootroot00000000000000/* ************************************************************************ * Copyright 2013 Advanced Micro Devices, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * ************************************************************************/ #include // srand() #include // memcpy() #include #include #include #include #include #include #include #include #include #include "tcase-filter.h" static void releaseMemObjects(cl_mem objA, cl_mem objX, cl_mem objY) { clReleaseMemObject(objA); clReleaseMemObject(objX); clReleaseMemObject(objY); } template static void deleteBuffers(T *A, T *X, T *blasY, T *clblasY) { delete[] A; delete[] X; delete[] blasY; delete[] clblasY; } template void symvCorrectnessTest(TestParams *params) { cl_int err; T *A, *B, *blasC, *clblasC, *X, *Y; T alpha, beta; cl_mem bufA, bufB, bufC; clMath::BlasBase *base; bool useAlpha, useBeta; cl_event *events; bool isComplex; base = clMath::BlasBase::getInstance(); if ((typeid(T) == typeid(cl_double) || typeid(T) == typeid(DoubleComplex)) && !base->isDevSupportDoublePrecision()) { std::cerr << ">> WARNING: The target device doesn't support native " "double precision floating point arithmetic" << std::endl << ">> Test skipped" << std::endl; SUCCEED(); return; } isComplex = ((typeid(T) == typeid(FloatComplex)) || (typeid(T) == typeid(DoubleComplex))); if (canCaseBeSkipped(params, isComplex)) { std::cerr << ">> Test is skipped because it has no importance for this " "level of coverage" << std::endl; SUCCEED(); return; } useAlpha = base->useAlpha(); useBeta = base->useBeta(); alpha = ZERO(); beta = ZERO(); events = new cl_event[params->numCommandQueues]; memset(events, 0, params->numCommandQueues * sizeof(cl_event)); A = new T[params->rowsA * params->columnsA]; // X and Y are rows or columns in matrixes B and C B = new T[params->rowsB * params->columnsB]; blasC = new T[params->rowsC * params->columnsC]; clblasC = new T[params->rowsC * params->columnsC]; X = &B[params->offBX]; Y = &blasC[params->offCY]; srand(params->seed); if (useAlpha) { alpha = convertMultiplier(params->alpha); } if (useBeta) { beta = convertMultiplier(params->beta); } ::std::cerr << "Generating input data... "; setNans(params->rowsA * params->columnsA, A); setNans(params->rowsB * params->columnsB, B); setNans(params->rowsC * params->columnsC, blasC); randomGemmMatrices(params->order, clblasNoTrans, clblasNoTrans, params->N, params->N, params->N, useAlpha, &alpha, A, params->lda, B, params->ldb, useBeta, &beta, blasC, params->ldc); // set to NAN elements which must not be accessed // in matrix A setTriangleNans(params->order, params->uplo, params->N, A, params->lda); // in matrix B containing vector X setVectorNans(params->offBX, abs(params->incx), B, params->N, params->columnsB * params->rowsB); // in matrix C containing vector Y setVectorNans(params->offCY, abs(params->incy), blasC, params->N, params->columnsC * params->rowsC); memcpy(clblasC, blasC, params->rowsC * params->columnsC * sizeof(*clblasC)); ::std::cerr << "Done" << ::std::endl; ::std::cerr << "Calling reference xSYMV routine... "; if (params->order == clblasColumnMajor) { ::clMath::blas::symv(clblasColumnMajor, params->uplo, params->N, alpha, A, params->lda, X, params->incx, beta, Y, params->incy); } else { T *reorderedA = new T[params->rowsA * params->columnsA]; reorderMatrix(clblasRowMajor, params->rowsA, params->columnsA, A, reorderedA); ::clMath::blas::symv(clblasColumnMajor, params->uplo, params->N, alpha, reorderedA, params->rowsA, X, params->incx, beta, Y, params->incy); delete[] reorderedA; } ::std::cerr << "Done" << ::std::endl; bufA = base->createEnqueueBuffer(A, params->rowsA * params->columnsA * sizeof(*A), params->offA * sizeof(*A), CL_MEM_READ_ONLY); bufB = base->createEnqueueBuffer(B, params->rowsB * params->columnsB * sizeof(*X), 0, CL_MEM_READ_ONLY); bufC = base->createEnqueueBuffer(clblasC, params->rowsC * params->columnsC * sizeof(*clblasC), 0, CL_MEM_READ_WRITE); if ((bufA == NULL) || (bufB == NULL) || (bufC == NULL)) { /* Skip the test, the most probable reason is * matrix too big for a device. */ releaseMemObjects(bufA, bufB, bufC); deleteBuffers(A, B, blasC, clblasC); delete[] events; ::std::cerr << ">> Failed to create/enqueue buffer for a matrix." << ::std::endl << ">> Can't execute the test, because data is not transfered to GPU." << ::std::endl << ">> Test skipped." << ::std::endl; SUCCEED(); return; } ::std::cerr << "Calling clblas xSYMV routine... "; err = (cl_int)::clMath::clblas::symv(params->order, params->uplo, params->N, alpha, bufA, params->offA, params->lda, bufB, params->offBX, params->incx, beta, bufC, params->offCY, params->incy, params->numCommandQueues, base->commandQueues(), 0, NULL, events); if (err != CL_SUCCESS) { releaseMemObjects(bufA, bufB, bufC); deleteBuffers(A, B, blasC, clblasC); delete[] events; ASSERT_EQ(CL_SUCCESS, err) << "::clMath::clblas::SYMV() failed"; } err = waitForSuccessfulFinish(params->numCommandQueues, base->commandQueues(), events); if (err != CL_SUCCESS) { releaseMemObjects(bufA, bufB, bufC); deleteBuffers(A, B, blasC, clblasC); delete[] events; ASSERT_EQ(CL_SUCCESS, err) << "waitForSuccessfulFinish()"; } ::std::cerr << "Done" << ::std::endl; clEnqueueReadBuffer(base->commandQueues()[0], bufC, CL_TRUE, 0, params->rowsC * params->columnsC * sizeof(*clblasC), clblasC, 0, NULL, NULL); releaseMemObjects(bufA, bufB, bufC); compareVectors(params->offCY, params->N, abs(params->incy), params->columnsC * params->rowsC, blasC, clblasC); deleteBuffers(A, B, blasC, clblasC); delete[] events; } // Instantiate the test TEST_P(SYMV, ssymv) { TestParams params; getParams(¶ms); symvCorrectnessTest(¶ms); } TEST_P(SYMV, dsymv) { TestParams params; getParams(¶ms); symvCorrectnessTest(¶ms); } clblas-2.10/src/tests/correctness/corr-syr.cpp000066400000000000000000000202121264277366700214640ustar00rootroot00000000000000/* ************************************************************************ * Copyright 2013 Advanced Micro Devices, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * ************************************************************************/ #include // srand() #include // memcpy() #include #include #include #include #include #include #include #include #include static void releaseMemObjects(cl_mem objA, cl_mem objX) { if(objA != NULL) { clReleaseMemObject(objA); } if(objX != NULL) { clReleaseMemObject(objX); } } template static void deleteBuffers(T *blasA, T *clblasA, T *X) { if(blasA != NULL) { delete[] blasA; } if(clblasA != NULL) { delete[] clblasA; } if(X != NULL) { delete[] X; } } template void syrCorrectnessTest(TestParams *params) { cl_int err; T *blasA, *clblasA, *X; // T *tempA; cl_mem bufA, bufX; clMath::BlasBase *base; cl_event *events; bool useAlpha; T alpha; base = clMath::BlasBase::getInstance(); if ((typeid(T) == typeid(cl_double)) && !base->isDevSupportDoublePrecision()) { std::cerr << ">> WARNING: The target device doesn't support native " "double precision floating point arithmetic" << std::endl << ">> Test skipped" << std::endl; SUCCEED(); return; } printf("number of command queues : %d\n\n", params->numCommandQueues); events = new cl_event[params->numCommandQueues]; memset(events, 0, params->numCommandQueues * sizeof(cl_event)); size_t lengthA = params->N * params->lda; size_t lengthX = (1 + ((params->N -1) * abs(params->incx))); blasA = new T[lengthA + params->offa ]; clblasA = new T[lengthA + params->offa ]; X = new T[lengthX + params->offBX ]; // tempA = new T[lengthA + params->offa ]; srand(params->seed); ::std::cerr << "Generating input data... "; memset(blasA, -1, (lengthA + params->offa)); memset(clblasA, -1, (lengthA + params->offa)); memset(X, -1, (lengthX + params->offBX)); alpha = convertMultiplier(params->alpha); useAlpha = true; #ifdef DEBUG_SYR printf("ALPHA in CORR_SYR.CPP %f\n", alpha); #endif if((blasA == NULL) || (X == NULL) || (clblasA == NULL)) { ::std::cerr << "Cannot allocate memory on host side\n" << "!!!!!!!!!!!!Test skipped.!!!!!!!!!!!!" << ::std::endl; deleteBuffers(blasA, clblasA, X); delete[] events; SUCCEED(); return; } randomSyrMatrices(params->order, params->uplo, params->N, useAlpha, &alpha, (blasA + params->offa), params->lda, (X + params->offBX), params->incx); /* // Set data in A and X using populate() routine int creationFlags = 0; creationFlags = creationFlags | RANDOM_INIT; // Default is Column-Major creationFlags = ( (params-> order) == clblasRowMajor)? (creationFlags | ROW_MAJOR_ORDER) : (creationFlags); creationFlags = ( (params-> uplo) == clblasLower)? (creationFlags | LOWER_HALF_ONLY) : (creationFlags | UPPER_HALF_ONLY); BlasRoutineID BlasFn = CLBLAS_SYR; // Populate A and blasX populate( blasA + params->offa, params-> N, params-> N, params-> lda, BlasFn, creationFlags); populate( X , (lengthX + params->offBX), 1, (lengthX + params->offBX), BlasFn); */ // Copy blasA to clblasA memcpy(clblasA, blasA, (lengthA + params->offa)* sizeof(*blasA)); // memcpy(tempA, blasA, (lengthA + params->offa)* sizeof(*blasA)); ::std::cerr << "Done" << ::std::endl; // Allocate buffers bufA = base->createEnqueueBuffer(clblasA, (lengthA + params->offa) * sizeof(*clblasA), 0, CL_MEM_READ_WRITE); bufX = base->createEnqueueBuffer(X, (lengthX + params->offBX)* sizeof(*X), 0, CL_MEM_READ_ONLY); ::std::cerr << "Calling reference xSYR routine... "; clblasOrder order; clblasUplo fUplo; order = params->order; fUplo = params->uplo; //printf("\n\n before acml call\nA\n"); // printMatrixBlock( params->order, 0, 0, params->N, params->N, params->lda, blasA); //printf("\nX\n"); //printMatrixBlock( clblasColumnMajor, 0, 0, lengthX, 1, lengthX, X); if (order == clblasColumnMajor) { ::clMath::blas::syr( clblasColumnMajor, fUplo, params->N, alpha, X, params->offBX, params->incx, blasA, params->offa, params->lda); } else { T *reorderedA = new T[lengthA + params->offa]; //reorderMatrix(clblasRowMajor, params->N, params->lda, blasA, reorderedA); fUplo = (fUplo == clblasUpper) ? clblasLower : clblasUpper; //::clMath::blas::syr( clblasColumnMajor, fUplo, params->N, alpha, X, params->offBX, params->incx, reorderedA, params->offa, params->lda); ::clMath::blas::syr( clblasColumnMajor, fUplo, params->N, alpha, X, params->offBX, params->incx, blasA, params->offa, params->lda); //reorderMatrix(clblasColumnMajor, params->lda, params->N, reorderedA, blasA); delete[] reorderedA; } //printf("After acml\n"); //printMatrixBlock( params->order, 0, 0, params->N, params->N, params->lda, blasA); ::std::cerr << "Done" << ::std::endl; if ((bufA == NULL) || (bufX == NULL) ) { /* Skip the test, the most probable reason is * matrix too big for a device. */ releaseMemObjects(bufA, bufX); deleteBuffers(blasA, clblasA, X); delete[] events; ::std::cerr << ">> Failed to create/enqueue buffer for a matrix." << ::std::endl << ">> Can't execute the test, because data is not transfered to GPU." << ::std::endl << ">> Test skipped." << ::std::endl; SUCCEED(); return; } ::std::cerr << "Calling clblas xSYR routine... "; err = (cl_int)::clMath::clblas::syr( params->order, params->uplo, params->N, alpha, bufX, params->offBX, params->incx, bufA, params->offa, params->lda, params->numCommandQueues, base->commandQueues(), 0, NULL, events); if (err != CL_SUCCESS) { releaseMemObjects(bufA, bufX); deleteBuffers(blasA, clblasA, X); delete[] events; ASSERT_EQ(CL_SUCCESS, err) << "::clMath::clblas::SYR() failed"; } err = waitForSuccessfulFinish(params->numCommandQueues, base->commandQueues(), events); if (err != CL_SUCCESS) { releaseMemObjects(bufA, bufX); deleteBuffers(blasA, clblasA, X); delete[] events; ASSERT_EQ(CL_SUCCESS, err) << "waitForSuccessfulFinish()"; } ::std::cerr << "Done" << ::std::endl; err = clEnqueueReadBuffer(base->commandQueues()[0], bufA, CL_TRUE, 0, (lengthA + params->offa) * sizeof(*clblasA), clblasA, 0, NULL, NULL); if (err != CL_SUCCESS) { ::std::cerr << "SYR: Reading results failed...." << std::endl; } releaseMemObjects(bufA, bufX); //printMatrixBlock( params->order, 0, 0, params->N, params->N, params->lda, clblasA); //getchar(); // printf("Comparing with the temp buffer\n"); // compareMatrices(clblasColumnMajor, 1, (params->lda - params->N), (blasA + params->offa + params->N), (tempA + params->offa + params->N), // params->lda); // delete[] tempA; printf("Comparing the results\n"); compareMatrices(params->order, params->N , params->N, (blasA + params->offa), (clblasA + params->offa), params->lda); deleteBuffers(blasA, clblasA, X); delete[] events; } // Instantiate the test TEST_P(SYR, ssyr) { TestParams params; getParams(¶ms); syrCorrectnessTest(¶ms); } TEST_P(SYR, dsyr) { TestParams params; getParams(¶ms); syrCorrectnessTest(¶ms); } clblas-2.10/src/tests/correctness/corr-syr2.cpp000066400000000000000000000147501264277366700215600ustar00rootroot00000000000000/* ************************************************************************ * Copyright 2013 Advanced Micro Devices, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * ************************************************************************/ #include // srand() #include // memcpy() #include #include #include #include #include #include #include #include #include static void releaseMemObjects(cl_mem objA, cl_mem objX, cl_mem objY) { if(objA != NULL) { clReleaseMemObject(objA); } if(objX != NULL) { clReleaseMemObject(objX); } if(objY != NULL) { clReleaseMemObject(objY); } } template static void deleteBuffers(T *blasA, T *clblasA, T *X, T *Y) { if(blasA != NULL) { delete[] blasA; } if(clblasA != NULL) { delete[] clblasA; } if(X != NULL) { delete[] X; } if(Y != NULL) { delete[] Y; } } template void syr2CorrectnessTest(TestParams *params) { cl_int err; T *blasA, *clblasA, *X, *Y; cl_mem bufA, bufX, bufY; clMath::BlasBase *base; cl_event *events; bool useAlpha; T alpha; base = clMath::BlasBase::getInstance(); if ((typeid(T) == typeid(cl_double)) && !base->isDevSupportDoublePrecision()) { std::cerr << ">> WARNING: The target device doesn't support native " "double precision floating point arithmetic" << std::endl << ">> Test skipped" << std::endl; SUCCEED(); return; } printf("number of command queues : %d\n\n", params->numCommandQueues); events = new cl_event[params->numCommandQueues]; memset(events, 0, params->numCommandQueues * sizeof(cl_event)); size_t lengthA = params->N * params->lda; size_t lengthX = (1 + ((params->N - 1) * abs(params->incx))); size_t lengthY = (1 + ((params->N - 1) * abs(params->incy))); blasA = new T[lengthA + params->offa ]; clblasA = new T[lengthA + params->offa ]; X = new T[lengthX + params->offBX ]; Y = new T[lengthY + params->offCY ]; srand(params->seed); if((blasA == NULL) || (clblasA == NULL) || (X == NULL) || (Y == NULL)) { deleteBuffers(blasA, clblasA, X, Y); ::std::cerr << "Cannot allocate memory on host side\n" << "!!!!!!!!!!!!Test skipped.!!!!!!!!!!!!" << ::std::endl; delete[] events; SUCCEED(); return; } alpha = convertMultiplier(params->alpha); useAlpha = true; ::std::cerr << "Generating input data... "; randomSyr2Matrices(params->order, params->uplo, params->N, useAlpha, &alpha, (blasA + params->offa), params->lda, (X + params->offBX), params->incx, (Y + params->offCY), params->incy); // Copy blasA to clblasA memcpy(clblasA, blasA, (lengthA + params->offa)* sizeof(*blasA)); ::std::cerr << "Done" << ::std::endl; // Allocate buffers bufA = base->createEnqueueBuffer(clblasA, (lengthA + params->offa)* sizeof(*clblasA), 0,CL_MEM_READ_WRITE); bufX = base->createEnqueueBuffer(X, (lengthX + params->offBX)* sizeof(*X), 0, CL_MEM_READ_ONLY); bufY = base->createEnqueueBuffer(Y, (lengthY + params->offCY)* sizeof(*Y), 0, CL_MEM_READ_ONLY); ::std::cerr << "Calling reference xSYR2 routine... "; clblasOrder order; clblasUplo fUplo; order = params->order; fUplo = params->uplo; if (order != clblasColumnMajor) { order = clblasColumnMajor; fUplo = (params->uplo == clblasUpper)? clblasLower : clblasUpper; } ::clMath::blas::syr2( order, fUplo, params->N, alpha, X, params->offBX, params->incx, Y, params->offCY, params->incy, blasA, params->offa, params->lda); ::std::cerr << "Done" << ::std::endl; if ((bufA == NULL) || (bufX == NULL) || (bufY == NULL)) { /* Skip the test, the most probable reason is * matrix too big for a device. */ releaseMemObjects(bufA, bufX, bufY); deleteBuffers(blasA, clblasA, X, Y); delete[] events; ::std::cerr << ">> Failed to create/enqueue buffer for a matrix." << ::std::endl << ">> Can't execute the test, because data is not transfered to GPU." << ::std::endl << ">> Test skipped." << ::std::endl; SUCCEED(); return; } ::std::cerr << "Calling clblas xSYR2 routine... "; err = (cl_int)::clMath::clblas::syr2( params->order, params->uplo, params->N, alpha, bufX, params->offBX, params->incx, bufY, params->offCY, params->incy, bufA, params->offa, params->lda, params->numCommandQueues, base->commandQueues(), 0, NULL, events); if (err != CL_SUCCESS) { releaseMemObjects(bufA, bufX, bufY); deleteBuffers(blasA, clblasA, X, Y); delete[] events; ASSERT_EQ(CL_SUCCESS, err) << "::clMath::clblas::SYR2() failed"; } err = waitForSuccessfulFinish(params->numCommandQueues, base->commandQueues(), events); if (err != CL_SUCCESS) { releaseMemObjects(bufA, bufX, bufY); deleteBuffers(blasA, clblasA, X, Y); delete[] events; ASSERT_EQ(CL_SUCCESS, err) << "waitForSuccessfulFinish()"; } ::std::cerr << "Done" << ::std::endl; err = clEnqueueReadBuffer(base->commandQueues()[0], bufA, CL_TRUE, 0, (lengthA + params->offa) * sizeof(*clblasA), clblasA, 0, NULL, NULL); if (err != CL_SUCCESS) { ::std::cerr << "SYR2: Reading results failed...." << std::endl; } releaseMemObjects(bufA, bufX, bufY); compareMatrices(clblasColumnMajor, params->N , params->N, (blasA + params->offa), (clblasA + params->offa), params->lda); deleteBuffers(blasA, clblasA, X, Y); delete[] events; } // Instantiate the test TEST_P(SYR2, ssyr2) { TestParams params; getParams(¶ms); syr2CorrectnessTest(¶ms); } TEST_P(SYR2, dsyr2) { TestParams params; getParams(¶ms); syr2CorrectnessTest(¶ms); } clblas-2.10/src/tests/correctness/corr-syr2k.cpp000066400000000000000000000215701264277366700217310ustar00rootroot00000000000000/* ************************************************************************ * Copyright 2013 Advanced Micro Devices, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * ************************************************************************/ #include // srand() #include // memcpy() #include #include #include #include #include #include #include #include #include #include "tcase-filter.h" static void releaseMemObjects(cl_mem objA, cl_mem objB, cl_mem objC) { clReleaseMemObject(objA); clReleaseMemObject(objB); clReleaseMemObject(objC); } template static void deleteBuffers(T *A, T *B, T *blasC, T *clblasC) { delete[] A; delete[] B; delete[] blasC; delete[] clblasC; } template void syr2kCorrectnessTest(TestParams *params) { cl_int err; T *A, *B, *blasC, *clblasC; T alpha, beta, a; cl_mem bufA, bufB, bufC; clMath::BlasBase *base; bool useAlpha; bool useBeta; cl_event *events; clblasTranspose transB; bool isComplex; base = clMath::BlasBase::getInstance(); if ((typeid(T) == typeid(cl_double) || typeid(T) == typeid(DoubleComplex)) && !base->isDevSupportDoublePrecision()) { std::cerr << ">> WARNING: The target device doesn't support native " "double precision floating point arithmetic" << std::endl << ">> Test skipped" << std::endl; SUCCEED(); return; } if ((typeid(T) == typeid(FloatComplex)) || (typeid(T) == typeid(DoubleComplex))) { if (params->transA == clblasConjTrans) { ::std::cerr << ">> syr2k(CONJUGATE_TRANSPOSE) for complex numbers " "is not allowed." << ::std::endl << ">> Test skipped." << ::std::endl; SUCCEED(); return; } } isComplex = ((typeid(T) == typeid(FloatComplex)) || (typeid(T) == typeid(DoubleComplex))); if (canCaseBeSkipped(params, isComplex)) { std::cerr << ">> Test is skipped because it has no importance for this " "level of coverage" << std::endl; SUCCEED(); return; } useAlpha = base->useAlpha(); useBeta = base->useBeta(); alpha = ZERO(); beta = ZERO(); events = new cl_event[params->numCommandQueues]; memset(events, 0, params->numCommandQueues * sizeof(cl_event)); A = new T[params->rowsA * params->columnsA]; B = new T[params->rowsB * params->columnsB]; blasC = new T[params->rowsC * params->columnsC]; clblasC = new T[params->rowsC * params->columnsC]; srand(params->seed); if (useAlpha) { alpha = convertMultiplier(params->alpha); } if (useBeta) { beta = convertMultiplier(params->beta); } ::std::cerr << "Generating input data... "; if (!useAlpha) { alpha = random(100); if (module(alpha) == 0.0) { alpha = ONE(); } } a = alpha * 2; transB = (params->transA == clblasNoTrans) ? clblasTrans : clblasNoTrans; randomGemmMatrices(params->order, params->transA, transB, params->N, params->N, params->K, true, &a, A, params->lda, B, params->ldb, useBeta, &beta, blasC, params->ldc); memcpy(clblasC, blasC, params->rowsC * params->columnsC * sizeof(*blasC)); ::std::cerr << "Done" << ::std::endl; ::std::cerr << "Calling reference xSYR2K routine... "; if (params->order == clblasColumnMajor) { ::clMath::blas::syr2k(clblasColumnMajor, params->uplo, params->transA, params->N, params->K, alpha, A, params->lda, B, params->ldb, beta, blasC, params->ldc); } else { T *reorderedA = new T[params->rowsA * params->columnsA]; T *reorderedB = new T[params->rowsB * params->columnsB]; T *reorderedC = new T[params->rowsC * params->columnsC]; reorderMatrix(clblasRowMajor, params->rowsA, params->columnsA, A, reorderedA); reorderMatrix(clblasRowMajor, params->rowsB, params->columnsB, B, reorderedB); reorderMatrix(clblasRowMajor, params->rowsC, params->columnsC, blasC, reorderedC); ::clMath::blas::syr2k(clblasColumnMajor, params->uplo, params->transA, params->N, params->K, alpha, reorderedA, params->rowsA, reorderedB, params->rowsB, beta, reorderedC, params->rowsC); reorderMatrix(clblasColumnMajor, params->rowsC, params->columnsC, reorderedC, blasC); delete[] reorderedC; delete[] reorderedB; delete[] reorderedA; } ::std::cerr << "Done" << ::std::endl; bufA = base->createEnqueueBuffer(A, params->rowsA * params->columnsA * sizeof(*A), params->offA * sizeof(*A), CL_MEM_READ_ONLY); bufB = base->createEnqueueBuffer(B, params->rowsB * params->columnsB * sizeof(*B), params->offBX * sizeof(*B), CL_MEM_READ_ONLY); bufC = base->createEnqueueBuffer(clblasC, params->rowsC * params->columnsC * sizeof(*clblasC), params->offCY * sizeof(*clblasC), CL_MEM_READ_WRITE); if ((bufA == NULL) || (bufB == NULL) || (bufC == NULL)) { /* Skip the test, the most probable reason is * matrix too big for a device. */ releaseMemObjects(bufA, bufB, bufC); deleteBuffers(A, B, blasC, clblasC); delete[] events; ::std::cerr << ">> Failed to create/enqueue buffer for a matrix." << ::std::endl << ">> Can't execute the test, because data is not transfered to GPU." << ::std::endl << ">> Test skipped." << ::std::endl; SUCCEED(); return; } ::std::cerr << "Calling clblas xSYR2K routine... "; err = (cl_int)::clMath::clblas::syr2k(params->order, params->uplo, params->transA, params->N, params->K, alpha, bufA, params->offA, params->lda, bufB, params->offBX, params->ldb, beta, bufC, params->offCY, params->ldc, params->numCommandQueues, base->commandQueues(), 0, NULL, events); if (err != CL_SUCCESS) { releaseMemObjects(bufA, bufB, bufC); deleteBuffers(A, B, blasC, clblasC); delete[] events; ASSERT_EQ(CL_SUCCESS, err) << "::clMath::clblas::SYR2K() failed"; } err = waitForSuccessfulFinish(params->numCommandQueues, base->commandQueues(), events); if (err != CL_SUCCESS) { releaseMemObjects(bufA, bufB, bufC); deleteBuffers(A, B, blasC, clblasC); delete[] events; ASSERT_EQ(CL_SUCCESS, err) << "waitForSuccessfulFinish()"; } ::std::cerr << "Done" << ::std::endl; clEnqueueReadBuffer(base->commandQueues()[0], bufC, CL_TRUE, params->offCY * sizeof(*clblasC), params->rowsC * params->columnsC * sizeof(*clblasC), clblasC, 0, NULL, NULL); releaseMemObjects(bufA, bufB, bufC); compareMatrices(params->order, params->N, params->N, blasC, clblasC, params->ldc); deleteBuffers(A, B, blasC, clblasC); delete[] events; } // Instantiate the test TEST_P(SYR2K, ssyr2k) { TestParams params; getParams(¶ms); syr2kCorrectnessTest(¶ms); } TEST_P(SYR2K, dsyr2k) { TestParams params; getParams(¶ms); syr2kCorrectnessTest(¶ms); } TEST_P(SYR2K, csyr2k) { TestParams params; getParams(¶ms); syr2kCorrectnessTest(¶ms); } TEST_P(SYR2K, zsyr2k) { TestParams params; getParams(¶ms); syr2kCorrectnessTest(¶ms); } clblas-2.10/src/tests/correctness/corr-syrk.cpp000066400000000000000000000177441264277366700216570ustar00rootroot00000000000000/* ************************************************************************ * Copyright 2013 Advanced Micro Devices, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * ************************************************************************/ #include // srand() #include // memcpy() #include #include #include #include #include #include #include #include #include #include "tcase-filter.h" static void releaseMemObjects(cl_mem objA, cl_mem objC) { clReleaseMemObject(objA); clReleaseMemObject(objC); } template static void deleteBuffers(T *A, T *blasC, T *clblasC) { delete[] A; delete[] blasC; delete[] clblasC; } template void syrkCorrectnessTest(TestParams *params) { cl_int err; T *A, *blasC, *clblasC; T alpha, beta; cl_mem bufA, bufC; clMath::BlasBase *base; bool useAlpha; bool useBeta; cl_event *events; bool isComplex; base = clMath::BlasBase::getInstance(); if ((typeid(T) == typeid(cl_double) || typeid(T) == typeid(DoubleComplex)) && !base->isDevSupportDoublePrecision()) { std::cerr << ">> WARNING: The target device doesn't support native " "double precision floating point arithmetic" << std::endl << ">> Test skipped" << std::endl; SUCCEED(); return; } if ((typeid(T) == typeid(FloatComplex)) || (typeid(T) == typeid(DoubleComplex))) { if (params->transA == clblasConjTrans) { ::std::cerr << ">> syrk(CONJUGATE_TRANSPOSE) for complex numbers " "is not allowed." << ::std::endl << ">> Test skipped." << ::std::endl; SUCCEED(); return; } } isComplex = ((typeid(T) == typeid(FloatComplex)) || (typeid(T) == typeid(DoubleComplex))); if (canCaseBeSkipped(params, isComplex)) { std::cerr << ">> Test is skipped because it has no importance for this " "level of coverage" << std::endl; SUCCEED(); return; } useAlpha = base->useAlpha(); useBeta = base->useBeta(); alpha = ZERO(); beta = ZERO(); events = new cl_event[params->numCommandQueues]; memset(events, 0, params->numCommandQueues * sizeof(cl_event)); A = new T[params->rowsA * params->columnsA]; blasC = new T[params->rowsC * params->columnsC]; clblasC = new T[params->rowsC * params->columnsC]; srand(params->seed); if (useAlpha) { alpha = convertMultiplier(params->alpha); } if (useBeta) { beta = convertMultiplier(params->beta); } ::std::cerr << "Generating input data... "; if (!useAlpha) { alpha = random(100); if (module(alpha) == 0.0) { alpha = ONE(); } } randomGemmMatrices(params->order, params->transA, clblasNoTrans, params->N, params->N, params->K, useAlpha, &alpha, A, params->lda, NULL, 0, useBeta, &beta, blasC, params->ldc); memcpy(clblasC, blasC, params->rowsC * params->columnsC * sizeof(*blasC)); ::std::cerr << "Done" << ::std::endl; ::std::cerr << "Calling reference xSYRK routine... "; if (params->order == clblasColumnMajor) { ::clMath::blas::syrk(clblasColumnMajor, params->uplo, params->transA, params->N, params->K, alpha, A, params->lda, beta, blasC, params->ldc); } else { T *reorderedA = new T[params->rowsA * params->columnsA]; T *reorderedC = new T[params->rowsC * params->columnsC]; reorderMatrix(clblasRowMajor, params->rowsA, params->columnsA, A, reorderedA); reorderMatrix(clblasRowMajor, params->rowsC, params->columnsC, blasC, reorderedC); ::clMath::blas::syrk(clblasColumnMajor, params->uplo, params->transA, params->N, params->K, alpha, reorderedA, params->rowsA, beta, reorderedC, params->rowsC); reorderMatrix(clblasColumnMajor, params->rowsC, params->columnsC, reorderedC, blasC); delete[] reorderedC; delete[] reorderedA; } ::std::cerr << "Done" << ::std::endl; bufA = base->createEnqueueBuffer(A, params->rowsA * params->columnsA * sizeof(*A), params->offA * sizeof(*A), CL_MEM_READ_ONLY); bufC = base->createEnqueueBuffer(clblasC, params->rowsC * params->columnsC * sizeof(*clblasC), params->offCY * sizeof(*clblasC), CL_MEM_READ_WRITE); if ((bufA == NULL) || (bufC == NULL)) { /* Skip the test, the most probable reason is * matrix too big for a device. */ releaseMemObjects(bufA, bufC); deleteBuffers(A, blasC, clblasC); delete[] events; ::std::cerr << ">> Failed to create/enqueue buffer for a matrix." << ::std::endl << ">> Can't execute the test, because data is not transfered to GPU." << ::std::endl << ">> Test skipped." << ::std::endl; SUCCEED(); return; } ::std::cerr << "Calling clblas xSYRK routine... "; err = (cl_int)::clMath::clblas::syrk(params->order, params->uplo, params->transA, params->N, params->K, alpha, bufA, params->offA, params->lda, beta, bufC, params->offCY, params->ldc, params->numCommandQueues, base->commandQueues(), 0, NULL, events); if (err != CL_SUCCESS) { releaseMemObjects(bufA, bufC); deleteBuffers(A, blasC, clblasC); delete[] events; ASSERT_EQ(CL_SUCCESS, err) << "::clMath::clblas::SYRK() failed"; } err = waitForSuccessfulFinish(params->numCommandQueues, base->commandQueues(), events); if (err != CL_SUCCESS) { releaseMemObjects(bufA, bufC); deleteBuffers(A, blasC, clblasC); delete[] events; ASSERT_EQ(CL_SUCCESS, err) << "waitForSuccessfulFinish()"; } ::std::cerr << "Done" << ::std::endl; clEnqueueReadBuffer(base->commandQueues()[0], bufC, CL_TRUE, params->offCY * sizeof(*clblasC), params->rowsC * params->columnsC * sizeof(*clblasC), clblasC, 0, NULL, NULL); releaseMemObjects(bufA, bufC); compareMatrices(params->order, params->N, params->N, blasC, clblasC, params->ldc); deleteBuffers(A, blasC, clblasC); delete[] events; } // Instantiate the test TEST_P(SYRK, ssyrk) { TestParams params; getParams(¶ms); syrkCorrectnessTest(¶ms); } TEST_P(SYRK, dsyrk) { TestParams params; getParams(¶ms); syrkCorrectnessTest(¶ms); } TEST_P(SYRK, csyrk) { TestParams params; getParams(¶ms); syrkCorrectnessTest(¶ms); } TEST_P(SYRK, zsyrk) { TestParams params; getParams(¶ms); syrkCorrectnessTest(¶ms); } clblas-2.10/src/tests/correctness/corr-tbmv.cpp000066400000000000000000000161271264277366700216310ustar00rootroot00000000000000/* ************************************************************************ * Copyright 2013 Advanced Micro Devices, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * ************************************************************************/ #include // srand() #include // memcpy() #include #include #include #include #include #include #include #include #include static void releaseMemObjects(cl_mem objA, cl_mem objX, cl_mem objXtemp) { if(objA != NULL) { clReleaseMemObject(objA); } if(objX != NULL) { clReleaseMemObject(objX); } if(objXtemp != NULL) { clReleaseMemObject(objXtemp); } } template static void deleteBuffers(T *A, T *blasX, T *clblasX) { if(A != NULL) { delete[] A; } if(blasX != NULL) { delete[] blasX; } if(clblasX != NULL) { delete[] clblasX; // To hold clblas TBMV call results } } template void tbmvCorrectnessTest(TestParams *params) { cl_int err; T *A, *blasX, *clblasX; cl_mem bufA, bufX, bufXtemp; clMath::BlasBase *base; cl_event *events; size_t lengthX, lengthA; base = clMath::BlasBase::getInstance(); if (( (typeid(T) == typeid(DoubleComplex)) || (typeid(T) == typeid(cl_double)) ) && !base->isDevSupportDoublePrecision()) { std::cerr << ">> WARNING: The target device doesn't support native " "double precision floating point arithmetic" << std::endl << ">> Test skipped" << std::endl; SUCCEED(); return; } printf("number of command queues : %d\n\n", params->numCommandQueues); events = new cl_event[params->numCommandQueues]; memset(events, 0, params->numCommandQueues * sizeof(cl_event)); lengthA = params->N * params->lda ; lengthX = (params->N - 1)*abs(params->incx) + 1; A = new T[ lengthA + params->offA ]; blasX = new T[ lengthX + params->offBX ]; clblasX = new T[ lengthX + params->offBX ]; srand(params->seed); ::std::cerr << "Generating input data... "; if((A == NULL) || (blasX == NULL) || (clblasX == NULL)) { deleteBuffers(A, blasX, clblasX); ::std::cerr << "Cannot allocate memory on host side\n" << "!!!!!!!!!!!!Test skipped!!!!!!!!!!!!" << ::std::endl; delete[] events; SUCCEED(); return; } randomTbmvMatrices( params->N, (A + params->offA), params->lda, (blasX + params->offBX), params->incx ); // Copy blasY to clblasY memcpy(clblasX, blasX, (lengthX + params->offBX)* sizeof(*blasX)); ::std::cerr << "Done" << ::std::endl; // Allocate buffers bufA = base->createEnqueueBuffer(A, (lengthA + params->offA)* sizeof(*A), 0, CL_MEM_READ_WRITE); bufX = base->createEnqueueBuffer(blasX, (lengthX + params->offBX)* sizeof(*blasX), 0, CL_MEM_READ_WRITE); bufXtemp = base->createEnqueueBuffer(blasX, (lengthX + params->offBX)* sizeof(*blasX), 0, CL_MEM_READ_WRITE); ::std::cerr << "Calling reference xTBMV routine... "; clblasOrder fOrder; clblasTranspose fTrans; clblasUplo fUplo; fOrder = params->order; fTrans = params->transA; fUplo = params->uplo; size_t fN = params->N, fK = params->K; if (fOrder != clblasColumnMajor) { fOrder = clblasColumnMajor; fTrans = (params->transA == clblasNoTrans)? clblasTrans : clblasNoTrans; fUplo = (params->uplo == clblasLower)? clblasUpper : clblasLower; if( params->transA == clblasConjTrans ) doConjugate( (A + params->offA), 1, lengthA, params->lda ); } clMath::blas::tbmv(fOrder, fUplo, fTrans, params->diag, fN, fK, A, params->offA, params->lda, blasX, params->offBX, params->incx); ::std::cerr << "Done" << ::std::endl; if ((bufA == NULL) || (bufX == NULL)|| (bufXtemp == NULL)) { // Skip the test, the most probable reason is // matrix too big for a device. releaseMemObjects(bufA, bufX, bufXtemp ); deleteBuffers(A, blasX, clblasX); delete[] events; ::std::cerr << ">> Failed to create/enqueue buffer for a matrix." << ::std::endl << ">> Can't execute the test, because data is not transfered to GPU." << ::std::endl << ">> Test skipped." << ::std::endl; SUCCEED(); return; } ::std::cerr << "Calling clblas xTBMV routine... "; DataType type; type = ( typeid(T) == typeid(cl_float))? TYPE_FLOAT:( typeid(T) == typeid(cl_double))? TYPE_DOUBLE: ( typeid(T) == typeid(cl_float2))? TYPE_COMPLEX_FLOAT: TYPE_COMPLEX_DOUBLE; err = (cl_int)clMath::clblas::tbmv(type, params->order, params->uplo, params->transA, params->diag, params->N, params->K, bufA, params->offA, params->lda, bufX, params->offBX, params->incx, bufXtemp, params->numCommandQueues, base->commandQueues(), 0, NULL, events); if (err != CL_SUCCESS) { releaseMemObjects(bufA, bufX, bufXtemp); deleteBuffers(A, blasX, clblasX); delete[] events; ASSERT_EQ(CL_SUCCESS, err) << "::clMath::clblas::TBMV() failed"; } err = waitForSuccessfulFinish(params->numCommandQueues, base->commandQueues(), events); if (err != CL_SUCCESS) { releaseMemObjects(bufA, bufX, bufXtemp); deleteBuffers(A, blasX, clblasX); delete[] events; ASSERT_EQ(CL_SUCCESS, err) << "waitForSuccessfulFinish()"; } ::std::cerr << "Done" << ::std::endl; err = clEnqueueReadBuffer(base->commandQueues()[0], bufX, CL_TRUE, 0, (lengthX + params->offBX) * sizeof(*clblasX), clblasX, 0, NULL, NULL); if (err != CL_SUCCESS) { ::std::cerr << "TBMV: Reading results failed...." << std::endl; } releaseMemObjects(bufA, bufX, bufXtemp); compareMatrices(clblasColumnMajor, lengthX , 1, (blasX + params->offBX), (clblasX + params->offBX), lengthX); deleteBuffers(A, blasX, clblasX); delete[] events; } // Instantiate the test TEST_P(TBMV, stbmv) { TestParams params; getParams(¶ms); tbmvCorrectnessTest(¶ms); } TEST_P(TBMV, dtbmv) { TestParams params; getParams(¶ms); tbmvCorrectnessTest(¶ms); } TEST_P(TBMV, ctbmv) { TestParams params; getParams(¶ms); tbmvCorrectnessTest(¶ms); } TEST_P(TBMV, ztbmv) { TestParams params; getParams(¶ms); tbmvCorrectnessTest(¶ms); } clblas-2.10/src/tests/correctness/corr-tbsv.cpp000066400000000000000000000170161264277366700216350ustar00rootroot00000000000000/* ************************************************************************ * Copyright 2013 Advanced Micro Devices, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * ************************************************************************/ #include // srand() #include // memcpy() #include #include #include #include #include #include #include #include #include #include "trsv-delta.h" static void releaseMemObjects(cl_mem objA, cl_mem objX) { if(objA != NULL) { clReleaseMemObject(objA); } if(objX != NULL) { clReleaseMemObject(objX); } } template static void deleteBuffers(T *A, T *blasX, T *backX, cl_double *deltaX) { if( A != NULL ) { delete[] A; } if( blasX != NULL ) { delete[] blasX; } if( backX != NULL ) { delete[] backX; } if( deltaX != NULL ) { delete[] deltaX; } } template void tbsvCorrectnessTest(TestParams *params) { cl_int err; T *A, *blasX, *clblasX; cl_mem bufA, bufX; cl_double *deltaX; clMath::BlasBase *base; cl_event *events; size_t lengthX, lengthA; base = clMath::BlasBase::getInstance(); if (( (typeid(T) == typeid(DoubleComplex)) || (typeid(T) == typeid(cl_double)) ) && !base->isDevSupportDoublePrecision()) { std::cerr << ">> WARNING: The target device doesn't support native " "double precision floating point arithmetic" << std::endl << ">> Test skipped" << std::endl; SUCCEED(); return; } printf("number of command queues : %d\n\n", params->numCommandQueues); events = new cl_event[params->numCommandQueues]; memset(events, 0, params->numCommandQueues * sizeof(cl_event)); lengthA = params->N * params->lda ; lengthX = (params->N - 1)*abs(params->incx) + 1; A = new T[ lengthA + params->offA ]; blasX = new T[ lengthX + params->offBX ]; clblasX = new T[ lengthX + params->offBX ]; deltaX = new cl_double[lengthX + params->offBX]; srand(params->seed); ::std::cerr << "Generating input data... "; if((A == NULL) || (blasX == NULL) || (clblasX == NULL)) { deleteBuffers(A, blasX, clblasX, deltaX); ::std::cerr << "Cannot allocate memory on host side\n" << "!!!!!!!!!!!!Test skipped!!!!!!!!!!!!" << ::std::endl; delete[] events; SUCCEED(); return; } memset( deltaX, 0, (lengthX + params->offBX)*sizeof(cl_double) ); memset( blasX, 0, (lengthX + params->offBX)*sizeof(T)); memset( clblasX, 0, (lengthX + params->offBX)*sizeof(T)); randomTbsvMatrices( params->order, params->uplo, params->diag, params->N, params->K, (A + params->offA), params->lda, (blasX + params->offBX), params->incx ); // Generate delta X for result comparison tbsvDelta( params->order, params->uplo, params->transA, params->diag, params->N, params->K, (A + params->offA), params->lda, (blasX + params->offBX), params->incx, (deltaX + params->offBX) ); memcpy(clblasX, blasX, (lengthX + params->offBX)* sizeof(*blasX)); ::std::cerr << "Done" << ::std::endl; // Allocate buffers bufA = base->createEnqueueBuffer(A, (lengthA + params->offA)* sizeof(*A), 0, CL_MEM_READ_WRITE); bufX = base->createEnqueueBuffer(blasX, (lengthX + params->offBX)* sizeof(*blasX), 0, CL_MEM_READ_WRITE); ::std::cerr << "Calling reference xTBSV routine... "; clblasOrder fOrder; clblasTranspose fTrans; clblasUplo fUplo; fOrder = params->order; fTrans = params->transA; fUplo = params->uplo; size_t fN = params->N, fK = params->K; if (fOrder != clblasColumnMajor) { fOrder = clblasColumnMajor; fTrans = (params->transA == clblasNoTrans)? clblasTrans : clblasNoTrans; fUplo = (params->uplo == clblasLower)? clblasUpper : clblasLower; if( params->transA == clblasConjTrans ) doConjugate( (A + params->offA), params->N, params->lda, params->lda ); } clMath::blas::tbsv(fOrder, fUplo, fTrans, params->diag, fN, fK, A, params->offA, params->lda, blasX, params->offBX, params->incx); ::std::cerr << "Done" << ::std::endl; if ((bufA == NULL) || (bufX == NULL)) { // Skip the test, the most probable reason is // matrix too big for a device. releaseMemObjects(bufA, bufX); deleteBuffers(A, blasX, clblasX, deltaX); delete[] events; ::std::cerr << ">> Failed to create/enqueue buffer for a matrix." << ::std::endl << ">> Can't execute the test, because data is not transfered to GPU." << ::std::endl << ">> Test skipped." << ::std::endl; SUCCEED(); return; } ::std::cerr << "Calling clblas xTBSV routine... "; DataType type; type = ( typeid(T) == typeid(cl_float))? TYPE_FLOAT:( typeid(T) == typeid(cl_double))? TYPE_DOUBLE: ( typeid(T) == typeid(cl_float2))? TYPE_COMPLEX_FLOAT: TYPE_COMPLEX_DOUBLE; err = (cl_int)clMath::clblas::tbsv(type, params->order, params->uplo, params->transA, params->diag, params->N, params->K, bufA, params->offA, params->lda, bufX, params->offBX, params->incx, params->numCommandQueues, base->commandQueues(), 0, NULL, events); if (err != CL_SUCCESS) { releaseMemObjects(bufA, bufX); deleteBuffers(A, blasX, clblasX, deltaX); delete[] events; ASSERT_EQ(CL_SUCCESS, err) << "::clMath::clblas::TBSV() failed"; } err = waitForSuccessfulFinish(params->numCommandQueues, base->commandQueues(), events); if (err != CL_SUCCESS) { releaseMemObjects(bufA, bufX); deleteBuffers(A, blasX, clblasX, deltaX); delete[] events; ASSERT_EQ(CL_SUCCESS, err) << "waitForSuccessfulFinish()"; } ::std::cerr << "Done" << ::std::endl; err = clEnqueueReadBuffer(base->commandQueues()[0], bufX, CL_TRUE, 0, (lengthX + params->offBX) * sizeof(*clblasX), clblasX, 0, NULL, NULL); if (err != CL_SUCCESS) { ::std::cerr << "TBSV: Reading results failed...." << std::endl; } releaseMemObjects(bufA, bufX); compareMatrices(clblasColumnMajor, lengthX , 1, (blasX + params->offBX), (clblasX + params->offBX), lengthX, (deltaX + params->offBX) ); deleteBuffers(A, blasX, clblasX, deltaX); delete[] events; } // Instantiate the test TEST_P(TBSV, stbsv) { TestParams params; getParams(¶ms); tbsvCorrectnessTest(¶ms); } TEST_P(TBSV, dtbsv) { TestParams params; getParams(¶ms); tbsvCorrectnessTest(¶ms); } TEST_P(TBSV, ctbsv) { TestParams params; getParams(¶ms); tbsvCorrectnessTest(¶ms); } TEST_P(TBSV, ztbsv) { TestParams params; getParams(¶ms); tbsvCorrectnessTest(¶ms); } clblas-2.10/src/tests/correctness/corr-tpmv.cpp000066400000000000000000000171401264277366700216430ustar00rootroot00000000000000/* ************************************************************************ * Copyright 2013 Advanced Micro Devices, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * ************************************************************************/ #include // srand() #include // memcpy() #include #include #include #include #include #include #include #include #include static void releaseMemObjects(cl_mem objAP, cl_mem objX, cl_mem objXTemp) { if(objAP != NULL) { clReleaseMemObject(objAP); } if(objX != NULL) { clReleaseMemObject(objX); } if(objXTemp != NULL) { clReleaseMemObject(objXTemp); } } template static void deleteBuffers(T *AP, T *blasX, T *clblasX) { if(AP != NULL) { delete[] AP; } if(blasX != NULL) { delete[] blasX; } if(clblasX != NULL) { delete[] clblasX; } } template void tpmvCorrectnessTest(TestParams *params) { cl_int err; T *AP, *blasX, *clblasX; cl_mem bufAP, bufX, bufXTemp; clMath::BlasBase *base; cl_event *events; base = clMath::BlasBase::getInstance(); if ((typeid(T) == typeid(cl_double) || typeid(T) == typeid(DoubleComplex)) && !base->isDevSupportDoublePrecision()) { std::cerr << ">> WARNING: The target device doesn't support native " "double precision floating point arithmetic" << std::endl << ">> Test skipped" << std::endl; SUCCEED(); return; } printf("number of command queues : %d\n\n", params->numCommandQueues); events = new cl_event[params->numCommandQueues]; memset(events, 0, params->numCommandQueues * sizeof(cl_event)); size_t lengthAP = (params->N *( params->N + 1 ))/2 ; size_t lengthX = (1 + ((params->N -1) * abs(params->incx))); AP = new T[lengthAP + params->offa ]; blasX = new T[lengthX + params->offBX ]; clblasX = new T[lengthX + params->offBX ]; if((AP == NULL) || (blasX == NULL) || (clblasX == NULL)) { ::std::cerr << "Cannot allocate memory on host side\n" << "!!!!!!!!!!!!Test skipped.!!!!!!!!!!!!" << ::std::endl; deleteBuffers(AP, blasX, clblasX); delete[] events; SUCCEED(); return; } srand(params->seed); ::std::cerr << "Generating input data... "; // Set data in A and X using populate() routine int creationFlags = 0; creationFlags = creationFlags | RANDOM_INIT | PACKED_MATRIX; // Default is Column-Major creationFlags = ( (params-> order) == clblasRowMajor)? (creationFlags | ROW_MAJOR_ORDER) : (creationFlags); creationFlags = ( (params-> uplo) == clblasLower)? (creationFlags | LOWER_HALF_ONLY) : (creationFlags | UPPER_HALF_ONLY); BlasRoutineID BlasFn = CLBLAS_TRMV; // Populate A and blasX populate( AP + params->offa, params-> N, params-> N, 0, BlasFn, creationFlags); populate( blasX , (lengthX + params->offBX), 1, (lengthX + params->offBX), BlasFn); // Copy blasX to clblasX memcpy(clblasX, blasX, (lengthX + params->offBX)* sizeof(*blasX)); ::std::cerr << "Done" << ::std::endl; // Allocate buffers bufAP = base->createEnqueueBuffer(AP, (lengthAP + params->offa)* sizeof(*AP), 0, CL_MEM_READ_ONLY); bufX = base->createEnqueueBuffer(clblasX, (lengthX + params->offBX)* sizeof(*clblasX), 0, CL_MEM_WRITE_ONLY); bufXTemp = base->createEnqueueBuffer(NULL, lengthX * sizeof(*clblasX), 0, CL_MEM_READ_ONLY); //printData( "bufX", blasX, lengthX, 1, lengthX); //printData( "clblasX", clblasX, lengthX, 1, lengthX); ::std::cerr << "Calling reference xTPMV routine... "; clblasOrder order; clblasUplo fUplo; clblasTranspose fTrans; order = params->order; fUplo = params->uplo; fTrans = params->transA; if (order != clblasColumnMajor) { order = clblasColumnMajor; fUplo = (params->uplo == clblasUpper)? clblasLower : clblasUpper; fTrans = (params->transA == clblasNoTrans)? clblasTrans : clblasNoTrans; if( params->transA == clblasConjTrans ) doConjugate( (AP +params->offa), (( params->N * (params->N + 1)) / 2) , 1, 1 ); } ::clMath::blas::tpmv( order, fUplo, fTrans, params->diag, params->N, AP, params->offa, blasX, params->offBX, params->incx); ::std::cerr << "Done" << ::std::endl; // Hold X vector if ((bufAP == NULL) || (bufX == NULL) || (bufXTemp == NULL)) { /* Skip the test, the most probable reason is * matrix too big for a device. */ releaseMemObjects(bufAP, bufX, bufXTemp); deleteBuffers(AP, blasX, clblasX); delete[] events; ::std::cerr << ">> Failed to create/enqueue buffer for a matrix." << ::std::endl << ">> Can't execute the test, because data is not transfered to GPU." << ::std::endl << ">> Test skipped." << ::std::endl; SUCCEED(); return; } ::std::cerr << "Calling clblas xTPMV routine... "; DataType type; type = ( typeid(T) == typeid(cl_float))? TYPE_FLOAT : ( typeid(T) == typeid(cl_double))? TYPE_DOUBLE: ( typeid(T) == typeid(cl_float2))? TYPE_COMPLEX_FLOAT:TYPE_COMPLEX_DOUBLE; // Should use bufXTemp as well err = (cl_int)::clMath::clblas::tpmv( type, params->order, params->uplo, params->transA, params->diag, params->N, bufAP, params->offa, bufX, params->offBX, params->incx, bufXTemp, params->numCommandQueues, base->commandQueues(), 0, NULL, events); if (err != CL_SUCCESS) { releaseMemObjects(bufAP, bufX, bufXTemp); deleteBuffers(AP, blasX, clblasX); delete[] events; ASSERT_EQ(CL_SUCCESS, err) << "::clMath::clblas::TPMV() failed"; } err = waitForSuccessfulFinish(params->numCommandQueues, base->commandQueues(), events); if (err != CL_SUCCESS) { releaseMemObjects(bufAP, bufX, bufXTemp); deleteBuffers(AP, blasX, clblasX); delete[] events; ASSERT_EQ(CL_SUCCESS, err) << "waitForSuccessfulFinish()"; } ::std::cerr << "Done" << ::std::endl; err = clEnqueueReadBuffer(base->commandQueues()[0], bufX, CL_TRUE, 0, (lengthX + params->offBX) * sizeof(*clblasX), clblasX, 0, NULL, NULL); if (err != CL_SUCCESS) { ::std::cerr << "TPMV: Reading results failed...." << std::endl; } releaseMemObjects(bufAP, bufX, bufXTemp); compareMatrices(clblasColumnMajor, lengthX , 1, (blasX + params->offBX), (clblasX + params->offBX), lengthX); deleteBuffers(AP, blasX, clblasX); delete[] events; } // Instantiate the test TEST_P(TPMV, stpmv) { TestParams params; getParams(¶ms); tpmvCorrectnessTest(¶ms); } TEST_P(TPMV, dtpmv) { TestParams params; getParams(¶ms); tpmvCorrectnessTest(¶ms); } TEST_P(TPMV, ctpmv) { TestParams params; getParams(¶ms); tpmvCorrectnessTest(¶ms); } TEST_P(TPMV, ztpmv) { TestParams params; getParams(¶ms); tpmvCorrectnessTest(¶ms); } clblas-2.10/src/tests/correctness/corr-tpsv.cpp000066400000000000000000000167441264277366700216620ustar00rootroot00000000000000/* ************************************************************************ * Copyright 2013 Advanced Micro Devices, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * ************************************************************************/ #include // srand() #include // memcpy() #include #include #include #include #include #include #include #include #include #include #include "trsv-delta.h" static void releaseMemObjects(cl_mem objA, cl_mem objX) { if (objA != NULL) clReleaseMemObject(objA); if (objX != NULL) clReleaseMemObject(objX); } template static void deleteBuffers(T *A, T *blasX, T *backX, cl_double *deltaX) { if( A != NULL ) { delete[] A; } if( blasX != NULL ) { delete[] blasX; } if( backX != NULL ) { delete[] backX; } if( deltaX != NULL ) { delete[] deltaX; } } template void tpsvCorrectnessTest(TestParams *params) { cl_int err; T *A, *blasX, *backX; cl_double *deltaX; cl_mem bufA, bufX; clMath::BlasBase *base; cl_event *events; base = clMath::BlasBase::getInstance(); if ((typeid(T) == typeid(cl_double) || typeid(T) == typeid(DoubleComplex)) && !base->isDevSupportDoublePrecision()) { std::cerr << ">> WARNING: The target device doesn't support native " "double precision floating point arithmetic" << std::endl << ">> Test skipped" << std::endl; SUCCEED(); return; } events = new cl_event[params->numCommandQueues]; memset(events, 0, params->numCommandQueues * sizeof(cl_event)); size_t lengthA = (params->N * (params->N + 1)) / 2; size_t lengthX = (1 + ((params->N -1) * abs(params->incx))); A = new T[lengthA + params->offa]; blasX = new T[lengthX + params->offBX]; backX = new T[lengthX + params->offBX]; deltaX = new cl_double[lengthX + params->offBX]; if ((A==NULL) || (blasX == NULL) || (backX == NULL) || (deltaX == NULL)) { ::std::cerr << "Unable to allocate matrices in Host memory" << std::endl; deleteBuffers(A, blasX, backX, deltaX); delete[] events; SUCCEED(); return; } memset( deltaX, 0, lengthX*sizeof(cl_double) ); memset( blasX, 0, lengthX*sizeof(T) ); srand(params->seed); ::std::cerr << "Generating input data... "; //custom generation function in blas-random.h randomTrsvMatrices( params->order, params->uplo, params->diag, params->N, (A + params->offa), 0, (blasX + params->offBX), params->incx); // Generate delta X for result comparison trsvDelta( params->order, params->uplo, params->transA, params->diag, params->N, (A + params->offa), 0, (blasX + params->offBX), params->incx, (deltaX + params->offBX) ); /*printf("\n\n before acml call\nA\n"); printMatrixBlock( params->order, 0, 0, params->N, params->N, params->lda, A); printf("\nX\n"); printMatrixBlock( clblasColumnMajor, 0, 0, lengthX, 1, lengthX, blasX);*/ // Copy blasX to clblasX memcpy(backX, blasX, (lengthX + params->offBX) * sizeof(T)); // Allocate buffers bufA = base->createEnqueueBuffer(A, (lengthA + params->offa)* sizeof(T), 0, CL_MEM_READ_ONLY); bufX = base->createEnqueueBuffer(backX, (lengthX + params->offBX)* sizeof(T), 0, CL_MEM_WRITE_ONLY); ::std::cerr << "Done" << ::std::endl; ::std::cerr << "Calling reference xTPSV routine... "; clblasOrder order; clblasUplo fUplo; clblasTranspose fTrans; order = params->order; fUplo = params->uplo; fTrans = params->transA; if (order != clblasColumnMajor) { order = clblasColumnMajor; fUplo = (params->uplo == clblasUpper)? clblasLower : clblasUpper; fTrans = (params->transA == clblasNoTrans)? clblasTrans : clblasNoTrans; if( params->transA == clblasConjTrans ) doConjugate((A + params->offa), 1, lengthA, 1); } ::clMath::blas::tpsv( order, fUplo, fTrans, params->diag, params->N, A, params->offa, blasX, params->offBX, params->incx); ::std::cerr << "Done" << ::std::endl; /* printf("\n\n acml result X\n"); printf("\nblasX\n"); printMatrixBlock( clblasColumnMajor, 0, 0, lengthX, 1, lengthX, blasX);*/ if ((bufA == NULL) || (bufX == NULL)) { /* Skip the test, the most probable reason is * matrix too big for a device. */ releaseMemObjects(bufA, bufX); deleteBuffers(A, blasX, backX, deltaX); delete[] events; ::std::cerr << ">> Failed to create/enqueue buffer for a matrix." << ::std::endl << ">> Can't execute the test, because data is not transfered to GPU." << ::std::endl << ">> Test skipped." << ::std::endl; SUCCEED(); return; } ::std::cerr << "Calling clblas xTPSV routine... "; DataType type; type = ( typeid(T) == typeid(cl_float))? TYPE_FLOAT : ( typeid(T) == typeid(cl_double))? TYPE_DOUBLE: ( typeid(T) == typeid(cl_float2))? TYPE_COMPLEX_FLOAT: TYPE_COMPLEX_DOUBLE; // Should use bufXTemp as well err = (cl_int)::clMath::clblas::tpsv(type, params->order, params->uplo, params->transA, params->diag, params->N, bufA, params->offa, bufX, params->offBX, params->incx, params->numCommandQueues, base->commandQueues(), 0, NULL, events); if (err != CL_SUCCESS) { deleteBuffers(A, blasX, backX, deltaX); delete[] events; ASSERT_EQ(CL_SUCCESS, err) << "::clMath::clblas::TPSV() failed"; } err = waitForSuccessfulFinish(params->numCommandQueues, base->commandQueues(), events); if (err != CL_SUCCESS) { deleteBuffers(A, blasX, backX, deltaX); delete[] events; ASSERT_EQ(CL_SUCCESS, err) << "waitForSuccessfulFinish()"; } ::std::cerr << "Done" << ::std::endl; clEnqueueReadBuffer(base->commandQueues()[0], bufX, CL_TRUE, 0, lengthX * sizeof(*backX), backX, 0, NULL, NULL); releaseMemObjects(bufA, bufX); /* printf("\n\n clblas result X\n"); printf("\nclBlasX\n"); printMatrixBlock( clblasColumnMajor, 0, 0, lengthX, 1, lengthX, backX); printf("\n\n delta X\n\n"); printMatrixBlock( clblasColumnMajor, 0, 0, lengthX, 1, lengthX, deltaX);*/ // handle lda correctly based on row-major/col-major.. compareMatrices( clblasColumnMajor, lengthX , 1, blasX, backX, lengthX, deltaX ); deleteBuffers(A, blasX, backX, deltaX); delete[] events; } // Instantiate the test TEST_P(TPSV, stpsv) { TestParams params; getParams(¶ms); tpsvCorrectnessTest(¶ms); } TEST_P(TPSV, dtpsv) { TestParams params; getParams(¶ms); tpsvCorrectnessTest(¶ms); } TEST_P(TPSV, ctpsv) { TestParams params; getParams(¶ms); tpsvCorrectnessTest(¶ms); } TEST_P(TPSV, ztpsv) { TestParams params; getParams(¶ms); tpsvCorrectnessTest(¶ms); } clblas-2.10/src/tests/correctness/corr-trmm.cpp000066400000000000000000000160351264277366700216360ustar00rootroot00000000000000/* ************************************************************************ * Copyright 2013 Advanced Micro Devices, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * ************************************************************************/ #include // srand() #include // memcpy() #include #include #include #include #include #include #include #include #include #include "tcase-filter.h" static void releaseMemObjects(cl_mem A, cl_mem B) { clReleaseMemObject(A); clReleaseMemObject(B); } template static void deleteBuffers(T *A, T *blasB, T *clblasB) { delete[] A; delete[] blasB; delete[] clblasB; } template void trmmCorrectnessTest(TestParams *params) { cl_int err; T *A, *blasB, *clblasB; T alpha; cl_mem bufA, bufB; clMath::BlasBase *base; bool useAlpha; cl_event *events; bool isComplex; base = clMath::BlasBase::getInstance(); if ((typeid(T) == typeid(cl_double) || typeid(T) == typeid(DoubleComplex)) && !base->isDevSupportDoublePrecision()) { std::cerr << ">> WARNING: The target device doesn't support native " "double precision floating point arithmetic" << std::endl << ">> Test skipped" << std::endl; SUCCEED(); return; } isComplex = ((typeid(T) == typeid(FloatComplex)) || (typeid(T) == typeid(DoubleComplex))); if (canCaseBeSkipped(params, isComplex)) { std::cerr << ">> Test is skipped because it has no importance for this " "level of coverage" << std::endl; SUCCEED(); return; } useAlpha = base->useAlpha(); events = new cl_event[params->numCommandQueues]; memset(events, 0, params->numCommandQueues * sizeof(cl_event)); A = new T[params->rowsA * params->columnsA]; blasB = new T[params->rowsB * params->columnsB]; clblasB = new T[params->rowsB * params->columnsB]; alpha = ZERO(); srand(params->seed); if (useAlpha) { alpha = convertMultiplier(params->alpha); } ::std::cerr << "Generating input data... "; randomTrmmMatrices(params->order, params->side, params->uplo, params->diag, params->M, params->N, useAlpha, &alpha, A, params->lda, blasB, params->ldb); memcpy(clblasB, blasB, params->rowsB * params->columnsB * sizeof(*blasB)); ::std::cerr << "Done" << ::std::endl; ::std::cerr << "Calling reference xTRMM routine... "; if (params->order == clblasColumnMajor) { ::clMath::blas::trmm(clblasColumnMajor, params->side, params->uplo, params->transA, params->diag, params->M, params->N, alpha, A, params->lda, blasB, params->ldb); } else { T *reorderedA = new T[params->rowsA * params->columnsA]; T *reorderedB = new T[params->rowsB * params->columnsB]; reorderMatrix(clblasRowMajor, params->rowsA, params->columnsA, A, reorderedA); reorderMatrix(clblasRowMajor, params->rowsB, params->columnsB, blasB, reorderedB); ::clMath::blas::trmm(clblasColumnMajor, params->side, params->uplo, params->transA, params->diag, params->M, params->N, alpha, reorderedA, params->rowsA, reorderedB, params->rowsB); reorderMatrix(clblasColumnMajor, params->rowsB, params->columnsB, reorderedB, blasB); delete[] reorderedB; delete[] reorderedA; } ::std::cerr << "Done" << ::std::endl; bufA = base->createEnqueueBuffer(A, params->rowsA * params->columnsA * sizeof(*A), params->offA * sizeof(*A), CL_MEM_READ_ONLY); bufB = base->createEnqueueBuffer(clblasB, params->rowsB * params->columnsB * sizeof(*clblasB), params->offBX * sizeof(*clblasB), CL_MEM_READ_WRITE); if ((bufA == NULL) || (bufB == NULL)) { /* Skip the test, the most probable reason is * matrix too big for a device. */ releaseMemObjects(bufA, bufB); deleteBuffers(A, blasB, clblasB); delete[] events; ::std::cerr << ">> Failed to create/enqueue buffer for a matrix." << ::std::endl << ">> Can't execute the test, because data is not transfered to GPU." << ::std::endl << ">> Test skipped." << ::std::endl; SUCCEED(); return; } ::std::cerr << "Calling clblas xTRMM routine... "; err = (cl_int)::clMath::clblas::trmm(params->order, params->side, params->uplo, params->transA, params->diag, params->M, params->N, alpha, bufA, params->offA, params->lda, bufB, params->offBX, params->ldb, params->numCommandQueues, base->commandQueues(), 0, NULL, events); if (err != CL_SUCCESS) { releaseMemObjects(bufA, bufB); deleteBuffers(A, blasB, clblasB); delete[] events; ASSERT_EQ(CL_SUCCESS, err) << "::clMath::clblas::TRMM() failed"; } err = waitForSuccessfulFinish(params->numCommandQueues, base->commandQueues(), events); if (err != CL_SUCCESS) { releaseMemObjects(bufA, bufB); deleteBuffers(A, blasB, clblasB); delete[] events; ASSERT_EQ(CL_SUCCESS, err) << "waitForSuccessfulFinish()"; } ::std::cerr << "Done" << ::std::endl; clEnqueueReadBuffer(base->commandQueues()[0], bufB, CL_TRUE, params->offBX * sizeof(*clblasB), params->rowsB * params->columnsB * sizeof(*clblasB), clblasB, 0, NULL, NULL); releaseMemObjects(bufA, bufB); compareMatrices(params->order, params->M, params->N, blasB, clblasB, params->ldb); deleteBuffers(A, blasB, clblasB); delete[] events; } // Instantiate the test TEST_P(TRMM, strmm) { TestParams params; getParams(¶ms); trmmCorrectnessTest(¶ms); } TEST_P(TRMM, dtrmm) { TestParams params; getParams(¶ms); trmmCorrectnessTest(¶ms); } TEST_P(TRMM, ctrmm) { TestParams params; getParams(¶ms); trmmCorrectnessTest(¶ms); } TEST_P(TRMM, ztrmm) { TestParams params; getParams(¶ms); trmmCorrectnessTest(¶ms); } clblas-2.10/src/tests/correctness/corr-trmv.cpp000066400000000000000000000174061264277366700216520ustar00rootroot00000000000000/* ************************************************************************ * Copyright 2013 Advanced Micro Devices, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * ************************************************************************/ #include // srand() #include // memcpy() #include #include #include #include #include #include #include #include #include static void releaseMemObjects(cl_mem objA, cl_mem objX, cl_mem objXTemp) { if(objA != NULL) { clReleaseMemObject(objA); } if(objX != NULL) { clReleaseMemObject(objX); } if(objXTemp != NULL) { clReleaseMemObject(objXTemp); } } template static void deleteBuffers(T *A, T *blasX, T *clblasX) { if(A != NULL) { delete[] A; } if(blasX != NULL) { delete[] blasX; } if(clblasX != NULL) { delete[] clblasX; } } template void trmvCorrectnessTest(TestParams *params) { cl_int err; T *A, *blasX, *clblasX; cl_mem bufA, bufX, bufXTemp; clMath::BlasBase *base; cl_event *events; base = clMath::BlasBase::getInstance(); if ((typeid(T) == typeid(cl_double) || typeid(T) == typeid(DoubleComplex)) && !base->isDevSupportDoublePrecision()) { std::cerr << ">> WARNING: The target device doesn't support native " "double precision floating point arithmetic" << std::endl << ">> Test skipped" << std::endl; SUCCEED(); return; } printf("number of command queues : %d\n\n", params->numCommandQueues); events = new cl_event[params->numCommandQueues]; memset(events, 0, params->numCommandQueues * sizeof(cl_event)); size_t lengthA = params->N * params->lda; size_t lengthX = (1 + ((params->N -1) * abs(params->incx))); A = new T[lengthA + params->offa ]; blasX = new T[lengthX + params->offBX ]; clblasX = new T[lengthX + params->offBX ]; if((A == NULL) || (blasX == NULL) || (clblasX == NULL)) { ::std::cerr << "Cannot allocate memory on host side\n" << "!!!!!!!!!!!!Test skipped.!!!!!!!!!!!!" << ::std::endl; deleteBuffers(A, blasX, clblasX); delete[] events; SUCCEED(); return; } srand(params->seed); ::std::cerr << "Generating input data... "; // Set data in A and X using populate() routine int creationFlags = 0; creationFlags = creationFlags | RANDOM_INIT; // Default is Column-Major creationFlags = ( (params-> order) == clblasRowMajor)? (creationFlags | ROW_MAJOR_ORDER) : (creationFlags); creationFlags = ( (params-> uplo) == clblasLower)? (creationFlags | LOWER_HALF_ONLY) : (creationFlags | UPPER_HALF_ONLY); BlasRoutineID BlasFn = CLBLAS_TRMV; // Populate A and blasX populate( A + params->offa, params-> N, params-> N, params-> lda, BlasFn, creationFlags); populate( blasX , (lengthX + params->offBX), 1, (lengthX + params->offBX), BlasFn); // Copy blasX to clblasX memcpy(clblasX, blasX, (lengthX + params->offBX)* sizeof(*blasX)); ::std::cerr << "Done" << ::std::endl; // Allocate buffers bufA = base->createEnqueueBuffer(A, (lengthA + params->offa)* sizeof(*A), 0, CL_MEM_READ_ONLY); bufX = base->createEnqueueBuffer(clblasX, (lengthX + params->offBX)* sizeof(*clblasX), 0, CL_MEM_WRITE_ONLY); bufXTemp = base->createEnqueueBuffer(NULL, lengthX * sizeof(*clblasX), 0, CL_MEM_READ_ONLY); //printData( "bufX", blasX, lengthX, 1, lengthX); //printData( "clblasX", clblasX, lengthX, 1, lengthX); ::std::cerr << "Calling reference xTRMV routine... "; clblasOrder order; clblasUplo fUplo; clblasTranspose fTrans; order = params->order; fUplo = params->uplo; fTrans = params->transA; if (order != clblasColumnMajor) { order = clblasColumnMajor; fUplo = (params->uplo == clblasUpper)? clblasLower : clblasUpper; fTrans = (params->transA == clblasNoTrans)? clblasTrans : clblasNoTrans; if( params->transA == clblasConjTrans ) doConjugate( (A + params->offa), params->N, params->N, params->lda ); } ::clMath::blas::trmv( order, fUplo, fTrans, params->diag, params->N, A, params->offa, params->lda, blasX, params->offBX, params->incx); ::std::cerr << "Done" << ::std::endl; // Hold X vector if ((bufA == NULL) || (bufX == NULL) || (bufXTemp == NULL)) { /* Skip the test, the most probable reason is * matrix too big for a device. */ releaseMemObjects(bufA, bufX, bufXTemp); deleteBuffers(A, blasX, clblasX); delete[] events; ::std::cerr << ">> Failed to create/enqueue buffer for a matrix." << ::std::endl << ">> Can't execute the test, because data is not transfered to GPU." << ::std::endl << ">> Test skipped." << ::std::endl; SUCCEED(); return; } ::std::cerr << "Calling clblas xTRMV routine... "; DataType type; type = ( typeid(T) == typeid(cl_float))? TYPE_FLOAT : ( typeid(T) == typeid(cl_double))? TYPE_DOUBLE: ( typeid(T) == typeid(cl_float2))? TYPE_COMPLEX_FLOAT:TYPE_COMPLEX_DOUBLE; // Should use bufXTemp as well err = (cl_int)::clMath::clblas::trmv( type, params->order, params->uplo, params->transA, params->diag, params->N, bufA, params->offa, params->lda, bufX, params->offBX, params->incx, bufXTemp, params->numCommandQueues, base->commandQueues(), 0, NULL, events); if (err != CL_SUCCESS) { releaseMemObjects(bufA, bufX, bufXTemp); deleteBuffers(A, blasX, clblasX); delete[] events; ASSERT_EQ(CL_SUCCESS, err) << "::clMath::clblas::TRMV() failed"; } err = waitForSuccessfulFinish(params->numCommandQueues, base->commandQueues(), events); if (err != CL_SUCCESS) { releaseMemObjects(bufA, bufX, bufXTemp); deleteBuffers(A, blasX, clblasX); delete[] events; ASSERT_EQ(CL_SUCCESS, err) << "waitForSuccessfulFinish()"; } ::std::cerr << "Done" << ::std::endl; err = clEnqueueReadBuffer(base->commandQueues()[0], bufX, CL_TRUE, 0, (lengthX + params->offBX) * sizeof(*clblasX), clblasX, 0, NULL, NULL); if (err != CL_SUCCESS) { ::std::cerr << "TRMV: Reading results failed...." << std::endl; } releaseMemObjects(bufA, bufX, bufXTemp); // handle lda correctly based on row-major/col-major.. // printData( "Ref blasX result:", blasX, lengthX, 1, lengthX); // printData( "OpenCL clblasX result:", clblasX, lengthX, 1, lengthX); compareMatrices(clblasColumnMajor, lengthX , 1, (blasX + params->offBX), (clblasX + params->offBX), lengthX); deleteBuffers(A, blasX, clblasX); delete[] events; } // Instantiate the test TEST_P(TRMV, strmv) { TestParams params; getParams(¶ms); trmvCorrectnessTest(¶ms); } TEST_P(TRMV, dtrmv) { TestParams params; getParams(¶ms); trmvCorrectnessTest(¶ms); } TEST_P(TRMV, ctrmv) { TestParams params; getParams(¶ms); trmvCorrectnessTest(¶ms); } TEST_P(TRMV, ztrmv) { TestParams params; getParams(¶ms); trmvCorrectnessTest(¶ms); } clblas-2.10/src/tests/correctness/corr-trsm.cpp000066400000000000000000000323331264277366700216430ustar00rootroot00000000000000/* ************************************************************************ * Copyright 2013 Advanced Micro Devices, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * ************************************************************************/ #include // srand() #include // memcpy() #include #include #include #include #include #include #include #include #include #include "trsm-delta.h" #include "tcase-filter.h" static void releaseMemObjects(cl_mem A, cl_mem B) { clReleaseMemObject(A); clReleaseMemObject(B); } template static void deleteBuffers(T *A, T *B, T *blasB, T *clblasB, cl_double *delta) { delete[] A; delete[] B; delete[] blasB; delete[] clblasB; delete[] delta; } template void trsmCorrectnessTest(TestParams *params) { cl_int err; T *A, *B, *blasB, *clblasB; T alpha; cl_mem bufA, bufB; cl_double *delta; clMath::BlasBase *base; bool useAlpha; cl_event *events; bool isComplex; base = clMath::BlasBase::getInstance(); if ((typeid(T) == typeid(cl_double) || typeid(T) == typeid(DoubleComplex)) && !base->isDevSupportDoublePrecision()) { std::cerr << ">> WARNING: The target device doesn't support native " "double precision floating point arithmetic" << std::endl << ">> Test skipped" << std::endl; SUCCEED(); return; } isComplex = ((typeid(T) == typeid(FloatComplex)) || (typeid(T) == typeid(DoubleComplex))); if (canCaseBeSkipped(params, isComplex)) { std::cerr << ">> Test is skipped because it has no importance for this " "level of coverage" << std::endl; SUCCEED(); return; } useAlpha = base->useAlpha(); alpha = ZERO(); events = new cl_event[params->numCommandQueues]; memset(events, 0, params->numCommandQueues * sizeof(cl_event)); A = new T[params->rowsA * params->columnsA]; B = new T[params->rowsB * params->columnsB]; blasB = new T[params->rowsB * params->columnsB]; clblasB = new T[params->rowsB * params->columnsB]; delta = new cl_double[params->rowsB * params->columnsB]; srand(params->seed); if (useAlpha) { alpha = convertMultiplier(params->alpha); } ::std::cerr << "Generating input data... "; randomTrsmMatrices(params->order, params->side, params->uplo, params->diag, params->M, params->N, useAlpha, &alpha, A, params->lda, B, params->ldb); memcpy(blasB, B, params->rowsB * params->columnsB * sizeof(*B)); memcpy(clblasB, B, params->rowsB * params->columnsB * sizeof(*B)); ::std::cerr << "Done" << ::std::endl; ::std::cerr << "Calling reference xTRSM routine... "; if (params->order == clblasColumnMajor) { ::clMath::blas::trsm(clblasColumnMajor, params->side, params->uplo, params->transA, params->diag, params->M, params->N, alpha, A, params->lda, blasB, params->ldb); } else { T *reorderedA = new T[params->rowsA * params->columnsA]; T *reorderedB = new T[params->rowsB * params->columnsB]; reorderMatrix(clblasRowMajor, params->rowsA, params->columnsA, A, reorderedA); reorderMatrix(clblasRowMajor, params->rowsB, params->columnsB, blasB, reorderedB); ::clMath::blas::trsm(clblasColumnMajor, params->side, params->uplo, params->transA, params->diag, params->M, params->N, alpha, reorderedA, params->rowsA, reorderedB, params->rowsB); reorderMatrix(clblasColumnMajor, params->rowsB, params->columnsB, reorderedB, blasB); delete[] reorderedB; delete[] reorderedA; } ::std::cerr << "Done" << ::std::endl; bufA = base->createEnqueueBuffer(A, params->rowsA * params->columnsA * sizeof(*A), params->offA * sizeof(*A), CL_MEM_READ_ONLY); bufB = base->createEnqueueBuffer(clblasB, params->rowsB * params->columnsB * sizeof(*clblasB), params->offBX * sizeof(*clblasB), CL_MEM_READ_WRITE); if ((bufA == NULL) || (bufB == NULL)) { /* Skip the test, the most probable reason is * matrix too big for a device. */ releaseMemObjects(bufA, bufB); deleteBuffers(A, B, blasB, clblasB, delta); delete[] events; ::std::cerr << ">> Failed to create/enqueue buffer for a matrix." << ::std::endl << ">> Can't execute the test, because data is not transfered to GPU." << ::std::endl << ">> Test skipped." << ::std::endl; SUCCEED(); return; } ::std::cerr << "Calling clblas xTRSM routine... "; err = (cl_int)::clMath::clblas::trsm(params->order, params->side, params->uplo, params->transA, params->diag, params->M, params->N, alpha, bufA, params->offA, params->lda, bufB, params->offBX, params->ldb, params->numCommandQueues, base->commandQueues(), 0, NULL, events); if (err != CL_SUCCESS) { releaseMemObjects(bufA, bufB); deleteBuffers(A, B, blasB, clblasB, delta); delete[] events; ASSERT_EQ(CL_SUCCESS, err) << "::clMath::clblas::TRSM() failed"; } err = waitForSuccessfulFinish(params->numCommandQueues, base->commandQueues(), events); if (err != CL_SUCCESS) { releaseMemObjects(bufA, bufB); deleteBuffers(A, B, blasB, clblasB, delta); delete[] events; ASSERT_EQ(CL_SUCCESS, err) << "waitForSuccessfulFinish()"; } ::std::cerr << "Done" << ::std::endl; clEnqueueReadBuffer(base->commandQueues()[0], bufB, CL_TRUE, params->offBX * sizeof(*clblasB), params->rowsB * params->columnsB * sizeof(*clblasB), clblasB, 0, NULL, NULL); releaseMemObjects(bufA, bufB); trsmDelta(params->order, params->side, params->uplo, params->transA, params->diag, params->M, params->N, A, params->lda, B, params->ldb, alpha, delta); compareMatrices(params->order, params->M, params->N, blasB, clblasB, params->ldb, delta); deleteBuffers(A, B, blasB, clblasB, delta); delete[] events; } // Instantiate the test TEST_P(TRSM, strsm) { TestParams params; getParams(¶ms); trsmCorrectnessTest(¶ms); } TEST_P(TRSM, dtrsm) { TestParams params; getParams(¶ms); trsmCorrectnessTest(¶ms); } TEST_P(TRSM, ctrsm) { TestParams params; getParams(¶ms); trsmCorrectnessTest(¶ms); } TEST_P(TRSM, ztrsm) { TestParams params; getParams(¶ms); trsmCorrectnessTest(¶ms); } // ==================================== // Adding some tests to catch bugs in the scenario where lda != M int arithsum(int i) { int j; for(j=i-1; j>0; j--) i += j; return i; } template void AssignA(T *A, size_t i, size_t j, size_t ld) { A[i*ld + j] = j == i ? (j+1) : ( j > i ? 0 : 1.0 ); } template <> void AssignA(FloatComplex *A, size_t i, size_t j, size_t ld) { FloatComplex *Ac = (FloatComplex *)A; Ac[i*ld + j].s[0] = j == i ? (j+1) : ( j > i ? 0 : 1.0 ); Ac[i*ld + j].s[1] = 0; } template <> void AssignA(DoubleComplex *A, size_t i, size_t j, size_t ld) { DoubleComplex *Az = (DoubleComplex *)A; Az[i*ld + j].s[0] = j == i ? (j+1) : ( j > i ? 0 : 1.0 ); Az[i*ld + j].s[1] = 0; } template void AssignB(T *B, size_t i, size_t j, size_t ld, size_t M) { B[i*ld + j] = arithsum(M) - arithsum(j+1) + (j+1)*(j+1); } template <> void AssignB(FloatComplex *B, size_t i, size_t j, size_t ld, size_t M) { FloatComplex *Bc = (FloatComplex *)B; Bc[i*ld + j].s[0] = arithsum(M) - arithsum(j+1) + (j+1)*(j+1); Bc[i*ld + j].s[1] = 0; } template <> void AssignB(DoubleComplex *B, size_t i, size_t j, size_t ld, size_t M) { DoubleComplex *Bz = (DoubleComplex *)B; Bz[i*ld + j].s[0] = arithsum(M) - arithsum(j+1) + (j+1)*(j+1); Bz[i*ld + j].s[1] = 0; } template void local_assert(T x, T y, T d) { ASSERT_NEAR(x, y, d); } template <> void local_assert(FloatComplex x, FloatComplex y, FloatComplex d) { ASSERT_NEAR(x.s[0], y.s[0], d.s[0]); ASSERT_NEAR(x.s[1], y.s[1], d.s[1]); } template <> void local_assert(DoubleComplex x, DoubleComplex y, DoubleComplex d) { ASSERT_NEAR(x.s[0], y.s[0], d.s[0]); ASSERT_NEAR(x.s[1], y.s[1], d.s[1]); } template void Extratest(size_t M, size_t N, size_t lda, size_t ldb, T alpha, T delta) { T *A, *B, *blasB, *clblasB; cl_mem bufA, bufB; clMath::BlasBase *base; cl_event *events; cl_int err; base = clMath::BlasBase::getInstance(); if ((typeid(T) == typeid(cl_double) || typeid(T) == typeid(DoubleComplex)) && !base->isDevSupportDoublePrecision()) { std::cerr << ">> WARNING: The target device doesn't support native " "double precision floating point arithmetic" << std::endl << ">> Test skipped" << std::endl; SUCCEED(); return; } clblasOrder order = clblasColumnMajor; clblasSide side = clblasLeft; clblasUplo uplo = clblasUpper; clblasTranspose trans = clblasNoTrans; clblasDiag diag = clblasNonUnit; A = new T[M * lda]; B = new T[N * ldb]; blasB = new T[N * ldb]; clblasB = new T[N * ldb]; memset(A, 0, M*lda*sizeof(T)); memset(B, 0, N*ldb*sizeof(T)); for(int i=0; i(A, i, j, lda); } } for(int i=0; i(B, i, j, ldb, M); } } memcpy(blasB, B, N*ldb*sizeof(T)); memcpy(clblasB, B, N*ldb*sizeof(T)); ::std::cerr << "Calling reference xTRSM routine... "; ::clMath::blas::trsm(order, side, uplo, trans, diag, M, N, alpha, A, lda, blasB, ldb); bufA = base->createEnqueueBuffer(A, M*lda*sizeof(T), 0, CL_MEM_READ_ONLY); bufB = base->createEnqueueBuffer(clblasB, N*ldb*sizeof(T), 0, CL_MEM_READ_WRITE); events = new cl_event[1]; memset(events, 0, sizeof(cl_event)); if ((bufA == NULL) || (bufB == NULL)) { /* Skip the test, the most probable reason is * matrix too big for a device. */ releaseMemObjects(bufA, bufB); deleteBuffers(A, B, blasB, clblasB, NULL); delete[] events; ::std::cerr << ">> Failed to create/enqueue buffer for a matrix." << ::std::endl << ">> Can't execute the test, because data is not transfered to GPU." << ::std::endl << ">> Test skipped." << ::std::endl; SUCCEED(); return; } ::std::cerr << "Calling clblas xTRSM routine... "; err = (cl_int)::clMath::clblas::trsm(order, side, uplo, trans, diag, M, N, alpha, bufA, 0, lda, bufB, 0, ldb, 1, base->commandQueues(), 0, NULL, events); if (err != CL_SUCCESS) { releaseMemObjects(bufA, bufB); deleteBuffers(A, B, blasB, clblasB, NULL); delete[] events; ASSERT_EQ(CL_SUCCESS, err) << "::clMath::clblas::TRSM() failed"; } err = waitForSuccessfulFinish(1, base->commandQueues(), events); if (err != CL_SUCCESS) { releaseMemObjects(bufA, bufB); deleteBuffers(A, B, blasB, clblasB, NULL); delete[] events; ASSERT_EQ(CL_SUCCESS, err) << "waitForSuccessfulFinish()"; } ::std::cerr << "Done" << ::std::endl; clEnqueueReadBuffer(base->commandQueues()[0], bufB, CL_TRUE, 0, N*ldb*sizeof(T), clblasB, 0, NULL, NULL); releaseMemObjects(bufA, bufB); // Validate the answer for(int i=0; i(A, B, blasB, clblasB, NULL); delete[] events; } #define ETST_TOLERENCE 1E-5 TEST(TRSM_extratest, strsm) { Extratest(5, 2, 32, 32, 1.0f, ETST_TOLERENCE); } TEST(TRSM_extratest, dtrsm) { Extratest(5, 2, 32, 32, 1.0, ETST_TOLERENCE); } TEST(TRSM_extratest, ctrsm) { FloatComplex alpha = floatComplex(1.0f, 0); FloatComplex delta = floatComplex(ETST_TOLERENCE, ETST_TOLERENCE); Extratest(5, 2, 32, 32, alpha, delta); } TEST(TRSM_extratest, ztrsm) { DoubleComplex alpha = doubleComplex(1.0, 0); DoubleComplex delta = doubleComplex(ETST_TOLERENCE, ETST_TOLERENCE); Extratest(5, 2, 32, 32, alpha, delta); }clblas-2.10/src/tests/correctness/corr-trsv.cpp000066400000000000000000000170351264277366700216560ustar00rootroot00000000000000/* ************************************************************************ * Copyright 2013 Advanced Micro Devices, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * ************************************************************************/ #include // srand() #include // memcpy() #include #include #include #include #include #include #include #include #include #include #include "trsv-delta.h" static void releaseMemObjects(cl_mem objA, cl_mem objX) { if (objA != NULL) clReleaseMemObject(objA); if (objX != NULL) clReleaseMemObject(objX); } template static void deleteBuffers(T *A, T *blasX, T *backX, cl_double *deltaX) { if( A != NULL ) { delete[] A; } if( blasX != NULL ) { delete[] blasX; } if( backX != NULL ) { delete[] backX; } if( deltaX != NULL ) { delete[] deltaX; } } template void trsvCorrectnessTest(TestParams *params) { cl_int err; T *A, *blasX, *backX; cl_double *deltaX; cl_mem bufA, bufX; clMath::BlasBase *base; cl_event *events; base = clMath::BlasBase::getInstance(); if ((typeid(T) == typeid(cl_double) || typeid(T) == typeid(DoubleComplex)) && !base->isDevSupportDoublePrecision()) { std::cerr << ">> WARNING: The target device doesn't support native " "double precision floating point arithmetic" << std::endl << ">> Test skipped" << std::endl; SUCCEED(); return; } events = new cl_event[params->numCommandQueues]; memset(events, 0, params->numCommandQueues * sizeof(cl_event)); size_t lengthA = params->N * params->lda; size_t lengthX = (1 + ((params->N -1) * abs(params->incx))); A = new T[lengthA + params->offa]; blasX = new T[lengthX + params->offBX]; backX = new T[lengthX + params->offBX]; deltaX = new cl_double[lengthX + params->offBX]; if ((A==NULL) || (blasX == NULL) || (backX == NULL) || (deltaX == NULL)) { ::std::cerr << "Unable to allocate matrices in Host memory" << std::endl; deleteBuffers(A, blasX, backX, deltaX); delete[] events; SUCCEED(); return; } memset( deltaX, 0, lengthX*sizeof(cl_double) ); memset( blasX, 0, lengthX*sizeof(T) ); srand(params->seed); ::std::cerr << "Generating input data... "; //custom generation function in blas-random.h randomTrsvMatrices( params->order, params->uplo, params->diag, params->N, (A + params->offa), params->lda, (blasX + params->offBX), params->incx); // Generate delta X for result comparison trsvDelta( params->order, params->uplo, params->transA, params->diag, params->N, (A + params->offa), params->lda, (blasX + params->offBX), params->incx, (deltaX + params->offBX) ); /*printf("\n\n before acml call\nA\n"); printMatrixBlock( params->order, 0, 0, params->N, params->N, params->lda, A); printf("\nX\n"); printMatrixBlock( clblasColumnMajor, 0, 0, lengthX, 1, lengthX, blasX);*/ // Copy blasX to clblasX memcpy(backX, blasX, (lengthX + params->offBX) * sizeof(T)); // Allocate buffers bufA = base->createEnqueueBuffer(A, (lengthA + params->offa)* sizeof(T), 0, CL_MEM_READ_ONLY); bufX = base->createEnqueueBuffer(backX, (lengthX + params->offBX)* sizeof(T), 0, CL_MEM_WRITE_ONLY); ::std::cerr << "Done" << ::std::endl; ::std::cerr << "Calling reference xTRSV routine... "; clblasOrder order; clblasUplo fUplo; clblasTranspose fTrans; order = params->order; fUplo = params->uplo; fTrans = params->transA; if (order != clblasColumnMajor) { order = clblasColumnMajor; fUplo = (params->uplo == clblasUpper)? clblasLower : clblasUpper; fTrans = (params->transA == clblasNoTrans)? clblasTrans : clblasNoTrans; if( params->transA == clblasConjTrans ) doConjugate((A + params->offa), params->N, params->N, params->lda ); } ::clMath::blas::trsv( order, fUplo, fTrans, params->diag, params->N, A, params->offa, params->lda, blasX, params->offBX, params->incx); ::std::cerr << "Done" << ::std::endl; /* printf("\n\n acml result X\n"); printf("\nblasX\n"); printMatrixBlock( clblasColumnMajor, 0, 0, lengthX, 1, lengthX, blasX);*/ if ((bufA == NULL) || (bufX == NULL)) { /* Skip the test, the most probable reason is * matrix too big for a device. */ releaseMemObjects(bufA, bufX); deleteBuffers(A, blasX, backX, deltaX); delete[] events; ::std::cerr << ">> Failed to create/enqueue buffer for a matrix." << ::std::endl << ">> Can't execute the test, because data is not transfered to GPU." << ::std::endl << ">> Test skipped." << ::std::endl; SUCCEED(); return; } ::std::cerr << "Calling clblas xTRSV routine... "; DataType type; type = ( typeid(T) == typeid(cl_float))? TYPE_FLOAT : ( typeid(T) == typeid(cl_double))? TYPE_DOUBLE: ( typeid(T) == typeid(cl_float2))? TYPE_COMPLEX_FLOAT: TYPE_COMPLEX_DOUBLE; // Should use bufXTemp as well err = (cl_int)::clMath::clblas::trsv(type, params->order, params->uplo, params->transA, params->diag, params->N, bufA, params->offa, params->lda, bufX, params->offBX, params->incx, params->numCommandQueues, base->commandQueues(), 0, NULL, events); if (err != CL_SUCCESS) { deleteBuffers(A, blasX, backX, deltaX); delete[] events; ASSERT_EQ(CL_SUCCESS, err) << "::clMath::clblas::TRSV() failed"; } err = waitForSuccessfulFinish(params->numCommandQueues, base->commandQueues(), events); if (err != CL_SUCCESS) { deleteBuffers(A, blasX, backX, deltaX); delete[] events; ASSERT_EQ(CL_SUCCESS, err) << "waitForSuccessfulFinish()"; } ::std::cerr << "Done" << ::std::endl; clEnqueueReadBuffer(base->commandQueues()[0], bufX, CL_TRUE, 0, lengthX * sizeof(*backX), backX, 0, NULL, NULL); releaseMemObjects(bufA, bufX); /* printf("\n\n clblas result X\n"); printf("\nclBlasX\n"); printMatrixBlock( clblasColumnMajor, 0, 0, lengthX, 1, lengthX, backX); printf("\n\n delta X\n\n"); printMatrixBlock( clblasColumnMajor, 0, 0, lengthX, 1, lengthX, deltaX);*/ // handle lda correctly based on row-major/col-major.. compareMatrices( clblasColumnMajor, lengthX , 1, blasX, backX, lengthX, deltaX ); deleteBuffers(A, blasX, backX, deltaX); delete[] events; } // Instantiate the test TEST_P(TRSV, strsv) { TestParams params; getParams(¶ms); trsvCorrectnessTest(¶ms); } TEST_P(TRSV, dtrsv) { TestParams params; getParams(¶ms); trsvCorrectnessTest(¶ms); } TEST_P(TRSV, ctrsv) { TestParams params; getParams(¶ms); trsvCorrectnessTest(¶ms); } TEST_P(TRSV, ztrsv) { TestParams params; getParams(¶ms); trsvCorrectnessTest(¶ms); } clblas-2.10/src/tests/correctness/delta.h000066400000000000000000000024221264277366700204450ustar00rootroot00000000000000/* ************************************************************************ * Copyright 2013 Advanced Micro Devices, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * ************************************************************************/ #ifndef DELTA_H_ #define DELTA_H_ #include #include // Type-dependant constants template static cl_double DELTA_0(); template<> __template_static cl_double DELTA_0() { return pow(2.0, -20); } template<> __template_static cl_double DELTA_0() { return pow(2.0, -50); } template<> __template_static cl_double DELTA_0() { return pow(2.0, -20); } template<> __template_static cl_double DELTA_0() { return pow(2.0, -50); } #endif // DELTA_H clblas-2.10/src/tests/correctness/tcase-filter.cpp000066400000000000000000000123561264277366700223000ustar00rootroot00000000000000/* ************************************************************************ * Copyright 2013 Advanced Micro Devices, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * ************************************************************************/ #include "tcase-filter.h" #if defined(SHORT_TESTS) || defined(MEDIUM_TESTS) static __inline size_t selectSize(size_t orig, size_t alt) { return (orig) ? orig : alt; } static size_t nonZeroSize(size_t size1, size_t size2, size_t size3) { size_t r = 0; if (size1) { r = size1; } else if (size2) { r = size2; } else { r = size3; } return r; } static int sizeEquCount(size_t size1, size_t size2, size_t size3) { int cnt = 0; cnt += static_cast(size1 == size2); cnt += static_cast(size2 == size3); cnt += static_cast(size1 == size3); return cnt; } static __inline bool isEquToAny(size_t size, size_t alt1, size_t alt2, size_t alt3) { return ((size == alt1) || (size == alt2) || (size == alt3)); } static __inline bool isRealConjugation(const TestParams *params, bool isComplex) { return !isComplex && ((params->transA == clblasConjTrans) || (params->transB == clblasConjTrans)); } #endif /* SHORT_TESTS || MEDIUM_TESTS */ #if defined(SHORT_TESTS) bool canCaseBeSkipped(const TestParams *params, bool isComplex) { size_t s; size_t m, n, k, lda, ldb, ldc; // skip cases with conjugated transposition for real data if (isRealConjugation(params, isComplex)) { return true; } /* * Enable only cases at which all the problem dimensions are equal * to each other */ s = nonZeroSize(params->M, params->N, params->K); m = selectSize(params->M, s); n = selectSize(params->N, s); k = selectSize(params->K, s); if (sizeEquCount(m, n, k) < 3) { return true; } /* * filter BigLDA cases */ /* s = nonZeroSize(params->lda, params->ldb, params->ldc); lda = selectSize(params->lda, s); ldb = selectSize(params->ldb, s); ldc = selectSize(params->ldc, s); if (sizeEquCount(lda, ldb, ldc) < 3) { return true; } if (!isEquToAny(lda, m, n, k)) { return true; } */ return false; } #elif defined(MEDIUM_TESTS) /* SHORT_TESTS */ #include #include /* * Evaluate best vector length that buffer with such leading dimension * would have for such leading dimension. */ static unsigned int prognozedVecLen(size_t ld) { size_t u = static_cast(1) << (sizeof(size_t) * 8 - 1); size_t vecLen; // typically vecLen will not exceed 8 ld %= 8; if (ld == 0) { return 8; } else if (ld == 1) { return 1; } // find the highest non zero bit for (; (u != 0) && !(u & ld); u >>= 1); /* * Evaluated as minimum of modules based operation results against * upper and lower power of 2 bounds */ vecLen = ld - u; u >>= 1; vecLen = ::std::min(vecLen, u - ld); return static_cast(vecLen); } bool canCaseBeSkipped(const TestParams *params, bool isComplex) { size_t s; size_t m, n, k, lda, ldb, ldc; int bigCnt = 0; unsigned int vecLen; // skip cases with conjugated transposition for real data if (isRealConjugation(params, isComplex)) { return true; } // set of cases for extended versions is really tiny, so enable them all if (params->offA || params->offBX || params->offCY) { return false; } s = nonZeroSize(params->M, params->N, params->K); m = selectSize(params->M, s); n = selectSize(params->N, s); k = selectSize(params->K, s); // enable BigLDA cases when problem dimensions all are equal to each other s = nonZeroSize(params->lda, params->ldb, params->ldc); lda = selectSize(params->lda, s); ldb = selectSize(params->ldb, s); ldc = selectSize(params->ldc, s); bigCnt += static_cast(!isEquToAny(lda, m, n, k)); bigCnt += static_cast(!isEquToAny(ldb, m, n, k)); bigCnt += static_cast(!isEquToAny(ldc, m, n, k)); if (bigCnt) { if (sizeEquCount(m, n, k) < 3) { return true; } else { return false; } } // enable only cases at which buffers will have the same vectorization vecLen = prognozedVecLen(lda); if ((prognozedVecLen(ldb) != vecLen) || (prognozedVecLen(ldc) != vecLen)) { return true; } return false; } #else /* MEDIUM_TESTS */ bool canCaseBeSkipped(const TestParams *params, bool isComplex) { (void)params; (void)isComplex; return false; } #endif /* !SHORT_TESTS && !MEDIUM_TESTS */ clblas-2.10/src/tests/correctness/tcase-filter.h000066400000000000000000000017621264277366700217440ustar00rootroot00000000000000/* ************************************************************************ * Copyright 2013 Advanced Micro Devices, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * ************************************************************************/ /* * Filter for skipping test cases when run time is more important than * coverage */ #ifndef TCASEFILTER_H_ #define TCASEFILTER_H_ #include bool canCaseBeSkipped(const TestParams *params, bool isComplex); #endif /* TCASEFILTER_H_ */ clblas-2.10/src/tests/correctness/test-correctness.cpp000066400000000000000000004236301264277366700232260ustar00rootroot00000000000000/* ************************************************************************ * Copyright 2013 Advanced Micro Devices, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * ************************************************************************/ #define DO_GEMM #define DO_TRMM #define DO_TRSM #define DO_SYR2K #define DO_SYRK #define DO_GEMV #define DO_SYMV #define DO_SYMM #define DO_TRMV #define DO_TPMV #define DO_TRSV #define DO_SYR #define DO_SPR #define DO_GER #define DO_GERC #define DO_SYR2 #define DO_HER #define DO_HER2 #define DO_HEMM #define DO_HEMV #define DO_HPMV #define DO_SPMV #define DO_SBMV #define DO_HERK #define DO_TPSV #define DO_HPR #define DO_SPR2 #define DO_HPR2 #define DO_GBMV #define DO_HBMV #define DO_TBMV #define DO_TBSV #define DO_HER2K #define DO_SWAP #define DO_COPY #define DO_SCAL #define DO_AXPY #define DO_DOT #define DO_DOTC #define DO_ROTG #define DO_ROTM #define DO_ROT #define DO_ROTMG #define DO_NRM2 #define DO_ASUM #define DO_iAMAX //#define DO_SPL - Only used for special case testing (for devel purposes) //#define DO_GEMM_2 - This needs to remain commented. #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include using ::testing::TestWithParam; using ::testing::Values; using ::testing::ValuesIn; using ::testing::Combine; TestParams globalTestParams; // Different ranges of test parameters static const clblasOrder orderSet[] = { clblasColumnMajor, clblasRowMajor }; static const clblasTranspose transSet[] = { clblasNoTrans, clblasTrans, clblasConjTrans }; static const clblasSide sideSet[] = { clblasLeft, clblasRight }; static const clblasUplo uploSet[] = { clblasUpper, clblasLower }; static const clblasDiag diagSet[] = { clblasUnit, clblasNonUnit }; const size_t ZERO_VAL[1] = { 0 }; const int ONE_VAL[1] = { 1 }; const int verySmallRange[] = {1, 3, 5, 10, 11, 15, 16, 23, 21, 32, 33, 45, 40, 63, 333, 1024, 1025, 4096, 4223}; const int completeRange[] = {1, 3, 5, 10, 11, 15, 16, 23, 21, 32, 33, 45, 40, 63, 333, 1024, 1025, 4096, 4223}; #if defined SHORT_TESTS const int smallRange[] = { 63, 128 }; const int numQueues[] = { 2 }; #elif defined MEDIUM_TESTS /* SHORT_TESTS */ const int smallRange[] = { 15, 64, 133 }; const int numQueues[] = { 3, 4 }; #else /* MEDIUM_TESTS */ const int smallRange[] = //{ 15, 16, 33, 40, 62, 64, 128, 129, 256, 258 }; { 8, 16, 17, 32, 62, 64, 128, 144, 256 }; //{ 15, 16, 32, 33, 63, 64, 128, 129, 256, 257 }; //{ 3, 4, 15, 16, 32, 33, 63, 64, 128, 129, 256, 257, 333, 566, 787, 1024, 1025, 1113, 1111, 999, 883, 633, 17 }; const int numQueues[] = { 2, 3, 4, 5, 6, 7 }; #endif /* !SHORT_TESTS && !MEDIUM_TESTS */ #if defined(SHORT_TESTS) || defined(MEDIUM_TESTS) enum { BIG_LDA = 500, BIG_LDB = 600, BIG_LDC = 700 }; const int incs[] = { 33, -33 }; #else /* SHORT_TESTS || MEDIUM_TESTS */ enum { BIG_LDA = 501, BIG_LDB = 602, BIG_LDC = 703 }; const int incs[] = { 1, -1, 33, -33 }; #endif /* !SHORT_TESTS && !MEDIUM_TESTS */ #if defined(SHORT_TESTS) || defined(MEDIUM_TESTS) const size_t offs[] = { 63, 258 }; #else /* !SHORT_TESTS && !MEDIUM_TESTS */ const size_t offs[] = {0, 63, 128, 258 }; #endif const int ldaRange[] = {0, 3192, 4097 }; const int offsetRange[] = { 0, 100 }; const double realAlphaRange[] = {(double)50, (double)100, (double)999999}; const cl_float2 complexAlphaRange[] = {floatComplex(0,1), floatComplex(3,4)}; const cl_float2 complexAlpha = floatComplex(2,3); const ComplexLong alphaBetaRange[] = {{50,50}, {20,20}}; const ComplexLong alphaBeta = {10,10}; const ComplexLong sflagRange[] = {{-1,0}, {0,0}, {1,0}, {-2,0}}; const ComplexLong rotCosMedium = {0, 3}; const ComplexLong rotSinMedium = {0, 4}; const ComplexLong rotCosShort = {1, 6}; const ComplexLong rotSinShort = {1, 2}; #ifdef DO_SPL INSTANTIATE_TEST_CASE_P(RowMajor_SmallRangeHER2_SPL, HER2, Combine( Values(clblasRowMajor), Values(clblasLower), ValuesIn(smallRange), ValuesIn(complexAlphaRange), ValuesIn(offsetRange), ValuesIn(incs), ValuesIn(offsetRange),ValuesIn(offsetRange),ValuesIn(ldaRange), Values(1) ) ); #endif #ifdef DO_HEMV #if defined(SHORT_TESTS) INSTANTIATE_TEST_CASE_P(Short_HEMV, HEMV, Combine( Values(clblasRowMajor), Values(clblasLower), ValuesIn(smallRange), Values(alphaBeta), Values(alphaBeta), Values((size_t)0), Values((size_t)0), Values((size_t)0), Values(clMath::ExtraTestSizes(0, 1, 1, 0, 0, 0)), Values(1))); INSTANTIATE_TEST_CASE_P(SelectedSmall_0HEMV, HEMV, Combine( ValuesIn(orderSet), ValuesIn(uploSet), Values(15), Values(alphaBeta), Values(alphaBeta), Values((size_t)0), Values((size_t)0), Values((size_t)0), Values(clMath::ExtraTestSizes(0, 1, 1, 0, 0, 0)), Values(1))); #elif defined(MEDIUM_TESTS) INSTANTIATE_TEST_CASE_P(order_HEMV, HEMV, Combine( ValuesIn(orderSet), Values(clblasLower), ValuesIn(smallRange), Values(alphaBeta), Values(alphaBeta), ValuesIn(offs), Values((size_t)0), Values((size_t)0), Values(clMath::ExtraTestSizes(0, 1, 1, 0, 0, 0)), Values(1))); INSTANTIATE_TEST_CASE_P(uplo_HEMV, HEMV, Combine( Values(clblasRowMajor), ValuesIn(uploSet), ValuesIn(smallRange), Values(alphaBeta), Values(alphaBeta), Values((size_t)0), ValuesIn(offs), Values((size_t)0), Values(clMath::ExtraTestSizes(0, 1, 1, 0, 0, 0)), Values(1))); INSTANTIATE_TEST_CASE_P(alpha_beta_HEMV, HEMV, Combine( Values(clblasRowMajor), Values(clblasLower), ValuesIn(smallRange), Values(alphaBeta), Values(alphaBeta), Values((size_t)0), Values((size_t)0), ValuesIn(offs), Values(clMath::ExtraTestSizes(0, 1, 1, 0, 0, 0)), Values(1))); INSTANTIATE_TEST_CASE_P(SelectedBig_0HEMV, HEMV, Combine( ValuesIn(orderSet), ValuesIn(uploSet), Values(1500, 5101), Values(alphaBeta), Values(alphaBeta), Values((size_t)0), Values((size_t)0), Values((size_t)0), Values(clMath::ExtraTestSizes(0, 1, 1, 0, 0, 0)), Values(1))); #else INSTANTIATE_TEST_CASE_P(ALL_HEMV, HEMV, Combine( ValuesIn(orderSet), ValuesIn(uploSet), ValuesIn(smallRange), ValuesIn(alphaBetaRange), ValuesIn(alphaBetaRange), ValuesIn(offs), ValuesIn(offs), ValuesIn(offs), Values(clMath::ExtraTestSizes(0, 1, 1, 0, 0, 0)), Values(1))); #endif // Correctness #endif #ifdef DO_SWAP #if defined(SHORT_TESTS) INSTANTIATE_TEST_CASE_P(SmallRange, SWAPXY, Combine( Values(100,50), Values(0), Values(1), Values(0), Values(1), Values(1) ) ); #elif defined(MEDIUM_TESTS) INSTANTIATE_TEST_CASE_P(Medium_SWAP, SWAPXY, Combine( Values(64,128,256,512), Values(0,3), Values(1,-1), Values(0,3), Values(1,-1), Values(1))); #else INSTANTIATE_TEST_CASE_P(ALL_SWAP, SWAPXY, Combine( ValuesIn(completeRange), ValuesIn(offsetRange), ValuesIn(incs), ValuesIn(offsetRange), ValuesIn(incs), Values(1))); #endif #endif #ifdef DO_AXPY #if defined(SHORT_TESTS) INSTANTIATE_TEST_CASE_P(Small_AXPY, AXPY, Combine( Values(100,50), ValuesIn(alphaBetaRange), Values(0), Values(1), Values(0), Values(1), Values(1) ) ); #elif defined(MEDIUM_TESTS) INSTANTIATE_TEST_CASE_P(Medium_AXPY, AXPY, Combine( Values(64,128,256,512), ValuesIn(alphaBetaRange), Values(0,3), Values(1,-1), Values(0,3), Values(1,-1), Values(1))); #else INSTANTIATE_TEST_CASE_P(ALL_AXPY, AXPY, Combine( ValuesIn(completeRange), ValuesIn(alphaBetaRange), ValuesIn(offsetRange), ValuesIn(incs), ValuesIn(offsetRange), ValuesIn(incs), Values(1))); #endif #endif #ifdef DO_ROTG #if defined(SHORT_TESTS) INSTANTIATE_TEST_CASE_P(Small_ROTG, ROTG, Combine( Values(1, 5), Values(1, 6), Values(2, 8), Values(3, 7), Values(1) ) ); #elif defined(MEDIUM_TESTS) INSTANTIATE_TEST_CASE_P(Medium_ROTG, ROTG, Combine( Values(64,128,256,512), Values(64, 128, 256, 512), Values(0,3), Values(0,3), Values(1))); #else INSTANTIATE_TEST_CASE_P(ALL_ROTG, ROTG, Combine( ValuesIn(offsetRange), ValuesIn(offsetRange), ValuesIn(offsetRange), ValuesIn(offsetRange), Values(1))); #endif #endif #ifdef DO_ROTM #if defined(SHORT_TESTS) INSTANTIATE_TEST_CASE_P(Small_ROTM, ROTM, Combine( Values(1, 5, 10, 20), Values(1, 6), Values(1, -1), Values(1, 6), Values(1, -1), Values(1, 6), ValuesIn(sflagRange), Values(1))); #elif defined(MEDIUM_TESTS) INSTANTIATE_TEST_CASE_P(Medium_ROTM, ROTM, Combine( Values(64,128,256,512), Values(0,3), Values(1, -3, 3, 1), Values(0,3), Values(1, -3, 3, 1), Values(0, 3), ValuesIn(sflagRange), Values(1))); #else INSTANTIATE_TEST_CASE_P(ALL_ROTM, ROTM, Combine( ValuesIn(completeRange), ValuesIn(offsetRange), ValuesIn(incs), ValuesIn(offsetRange), ValuesIn(incs), ValuesIn(offsetRange), ValuesIn(sflagRange), Values(1))); #endif #endif #ifdef DO_ROT #if defined(SHORT_TESTS) INSTANTIATE_TEST_CASE_P(Small_ROT, ROT, Combine( Values(1, 5, 10, 20), Values(1, 6), Values(1, -1), Values(1, 6), Values(1, -1), Values(rotCosShort), Values(rotSinShort), Values(1))); #elif defined(MEDIUM_TESTS) INSTANTIATE_TEST_CASE_P(Medium_ROT, ROT, Combine( Values(64,128,256,512), Values(0,3), Values(1, -3, 3, 1), Values(0,3), Values(1, -3, 3, 1), Values(rotCosMedium), Values(rotSinMedium), Values(1))); #else INSTANTIATE_TEST_CASE_P(ALL_ROT, ROT, Combine( ValuesIn(completeRange), ValuesIn(offsetRange), ValuesIn(incs), ValuesIn(offsetRange), ValuesIn(incs), ValuesIn(alphaBetaRange), ValuesIn(alphaBetaRange), Values(1))); #endif #endif #ifdef DO_ROTMG #if defined(SHORT_TESTS) INSTANTIATE_TEST_CASE_P(Small_ROTMG, ROTMG, Combine( Values(1, 6), Values(1, 6), Values(1, 6), Values(1, 6), Values(1, 6), ValuesIn(sflagRange), Values(1))); #elif defined(MEDIUM_TESTS) INSTANTIATE_TEST_CASE_P(Medium_ROTMG, ROTMG, Combine( Values(1, 3, 15), Values(0, 3, 15), Values(0, 3, 15), Values(0, 3, 15), Values(0, 3, 15), ValuesIn(sflagRange), Values(1))); #else INSTANTIATE_TEST_CASE_P(ALL_ROTMG, ROTMG, Combine( ValuesIn(offsetRange), ValuesIn(offsetRange), ValuesIn(offsetRange), ValuesIn(offsetRange), ValuesIn(offsetRange), ValuesIn(sflagRange), Values(1))); #endif #endif //NRM2 #ifdef DO_NRM2 #if defined(SHORT_TESTS) INSTANTIATE_TEST_CASE_P(Short_NRM2, NRM2, Combine( ValuesIn(smallRange), Values(1), Values(1), Values(1), Values(1)) ); INSTANTIATE_TEST_CASE_P(SelectedSmall0_NRM2, NRM2, Combine( Values(61), Values(4, -11), Values(0), Values(1), Values(1)) ); #elif defined(MEDIUM_TESTS) INSTANTIATE_TEST_CASE_P(Medium_NRM2, NRM2, Combine( ValuesIn(smallRange), Values(-10), Values(1), Values(1), Values(1) ) ); INSTANTIATE_TEST_CASE_P(SelectedBig0_NRM2, NRM2, Combine( Values(4900), Values(1), Values(4), Values(1), Values(1) ) ); #else // Correctness INSTANTIATE_TEST_CASE_P(ALL_NRM2, NRM2, Combine( ValuesIn(completeRange), ValuesIn(incs), ValuesIn(offsetRange), ValuesIn(offsetRange), Values(1) ) ); #endif // Correctness #endif #ifdef DO_ASUM #if defined(SHORT_TESTS) INSTANTIATE_TEST_CASE_P(Short_ASUM, ASUM, Combine( ValuesIn(smallRange), Values(1), Values(1), Values(1), Values(1)) ); INSTANTIATE_TEST_CASE_P(SelectedSmall0_ASUM, ASUM, Combine( Values(61), Values(4, -11), Values(0), Values(1), Values(1)) ); #elif defined(MEDIUM_TESTS) INSTANTIATE_TEST_CASE_P(Medium_ASUM, ASUM, Combine( ValuesIn(smallRange), Values(-10), Values(1), Values(1), Values(1) ) ); INSTANTIATE_TEST_CASE_P(SelectedBig0_ASUM, ASUM, Combine( Values(4900), Values(1), Values(4), Values(1), Values(1) ) ); #else // Correctness INSTANTIATE_TEST_CASE_P(ALL_ASUM, ASUM, Combine( ValuesIn(completeRange), ValuesIn(incs), ValuesIn(offsetRange), ValuesIn(offsetRange), Values(1) ) ); #endif // Correctness #endif #ifdef DO_iAMAX #if defined(SHORT_TESTS) INSTANTIATE_TEST_CASE_P(Short_iAMAX, iAMAX, Combine( ValuesIn(smallRange), Values(1), Values(1), Values(1), Values(1)) ); INSTANTIATE_TEST_CASE_P(SelectedSmall0_iAMAX, iAMAX, Combine( Values(61), Values(4, -1), Values(0), Values(1), Values(1)) ); #elif defined(MEDIUM_TESTS) INSTANTIATE_TEST_CASE_P(Medium_iAMAX, iAMAX, Combine( ValuesIn(smallRange), Values(-10), Values(1), Values(1), Values(1) ) ); INSTANTIATE_TEST_CASE_P(SelectedBig0_iAMAX, iAMAX, Combine( Values(4900), Values(1), Values(4), Values(1), Values(1) ) ); #else // Correctness INSTANTIATE_TEST_CASE_P(ALL_iAMAX, iAMAX, Combine( ValuesIn(completeRange), ValuesIn(incs), ValuesIn(offsetRange), ValuesIn(offsetRange), Values(1) ) ); #endif // Correctness #endif #ifdef DO_HPMV #if defined(SHORT_TESTS) INSTANTIATE_TEST_CASE_P(Short_HPMV, HPMV, Combine( Values(clblasRowMajor), Values(clblasLower), ValuesIn(smallRange), Values(alphaBeta), Values(alphaBeta), Values((size_t)0), Values((size_t)0), Values((size_t)0), Values(clMath::ExtraTestSizes(0, 1, 1, 0, 0, 0)), Values(1))); INSTANTIATE_TEST_CASE_P(SelectedSmall_0HPMV, HPMV, Combine( ValuesIn(orderSet), ValuesIn(uploSet), Values(15), Values(alphaBeta), Values(alphaBeta), Values((size_t)0), Values((size_t)0), Values((size_t)0), Values(clMath::ExtraTestSizes(0, 1, 1, 0, 0, 0)), Values(1))); #elif defined(MEDIUM_TESTS) INSTANTIATE_TEST_CASE_P(order_HPMV, HPMV, Combine( ValuesIn(orderSet), Values(clblasLower), ValuesIn(smallRange), Values(alphaBeta), Values(alphaBeta), ValuesIn(offs), Values((size_t)0), Values((size_t)0), Values(clMath::ExtraTestSizes(0, 1, 1, 0, 0, 0)), Values(1))); INSTANTIATE_TEST_CASE_P(uplo_HPMV, HPMV, Combine( Values(clblasRowMajor), ValuesIn(uploSet), ValuesIn(smallRange), Values(alphaBeta), Values(alphaBeta), Values((size_t)0), ValuesIn(offs), Values((size_t)0), Values(clMath::ExtraTestSizes(0, 1, 1, 0, 0, 0)), Values(1))); INSTANTIATE_TEST_CASE_P(alpha_beta_HPMV, HPMV, Combine( Values(clblasRowMajor), Values(clblasLower), ValuesIn(smallRange), Values(alphaBeta), Values(alphaBeta), Values((size_t)0), Values((size_t)0), ValuesIn(offs), Values(clMath::ExtraTestSizes(0, 1, 1, 0, 0, 0)), Values(1))); INSTANTIATE_TEST_CASE_P(SelectedBig_0HPMV, HPMV, Combine( ValuesIn(orderSet), ValuesIn(uploSet), Values(1500, 5101), Values(alphaBeta), Values(alphaBeta), Values((size_t)0), Values((size_t)0), Values((size_t)0), Values(clMath::ExtraTestSizes(0, 1, 1, 0, 0, 0)), Values(1))); #else INSTANTIATE_TEST_CASE_P(ALL_HPMV, HPMV, Combine( ValuesIn(orderSet), ValuesIn(uploSet), ValuesIn(smallRange), ValuesIn(alphaBetaRange), ValuesIn(alphaBetaRange), ValuesIn(offs), ValuesIn(offs), ValuesIn(offs), Values(clMath::ExtraTestSizes(0, 1, 1, 0, 0, 0)), Values(1))); #endif // Correctness #endif #ifdef DO_SYMM #if defined(SHORT_TESTS) /*INSTANTIATE_TEST_CASE_P(Short_SYMM, SYMM, Combine( Values(clblasRowMajor), Values(clblasLeft),Values(clblasLower), ValuesIn(smallRange), ValuesIn(smallRange), ValuesIn(complexAlphaRange), ValuesIn(complexAlphaRange), Values(clMath::ExtraTestSizes(0, 0, 0, 0, 0, 0)), Values(1)));*/ INSTANTIATE_TEST_CASE_P(SelectedSmall_0SYMM, SYMM, Combine( ValuesIn(orderSet), ValuesIn(sideSet),ValuesIn(uploSet), Values(15),Values(15), Values(complexAlpha), Values(complexAlpha), Values(clMath::ExtraTestSizes(0, 0, 0, 0, 0, 0)), Values(1))); #elif defined(MEDIUM_TESTS) /*INSTANTIATE_TEST_CASE_P(order_SYMM, SYMM, Combine( ValuesIn(orderSet), ValuesIn(sideSet),Values(clblasLower), ValuesIn(smallRange),ValuesIn(smallRange) ,ValuesIn(complexAlphaRange), ValuesIn(complexAlphaRange), Values(clMath::ExtraTestSizes(0, 0, 0, 9, 0, 0)), Values(1))); INSTANTIATE_TEST_CASE_P(uplo_SYMM, SYMM, Combine( Values(clblasRowMajor), Values(clblasLeft),ValuesIn(uploSet), ValuesIn(smallRange),ValuesIn(smallRange), ValuesIn(complexAlphaRange), ValuesIn(complexAlphaRange), Values(clMath::ExtraTestSizes(0, 0, 0, 0, 9, 0)), Values(1)));*/ INSTANTIATE_TEST_CASE_P(alpha_beta_SYMM, SYMM, Combine( Values(clblasRowMajor), Values(clblasLeft),Values(clblasLower), Values(64),Values(133), Values(complexAlpha), Values(complexAlpha), Values(clMath::ExtraTestSizes(0, (size_t)0, (size_t)0, 3, 7, 11)), Values(1))); INSTANTIATE_TEST_CASE_P(SelectedBig_0SYMM, SYMM, Combine( ValuesIn(orderSet), Values(clblasLeft),Values(clblasLower), Values(1100),Values(4000), Values(complexAlpha), Values(complexAlpha), Values(clMath::ExtraTestSizes(0, (size_t)0, (size_t)0, 0, 0, 0)), Values(1))); #else INSTANTIATE_TEST_CASE_P(ALL_SYMM_FriendlyOffsets, SYMM, Combine( ValuesIn(orderSet), ValuesIn(sideSet),ValuesIn(uploSet), ValuesIn(smallRange),ValuesIn(smallRange), ValuesIn(complexAlphaRange), ValuesIn(complexAlphaRange), Values(clMath::ExtraTestSizes(0, (size_t)0, (size_t)0, 64, 32, 128)), Values(1))); INSTANTIATE_TEST_CASE_P(ALL_SYMM_UnfriendlyOffsets, SYMM, Combine( ValuesIn(orderSet), ValuesIn(sideSet),ValuesIn(uploSet), ValuesIn(smallRange),ValuesIn(smallRange), ValuesIn(complexAlphaRange), ValuesIn(complexAlphaRange), Values(clMath::ExtraTestSizes(0, (size_t)0, (size_t)0, 6, 3, 12)), Values(1))); #endif // Correctness #endif #ifdef DO_HEMM #if defined(SHORT_TESTS) /*INSTANTIATE_TEST_CASE_P(Short_HEMM, HEMM, Combine( Values(clblasRowMajor), Values(clblasLeft),Values(clblasLower), ValuesIn(smallRange), ValuesIn(smallRange), ValuesIn(complexAlphaRange), ValuesIn(complexAlphaRange), Values(clMath::ExtraTestSizes(0, 0, 0, 0, 0, 0)), Values(1)));*/ INSTANTIATE_TEST_CASE_P(SelectedSmall_0HEMM, HEMM, Combine( ValuesIn(orderSet), ValuesIn(sideSet),ValuesIn(uploSet), Values(15),Values(15), Values(complexAlpha), Values(complexAlpha), Values(clMath::ExtraTestSizes(0, 0, 0, 0, 0, 0)), Values(1))); #elif defined(MEDIUM_TESTS) /*INSTANTIATE_TEST_CASE_P(order_HEMM, HEMM, Combine( ValuesIn(orderSet), ValuesIn(sideSet),Values(clblasLower), ValuesIn(smallRange),ValuesIn(smallRange) ,ValuesIn(complexAlphaRange), ValuesIn(complexAlphaRange), Values(clMath::ExtraTestSizes(0, 0, 0, 9, 0, 0)), Values(1))); INSTANTIATE_TEST_CASE_P(uplo_HEMM, HEMM, Combine( Values(clblasRowMajor), Values(clblasLeft),ValuesIn(uploSet), ValuesIn(smallRange),ValuesIn(smallRange), ValuesIn(complexAlphaRange), ValuesIn(complexAlphaRange), Values(clMath::ExtraTestSizes(0, 0, 0, 0, 9, 0)), Values(1)));*/ INSTANTIATE_TEST_CASE_P(alpha_beta_HEMM, HEMM, Combine( Values(clblasRowMajor), Values(clblasLeft),Values(clblasLower), Values(64),Values(133), Values(complexAlpha), Values(complexAlpha), Values(clMath::ExtraTestSizes(0, (size_t)0, (size_t)0, 0, 0, 9)), Values(1))); INSTANTIATE_TEST_CASE_P(SelectedBig_0HEMM, HEMM, Combine( ValuesIn(orderSet), Values(clblasLeft),Values(clblasLower), Values(1010),Values( 4000), Values(complexAlpha), Values(complexAlpha), Values(clMath::ExtraTestSizes(0, (size_t)0, (size_t)0, 0, 1, 0)), Values(1))); #else INSTANTIATE_TEST_CASE_P(ALL_HEMM, HEMM, Combine( ValuesIn(orderSet), ValuesIn(sideSet),ValuesIn(uploSet), ValuesIn(smallRange),ValuesIn(smallRange), ValuesIn(complexAlphaRange), ValuesIn(complexAlphaRange), Values(clMath::ExtraTestSizes(0, (size_t)512, (size_t)511, 9, 0, 0)), Values(1))); #endif // Correctness #endif #ifdef DO_SPMV #if defined(SHORT_TESTS) INSTANTIATE_TEST_CASE_P(Short_SPMV, SPMV, Combine( Values(clblasRowMajor), Values(clblasLower), ValuesIn(smallRange), Values(alphaBeta), Values(alphaBeta), Values((size_t)0), Values((size_t)0), Values((size_t)0), Values(clMath::ExtraTestSizes(0, 1, 1, 0, 0, 0)), Values(1))); INSTANTIATE_TEST_CASE_P(SelectedSmall_0SPMV, SPMV, Combine( ValuesIn(orderSet), ValuesIn(uploSet), Values(15), Values(alphaBeta), Values(alphaBeta), Values((size_t)0), Values((size_t)0), Values((size_t)0), Values(clMath::ExtraTestSizes(0, 1, 1, 0, 0, 0)), Values(1))); #elif defined(MEDIUM_TESTS) INSTANTIATE_TEST_CASE_P(order_SPMV, SPMV, Combine( ValuesIn(orderSet), Values(clblasLower), ValuesIn(smallRange), Values(alphaBeta), Values(alphaBeta), ValuesIn(offs), Values((size_t)0), Values((size_t)0), Values(clMath::ExtraTestSizes(0, 1, 1, 0, 0, 0)), Values(1))); INSTANTIATE_TEST_CASE_P(uplo_SPMV, SPMV, Combine( Values(clblasRowMajor), ValuesIn(uploSet), ValuesIn(smallRange), Values(alphaBeta), Values(alphaBeta), Values((size_t)0), ValuesIn(offs), Values((size_t)0), Values(clMath::ExtraTestSizes(0, 1, 1, 0, 0, 0)), Values(1))); INSTANTIATE_TEST_CASE_P(alpha_beta_SPMV, SPMV, Combine( Values(clblasRowMajor), Values(clblasLower), ValuesIn(smallRange), Values(alphaBeta), Values(alphaBeta), Values((size_t)0), Values((size_t)0), ValuesIn(offs), Values(clMath::ExtraTestSizes(0, 1, 1, 0, 0, 0)), Values(1))); INSTANTIATE_TEST_CASE_P(SelectedBig_0SPMV, SPMV, Combine( ValuesIn(orderSet), ValuesIn(uploSet), Values(1500, 5101), Values(alphaBeta), Values(alphaBeta), Values((size_t)0), Values((size_t)0), Values((size_t)0), Values(clMath::ExtraTestSizes(0, 1, 1, 0, 0, 0)), Values(1))); #else INSTANTIATE_TEST_CASE_P(ALL_SPMV, SPMV, Combine( ValuesIn(orderSet), ValuesIn(uploSet), ValuesIn(smallRange), ValuesIn(alphaBetaRange), ValuesIn(alphaBetaRange), ValuesIn(offs), ValuesIn(offs), ValuesIn(offs), Values(clMath::ExtraTestSizes(0, 1, 1, 0, 0, 0)), Values(1))); #endif // Correctness #endif #ifdef DO_GEMM_2 INSTANTIATE_TEST_CASE_P(ColumnMajor_SmallRange_BigLDA_OFF_NX, GEMM2, Combine( Values(clblasColumnMajor), Values(clblasNoTrans), ValuesIn(transSet), ValuesIn(smallRange), ValuesIn(smallRange), ValuesIn(smallRange), Values(clMath::ExtraTestSizes(500, 501, 502, 1, 3, 10)), Values(1))); INSTANTIATE_TEST_CASE_P(ColumnMajor_SmallRange_BigLDA_OFF_TN, GEMM2, Combine( Values(clblasColumnMajor), Values(clblasTrans), Values(clblasNoTrans ), ValuesIn(smallRange), ValuesIn(smallRange), ValuesIn(smallRange), Values(clMath::ExtraTestSizes(500, (size_t)501, (size_t)502, 3, 2, 1)), Values(1))); INSTANTIATE_TEST_CASE_P(ColumnMajor_SmallRange_BigLDA_OFF_HN, GEMM2, Combine( Values(clblasColumnMajor), Values(clblasConjTrans), Values(clblasNoTrans ), ValuesIn(smallRange), ValuesIn(smallRange), ValuesIn(smallRange), Values(clMath::ExtraTestSizes(500, (size_t)501, (size_t)502, 3, 2, 1)), Values(1))); #if !defined(SHORT_TESTS) && !defined(MEDIUM_TESTS) INSTANTIATE_TEST_CASE_P(ColumnMajor_SmallRange_NX, GEMM2, Combine( Values(clblasColumnMajor), Values(clblasNoTrans), ValuesIn(transSet), ValuesIn(smallRange), ValuesIn(smallRange), ValuesIn(smallRange), Values(clMath::ExtraTestSizes()), Values(1))); INSTANTIATE_TEST_CASE_P(ColumnMajor_SmallRange_TN, GEMM2, Combine( Values(clblasColumnMajor), Values(clblasTrans), Values(clblasNoTrans ), ValuesIn(smallRange), ValuesIn(smallRange), ValuesIn(smallRange), Values(clMath::ExtraTestSizes()), Values(1))); INSTANTIATE_TEST_CASE_P(ColumnMajor_SmallRange_HN, GEMM2, Combine( Values(clblasColumnMajor), Values(clblasConjTrans), Values(clblasNoTrans ), ValuesIn(smallRange), ValuesIn(smallRange), ValuesIn(smallRange), Values(clMath::ExtraTestSizes()), Values(1))); #endif #endif //DO_GEMM_2 #ifdef DO_GEMM // xGEMM tests INSTANTIATE_TEST_CASE_P(ColumnMajor_SmallRange, GEMM, Combine( Values(clblasColumnMajor), ValuesIn(transSet), ValuesIn(transSet), ValuesIn(smallRange), ValuesIn(smallRange), ValuesIn(smallRange), Values(clMath::ExtraTestSizes()), Values(1))); INSTANTIATE_TEST_CASE_P(RowMajor_SmallRange, GEMM, Combine( Values(clblasRowMajor), ValuesIn(transSet), ValuesIn(transSet), ValuesIn(smallRange), ValuesIn(smallRange), ValuesIn(smallRange), Values(clMath::ExtraTestSizes()), Values(1))); // We know, that SmallRange does not have values more that 257, // so lda is set to 500. INSTANTIATE_TEST_CASE_P(ColumnMajor_SmallRange_BigLDA, GEMM, Combine( Values(clblasColumnMajor), ValuesIn(transSet), ValuesIn(transSet), ValuesIn(smallRange), ValuesIn(smallRange), ValuesIn(smallRange), Values(clMath::ExtraTestSizes(500, 501, 502, 0, 0, 0)), Values(1))); INSTANTIATE_TEST_CASE_P(RowMajor_SmallRange_BigLDA, GEMM, Combine( Values(clblasRowMajor), ValuesIn(transSet), ValuesIn(transSet), ValuesIn(smallRange), ValuesIn(smallRange), ValuesIn(smallRange), Values(clMath::ExtraTestSizes(500, 501, 502, 0, 0, 0)), Values(1))); INSTANTIATE_TEST_CASE_P(ColumnMajor_SmallRange_BigLDA_OffSet, GEMM, Combine( Values(clblasColumnMajor), ValuesIn(transSet), ValuesIn(transSet), ValuesIn(smallRange), ValuesIn(smallRange), ValuesIn(smallRange), Values(clMath::ExtraTestSizes(500, 501, 502, 1, 0, 0)), Values(1))); // Cases for extended versions with offsets #if defined(SHORT_TESTS) || defined(MEDIUM_TESTS) INSTANTIATE_TEST_CASE_P(ColumnMajor_SmallRangeEx_0, GEMM, Combine( Values(clblasColumnMajor), ValuesIn(transSet), ValuesIn(transSet), Values(67), Values(138), Values(220), Values(clMath::ExtraTestSizes(0, 0, 0, 500, 600, 700)), Values(1))); INSTANTIATE_TEST_CASE_P(RowMajor_SmallRangeEx_0, GEMM, Combine( Values(clblasRowMajor), ValuesIn(transSet), ValuesIn(transSet), Values(67), Values(138), Values(220), Values(clMath::ExtraTestSizes(0, 0, 0, 500, 600, 700)), Values(1))); #else /* SHORT_TESTS || MEDIUM_TESTS */ INSTANTIATE_TEST_CASE_P(ColumnMajor_SmallRangeEx_0, GEMM, Combine( Values(clblasColumnMajor), ValuesIn(transSet), ValuesIn(transSet), Values(67), Values(135), Values(228), Values(clMath::ExtraTestSizes(0, 0, 0, 500, 0, 0)), Values(1))); INSTANTIATE_TEST_CASE_P(ColumnMajor_SmallRangeEx_1, GEMM, Combine( Values(clblasColumnMajor), ValuesIn(transSet), ValuesIn(transSet), Values(64), Values(64), Values(64), Values(clMath::ExtraTestSizes(0, 0, 0, 0, 501, 0)), Values(1))); INSTANTIATE_TEST_CASE_P(ColumnMajor_SmallRangeEx_2, GEMM, Combine( Values(clblasColumnMajor), ValuesIn(transSet), ValuesIn(transSet), Values(128), Values(64), Values(77), Values(clMath::ExtraTestSizes(0, 0, 0, 0, 0, 502)), Values(1))); INSTANTIATE_TEST_CASE_P(ColumnMajor_SmallRangeEx_3, GEMM, Combine( Values(clblasColumnMajor), ValuesIn(transSet), ValuesIn(transSet), Values(112), Values(86), Values(68), Values(clMath::ExtraTestSizes(0, 0, 0, 500, 501, 502)), Values(1))); INSTANTIATE_TEST_CASE_P(RowMajor_SmallRangeEx_0, GEMM, Combine( Values(clblasRowMajor), ValuesIn(transSet), ValuesIn(transSet), Values(67), Values(135), Values(228), Values(clMath::ExtraTestSizes(0, 0, 0, 500, 0, 0)), Values(1))); INSTANTIATE_TEST_CASE_P(RowMajor_SmallRangeEx_1, GEMM, Combine( Values(clblasRowMajor), ValuesIn(transSet), ValuesIn(transSet), Values(64), Values(64), Values(64), Values(clMath::ExtraTestSizes(0, 0, 0, 0, 501, 0)), Values(1))); INSTANTIATE_TEST_CASE_P(RowMajor_SmallRangeEx_2, GEMM, Combine( Values(clblasRowMajor), ValuesIn(transSet), ValuesIn(transSet), Values(128), Values(64), Values(77), Values(clMath::ExtraTestSizes(0, 0, 0, 0, 0, 502)), Values(1))); INSTANTIATE_TEST_CASE_P(RowMajor_SmallRangeEx_3, GEMM, Combine( Values(clblasRowMajor), ValuesIn(transSet), ValuesIn(transSet), Values(112), Values(86), Values(68), Values(clMath::ExtraTestSizes(0, 0, 0, 500, 501, 502)), Values(1))); #endif /* !SHORT_TESTS || !MEDIUM_TESTS */ // Big matrices #if !defined SHORT_TESTS INSTANTIATE_TEST_CASE_P(SelectedBig_0, GEMM, Combine( ValuesIn(orderSet), Values(clblasNoTrans), Values(clblasNoTrans), Values(2801), Values(2903), Values(3005), Values(clMath::ExtraTestSizes()), Values(1))); #if !defined(MEDIUM_TESTS) INSTANTIATE_TEST_CASE_P(SelectedBig_1, GEMM, Combine( ValuesIn(orderSet), Values(clblasNoTrans), Values(clblasNoTrans), Values(4777), Values(4333), Values(5000), Values(clMath::ExtraTestSizes()), Values(1))); INSTANTIATE_TEST_CASE_P(SelectedBig_2, GEMM, Combine( ValuesIn(orderSet), Values(clblasTrans), Values(clblasNoTrans), Values(5777), Values(5333), Values(3000), Values(clMath::ExtraTestSizes()), Values(1))); INSTANTIATE_TEST_CASE_P(SelectedBig_3, GEMM, Combine( ValuesIn(orderSet), Values(clblasTrans), Values(clblasConjTrans), Values(6777), Values(3333), Values(3000), Values(clMath::ExtraTestSizes()), Values(1))); #endif // !MEDIUM_TESTS #endif // !SHORT_TESTS // Small matrices and Custom cases INSTANTIATE_TEST_CASE_P(SelectedSmall_0, GEMM, Combine( ValuesIn(orderSet), Values(clblasNoTrans), Values(clblasNoTrans), Values(1), Values(1), Values(1), Values(clMath::ExtraTestSizes()), Values(1))); #if !defined SHORT_TESTS INSTANTIATE_TEST_CASE_P(SelectedSmall_1, GEMM, Combine( ValuesIn(orderSet), Values(clblasNoTrans), Values(clblasNoTrans), Values(2), Values(1), Values(3), Values(clMath::ExtraTestSizes()), Values(1))); #if !defined(MEDIUM_TESTS) INSTANTIATE_TEST_CASE_P(SelectedSmall_2, GEMM, Combine( ValuesIn(orderSet), Values(clblasTrans), Values(clblasNoTrans), Values(3), Values(2), Values(1), Values(clMath::ExtraTestSizes()), Values(1))); INSTANTIATE_TEST_CASE_P(SelectedSmall_3, GEMM, Combine( ValuesIn(orderSet), Values(clblasTrans), Values(clblasConjTrans), Values(4), Values(3), Values(2), Values(clMath::ExtraTestSizes()), Values(1))); INSTANTIATE_TEST_CASE_P(SelectedSmall_4, GEMM, Combine( ValuesIn(orderSet), Values(clblasConjTrans), Values(clblasNoTrans), Values(17), Values(13), Values(1), Values(clMath::ExtraTestSizes()), Values(1))); // Custom test - use command line arguments to tweak it INSTANTIATE_TEST_CASE_P(Custom, GEMM, Combine( ValuesIn(orderSet), ValuesIn(transSet), ValuesIn(transSet), Values(32), Values(32), Values(32), Values(clMath::ExtraTestSizes()), Values(1))); #endif /* !MEDIUM_TESTS */ #endif /* !SHORT_TESTS */ #endif // DO_GEMM #ifdef DO_TRMM // xTRMM tests INSTANTIATE_TEST_CASE_P(ColumnMajor_SmallRange, TRMM, Combine( Values(clblasColumnMajor), ValuesIn(sideSet), ValuesIn(uploSet), ValuesIn(transSet), ValuesIn(diagSet), ValuesIn(smallRange), ValuesIn(smallRange), Values(clMath::ExtraTestSizes()), Values(1))); INSTANTIATE_TEST_CASE_P(RowMajor_SmallRange, TRMM, Combine( Values(clblasRowMajor), ValuesIn(sideSet), ValuesIn(uploSet), ValuesIn(transSet), ValuesIn(diagSet), ValuesIn(smallRange), ValuesIn(smallRange), Values(clMath::ExtraTestSizes()), Values(1))); // We know, that SmallRange does not have values more that 257, // so lda is set to 500. INSTANTIATE_TEST_CASE_P(ColumnMajor_SmallRange_BigLDA, TRMM, Combine( Values(clblasColumnMajor), ValuesIn(sideSet), ValuesIn(uploSet), ValuesIn(transSet), ValuesIn(diagSet), ValuesIn(smallRange), ValuesIn(smallRange), Values(clMath::ExtraTestSizes(500, 501, 0, 0, 0, 0)), Values(1))); INSTANTIATE_TEST_CASE_P(RowMajor_SmallRange_BigLDA, TRMM, Combine( Values(clblasRowMajor), ValuesIn(sideSet), ValuesIn(uploSet), ValuesIn(transSet), ValuesIn(diagSet), ValuesIn(smallRange), ValuesIn(smallRange), Values(clMath::ExtraTestSizes(500, 501, 0, 0, 0, 0)), Values(1))); #if defined(SHORT_TESTS) || defined(MEDIUM_TESTS) INSTANTIATE_TEST_CASE_P(ColumnMajor_SmallRangeEx_0, TRMM, Combine( Values(clblasColumnMajor), ValuesIn(sideSet), ValuesIn(uploSet), ValuesIn(transSet), ValuesIn(diagSet), Values(115), Values(158), Values(clMath::ExtraTestSizes(0, 0, 0, 502, 606, 0)), Values(1))); INSTANTIATE_TEST_CASE_P(RowMajor_SmallRangeEx_0, TRMM, Combine( Values(clblasRowMajor), ValuesIn(sideSet), ValuesIn(uploSet), ValuesIn(transSet), ValuesIn(diagSet), Values(115), Values(158), Values(clMath::ExtraTestSizes(0, 0, 0, 502, 606, 0)), Values(1))); #else /* SHORT_TESTS || MEDIUM_TESTS */ // Cases for extended versions with offsets INSTANTIATE_TEST_CASE_P(ColumnMajor_SmallRangeEx_0, TRMM, Combine( Values(clblasColumnMajor), ValuesIn(sideSet), ValuesIn(uploSet), ValuesIn(transSet), ValuesIn(diagSet), Values(115), Values(113), Values(clMath::ExtraTestSizes(0, 0, 0, 500, 0, 0)), Values(1))); INSTANTIATE_TEST_CASE_P(ColumnMajor_SmallRangeEx_1, TRMM, Combine( Values(clblasColumnMajor), ValuesIn(sideSet), ValuesIn(uploSet), ValuesIn(transSet), ValuesIn(diagSet), Values(128), Values(66), Values(clMath::ExtraTestSizes(0, 0, 0, 0, 501, 0)), Values(1))); INSTANTIATE_TEST_CASE_P(ColumnMajor_SmallRangeEx_2, TRMM, Combine( Values(clblasColumnMajor), ValuesIn(sideSet), ValuesIn(uploSet), ValuesIn(transSet), ValuesIn(diagSet), Values(53), Values(67), Values(clMath::ExtraTestSizes(0, 0, 0, 500, 501, 0)), Values(1))); INSTANTIATE_TEST_CASE_P(RowMajor_SmallRangeEx_0, TRMM, Combine( Values(clblasRowMajor), ValuesIn(sideSet), ValuesIn(uploSet), ValuesIn(transSet), ValuesIn(diagSet), Values(115), Values(113), Values(clMath::ExtraTestSizes(0, 0, 0, 500, 0, 0)), Values(1))); INSTANTIATE_TEST_CASE_P(RowMajor_SmallRangeEx_1, TRMM, Combine( Values(clblasRowMajor), ValuesIn(sideSet), ValuesIn(uploSet), ValuesIn(transSet), ValuesIn(diagSet), Values(128), Values(66), Values(clMath::ExtraTestSizes(0, 0, 0, 0, 501, 0)), Values(1))); INSTANTIATE_TEST_CASE_P(RowMajor_SmallRangeEx_2, TRMM, Combine( Values(clblasRowMajor), ValuesIn(sideSet), ValuesIn(uploSet), ValuesIn(transSet), ValuesIn(diagSet), Values(53), Values(67), Values(clMath::ExtraTestSizes(0, 0, 0, 500, 501, 0)), Values(1))); #endif /* !SHORT_TESTS && !MEDIUM_TESTS */ // Big matrices #if !defined SHORT_TESTS INSTANTIATE_TEST_CASE_P(SelectedBig_0, TRMM, Combine( ValuesIn(orderSet), Values(clblasRight), Values(clblasUpper), Values(clblasTrans), Values(clblasNonUnit), Values(2801), Values(2903), Values(clMath::ExtraTestSizes()), Values(1))); #if !defined(MEDIUM_TESTS) INSTANTIATE_TEST_CASE_P(SelectedBig_1, TRMM, Combine( ValuesIn(orderSet), Values(clblasRight), Values(clblasUpper), Values(clblasTrans), Values(clblasNonUnit), Values(4567), Values(4321), Values(clMath::ExtraTestSizes()), Values(1))); INSTANTIATE_TEST_CASE_P(SelectedBig_2, TRMM, Combine( ValuesIn(orderSet), Values(clblasLeft), Values(clblasUpper), Values(clblasNoTrans), Values(clblasNonUnit), Values(5567), Values(5321), Values(clMath::ExtraTestSizes()), Values(1))); INSTANTIATE_TEST_CASE_P(SelectedBig_3, TRMM, Combine( ValuesIn(orderSet), Values(clblasLeft), Values(clblasLower), Values(clblasTrans), Values(clblasUnit), Values(6567), Values(3321), Values(clMath::ExtraTestSizes()), Values(1))); #endif // !MEDIUM_TESTS #endif // !SHORT_TESTS // Small matrices and Custom tests INSTANTIATE_TEST_CASE_P(SelectedSmall_0, TRMM, Combine( ValuesIn(orderSet), Values(clblasRight), Values(clblasUpper), Values(clblasTrans), Values(clblasNonUnit), Values(1), Values(1), Values(clMath::ExtraTestSizes()), Values(1))); #if !defined SHORT_TESTS INSTANTIATE_TEST_CASE_P(SelectedSmall_1, TRMM, Combine( ValuesIn(orderSet), Values(clblasRight), Values(clblasUpper), Values(clblasTrans), Values(clblasNonUnit), Values(2), Values(1), Values(clMath::ExtraTestSizes()), Values(1))); #if !defined(MEDIUM_TESTS) INSTANTIATE_TEST_CASE_P(SelectedSmall_2, TRMM, Combine( ValuesIn(orderSet), Values(clblasLeft), Values(clblasUpper), Values(clblasNoTrans), Values(clblasNonUnit), Values(3), Values(2), Values(clMath::ExtraTestSizes()), Values(1))); INSTANTIATE_TEST_CASE_P(SelectedSmall_3, TRMM, Combine( ValuesIn(orderSet), Values(clblasLeft), Values(clblasLower), Values(clblasTrans), Values(clblasUnit), Values(4), Values(3), Values(clMath::ExtraTestSizes()), Values(1))); INSTANTIATE_TEST_CASE_P(SelectedSmall_4, TRMM, Combine( ValuesIn(orderSet), Values(clblasLeft), Values(clblasUpper), Values(clblasNoTrans), Values(clblasUnit), Values(17), Values(1), Values(clMath::ExtraTestSizes()), Values(1))); // Custom test - use command line arguments to tweak it INSTANTIATE_TEST_CASE_P(Custom, TRMM, Combine( ValuesIn(orderSet), ValuesIn(sideSet), ValuesIn(uploSet), ValuesIn(transSet), ValuesIn(diagSet), Values(32), Values(32), Values(clMath::ExtraTestSizes()), Values(1))); #endif /* !MEDIUM_TESTS */ #endif /* !SHORT_TESTS */ #endif // DO_TRMM #ifdef DO_TRSM // xTRSM tests INSTANTIATE_TEST_CASE_P(ColumnMajor_SmallRange, TRSM, Combine( Values(clblasColumnMajor), ValuesIn(sideSet), ValuesIn(uploSet), ValuesIn(transSet), ValuesIn(diagSet), ValuesIn(smallRange), ValuesIn(smallRange), Values(clMath::ExtraTestSizes()), Values(1))); INSTANTIATE_TEST_CASE_P(RowMajor_SmallRange, TRSM, Combine( Values(clblasRowMajor), ValuesIn(sideSet), ValuesIn(uploSet), ValuesIn(transSet), ValuesIn(diagSet), ValuesIn(smallRange), ValuesIn(smallRange), Values(clMath::ExtraTestSizes()), Values(1))); // We know, that SmallRange does not have values more that 257, // so lda is set to 500. INSTANTIATE_TEST_CASE_P(ColumnMajor_SmallRange_BigLDA, TRSM, Combine( Values(clblasColumnMajor), ValuesIn(sideSet), ValuesIn(uploSet), ValuesIn(transSet), ValuesIn(diagSet), ValuesIn(smallRange), ValuesIn(smallRange), Values(clMath::ExtraTestSizes(500, 501, 0, 0, 0, 0)), Values(1))); INSTANTIATE_TEST_CASE_P(RowMajor_SmallRange_BigLDA, TRSM, Combine( Values(clblasRowMajor), ValuesIn(sideSet), ValuesIn(uploSet), ValuesIn(transSet), ValuesIn(diagSet), ValuesIn(smallRange), ValuesIn(smallRange), Values(clMath::ExtraTestSizes(500, 501, 0, 0, 0, 0)), Values(1))); #if defined(SHORT_TESTS) || defined(MEDIUM_TESTS) INSTANTIATE_TEST_CASE_P(ColumnMajor_SmallRangeEx_0, TRSM, Combine( Values(clblasColumnMajor), ValuesIn(sideSet), ValuesIn(uploSet), ValuesIn(transSet), ValuesIn(diagSet), Values(115), Values(158), Values(clMath::ExtraTestSizes(0, 0, 0, 502, 606, 0)), Values(1))); INSTANTIATE_TEST_CASE_P(RowMajor_SmallRangeEx_0, TRSM, Combine( Values(clblasRowMajor), ValuesIn(sideSet), ValuesIn(uploSet), ValuesIn(transSet), ValuesIn(diagSet), Values(115), Values(158), Values(clMath::ExtraTestSizes(0, 0, 0, 502, 606, 0)), Values(1))); #else /* SHORT_TESTS || MEDIUM_TESTS */ // Cases for extended versions with offsets INSTANTIATE_TEST_CASE_P(ColumnMajor_SmallRangeEx_0, TRSM, Combine( Values(clblasColumnMajor), ValuesIn(sideSet), ValuesIn(uploSet), ValuesIn(transSet), ValuesIn(diagSet), Values(115), Values(113), Values(clMath::ExtraTestSizes(0, 0, 0, 500, 0, 0)), Values(1))); INSTANTIATE_TEST_CASE_P(ColumnMajor_SmallRangeEx_1, TRSM, Combine( Values(clblasColumnMajor), ValuesIn(sideSet), ValuesIn(uploSet), ValuesIn(transSet), ValuesIn(diagSet), Values(128), Values(66), Values(clMath::ExtraTestSizes(0, 0, 0, 0, 501, 0)), Values(1))); INSTANTIATE_TEST_CASE_P(ColumnMajor_SmallRangeEx_2, TRSM, Combine( Values(clblasColumnMajor), ValuesIn(sideSet), ValuesIn(uploSet), ValuesIn(transSet), ValuesIn(diagSet), Values(53), Values(67), Values(clMath::ExtraTestSizes(0, 0, 0, 500, 501, 0)), Values(1))); INSTANTIATE_TEST_CASE_P(RowMajor_SmallRangeEx_0, TRSM, Combine( Values(clblasRowMajor), ValuesIn(sideSet), ValuesIn(uploSet), ValuesIn(transSet), ValuesIn(diagSet), Values(115), Values(113), Values(clMath::ExtraTestSizes(0, 0, 0, 500, 0, 0)), Values(1))); INSTANTIATE_TEST_CASE_P(RowMajor_SmallRangeEx_1, TRSM, Combine( Values(clblasRowMajor), ValuesIn(sideSet), ValuesIn(uploSet), ValuesIn(transSet), ValuesIn(diagSet), Values(128), Values(66), Values(clMath::ExtraTestSizes(0, 0, 0, 0, 501, 0)), Values(1))); INSTANTIATE_TEST_CASE_P(RowMajor_SmallRangeEx_2, TRSM, Combine( Values(clblasRowMajor), ValuesIn(sideSet), ValuesIn(uploSet), ValuesIn(transSet), ValuesIn(diagSet), Values(53), Values(67), Values(clMath::ExtraTestSizes(0, 0, 0, 500, 501, 0)), Values(1))); #endif /* !SHORT_TESTS && !MEDIUM_TESTS */ // Big matrices #if !defined SHORT_TESTS INSTANTIATE_TEST_CASE_P(SelectedBig_0, TRSM, Combine( ValuesIn(orderSet), Values(clblasRight), Values(clblasUpper), Values(clblasTrans), Values(clblasNonUnit), Values(2801), Values(2903), Values(clMath::ExtraTestSizes()), Values(1))); #if !defined(MEDIUM_TESTS) INSTANTIATE_TEST_CASE_P(SelectedBig_1, TRSM, Combine( ValuesIn(orderSet), Values(clblasRight), Values(clblasUpper), Values(clblasTrans), Values(clblasNonUnit), Values(4567), Values(4321), Values(clMath::ExtraTestSizes()), Values(1))); INSTANTIATE_TEST_CASE_P(SelectedBig_2, TRSM, Combine( ValuesIn(orderSet), Values(clblasLeft), Values(clblasUpper), Values(clblasNoTrans), Values(clblasNonUnit), Values(5567), Values(5321), Values(clMath::ExtraTestSizes()), Values(1))); INSTANTIATE_TEST_CASE_P(SelectedBig_3, TRSM, Combine( ValuesIn(orderSet), Values(clblasLeft), Values(clblasLower), Values(clblasTrans), Values(clblasUnit), Values(6567), Values(3321), Values(clMath::ExtraTestSizes()), Values(1))); #endif // !MEDIUM_TESTS #endif // !SHORT_TESTS // Small matrices and Custom tests INSTANTIATE_TEST_CASE_P(SelectedSmall_0, TRSM, Combine( ValuesIn(orderSet), Values(clblasRight), Values(clblasUpper), Values(clblasTrans), Values(clblasNonUnit), Values(1), Values(1), Values(clMath::ExtraTestSizes()), Values(1))); #if !defined SHORT_TESTS INSTANTIATE_TEST_CASE_P(SelectedSmall_1, TRSM, Combine( ValuesIn(orderSet), Values(clblasRight), Values(clblasUpper), Values(clblasTrans), Values(clblasNonUnit), Values(2), Values(1), Values(clMath::ExtraTestSizes()), Values(1))); #if !defined(MEDIUM_TESTS) INSTANTIATE_TEST_CASE_P(SelectedSmall_2, TRSM, Combine( ValuesIn(orderSet), Values(clblasLeft), Values(clblasUpper), Values(clblasNoTrans), Values(clblasNonUnit), Values(3), Values(2), Values(clMath::ExtraTestSizes()), Values(1))); INSTANTIATE_TEST_CASE_P(SelectedSmall_3, TRSM, Combine( ValuesIn(orderSet), Values(clblasLeft), Values(clblasLower), Values(clblasTrans), Values(clblasUnit), Values(4), Values(3), Values(clMath::ExtraTestSizes()), Values(1))); INSTANTIATE_TEST_CASE_P(SelectedSmall_4, TRSM, Combine( ValuesIn(orderSet), Values(clblasLeft), Values(clblasUpper), Values(clblasNoTrans), Values(clblasUnit), Values(17), Values(1), Values(clMath::ExtraTestSizes()), Values(1))); // Custom test - use command line arguments to tweak it INSTANTIATE_TEST_CASE_P(Custom, TRSM, Combine( ValuesIn(orderSet), ValuesIn(sideSet), ValuesIn(uploSet), ValuesIn(transSet), ValuesIn(diagSet), Values(32), Values(32), Values(clMath::ExtraTestSizes()), Values(1))); #endif /* !MEDIUM_TESTS */ #endif /* !SHORT_TESTS */ #endif // DO_TRSM #ifdef DO_GEMV // xGEMV tests INSTANTIATE_TEST_CASE_P(ColumnMajor_SmallRange, GEMV, Combine( Values(clblasColumnMajor), ValuesIn(transSet), ValuesIn(smallRange), ValuesIn(smallRange), Values(clMath::ExtraTestSizes(0, 1, 1, 0, 0, 0)), Values(1))); INSTANTIATE_TEST_CASE_P(RowMajor_SmallRange, GEMV, Combine( Values(clblasRowMajor), ValuesIn(transSet), ValuesIn(smallRange), ValuesIn(smallRange), Values(clMath::ExtraTestSizes(0, 1, 1, 0, 0, 0)), Values(1))); // We know, that SmallRange does not have values more that 257, // so lda is set to 500. INSTANTIATE_TEST_CASE_P(ColumnMajor_SmallRange_BigLDA, GEMV, Combine( Values(clblasColumnMajor), ValuesIn(transSet), ValuesIn(smallRange), ValuesIn(smallRange), Values(clMath::ExtraTestSizes(500, 501, 502, 0, 0, 0)), Values(1))); INSTANTIATE_TEST_CASE_P(RowMajor_SmallRange_BigLDA, GEMV, Combine( Values(clblasRowMajor), ValuesIn(transSet), ValuesIn(smallRange), ValuesIn(smallRange), Values(clMath::ExtraTestSizes(500, 501, 502, 0, 0, 0)), Values(1))); INSTANTIATE_TEST_CASE_P(SmallRange_VariousInc, GEMV, Combine( ValuesIn(orderSet), ValuesIn(transSet), ValuesIn(smallRange), ValuesIn(smallRange), ValuesIn(clMath::makeContainerETS(ZERO_VAL, incs, incs, ZERO_VAL, ZERO_VAL, ZERO_VAL)), Values(1))); // Cases for the extended version with offsets INSTANTIATE_TEST_CASE_P(ColumnMajor_SmallRangeEx, GEMV, Combine( Values(clblasColumnMajor), ValuesIn(transSet), ValuesIn(smallRange), ValuesIn(smallRange), ValuesIn(clMath::makeContainerETS(ZERO_VAL, ONE_VAL, ONE_VAL, offs, ZERO_VAL, ZERO_VAL)), Values(1))); INSTANTIATE_TEST_CASE_P(RowMajor_SmallRangeEx, GEMV, Combine( Values(clblasRowMajor), ValuesIn(transSet), ValuesIn(smallRange), ValuesIn(smallRange), ValuesIn(clMath::makeContainerETS(ZERO_VAL, ONE_VAL, ONE_VAL, offs, ZERO_VAL, ZERO_VAL)), Values(1))); // Big matrices #if !defined SHORT_TESTS INSTANTIATE_TEST_CASE_P(SelectedBig_0, GEMV, Combine( ValuesIn(orderSet), Values(clblasTrans), Values(2800), Values(2800), Values(clMath::ExtraTestSizes(0, 1, 1, 0, 0, 0)), Values(1))); #if !defined(MEDIUM_TESTS) INSTANTIATE_TEST_CASE_P(SelectedBig_1, GEMV, Combine( ValuesIn(orderSet), Values(clblasTrans), Values(4567), Values(4321), Values(clMath::ExtraTestSizes(0, 1, 1, 0, 0, 0)), Values(1))); INSTANTIATE_TEST_CASE_P(SelectedBig_2, GEMV, Combine( ValuesIn(orderSet), Values(clblasNoTrans), Values(5567), Values(5321), Values(clMath::ExtraTestSizes(0, 1, 1, 0, 0, 0)), Values(1))); INSTANTIATE_TEST_CASE_P(SelectedBig_3, GEMV, Combine( ValuesIn(orderSet), Values(clblasTrans), Values(6567), Values(3321), Values(clMath::ExtraTestSizes(0, 1, 1, 0, 0, 0)), Values(1))); #endif // !MEDIUM_TESTS #endif // !SHORT_TESTS // Small matrices and Custom tests INSTANTIATE_TEST_CASE_P(SelectedSmall_0, GEMV, Combine( ValuesIn(orderSet), Values(clblasTrans), Values(1), Values(1), Values(clMath::ExtraTestSizes(0, 1, 1, 0, 0, 0)), Values(1))); #if !defined SHORT_TESTS INSTANTIATE_TEST_CASE_P(SelectedSmall_1, GEMV, Combine( ValuesIn(orderSet), Values(clblasTrans), Values(2), Values(1), Values(clMath::ExtraTestSizes(0, 1, 1, 0, 0, 0)), Values(1))); #if !defined(MEDIUM_TESTS) INSTANTIATE_TEST_CASE_P(SelectedSmall_2, GEMV, Combine( ValuesIn(orderSet), Values(clblasNoTrans), Values(3), Values(2), Values(clMath::ExtraTestSizes(0, 1, 1, 0, 0, 0)), Values(1))); INSTANTIATE_TEST_CASE_P(SelectedSmall_3, GEMV, Combine( ValuesIn(orderSet), Values(clblasTrans), Values(4), Values(3), Values(clMath::ExtraTestSizes(0, 1, 1, 0, 0, 0)), Values(1))); INSTANTIATE_TEST_CASE_P(SelectedSmall_4, GEMV, Combine( ValuesIn(orderSet), Values(clblasNoTrans), Values(17), Values(1), Values(clMath::ExtraTestSizes(0, 1, 1, 0, 0, 0)), Values(1))); // Custom test - use command line arguments to tweak it INSTANTIATE_TEST_CASE_P(Custom, GEMV, Combine( ValuesIn(orderSet), ValuesIn(transSet), Values(32), Values(32), Values(clMath::ExtraTestSizes(0, 1, 1, 0, 0, 0)), Values(1))); #endif /* !MEDIUM_TESTS */ #endif /* !SHORT_TESTS */ #endif // DO_GEMV #ifdef DO_SYMV // xSYMV tests INSTANTIATE_TEST_CASE_P(ColumnMajor_SmallRange, SYMV, Combine( Values(clblasColumnMajor), ValuesIn(uploSet), ValuesIn(smallRange), Values(clMath::ExtraTestSizes(0, 1, 1, 0, 0, 0)), Values(1))); INSTANTIATE_TEST_CASE_P(RowMajor_SmallRange, SYMV, Combine( Values(clblasRowMajor), ValuesIn(uploSet), ValuesIn(smallRange), Values(clMath::ExtraTestSizes(0, 1, 1, 0, 0, 0)), Values(1))); // We know, that SmallRange does not have values more that 257, // so lda is set to 500. INSTANTIATE_TEST_CASE_P(ColumnMajor_SmallRange_BigLDA, SYMV, Combine( Values(clblasColumnMajor), ValuesIn(uploSet), ValuesIn(smallRange), Values(clMath::ExtraTestSizes(500, 501, 502, 0, 0, 0)), Values(1))); INSTANTIATE_TEST_CASE_P(RowMajor_SmallRange_BigLDA, SYMV, Combine( Values(clblasRowMajor), ValuesIn(uploSet), ValuesIn(smallRange), Values(clMath::ExtraTestSizes(500, 501, 502, 0, 0, 0)), Values(1))); INSTANTIATE_TEST_CASE_P(SmallRange_VariousInc, SYMV, Combine( ValuesIn(orderSet), ValuesIn(uploSet), ValuesIn(smallRange), ValuesIn(clMath::makeContainerETS(ZERO_VAL, incs, incs, ZERO_VAL, ZERO_VAL, ZERO_VAL)), Values(1))); // cases for the extended versions with offsets INSTANTIATE_TEST_CASE_P(ColumnMajor_SmallRangeEx, SYMV, Combine( Values(clblasColumnMajor), ValuesIn(uploSet), ValuesIn(smallRange), ValuesIn(clMath::makeContainerETS(ZERO_VAL, ONE_VAL, ONE_VAL, offs, ZERO_VAL, ZERO_VAL)), Values(1))); INSTANTIATE_TEST_CASE_P(RowMajor_SmallRangeEx, SYMV, Combine( Values(clblasRowMajor), ValuesIn(uploSet), ValuesIn(smallRange), ValuesIn(clMath::makeContainerETS(ZERO_VAL, ONE_VAL, ONE_VAL, offs, ZERO_VAL, ZERO_VAL)), Values(1))); // Big matrices #if !defined SHORT_TESTS INSTANTIATE_TEST_CASE_P(SelectedBig_0, SYMV, Combine( ValuesIn(orderSet), Values(clblasUpper), Values(2801), Values(clMath::ExtraTestSizes(0, 1, 1, 0, 0, 0)), Values(1))); #if !defined MEDIUM_TESTS INSTANTIATE_TEST_CASE_P(SelectedBig_1, SYMV, Combine( ValuesIn(orderSet), Values(clblasUpper), Values(4567), Values(clMath::ExtraTestSizes(0, 1, 1, 0, 0, 0)), Values(1))); INSTANTIATE_TEST_CASE_P(SelectedBig_2, SYMV, Combine( ValuesIn(orderSet), Values(clblasLower), Values(5567), Values(clMath::ExtraTestSizes(0, 1, 1, 0, 0, 0)), Values(1))); INSTANTIATE_TEST_CASE_P(SelectedBig_3, SYMV, Combine( ValuesIn(orderSet), Values(clblasUpper), Values(6567), Values(clMath::ExtraTestSizes(0, 1, 1, 0, 0, 0)), Values(1))); #endif // !MEDIUM_TESTS #endif // !SHORT_TESTS // Small matrices and Custom tests INSTANTIATE_TEST_CASE_P(SelectedSmall_0, SYMV, Combine( ValuesIn(orderSet), Values(clblasUpper), Values(1), Values(clMath::ExtraTestSizes(0, 1, 1, 0, 0, 0)), Values(1))); #if !defined SHORT_TESTS INSTANTIATE_TEST_CASE_P(SelectedSmall_1, SYMV, Combine( ValuesIn(orderSet), Values(clblasUpper), Values(2), Values(clMath::ExtraTestSizes(0, 1, 1, 0, 0, 0)), Values(1))); #if !defined(MEDIUM_TESTS) INSTANTIATE_TEST_CASE_P(SelectedSmall_2, SYMV, Combine( ValuesIn(orderSet), Values(clblasLower), Values(3), Values(clMath::ExtraTestSizes(0, 1, 1, 0, 0, 0)), Values(1))); INSTANTIATE_TEST_CASE_P(SelectedSmall_3, SYMV, Combine( ValuesIn(orderSet), Values(clblasUpper), Values(4), Values(clMath::ExtraTestSizes(0, 1, 1, 0, 0, 0)), Values(1))); INSTANTIATE_TEST_CASE_P(SelectedSmall_4, SYMV, Combine( ValuesIn(orderSet), Values(clblasLower), Values(5), Values(clMath::ExtraTestSizes(0, 1, 1, 0, 0, 0)), Values(1))); // Custom test - use command line arguments to tweak it INSTANTIATE_TEST_CASE_P(Custom, SYMV, Combine( ValuesIn(orderSet), ValuesIn(uploSet), Values(32), Values(clMath::ExtraTestSizes(0, 1, 1, 0, 0, 0)), Values(1))); #endif /* !MEDIUM_TESTS */ #endif /* !SHORT_TESTS */ #endif #ifdef DO_SYR2K // xSYR2K tests INSTANTIATE_TEST_CASE_P(ColumnMajor_SmallRange, SYR2K, Combine( Values(clblasColumnMajor), ValuesIn(uploSet), ValuesIn(transSet), ValuesIn(smallRange), ValuesIn(smallRange), Values(clMath::ExtraTestSizes()), Values(1))); INSTANTIATE_TEST_CASE_P(RowMajor_SmallRange, SYR2K, Combine( Values(clblasRowMajor), ValuesIn(uploSet), ValuesIn(transSet), ValuesIn(smallRange), ValuesIn(smallRange), Values(clMath::ExtraTestSizes()), Values(1))); // We know, that SmallRange does not have values more that 257, // so lda is set to 500. INSTANTIATE_TEST_CASE_P(ColumnMajor_SmallRange_BigLDA, SYR2K, Combine( Values(clblasColumnMajor), ValuesIn(uploSet), ValuesIn(transSet), ValuesIn(smallRange), ValuesIn(smallRange), Values(clMath::ExtraTestSizes(500, 501, 502, 0, 0, 0)), Values(1))); INSTANTIATE_TEST_CASE_P(RowMajor_SmallRange_BigLDA, SYR2K, Combine( Values(clblasRowMajor), ValuesIn(uploSet), ValuesIn(transSet), ValuesIn(smallRange), ValuesIn(smallRange), Values(clMath::ExtraTestSizes(500, 501, 502, 0, 0, 0)), Values(1))); // cases for the extended versions with the offsets #if defined(SHORT_TESTS) || defined(MEDIUM_TESTS) INSTANTIATE_TEST_CASE_P(ColumnMajor_SmallRangeEx_0, SYR2K, Combine( Values(clblasColumnMajor), ValuesIn(uploSet), ValuesIn(transSet), Values(254), Values(353), Values(clMath::ExtraTestSizes(0, 0, 0, 500, 602, 704)), Values(1))); INSTANTIATE_TEST_CASE_P(RowMajor_SmallRangeEx_0, SYR2K, Combine( Values(clblasRowMajor), ValuesIn(uploSet), ValuesIn(transSet), Values(254), Values(353), Values(clMath::ExtraTestSizes(0, 0, 0, 500, 602, 704)), Values(1))); #else /* SHORT_TESTS || MEDIUM_TESTS */ INSTANTIATE_TEST_CASE_P(ColumnMajor_SmallRangeEx_0, SYR2K, Combine( Values(clblasColumnMajor), ValuesIn(uploSet), ValuesIn(transSet), Values(255), Values(253), Values(clMath::ExtraTestSizes(0, 0, 0, 500, 0, 0)), Values(1))); INSTANTIATE_TEST_CASE_P(ColumnMajor_SmallRangeEx_1, SYR2K, Combine( Values(clblasColumnMajor), ValuesIn(uploSet), ValuesIn(transSet), Values(128), Values(64), Values(clMath::ExtraTestSizes(0, 0, 0, 0, 501, 0)), Values(1))); INSTANTIATE_TEST_CASE_P(ColumnMajor_SmallRangeEx_2, SYR2K, Combine( Values(clblasColumnMajor), ValuesIn(uploSet), ValuesIn(transSet), Values(75), Values(200), Values(clMath::ExtraTestSizes(0, 0, 0, 0, 0, 502)), Values(1))); INSTANTIATE_TEST_CASE_P(ColumnMajor_SmallRangeEx_3, SYR2K, Combine( Values(clblasColumnMajor), ValuesIn(uploSet), ValuesIn(transSet), Values(111), Values(256), Values(clMath::ExtraTestSizes(0, 0, 0, 500, 501, 502)), Values(1))); INSTANTIATE_TEST_CASE_P(RowMajor_SmallRangeEx_0, SYR2K, Combine( Values(clblasRowMajor), ValuesIn(uploSet), ValuesIn(transSet), Values(255), Values(253), Values(clMath::ExtraTestSizes(0, 0, 0, 500, 0, 0)), Values(1))); INSTANTIATE_TEST_CASE_P(RowMajor_SmallRangeEx_1, SYR2K, Combine( Values(clblasRowMajor), ValuesIn(uploSet), ValuesIn(transSet), Values(128), Values(64), Values(clMath::ExtraTestSizes(0, 0, 0, 0, 501, 0)), Values(1))); INSTANTIATE_TEST_CASE_P(RowMajor_SmallRangeEx_2, SYR2K, Combine( Values(clblasRowMajor), ValuesIn(uploSet), ValuesIn(transSet), Values(75), Values(200), Values(clMath::ExtraTestSizes(0, 0, 0, 0, 0, 502)), Values(1))); INSTANTIATE_TEST_CASE_P(RowMajor_SmallRangeEx_3, SYR2K, Combine( Values(clblasRowMajor), ValuesIn(uploSet), ValuesIn(transSet), Values(111), Values(256), Values(clMath::ExtraTestSizes(0, 0, 0, 500, 501, 502)), Values(1))); #endif /* !SHORT_TESTS && !MEDIUM_TESTS */ // Big matrices #if !defined SHORT_TESTS INSTANTIATE_TEST_CASE_P(SelectedBig_0, SYR2K, Combine( ValuesIn(orderSet), Values(clblasUpper), Values(clblasTrans), Values(2801), Values(2903), Values(clMath::ExtraTestSizes()), Values(1))); #if !defined(MEDIUM_TESTS) INSTANTIATE_TEST_CASE_P(SelectedBig_1, SYR2K, Combine( ValuesIn(orderSet), Values(clblasUpper), Values(clblasTrans), Values(4567), Values(4321), Values(clMath::ExtraTestSizes()), Values(1))); INSTANTIATE_TEST_CASE_P(SelectedBig_2, SYR2K, Combine( ValuesIn(orderSet), Values(clblasUpper), Values(clblasNoTrans), Values(5567), Values(5321), Values(clMath::ExtraTestSizes()), Values(1))); INSTANTIATE_TEST_CASE_P(SelectedBig_3, SYR2K, Combine( ValuesIn(orderSet), Values(clblasLower), Values(clblasTrans), Values(6567), Values(3321), Values(clMath::ExtraTestSizes()), Values(1))); #endif // !MEDIUM_TESTS #endif // !SHORT_TESTS // Small matrices and Custom tests INSTANTIATE_TEST_CASE_P(SelectedSmall_0, SYR2K, Combine( ValuesIn(orderSet), Values(clblasUpper), Values(clblasTrans), Values(1), Values(1), Values(clMath::ExtraTestSizes()), Values(1))); #if !defined SHORT_TESTS INSTANTIATE_TEST_CASE_P(SelectedSmall_1, SYR2K, Combine( ValuesIn(orderSet), Values(clblasUpper), Values(clblasTrans), Values(2), Values(1), Values(clMath::ExtraTestSizes()), Values(1))); #if !defined(MEDIUM_TESTS) INSTANTIATE_TEST_CASE_P(SelectedSmall_2, SYR2K, Combine( ValuesIn(orderSet), Values(clblasUpper), Values(clblasNoTrans), Values(3), Values(2), Values(clMath::ExtraTestSizes()), Values(1))); INSTANTIATE_TEST_CASE_P(SelectedSmall_3, SYR2K, Combine( ValuesIn(orderSet), Values(clblasLower), Values(clblasTrans), Values(4), Values(3), Values(clMath::ExtraTestSizes()), Values(1))); INSTANTIATE_TEST_CASE_P(SelectedSmall_4, SYR2K, Combine( ValuesIn(orderSet), Values(clblasUpper), Values(clblasNoTrans), Values(17), Values(1), Values(clMath::ExtraTestSizes()), Values(1))); // Custom test - use command line arguments to tweak it INSTANTIATE_TEST_CASE_P(Custom, SYR2K, Combine( ValuesIn(orderSet), ValuesIn(uploSet), ValuesIn(transSet), Values(32), Values(32), Values(clMath::ExtraTestSizes()), Values(1))); #endif /* !MEDIUM_TESTS */ #endif /* !SHORT_TESTS */ #endif // DO_SYR2K #ifdef DO_HERK /* ::std::tr1::tuple< clblasOrder, // order clblasUplo, // uplo clblasTranspose, // transA int, // N int, // K ComplexLong, // alpha ComplexLong, // beta ExtraTestSizes, // offa, offc, lda, ldc. int // numCommandQueues */ #if !defined(SHORT_TESTS) INSTANTIATE_TEST_CASE_P(SPL_HERK, HERK, Combine( Values(clblasColumnMajor, clblasRowMajor), ValuesIn(uploSet), Values(clblasNoTrans, clblasConjTrans), Values(513), Values(513), ValuesIn(alphaBetaRange), ValuesIn(alphaBetaRange), Values(clMath::ExtraTestSizes()), Values(1))); #endif #if defined(SHORT_TESTS) INSTANTIATE_TEST_CASE_P(Short_HERK, HERK, Combine( Values(clblasRowMajor), Values(clblasUpper), Values(clblasNoTrans), ValuesIn(smallRange), ValuesIn(smallRange), ValuesIn(alphaBetaRange), ValuesIn(alphaBetaRange), Values(clMath::ExtraTestSizes()), Values(1))); INSTANTIATE_TEST_CASE_P(SelectedSmall0_HERK, HERK, Combine( ValuesIn(orderSet), ValuesIn(uploSet), Values(clblasConjTrans), Values(14), Values(15), ValuesIn(alphaBetaRange), ValuesIn(alphaBetaRange), Values(clMath::ExtraTestSizes(0,0,0,9,0,0)), Values(1))); #elif defined(MEDIUM_TESTS) INSTANTIATE_TEST_CASE_P(Order_HERK, HERK, Combine( ValuesIn(orderSet), Values(clblasUpper), Values(clblasNoTrans), ValuesIn(smallRange), ValuesIn(smallRange), ValuesIn(alphaBetaRange), ValuesIn(alphaBetaRange), Values(clMath::ExtraTestSizes(0,0,0,0,10,0)), Values(1))); INSTANTIATE_TEST_CASE_P(Uplo_HERK, HERK, Combine( Values(clblasRowMajor), ValuesIn(uploSet), Values(clblasNoTrans), ValuesIn(smallRange), ValuesIn(smallRange), ValuesIn(alphaBetaRange), ValuesIn(alphaBetaRange), Values(clMath::ExtraTestSizes(0,0,0,9,0,0)), Values(1))); INSTANTIATE_TEST_CASE_P(Trans_HERK, HERK, Combine( Values(clblasRowMajor), Values(clblasUpper), Values(clblasNoTrans, clblasConjTrans), ValuesIn(smallRange), ValuesIn(smallRange), ValuesIn(alphaBetaRange), ValuesIn(alphaBetaRange), Values(clMath::ExtraTestSizes(0,0,0,0,10,0)), Values(1))); #else // Correctness INSTANTIATE_TEST_CASE_P(ALL_HERK, HERK, Combine( ValuesIn(orderSet), ValuesIn(uploSet), Values(clblasNoTrans, clblasConjTrans), ValuesIn(smallRange), ValuesIn(smallRange), ValuesIn(alphaBetaRange), ValuesIn(alphaBetaRange), Values(clMath::ExtraTestSizes(0,0,0,9,10,0)), Values(1))); INSTANTIATE_TEST_CASE_P(SelectedBig0_HERK, HERK, Combine( ValuesIn(orderSet), ValuesIn(uploSet), Values(clblasNoTrans, clblasConjTrans), Values(2510, 4300), Values(1500,4600), ValuesIn(alphaBetaRange), ValuesIn(alphaBetaRange), Values(clMath::ExtraTestSizes(0,0,0,9,0,0)), Values(1))); #endif // Correctness #endif // DO_HERK #ifdef DO_HER2K #if !defined(SHORT_TESTS) INSTANTIATE_TEST_CASE_P(SPL_HER2K, HER2K, Combine( Values(clblasColumnMajor, clblasRowMajor), ValuesIn(uploSet), Values(clblasNoTrans, clblasConjTrans), Values(513), Values(513), ValuesIn(alphaBetaRange), ValuesIn(alphaBetaRange), Values(clMath::ExtraTestSizes()), Values(1))); #endif #if defined(SHORT_TESTS) INSTANTIATE_TEST_CASE_P(Short_HER2K, HER2K, Combine( Values(clblasRowMajor), Values(clblasUpper), Values(clblasNoTrans), ValuesIn(smallRange), ValuesIn(smallRange), ValuesIn(alphaBetaRange), ValuesIn(alphaBetaRange), Values(clMath::ExtraTestSizes()), Values(1))); INSTANTIATE_TEST_CASE_P(SelectedSmall0_HER2K, HER2K, Combine( ValuesIn(orderSet), ValuesIn(uploSet), Values(clblasConjTrans), Values(14), Values(15), ValuesIn(alphaBetaRange), ValuesIn(alphaBetaRange), Values(clMath::ExtraTestSizes(0,0,0,9,0,0)), Values(1))); #elif defined(MEDIUM_TESTS) INSTANTIATE_TEST_CASE_P(Order_HER2K, HER2K, Combine( ValuesIn(orderSet), Values(clblasUpper), Values(clblasNoTrans), ValuesIn(smallRange), ValuesIn(smallRange), ValuesIn(alphaBetaRange), ValuesIn(alphaBetaRange), Values(clMath::ExtraTestSizes(0,0,0,0,10,0)), Values(1))); INSTANTIATE_TEST_CASE_P(Uplo_HER2K, HER2K, Combine( Values(clblasRowMajor), ValuesIn(uploSet), Values(clblasNoTrans), ValuesIn(smallRange), ValuesIn(smallRange), ValuesIn(alphaBetaRange), ValuesIn(alphaBetaRange), Values(clMath::ExtraTestSizes(0,0,0,9,0,0)), Values(1))); INSTANTIATE_TEST_CASE_P(Trans_HER2K, HER2K, Combine( Values(clblasRowMajor), Values(clblasUpper), Values(clblasNoTrans, clblasConjTrans), ValuesIn(smallRange), ValuesIn(smallRange), ValuesIn(alphaBetaRange), ValuesIn(alphaBetaRange), Values(clMath::ExtraTestSizes(0,0,0,0,10,0)), Values(1))); #else // Correctness INSTANTIATE_TEST_CASE_P(ALL_HER2K, HER2K, Combine( ValuesIn(orderSet), ValuesIn(uploSet), Values(clblasNoTrans, clblasConjTrans), ValuesIn(smallRange), ValuesIn(smallRange), ValuesIn(alphaBetaRange), ValuesIn(alphaBetaRange), Values(clMath::ExtraTestSizes(0,0,0,9,10,0)), Values(1))); INSTANTIATE_TEST_CASE_P(SelectedBig0_HER2K, HER2K, Combine( ValuesIn(orderSet), ValuesIn(uploSet), Values(clblasNoTrans, clblasConjTrans), Values(2510, 4300), Values(1500,4600), ValuesIn(alphaBetaRange), ValuesIn(alphaBetaRange), Values(clMath::ExtraTestSizes(0,0,0,9,0,0)), Values(1))); #endif // Correctness #endif // DO_HER2K #ifdef DO_SYRK // xSYRK tests INSTANTIATE_TEST_CASE_P(ColumnMajor_SmallRange, SYRK, Combine( Values(clblasColumnMajor), ValuesIn(uploSet), ValuesIn(transSet), ValuesIn(smallRange), ValuesIn(smallRange), Values(clMath::ExtraTestSizes()), Values(1))); INSTANTIATE_TEST_CASE_P(RowMajor_SmallRange, SYRK, Combine( Values(clblasRowMajor), ValuesIn(uploSet), ValuesIn(transSet), ValuesIn(smallRange), ValuesIn(smallRange), Values(clMath::ExtraTestSizes()), Values(1))); // We know, that SmallRange does not have values more that 257, // so lda is set to 500. INSTANTIATE_TEST_CASE_P(ColumnMajor_SmallRange_BigLDA, SYRK, Combine( Values(clblasColumnMajor), ValuesIn(uploSet), ValuesIn(transSet), ValuesIn(smallRange), ValuesIn(smallRange), Values(clMath::ExtraTestSizes(500, 0, 501, 0, 0, 0)), Values(1))); INSTANTIATE_TEST_CASE_P(RowMajor_SmallRange_BigLDA, SYRK, Combine( Values(clblasRowMajor), ValuesIn(uploSet), ValuesIn(transSet), ValuesIn(smallRange), ValuesIn(smallRange), Values(clMath::ExtraTestSizes(500, 0, 501, 0, 0, 0)), Values(1))); // cases for the extended versions with the offsets #if defined(SHORT_TESTS) || defined(MEDIUM_TESTS) INSTANTIATE_TEST_CASE_P(ColumnMajor_SmallRangeEx_0, SYRK, Combine( Values(clblasColumnMajor), ValuesIn(uploSet), ValuesIn(transSet), Values(252), Values(353), Values(clMath::ExtraTestSizes(0, 0, 0, 500, 0, 702)), Values(1))); INSTANTIATE_TEST_CASE_P(RowMajor_SmallRangeEx_0, SYRK, Combine( Values(clblasRowMajor), ValuesIn(uploSet), ValuesIn(transSet), Values(252), Values(353), Values(clMath::ExtraTestSizes(0, 0, 0, 500, 0, 702)), Values(1))); #else /* SHORT_TESTS || MEDIUM_TESTS */ INSTANTIATE_TEST_CASE_P(ColumnMajor_SmallRangeEx_0, SYRK, Combine( Values(clblasColumnMajor), ValuesIn(uploSet), ValuesIn(transSet), Values(255), Values(253), Values(clMath::ExtraTestSizes(0, 0, 0, 500, 0, 0)), Values(1))); INSTANTIATE_TEST_CASE_P(ColumnMajor_SmallRangeEx_1, SYRK, Combine( Values(clblasColumnMajor), ValuesIn(uploSet), ValuesIn(transSet), Values(128), Values(64), Values(clMath::ExtraTestSizes(0, 0, 0, 0, 0, 501)), Values(1))); INSTANTIATE_TEST_CASE_P(ColumnMajor_SmallRangeEx_2, SYRK, Combine( Values(clblasColumnMajor), ValuesIn(uploSet), ValuesIn(transSet), Values(75), Values(200), Values(clMath::ExtraTestSizes(0, 0, 0, 500, 0, 501)), Values(1))); INSTANTIATE_TEST_CASE_P(RowMajor_SmallRangeEx_0, SYRK, Combine( Values(clblasRowMajor), ValuesIn(uploSet), ValuesIn(transSet), Values(255), Values(253), Values(clMath::ExtraTestSizes(0, 0, 0, 500, 0, 0)), Values(1))); INSTANTIATE_TEST_CASE_P(RowMajor_SmallRangeEx_1, SYRK, Combine( Values(clblasRowMajor), ValuesIn(uploSet), ValuesIn(transSet), Values(128), Values(64), Values(clMath::ExtraTestSizes(0, 0, 0, 0, 0, 501)), Values(1))); INSTANTIATE_TEST_CASE_P(RowMajor_SmallRangeEx_2, SYRK, Combine( Values(clblasRowMajor), ValuesIn(uploSet), ValuesIn(transSet), Values(75), Values(200), Values(clMath::ExtraTestSizes(0, 0, 0, 500, 0, 501)), Values(1))); #endif /* !SHORT_TESTS && !MEDIUM_TESTS */ // Big matrices #if !defined(SHORT_TESTS) INSTANTIATE_TEST_CASE_P(SelectedBig_0, SYRK, Combine( ValuesIn(orderSet), Values(clblasUpper), Values(clblasTrans), Values(2801), Values(2903), Values(clMath::ExtraTestSizes()), Values(1))); #if !defined(MEDIUM_TESTS) INSTANTIATE_TEST_CASE_P(SelectedBig_1, SYRK, Combine( ValuesIn(orderSet), Values(clblasUpper), Values(clblasTrans), Values(4567), Values(4321), Values(clMath::ExtraTestSizes()), Values(1))); INSTANTIATE_TEST_CASE_P(SelectedBig_2, SYRK, Combine( ValuesIn(orderSet), Values(clblasUpper), Values(clblasNoTrans), Values(5567), Values(5321), Values(clMath::ExtraTestSizes()), Values(1))); INSTANTIATE_TEST_CASE_P(SelectedBig_3, SYRK, Combine( ValuesIn(orderSet), Values(clblasLower), Values(clblasTrans), Values(6567), Values(3321), Values(clMath::ExtraTestSizes()), Values(1))); #endif // !MEDIUM_TESTS #endif // !SHORT_TESTS // Small matrices and Custom tests INSTANTIATE_TEST_CASE_P(SelectedSmall_0, SYRK, Combine( ValuesIn(orderSet), Values(clblasUpper), Values(clblasTrans), Values(1), Values(1), Values(clMath::ExtraTestSizes()), Values(1))); #if !defined SHORT_TESTS INSTANTIATE_TEST_CASE_P(SelectedSmall_1, SYRK, Combine( ValuesIn(orderSet), Values(clblasUpper), Values(clblasTrans), Values(2), Values(1), Values(clMath::ExtraTestSizes()), Values(1))); #if !defined(MEDIUM_TESTS) INSTANTIATE_TEST_CASE_P(SelectedSmall_2, SYRK, Combine( ValuesIn(orderSet), Values(clblasUpper), Values(clblasNoTrans), Values(3), Values(2), Values(clMath::ExtraTestSizes()), Values(1))); INSTANTIATE_TEST_CASE_P(SelectedSmall_3, SYRK, Combine( ValuesIn(orderSet), Values(clblasLower), Values(clblasTrans), Values(4), Values(3), Values(clMath::ExtraTestSizes()), Values(1))); INSTANTIATE_TEST_CASE_P(SelectedSmall_4, SYRK, Combine( ValuesIn(orderSet), Values(clblasUpper), Values(clblasNoTrans), Values(17), Values(1), Values(clMath::ExtraTestSizes()), Values(1))); // Custom test - use command line arguments to tweak it INSTANTIATE_TEST_CASE_P(Custom, SYRK, Combine( ValuesIn(orderSet), ValuesIn(uploSet), ValuesIn(transSet), Values(32), Values(32), Values(clMath::ExtraTestSizes()), Values(1))); #endif /* !MEDIUM_TESTS */ #endif /* !SHORT_TESTS */ #endif // DO_SYRK #ifdef DO_TRMV #if defined(SHORT_TESTS) INSTANTIATE_TEST_CASE_P(ShortTRMV, TRMV, Combine( Values(clblasRowMajor), Values(clblasLower), Values(clblasNoTrans), Values(clblasUnit),ValuesIn(smallRange),Values(0), Values(1), Values(0), Values(0), Values(1))); #elif defined(MEDIUM_TESTS) INSTANTIATE_TEST_CASE_P(Order_TRMV, TRMV, Combine( ValuesIn(orderSet), Values(clblasLower), Values(clblasNoTrans), Values(clblasUnit),ValuesIn(smallRange),Values(0), Values(1), Values(0,9), Values(0), Values(1))); INSTANTIATE_TEST_CASE_P(Uplo_TRMV, TRMV, Combine( Values(clblasRowMajor), ValuesIn(uploSet), Values(clblasNoTrans), Values(clblasUnit),ValuesIn(smallRange),Values(0), Values(1), Values(0), Values(0,10), Values(1))); INSTANTIATE_TEST_CASE_P(Trans_TRMV, TRMV, Combine( Values(clblasRowMajor), Values(clblasLower), ValuesIn(transSet), Values(clblasUnit),ValuesIn(smallRange),Values(0), Values(1), Values(0,9), Values(0), Values(1))); INSTANTIATE_TEST_CASE_P(Diag_TRMV, TRMV, Combine( Values(clblasRowMajor), Values(clblasLower), Values(clblasNoTrans), ValuesIn(diagSet), ValuesIn(smallRange),Values(0), Values(1), Values(0), Values(0,10), Values(1))); #else // Correctness INSTANTIATE_TEST_CASE_P(All_TRMV, TRMV, Combine( ValuesIn(orderSet), ValuesIn(uploSet), ValuesIn(transSet), ValuesIn(diagSet),ValuesIn(smallRange),Values(0,4097), ValuesIn(incs), Values(0, 10), Values(0, 9), Values(1))); #endif // Correctness #endif #ifdef DO_TPMV #if defined(SHORT_TESTS) INSTANTIATE_TEST_CASE_P(ShortTPMV, TPMV, Combine( Values(clblasRowMajor), Values(clblasLower), Values(clblasNoTrans), Values(clblasUnit),ValuesIn(smallRange),Values(0), Values(1), Values(0), Values(0), Values(1))); #elif defined(MEDIUM_TESTS) INSTANTIATE_TEST_CASE_P(Order_TPMV, TPMV, Combine( ValuesIn(orderSet), Values(clblasLower), Values(clblasNoTrans), Values(clblasUnit),ValuesIn(smallRange),Values(0), Values(1), Values(0,9), Values(0), Values(1))); INSTANTIATE_TEST_CASE_P(Uplo_TPMV, TPMV, Combine( Values(clblasRowMajor), ValuesIn(uploSet), Values(clblasNoTrans), Values(clblasUnit),ValuesIn(smallRange),Values(0), Values(1), Values(0), Values(0,10), Values(1))); INSTANTIATE_TEST_CASE_P(Trans_TPMV, TPMV, Combine( Values(clblasRowMajor), Values(clblasLower), ValuesIn(transSet), Values(clblasUnit),ValuesIn(smallRange),Values(0), Values(1), Values(0,9), Values(0), Values(1))); INSTANTIATE_TEST_CASE_P(Diag_TPMV, TPMV, Combine( Values(clblasRowMajor), Values(clblasLower), Values(clblasNoTrans), ValuesIn(diagSet), ValuesIn(smallRange),Values(0), Values(1), Values(0), Values(0,10), Values(1))); #else // Correctness INSTANTIATE_TEST_CASE_P(All_TPMV, TPMV, Combine( ValuesIn(orderSet), ValuesIn(uploSet), ValuesIn(transSet), ValuesIn(diagSet),ValuesIn(smallRange),Values(0,4097), ValuesIn(incs), Values(0, 10), Values(0, 9), Values(1))); #endif // Correctness #endif #ifdef DO_TRSV #ifdef SHORT_TESTS INSTANTIATE_TEST_CASE_P(RowMajor_SmallRangeTRSV, TRSV, Combine( Values(clblasRowMajor), Values(clblasUpper), Values(clblasNoTrans), Values(clblasUnit),ValuesIn(smallRange), Values(0), Values(1), Values(0), Values(0), Values(1))); #endif #ifdef MEDIUM_TESTS INSTANTIATE_TEST_CASE_P(ColumnMajor_SmallRangeTRSV, TRSV, Combine( Values(clblasColumnMajor), ValuesIn(uploSet), Values(clblasTrans), Values(clblasNonUnit), ValuesIn(smallRange), Values(0), Values(1), Values(0), Values(0), Values(1))); INSTANTIATE_TEST_CASE_P(SmallRange_VariousIncTRSV, TRSV, Combine( Values(clblasRowMajor), ValuesIn(uploSet), Values(clblasNoTrans, clblasConjTrans), Values(clblasUnit), ValuesIn(smallRange), Values(0), ValuesIn(incs), Values(0), Values(0), Values(1))); #endif #if !defined SHORT_TESTS && !defined MEDIUM_TESTS INSTANTIATE_TEST_CASE_P(ColumnMajor_SmallRangeTRSV, TRSV, Combine( Values(clblasColumnMajor), ValuesIn(uploSet), ValuesIn(transSet), ValuesIn(diagSet),ValuesIn(smallRange), Values(0), Values(1), Values(0,10), Values(0,9), Values(1))); INSTANTIATE_TEST_CASE_P(RowMajor_SmallRangeTRSV, TRSV, Combine( Values(clblasRowMajor), ValuesIn(uploSet), ValuesIn(transSet), ValuesIn(diagSet),ValuesIn(smallRange), Values(0), Values(1), Values(0,10), Values(0,9), Values(1))); INSTANTIATE_TEST_CASE_P(ColumnMajor_SmallRange_BigLDATRSV, TRSV, Combine( Values(clblasColumnMajor), ValuesIn(uploSet), ValuesIn(transSet), ValuesIn(diagSet),ValuesIn(smallRange), Values(500), Values(1), Values(0,10), Values(0,9), Values(1))); INSTANTIATE_TEST_CASE_P(RowMajor_SmallRange_BigLDATRSV, TRSV, Combine( Values(clblasRowMajor), ValuesIn(uploSet), ValuesIn(transSet), ValuesIn(diagSet),ValuesIn(smallRange), Values(500), Values(1), Values(0,10), Values(0,9), Values(1))); INSTANTIATE_TEST_CASE_P(SmallRange_VariousIncTRSV, TRSV, Combine( ValuesIn(orderSet), ValuesIn(uploSet), ValuesIn(transSet), ValuesIn(diagSet),ValuesIn(smallRange), Values(0), ValuesIn(incs), Values(0,10), Values(0,9), Values(1))); #endif #endif #ifdef DO_TPSV #ifdef SHORT_TESTS INSTANTIATE_TEST_CASE_P(RowMajor_SmallRangeTPSV, TPSV, Combine( Values(clblasRowMajor), Values(clblasUpper), Values(clblasNoTrans), Values(clblasUnit),ValuesIn(smallRange), Values(0), Values(1), Values(0), Values(0), Values(1))); #endif #ifdef MEDIUM_TESTS INSTANTIATE_TEST_CASE_P(ColumnMajor_SmallRangeTPSV, TPSV, Combine( Values(clblasColumnMajor), ValuesIn(uploSet), Values(clblasTrans), Values(clblasNonUnit), ValuesIn(smallRange), Values(0), Values(1), Values(0), Values(0), Values(1))); INSTANTIATE_TEST_CASE_P(SmallRange_VariousIncTPSV, TPSV, Combine( Values(clblasRowMajor), ValuesIn(uploSet), Values(clblasNoTrans, clblasConjTrans), Values(clblasUnit), ValuesIn(smallRange), Values(0), ValuesIn(incs), Values(0), Values(0), Values(1))); #endif #if !defined SHORT_TESTS && !defined MEDIUM_TESTS INSTANTIATE_TEST_CASE_P(ColumnMajor_SmallRangeTPSV, TPSV, Combine( Values(clblasColumnMajor), ValuesIn(uploSet), ValuesIn(transSet), ValuesIn(diagSet),ValuesIn(smallRange), Values(0), Values(1), Values(0,10), Values(0,9), Values(1))); INSTANTIATE_TEST_CASE_P(RowMajor_SmallRangeTPSV, TPSV, Combine( Values(clblasRowMajor), ValuesIn(uploSet), ValuesIn(transSet), ValuesIn(diagSet),ValuesIn(smallRange), Values(0), Values(1), Values(0,10), Values(0,9), Values(1))); INSTANTIATE_TEST_CASE_P(SmallRange_VariousIncTPSV, TPSV, Combine( ValuesIn(orderSet), ValuesIn(uploSet), ValuesIn(transSet), ValuesIn(diagSet),ValuesIn(smallRange), Values(0), ValuesIn(incs), Values(0,10), Values(0,9), Values(1))); #endif #endif /*#ifdef DO_SYMM order = ::std::tr1::get<0>(GetParam()); side = ::std::tr1::get<1>(GetParam()); uplo = ::std::tr1::get<2>(GetParam()); M = ::std::tr1::get<3>(GetParam()); N = ::std::tr1::get<4>(GetParam()); lda = ::std::tr1::get<5>(GetParam()); ldb = ::std::tr1::get<6>(GetParam()); ldc = ::std::tr1::get<7>(GetParam()); offa = ::std::tr1::get<8>(GetParam()); numCommandQueues = ::std::tr1::get<9>(GetParam()); INSTANTIATE_TEST_CASE_P(ColumnMajor_SmallRangeSYMM, SYMM, Combine( Values(clblasColumnMajor), ValuesIn(sideSet), ValuesIn(uploSet), ValuesIn(smallRange), ValuesIn(smallRange), Values(3192), Values(3192), Values(3192), Values(0), Values(1) ) ); INSTANTIATE_TEST_CASE_P(RowMajor_SmallRangeSYMM, SYMM, Combine( Values(clblasRowMajor), ValuesIn(sideSet), ValuesIn(uploSet), ValuesIn(smallRange), ValuesIn(smallRange), Values(3192), Values(3192), Values(3192), Values(0), Values(1) ) ); INSTANTIATE_TEST_CASE_P(ColumnMajor_VariousLDASYMM, SYMM, Combine( Values(clblasColumnMajor), ValuesIn(sideSet), ValuesIn(uploSet), ValuesIn(smallRange), ValuesIn(smallRange), ValuesIn(ldaRange), ValuesIn(ldaRange), ValuesIn(ldaRange), Values(0), Values(1) ) ); INSTANTIATE_TEST_CASE_P(RowMajor_VariousLDASYMM, SYMM, Combine( Values(clblasRowMajor), ValuesIn(sideSet), ValuesIn(uploSet), ValuesIn(smallRange), ValuesIn(smallRange), ValuesIn(ldaRange), ValuesIn(ldaRange), ValuesIn(ldaRange), Values(0), Values(1) ) ); #endif */ #ifdef DO_SYR /* clblasOrder, // order clblasUplo, // uplo int, // N double, //alpha int, // offx int, // incx, should be greater than 0 int, // offa int, // lda, 0 - undefined int // numCommandQueues */ #if defined(SHORT_TESTS) INSTANTIATE_TEST_CASE_P(Short_SYR, SYR, Combine( Values(clblasRowMajor), Values(clblasLower), ValuesIn(smallRange), ValuesIn(realAlphaRange), Values(0), Values(1), Values(0), Values(0), Values(1) ) ); INSTANTIATE_TEST_CASE_P(SelectedSmall_SYR, SYR, Combine( ValuesIn(orderSet), ValuesIn(uploSet), Values(15), ValuesIn(realAlphaRange), Values(0), Values(1), Values(0), Values(0), Values(1) ) ); #elif defined(MEDIUM_TESTS) INSTANTIATE_TEST_CASE_P(Order_SYR, SYR, Combine( ValuesIn(orderSet), Values(clblasLower), ValuesIn(smallRange), ValuesIn(realAlphaRange), Values(0,9), Values(1), Values(0,10), Values(0), Values(1) ) ); INSTANTIATE_TEST_CASE_P(Uplo_SYR, SYR, Combine( Values(clblasRowMajor), ValuesIn(uploSet), ValuesIn(smallRange), ValuesIn(realAlphaRange), Values(0,9), Values(1), Values(0,10), Values(0), Values(1) ) ); INSTANTIATE_TEST_CASE_P(SelectedBig_SYR, SYR, Combine( Values(clblasRowMajor), ValuesIn(uploSet), Values(1500), ValuesIn(realAlphaRange), Values(0), Values(1), Values(0), Values(0), Values(1) ) ); #else // Correctness INSTANTIATE_TEST_CASE_P(ALL, SYR, Combine(ValuesIn(orderSet), ValuesIn(uploSet), ValuesIn(smallRange), ValuesIn(realAlphaRange), ValuesIn(offsetRange), ValuesIn(incs), ValuesIn(offsetRange), ValuesIn(ldaRange), Values(1) ) ); #endif #endif #ifdef DO_SPR #if defined(SHORT_TESTS) INSTANTIATE_TEST_CASE_P(Short_SPR, SPR, Combine( Values(clblasRowMajor), Values(clblasLower), ValuesIn(smallRange), ValuesIn(realAlphaRange), Values(0), Values(1), Values(0), Values(0), Values(1) ) ); INSTANTIATE_TEST_CASE_P(SelectedSmall_SPR, SPR, Combine( ValuesIn(orderSet), ValuesIn(uploSet), Values(15), ValuesIn(realAlphaRange), Values(0), Values(1), Values(0), Values(0), Values(1) ) ); #elif defined(MEDIUM_TESTS) INSTANTIATE_TEST_CASE_P(Order_SPR, SPR, Combine( ValuesIn(orderSet), Values(clblasLower), ValuesIn(smallRange), ValuesIn(realAlphaRange), Values(0,9), Values(1), Values(0,10), Values(0), Values(1) ) ); INSTANTIATE_TEST_CASE_P(Uplo_SPR, SPR, Combine( Values(clblasRowMajor), ValuesIn(uploSet), ValuesIn(smallRange), ValuesIn(realAlphaRange), Values(0,9), Values(1), Values(0,10), Values(0), Values(1) ) ); INSTANTIATE_TEST_CASE_P(SelectedBig_SPR, SPR, Combine( Values(clblasRowMajor), ValuesIn(uploSet), Values(1500, 5101), ValuesIn(realAlphaRange), Values(0), Values(1), Values(0), Values(0), Values(1) ) ); #else // Correctness INSTANTIATE_TEST_CASE_P(All_SPR, SPR, Combine( ValuesIn(orderSet), ValuesIn(uploSet), ValuesIn(smallRange), ValuesIn(realAlphaRange), ValuesIn(offsetRange), ValuesIn(incs), ValuesIn(offsetRange), ValuesIn(ldaRange), Values(1) ) ); #endif // Correctness #endif #ifdef DO_GER #if defined(SHORT_TESTS) INSTANTIATE_TEST_CASE_P(Short_GER, GER, Combine( Values(clblasRowMajor),ValuesIn(smallRange), ValuesIn(smallRange), Values(0), Values(1), Values(1), Values(0), Values(0), Values(0), Values(1) ) ); INSTANTIATE_TEST_CASE_P(SelectedSmall0_GER, GER, Combine( ValuesIn(orderSet), Values(61), Values(32), Values(0), Values(4,-11), Values(-30,1), Values(0), Values(0), Values(0), Values(1) ) ); #elif defined(MEDIUM_TESTS) INSTANTIATE_TEST_CASE_P(Order_GER, GER, Combine( ValuesIn(orderSet), ValuesIn(smallRange), ValuesIn(smallRange), Values(0), Values(-10), Values(21), Values(0,9), Values(0), Values(0), Values(1) ) ); INSTANTIATE_TEST_CASE_P(SelectedBig0_GER, GER, Combine( ValuesIn(orderSet), Values(4900), Values(3999), Values(0), Values(4), Values(-33), Values(0), Values(0), Values(0), Values(1) ) ); #else // Correctness INSTANTIATE_TEST_CASE_P(ALL_GER, GER, Combine( ValuesIn(orderSet), ValuesIn(smallRange), ValuesIn(smallRange), ValuesIn(ldaRange), ValuesIn(incs), ValuesIn(incs), ValuesIn(offsetRange),ValuesIn(offsetRange),ValuesIn(offsetRange), Values(1) ) ); #endif // Correctness #endif #ifdef DO_GERC #if defined(SHORT_TESTS) INSTANTIATE_TEST_CASE_P(Short_GERC, GERC, Combine( Values(clblasRowMajor),ValuesIn(smallRange), ValuesIn(smallRange), Values(0), Values(1), Values(1), Values(0), Values(0), Values(0), Values(1) ) ); INSTANTIATE_TEST_CASE_P(SelectedSmall0_GERC, GERC, Combine( ValuesIn(orderSet), Values(61), Values(32), Values(0), Values(4,-11), Values(-30,1), Values(0), Values(0), Values(0), Values(1) ) ); #elif defined(MEDIUM_TESTS) INSTANTIATE_TEST_CASE_P(Order_GERC, GERC, Combine( ValuesIn(orderSet), ValuesIn(smallRange), ValuesIn(smallRange), Values(0), Values(-10), Values(21), Values(0,9), Values(0), Values(0,19), Values(1) ) ); INSTANTIATE_TEST_CASE_P(SelectedBig0_GERC, GERC, Combine( ValuesIn(orderSet), Values(4900), Values(3999), Values(0), Values(4), Values(-33), Values(0), Values(0), Values(0), Values(1) ) ); #else // Correctness INSTANTIATE_TEST_CASE_P(ALL_GERC, GERC, Combine( ValuesIn(orderSet), ValuesIn(smallRange), ValuesIn(smallRange), ValuesIn(ldaRange), ValuesIn(incs), ValuesIn(incs), ValuesIn(offsetRange),ValuesIn(offsetRange),ValuesIn(offsetRange), Values(1) ) ); #endif // Correctness #endif #ifdef DO_HER #if defined(SHORT_TESTS) INSTANTIATE_TEST_CASE_P(Short_HER, HER, Combine( Values(clblasRowMajor), Values(clblasLower), ValuesIn(smallRange), ValuesIn(realAlphaRange), Values(0), Values(1), Values(0), Values(0), Values(1) ) ); INSTANTIATE_TEST_CASE_P(SelectedSmall_HER, HER, Combine( ValuesIn(orderSet), ValuesIn(uploSet), Values(15), ValuesIn(realAlphaRange), Values(0), Values(1), Values(0), Values(0), Values(1) ) ); #elif defined(MEDIUM_TESTS) INSTANTIATE_TEST_CASE_P(Order_HER, HER, Combine( ValuesIn(orderSet), Values(clblasLower), ValuesIn(smallRange), ValuesIn(realAlphaRange), Values(0), Values(1), Values(0,10), Values(0,9), Values(1) ) ); INSTANTIATE_TEST_CASE_P(Uplo_HER, HER, Combine( Values(clblasRowMajor), ValuesIn(uploSet), ValuesIn(smallRange), ValuesIn(realAlphaRange), Values(0), Values(1), Values(0,10), Values(0,9), Values(1) ) ); #else // Correctness INSTANTIATE_TEST_CASE_P(ColumnMajor_SmallRangeHER, HER, Combine( Values(clblasColumnMajor), ValuesIn(uploSet), ValuesIn(smallRange), ValuesIn(realAlphaRange), ValuesIn(ldaRange), ValuesIn(incs), ValuesIn(offsetRange), ValuesIn(offsetRange), Values(1) ) ); INSTANTIATE_TEST_CASE_P(RowMajor_SmallRangeHER, HER, Combine( Values(clblasRowMajor), ValuesIn(uploSet), ValuesIn(smallRange), ValuesIn(realAlphaRange), ValuesIn(ldaRange), ValuesIn(incs), ValuesIn(offsetRange), ValuesIn(offsetRange), Values(1) ) ); INSTANTIATE_TEST_CASE_P(ColumnMajor_VariousLDAHER, HER, Combine( Values(clblasColumnMajor), ValuesIn(uploSet), ValuesIn(smallRange), ValuesIn(realAlphaRange), ValuesIn(ldaRange), ValuesIn(incs), ValuesIn(offsetRange), ValuesIn(offsetRange), Values(1) ) ); INSTANTIATE_TEST_CASE_P(RowMajor_VariousLDAHER, HER, Combine( Values(clblasRowMajor), ValuesIn(uploSet), ValuesIn(smallRange), ValuesIn(realAlphaRange), ValuesIn(ldaRange), ValuesIn(incs), ValuesIn(offsetRange), ValuesIn(offsetRange), Values(1) ) ); #endif #endif #ifdef DO_HPR #if defined(SHORT_TESTS) INSTANTIATE_TEST_CASE_P(Short_HPR, HPR, Combine( Values(clblasRowMajor), Values(clblasLower), ValuesIn(smallRange), ValuesIn(realAlphaRange), Values(0), Values(1), Values(0), Values(0), Values(1) ) ); INSTANTIATE_TEST_CASE_P(SelectedSmall_HPR, HPR, Combine( ValuesIn(orderSet), ValuesIn(uploSet), Values(15), ValuesIn(realAlphaRange), Values(0), Values(1), Values(0), Values(0), Values(1) ) ); #elif defined(MEDIUM_TESTS) INSTANTIATE_TEST_CASE_P(Order_HPR, HPR, Combine( ValuesIn(orderSet), Values(clblasLower), ValuesIn(smallRange), ValuesIn(realAlphaRange), Values(0), Values(1), Values(0,10), Values(0,9), Values(1) ) ); INSTANTIATE_TEST_CASE_P(Uplo_HPR, HPR, Combine( Values(clblasRowMajor), ValuesIn(uploSet), ValuesIn(smallRange), ValuesIn(realAlphaRange), Values(0), Values(1), Values(0,10), Values(0,9), Values(1) ) ); INSTANTIATE_TEST_CASE_P(SelectedBig_HPR, HPR, Combine( Values(clblasRowMajor), ValuesIn(uploSet), Values(1500, 5101), ValuesIn(realAlphaRange), Values(0), Values(1), Values(0), Values(0), Values(1) ) ); #else // Correctness INSTANTIATE_TEST_CASE_P(All_HPR, HPR, Combine( ValuesIn(orderSet), ValuesIn(uploSet), ValuesIn(smallRange), ValuesIn(realAlphaRange), ValuesIn(ldaRange), ValuesIn(incs), ValuesIn(offsetRange), ValuesIn(offsetRange), Values(1) ) ); #endif // Correctness #endif #ifdef DO_HER2 #if defined(SHORT_TESTS) INSTANTIATE_TEST_CASE_P(Short_HER2, HER2, Combine( Values(clblasRowMajor), Values(clblasLower), ValuesIn(smallRange), ValuesIn(complexAlphaRange), Values(0), Values(1), Values(0), Values(0), Values(0), Values(1) ) ); INSTANTIATE_TEST_CASE_P(SelectedSmall_HER2, HER2, Combine( ValuesIn(orderSet), ValuesIn(uploSet), Values(15), ValuesIn(complexAlphaRange), Values(0), Values(1), Values(0), Values(0), Values(0), Values(1) ) ); #elif defined(MEDIUM_TESTS) INSTANTIATE_TEST_CASE_P(Order_HER2, HER2, Combine( ValuesIn(orderSet), Values(clblasLower), ValuesIn(smallRange), ValuesIn(complexAlphaRange), Values(0,9), Values(1), Values(0,10), Values(0,9), Values(0), Values(1) ) ); INSTANTIATE_TEST_CASE_P(Uplo_HER2, HER2, Combine( Values(clblasRowMajor), ValuesIn(uploSet), ValuesIn(smallRange), ValuesIn(complexAlphaRange), Values(0,10), Values(1), Values(0,10), Values(0,9), Values(0), Values(1) ) ); #else // Correctness INSTANTIATE_TEST_CASE_P(ColumnMajor_SmallRangeHER2, HER2, Combine( Values(clblasColumnMajor), ValuesIn(uploSet), ValuesIn(smallRange), ValuesIn(complexAlphaRange), ValuesIn(offsetRange), ValuesIn(incs), ValuesIn(offsetRange),ValuesIn(offsetRange),ValuesIn(ldaRange), Values(1) ) ); INSTANTIATE_TEST_CASE_P(RowMajor_SmallRangeHER2, HER2, Combine( Values(clblasRowMajor), ValuesIn(uploSet), ValuesIn(smallRange), ValuesIn(complexAlphaRange), ValuesIn(offsetRange), ValuesIn(incs), ValuesIn(offsetRange),ValuesIn(offsetRange),ValuesIn(ldaRange), Values(1) ) ); INSTANTIATE_TEST_CASE_P(ColumnMajor_VariousLDAHER2, HER2, Combine( Values(clblasColumnMajor), ValuesIn(uploSet), ValuesIn(smallRange), ValuesIn(complexAlphaRange), ValuesIn(offsetRange), ValuesIn(incs), ValuesIn(offsetRange),ValuesIn(offsetRange),ValuesIn(ldaRange), Values(1) ) ); INSTANTIATE_TEST_CASE_P(RowMajor_VariousLDAHER2, HER2, Combine( Values(clblasRowMajor), ValuesIn(uploSet), ValuesIn(smallRange), ValuesIn(complexAlphaRange), ValuesIn(offsetRange), ValuesIn(incs), ValuesIn(offsetRange),ValuesIn(offsetRange),ValuesIn(ldaRange), Values(1) ) ); #endif #endif #ifdef DO_HPR2 #if defined(SHORT_TESTS) INSTANTIATE_TEST_CASE_P(Short_HPR2, HPR2, Combine( Values(clblasRowMajor), Values(clblasLower), ValuesIn(smallRange), ValuesIn(complexAlphaRange), Values(0), Values(1), Values(0), Values(0), Values(0), Values(1) ) ); INSTANTIATE_TEST_CASE_P(SelectedSmall_HPR2, HPR2, Combine( ValuesIn(orderSet), ValuesIn(uploSet), Values(15), ValuesIn(complexAlphaRange), Values(0), Values(1), Values(0), Values(0), Values(0), Values(1) ) ); #elif defined(MEDIUM_TESTS) INSTANTIATE_TEST_CASE_P(Order_HPR2, HPR2, Combine( ValuesIn(orderSet), Values(clblasLower), ValuesIn(smallRange), ValuesIn(complexAlphaRange), Values(0,9), Values(1), Values(0,10), Values(0,9), Values(0), Values(1) ) ); INSTANTIATE_TEST_CASE_P(Uplo_HPR2, HPR2, Combine( Values(clblasRowMajor), ValuesIn(uploSet), ValuesIn(smallRange), ValuesIn(complexAlphaRange), Values(0,10), Values(1), Values(0,10), Values(0,9), Values(0), Values(1) ) ); INSTANTIATE_TEST_CASE_P(SelectedBig_HPR2, HPR2, Combine( Values(clblasRowMajor), ValuesIn(uploSet), Values(1500, 5101), ValuesIn(complexAlphaRange), Values(0), Values(1), Values(0), Values(0), Values(0), Values(1) ) ); #else // Correctness INSTANTIATE_TEST_CASE_P(All_HPR2, HPR2, Combine( ValuesIn(orderSet), ValuesIn(uploSet), ValuesIn(smallRange), ValuesIn(complexAlphaRange), ValuesIn(offsetRange), ValuesIn(incs), ValuesIn(offsetRange), ValuesIn(offsetRange), ValuesIn(ldaRange), Values(1) ) ); #endif // Correctness #endif /*INSTANTIATE_TEST_CASE_P(ALL_HEMM_WITH_OFFSETS_ZERO, HEMM, Combine( ValuesIn(orderSet), ValuesIn(sideSet), ValuesIn(uploSet), ValuesIn(smallRange), ValuesIn(smallRange),ValuesIn(complexAlphaRange), ValuesIn(complexAlphaRange), Values(clMath::ExtraTestSizes(0, 0, 0, 0, 0, 0)), //Values(clMath::ExtraTestSizes(0, 0, 0, 12, 0, 1)), Values(1) ) ); INSTANTIATE_TEST_CASE_P(ALL_HEMM, HEMM, Combine( ValuesIn(orderSet), ValuesIn(sideSet), ValuesIn(uploSet), ValuesIn(smallRange), ValuesIn(smallRange),ValuesIn(complexAlphaRange), ValuesIn(complexAlphaRange), Values(clMath::ExtraTestSizes(0, 0, 0, 12, 13, 15)), Values(1) ) ); INSTANTIATE_TEST_CASE_P(SelectedBig_0, HEMM, Combine( ValuesIn(orderSet), ValuesIn(sideSet), ValuesIn(uploSet), Values(5600), Values(5600),ValuesIn(complexAlphaRange), ValuesIn(complexAlphaRange), Values(clMath::ExtraTestSizes(0, 0, 0, 0, 0, 0)), Values(1) ) ); */ /* INSTANTIATE_TEST_CASE_P(SYMM_VERYSMALL, SYMM, Combine( ValuesIn(orderSet), ValuesIn(sideSet), ValuesIn(uploSet), ValuesIn(verySmallRange), ValuesIn(verySmallRange),ValuesIn(complexAlphaRange), ValuesIn(complexAlphaRange), Values(clMath::ExtraTestSizes(0, 0, 0, 0, 0, 0)), Values(1) ) );*/ /*INSTANTIATE_TEST_CASE_P(ALL_SYMM, SYMM, Combine( ValuesIn(orderSet), ValuesIn(sideSet), ValuesIn(uploSet), ValuesIn(smallRange), ValuesIn(smallRange),ValuesIn(complexAlphaRange), ValuesIn(complexAlphaRange), Values(clMath::ExtraTestSizes(0, 0, 0, 1, 3, 13)), Values(1) ) ); INSTANTIATE_TEST_CASE_P(ALL_SYMM_WITH_OFFSETS_ZERO, SYMM, Combine( ValuesIn(orderSet), ValuesIn(sideSet), ValuesIn(uploSet), ValuesIn(smallRange), ValuesIn(smallRange),ValuesIn(complexAlphaRange), ValuesIn(complexAlphaRange), Values(clMath::ExtraTestSizes(0, 0, 0, 0, 0, 0)), Values(1) ) ); INSTANTIATE_TEST_CASE_P(SelectedBig_0, SYMM, Combine( ValuesIn(orderSet), ValuesIn(sideSet), ValuesIn(uploSet), Values(5600), Values(5600),ValuesIn(complexAlphaRange), ValuesIn(complexAlphaRange), Values(clMath::ExtraTestSizes(0, 0, 0, 0, 0, 0)), Values(1) ) ); */ #ifdef DO_SYR2 #if defined(SHORT_TESTS) INSTANTIATE_TEST_CASE_P(Short_SYR2, SYR2, Combine( Values(clblasRowMajor), Values(clblasLower), ValuesIn(smallRange), ValuesIn(realAlphaRange), Values(0), Values(1), Values(0), Values(0), Values(0), Values(1) ) ); INSTANTIATE_TEST_CASE_P(SelectedSmall_SYR2, SYR2, Combine( ValuesIn(orderSet), ValuesIn(uploSet), Values(15), ValuesIn(realAlphaRange), Values(0), Values(1), Values(0), Values(0), Values(0), Values(1) ) ); #elif defined(MEDIUM_TESTS) INSTANTIATE_TEST_CASE_P(Order_SYR2, SYR2, Combine( ValuesIn(orderSet), Values(clblasLower), ValuesIn(smallRange), ValuesIn(realAlphaRange), Values(0,9), Values(1), Values(0,10), Values(0,9), Values(0), Values(1) ) ); INSTANTIATE_TEST_CASE_P(Uplo_SYR2, SYR2, Combine( Values(clblasRowMajor), ValuesIn(uploSet), ValuesIn(smallRange), ValuesIn(realAlphaRange), Values(0,10), Values(1), Values(0,10), Values(0,9), Values(0), Values(1) ) ); INSTANTIATE_TEST_CASE_P(SelectedBig_SYR2, SYR2, Combine( Values(clblasRowMajor), ValuesIn(uploSet), Values(1500, 2800), ValuesIn(realAlphaRange), Values(0), Values(1), Values(0), Values(0), Values(0), Values(1) ) ); #else // Correctness INSTANTIATE_TEST_CASE_P(ALL, SYR2, Combine(ValuesIn(orderSet), ValuesIn(uploSet), ValuesIn(smallRange), ValuesIn(realAlphaRange), ValuesIn(offsetRange), ValuesIn(incs), ValuesIn(offsetRange), ValuesIn(offsetRange), ValuesIn(ldaRange), Values(1))); #endif #endif #ifdef DO_SPR2 #if defined(SHORT_TESTS) INSTANTIATE_TEST_CASE_P(Short_SPR2, SPR2, Combine( Values(clblasRowMajor), Values(clblasLower), ValuesIn(smallRange), ValuesIn(realAlphaRange), Values(0), Values(1), Values(0), Values(0), Values(0), Values(1) ) ); INSTANTIATE_TEST_CASE_P(SelectedSmall_SPR2, SPR2, Combine( ValuesIn(orderSet), ValuesIn(uploSet), Values(15), ValuesIn(realAlphaRange), Values(0), Values(1), Values(0), Values(0), Values(0), Values(1) ) ); #elif defined(MEDIUM_TESTS) INSTANTIATE_TEST_CASE_P(Order_SPR2, SPR2, Combine( ValuesIn(orderSet), Values(clblasLower), ValuesIn(smallRange), ValuesIn(realAlphaRange), Values(0,9), Values(1), Values(0,10), Values(0,9), Values(0), Values(1) ) ); INSTANTIATE_TEST_CASE_P(Uplo_SPR2, SPR2, Combine( Values(clblasRowMajor), ValuesIn(uploSet), ValuesIn(smallRange), ValuesIn(realAlphaRange), Values(0,10), Values(1), Values(0,10), Values(0,9), Values(0), Values(1) ) ); INSTANTIATE_TEST_CASE_P(SelectedBig_SPR2, SPR2, Combine( Values(clblasRowMajor), ValuesIn(uploSet), Values(1500, 5101), ValuesIn(realAlphaRange), Values(0), Values(1), Values(0), Values(0), Values(0), Values(1) ) ); #else // Correctness INSTANTIATE_TEST_CASE_P(All_SPR2, SPR2, Combine( ValuesIn(orderSet), ValuesIn(uploSet), ValuesIn(smallRange), ValuesIn(realAlphaRange), ValuesIn(offsetRange), ValuesIn(incs), ValuesIn(offsetRange), ValuesIn(offsetRange), ValuesIn(ldaRange), Values(1) ) ); #endif // Correctness #endif #ifdef DO_GBMV #if defined(SHORT_TESTS) INSTANTIATE_TEST_CASE_P(Short_GBMV, GBMV, Combine( Values(clblasRowMajor), Values(clblasNoTrans), ValuesIn(smallRange), ValuesIn(smallRange), ValuesIn(smallRange), ValuesIn(smallRange), Values(ExtraTestSizes(0, (int)1, (int)1, 0, 0, 0)), ValuesIn(alphaBetaRange), ValuesIn(alphaBetaRange), Values(1))); INSTANTIATE_TEST_CASE_P(SelectedSmall0_GBMV, GBMV, Combine( ValuesIn(orderSet), Values(clblasConjTrans), Values(14), Values(15), Values(10), Values(8),Values(ExtraTestSizes(0, (int)-1, (int)1, 9, 0, 0)), ValuesIn(alphaBetaRange), ValuesIn(alphaBetaRange), Values(1))); #elif defined(MEDIUM_TESTS) INSTANTIATE_TEST_CASE_P(Order_GBMV, GBMV, Combine( ValuesIn(orderSet), Values(clblasNoTrans), ValuesIn(smallRange), ValuesIn(smallRange), ValuesIn(smallRange), ValuesIn(smallRange), Values(ExtraTestSizes(0, (int)1, (int)33, 10, 0, 0)), ValuesIn(alphaBetaRange), ValuesIn(alphaBetaRange), Values(1))); INSTANTIATE_TEST_CASE_P(Trans_GBMV, GBMV, Combine( Values(clblasRowMajor), ValuesIn(transSet), ValuesIn(smallRange), ValuesIn(smallRange), ValuesIn(smallRange), ValuesIn(smallRange), Values(ExtraTestSizes(0, (int)-33, (int)1, 0, 10, 0)), ValuesIn(alphaBetaRange), ValuesIn(alphaBetaRange), Values(1))); INSTANTIATE_TEST_CASE_P(SelectedVerySmall_GBMV, GBMV, Combine( ValuesIn(orderSet), ValuesIn(transSet), Values(1, 2, 4, 9), Values(3, 6, 11), Values(5), Values(7),Values(ExtraTestSizes(0, (int)-1, (int)1, 9, 0, 0)), ValuesIn(alphaBetaRange), ValuesIn(alphaBetaRange), Values(1))); INSTANTIATE_TEST_CASE_P(SelectedBig0_GBMV, GBMV, Combine( Values(clblasRowMajor), ValuesIn(transSet), Values(2599), Values(999), Values(2000), Values(565), Values(clMath::ExtraTestSizes(0,(int)30,(int)1,9,0,6)), ValuesIn(alphaBetaRange), ValuesIn(alphaBetaRange), Values(1))); #else // Correctness INSTANTIATE_TEST_CASE_P(ALL_GBMV, GBMV, Combine( ValuesIn(orderSet), ValuesIn(transSet), ValuesIn(smallRange), ValuesIn(smallRange), ValuesIn(smallRange), ValuesIn(smallRange),Values(clMath::ExtraTestSizes(0,(int)22,(int)-20,9,10,0)), ValuesIn(alphaBetaRange), ValuesIn(alphaBetaRange), Values(1))); INSTANTIATE_TEST_CASE_P(SelectedBig1_GBMV, GBMV, Combine( ValuesIn(orderSet), ValuesIn(transSet), Values(2510, 2300), Values(1500,2400), Values(2509, 2299), Values(1499,2399),Values(clMath::ExtraTestSizes(0,(int)3,(int)-2,9,0,6)), ValuesIn(alphaBetaRange), ValuesIn(alphaBetaRange), Values(1))); #endif // Correctness #endif // DO_GBMV #ifdef DO_SBMV #if defined(SHORT_TESTS) INSTANTIATE_TEST_CASE_P(Short_SBMV, SBMV, Combine( Values(clblasRowMajor), Values(clblasUpper), ValuesIn(smallRange), ValuesIn(smallRange), Values(ExtraTestSizes(0, (int)1, (int)1, 0, 0, 0)), ValuesIn(alphaBetaRange), ValuesIn(alphaBetaRange), Values(1))); INSTANTIATE_TEST_CASE_P(SelectedSmall0_SBMV, SBMV, Combine( ValuesIn(orderSet), Values(clblasLower), Values(14), Values(10), Values(ExtraTestSizes(0, (int)-1, (int)1, 9, 0, 0)), ValuesIn(alphaBetaRange), ValuesIn(alphaBetaRange), Values(1))); #elif defined(MEDIUM_TESTS) INSTANTIATE_TEST_CASE_P(Order_SBMV, SBMV, Combine( ValuesIn(orderSet), Values(clblasUpper), ValuesIn(smallRange), ValuesIn(smallRange), Values(ExtraTestSizes(0, (int)1, (int)33, 10, 0, 0)), ValuesIn(alphaBetaRange), ValuesIn(alphaBetaRange), Values(1))); INSTANTIATE_TEST_CASE_P(Uplo__SBMV, SBMV, Combine( Values(clblasRowMajor), ValuesIn(uploSet), ValuesIn(smallRange), ValuesIn(smallRange), Values(ExtraTestSizes(0, (int)-33, (int)1, 0, 10, 0)), ValuesIn(alphaBetaRange), ValuesIn(alphaBetaRange), Values(1))); INSTANTIATE_TEST_CASE_P(SelectedVerySmall_SBMV, SBMV, Combine( ValuesIn(orderSet), Values(clblasUpper), Values(7), Values(5),Values(ExtraTestSizes(0, (int)-1, (int)1, 9, 0, 0)), ValuesIn(alphaBetaRange), ValuesIn(alphaBetaRange), Values(1))); INSTANTIATE_TEST_CASE_P(SelectedBig0_SBMV, SBMV, Combine( Values(clblasRowMajor), Values(clblasLower), Values(2000), Values(565), Values(clMath::ExtraTestSizes(0,(int)30,(int)1,9,0,6)), ValuesIn(alphaBetaRange), ValuesIn(alphaBetaRange), Values(1))); #else // Correctness INSTANTIATE_TEST_CASE_P(ALL_SBMV, SBMV, Combine( ValuesIn(orderSet), ValuesIn(uploSet), ValuesIn(smallRange), ValuesIn(smallRange), Values(clMath::ExtraTestSizes(0,(int)22,(int)-20,9,10,0)), ValuesIn(alphaBetaRange), ValuesIn(alphaBetaRange), Values(1))); INSTANTIATE_TEST_CASE_P(SelectedBig1_SBMV, SBMV, Combine( ValuesIn(orderSet), ValuesIn(uploSet), Values(2510, 2300), Values(1500,1700), Values(clMath::ExtraTestSizes(0,(int)3,(int)-2,9,0,6)), ValuesIn(alphaBetaRange), ValuesIn(alphaBetaRange), Values(1))); #endif // Correctness #endif // DO_SBMV //HBMV #ifdef DO_HBMV #if defined(SHORT_TESTS) INSTANTIATE_TEST_CASE_P(Short_HBMV, HBMV, Combine( Values(clblasRowMajor), Values(clblasUpper), ValuesIn(smallRange), ValuesIn(smallRange), Values(ExtraTestSizes(0, (int)1, (int)1, 0, 0, 0)), ValuesIn(alphaBetaRange), ValuesIn(alphaBetaRange), Values(1))); INSTANTIATE_TEST_CASE_P(SelectedSmall0_HBMV, HBMV, Combine( ValuesIn(orderSet), Values(clblasLower), Values(14), Values(10), Values(ExtraTestSizes(0, (int)-1, (int)1, 9, 0, 0)), ValuesIn(alphaBetaRange), ValuesIn(alphaBetaRange), Values(1))); #elif defined(MEDIUM_TESTS) INSTANTIATE_TEST_CASE_P(Order_HBMV, HBMV, Combine( ValuesIn(orderSet), Values(clblasUpper), ValuesIn(smallRange), ValuesIn(smallRange), Values(ExtraTestSizes(0, (int)1, (int)33, 10, 0, 0)), ValuesIn(alphaBetaRange), ValuesIn(alphaBetaRange), Values(1))); INSTANTIATE_TEST_CASE_P(Trans_HBMV, HBMV, Combine( Values(clblasRowMajor), Values(clblasLower), ValuesIn(smallRange), ValuesIn(smallRange), Values(ExtraTestSizes(0, (int)-33, (int)1, 0, 10, 0)), ValuesIn(alphaBetaRange), ValuesIn(alphaBetaRange), Values(1))); INSTANTIATE_TEST_CASE_P(SelectedVerySmall_HBMV, HBMV, Combine( ValuesIn(orderSet), Values(clblasUpper), Values(7), Values(5),Values(ExtraTestSizes(0, (int)-1, (int)1, 9, 0, 0)), ValuesIn(alphaBetaRange), ValuesIn(alphaBetaRange), Values(1))); INSTANTIATE_TEST_CASE_P(SelectedBig0_HBMV, HBMV, Combine( Values(clblasRowMajor), Values(clblasLower), Values(2000), Values(565), Values(clMath::ExtraTestSizes(0,(int)30,(int)1,9,0,6)), ValuesIn(alphaBetaRange), ValuesIn(alphaBetaRange), Values(1))); #else // Correctness INSTANTIATE_TEST_CASE_P(ALL_HBMV, HBMV, Combine( ValuesIn(orderSet), ValuesIn(uploSet), ValuesIn(smallRange), ValuesIn(smallRange), Values(clMath::ExtraTestSizes(0,(int)22,(int)-20,9,10,0)), ValuesIn(alphaBetaRange), ValuesIn(alphaBetaRange), Values(1))); INSTANTIATE_TEST_CASE_P(SelectedBig1_HBMV, HBMV, Combine( ValuesIn(orderSet), ValuesIn(uploSet), Values(2510, 2300), Values(1500,1700), Values(clMath::ExtraTestSizes(0,(int)3,(int)-2,9,0,6)), ValuesIn(alphaBetaRange), ValuesIn(alphaBetaRange), Values(1))); #endif // Correctness #endif // DO_HBMV #ifdef DO_TBMV #if defined(SHORT_TESTS) INSTANTIATE_TEST_CASE_P(Short_TBMV, TBMV, Combine( Values(clblasRowMajor), Values(clblasUpper), Values(clblasNoTrans), Values(clblasNonUnit), ValuesIn(smallRange), ValuesIn(smallRange), Values(ExtraTestSizes(0, (int)1, (int)1, 0, 0, 0)), Values(1))); INSTANTIATE_TEST_CASE_P(SelectedSmall0_TBMV, TBMV, Combine( ValuesIn(orderSet), Values(clblasLower), Values(clblasTrans), Values(clblasUnit), Values(14), Values(13), Values(ExtraTestSizes(0, (int)-1, (int)1, 9, 0, 0)), Values(1))); #elif defined(MEDIUM_TESTS) INSTANTIATE_TEST_CASE_P(Order_TBMV, TBMV, Combine( ValuesIn(orderSet), Values(clblasUpper), Values(clblasNoTrans), Values(clblasNonUnit), ValuesIn(smallRange), ValuesIn(smallRange),Values(ExtraTestSizes(0, (int)1, (int)33, 10, 0, 0)), Values(1))); INSTANTIATE_TEST_CASE_P(Uplo_TBMV, TBMV, Combine( Values(clblasRowMajor), ValuesIn(uploSet), Values(clblasNoTrans), Values(clblasNonUnit), ValuesIn(smallRange), ValuesIn(smallRange), Values(ExtraTestSizes(0, (int)1, (int)1, 0, 0, 10)), Values(1))); INSTANTIATE_TEST_CASE_P(Trans_TBMV, TBMV, Combine( Values(clblasRowMajor), Values(clblasLower), ValuesIn(transSet), Values(clblasUnit), ValuesIn(smallRange), ValuesIn(smallRange), Values(ExtraTestSizes(0, (int)-33, (int)1, 0, 10, 0)), Values(1))); INSTANTIATE_TEST_CASE_P(Diag_TBMV, TBMV, Combine( Values(clblasRowMajor), Values(clblasUpper), Values(clblasNoTrans), ValuesIn(diagSet), ValuesIn(smallRange), ValuesIn(smallRange), Values(ExtraTestSizes(0, (int)1, (int)1, 8, 0, 0)), Values(1))); INSTANTIATE_TEST_CASE_P(SelectedVerySmall_TBMV, TBMV, Combine( ValuesIn(orderSet), ValuesIn(uploSet), ValuesIn(transSet), ValuesIn(diagSet), Values(1, 2, 4, 9), Values(3), Values(ExtraTestSizes(0, (int)-1, (int)1, 9, 0, 0)), Values(1))); #else // Correctness INSTANTIATE_TEST_CASE_P(ALL_TBMV, TBMV, Combine( ValuesIn(orderSet), ValuesIn(uploSet), ValuesIn(transSet), ValuesIn(diagSet), ValuesIn(smallRange), ValuesIn(smallRange), Values(clMath::ExtraTestSizes(0,(int)22,(int)-20,9,10,0)), Values(1))); INSTANTIATE_TEST_CASE_P(SelectedBig_TBMV, TBMV, Combine( ValuesIn(orderSet), ValuesIn(uploSet), ValuesIn(transSet), ValuesIn(diagSet), Values(2509, 2299), Values(1499,2199), Values(clMath::ExtraTestSizes(0,(int)3,(int)-2,9,0,6)), Values(1))); #endif // Correctness #endif // DO_TBMV #ifdef DO_TBSV #if defined(SHORT_TESTS) /* INSTANTIATE_TEST_CASE_P(Short_TBSV, TBSV, Combine( Values(clblasRowMajor), Values(clblasUpper), Values(clblasNoTrans), Values(clblasNonUnit), ValuesIn(smallRange), ValuesIn(smallRange), Values(ExtraTestSizes(0, (int)1, (int)1, 0, 0, 0)), Values(1))); INSTANTIATE_TEST_CASE_P(SelectedSmall0_TBSV, TBSV, Combine( ValuesIn(orderSet), Values(clblasLower), Values(clblasTrans), Values(clblasUnit), Values(14), Values(13), Values(ExtraTestSizes(0, (int)-1, (int)1, 9, 0, 0)), Values(1))); */ INSTANTIATE_TEST_CASE_P(Short_TBSV, TBSV, Combine( Values(clblasRowMajor), Values(clblasLower), Values(clblasNoTrans), Values(clblasNonUnit), ValuesIn(smallRange), ValuesIn(smallRange), Values(ExtraTestSizes(0, (int)1, (int)1, 0, 0, 0)), Values(1))); INSTANTIATE_TEST_CASE_P(SelectedSmall0_TBSV, TBSV, Combine( Values(clblasRowMajor), Values(clblasLower), Values(clblasNoTrans), Values(clblasUnit), Values(14), Values(13), Values(ExtraTestSizes(0, (int)-2, (int)1, 9, 0, 0)), Values(1))); #elif defined(MEDIUM_TESTS) INSTANTIATE_TEST_CASE_P(Order_TBSV, TBSV, Combine( Values(clblasRowMajor), Values(clblasLower), Values(clblasNoTrans), Values(clblasNonUnit), ValuesIn(smallRange), ValuesIn(smallRange),Values(ExtraTestSizes(0, (int)1, (int)33, 10, 0, 0)), Values(1))); INSTANTIATE_TEST_CASE_P(Uplo_TBSV, TBSV, Combine( Values(clblasRowMajor), Values(clblasLower), Values(clblasNoTrans), Values(clblasNonUnit), ValuesIn(smallRange), ValuesIn(smallRange), Values(ExtraTestSizes(0, (int)1, (int)1, 0, 0, 10)), Values(1))); INSTANTIATE_TEST_CASE_P(Trans_TBSV, TBSV, Combine( Values(clblasRowMajor), Values(clblasLower), Values(clblasNoTrans), Values(clblasUnit), ValuesIn(smallRange), ValuesIn(smallRange), Values(ExtraTestSizes(0, (int)-33, (int)1, 0, 10, 0)), Values(1))); INSTANTIATE_TEST_CASE_P(Diag_TBSV, TBSV, Combine( Values(clblasRowMajor), Values(clblasLower), Values(clblasNoTrans), ValuesIn(diagSet), ValuesIn(smallRange), ValuesIn(smallRange), Values(ExtraTestSizes(0, (int)1, (int)1, 8, 0, 0)), Values(1))); INSTANTIATE_TEST_CASE_P(SelectedVerySmall_TBSV, TBSV, Combine( Values(clblasRowMajor), Values(clblasLower), Values(clblasNoTrans), ValuesIn(diagSet), Values(1, 2, 4, 9), Values(3), Values(ExtraTestSizes(0, (int)-1, (int)1, 9, 0, 0)), Values(1))); /* INSTANTIATE_TEST_CASE_P(Order_TBSV, TBSV, Combine( ValuesIn(orderSet), Values(clblasUpper), Values(clblasNoTrans), Values(clblasNonUnit), ValuesIn(smallRange), ValuesIn(smallRange),Values(ExtraTestSizes(0, (int)1, (int)33, 10, 0, 0)), Values(1))); INSTANTIATE_TEST_CASE_P(Uplo_TBSV, TBSV, Combine( Values(clblasRowMajor), ValuesIn(uploSet), Values(clblasNoTrans), Values(clblasNonUnit), ValuesIn(smallRange), ValuesIn(smallRange), Values(ExtraTestSizes(0, (int)1, (int)1, 0, 0, 10)), Values(1))); INSTANTIATE_TEST_CASE_P(Trans_TBSV, TBSV, Combine( Values(clblasRowMajor), Values(clblasLower), ValuesIn(transSet), Values(clblasUnit), ValuesIn(smallRange), ValuesIn(smallRange), Values(ExtraTestSizes(0, (int)-33, (int)1, 0, 10, 0)), Values(1))); INSTANTIATE_TEST_CASE_P(Diag_TBSV, TBSV, Combine( Values(clblasRowMajor), Values(clblasUpper), Values(clblasNoTrans), ValuesIn(diagSet), ValuesIn(smallRange), ValuesIn(smallRange), Values(ExtraTestSizes(0, (int)1, (int)1, 8, 0, 0)), Values(1))); INSTANTIATE_TEST_CASE_P(SelectedVerySmall_TBSV, TBSV, Combine( ValuesIn(orderSet), ValuesIn(uploSet), ValuesIn(transSet), ValuesIn(diagSet), Values(1, 2, 4, 9), Values(3), Values(ExtraTestSizes(0, (int)-1, (int)1, 9, 0, 0)), Values(1))); */ #else // Correctness INSTANTIATE_TEST_CASE_P(ALL_TBSV, TBSV, Combine( ValuesIn(orderSet), ValuesIn(uploSet), ValuesIn(transSet), ValuesIn(diagSet), ValuesIn(smallRange), ValuesIn(smallRange), Values(clMath::ExtraTestSizes(0,(int)22,(int)-20,9,10,0)), Values(1))); INSTANTIATE_TEST_CASE_P(SelectedBig_TBSV, TBSV, Combine( ValuesIn(orderSet), ValuesIn(uploSet), ValuesIn(transSet), ValuesIn(diagSet), Values(2509, 2299), Values(1499,2199), Values(clMath::ExtraTestSizes(0,(int)3,(int)-2,9,0,6)), Values(1))); #endif // Correctness #endif // DO_TBSV //COPY #ifdef DO_COPY #if defined(SHORT_TESTS) INSTANTIATE_TEST_CASE_P(Short_COPY, COPY, Combine( ValuesIn(smallRange), Values(1), Values(1), Values(1), Values(1), Values(1)) ); INSTANTIATE_TEST_CASE_P(SelectedSmall0_COPY, COPY, Combine( Values(61), Values(4, -11), Values(1), Values(0), Values(1), Values(1) ) ); #elif defined(MEDIUM_TESTS) INSTANTIATE_TEST_CASE_P(Medium_COPY, COPY, Combine( ValuesIn(smallRange), Values(-10), Values(1), Values(1), Values(1), Values(1) ) ); INSTANTIATE_TEST_CASE_P(SelectedBig0_COPY, COPY, Combine( Values(4900), Values(1), Values(1), Values(4), Values(1), Values(1) ) ); #else // Correctness INSTANTIATE_TEST_CASE_P(ALL_COPY, COPY, Combine( ValuesIn(completeRange), ValuesIn(incs), ValuesIn(incs), ValuesIn(offsetRange), ValuesIn(offsetRange), Values(1) ) ); #endif // Correctness #endif //DOT #ifdef DO_DOT #if defined(SHORT_TESTS) INSTANTIATE_TEST_CASE_P(Short_DOT, DOT, Combine( ValuesIn(smallRange), Values(1), Values(1), Values(1), Values(1), Values(1), Values(1)) ); INSTANTIATE_TEST_CASE_P(SelectedSmall0_DOT, DOT, Combine( Values(61), Values(4, -11), Values(1), Values(0), Values(1), Values(1) , Values(1)) ); #elif defined(MEDIUM_TESTS) INSTANTIATE_TEST_CASE_P(Medium_DOT, DOT, Combine( ValuesIn(smallRange), Values(-10), Values(1), Values(1), Values(1), Values(1), Values(1) ) ); INSTANTIATE_TEST_CASE_P(SelectedBig0_DOT, DOT, Combine( Values(4900), Values(1), Values(1), Values(4), Values(1), Values(1), Values(1) ) ); #else // Correctness INSTANTIATE_TEST_CASE_P(ALL_DOT, DOT, Combine( ValuesIn(completeRange), ValuesIn(incs), ValuesIn(incs), ValuesIn(offsetRange), ValuesIn(offsetRange), ValuesIn(offsetRange), Values(1) ) ); #endif // Correctness #endif #ifdef DO_DOTC #if defined(SHORT_TESTS) INSTANTIATE_TEST_CASE_P(Short_DOTC, DOTC, Combine( ValuesIn(smallRange), Values(1), Values(1), Values(1), Values(1), Values(1), Values(1)) ); INSTANTIATE_TEST_CASE_P(SelectedSmall0_DOTC, DOTC, Combine( Values(61), Values(4, -11), Values(1), Values(0), Values(1), Values(1) , Values(1)) ); #elif defined(MEDIUM_TESTS) INSTANTIATE_TEST_CASE_P(Medium_DOTC, DOTC, Combine( ValuesIn(smallRange), Values(-10), Values(1), Values(1), Values(1), Values(1), Values(1) ) ); INSTANTIATE_TEST_CASE_P(SelectedBig0_DOTC, DOTC, Combine( Values(4900), Values(1), Values(1), Values(4), Values(1), Values(1), Values(1) ) ); #else // Correctness INSTANTIATE_TEST_CASE_P(ALL_DOTC, DOTC, Combine( ValuesIn(completeRange), ValuesIn(incs), ValuesIn(incs), ValuesIn(offsetRange), ValuesIn(offsetRange), ValuesIn(offsetRange), Values(1) ) ); #endif // Correctness #endif #ifdef DO_SCAL #if defined(SHORT_TESTS) INSTANTIATE_TEST_CASE_P(Short_SCAL, SCAL, Combine( ValuesIn(smallRange), ValuesIn(alphaBetaRange), Values(0), Values(1), Values(1) ) ); INSTANTIATE_TEST_CASE_P(SelectedSmall0_SCAL, SCAL, Combine( Values(61), ValuesIn(alphaBetaRange), Values(0), Values(4,-11), Values(1) ) ); #elif defined(MEDIUM_TESTS) INSTANTIATE_TEST_CASE_P(Medium_SCAL, SCAL, Combine( ValuesIn(smallRange), ValuesIn(alphaBetaRange), Values(0), Values(-10), Values(1) ) ); INSTANTIATE_TEST_CASE_P(SelectedBig0_SCAL, SCAL, Combine( Values(4900), ValuesIn(alphaBetaRange), Values(0), Values(4), Values(1) ) ); #else // Correctness INSTANTIATE_TEST_CASE_P(ALL_SCAL, SCAL, Combine( ValuesIn(completeRange), ValuesIn(alphaBetaRange), ValuesIn(offsetRange), ValuesIn(incs), Values(1) ) ); #endif // Correctness #endif // Big matrices #if !defined SHORT_TESTS #ifdef DO_TRMV INSTANTIATE_TEST_CASE_P(SelectedBig_0TRMV, TRMV, Combine( ValuesIn(orderSet), ValuesIn(uploSet), Values(clblasTrans), ValuesIn(diagSet),Values(2800), Values(0), ValuesIn(incs), Values(0, 10), Values(0, 9), Values(1))); INSTANTIATE_TEST_CASE_P(SelectedBig_1TRMV, TRMV, Combine( ValuesIn(orderSet), ValuesIn(uploSet), Values(clblasTrans), ValuesIn(diagSet),Values(4567), Values(0), ValuesIn(incs), Values(0, 10), Values(0, 9), Values(1))); #endif #ifdef DO_TRSV INSTANTIATE_TEST_CASE_P(SelectedBig_0TRSV, TRSV, Combine( Values(clblasColumnMajor), ValuesIn(uploSet), Values(clblasTrans), ValuesIn(diagSet),Values(2800), Values(0), Values(1), Values(0), Values(0), Values(1))); #endif #ifdef DO_TPSV INSTANTIATE_TEST_CASE_P(SelectedBig_0TPSV, TPSV, Combine( Values(clblasColumnMajor), ValuesIn(uploSet), Values(clblasTrans), ValuesIn(diagSet),Values(2800), Values(0), Values(1), Values(0), Values(0), Values(1))); #endif #ifdef DO_HER INSTANTIATE_TEST_CASE_P(SelectedBig_0HER, HER, Combine( Values(clblasColumnMajor), ValuesIn(uploSet), Values(2800), Values((double)50), Values(0), Values(1), Values(0), Values(0), Values(1) ) ); #endif #ifdef DO_HER2 INSTANTIATE_TEST_CASE_P(SelectedBig_0HER2, HER2, Combine( Values(clblasColumnMajor), ValuesIn(uploSet), Values(2800), Values((cl_float2)floatComplex(0,1)), Values(0), Values(1), Values(0), Values(0), Values(0),Values(1) ) ); #endif #if !defined(MEDIUM_TESTS) #ifdef DO_TRMV INSTANTIATE_TEST_CASE_P(SelectedBig_2TRMV, TRMV, Combine( ValuesIn(orderSet), ValuesIn(uploSet), Values(clblasNoTrans), ValuesIn(diagSet), Values(5567), Values(0), ValuesIn(incs), Values(0, 10), Values(0, 9), Values(1))); INSTANTIATE_TEST_CASE_P(SelectedBig_3TRMV, TRMV, Combine( ValuesIn(orderSet), ValuesIn(uploSet), Values(clblasNoTrans), ValuesIn(diagSet), Values(6567), Values(0), ValuesIn(incs), Values(0, 10), Values(0, 9), Values(1))); INSTANTIATE_TEST_CASE_P(SelectedBig_4TRMV, TRMV, Combine( ValuesIn(orderSet), ValuesIn(uploSet), Values(clblasNoTrans), ValuesIn(diagSet), Values(7567), Values(0), ValuesIn(incs), Values(0, 10), Values(0, 9), Values(1))); #endif #ifdef DO_TPMV INSTANTIATE_TEST_CASE_P(SelectedBig_2TPMV, TPMV, Combine( ValuesIn(orderSet), ValuesIn(uploSet), Values(clblasNoTrans), ValuesIn(diagSet), Values(5567),Values(0), ValuesIn(incs), Values(0, 10), Values(0, 9), Values(1))); INSTANTIATE_TEST_CASE_P(SelectedBig_3TPMV, TPMV, Combine( ValuesIn(orderSet), ValuesIn(uploSet), Values(clblasNoTrans), ValuesIn(diagSet), Values(6567),Values(0), ValuesIn(incs), Values(0, 10), Values(0, 9), Values(1))); INSTANTIATE_TEST_CASE_P(SelectedBig_4TPMV, TPMV, Combine( ValuesIn(orderSet), ValuesIn(uploSet), Values(clblasNoTrans), ValuesIn(diagSet), Values(7567),Values(0), ValuesIn(incs), Values(0, 10), Values(0, 9), Values(1))); #endif #ifdef DO_TRSV INSTANTIATE_TEST_CASE_P(SelectedBig_1TRSV, TRSV, Combine( ValuesIn(orderSet), ValuesIn(uploSet), Values(clblasTrans), ValuesIn(diagSet),Values(4567), Values(0), ValuesIn(incs), Values(0,10), Values(0,9), Values(1))); INSTANTIATE_TEST_CASE_P(SelectedBig_2TRSV, TRSV, Combine( ValuesIn(orderSet), ValuesIn(uploSet), Values(clblasNoTrans), ValuesIn(diagSet), Values(5567), Values(0), ValuesIn(incs), Values(0,10), Values(0,9), Values(1))); INSTANTIATE_TEST_CASE_P(SelectedBig_3TRSV, TRSV, Combine( ValuesIn(orderSet), ValuesIn(uploSet), Values(clblasNoTrans), ValuesIn(diagSet), Values(6567), Values(0), ValuesIn(incs), Values(0,10), Values(0,9), Values(1))); INSTANTIATE_TEST_CASE_P(SelectedBig_4TRSV, TRSV, Combine( ValuesIn(orderSet), ValuesIn(uploSet), Values(clblasNoTrans), ValuesIn(diagSet), Values(7567), Values(0), ValuesIn(incs), Values(0,10), Values(0,9), Values(1))); #endif #ifdef DO_TPSV INSTANTIATE_TEST_CASE_P(SelectedBig_1TPSV, TPSV, Combine( ValuesIn(orderSet), ValuesIn(uploSet), Values(clblasTrans), ValuesIn(diagSet),Values(4567), Values(0), ValuesIn(incs), Values(0,10), Values(0,9), Values(1))); INSTANTIATE_TEST_CASE_P(SelectedBig_2TPSV, TPSV, Combine( ValuesIn(orderSet), ValuesIn(uploSet), Values(clblasNoTrans), ValuesIn(diagSet), Values(5567), Values(0), ValuesIn(incs), Values(0,10), Values(0,9), Values(1))); INSTANTIATE_TEST_CASE_P(SelectedBig_3TPSV, TPSV, Combine( ValuesIn(orderSet), ValuesIn(uploSet), Values(clblasNoTrans), ValuesIn(diagSet), Values(6567), Values(0), ValuesIn(incs), Values(0,10), Values(0,9), Values(1))); INSTANTIATE_TEST_CASE_P(SelectedBig_4TPSV, TPSV, Combine( ValuesIn(orderSet), ValuesIn(uploSet), Values(clblasNoTrans), ValuesIn(diagSet), Values(7567), Values(0), ValuesIn(incs), Values(0,10), Values(0,9), Values(1))); #endif #ifdef DO_HER INSTANTIATE_TEST_CASE_P(SelectedBig_1HER, HER, Combine( ValuesIn(orderSet), ValuesIn(uploSet), Values(3192), ValuesIn(realAlphaRange), ValuesIn(ldaRange), ValuesIn(incs), ValuesIn(offsetRange), ValuesIn(offsetRange), Values(1) ) ); INSTANTIATE_TEST_CASE_P(SelectedBig_2HER, HER, Combine( ValuesIn(orderSet), ValuesIn(uploSet), Values(2048), ValuesIn(realAlphaRange), ValuesIn(ldaRange), ValuesIn(incs), ValuesIn(offsetRange), ValuesIn(offsetRange), Values(1) ) ); INSTANTIATE_TEST_CASE_P(SelectedBig_3HER, HER, Combine( ValuesIn(orderSet), ValuesIn(uploSet), Values(3192), ValuesIn(realAlphaRange), ValuesIn(ldaRange), ValuesIn(incs), ValuesIn(offsetRange), ValuesIn(offsetRange), Values(1) ) ); INSTANTIATE_TEST_CASE_P(SelectedBig_4HER, HER, Combine( ValuesIn(orderSet), ValuesIn(uploSet), Values(2055), ValuesIn(realAlphaRange), ValuesIn(ldaRange), ValuesIn(incs), ValuesIn(offsetRange), ValuesIn(offsetRange), Values(1) ) ); #endif #ifdef DO_HER2 INSTANTIATE_TEST_CASE_P(SelectedBig_1HER2, HER2, Combine( ValuesIn(orderSet), ValuesIn(uploSet), Values(3192), ValuesIn(complexAlphaRange), ValuesIn(offsetRange), ValuesIn(incs), ValuesIn(offsetRange), ValuesIn(offsetRange), ValuesIn(ldaRange),Values(1) ) ); INSTANTIATE_TEST_CASE_P(SelectedBig_2HER2, HER2, Combine( ValuesIn(orderSet), ValuesIn(uploSet), Values(2048), ValuesIn(complexAlphaRange), ValuesIn(offsetRange), ValuesIn(incs), ValuesIn(offsetRange), ValuesIn(offsetRange), ValuesIn(ldaRange),Values(1) ) ); INSTANTIATE_TEST_CASE_P(SelectedBig_3HER2, HER2, Combine( ValuesIn(orderSet), ValuesIn(uploSet), Values(3192), ValuesIn(complexAlphaRange), ValuesIn(offsetRange), ValuesIn(incs), ValuesIn(offsetRange), ValuesIn(offsetRange), ValuesIn(ldaRange),Values(1) ) ); INSTANTIATE_TEST_CASE_P(SelectedBig_4HER2, HER2, Combine( ValuesIn(orderSet), ValuesIn(uploSet), Values(2055), ValuesIn(complexAlphaRange), ValuesIn(offsetRange), ValuesIn(incs), ValuesIn(offsetRange),ValuesIn(offsetRange), ValuesIn(ldaRange),Values(1) ) ); #endif #endif /* !MEDIUM_TESTS */ #endif /* !SHORT_TESTS */ // Small matrices #ifdef DO_TRMV INSTANTIATE_TEST_CASE_P(SelectedSmall_0TRMV, TRMV, Combine( ValuesIn(orderSet), ValuesIn(uploSet), Values(clblasNoTrans), ValuesIn(diagSet), Values(1), Values(0), Values(1), Values(0, 10), Values(0, 9), Values(1))); #endif #ifdef DO_TPMV INSTANTIATE_TEST_CASE_P(SelectedSmall_0TPMV, TPMV, Combine( ValuesIn(orderSet), ValuesIn(uploSet), Values(clblasNoTrans), ValuesIn(diagSet), Values(1),Values(0), Values(1), Values(0, 10), Values(0, 9), Values(1))); #endif #ifdef DO_TRSV INSTANTIATE_TEST_CASE_P(SelectedSmall_0TRSV, TRSV, Combine( Values(clblasColumnMajor), ValuesIn(uploSet), Values(clblasNoTrans), Values(clblasNonUnit), Values(1), Values(0), Values(1), Values(0,10), Values(0,9), Values(1))); #endif #ifdef DO_TPSV INSTANTIATE_TEST_CASE_P(SelectedSmall_0TPSV, TPSV, Combine( Values(clblasColumnMajor), ValuesIn(uploSet), Values(clblasNoTrans), Values(clblasNonUnit), Values(1), Values(0), Values(1), Values(0,10), Values(0,9), Values(1))); #endif #ifdef DO_HER INSTANTIATE_TEST_CASE_P(SelectedSmall_0HER, HER, Combine( Values(clblasColumnMajor), ValuesIn(uploSet), Values(4), ValuesIn(realAlphaRange), Values(0), ValuesIn(incs), Values(0,9), Values(0,11), Values(1) ) ); #endif #ifdef DO_HER2 INSTANTIATE_TEST_CASE_P(SelectedSmall_0HER2, HER2, Combine( Values(clblasColumnMajor), ValuesIn(uploSet), Values(4), ValuesIn(complexAlphaRange), Values(0,7), ValuesIn(incs), Values(0,9), Values(0,11), Values(0),Values(1) ) ); #endif #if !defined SHORT_TESTS #ifdef DO_TRMV INSTANTIATE_TEST_CASE_P(SelectedSmall_1TRMV, TRMV, Combine( ValuesIn(orderSet), ValuesIn(uploSet), Values(clblasNoTrans), ValuesIn(diagSet), Values(2), Values(0), Values(1), Values(0, 10), Values(0, 9), Values(1))); #endif #ifdef DO_TPMV INSTANTIATE_TEST_CASE_P(SelectedSmall_1TPMV, TPMV, Combine( ValuesIn(orderSet), ValuesIn(uploSet), Values(clblasNoTrans), ValuesIn(diagSet), Values(2),Values(0), Values(1), Values(0, 10), Values(0, 9), Values(1))); #endif #ifdef DO_TRSV INSTANTIATE_TEST_CASE_P(SelectedSmall_1TRSV, TRSV, Combine( Values(clblasRowMajor), ValuesIn(uploSet), Values(clblasNoTrans), Values(clblasUnit), Values(2), Values(0), Values(1), Values(10), Values(9), Values(1))); #endif #ifdef DO_HER INSTANTIATE_TEST_CASE_P(SelectedSmall_1HER, HER, Combine( Values(clblasRowMajor), ValuesIn(uploSet), Values(12), ValuesIn(realAlphaRange), Values(0), ValuesIn(incs), Values(0), Values(1), Values(1) ) ); #endif #ifdef DO_HER2 INSTANTIATE_TEST_CASE_P(SelectedSmall_1HER2, HER2, Combine( ValuesIn(orderSet), ValuesIn(uploSet), Values(12), ValuesIn(complexAlphaRange), Values(0,1), ValuesIn(incs), Values(0),Values(9), Values(0),Values(1) ) ); #endif #if !defined(MEDIUM_TESTS) #ifdef DO_TRMV INSTANTIATE_TEST_CASE_P(SelectedSmall_2TRMV, TRMV, Combine( ValuesIn(orderSet), ValuesIn(uploSet), Values(clblasNoTrans), ValuesIn(diagSet), Values(13), Values(0), Values(1), Values(0, 10), Values(0, 9), Values(1))); #endif #ifdef DO_TPMV INSTANTIATE_TEST_CASE_P(SelectedSmall_2TPMV, TPMV, Combine( ValuesIn(orderSet), ValuesIn(uploSet), Values(clblasNoTrans), ValuesIn(diagSet), Values(13),Values(0), Values(1), Values(0, 10), Values(0, 9), Values(1))); #endif #ifdef DO_TRSV INSTANTIATE_TEST_CASE_P(SelectedSmall_2TRSV, TRSV, Combine( Values(clblasRowMajor), ValuesIn(uploSet), Values(clblasNoTrans), Values(clblasNonUnit), Values(13), Values(0), Values(1), Values(0,10), Values(0,9), Values(1))); #endif #ifdef DO_TPSV INSTANTIATE_TEST_CASE_P(SelectedSmall_2TPSV, TPSV, Combine( Values(clblasRowMajor), ValuesIn(uploSet), Values(clblasTrans), Values(clblasUnit), Values(13), Values(0), Values(1), Values(0,10), Values(0,9), Values(1))); #endif #ifdef DO_HER INSTANTIATE_TEST_CASE_P(SelectedSmallHER_2HER, HER, Combine( ValuesIn(orderSet), ValuesIn(uploSet), Values(65), ValuesIn(realAlphaRange), Values(0), ValuesIn(incs), Values(0), Values(0), Values(1) ) ); #endif #ifdef DO_HER2 INSTANTIATE_TEST_CASE_P(SelectedSmallHER2_2HER2, HER2, Combine( ValuesIn(orderSet), ValuesIn(uploSet), Values(65), ValuesIn(complexAlphaRange), Values(0), ValuesIn(incs), Values(0), Values(0), Values(0), Values(1) ) ); #endif #endif /* !MEDIUM_TESTS */ #endif /* !SHORT_TESTS */ // Custom test - use command line arguments to tweak it #if !defined SHORT_TESTS && !defined MEDIUM_TESTS #ifdef DO_TRMV INSTANTIATE_TEST_CASE_P(Custom, TRMV, Combine( ValuesIn(orderSet), ValuesIn(uploSet), ValuesIn(transSet), ValuesIn(diagSet), Values(32), Values(0), Values(1), Values(0, 10), Values(0, 9), Values(1))); #endif #ifdef DO_TRSV INSTANTIATE_TEST_CASE_P(Custom, TRSV, Combine( ValuesIn(orderSet), ValuesIn(uploSet), ValuesIn(transSet), ValuesIn(diagSet), Values(32), Values(0), Values(1), Values(0,10), Values(0,9), Values(1))); #endif #ifdef DO_TPSV INSTANTIATE_TEST_CASE_P(Custom, TPSV, Combine( ValuesIn(orderSet), ValuesIn(uploSet), ValuesIn(transSet), ValuesIn(diagSet), Values(32), Values(0), Values(1), Values(0,10), Values(0,9), Values(1))); #endif #ifdef DO_GER INSTANTIATE_TEST_CASE_P(Custom, GER, Combine( ValuesIn(orderSet), Values(32), Values(32), Values(0), Values(1), Values(1), Values(0, 9), Values(0, 11), Values(0, 10), Values(1) ) ); #endif #ifdef DO_GERC INSTANTIATE_TEST_CASE_P(Custom, GERC, Combine( ValuesIn(orderSet), Values(32), Values(32), Values(0), Values(1), Values(1), Values(0, 9), Values(0, 11), Values(0, 10), Values(1) ) ); #endif #ifdef DO_HER INSTANTIATE_TEST_CASE_P(Custom, HER, Combine( ValuesIn(orderSet), ValuesIn(uploSet), Values(32), Values(99.0), Values(0), Values(1), Values(6, 2), Values(0, 5), Values(1) ) ); #endif #ifdef DO_HER2 INSTANTIATE_TEST_CASE_P(Custom, HER2, Combine( ValuesIn(orderSet), ValuesIn(uploSet), Values(32), ValuesIn(complexAlphaRange), Values(0), Values(1), Values(0), Values(0),Values(40), Values(1) ) ); #endif #endif /* !SHORT_TESTS */ // Multiple command queues tests #if defined SHORT_TESTS #define QUEUES_TEST_MATRIX_SIZES 257 #elif defined MEDIUM_TESTS #define QUEUES_TEST_MATRIX_SIZES 385 #else #define QUEUES_TEST_MATRIX_SIZES 513,1025 #endif #if !defined(SHORT_TESTS) #ifdef DO_GEMM INSTANTIATE_TEST_CASE_P(MultipleQueues, GEMM, Combine( ValuesIn(orderSet), ValuesIn(transSet), ValuesIn(transSet), Values(QUEUES_TEST_MATRIX_SIZES), Values(QUEUES_TEST_MATRIX_SIZES), Values(QUEUES_TEST_MATRIX_SIZES), Values(clMath::ExtraTestSizes()), ValuesIn(numQueues))); #endif #if !defined(MEDIUM_TESTS) #ifdef DO_TRMM INSTANTIATE_TEST_CASE_P(MultipleQueues, TRMM, Combine( ValuesIn(orderSet), ValuesIn(sideSet), ValuesIn(uploSet), ValuesIn(transSet), ValuesIn(diagSet), Values(QUEUES_TEST_MATRIX_SIZES), Values(QUEUES_TEST_MATRIX_SIZES), Values(clMath::ExtraTestSizes()), ValuesIn(numQueues))); #endif #ifdef DO_TRSM INSTANTIATE_TEST_CASE_P(MultipleQueues, TRSM, Combine( ValuesIn(orderSet), ValuesIn(sideSet), ValuesIn(uploSet), ValuesIn(transSet), ValuesIn(diagSet), Values(QUEUES_TEST_MATRIX_SIZES), Values(QUEUES_TEST_MATRIX_SIZES), Values(clMath::ExtraTestSizes()), ValuesIn(numQueues))); #endif #endif /* MEDIUM_TESTS */ #ifdef DO_GEMV INSTANTIATE_TEST_CASE_P(MultipleQueues, GEMV, Combine( ValuesIn(orderSet), ValuesIn(transSet), Values(QUEUES_TEST_MATRIX_SIZES), Values(QUEUES_TEST_MATRIX_SIZES), Values(clMath::ExtraTestSizes(0, 1, 1, 0, 0, 0)), ValuesIn(numQueues))); #endif #ifdef DO_SYMV INSTANTIATE_TEST_CASE_P(MultipleQueues, SYMV, Combine( ValuesIn(orderSet), ValuesIn(uploSet), Values(QUEUES_TEST_MATRIX_SIZES), Values(clMath::ExtraTestSizes(0, 1, 1, 0, 0, 0)), ValuesIn(numQueues))); #endif #ifdef DO_SYR2K INSTANTIATE_TEST_CASE_P(MultipleQueues, SYR2K, Combine( ValuesIn(orderSet), ValuesIn(uploSet), ValuesIn(transSet), Values(QUEUES_TEST_MATRIX_SIZES), Values(QUEUES_TEST_MATRIX_SIZES), Values(clMath::ExtraTestSizes()), ValuesIn(numQueues))); #endif #ifdef DO_SYRK INSTANTIATE_TEST_CASE_P(MultipleQueues, SYRK, Combine( ValuesIn(orderSet), ValuesIn(uploSet), ValuesIn(transSet), Values(QUEUES_TEST_MATRIX_SIZES), Values(QUEUES_TEST_MATRIX_SIZES), Values(clMath::ExtraTestSizes()), ValuesIn(numQueues))); #endif #if !defined MEDIUM_TESTS #ifdef DO_HERK INSTANTIATE_TEST_CASE_P(MultipleQueues, HERK, Combine( ValuesIn(orderSet), ValuesIn(uploSet), ValuesIn(transSet), Values(QUEUES_TEST_MATRIX_SIZES), Values(QUEUES_TEST_MATRIX_SIZES), ValuesIn(alphaBetaRange), ValuesIn(alphaBetaRange), Values(clMath::ExtraTestSizes()), ValuesIn(numQueues))); #endif #ifdef DO_HER2K INSTANTIATE_TEST_CASE_P(MultipleQueues, HER2K, Combine( ValuesIn(orderSet), ValuesIn(uploSet), ValuesIn(transSet), Values(QUEUES_TEST_MATRIX_SIZES), Values(QUEUES_TEST_MATRIX_SIZES), ValuesIn(alphaBetaRange), ValuesIn(alphaBetaRange), Values(clMath::ExtraTestSizes()), ValuesIn(numQueues))); #endif #ifdef DO_TRMV INSTANTIATE_TEST_CASE_P(MultipleQueues, TRMV, Combine( ValuesIn(orderSet), ValuesIn(uploSet), ValuesIn(transSet), ValuesIn(diagSet),Values(QUEUES_TEST_MATRIX_SIZES), Values(0), Values(1), Values(0, 10), Values(0, 9), ValuesIn(numQueues))); #endif #ifdef DO_TPMV INSTANTIATE_TEST_CASE_P(MultipleQueues, TPMV, Combine( ValuesIn(orderSet), ValuesIn(uploSet), ValuesIn(transSet), ValuesIn(diagSet),Values(QUEUES_TEST_MATRIX_SIZES), Values(0), Values(1), Values(0, 10), Values(0, 9), ValuesIn(numQueues))); #endif #ifdef DO_HEMV INSTANTIATE_TEST_CASE_P(MultipleQueues, HEMV, Combine( ValuesIn(orderSet), ValuesIn(uploSet), Values(QUEUES_TEST_MATRIX_SIZES), ValuesIn(alphaBetaRange), ValuesIn(alphaBetaRange), Values(0, 10), Values(0, 9), Values(0, 8), Values(clMath::ExtraTestSizes(0, 1, 1, 0, 0, 0)), ValuesIn(numQueues))); #endif #ifdef DO_HPMV INSTANTIATE_TEST_CASE_P(MultipleQueues, HPMV, Combine( ValuesIn(orderSet), ValuesIn(uploSet), Values(QUEUES_TEST_MATRIX_SIZES), ValuesIn(alphaBetaRange), ValuesIn(alphaBetaRange), Values(0, 10), Values(0, 9), Values(0, 8), Values(clMath::ExtraTestSizes(0, 1, 1, 0, 0, 0)), ValuesIn(numQueues))); #endif #ifdef DO_SPMV INSTANTIATE_TEST_CASE_P(MultipleQueues, SPMV, Combine( ValuesIn(orderSet), ValuesIn(uploSet), Values(QUEUES_TEST_MATRIX_SIZES), ValuesIn(alphaBetaRange), ValuesIn(alphaBetaRange), Values(0, 10), Values(0, 9), Values(0, 8), Values(clMath::ExtraTestSizes(0, 1, 1, 0, 0, 0)), ValuesIn(numQueues))); #endif #ifdef DO_TRSV INSTANTIATE_TEST_CASE_P(MultipleQueues, TRSV, Combine( Values(clblasColumnMajor), ValuesIn(uploSet), Values(clblasConjTrans), ValuesIn(diagSet),Values(QUEUES_TEST_MATRIX_SIZES), Values(0), Values(1), Values(0,10), Values(0,9), ValuesIn(numQueues))); #endif #ifdef DO_TPSV INSTANTIATE_TEST_CASE_P(MultipleQueues, TPSV, Combine( Values(clblasColumnMajor), ValuesIn(uploSet), Values(clblasTrans), ValuesIn(diagSet),Values(QUEUES_TEST_MATRIX_SIZES), Values(0), Values(1), Values(0,10), Values(0,9), ValuesIn(numQueues))); #endif #ifdef DO_SYR INSTANTIATE_TEST_CASE_P(MultipleQueues, SYR, Combine( ValuesIn(orderSet), ValuesIn(uploSet), Values(QUEUES_TEST_MATRIX_SIZES), ValuesIn(realAlphaRange), ValuesIn(offsetRange), ValuesIn(incs), ValuesIn(offsetRange), ValuesIn(ldaRange), ValuesIn(numQueues) ) ); #endif #ifdef DO_SPR INSTANTIATE_TEST_CASE_P(MultipleQueues, SPR, Combine( ValuesIn(orderSet), ValuesIn(uploSet), Values(QUEUES_TEST_MATRIX_SIZES), ValuesIn(realAlphaRange), ValuesIn(offsetRange), ValuesIn(incs), ValuesIn(offsetRange), ValuesIn(ldaRange), ValuesIn(numQueues) ) ); #endif #ifdef DO_GER INSTANTIATE_TEST_CASE_P(MultipleQueues, GER, Combine( ValuesIn(orderSet), Values(QUEUES_TEST_MATRIX_SIZES), Values(QUEUES_TEST_MATRIX_SIZES), Values(0), Values(1), Values(1), Values(0, 9), Values(0, 11), Values(0, 10), ValuesIn(numQueues) ) ); #endif #ifdef DO_GERC INSTANTIATE_TEST_CASE_P(MultipleQueues, GERC, Combine( ValuesIn(orderSet), Values(QUEUES_TEST_MATRIX_SIZES), Values(QUEUES_TEST_MATRIX_SIZES), Values(0), Values(1), Values(1), Values(0, 9), Values(0, 11), Values(0, 10), ValuesIn(numQueues) ) ); #endif #ifdef DO_HER INSTANTIATE_TEST_CASE_P(MultipleQueues, HER, Combine( ValuesIn(orderSet), ValuesIn(uploSet), Values(QUEUES_TEST_MATRIX_SIZES), ValuesIn(realAlphaRange), ValuesIn(ldaRange), Values(1), Values(0), Values(0), ValuesIn(numQueues) ) ); #endif #ifdef DO_HPR INSTANTIATE_TEST_CASE_P(MultipleQueues, HPR, Combine( ValuesIn(orderSet), ValuesIn(uploSet), Values(QUEUES_TEST_MATRIX_SIZES), ValuesIn(realAlphaRange), ValuesIn(ldaRange), Values(1), Values(0), Values(0), ValuesIn(numQueues) ) ); #endif #ifdef DO_HER2 INSTANTIATE_TEST_CASE_P(MultipleQueues, HER2, Combine( ValuesIn(orderSet), ValuesIn(uploSet), Values(QUEUES_TEST_MATRIX_SIZES), ValuesIn(complexAlphaRange), Values(0), Values(1), Values(0),Values(1), ValuesIn(ldaRange), ValuesIn(numQueues) ) ); #endif #ifdef DO_HPR2 INSTANTIATE_TEST_CASE_P(MultipleQueues, HPR2, Combine( ValuesIn(orderSet), ValuesIn(uploSet), Values(QUEUES_TEST_MATRIX_SIZES), ValuesIn(complexAlphaRange), Values(0), Values(1), Values(0),Values(1), ValuesIn(ldaRange), ValuesIn(numQueues) ) ); #endif #ifdef DO_SYR2 #endif #ifdef DO_SPR2 INSTANTIATE_TEST_CASE_P(MultipleQueues, SPR2, Combine( ValuesIn(orderSet), ValuesIn(uploSet), Values(QUEUES_TEST_MATRIX_SIZES), ValuesIn(realAlphaRange), ValuesIn(offsetRange), ValuesIn(incs), ValuesIn(offsetRange), ValuesIn(offsetRange), ValuesIn(ldaRange), ValuesIn(numQueues) ) ); #endif #ifdef DO_GBMV INSTANTIATE_TEST_CASE_P(MultipleQueues, GBMV, Combine( ValuesIn(orderSet), ValuesIn(transSet), Values(QUEUES_TEST_MATRIX_SIZES), Values(QUEUES_TEST_MATRIX_SIZES), Values(QUEUES_TEST_MATRIX_SIZES), Values(QUEUES_TEST_MATRIX_SIZES),Values(clMath::ExtraTestSizes(0,(int)1,(int)1,0,0,0)), ValuesIn(alphaBetaRange), ValuesIn(alphaBetaRange), ValuesIn(numQueues))); #endif #ifdef DO_TBMV INSTANTIATE_TEST_CASE_P(MultipleQueues, TBMV, Combine( ValuesIn(orderSet), ValuesIn(uploSet), ValuesIn(transSet), ValuesIn(diagSet), Values(QUEUES_TEST_MATRIX_SIZES), Values(QUEUES_TEST_MATRIX_SIZES), Values(clMath::ExtraTestSizes(0,(int)1,(int)1,0,0,0)), ValuesIn(numQueues))); #endif #ifdef DO_SCAL INSTANTIATE_TEST_CASE_P(MultipleQueues, SCAL, Combine( Values(QUEUES_TEST_MATRIX_SIZES), ValuesIn(alphaBetaRange), ValuesIn(offsetRange), ValuesIn(incs), ValuesIn(numQueues))); #endif #ifdef DO_COPY INSTANTIATE_TEST_CASE_P(MultipleQueues, COPY, Combine( Values(QUEUES_TEST_MATRIX_SIZES), ValuesIn(incs), ValuesIn(incs), ValuesIn(offsetRange), ValuesIn(offsetRange), ValuesIn(numQueues))); #endif #ifdef DO_SWAP INSTANTIATE_TEST_CASE_P(MultipleQueues, SWAPXY, Combine( Values(QUEUES_TEST_MATRIX_SIZES), ValuesIn(offsetRange), ValuesIn(incs), ValuesIn(offsetRange), ValuesIn(incs), ValuesIn(numQueues) ) ); #endif #ifdef DO_DOT INSTANTIATE_TEST_CASE_P(MultipleQueues, DOT, Combine( Values(QUEUES_TEST_MATRIX_SIZES), ValuesIn(incs), ValuesIn(incs), ValuesIn(offsetRange), ValuesIn(offsetRange), ValuesIn(offsetRange), ValuesIn(numQueues) ) ); #endif #ifdef DO_DOTC INSTANTIATE_TEST_CASE_P(MultipleQueues, DOTC, Combine( Values(QUEUES_TEST_MATRIX_SIZES), ValuesIn(incs), ValuesIn(incs), ValuesIn(offsetRange), ValuesIn(offsetRange), ValuesIn(offsetRange), ValuesIn(numQueues) ) ); #endif #ifdef DO_AXPY INSTANTIATE_TEST_CASE_P(MultipleQueues, AXPY, Combine( Values(QUEUES_TEST_MATRIX_SIZES), ValuesIn(alphaBetaRange), ValuesIn(offsetRange), ValuesIn(incs), ValuesIn(offsetRange), ValuesIn(incs), ValuesIn(numQueues))); #endif #ifdef DO_ROTG INSTANTIATE_TEST_CASE_P(MultipleQueues, ROTG, Combine( ValuesIn(offsetRange), ValuesIn(offsetRange), ValuesIn(offsetRange), ValuesIn(offsetRange), ValuesIn(numQueues))); #endif #ifdef DO_ROTM INSTANTIATE_TEST_CASE_P(MultipleQueues, ROTM, Combine( Values(QUEUES_TEST_MATRIX_SIZES), ValuesIn(offsetRange), ValuesIn(incs), ValuesIn(offsetRange), ValuesIn(incs), ValuesIn(offsetRange), ValuesIn(sflagRange), ValuesIn(numQueues))); #endif #ifdef DO_ROT INSTANTIATE_TEST_CASE_P(MultipleQueues, ROT, Combine( Values(QUEUES_TEST_MATRIX_SIZES), ValuesIn(offsetRange), ValuesIn(incs), ValuesIn(offsetRange), ValuesIn(incs), ValuesIn(alphaBetaRange), ValuesIn(alphaBetaRange), ValuesIn(numQueues))); #endif #ifdef DO_ROTMG INSTANTIATE_TEST_CASE_P(MultipleQueues, ROTMG, Combine( ValuesIn(offsetRange), ValuesIn(offsetRange), ValuesIn(offsetRange), ValuesIn(offsetRange), ValuesIn(offsetRange), ValuesIn(sflagRange), ValuesIn(numQueues))); #endif #ifdef DO_NRM2 INSTANTIATE_TEST_CASE_P(MultipleQueues, NRM2, Combine( Values(QUEUES_TEST_MATRIX_SIZES), ValuesIn(incs), ValuesIn(offsetRange), ValuesIn(offsetRange), ValuesIn(numQueues) ) ); #endif #ifdef DO_ASUM INSTANTIATE_TEST_CASE_P(MultipleQueues, ASUM, Combine( Values(QUEUES_TEST_MATRIX_SIZES), ValuesIn(incs), ValuesIn(offsetRange), ValuesIn(offsetRange), ValuesIn(numQueues) ) ); #endif #ifdef DO_iAMAX INSTANTIATE_TEST_CASE_P(MultipleQueues, iAMAX, Combine( Values(QUEUES_TEST_MATRIX_SIZES), ValuesIn(incs), ValuesIn(offsetRange), ValuesIn(offsetRange), ValuesIn(numQueues) ) ); #endif #endif /* !MEDIUM_TESTS */ #endif /* SHORT_TESTS */ #undef QUEUES_TEST_MATRIX_SIZES /////////////////////////////////////////////////////////////////////////////// int main(int argc, char *argv[]) { ::clMath::BlasBase *base; TestParams params; int ret; if( (argc > 1) && ( !strcmp(argv[1], "--test-help") || !strcmp(argv[1], "-?") || !strcmp(argv[1], "-h") ) ) { printUsage("test-correctness"); ::testing::InitGoogleTest(&argc, argv); return 0; } // The library takes an environment variable to control how to cache kernels; automate the setting of this // environment variable in our different test programs to set it to reasonable values // Read environmental variable to limit or disable ( 0 ) the size of the kernel cache in memory char* kCacheEnv = getenv( "AMD_CLBLAS_KCACHE_LIMIT_MB" ); if( kCacheEnv == NULL ) { #if defined( SHORT_TESTS ) #else putenv( (char*)"AMD_CLBLAS_KCACHE_LIMIT_MB=256" ); #endif } ::testing::InitGoogleTest(&argc, argv); ::std::cerr << "Initialize OpenCL and clblas..." << ::std::endl; base = ::clMath::BlasBase::getInstance(); if (base == NULL) { ::std::cerr << "Fatal error, OpenCL or clblas initialization failed! " "Leaving the test." << ::std::endl; return -1; } base->setSeed(DEFAULT_SEED); if (argc != 1) { params.optFlags = NO_FLAGS; params.devType = CL_DEVICE_TYPE_GPU; params.devName = NULL; if (parseBlasCmdLineArgs(argc, argv, ¶ms) != 0) { printUsage(argv[0]); return 1; } if (params.optFlags & SET_SEED) { base->setSeed(params.seed); } if (params.optFlags & SET_ALPHA) { base->setAlpha(params.alpha); } if (params.optFlags & SET_BETA) { base->setBeta(params.beta); } if (params.optFlags & SET_M) { base->setM(params.M); } if (params.optFlags & SET_N) { base->setN(params.N); } if (params.optFlags & SET_K) { base->setK(params.K); } if (params.optFlags & SET_INCX) { base->setIncX(params.incx); } if (params.optFlags & SET_INCY) { base->setIncY(params.incy); } if (params.optFlags & SET_DEVICE_TYPE) { if (!base->setDeviceType(¶ms.devType, params.devName)) { ::std::cerr << "Fatal error, OpenCL or clblas " "initialization failed! Leaving the test." << ::std::endl; return -1; } } if (params.optFlags & SET_NUM_COMMAND_QUEUES) { base->setNumCommandQueues(params.numCommandQueues); } } parseEnv(¶ms); if (params.optFlags & SET_USE_IMAGES) { base->setUseImages(params.useImages); } /* Use of image based buffers is deprecated if (base->useImages()) { if (base->addScratchImages()) { std::cerr << "FATAL ERROR, CANNOT CREATE SCRATCH IMAGES!" << std::endl; } } */ base->printEnvInfo(); ret = RUN_ALL_TESTS(); if (base->useImages()) { base->removeScratchImages(); } /* * Explicitely tell the singleton to release all resources, * before we return from main. */ base->release( ); return ret; } clblas-2.10/src/tests/correctness/trsm-delta.h000066400000000000000000000177771264277366700214530ustar00rootroot00000000000000/* ************************************************************************ * Copyright 2013 Advanced Micro Devices, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * ************************************************************************/ #include #include // Type-dependant constants template static cl_double DELTA_0(); template<> __template_static cl_double DELTA_0() { return pow(2.0, -20); } template<> __template_static cl_double DELTA_0() { return pow(2.0, -50); } template<> __template_static cl_double DELTA_0() { return pow(2.0, -20); } template<> __template_static cl_double DELTA_0() { return pow(2.0, -50); } size_t trsmBlockSize(size_t elemSize) { /* TODO: Right now TRSM generators use block size of 16 elements for the * double complex type, and of 32 elements for another types. * If this changes, we have to fetch block size from TRSM generator * somehow. */ return (elemSize == sizeof(DoubleComplex)) ? 16 : 32; } template void trsmDelta( clblasOrder order, clblasSide side, clblasUplo uplo, clblasTranspose transA, clblasDiag diag, size_t M, size_t N, T *A, size_t lda, T *B, size_t ldb, T alpha, cl_double *delta) { cl_double *deltaCLBLAS, s; int i, k, j, jStart, jEnd, idx; int zinc; size_t z = 0; size_t bsize; bool isUpper; T v; isUpper = ((uplo == clblasUpper) && (transA == clblasNoTrans)) || ((uplo == clblasLower) && (transA != clblasNoTrans)); deltaCLBLAS = new cl_double[M * N]; bsize = trsmBlockSize(sizeof(T)); if (side == clblasLeft) { // Calculate delta of TRSM evaluated with the Gauss' method for (k = 0; k < (int)N; k++) { if (isUpper) { for (i = (int)M - 1; i >= 0; i--) { v = getElement(order, clblasNoTrans, i, k, B, ldb); if (diag == clblasNonUnit) { v = v / getElement(order, transA, i, i, A, lda); } s = module(v) * DELTA_0() * module(alpha); if (i == (int)(M - 1)) { delta[i * N + k] = s; } else { delta[i * N + k] = s + delta[(i + 1) * N + k]; } assert(delta[i* N + k] >= 0); } } else { for (i = 0; i < (int)M; i++) { v = getElement(order, clblasNoTrans, i, k, B, ldb); if (diag == clblasNonUnit) { v = v / getElement(order, transA, i, i, A, lda); } s = module(v) * DELTA_0() * module(alpha); if (i == 0) { delta[i * N + k] = s; } else { delta[i * N + k] = s + delta[(i - 1) * N + k]; } assert(delta[i* N + k] >= 0); } } } // Calculate clblas TRSM delta for (k = 0; k < (int)N; k++) { for (i = 0; i < (int)M; i++) { s = 0.0; /* * For the upper triangular matrix the solving process proceeds * from the bottom to the top, and the bottommost block's * delta influents most of all. For the lower triangular matrix * the situation is opposite. */ if (isUpper) { jStart = i / (int)bsize; // index of the block just after the last matrix block jEnd = ((int)M + (int)bsize - 1) / (int)bsize; z = 1; zinc = 1; } else { jStart = 0; jEnd = i / (int)bsize + 1; z = jEnd - jStart; zinc = -1; } for (j = jStart; j < jEnd; j++) { idx = j * (int)bsize + i % (int)bsize; if (idx >= (int)M) { continue; } s += z * delta[idx * N + k]; z += zinc; } deltaCLBLAS[i * N + k] = s * bsize; assert(deltaCLBLAS[i* N + k] >= 0); } } } else { // Calculate delta of TRSM evaluated with the Gauss' method for (i = 0; i < (int)M; i++) { if (isUpper) { for (k = 0; k < (int)N; k++) { v = getElement(order, clblasNoTrans, i, k, B, ldb); if (diag == clblasNonUnit) { v = v / getElement(order, transA, k, k, A, lda); } s = module(v) * DELTA_0() * module(alpha); if (k == 0) { delta[i * N + k] = s; } else { delta[i * N + k] = s + delta[i * N + (k - 1)]; } assert(delta[i* N + k] >= 0); } } else { for (k = (int)N - 1; k >= 0; k--) { v = getElement(order, clblasNoTrans, i, k, B, ldb); if (diag == clblasNonUnit) { v = v / getElement(order, transA, k, k, A, lda); } s = module(v) * DELTA_0() * module(alpha); if (k == (int)(N - 1)) { delta[i * N + k] = s; } else { delta[i * N + k] = s + delta[i * N + (k + 1)]; } assert(delta[i* N + k] >= 0); } } } // Calculate clblas TRSM delta for (i = 0; i < (int)M; i++) { for (k = 0; k < (int)N; k++) { s = 0.0; /* * Approach is the same as for the left side matrix, but delta * is calculated over the rows rather than the columns. * Now, since the matrices are swapped, the largest and * tightest blocks are swapped as well. Therefore, pass * direction for the upper and lower triangular matrix is also * swapped. */ if (isUpper) { jStart = 0; jEnd = k / (int)bsize + 1; z = jEnd - jStart; zinc = -1; } else { jStart = k / (int)bsize; jEnd = (k + (int)bsize - 1) / (int)bsize; z = 1; zinc = 1; } for (j = jStart; j < jEnd; j++) { idx = j * (int)bsize + k % (int)bsize; if (idx >= (int)N) { continue; } s += z * delta[i * N + idx]; z += zinc; } deltaCLBLAS[i * N + k] = s * bsize; assert(deltaCLBLAS[i* N + k] >= 0); } } } for (k = 0; k < (int)N; k++) { for (i = 0; i < (int)M; i++) { delta[i * N + k] += deltaCLBLAS[i * N + k]; } } delete[] deltaCLBLAS; } clblas-2.10/src/tests/correctness/trsv-delta.h000066400000000000000000000226031264277366700214440ustar00rootroot00000000000000/* ************************************************************************ * Copyright 2013 Advanced Micro Devices, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * ************************************************************************/ #ifndef TRSV_DELTA_H_ #define TRSV_DELTA_H_ #include "delta.h" static size_t trsvBlockSize(size_t elemSize) { /* TODO: Right now TRSV generators use block size of 16 elements for the * double complex type, and of 32 elements for another types. * If this changes, we have to fetch block size from TRSV generator * somehow. */ return (elemSize == sizeof(DoubleComplex)) ? 16 : 32; } template void trsvDelta( clblasOrder order, clblasUplo uplo, clblasTranspose transA, clblasDiag diag, size_t N, T *A, size_t lda, T *X, int incx, cl_double *delta) { cl_double *deltaCLBLAS, s; int i, j, jStart, jEnd, idx; int zinc; size_t z = 0; size_t bsize, lenX; bool isUpper = false; size_t previncxi=0; T v; isUpper = ((uplo == clblasUpper) && (transA == clblasNoTrans)) || ((uplo == clblasLower) && (transA != clblasNoTrans)); // incx = abs(incx); lenX = 1 + (N-1)*abs(incx); deltaCLBLAS = new cl_double[lenX]; bsize = trsvBlockSize(sizeof(T)); // Calculate delta of TRSV evaluated with the Gauss' method if (isUpper) { for (i = (int)N - 1; i >= 0; i--) { size_t incxi; incxi = (incx > 0) ? (i*incx) : (N-1-i)*abs(incx); v = getElement(clblasColumnMajor, clblasNoTrans, incxi, 0, X, lenX); if (diag == clblasNonUnit) { T tempA; if(lda > 0) { tempA = getElement(order, transA, i, i, A, lda); } else { tempA = getElementPacked(order, clblasNoTrans, uplo, i, i, A, N); } v = v / tempA; } s = module(v) * DELTA_0(); if (i == (int)(N - 1)) { delta[ incxi ] = s; } else { delta[ incxi ] = s + delta[ previncxi ]; } assert(delta[ incxi ] >= 0); previncxi = incxi; } } else { for (i = 0; i < (int)N; i++) { size_t incxi; incxi = (incx > 0) ? (i*incx) : (N-1-i)*abs(incx); v = getElement(clblasColumnMajor, clblasNoTrans, incxi, 0, X, lenX); if (diag == clblasNonUnit) { T tempA; if(lda > 0) { tempA = getElement(order, transA, i, i, A, lda); } else { tempA = getElementPacked(order, clblasNoTrans, uplo, i, i, A, N); } v = v / tempA; } s = module(v) * DELTA_0(); if (i == 0) { delta[ incxi ] = s; } else { delta[ incxi ] = s + delta[ previncxi ]; } assert(delta[ incxi ] >= 0); previncxi = incxi; } } // Calculate clblas TRSV delta for (i = 0; i < (int)N; i++) { size_t incxi; s = 0.0; /* * For the upper triangular matrix the solving process proceeds * from the bottom to the top, and the bottommost block's * delta influents most of all. For the lower triangular matrix * the situation is opposite. */ if (isUpper) { jStart = i / (int)bsize; // index of the block just after the last matrix block jEnd = ((int)N + (int)bsize - 1) / (int)bsize; z = 1; zinc = 1; } else { jStart = 0; jEnd = i / (int)bsize + 1; z = jEnd - jStart; zinc = -1; } for (j = jStart; j < jEnd; j++) { size_t incxi; idx = j * (int)bsize + i % (int)bsize; if (idx >= (int)N) { continue; } incxi = (incx > 0) ? (idx*incx) : (N-1-idx)*abs(incx); s += z * delta[ incxi ]; z += zinc; } incxi = (incx > 0) ? (i*incx) : (N-1-i)*abs(incx); deltaCLBLAS[ incxi ] = s * bsize; assert(deltaCLBLAS[ incxi ] >= 0); } for (i = 0; i < (int)N; i++) { size_t incxi; incxi = (incx > 0) ? (i*incx) : (N-1-i)*abs(incx); delta[ incxi ] += deltaCLBLAS[ incxi ]; } delete[] deltaCLBLAS; } template void tbsvDelta( clblasOrder order, clblasUplo uplo, clblasTranspose transA, clblasDiag diag, size_t N, size_t K, T *A, size_t lda, T *X, int incx, cl_double *delta) { cl_double *deltaCLBLAS, s; int i, j, jStart, jEnd, idx; int zinc; size_t z = 0; size_t bsize, lenX; bool isUpper = false; size_t previncxi=0; T v; isUpper = ((uplo == clblasUpper) && (transA == clblasNoTrans)) || ((uplo == clblasLower) && (transA != clblasNoTrans)); lenX = 1 + (N-1)*abs(incx); deltaCLBLAS = new cl_double[lenX]; bsize = trsvBlockSize(sizeof(T)); // Calculate delta of TRSV evaluated with the Gauss' method if (isUpper) { for (i = (int)N - 1; i >= 0; i--) { size_t incxi; incxi = (incx > 0) ? (i*incx) : (N-1-i)*abs(incx); v = getElement(clblasColumnMajor, clblasNoTrans, incxi, 0, X, lenX); if (diag == clblasNonUnit) { v = v / getElementBanded(order, uplo, i, i, K, A, lda); } s = module(v) * DELTA_0(); if (i == (int)(N - 1)) { delta[ incxi ] = s; } else { delta[ incxi ] = s + delta[ previncxi ]; } assert(delta[ incxi ] >= 0); previncxi = incxi; } } else { for (i = 0; i < (int)N; i++) { size_t incxi; incxi = (incx > 0) ? (i*incx) : (N-1-i)*abs(incx); v = getElement(clblasColumnMajor, clblasNoTrans, incxi, 0, X, lenX); if (diag == clblasNonUnit) { v = v / getElementBanded(order, uplo, i, i, K, A, lda); } s = module(v) * DELTA_0(); if (i == 0) { delta[ incxi ] = s; } else { delta[ incxi ] = s + delta[ previncxi ]; } assert(delta[ incxi ] >= 0); previncxi = incxi; } } // Calculate clblas TRSV delta for (i = 0; i < (int)N; i++) { size_t incxi; s = 0.0; if (isUpper) { jStart = i / (int)bsize; // index of the block just after the last matrix block jEnd = ((int)N + (int)bsize - 1) / (int)bsize; z = 1; zinc = 1; } else { jStart = 0; jEnd = i / (int)bsize + 1; z = jEnd - jStart; zinc = -1; } for (j = jStart; j < jEnd; j++) { size_t incxi; idx = j * (int)bsize + i % (int)bsize; if (idx >= (int)N) { continue; } incxi = (incx > 0) ? (idx*incx) : (N-1-idx)*abs(incx); s += z * delta[ incxi ]; z += zinc; } incxi = (incx > 0) ? (i*incx) : (N-1-i)*abs(incx); deltaCLBLAS[ incxi ] = s * bsize; assert(deltaCLBLAS[ incxi ] >= 0); } for (i = 0; i < (int)N; i++) { size_t incxi; incxi = (incx > 0) ? (i*incx) : (N-1-i)*abs(incx); delta[ incxi ] += deltaCLBLAS[ incxi ]; } delete[] deltaCLBLAS; } #endif clblas-2.10/src/tests/functional/000077500000000000000000000000001264277366700170135ustar00rootroot00000000000000clblas-2.10/src/tests/functional/BlasBase-func.cpp000066400000000000000000000060531264277366700221300ustar00rootroot00000000000000/* ************************************************************************ * Copyright 2013 Advanced Micro Devices, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * ************************************************************************/ #include #include #include #include #include namespace clMath { static size_t imageMaxDimension(cl_context context, int widthHeight) { cl_int err; cl_device_id devices[2]; size_t i, retSize; size_t rc = (size_t)-1; cl_device_info par; par = (widthHeight) ? CL_DEVICE_IMAGE2D_MAX_HEIGHT : CL_DEVICE_IMAGE2D_MAX_WIDTH; err = clGetContextInfo(context, CL_CONTEXT_DEVICES, sizeof(devices), devices, &retSize); if (err == CL_SUCCESS) { size_t s; retSize /= sizeof(cl_device_id); for (i = 0; (i < retSize) && (err == CL_SUCCESS); i++) { err = clGetDeviceInfo(devices[i], par, sizeof(s), &s, NULL); if (err == CL_SUCCESS) { rc = std::min(rc, s); } } } if (err != CL_SUCCESS) { rc = 0; } return rc; } static size_t imageMaxWidth(cl_context context) { return imageMaxDimension(context, 0); } static size_t imageMaxHeight(cl_context context) { return imageMaxDimension(context, 1); } clblasStatus BlasBase::addScratchImages(void) { //cl_ulong memSize, allocSize; //size_t width, height; //clblasStatus status; //float scale; ///* // * get maximum amount of memory each image can takes, not // * forgetting that it can be up to three matrices residing // * in memory objects // */ //allocSize = maxMemAllocSize(); //memSize = availGlobalMemSize(0); //if (allocSize > memSize / 5) { // allocSize = memSize / 5; // scale = 1.4f; //} //else { // scale = 1.5f; //} //height = static_cast(sqrt(static_cast(allocSize) / sizeof(cl_float))); //width = height / 4; //height = static_cast(height / scale); //width = static_cast(width * scale); //if (height > imageMaxHeight(context_)) { // height = imageMaxHeight(context_); //} //if (width > imageMaxWidth(context_)) { // width = imageMaxWidth(context_); //} //imageA_ = clblasAddScratchImage(context_, width, height, &status); //if (imageA_) { // imageB_ = clblasAddScratchImage(context_, width, height, &status); //} //return status; return clblasNotImplemented; } } // namespace clblas-2.10/src/tests/functional/func-error.cpp000066400000000000000000001017501264277366700216050ustar00rootroot00000000000000/* ************************************************************************ * Copyright 2013 Advanced Micro Devices, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * ************************************************************************/ #include #include #include "blas-wrapper.h" #include "clBLAS-wrapper.h" #include "BlasBase.h" #include "blas-random.h" #include "timer.h" #include "func.h" template class ErrorClass { M metod; protected: bool generateData(); public: void error(cl_int err_etalon); // nano_time_t runRepeat(int rep, cl_int* err); }; template bool ErrorClass::generateData() { metod.generateData(); bool ret = metod.prepareDataToRun(); if (!ret) { ::std::cerr << ">> Failed to create/enqueue buffer for a matrix." << ::std::endl << ">> Can't execute the test, because data is not transfered to GPU." << ::std::endl << ">> Test skipped." << ::std::endl; SUCCEED(); } return ret; } template void ErrorClass::error(cl_int err_etalon) { metod.initDefault(1024, 1); cl_command_queue queues = metod.queues[0]; if (generateData()) { switch (err_etalon) { case CL_INVALID_EVENT_WAIT_LIST: metod.inEvent = NULL; metod.inEventCount = 1; break; case CL_INVALID_EVENT: metod.outEvent = NULL; metod.inEventCount = 1; break; case CL_INVALID_CONTEXT: clReleaseContext(metod.context); break; case CL_INVALID_COMMAND_QUEUE: metod.queues[0] = NULL; break; case clblasInvalidMatA: case clblasInvalidVecX: case CL_INVALID_MEM_OBJECT: metod.bufA = NULL; metod.bufAP = NULL; metod.bufX = NULL; metod.bufY = NULL; break; case CL_INVALID_DEVICE: break; case clblasInsufficientMemMatA: case clblasInsufficientMemMatB: case clblasInsufficientMemVecX: case CL_INVALID_VALUE: metod.size = 2048; //metod.bufA = NULL; break; default: FAIL() << "Unknown Error cod " << err_etalon; } cl_int err = metod.run(); metod.queues[0] = queues; ASSERT_EQ(err, err_etalon) << "clFinish()"; } metod.destroy(); } #ifdef DO_THEIRS // Instantiate the test TEST(ERROR, InvalidCommandQueue) { ErrorClass > ec; ec.error(CL_INVALID_COMMAND_QUEUE); } TEST(ERROR, InvalidEventWaitList) { ErrorClass > ec; ec.error(CL_INVALID_EVENT_WAIT_LIST); } TEST(ERROR, InvalidMemObject) { ErrorClass > ec; ec.error(clblasInvalidMatA); } TEST(ERROR, InvalidValue) { ErrorClass > ec; ec.error(clblasInsufficientMemMatA); } TEST(ERROR, InvalidDevice) { clMath::BlasBase* base = clMath::BlasBase::getInstance(); if (!base->isDevSupportDoublePrecision()) { ErrorClass > ec; ec.error(CL_INVALID_DEVICE); } } // Instantiate the test #endif #ifdef DO_TRMV TEST(ERROR, InvalidCommandQueuetrmv) { ErrorClass > ec; ec.error(CL_INVALID_COMMAND_QUEUE); } TEST(ERROR, InvalidEventWaitListtrmv) { ErrorClass > ec; ec.error(CL_INVALID_EVENT_WAIT_LIST); } TEST(ERROR, InvalidMemObjecttrmv) { ErrorClass > ec; ec.error(clblasInvalidMatA); } TEST(ERROR, InvalidValuetrmv) { ErrorClass > ec; ec.error(clblasInsufficientMemMatA); } TEST(ERROR, InvalidDevicetrmv) { clMath::BlasBase* base = clMath::BlasBase::getInstance(); if (!base->isDevSupportDoublePrecision()) { ErrorClass > ec; ec.error(CL_INVALID_DEVICE); } } #endif #ifdef DO_TRSV TEST(ERROR, InvalidCommandQueue_trsv) { ErrorClass > ec; ec.error(CL_INVALID_COMMAND_QUEUE); } TEST(ERROR, InvalidEventWaitList_trsv) { ErrorClass > ec; ec.error(CL_INVALID_EVENT_WAIT_LIST); } TEST(ERROR, InvalidMemObject_trsv) { ErrorClass > ec; ec.error(clblasInvalidMatA); } TEST(ERROR, InvalidValue_trsv) { ErrorClass > ec; ec.error(clblasInsufficientMemMatA); } TEST(ERROR, InvalidDevice_trsv) { clMath::BlasBase* base = clMath::BlasBase::getInstance(); if (!base->isDevSupportDoublePrecision()) { ErrorClass > ec; ec.error(CL_INVALID_DEVICE); } } #endif #ifdef DO_TPSV TEST(ERROR, InvalidCommandQueue_tpsv) { ErrorClass > ec; ec.error(CL_INVALID_COMMAND_QUEUE); } TEST(ERROR, InvalidEventWaitList_tpsv) { ErrorClass > ec; ec.error(CL_INVALID_EVENT_WAIT_LIST); } TEST(ERROR, InvalidMemObject_tpsv) { ErrorClass > ec; ec.error(clblasInvalidMatA); } TEST(ERROR, InvalidValue_tpsv) { ErrorClass > ec; ec.error(clblasInsufficientMemMatA); } TEST(ERROR, InvalidDevice_tpsv) { clMath::BlasBase* base = clMath::BlasBase::getInstance(); if (!base->isDevSupportDoublePrecision()) { ErrorClass > ec; ec.error(CL_INVALID_DEVICE); } } #endif #ifdef DO_TPMV TEST(ERROR, InvalidCommandQueue_tpmv) { ErrorClass > ec; ec.error(CL_INVALID_COMMAND_QUEUE); } TEST(ERROR, InvalidEventWaitList_tpmv) { ErrorClass > ec; ec.error(CL_INVALID_EVENT_WAIT_LIST); } TEST(ERROR, InvalidMemObject_tpmv) { ErrorClass > ec; ec.error(clblasInvalidMatA); } TEST(ERROR, InvalidValue_tpmv) { ErrorClass > ec; ec.error(clblasInsufficientMemMatA); } TEST(ERROR, InvalidDevice_tpmv) { clMath::BlasBase* base = clMath::BlasBase::getInstance(); if (!base->isDevSupportDoublePrecision()) { ErrorClass > ec; ec.error(CL_INVALID_DEVICE); } } #endif #ifdef DO_SYMM TEST(ERROR, InvalidCommandQueuesymm) { ErrorClass > ec; ec.error(CL_INVALID_COMMAND_QUEUE); } TEST(ERROR, InvalidEventWaitListsymm) { ErrorClass > ec; ec.error(CL_INVALID_EVENT_WAIT_LIST); } TEST(ERROR, InvalidMemObjectsymm) { ErrorClass > ec; ec.error(clblasInvalidMatA); } TEST(ERROR, InvalidValuesymm) { ErrorClass > ec; ec.error(clblasInsufficientMemMatB); } TEST(ERROR, InvalidDevicesymm) { clMath::BlasBase* base = clMath::BlasBase::getInstance(); if (!base->isDevSupportDoublePrecision()) { ErrorClass > ec; ec.error(CL_INVALID_DEVICE); } } #endif #ifdef DO_SYR TEST(ERROR, InvalidCommandQueuesyr) { ErrorClass > ec; ec.error(CL_INVALID_COMMAND_QUEUE); } TEST(ERROR, InvalidEventWaitListsyr) { ErrorClass > ec; ec.error(CL_INVALID_EVENT_WAIT_LIST); } TEST(ERROR, InvalidMemObjectsyr) { ErrorClass > ec; ec.error(clblasInvalidMatA); } TEST(ERROR, InvalidValuesyr) { ErrorClass > ec; ec.error(clblasInsufficientMemMatA); } TEST(ERROR, InvalidDevicesyr) { clMath::BlasBase* base = clMath::BlasBase::getInstance(); if (!base->isDevSupportDoublePrecision()) { ErrorClass > ec; ec.error(CL_INVALID_DEVICE); } } #endif #ifdef DO_SPR TEST(ERROR, InvalidCommandQueuespr) { ErrorClass > ec; ec.error(CL_INVALID_COMMAND_QUEUE); } TEST(ERROR, InvalidEventWaitListspr) { ErrorClass > ec; ec.error(CL_INVALID_EVENT_WAIT_LIST); } TEST(ERROR, InvalidMemObjectspr) { ErrorClass > ec; ec.error(clblasInvalidMatA); } TEST(ERROR, InvalidValuespr) { ErrorClass > ec; ec.error(clblasInsufficientMemMatA); } TEST(ERROR, InvalidDevicespr) { clMath::BlasBase* base = clMath::BlasBase::getInstance(); if (!base->isDevSupportDoublePrecision()) { ErrorClass > ec; ec.error(CL_INVALID_DEVICE); } } #endif #ifdef DO_SYR2 TEST(ERROR, InvalidCommandQueuesyr2) { ErrorClass > ec; ec.error(CL_INVALID_COMMAND_QUEUE); } TEST(ERROR, InvalidEventWaitListsyr2) { ErrorClass > ec; ec.error(CL_INVALID_EVENT_WAIT_LIST); } TEST(ERROR, InvalidMemObjectsyr2) { ErrorClass > ec; ec.error(clblasInvalidMatA); } TEST(ERROR, InvalidValuesyr2) { ErrorClass > ec; ec.error(clblasInsufficientMemMatA); } TEST(ERROR, InvalidDevicesyr2) { clMath::BlasBase* base = clMath::BlasBase::getInstance(); if (!base->isDevSupportDoublePrecision()) { ErrorClass > ec; ec.error(CL_INVALID_DEVICE); } } #endif #ifdef DO_GER TEST(ERROR, InvalidCommandQueueger) { ErrorClass > ec; ec.error(CL_INVALID_COMMAND_QUEUE); } TEST(ERROR, InvalidEventWaitListger) { ErrorClass > ec; ec.error(CL_INVALID_EVENT_WAIT_LIST); } TEST(ERROR, InvalidMemObjectger) { ErrorClass > ec; ec.error(clblasInvalidMatA); } TEST(ERROR, InvalidValueger) { ErrorClass > ec; ec.error(clblasInsufficientMemMatA); } TEST(ERROR, InvalidDeviceger) { clMath::BlasBase* base = clMath::BlasBase::getInstance(); if (!base->isDevSupportDoublePrecision()) { ErrorClass > ec; ec.error(CL_INVALID_DEVICE); } } #endif #ifdef DO_GERC TEST(ERROR, InvalidCommandQueuegerc) { ErrorClass > ec; ec.error(CL_INVALID_COMMAND_QUEUE); } TEST(ERROR, InvalidEventWaitListgerc) { ErrorClass > ec; ec.error(CL_INVALID_EVENT_WAIT_LIST); } TEST(ERROR, InvalidMemObjectgerc) { ErrorClass > ec; ec.error(clblasInvalidMatA); } TEST(ERROR, InvalidValuegerc) { ErrorClass > ec; ec.error(clblasInsufficientMemMatA); } TEST(ERROR, InvalidDevicegerc) { clMath::BlasBase* base = clMath::BlasBase::getInstance(); if (!base->isDevSupportDoublePrecision()) { ErrorClass > ec; ec.error(CL_INVALID_DEVICE); } } #endif #ifdef DO_HER TEST(ERROR, InvalidCommandQueueher) { ErrorClass > ec; ec.error(CL_INVALID_COMMAND_QUEUE); } TEST(ERROR, InvalidEventWaitListher) { ErrorClass > ec; ec.error(CL_INVALID_EVENT_WAIT_LIST); } TEST(ERROR, InvalidMemObjecther) { ErrorClass > ec; ec.error(clblasInvalidMatA); } TEST(ERROR, InvalidValueher) { ErrorClass > ec; ec.error(clblasInsufficientMemMatA); } TEST(ERROR, InvalidDeviceher) { clMath::BlasBase* base = clMath::BlasBase::getInstance(); if (!base->isDevSupportDoublePrecision()) { ErrorClass > ec; ec.error(CL_INVALID_DEVICE); } } #endif #ifdef DO_HER2 TEST(ERROR, InvalidCommandQueueher2) { ErrorClass > ec; ec.error(CL_INVALID_COMMAND_QUEUE); } TEST(ERROR, InvalidEventWaitListher2) { ErrorClass > ec; ec.error(CL_INVALID_EVENT_WAIT_LIST); } TEST(ERROR, InvalidMemObjecther2) { ErrorClass > ec; ec.error(clblasInvalidMatA); } TEST(ERROR, InvalidValueher2) { ErrorClass > ec; ec.error(clblasInsufficientMemMatA); } TEST(ERROR, InvalidDeviceher2) { clMath::BlasBase* base = clMath::BlasBase::getInstance(); if (!base->isDevSupportDoublePrecision()) { ErrorClass > ec; ec.error(CL_INVALID_DEVICE); } } #endif #ifdef DO_HEMM TEST(ERROR, InvalidCommandQueuehemm) { ErrorClass > ec; ec.error(CL_INVALID_COMMAND_QUEUE); } TEST(ERROR, InvalidEventWaitListhemm) { ErrorClass > ec; ec.error(CL_INVALID_EVENT_WAIT_LIST); } TEST(ERROR, InvalidMemObjecthemm) { ErrorClass > ec; ec.error(clblasInvalidMatA); } TEST(ERROR, InvalidValuehemm) { ErrorClass > ec; ec.error(clblasInsufficientMemMatB); } TEST(ERROR, InvalidDevicehemm) { clMath::BlasBase* base = clMath::BlasBase::getInstance(); if (!base->isDevSupportDoublePrecision()) { ErrorClass > ec; ec.error(CL_INVALID_DEVICE); } } #endif #ifdef DO_HEMV TEST(ERROR, InvalidCommandQueuehemv) { ErrorClass > ec; ec.error(CL_INVALID_COMMAND_QUEUE); } TEST(ERROR, InvalidEventWaitListhemv) { ErrorClass > ec; ec.error(CL_INVALID_EVENT_WAIT_LIST); } TEST(ERROR, InvalidMemObjecthemv) { ErrorClass > ec; ec.error(clblasInvalidMatA); } TEST(ERROR, InvalidValuehemv) { ErrorClass > ec; ec.error(clblasInsufficientMemMatA); } TEST(ERROR, InvalidDevicehemv) { clMath::BlasBase* base = clMath::BlasBase::getInstance(); if (!base->isDevSupportDoublePrecision()) { ErrorClass > ec; ec.error(CL_INVALID_DEVICE); } } #endif #ifdef DO_HERK TEST(ERROR, InvalidCommandQueueherk) { ErrorClass > ec; ec.error(CL_INVALID_COMMAND_QUEUE); } TEST(ERROR, InvalidEventWaitListherk) { ErrorClass > ec; ec.error(CL_INVALID_EVENT_WAIT_LIST); } TEST(ERROR, InvalidMemObjectherk) { ErrorClass > ec; ec.error(clblasInvalidMatA); } TEST(ERROR, InvalidValueherk) { ErrorClass > ec; ec.error(clblasInsufficientMemMatA); } TEST(ERROR, InvalidDeviceherk) { clMath::BlasBase* base = clMath::BlasBase::getInstance(); if (!base->isDevSupportDoublePrecision()) { ErrorClass > ec; ec.error(CL_INVALID_DEVICE); } } #endif #ifdef DO_HPMV TEST(ERROR, InvalidCommandQueuehpmv) { ErrorClass > ec; ec.error(CL_INVALID_COMMAND_QUEUE); } TEST(ERROR, InvalidEventWaitListhpmv) { ErrorClass > ec; ec.error(CL_INVALID_EVENT_WAIT_LIST); } TEST(ERROR, InvalidMemObjecthpmv) { ErrorClass > ec; ec.error(clblasInvalidMatA); } TEST(ERROR, InvalidValuehpmv) { ErrorClass > ec; ec.error(clblasInsufficientMemMatA); } TEST(ERROR, InvalidDevicehpmv) { clMath::BlasBase* base = clMath::BlasBase::getInstance(); if (!base->isDevSupportDoublePrecision()) { ErrorClass > ec; ec.error(CL_INVALID_DEVICE); } } #endif #ifdef DO_SPMV TEST(ERROR, InvalidCommandQueuespmv) { ErrorClass > ec; ec.error(CL_INVALID_COMMAND_QUEUE); } TEST(ERROR, InvalidEventWaitListspmv) { ErrorClass > ec; ec.error(CL_INVALID_EVENT_WAIT_LIST); } TEST(ERROR, InvalidMemObjectspmv) { ErrorClass > ec; ec.error(clblasInvalidMatA); } TEST(ERROR, InvalidValuespmv) { ErrorClass > ec; ec.error(clblasInsufficientMemMatA); } TEST(ERROR, InvalidDevicespmv) { clMath::BlasBase* base = clMath::BlasBase::getInstance(); if (!base->isDevSupportDoublePrecision()) { ErrorClass > ec; ec.error(CL_INVALID_DEVICE); } } #endif #ifdef DO_SPR2 TEST(ERROR, InvalidCommandQueuespr2) { ErrorClass > ec; ec.error(CL_INVALID_COMMAND_QUEUE); } TEST(ERROR, InvalidEventWaitListspr2) { ErrorClass > ec; ec.error(CL_INVALID_EVENT_WAIT_LIST); } TEST(ERROR, InvalidMemObjectspr2) { ErrorClass > ec; ec.error(clblasInvalidMatA); } TEST(ERROR, InvalidValuespr2) { ErrorClass > ec; ec.error(clblasInsufficientMemMatA); } TEST(ERROR, InvalidDevicespr2) { clMath::BlasBase* base = clMath::BlasBase::getInstance(); if (!base->isDevSupportDoublePrecision()) { ErrorClass > ec; ec.error(CL_INVALID_DEVICE); } } #endif #ifdef DO_HPR TEST(ERROR, InvalidCommandQueuehpr) { ErrorClass > ec; ec.error(CL_INVALID_COMMAND_QUEUE); } TEST(ERROR, InvalidEventWaitListhpr) { ErrorClass > ec; ec.error(CL_INVALID_EVENT_WAIT_LIST); } TEST(ERROR, InvalidMemObjecthpr) { ErrorClass > ec; ec.error(clblasInvalidMatA); } TEST(ERROR, InvalidValuehpr) { ErrorClass > ec; ec.error(clblasInsufficientMemMatA); } TEST(ERROR, InvalidDevicehpr) { clMath::BlasBase* base = clMath::BlasBase::getInstance(); if (!base->isDevSupportDoublePrecision()) { ErrorClass > ec; ec.error(CL_INVALID_DEVICE); } } #endif #ifdef DO_HPR2 TEST(ERROR, InvalidCommandQueuehpr2) { ErrorClass > ec; ec.error(CL_INVALID_COMMAND_QUEUE); } TEST(ERROR, InvalidEventWaitListhpr2) { ErrorClass > ec; ec.error(CL_INVALID_EVENT_WAIT_LIST); } TEST(ERROR, InvalidMemObjecthpr2) { ErrorClass > ec; ec.error(clblasInvalidMatA); } TEST(ERROR, InvalidValuehpr2) { ErrorClass > ec; ec.error(clblasInsufficientMemMatA); } TEST(ERROR, InvalidDevicehpr2) { clMath::BlasBase* base = clMath::BlasBase::getInstance(); if (!base->isDevSupportDoublePrecision()) { ErrorClass > ec; ec.error(CL_INVALID_DEVICE); } } #endif #ifdef DO_GBMV TEST(ERROR, InvalidCommandQueueGBMV) { ErrorClass > ec; ec.error(CL_INVALID_COMMAND_QUEUE); } TEST(ERROR, InvalidEventWaitListGBMV) { ErrorClass > ec; ec.error(CL_INVALID_EVENT_WAIT_LIST); } TEST(ERROR, InvalidMemObjectGBMV) { ErrorClass > ec; ec.error(clblasInvalidMatA); } TEST(ERROR, InvalidValueGBMV) { ErrorClass > ec; ec.error(clblasInsufficientMemMatA); } TEST(ERROR, InvalidDeviceGBMV) { clMath::BlasBase* base = clMath::BlasBase::getInstance(); if (!base->isDevSupportDoublePrecision()) { ErrorClass > ec; ec.error(CL_INVALID_DEVICE); } } #endif #ifdef DO_SBMV TEST(ERROR, InvalidCommandQueuesbmv) { ErrorClass > ec; ec.error(CL_INVALID_COMMAND_QUEUE); } TEST(ERROR, InvalidEventWaitListsbmv) { ErrorClass > ec; ec.error(CL_INVALID_EVENT_WAIT_LIST); } TEST(ERROR, InvalidMemObjectsbmv) { ErrorClass > ec; ec.error(clblasInvalidMatA); } TEST(ERROR, InvalidValuesbmv) { ErrorClass > ec; ec.error(clblasInsufficientMemMatA); } TEST(ERROR, InvalidDevicesbmv) { clMath::BlasBase* base = clMath::BlasBase::getInstance(); if (!base->isDevSupportDoublePrecision()) { ErrorClass > ec; ec.error(CL_INVALID_DEVICE); } } #endif #ifdef DO_HBMV TEST(ERROR, InvalidCommandQueuehbmv) { ErrorClass > ec; ec.error(CL_INVALID_COMMAND_QUEUE); } TEST(ERROR, InvalidEventWaitListhbmv) { ErrorClass > ec; ec.error(CL_INVALID_EVENT_WAIT_LIST); } TEST(ERROR, InvalidMemObjecthbmv) { ErrorClass > ec; ec.error(clblasInvalidMatA); } TEST(ERROR, InvalidValuehbmv) { ErrorClass > ec; ec.error(clblasInsufficientMemMatA); } TEST(ERROR, InvalidDevicehbmv) { clMath::BlasBase* base = clMath::BlasBase::getInstance(); if (!base->isDevSupportDoublePrecision()) { ErrorClass > ec; ec.error(CL_INVALID_DEVICE); } } #endif #ifdef DO_TBMV TEST(ERROR, InvalidCommandQueueTBMV) { ErrorClass > ec; ec.error(CL_INVALID_COMMAND_QUEUE); } TEST(ERROR, InvalidEventWaitListTBMV) { ErrorClass > ec; ec.error(CL_INVALID_EVENT_WAIT_LIST); } TEST(ERROR, InvalidMemObjectTBMV) { ErrorClass > ec; ec.error(clblasInvalidMatA); } TEST(ERROR, InvalidValueTBMV) { ErrorClass > ec; ec.error(clblasInsufficientMemMatA); } TEST(ERROR, InvalidDeviceTBMV) { clMath::BlasBase* base = clMath::BlasBase::getInstance(); if (!base->isDevSupportDoublePrecision()) { ErrorClass > ec; ec.error(CL_INVALID_DEVICE); } } #endif #ifdef DO_TBSV TEST(ERROR, InvalidCommandQueueTBSV) { ErrorClass > ec; ec.error(CL_INVALID_COMMAND_QUEUE); } TEST(ERROR, InvalidEventWaitListTBSV) { ErrorClass > ec; ec.error(CL_INVALID_EVENT_WAIT_LIST); } TEST(ERROR, InvalidMemObjectTBSV) { ErrorClass > ec; ec.error(clblasInvalidMatA); } TEST(ERROR, InvalidValueTBSV) { ErrorClass > ec; ec.error(clblasInsufficientMemVecX); } TEST(ERROR, InvalidDeviceTBSV) { clMath::BlasBase* base = clMath::BlasBase::getInstance(); if (!base->isDevSupportDoublePrecision()) { ErrorClass > ec; ec.error(CL_INVALID_DEVICE); } } #endif #ifdef DO_HER2K TEST(ERROR, InvalidCommandQueueher2k) { ErrorClass > ec; ec.error(CL_INVALID_COMMAND_QUEUE); } TEST(ERROR, InvalidEventWaitListher2k) { ErrorClass > ec; ec.error(CL_INVALID_EVENT_WAIT_LIST); } TEST(ERROR, InvalidMemObjecther2k) { ErrorClass > ec; ec.error(clblasInvalidMatA); } TEST(ERROR, InvalidValueher2k) { ErrorClass > ec; ec.error(clblasInsufficientMemMatA); } TEST(ERROR, InvalidDeviceher2k) { clMath::BlasBase* base = clMath::BlasBase::getInstance(); if (!base->isDevSupportDoublePrecision()) { ErrorClass > ec; ec.error(CL_INVALID_DEVICE); } } #endif #ifdef DO_SCAL TEST(ERROR, InvalidCommandQueuescal) { ErrorClass > ec; ec.error(CL_INVALID_COMMAND_QUEUE); } TEST(ERROR, InvalidEventWaitListscal) { ErrorClass > ec; ec.error(CL_INVALID_EVENT_WAIT_LIST); } TEST(ERROR, InvalidMemObjectscal) { ErrorClass > ec; ec.error(clblasInvalidVecX); } TEST(ERROR, InvalidValuescal) { ErrorClass > ec; ec.error(clblasInsufficientMemVecX); } TEST(ERROR, InvalidDevicescal) { clMath::BlasBase* base = clMath::BlasBase::getInstance(); if (!base->isDevSupportDoublePrecision()) { ErrorClass > ec; ec.error(CL_INVALID_DEVICE); } } #endif #ifdef DO_SSCAL TEST(ERROR, InvalidCommandQueuesscal) { ErrorClass > ec; ec.error(CL_INVALID_COMMAND_QUEUE); } TEST(ERROR, InvalidEventWaitListsscal) { ErrorClass > ec; ec.error(CL_INVALID_EVENT_WAIT_LIST); } TEST(ERROR, InvalidMemObjectsscal) { ErrorClass > ec; ec.error(clblasInvalidVecX); } TEST(ERROR, InvalidValuesscal) { ErrorClass > ec; ec.error(clblasInsufficientMemVecX); } TEST(ERROR, InvalidDevicesscal) { clMath::BlasBase* base = clMath::BlasBase::getInstance(); if (!base->isDevSupportDoublePrecision()) { ErrorClass > ec; ec.error(CL_INVALID_DEVICE); } } #endif #ifdef DO_SWAP TEST(ERROR, InvalidCommandQueueswap) { ErrorClass > ec; ec.error(CL_INVALID_COMMAND_QUEUE); } TEST(ERROR, InvalidEventWaitListswap) { ErrorClass > ec; ec.error(CL_INVALID_EVENT_WAIT_LIST); } TEST(ERROR, InvalidMemObjectswap) { ErrorClass > ec; ec.error(clblasInvalidVecX); } TEST(ERROR, InvalidValueswap) { ErrorClass > ec; ec.error(clblasInsufficientMemVecX); } TEST(ERROR, InvalidDeviceswap) { clMath::BlasBase* base = clMath::BlasBase::getInstance(); if (!base->isDevSupportDoublePrecision()) { ErrorClass > ec; ec.error(CL_INVALID_DEVICE); } } #endif #ifdef DO_COPY TEST(ERROR, InvalidCommandQueuecopy) { ErrorClass > ec; ec.error(CL_INVALID_COMMAND_QUEUE); } TEST(ERROR, InvalidEventWaitListcopy) { ErrorClass > ec; ec.error(CL_INVALID_EVENT_WAIT_LIST); } TEST(ERROR, InvalidMemObjectcopy) { ErrorClass > ec; ec.error(clblasInvalidVecX); } TEST(ERROR, InvalidValuecopy) { ErrorClass > ec; ec.error(clblasInsufficientMemVecX); } TEST(ERROR, InvalidDevicecopy) { clMath::BlasBase* base = clMath::BlasBase::getInstance(); if (!base->isDevSupportDoublePrecision()) { ErrorClass > ec; ec.error(CL_INVALID_DEVICE); } } #endif #ifdef DO_AXPY TEST(ERROR, InvalidCommandQueueaxpy) { ErrorClass > ec; ec.error(CL_INVALID_COMMAND_QUEUE); } TEST(ERROR, InvalidEventWaitListaxpy) { ErrorClass > ec; ec.error(CL_INVALID_EVENT_WAIT_LIST); } TEST(ERROR, InvalidMemObjectaxpy) { ErrorClass > ec; ec.error(clblasInvalidVecX); } TEST(ERROR, InvalidValueaxpy) { ErrorClass > ec; ec.error(clblasInsufficientMemVecX); } TEST(ERROR, InvalidDeviceaxpy) { clMath::BlasBase* base = clMath::BlasBase::getInstance(); if (!base->isDevSupportDoublePrecision()) { ErrorClass > ec; ec.error(CL_INVALID_DEVICE); } } #endif //DOT #ifdef DO_DOT TEST(ERROR, InvalidCommandQueuedot) { ErrorClass > ec; ec.error(CL_INVALID_COMMAND_QUEUE); } TEST(ERROR, InvalidEventWaitListdot) { ErrorClass > ec; ec.error(CL_INVALID_EVENT_WAIT_LIST); } TEST(ERROR, InvalidMemObjectdot) { ErrorClass > ec; ec.error(clblasInvalidVecX); } TEST(ERROR, InvalidValuedot) { ErrorClass > ec; ec.error(clblasInsufficientMemVecX); } TEST(ERROR, InvalidDevicedot) { clMath::BlasBase* base = clMath::BlasBase::getInstance(); if (!base->isDevSupportDoublePrecision()) { ErrorClass > ec; ec.error(CL_INVALID_DEVICE); } } #endif #ifdef DO_ASUM TEST(ERROR, InvalidCommandQueueasum) { ErrorClass > ec; ec.error(CL_INVALID_COMMAND_QUEUE); } TEST(ERROR, InvalidEventWaitListasum) { ErrorClass > ec; ec.error(CL_INVALID_EVENT_WAIT_LIST); } TEST(ERROR, InvalidMemObjectasum) { ErrorClass > ec; ec.error(clblasInvalidVecX); } TEST(ERROR, InvalidValueasum) { ErrorClass > ec; ec.error(clblasInsufficientMemVecX); } TEST(ERROR, InvalidDeviceasum) { clMath::BlasBase* base = clMath::BlasBase::getInstance(); if (!base->isDevSupportDoublePrecision()) { ErrorClass > ec; ec.error(CL_INVALID_DEVICE); } } #endif #ifdef DO_iAMAX TEST(ERROR, InvalidCommandQueueiamax) { ErrorClass > ec; ec.error(CL_INVALID_COMMAND_QUEUE); } TEST(ERROR, InvalidEventWaitListiamax) { ErrorClass > ec; ec.error(CL_INVALID_EVENT_WAIT_LIST); } TEST(ERROR, InvalidMemObjectiamax) { ErrorClass > ec; ec.error(clblasInvalidVecX); } TEST(ERROR, InvalidValueiamax) { ErrorClass > ec; ec.error(clblasInsufficientMemVecX); } TEST(ERROR, InvalidDeviceiamax) { clMath::BlasBase* base = clMath::BlasBase::getInstance(); if (!base->isDevSupportDoublePrecision()) { ErrorClass > ec; ec.error(CL_INVALID_DEVICE); } } #endif //DOTC #ifdef DO_DOTC TEST(ERROR, InvalidCommandQueuedotc) { ErrorClass > ec; ec.error(CL_INVALID_COMMAND_QUEUE); } TEST(ERROR, InvalidEventWaitListdotc) { ErrorClass > ec; ec.error(CL_INVALID_EVENT_WAIT_LIST); } TEST(ERROR, InvalidMemObjectdotc) { ErrorClass > ec; ec.error(clblasInvalidVecX); } TEST(ERROR, InvalidValuedotc) { ErrorClass > ec; ec.error(clblasInsufficientMemVecX); } TEST(ERROR, InvalidDevicedotc) { clMath::BlasBase* base = clMath::BlasBase::getInstance(); if (!base->isDevSupportDoublePrecision()) { ErrorClass > ec; ec.error(CL_INVALID_DEVICE); } } #endif #ifdef DO_ROTG TEST(ERROR, InvalidCommandQueuerotg) { ErrorClass > ec; ec.error(CL_INVALID_COMMAND_QUEUE); } TEST(ERROR, InvalidEventWaitListrotg) { ErrorClass > ec; ec.error(CL_INVALID_EVENT_WAIT_LIST); } TEST(ERROR, InvalidMemObjectrotg) { ErrorClass > ec; ec.error(clblasInvalidVecX); } /* Skipping Invalid value- because rotg doesn't depend on parameter N, So even passing an invalid N doesn't matter TEST(ERROR, InvalidValuerotg) { ErrorClass > ec; ec.error(clblasInsufficientMemVecX); } */ TEST(ERROR, InvalidDevicerotg) { clMath::BlasBase* base = clMath::BlasBase::getInstance(); if (!base->isDevSupportDoublePrecision()) { ErrorClass > ec; ec.error(CL_INVALID_DEVICE); } } #endif #ifdef DO_ROTM TEST(ERROR, InvalidCommandQueuerotm) { ErrorClass > ec; ec.error(CL_INVALID_COMMAND_QUEUE); } TEST(ERROR, InvalidEventWaitListrotm) { ErrorClass > ec; ec.error(CL_INVALID_EVENT_WAIT_LIST); } TEST(ERROR, InvalidMemObjectrotm) { ErrorClass > ec; ec.error(clblasInvalidVecX); } TEST(ERROR, InvalidValuerotm) { ErrorClass > ec; ec.error(clblasInsufficientMemVecX); } TEST(ERROR, InvalidDevicerotm) { clMath::BlasBase* base = clMath::BlasBase::getInstance(); if (!base->isDevSupportDoublePrecision()) { ErrorClass > ec; ec.error(CL_INVALID_DEVICE); } } #endif #ifdef DO_ROT TEST(ERROR, InvalidCommandQueuerot) { ErrorClass > ec; ec.error(CL_INVALID_COMMAND_QUEUE); } TEST(ERROR, InvalidEventWaitListrot) { ErrorClass > ec; ec.error(CL_INVALID_EVENT_WAIT_LIST); } TEST(ERROR, InvalidMemObjectrot) { ErrorClass > ec; ec.error(clblasInvalidVecX); } TEST(ERROR, InvalidValuerot) { ErrorClass > ec; ec.error(clblasInsufficientMemVecX); } TEST(ERROR, InvalidDevicerot) { clMath::BlasBase* base = clMath::BlasBase::getInstance(); if (!base->isDevSupportDoublePrecision()) { ErrorClass > ec; ec.error(CL_INVALID_DEVICE); } } #endif #ifdef DO_ROTMG TEST(ERROR, InvalidCommandQueuerotmg) { ErrorClass > ec; ec.error(CL_INVALID_COMMAND_QUEUE); } TEST(ERROR, InvalidEventWaitListrotmg) { ErrorClass > ec; ec.error(CL_INVALID_EVENT_WAIT_LIST); } TEST(ERROR, InvalidMemObjectrotmg) { ErrorClass > ec; ec.error(clblasInvalidVecX); } /* Skipping Invalid value- because rotg doesn't depend on parameter N, So even passing an invalid N doesn't matter TEST(ERROR, InvalidValuerotmg) { ErrorClass > ec; ec.error(clblasInsufficientMemVecX); } */ TEST(ERROR, InvalidDevicerotmg) { clMath::BlasBase* base = clMath::BlasBase::getInstance(); if (!base->isDevSupportDoublePrecision()) { ErrorClass > ec; ec.error(CL_INVALID_DEVICE); } } #endif #ifdef DO_NRM2 TEST(ERROR, InvalidCommandQueuenrm2) { ErrorClass > ec; ec.error(CL_INVALID_COMMAND_QUEUE); } TEST(ERROR, InvalidEventWaitListnrm2) { ErrorClass > ec; ec.error(CL_INVALID_EVENT_WAIT_LIST); } TEST(ERROR, InvalidMemObjectnrm2) { ErrorClass > ec; ec.error(clblasInvalidVecX); } TEST(ERROR, InvalidValuenrm2) { ErrorClass > ec; ec.error(clblasInsufficientMemVecX); } TEST(ERROR, InvalidDevicenrm2) { clMath::BlasBase* base = clMath::BlasBase::getInstance(); if (!base->isDevSupportDoublePrecision()) { ErrorClass > ec; ec.error(CL_INVALID_DEVICE); } } #endif clblas-2.10/src/tests/functional/func-event.cpp000066400000000000000000000740571264277366700216060ustar00rootroot00000000000000/* ************************************************************************ * Copyright 2013 Advanced Micro Devices, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * ************************************************************************/ //#include // srand() //#include // memcpy() #include #include // //#include "common.h" //#include "blas.h" #include "blas-wrapper.h" #include "clBLAS-wrapper.h" #include "BlasBase.h" #include "blas-random.h" #include "timer.h" #include "func.h" template class EventClass { M metod; protected: void eventOutCorrectnessTest(); void eventInCorrectnessTest(); bool generateData(); public: void runOut(); void runIn(); }; template bool EventClass::generateData() { metod.generateData(); bool ret =metod.prepareDataToRun(); if (!ret) { ::std::cerr << ">> Failed to create/enqueue buffer for a matrix." << ::std::endl << ">> Can't execute the test, because data is not transfered to GPU." << ::std::endl << ">> Test skipped." << ::std::endl; SUCCEED(); } return ret; } template void EventClass::runOut() { metod.initDefault(512*4, 1); eventOutCorrectnessTest(); metod.destroy(); } template void EventClass::runIn() { metod.initDefault(256, 1); eventInCorrectnessTest(); metod.destroy(); } template void EventClass::eventOutCorrectnessTest() { cl_int err; if (generateData()) { metod.initOutEvent(); err = metod.run(); ASSERT_EQ(err, CL_SUCCESS) << "clFinish()"; //logEvent(events); err = clFinish(metod.queues[0]); ASSERT_EQ(err, CL_SUCCESS) << "clFinish()"; cl_int ret = CL_SUCCESS; err = clGetEventInfo(*metod.outEvent, CL_EVENT_COMMAND_EXECUTION_STATUS, sizeof(cl_int), &ret, NULL); ASSERT_EQ(err, CL_SUCCESS) << "clGetEventInfo()"; ASSERT_EQ(ret, CL_COMPLETE) << "clGetEventInfo()"; } } template void EventClass::eventInCorrectnessTest() { cl_int err; cl_int ret = CL_SUCCESS; int qmax = metod.qnum; nano_time_t minSleepTime = 100000000; if (generateData()) { metod.outEvent = new cl_event[1]; metod.outEvent[0] = NULL; nano_time_t timeFirst = getCurrentTime(); // First run. err = metod.run(); ASSERT_EQ(err, CL_SUCCESS) << "clFinish()"; for (int i = 0; i < qmax; ++i) { err = clFinish(metod.queues[i]); } ASSERT_EQ(err, CL_SUCCESS) << "clFinish()"; timeFirst = getCurrentTime() - timeFirst; cl_event event = clCreateUserEvent(metod.context, &err); ASSERT_EQ(err, CL_SUCCESS) << "clCreateUserEvent()"; metod.inEventCount = 1; metod.inEvent = &event; err = metod.run(); ASSERT_EQ(err, CL_SUCCESS) << "runClBlasFunction()"; clFlush(metod.queues[0]); // sleepTime((timeFirst < minSleepTime)? minSleepTime : timeFirst); clSetUserEventStatus(event, CL_COMPLETE); err = clFinish(metod.queues[0]); err = clGetEventInfo(metod.outEvent[0], CL_EVENT_COMMAND_EXECUTION_STATUS, sizeof(cl_int), &ret, NULL); ASSERT_EQ(err, CL_SUCCESS) << "clGetEventInfo()"; ASSERT_EQ(ret, CL_COMPLETE) << "clGetEventInfo()"; clReleaseEvent(event); metod.inEventCount = 0; metod.inEvent = NULL; } } #ifdef DO_THEIRS //******************************************************// TEST(EVENT_OUT, sgemm) { EventClass< GemmMetod > ec; ec.runOut(); } TEST(EVENT_OUT, cgemm) { EventClass< GemmMetod > ec; ec.runOut(); } TEST(EVENT_OUT, dgemm) { CHECK_DOUBLE; EventClass< GemmMetod > ec; ec.runOut(); } TEST(EVENT_OUT, zgemm) { CHECK_DOUBLE; EventClass > ec; ec.runOut(); } //******************************************************// TEST(EVENT_OUT, strmm) { EventClass > ec; ec.runOut(); } TEST(EVENT_OUT, ctrmm) { EventClass > ec; ec.runOut(); } TEST(EVENT_OUT, dtrmm) { CHECK_DOUBLE; EventClass > ec; ec.runOut(); } TEST(EVENT_OUT, ztrmm) { CHECK_DOUBLE; EventClass > ec; ec.runOut(); } //******************************************************// TEST(EVENT_OUT, strsm) { EventClass > ec; ec.runOut(); } TEST(EVENT_OUT, ctrsm) { EventClass > ec; ec.runOut(); } TEST(EVENT_OUT, dtrsm) { CHECK_DOUBLE; EventClass > ec; ec.runOut(); } TEST(EVENT_OUT, ztrsm) { CHECK_DOUBLE; EventClass > ec; ec.runOut(); } //******************************************************// TEST(EVENT_OUT, sgemv) { EventClass > ec; ec.runOut(); } #if defined(_USE_GEMV_COMPLEX) TEST(EVENT_OUT, cgemv) { EventClass > ec; ec.runOut(); } #endif TEST(EVENT_OUT, dgemv) { CHECK_DOUBLE; EventClass > ec; ec.runOut(); } #if defined(_USE_GEMV_COMPLEX) TEST(EVENT_OUT, zgemv) { CHECK_DOUBLE; EventClass > ec; ec.runOut(); } #endif //******************************************************// TEST(EVENT_OUT, ssymv) { EventClass > ec; ec.runOut(); } TEST(EVENT_OUT, dsymv) { CHECK_DOUBLE; EventClass > ec; ec.runOut(); } //******************************************************// TEST(EVENT_OUT, ssyr2k) { EventClass > ec; ec.runOut(); } TEST(EVENT_OUT, dsyr2k) { CHECK_DOUBLE; EventClass > ec; ec.runOut(); } //******************************************************// //******************************************************// TEST(EVENT_IN, sgemm) { EventClass > ec; ec.runIn(); } TEST(EVENT_IN, cgemm) { EventClass > ec; ec.runIn(); } TEST(EVENT_IN, dgemm) { CHECK_DOUBLE; EventClass > ec; ec.runIn(); } TEST(EVENT_IN, zgemm) { CHECK_DOUBLE; EventClass > ec; ec.runIn(); } //******************************************************// TEST(EVENT_IN, strmm) { EventClass > ec; ec.runIn(); } TEST(EVENT_IN, ctrmm) { EventClass > ec; ec.runIn(); } TEST(EVENT_IN, dgtrmm) { CHECK_DOUBLE; EventClass > ec; ec.runIn(); } TEST(EVENT_IN, ztrmm) { CHECK_DOUBLE; EventClass > ec; ec.runIn(); } //******************************************************// TEST(EVENT_IN, strsm) { EventClass > ec; ec.runIn(); } TEST(EVENT_IN, ctrsm) { EventClass > ec; ec.runIn(); } TEST(EVENT_IN, dtrsm) { CHECK_DOUBLE; EventClass > ec; ec.runIn(); } TEST(EVENT_IN, ztrsm) { CHECK_DOUBLE; EventClass > ec; ec.runIn(); } //******************************************************// TEST(EVENT_IN, sgemv) { EventClass > ec; ec.runIn(); } #if defined(_USE_GEMV_COMPLEX) TEST(EVENT_IN, cgemv) { EventClass > ec; ec.runIn(); } #endif TEST(EVENT_IN, dgemv) { CHECK_DOUBLE; EventClass > ec; ec.runIn(); } #if defined(_USE_GEMV_COMPLEX) TEST(EVENT_IN, zgemv) { CHECK_DOUBLE; EventClass > ec; ec.runIn(); } #endif //******************************************************// TEST(EVENT_IN, ssymv) { EventClass > ec; ec.runIn(); } TEST(EVENT_IN, dsymv) { CHECK_DOUBLE; EventClass > ec; ec.runIn(); } //******************************************************// TEST(EVENT_IN, ssyr2k) { EventClass > ec; ec.runIn(); } TEST(EVENT_IN, dsyr2k) { CHECK_DOUBLE; EventClass > ec; ec.runIn(); } #endif #ifdef DO_TRMV // TRMV //******************************************************// TEST(EVENT_OUT, strmv) { EventClass > ec; ec.runOut(); } TEST(EVENT_OUT, dtrmv) { CHECK_DOUBLE; EventClass > ec; ec.runOut(); } TEST(EVENT_OUT, ctrmv) { EventClass > ec; ec.runOut(); } TEST(EVENT_OUT, ztrmv) { CHECK_DOUBLE; EventClass > ec; ec.runOut(); } TEST(EVENT_IN, strmv) { EventClass > ec; ec.runIn(); } TEST(EVENT_IN, dtrmv) { CHECK_DOUBLE; EventClass > ec; ec.runIn(); } TEST(EVENT_IN, ctrmv) { EventClass > ec; ec.runIn(); } TEST(EVENT_IN, ztrmv) { CHECK_DOUBLE; EventClass > ec; ec.runIn(); } #endif #ifdef DO_TPMV TEST(EVENT_OUT, stpmv) { EventClass > ec; ec.runOut(); } TEST(EVENT_OUT, dtpmv) { CHECK_DOUBLE; EventClass > ec; ec.runOut(); } TEST(EVENT_OUT, ctpmv) { EventClass > ec; ec.runOut(); } TEST(EVENT_OUT, ztpmv) { CHECK_DOUBLE; EventClass > ec; ec.runOut(); } TEST(EVENT_IN, stpmv) { EventClass > ec; ec.runIn(); } TEST(EVENT_IN, dtpmv) { CHECK_DOUBLE; EventClass > ec; ec.runIn(); } TEST(EVENT_IN, ctpmv) { EventClass > ec; ec.runIn(); } TEST(EVENT_IN, ztpmv) { CHECK_DOUBLE; EventClass > ec; ec.runIn(); } #endif #ifdef DO_TRSV //******************************************************// // TRSV TEST(EVENT_OUT, strsv) { EventClass > ec; ec.runOut(); } TEST(EVENT_OUT, dtrsv) { CHECK_DOUBLE; EventClass > ec; ec.runOut(); } TEST(EVENT_OUT, ctrsv) { EventClass > ec; ec.runOut(); } TEST(EVENT_OUT, ztrsv) { CHECK_DOUBLE; EventClass > ec; ec.runOut(); } TEST(EVENT_IN, strsv) { EventClass > ec; ec.runIn(); } TEST(EVENT_IN, dtrsv) { CHECK_DOUBLE; EventClass > ec; ec.runIn(); } TEST(EVENT_IN, ctrsv) { EventClass > ec; ec.runIn(); } TEST(EVENT_IN, ztrsv) { CHECK_DOUBLE; EventClass > ec; ec.runIn(); } #endif #ifdef DO_TPSV TEST(EVENT_OUT, stpsv) { EventClass > ec; ec.runOut(); } TEST(EVENT_OUT, dtpsv) { CHECK_DOUBLE; EventClass > ec; ec.runOut(); } TEST(EVENT_OUT, ctpsv) { EventClass > ec; ec.runOut(); } TEST(EVENT_OUT, ztpsv) { CHECK_DOUBLE; EventClass > ec; ec.runOut(); } TEST(EVENT_IN, stpsv) { EventClass > ec; ec.runIn(); } TEST(EVENT_IN, dtpsv) { CHECK_DOUBLE; EventClass > ec; ec.runIn(); } TEST(EVENT_IN, ctpsv) { EventClass > ec; ec.runIn(); } TEST(EVENT_IN, ztpsv) { CHECK_DOUBLE; EventClass > ec; ec.runIn(); } #endif #ifdef DO_SYMM TEST(EVENT_IN, Ssymm) { EventClass > ec; ec.runIn(); } TEST(EVENT_IN, Dsymm) { CHECK_DOUBLE; EventClass > ec; ec.runIn(); } TEST(EVENT_IN, Csymm) { EventClass > ec; ec.runIn(); } TEST(EVENT_IN, Zsymm) { CHECK_DOUBLE; EventClass > ec; ec.runIn(); } TEST(EVENT_OUT, Ssymm) { EventClass > ec; ec.runOut(); } TEST(EVENT_OUT, Dsymm) { CHECK_DOUBLE; EventClass > ec; ec.runOut(); } TEST(EVENT_OUT, Csymm) { EventClass > ec; ec.runOut(); } TEST(EVENT_OUT, Zsymm) { CHECK_DOUBLE; EventClass > ec; ec.runOut(); } #endif #ifdef DO_SYR TEST(EVENT_IN, Ssyr) { EventClass > ec; ec.runIn(); } TEST(EVENT_IN, Dsyr) { CHECK_DOUBLE; EventClass > ec; ec.runIn(); } TEST(EVENT_OUT, Ssyr) { EventClass > ec; ec.runOut(); } TEST(EVENT_OUT, Dsyr) { CHECK_DOUBLE; EventClass > ec; ec.runOut(); } #endif #ifdef DO_SPR TEST(EVENT_IN, Sspr) { EventClass > ec; ec.runIn(); } TEST(EVENT_IN, Dspr) { CHECK_DOUBLE; EventClass > ec; ec.runIn(); } TEST(EVENT_OUT, Sspr) { EventClass > ec; ec.runOut(); } TEST(EVENT_OUT, Dspr) { CHECK_DOUBLE; EventClass > ec; ec.runOut(); } #endif #ifdef DO_SYR2 TEST(EVENT_IN, Ssyr2) { EventClass > ec; ec.runIn(); } TEST(EVENT_IN, Dsyr2) { CHECK_DOUBLE; EventClass > ec; ec.runIn(); } TEST(EVENT_OUT, Ssyr2) { EventClass > ec; ec.runOut(); } TEST(EVENT_OUT, Dsyr2) { CHECK_DOUBLE; EventClass > ec; ec.runOut(); } #endif #ifdef DO_GER TEST(EVENT_IN, Sger) { EventClass > ec; ec.runIn(); } TEST(EVENT_IN, Dger) { CHECK_DOUBLE; EventClass > ec; ec.runIn(); } TEST(EVENT_IN, Cgeru) { EventClass > ec; ec.runIn(); } TEST(EVENT_IN, Zgeru) { CHECK_DOUBLE; EventClass > ec; ec.runIn(); } TEST(EVENT_OUT, Sger) { EventClass > ec; ec.runOut(); } TEST(EVENT_OUT, Dger) { CHECK_DOUBLE; EventClass > ec; ec.runOut(); } TEST(EVENT_OUT, Cgeru) { EventClass > ec; ec.runOut(); } TEST(EVENT_OUT, Zgeru) { CHECK_DOUBLE; EventClass > ec; ec.runOut(); } #endif #ifdef DO_HER TEST(EVENT_IN, Cher) { EventClass > ec; ec.runIn(); } TEST(EVENT_IN, Zher) { CHECK_DOUBLE; EventClass > ec; ec.runIn(); } TEST(EVENT_OUT, Cher) { EventClass > ec; ec.runOut(); } TEST(EVENT_OUT, Zher) { CHECK_DOUBLE; EventClass > ec; ec.runOut(); } #endif #ifdef DO_GERC TEST(EVENT_IN, Cgerc) { EventClass > ec; ec.runIn(); } TEST(EVENT_IN, Zgerc) { CHECK_DOUBLE; EventClass > ec; ec.runIn(); } TEST(EVENT_OUT, Cgerc) { EventClass > ec; ec.runOut(); } TEST(EVENT_OUT, Zgerc) { CHECK_DOUBLE; EventClass > ec; ec.runOut(); } #endif #ifdef DO_HER2 TEST(EVENT_IN, Cher2) { EventClass > ec; ec.runIn(); } TEST(EVENT_IN, Zher2) { CHECK_DOUBLE; EventClass > ec; ec.runIn(); } TEST(EVENT_OUT, Cher2) { EventClass > ec; ec.runOut(); } TEST(EVENT_OUT, Zher2) { CHECK_DOUBLE; EventClass > ec; ec.runOut(); } #endif #ifdef DO_HEMM TEST(EVENT_IN, Chemm) { EventClass > ec; ec.runIn(); } TEST(EVENT_IN, Zhemm) { CHECK_DOUBLE; EventClass > ec; ec.runIn(); } TEST(EVENT_OUT, Chemm) { EventClass > ec; ec.runOut(); } TEST(EVENT_OUT, Zhemm) { CHECK_DOUBLE; EventClass > ec; ec.runOut(); } #endif #ifdef DO_HEMV TEST(EVENT_IN, Chemv) { EventClass > ec; ec.runIn(); } TEST(EVENT_IN, Zhemv) { CHECK_DOUBLE; EventClass > ec; ec.runIn(); } TEST(EVENT_OUT, Chemv) { EventClass > ec; ec.runOut(); } TEST(EVENT_OUT, Zhemv) { CHECK_DOUBLE; EventClass > ec; ec.runOut(); } #endif #ifdef DO_HERK TEST(EVENT_IN, Cherk) { EventClass > ec; ec.runIn(); } TEST(EVENT_IN, Zherk) { CHECK_DOUBLE; EventClass > ec; ec.runIn(); } TEST(EVENT_OUT, Cherk) { EventClass > ec; ec.runOut(); } TEST(EVENT_OUT, Zherk) { CHECK_DOUBLE; EventClass > ec; ec.runOut(); } #endif #ifdef DO_HPMV TEST(EVENT_IN, Chpmv) { EventClass > ec; ec.runIn(); } TEST(EVENT_IN, Zhpmv) { CHECK_DOUBLE; EventClass > ec; ec.runIn(); } TEST(EVENT_OUT, Chpmv) { EventClass > ec; ec.runOut(); } TEST(EVENT_OUT, Zhpmv) { CHECK_DOUBLE; EventClass > ec; ec.runOut(); } #endif #ifdef DO_SPMV TEST(EVENT_IN, Sspmv) { EventClass > ec; ec.runIn(); } TEST(EVENT_IN, Dspmv) { CHECK_DOUBLE; EventClass > ec; ec.runIn(); } TEST(EVENT_OUT, Sspmv) { EventClass > ec; ec.runOut(); } TEST(EVENT_OUT, Dspmv) { CHECK_DOUBLE; EventClass > ec; ec.runOut(); } #endif #ifdef DO_SPR2 TEST(EVENT_IN, Sspr2) { EventClass > ec; ec.runIn(); } TEST(EVENT_IN, Dspr2) { CHECK_DOUBLE; EventClass > ec; ec.runIn(); } TEST(EVENT_OUT, Sspr2) { EventClass > ec; ec.runOut(); } TEST(EVENT_OUT, Dspr2) { CHECK_DOUBLE; EventClass > ec; ec.runOut(); } #endif #ifdef DO_HPR TEST(EVENT_IN, Chpr) { EventClass > ec; ec.runIn(); } TEST(EVENT_IN, Zhpr) { CHECK_DOUBLE; EventClass > ec; ec.runIn(); } TEST(EVENT_OUT, Chpr) { EventClass > ec; ec.runOut(); } TEST(EVENT_OUT, Zhpr) { CHECK_DOUBLE; EventClass > ec; ec.runOut(); } #endif #ifdef DO_HPR2 TEST(EVENT_IN, Chpr2) { EventClass > ec; ec.runIn(); } TEST(EVENT_IN, Zhpr2) { CHECK_DOUBLE; EventClass > ec; ec.runIn(); } TEST(EVENT_OUT, Chpr2) { EventClass > ec; ec.runOut(); } TEST(EVENT_OUT, Zhpr2) { CHECK_DOUBLE; EventClass > ec; ec.runOut(); } #endif #ifdef DO_GBMV TEST(EVENT_IN, CGBMV) { EventClass > ec; ec.runIn(); } TEST(EVENT_IN, ZGBMV) { CHECK_DOUBLE; EventClass > ec; ec.runIn(); } TEST(EVENT_OUT, CGBMV) { EventClass > ec; ec.runOut(); } TEST(EVENT_OUT, ZGBMV) { CHECK_DOUBLE; EventClass > ec; ec.runOut(); } #endif #ifdef DO_SBMV TEST(EVENT_IN, Ssbmv) { EventClass > ec; ec.runIn(); } TEST(EVENT_IN, Dsbmv) { CHECK_DOUBLE; EventClass > ec; ec.runIn(); } TEST(EVENT_OUT, Ssbmv) { EventClass > ec; ec.runOut(); } TEST(EVENT_OUT, Dsbmv) { CHECK_DOUBLE; EventClass > ec; ec.runOut(); } #endif //DOT #ifdef DO_DOT TEST(EVENT_IN, Sdot) { EventClass > ec; ec.runIn(); } TEST(EVENT_IN, Ddot) { CHECK_DOUBLE; EventClass > ec; ec.runIn(); } TEST(EVENT_OUT, Sdot) { EventClass > ec; ec.runOut(); } TEST(EVENT_OUT, Ddot) { CHECK_DOUBLE; EventClass > ec; ec.runOut(); } TEST(EVENT_IN, Cdotu) { EventClass > ec; ec.runIn(); } TEST(EVENT_IN, Zdotu) { CHECK_DOUBLE; EventClass > ec; ec.runIn(); } TEST(EVENT_OUT, Cdotu) { EventClass > ec; ec.runOut(); } TEST(EVENT_OUT, Zdotu) { CHECK_DOUBLE; EventClass > ec; ec.runOut(); } #endif //ASUM #ifdef DO_ASUM TEST(EVENT_IN, Sasum) { EventClass > ec; ec.runIn(); } TEST(EVENT_IN, Dasum) { CHECK_DOUBLE; EventClass > ec; ec.runIn(); } TEST(EVENT_OUT, Sasum) { EventClass > ec; ec.runOut(); } TEST(EVENT_OUT, Dasum) { CHECK_DOUBLE; EventClass > ec; ec.runOut(); } TEST(EVENT_IN, Scasum) { EventClass > ec; ec.runIn(); } TEST(EVENT_IN, Dzasum) { CHECK_DOUBLE; EventClass > ec; ec.runIn(); } TEST(EVENT_OUT, Scasum) { EventClass > ec; ec.runOut(); } TEST(EVENT_OUT, Dzasum) { CHECK_DOUBLE; EventClass > ec; ec.runOut(); } #endif //iAMAX #ifdef DO_iAMAX TEST(EVENT_IN, iSamax) { EventClass > ec; ec.runIn(); } TEST(EVENT_IN, iDamax) { CHECK_DOUBLE; EventClass > ec; ec.runIn(); } TEST(EVENT_OUT, iSamax) { EventClass > ec; ec.runOut(); } TEST(EVENT_OUT, iDamax) { CHECK_DOUBLE; EventClass > ec; ec.runOut(); } TEST(EVENT_IN, iCamax) { EventClass > ec; ec.runIn(); } TEST(EVENT_IN, iZamax) { CHECK_DOUBLE; EventClass > ec; ec.runIn(); } TEST(EVENT_OUT, iCamax) { EventClass > ec; ec.runOut(); } TEST(EVENT_OUT, iZamax) { CHECK_DOUBLE; EventClass > ec; ec.runOut(); } #endif //DOTC #ifdef DO_DOTC TEST(EVENT_IN, Cdotc) { EventClass > ec; ec.runIn(); } TEST(EVENT_IN, Zdotc) { CHECK_DOUBLE; EventClass > ec; ec.runIn(); } TEST(EVENT_OUT, Cdotc) { EventClass > ec; ec.runOut(); } TEST(EVENT_OUT, Zdotc) { CHECK_DOUBLE; EventClass > ec; ec.runOut(); } #endif #ifdef DO_HBMV TEST(EVENT_IN, Chbmv) { EventClass > ec; ec.runIn(); } TEST(EVENT_IN, Zhbmv) { CHECK_DOUBLE; EventClass > ec; ec.runIn(); } TEST(EVENT_OUT, Chbmv) { EventClass > ec; ec.runOut(); } TEST(EVENT_OUT, Zhbmv) { CHECK_DOUBLE; EventClass > ec; ec.runOut(); } #endif #ifdef DO_TBMV TEST(EVENT_IN, CTBMV) { EventClass > ec; ec.runIn(); } TEST(EVENT_IN, ZTBMV) { CHECK_DOUBLE; EventClass > ec; ec.runIn(); } TEST(EVENT_OUT, CTBMV) { EventClass > ec; ec.runOut(); } TEST(EVENT_OUT, ZTBMV) { CHECK_DOUBLE; EventClass > ec; ec.runOut(); } #endif #ifdef DO_TBSV TEST(EVENT_IN, CTBSV) { EventClass > ec; ec.runIn(); } TEST(EVENT_IN, ZTBSV) { CHECK_DOUBLE; EventClass > ec; ec.runIn(); } TEST(EVENT_OUT, CTBSV) { EventClass > ec; ec.runOut(); } TEST(EVENT_OUT, ZTBSV) { CHECK_DOUBLE; EventClass > ec; ec.runOut(); } #endif #ifdef DO_HER2K TEST(EVENT_IN, Cher2k) { EventClass > ec; ec.runIn(); } TEST(EVENT_IN, Zher2k) { CHECK_DOUBLE; EventClass > ec; ec.runIn(); } TEST(EVENT_OUT, Cher2k) { EventClass > ec; ec.runOut(); } TEST(EVENT_OUT, Zher2k) { CHECK_DOUBLE; EventClass > ec; ec.runOut(); } #endif #ifdef DO_SCAL TEST(EVENT_IN, Sscal) { EventClass > ec; ec.runIn(); } TEST(EVENT_IN, Dscal) { CHECK_DOUBLE; EventClass > ec; ec.runIn(); } TEST(EVENT_OUT, Sscal) { EventClass > ec; ec.runOut(); } TEST(EVENT_OUT, Dscal) { CHECK_DOUBLE; EventClass > ec; ec.runOut(); } TEST(EVENT_IN, Cscal) { EventClass > ec; ec.runIn(); } TEST(EVENT_IN, Zscal) { CHECK_DOUBLE; EventClass > ec; ec.runIn(); } TEST(EVENT_OUT, Cscal) { EventClass > ec; ec.runOut(); } TEST(EVENT_OUT, Zscal) { CHECK_DOUBLE; EventClass > ec; ec.runOut(); } #endif #ifdef DO_SSCAL TEST(EVENT_IN, Csscal) { EventClass > ec; ec.runIn(); } TEST(EVENT_IN, Zdscal) { CHECK_DOUBLE; EventClass > ec; ec.runIn(); } TEST(EVENT_OUT, Csscal) { EventClass > ec; ec.runOut(); } TEST(EVENT_OUT, Zdscal) { CHECK_DOUBLE; EventClass > ec; ec.runOut(); } #endif #ifdef DO_SWAP TEST(EVENT_IN, Sswap) { EventClass > ec; ec.runIn(); } TEST(EVENT_IN, Dswap) { CHECK_DOUBLE; EventClass > ec; ec.runIn(); } TEST(EVENT_OUT, Sswap) { EventClass > ec; ec.runOut(); } TEST(EVENT_OUT, Dswap) { CHECK_DOUBLE; EventClass > ec; ec.runOut(); } TEST(EVENT_IN, Cswap) { EventClass > ec; ec.runIn(); } TEST(EVENT_IN, Zswap) { CHECK_DOUBLE; EventClass > ec; ec.runIn(); } TEST(EVENT_OUT, Cswap) { EventClass > ec; ec.runOut(); } TEST(EVENT_OUT, Zswap) { CHECK_DOUBLE; EventClass > ec; ec.runOut(); } #endif //copy #ifdef DO_COPY TEST(EVENT_IN, Scopy) { EventClass > ec; ec.runIn(); } TEST(EVENT_IN, Dcopy) { CHECK_DOUBLE; EventClass > ec; ec.runIn(); } TEST(EVENT_OUT, Scopy) { EventClass > ec; ec.runOut(); } TEST(EVENT_OUT, Dcopy) { CHECK_DOUBLE; EventClass > ec; ec.runOut(); } TEST(EVENT_IN, Ccopy) { EventClass > ec; ec.runIn(); } TEST(EVENT_IN, Zcopy) { CHECK_DOUBLE; EventClass > ec; ec.runIn(); } TEST(EVENT_OUT, Ccopy) { EventClass > ec; ec.runOut(); } TEST(EVENT_OUT, Zcopy) { CHECK_DOUBLE; EventClass > ec; ec.runOut(); } #endif #ifdef DO_AXPY TEST(EVENT_IN, Saxpy) { EventClass > ec; ec.runIn(); } TEST(EVENT_IN, Daxpy) { CHECK_DOUBLE; EventClass > ec; ec.runIn(); } TEST(EVENT_OUT, Saxpy) { EventClass > ec; ec.runOut(); } TEST(EVENT_OUT, Daxpy) { CHECK_DOUBLE; EventClass > ec; ec.runOut(); } TEST(EVENT_IN, Caxpy) { EventClass > ec; ec.runIn(); } TEST(EVENT_IN, Zaxpy) { CHECK_DOUBLE; EventClass > ec; ec.runIn(); } TEST(EVENT_OUT, Caxpy) { EventClass > ec; ec.runOut(); } TEST(EVENT_OUT, Zaxpy) { CHECK_DOUBLE; EventClass > ec; ec.runOut(); } #endif #ifdef DO_ROTG TEST(EVENT_IN, Srotg) { EventClass > ec; ec.runIn(); } TEST(EVENT_IN, Drotg) { CHECK_DOUBLE; EventClass > ec; ec.runIn(); } TEST(EVENT_OUT, Srotg) { EventClass > ec; ec.runOut(); } TEST(EVENT_OUT, Drotg) { CHECK_DOUBLE; EventClass > ec; ec.runOut(); } TEST(EVENT_IN, Crotg) { EventClass > ec; ec.runIn(); } TEST(EVENT_IN, Zrotg) { CHECK_DOUBLE; EventClass > ec; ec.runIn(); } TEST(EVENT_OUT, Crotg) { EventClass > ec; ec.runOut(); } TEST(EVENT_OUT, Zrotg) { CHECK_DOUBLE; EventClass > ec; ec.runOut(); } #endif #ifdef DO_ROTM TEST(EVENT_IN, Srotm) { EventClass > ec; ec.runIn(); } TEST(EVENT_IN, Drotm) { CHECK_DOUBLE; EventClass > ec; ec.runIn(); } TEST(EVENT_OUT, Srotm) { EventClass > ec; ec.runOut(); } TEST(EVENT_OUT, Drotm) { CHECK_DOUBLE; EventClass > ec; ec.runOut(); } #endif #ifdef DO_ROT TEST(EVENT_IN, Srot) { EventClass > ec; ec.runIn(); } TEST(EVENT_IN, Drot) { CHECK_DOUBLE; EventClass > ec; ec.runIn(); } TEST(EVENT_OUT, Csrot) { EventClass > ec; ec.runOut(); } TEST(EVENT_OUT, Zdrot) { CHECK_DOUBLE; EventClass > ec; ec.runOut(); } #endif #ifdef DO_ROTMG TEST(EVENT_IN, Srotmg) { EventClass > ec; ec.runIn(); } TEST(EVENT_IN, Drotmg) { CHECK_DOUBLE; EventClass > ec; ec.runIn(); } TEST(EVENT_OUT, Srotmg) { EventClass > ec; ec.runOut(); } TEST(EVENT_OUT, Drotmg) { CHECK_DOUBLE; EventClass > ec; ec.runOut(); } #endif #ifdef DO_NRM2 TEST(EVENT_IN, Snrm2) { EventClass > ec; ec.runIn(); } TEST(EVENT_IN, Dnrm2) { CHECK_DOUBLE; EventClass > ec; ec.runIn(); } TEST(EVENT_OUT, Snrm2) { EventClass > ec; ec.runOut(); } TEST(EVENT_OUT, Dnrm2) { CHECK_DOUBLE; EventClass > ec; ec.runOut(); } TEST(EVENT_IN, Scnrm2) { EventClass > ec; ec.runIn(); } TEST(EVENT_IN, Dznrm2) { CHECK_DOUBLE; EventClass > ec; ec.runIn(); } TEST(EVENT_OUT, Scnrm2) { EventClass > ec; ec.runOut(); } TEST(EVENT_OUT, Dznrm2) { CHECK_DOUBLE; EventClass > ec; ec.runOut(); } #endif clblas-2.10/src/tests/functional/func-images.cpp000066400000000000000000000144751264277366700217300ustar00rootroot00000000000000/* ************************************************************************ * Copyright 2013 Advanced Micro Devices, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * ************************************************************************/ //#include // srand() //#include // memcpy() #include #include // //#include "common.h" //#include "blas.h" #include "blas-wrapper.h" #include "clBLAS-wrapper.h" #include "BlasBase.h" #include "blas-random.h" #include "timer.h" #include "func.h" #include template class ImagesClass { enum { I_DEFAULT = -1, I_BUFERS, I_IMAGES, I_CASHES }; M metod; protected: bool generateData(); void setImplementation(int i); public: void images(); nano_time_t runRepeat(int rep, cl_int* err); }; template void ImagesClass::setImplementation(int i) { char str[100]; clMath::BlasBase *base = clMath::BlasBase::getInstance(); if (i != I_IMAGES) { if (base->useImages()) { base->removeScratchImages(); } base->setUseImages(false); } #if WIN32 if (i == I_DEFAULT) { sprintf (str, "%s=", metod.env); } else { sprintf (str, "%s=%i",metod.env, i); } _putenv(str); #else if (i == I_DEFAULT) { str[0] = '\0'; } else { sprintf (str, "%i", i); } setenv(metod.env, str, 1); #endif if (i == I_IMAGES) { base->setUseImages(true); if (base->useImages()) { if (base->addScratchImages()) { std::cerr << ">> FATAL ERROR, CANNOT CREATE SCRATCH IMAGES!" << std::endl << ">> Test skipped." << ::std::endl; SUCCEED(); } } } } template bool ImagesClass::generateData() { metod.generateData(); bool ret = metod.prepareDataToRun(); if (!ret) { ::std::cerr << ">> Failed to create/enqueue buffer for a matrix." << ::std::endl << ">> Can't execute the test, because data is not transfered to GPU." << ::std::endl << ">> Test skipped." << ::std::endl; SUCCEED(); } return ret; } template nano_time_t ImagesClass::runRepeat(int rep, cl_int* err) { nano_time_t time1 = getCurrentTime(); for (int i= 0; i < rep; i++) { nano_time_t time = getCurrentTime(); *err = metod.run(); if (*err != CL_SUCCESS) { return 0; } *err = clFinish(metod.queues[0]); if (*err != CL_SUCCESS) { return 0; } time = getCurrentTime() - time; time1 = (time < time1)?time:time1 ; } return time1; } template void ImagesClass::images() { cl_int err; int i= 6; int iMax = 30; nano_time_t maxTime = 1000; nano_time_t minTime = 100; bool next = true; do { nano_time_t time; metod.initDefault(256*i, 1); bool b = generateData(); ASSERT_EQ(b, true) << "generateData()"; setImplementation(I_BUFERS); metod.initOutEvent(); time = runRepeat(2, &err); ASSERT_EQ(err, CL_SUCCESS) << "clFinish()"; //std::cerr << "size = " << 256*i << "/" << i << " time = " << conv2millisec(time) << std::endl; if (conv2millisec(time) < minTime) { i += (((int)minTime - (int)conv2millisec(time)) /20) + 1; metod.destroy(); continue; } if (conv2millisec(time) > maxTime) { i = iMax; metod.destroy(); continue; } next = false; nano_time_t time1 = runRepeat(5, &err); ASSERT_EQ(err, CL_SUCCESS) << "clFinish()"; setImplementation(I_IMAGES); nano_time_t time2 = runRepeat(5, &err); ASSERT_EQ(err, CL_SUCCESS) << "clFinish()"; setImplementation(I_DEFAULT); //nano_time_t time3 = runRepeat(5, & err); //ASSERT_EQ(err, CL_SUCCESS) << "clFinish()"; double d = (double)(time1) / time2; std::cerr << "size = " << 256*i << " timeBufer = " << conv2millisec(time1) << " timeImage = " << conv2millisec(time2) << " t1/t2 = " << d << std::endl; if (d < 1.2) { next = true; i++; } metod.destroy(); } while (i < iMax && next); ASSERT_TRUE(!next) ; } // Instantiate the test //******************************************************/ TEST(IMAGES, sgemm) { ImagesClass > ec; ec.images(); } TEST(IMAGES, cgemm) { ImagesClass > ec; ec.images(); } TEST(IMAGES, dgemm) { CHECK_DOUBLE; ImagesClass > ec; ec.images(); } TEST(IMAGES, zgemm) { CHECK_DOUBLE; ImagesClass > ec; ec.images(); }//******************************************************/ TEST(IMAGES, strmm) { ImagesClass > ec; ec.images(); } TEST(IMAGES, ctrmm) { ImagesClass > ec; ec.images(); } TEST(IMAGES, dtrmm) { CHECK_DOUBLE; ImagesClass > ec; ec.images(); } TEST(IMAGES, ztrmm) { CHECK_DOUBLE; ImagesClass > ec; ec.images(); } //******************************************************/ TEST(IMAGES, strsm) { ImagesClass > ec; ec.images(); } TEST(IMAGES, ctrsm) { ImagesClass > ec; ec.images(); } TEST(IMAGES, dtrsm) { CHECK_DOUBLE; ImagesClass > ec; ec.images(); } TEST(IMAGES, ztrsm) { CHECK_DOUBLE; ImagesClass > ec; ec.images(); } //******************************************************/ clblas-2.10/src/tests/functional/func-queue.cpp000066400000000000000000000415461264277366700216060ustar00rootroot00000000000000/* ************************************************************************ * Copyright 2013 Advanced Micro Devices, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * ************************************************************************/ //#include // srand() #include // memcpy() #include #include // //#include "common.h" //#include "blas.h" #include "blas-wrapper.h" #include "clBLAS-wrapper.h" #include "BlasBase.h" #include "blas-random.h" #include "timer.h" #include "func.h" template class MQueueClass { M metod; protected: void init(); void run(); void destroy(); public: void testQueue(); }; template void MQueueClass::init() { size_t maxElem = 1024*2; metod.initDefault(maxElem, 0); metod.generateData(); metod.outEvent = NULL; } template void MQueueClass::run() { cl_int err; bool b = metod.prepareDataToRun(); ASSERT_EQ(b, true); int qmax = metod.qnum; metod.initOutEvent(); cl_int ret = CL_SUCCESS; err = metod.run(); ASSERT_EQ(err, CL_SUCCESS); //::std::cerr << "queues = " << base->numCommandQueues() << std::endl; for (int q = 0; q < qmax; ++q) { err = clFinish(metod.queues[q]); ASSERT_EQ(err, CL_SUCCESS) << "clFinish()"; err = clGetEventInfo(metod.outEvent[q], CL_EVENT_COMMAND_EXECUTION_STATUS, sizeof(cl_int), &ret, NULL); //std::cerr << "2: err=" << err <<" ret=" << ret << std::endl; ASSERT_EQ(err, CL_SUCCESS) << "clGetEventInfo()"; ASSERT_EQ(ret, CL_COMPLETE) << "clGetEventInfo()"; } } template void MQueueClass::destroy() { metod.destroy(); } template void MQueueClass::testQueue() { init(); run(); destroy(); } #ifdef DO_THEIRS //******************************************************// TEST(QUEUE, sgemm) { MQueueClass< GemmMetod > ec; ec.testQueue(); } TEST(QUEUE, cgemm) { MQueueClass< GemmMetod > ec; ec.testQueue(); } TEST(QUEUE, dgemm) { CHECK_DOUBLE; MQueueClass< GemmMetod > ec; ec.testQueue(); } TEST(QUEUE, zgemm) { CHECK_DOUBLE; MQueueClass > ec; ec.testQueue(); } //******************************************************// TEST(QUEUE, strmm) { MQueueClass > ec; ec.testQueue(); } TEST(QUEUE, ctrmm) { MQueueClass > ec; ec.testQueue(); } TEST(QUEUE, dtrmm) { CHECK_DOUBLE; MQueueClass > ec; ec.testQueue(); } TEST(QUEUE, ztrmm) { CHECK_DOUBLE; MQueueClass > ec; ec.testQueue(); } //******************************************************// TEST(QUEUE, strsm) { MQueueClass > ec; ec.testQueue(); } TEST(QUEUE, ctrsm) { MQueueClass > ec; ec.testQueue(); } TEST(QUEUE, dtrsm) { CHECK_DOUBLE; MQueueClass > ec; ec.testQueue(); } TEST(QUEUE, ztrsm) { CHECK_DOUBLE; MQueueClass > ec; ec.testQueue(); } //******************************************************// TEST(QUEUE, sgemv) { MQueueClass > ec; ec.testQueue(); } #if defined(_USE_GEMV_COMPLEX) TEST(QUEUE, cgemv) { MQueueClass > ec; ec.testQueue(); } #endif TEST(QUEUE, dgemv) { CHECK_DOUBLE; MQueueClass > ec; ec.testQueue(); } #if defined(_USE_GEMV_COMPLEX) TEST(QUEUE, zgemv) { CHECK_DOUBLE; MQueueClass > ec; ec.testQueue(); } #endif //******************************************************// TEST(QUEUE, ssymv) { MQueueClass > ec; ec.testQueue(); } TEST(QUEUE, dsymv) { CHECK_DOUBLE; MQueueClass > ec; ec.testQueue(); } //******************************************************// TEST(QUEUE, ssyr2k) { MQueueClass > ec; ec.testQueue(); } TEST(QUEUE, dsyr2k) { CHECK_DOUBLE; MQueueClass > ec; ec.testQueue(); } #endif //DO_THEIRS //****************************************************** #ifdef DO_TRMV TEST(QUEUE, strmv) { MQueueClass< TrmvMetod > ec; ec.testQueue(); } TEST(QUEUE, dtrmv) { CHECK_DOUBLE; MQueueClass< TrmvMetod > ec; ec.testQueue(); } TEST(QUEUE, ctrmv) { MQueueClass< TrmvMetod > ec; ec.testQueue(); } TEST(QUEUE, ztrmv) { CHECK_DOUBLE; MQueueClass< TrmvMetod > ec; ec.testQueue(); } #endif // ******************************************************/ #ifdef DO_TPMV TEST(QUEUE, stpmv) { MQueueClass< TpmvMetod > ec; ec.testQueue(); } TEST(QUEUE, dtpmv) { CHECK_DOUBLE; MQueueClass< TpmvMetod > ec; ec.testQueue(); } TEST(QUEUE, ctpmv) { MQueueClass< TpmvMetod > ec; ec.testQueue(); } TEST(QUEUE, ztpmv) { CHECK_DOUBLE; MQueueClass< TpmvMetod > ec; ec.testQueue(); } #endif #ifdef DO_TRSV TEST(QUEUE, strsv) { MQueueClass< TrsvMetod > ec; ec.testQueue(); } TEST(QUEUE, dtrsv) { CHECK_DOUBLE; MQueueClass< TrsvMetod > ec; ec.testQueue(); } TEST(QUEUE, ctrsv) { MQueueClass< TrsvMetod > ec; ec.testQueue(); } TEST(QUEUE, ztrsv) { CHECK_DOUBLE; MQueueClass< TrsvMetod > ec; ec.testQueue(); } #endif #ifdef DO_TPSV TEST(QUEUE, stpsv) { MQueueClass< TpsvMetod > ec; ec.testQueue(); } TEST(QUEUE, dtpsv) { CHECK_DOUBLE; MQueueClass< TpsvMetod > ec; ec.testQueue(); } TEST(QUEUE, ctpsv) { MQueueClass< TpsvMetod > ec; ec.testQueue(); } TEST(QUEUE, ztpsv) { CHECK_DOUBLE; MQueueClass< TpsvMetod > ec; ec.testQueue(); } #endif #ifdef DO_SYMM TEST(QUEUE, Ssymm) { MQueueClass< SymmMetod > ec; ec.testQueue(); } TEST(QUEUE, Dsymm) { CHECK_DOUBLE; MQueueClass< SymmMetod > ec; ec.testQueue(); } TEST(QUEUE, Csymm) { MQueueClass< SymmMetod > ec; ec.testQueue(); } TEST(QUEUE, Zsymm) { CHECK_DOUBLE; MQueueClass< SymmMetod > ec; ec.testQueue(); } #endif #ifdef DO_SYR TEST(QUEUE, Ssyr) { MQueueClass< SyrMetod > ec; ec.testQueue(); } TEST(QUEUE, Dsyr) { CHECK_DOUBLE; MQueueClass< SyrMetod > ec; ec.testQueue(); } #endif #ifdef DO_SPR TEST(QUEUE, Sspr) { MQueueClass< SprMetod > ec; ec.testQueue(); } TEST(QUEUE, Dspr) { CHECK_DOUBLE; MQueueClass< SprMetod > ec; ec.testQueue(); } #endif #ifdef DO_SYR2 TEST(QUEUE, Ssyr2) { MQueueClass< Syr2Metod > ec; ec.testQueue(); } TEST(QUEUE, Dsyr2) { CHECK_DOUBLE; MQueueClass< Syr2Metod > ec; ec.testQueue(); } #endif #ifdef DO_GER TEST(QUEUE, sger) { MQueueClass< GerMetod > ec; ec.testQueue(); } TEST(QUEUE, dger) { CHECK_DOUBLE; MQueueClass< GerMetod > ec; ec.testQueue(); } TEST(QUEUE, cger) { MQueueClass< GerMetod > ec; ec.testQueue(); } TEST(QUEUE, zger) { CHECK_DOUBLE; MQueueClass< GerMetod > ec; ec.testQueue(); } #endif #ifdef DO_GERC TEST(QUEUE, cgerc) { MQueueClass< GercMetod > ec; ec.testQueue(); } TEST(QUEUE, zgerc) { CHECK_DOUBLE; MQueueClass< GercMetod > ec; ec.testQueue(); } #endif #ifdef DO_HER TEST(QUEUE, cher) { MQueueClass< HerMetod > ec; ec.testQueue(); } TEST(QUEUE, zher) { CHECK_DOUBLE; MQueueClass > ec; ec.testQueue(); } #endif #ifdef DO_HER2 TEST(QUEUE, cher2) { MQueueClass< Her2Metod > ec; ec.testQueue(); } TEST(QUEUE, zher2) { CHECK_DOUBLE; MQueueClass > ec; ec.testQueue(); } #endif #ifdef DO_HEMM TEST(QUEUE, chemm) { MQueueClass< HemmMetod > ec; ec.testQueue(); } TEST(QUEUE, zhemm) { CHECK_DOUBLE; MQueueClass > ec; ec.testQueue(); } #endif #ifdef DO_HEMV TEST(QUEUE, chemv) { MQueueClass< HemvMetod > ec; ec.testQueue(); } TEST(QUEUE, zhemv) { CHECK_DOUBLE; MQueueClass > ec; ec.testQueue(); } #endif #ifdef DO_HERK TEST(QUEUE, cherk) { MQueueClass< HerkMetod > ec; ec.testQueue(); } TEST(QUEUE, zherk) { CHECK_DOUBLE; MQueueClass > ec; ec.testQueue(); } #endif #ifdef DO_HPMV TEST(QUEUE, chpmv) { MQueueClass< HpmvMetod > ec; ec.testQueue(); } TEST(QUEUE, zhpmv) { CHECK_DOUBLE; MQueueClass > ec; ec.testQueue(); } #endif #ifdef DO_SPMV TEST(QUEUE, sspmv) { MQueueClass > ec; ec.testQueue(); } TEST(QUEUE, dspmv) { CHECK_DOUBLE; MQueueClass > ec; ec.testQueue(); } #endif #ifdef DO_SPR2 TEST(QUEUE, Sspr2) { MQueueClass< Spr2Metod > ec; ec.testQueue(); } TEST(QUEUE, Dspr2) { CHECK_DOUBLE; MQueueClass< Spr2Metod > ec; ec.testQueue(); } #endif #ifdef DO_HPR TEST(QUEUE, chpr) { MQueueClass< HprMetod > ec; ec.testQueue(); } TEST(QUEUE, zhpr) { CHECK_DOUBLE; MQueueClass > ec; ec.testQueue(); } #endif #ifdef DO_HPR2 TEST(QUEUE, chpr2) { MQueueClass< Hpr2Metod > ec; ec.testQueue(); } TEST(QUEUE, zhpr2) { CHECK_DOUBLE; MQueueClass > ec; ec.testQueue(); } #endif #ifdef DO_GBMV TEST(QUEUE, SGBMV) { MQueueClass< GbmvMetod > ec; ec.testQueue(); } TEST(QUEUE, DGBMV) { CHECK_DOUBLE; MQueueClass< GbmvMetod > ec; ec.testQueue(); } TEST(QUEUE, CGBMV) { MQueueClass< GbmvMetod > ec; ec.testQueue(); } TEST(QUEUE, ZGBMV) { CHECK_DOUBLE; MQueueClass< GbmvMetod > ec; ec.testQueue(); } #endif #ifdef DO_SYR TEST(QUEUE, Ssbmv) { MQueueClass< SbmvMetod > ec; ec.testQueue(); } TEST(QUEUE, Dsbmv) { CHECK_DOUBLE; MQueueClass< SbmvMetod > ec; ec.testQueue(); } #endif //DOT #ifdef DO_DOT TEST(QUEUE, Sdot) { MQueueClass< DotMetod > ec; ec.testQueue(); } TEST(QUEUE, Ddot) { CHECK_DOUBLE; MQueueClass< DotMetod > ec; ec.testQueue(); } TEST(QUEUE, Cdotu) { MQueueClass< DotMetod > ec; ec.testQueue(); } TEST(QUEUE, Zdotu) { CHECK_DOUBLE; MQueueClass< DotMetod > ec; ec.testQueue(); } #endif //ASUM #ifdef DO_ASUM TEST(QUEUE, Sasum) { MQueueClass< AsumMetod > ec; ec.testQueue(); } TEST(QUEUE, Dasum) { CHECK_DOUBLE; MQueueClass< AsumMetod > ec; ec.testQueue(); } TEST(QUEUE, Scasum) { MQueueClass< AsumMetod > ec; ec.testQueue(); } TEST(QUEUE, Dzasum) { CHECK_DOUBLE; MQueueClass< AsumMetod > ec; ec.testQueue(); } #endif //iAMAX #ifdef DO_iAMAX TEST(QUEUE, iSamax) { MQueueClass< iAmaxMetod > ec; ec.testQueue(); } TEST(QUEUE, iDamax) { CHECK_DOUBLE; MQueueClass< iAmaxMetod > ec; ec.testQueue(); } TEST(QUEUE, iCamax) { MQueueClass< iAmaxMetod > ec; ec.testQueue(); } TEST(QUEUE, iZamax) { CHECK_DOUBLE; MQueueClass< iAmaxMetod > ec; ec.testQueue(); } #endif //DOTC #ifdef DO_DOTC TEST(QUEUE, Cdotc) { MQueueClass< DotcMetod > ec; ec.testQueue(); } TEST(QUEUE, Zdotc) { CHECK_DOUBLE; MQueueClass< DotcMetod > ec; ec.testQueue(); } #endif #ifdef DO_SYR TEST(QUEUE, Chbmv) { MQueueClass< HbmvMetod > ec; ec.testQueue(); } TEST(QUEUE, Zhbmv) { CHECK_DOUBLE; MQueueClass< HbmvMetod > ec; ec.testQueue(); } #endif #ifdef DO_TBMV TEST(QUEUE, STBMV) { MQueueClass< TbmvMetod > ec; ec.testQueue(); } TEST(QUEUE, DTBMV) { CHECK_DOUBLE; MQueueClass< TbmvMetod > ec; ec.testQueue(); } TEST(QUEUE, CTBMV) { MQueueClass< TbmvMetod > ec; ec.testQueue(); } TEST(QUEUE, ZTBMV) { CHECK_DOUBLE; MQueueClass< TbmvMetod > ec; ec.testQueue(); } #endif #ifdef DO_TBSV TEST(QUEUE, STBSV) { MQueueClass< TbsvMetod > ec; ec.testQueue(); } TEST(QUEUE, DTBSV) { CHECK_DOUBLE; MQueueClass< TbsvMetod > ec; ec.testQueue(); } TEST(QUEUE, CTBSV) { MQueueClass< TbsvMetod > ec; ec.testQueue(); } TEST(QUEUE, ZTBSV) { CHECK_DOUBLE; MQueueClass< TbsvMetod > ec; ec.testQueue(); } #endif #ifdef DO_HER2K TEST(QUEUE, cher2k) { MQueueClass< Her2kMetod > ec; ec.testQueue(); } TEST(QUEUE, zher2k) { CHECK_DOUBLE; MQueueClass > ec; ec.testQueue(); } #endif #ifdef DO_SCAL TEST(QUEUE, Sscal) { MQueueClass< ScalMetod > ec; ec.testQueue(); } TEST(QUEUE, Dscal) { CHECK_DOUBLE; MQueueClass< ScalMetod > ec; ec.testQueue(); } TEST(QUEUE, Cscal) { MQueueClass< ScalMetod > ec; ec.testQueue(); } TEST(QUEUE, Zscal) { CHECK_DOUBLE; MQueueClass< ScalMetod > ec; ec.testQueue(); } #endif #ifdef DO_SSCAL TEST(QUEUE, Csscal) { MQueueClass< SscalMetod > ec; ec.testQueue(); } TEST(QUEUE, Zdscal) { CHECK_DOUBLE; MQueueClass< SscalMetod > ec; ec.testQueue(); } #endif #ifdef DO_SWAP TEST(QUEUE, Sswap) { MQueueClass< SwapMetod > ec; ec.testQueue(); } TEST(QUEUE, Dswap) { CHECK_DOUBLE; MQueueClass< SwapMetod > ec; ec.testQueue(); } TEST(QUEUE, Cswap) { MQueueClass< SwapMetod > ec; ec.testQueue(); } TEST(QUEUE, Zswap) { CHECK_DOUBLE; MQueueClass< SwapMetod > ec; ec.testQueue(); } #endif #ifdef DO_COPY TEST(QUEUE, Scopy) { MQueueClass< CopyMetod > ec; ec.testQueue(); } TEST(QUEUE, Dcopy) { CHECK_DOUBLE; MQueueClass< CopyMetod > ec; ec.testQueue(); } TEST(QUEUE, Ccopy) { MQueueClass< CopyMetod > ec; ec.testQueue(); } TEST(QUEUE, Zcopy) { CHECK_DOUBLE; MQueueClass< CopyMetod > ec; ec.testQueue(); } #endif #ifdef DO_AXPY TEST(QUEUE, Saxpy) { MQueueClass< AxpyMetod > ec; ec.testQueue(); } TEST(QUEUE, Daxpy) { CHECK_DOUBLE; MQueueClass< AxpyMetod > ec; ec.testQueue(); } TEST(QUEUE, Caxpy) { MQueueClass< AxpyMetod > ec; ec.testQueue(); } TEST(QUEUE, Zaxpy) { CHECK_DOUBLE; MQueueClass< AxpyMetod > ec; ec.testQueue(); } #endif #ifdef DO_ROTG TEST(QUEUE, Srotg) { MQueueClass< RotgMetod > ec; ec.testQueue(); } TEST(QUEUE, Drotg) { CHECK_DOUBLE; MQueueClass< RotgMetod > ec; ec.testQueue(); } TEST(QUEUE, Crotg) { MQueueClass< RotgMetod > ec; ec.testQueue(); } TEST(QUEUE, Zrotg) { CHECK_DOUBLE; MQueueClass< RotgMetod > ec; ec.testQueue(); } #endif #ifdef DO_ROTM TEST(QUEUE, Srotm) { MQueueClass< RotmMetod > ec; ec.testQueue(); } TEST(QUEUE, Drotm) { CHECK_DOUBLE; MQueueClass< RotmMetod > ec; ec.testQueue(); } #endif #ifdef DO_ROT TEST(QUEUE, Srot) { MQueueClass< RotMetod > ec; ec.testQueue(); } TEST(QUEUE, Drot) { CHECK_DOUBLE; MQueueClass< RotMetod > ec; ec.testQueue(); } TEST(QUEUE, Csrot) { MQueueClass< RotMetod > ec; ec.testQueue(); } TEST(QUEUE, Zdrot) { CHECK_DOUBLE; MQueueClass< RotMetod > ec; ec.testQueue(); } #endif #ifdef DO_ROTMG TEST(QUEUE, Srotmg) { MQueueClass< RotmgMetod > ec; ec.testQueue(); } TEST(QUEUE, Drotmg) { CHECK_DOUBLE; MQueueClass< RotmgMetod > ec; ec.testQueue(); } #endif #ifdef DO_NRM2 TEST(QUEUE, Snrm2) { MQueueClass< Nrm2Metod > ec; ec.testQueue(); } TEST(QUEUE, Dnrm2) { CHECK_DOUBLE; MQueueClass< Nrm2Metod > ec; ec.testQueue(); } TEST(QUEUE, Scnrm2) { MQueueClass< Nrm2Metod > ec; ec.testQueue(); } TEST(QUEUE, Dznrm2) { CHECK_DOUBLE; MQueueClass< Nrm2Metod > ec; ec.testQueue(); } #endif clblas-2.10/src/tests/functional/func-thread.cpp000066400000000000000000000412231264277366700217210ustar00rootroot00000000000000/* ************************************************************************ * Copyright 2013 Advanced Micro Devices, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * ************************************************************************/ //#include // srand() #include // memcpy() #include #include // //#include "common.h" //#include "blas.h" #include "blas-wrapper.h" #include "clBLAS-wrapper.h" #include "BlasBase.h" #include "blas-random.h" #include "timer.h" #include #include "func.h" // Parallel thread #define P_TH 5 #if defined(_MSC_VER) #include "windows.h" #include "process.h" #define THREAD_ID HANDLE #define THREAD_START(ID, DATA) \ ID = (HANDLE)_beginthreadex(NULL, 0, &phfunc, &DATA, 0, NULL); #define THREAD_WAIT(ID, RET) \ { \ DWORD r;\ WaitForSingleObject(ID, INFINITE); \ GetExitCodeThread(ID, &r);\ RET = (r == 1);\ } template unsigned __stdcall phfunc(void* vm) { unsigned ret; M* m = (M*) vm; cl_uint err = m->run(); clWaitForEvents(1, m->outEvent); err = m->getResult(); ret = (err == CL_SUCCESS)? 1:0; _endthreadex(ret); return ret; } #else /* defined(_MCS_VER) */ #include "pthread.h" #define THREAD_ID pthread_t #define THREAD_START(ID, DATA) \ pthread_create(&ID, NULL, phfunc, &DATA) #define THREAD_WAIT(ID, RET) \ { \ void* r;\ int res = pthread_join(pt[i], &r); \ (void) res; \ RET =(bool)r;\ } template void* phfunc(void* vm) { M* m = (M*) vm; cl_uint err = m->run(); clWaitForEvents(1, m->outEvent); sleep(1); err = m->getResult(); return (void *)(err == CL_SUCCESS); } #endif template class MThreadClass { M s_metod; M m_metod[P_TH]; protected: void init(); void run(); void destroy(); public: void mthread(); }; template void MThreadClass::init() { //size_t maxElem = 1024; PENDING: Make it back to 1024 size_t maxElem = 128; s_metod.initDefault(maxElem, 1); s_metod.generateData(); for (int i=0; i < P_TH; ++i ) { m_metod[i].initDefault(maxElem, 1); //m_metod[i].generateData(); m_metod[i].copyData(s_metod); } } template void MThreadClass::run() { cl_int err; bool b = s_metod.prepareDataToRun(); ASSERT_EQ(b, true); for (int i=0; i < P_TH; ++i ) { bool b = m_metod[i].prepareDataToRun(); m_metod[i].initOutEvent(); ASSERT_EQ(b, true); } err = s_metod.run(); if (err == CL_SUCCESS) { err = clFinish(s_metod.queues[0]); ASSERT_EQ(err, CL_SUCCESS) << "clFinish()"; err = s_metod.getResult(); ASSERT_EQ(err, CL_SUCCESS); THREAD_ID pt[P_TH]; for (int i=0; i < P_TH; ++i ) { THREAD_START(pt[i], m_metod[i]); } for (int i=0; i < P_TH; ++i ) { bool ret; THREAD_WAIT(pt[i], ret); EXPECT_EQ(ret, true); s_metod.compareData(m_metod[i]); } } else { ::std::cerr << ">> Test skipped." << err <<::std::endl; SUCCEED(); return; } } template void MThreadClass::destroy() { s_metod.destroy(); for (int i=0; i < P_TH; ++i ) { m_metod[i].destroy(); } } template void MThreadClass::mthread() { init(); run(); destroy(); } #ifdef DO_THEIRS TEST(THREAD, sgemm) { MThreadClass > ec; ec.mthread(); } TEST(THREAD, cgemm) { MThreadClass > ec; ec.mthread(); } TEST(THREAD, dgemm) { CHECK_DOUBLE; MThreadClass > ec; ec.mthread(); } TEST(THREAD, zgemm) { CHECK_DOUBLE; MThreadClass > ec; ec.mthread(); } TEST(THREAD, strmm) { MThreadClass > ec; ec.mthread(); } TEST(THREAD, ctrmm) { MThreadClass > ec; ec.mthread(); } TEST(THREAD, dtrmm) { CHECK_DOUBLE; MThreadClass > ec; ec.mthread(); } TEST(THREAD, ztrmm) { CHECK_DOUBLE; MThreadClass > ec; ec.mthread(); } ////////////////////////////////////////////////////////////// TEST(THREAD, strsm) { MThreadClass > ec; ec.mthread(); } TEST(THREAD, ctrsm) { MThreadClass > ec; ec.mthread(); } TEST(THREAD, dtrsm) { CHECK_DOUBLE; MThreadClass > ec; ec.mthread(); } TEST(THREAD, ztrsm) { CHECK_DOUBLE; MThreadClass > ec; ec.mthread(); } ////////////////////////////////////////////////////////////// TEST(THREAD, sgemv) { MThreadClass > ec; ec.mthread(); } #if defined(_USE_GEMV_COMPLEX) TEST(THREAD, cgemv) { MThreadClass > ec; ec.mthread(); } #endif TEST(THREAD, dgemv) { CHECK_DOUBLE; MThreadClass > ec; ec.mthread(); } #if defined(_USE_GEMV_COMPLEX) TEST(THREAD, zgemv) { CHECK_DOUBLE; MThreadClass > ec; ec.mthread(); } #endif ////////////////////////////////////////////////////////////// TEST(THREAD, ssymv) { MThreadClass > ec; ec.mthread(); } TEST(THREAD, dsymv) { CHECK_DOUBLE; MThreadClass > ec; ec.mthread(); } //******************************************************// TEST(THREAD, ssyr2k) { MThreadClass > ec; ec.mthread(); } TEST(THREAD, dsyr2k) { CHECK_DOUBLE; MThreadClass > ec; ec.mthread(); } #endif //DO_THIERS #ifdef DO_TRMV TEST(THREAD, strmv) { MThreadClass > ec; ec.mthread(); } TEST(THREAD, dtrmv) { CHECK_DOUBLE; MThreadClass > ec; ec.mthread(); } TEST(THREAD, ctrmv) { MThreadClass > ec; ec.mthread(); } TEST(THREAD, ztrmv) { CHECK_DOUBLE; MThreadClass > ec; ec.mthread(); } #endif #ifdef DO_TPMV TEST(THREAD, stpmv) { MThreadClass > ec; ec.mthread(); } TEST(THREAD, dtpmv) { CHECK_DOUBLE; MThreadClass > ec; ec.mthread(); } TEST(THREAD, ctpmv) { MThreadClass > ec; ec.mthread(); } TEST(THREAD, ztpmv) { CHECK_DOUBLE; MThreadClass > ec; ec.mthread(); } #endif #ifdef DO_TRSV TEST(THREAD, strsv) { MThreadClass > ec; ec.mthread(); } TEST(THREAD, dtrsv) { CHECK_DOUBLE; MThreadClass > ec; ec.mthread(); } TEST(THREAD, ctrsv) { MThreadClass > ec; ec.mthread(); } TEST(THREAD, ztrsv) { CHECK_DOUBLE; MThreadClass > ec; ec.mthread(); } #endif #ifdef DO_TPSV TEST(THREAD, stpsv) { MThreadClass > ec; ec.mthread(); } TEST(THREAD, dtpsv) { CHECK_DOUBLE; MThreadClass > ec; ec.mthread(); } TEST(THREAD, ctpsv) { MThreadClass > ec; ec.mthread(); } TEST(THREAD, ztpsv) { CHECK_DOUBLE; MThreadClass > ec; ec.mthread(); } #endif #ifdef DO_SYMM TEST(THREAD, Ssymm) { MThreadClass > ec; ec.mthread(); } TEST(THREAD, Dsymm) { CHECK_DOUBLE; MThreadClass > ec; ec.mthread(); } TEST(THREAD, Csymm) { MThreadClass > ec; ec.mthread(); } TEST(THREAD, Zsymm) { CHECK_DOUBLE; MThreadClass > ec; ec.mthread(); } #endif #ifdef DO_SYR TEST(THREAD, Ssyr) { MThreadClass > ec; ec.mthread(); } TEST(THREAD, Dsyr) { CHECK_DOUBLE; MThreadClass > ec; ec.mthread(); } #endif #ifdef DO_SPR TEST(THREAD, Sspr) { MThreadClass > ec; ec.mthread(); } TEST(THREAD, Dspr) { CHECK_DOUBLE; MThreadClass > ec; ec.mthread(); } #endif #ifdef DO_SYR2 TEST(THREAD, Ssyr2) { MThreadClass > ec; ec.mthread(); } TEST(THREAD, Dsyr2) { CHECK_DOUBLE; MThreadClass > ec; ec.mthread(); } #endif #ifdef DO_GER TEST(THREAD, Sger) { MThreadClass > ec; ec.mthread(); } TEST(THREAD, Dger) { CHECK_DOUBLE; MThreadClass > ec; ec.mthread(); } TEST(THREAD, Cger) { MThreadClass > ec; ec.mthread(); } TEST(THREAD, Zger) { CHECK_DOUBLE; MThreadClass > ec; ec.mthread(); } #endif #ifdef DO_GERC TEST(THREAD, Cgerc) { MThreadClass > ec; ec.mthread(); } TEST(THREAD, Zgerc) { CHECK_DOUBLE; MThreadClass > ec; ec.mthread(); } #endif #ifdef DO_HER TEST(THREAD, Cher) { MThreadClass > ec; ec.mthread(); } TEST(THREAD, Zher) { CHECK_DOUBLE; MThreadClass > ec; ec.mthread(); } #endif #ifdef DO_HER2 TEST(THREAD, Cher2) { MThreadClass > ec; ec.mthread(); } TEST(THREAD, Zher2) { CHECK_DOUBLE; MThreadClass > ec; ec.mthread(); } #endif #ifdef DO_HEMM TEST(THREAD, Chemm) { MThreadClass > ec; ec.mthread(); } TEST(THREAD, Zhemm) { CHECK_DOUBLE; MThreadClass > ec; ec.mthread(); } #endif #ifdef DO_HEMV TEST(THREAD, Chemv) { MThreadClass > ec; ec.mthread(); } TEST(THREAD, Zhemv) { CHECK_DOUBLE; MThreadClass > ec; ec.mthread(); } #endif #ifdef DO_HERK TEST(THREAD, Cherk) { MThreadClass > ec; ec.mthread(); } TEST(THREAD, Zherk) { CHECK_DOUBLE; MThreadClass > ec; ec.mthread(); } #endif #ifdef DO_HPMV TEST(THREAD, Chpmv) { MThreadClass > ec; ec.mthread(); } TEST(THREAD, Zhpmv) { CHECK_DOUBLE; MThreadClass > ec; ec.mthread(); } #endif #ifdef DO_SPMV TEST(THREAD, Sspmv) { MThreadClass > ec; ec.mthread(); } TEST(THREAD, Dspmv) { CHECK_DOUBLE; MThreadClass > ec; ec.mthread(); } #endif #ifdef DO_SPR2 TEST(THREAD, Sspr2) { MThreadClass > ec; ec.mthread(); } TEST(THREAD, Dspr2) { CHECK_DOUBLE; MThreadClass > ec; ec.mthread(); } #endif #ifdef DO_HPR TEST(THREAD, Chpr) { MThreadClass > ec; ec.mthread(); } TEST(THREAD, Zhpr) { CHECK_DOUBLE; MThreadClass > ec; ec.mthread(); } #endif #ifdef DO_HPR2 TEST(THREAD, Chpr2) { MThreadClass > ec; ec.mthread(); } TEST(THREAD, Zhpr2) { CHECK_DOUBLE; MThreadClass > ec; ec.mthread(); } #endif #ifdef DO_GBMV TEST(THREAD, SGBMV) { MThreadClass > ec; ec.mthread(); } TEST(THREAD, ZGBMV) { CHECK_DOUBLE; MThreadClass > ec; ec.mthread(); } #endif #ifdef DO_SBMV TEST(THREAD, Ssbmv) { MThreadClass > ec; ec.mthread(); } TEST(THREAD, Dsbmv) { CHECK_DOUBLE; MThreadClass > ec; ec.mthread(); } #endif #ifdef DO_HBMV TEST(THREAD, Chbmv) { MThreadClass > ec; ec.mthread(); } TEST(THREAD, Zhbmv) { CHECK_DOUBLE; MThreadClass > ec; ec.mthread(); } #endif #ifdef DO_TBMV TEST(THREAD, STBMV) { MThreadClass > ec; ec.mthread(); } TEST(THREAD, ZTBMV) { CHECK_DOUBLE; MThreadClass > ec; ec.mthread(); } #endif #ifdef DO_TBSV TEST(THREAD, STBSV) { MThreadClass > ec; ec.mthread(); } TEST(THREAD, ZTBSV) { CHECK_DOUBLE; MThreadClass > ec; ec.mthread(); } #endif #ifdef DO_HER2K TEST(THREAD, Cher2k) { MThreadClass > ec; ec.mthread(); } TEST(THREAD, Zher2k) { CHECK_DOUBLE; MThreadClass > ec; ec.mthread(); } #endif #ifdef DO_SCAL TEST(THREAD, Sscal) { MThreadClass > ec; ec.mthread(); } TEST(THREAD, Zscal) { CHECK_DOUBLE; MThreadClass > ec; ec.mthread(); } #endif #ifdef DO_SSCAL TEST(THREAD, Csscal) { MThreadClass > ec; ec.mthread(); } TEST(THREAD, Zdscal) { CHECK_DOUBLE; MThreadClass > ec; ec.mthread(); } #endif #ifdef DO_SWAP TEST(THREAD, Sswap) { MThreadClass > ec; ec.mthread(); } TEST(THREAD, Zswap) { CHECK_DOUBLE; MThreadClass > ec; ec.mthread(); } #endif #ifdef DO_AXPY TEST(THREAD, Saxpy) { MThreadClass > ec; ec.mthread(); } TEST(THREAD, Zaxpy) { CHECK_DOUBLE; MThreadClass > ec; ec.mthread(); } #endif #ifdef DO_COPY TEST(THREAD, Scopy) { MThreadClass > ec; ec.mthread(); } TEST(THREAD, Dcopy) { CHECK_DOUBLE; MThreadClass > ec; ec.mthread(); } TEST(THREAD, Ccopy) { MThreadClass > ec; ec.mthread(); } TEST(THREAD, Zcopy) { CHECK_DOUBLE; MThreadClass > ec; ec.mthread(); } #endif //DOT #ifdef DO_DOT TEST(THREAD, Sdot) { MThreadClass > ec; ec.mthread(); } TEST(THREAD, Ddot) { CHECK_DOUBLE; MThreadClass > ec; ec.mthread(); } TEST(THREAD, Cdotu) { MThreadClass > ec; ec.mthread(); } TEST(THREAD, Zdotu) { CHECK_DOUBLE; MThreadClass > ec; ec.mthread(); } #endif //ASUM #ifdef DO_ASUM TEST(THREAD, Sasum) { MThreadClass > ec; ec.mthread(); } TEST(THREAD, Dasum) { CHECK_DOUBLE; MThreadClass > ec; ec.mthread(); } TEST(THREAD, Scasum) { MThreadClass > ec; ec.mthread(); } TEST(THREAD, Dzasum) { CHECK_DOUBLE; MThreadClass > ec; ec.mthread(); } #endif //iAMAX #ifdef DO_iAMAX TEST(THREAD, iSamax) { MThreadClass > ec; ec.mthread(); } TEST(THREAD, iDamax) { CHECK_DOUBLE; MThreadClass > ec; ec.mthread(); } TEST(THREAD, iCamax) { MThreadClass > ec; ec.mthread(); } TEST(THREAD, iZamax) { CHECK_DOUBLE; MThreadClass > ec; ec.mthread(); } #endif //DOTC #ifdef DO_DOTC TEST(THREAD, Cdotc) { MThreadClass > ec; ec.mthread(); } TEST(THREAD, Zdotc) { CHECK_DOUBLE; MThreadClass > ec; ec.mthread(); } #endif #ifdef DO_ROTG TEST(THREAD, Srotg) { MThreadClass > ec; ec.mthread(); } TEST(THREAD, Zrotg) { CHECK_DOUBLE; MThreadClass > ec; ec.mthread(); } #endif #ifdef DO_ROTM TEST(THREAD, Srotm) { MThreadClass > ec; ec.mthread(); } #endif #ifdef DO_ROT TEST(THREAD, Srot) { MThreadClass > ec; ec.mthread(); } TEST(THREAD, Drot) { CHECK_DOUBLE; MThreadClass > ec; ec.mthread(); } TEST(THREAD, Csrot) { MThreadClass > ec; ec.mthread(); } TEST(THREAD, Zdrot) { CHECK_DOUBLE; MThreadClass > ec; ec.mthread(); } #endif #ifdef DO_ROTMG TEST(THREAD, Srotmg) { MThreadClass > ec; ec.mthread(); } #endif #ifdef DO_NRM2 TEST(THREAD, Snrm2) { MThreadClass > ec; ec.mthread(); } TEST(THREAD, Dnrm2) { CHECK_DOUBLE; MThreadClass > ec; ec.mthread(); } TEST(THREAD, Scnrm2) { MThreadClass > ec; ec.mthread(); } TEST(THREAD, Dznrm2) { CHECK_DOUBLE; MThreadClass > ec; ec.mthread(); } #endif clblas-2.10/src/tests/functional/func.h000066400000000000000000002255151264277366700201310ustar00rootroot00000000000000/* ************************************************************************ * Copyright 2013 Advanced Micro Devices, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * ************************************************************************/ #define DO_THEIRS #define DO_TRMV #define DO_TRSV #define DO_SYMM #define DO_SYR #define DO_SPR #define DO_GER #define DO_GERC #define DO_HER #define DO_SYR2 #define DO_HER2 #define DO_HER #define DO_SYR2 #define DO_HER2 #define DO_HEMV #define DO_HEMM #define DO_HERK #define DO_TPMV #define DO_HPMV #define DO_SPMV #define DO_TPSV #define DO_HPR #define DO_SBMV #define DO_HPR2 #define DO_SPR2 #define DO_GBMV #define DO_HBMV #define DO_TBMV #define DO_TBSV #define DO_HER2K #define DO_SCAL #define DO_SSCAL #define DO_DOT #define DO_DOTC #define DO_SWAP #define DO_COPY #define DO_AXPY #define DO_ROTG #define DO_ROTM #define DO_ROTMG #define DO_ROT #define DO_NRM2 #define DO_ASUM #define DO_iAMAX #ifndef FUNC_H_ #define FUNC_H_ //#define _USE_GEMV_COMPLEX #include #include #include #include #include #include #include #include // Functions of BaseMetod Modified // included : As using typeid() // testDG.h contains common definitions and enumerations used for populate() /* bool prepareDataToRun(); void copyData(baseMetod & source); void initDefault(size_t s, unsigned int q, USE_BUFFER ub); void destroy(); void compareData(baseMetod & source); cl_int getResult(); matrix.h // Added support for Packed Matrix getElement(); setElement(); // New stuff added populate() : Can generate data for general, packed, symmetric, lower-upper triangle // // Set diagonal elements to unity, random, zero // Row-Major, Col-Major support // TODO: Hermition Matrix, Banded Matrix support */ enum USE_BUFFER { USE_ABC, USE_AB, USE_AC, USE_AXY, //For TRMV and friends USE_APXY, //For TPMV and friends USE_AX, //For TRSV and friends USE_APX, //For TPSV and friends USE_X, //For blas-1 routines USE_XY, USE_ABXY, USE_ABCXY, // for xROTMG USE_NOTHING // Don't Care: Memory Allocation handled by derived Metod (xxxMetod class) }; typedef enum BUFFER { Aresult, APresult, Bresult, Cresult, Xresult, Yresult } BUFFER_KIND; template class baseMetod { protected: clMath::BlasBase *base; public: typedef T TYPE; T t; USE_BUFFER inputBuffers; BUFFER_KIND resultBuffer; BUFFER_KIND resultBuffer_additional; clblasOrder order; cl_command_queue* queues; cl_uint qnum; cl_context context; cl_event* outEvent; cl_event* inEvent; cl_uint inEventCount; int seed; const char* env; size_t size; //size_t size2; BUFFER_KIND resultLocation; TYPE alpha, beta; cl_mem bufA, bufB, bufC, bufX, bufY, bufAP; TYPE *A, *AP, *B, *C, *X, *Y; size_t ASize, BSize, CSize, XSize, YSize; void initOutEvent(); bool prepareDataToRun(); void copyData(baseMetod & source); void initDefault(size_t s, unsigned int q, USE_BUFFER ub); void destroy(); void compareData(baseMetod & source); cl_int getResult(); }; template bool baseMetod::prepareDataToRun() { if (A != NULL) { bufA = base->createEnqueueBuffer(A, size * size * sizeof (TYPE), 0, ((resultBuffer == Aresult )? CL_MEM_READ_WRITE:CL_MEM_READ_ONLY)); if ( bufA == NULL){ return false; } } if (B != NULL) { bufB = base->createEnqueueBuffer(B, size * size * sizeof (TYPE), 0, ((resultBuffer == Bresult )? CL_MEM_READ_WRITE:CL_MEM_READ_ONLY)); if ( bufB == NULL){ return false; } } if (C != NULL) { bufC = base->createEnqueueBuffer(C, size * size * sizeof (TYPE), 0, ((resultBuffer == Cresult )? CL_MEM_READ_WRITE:CL_MEM_READ_ONLY)); if ( bufC == NULL){ return false; } } if (AP != NULL) { bufAP = base->createEnqueueBuffer(AP, ((size * (size + 1)) / 2) * sizeof (TYPE), 0, ((resultBuffer == APresult )? CL_MEM_READ_WRITE:CL_MEM_READ_ONLY)); if ( bufAP == NULL){ return false; } } if (X != NULL) { bufX = base->createEnqueueBuffer(X, size * sizeof (TYPE), 0, ((resultBuffer == Xresult )? CL_MEM_READ_WRITE:CL_MEM_READ_ONLY)); if ( bufX == NULL){ return false; } } if (Y != NULL) { bufY = base->createEnqueueBuffer(Y, size * sizeof (TYPE), 0, ((resultBuffer == Yresult )? CL_MEM_READ_WRITE:CL_MEM_READ_ONLY)); if ( bufY == NULL){ return false; } } return true; } template void baseMetod::initOutEvent() { outEvent = new cl_event[qnum]; for (unsigned int i = 0; i < qnum; ++i) { outEvent[i] = NULL; } } template void baseMetod::copyData(baseMetod & source) { if (source.A != NULL) { //A = new TYPE[size * size]; memcpy(A, source.A, size * size * sizeof(TYPE)); } if (source.B != NULL) { //B = new TYPE[size * size]; memcpy(B, source.B, size * size * sizeof(TYPE)); } if (source.C != NULL) { //C = new TYPE[size * size]; memcpy(C, source.C, size * size * sizeof(TYPE)); } if (source.AP != NULL) { //A = new TYPE[size * size]; memcpy(AP, source.AP, ((size * (size+1)) /2 )* sizeof(TYPE)); } if (source.X != NULL) { //A = new TYPE[size * size]; memcpy(X, source.X, size * sizeof(TYPE)); } if (source.Y != NULL) { //A = new TYPE[size * size]; memcpy(Y, source.Y, size * sizeof(TYPE)); } alpha = source.alpha; beta = source.beta; } template void baseMetod::initDefault(size_t s, unsigned int q, USE_BUFFER ub) { size = s; order = clblasColumnMajor; seed = 12345; base = clMath::BlasBase::getInstance(); if (q > 0) { base->setNumCommandQueues(q); } queues = base->commandQueues(); qnum = base->numCommandQueues(); context = base->context(); alpha = convertMultiplier(base->alpha()); beta = convertMultiplier(base->beta()); outEvent= NULL; inEvent = NULL; inEventCount = 0; switch (ub) { case USE_ABC: A = new TYPE[size * size]; B = new TYPE[size * size]; C = new TYPE[size * size]; AP = NULL; X = NULL; Y = NULL; break; case USE_AB: A = new TYPE[size * size]; B = new TYPE[size * size]; AP = NULL; C = NULL; X = NULL; Y = NULL; break; case USE_AC: A = new TYPE[size * size]; C = new TYPE[size * size]; AP = NULL; B = NULL; X = NULL; Y = NULL; break; case USE_AX: A = new TYPE[size * size]; X = new TYPE[size]; AP = NULL; B = NULL; C = NULL; Y = NULL; break; case USE_AXY: A = new TYPE[size * size]; X = new TYPE[size]; Y = new TYPE[size]; AP = NULL; B = NULL; C = NULL; break; case USE_APXY: AP = new TYPE[(size * (size + 1)) /2]; X = new TYPE[size]; Y = new TYPE[size]; A = NULL; B = NULL; C = NULL; break; case USE_APX: AP = new TYPE[(size * (size + 1)) /2]; X = new TYPE[size]; A = NULL; B = NULL; C = NULL; Y = NULL; break; case USE_ABXY: X = new TYPE[size]; Y = new TYPE[size]; AP = NULL; A = new TYPE[size * size]; B = new TYPE[size * size]; C = NULL; break; // Currently used only for xROTMG requiring 5 buffers // change if it is reusable for more tests case USE_ABCXY: X = new TYPE[size]; Y = new TYPE[size]; AP = NULL; A = new TYPE[size*size];//for D1 B = new TYPE[size*size];// for D2 C = new TYPE[size*size];//for SPARAM break; case USE_X: X = new TYPE[size]; Y = NULL; AP = NULL; A = NULL; B = NULL; C = NULL; break; case USE_XY: // suitable for BLAS-1 routines: copy & swap X = new TYPE[size]; Y = new TYPE[size]; AP = NULL; A = NULL; B = NULL; C = NULL; break; default: AP = NULL; A = NULL; B = NULL; C = NULL; X = NULL; Y = NULL; } bufA = NULL; bufB = NULL; bufC = NULL; bufX = NULL; bufY = NULL; bufAP = NULL; srand(seed); //std::cerr << "init = " << size << std::endl; env = NULL; } template void baseMetod::destroy() { if (outEvent != NULL) { for (unsigned int i = 0; i < qnum; ++i) { outEvent[i] = NULL; } delete[](outEvent); } //std::cerr << "destroy "<< std::endl; delete[] this->A; delete[] this->B; delete[] this->C; delete[] this->AP; delete[] this->X; delete[] this->Y; clReleaseMemObject(this->bufA); clReleaseMemObject(this->bufB); clReleaseMemObject(this->bufC); clReleaseMemObject(this->bufAP); clReleaseMemObject(this->bufX); clReleaseMemObject(this->bufY); A = NULL; B = NULL; C = NULL; AP = NULL; X = NULL; Y = NULL; bufA = NULL; bufB = NULL; bufC = NULL; bufAP = NULL; bufX = NULL; bufY = NULL; } template void baseMetod::compareData(baseMetod & source) { /* if (C == NULL) { compareMatrices(order, size, size, B, source.B, size); } else { compareMatrices(order, size, size, C, source.C, size); } */ /* if (C == NULL && ( X == NULL)) { resultBuffer = Bresult; } else { resultBuffer = Cresult; } */ T* s1 = NULL; T* s2 = NULL; s1 = ( resultBuffer == Aresult)? A: ( resultBuffer == Bresult) ? B: ( resultBuffer == Cresult)? C:( resultBuffer == Xresult)? X:( resultBuffer == Yresult)? Y: AP; s2 = ( resultBuffer == Aresult)? source.A: ( resultBuffer == Bresult) ? source.B: ( resultBuffer == Cresult)? source.C:( resultBuffer == Xresult)? source.X:( resultBuffer == Yresult)? source.Y: source.AP; clblasOrder fOrder; size_t m,n,lda; if ( resultBuffer == Aresult || resultBuffer == Bresult || resultBuffer == Cresult ) { m = size; n = size; lda = size; fOrder = order; } else if ( resultBuffer == Xresult || resultBuffer == Yresult ) { m = size; n = 1; lda = size; fOrder = clblasColumnMajor; } else if ( resultBuffer == APresult) { m = size; n = size; lda = 0; // compareMatrix expects lda = 0 for Packed Matrix fOrder = order; } compareMatrices( fOrder, m, n, s1, s2, lda); } template cl_int baseMetod::getResult() { cl_int err; /* if (C == NULL) { err = clEnqueueReadBuffer(queues[0], bufB, CL_TRUE, 0, size * size * sizeof(TYPE), B, 0, NULL, NULL); } else { err = clEnqueueReadBuffer(queues[0], bufC, CL_TRUE, 0, size * size * sizeof(TYPE), C, 0, NULL, NULL); } */ /* if (C == NULL) { resultBuffer = Bresult; } else { resultBuffer = Cresult; } */ T* s = NULL; s = ( resultBuffer == Aresult)? A: ( resultBuffer == Bresult) ? B: ( resultBuffer == Cresult)? C:( resultBuffer == Xresult)? X:( resultBuffer == Yresult)? Y: AP; cl_mem bufs = ( resultBuffer == Aresult)? bufA: ( resultBuffer == Bresult) ? bufB: ( resultBuffer == Cresult)? bufC:( resultBuffer == Xresult)? bufX:( resultBuffer == Yresult)? bufY: bufAP; size_t transferSize = 0; if ( resultBuffer == Aresult || resultBuffer == Bresult || resultBuffer == Cresult ) { transferSize = size * size; } else if ( resultBuffer == Xresult || resultBuffer == Yresult ) { transferSize = size; } else if ( resultBuffer == APresult) { transferSize = (size * (size + 1))/2; } transferSize *= sizeof(TYPE); err = CL_SUCCESS; err = clEnqueueReadBuffer(queues[0], bufs, CL_TRUE, 0, transferSize, s, 0, NULL, NULL); return err; } /////// template class GemmMetod : public baseMetod { private: typedef T TYPE; clblasTranspose transA; clblasTranspose transB; public: void initDefault(size_t s, unsigned int q); cl_int run(); void generateData(); }; template void GemmMetod::initDefault(size_t s, unsigned int q) { baseMetod::initDefault(s, q, USE_ABC); transA = clblasNoTrans; transB = clblasNoTrans; this->resultBuffer = Cresult; baseMetod::env = "AMD_CLBLAS_GEMM_IMPLEMENTATION"; } template void GemmMetod::generateData() { bool useAlpha = this->base->useAlpha(); bool useBeta = this->base->useBeta(); randomGemmMatrices(this->order, transA, transB, this->size, this->size, this->size, useAlpha, &this->alpha, this->A, this->size, this->B, this->size, useBeta, &this->beta, this->C, this->size); } template cl_int GemmMetod::run() { return (cl_int)::clMath::clblas::gemm(this->order, transA, transB, this->size, this->size, this->size, this->alpha, this->bufA, 0, this->size, this->bufB, 0, this->size, this->beta, this->bufC, 0, this->size, this->qnum, this->queues, this->inEventCount, this->inEvent, this->outEvent); } ///////////////////////////////////////////////////////////////////////////////////////////////////// template class TrmmMetod : public baseMetod { typedef T TYPE; clblasTranspose transA; clblasTranspose transB; clblasUplo uplo; clblasSide side; clblasDiag diag; public: void initDefault(size_t s, unsigned int q); cl_int run(); void generateData(); }; template void TrmmMetod::initDefault(size_t s, unsigned int q) { baseMetod::initDefault(s, q, USE_AB); transA = clblasNoTrans; transB = clblasNoTrans; side = clblasLeft; uplo = clblasUpper; diag = clblasUnit; this->resultBuffer = Bresult; baseMetod::env = "AMD_CLBLAS_TRMM_IMPLEMENTATION"; } template void TrmmMetod::generateData() { bool useAlpha = this->base->useAlpha(); randomTrmmMatrices(this->order, side, uplo, diag, this->size, this->size, useAlpha, &this->alpha, this->A, this->size, this->B, this->size); } template cl_int TrmmMetod::run() { return (cl_int)::clMath::clblas::trmm(this->order, this->side, this->uplo, this->transA, clblasUnit, this->size, this->size, this->alpha, this->bufA, 0, this->size, this->bufB, 0, this->size, this->qnum, this->queues, this->inEventCount, this->inEvent, this->outEvent); } ///////////////////////////////////////////////////////////////////////////////////////////////////// template class TrsmMetod : public baseMetod { typedef T TYPE; clblasTranspose transA; clblasUplo uplo; clblasSide side; clblasDiag diag; public: void initDefault(size_t s, unsigned int q); cl_int run(); void generateData(); }; template void TrsmMetod::initDefault(size_t s, unsigned int q) { baseMetod::initDefault(s, q, USE_AB); transA = clblasNoTrans; side = clblasLeft; uplo = clblasUpper; diag = clblasUnit; this->resultBuffer = Bresult; baseMetod::env = "AMD_CLBLAS_TRSM_IMPLEMENTATION"; } template void TrsmMetod::generateData() { bool useAlpha = this->base->useAlpha(); randomTrsmMatrices(this->order, side, uplo, diag, this->size, this->size, useAlpha, &this->alpha, this->A, this->size, this->B, this->size); } template cl_int TrsmMetod::run() { return (cl_int)::clMath::clblas::trsm(this->order, side, uplo, transA, diag, this->size, this->size, this->alpha, this->bufA, 0, this->size, this->bufB, 0, this->size, this->qnum, this->queues, this->inEventCount, this->inEvent, this->outEvent); } ///////////////////////////////////////////////////////////////////////////////////////////////////// template class GemvMetod : public baseMetod { typedef T TYPE; clblasTranspose transA; clblasTranspose transB; clblasTranspose transC; public: void initDefault(size_t s, unsigned int q); cl_int run(); void generateData(); }; template void GemvMetod::initDefault(size_t s, unsigned int q) { baseMetod::initDefault(s, q, USE_ABC); transA = clblasNoTrans; transB = clblasNoTrans; transC = clblasNoTrans; this->resultBuffer = Cresult; } template void GemvMetod::generateData() { bool useAlpha = this->base->useAlpha(); bool useBeta = this->base->useBeta(); randomGemmxMatrices(this->order, transA, transB, transC, this->size, this->size, this->size, useAlpha, &this->alpha, this->A, this->size, this->B, this->size, useBeta, &this->beta, this->C, this->size); } template cl_int GemvMetod::run() { return (cl_int)::clMath::clblas::gemv(this->order, transA, this->size, this->size, this->alpha, this->bufA, 0, this->size, this->bufB, 0, 1, this->beta, this->bufC, 0, 1, this->qnum, this->queues, this->inEventCount, this->inEvent, this->outEvent); return 0; } ///////////////////////////////////////////////////////////////////////////////////////////////////// template class SymvMetod : public baseMetod { typedef T TYPE; clblasTranspose transA; clblasTranspose transB; clblasTranspose transC; clblasUplo uplo; public: void initDefault(size_t s, unsigned int q); cl_int run(); void generateData(); }; template void SymvMetod::initDefault(size_t s, unsigned int q) { baseMetod::initDefault(s, q, USE_ABC); transA = clblasNoTrans; transB = clblasNoTrans; transC = clblasNoTrans; uplo = clblasUpper; this->resultBuffer = Cresult; } template void SymvMetod::generateData() { bool useAlpha = this->base->useAlpha(); bool useBeta = this->base->useBeta(); randomGemmxMatrices(this->order, transA, transB, transC, this->size, this->size, this->size, useAlpha, &this->alpha, this->A, this->size, this->B, this->size, useBeta, &this->beta, this->C, this->size); } template cl_int SymvMetod::run() { return (cl_int)::clMath::clblas::symv(this->order, uplo, this->size, this->alpha, this->bufA, 0, this->size, this->bufB, 0, 1, this->beta, this->bufC, 0, 1, this->qnum, this->queues, this->inEventCount, this->inEvent, this->outEvent); } ///////////////////////////////////////////////////////////////////////////////////////////////////// template class Syr2kMetod : public baseMetod { public: typedef T TYPE; clblasTranspose transA; clblasTranspose transB; clblasUplo uplo; public: void initDefault(size_t s, unsigned int q); cl_int run(); void generateData(); }; template void Syr2kMetod::initDefault(size_t s, unsigned int q) { baseMetod::initDefault(s, q, USE_ABC); transA = clblasNoTrans; transB = clblasNoTrans; uplo = clblasUpper; this->resultBuffer = Cresult; } template void Syr2kMetod::generateData() { bool useBeta = this->base->useBeta(); randomGemmMatrices(this->order, transA, transB, this->size, this->size, this->size, true, &this->alpha, this->A, this->size, this->B, this->size, useBeta, &this->beta, this->C, this->size); } template cl_int Syr2kMetod::run() { return (cl_int)::clMath::clblas::syr2k(this->order, uplo, transA, this->size, this->size, this->alpha, this->bufA, 0, this->size, this->bufB, 0, this->size, this->beta, this->bufC, 0, this->size, this->qnum, this->queues, this->inEventCount, this->inEvent, this->outEvent); } ///////////////////////////////////////////////////////////////////////////////////////////////////// template class TrmvMetod : public baseMetod { public: typedef T TYPE; clblasTranspose transA; clblasUplo uplo; clblasDiag diagA; public: void initDefault(size_t s, unsigned int q); cl_int run(); void generateData(); }; // Assumptions // 1. Packed Matrix : lda must be set to zero // 2. Always test with RowMajor Lower in case of Packed matrix // 3. NoTrans case only supported for Packed matrix // template void TrmvMetod::initDefault(size_t s, unsigned int q) { baseMetod::initDefault(s, q, USE_AXY); this->order = clblasRowMajor; transA = clblasNoTrans; uplo = clblasLower; diagA = clblasUnit; this->resultBuffer = Xresult; } template void TrmvMetod::generateData() { /* enum RealMatrixCreationFlags { NO_FLAGS = 0, ROW_MAJOR_ORDER = 1, PACKED_MATRIX = 2, SYMMETRIC_MATRIX = 4, UPPER_HALF_ONLY = 8, LOWER_HALF_ONLY = 16, NO_ALIGNMENT = 32, UNIT_DIAGONAL = 64, RANDOM_INIT = 128, ZERO_DIAGONAL = 256 */ //bool useBeta = this->base->useBeta(); /* randomGemmMatrices(this->order, transA, transB,this->size, this->size, this->size, true, &this->alpha, this->A, this->size, this->B, this->size, useBeta, &this->beta, this->C, this->size); */ // Set flags... int creationFlags = 0; creationFlags = creationFlags | RANDOM_INIT; // Default is Column-Major creationFlags = ( (this-> order) == clblasRowMajor)? (creationFlags | ROW_MAJOR_ORDER) : (creationFlags); // Setting uplo creationFlags = ( (this-> uplo) == clblasLower)? (creationFlags | LOWER_HALF_ONLY) : (creationFlags | UPPER_HALF_ONLY); BlasRoutineID BlasFn = CLBLAS_TRMV; // Matrix A populate( this->A, this->size, this->size, this->size, BlasFn, creationFlags); populate( this->X, this->size, 1, this->size, BlasFn); } template cl_int TrmvMetod::run() { DataType type; type = ( typeid(T) == typeid(float))? TYPE_FLOAT:( typeid(T) == typeid(double))? TYPE_DOUBLE: ( typeid(T) == typeid(FloatComplex))? TYPE_COMPLEX_FLOAT: TYPE_COMPLEX_DOUBLE; return (cl_int)::clMath::clblas::trmv(type, this->order, uplo, transA, diagA, this->size, this->bufA, 0, this->size, this->bufX, 0, 1, this->bufY/* as Xcopy */, this->qnum, this->queues, this->inEventCount, this->inEvent, this->outEvent); } ///////////////////////////////////////////////////////////////////// template class TrsvMetod : public baseMetod { public: typedef T TYPE; clblasTranspose transA; clblasUplo uplo; clblasDiag diagA; public: void initDefault(size_t s, unsigned int q); cl_int run(); void generateData(); }; // Assumptions // 1. Packed Matrix : lda must be set to zero // 2. Always test with RowMajor Lower in case of Packed matrix // 3. NoTrans case only supported for Packed matrix // template void TrsvMetod::initDefault(size_t s, unsigned int q) { baseMetod::initDefault(s, q, USE_AX); this->order = clblasRowMajor; transA = clblasNoTrans; uplo = clblasLower; diagA = clblasUnit; this->resultBuffer = Xresult; } template void TrsvMetod::generateData() { /* enum RealMatrixCreationFlags { NO_FLAGS = 0, ROW_MAJOR_ORDER = 1, PACKED_MATRIX = 2, SYMMETRIC_MATRIX = 4, UPPER_HALF_ONLY = 8, LOWER_HALF_ONLY = 16, NO_ALIGNMENT = 32, UNIT_DIAGONAL = 64, RANDOM_INIT = 128, ZERO_DIAGONAL = 256 */ int creationFlags = 0; creationFlags = creationFlags | RANDOM_INIT; // Default is Column-Major creationFlags = ( (this-> order) == clblasRowMajor)? (creationFlags | ROW_MAJOR_ORDER) : (creationFlags); // Setting uplo creationFlags = ( (this-> uplo) == clblasLower)? (creationFlags | LOWER_HALF_ONLY) : (creationFlags | UPPER_HALF_ONLY); // Matrix A //populate( this->A, this->size, this->size, this->size, creationFlags); //populate( this->X, this->size, 1, this->size); randomTrsvMatrices(this->order, this->uplo, this->diagA, this->size, this->A, this->size, this->X, 1); } template cl_int TrsvMetod::run() { DataType type; type = ( typeid(T) == typeid(float))? TYPE_FLOAT:( typeid(T) == typeid(double))? TYPE_DOUBLE: ( typeid(T) == typeid(FloatComplex))? TYPE_COMPLEX_FLOAT: TYPE_COMPLEX_DOUBLE; return (cl_int)::clMath::clblas::trsv(type, this->order, uplo, transA, diagA, this->size, this->bufA, 0, this->size, this->bufX, 0, 1, this->qnum, this->queues, this->inEventCount, this->inEvent, this->outEvent); } ///////////////////////////////////////////////////////////////////// template class TpsvMetod : public baseMetod { public: typedef T TYPE; clblasTranspose transA; clblasUplo uplo; clblasDiag diagA; public: void initDefault(size_t s, unsigned int q); cl_int run(); void generateData(); }; // Assumptions // 1. Packed Matrix : lda must be set to zero // 2. Always test with RowMajor Lower in case of Packed matrix // 3. NoTrans case only supported for Packed matrix // template void TpsvMetod::initDefault(size_t s, unsigned int q) { baseMetod::initDefault(s, q, USE_AX); this->order = clblasRowMajor; transA = clblasNoTrans; uplo = clblasLower; diagA = clblasUnit; this->resultBuffer = Xresult; } template void TpsvMetod::generateData() { randomTrsvMatrices(this->order, this->uplo, this->diagA, this->size, this->A, 0, this->X, 1); } template cl_int TpsvMetod::run() { DataType type; type = ( typeid(T) == typeid(float))? TYPE_FLOAT:( typeid(T) == typeid(double))? TYPE_DOUBLE: ( typeid(T) == typeid(FloatComplex))? TYPE_COMPLEX_FLOAT: TYPE_COMPLEX_DOUBLE; return (cl_int)::clMath::clblas::tpsv(type, this->order, uplo, transA, diagA, this->size, this->bufA, 0, this->bufX, 0, 1, this->qnum, this->queues, this->inEventCount, this->inEvent, this->outEvent); } ///////////////////////////////////////////////////////////////////////////////////////////////////// template class SymmMetod : public baseMetod { public: typedef T TYPE; clblasSide side; clblasUplo uplo; public: void initDefault(size_t s, unsigned int q); cl_int run(); void generateData(); }; // Assumptions // 1. Packed Matrix : lda must be set to zero // 2. Always test with RowMajor Lower in case of Packed matrix // 3. NoTrans case only supported for Packed matrix // template void SymmMetod::initDefault(size_t s, unsigned int q) { baseMetod::initDefault(s, q, USE_ABC); this->order = clblasRowMajor; uplo = clblasLower; side = clblasLeft; this->resultBuffer = Cresult; } template void SymmMetod::generateData() { /* enum RealMatrixCreationFlags { NO_FLAGS = 0, ROW_MAJOR_ORDER = 1, PACKED_MATRIX = 2, SYMMETRIC_MATRIX = 4, UPPER_HALF_ONLY = 8, LOWER_HALF_ONLY = 16, NO_ALIGNMENT = 32, UNIT_DIAGONAL = 64, RANDOM_INIT = 128, ZERO_DIAGONAL = 256 */ //bool useBeta = this->base->useBeta(); /* randomGemmMatrices(this->order, transA, transB,this->size, this->size, this->size, true, &this->alpha, this->A, this->size, this->B, this->size, useBeta, &this->beta, this->C, this->size); */ // Set flags... int creationFlags = 0, creationFlagsA; creationFlags = creationFlags | RANDOM_INIT; // Default is Column-Major creationFlags = ( (this-> order) == clblasRowMajor)? (creationFlags | ROW_MAJOR_ORDER) : (creationFlags); // Setting uplo //In this case only A matrix is either upper or lower triangular creationFlagsA = creationFlags; creationFlagsA = ( (this-> uplo) == clblasLower)? (creationFlagsA | LOWER_HALF_ONLY) : (creationFlagsA | UPPER_HALF_ONLY); BlasRoutineID BlasFn = CLBLAS_SYMM; // Matrix A populate(this->A, this->size, this->size, this->size, BlasFn, creationFlagsA ); populate(this->B, this->size, this->size, this->size, BlasFn, creationFlags); populate(this->C, this->size, this->size, this->size, BlasFn, creationFlags); } template cl_int SymmMetod::run() { return (cl_int)::clMath::clblas::symm(this->order, side, uplo, this->size, this->size, this->alpha, this->bufA, 0, this->size, this->bufB, 0, this->size, this->beta, this->bufC, 0, this->size, this->qnum, this->queues, this->inEventCount, this->inEvent, this->outEvent); } ////////////////////////////////////////////////////////////////////////////////////////////////////////////// template class SyrMetod : public baseMetod { public: typedef T TYPE; clblasUplo uplo; public: void initDefault(size_t s, unsigned int q); cl_int run(); void generateData(); }; // Assumptions // 1. Testing for Row Major order. // 2. Lower triangle // template void SyrMetod::initDefault(size_t s, unsigned int q) { baseMetod::initDefault(s, q, USE_AX); this->order = clblasRowMajor; uplo = clblasLower; this->resultBuffer = Aresult; } template void SyrMetod::generateData() { /* // Set flags... int creationFlags = 0; creationFlags = creationFlags | RANDOM_INIT; // Default is Column-Major creationFlags = ( (this-> order) == clblasRowMajor)? (creationFlags | ROW_MAJOR_ORDER) : (creationFlags); creationFlags = ( (this-> uplo) == clblasLower)? (creationFlags | LOWER_HALF_ONLY) : (creationFlags | UPPER_HALF_ONLY); BlasRoutineID BlasFn = CLBLAS_GER; // Matrix A populate( this->A, this->size, this->size, this->size, BlasFn, creationFlags); //Vector X populate( this->X, this->size, 1, this->size, BlasFn); */ randomSyrMatrices( this->order, uplo, this->size, false, &(this->alpha), this->A, this->size, this->X, 1); } template cl_int SyrMetod::run() { return (cl_int)::clMath::clblas::syr(this->order, uplo, this->size, this->alpha, this->bufX, 0, 1, this->bufA, 0, this->size, this->qnum, this->queues, this->inEventCount, this->inEvent, this->outEvent); } ////////////////////////////////////////////////////////////////////////////////////////////////////////////// template class Syr2Metod : public baseMetod { public: typedef T TYPE; clblasUplo uplo; public: void initDefault(size_t s, unsigned int q); cl_int run(); void generateData(); }; // Assumptions // 1. Testing for Row Major order. // 2. Lower triangle // template void Syr2Metod::initDefault(size_t s, unsigned int q) { baseMetod::initDefault(s, q, USE_AXY); this->order = clblasRowMajor; uplo = clblasLower; this->resultBuffer = Aresult; } template void Syr2Metod::generateData() { randomSyr2Matrices( this->order, uplo, this->size, false, &(this->alpha), this->A, this->size, this->X, 1, this->Y, 1); } template cl_int Syr2Metod::run() { return (cl_int)::clMath::clblas::syr2(this->order, uplo, this->size, this->alpha, this->bufX, 0, 1, this->bufY, 0, 1, this->bufA, 0, this->size, this->qnum, this->queues, this->inEventCount, this->inEvent, this->outEvent); } ////////////////////////////////////////////////////////////////////////////////////////////////////////////// template class GerMetod : public baseMetod { public: typedef T TYPE; int incx, incy; int m; public: void initDefault(size_t s, unsigned int q); cl_int run(); void generateData(); }; // Assumptions // 1. Packed Matrix : lda must be set to zero // 2. Always test with RowMajor Lower in case of Packed matrix // 3. NoTrans case only supported for Packed matrix // template void GerMetod::initDefault(size_t s, unsigned int q) { baseMetod::initDefault(s, q, USE_AXY); this->order = clblasRowMajor; this->resultBuffer = Aresult; } template void GerMetod::generateData() { /* enum RealMatrixCreationFlags { NO_FLAGS = 0, ROW_MAJOR_ORDER = 1, PACKED_MATRIX = 2, SYMMETRIC_MATRIX = 4, UPPER_HALF_ONLY = 8, LOWER_HALF_ONLY = 16, NO_ALIGNMENT = 32, UNIT_DIAGONAL = 64, RANDOM_INIT = 128, ZERO_DIAGONAL = 256 */ //bool useBeta = this->base->useBeta(); /* randomGemmMatrices(this->order, transA, transB,this->size, this->size, this->size, true, &this->alpha, this->A, this->size, this->B, this->size, useBeta, &this->beta, this->C, this->size); */ // Set flags... int creationFlags = 0; creationFlags = creationFlags | RANDOM_INIT; // Default is Column-Major creationFlags = ( (this-> order) == clblasRowMajor)? (creationFlags | ROW_MAJOR_ORDER) : (creationFlags); BlasRoutineID BlasFn = CLBLAS_GER; // Matrix A populate(this->A, this->size, this->size, this->size, BlasFn, creationFlags); populate(this->X, this->size, 1, (1 + (m - 1) * abs(incx)), BlasFn, 0); populate(this->Y, this->size, 1, (1 + (m - 1) * abs(incy)), BlasFn, 0); } template cl_int GerMetod::run() { return (cl_int)::clMath::clblas::ger(this->order, this->size, this->size, this->alpha, this->bufX, 0, 1, this->bufY, 0, 1, this->bufA, 0, this->size, this->qnum, this->queues, this->inEventCount, this->inEvent, this->outEvent); } ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// template class GercMetod : public baseMetod { public: typedef T TYPE; int incx, incy; int m; public: void initDefault(size_t s, unsigned int q); cl_int run(); void generateData(); }; // Assumptions // 1. Packed Matrix : lda must be set to zero // 2. Always test with RowMajor Lower in case of Packed matrix // 3. NoTrans case only supported for Packed matrix // template void GercMetod::initDefault(size_t s, unsigned int q) { baseMetod::initDefault(s, q, USE_AXY); this->order = clblasRowMajor; this->resultBuffer = Aresult; } template void GercMetod::generateData() { // Set flags... int creationFlags = 0; creationFlags = creationFlags | RANDOM_INIT; // Default is Column-Major creationFlags = ( (this-> order) == clblasRowMajor)? (creationFlags | ROW_MAJOR_ORDER) : (creationFlags); BlasRoutineID BlasFn = CLBLAS_GER; // Matrix A populate(this->A, this->size, this->size, this->size, BlasFn, creationFlags); populate(this->X, this->size, 1, this->size, BlasFn, 0); populate(this->Y, this->size, 1, this->size, BlasFn, 0); } template cl_int GercMetod::run() { return (cl_int)::clMath::clblas::gerc(this->order, this->size, this->size, this->alpha, this->bufX, 0, 1, this->bufY, 0, 1, this->bufA, 0, this->size, this->qnum, this->queues, this->inEventCount, this->inEvent, this->outEvent); } /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// template class HerMetod : public baseMetod { public: typedef T TYPE; clblasUplo uplo; public: void initDefault(size_t s, unsigned int q); cl_int run(); void generateData(); }; // Assumptions // 1. Testing for Row Major order. // 2. Lower triangle // template void HerMetod::initDefault(size_t s, unsigned int q) { baseMetod::initDefault(s, q, USE_AX); this->order = clblasRowMajor; uplo = clblasLower; this->resultBuffer = Aresult; } template void HerMetod::generateData() { randomHerMatrices( this->order, uplo, this->size, &(this->alpha), this->A, this->size, this->X, 1 ); } template cl_int HerMetod::run() { return (cl_int)::clMath::clblas::her(this->order, this->uplo, this->size, CREAL(this->alpha), this->bufX, 0, 1, this->bufA, 0, this->size, this->qnum, this->queues, this->inEventCount, this->inEvent, this->outEvent); } ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// template class Her2Metod : public baseMetod { public: typedef T TYPE; clblasUplo uplo; public: void initDefault(size_t s, unsigned int q); cl_int run(); void generateData(); }; // Assumptions // 1. Testing for Row Major order. // 2. Lower triangle // template void Her2Metod::initDefault(size_t s, unsigned int q) { baseMetod::initDefault(s, q, USE_AXY); this->order = clblasRowMajor; uplo = clblasLower; this->resultBuffer = Aresult; } template void Her2Metod::generateData() { randomHer2Matrices(this->order, uplo, this->size, &(this->alpha), this->A, this->size, this->X, 1, this->Y, 1); } template cl_int Her2Metod::run() { return (cl_int)::clMath::clblas::her2(this->order, this->uplo, this->size, this->alpha, this->bufX, 0, 1, this->bufY, 0, 1, this->bufA, 0, this->size, this->qnum, this->queues, this->inEventCount, this->inEvent, this->outEvent); } /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// template class HemmMetod : public baseMetod { public: typedef T TYPE; clblasSide side; clblasUplo uplo; public: void initDefault(size_t s, unsigned int q); cl_int run(); void generateData(); }; // Assumptions // 1. Packed Matrix : lda must be set to zero // 2. Always test with RowMajor Lower in case of Packed matrix // 3. NoTrans case only supported for Packed matrix // template void HemmMetod::initDefault(size_t s, unsigned int q) { baseMetod::initDefault(s, q, USE_ABC); this->order = clblasRowMajor; uplo = clblasLower; side = clblasLeft; this->resultBuffer = Cresult; } template void HemmMetod::generateData() { /* int creationFlags = 0, creationFlagsA; creationFlags = creationFlags | RANDOM_INIT; creationFlags = ( (this-> order) == clblasRowMajor)? (creationFlags | ROW_MAJOR_ORDER) : (creationFlags); creationFlagsA = creationFlags; creationFlagsA = ( (this-> uplo) == clblasLower)? (creationFlagsA | LOWER_HALF_ONLY) : (creationFlagsA | UPPER_HALF_ONLY); BlasRoutineID BlasFn = CLBLAS_HEMM; populate(this->A, this->size, this->size, this->size, BlasFn, creationFlagsA ); populate(this->B, this->size, this->size, this->size, BlasFn, creationFlags); populate(this->C, this->size, this->size, this->size, BlasFn, creationFlags); */ randomGemmMatrices(this->order, clblasNoTrans, clblasNoTrans, this->size, this->size, this->size, false, &this->alpha, this->A, this->size, this->B, this->size, false, &this->beta, this->C, this->size); } template cl_int HemmMetod::run() { return (cl_int)::clMath::clblas::hemm(this->order, side, uplo, this->size, this->size, this->alpha, this->bufA, 0, this->size, this->bufB, 0, this->size, this->beta, this->bufC, 0, this->size, this->qnum, this->queues, this->inEventCount, this->inEvent, this->outEvent); } // HEMV template class HemvMetod : public baseMetod { public: typedef T TYPE; clblasUplo uplo; public: void initDefault(size_t s, unsigned int q); cl_int run(); void generateData(); }; template void HemvMetod::initDefault(size_t s, unsigned int q) { baseMetod::initDefault(s, q, USE_AXY); this->order = clblasRowMajor; uplo = clblasLower; this->resultBuffer = Yresult; } template void HemvMetod::generateData() { randomHemvMatrices(this->order, uplo, this->size, false, &(this->alpha), this->A, this->size, this->X, 1, false, &(this->beta), this->Y, 1); } template cl_int HemvMetod::run() { return (cl_int)::clMath::clblas::hemv(this->order, uplo, this->size, this->alpha, this->bufA, 0, this->size, this->bufX, 0, 1, this->beta, this->bufY, 0, 1, this->qnum, this->queues, this->inEventCount, this->inEvent, this->outEvent); } ///////////////////////////////////////////////////// template class HerkMetod : public baseMetod { public: typedef T TYPE; clblasUplo uplo; clblasTranspose transA; public: void initDefault(size_t s, unsigned int q); cl_int run(); void generateData(); }; // Assumptions // 1. Packed Matrix : lda must be set to zero // 2. Always test with RowMajor Lower in case of Packed matrix // 3. NoTrans case only supported for Packed matrix // template void HerkMetod::initDefault(size_t s, unsigned int q) { baseMetod::initDefault(s, q, USE_AC); this->order = clblasRowMajor; uplo = clblasLower; transA = clblasNoTrans; this->resultBuffer = Cresult; } template void HerkMetod::generateData() { randomGemmMatrices(this->order, this->transA, clblasNoTrans, this->size, this->size, this->size, false, &this->alpha, this->A, this->size, NULL, 0, false, &this->beta, this->C, this->size); } template cl_int HerkMetod::run() { return (cl_int)::clMath::clblas::herk(this->order, uplo, transA, this->size, this->size, CREAL(this->alpha), this->bufA, 0, this->size, CREAL(this->beta), this->bufC, 0, this->size, this->qnum, this->queues, this->inEventCount, this->inEvent, this->outEvent); } //////////////////////////////////////////////////////////////////////////////////////////// template class TpmvMetod : public baseMetod { public: typedef T TYPE; clblasTranspose trans; clblasUplo uplo; clblasDiag diag; public: void initDefault(size_t s, unsigned int q); cl_int run(); void generateData(); }; // Assumptions // 1. Packed Matrix : lda must be set to zero // 2. Always test with RowMajor Lower in case of Packed matrix // 3. NoTrans case only supported for Packed matrix // template void TpmvMetod::initDefault(size_t s, unsigned int q) { baseMetod::initDefault(s, q, USE_APXY); this->order = clblasRowMajor; trans = clblasNoTrans; uplo = clblasLower; diag = clblasUnit; this->resultBuffer = Xresult; } template void TpmvMetod::generateData() { // Set flags... int creationFlags = 0; creationFlags = creationFlags | RANDOM_INIT | PACKED_MATRIX; // Default is Column-Major creationFlags = ( (this-> order) == clblasRowMajor)? (creationFlags | ROW_MAJOR_ORDER) : (creationFlags); // Setting uplo creationFlags = ( (this-> uplo) == clblasLower)? (creationFlags | LOWER_HALF_ONLY) : (creationFlags | UPPER_HALF_ONLY); BlasRoutineID BlasFn = CLBLAS_TRMV; // Matrix A populate( this->AP, this->size, this->size, 0, BlasFn, creationFlags); populate( this->X, this->size, 1, this->size, BlasFn); } template cl_int TpmvMetod::run() { DataType type; type = ( typeid(T) == typeid(float))? TYPE_FLOAT:( typeid(T) == typeid(double))? TYPE_DOUBLE: ( typeid(T) == typeid(FloatComplex))? TYPE_COMPLEX_FLOAT: TYPE_COMPLEX_DOUBLE; return (cl_int)::clMath::clblas::tpmv(type, this->order, uplo, trans, diag, this->size, this->bufAP, 0, this->bufX, 0, 1, this->bufY/* as Xcopy */, this->qnum, this->queues, this->inEventCount, this->inEvent, this->outEvent); } /////////////////////////////////////////////////////////////////////////// template class SpmvMetod : public baseMetod { public: typedef T TYPE; clblasUplo uplo; public: void initDefault(size_t s, unsigned int q); cl_int run(); void generateData(); }; template void SpmvMetod::initDefault(size_t s, unsigned int q) { baseMetod::initDefault(s, q, USE_APXY); this->order = clblasRowMajor; uplo = clblasLower; this->resultBuffer = Yresult; } template void SpmvMetod::generateData() { randomSpmvMatrices(this->order, uplo, this->size, false, &(this->alpha), this->AP, this->X, 1, false, &(this->beta), this->Y, 1); } template cl_int SpmvMetod::run() { return (cl_int)::clMath::clblas::spmv(this->order, uplo, this->size, this->alpha, this->bufAP, 0, this->bufX, 0, 1, this->beta, this->bufY, 0, 1, this->qnum, this->queues, this->inEventCount, this->inEvent, this->outEvent); } /////////////////////////////////////////////////////////////////////////// template class HpmvMetod : public baseMetod { public: typedef T TYPE; clblasUplo uplo; public: void initDefault(size_t s, unsigned int q); cl_int run(); void generateData(); }; template void HpmvMetod::initDefault(size_t s, unsigned int q) { baseMetod::initDefault(s, q, USE_APXY); this->order = clblasRowMajor; uplo = clblasLower; this->resultBuffer = Yresult; } template void HpmvMetod::generateData() { randomHemvMatrices(this->order, uplo, this->size, false, &(this->alpha), this->AP, 0, this->X, 1, false, &(this->beta), this->Y, 1); } template cl_int HpmvMetod::run() { return (cl_int)::clMath::clblas::hpmv(this->order, uplo, this->size, this->alpha, this->bufAP, 0, this->bufX, 0, 1, this->beta, this->bufY, 0, 1, this->qnum, this->queues, this->inEventCount, this->inEvent, this->outEvent); } ////////////////////////////////////////////////////////////////////////////////////// template class SprMetod : public baseMetod { public: typedef T TYPE; clblasUplo uplo; public: void initDefault(size_t s, unsigned int q); cl_int run(); void generateData(); }; // Assumptions // 1. Testing for Row Major order. // 2. Lower triangle // template void SprMetod::initDefault(size_t s, unsigned int q) { baseMetod::initDefault(s, q, USE_APX); this->order = clblasRowMajor; uplo = clblasLower; this->resultBuffer = APresult; } template void SprMetod::generateData() { randomSyrMatrices( this->order,uplo, this->size, false, &(this->alpha), this->AP, 0, this->X, 1); } template cl_int SprMetod::run() { return (cl_int)::clMath::clblas::spr(this->order, uplo, this->size, this->alpha, this->bufX, 0, 1, this->bufAP, 0, this->qnum, this->queues, this->inEventCount, this->inEvent, this->outEvent); } /////////////////////////////////////////////////////////// template class HprMetod : public baseMetod { public: typedef T TYPE; clblasUplo uplo; public: void initDefault(size_t s, unsigned int q); cl_int run(); void generateData(); }; template void HprMetod::initDefault(size_t s, unsigned int q) { baseMetod::initDefault(s, q, USE_APX); this->order = clblasRowMajor; uplo = clblasLower; this->resultBuffer = APresult; } template void HprMetod::generateData() { randomHerMatrices( this->order, uplo, this->size, &(this->alpha), this->AP, 0, this->X, 1 ); } template cl_int HprMetod::run() { return (cl_int)::clMath::clblas::hpr(this->order, this->uplo, this->size, CREAL(this->alpha), this->bufX, 0, 1, this->bufAP, 0, this->qnum, this->queues, this->inEventCount, this->inEvent, this->outEvent); } ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// template class Hpr2Metod : public baseMetod { public: typedef T TYPE; clblasUplo uplo; public: void initDefault(size_t s, unsigned int q); cl_int run(); void generateData(); }; // Assumptions // 1. Testing for Row Major order. // 2. Lower triangle // template void Hpr2Metod::initDefault(size_t s, unsigned int q) { baseMetod::initDefault(s, q, USE_APXY); this->order = clblasRowMajor; uplo = clblasLower; this->resultBuffer = APresult; } template void Hpr2Metod::generateData() { randomHer2Matrices(this->order, uplo, this->size, &(this->alpha), this->AP, 0, this->X, 1, this->Y, 1); } template cl_int Hpr2Metod::run() { return (cl_int)::clMath::clblas::hpr2(this->order, this->uplo, this->size, this->alpha, this->bufX, 0, 1, this->bufY, 0, 1, this->bufAP, 0, this->qnum, this->queues, this->inEventCount, this->inEvent, this->outEvent); } /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// template class Spr2Metod : public baseMetod { public: typedef T TYPE; clblasUplo uplo; public: void initDefault(size_t s, unsigned int q); cl_int run(); void generateData(); }; // Assumptions // 1. Testing for Row Major order. // 2. Lower triangle // template void Spr2Metod::initDefault(size_t s, unsigned int q) { baseMetod::initDefault(s, q, USE_APXY); this->order = clblasRowMajor; uplo = clblasLower; this->resultBuffer = APresult; } template void Spr2Metod::generateData() { randomSyr2Matrices( this->order, uplo, this->size, false, &(this->alpha), this->AP, 0, this->X, 1, this->Y, 1); } template cl_int Spr2Metod::run() { return (cl_int)::clMath::clblas::spr2(this->order, uplo, this->size, this->alpha, this->bufX, 0, 1, this->bufY, 0, 1, this->bufAP, 0, this->qnum, this->queues, this->inEventCount, this->inEvent, this->outEvent); } ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////// template class GbmvMetod : public baseMetod { typedef T TYPE; clblasTranspose transA; public: void initDefault(size_t s, unsigned int q); cl_int run(); void generateData(); }; // Assumptions // 1. Testing for Row Major order. // 2. Non Transpose // template void GbmvMetod::initDefault(size_t s, unsigned int q) { baseMetod::initDefault(s, q, USE_AXY); this->order = clblasRowMajor; transA = clblasNoTrans; this->resultBuffer = Yresult; } template void GbmvMetod::generateData() { randomGbmvMatrices(this->order, this->transA, this->size, this->size, &(this->alpha), &(this->beta), this->A, this->size, this->X, 1, this->Y, 1); } template cl_int GbmvMetod::run() { return (cl_int)clMath::clblas::gbmv(this->order, this->transA, this->size, this->size, (1), (1), this->alpha, this->bufA, 0, this->size, this->bufX, 0, 1, this->beta, this->bufY, 0, 1, this->qnum, this->queues, this->inEventCount, this->inEvent, this->outEvent); } ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////// template class TbmvMetod : public baseMetod { typedef T TYPE; clblasTranspose transA; clblasUplo uplo; clblasDiag diag; public: void initDefault(size_t s, unsigned int q); cl_int run(); void generateData(); }; // Assumptions // 1. Testing for Row Major order. // 2. Non Transpose // template void TbmvMetod::initDefault(size_t s, unsigned int q) { baseMetod::initDefault(s, q, USE_AXY); this->order = clblasRowMajor; transA = clblasNoTrans; uplo = clblasUpper; diag = clblasNonUnit; this->resultBuffer = Yresult; } template void TbmvMetod::generateData() { randomTbmvMatrices(this->size, this->A, this->size, this->X, 1); } template cl_int TbmvMetod::run() { DataType type; type = ( typeid(T) == typeid(float))? TYPE_FLOAT:( typeid(T) == typeid(double))? TYPE_DOUBLE: ( typeid(T) == typeid(FloatComplex))? TYPE_COMPLEX_FLOAT: TYPE_COMPLEX_DOUBLE; return (cl_int)clMath::clblas::tbmv(type, this->order, this->uplo, this->transA, this->diag, this->size, (1), this->bufA, 0, this->size, this->bufX, 0, 1, this->bufY, this->qnum, this->queues, this->inEventCount, this->inEvent, this->outEvent); } //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// template class SbmvMetod : public baseMetod { typedef T TYPE; clblasUplo uplo; public: void initDefault(size_t s, unsigned int q); cl_int run(); void generateData(); }; // Assumptions // 1. Testing for Row Major order. // 2. Non Transpose // template void SbmvMetod::initDefault(size_t s, unsigned int q) { baseMetod::initDefault(s, q, USE_AXY); this->order = clblasRowMajor; uplo = clblasUpper; this->resultBuffer = Yresult; } template void SbmvMetod::generateData() { randomGbmvMatrices(this->order, clblasNoTrans, this->size, this->size, &(this->alpha), &(this->beta), this->A, this->size, this->X, 1, this->Y, 1); } template cl_int SbmvMetod::run() { return (cl_int)clMath::clblas::sbmv(this->order, this->uplo, this->size, 1, this->alpha, this->bufA, 0, this->size, this->bufX, 0, 1, this->beta, this->bufY, 0, 1, this->qnum, this->queues, this->inEventCount, this->inEvent, this->outEvent); } ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// //HBMV template class HbmvMetod : public baseMetod { typedef T TYPE; clblasUplo uplo; public: void initDefault(size_t s, unsigned int q); cl_int run(); void generateData(); }; // Assumptions // 1. Testing for Row Major order. // 2. Non Transpose // template void HbmvMetod::initDefault(size_t s, unsigned int q) { baseMetod::initDefault(s, q, USE_AXY); this->order = clblasRowMajor; uplo = clblasUpper; this->resultBuffer = Yresult; } template void HbmvMetod::generateData() { randomGbmvMatrices(this->order, clblasNoTrans, this->size, this->size, &(this->alpha), &(this->beta), this->A, this->size, this->X, 1, this->Y, 1); } template cl_int HbmvMetod::run() { return (cl_int)clMath::clblas::hbmv(this->order, this->uplo, this->size, 1, this->alpha, this->bufA, 0, this->size, this->bufX, 0, 1, this->beta, this->bufY, 0, 1, this->qnum, this->queues, this->inEventCount, this->inEvent, this->outEvent); } ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// template class TbsvMetod : public baseMetod { typedef T TYPE; clblasTranspose transA; clblasUplo uplo; clblasDiag diag; public: void initDefault(size_t s, unsigned int q); cl_int run(); void generateData(); }; template void TbsvMetod::initDefault(size_t s, unsigned int q) { baseMetod::initDefault(s, q, USE_AX); this->order = clblasRowMajor; transA = clblasNoTrans; uplo = clblasUpper; diag = clblasNonUnit; this->resultBuffer = Xresult; } template void TbsvMetod::generateData() { randomTbsvMatrices(this->order, this->uplo, this->diag, this->size, 1, this->A, 2, this->X, 1); } template cl_int TbsvMetod::run() { DataType type; type = ( typeid(T) == typeid(float))? TYPE_FLOAT:( typeid(T) == typeid(double))? TYPE_DOUBLE: ( typeid(T) == typeid(FloatComplex))? TYPE_COMPLEX_FLOAT: TYPE_COMPLEX_DOUBLE; return (cl_int)clMath::clblas::tbsv(type, this->order, this->uplo, this->transA, this->diag, this->size, 1, this->bufA, 0, 2, this->bufX, 0, 1, this->qnum, this->queues, this->inEventCount, this->inEvent, this->outEvent); } /////////////////////////////////////////////// template class Her2kMetod : public baseMetod { public: typedef T TYPE; clblasUplo uplo; clblasTranspose transA; public: void initDefault(size_t s, unsigned int q); cl_int run(); void generateData(); }; template void Her2kMetod::initDefault(size_t s, unsigned int q) { baseMetod::initDefault(s, q, USE_ABC); this->order = clblasRowMajor; uplo = clblasLower; transA = clblasNoTrans; this->resultBuffer = Cresult; } template void Her2kMetod::generateData() { clblasTranspose ftransB = (this->transA==clblasNoTrans)? clblasConjTrans: clblasNoTrans; randomGemmMatrices(this->order, this->transA, ftransB, this->size, this->size, this->size, false, &this->alpha, this->A, this->size, this->B, this->size, false, &this->beta, this->C, this->size); } template cl_int Her2kMetod::run() { return (cl_int)::clMath::clblas::her2k(this->order, uplo, this->transA, this->size, this->size, this->alpha, this->bufA, 0, this->size, this->bufB, 0, this->size, CREAL(this->beta), this->bufC, 0, this->size, this->qnum, this->queues, this->inEventCount, this->inEvent, this->outEvent); } //////////////////////////////////////////////////////////////////////////////////////////// template class ScalMetod : public baseMetod { public: typedef T TYPE; public: void initDefault(size_t s, unsigned int q); cl_int run(); void generateData(); }; template void ScalMetod::initDefault(size_t s, unsigned int q) { baseMetod::initDefault(s, q, USE_X); this->resultBuffer = Xresult; } template void ScalMetod::generateData() { randomVectors(this->size, this->X, 1); } template cl_int ScalMetod::run() { return (cl_int)::clMath::clblas::scal(false, this->size, this->alpha, this->bufX, 0, 1, this->qnum, this->queues, this->inEventCount, this->inEvent, this->outEvent); } //////////////////////////////////////////////////////////////////////////////////////////// // Sscal is for handling the 2 extra cases csscal and zdscal template class SscalMetod : public baseMetod { public: typedef T TYPE; public: void initDefault(size_t s, unsigned int q); cl_int run(); void generateData(); }; template void SscalMetod::initDefault(size_t s, unsigned int q) { baseMetod::initDefault(s, q, USE_X); this->resultBuffer = Xresult; } template void SscalMetod::generateData() { randomVectors(this->size, this->X, 1); } template cl_int SscalMetod::run() { return (cl_int)::clMath::clblas::scal(true, this->size, this->alpha, this->bufX, 0, 1, this->qnum, this->queues, this->inEventCount, this->inEvent, this->outEvent); } //////////////////////////////////////////////////////////////////////////////////////////// template class SwapMetod : public baseMetod { public: typedef T TYPE; public: void initDefault(size_t s, unsigned int q); cl_int run(); void generateData(); }; template void SwapMetod::initDefault(size_t s, unsigned int q) { baseMetod::initDefault(s, q, USE_XY); this->resultBuffer = Xresult; // no need to have 2 buffers as result, as this is not a correctness test. // proper correctness testing happens in test-correctness. } template void SwapMetod::generateData() { randomVectors(this->size, this->X, 1, this->Y, 1); } template cl_int SwapMetod::run() { DataType type; type = ( typeid(T) == typeid(float))? TYPE_FLOAT: ( typeid(T) == typeid(double))? TYPE_DOUBLE: ( typeid(T) == typeid(FloatComplex))? TYPE_COMPLEX_FLOAT: TYPE_COMPLEX_DOUBLE; return (cl_int)::clMath::clblas::swap(type, this->size, this->bufX, 0, 1, this->bufY, 0, 1, this->qnum, this->queues, this->inEventCount, this->inEvent, this->outEvent); } //////////////////////////////////////////////////////////////////////////////////////////// template class DotMetod : public baseMetod { typedef T TYPE; public: void initDefault(size_t s, unsigned int q); cl_int run(); void generateData(); }; template void DotMetod::initDefault(size_t s, unsigned int q) { baseMetod::initDefault(s, q, USE_ABXY); this->resultBuffer = Yresult; } template void DotMetod::generateData() { //BlasRoutineID BlasFn = CLBLAS_DOT; // populate( this->X, this->size, 1, this->size, BlasFn); randomVectors(this->size, this->X, 1, this->Y, 1, true); } template cl_int DotMetod::run() { DataType type; type = ( typeid(T) == typeid(float))? TYPE_FLOAT:( typeid(T) == typeid(double))? TYPE_DOUBLE: ( typeid(T) == typeid(FloatComplex))? TYPE_COMPLEX_FLOAT: TYPE_COMPLEX_DOUBLE; return (cl_int)clMath::clblas::dot( type, this->size, this->bufA, 0, this->bufX, 0, 1, this->bufY, 0, 1, this->bufB, this->qnum, this->queues, this->inEventCount, this->inEvent, this->outEvent); } /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// //DOTC template class DotcMetod : public baseMetod { typedef T TYPE; public: void initDefault(size_t s, unsigned int q); cl_int run(); void generateData(); }; template void DotcMetod::initDefault(size_t s, unsigned int q) { baseMetod::initDefault(s, q, USE_ABXY); this->resultBuffer = Yresult; } template void DotcMetod::generateData() { randomVectors(this->size, this->X, 1, this->Y, 1, true); } template cl_int DotcMetod::run() { DataType type; type = ( typeid(T) == typeid(float))? TYPE_FLOAT:( typeid(T) == typeid(double))? TYPE_DOUBLE: ( typeid(T) == typeid(FloatComplex))? TYPE_COMPLEX_FLOAT: TYPE_COMPLEX_DOUBLE; return (cl_int)clMath::clblas::dotc( type, this->size, this->bufA, 0, this->bufX, 0, 1, this->bufY, 0, 1, this->bufB, this->qnum, this->queues, this->inEventCount, this->inEvent, this->outEvent); } //////////////////////////////////////////////////////////////////////////////////////////// //COPY template class CopyMetod : public baseMetod { public: typedef T TYPE; public: void initDefault(size_t s, unsigned int q); cl_int run(); void generateData(); }; template void CopyMetod::initDefault(size_t s, unsigned int q) { baseMetod::initDefault(s, q, USE_XY); this->resultBuffer = Yresult; } template void CopyMetod::generateData() { randomVectors(this->size, this->X, 1, this->Y, 1); } template cl_int CopyMetod::run() { DataType type; type = ( typeid(T) == typeid(float))? TYPE_FLOAT: ( typeid(T) == typeid(double))? TYPE_DOUBLE: ( typeid(T) == typeid(FloatComplex))? TYPE_COMPLEX_FLOAT: TYPE_COMPLEX_DOUBLE; return (cl_int)::clMath::clblas::copy(type, this->size, this->bufX, 0, 1, this->bufY, 0, 1, this->qnum, this->queues, this->inEventCount, this->inEvent, this->outEvent); } //////////////////////////////////////////////////////////////////////////////////////////////////////////////////// template class AxpyMetod : public baseMetod { public: typedef T TYPE; public: void initDefault(size_t s, unsigned int q); cl_int run(); void generateData(); }; template void AxpyMetod::initDefault(size_t s, unsigned int q) { baseMetod::initDefault(s, q, USE_XY); this->resultBuffer = Yresult; } template void AxpyMetod::generateData() { randomVectors(this->size, this->X, 1, this->Y, 1); } template cl_int AxpyMetod::run() { return (cl_int)::clMath::clblas::axpy(this->size, this->alpha, this->bufX, 0, 1, this->bufY, 0, 1, this->qnum, this->queues, this->inEventCount, this->inEvent, this->outEvent); } //////////////////////////////////////////////////////////////////////////////////////////// template class RotgMetod : public baseMetod { typedef T TYPE; public: void initDefault(size_t s, unsigned int q); cl_int run(); void generateData(); }; template void RotgMetod::initDefault(size_t s, unsigned int q) { //USE_ABXY is actually used to create 2 2-D arrays and 2 vectors //But here we use is to create the required 4 vectors. So more than required memory is allocated here. //As this is functionality test, this does not affect the purpose of the tests. //Here X=SA, Y=SB, A=C and B=S, where RHS's represent the standard netlib variable names baseMetod::initDefault(1, q, USE_ABXY); this->resultBuffer = Yresult; s = s; //Warning } template void RotgMetod::generateData() { randomVectors(this->size, this->X, 1, this->Y, 1); } template cl_int RotgMetod::run() { DataType type; type = ( typeid(T) == typeid(float))? TYPE_FLOAT:( typeid(T) == typeid(double))? TYPE_DOUBLE: ( typeid(T) == typeid(FloatComplex))? TYPE_COMPLEX_FLOAT: TYPE_COMPLEX_DOUBLE; return (cl_int)clMath::clblas::rotg( type, this->bufX, 0, this->bufY, 0, this->bufA, 0, this->bufB, 0, this->qnum, this->queues, this->inEventCount, this->inEvent, this->outEvent); } //////////////////////////////////////////////////////////////////////////////////////////// template class RotmMetod : public baseMetod { typedef T TYPE; public: void initDefault(size_t s, unsigned int q); cl_int run(); void generateData(); }; template void RotmMetod::initDefault(size_t s, unsigned int q) { // USE_AXY allocates space for 1 2-D array A and 2 vectors: X & Y // Here are we are allocating more memory for PARAM than required, to reuse code. A corrosponds to PARAM. baseMetod::initDefault(s, q, USE_AXY); this->resultBuffer = Yresult; } template void RotmMetod::generateData() { randomVectors(this->size, this->X, 1, this->Y, 1); randomVectors(4, this->A + 1, 1); *(this->A) = 0; //Only 4 inputs are valid here, which are tested in correctness and performance test } template cl_int RotmMetod::run() { DataType type; type = ( typeid(T) == typeid(float))? TYPE_FLOAT: TYPE_DOUBLE; return (cl_int)clMath::clblas::rotm( type, this->size, this->bufX, 0, 1, this->bufY, 0, 1, this->bufA, 0, this->qnum, this->queues, this->inEventCount, this->inEvent, this->outEvent); } //////////////////////////////////////////////////////////////////////////////////////////// template class RotmgMetod : public baseMetod { typedef T TYPE; public: void initDefault(size_t s, unsigned int q); cl_int run(); void generateData(); }; template void RotmgMetod::initDefault(size_t s, unsigned int q) { baseMetod::initDefault(5, q, USE_ABCXY); this->resultBuffer = Cresult; s = s; //Warning } template void RotmgMetod::generateData() { randomRotmg(this->A, this->B, this->X, this->Y, this->C); *(this->C) = 0; //Only 4 inputs are valid here, which are tested in correctness and performance test } template cl_int RotmgMetod::run() { DataType type; type = ( typeid(T) == typeid(float))? TYPE_FLOAT: TYPE_DOUBLE; return (cl_int)clMath::clblas::rotmg( type, this->bufA, 0, this->bufB, 0, this->bufX, 0, this->bufY, 0, this->bufC, 0, this->qnum, this->queues, this->inEventCount, this->inEvent, this->outEvent); } ///////////////////////////////////////////////////////////////////////////////// template class RotMetod : public baseMetod { typedef T TYPE; public: void initDefault(size_t s, unsigned int q); cl_int run(); void generateData(); }; template void RotMetod::initDefault(size_t s, unsigned int q) { baseMetod::initDefault(s, q, USE_XY); this->resultBuffer = Yresult; } template void RotMetod::generateData() { randomVectors(this->size, this->X, 1, this->Y, 1); } template cl_int RotMetod::run() { //DataType type; //type = ( typeid(T) == typeid(float))? TYPE_FLOAT:( typeid(T) == typeid(double))? TYPE_DOUBLE: // ( typeid(T) == typeid(FloatComplex))? TYPE_COMPLEX_FLOAT: TYPE_COMPLEX_DOUBLE; return (cl_int)clMath::clblas::rot( this->size, this->bufX, 0, 1, this->bufY, 0, 1, this->alpha, this->beta, this->qnum, this->queues, this->inEventCount, this->inEvent, this->outEvent); } //////////////////////////////////////////////////////////////////////////////////////////// template class Nrm2Metod : public baseMetod { typedef T TYPE; public: void initDefault(size_t s, unsigned int q); cl_int run(); void generateData(); }; template void Nrm2Metod::initDefault(size_t s, unsigned int q) { baseMetod::initDefault(s, q, USE_AXY); this->resultBuffer = Yresult; } template void Nrm2Metod::generateData() { randomVectors(this->size, this->X, 1, this->Y, 1, true); } template cl_int Nrm2Metod::run() { DataType type; type = ( typeid(T) == typeid(float))? TYPE_FLOAT:( typeid(T) == typeid(double))? TYPE_DOUBLE: ( typeid(T) == typeid(FloatComplex))? TYPE_COMPLEX_FLOAT: TYPE_COMPLEX_DOUBLE; return (cl_int)clMath::clblas::nrm2(type, this->size, this->bufY, 0, this->bufX, 0, 1, this->bufA, this->qnum, this->queues, this->inEventCount, this->inEvent, this->outEvent); } /////////////////////////////////////////////////////// //ASUM template class AsumMetod : public baseMetod { typedef T TYPE; public: void initDefault(size_t s, unsigned int q); cl_int run(); void generateData(); }; template void AsumMetod::initDefault(size_t s, unsigned int q) { baseMetod::initDefault(s, q, USE_AXY); this->resultBuffer = Xresult; } template void AsumMetod::generateData() { randomVectors(this->size, this->X, 1, (T*)NULL, 0, true); } template cl_int AsumMetod::run() { DataType type; type = ( typeid(T) == typeid(float))? TYPE_FLOAT:( typeid(T) == typeid(double))? TYPE_DOUBLE: ( typeid(T) == typeid(FloatComplex))? TYPE_COMPLEX_FLOAT: TYPE_COMPLEX_DOUBLE; return (cl_int)clMath::clblas::asum( type, this->size, this->bufA, 0, this->bufX, 0, 1, this->bufY, this->qnum, this->queues, this->inEventCount, this->inEvent, this->outEvent); } ////////////////////////////////////////////////////////////////////// /////////////////////////////////////////////////////// //iAMAX template class iAmaxMetod : public baseMetod { typedef T TYPE; public: void initDefault(size_t s, unsigned int q); cl_int run(); void generateData(); }; template void iAmaxMetod::initDefault(size_t s, unsigned int q) { baseMetod::initDefault(s, q, USE_AXY); this->resultBuffer = Yresult; } template void iAmaxMetod::generateData() { randomVectors(this->size, this->X, 1, this->Y, 1); } template cl_int iAmaxMetod::run() { DataType type; type = ( typeid(T) == typeid(float))? TYPE_FLOAT:( typeid(T) == typeid(double))? TYPE_DOUBLE: ( typeid(T) == typeid(FloatComplex))? TYPE_COMPLEX_FLOAT: TYPE_COMPLEX_DOUBLE; return (cl_int)clMath::clblas::iamax( type, this->size, this->bufY, 0, this->bufX, 0, 1, this->bufA, this->qnum, this->queues, this->inEventCount, this->inEvent, this->outEvent); } ////////////////////////////////////////////////////////////////////// #define CHECK_DOUBLE \ { \ clMath::BlasBase* base = clMath::BlasBase::getInstance();\ if (!base->isDevSupportDoublePrecision()) {\ ::std::cerr << ">> Double precision is not supported"\ << ::std::endl \ << ">> Test skipped." << ::std::endl;\ SUCCEED();\ return;\ }\ } #endif // FUNC_H_ clblas-2.10/src/tests/functional/test-functional.cpp000066400000000000000000000065371264277366700226510ustar00rootroot00000000000000/* ************************************************************************ * Copyright 2013 Advanced Micro Devices, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * ************************************************************************/ #include #include "BlasBase.h" /////////////////////////////////////////////////////////////////////////////// int main(int argc, char *argv[]) { ::clMath::BlasBase *base; TestParams params; int ret; if ((argc > 1) && !strcmp(argv[1], "--test-help")) { printUsage("test-functional"); return 0; } ::testing::InitGoogleTest(&argc, argv); ::std::cerr << "Initialize OpenCL and clblas..." << ::std::endl; base = ::clMath::BlasBase::getInstance(); if (base == NULL) { ::std::cerr << "Fatal error, OpenCL or clblas initialization failed! " "Leaving the test." << ::std::endl; return -1; } if (argc != 1) { params.optFlags = NO_FLAGS; params.devType = CL_DEVICE_TYPE_GPU; params.devName = NULL; if (parseBlasCmdLineArgs(argc, argv, ¶ms) != 0) { printUsage(argv[0]); return 1; } if (params.optFlags & SET_SEED) { base->setSeed(params.seed); } if (params.optFlags & SET_ALPHA) { base->setAlpha(params.alpha); } if (params.optFlags & SET_BETA) { base->setBeta(params.beta); } if (params.optFlags & SET_M) { base->setM(params.M); } if (params.optFlags & SET_N) { base->setN(params.N); } if (params.optFlags & SET_K) { base->setK(params.K); } if (params.optFlags & SET_INCX) { base->setIncX(params.incx); } if (params.optFlags & SET_INCY) { base->setIncY(params.incy); } if (params.optFlags & SET_DEVICE_TYPE) { if (!base->setDeviceType(¶ms.devType, params.devName)) { ::std::cerr << "Fatal error, OpenCL or clblas " "initialization failed! Leaving the test." << ::std::endl; return -1; } } if (params.optFlags & SET_NUM_COMMAND_QUEUES) { base->setNumCommandQueues(params.numCommandQueues); } } parseEnv(¶ms); if ((params.optFlags & SET_USE_IMAGES) && (params.devType != CL_DEVICE_TYPE_CPU)) { base->setUseImages(params.useImages); } /* Use of image based buffers is deprecated if (base->useImages()) { if (base->addScratchImages()) { std::cerr << "FATAL ERROR, CANNOT CREATE SCRATCH IMAGES!" << std::endl; } } */ ret = RUN_ALL_TESTS(); if (base->useImages()) { base->removeScratchImages(); } return ret; } clblas-2.10/src/tests/gtest.cmake000066400000000000000000000055371264277366700170130ustar00rootroot00000000000000 option(USE_SYSTEM_GTEST "Use system installed gtest when set to ON, or build gtest locally when set to OFF" OFF) if(USE_SYSTEM_GTEST) if( (${CMAKE_MAJOR_VERSION}.${CMAKE_MINOR_VERSION} VERSION_LESS 2.8) ) message( STATUS "Cmake version 2.8 or greater needed to use GTest" ) else() # This will define GTEST_FOUND find_package( GTest ) endif() else() if(CMAKE_VERSION VERSION_LESS 3.2 AND CMAKE_GENERATOR MATCHES "Ninja") message(WARNING "Building GTest with Ninja has known issues with CMake older than 3.2") endif() include(ExternalProject) set(GTEST_LIBRARIES gtest gtest_main) # the binary dir must be know before creating the external project in order # to pass the byproducts set(prefix "${CMAKE_CURRENT_BINARY_DIR}/gtest-external-prefix") set(binary_dir "${prefix}/src/gtest-external-build") set(byproducts) foreach(lib ${GTEST_LIBRARIES}) set(${lib}_location ${binary_dir}/${CMAKE_CFG_INTDIR}/${CMAKE_STATIC_LIBRARY_PREFIX}${lib}${CMAKE_STATIC_LIBRARY_SUFFIX}) list(APPEND byproducts ${${lib}_location}) endforeach() if( MSVC ) if( MSVC_VERSION LESS 1800 ) set(EXTRA_FLAG "/D_VARIADIC_MAX=10 ") else() set(EXTRA_FLAG "") endif() else() set(EXTRA_FLAG "") endif() ExternalProject_Add( gtest-external URL https://github.com/google/googletest/archive/release-1.7.0.zip URL_MD5 ef5e700c8a0f3ee123e2e0209b8b4961 PREFIX ${prefix} BINARY_DIR ${binary_dir} CMAKE_CACHE_ARGS -DCMAKE_CXX_COMPILER:FILEPATH=${CMAKE_CXX_COMPILER} -DCMAKE_CXX_FLAGS:STRING=${CMAKE_CXX_FLAGS} -DCMAKE_CXX_FLAGS_DEBUG:STRING=${EXTRA_FLAG}${CMAKE_CXX_FLAGS_DEBUG} -DCMAKE_CXX_FLAGS_MINSIZEREL:STRING=${EXTRA_FLAG}${CMAKE_CXX_FLAGS_MINSIZEREL} -DCMAKE_CXX_FLAGS_RELEASE:STRING=${EXTRA_FLAG}${CMAKE_CXX_FLAGS_RELEASE} -DCMAKE_CXX_FLAGS_RELWITHDEBINFO:STRING=${EXTRA_FLAG}${CMAKE_CXX_FLAGS_RELWITHDEBINFO} -DCMAKE_C_COMPILER:FILEPATH=${CMAKE_C_COMPILER} -DCMAKE_C_FLAGS:STRING=${CMAKE_C_FLAGS} -DCMAKE_C_FLAGS_DEBUG:STRING=${CMAKE_C_FLAGS_DEBUG} -DCMAKE_C_FLAGS_MINSIZEREL:STRING=${CMAKE_C_FLAGS_MINSIZEREL} -DCMAKE_C_FLAGS_RELEASE:STRING=${CMAKE_C_FLAGS_RELEASE} -DCMAKE_C_FLAGS_RELWITHDEBINFO:STRING=${CMAKE_C_FLAGS_RELWITHDEBINFO} -DCMAKE_BUILD_TYPE:STRING=${CMAKE_BUILD_TYPE} -Dgtest_force_shared_crt:BOOL=ON BUILD_BYPRODUCTS ${byproducts} INSTALL_COMMAND "") foreach(lib ${GTEST_LIBRARIES}) add_library(${lib} IMPORTED STATIC) add_dependencies(${lib} gtest-external) set_target_properties(${lib} PROPERTIES IMPORTED_LOCATION ${${lib}_location}) endforeach() ExternalProject_Get_Property(gtest-external source_dir) set(GTEST_INCLUDE_DIRS ${source_dir}/include) set(GTEST_FOUND ON) endif() # Hack to get googletest v1.6 to work with vs2012 if( MSVC11 ) add_definitions( "/D_VARIADIC_MAX=10" ) endif( ) clblas-2.10/src/tests/include/000077500000000000000000000000001264277366700162745ustar00rootroot00000000000000clblas-2.10/src/tests/include/BlasBase.h000066400000000000000000000146311264277366700201260ustar00rootroot00000000000000/* ************************************************************************ * Copyright 2013 Advanced Micro Devices, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * ************************************************************************/ #ifndef BLASBASE_H_ #define BLASBASE_H_ #include #include #include #if _MSC_VER #pragma warning (disable:4127) #endif // check it is double precision error and return #define CHECK_DP_ERROR_AND_RETURN(err, funcName) \ do { \ clMath::BlasBase *base = clMath::BlasBase::getInstance(); \ \ if (err == CL_INVALID_DEVICE && !base->isDevSupportDoublePrecision()) { \ ::std::cerr << std::endl << ">> " << funcName << \ "() reported that this device doesn't support double " \ "precision floating point arithmetic. Test is skipped" << \ ::std::endl; \ SUCCEED(); \ \ return; \ } \ } while (0) \ #define DEFAULT_SEED 12345 #define MAX_COMMAND_QUEUES 10 namespace clMath { // This class is a singleton class BlasBase { private: cl_platform_id platform_; // used in all cases cl_device_id primaryDevice_; /* * used only in cases with MultipleQueues to cover problem distribution * among different devices, not only different queues belonging to the same * device */ cl_device_id additionalDevice_; cl_context context_; cl_command_queue commandQueues_[MAX_COMMAND_QUEUES]; bool useNumCommandQueues_; cl_uint numCommandQueues_; bool useAlpha_; bool useBeta_; ComplexLong alpha_; ComplexLong beta_; bool useSeed_; unsigned int seed_; bool useM_, useN_, useK_; size_t M_, N_, K_; bool useIncX_, useIncY_; int incX_, incY_; bool useImages_; cl_device_type devType_; const char* devName_; cl_ulong imageA_; cl_ulong imageB_; BlasBase(); ~BlasBase(); BlasBase(const BlasBase &); // intentionally undefined BlasBase & operator=(const BlasBase &); // intentionally undefined void SetUp(); void TearDown(); bool initialized(); cl_int getPlatforms(cl_platform_id** platforms, cl_int *error); cl_device_id getDevice(cl_device_type type, const char* name, cl_int *error); void printDevInfoStr(cl_device_info param, const char *paramName, int primAdd); public: static BlasBase* getInstance(); cl_context context() { return context_; }; cl_command_queue* commandQueues() const { return const_cast(commandQueues_); }; bool useNumCommandQueues() const { return useNumCommandQueues_; }; cl_uint numCommandQueues() const { return numCommandQueues_; }; void setNumCommandQueues(cl_uint numCommandQueues) { if (numCommandQueues <= MAX_COMMAND_QUEUES) { numCommandQueues_ = numCommandQueues; useNumCommandQueues_ = true; } } bool useAlpha() const { return useAlpha_; } ComplexLong alpha() const { return alpha_; } void setAlpha(ComplexLong alpha) { alpha_ = alpha; useAlpha_ = true; } bool useBeta() const { return useBeta_; } ComplexLong beta() const { return beta_; } void setBeta(ComplexLong beta) { beta_ = beta; useBeta_ = true; } bool useSeed() const { return useSeed_; }; unsigned int seed() const { return seed_; }; void setSeed(unsigned int seed) { seed_ = seed; useSeed_ = true; } bool useM() const { return useM_; }; size_t M() const { return M_; } void setM(size_t M) { M_ = M; useM_ = true; } bool useN() const { return useN_; }; size_t N() const { return N_; } void setN(size_t N) { N_ = N; useN_ = true; } bool useK() const { return useK_; }; size_t K() const { return K_; } void setK(size_t K) { K_ = K; useK_ = true; } bool useIncX() const { return useIncX_; }; int incX() const { return incX_; } void setIncX(int incX) { incX_ = incX; useIncX_ = true; } bool useIncY() const { return useIncY_; }; int incY() const { return incY_; } void setIncY(int incY) { incY_ = incY; useIncY_ = true; } bool useImages() const { return useImages_; }; void setUseImages(bool value) { useImages_ = value; } void setUseImages(int value) { useImages_ = (value != 0); } bool setDeviceType(cl_device_type* devType, const char* devName); cl_mem createEnqueueBuffer(const void *data, size_t matrSize, size_t off, cl_mem_flags mode); cl_mem readBuffer(void *ptr, size_t off, size_t size); clblasStatus addScratchImages(void); void removeScratchImages(void); size_t scratchImageWidth(void); size_t scratchImageHeight(void); cl_ulong maxMemAllocSize(void); cl_ulong availGlobalMemSize(int primAdd); bool isDevSupportDoublePrecision(void); // print information on environment the test run in void printEnvInfo(void); void release(void) { TearDown(); } }; } // namespace #endif // BLASBASE_H_ clblas-2.10/src/tests/include/ExtraTestSizes.h000066400000000000000000000155041264277366700214130ustar00rootroot00000000000000/* ************************************************************************ * Copyright 2013 Advanced Micro Devices, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * ************************************************************************/ #ifndef EXTRATESTSIZES_H_ #define EXTRATESTSIZES_H_ #include //#define AMD_ETS_CONTAINER(ar1, ar2, ar3, ar4, ar5, ar6) namespace clMath { union BlasStride { size_t ld; /* matrix leading dimension */ int inc; /* increment between vector elements */ }; /* * Common convention: * If a field is zero at test specialization, it is assumed to be undefined. * In this case a test itself is responsible for assigning some value to it. */ struct ExtraTestSizes { ExtraTestSizes() : offA(0), offBX(0), offCY(0) { strideA.ld = 0; strideBX.ld = 0; strideCY.ld = 0; } ExtraTestSizes( size_t lda, int incx, int incy, size_t offA, size_t offBX, size_t offCY) { strideA.ld = lda; strideBX.ld = 0; strideBX.inc = incx; strideCY.ld = 0; strideCY.inc = incy; this->offA = offA; this->offBX = offBX; this->offCY = offCY; } ExtraTestSizes( size_t lda, size_t ldb, size_t ldc, size_t offA, size_t offBX, size_t offCY) { strideA.ld = lda; strideBX.ld = ldb; strideCY.ld = ldc; this->offA = offA; this->offBX = offBX; this->offCY = offCY; } BlasStride strideA; BlasStride strideBX; BlasStride strideCY; size_t offA; size_t offBX; size_t offCY; }; template class IteratorETS { public: typedef ExtraTestSizes value_type; typedef std::forward_iterator_tag iterator_category; typedef int difference_type; typedef ExtraTestSizes* pointer; typedef ExtraTestSizes& reference; IteratorETS( const size_t *begin1, const size_t *end1, const T2 *begin2, const T2 *end2, const T3 *begin3, const T3 *end3, const size_t *begin4, const size_t *end4, const size_t *begin5, const size_t *end5, const size_t *begin6, const size_t *end6, int startEnd) : begin1_(begin1), end1_(end1), begin2_(begin2), end2_(end2), begin3_(begin3), end3_(end3), begin4_(begin4), end4_(end4), begin5_(begin5), end5_(end5), begin6_(begin6), end6_(end6) { cur1_ = (startEnd) ? end1_ : begin1_; cur2_ = begin2_; cur3_ = begin3_; cur4_ = begin4_; cur5_ = begin5_; cur6_ = begin6_; } IteratorETS& operator++() { bool carry = false; // don't go beyond the end if (cur1_ == end1_) { return *this; } carry = (cur6_ + 1 == end6_); cur6_ = (carry) ? begin6_ : (cur6_ + 1); if (carry) { carry = (cur5_ + 1 == end5_); cur5_ = (carry) ? begin5_ : (cur5_ + 1); } if (carry) { carry = (cur4_ + 1 == end4_); cur4_ = (carry) ? begin4_ : (cur4_ + 1); } if (carry) { carry = (cur3_ + 1 == end3_); cur3_ = (carry) ? begin3_ : (cur3_ + 1); } if (carry) { carry = (cur2_ + 1 == end2_); cur2_ = (carry) ? begin2_ : (cur2_ + 1); } if (carry) { cur1_++; } return *this; } bool operator==(const IteratorETS& rhs) const { return (cur1_ == rhs.cur1_ && cur2_ == rhs.cur2_ && cur3_ == rhs.cur3_ && cur4_ == rhs.cur4_ && cur5_ == rhs.cur5_ && cur6_ == rhs.cur6_); } bool operator!=(const IteratorETS& rhs) const { return !(*this == rhs); } ExtraTestSizes& operator*() { inst_ = ExtraTestSizes(*cur1_, *cur2_, *cur3_, *cur4_, *cur5_, *cur6_); return inst_; } private: ExtraTestSizes inst_; const size_t *begin1_; const size_t *cur1_; const size_t *end1_; const T2 *begin2_; const T2 *cur2_; const T2 *end2_; const T3 *begin3_; const T3 *cur3_; const T3 *end3_; const size_t *begin4_; const size_t *cur4_; const size_t *end4_; const size_t *begin5_; const size_t *cur5_; const size_t *end5_; const size_t *begin6_; const size_t *cur6_; const size_t *end6_; }; /* * Extra test sizes container */ template class ContainerETS { public: typedef ExtraTestSizes value_type; ContainerETS( const size_t (&array1)[N1], const T2 (&array2)[N2], const T3 (&array3)[N3], const size_t (&array4)[N4], const size_t (&array5)[N5], const size_t (&array6)[N6]) : ar1_(array1), ar2_(array2), ar3_(array3), ar4_(array4), ar5_(array5), ar6_(array6) { } IteratorETS begin() const { return IteratorETS(ar1_, ar1_ + N1, ar2_, ar2_ + N2, ar3_, ar3_ + N3, ar4_, ar4_ + N4, ar5_, ar5_ + N5, ar6_, ar6_ + N6, 0); } IteratorETS end() const { return IteratorETS(ar1_, ar1_ + N1, ar2_, ar2_ + N2, ar3_, ar3_ + N3, ar4_, ar4_ + N4, ar5_, ar5_ + N5, ar6_, ar6_ + N6, 1); } private: const size_t *ar1_; const T2 *ar2_; const T3 *ar3_; const size_t *ar4_; const size_t *ar5_; const size_t *ar6_; }; template ContainerETS makeContainerETS( const size_t (&array1)[N1], const T2 (&array2)[N2], const T3 (&array3)[N3], const size_t (&array4)[N4], const size_t (&array5)[N5], const size_t (&array6)[N6]) { return ContainerETS(array1, array2, array3, array4, array5, array6); } } /* namespace clMath */ #endif /* EXTRATESTSIZES_H_ */ clblas-2.10/src/tests/include/asum.h000066400000000000000000000043661264277366700174230ustar00rootroot00000000000000/* ************************************************************************ * Copyright 2013 Advanced Micro Devices, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * ************************************************************************/ #include #include #include #include #include using ::testing::TestWithParam; class ASUM : public TestWithParam< ::std::tr1::tuple< int, // N int, // incx, should be greater than 0 int, //offx int, //offa -- for offAsum int // numCommandQueues > > { public: void getParams(TestParams *params) { params->N = N; params->incx = incx; params->offBX = offx; params->offa = offAsum; params->numCommandQueues = numCommandQueues; } protected: virtual void SetUp() { //size_t lenX; N = ::std::tr1::get<0>(GetParam()); incx = ::std::tr1::get<1>(GetParam()); offx = ::std::tr1::get<2>(GetParam()); offAsum = ::std::tr1::get<3>(GetParam()); numCommandQueues = ::std::tr1::get<4>(GetParam()); base = ::clMath::BlasBase::getInstance(); useNumCommandQueues = base->useNumCommandQueues(); if (useNumCommandQueues) { numCommandQueues = base->numCommandQueues(); } if (base->useN()) { N = base->N(); } printTestParams(N, offx, incx); ::std::cerr << "offAsum = " << offAsum << ::std::endl; ::std::cerr << "queues = " << numCommandQueues << ::std::endl; } size_t N; int incx; size_t offx, offAsum; ::clMath::BlasBase *base; cl_ulong imageA, imageX; bool useNumCommandQueues; cl_uint numCommandQueues; }; clblas-2.10/src/tests/include/axpy.h000066400000000000000000000050741264277366700174340ustar00rootroot00000000000000/* ************************************************************************ * Copyright 2013 Advanced Micro Devices, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * ************************************************************************/ #ifndef AXPY__H_ #define AXPY__H_ #include #include #include #include using namespace clMath; using ::testing::TestWithParam; // Name AXPY creates problem in gTest class AXPY : public TestWithParam< ::std::tr1::tuple< int, // N ComplexLong, // alpha int, // offBX int, // incx, should not be 0 int, //offCY int, //incy, should not be 0 int // numCommandQueues > > { public: void getParams(TestParams *params) { params->N = N; params->alpha = paramAlpha; params->offBX = offBX; params->incx = incx; params->offCY = offCY; params->incy = incy; params->numCommandQueues = numCommandQueues; } protected: virtual void SetUp() { N = ::std::tr1::get<0>(GetParam()); paramAlpha = ::std::tr1::get<1>(GetParam()); offBX = ::std::tr1::get<2>(GetParam()); incx = ::std::tr1::get<3>(GetParam()); offCY = ::std::tr1::get<4>(GetParam()); incy = ::std::tr1::get<5>(GetParam()); numCommandQueues = ::std::tr1::get<6>(GetParam()); base = ::clMath::BlasBase::getInstance(); seed = base->seed(); useNumCommandQueues = base->useNumCommandQueues(); if (useNumCommandQueues) { numCommandQueues = base->numCommandQueues(); } if (base->useN()) { N = base->N(); } printTestParams(N, paramAlpha, offBX, incx, offCY, incy); ::std::cerr << "queues = " << numCommandQueues << ::std::endl; } size_t N; bool useAlpha; ComplexLong paramAlpha; size_t offBX; int incx; size_t offCY; int incy; unsigned int seed; ::clMath::BlasBase *base; bool useNumCommandQueues; cl_uint numCommandQueues; }; #endif clblas-2.10/src/tests/include/blas-cblas.h000066400000000000000000000374721264277366700204650ustar00rootroot00000000000000/* ************************************************************************ * Copyright 2013 Advanced Micro Devices, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * ************************************************************************/ #ifndef BLAS_CBLAS_H_ #define BLAS_CBLAS_H_ /* Under Windows math.h defines "complex" to mean "_complex". */ #include #undef complex #ifdef __cplusplus extern "C" { #endif /* A complex datatype for use by the C interfaces to ACML routines */ #ifndef _ACML_COMPLEX #define _ACML_COMPLEX typedef struct { float real, imag; } complex; typedef struct { double real, imag; } doublecomplex; #endif /* !defined(_ACML_COMPLEX) */ /* Basic complex arithmetic routines for C */ complex compose_complex(float x, float y); float complex_real(complex z); float complex_imag(complex z); doublecomplex compose_doublecomplex(double x, double y); double doublecomplex_real(doublecomplex z); double doublecomplex_imag(doublecomplex z); /* BLAS-2 functions */ void sgemv(char transa, int m, int n, float alpha, float *a, int lda, float *x, int incx, float beta, float *y, int incy); void dgemv(char transa, int m, int n, double alpha, double *a, int lda, double *x, int incx, double beta, double *y, int incy); void cgemv(char transa, int m, int n, complex *alpha, complex *a, int lda, complex *x, int incx, complex *beta, complex *y, int incy); void zgemv(char transa, int m, int n, doublecomplex *alpha, doublecomplex *a, int lda, doublecomplex *x, int incx, doublecomplex *beta, doublecomplex *y, int incy); void ssymv(char uplo, int n, float alpha, float *a, int lda, float *x, int incx, float beta, float *y, int incy); void dsymv(char uplo, int n, double alpha, double *a, int lda, double *x, int incx, double beta, double *y, int incy); void strmv(char uplo, char transa, char diag, int n, float *a, int lda, float *x, int incx); void dtrmv(char uplo, char transa, char diag, int n, double *a, int lda, double *x, int incx); void ctrmv(char uplo, char transa, char diag, int n, complex *a, int lda, complex *x, int incx); void ztrmv(char uplo, char transa, char diag, int n, doublecomplex *a, int lda, doublecomplex *x, int incx); void strsv(char uplo, char transa, char diag, int n, float *a, int lda, float *x, int incx); void dtrsv(char uplo, char transa, char diag, int n, double *a, int lda, double *x, int incx); void ctrsv(char uplo, char transa, char diag, int n, complex *a, int lda, complex *x, int incx); void ztrsv(char uplo, char transa, char diag, int n, doublecomplex *a, int lda, doublecomplex *x, int incx); void sger(int m, int n, float alpha, float *x, int incx, float *y, int incy, float *a, int lda); void dger(int m, int n, double alpha, double *x, int incx, double *y, int incy, double *a, int lda); void cgeru(int m, int n, complex *alpha, complex *x, int incx, complex *y, int incy, complex *a, int lda); void zgeru(int m, int n, doublecomplex *alpha, doublecomplex *x, int incx, doublecomplex *y, int incy, doublecomplex *a, int lda); void cgerc(int m, int n, complex *alpha, complex *x, int incx, complex *y, int incy, complex *a, int lda); void zgerc(int m, int n, doublecomplex *alpha, doublecomplex *x, int incx, doublecomplex *y, int incy, doublecomplex *a, int lda); void ssyr(char uplo, int n, float alpha, float *x, int incx, float *a, int lda); void dsyr(char uplo, int n, double alpha, double *x, int incx, double *a, int lda); void ssyr2(char uplo, int n, float alpha, float *x, int incx, float *y, int incy, float *a, int lda); void dsyr2(char uplo, int n, double alpha, double *x, int incx, double *y, int incy, double *a, int lda); void cher(char uplo, int n, float alpha, complex *x, int incx, complex *a, int lda); void zher(char uplo, int n, double alpha, doublecomplex *x, int incx, doublecomplex *a, int lda); void cher2(char uplo, int n, complex *alpha, complex *x, int incx, complex *y, int incy, complex *a, int lda); void zher2(char uplo, int n, doublecomplex *alpha, doublecomplex *x, int incx, doublecomplex *y, int incy, doublecomplex *a, int lda); void chemv(char uplo, int n, complex *alpha, complex *a, int lda, complex *x, int incx, complex *beta, complex *y, int incy); void zhemv(char uplo, int n, doublecomplex *alpha, doublecomplex *a, int lda, doublecomplex *x, int incx, doublecomplex *beta, doublecomplex *y, int incy); void stpmv(char uplo, char transa, char diag, int n, float *ap, float *x, int incx); void dtpmv(char uplo, char transa, char diag, int n, double *ap, double *x, int incx); void ctpmv(char uplo, char transa, char diag, int n, complex *ap, complex *x, int incx); void ztpmv(char uplo, char transa, char diag, int n, doublecomplex *ap, doublecomplex *x, int incx); void stpsv(char uplo, char transa, char diag, int n, float *ap, float *x, int incx); void dtpsv(char uplo, char transa, char diag, int n, double *ap, double *x, int incx); void ctpsv(char uplo, char transa, char diag, int n, complex *ap, complex *x, int incx); void ztpsv(char uplo, char transa, char diag, int n, doublecomplex *ap, doublecomplex *x, int incx); void sspr(char uplo, int n, float alpha, float *x, int incx, float *ap ); void dspr(char uplo, int n, double alpha, double *x, int incx, double *ap ); void sspmv(char uplo, int n, float alpha, float *ap, float *x, int incx, float beta, float *y, int incy); void dspmv(char uplo, int n, double alpha, double *ap, double *x, int incx, double beta, double *y, int incy); void chpmv(char uplo, int n, complex *alpha, complex *ap, complex *x, int incx, complex *beta, complex *y, int incy); void zhpmv(char uplo, int n, doublecomplex *alpha, doublecomplex *ap, doublecomplex *x, int incx, doublecomplex *beta, doublecomplex *y, int incy); void chpr(char uplo, int n, float alpha, complex *x, int incx, complex *ap ); void zhpr(char uplo, int n, double alpha, doublecomplex *x, int incx, doublecomplex *ap ); void sspr2(char uplo, int n, float alpha, float *x, int incx, float *y, int incy, float *a ); void dspr2(char uplo, int n, double alpha, double *x, int incx, double *y, int incy, double *a ); void chpr2(char uplo, int n, complex *alpha, complex *x, int incx, complex *y, int incy, complex *a ); void zhpr2(char uplo, int n, doublecomplex *alpha, doublecomplex *x, int incx, doublecomplex *y, int incy, doublecomplex *a ); void sgbmv(char trans, int m, int n, int kl, int ku, float alpha, float *a, int inca, float *x, int incx, float beta, float *y, int incy ); void dgbmv(char trans, int m, int n, int kl, int ku, double alpha, double *a, int inca, double *x, int incx, double beta, double *y, int incy ); void cgbmv(char trans, int m, int n, int kl, int ku, complex *alpha, complex *a, int inca, complex *x, int incx, complex *beta, complex *y, int incy ); void zgbmv(char trans, int m, int n, int kl, int ku, doublecomplex *alpha, doublecomplex *a, int inca, doublecomplex *x, int incx, doublecomplex *beta, doublecomplex *y, int incy ); void stbmv(char uplo, char trans, char diag, int n, int k, float *a, int lda, float *x, int incx ); void dtbmv(char uplo, char trans, char diag, int n, int k, double *a, int lda, double *x, int incx ); void ctbmv(char uplo, char trans, char diag, int n, int k, complex *a, int lda, complex *x, int incx ); void ztbmv(char uplo, char trans, char diag, int n, int k, doublecomplex *a, int lda, doublecomplex *x, int incx ); void ssbmv(char uplo, int n, int k, float alpha, float *a, int lda, float *x, int incx, float beta, float *y, int incy ); void dsbmv(char uplo, int n, int k, double alpha, double *a, int lda, double *x, int incx, double beta, double *y, int incy ); void chbmv(char uplo, int n, int k, complex *alpha, complex *a, int lda, complex *x, int incx, complex *beta, complex *y, int incy ); void zhbmv(char uplo, int n, int k, doublecomplex *alpha, doublecomplex *a, int lda, doublecomplex *x, int incx, doublecomplex *beta, doublecomplex *y, int incy ); void stbsv(char uplo, char trans, char diag, int n, int k, float *a, int lda, float *x, int incx ); void dtbsv(char uplo, char trans, char diag, int n, int k, double *a, int lda, double *x, int incx ); void ctbsv(char uplo, char trans, char diag, int n, int k, complex *a, int lda, complex *x, int incx ); void ztbsv(char uplo, char trans, char diag, int n, int k, doublecomplex *a, int lda, doublecomplex *x, int incx ); /* BLAS-3 functions */ void sgemm(char transa, char transb, int m, int n, int k, float alpha, float *a, int lda, float *b, int ldb, float beta, float *c, int ldc); void dgemm(char transa, char transb, int m, int n, int k, double alpha, double *a, int lda, double *b, int ldb, double beta, double *c, int ldc); void cgemm(char transa, char transb, int m, int n, int k, complex *alpha, complex *a, int lda, complex *b, int ldb, complex *beta, complex *c, int ldc); void zgemm(char transa, char transb, int m, int n, int k, doublecomplex *alpha, doublecomplex *a, int lda, doublecomplex *b, int ldb, doublecomplex *beta, doublecomplex *c, int ldc); void strmm(char side, char uplo, char transa, char diag, int m, int n, float alpha, float *a, int lda, float *b, int ldb); void dtrmm(char side, char uplo, char transa, char diag, int m, int n, double alpha, double *a, int lda, double *b, int ldb); void ctrmm(char side, char uplo, char transa, char diag, int m, int n, complex *alpha, complex *a, int lda, complex *b, int ldb); void ztrmm(char side, char uplo, char transa, char diag, int m, int n, doublecomplex *alpha, doublecomplex *a, int lda, doublecomplex *b, int ldb); void strsm(char side, char uplo, char transa, char diag, int m, int n, float alpha, float *a, int lda, float *b, int ldb); void dtrsm(char side, char uplo, char transa, char diag, int m, int n, double alpha, double *a, int lda, double *b, int ldb); void ctrsm(char side, char uplo, char transa, char diag, int m, int n, complex *alpha, complex *a, int lda, complex *b, int ldb); void ztrsm(char side, char uplo, char transa, char diag, int m, int n, doublecomplex *alpha, doublecomplex *a, int lda, doublecomplex *b, int ldb); void ssyr2k(char uplo, char transa, int n, int k, float alpha, float *a, int lda, float *b, int ldb, float beta, float *c, int ldc); void dsyr2k(char uplo, char transa, int n, int k, double alpha, double *a, int lda, double *b, int ldb, double beta, double *c, int ldc); void csyr2k(char uplo, char transa, int n, int k, complex *alpha, complex *a, int lda, complex *b, int ldb, complex *beta, complex *c, int ldc); void zsyr2k(char uplo, char transa, int n, int k, doublecomplex *alpha, doublecomplex *a, int lda, doublecomplex *b, int ldb, doublecomplex *beta, doublecomplex *c, int ldc); void ssyrk(char uplo, char transa, int n, int k, float alpha, float *a, int lda, float beta, float *c, int ldc); void dsyrk(char uplo, char transa, int n, int k, double alpha, double *a, int lda, double beta, double *c, int ldc); void csyrk(char uplo, char transa, int n, int k, complex *alpha, complex *a, int lda, complex *beta, complex *c, int ldc); void zsyrk(char uplo, char transa, int n, int k, doublecomplex *alpha, doublecomplex *a, int lda, doublecomplex *beta, doublecomplex *c, int ldc); void ssymm(char side, char uplo, int m, int n, float alpha, float *a, int lda, float *b, int ldb, float beta, float *c, int ldc); void dsymm(char side, char uplo, int m, int n, double alpha, double *a, int lda, double *b, int ldb, double beta, double *c, int ldc); void csymm(char side, char uplo, int m, int n, complex *alpha, complex *a, int lda, complex *b, int ldb, complex *beta, complex *c, int ldc); void zsymm(char side, char uplo, int m, int n, doublecomplex *alpha, doublecomplex *a, int lda, doublecomplex *b, int ldb, doublecomplex *beta, doublecomplex *c, int ldc); void chemm(char side, char uplo, int m, int n, complex *alpha, complex *a, int lda, complex *b, int ldb, complex *beta, complex *c, int ldc); void zhemm(char side, char uplo, int m, int n, doublecomplex *alpha, doublecomplex *a, int lda, doublecomplex *b, int ldb, doublecomplex *beta, doublecomplex *c, int ldc); void cherk(char uplo, char transa, int n, int k, float alpha, complex *a, int lda, float beta, complex *c, int ldc); void zherk(char uplo, char transa, int n, int k, double alpha, doublecomplex *a, int lda, double beta, doublecomplex *c, int ldc); void cher2k(char uplo, char transa, int n, int k, complex *alpha, complex *a, int lda, complex *b, int ldb, float beta, complex *c, int ldc); void zher2k(char uplo, char transa, int n, int k, doublecomplex *alpha, doublecomplex *a, int lda, doublecomplex *b, int ldb, double beta, doublecomplex *c, int ldc); void sscal( int n, float alpha, float *x, int incx); void dscal( int n, double alpha, double *x, int incx); void cscal( int n, complex* alpha, complex *x, int incx); void zscal( int n, doublecomplex* alpha, doublecomplex *x, int incx); void csscal( int n, float alpha, complex *x, int incx); void zdscal( int n, double alpha, doublecomplex *x, int incx); void sswap( int n, float *x, int incx, float *y, int incy); void dswap( int n, double *x, int incx, double *y, int incy); void cswap( int n, complex *x, int incx, complex *y, int incy); void zswap( int n, doublecomplex *x, int incx, doublecomplex *y, int incy); void scopy( int n, float *x, int incx, float *y, int incy); void dcopy( int n, double *x, int incx, double *y, int incy); void ccopy( int n, complex *x, int incx, complex *y, int incy); void zcopy( int n, doublecomplex *x, int incx, doublecomplex *y, int incy); float sdot( int n, float *x, int incx, float *y, int incy); double ddot( int n, double *x, int incx, double *y, int incy); complex cdotu( int n, complex *x, int incx, complex *y, int incy); doublecomplex zdotu( int n, doublecomplex *x, int incx, doublecomplex *y, int incy); complex cdotc( int n, complex *x, int incx, complex *y, int incy); doublecomplex zdotc( int n, doublecomplex *x, int incx, doublecomplex *y, int incy); void saxpy( int n, float alpha, float *x, int incx, float *y, int incy); void daxpy( int n, double aplha, double *x, int incx, double *y, int incy); void caxpy( int n, complex *alpha, complex *x, int incx, complex *y, int incy); void zaxpy( int n, doublecomplex *alpha, doublecomplex *x, int incx, doublecomplex *y, int incy); void srotg(float *A, float *B, float *C, float *S); void drotg(double *A, double *B, double *C, double *S); void crotg(complex *A, complex *B, float *C, complex *S); void zrotg(doublecomplex *A, doublecomplex *B, double *C, doublecomplex *S); void srotmg(float *D1, float *D2, float *X1, const float *Y1, float *PARAM); void drotmg(double *D1, double *D2, double *X1, const double *Y1, double *PARAM); void srotm(int N, float *X, int incx, float *Y, int incy, float* PARAM); void drotm(int N, double *X, int incx, double *Y, int incy, double* PARAM); void srot(int N, float *X, int incx, float *Y, int incy, float C, float S); void drot(int N, double *X, int incx, double *Y, int incy, double C, double S); void csrot(int N, complex *X, int incx, complex *Y, int incy, float C, float S); void zdrot(int N, doublecomplex *X, int incx, doublecomplex *Y, int incy, double C, double S); float sasum(int n, float *x, int incx); double dasum(int n, double *x, int incx); float scasum(int n, complex *x, int incx); double dzasum(int n, doublecomplex *x, int incx); float snrm2( int n, float *x, int incx); double dnrm2( int n, double *x, int incx); float scnrm2( int n, complex *x, int incx); double dznrm2( int n, doublecomplex *x, int incx); int isamax(int n, float *x, int incx); int idamax(int n, double *x, int incx); int icamax(int n, complex *x, int incx); int izamax(int n, doublecomplex *x, int incx); #ifdef __cplusplus } #endif #endif /* BLAS_CBLAS_H_ */ clblas-2.10/src/tests/include/blas-internal.h000066400000000000000000001050541264277366700212050ustar00rootroot00000000000000/* ************************************************************************ * Copyright 2013 Advanced Micro Devices, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * ************************************************************************/ #ifndef BLAS_INTERNAL_H_ #define BLAS_INTERNAL_H_ #ifdef __cplusplus extern "C" { #endif /* BLAS-2 functions */ void blasSgemv( clblasOrder order, clblasTranspose transA, size_t M, size_t N, float alpha, const float *A, size_t lda, const float *X, int incx, float beta, float *Y, int incy); void blasDgemv( clblasOrder order, clblasTranspose transA, size_t M, size_t N, double alpha, const double *A, size_t lda, const double *X, int incx, double beta, double *Y, int incy); void blasCgemv( clblasOrder order, clblasTranspose transA, size_t M, size_t N, FloatComplex alpha, const FloatComplex *A, size_t lda, const FloatComplex *X, int incx, FloatComplex beta, FloatComplex *Y, int incy); void blasZgemv( clblasOrder order, clblasTranspose transA, size_t M, size_t N, DoubleComplex alpha, const DoubleComplex *A, size_t lda, const DoubleComplex *X, int incx, DoubleComplex beta, DoubleComplex *Y, int incy); void blasSsymv( clblasOrder order, clblasUplo uplo, size_t N, float alpha, const float *A, size_t lda, const float *X, int incx, float beta, float *Y, int incy); void blasDsymv( clblasOrder order, clblasUplo uplo, size_t N, double alpha, const double *A, size_t lda, const double *X, int incx, double beta, double *Y, int incy); /* BLAS-3 functions */ void blasSgemm( clblasOrder order, clblasTranspose transA, clblasTranspose transB, size_t M, size_t N, size_t K, float alpha, const float *A, size_t lda, const float *B, size_t ldb, float beta, float *C, size_t ldc); void blasDgemm( clblasOrder order, clblasTranspose transA, clblasTranspose transB, size_t M, size_t N, size_t K, double alpha, const double *A, size_t lda, const double *B, size_t ldb, double beta, double *C, size_t ldc); void blasCgemm( clblasOrder order, clblasTranspose transA, clblasTranspose transB, size_t M, size_t N, size_t K, FloatComplex alpha, const FloatComplex *A, size_t lda, const FloatComplex *B, size_t ldb, FloatComplex beta, FloatComplex *C, size_t ldc); void blasZgemm( clblasOrder order, clblasTranspose transA, clblasTranspose transB, size_t M, size_t N, size_t K, DoubleComplex alpha, const DoubleComplex *A, size_t lda, const DoubleComplex *B, size_t ldb, DoubleComplex beta, DoubleComplex *C, size_t ldc); void blasStrmm( clblasOrder order, clblasSide side, clblasUplo uplo, clblasTranspose transA, clblasDiag diag, size_t M, size_t N, float alpha, const float *A, size_t lda, float *B, size_t ldb); void blasDtrmm( clblasOrder order, clblasSide side, clblasUplo uplo, clblasTranspose transA, clblasDiag diag, size_t M, size_t N, double alpha, const double *A, size_t lda, double *B, size_t ldb); void blasCtrmm( clblasOrder order, clblasSide side, clblasUplo uplo, clblasTranspose transA, clblasDiag diag, size_t M, size_t N, FloatComplex alpha, const FloatComplex *A, size_t lda, FloatComplex *B, size_t ldb); void blasZtrmm( clblasOrder order, clblasSide side, clblasUplo uplo, clblasTranspose transA, clblasDiag diag, size_t M, size_t N, DoubleComplex alpha, const DoubleComplex *A, size_t lda, DoubleComplex *B, size_t ldb); void blasStrsm( clblasOrder order, clblasSide side, clblasUplo uplo, clblasTranspose transA, clblasDiag diag, size_t M, size_t N, float alpha, const float *A, size_t lda, float *B, size_t ldb); void blasDtrsm( clblasOrder order, clblasSide side, clblasUplo uplo, clblasTranspose transA, clblasDiag diag, size_t M, size_t N, double alpha, const double *A, size_t lda, double *B, size_t ldb); void blasCtrsm( clblasOrder order, clblasSide side, clblasUplo uplo, clblasTranspose transA, clblasDiag diag, size_t M, size_t N, FloatComplex alpha, const FloatComplex *A, size_t lda, FloatComplex *B, size_t ldb); void blasZtrsm( clblasOrder order, clblasSide side, clblasUplo uplo, clblasTranspose transA, clblasDiag diag, size_t M, size_t N, DoubleComplex alpha, const DoubleComplex *A, size_t lda, DoubleComplex *B, size_t ldb); void blasSsyr2k( clblasOrder order, clblasUplo uplo, clblasTranspose transA, size_t N, size_t K, float alpha, const float *A, size_t lda, const float *B, size_t ldb, float beta, float *C, size_t ldc); void blasDsyr2k( clblasOrder order, clblasUplo uplo, clblasTranspose transA, size_t N, size_t K, double alpha, const double *A, size_t lda, const double *B, size_t ldb, double beta, double *C, size_t ldc); void blasCsyr2k( clblasOrder order, clblasUplo uplo, clblasTranspose transA, size_t N, size_t K, FloatComplex alpha, const FloatComplex *A, size_t lda, const FloatComplex *B, size_t ldb, FloatComplex beta, FloatComplex *C, size_t ldc); void blasZsyr2k( clblasOrder order, clblasUplo uplo, clblasTranspose transA, size_t N, size_t K, DoubleComplex alpha, const DoubleComplex *A, size_t lda, const DoubleComplex *B, size_t ldb, DoubleComplex beta, DoubleComplex *C, size_t ldc); void blasSsyrk( clblasOrder order, clblasUplo uplo, clblasTranspose transA, size_t N, size_t K, float alpha, const float *A, size_t lda, float beta, float *C, size_t ldc); void blasDsyrk( clblasOrder order, clblasUplo uplo, clblasTranspose transA, size_t N, size_t K, double alpha, const double *A, size_t lda, double beta, double *C, size_t ldc); void blasCsyrk( clblasOrder order, clblasUplo uplo, clblasTranspose transA, size_t N, size_t K, FloatComplex alpha, const FloatComplex *A, size_t lda, FloatComplex beta, FloatComplex *C, size_t ldc); void blasZsyrk( clblasOrder order, clblasUplo uplo, clblasTranspose transA, size_t N, size_t K, DoubleComplex alpha, const DoubleComplex *A, size_t lda, DoubleComplex beta, DoubleComplex *C, size_t ldc); void blasStrmv( clblasOrder order, clblasUplo uplo, clblasTranspose transA, clblasDiag diag, size_t N, float *A, size_t offa, size_t lda, float *X, size_t offx, int incx); void blasDtrmv( clblasOrder order, clblasUplo uplo, clblasTranspose transA, clblasDiag diag, size_t N, double *A, size_t offa, size_t lda, double *X, size_t offx, int incx); void blasCtrmv( clblasOrder order, clblasUplo uplo, clblasTranspose transA, clblasDiag diag, size_t N, FloatComplex *A, size_t offa, size_t lda, FloatComplex *X, size_t offx, int incx); void blasZtrmv( clblasOrder order, clblasUplo uplo, clblasTranspose transA, clblasDiag diag, size_t N, DoubleComplex *A, size_t offa, size_t lda, DoubleComplex *X, size_t offx, int incx); void blasStpmv( clblasOrder order, clblasUplo uplo, clblasTranspose transA, clblasDiag diag, size_t N, float *AP, size_t offa, float *X, size_t offx, int incx); void blasDtpmv( clblasOrder order, clblasUplo uplo, clblasTranspose transA, clblasDiag diag, size_t N, double *AP, size_t offa, double *X, size_t offx, int incx); void blasCtpmv( clblasOrder order, clblasUplo uplo, clblasTranspose transA, clblasDiag diag, size_t N, FloatComplex *AP, size_t offa, FloatComplex *X, size_t offx, int incx); void blasZtpmv( clblasOrder order, clblasUplo uplo, clblasTranspose transA, clblasDiag diag, size_t N, DoubleComplex *AP, size_t offa, DoubleComplex *X, size_t offx, int incx); void blasStrsv( clblasOrder order, clblasUplo uplo, clblasTranspose transA, clblasDiag diag, size_t N, float *A, size_t offa, size_t lda, float *X, size_t offx, int incx); void blasDtrsv( clblasOrder order, clblasUplo uplo, clblasTranspose transA, clblasDiag diag, size_t N, double *A, size_t offa, size_t lda, double *X, size_t offx, int incx); void blasCtrsv( clblasOrder order, clblasUplo uplo, clblasTranspose transA, clblasDiag diag, size_t N, FloatComplex *A, size_t offa, size_t lda, FloatComplex *X, size_t offx, int incx); void blasZtrsv( clblasOrder order, clblasUplo uplo, clblasTranspose transA, clblasDiag diag, size_t N, DoubleComplex *A, size_t offa, size_t lda, DoubleComplex *X, size_t offx, int incx); void blasStpsv( clblasOrder order, clblasUplo uplo, clblasTranspose transA, clblasDiag diag, size_t N, float *A, size_t offa, float *X, size_t offx, int incx); void blasDtpsv( clblasOrder order, clblasUplo uplo, clblasTranspose transA, clblasDiag diag, size_t N, double *A, size_t offa, double *X, size_t offx, int incx); void blasCtpsv( clblasOrder order, clblasUplo uplo, clblasTranspose transA, clblasDiag diag, size_t N, FloatComplex *A, size_t offa, FloatComplex *X, size_t offx, int incx); void blasZtpsv( clblasOrder order, clblasUplo uplo, clblasTranspose transA, clblasDiag diag, size_t N, DoubleComplex *A, size_t offa, DoubleComplex *X, size_t offx, int incx); void blasSsymm( clblasOrder order, clblasSide side, clblasUplo uplo, size_t M, size_t N, float alpha, float* A, size_t offa, size_t lda, float* B, size_t offb, size_t ldb, float beta, float* C, size_t offc, size_t ldc); void blasDsymm( clblasOrder order, clblasSide side, clblasUplo uplo, size_t M, size_t N, double alpha, double* A, size_t offa, size_t lda, double* B, size_t offb, size_t ldb, double beta, double* C, size_t offc, size_t ldc); void blasCsymm( clblasOrder order, clblasSide side, clblasUplo uplo, size_t M, size_t N, FloatComplex alpha, FloatComplex* A, size_t offa, size_t lda, FloatComplex* B, size_t offb, size_t ldb, FloatComplex beta, FloatComplex* C, size_t offc, size_t ldc); void blasZsymm( clblasOrder order, clblasSide side, clblasUplo uplo, size_t M, size_t N, DoubleComplex alpha, DoubleComplex* A, size_t offa, size_t lda, DoubleComplex* B, size_t offb, size_t ldb, DoubleComplex beta, DoubleComplex* C, size_t offc, size_t ldc); void blasSger( clblasOrder order, size_t M, size_t N, float alpha, float* x, size_t offx, int incx, float* y, size_t offy, int incy, float* A, size_t offa, size_t lda); void blasDger( clblasOrder order, size_t M, size_t N, double alpha, double* x, size_t offx, int incx, double* y, size_t offy, int incy, double* A, size_t offa, size_t lda); void blasCgeru( clblasOrder order, size_t M, size_t N, FloatComplex alpha, FloatComplex* x, size_t offx, int incx, FloatComplex* y, size_t offy, int incy, FloatComplex* A, size_t offa, size_t lda); void blasZgeru( clblasOrder order, size_t M, size_t N, DoubleComplex alpha, DoubleComplex* x, size_t offx, int incx, DoubleComplex* y, size_t offy, int incy, DoubleComplex* A, size_t offa, size_t lda); void blasCgerc( clblasOrder order, size_t M, size_t N, FloatComplex alpha, FloatComplex* x, size_t offx, int incx, FloatComplex* y, size_t offy, int incy, FloatComplex* A, size_t offa, size_t lda); void blasZgerc( clblasOrder order, size_t M, size_t N, DoubleComplex alpha, DoubleComplex* x, size_t offx, int incx, DoubleComplex* y, size_t offy, int incy, DoubleComplex* A, size_t offa, size_t lda); void blasCher( clblasOrder order, clblasUplo uplo, size_t N, float alpha, FloatComplex* x, size_t offx, int incx, FloatComplex* A, size_t offa, size_t lda); void blasZher( clblasOrder order, clblasUplo uplo, size_t N, double alpha, DoubleComplex* x, size_t offx, int incx, DoubleComplex* A, size_t offa, size_t lda); void blasDsyr( clblasOrder order, clblasUplo uplo, size_t N, double alpha, double* X, size_t offx, int incx, double* A, size_t offa, size_t lda); void blasSsyr( clblasOrder order, clblasUplo uplo, size_t N, float alpha, float* X, size_t offx, int incx, float* A, size_t offa, size_t lda); void blasDspr( clblasOrder order, clblasUplo uplo, size_t N, double alpha, double* X, size_t offx, int incx, double* AP, size_t offa); void blasSspr( clblasOrder order, clblasUplo uplo, size_t N, float alpha, float* X, size_t offx, int incx, float* AP, size_t offa); void blasSsyr2( clblasOrder order, clblasUplo uplo, size_t N, float alpha, float* X, size_t offx, int incx, float* Y, size_t offy, int incy, float* A, size_t offa, size_t lda); void blasDsyr2( clblasOrder order, clblasUplo uplo, size_t N, double alpha, double* X, size_t offx, int incx, double* Y, size_t offy, int incy, double* A, size_t offa, size_t lda); //HER2 void blasCher2( clblasOrder order, clblasUplo uplo, size_t N, FloatComplex alpha, FloatComplex* X, size_t offx, int incx, FloatComplex* Y, size_t offy, int incy, FloatComplex* A, size_t offa, size_t lda); void blasZher2( clblasOrder order, clblasUplo uplo, size_t N, DoubleComplex alpha, DoubleComplex* X, size_t offx, int incx, DoubleComplex* Y, size_t offy, int incy, DoubleComplex* A, size_t offa, size_t lda); void blasChemv( clblasOrder order, clblasUplo uplo, size_t N, FloatComplex alpha, FloatComplex* A, size_t offa, size_t lda, FloatComplex* X, size_t offx, int incx, FloatComplex beta, FloatComplex* Y, size_t offy, int incy); void blasZhemv( clblasOrder order, clblasUplo uplo, size_t N, DoubleComplex alpha, DoubleComplex* A, size_t offa, size_t lda, DoubleComplex* X, size_t offx, int incx, DoubleComplex beta, DoubleComplex* Y, size_t offy, int incy); //HEMM void blasChemm( clblasOrder order, clblasSide side, clblasUplo uplo, size_t M, size_t N, FloatComplex alpha, FloatComplex* A, size_t offa, size_t lda, FloatComplex* B, size_t offb, size_t ldb, FloatComplex beta, FloatComplex* C, size_t offc, size_t ldc); void blasZhemm( clblasOrder order, clblasSide side, clblasUplo uplo, size_t M, size_t N, DoubleComplex alpha, DoubleComplex* A, size_t offa, size_t lda, DoubleComplex* B, size_t offb, size_t ldb, DoubleComplex beta, DoubleComplex* C, size_t offc, size_t ldc); void blasCherk( clblasOrder order, clblasUplo uplo, clblasTranspose transA, size_t N, size_t K, float alpha, const FloatComplex *A, size_t lda, float beta, FloatComplex *C, size_t ldc); void blasZherk( clblasOrder order, clblasUplo uplo, clblasTranspose transA, size_t N, size_t K, double alpha, const DoubleComplex *A, size_t lda, double beta, DoubleComplex *C, size_t ldc); void blasSspmv( clblasOrder order, clblasUplo uplo, size_t N, float alpha, const float *A, size_t offa, const float *X, size_t offx, int incx, float beta, float *Y, size_t offy, int incy); void blasDspmv( clblasOrder order, clblasUplo uplo, size_t N, double alpha, const double *A, size_t offa, const double *X, size_t offx, int incx, double beta, double *Y, size_t offy, int incy); void blasChpmv( clblasOrder order, clblasUplo uplo, size_t N, FloatComplex alpha, FloatComplex* A, size_t offa, FloatComplex* X, size_t offx, int incx, FloatComplex beta, FloatComplex* Y, size_t offy, int incy); void blasZhpmv( clblasOrder order, clblasUplo uplo, size_t N, DoubleComplex alpha, DoubleComplex* A, size_t offa, DoubleComplex* X, size_t offx, int incx, DoubleComplex beta, DoubleComplex* Y, size_t offy, int incy); void blasChpr( clblasOrder order, clblasUplo uplo, size_t N, float alpha, FloatComplex* x, size_t offx, int incx, FloatComplex* AP, size_t offa); void blasZhpr( clblasOrder order, clblasUplo uplo, size_t N, double alpha, DoubleComplex* x, size_t offx, int incx, DoubleComplex* AP, size_t offa); void blasSspr2( clblasOrder order, clblasUplo uplo, size_t N, float alpha, float* X, size_t offx, int incx, float* Y, size_t offy, int incy, float* AP, size_t offa); void blasDspr2( clblasOrder order, clblasUplo uplo, size_t N, double alpha, double* X, size_t offx, int incx, double* Y, size_t offy, int incy, double* AP, size_t offa); void blasChpr2( clblasOrder order, clblasUplo uplo, size_t N, FloatComplex alpha, FloatComplex* X, size_t offx, int incx, FloatComplex* Y, size_t offy, int incy, FloatComplex* AP, size_t offa); void blasZhpr2( clblasOrder order, clblasUplo uplo, size_t N, DoubleComplex alpha, DoubleComplex* X, size_t offx, int incx, DoubleComplex* Y, size_t offy, int incy, DoubleComplex* AP, size_t offa); void blasSgbmv( clblasOrder order, clblasTranspose transA, size_t M, size_t N, size_t KL, size_t KU, float alpha, float *A, size_t offa, size_t lda, float *X, size_t offx, int incx, float beta, float *Y, size_t offy, int incy); void blasDgbmv( clblasOrder order, clblasTranspose transA, size_t M, size_t N, size_t KL, size_t KU, double alpha, double *A, size_t offa, size_t lda, double *X, size_t offx, int incx, double beta, double *Y, size_t offy, int incy); void blasCgbmv( clblasOrder order, clblasTranspose transA, size_t M, size_t N, size_t KL, size_t KU, FloatComplex alpha, FloatComplex *A, size_t offa, size_t lda, FloatComplex *X, size_t offx, int incx, FloatComplex beta, FloatComplex *Y, size_t offy, int incy); void blasZgbmv( clblasOrder order, clblasTranspose transA, size_t M, size_t N, size_t KL, size_t KU, DoubleComplex alpha, DoubleComplex *A, size_t offa, size_t lda, DoubleComplex *X, size_t offx, int incx, DoubleComplex beta, DoubleComplex *Y, size_t offy, int incy); //TBMV void blasStbmv( clblasOrder order, clblasUplo uplo, clblasTranspose transA, clblasDiag diag, size_t N, size_t K, float *A, size_t offa, size_t lda, float *X, size_t offx, int incx); void blasDtbmv( clblasOrder order, clblasUplo uplo, clblasTranspose trans, clblasDiag diag, size_t N, size_t K, double *A, size_t offa, size_t lda, double *X, size_t offx, int incx); void blasCtbmv( clblasOrder order, clblasUplo uplo, clblasTranspose trans, clblasDiag diag, size_t N, size_t K, FloatComplex *A, size_t offa, size_t lda, FloatComplex *X, size_t offx, int incx); void blasZtbmv( clblasOrder order, clblasUplo uplo, clblasTranspose trans, clblasDiag diag, size_t N, size_t K, DoubleComplex *A, size_t offa, size_t lda, DoubleComplex *X, size_t offx, int incx); void blasSsbmv( clblasOrder order, clblasUplo uplo, size_t N, size_t K, float alpha, float *A, size_t offa, size_t lda, float *X, size_t offx, int incx, float beta, float *Y, size_t offy, int incy); void blasDsbmv( clblasOrder order, clblasUplo uplo, size_t N, size_t K, double alpha, double *A, size_t offa, size_t lda, double *X, size_t offx, int incx, double beta, double *Y, size_t offy, int incy); void blasChbmv( clblasOrder order, clblasUplo uplo, size_t N, size_t K, FloatComplex alpha, FloatComplex *A, size_t offa, size_t lda, FloatComplex *X, size_t offx, int incx, FloatComplex beta, FloatComplex *Y, size_t offy, int incy); void blasZhbmv( clblasOrder order, clblasUplo uplo, size_t N, size_t K, DoubleComplex alpha, DoubleComplex *A, size_t offa, size_t lda, DoubleComplex *X, size_t offx, int incx, DoubleComplex beta, DoubleComplex *Y, size_t offy, int incy); //TBSV void blasStbsv( clblasOrder order, clblasUplo uplo, clblasTranspose transA, clblasDiag diag, size_t N, size_t K, float *A, size_t offa, size_t lda, float *X, size_t offx, int incx); void blasDtbsv( clblasOrder order, clblasUplo uplo, clblasTranspose trans, clblasDiag diag, size_t N, size_t K, double *A, size_t offa, size_t lda, double *X, size_t offx, int incx); void blasCtbsv( clblasOrder order, clblasUplo uplo, clblasTranspose trans, clblasDiag diag, size_t N, size_t K, FloatComplex *A, size_t offa, size_t lda, FloatComplex *X, size_t offx, int incx); void blasZtbsv( clblasOrder order, clblasUplo uplo, clblasTranspose trans, clblasDiag diag, size_t N, size_t K, DoubleComplex *A, size_t offa, size_t lda, DoubleComplex *X, size_t offx, int incx); void blasCher2k( clblasOrder order, clblasUplo uplo, clblasTranspose transA, size_t N, size_t K, FloatComplex alpha, const FloatComplex *A, size_t offa, size_t lda, const FloatComplex *B, size_t offb, size_t ldb, float beta, FloatComplex *C, size_t offc, size_t ldc); void blasZher2k( clblasOrder order, clblasUplo uplo, clblasTranspose transA, size_t N, size_t K, DoubleComplex alpha, const DoubleComplex *A, size_t offa, size_t lda, const DoubleComplex *B, size_t offb, size_t ldb, double beta, DoubleComplex *C, size_t offc, size_t ldc); /* BLAS-1 functions */ //swap void blasSswap( size_t N, float *X, size_t offBX, int incx, float *Y, size_t offCY, int incy); void blasDswap( size_t N, double *X, size_t offBX, int incx, double *Y, size_t offCY, int incy); void blasCswap( size_t N, FloatComplex *X, size_t offBX, int incx, FloatComplex *Y, size_t offCY, int incy); void blasZswap( size_t N, DoubleComplex *X, size_t offBX, int incx, DoubleComplex *Y, size_t offCY, int incy); //Scal void blasSscal( size_t N, float alpha, float *X, size_t offx, int incx); void blasDscal( size_t N, double alpha, double *X, size_t offx, int incx); void blasCscal( size_t N, FloatComplex alpha, FloatComplex *X, size_t offx, int incx); void blasZscal( size_t N, DoubleComplex alpha, DoubleComplex *X, size_t offx, int incx); void blasCsscal( size_t N, float alpha, FloatComplex *X, size_t offx, int incx); void blasZdscal( size_t N, double alpha, DoubleComplex *X, size_t offx, int incx); //COPY void blasScopy( size_t N, float *X, size_t offx, int incx, float *Y, size_t offy, int incy); void blasDcopy( size_t N, double *X, size_t offx, int incx, double *Y, size_t offy, int incy); void blasCcopy( size_t N, FloatComplex *X, size_t offx, int incx, FloatComplex *Y, size_t offy, int incy); void blasZcopy( size_t N, DoubleComplex *X, size_t offx, int incx, DoubleComplex *Y, size_t offy, int incy); // DOT float blasSdot( size_t N, float *X, size_t offx, int incx, float *Y, size_t offy, int incy); double blasDdot( size_t N, double *X, size_t offx, int incx, double *Y, size_t offy, int incy); FloatComplex blasCdotu( size_t N, FloatComplex *X, size_t offx, int incx, FloatComplex *Y, size_t offy, int incy); DoubleComplex blasZdotu( size_t N, DoubleComplex *X, size_t offx, int incx, DoubleComplex *Y, size_t offy, int incy); //ASUM float blasSasum( size_t N, float *X, size_t offx, int incx); double blasDasum( size_t N, double *X, size_t offx, int incx); float blasScasum( size_t N, FloatComplex *X, size_t offx, int incx); double blasDzasum( size_t N, DoubleComplex *X, size_t offx, int incx); //DOTC FloatComplex blasCdotc( size_t N, FloatComplex *X, size_t offx, int incx, FloatComplex *Y, size_t offy, int incy); DoubleComplex blasZdotc( size_t N, DoubleComplex *X, size_t offx, int incx, DoubleComplex *Y, size_t offy, int incy); //axpy void blasSaxpy( size_t N, float alpha, const float *X, size_t offBX, int incx, float *Y, size_t offCY, int incy); void blasDaxpy( size_t N, double alpha, const double *X, size_t offBX, int incx, double *Y, size_t offCY, int incy); void blasCaxpy( size_t N, FloatComplex alpha, const FloatComplex *X, size_t offBX, int incx, FloatComplex *Y, size_t offCY, int incy); void blasZaxpy( size_t N, DoubleComplex alpha, const DoubleComplex *X, size_t offBX, int incx, DoubleComplex *Y, size_t offCY, int incy); //ROTG void blasSrotg( float* SA, size_t offSA, float* SB, size_t offSB, float* C, size_t offC, float* S, size_t offS); void blasDrotg( double* SA, size_t offSA, double* SB, size_t offSB, double* C, size_t offC, double* S, size_t offS); void blasCrotg( FloatComplex* SA, size_t offSA, FloatComplex* SB, size_t offSB, float* C, size_t offC, FloatComplex* S, size_t offS); void blasZrotg( DoubleComplex* SA, size_t offSA, DoubleComplex* SB, size_t offSB, double* C, size_t offC, DoubleComplex* S, size_t offS); void blasSrotmg( float *D1, size_t offD1, float *D2, size_t offD2, float *X1, size_t offX1, const float *Y1, size_t offY1, float *PARAM, size_t offParam); void blasDrotmg( double *D1, size_t offD1, double *D2, size_t offD2, double *X1, size_t offX1, const double *Y1, size_t offY1, double *PARAM, size_t offParam); void blasSrotm( size_t N, float *X, size_t offx, int incx, float *Y, size_t offy, int incy, float *PARAM, size_t offParam); void blasDrotm( size_t N, double *X, size_t offx, int incx, double *Y, size_t offy, int incy, double *PARAM, size_t offParam); void blasSrot( size_t N, float *X, size_t offx, int incx, float *Y, size_t offy, int incy, float C, float S); void blasDrot( size_t N, double *X, size_t offx, int incx, double *Y, size_t offy, int incy, double C, double S); void blasCsrot( size_t N, FloatComplex *X, size_t offx, int incx, FloatComplex *Y, size_t offy, int incy, float C, float S); void blasZdrot( size_t N, DoubleComplex *X, size_t offx, int incx, DoubleComplex *Y, size_t offy, int incy, double C, double S); int blasiSamax( size_t N, float *X, size_t offx, int incx); int blasiDamax( size_t N, double *X, size_t offx, int incx); int blasiCamax( size_t N, FloatComplex *X, size_t offx, int incx); int blasiZamax( size_t N, DoubleComplex *X, size_t offx, int incx); float blasSnrm2( size_t N, float *X, size_t offx, int incx); double blasDnrm2( size_t N, double *X, size_t offx, int incx); float blasScnrm2( size_t N, FloatComplex *X, size_t offx, int incx); double blasDznrm2( size_t N, DoubleComplex *X, size_t offx, int incx); #ifdef __cplusplus } /* extern "C" { */ #endif #endif /* BLAS_INTERNAL_H_ */ clblas-2.10/src/tests/include/blas-math.h000066400000000000000000000163001264277366700203150ustar00rootroot00000000000000/* ************************************************************************ * Copyright 2013 Advanced Micro Devices, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * ************************************************************************/ #ifndef BLAS_MATH_H_ #define BLAS_MATH_H_ #if defined (_MSC_VER) #if( _MSC_VER <= 1700 ) static unsigned long long ROW_NAN = 0x7ff0000000000000LL; #define NAN *(reinterpret_cast(&ROW_NAN)) #endif static unsigned int ROW_NANF = 0x7fc00000; #define NANF *(reinterpret_cast(&ROW_NANF)) #else /* _MSC_VER */ #define NANF NAN #endif /* !_MSC_VER */ #include // NAN, sqrt, abs #include // rand() #include #include static inline cl_int module(cl_int a) { return abs(a); } static inline cl_float module(cl_float a) { return fabsf(a); } static inline cl_double module(cl_double a) { return fabs(a); } static inline FloatComplex operator+(FloatComplex a, FloatComplex b) { return floatComplex(CREAL(a) + CREAL(b), CIMAG(b) + CIMAG(b)); } static inline FloatComplex operator-(FloatComplex a, FloatComplex b) { return floatComplex(CREAL(a) - CREAL(b), CIMAG(b) - CIMAG(b)); } static inline FloatComplex operator*(FloatComplex a, FloatComplex b) { return floatComplex( CREAL(a) * CREAL(b) - CIMAG(a) * CIMAG(b), CREAL(a) * CIMAG(b) + CREAL(b) * CIMAG(a)); } static inline FloatComplex operator*(FloatComplex a, cl_float b) { return floatComplex(CREAL(a) * b, CIMAG(a) * b); } static inline FloatComplex operator/(FloatComplex a, FloatComplex b) { cl_float div = CREAL(b) * CREAL(b) + CIMAG(b) * CIMAG(b); return floatComplex( (CREAL(a) * CREAL(b) + CIMAG(a) * CIMAG(b)) / div, (CREAL(b) * CIMAG(a) - CREAL(a) * CIMAG(b)) / div); } static inline FloatComplex operator/(FloatComplex a, cl_float b) { return floatComplex(CREAL(a) / b, CIMAG(a) / b); } static inline cl_float module(FloatComplex a) { if ((CREAL(a) == 0.0) && (CIMAG(a) == 0.0)) return 0.0; return sqrtf(CREAL(a) * CREAL(a) + CIMAG(a) * CIMAG(a)); } static inline DoubleComplex operator+(DoubleComplex a, DoubleComplex b) { return doubleComplex(CREAL(a) + CREAL(b), CIMAG(b) + CIMAG(b)); } static inline DoubleComplex operator-(DoubleComplex a, DoubleComplex b) { return doubleComplex(CREAL(a) - CREAL(b), CIMAG(b) - CIMAG(b)); } static inline DoubleComplex operator*(DoubleComplex a, DoubleComplex b) { return doubleComplex( CREAL(a) * CREAL(b) - CIMAG(a) * CIMAG(b), CREAL(a) * CIMAG(b) + CREAL(b) * CIMAG(a)); } static inline DoubleComplex operator*(DoubleComplex a, cl_double b) { return doubleComplex(CREAL(a) * b, CIMAG(a) * b); } static inline DoubleComplex operator/(DoubleComplex a, DoubleComplex b) { cl_double div = CREAL(b) * CREAL(b) + CIMAG(b) * CIMAG(b); return doubleComplex( (CREAL(a) * CREAL(b) + CIMAG(a) * CIMAG(b)) / div, (CREAL(b) * CIMAG(a) - CREAL(a) * CIMAG(b)) / div); } static inline DoubleComplex operator/(DoubleComplex a, cl_double b) { return doubleComplex(CREAL(a) / b, CIMAG(a) / b); } static inline cl_double module(DoubleComplex a) { if ((CREAL(a) == 0.0) && (CIMAG(a) == 0.0)) return 0.0; return sqrt(CREAL(a) * CREAL(a) + CIMAG(a) * CIMAG(a)); } // Random generator template static T randomTrsv(cl_double limit) { T v; T temp; temp = ((T)rand() / (T)(RAND_MAX)); temp = temp * (T)limit; if(temp == 0) { if ((rand() % 2) == 1) { temp = ((T)rand() / (T)(RAND_MAX)); temp = temp * (T)limit; } } v = static_cast(temp); if ((rand() % 2) == 1) v = -v; return v; } template<> __template_static FloatComplex randomTrsv(cl_double limit) { return floatComplex(randomTrsv(limit), randomTrsv(limit)); } template<> __template_static DoubleComplex randomTrsv(cl_double limit) { return doubleComplex(randomTrsv(limit), randomTrsv(limit)); } template static T randomTrsv(cl_double left, cl_double right) { T v; T l = static_cast(left); v = randomTrsv(right - left); if (v < 0) { v -= l; } else { v += l; } return v; } template<> __template_static FloatComplex randomTrsv(cl_double left, cl_double right) { return floatComplex(randomTrsv(left, right), randomTrsv(left, right)); } template<> __template_static DoubleComplex randomTrsv(cl_double left, cl_double right) { return doubleComplex(randomTrsv(left, right), randomTrsv(left, right)); } template static T random(cl_double limit) { T v; cl_ulong l = static_cast(limit); if (l == 0) { return 0; } v = static_cast(rand() % l); if ((rand() % 2) == 1) v = -v; return v; } template<> __template_static FloatComplex random(cl_double limit) { return floatComplex(random(limit), random(limit)); } template<> __template_static DoubleComplex random(cl_double limit) { return doubleComplex(random(limit), random(limit)); } template static T random(cl_double left, cl_double right) { T v; T l = static_cast(left); v = random(right - left); if (v < 0) { v -= l; } else { v += l; } return v; } template<> __template_static FloatComplex random(cl_double left, cl_double right) { return floatComplex(random(left, right), random(left, right)); } template<> __template_static DoubleComplex random(cl_double left, cl_double right) { return doubleComplex(random(left, right), random(left, right)); } // Type-dependant constants template static T ZERO() { return static_cast(0.0); } template<> __template_static FloatComplex ZERO() { return floatComplex(0.0, 0.0); } template<> __template_static DoubleComplex ZERO() { return doubleComplex(0.0, 0.0); } template static T ONE() { return static_cast(1.0); } template<> __template_static FloatComplex ONE() { return floatComplex(1.0, 0.0); } template<> __template_static DoubleComplex ONE() { return doubleComplex(1.0, 0.0); } template static T FNAN(); template<> __template_static float FNAN() { return NANF; } template<> __template_static double FNAN() { return NAN; } template<> __template_static FloatComplex FNAN() { return floatComplex(NANF, NANF); } template<> __template_static DoubleComplex FNAN() { return doubleComplex(NAN, NAN); } #endif /* BLAS_MATH_H_ */ clblas-2.10/src/tests/include/blas-random.h000066400000000000000000000735151264277366700206570ustar00rootroot00000000000000/* ************************************************************************ * Copyright 2013 Advanced Micro Devices, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * ************************************************************************/ #ifndef BLAS_RANDOM_H_ #define BLAS_RANDOM_H_ #include #include // sqrt() #include #include #include #include template static void randomGemmxMatrices( clblasOrder order, clblasTranspose transA, clblasTranspose transB, clblasTranspose transC, size_t M, size_t N, size_t K, bool useAlpha, T *alpha, T *A, size_t lda, T *B, size_t ldb, bool useBeta, T *beta, T *C, size_t ldc) { size_t m, n, k; cl_double bound; if (!useAlpha) { *alpha = random(100); if (module(*alpha) == 0.0) { *alpha = ONE(); } } bound = UPPER_BOUND(); bound = sqrt(((K - 1) * bound) / (module(*alpha) * K * K)); for (m = 0; m < M; m++) { for (k = 0; k < K; k++) { setElement(order, transA, m, k, A, lda, random(bound)); } } if (B != NULL) { for (k = 0; k < K; k++) { for (n = 0; n < N; n++) { setElement(order, transB, k, n, B, ldb, random(bound)); } } } if ((!useBeta) && (beta != NULL)) { *beta = random(100); } if (C != NULL) { // if C is not NULL, then beta must not be NULL. bound = UPPER_BOUND(); if (module(*beta) != 0.0) { bound = sqrt(bound / (module(*beta) * K)); } for (m = 0; m < M; m++) { for (n = 0; n < N; n++) { setElement(order, transC, m, n, C, ldc, random(bound)); } } } } template static void randomGemmMatrices( clblasOrder order, clblasTranspose transA, clblasTranspose transB, size_t M, size_t N, size_t K, bool useAlpha, T *alpha, T *A, size_t lda, T *B, size_t ldb, bool useBeta, T *beta, T *C, size_t ldc) { randomGemmxMatrices(order, transA, transB, clblasNoTrans, M, N, K, useAlpha, alpha, A, lda, B, ldb, useBeta, beta, C, ldc); } template static void randomTrmmMatrices( clblasOrder order, clblasSide side, clblasUplo uplo, clblasDiag diag, size_t M, size_t N, bool useAlpha, T *alpha, T *A, size_t lda, T *B, size_t ldb) { size_t i, j; size_t limA = 0; /* Matrix A boundary: M or N */ switch (side) { case clblasLeft: randomGemmMatrices(order, clblasNoTrans, clblasNoTrans, M, N, M, useAlpha, alpha, A, lda, B, ldb, false, NULL, NULL, 0); limA = M; break; case clblasRight: randomGemmMatrices(order, clblasNoTrans, clblasNoTrans, M, N, N, useAlpha, alpha, B, ldb, A, lda, false, NULL, NULL, 0); limA = N; break; } // set to NAN elements which must not be accessed for (i = 0; i < limA; i++) { switch (uplo) { case clblasUpper: for (j = 0; j < i; j++) { setElement(order, clblasNoTrans, i, j, A, lda, FNAN()); } break; case clblasLower: for (j = i + 1; j < limA; j++) { setElement(order, clblasNoTrans, i, j, A, lda, FNAN()); } break; } } if (diag == clblasUnit) { for (i = 0; i < limA; i++) { setElement(order, clblasNoTrans, i, i, A, lda, FNAN()); } } } template static void randomTrsmMatrices( clblasOrder order, clblasSide side, clblasUplo uplo, clblasDiag diag, size_t M, size_t N, bool useAlpha, T *alpha, T *A, size_t lda, T *B, size_t ldb) { size_t limA, i, j; T min, max, x, y; cl_double modMin, modMax, sum; min = ZERO(); max = ZERO(); if (side == clblasLeft) { limA = M; } else { limA = N; } /* * Generate max(|a_{ii}|). Determine min(|a_{ii}|). * Generate a_{ii} which are constrainted by min/max. */ switch (diag) { case clblasUnit: for (i = 0; i < limA; i++) { // must not be accessed setElement(order, clblasNoTrans, i, i, A, lda, ONE()); } break; case clblasNonUnit: /* Do not allow zeros on A's main diagonal */ do { max = random(TRSM_LIMIT_A()); } while (module(max) < 1); modMax = module(max); min = max / 100; modMin = module(min); setElement(order, clblasNoTrans, 0, 0, A, lda, max); for (i = 1; i < limA; i++) { x = random(modMin, modMax); if (module(x) == 0) { x = max; } setElement(order, clblasNoTrans, i, i, A, lda, x); } break; } /* Generate a_{ij} for all j <> i. */ for (i = 0; i < limA; i++) { if (diag == clblasUnit) { sum = module(ONE()); } else { sum = module(getElement(order, clblasNoTrans, i, i, A, lda)); } for (j = 0; j < limA; j++) { if (j == i) { continue; } if (((uplo == clblasUpper) && (j > i)) || ((uplo == clblasLower) && (j < i))) { // useful element if (sum >= 1.0) { x = random(sum / sqrt((double)limA - j)); sum -= module(x); } else { x = ZERO(); } } else { // must not be accessed x = FNAN(); } setElement(order, clblasNoTrans, i, j, A, lda, x); } } /* Generate matrix B. */ switch (side) { case clblasLeft: for (j = 0; j < N; j++) { sum = TRSM_LIMIT_B(); for (i = 0; i < M; i++) { x = getElement(order, clblasNoTrans, i, i, A, lda); y = ZERO(); if (sum >= 0.0) { y = random(sum * module(x) / sqrt((double)M - i)); sum -= module(y) / module(x); } setElement(order, clblasNoTrans, i, j, B, ldb, y); if ((i == 0) && (j == 0)) { min = y; } else if (module(y) < module(min)) { min = y; } } } break; case clblasRight: for (i = 0; i < M; i++) { sum = TRSM_LIMIT_B(); for (j = 0; j < N; j++) { x = getElement(order, clblasNoTrans, j, j, A, lda); y = ZERO(); if (sum >= 0.0) { y = random(sum * module(x) / sqrt((double)N - j)); sum -= module(y) / module(x); } setElement(order, clblasNoTrans, i, j, B, ldb, y); if ((i == 0) && (j == 0)) { min = y; } else if (module(y) < module(min)) { min = y; } } } break; } if (diag == clblasUnit) { for (i = 0; i < limA; i++) { // must not be accessed setElement(order, clblasNoTrans, i, i, A, lda, FNAN()); } } /* Calculate alpha and adjust B accordingly */ if (!useAlpha) { *alpha = ONE(); } if (module(min) > module(*alpha)) { /* FIXME: What exactly next three lines do? */ *alpha = random(module(min) - 2); *alpha = *alpha + ONE(); *alpha = *alpha + ONE(); if (module(*alpha) < 1.0) { *alpha = ONE(); } } if (module(*alpha) != 1.0) { for (i = 0; i < M; i++) { for (j = 0; j < N; j++) { x = getElement(order, clblasNoTrans, i, j, B, ldb); x = x / *alpha; setElement(order, clblasNoTrans, i, j, B, ldb, x); } } } } template static void randomTrsvMatrices( clblasOrder order, clblasUplo uplo, clblasDiag diag, size_t N, T *A, size_t lda, T *X, int incx) { size_t i, j; T min, max, x, y; cl_double modMin, modMax, sum, maxDiag; min = ZERO(); max = ZERO(); incx = abs(incx); maxDiag = 1.0; cl_double bound; bound = (UPPER_BOUND()/(N)); switch (diag) { case clblasUnit: for (i = 0; i < N; i++) { // must not be accessed if(lda > 0) { setElement(order, clblasNoTrans, i, i, A, lda, ONE/*FNAN*/()); } else //Packed case { setElementPacked(order, clblasNoTrans, uplo, i, i, A, N, ONE/*FNAN*/()); } } break; case clblasNonUnit: /* Do not allow zeros on A's main diagonal and get a big number which is atleast greater than N/4*/ maxDiag = ((N/4) > bound) ? (bound/4) : (N/4); maxDiag = (1 > (maxDiag)) ? 1 : maxDiag; do { max = randomTrsv(bound); } while ((module(max) < (maxDiag))); modMax = module(max); min = max / 100; modMin = module(min); if(lda > 0) { setElement(order, clblasNoTrans, 0, 0, A, lda, max); } else //Packed Case { setElementPacked(order, clblasNoTrans, uplo, 0, 0, A, N, max); } //printf("Diagonals %d ", max); for (i = 1; i < N; i++) { x = randomTrsv(modMin, modMax); if (module(x) < 1) { x = max; } //printf("%d ", x); /*if(module(x) < 1) { printf("WARNING: Diagonal less than one\n"); }*/ if(lda > 0) { setElement(order, clblasNoTrans, i, i, A, lda, x); } else { setElementPacked(order, clblasNoTrans, uplo, i, i, A, N, x); } } // printf("\n"); break; } /* Generate a_{ij} for all j <> i. */ for (i = 0; i < N; i++) { if (diag == clblasUnit) { sum = module(ONE()); } else { T temp; if(lda > 0) { temp = getElement(order, clblasNoTrans, i, i, A, lda); } else { temp = getElementPacked(order, clblasNoTrans, uplo, i, i, A, N); } sum = module(temp); } for (j = 0; j < N; j++) { if (j == i) { continue; } if (((uplo == clblasUpper) && (j > i)) || ((uplo == clblasLower) && (j < i))) { x = randomTrsv(sum/N); } else { // must not be accessed x = FNAN(); } if(lda > 0) { setElement(order, clblasNoTrans, i, j, A, lda, x); } else //Packed Case. { setElementPacked(order, clblasNoTrans, uplo, i, j, A, N, x); } } } /* Generate matrix X. */ sum = TRSM_LIMIT_B(); for (i = 0; i < N; i++) { if(lda > 0) { x = getElement(order, clblasNoTrans, i, i, A, lda); } else //Packed Case. { x = getElementPacked(order, clblasNoTrans, uplo, i, i, A, N); } sum = module(x); y = randomTrsv(sum/N); setElement(clblasColumnMajor, clblasNoTrans, (i * abs(incx)), 0, X, (1 + (N-1)*abs(incx)), y); if (i == 0) { min = y; } else if (module(y) < module(min)) { min = y; } } } template static void randomSyrMatrices( clblasOrder order, clblasUplo uplo, size_t N, bool useAlpha, T *alpha, T *A, size_t lda, T *X, int incx ) { size_t i, j; size_t lengthX; cl_double bound; if (!useAlpha) { *alpha = random(100); if (module(*alpha) == 0.0) { *alpha = ONE(); } } #ifdef DEBUG_SYR printf("ALPHA in randomSyrMatrices %f\n", *alpha); #endif // bound is calculated by solving the equation (alpha*x^2 + x - UPPER_BOUND) < 0 bound = UPPER_BOUND(); if(module(*alpha) > (sqrt(bound) / (2.0))) *alpha = random((sqrt(bound) / (2.0))); #ifdef DEBUG_SYR printf("ALPHA in randomSyrMatrices after check %f bound for alpha is %f\n", *alpha, (sqrt(bound) / (2.0))); #endif bound = bound / module(*alpha); bound = sqrt( ((((1.0) / module(*alpha)) / (4.0)) / module(*alpha)) + bound) - ((1.0) / ((2.0) * (*alpha))); #ifdef DEBUG_SYR printf("BOUND : %f alpha %f \n", bound, *alpha); #endif if( lda ) { for (i = 0; i < N; i++) { for (j = 0; j < N; j++) { setElement(order, clblasNoTrans, i, j, A, lda, random(bound)); } } } else { for (i = 0; i < N; i++) { for (j = 0; j < N; j++) { setElementPacked(order, clblasNoTrans, uplo, i, j, A, N, random(bound)); } } } lengthX = 1 + ((N - 1) * abs(incx)); if (X != NULL) { for (i = 0; i < lengthX; i++) { X[i] = random(bound); } } } template static void randomSyr2Matrices( clblasOrder order, clblasUplo uplo, size_t N, bool useAlpha, T *alpha, T *A, size_t lda, T *X, int incx, T *Y, int incy ) { size_t i, j; size_t lengthX; size_t lengthY; cl_double bound; if (!useAlpha) { *alpha = random(100); if (module(*alpha) == 0.0) { *alpha = ONE(); } } #ifdef DEBUG_SYR2 printf("ALPHA in randomSyr2Matrices %f\n", *alpha); #endif // bound is calculated by solving the equation (2*alpha*x^2 + x - UPPER_BOUND) < 0 bound = UPPER_BOUND(); if(module(*alpha) > (sqrt(bound) / (4.0))) *alpha = random((sqrt(bound) / (4.0))); #ifdef DEBUG_SYR2 printf("ALPHA in randomSyrMatrices after check %f bound for alpha is %f\n", *alpha, (sqrt(bound) / (2.0))); #endif bound = bound / ( 2 * module(*alpha)); bound = sqrt( ((((1.0) / module(*alpha)) / (16.0)) / module(*alpha)) + bound) - ((1.0) / ((4.0) * (*alpha))); #ifdef DEBUG_SYR2 printf("BOUND : %f alpha %f \n", bound, *alpha); #endif if( lda ) { for (i = 0; i < N; i++) { for (j = 0; j < N; j++) { setElement(order, clblasNoTrans, i, j, A, lda, random(bound)); } } } else { for (i = 0; i < N; i++) { for (j = 0; j < N; j++) { setElementPacked(order, clblasNoTrans, uplo, i, j, A, N, random(bound)); } } } lengthX = 1 + ((N - 1) * abs(incx)); if (X != NULL) { for (i = 0; i < lengthX; i++) { X[i] = random(bound); } } lengthY = 1 + (N - 1) * abs(incy); if (Y != NULL) { for (i = 0; i < lengthY; i++) { Y[i] = random(bound); } } } template static void randomHemvMatrices( clblasOrder order, clblasUplo uplo, size_t N, bool useAlpha, T *alpha, T *A, size_t lda, T *X, int incx, bool useBeta, T *beta, T *Y, int incy ) { size_t i, j; size_t lengthX; size_t lengthY; cl_double bound; cl_double fAlpha, fBeta; if (!useAlpha) { *alpha = random(100); if (module(CREAL(*alpha)) == 0.0) { CREAL(*alpha) = 1.0; } } if (!useBeta) { *beta = random(100); if (module(CREAL(*beta)) == 0.0) { CREAL(*beta) = 1.0; } } #ifdef DEBUG_HEMV printf("ALPHA in randomSyr2Matrices %f.%f\n", CREAL(*alpha), CIMAG(*alpha)); printf("BETA in randomSyr2Matrices %f.%f\n", CREAL(*beta), CIMAG(*beta)); #endif // bound is calculated by solving the equation (2*alpha*x^2 + x - UPPER_BOUND) < 0 bound = UPPER_BOUND(); if((module(CREAL(*alpha)) > bound) || (module(CIMAG(*alpha)) > bound)) *alpha = random((sqrt(bound) / ((2.0) * N))); if (module(CREAL(*alpha)) == 0.0) { CREAL(*alpha) = 1.0; } if((module(CREAL(*beta)) > bound) || (module(CIMAG(*beta)) > bound)) *beta = random((sqrt(bound))); if (module(CREAL(*beta)) == 0.0) { CREAL(*beta) = 1.0; } #ifdef DEBUG_HEMV printf("ALPHA in randomSyrMatrices after check %f.%f bound for alpha is %f\n", CREAL(*alpha), CIMAG(*alpha), (sqrt(bound) / ((2.0) * N))); #endif fAlpha = (module(CREAL(*alpha)) > module(CIMAG(*alpha))) ? module(CREAL(*alpha)) : module(CIMAG(*alpha)); fBeta = (module(CREAL(*beta)) > module(CIMAG(*beta))) ? module(CREAL(*beta)) : module(CIMAG(*beta)); bound = bound / (fAlpha * N); bound = sqrt( ((((((fBeta * fBeta)) / fAlpha) / (4.0)) / fAlpha) / (N * N)) + bound) - ((fBeta) / ((2.0) * (fAlpha) * N)); #ifdef DEBUG_HEMV printf("BOUND : %f \n", bound); #endif if( lda ) { for (i = 0; i < N; i++) { for (j = 0; j < N; j++) { setElement(order, clblasNoTrans, i, j, A, lda, random(bound)); } } } else { for (i = 0; i < N; i++) { for (j = 0; j < N; j++) { setElementPacked(order, clblasNoTrans, uplo, i, j, A, N, random(bound)); } } } lengthX = 1 + ((N - 1) * abs(incx)); if (X != NULL) { for (i = 0; i < lengthX; i++) { X[i] = random(bound); } } lengthY = 1 + (N - 1) * abs(incy); if (Y != NULL) { for (i = 0; i < lengthY; i++) { Y[i] = random(bound); } } } template static void randomVectors( size_t N, T *X, int incx, T *Y = NULL, int incy = 0, bool considerN=false ) { cl_double quotient = (considerN)? N: 1.0; cl_double bound = sqrt( UPPER_BOUND()/quotient ) / 2; // sqrt for the alpha factor and 2 for addition int length = 1 + ((N - 1) * abs(incx)); for(int i=0; i(bound); } if(Y != NULL) { length = 1 + ((N - 1) * abs(incy)); for(int i=0; i(bound); } } } // testDG template static void setElementWithRandomData(T *p, int vectorLength, cl_double bound) { for(int k=0; k(bound); } template static void setElementWithUnity(T *p, int vectorLength) { p[0] = (T)1.0; if ( vectorLength == 2) { p[1] = 0.0f; } } template static void setElementWithZero(T *p, int vectorLength) { for(int k=0; k static void setDiagonalUnityOrNonUnity(int unity, T* data, size_t rows, size_t cols, size_t lda, int vectorLength, int creationFlags, cl_double bound) { if (creationFlags & PACKED_MATRIX) { // Rows = Cols for PACKED Matrix for(size_t i=0;i< rows;i++) { if (creationFlags & UPPER_HALF_ONLY) { (unity==1)? setElementWithUnity( ((creationFlags & ROW_MAJOR_ORDER))?RMUPacked(i,i):RMLPacked(i,i), vectorLength): (unity == 0)? setElementWithZero( ((creationFlags & ROW_MAJOR_ORDER))?RMUPacked(i,i):RMLPacked(i,i), vectorLength): setElementWithRandomData( ((creationFlags & ROW_MAJOR_ORDER))?RMUPacked(i,i):RMLPacked(i,i), vectorLength, bound); } else { (unity==1)? setElementWithUnity( (creationFlags & ROW_MAJOR_ORDER)?RMLPacked(i,i):RMUPacked(i,i), vectorLength): (unity==0)? setElementWithZero( (creationFlags & ROW_MAJOR_ORDER)?RMLPacked(i,i):RMUPacked(i,i), vectorLength): setElementWithRandomData( (creationFlags & ROW_MAJOR_ORDER)?RMLPacked(i,i):RMUPacked(i,i) , vectorLength, bound); } } } else { // Row Major - rows x lda // Col major - lda x cols size_t firstdimension; T *p; if (creationFlags & ROW_MAJOR_ORDER) { firstdimension = rows; } else { firstdimension = cols; } for(size_t i=0; i static void setTriangularMatrixWithRandomData(char uplo, T* data, size_t rows, size_t cols, size_t lda, int vectorLength, int creationFlags, cl_double bound) { // Packed Matrix if (creationFlags & PACKED_MATRIX) { if (uplo == 'L') { for( size_t i=0; i < rows; i++) { for( size_t j=0; j < i; j++) // Don't touch diagonals { //setRandom( (flags & ROW_MAJOR) ? RMLPacked(i,j) : CMLPacked(i,j)); setElementWithRandomData( (creationFlags & ROW_MAJOR_ORDER) ? RMLPacked(i,j) : RMUPacked(j,i), vectorLength, bound); } } } else { for( size_t i=0; i < rows; i++) { for( size_t j=(i+1); j < cols; j++) // Don't touch diagonals { //printf("(i,j) -- (%d,%d) : Index : %d\n", i, j, ((i*((2*rows) + 1 - i))/2 + (j -i))); setElementWithRandomData( (creationFlags & ROW_MAJOR_ORDER) ? RMUPacked(i,j) : RMLPacked(j,i), vectorLength, bound); } } } } else { // Row Major - rows x lda // Col major - lda x cols size_t firstdimension, seconddimension; T *p; if ((uplo != 'U') && (uplo != 'L')) { throw -1; } if (creationFlags & ROW_MAJOR_ORDER) { firstdimension = rows; seconddimension = cols; } else { firstdimension = cols; seconddimension = rows; if (uplo == 'U') { uplo = 'L'; } else { uplo = 'U'; } } for(size_t i=0; i static void doTriangleOperation(TRIANGLE_OPERATIONS op, T* data, size_t rows, size_t cols, size_t lda, int vectorLength, int creationFlags ) { size_t firstdimension, seconddimension; T *p1, *p2; size_t start, end; if (creationFlags & ROW_MAJOR_ORDER) { firstdimension = rows; seconddimension = cols; } else { firstdimension = cols; seconddimension = rows; } for(size_t i=0; i static void doPopulate(T* data, size_t rows, size_t cols, size_t lda, int vectorLength, cl_double bound, int creationFlags = 0) { bool triangularMatrix = ((creationFlags & LOWER_HALF_ONLY) || (creationFlags & UPPER_HALF_ONLY)); // Non-Square Matrix if( rows != cols) { // Row-Major if (creationFlags & ROW_MAJOR_ORDER) { for( size_t i=0; i < rows; i++) { for(size_t j=0; j < cols; j++) { T *p = (T *)data + i* lda*vectorLength + j*vectorLength; setElementWithRandomData(p, vectorLength , bound); if ( i == j) { if (creationFlags & UNIT_DIAGONAL) { setElementWithUnity(p, vectorLength); } else if (creationFlags & ZERO_DIAGONAL) { setElementWithZero(p, vectorLength); } } } } } else // Col-Major { for( size_t i=0; i < rows; i++) { for(size_t j=0; j < cols; j++) { T *p = (T *)data + j* lda*vectorLength + i*vectorLength; setElementWithRandomData(p, vectorLength, bound); if ( i == j) { if (creationFlags & UNIT_DIAGONAL) { setElementWithUnity(p, vectorLength); } else if (creationFlags & ZERO_DIAGONAL) { setElementWithZero(p, vectorLength); } } } } } } else if ( creationFlags & PACKED_MATRIX ) // SQUARE and PACKED { if (triangularMatrix) { if (creationFlags & UPPER_HALF_ONLY) setTriangularMatrixWithRandomData('U', data, rows, cols, lda, vectorLength, creationFlags, bound); if (creationFlags & LOWER_HALF_ONLY) { setTriangularMatrixWithRandomData('L', data, rows, cols, lda, vectorLength, creationFlags, bound); } } else { // FIXME: throw -1; } if (creationFlags & UNIT_DIAGONAL) { setDiagonalUnity(); } else if (creationFlags & ZERO_DIAGONAL) { setDiagonalZero(); } else { setDiagonalRandom(); } } else // SQUARE { if (triangularMatrix) { if (creationFlags & UPPER_HALF_ONLY) setTriangularMatrixWithRandomData('U', data, rows, cols, lda, vectorLength, creationFlags, bound); if (creationFlags & LOWER_HALF_ONLY) setTriangularMatrixWithRandomData('L', data, rows, cols, lda, vectorLength, creationFlags, bound); } else { setTriangularMatrixWithRandomData('L', data, rows, cols, lda, vectorLength, creationFlags, bound); if (creationFlags & SYMMETRIC_MATRIX) { doTriangleOperation(LTOU, data, rows, cols, lda, vectorLength, creationFlags); } else { setTriangularMatrixWithRandomData('U', data, rows, cols, lda, vectorLength, creationFlags, bound); } } if (creationFlags & UNIT_DIAGONAL) { setDiagonalUnity(); } else if (creationFlags & ZERO_DIAGONAL) { setDiagonalZero(); } else { setDiagonalRandom(); } } } template static void populate(T* data, size_t rows, size_t cols, size_t lda, BlasRoutineID BlasFn, int creationFlags = 0) { cl_double bound; bound = UPPER_BOUND(); cl_double biggest = (cl_double)std::max( rows, cols); switch( BlasFn ) { case CLBLAS_TRMV: bound = sqrt( ((biggest - 1)* bound) / (biggest * biggest)); break; case CLBLAS_SYMM: case CLBLAS_HER: case CLBLAS_HER2: case CLBLAS_HEMM: case CLBLAS_HERK: case CLBLAS_GER: // Taking cube root because of Alpha factor- (alpha*X*Y) bound = pow( (((biggest - 1)* bound) / (biggest * biggest)), ((double)1/3) ); break; default : ::std::cerr << "Invalid function ID sent to populate!" << ::std::endl; } doPopulate( data, rows, cols, lda, 1, bound, creationFlags); } template<> __template_static void populate(FloatComplex* data, size_t rows, size_t cols, size_t lda, BlasRoutineID BlasFn, int creationFlags) { cl_double bound; bound = UPPER_BOUND(); cl_double biggest = (cl_double)std::max( rows, cols); switch( BlasFn ) { case CLBLAS_TRMV: bound = sqrt( ((biggest - 1)* bound) / (biggest * biggest)) / 2; break; case CLBLAS_SYMM: case CLBLAS_HER: case CLBLAS_HER2: case CLBLAS_HEMM: case CLBLAS_HERK: case CLBLAS_GER: // Taking cube root because of Alpha factor- (alpha*X*Y) bound = pow( (((biggest - 1)* bound) / (biggest * biggest)), ((double)1/3) ); break; default : ::std::cerr << "Invalid function ID sent to populate!" << ::std::endl; } doPopulate( (float*)data, rows, cols, lda, 2, bound, creationFlags); } template<> __template_static void populate(DoubleComplex* data, size_t rows, size_t cols, size_t lda, BlasRoutineID BlasFn, int creationFlags ) { cl_double bound; bound = UPPER_BOUND(); cl_double biggest = (cl_double)std::max( rows, cols); switch( BlasFn ) { case CLBLAS_TRMV: bound = sqrt( ((biggest - 1)* bound) / (biggest * biggest)) / 2; break; case CLBLAS_SYMM: case CLBLAS_HER: case CLBLAS_HER2: case CLBLAS_GER: case CLBLAS_HEMM: case CLBLAS_HERK: case CLBLAS_SYR: // Taking cube root because of Alpha factor- (alpha*X*Y) bound = pow( (((biggest - 1)* bound) / (biggest * biggest)), ((double)1/3) ); break; default : ::std::cerr << "Invalid function ID sent to populate!" << ::std::endl; } doPopulate( (double*)data, rows, cols, lda, 2, bound, creationFlags); } template static double maxVal( T elem ) { return (double)elem; } template <> __template_static double maxVal( FloatComplex elem ) { return (cl_double)std::max( CREAL(elem), CIMAG(elem) ); } template <> __template_static double maxVal( DoubleComplex elem ) { return (cl_double)std::max( CREAL(elem), CIMAG(elem) ); } #endif // BLAS_RANDOM_H_ clblas-2.10/src/tests/include/blas-wrapper.h000066400000000000000000001117071264277366700210530ustar00rootroot00000000000000/* ************************************************************************ * Copyright 2013 Advanced Micro Devices, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * ************************************************************************/ #ifndef BLAS_WRAPPER_H_ #define BLAS_WRAPPER_H_ #include namespace clMath { class blas { public: // GEMV wrappers static void gemv( clblasOrder order, clblasTranspose transA, size_t M, size_t N, float alpha, const float *A, size_t lda, const float *X, int incx, float beta, float *Y, int incy); static void gemv( clblasOrder order, clblasTranspose transA, size_t M, size_t N, double alpha, const double *A, size_t lda, const double *X, int incx, double beta, double *Y, int incy); static void gemv( clblasOrder order, clblasTranspose transA, size_t M, size_t N, FloatComplex alpha, const FloatComplex *A, size_t lda, const FloatComplex *X, int incx, FloatComplex beta, FloatComplex *Y, int incy); static void gemv( clblasOrder order, clblasTranspose transA, size_t M, size_t N, DoubleComplex alpha, const DoubleComplex *A, size_t lda, const DoubleComplex *X, int incx, DoubleComplex beta, DoubleComplex *Y, int incy); // SYMV wrappers static void symv( clblasOrder order, clblasUplo uplo, size_t N, float alpha, const float *A, size_t lda, const float *X, int incx, float beta, float *Y, int incy); static void symv( clblasOrder order, clblasUplo uplo, size_t N, double alpha, const double *A, size_t lda, const double *X, int incx, double beta, double *Y, int incy); // GEMM wrappers static void gemm( clblasOrder order, clblasTranspose transA, clblasTranspose transB, size_t M, size_t N, size_t K, float alpha, const float *A, size_t lda, const float *B, size_t ldb, float beta, float *C, size_t ldc); static void gemm( clblasOrder order, clblasTranspose transA, clblasTranspose transB, size_t M, size_t N, size_t K, double alpha, const double *A, size_t lda, const double *B, size_t ldb, double beta, double *C, size_t ldc); static void gemm( clblasOrder order, clblasTranspose transA, clblasTranspose transB, size_t M, size_t N, size_t K, FloatComplex alpha, const FloatComplex *A, size_t lda, const FloatComplex *B, size_t ldb, FloatComplex beta, FloatComplex *C, size_t ldc); static void gemm( clblasOrder order, clblasTranspose transA, clblasTranspose transB, size_t M, size_t N, size_t K, DoubleComplex alpha, const DoubleComplex *A, size_t lda, const DoubleComplex *B, size_t ldb, DoubleComplex beta, DoubleComplex *C, size_t ldc); // TRMM wrappers static void trmm( clblasOrder order, clblasSide side, clblasUplo uplo, clblasTranspose transA, clblasDiag diag, size_t M, size_t N, float alpha, const float *A, size_t lda, float *B, size_t ldb); static void trmm( clblasOrder order, clblasSide side, clblasUplo uplo, clblasTranspose transA, clblasDiag diag, size_t M, size_t N, double alpha, const double *A, size_t lda, double *B, size_t ldb); static void trmm( clblasOrder order, clblasSide side, clblasUplo uplo, clblasTranspose transA, clblasDiag diag, size_t M, size_t N, FloatComplex alpha, const FloatComplex *A, size_t lda, FloatComplex *B, size_t ldb); static void trmm( clblasOrder order, clblasSide side, clblasUplo uplo, clblasTranspose transA, clblasDiag diag, size_t M, size_t N, DoubleComplex alpha, const DoubleComplex *A, size_t lda, DoubleComplex *B, size_t ldb); // TRSM wrappers static void trsm( clblasOrder order, clblasSide side, clblasUplo uplo, clblasTranspose transA, clblasDiag diag, size_t M, size_t N, float alpha, const float *A, size_t lda, float *B, size_t ldb); static void trsm( clblasOrder order, clblasSide side, clblasUplo uplo, clblasTranspose transA, clblasDiag diag, size_t M, size_t N, double alpha, const double *A, size_t lda, double *B, size_t ldb); static void trsm( clblasOrder order, clblasSide side, clblasUplo uplo, clblasTranspose transA, clblasDiag diag, size_t M, size_t N, FloatComplex alpha, const FloatComplex *A, size_t lda, FloatComplex *B, size_t ldb); static void trsm( clblasOrder order, clblasSide side, clblasUplo uplo, clblasTranspose transA, clblasDiag diag, size_t M, size_t N, DoubleComplex alpha, const DoubleComplex *A, size_t lda, DoubleComplex *B, size_t ldb); // SYR2K wrappers static void syr2k( clblasOrder order, clblasUplo uplo, clblasTranspose transA, size_t N, size_t K, float alpha, const float *A, size_t lda, const float *B, size_t ldb, float beta, float *C, size_t ldc); static void syr2k( clblasOrder order, clblasUplo uplo, clblasTranspose transA, size_t N, size_t K, double alpha, const double *A, size_t lda, const double *B, size_t ldb, double beta, double *C, size_t ldc); static void syr2k( clblasOrder order, clblasUplo uplo, clblasTranspose transA, size_t N, size_t K, FloatComplex alpha, const FloatComplex *A, size_t lda, const FloatComplex *B, size_t ldb, FloatComplex beta, FloatComplex *C, size_t ldc); static void syr2k( clblasOrder order, clblasUplo uplo, clblasTranspose transA, size_t N, size_t K, DoubleComplex alpha, const DoubleComplex *A, size_t lda, const DoubleComplex *B, size_t ldb, DoubleComplex beta, DoubleComplex *C, size_t ldc); // SYRK wrappers static void syrk( clblasOrder order, clblasUplo uplo, clblasTranspose transA, size_t N, size_t K, float alpha, const float *A, size_t lda, float beta, float *C, size_t ldc); static void syrk( clblasOrder order, clblasUplo uplo, clblasTranspose transA, size_t N, size_t K, double alpha, const double *A, size_t lda, double beta, double *C, size_t ldc); static void syrk( clblasOrder order, clblasUplo uplo, clblasTranspose transA, size_t N, size_t K, FloatComplex alpha, const FloatComplex *A, size_t lda, FloatComplex beta, FloatComplex *C, size_t ldc); static void syrk( clblasOrder order, clblasUplo uplo, clblasTranspose transA, size_t N, size_t K, DoubleComplex alpha, const DoubleComplex *A, size_t lda, DoubleComplex beta, DoubleComplex *C, size_t ldc); static void trmv( clblasOrder order, clblasUplo uplo, clblasTranspose transA, clblasDiag diag, size_t N, float *A, size_t offa, size_t lda, float *X, size_t offx, int incx); static void trmv( clblasOrder order, clblasUplo uplo, clblasTranspose transA, clblasDiag diag, size_t N, double *A, size_t offa, size_t lda, double *X, size_t offx, int incx); static void trmv( clblasOrder order, clblasUplo uplo, clblasTranspose transA, clblasDiag diag, size_t N, FloatComplex *A, size_t offa, size_t lda, FloatComplex *X, size_t offx, int incx); static void trmv( clblasOrder order, clblasUplo uplo, clblasTranspose transA, clblasDiag diag, size_t N, DoubleComplex *A, size_t offa, size_t lda, DoubleComplex *X, size_t offx, int incx); //TPMV static void tpmv( clblasOrder order, clblasUplo uplo, clblasTranspose transA, clblasDiag diag, size_t N, float *AP, size_t offa, float *X, size_t offx, int incx); static void tpmv( clblasOrder order, clblasUplo uplo, clblasTranspose transA, clblasDiag diag, size_t N, double *AP, size_t offa, double *X, size_t offx, int incx); static void tpmv( clblasOrder order, clblasUplo uplo, clblasTranspose transA, clblasDiag diag, size_t N, FloatComplex *AP, size_t offa, FloatComplex *X, size_t offx, int incx); static void tpmv( clblasOrder order, clblasUplo uplo, clblasTranspose transA, clblasDiag diag, size_t N, DoubleComplex *AP, size_t offa, DoubleComplex *X, size_t offx, int incx); static void trsv( clblasOrder order, clblasUplo uplo, clblasTranspose transA, clblasDiag diag, size_t N, float *A, size_t offa, size_t lda, float *X, size_t offx, int incx); static void trsv( clblasOrder order, clblasUplo uplo, clblasTranspose transA, clblasDiag diag, size_t N, double *A, size_t offa, size_t lda, double *X, size_t offx, int incx); static void trsv( clblasOrder order, clblasUplo uplo, clblasTranspose transA, clblasDiag diag, size_t N, FloatComplex *A, size_t offa, size_t lda, FloatComplex *X, size_t offx, int incx); static void trsv( clblasOrder order, clblasUplo uplo, clblasTranspose transA, clblasDiag diag, size_t N, DoubleComplex *A, size_t offa, size_t lda, DoubleComplex *X, size_t offx, int incx); static void tpsv( clblasOrder order, clblasUplo uplo, clblasTranspose transA, clblasDiag diag, size_t N, float *A, size_t offa, float *X, size_t offx, int incx); static void tpsv( clblasOrder order, clblasUplo uplo, clblasTranspose transA, clblasDiag diag, size_t N, double *A, size_t offa, double *X, size_t offx, int incx); static void tpsv( clblasOrder order, clblasUplo uplo, clblasTranspose transA, clblasDiag diag, size_t N, FloatComplex *A, size_t offa, FloatComplex *X, size_t offx, int incx); static void tpsv( clblasOrder order, clblasUplo uplo, clblasTranspose transA, clblasDiag diag, size_t N, DoubleComplex *A, size_t offa, DoubleComplex *X, size_t offx, int incx); static void symm( clblasOrder order, clblasSide side, clblasUplo uplo, size_t M, size_t N, float alpha, float* A, size_t offa, size_t lda, float* B, size_t offb, size_t ldb, float beta, float* C, size_t offc, size_t ldc); static void symm( clblasOrder order, clblasSide side, clblasUplo uplo, size_t M, size_t N, double alpha, double* A, size_t offa, size_t lda, double* B, size_t offb, size_t ldb, double beta, double* C, size_t offc, size_t ldc); static void symm( clblasOrder order, clblasSide side, clblasUplo uplo, size_t M, size_t N, FloatComplex alpha, FloatComplex* A, size_t offa, size_t lda, FloatComplex* B, size_t offb, size_t ldb, FloatComplex beta, FloatComplex* C, size_t offc, size_t ldc); static void symm( clblasOrder order, clblasSide side, clblasUplo uplo, size_t M, size_t N, DoubleComplex alpha, DoubleComplex* A, size_t offa, size_t lda, DoubleComplex* B, size_t offb, size_t ldb, DoubleComplex beta, DoubleComplex* C, size_t offc, size_t ldc); static void ger( clblasOrder order, size_t M, size_t N, float alpha, float* x, size_t offx, int incx, float* y, size_t offy, int incy, float* A , size_t offa, size_t lda); static void ger( clblasOrder order, size_t M, size_t N, double alpha, double* x, size_t offx, int incx, double* y, size_t offy, int incy, double* A, size_t offa, size_t lda); static void ger( clblasOrder order, size_t M, size_t N, FloatComplex alpha, FloatComplex* x, size_t offx, int incx, FloatComplex* y, size_t offy, int incy, FloatComplex* A , size_t offa, size_t lda); static void ger( clblasOrder order, size_t M, size_t N, DoubleComplex alpha, DoubleComplex* x, size_t offx, int incx, DoubleComplex* y, size_t offy, int incy, DoubleComplex* A, size_t offa, size_t lda); static void gerc( clblasOrder order, size_t M, size_t N, FloatComplex alpha, FloatComplex* x, size_t offx, int incx, FloatComplex* y, size_t offy, int incy, FloatComplex* A , size_t offa, size_t lda); static void gerc( clblasOrder order, size_t M, size_t N, DoubleComplex alpha, DoubleComplex* x, size_t offx, int incx, DoubleComplex* y, size_t offy, int incy, DoubleComplex* A, size_t offa, size_t lda); //HER wrappers static void her( clblasOrder order, clblasUplo uplo, size_t N, float alpha, FloatComplex* x, size_t offx, int incx, FloatComplex* A , size_t offa, size_t lda); static void her( clblasOrder order, clblasUplo uplo, size_t N, double alpha, DoubleComplex* x, size_t offx, int incx, DoubleComplex* A, size_t offa, size_t lda); // SYR wrappers static void syr( clblasOrder order, clblasUplo uplo, size_t N, float alpha, float* X, size_t offx, int incx, float* A, size_t offa, size_t lda); static void syr( clblasOrder order, clblasUplo uplo, size_t N, double Alpha, double* X, size_t offx, int incx, double* A, size_t offa, size_t lda); //SPR static void spr( clblasOrder order, clblasUplo uplo, size_t N, float alpha, float* X, size_t offx, int incx, float* AP, size_t offa); static void spr( clblasOrder order, clblasUplo uplo, size_t N, double Alpha, double* X, size_t offx, int incx, double* AP, size_t offa); static void syr2( clblasOrder order, clblasUplo uplo, size_t N, float alpha, float* X, size_t offx, int incx, float* Y, size_t offy, int incy, float* A, size_t offa, size_t lda); static void syr2( clblasOrder order, clblasUplo uplo, size_t N, double Alpha, double* X, size_t offx, int incx, double* Y, size_t offy, int incy, double* A, size_t offa, size_t lda); //HER2 static void her2( clblasOrder order, clblasUplo uplo, size_t N, FloatComplex alpha, FloatComplex* X, size_t offx, int incx, FloatComplex* Y, size_t offy, int incy, FloatComplex* A, size_t offa, size_t lda); static void her2( clblasOrder order, clblasUplo uplo, size_t N, DoubleComplex alpha, DoubleComplex* X, size_t offx, int incx, DoubleComplex* Y, size_t offy, int incy, DoubleComplex* A, size_t offa, size_t lda); static void hemv( clblasOrder order, clblasUplo uplo, size_t N, FloatComplex alpha, FloatComplex* A, size_t offa, size_t lda, FloatComplex* X, size_t offx, int incx, FloatComplex beta, FloatComplex* Y, size_t offy, int incy); static void hemv( clblasOrder order, clblasUplo uplo, size_t N, DoubleComplex alpha, DoubleComplex* A, size_t offa, size_t lda, DoubleComplex* X, size_t offx, int incx, DoubleComplex beta, DoubleComplex* Y, size_t offy, int incy); //HEMM static void hemm( clblasOrder order, clblasSide side, clblasUplo uplo, size_t M, size_t N, FloatComplex alpha, FloatComplex* A, size_t offa, size_t lda, FloatComplex* B, size_t offb, size_t ldb, FloatComplex beta, FloatComplex* C, size_t offc, size_t ldc); static void hemm( clblasOrder order, clblasSide side, clblasUplo uplo, size_t M, size_t N, DoubleComplex alpha, DoubleComplex* A, size_t offa, size_t lda, DoubleComplex* B, size_t offb, size_t ldb, DoubleComplex beta, DoubleComplex* C, size_t offc, size_t ldc); // HERK wrappers static void herk( clblasOrder order, clblasUplo uplo, clblasTranspose transA, size_t N, size_t K, float alpha, const FloatComplex *A, size_t lda, float beta, FloatComplex *C, size_t ldc); static void herk( clblasOrder order, clblasUplo uplo, clblasTranspose transA, size_t N, size_t K, double alpha, const DoubleComplex *A, size_t lda, double beta, DoubleComplex *C, size_t ldc); // SPMV wrappers static void spmv( clblasOrder order, clblasUplo uplo, size_t N, float alpha, const float *A, size_t offa, const float *X, size_t offx, int incx, float beta, float *Y, size_t offy, int incy); static void spmv( clblasOrder order, clblasUplo uplo, size_t N, double alpha, const double *A, size_t offa, const double *X, size_t offx, int incx, double beta, double *Y, size_t offy, int incy); static void hpmv( clblasOrder order, clblasUplo uplo, size_t N, FloatComplex alpha, FloatComplex* A, size_t offa, FloatComplex* X, size_t offx, int incx, FloatComplex beta, FloatComplex* Y, size_t offy, int incy); static void hpmv( clblasOrder order, clblasUplo uplo, size_t N, DoubleComplex alpha, DoubleComplex* A, size_t offa, DoubleComplex* X, size_t offx, int incx, DoubleComplex beta, DoubleComplex* Y, size_t offy, int incy); //HPR wrappers static void hpr( clblasOrder order, clblasUplo uplo, size_t N, float alpha, FloatComplex* x, size_t offx, int incx, FloatComplex* AP , size_t offa); static void hpr( clblasOrder order, clblasUplo uplo, size_t N, double alpha, DoubleComplex* x, size_t offx, int incx, DoubleComplex* AP, size_t offa); static void spr2( clblasOrder order, clblasUplo uplo, size_t N, float alpha, float* X, size_t offx, int incx, float* Y, size_t offy, int incy, float* AP, size_t offa); static void spr2( clblasOrder order, clblasUplo uplo, size_t N, double Alpha, double* X, size_t offx, int incx, double* Y, size_t offy, int incy, double* AP, size_t offa); static void hpr2( clblasOrder order, clblasUplo uplo, size_t N, FloatComplex alpha, FloatComplex* X, size_t offx, int incx, FloatComplex* Y, size_t offy, int incy, FloatComplex* AP, size_t offa); static void hpr2( clblasOrder order, clblasUplo uplo, size_t N, DoubleComplex alpha, DoubleComplex* X, size_t offx, int incx, DoubleComplex* Y, size_t offy, int incy, DoubleComplex* AP, size_t offa); // GBMV wrappers static void gbmv( clblasOrder order, clblasTranspose transA, size_t M, size_t N, size_t KL, size_t KU, float alpha, float *A, size_t offa, size_t lda, float *X, size_t offx, int incx, float beta, float *Y, size_t offy, int incy); static void gbmv( clblasOrder order, clblasTranspose transA, size_t M, size_t N, size_t KL, size_t KU, double alpha, double *A, size_t offa, size_t lda, double *X, size_t offx, int incx, double beta, double *Y, size_t offy, int incy); static void gbmv( clblasOrder order, clblasTranspose transA, size_t M, size_t N, size_t KL, size_t KU, FloatComplex alpha, FloatComplex *A, size_t offa, size_t lda, FloatComplex *X, size_t offx, int incx, FloatComplex beta, FloatComplex *Y, size_t offy, int incy); static void gbmv( clblasOrder order, clblasTranspose transA, size_t M, size_t N, size_t KL, size_t KU, DoubleComplex alpha, DoubleComplex *A, size_t offa, size_t lda, DoubleComplex *X, size_t offx, int incx, DoubleComplex beta, DoubleComplex *Y, size_t offy, int incy); //TBMV static void tbmv( clblasOrder order, clblasUplo uplo, clblasTranspose trans, clblasDiag diag, size_t N, size_t K, float *A, size_t offa, size_t lda, float *X, size_t offx, int incx); static void tbmv( clblasOrder order, clblasUplo uplo, clblasTranspose trans, clblasDiag diag, size_t N, size_t K, double *A, size_t offa, size_t lda, double *X, size_t offx, int incx); static void tbmv( clblasOrder order, clblasUplo uplo, clblasTranspose trans, clblasDiag diag, size_t N, size_t K, FloatComplex *A, size_t offa, size_t lda, FloatComplex *X, size_t offx, int incx); static void tbmv( clblasOrder order, clblasUplo uplo, clblasTranspose trans, clblasDiag diag, size_t N, size_t K, DoubleComplex *A, size_t offa, size_t lda, DoubleComplex *X, size_t offx, int incx); //SBMV static void sbmv( clblasOrder order, clblasUplo uplo, size_t N, size_t K, float alpha, float *A, size_t offa, size_t lda, float *X, size_t offx, int incx, float beta, float *Y, size_t offy, int incy); static void sbmv( clblasOrder order, clblasUplo uplo, size_t N, size_t K, double alpha, double *A, size_t offa, size_t lda, double *X, size_t offx, int incx, double beta, double *Y, size_t offy, int incy); //HBMV static void hbmv( clblasOrder order, clblasUplo uplo, size_t N, size_t K, FloatComplex alpha, FloatComplex *A, size_t offa, size_t lda, FloatComplex *X, size_t offx, int incx, FloatComplex beta, FloatComplex *Y, size_t offy, int incy); static void hbmv( clblasOrder order, clblasUplo uplo, size_t N, size_t K, DoubleComplex alpha, DoubleComplex *A, size_t offa, size_t lda, DoubleComplex *X, size_t offx, int incx, DoubleComplex beta, DoubleComplex *Y, size_t offy, int incy); //TBSV static void tbsv( clblasOrder order, clblasUplo uplo, clblasTranspose trans, clblasDiag diag, size_t N, size_t K, float *A, size_t offa, size_t lda, float *X, size_t offx, int incx); static void tbsv( clblasOrder order, clblasUplo uplo, clblasTranspose trans, clblasDiag diag, size_t N, size_t K, double *A, size_t offa, size_t lda, double *X, size_t offx, int incx); static void tbsv( clblasOrder order, clblasUplo uplo, clblasTranspose trans, clblasDiag diag, size_t N, size_t K, FloatComplex *A, size_t offa, size_t lda, FloatComplex *X, size_t offx, int incx); static void tbsv( clblasOrder order, clblasUplo uplo, clblasTranspose trans, clblasDiag diag, size_t N, size_t K, DoubleComplex *A, size_t offa, size_t lda, DoubleComplex *X, size_t offx, int incx); static void her2k( clblasOrder order, clblasUplo uplo, clblasTranspose transA, size_t N, size_t K, FloatComplex alpha, const FloatComplex *A, size_t offa, size_t lda, const FloatComplex *B, size_t offb, size_t ldb, float beta, FloatComplex *C, size_t offc, size_t ldc); static void her2k( clblasOrder order, clblasUplo uplo, clblasTranspose transA, size_t N, size_t K, DoubleComplex alpha, const DoubleComplex *A, size_t offa, size_t lda, const DoubleComplex *B, size_t offb, size_t ldb, double beta, DoubleComplex *C, size_t offc, size_t ldc); //copy static void copy( size_t N, float *X, size_t offx, int incx, float *Y, size_t offy, int incy); static void copy( size_t N, double *X, size_t offx, int incx, double *Y, size_t offy, int incy); static void copy( size_t N, FloatComplex *X, size_t offx, int incx, FloatComplex *Y, size_t offy, int incy); static void copy( size_t N, DoubleComplex *X, size_t offx, int incx, DoubleComplex *Y, size_t offy, int incy); //DOT static float dot( size_t N, float *X, size_t offx, int incx, float *Y, size_t offy, int incy); static double dot( size_t N, double *X, size_t offx, int incx, double *Y, size_t offy, int incy); static FloatComplex dot( size_t N, FloatComplex *X, size_t offx, int incx, FloatComplex *Y, size_t offy, int incy); static DoubleComplex dot( size_t N, DoubleComplex *X, size_t offx, int incx, DoubleComplex *Y, size_t offy, int incy); //ASUM static float asum( size_t N, float *X, size_t offx, int incx); static double asum( size_t N, double *X, size_t offx, int incx); static float asum( size_t N, FloatComplex *X, size_t offx, int incx); static double asum( size_t N, DoubleComplex *X, size_t offx, int incx); static FloatComplex dotc( size_t N, FloatComplex *X, size_t offx, int incx, FloatComplex *Y, size_t offy, int incy); static DoubleComplex dotc( size_t N, DoubleComplex *X, size_t offx, int incx, DoubleComplex *Y, size_t offy, int incy); // SWAP wrappers static void swap( size_t N, float *X, size_t offa, int incx, float *Y, size_t offb, int incy); static void swap( size_t N, double *X, size_t offa, int incx, double *Y, size_t offb, int incy); static void swap( size_t N, FloatComplex *X, size_t offa, int incx, FloatComplex *Y, size_t offb, int incy); static void swap( size_t N, DoubleComplex *X, size_t offa, int incx, DoubleComplex *Y, size_t offb, int incy); // Scal static void scal( bool is_css_zds, size_t N, float alpha, float *X, size_t offx, int incx); static void scal( bool is_css_zds, size_t N, double alpha, double *X, size_t offx, int incx); static void scal( bool is_css_zds, size_t N, FloatComplex alpha, FloatComplex *X, size_t offx, int incx); static void scal( bool is_css_zds, size_t N, DoubleComplex alpha, DoubleComplex *X, size_t offx, int incx); //axpy calls static void axpy( size_t N, float alpha, const float * X, size_t offBX, int incx, float *Y, size_t offCY, int incy); static void axpy( size_t N, double alpha, const double *X, size_t offBX, int incx, double *Y, size_t offCY, int incy); static void axpy( size_t N, FloatComplex alpha, const FloatComplex *X, size_t offBX, int incx, FloatComplex *Y, size_t offCY, int incy); static void axpy( size_t N, DoubleComplex alpha, const DoubleComplex *X, size_t offBX, int incx, DoubleComplex *Y, size_t offCY, int incy); static void rotmg( float* D1, size_t offD1, float* D2, size_t offD2, float* X1, size_t offX1, const float* Y1, size_t offY1, float* PARAM, size_t offParam); static void rotmg( double* D1, size_t offD1, double* D2, size_t offD2, double* X1, size_t offX1, const double* Y1, size_t offY1, double* PARAM, size_t offParam); static void rotm( size_t N, float* X, size_t offx, int incx, float* Y, size_t offy, int incy, float* PARAM, size_t offParam); static void rotm( size_t N, double* X, size_t offx, int incx, double* Y, size_t offy, int incy, double* PARAM, size_t offParam); static void rotg( float* SA, size_t offSA, float* SB, size_t offSB, float* C, size_t offC, float* S, size_t offS); static void rotg( double* SA, size_t offSA, double* SB, size_t offSB, double* C, size_t offC, double* S, size_t offS); static void rotg( FloatComplex* SA, size_t offSA, FloatComplex* SB, size_t offSB, float* C, size_t offC, FloatComplex* S, size_t offS); static void rotg( DoubleComplex* SA, size_t offSA, DoubleComplex* SB, size_t offSB, double* C, size_t offC, DoubleComplex* S, size_t offS); static void rot( size_t N, float* X, size_t offx, int incx, float* Y, size_t offy, int incy, float C, float S); static void rot( size_t N, double* X, size_t offx, int incx, double* Y, size_t offy, int incy, double C, double S); static void rot( size_t N, FloatComplex* X, size_t offx, int incx, FloatComplex* Y, size_t offy, int incy, FloatComplex C, FloatComplex S); static void rot( size_t N, DoubleComplex* X, size_t offx, int incx, DoubleComplex* Y, size_t offy, int incy, DoubleComplex C, DoubleComplex S); static int iamax( size_t N, float *X, size_t offx, int incx); static int iamax( size_t N, double *X, size_t offx, int incx); static int iamax( size_t N, FloatComplex *X, size_t offx, int incx); static int iamax( size_t N, DoubleComplex *X, size_t offx, int incx); static float nrm2( size_t N, float *X, size_t offx, int incx); static double nrm2( size_t N, double *X, size_t offx, int incx); static float nrm2( size_t N, FloatComplex *X, size_t offx, int incx); static double nrm2( size_t N, DoubleComplex *X, size_t offx, int incx); };// class blas } // namespace clMath; #endif // BLAS_WRAPPER_H_ clblas-2.10/src/tests/include/clBLAS-wrapper.h000066400000000000000000001436211264277366700211720ustar00rootroot00000000000000/* ************************************************************************ * Copyright 2013 Advanced Micro Devices, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * ************************************************************************/ #ifndef CLBLAS_WRAPPER_H_ #define CLBLAS_WRAPPER_H_ #include #include namespace clMath { class clblas { public: // GEMV wrappers static clblasStatus gemv( clblasOrder order, clblasTranspose transA, size_t M, size_t N, float alpha, const cl_mem A, size_t offA, size_t lda, const cl_mem X, size_t offx, int incx, float beta, cl_mem Y, size_t offy, int incy, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events); static clblasStatus gemv( clblasOrder order, clblasTranspose transA, size_t M, size_t N, double alpha, const cl_mem A, size_t offA, size_t lda, const cl_mem X, size_t offx, int incx, double beta, cl_mem Y, size_t offy, int incy, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events); static clblasStatus gemv( clblasOrder order, clblasTranspose transA, size_t M, size_t N, FloatComplex alpha, const cl_mem A, size_t offA, size_t lda, const cl_mem X, size_t offx, int incx, FloatComplex beta, cl_mem Y, size_t offy, int incy, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events); static clblasStatus gemv( clblasOrder order, clblasTranspose transA, size_t M, size_t N, DoubleComplex alpha, const cl_mem A, size_t offA, size_t lda, const cl_mem X, size_t offx, int incx, DoubleComplex beta, cl_mem Y, size_t offy, int incy, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events); // SYMV wrappers static clblasStatus symv( clblasOrder order, clblasUplo uplo, size_t N, float alpha, const cl_mem A, size_t offA, size_t lda, const cl_mem X, size_t offx, int incx, float beta, cl_mem Y, size_t offy, int incy, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events); static clblasStatus symv( clblasOrder order, clblasUplo uplo, size_t N, double alpha, const cl_mem A, size_t offA, size_t lda, const cl_mem X, size_t offx, int incx, double beta, cl_mem Y, size_t offy, int incy, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events); // GEMM wrappers static clblasStatus gemm( clblasOrder order, clblasTranspose transA, clblasTranspose transB, size_t M, size_t N, size_t K, float alpha, const cl_mem A, size_t offA, size_t lda, const cl_mem B, size_t offB, size_t ldb, float beta, cl_mem C, size_t offC, size_t ldc, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events); static clblasStatus gemm( clblasOrder order, clblasTranspose transA, clblasTranspose transB, size_t M, size_t N, size_t K, double alpha, const cl_mem A, size_t offA, size_t lda, const cl_mem B, size_t offB, size_t ldb, double beta, cl_mem C, size_t offC, size_t ldc, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events); static clblasStatus gemm( clblasOrder order, clblasTranspose transA, clblasTranspose transB, size_t M, size_t N, size_t K, FloatComplex alpha, const cl_mem A, size_t offA, size_t lda, const cl_mem B, size_t offB, size_t ldb, FloatComplex beta, cl_mem C, size_t offC, size_t ldc, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events); static clblasStatus gemm( clblasOrder order, clblasTranspose transA, clblasTranspose transB, size_t M, size_t N, size_t K, DoubleComplex alpha, const cl_mem A, size_t offA, size_t lda, const cl_mem B, size_t offB, size_t ldb, DoubleComplex beta, cl_mem C, size_t offC, size_t ldc, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events); static clblasStatus gemm2( clblasOrder order, clblasTranspose transA, clblasTranspose transB, size_t M, size_t N, size_t K, float alpha, const cl_mem A, size_t offA, size_t lda, const cl_mem B, size_t offB, size_t ldb, float beta, cl_mem C, size_t offC, size_t ldc, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events); static clblasStatus gemm2( clblasOrder order, clblasTranspose transA, clblasTranspose transB, size_t M, size_t N, size_t K, double alpha, const cl_mem A, size_t offA, size_t lda, const cl_mem B, size_t offB, size_t ldb, double beta, cl_mem C, size_t offC, size_t ldc, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events); static clblasStatus gemm2( clblasOrder order, clblasTranspose transA, clblasTranspose transB, size_t M, size_t N, size_t K, FloatComplex alpha, const cl_mem A, size_t offA, size_t lda, const cl_mem B, size_t offB, size_t ldb, FloatComplex beta, cl_mem C, size_t offC, size_t ldc, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events); static clblasStatus gemm2( clblasOrder order, clblasTranspose transA, clblasTranspose transB, size_t M, size_t N, size_t K, DoubleComplex alpha, const cl_mem A, size_t offA, size_t lda, const cl_mem B, size_t offB, size_t ldb, DoubleComplex beta, cl_mem C, size_t offC, size_t ldc, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events); // TRMM wrappers static clblasStatus trmm( clblasOrder order, clblasSide side, clblasUplo uplo, clblasTranspose transA, clblasDiag diag, size_t M, size_t N, float alpha, const cl_mem A, size_t offA, size_t lda, cl_mem B, size_t offB, size_t ldb, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events); static clblasStatus trmm( clblasOrder order, clblasSide side, clblasUplo uplo, clblasTranspose transA, clblasDiag diag, size_t M, size_t N, double alpha, const cl_mem A, size_t offA, size_t lda, cl_mem B, size_t offB, size_t ldb, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events); static clblasStatus trmm( clblasOrder order, clblasSide side, clblasUplo uplo, clblasTranspose transA, clblasDiag diag, size_t M, size_t N, FloatComplex alpha, const cl_mem A, size_t offA, size_t lda, cl_mem B, size_t offB, size_t ldb, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events); static clblasStatus trmm( clblasOrder order, clblasSide side, clblasUplo uplo, clblasTranspose transA, clblasDiag diag, size_t M, size_t N, DoubleComplex alpha, const cl_mem A, size_t offA, size_t lda, cl_mem B, size_t offB, size_t ldb, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events); // TRSM wrappers static clblasStatus trsm( clblasOrder order, clblasSide side, clblasUplo uplo, clblasTranspose transA, clblasDiag diag, size_t M, size_t N, float alpha, const cl_mem A, size_t offA, size_t lda, cl_mem B, size_t offB, size_t ldb, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events); static clblasStatus trsm( clblasOrder order, clblasSide side, clblasUplo uplo, clblasTranspose transA, clblasDiag diag, size_t M, size_t N, double alpha, const cl_mem A, size_t offA, size_t lda, cl_mem B, size_t offB, size_t ldb, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events); static clblasStatus trsm( clblasOrder order, clblasSide side, clblasUplo uplo, clblasTranspose transA, clblasDiag diag, size_t M, size_t N, FloatComplex alpha, const cl_mem A, size_t offA, size_t lda, cl_mem B, size_t offB, size_t ldb, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events); static clblasStatus trsm( clblasOrder order, clblasSide side, clblasUplo uplo, clblasTranspose transA, clblasDiag diag, size_t M, size_t N, DoubleComplex alpha, const cl_mem A, size_t offA, size_t lda, cl_mem B, size_t offB, size_t ldb, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events); // SYR2K wrappers static clblasStatus syr2k( clblasOrder order, clblasUplo uplo, clblasTranspose transAB, size_t N, size_t K, float alpha, const cl_mem A, size_t offA, size_t lda, const cl_mem B, size_t offB, size_t ldb, float beta, cl_mem C, size_t offC, size_t ldc, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events); static clblasStatus syr2k( clblasOrder order, clblasUplo uplo, clblasTranspose transAB, size_t N, size_t K, double alpha, const cl_mem A, size_t offA, size_t lda, const cl_mem B, size_t offB, size_t ldb, double beta, cl_mem C, size_t offC, size_t ldc, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events); static clblasStatus syr2k( clblasOrder order, clblasUplo uplo, clblasTranspose transAB, size_t N, size_t K, FloatComplex alpha, const cl_mem A, size_t offA, size_t lda, const cl_mem B, size_t offB, size_t ldb, FloatComplex beta, cl_mem C, size_t offC, size_t ldc, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events); static clblasStatus syr2k( clblasOrder order, clblasUplo uplo, clblasTranspose transAB, size_t N, size_t K, DoubleComplex alpha, const cl_mem A, size_t offA, size_t lda, const cl_mem B, size_t offB, size_t ldb, DoubleComplex beta, cl_mem C, size_t offC, size_t ldc, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events); // SYRK wrappers static clblasStatus syrk( clblasOrder order, clblasUplo uplo, clblasTranspose transA, size_t N, size_t K, float alpha, const cl_mem A, size_t offA, size_t lda, float beta, cl_mem C, size_t offC, size_t ldc, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events); static clblasStatus syrk( clblasOrder order, clblasUplo uplo, clblasTranspose transA, size_t N, size_t K, double alpha, const cl_mem A, size_t offA, size_t lda, double beta, cl_mem C, size_t offC, size_t ldc, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events); static clblasStatus syrk( clblasOrder order, clblasUplo uplo, clblasTranspose transA, size_t N, size_t K, FloatComplex alpha, const cl_mem A, size_t offA, size_t lda, FloatComplex beta, cl_mem C, size_t offC, size_t ldc, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events); static clblasStatus syrk( clblasOrder order, clblasUplo uplo, clblasTranspose transA, size_t N, size_t K, DoubleComplex alpha, const cl_mem A, size_t offA, size_t lda, DoubleComplex beta, cl_mem C, size_t offC, size_t ldc, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events); static clblasStatus trmv( DataType type, clblasOrder order, clblasUplo uplo, clblasTranspose trans, clblasDiag diag, size_t N, const cl_mem A, size_t ffa, size_t lda, cl_mem X, size_t offx, int incx, cl_mem scratchBuff, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events); static clblasStatus trsv( DataType type, clblasOrder order, clblasUplo uplo, clblasTranspose trans, clblasDiag diag, size_t N, const cl_mem A, size_t offa, size_t lda, cl_mem X, size_t offx, int incx, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events); static clblasStatus tpsv( DataType type, clblasOrder order, clblasUplo uplo, clblasTranspose trans, clblasDiag diag, size_t N, const cl_mem A, size_t offa, cl_mem X, size_t offx, int incx, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events); static clblasStatus symm( clblasOrder order, clblasSide side, clblasUplo uplo, size_t M, size_t N, float alpha, const cl_mem A, size_t offa, size_t lda, const cl_mem B, size_t offb, size_t ldb, float beta, cl_mem C, size_t offc, size_t ldc, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events); static clblasStatus symm( clblasOrder order, clblasSide side, clblasUplo uplo, size_t M, size_t N, double alpha, const cl_mem A, size_t offa, size_t lda, const cl_mem B, size_t offb, size_t ldb, double beta, cl_mem C, size_t offc, size_t ldc, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events); static clblasStatus symm( clblasOrder order, clblasSide side, clblasUplo uplo, size_t M, size_t N, FloatComplex alpha, const cl_mem A, size_t offa, size_t lda, const cl_mem B, size_t offb, size_t ldb, FloatComplex beta, cl_mem C, size_t offc, size_t ldc, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events); static clblasStatus symm( clblasOrder order, clblasSide side, clblasUplo uplo, size_t M, size_t N, DoubleComplex alpha, const cl_mem A, size_t offa, size_t lda, const cl_mem B, size_t offb, size_t ldb, DoubleComplex beta, cl_mem C, size_t offc, size_t ldc, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events); static clblasStatus syr( clblasOrder order, clblasUplo uplo, size_t N, float alpha, const cl_mem X, size_t offx, int incx, cl_mem A, size_t offa, size_t lda, cl_uint numCommandQueues, cl_command_queue *commandQueue, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events); static clblasStatus syr( clblasOrder order, clblasUplo uplo, size_t N, double alpha, const cl_mem X, size_t offx, int incx, cl_mem A, size_t offa, size_t lda, cl_uint numCommandQueues, cl_command_queue *commandQueue, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events); static clblasStatus ger( clblasOrder order, size_t M, size_t N, float alpha, const cl_mem X, size_t offx, int incx, const cl_mem Y, size_t offy, int incy, cl_mem A, size_t offa, size_t lda, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events); static clblasStatus ger( clblasOrder order, size_t M, size_t N, double alpha, const cl_mem X, size_t offx, int incx, const cl_mem Y, size_t offy, int incy, cl_mem A, size_t offa, size_t lda, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events); static clblasStatus ger( clblasOrder order, size_t M, size_t N, FloatComplex alpha, const cl_mem X, size_t offx, int incx, const cl_mem Y, size_t offy, int incy, cl_mem A, size_t offa, size_t lda, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events); static clblasStatus ger( clblasOrder order, size_t M, size_t N, DoubleComplex alpha, const cl_mem X, size_t offx, int incx, const cl_mem Y, size_t offy, int incy, cl_mem A, size_t offa, size_t lda, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events); static clblasStatus gerc( clblasOrder order, size_t M, size_t N, FloatComplex alpha, const cl_mem X, size_t offx, int incx, const cl_mem Y, size_t offy, int incy, cl_mem A, size_t offa, size_t lda, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events); static clblasStatus gerc( clblasOrder order, size_t M, size_t N, DoubleComplex alpha, const cl_mem X, size_t offx, int incx, const cl_mem Y, size_t offy, int incy, cl_mem A, size_t offa, size_t lda, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events); static clblasStatus her( clblasOrder order, clblasUplo uplo, size_t N, float alpha, const cl_mem X, size_t offx, int incx, cl_mem A, size_t offa, size_t lda, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events); static clblasStatus her( clblasOrder order, clblasUplo uplo, size_t N, double alpha, const cl_mem X, size_t offx, int incx, cl_mem A, size_t offa, size_t lda, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events); static clblasStatus syr2( clblasOrder order, clblasUplo uplo, size_t N, float alpha, const cl_mem X, size_t offx, int incx, const cl_mem Y, size_t offy, int incy, cl_mem A, size_t offa, size_t lda, cl_uint numCommandQueues, cl_command_queue *commandQueue, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events); static clblasStatus syr2( clblasOrder order, clblasUplo uplo, size_t N, double alpha, const cl_mem X, size_t offx, int incx, const cl_mem Y, size_t offy, int incy, cl_mem A, size_t offa, size_t lda, cl_uint numCommandQueues, cl_command_queue *commandQueue, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events); //HER2 wrappers static clblasStatus her2( clblasOrder order, clblasUplo uplo, size_t N, FloatComplex alpha, const cl_mem X, size_t offx, int incx, const cl_mem Y, size_t offy, int incy, cl_mem A, size_t offa, size_t lda, cl_uint numCommandQueues, cl_command_queue *commandQueue, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events); static clblasStatus her2( clblasOrder order, clblasUplo uplo, size_t N, DoubleComplex alpha, const cl_mem X, size_t offx, int incx, const cl_mem Y, size_t offy, int incy, cl_mem A, size_t offa, size_t lda, cl_uint numCommandQueues, cl_command_queue *commandQueue, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events); static clblasStatus hemv( clblasOrder order, clblasUplo uplo, size_t N, FloatComplex alpha, const cl_mem A, size_t offa, size_t lda, const cl_mem X, size_t offx, int incx, FloatComplex beta, cl_mem Y, size_t offy, int incy, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events); static clblasStatus hemv( clblasOrder order, clblasUplo uplo, size_t N, DoubleComplex alpha, const cl_mem A, size_t offa, size_t lda, const cl_mem X, size_t offx, int incx, DoubleComplex beta, cl_mem Y, size_t offy, int incy, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events); //HEMM static clblasStatus hemm( clblasOrder order, clblasSide side, clblasUplo uplo, size_t M, size_t N, FloatComplex alpha, const cl_mem A, size_t offa, size_t lda, const cl_mem B, size_t offb, size_t ldb, FloatComplex beta, cl_mem C, size_t offc, size_t ldc, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events); static clblasStatus hemm( clblasOrder order, clblasSide side, clblasUplo uplo, size_t M, size_t N, DoubleComplex alpha, const cl_mem A, size_t offa, size_t lda, const cl_mem B, size_t offb, size_t ldb, DoubleComplex beta, cl_mem C, size_t offc, size_t ldc, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events); // HERK wrappers static clblasStatus herk( clblasOrder order, clblasUplo uplo, clblasTranspose transA, size_t N, size_t K, float alpha, const cl_mem A, size_t offA, size_t lda, float beta, cl_mem C, size_t offC, size_t ldc, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events); static clblasStatus herk( clblasOrder order, clblasUplo uplo, clblasTranspose transA, size_t N, size_t K, double alpha, const cl_mem A, size_t offA, size_t lda, double beta, cl_mem C, size_t offC, size_t ldc, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events); // TPMV wrappers static clblasStatus tpmv( DataType type, clblasOrder order, clblasUplo uplo, clblasTranspose trans, clblasDiag diag, size_t N, const cl_mem AP, size_t offa, cl_mem X, size_t offx, int incx, cl_mem scratchBuff, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events); // SPMV wrappers static clblasStatus spmv( clblasOrder order, clblasUplo uplo, size_t N, cl_float alpha, const cl_mem AP, size_t offa, const cl_mem X, size_t offx, int incx, cl_float beta, cl_mem Y, size_t offy, int incy, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events); static clblasStatus spmv( clblasOrder order, clblasUplo uplo, size_t N, cl_double alpha, const cl_mem AP, size_t offa, const cl_mem X, size_t offx, int incx, cl_double beta, cl_mem Y, size_t offy, int incy, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events); // HPMV wrappers static clblasStatus hpmv( clblasOrder order, clblasUplo uplo, size_t N, FloatComplex alpha, const cl_mem AP, size_t offa, const cl_mem X, size_t offx, int incx, FloatComplex beta, cl_mem Y, size_t offy, int incy, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events); static clblasStatus hpmv( clblasOrder order, clblasUplo uplo, size_t N, DoubleComplex alpha, const cl_mem AP, size_t offa, const cl_mem X, size_t offx, int incx, DoubleComplex beta, cl_mem Y, size_t offy, int incy, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events); // SPR wrappers static clblasStatus spr( clblasOrder order, clblasUplo uplo, size_t N, float alpha, const cl_mem X, size_t offx, int incx, cl_mem AP, size_t offa, cl_uint numCommandQueues, cl_command_queue *commandQueue, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events); static clblasStatus spr( clblasOrder order, clblasUplo uplo, size_t N, double alpha, const cl_mem X, size_t offx, int incx, cl_mem AP, size_t offa, cl_uint numCommandQueues, cl_command_queue *commandQueue, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events); // HPR wrappers static clblasStatus hpr( clblasOrder order, clblasUplo uplo, size_t N, float alpha, const cl_mem X, size_t offx, int incx, cl_mem AP, size_t offa, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events); static clblasStatus hpr( clblasOrder order, clblasUplo uplo, size_t N, double alpha, const cl_mem X, size_t offx, int incx, cl_mem AP, size_t offa, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events); // SPR2 wrappers static clblasStatus spr2( clblasOrder order, clblasUplo uplo, size_t N, float alpha, const cl_mem X, size_t offx, int incx, const cl_mem Y, size_t offy, int incy, cl_mem AP, size_t offa, cl_uint numCommandQueues, cl_command_queue *commandQueue, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events); static clblasStatus spr2( clblasOrder order, clblasUplo uplo, size_t N, double alpha, const cl_mem X, size_t offx, int incx, const cl_mem Y, size_t offy, int incy, cl_mem AP, size_t offa, cl_uint numCommandQueues, cl_command_queue *commandQueue, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events); //HPR2 wrappers static clblasStatus hpr2( clblasOrder order, clblasUplo uplo, size_t N, FloatComplex alpha, const cl_mem X, size_t offx, int incx, const cl_mem Y, size_t offy, int incy, cl_mem AP, size_t offa, cl_uint numCommandQueues, cl_command_queue *commandQueue, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events); static clblasStatus hpr2( clblasOrder order, clblasUplo uplo, size_t N, DoubleComplex alpha, const cl_mem X, size_t offx, int incx, const cl_mem Y, size_t offy, int incy, cl_mem AP, size_t offa, cl_uint numCommandQueues, cl_command_queue *commandQueue, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events); static clblasStatus gbmv( clblasOrder order, clblasTranspose trans, size_t M, size_t N, size_t KL, size_t KU, cl_float alpha, const cl_mem A, size_t offa, size_t lda, const cl_mem X, size_t offx, int incx, cl_float beta, cl_mem Y, size_t offy, int incy, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events); static clblasStatus gbmv( clblasOrder order, clblasTranspose trans, size_t M, size_t N, size_t KL, size_t KU, cl_double alpha, const cl_mem A, size_t offa, size_t lda, const cl_mem X, size_t offx, int incx, cl_double beta, cl_mem Y, size_t offy, int incy, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events); static clblasStatus gbmv( clblasOrder order, clblasTranspose trans, size_t M, size_t N, size_t KL, size_t KU, cl_float2 alpha, const cl_mem A, size_t offa, size_t lda, const cl_mem X, size_t offx, int incx, cl_float2 beta, cl_mem Y, size_t offy, int incy, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events); static clblasStatus gbmv( clblasOrder order, clblasTranspose trans, size_t M, size_t N, size_t KL, size_t KU, cl_double2 alpha, const cl_mem A, size_t offa, size_t lda, const cl_mem X, size_t offx, int incx, cl_double2 beta, cl_mem Y, size_t offy, int incy, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events); static clblasStatus tbmv( DataType type, clblasOrder order, clblasUplo uplo, clblasTranspose trans, clblasDiag diag, size_t N, size_t K, const cl_mem A, size_t offa, size_t lda, cl_mem X, size_t offx, int incx, cl_mem scratchBuff, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events); //SBMV static clblasStatus sbmv( clblasOrder order, clblasUplo uplo, size_t N, size_t K, cl_float alpha, const cl_mem A, size_t offa, size_t lda, const cl_mem X, size_t offx, int incx, cl_float beta, cl_mem Y, size_t offy, int incy, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events); static clblasStatus sbmv( clblasOrder order, clblasUplo uplo, size_t N, size_t K, cl_double alpha, const cl_mem A, size_t offa, size_t lda, const cl_mem X, size_t offx, int incx, cl_double beta, cl_mem Y, size_t offy, int incy, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events); //HBMV static clblasStatus hbmv( clblasOrder order, clblasUplo uplo, size_t N, size_t K, cl_float2 alpha, const cl_mem A, size_t offa, size_t lda, const cl_mem X, size_t offx, int incx, cl_float2 beta, cl_mem Y, size_t offy, int incy, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events); static clblasStatus hbmv( clblasOrder order, clblasUplo uplo, size_t N, size_t K, cl_double2 alpha, const cl_mem A, size_t offa, size_t lda, const cl_mem X, size_t offx, int incx, cl_double2 beta, cl_mem Y, size_t offy, int incy, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events); //TBSV static clblasStatus tbsv( DataType type, clblasOrder order, clblasUplo uplo, clblasTranspose trans, clblasDiag diag, size_t N, size_t K, const cl_mem A, size_t offa, size_t lda, cl_mem X, size_t offx, int incx, //cl_mem scratchBuff, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events); static clblasStatus her2k( clblasOrder order, clblasUplo uplo, clblasTranspose transA, size_t N, size_t K, cl_float2 alpha, const cl_mem A, size_t offa, size_t lda, const cl_mem B, size_t offb, size_t ldb, cl_float beta, cl_mem C, size_t offc, size_t ldc, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events); static clblasStatus her2k( clblasOrder order, clblasUplo uplo, clblasTranspose transA, size_t N, size_t K, cl_double2 alpha, const cl_mem A, size_t offa, size_t lda, const cl_mem B, size_t offb, size_t ldb, cl_double beta, cl_mem C, size_t offc, size_t ldc, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events); static clblasStatus scal( bool is_css_zds, size_t N, cl_float alpha, cl_mem X, size_t offx, int incx, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events); static clblasStatus scal( bool is_css_zds, size_t N, cl_double alpha, cl_mem X, size_t offx, int incx, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events); static clblasStatus scal( bool is_css_zds, size_t N, FloatComplex alpha, cl_mem X, size_t offx, int incx, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events); static clblasStatus scal( bool is_css_zds, size_t N, DoubleComplex alpha, cl_mem X, size_t offx, int incx, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events); //swap calls static clblasStatus swap( DataType type, size_t N, cl_mem X, size_t offBX, int incx, cl_mem Y, size_t offCY, int incy, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events); //copy static clblasStatus copy( DataType type, size_t N, cl_mem X, size_t offx, int incx, cl_mem Y, size_t offy, int incy, //cl_mem scratchBuff, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events); //DOT static clblasStatus dot( DataType type, size_t N, cl_mem dotProduct, size_t offDP, cl_mem X, size_t offx, int incx, cl_mem Y, size_t offy, int incy, cl_mem scratchBuff, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events); //ASUM static clblasStatus asum( DataType type, size_t N, cl_mem asum, size_t offAsum, cl_mem X, size_t offx, int incx, cl_mem scratchBuff, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events); //DOTC static clblasStatus dotc( DataType type, size_t N, cl_mem dotProduct, size_t offDP, cl_mem X, size_t offx, int incx, cl_mem Y, size_t offy, int incy, cl_mem scratchBuff, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events); //axpy calls static clblasStatus axpy( size_t N, cl_float alpha, cl_mem X, size_t offBX, int incx, cl_mem Y, size_t offCY, int incy, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events); static clblasStatus axpy( size_t N, cl_double alpha, cl_mem X, size_t offBX, int incx, cl_mem Y, size_t offCY, int incy, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events); static clblasStatus axpy( size_t N, FloatComplex alpha, cl_mem X, size_t offBX, int incx, cl_mem Y, size_t offCY, int incy, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events); static clblasStatus axpy( size_t N, DoubleComplex alpha, cl_mem X, size_t offBX, int incx, cl_mem Y, size_t offCY, int incy, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events); static clblasStatus rotmg( DataType type, cl_mem D1, size_t offD1, cl_mem D2, size_t offD2, cl_mem X1, size_t offX1, cl_mem Y1, size_t offY1, cl_mem PARAM, size_t offParam, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events); static clblasStatus rotm( DataType type, size_t N, cl_mem X, size_t offx, int incx, cl_mem Y, size_t offy, int incy, cl_mem PARAM, size_t offParam, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events); static clblasStatus rotg( DataType type, cl_mem SA, size_t offSA, cl_mem SB, size_t offSB, cl_mem C, size_t offC, cl_mem S, size_t offS, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events); static clblasStatus rot( size_t N, cl_mem X, size_t offx, int incx, cl_mem Y, size_t offy, int incy, cl_float C, cl_float S, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events); static clblasStatus rot( size_t N, cl_mem X, size_t offx, int incx, cl_mem Y, size_t offy, int incy, cl_double C, cl_double S, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events); static clblasStatus rot( size_t N, cl_mem X, size_t offx, int incx, cl_mem Y, size_t offy, int incy, FloatComplex C, FloatComplex S, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events); static clblasStatus rot( size_t N, cl_mem X, size_t offx, int incx, cl_mem Y, size_t offy, int incy, DoubleComplex C, DoubleComplex S, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events); //AMAX static clblasStatus iamax( DataType type, size_t N, cl_mem iMax, size_t offiMax, cl_mem X, size_t offx, int incx, cl_mem scratchBuff, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events); static clblasStatus nrm2( DataType type, size_t N, cl_mem NRM2, size_t offNRM2, cl_mem X, size_t offx, int incx, cl_mem scratchBuff, cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_uint numEventsInWaitList, const cl_event *eventWaitList, cl_event *events); }; // clblas class } // namespace clMath #endif // CLBLAS_WRAPPER_H_ clblas-2.10/src/tests/include/cmdline.h000066400000000000000000000046731264277366700200720ustar00rootroot00000000000000/* ************************************************************************ * Copyright 2013 Advanced Micro Devices, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * ************************************************************************/ #ifndef CMDLINE_H_ #define CMDLINE_H_ #include #ifdef __cplusplus extern "C" { #endif typedef struct ComplexLong { long re; long imag; } ComplexLong; // flags showing wheter an option was set through the command line typedef enum SetoptFlags { NO_FLAGS = 0, SET_SEED = (1 << 0), SET_ALPHA = (1 << 1), SET_BETA = (1 << 2), SET_M = (1 << 3), SET_N = (1 << 4), SET_K = (1 << 5), SET_USE_IMAGES = (1 << 6), SET_DEVICE_TYPE = (1 << 7), SET_INCX = (1 << 8), SET_INCY = (1 << 9), SET_NUM_COMMAND_QUEUES = (1 << 10) } SetoptFlags; typedef struct TestParams { clblasOrder order; clblasTranspose transA; clblasTranspose transB; clblasTranspose transC; size_t M; size_t N; size_t K; size_t KL; size_t KU; int incx; int incy; size_t offA; size_t offBX; size_t offCY; size_t rowsA; size_t columnsA; size_t rowsB; size_t columnsB; size_t rowsC; size_t columnsC; size_t offa; size_t offb; size_t offc; // reminded alpha value set through the command line ComplexLong alpha; size_t lda; size_t ldb; // reminded beta value set through the command line ComplexLong beta; size_t ldc; clblasSide side; clblasUplo uplo; clblasDiag diag; unsigned int seed; int useImages; cl_device_type devType; const char* devName; cl_uint numCommandQueues; SetoptFlags optFlags; } TestParams; int parseBlasCmdLineArgs( int argc, char *argv[], TestParams *params); void printUsage(const char *appName); void parseEnv(TestParams *params); #ifdef __cplusplus } /* extern "C" { */ #endif #endif /* CMDLINE_H_ */ clblas-2.10/src/tests/include/common.h000066400000000000000000000254571264277366700177520ustar00rootroot00000000000000/* ************************************************************************ * Copyright 2013 Advanced Micro Devices, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * ************************************************************************/ #ifndef COMMON_H_ #define COMMON_H_ #if defined (_MSC_VER) #define __template_static static #else /* _MSC_VER */ #define __template_static #endif /* !_MSC_VER */ #define MAX(a, b) ((a>b)? a: b) #include #include #include #ifdef __cplusplus extern "C" { #endif typedef enum BlasRoutineID { CLBLAS_GEMV, CLBLAS_SYMV, CLBLAS_GEMM, CLBLAS_GEMM2, CLBLAS_GEMM_TAIL, CLBLAS_TRMM, CLBLAS_TRSM, CLBLAS_SYRK, CLBLAS_SYR2K, CLBLAS_TRMV, CLBLAS_TPMV, CLBLAS_TRSV, CLBLAS_TRSV_GEMV, // Need a Kludge as current "gemv" don't support complex types CLBLAS_SYMM, CLBLAS_GER, CLBLAS_SYR, CLBLAS_HER, CLBLAS_HER2, CLBLAS_HEMM, CLBLAS_HERK, CLBLAS_SWAP, CLBLAS_COPY, CLBLAS_DOT, CLBLAS_SCAL, CLBLAS_AXPY, CLBLAS_ROTG, CLBLAS_ROTM, CLBLAS_ROT, CLBLAS_ROTMG, CLBLAS_NRM2, CLBLAS_ASUM, CLBLAS_iAMAX, /* ! Must be the last */ BLAS_FUNCTIONS_NUMBER } BlasRoutineID; typedef enum BlasFunction { FN_SGEMV, FN_DGEMV, FN_CGEMV, FN_ZGEMV, FN_SSYMV, FN_DSYMV, FN_SSPMV, FN_DSPMV, FN_SGEMM, FN_DGEMM, FN_CGEMM, FN_ZGEMM, FN_SGEMM_2, FN_DGEMM_2, FN_CGEMM_2, FN_ZGEMM_2, FN_STRMM, FN_DTRMM, FN_CTRMM, FN_ZTRMM, FN_STRSM, FN_DTRSM, FN_CTRSM, FN_ZTRSM, FN_SSYR2K, FN_DSYR2K, FN_CSYR2K, FN_ZSYR2K, FN_SSYRK, FN_DSYRK, FN_CSYRK, FN_ZSYRK, FN_STRMV, FN_DTRMV, FN_CTRMV, FN_ZTRMV, FN_STPMV, FN_DTPMV, FN_CTPMV, FN_ZTPMV, FN_STRSV, FN_DTRSV, FN_CTRSV, FN_ZTRSV, FN_STPSV, FN_DTPSV, FN_CTPSV, FN_ZTPSV, FN_SSYMM, FN_DSYMM, FN_CSYMM, FN_ZSYMM, FN_SSYR, FN_DSYR, FN_SSPR, FN_DSPR, FN_SGER, FN_DGER, FN_CGERU, FN_ZGERU, FN_CGERC, FN_ZGERC, FN_CHER, FN_ZHER, FN_CHER2, FN_ZHER2, FN_CHPR, FN_ZHPR, FN_CHPR2, FN_ZHPR2, FN_SSYR2, FN_DSYR2, FN_SSPR2, FN_DSPR2, FN_CHEMV, FN_ZHEMV, FN_CHPMV, FN_ZHPMV, FN_CHEMM, FN_ZHEMM, FN_CHERK, FN_ZHERK, FN_SGBMV, FN_DGBMV, FN_CGBMV, FN_ZGBMV, FN_STBMV, FN_DTBMV, FN_CTBMV, FN_ZTBMV, FN_SSBMV, FN_DSBMV, FN_CHBMV, FN_ZHBMV, FN_STBSV, FN_DTBSV, FN_CTBSV, FN_ZTBSV, FN_CHER2K, FN_ZHER2K, FN_SCOPY, FN_DCOPY, FN_CCOPY, FN_ZCOPY, FN_SSWAP, FN_DSWAP, FN_CSWAP, FN_ZSWAP, FN_SDOT, FN_DDOT, FN_CDOTU, FN_ZDOTU, FN_CDOTC, FN_ZDOTC, FN_SSCAL, FN_DSCAL, FN_CSCAL, FN_ZSCAL, FN_CSSCAL, FN_ZDSCAL, FN_SAXPY, FN_DAXPY, FN_CAXPY, FN_ZAXPY, FN_SROTG, FN_DROTG, FN_CROTG, FN_ZROTG, FN_SROTM, FN_DROTM, FN_SROT, FN_DROT, FN_CSROT, FN_ZDROT, FN_SROTMG, FN_DROTMG, FN_SNRM2, FN_DNRM2, FN_SCNRM2, FN_DZNRM2, FN_SASUM, FN_DASUM, FN_SCASUM, FN_DZASUM, FN_iSAMAX, FN_iDAMAX, FN_iCAMAX, FN_iZAMAX, BLAS_FUNCTION_END } BlasFunctionID; cl_context getQueueContext(cl_command_queue commandQueue, cl_int *error); cl_int waitForSuccessfulFinish( cl_uint numCommandQueues, cl_command_queue *commandQueues, cl_event *events); cl_int flushAll( cl_uint numCommandQueues, cl_command_queue *commandQueues); const char* orderStr(clblasOrder order); const char* sideStr(clblasSide side); const char* uploStr(clblasUplo uplo); const char* transStr(clblasTranspose trans); const char* diagStr(clblasDiag diag); char encodeTranspose(clblasTranspose value); char encodeUplo(clblasUplo value); char encodeDiag(clblasDiag value); char encodeSide(clblasSide value); int functionBlasLevel(BlasFunctionID funct); size_t trsmBlockSize(void); #ifdef __cplusplus } // extern "C" #endif #ifdef __cplusplus template static T convertMultiplier(ComplexLong arg) { return static_cast(arg.re); } template<> __template_static FloatComplex convertMultiplier(ComplexLong arg) { return floatComplex( static_cast(arg.re), static_cast(arg.imag)); } template<> __template_static DoubleComplex convertMultiplier(ComplexLong arg) { return doubleComplex(arg.re, arg.imag); } template static cl_double returnMax(T arg) { return static_cast(fabs(arg)); } template<> __template_static cl_double returnMax (FloatComplex arg) { return static_cast( MAX( fabs(CREAL(arg)), fabs(CIMAG(arg)) ) ); } template<> __template_static cl_double returnMax (DoubleComplex arg) { return static_cast( MAX( fabs(CREAL(arg)), fabs(CIMAG(arg)) ) ); } // xGEMM void printTestParams( clblasOrder order, clblasTranspose transA, clblasTranspose transB, size_t M, size_t N, size_t K, bool useAlpha, ComplexLong alpha, size_t offA, size_t lda, size_t offB, size_t ldb, bool useBeta, ComplexLong beta, size_t offC, size_t ldc); // xTRMM, xTRSM void printTestParams( clblasOrder order, clblasSide side, clblasUplo uplo, clblasTranspose transA, clblasDiag diag, size_t M, size_t N, bool useAlpha, ComplexLong alpha, size_t offA, size_t lda, size_t offB, size_t ldb); //xTRMV, xTRSV void printTestParams( clblasOrder order, clblasUplo uplo, clblasTranspose transA, clblasDiag diag, size_t N, size_t lda, int incx, size_t offa, size_t offx); //xTPMV void printTestParams( clblasOrder order, clblasUplo uplo, clblasTranspose transA, clblasDiag diag, size_t N, int incx, size_t offa, size_t offx); //xSYR xHER void printTestParams( clblasOrder order, clblasUplo uplo, size_t N, double alpha, size_t offx, int incx, size_t offa, size_t lda); //xHER2 void printTestParams( clblasOrder order, clblasUplo uplo, size_t N, bool useAlpha, cl_float2 alpha, size_t offx, int incx, size_t offy, int incy, size_t offa, size_t lda); //xCOPY , xSWAP void printTestParams( size_t N, size_t offx, int incx, size_t offy, int incy); //xSyr2 void printTestParams( clblasOrder order, clblasUplo uplo, size_t N, double alpha, size_t offx, int incx, size_t offy, int incy, size_t offa, size_t lda); //HEMV void printTestParams( clblasOrder order, clblasUplo uplo, size_t N, ComplexLong alpha, size_t offa, size_t lda, size_t offx, int incx, ComplexLong beta, size_t offy, int incy); //xSymm, void printTestParams( clblasOrder order, clblasSide side, clblasUplo uplo, size_t M, size_t N, bool useAlpha, ComplexLong alpha, bool useBeta, ComplexLong beta, size_t lda, size_t ldb, size_t ldc, size_t offa, size_t offb, size_t offc ); //xHEMM void printTestParams( clblasOrder order, clblasSide side, clblasUplo uplo, size_t M, size_t N, bool useAlpha, cl_float2 alpha, bool useBeta, cl_float2 beta, size_t lda, size_t ldb, size_t ldc, size_t offa, size_t offb, size_t offc ); //xGER , xGERC void printTestParams( clblasOrder order, size_t M, size_t N, bool useAlpha, ComplexLong alpha, size_t lda, int incx, int incy, size_t offa, size_t offx, size_t offy ); // xGEMV void printTestParams( clblasOrder order, clblasTranspose transA, size_t M, size_t N, bool useAlpha, ComplexLong alpha, size_t offA, size_t lda, int incx, bool useBeta, ComplexLong beta, int incy); // xGBMV void printTestParams( clblasOrder order, clblasTranspose transA, size_t M, size_t N, size_t KL, size_t KU, ComplexLong alpha, size_t offA, size_t lda, size_t offx, int incx, ComplexLong beta, size_t offy, int incy); //xHBMV/xSBMV void printTestParams( clblasOrder order, clblasUplo uplo, size_t N, size_t K, ComplexLong alpha, size_t offA, size_t lda, size_t offx, int incx, ComplexLong beta, size_t offy, int incy); // xTBMV void printTestParams( clblasOrder order, clblasUplo uplo, clblasTranspose transA, clblasDiag diag, size_t N, size_t KLU, size_t offA, size_t lda, size_t offx, int incx, size_t offy, int incy); // xSYMV void printTestParams( clblasOrder order, clblasUplo uplo, size_t N, bool useAlpha, ComplexLong alpha, size_t offA, size_t lda, int incx, bool useBeta, ComplexLong beta, int incy); // xSYR2K void printTestParams( clblasOrder order, clblasUplo uplo, clblasTranspose transA, size_t N, size_t K, bool useAlpha, ComplexLong alpha, size_t offA, size_t lda, size_t offB, size_t ldb, bool useBeta, ComplexLong beta, size_t offC, size_t ldc); // xSYRK void printTestParams( clblasOrder order, clblasUplo uplo, clblasTranspose transA, size_t N, size_t K, bool useAlpha, ComplexLong alpha, size_t offA, size_t lda, bool useBeta, ComplexLong beta, size_t offC, size_t ldc); // xSCAL void printTestParams( size_t N, ComplexLong alpha, size_t offx, int incx); // xAXPY void printTestParams( size_t N, ComplexLong alpha, size_t offx, int incx, size_t offy, int incy); // For ROT void printTestParams( size_t N, size_t offx, int incx, size_t offy, int incy, ComplexLong alpha, ComplexLong beta); // xROTG, check if other ROTs can use this too void printTestParams(size_t offSA, size_t offSB, size_t offC, size_t offS); // xROTM void printTestParams(size_t N, size_t offx, int incx, size_t offy, int incy, size_t offParam, ComplexLong sflagParam); //xROTMG void printTestParams(int offX, int offY, int offD1, int offD2, int offParam, ComplexLong sflagParam); // xNRM2, AMAX and ASUM void printTestParams( size_t N, size_t offx, int incx); #endif // __cplusplus #endif /* COMMON_H_ */ clblas-2.10/src/tests/include/copy.h000066400000000000000000000044171264277366700174250ustar00rootroot00000000000000/* ************************************************************************ * Copyright 2013 Advanced Micro Devices, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * ************************************************************************/ #include #include #include #include #include using ::testing::TestWithParam; class COPY : public TestWithParam< ::std::tr1::tuple< int, // N int, // incx, should be greater than 0 int, //incy int, //offx int, //offy int // numCommandQueues > > { public: void getParams(TestParams *params) { params->N = N; params->incx = incx; params->incy = incy; params->offBX = offx; params->offCY = offy; params->numCommandQueues = numCommandQueues; } protected: virtual void SetUp() { N = ::std::tr1::get<0>(GetParam()); incx = ::std::tr1::get<1>(GetParam()); incy = ::std::tr1::get<2>(GetParam()); offx = ::std::tr1::get<3>(GetParam()); offy = ::std::tr1::get<4>(GetParam()); numCommandQueues = ::std::tr1::get<5>(GetParam()); base = ::clMath::BlasBase::getInstance(); useNumCommandQueues = base->useNumCommandQueues(); if (useNumCommandQueues) { numCommandQueues = base->numCommandQueues(); } if (base->useN()) { N = base->N(); } printTestParams(N, offx, incx, offy, incy); ::std::cerr << "queues = " << numCommandQueues << ::std::endl; } size_t N; int incx; int incy; size_t offx, offy; ::clMath::BlasBase *base; cl_ulong imageA, imageX; bool useNumCommandQueues; cl_uint numCommandQueues; }; clblas-2.10/src/tests/include/dot.h000066400000000000000000000047051264277366700172410ustar00rootroot00000000000000/* ************************************************************************ * Copyright 2013 Advanced Micro Devices, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * ************************************************************************/ #include #include #include #include #include using ::testing::TestWithParam; class DOT : public TestWithParam< ::std::tr1::tuple< int, // N int, // incx, should be greater than 0 int, //incy int, //offx int, //offy int, //offa -- for offDP int // numCommandQueues > > { public: void getParams(TestParams *params) { params->N = N; params->incx = incx; params->incy = incy; params->offBX = offx; params->offCY = offy; params->offa = offDP; params->numCommandQueues = numCommandQueues; } protected: virtual void SetUp() { //size_t lenX; N = ::std::tr1::get<0>(GetParam()); incx = ::std::tr1::get<1>(GetParam()); incy = ::std::tr1::get<2>(GetParam()); offx = ::std::tr1::get<3>(GetParam()); offy = ::std::tr1::get<4>(GetParam()); offDP = ::std::tr1::get<5>(GetParam()); numCommandQueues = ::std::tr1::get<6>(GetParam()); base = ::clMath::BlasBase::getInstance(); useNumCommandQueues = base->useNumCommandQueues(); if (useNumCommandQueues) { numCommandQueues = base->numCommandQueues(); } if (base->useN()) { N = base->N(); } printTestParams(N, offx, incx, offy, incy); ::std::cerr << "offDP = " << offDP << ::std::endl; ::std::cerr << "queues = " << numCommandQueues << ::std::endl; } size_t N; int incx; int incy; size_t offx, offy, offDP; ::clMath::BlasBase *base; cl_ulong imageA, imageX; bool useNumCommandQueues; cl_uint numCommandQueues; }; clblas-2.10/src/tests/include/dotc.h000066400000000000000000000047061264277366700174050ustar00rootroot00000000000000/* ************************************************************************ * Copyright 2013 Advanced Micro Devices, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * ************************************************************************/ #include #include #include #include #include using ::testing::TestWithParam; class DOTC : public TestWithParam< ::std::tr1::tuple< int, // N int, // incx, should be greater than 0 int, //incy int, //offx int, //offy int, //offa -- for offDP int // numCommandQueues > > { public: void getParams(TestParams *params) { params->N = N; params->incx = incx; params->incy = incy; params->offBX = offx; params->offCY = offy; params->offa = offDP; params->numCommandQueues = numCommandQueues; } protected: virtual void SetUp() { //size_t lenX; N = ::std::tr1::get<0>(GetParam()); incx = ::std::tr1::get<1>(GetParam()); incy = ::std::tr1::get<2>(GetParam()); offx = ::std::tr1::get<3>(GetParam()); offy = ::std::tr1::get<4>(GetParam()); offDP = ::std::tr1::get<5>(GetParam()); numCommandQueues = ::std::tr1::get<6>(GetParam()); base = ::clMath::BlasBase::getInstance(); useNumCommandQueues = base->useNumCommandQueues(); if (useNumCommandQueues) { numCommandQueues = base->numCommandQueues(); } if (base->useN()) { N = base->N(); } printTestParams(N, offx, incx, offy, incy); ::std::cerr << "offDP = " << offDP << ::std::endl; ::std::cerr << "queues = " << numCommandQueues << ::std::endl; } size_t N; int incx; int incy; size_t offx, offy, offDP; ::clMath::BlasBase *base; cl_ulong imageA, imageX; bool useNumCommandQueues; cl_uint numCommandQueues; }; clblas-2.10/src/tests/include/gbmv.h000066400000000000000000000117531264277366700174070ustar00rootroot00000000000000/* ************************************************************************ * Copyright 2013 Advanced Micro Devices, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * ************************************************************************/ #ifndef GBMV_H_ #define GBMV_H_ #include #include #include #include #include #include #include using namespace clMath; using ::testing::TestWithParam; class GBMV : public TestWithParam< ::std::tr1::tuple< clblasOrder, // order clblasTranspose, // transA int, // M int, // N int, // KL int, // KU ExtraTestSizes, ComplexLong, // Alpha ComplexLong, // Beta int // numCommandQueues > > { public: void getParams(TestParams *params) { memset(params, 0, sizeof(TestParams)); params->order = order; params->transA = transA; params->seed = seed; params->M = M; params->N = N; params->KL = KL; params->KU = KU; params->lda = lda; params->incx = incx; params->incy = incy; params->offA = offA; params->offa = offA; params->offBX = offx; params->offCY = offy; params->alpha = paramAlpha; params->beta = paramBeta; params->numCommandQueues = numCommandQueues; } protected: virtual void SetUp() { ExtraTestSizes extra; order = ::std::tr1::get<0>(GetParam()); transA = ::std::tr1::get<1>(GetParam()); M = ::std::tr1::get<2>(GetParam()); N = ::std::tr1::get<3>(GetParam()); KL = ::std::tr1::get<4>(GetParam()); KU = ::std::tr1::get<5>(GetParam()); extra = ::std::tr1::get<6>(GetParam()); offA = extra.offA; offx = extra.offBX; offy = extra.offCY; lda = extra.strideA.ld; incx = extra.strideBX.inc; incy = extra.strideCY.inc; paramAlpha = ::std::tr1::get<7>(GetParam()); paramBeta = ::std::tr1::get<8>(GetParam()); numCommandQueues = ::std::tr1::get<9>(GetParam()); base = ::clMath::BlasBase::getInstance(); seed = base->seed(); useNumCommandQueues = base->useNumCommandQueues(); if (useNumCommandQueues) { numCommandQueues = base->numCommandQueues(); } KL = KL % M; KU = KU % N; lda = ::std::max(lda, (KL+KU+1)); printTestParams(order, transA, M, N, KL, KU, paramAlpha, offA, lda, offx, incx, paramBeta, offy, incy); ::std::cerr << "seed = " << seed << ::std::endl; ::std::cerr << "queues = " << numCommandQueues << ::std::endl; } clblasOrder order; clblasTranspose transA; size_t M, N, KL, KU; size_t lda; int incx, incy; size_t offA, offx, offy; unsigned int seed; ComplexLong paramAlpha, paramBeta; ::clMath::BlasBase *base; bool useNumCommandQueues; cl_uint numCommandQueues; }; template static void randomGbmvMatrices( clblasOrder order, clblasTranspose trans, size_t M, size_t N, T *alpha, T *beta, T *A, size_t lda, T *X, int incx, T *Y, int incy ) { size_t i; size_t lenX, lenY, lenA; cl_double bound, maxAB, maxMN; // bound is calculated by solving the equation (alpha*x^2 + x - UPPER_BOUND) < 0 bound = UPPER_BOUND(); if(module(maxVal(*alpha)) > (sqrt(bound) / (2.0))) *alpha = random((sqrt(bound) / (2.0))); if(module(maxVal(*beta)) > (sqrt(bound) / (2.0))) *beta = random((sqrt(bound) / (2.0))); maxAB = module( ::std::max(maxVal(*alpha), maxVal(*beta)) ); // Take max of alpha & beta maxMN = (cl_double)::std::max( M, N ); bound = sqrt( bound / (maxAB*maxMN) ); // (maxAB * N * bound^2 + maxAB * bound - UPPER_BOUND) < 0 lenA = ((order == clblasRowMajor)? M: N) * lda; for (i = 0; i < lenA; i++) { A[i] = random(bound); } if( trans == clblasNoTrans ) { lenX = 1 + ((N - 1) * abs(incx)); lenY = 1 + ((M - 1) * abs(incy)); } else { lenX = 1 + ((M - 1) * abs(incx)); lenY = 1 + ((N - 1) * abs(incy)); } if (X != NULL) { for (i = 0; i < lenX; i++) { X[i] = random(bound); } } if (Y != NULL) { for (i = 0; i < lenY; i++) { Y[i] = random(bound); } } } #endif // GBMV_H_ clblas-2.10/src/tests/include/gemm-2.h000066400000000000000000000120211264277366700175250ustar00rootroot00000000000000/* ************************************************************************ * Copyright 2013 Advanced Micro Devices, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * ************************************************************************/ #ifndef GEMM_2_H_ #define GEMM_2_H_ #include #include #include #include #include using namespace clMath; using ::testing::TestWithParam; class GEMM2 : public TestWithParam< ::std::tr1::tuple< clblasOrder, // order clblasTranspose, // transA clblasTranspose, // transB int, // M int, // N int, // K ExtraTestSizes, int // numCommandQueues > > { public: void getParams(TestParams *params) { params->order = order; params->transA = transA; params->transB = transB; params->seed = seed; params->M = M; params->N = N; params->K = K; params->offA = offA; params->offBX = offB; params->offCY = offC; params->lda = lda; params->ldb = ldb; params->ldc = ldc; params->rowsA = rowsA; params->columnsA = columnsA; params->rowsB = rowsB; params->columnsB = columnsB; params->rowsC = rowsC; params->columnsC = columnsC; params->alpha = paramAlpha; params->beta = paramBeta; params->numCommandQueues = numCommandQueues; } protected: virtual void SetUp() { ExtraTestSizes extra; order = ::std::tr1::get<0>(GetParam()); transA = ::std::tr1::get<1>(GetParam()); transB = ::std::tr1::get<2>(GetParam()); M = ::std::tr1::get<3>(GetParam()); N = ::std::tr1::get<4>(GetParam()); K = ::std::tr1::get<5>(GetParam()); extra = ::std::tr1::get<6>(GetParam()); offA = extra.offA; offB = extra.offBX; offC = extra.offCY; lda = extra.strideA.ld; ldb = extra.strideBX.ld; ldc = extra.strideCY.ld; numCommandQueues = ::std::tr1::get<7>(GetParam()); base = ::clMath::BlasBase::getInstance(); seed = base->seed(); useNumCommandQueues = base->useNumCommandQueues(); if (useNumCommandQueues) { numCommandQueues = base->numCommandQueues(); } useAlpha = base->useAlpha(); if (useAlpha != 0) { paramAlpha = base->alpha(); } useBeta = base->useBeta(); if (useBeta != 0) { paramBeta = base->beta(); } if (base->useM()) { M = base->M(); } if (base->useN()) { N = base->N(); } if (base->useK()) { K = base->K(); } if (transA == clblasNoTrans) { rowsA = M; columnsA = K; } else { rowsA = K; columnsA = M; } if (transB == clblasNoTrans) { rowsB = K; columnsB = N; } else { rowsB = N; columnsB = K; } rowsC = M; columnsC = N; switch (order) { case clblasRowMajor: lda = ::std::max(lda, columnsA); columnsA = lda; ldb = ::std::max(ldb, columnsB); columnsB = ldb; ldc = ::std::max(ldc, columnsC); columnsC = ldc; break; case clblasColumnMajor: lda = ::std::max(lda, rowsA); rowsA = lda; ldb = ::std::max(ldb, rowsB); rowsB = ldb; ldc = ::std::max(ldc, rowsC); rowsC = ldc; break; } printTestParams(order, transA, transB, M, N, K, useAlpha, base->alpha(), offA, lda, offB, ldb, useBeta, base->beta(), offC, ldc); ::std::cerr << "seed = " << seed << ::std::endl; ::std::cerr << "queues = " << numCommandQueues << ::std::endl; } clblasOrder order; clblasTranspose transA; clblasTranspose transB; size_t M, N, K; size_t offA, offB, offC; size_t lda, ldb, ldc; unsigned int seed; bool useAlpha, useBeta; ComplexLong paramAlpha, paramBeta; size_t rowsA, columnsA; size_t rowsB, columnsB; size_t rowsC, columnsC; ::clMath::BlasBase *base; cl_ulong imageA, imageB; bool useNumCommandQueues; cl_uint numCommandQueues; }; #endif // GEMM_2_H_ clblas-2.10/src/tests/include/gemm.h000066400000000000000000000120661264277366700173770ustar00rootroot00000000000000/* ************************************************************************ * Copyright 2013 Advanced Micro Devices, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * ************************************************************************/ #ifndef GEMM_H_ #define GEMM_H_ #include #include #include #include #include using namespace clMath; using ::testing::TestWithParam; class GEMM : public TestWithParam< ::std::tr1::tuple< clblasOrder, // order clblasTranspose, // transA clblasTranspose, // transB int, // M int, // N int, // K ExtraTestSizes, int // numCommandQueues > > { public: void getParams(TestParams *params) { memset(params, 0, sizeof(TestParams)); params->order = order; params->transA = transA; params->transB = transB; params->seed = seed; params->M = M; params->N = N; params->K = K; params->offA = offA; params->offBX = offB; params->offCY = offC; params->lda = lda; params->ldb = ldb; params->ldc = ldc; params->rowsA = rowsA; params->columnsA = columnsA; params->rowsB = rowsB; params->columnsB = columnsB; params->rowsC = rowsC; params->columnsC = columnsC; params->alpha = paramAlpha; params->beta = paramBeta; params->numCommandQueues = numCommandQueues; } protected: virtual void SetUp() { ExtraTestSizes extra; order = ::std::tr1::get<0>(GetParam()); transA = ::std::tr1::get<1>(GetParam()); transB = ::std::tr1::get<2>(GetParam()); M = ::std::tr1::get<3>(GetParam()); N = ::std::tr1::get<4>(GetParam()); K = ::std::tr1::get<5>(GetParam()); extra = ::std::tr1::get<6>(GetParam()); offA = extra.offA; offB = extra.offBX; offC = extra.offCY; lda = extra.strideA.ld; ldb = extra.strideBX.ld; ldc = extra.strideCY.ld; numCommandQueues = ::std::tr1::get<7>(GetParam()); base = ::clMath::BlasBase::getInstance(); seed = base->seed(); useNumCommandQueues = base->useNumCommandQueues(); if (useNumCommandQueues) { numCommandQueues = base->numCommandQueues(); } useAlpha = base->useAlpha(); if (useAlpha != 0) { paramAlpha = base->alpha(); } useBeta = base->useBeta(); if (useBeta != 0) { paramBeta = base->beta(); } if (base->useM()) { M = base->M(); } if (base->useN()) { N = base->N(); } if (base->useK()) { K = base->K(); } if (transA == clblasNoTrans) { rowsA = M; columnsA = K; } else { rowsA = K; columnsA = M; } if (transB == clblasNoTrans) { rowsB = K; columnsB = N; } else { rowsB = N; columnsB = K; } rowsC = M; columnsC = N; switch (order) { case clblasRowMajor: lda = ::std::max(lda, columnsA); columnsA = lda; ldb = ::std::max(ldb, columnsB); columnsB = ldb; ldc = ::std::max(ldc, columnsC); columnsC = ldc; break; case clblasColumnMajor: lda = ::std::max(lda, rowsA); rowsA = lda; ldb = ::std::max(ldb, rowsB); rowsB = ldb; ldc = ::std::max(ldc, rowsC); rowsC = ldc; break; } ::std::cerr << " seed = " << seed << ", " << "queues = " << numCommandQueues << ", "; printTestParams(order, transA, transB, M, N, K, useAlpha, base->alpha(), offA, lda, offB, ldb, useBeta, base->beta(), offC, ldc); } clblasOrder order; clblasTranspose transA; clblasTranspose transB; size_t M, N, K; size_t offA, offB, offC; size_t lda, ldb, ldc; unsigned int seed; bool useAlpha, useBeta; ComplexLong paramAlpha, paramBeta; size_t rowsA, columnsA; size_t rowsB, columnsB; size_t rowsC, columnsC; ::clMath::BlasBase *base; cl_ulong imageA, imageB; bool useNumCommandQueues; cl_uint numCommandQueues; }; #endif // GEMM_H_ clblas-2.10/src/tests/include/gemv.h000066400000000000000000000164341264277366700174130ustar00rootroot00000000000000/* ************************************************************************ * Copyright 2013 Advanced Micro Devices, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * ************************************************************************/ #ifndef GEMV_H_ #define GEMV_H_ #include #include #include #include #include #include using namespace clMath; using ::testing::TestWithParam; class GEMV : public TestWithParam< ::std::tr1::tuple< clblasOrder, // order clblasTranspose, // transA int, // M int, // N ExtraTestSizes, int // numCommandQueues > > { public: void getParams(TestParams *params) { memset(params, 0, sizeof(TestParams)); params->order = order; params->transA = transA; params->transB = transB; params->transC = transC; params->seed = seed; params->M = M; params->N = N; params->K = L; params->lda = lda; params->ldb = ldb; params->ldc = ldc; params->rowsA = rowsA; params->rowsB = rowsB; params->rowsC = rowsC; params->columnsA = columnsA; params->columnsB = columnsB; params->columnsC = columnsC; params->incx = incx; params->incy = incy; params->offA = offA; params->offBX = offx; params->offCY = offy; params->alpha = paramAlpha; params->beta = paramBeta; params->numCommandQueues = numCommandQueues; } protected: virtual void SetUp() { ExtraTestSizes extra; size_t lenX, lenY; bool seqX, seqY; order = ::std::tr1::get<0>(GetParam()); transA = ::std::tr1::get<1>(GetParam()); M = ::std::tr1::get<2>(GetParam()); N = ::std::tr1::get<3>(GetParam()); extra = ::std::tr1::get<4>(GetParam()); offA = extra.offA; lda = extra.strideA.ld; incx = extra.strideBX.inc; incy = extra.strideCY.inc; numCommandQueues = ::std::tr1::get<5>(GetParam()); base = ::clMath::BlasBase::getInstance(); seed = base->seed(); useNumCommandQueues = base->useNumCommandQueues(); if (useNumCommandQueues) { numCommandQueues = base->numCommandQueues(); } useAlpha = base->useAlpha(); if (useAlpha != 0) { paramAlpha = base->alpha(); } useBeta = base->useBeta(); if (useBeta != 0) { paramBeta = base->beta(); } if (base->useM()) { M = base->M(); } if (base->useN()) { N = base->N(); } if (base->useIncX()) { incx = base->incX(); } if (base->useIncY()) { incy = base->incY(); } ldb = ldc = 0; offx = offy = 0; L = (M + N) / 2; //It doesn't matter, can be any value seqX = module(incx) == 1; seqY = module(incy) == 1; if (transA == clblasNoTrans) { lenX = N; lenY = M; } else { lenX = M; lenY = N; } rowsA = M; columnsA = N; switch (order) { case clblasRowMajor: lda = ::std::max(lda, columnsA); columnsA = lda; if (seqX) { //x is a middle row in row major matrix rowsB = L; columnsB = lenX; ldb = ::std::max(ldb, columnsB); transB = clblasTrans; offx = (rowsB / 2) * ldb; } else { //x is a middle column column in row major matrix rowsB = lenX; columnsB = L; ldb = ::std::max((size_t)module(incx), columnsB); transB = clblasNoTrans; offx = columnsB / 2; } columnsB = ldb; if (seqY) { //y is a middle row in row major matrix rowsC = L; columnsC = lenY; ldc = ::std::max(ldc, columnsC); transC = clblasTrans; offy = (rowsC / 2) * ldc; } else { //y is a middle column in row major matrix rowsC = lenY; columnsC = L; ldc = ::std::max((size_t)module(incy), columnsC); transC = clblasNoTrans; offy = columnsC / 2; } columnsC = ldc; break; case clblasColumnMajor: lda = ::std::max(lda, rowsA); rowsA = lda; if (seqX) { //x is a middle column in column major matrix rowsB = lenX; columnsB = L; ldb = ::std::max(ldb, rowsB); transB = clblasNoTrans; offx = (columnsB / 2) * ldb; } else { //x is a middle row in column major matrix rowsB = L; columnsB = lenX; ldb = ::std::max((size_t)module(incx), rowsB); transB = clblasTrans; offx = rowsB / 2; } rowsB = ldb; if (seqY) { //y is a middle column in column major matrix rowsC = lenY; columnsC = L; ldc = ::std::max(ldc, rowsC); transC = clblasNoTrans; offy = (columnsC / 2) * ldc; } else { //y is a middle row in column major matrix rowsC = L; columnsC = lenY; ldc = ::std::max((size_t)module(incy), rowsC); transC = clblasTrans; offy = rowsC / 2; } rowsC = ldc; break; } if (!seqX) { incx = incx > 0 ? (int)ldb : (int)(0-ldb); } if (!seqY) { incy = incy > 0 ? (int)ldc : (int)(0-ldc); } printTestParams(order, transA, M, N, useAlpha, base->alpha(), offA, lda, incx, useBeta, base->beta(), incy); ::std::cerr << "seed = " << seed << ::std::endl; ::std::cerr << "queues = " << numCommandQueues << ::std::endl; } clblasOrder order; clblasTranspose transA, transB, transC; size_t M, N, L; size_t lda, ldb, ldc; int incx, incy; size_t offA, offx, offy; unsigned int seed; bool useAlpha, useBeta; ComplexLong paramAlpha, paramBeta; size_t rowsA, rowsB, rowsC, columnsA, columnsB, columnsC; ::clMath::BlasBase *base; cl_ulong imageA, imageX; bool useNumCommandQueues; cl_uint numCommandQueues; }; #endif // GEMV_H_ clblas-2.10/src/tests/include/ger.h000066400000000000000000000071071264277366700172270ustar00rootroot00000000000000/* ************************************************************************ * Copyright 2013 Advanced Micro Devices, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * ************************************************************************/ #ifndef GER_H_ #define GER_H_ #include #include #include #include using ::testing::TestWithParam; class GER : public TestWithParam< ::std::tr1::tuple< clblasOrder, // order int, // M int, // N int, // lda int, //incx int, //incy int, // offx int, // offy int, // offa //FIX_ME.. gtest not allowing to add more parameters int // numCommandQueues > > { public: void getParams(TestParams *params) { params->order = order; params->M = M; params->N = N; params->lda = lda; params->incx = incx; params->incy = incy; params->offa = offa; params->offBX = offx; params->offCY = offy; params->rowsA = rowsA; params->alpha = paramAlpha; params->numCommandQueues = numCommandQueues; } protected: virtual void SetUp() { order = ::std::tr1::get<0>(GetParam()); M = ::std::tr1::get<1>(GetParam()); N = ::std::tr1::get<2>(GetParam()); lda = ::std::tr1::get<3>(GetParam()); incx = ::std::tr1::get<4>(GetParam()); incy = ::std::tr1::get<5>(GetParam()); offa = ::std::tr1::get<6>(GetParam()); offx = ::std::tr1::get<7>(GetParam()); offy = ::std::tr1::get<8>(GetParam()); numCommandQueues = ::std::tr1::get<9>(GetParam()); base = ::clMath::BlasBase::getInstance(); seed = base->seed(); ComplexLong fAlpha; fAlpha.re = 3, fAlpha.imag = 4; base->setAlpha(fAlpha); useNumCommandQueues = base->useNumCommandQueues(); if (useNumCommandQueues) { numCommandQueues = base->numCommandQueues(); } useAlpha = base->useAlpha(); if (useAlpha != 0) { paramAlpha = base->alpha(); } if (base->useM()) { M = base->M(); } if (base->useN()) { N = base->N(); } rowsA = M; columnsA = N; switch (order) { case clblasRowMajor: lda = ::std::max(lda, columnsA); break; case clblasColumnMajor: lda = ::std::max(lda, rowsA); break; } printTestParams(order, M, N, useAlpha, base->alpha(), lda, incx, incy, offa, offx, offy); ::std::cerr << "seed = " << seed << ::std::endl; ::std::cerr << "queues = " << numCommandQueues << ::std::endl; } clblasOrder order; size_t M, N; size_t lda; int incx, incy; size_t offa, offx, offy; unsigned int seed; bool useAlpha; ComplexLong paramAlpha; size_t rowsA, columnsA; ::clMath::BlasBase *base; bool useNumCommandQueues; cl_uint numCommandQueues; }; #endif // GER_H_ clblas-2.10/src/tests/include/gerc.h000066400000000000000000000066461264277366700174010ustar00rootroot00000000000000/* ************************************************************************ * Copyright 2013 Advanced Micro Devices, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * ************************************************************************/ #ifndef GERC_H_ #define GERC_H_ #include #include #include #include using ::testing::TestWithParam; class GERC : public TestWithParam< ::std::tr1::tuple< clblasOrder, // order int, // M int, // N int, // lda int, //incx int, //incy int, // offx int, // offy int, // offa //FIX_ME.. gtest not allowing to add more parameters int // numCommandQueues > > { public: void getParams(TestParams *params) { params->order = order; params->M = M; params->N = N; params->lda = lda; params->incx = incx; params->incy = incy; params->offa = offa; params->offBX = offx; params->offCY = offy; params->rowsA = rowsA; params->alpha = paramAlpha; params->numCommandQueues = numCommandQueues; } protected: virtual void SetUp() { order = ::std::tr1::get<0>(GetParam()); M = ::std::tr1::get<1>(GetParam()); N = ::std::tr1::get<2>(GetParam()); lda = ::std::tr1::get<3>(GetParam()); incx = ::std::tr1::get<4>(GetParam()); incy = ::std::tr1::get<5>(GetParam()); offa = ::std::tr1::get<6>(GetParam()); offx = ::std::tr1::get<7>(GetParam()); offy = ::std::tr1::get<8>(GetParam()); numCommandQueues = ::std::tr1::get<9>(GetParam()); base = ::clMath::BlasBase::getInstance(); seed = base->seed(); ComplexLong fAlpha; fAlpha.re = 3, fAlpha.imag = 4; base->setAlpha(fAlpha); useNumCommandQueues = base->useNumCommandQueues(); if (useNumCommandQueues) { numCommandQueues = base->numCommandQueues(); } useAlpha = base->useAlpha(); if (useAlpha != 0) { paramAlpha = base->alpha(); } if (base->useM()) { M = base->M(); } if (base->useN()) { N = base->N(); } rowsA = M; columnsA = N; if( lda == 0 ) lda = ::std::max(M, N); printTestParams(order, M, N, useAlpha, base->alpha(), lda, incx, incy, offa, offx, offy); ::std::cerr << "seed = " << seed << ::std::endl; ::std::cerr << "queues = " << numCommandQueues << ::std::endl; } clblasOrder order; size_t M, N; size_t lda; int incx, incy; size_t offa, offx, offy; unsigned int seed; bool useAlpha; ComplexLong paramAlpha; size_t rowsA, columnsA; ::clMath::BlasBase *base; bool useNumCommandQueues; cl_uint numCommandQueues; }; #endif // GERC_H_ clblas-2.10/src/tests/include/hbmv.h000066400000000000000000000066101264277366700174040ustar00rootroot00000000000000/* ************************************************************************ * Copyright 2013 Advanced Micro Devices, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * ************************************************************************/ #ifndef HBMV_H_ #define HBMV_H_ #include #include #include #include #include #include #include #include using namespace clMath; using ::testing::TestWithParam; class HBMV : public TestWithParam< ::std::tr1::tuple< clblasOrder, // order clblasUplo, // uplo int, // N int, // K ExtraTestSizes, ComplexLong, // Alpha ComplexLong, // Beta int // numCommandQueues > > { public: void getParams(TestParams *params) { memset(params, 0, sizeof(TestParams)); params->order = order; params->uplo = uplo; params->seed = seed; params->N = N; params->K = KLU; params->lda = lda; params->incx = incx; params->incy = incy; params->offA = offA; params->offa = offA; params->offBX = offx; params->offCY = offy; params->alpha = paramAlpha; params->beta = paramBeta; params->numCommandQueues = numCommandQueues; } protected: virtual void SetUp() { ExtraTestSizes extra; order = ::std::tr1::get<0>(GetParam()); uplo = ::std::tr1::get<1>(GetParam()); N = ::std::tr1::get<2>(GetParam()); KLU = ::std::tr1::get<3>(GetParam()); extra = ::std::tr1::get<4>(GetParam()); offA = extra.offA; offx = extra.offBX; offy = extra.offCY; lda = extra.strideA.ld; incx = extra.strideBX.inc; incy = extra.strideCY.inc; paramAlpha = ::std::tr1::get<5>(GetParam()); paramBeta = ::std::tr1::get<6>(GetParam()); numCommandQueues = ::std::tr1::get<7>(GetParam()); base = ::clMath::BlasBase::getInstance(); seed = base->seed(); useNumCommandQueues = base->useNumCommandQueues(); if (useNumCommandQueues) { numCommandQueues = base->numCommandQueues(); } KLU = KLU % N; lda = ::std::max(lda, (KLU+1)); printTestParams(order, uplo, N, KLU, paramAlpha, offA, lda, offx, incx, paramBeta, offy, incy); ::std::cerr << "seed = " << seed << ::std::endl; ::std::cerr << "queues = " << numCommandQueues << ::std::endl; } clblasOrder order; clblasUplo uplo; size_t N, KLU; size_t lda; int incx, incy; size_t offA, offx, offy; unsigned int seed; ComplexLong paramAlpha, paramBeta; ::clMath::BlasBase *base; bool useNumCommandQueues; cl_uint numCommandQueues; }; #endif // HBMV_H_ clblas-2.10/src/tests/include/hemm.h000066400000000000000000000074411264277366700174010ustar00rootroot00000000000000/* ************************************************************************ * Copyright 2013 Advanced Micro Devices, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * ************************************************************************/ #ifndef HEMM_H_ #define HEMM_H_ #include #include #include #include #include using namespace clMath; using ::testing::TestWithParam; class HEMM : public TestWithParam< ::std::tr1::tuple< clblasOrder, // order clblasSide, // side clblasUplo, // uplo int, // M int, // N cl_float2, //alpha cl_float2, //beta ExtraTestSizes, // to get more than ten parameters in gtest. int // numCommandQueues > > { public: void getParams(TestParams *params) { params->order = order; params->seed = seed; params->side = side; params->uplo = uplo; params->M = M; params->N = N; params->lda = lda; params->ldb = ldb; params->ldc = ldc; params->offA = offA; params->offBX = offb; params->offCY = offc; params->alpha.re = (long)CREAL(alpha); params->alpha.imag = (long)CIMAG(alpha); params->beta.re = (long)CREAL(beta); params->beta.imag = (long)CIMAG(beta); params->numCommandQueues = numCommandQueues; } protected: virtual void SetUp() { ExtraTestSizes extra; order = ::std::tr1::get<0>(GetParam()); side = ::std::tr1::get<1>(GetParam()); uplo = ::std::tr1::get<2>(GetParam()); M = ::std::tr1::get<3>(GetParam()); N = ::std::tr1::get<4>(GetParam()); alpha = ::std::tr1::get<5>(GetParam()); beta = ::std::tr1::get<6>(GetParam()); extra = ::std::tr1::get<7>(GetParam()); offA = extra.offA; offb = extra.offBX; offc = extra.offCY; lda = extra.strideA.ld; ldb = extra.strideBX.ld; ldc = extra.strideCY.ld; numCommandQueues = ::std::tr1::get<8>(GetParam()); base = ::clMath::BlasBase::getInstance(); seed = base->seed(); useNumCommandQueues = base->useNumCommandQueues(); if (useNumCommandQueues) { numCommandQueues = base->numCommandQueues(); } if (base->useM()) { M = base->M(); } if (base->useN()) { N = base->N(); } if( side == clblasLeft ) { lda = ::std::max(lda, M); } else { lda = ::std::max(lda, N); } switch (order) { case clblasRowMajor: ldb = ::std::max(ldb, N); ldc = ::std::max(ldc, N); break; case clblasColumnMajor: ldb = ::std::max(ldb, M); ldc = ::std::max(ldc, M); break; } printTestParams(order, side, uplo, M, N, 1, alpha, 1, beta, lda, ldb, ldc, offA, offb, offc); ::std::cerr << "seed = " << seed << ::std::endl; ::std::cerr << "queues = " << numCommandQueues << ::std::endl; } clblasOrder order; clblasSide side; clblasUplo uplo; size_t M, N; size_t lda, ldb, ldc; size_t offA, offb, offc; unsigned int seed; cl_float2 alpha, beta; ::clMath::BlasBase *base; bool useNumCommandQueues; cl_uint numCommandQueues; }; #endif // HEMM_H_ clblas-2.10/src/tests/include/hemv.h000066400000000000000000000077401264277366700174140ustar00rootroot00000000000000/* ************************************************************************ * Copyright 2013 Advanced Micro Devices, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * ************************************************************************/ #if !defined(HEMV_PACKED) #ifndef HEMV_H #define HEMV_H #else #define DUPLICIT #endif #endif #ifndef DUPLICIT #include #include #include #include #include #include using namespace clMath; using ::testing::TestWithParam; #ifndef HEMV_PACKED class HEMV : public TestWithParam< #else class HPMV : public TestWithParam< #endif ::std::tr1::tuple< clblasOrder, // order clblasUplo, // uplo int, // N ComplexLong, // Alpha ComplexLong, // Beta size_t, // offA size_t, // offx size_t, // offy ExtraTestSizes, int // numCommandQueues > > { public: void getParams(TestParams *params) { params->order = order; params->uplo = uplo; params->seed = seed; params->N = N; params->lda = lda; params->incx = incx; params->incy = incy; params->offA = offA; params->offBX = offx; params->offCY = offy; params->alpha = paramAlpha; params->beta = paramBeta; params->numCommandQueues = numCommandQueues; } protected: virtual void SetUp() { ExtraTestSizes extra; order = ::std::tr1::get<0>(GetParam()); uplo = ::std::tr1::get<1>(GetParam()); N = ::std::tr1::get<2>(GetParam()); paramAlpha = ::std::tr1::get<3>(GetParam()); paramBeta = ::std::tr1::get<4>(GetParam()); offA = ::std::tr1::get<5>(GetParam()); offx = ::std::tr1::get<6>(GetParam()); offy = ::std::tr1::get<7>(GetParam()); extra = ::std::tr1::get<8>(GetParam()); lda = extra.strideA.ld; incx = extra.strideBX.inc; incy = extra.strideCY.inc; numCommandQueues = ::std::tr1::get<9>(GetParam()); base = ::clMath::BlasBase::getInstance(); seed = base->seed(); #ifndef HEMV_PACKED lda = ::std::max( lda, N ); #else lda =0; #endif useNumCommandQueues = base->useNumCommandQueues(); if (useNumCommandQueues) { numCommandQueues = base->numCommandQueues(); } useAlpha = base->useAlpha(); if (useAlpha != 0) { paramAlpha = base->alpha(); } useBeta = base->useBeta(); if (useBeta != 0) { paramBeta = base->beta(); } if (base->useN()) { N = base->N(); } if (base->useIncX()) { incx = base->incX(); } if (base->useIncY()) { incy = base->incY(); } printTestParams(order, uplo, N, paramAlpha, offA, lda, offx, incx, paramBeta, offy, incy); ::std::cerr << "seed = " << seed << ::std::endl; ::std::cerr << "queues = " << numCommandQueues << ::std::endl; } clblasOrder order; clblasUplo uplo; size_t N; size_t lda; int incx, incy; size_t offA, offx, offy; unsigned int seed; bool useAlpha, useBeta; ComplexLong paramAlpha, paramBeta; ::clMath::BlasBase *base; cl_ulong imageA, imageX, imageY; bool useNumCommandQueues; cl_uint numCommandQueues; }; #endif // HEMV_H_ clblas-2.10/src/tests/include/her.h000066400000000000000000000106071264277366700172270ustar00rootroot00000000000000/* ************************************************************************ * Copyright 2013 Advanced Micro Devices, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * ************************************************************************/ #if !defined(HER_PACKED) #ifndef HER_H #define HER_H #else #define DUPLICIT #endif #endif #ifndef DUPLICIT #include #include #include #include #include #include using ::testing::TestWithParam; #ifndef HER_PACKED class HER : public TestWithParam< #else class HPR : public TestWithParam< #endif ::std::tr1::tuple< clblasOrder, // order clblasUplo, // uplo int, // N double, //alpha int, // lda int, //incx int, // offx int, // offa //FIX_ME.. gtest not allowing to add more parameters int // numCommandQueues > > { public: void getParams(TestParams *params) { params->order = order; params->uplo = uplo; params->N = N; params->alpha.re = (long)alpha; params->lda = lda; params->incx = incx; params->offa = offa; params->offBX = offx; params->numCommandQueues = numCommandQueues; } protected: virtual void SetUp() { order = ::std::tr1::get<0>(GetParam()); uplo = ::std::tr1::get<1>(GetParam()); N = ::std::tr1::get<2>(GetParam()); alpha = ::std::tr1::get<3>(GetParam()); lda = ::std::tr1::get<4>(GetParam()); incx = ::std::tr1::get<5>(GetParam()); offa = ::std::tr1::get<6>(GetParam()); offx = ::std::tr1::get<7>(GetParam()); numCommandQueues = ::std::tr1::get<8>(GetParam()); #ifndef HER_PACKED lda = ::std::max( lda, N ); #else lda =0; #endif base = ::clMath::BlasBase::getInstance(); seed = base->seed(); useNumCommandQueues = base->useNumCommandQueues(); if (useNumCommandQueues) { numCommandQueues = base->numCommandQueues(); } if (base->useN()) { N = base->N(); } printTestParams(order, uplo, N, alpha, offx, incx, offa, lda ); ::std::cerr << "seed = " << seed << ::std::endl; ::std::cerr << "queues = " << numCommandQueues << ::std::endl; } clblasOrder order; clblasUplo uplo; size_t N; size_t lda; int incx; size_t offa, offx; unsigned int seed; double alpha; ComplexLong paramAlpha; size_t rowsA, columnsA; ::clMath::BlasBase *base; bool useNumCommandQueues; cl_uint numCommandQueues; }; #ifndef RANDOM_HER #define RANDOM_HER template static void randomHerMatrices( clblasOrder order, clblasUplo uplo, size_t N, T *alpha, T *A, size_t lda, T *X, int incx ) { size_t i, j; size_t lengthX; cl_double bound, max; // bound is calculated by solving the equation (alpha*x^2 + x - UPPER_BOUND) < 0 bound = UPPER_BOUND(); if(module(CREAL(*alpha)) > (sqrt(bound) / (2.0))) *alpha = random((sqrt(bound) / (2.0))); max = module(CREAL(*alpha)); bound = bound / max / 2.0; bound = sqrt( ((((1.0) / max) / (4.0)) / max) + bound) - ((1.0) / ((2.0) * max)); if( lda ) { for (i = 0; i < N; i++) { for (j = 0; j < N; j++) { setElement(order, clblasNoTrans, i, j, A, lda, random(bound)); } } } else { for (i = 0; i < N; i++) { for (j = 0; j < N; j++) { setElementPacked(order, clblasNoTrans, uplo, i, j, A, N, random(bound)); } } } lengthX = 1 + ((N - 1) * abs(incx)); if (X != NULL) { for (i = 0; i < lengthX; i++) { X[i] = random(bound); } } } #endif // RANDOM_HER #endif // HER_H_ clblas-2.10/src/tests/include/her2.h000066400000000000000000000122001264277366700173000ustar00rootroot00000000000000/* ************************************************************************ * Copyright 2013 Advanced Micro Devices, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * ************************************************************************/ #if !defined(HER2_PACKED) #ifndef HER2_H #define HER2_H #else #define DUPLICIT #endif #endif #ifndef DUPLICIT #include #include #include #include #include #include using ::testing::TestWithParam; #ifndef HER2_PACKED class HER2 : public TestWithParam< #else class HPR2 : public TestWithParam< #endif ::std::tr1::tuple< clblasOrder, // order clblasUplo, // uplo int, // N cl_float2, //alpha int, // offx int, // incx, should be greater than 0 int, // offy, //int, // incy, should be greater than 0. // Since tuple doesnot allow more than 10 arguments we assume incy = incx; int, // offa int, // lda, 0 - undefined int // numCommandQueues > > { public: void getParams(TestParams *params) { params->order = order; params->uplo = uplo; params->seed = seed; params->N = N; params->alpha.re = (long)(CREAL(alpha)); // This will cast alpha to long. So the real value that is params->alpha.imag = (long)(CIMAG(alpha)); // passed is not the same as what is set in the test case params->offBX = offx; params->incx = incx; params->offCY = offy; params->incy = incy; params->offa = offa; params->lda = lda; params->numCommandQueues = numCommandQueues; } protected: virtual void SetUp() { order = ::std::tr1::get<0>(GetParam()); uplo = ::std::tr1::get<1>(GetParam()); N = ::std::tr1::get<2>(GetParam()); alpha = ::std::tr1::get<3>(GetParam()); offx = ::std::tr1::get<4>(GetParam()); incx = ::std::tr1::get<5>(GetParam()); offy = ::std::tr1::get<6>(GetParam()); offa = ::std::tr1::get<7>(GetParam()); lda = ::std::tr1::get<8>(GetParam()); numCommandQueues = ::std::tr1::get<9>(GetParam()); incy = incx; //GTest allows only 10 arguments to be passed and //hence we define incy to be equivalent to incx. #ifndef HER2_PACKED lda = ::std::max( lda, N ); #else lda =0; #endif base = ::clMath::BlasBase::getInstance(); seed = base->seed(); useNumCommandQueues = base->useNumCommandQueues(); if (useNumCommandQueues) { numCommandQueues = base->numCommandQueues(); } if (base->useN()) { N = base->N(); } printTestParams(order, uplo, N, 1, alpha, offx, incx, offy, incy, offa, lda); ::std::cerr << "seed = " << seed << ::std::endl; ::std::cerr << "queues = " << numCommandQueues << ::std::endl; } clblasOrder order; clblasUplo uplo; size_t N; size_t lda; int incx, incy; size_t offx, offy, offa; unsigned int seed; cl_float2 alpha; ::clMath::BlasBase *base; cl_ulong imageA, imageX, imageY; bool useNumCommandQueues; cl_uint numCommandQueues; }; #ifndef RANDOM_HER2 #define RANDOM_HER2 template static void randomHer2Matrices( clblasOrder order, clblasUplo uplo, size_t N, T *alpha, T *A, size_t lda, T *X, int incx, T *Y, int incy ) { size_t i, j; size_t lengthX; size_t lengthY; cl_double bound, max; // bound is calculated by solving the equation (2*alpha*x^2 + x - UPPER_BOUND) < 0 bound = UPPER_BOUND(); max = module( ::std::max( alpha->s[0], alpha->s[1] ) ); if(max > (sqrt(bound) / (4.0))) *alpha = random((sqrt(bound) / (4.0))); max = module( ::std::max( alpha->s[0], alpha->s[1] ) ); bound = bound / ( 2 * max); bound = sqrt( ((((1.0) / max) / (16.0)) / max) + bound) - ((1.0) / ((4.0) * max)); if( lda ) { for (i = 0; i < N; i++) { for (j = 0; j < N; j++) { setElement(order, clblasNoTrans, i, j, A, lda, random(bound)); } } } else { for (i = 0; i < N; i++) { for (j = 0; j < N; j++) { setElementPacked(order, clblasNoTrans, uplo, i, j, A, N, random(bound)); } } } lengthX = 1 + ((N - 1) * abs(incx)); if (X != NULL) { for (i = 0; i < lengthX; i++) { X[i] = random(bound); } } lengthY = 1 + (N - 1) * abs(incy); if (Y != NULL) { for (i = 0; i < lengthY; i++) { Y[i] = random(bound); } } } #endif //RANDOM_HER2 #endif //HER2_H_ clblas-2.10/src/tests/include/her2k.h000066400000000000000000000113371264277366700174650ustar00rootroot00000000000000/* ************************************************************************ * Copyright 2013 Advanced Micro Devices, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * ************************************************************************/ #ifndef HER2K_H_ #define HER2K_H_ #include #include #include #include #include #include using namespace clMath; using ::testing::TestWithParam; class HER2K : public TestWithParam< ::std::tr1::tuple< clblasOrder, // order clblasUplo, // uplo clblasTranspose, // transA int, // N int, // K ComplexLong, // alpha ComplexLong, // beta ExtraTestSizes, // offa, offb, offc, lda, ldb, ldc. int // numCommandQueues > > { public: void getParams(TestParams *params) { params->order = order; params->uplo = uplo; params->transA = transA; params->seed = seed; params->N = N; params->K = K; params->offA = offa; params->offa = offa; params->offBX = offB; params->offCY = offC; params->lda = lda; params->ldb = ldb; params->ldc = ldc; params->rowsA = rowsA; params->columnsA = columnsA; params->rowsB = rowsB; params->columnsB = columnsB; params->rowsC = rowsC; params->columnsC = columnsC; params->numCommandQueues = numCommandQueues; params->alpha = paramAlpha; params->beta = paramBeta; } protected: virtual void SetUp() { ExtraTestSizes extra; order = ::std::tr1::get<0>(GetParam()); uplo = ::std::tr1::get<1>(GetParam()); transA = ::std::tr1::get<2>(GetParam()); N = ::std::tr1::get<3>(GetParam()); K = ::std::tr1::get<4>(GetParam()); paramAlpha = ::std::tr1::get<5>(GetParam()); paramBeta = ::std::tr1::get<6>(GetParam()); paramBeta.imag = 0; // Beta is a real number extra = ::std::tr1::get<7>(GetParam()); offa = extra.offA; offB = extra.offBX; offC = extra.offCY; lda = extra.strideA.ld; ldb = extra.strideBX.ld; ldc = extra.strideCY.ld; numCommandQueues = ::std::tr1::get<8>(GetParam()); base = ::clMath::BlasBase::getInstance(); seed = base->seed(); useNumCommandQueues = base->useNumCommandQueues(); if (useNumCommandQueues) { numCommandQueues = base->numCommandQueues(); } if (base->useN()) { N = base->N(); } if (base->useK()) { K = base->K(); } if (transA == clblasNoTrans) { rowsA = rowsB = N; columnsA = columnsB = K; } else { rowsA = rowsB = K; columnsA = columnsB = N; } rowsC = N; columnsC = N; switch (order) { case clblasRowMajor: lda = ::std::max(lda, columnsA); columnsA = lda; ldb = ::std::max(ldb, columnsB); columnsB = ldb; ldc = ::std::max(ldc, columnsC); columnsC = ldc; break; case clblasColumnMajor: lda = ::std::max(lda, rowsA); rowsA = lda; ldb = ::std::max(ldb, rowsB); rowsB = ldb; ldc = ::std::max(ldc, rowsC); rowsC = ldc; break; } printTestParams(order, uplo, transA, N, K, true, paramAlpha, offa, lda, offB, ldb, true, paramBeta, offC, ldc); ::std::cerr << "seed = " << seed << ::std::endl; ::std::cerr << "queues = " << numCommandQueues << ::std::endl; } clblasOrder order; clblasUplo uplo; clblasTranspose transA; size_t N, K; size_t offa, offC, offB; size_t lda, ldc, ldb; unsigned int seed; ComplexLong paramAlpha, paramBeta; size_t rowsA, columnsA; size_t rowsC, columnsC; size_t rowsB, columnsB; ::clMath::BlasBase *base; bool useNumCommandQueues; cl_uint numCommandQueues; }; #endif // HER2K_H_ clblas-2.10/src/tests/include/herk.h000066400000000000000000000105771264277366700174100ustar00rootroot00000000000000/* ************************************************************************ * Copyright 2013 Advanced Micro Devices, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * ************************************************************************/ #ifndef HERK_H_ #define HERK_H_ #include #include #include #include #include #include using namespace clMath; using ::testing::TestWithParam; class HERK : public TestWithParam< ::std::tr1::tuple< clblasOrder, // order clblasUplo, // uplo clblasTranspose, // transA int, // N int, // K ComplexLong, // alpha ComplexLong, // beta ExtraTestSizes, // offa, offc, lda, ldc. int // numCommandQueues > > { public: void getParams(TestParams *params) { params->order = order; params->uplo = uplo; params->transA = transA; params->seed = seed; params->N = N; params->K = K; params->offA = offA; params->offCY = offC; params->lda = lda; params->ldc = ldc; params->rowsA = rowsA; params->columnsA = columnsA; params->rowsC = rowsC; params->columnsC = columnsC; params->numCommandQueues = numCommandQueues; params->alpha = paramAlpha; params->beta = paramBeta; } protected: virtual void SetUp() { ExtraTestSizes extra; order = ::std::tr1::get<0>(GetParam()); uplo = ::std::tr1::get<1>(GetParam()); transA = ::std::tr1::get<2>(GetParam()); N = ::std::tr1::get<3>(GetParam()); K = ::std::tr1::get<4>(GetParam()); paramAlpha = ::std::tr1::get<5>(GetParam()); paramBeta = ::std::tr1::get<6>(GetParam()); extra = ::std::tr1::get<7>(GetParam()); offA = extra.offA; offC = extra.offCY; lda = extra.strideA.ld; ldc = extra.strideCY.ld; numCommandQueues = ::std::tr1::get<8>(GetParam()); base = ::clMath::BlasBase::getInstance(); seed = base->seed(); useNumCommandQueues = base->useNumCommandQueues(); if (useNumCommandQueues) { numCommandQueues = base->numCommandQueues(); } useAlpha = base->useAlpha(); if (useAlpha != 0) { paramAlpha = base->alpha(); } useBeta = base->useBeta(); if (useBeta != 0) { paramBeta = base->beta(); } if (base->useN()) { N = base->N(); } if (base->useK()) { K = base->K(); } if (transA == clblasNoTrans) { rowsA = N; columnsA = K; } else { rowsA = K; columnsA = N; } rowsC = N; columnsC = N; switch (order) { case clblasRowMajor: lda = ::std::max(lda, columnsA); columnsA = lda; ldc = ::std::max(ldc, columnsC); columnsC = ldc; break; case clblasColumnMajor: lda = ::std::max(lda, rowsA); rowsA = lda; ldc = ::std::max(ldc, rowsC); rowsC = ldc; break; } printTestParams(order, uplo, transA, N, K, true, paramAlpha, offA, lda, true, paramBeta, offC, ldc); ::std::cerr << "seed = " << seed << ::std::endl; ::std::cerr << "queues = " << numCommandQueues << ::std::endl; } clblasOrder order; clblasUplo uplo; clblasTranspose transA; size_t N, K; size_t offA, offC; size_t lda, ldc; unsigned int seed; bool useAlpha, useBeta; ComplexLong paramAlpha, paramBeta; size_t rowsA, columnsA; size_t rowsC, columnsC; ::clMath::BlasBase *base; bool useNumCommandQueues; cl_uint numCommandQueues; }; #endif // HERK_H_ clblas-2.10/src/tests/include/hpmv.h000066400000000000000000000015431264277366700174220ustar00rootroot00000000000000/* ************************************************************************ * Copyright 2013 Advanced Micro Devices, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * ************************************************************************/ #ifndef HPMV_H_ #define HPMV_H_ #define HEMV_PACKED #include #undef HEMV_PACKED #endif // HPMV_H_clblas-2.10/src/tests/include/hpr.h000066400000000000000000000015241264277366700172400ustar00rootroot00000000000000/* ************************************************************************ * Copyright 2013 Advanced Micro Devices, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * ************************************************************************/ #ifndef HPR_H_ #define HPR_H_ #define HER_PACKED #include "her.h" #undef HER_PACKED #endifclblas-2.10/src/tests/include/hpr2.h000066400000000000000000000015261264277366700173240ustar00rootroot00000000000000/* ************************************************************************ * Copyright 2013 Advanced Micro Devices, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * ************************************************************************/ #ifndef HPR2_H_ #define HPR2_H_ #define HER2_PACKED #include "her2.h" #undef HER2_PACKED #endifclblas-2.10/src/tests/include/iamax.h000066400000000000000000000042441264277366700175500ustar00rootroot00000000000000/* ************************************************************************ * Copyright 2013 Advanced Micro Devices, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * ************************************************************************/ #include #include #include #include #include using ::testing::TestWithParam; class iAMAX : public TestWithParam< ::std::tr1::tuple< int, // N int, // incx, should be greater than 0 int, //offx int, //offa -- for offiAmax int // numCommandQueues > > { public: void getParams(TestParams *params) { params->N = N; params->incx = incx; params->offBX = offx; params->offa = offiAmax; params->numCommandQueues = numCommandQueues; } protected: virtual void SetUp() { N = ::std::tr1::get<0>(GetParam()); incx = ::std::tr1::get<1>(GetParam()); offx = ::std::tr1::get<2>(GetParam()); offiAmax = ::std::tr1::get<3>(GetParam()); numCommandQueues = ::std::tr1::get<4>(GetParam()); base = ::clMath::BlasBase::getInstance(); useNumCommandQueues = base->useNumCommandQueues(); if (useNumCommandQueues) { numCommandQueues = base->numCommandQueues(); } if (base->useN()) { N = base->N(); } printTestParams(N, offx, incx); ::std::cerr << "offiAmax = " << offiAmax << ::std::endl; } size_t N; int incx; size_t offx, offiAmax; ::clMath::BlasBase *base; cl_ulong imageA, imageX; bool useNumCommandQueues; cl_uint numCommandQueues; }; clblas-2.10/src/tests/include/matrix.h000066400000000000000000000436321264277366700177610ustar00rootroot00000000000000/* ************************************************************************ * Copyright 2013 Advanced Micro Devices, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * ************************************************************************/ #ifndef MATRIX_H_ #define MATRIX_H_ #include #include #include #include // Data Generation #include template static T getElement( clblasOrder order, clblasTranspose trans, size_t row, size_t column, const T *A, size_t lda) { if ( lda > 0) // General case { switch (order) { case clblasRowMajor: if (trans == clblasNoTrans) { A += lda * row; return A[column]; } else { A += lda * column; return A[row]; } break; case clblasColumnMajor: if (trans == clblasNoTrans) { A += lda * column; return A[row]; } else { A += lda * row; return A[column]; } break; } /* Unreachable point */ return FNAN(); } else { // Needed for Macro : testDG.h int vectorLength = 1; const T* data = A; if ( order == clblasRowMajor) { return *RMLPacked(row, column); } else { // return CMLPacked(row, column); return FNAN(); } } } template static void setElement( clblasOrder order, clblasTranspose trans, size_t row, size_t column, T *A, size_t lda, T value) { switch (order) { case clblasRowMajor: if (trans == clblasNoTrans) { A += lda * row; A[column] = value; } else { A += lda * column; A[row] = value; } break; case clblasColumnMajor: if (trans == clblasNoTrans) { A += lda * column; A[row] = value; } else { A += lda * row; A[column] = value; } break; } } template static void setElementPacked( clblasOrder order, clblasTranspose trans, clblasUplo uplo, size_t row, size_t column, T *A, size_t rows, T value) { // Needed for Macro : testDG.h int vectorLength = 1; const T* data = A; clblasUplo fUplo = (trans == clblasNoTrans) ? uplo : ((uplo == clblasUpper) ? clblasLower : clblasUpper); if(fUplo == clblasLower) //Should not access elements out of bounds. { if (column > row) return; } else { if (column < row) return; } switch (order) { case clblasRowMajor: if (fUplo == clblasLower) { *RMLPacked(row, column) = value; } else { *RMUPacked(row, column) = value; } break; case clblasColumnMajor: if (fUplo == clblasLower) { *CMLPacked(row, column) = value; } else { *CMUPacked(row, column) = value; } break; } } template static T getElementPacked( clblasOrder order, clblasTranspose trans, clblasUplo uplo, size_t row, size_t column, T *A, size_t rows) { // Needed for Macro : testDG.h int vectorLength = 1; const T* data = A; clblasUplo fUplo = (trans == clblasNoTrans) ? uplo : ((uplo == clblasUpper) ? clblasLower : clblasUpper); if(fUplo == clblasLower) //Should not access elements out of bounds. { if (column > row) return FNAN(); } else { if (column < row) return FNAN(); } switch (order) { case clblasRowMajor: if (fUplo == clblasLower) { return *RMLPacked(row, column); } else { return *RMUPacked(row, column); } break; case clblasColumnMajor: if (fUplo == clblasLower) { return *CMLPacked(row, column); } else { return *CMUPacked(row, column); } break; default: return FNAN(); } } template static void printElement(T a) { std::cout << a << "\t"; } template<> __template_static void printElement(FloatComplex a) { std::cout << "(" << a.s[0] << ", " << a.s[1] << ")\t"; } template<> __template_static void printElement(DoubleComplex a) { std::cout << "(" << a.s[0] << ", " << a.s[1] << ")\t"; } template static void printMatrixBlock( clblasOrder order, size_t startRow, size_t startCol, size_t nrRows, size_t nrCols, size_t lda, T *A) { // FIXME : Packed Matrix size_t i, j; T a; for (i = 0; i < nrRows; i++) { for (j = 0; j < nrCols; j++) { a = getElement(order, clblasNoTrans, startRow + i, startCol + j, A, lda); printElement(a); } std::cout << std::endl; } std::cout << std::endl << std::endl; } template static void reorderMatrix( clblasOrder order, size_t rowsA, size_t columnsA, const T *A, T *B) { size_t lda = 0, ldb = 0; size_t x, y; clblasOrder orderB = clblasRowMajor; switch (order) { case clblasColumnMajor: orderB = clblasRowMajor; lda = rowsA; ldb = columnsA; break; case clblasRowMajor: orderB = clblasColumnMajor; lda = columnsA; ldb = rowsA; break; } for (y = 0; y < rowsA; y++) { for (x = 0; x < columnsA; x++) { setElement(orderB, clblasNoTrans, y, x, B, ldb, getElement(order, clblasNoTrans, y, x, A, lda)); } } } template static void compareMatrices( clblasOrder order, size_t M, size_t N, const T *A, const T *B, size_t lda, const cl_double *absDelta = NULL) { size_t m = 0, n = 0; T a, b; cl_double delta; if( lda > 0 ) // General case { for (m = 0; m < M; m++) { for (n = 0; n < N; n++) { a = getElement(order, clblasNoTrans, m, n, A, lda); b = getElement(order, clblasNoTrans, m, n, B, lda); delta = 0.0; if (absDelta != NULL) { delta = absDelta[m * N + n]; } if( module(a-b) > delta ) printf("m : %d\t n: %d\n", (int)m, (int)n); ASSERT_NEAR(a, b, delta); } } } else // Packed case { if ( order == clblasColumnMajor) { for ( n = 0; n < N; n++) { for( m=n; m < M; m++) { a = getElement(order, clblasNoTrans, m, n, A, lda); b = getElement(order, clblasNoTrans, m, n, B, lda); delta = 0.0; if (absDelta != NULL) { //delta = absDelta[m * N + n]; } if( module(a-b) > delta ) printf("m : %d\t n: %d\n", (int)m, (int)n); ASSERT_NEAR(a, b, delta); } } } else { for ( m = 0; m < M; m++) { for( n = 0; n <= m; n++) { a = getElement(order, clblasNoTrans, m, n, A, lda); b = getElement(order, clblasNoTrans, m, n, B, lda); delta = 0.0; if (absDelta != NULL) { //delta = absDelta[m * N + n]; } if( module(a-b) > delta ) printf("m : %d\t n: %d\n", (int)m, (int)n); ASSERT_NEAR(a, b, delta); } } } } } template<> __template_static void compareMatrices( clblasOrder order, size_t M, size_t N, const FloatComplex *A, const FloatComplex *B, size_t lda, const cl_double *absDelta) { size_t m = 0, n = 0; FloatComplex a, b; cl_double delta; if ( lda > 0 ) { for (m = 0; m < M; m++) { for (n = 0; n < N; n++) { a = getElement(order, clblasNoTrans, m, n, A, lda); b = getElement(order, clblasNoTrans, m, n, B, lda); delta = 0.0; if (absDelta != NULL) { delta = absDelta[m * N + n]; } if( (module(CREAL(a) - CREAL(b)) > delta) || (module(CIMAG(a) - CIMAG(b)) > delta) ) printf("m : %d\t n: %d\n", (int)m, (int)n); ASSERT_NEAR(CREAL(a), CREAL(b), delta); ASSERT_NEAR(CIMAG(a), CIMAG(b), delta); } } } else // Packed case { if ( order == clblasColumnMajor) { for ( n = 0; n < N; n++) { for( m=n; m < M; m++) { a = getElement(order, clblasNoTrans, m, n, A, lda); b = getElement(order, clblasNoTrans, m, n, B, lda); delta = 0.0; if (absDelta != NULL) { //delta = absDelta[m * N + n]; } if( (module(CREAL(a) - CREAL(b)) > delta) || (module(CIMAG(a) - CIMAG(b)) > delta) ) printf("m : %d\t n: %d\n", (int)m, (int)n); ASSERT_NEAR(CREAL(a), CREAL(b), delta); ASSERT_NEAR(CIMAG(a), CIMAG(b), delta); } } } else { for ( m = 0; m < M; m++) { for( n = 0; n <= m; n++) { a = getElement(order, clblasNoTrans, m, n, A, lda); b = getElement(order, clblasNoTrans, m, n, B, lda); delta = 0.0; if (absDelta != NULL) { //delta = absDelta[m * N + n]; } if( (module(CREAL(a) - CREAL(b)) > delta) || (module(CIMAG(a) - CIMAG(b)) > delta) ) printf("m : %d\t n: %d\n", (int)m, (int)n); ASSERT_NEAR(CREAL(a), CREAL(b), delta); ASSERT_NEAR(CIMAG(a), CIMAG(b), delta); } } } } } template<> __template_static void compareMatrices( clblasOrder order, size_t M, size_t N, const DoubleComplex *A, const DoubleComplex *B, size_t lda, const cl_double *absDelta) { size_t m = 0, n = 0; DoubleComplex a, b; cl_double delta; if( lda > 0 ) { for (m = 0; m < M; m++) { for (n = 0; n < N; n++) { a = getElement(order, clblasNoTrans, m, n, A, lda); b = getElement(order, clblasNoTrans, m, n, B, lda); delta = 0.0; if (absDelta != NULL) { delta = absDelta[m * N + n]; } if( (module(CREAL(a) - CREAL(b)) > delta) || (module(CIMAG(a) - CIMAG(b)) > delta) ) printf("m : %d\t n: %d\n", (int)m, (int)n); ASSERT_NEAR(CREAL(a), CREAL(b), delta); ASSERT_NEAR(CIMAG(a), CIMAG(b), delta); } } } else // Packed case { if ( order == clblasColumnMajor) { for ( n = 0; n < N; n++) { for( m=n; m < M; m++) { a = getElement(order, clblasNoTrans, m, n, A, lda); b = getElement(order, clblasNoTrans, m, n, B, lda); delta = 0.0; if (absDelta != NULL) { //delta = absDelta[m * N + n]; } if( (module(CREAL(a) - CREAL(b)) > delta) || (module(CIMAG(a) - CIMAG(b)) > delta) ) printf("m : %d\t n: %d\n", (int)m, (int)n); ASSERT_NEAR(CREAL(a), CREAL(b), delta); ASSERT_NEAR(CIMAG(a), CIMAG(b), delta); } } } else { for ( m = 0; m < M; m++) { for( n = 0; n <= m; n++) { a = getElement(order, clblasNoTrans, m, n, A, lda); b = getElement(order, clblasNoTrans, m, n, B, lda); delta = 0.0; if (absDelta != NULL) { //delta = absDelta[m * N + n]; } if( (module(CREAL(a) - CREAL(b)) > delta) || (module(CIMAG(a) - CIMAG(b)) > delta) ) printf("m : %d\t n: %d\n", (int)m, (int)n); ASSERT_NEAR(CREAL(a), CREAL(b), delta); ASSERT_NEAR(CIMAG(a), CIMAG(b), delta); } } } } } template static void setNans( size_t len, T *buf) { size_t i; for (i = 0; i < len; i++) { buf[i] = FNAN(); } } // set to NAN elements of upper or lower triangle of square matrix template static void setTriangleNans( clblasOrder order, clblasUplo uplo, size_t N, T *A, size_t lda) { size_t i, j; // For matrix A for (i = 0; i < N; i++) { switch (uplo) { case clblasUpper: for (j = 0; j < i; j++) { setElement(order, clblasNoTrans, i, j, A, lda, FNAN()); } break; case clblasLower: for (j = i + 1; j < N; j++) { setElement(order, clblasNoTrans, i, j, A, lda, FNAN()); } break; } } } template static void setVectorNans( size_t offset, size_t dx, T *B, size_t N, size_t memLen) { size_t i; for (i = 0; i < offset; i++) { B[i] = FNAN(); } for (i = offset; i <= offset + dx * (N - 1); i++) { if (((i - offset) % dx) != 0) { B[i] = FNAN(); } } for (; i < memLen; i++) { B[i] = FNAN(); } } template static void compareVectors( size_t offset, size_t N, size_t dy, size_t memLen, T *blasC, T *clblasC) { size_t tailBegin, tailEnd; // check the beginning containing NANs ASSERT_FALSE(memcmp(blasC, clblasC, offset * sizeof(blasC[0]))); // check vector values compareMatrices(clblasRowMajor, N, 1, blasC + offset, clblasC + offset, dy); // check NANs between vector values if (dy != 1) { size_t i; size_t start, end; start = offset + 1; end = start + dy - 1; for (i = 0; i < N - 1; i++) { ASSERT_FALSE(memcmp(blasC + start, clblasC + start, (end - start) * sizeof(blasC[0]))); } } // check tail containing NANs tailBegin = offset; if (dy == 1) { tailBegin += N; } else { tailBegin += N + (N - 1) * (dy - 1); } tailEnd = memLen; ASSERT_FALSE(memcmp(blasC + tailBegin, clblasC + tailBegin, (tailEnd - tailBegin) * sizeof(blasC[0]))); } // Works only for NxN matrix template static T getElementBanded( clblasOrder order, clblasUplo uplo, size_t row, size_t column, size_t K, const T *A, size_t lda) { switch (order) { case clblasRowMajor: A += lda * row; return (uplo == clblasLower)? A[ K - (row-column) ]: A[ column-row ]; break; case clblasColumnMajor: A += lda * column; return (uplo == clblasLower)? A[ row-column ]: A[ K - (column-row) ]; break; } /* Unreachable point */ return FNAN(); } template static void setElementBanded( clblasOrder order, clblasUplo uplo, size_t row, size_t column, size_t K, T *A, size_t lda, T value) { switch (order) { case clblasRowMajor: A += lda * row; if (uplo == clblasLower) { A[ K - (row-column) ] = value; } else { A[ column-row ] = value; } break; case clblasColumnMajor: A += lda * column; if (uplo == clblasLower) { A[ row-column ] = value; } else { A[ K - (column-row) ] = value; } break; } } //conjugate function to handle rowmajor as columnmajor // for float and double do nothing template static void doConjugate( T *A, size_t M, size_t N, size_t lda) { if( M || N || lda|| A){} // Dummy to avoid warnings return; } template<> __template_static void doConjugate( FloatComplex *A, size_t M, size_t N, size_t lda) { size_t m, n; FloatComplex b; if ( lda > 0 ) { for (m = 0; m < M; m++) { for (n = 0; n < N; n++) { b = getElement(clblasRowMajor, clblasNoTrans, m, n, A, lda); CIMAG(b) *= (-1); setElement(clblasRowMajor, clblasNoTrans, m, n, A, lda, b); } } } } template<> __template_static void doConjugate( DoubleComplex *A, size_t M, size_t N, size_t lda) { size_t m, n; DoubleComplex b; if ( lda > 0 ) { for (m = 0; m < M; m++) { for (n = 0; n < N; n++) { b = getElement(clblasRowMajor, clblasNoTrans, m, n, A, lda); CIMAG(b) *= (-1); setElement(clblasRowMajor, clblasNoTrans, m, n, A, lda, b); } } } } template static void compareValues( const T *A, const T *B, const cl_double absDelta=0.0 ) { T a, b; a = *A; b = *B; ASSERT_NEAR(a, b, absDelta); } template<> __template_static void compareValues ( const FloatComplex *A, const FloatComplex *B, const cl_double absDelta ) { FloatComplex a, b; a = *A; b = *B; ASSERT_NEAR(CREAL(a), CREAL(b), absDelta); ASSERT_NEAR(CIMAG(a), CIMAG(b), absDelta); } template<> __template_static void compareValues ( const DoubleComplex *A, const DoubleComplex *B, const cl_double absDelta ) { DoubleComplex a, b; a = *A; b = *B; ASSERT_NEAR(CREAL(a), CREAL(b), absDelta); ASSERT_NEAR(CIMAG(a), CIMAG(b), absDelta); } #endif // MATRIX_H_ clblas-2.10/src/tests/include/nrm2.h000066400000000000000000000043361264277366700173310ustar00rootroot00000000000000/* ************************************************************************ * Copyright 2013 Advanced Micro Devices, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * ************************************************************************/ #include #include #include #include #include using ::testing::TestWithParam; class NRM2 : public TestWithParam< ::std::tr1::tuple< int, // N int, // incx int, // offx int, // offa -- for offNRM2 int // numCommandQueues > > { public: void getParams(TestParams *params) { params->N = N; params->incx = incx; params->offBX = offx; params->offa = offNRM2; params->numCommandQueues = numCommandQueues; } protected: virtual void SetUp() { //size_t lenX; N = ::std::tr1::get<0>(GetParam()); incx = ::std::tr1::get<1>(GetParam()); offx = ::std::tr1::get<2>(GetParam()); offNRM2 = ::std::tr1::get<3>(GetParam()); numCommandQueues = ::std::tr1::get<4>(GetParam()); base = ::clMath::BlasBase::getInstance(); useNumCommandQueues = base->useNumCommandQueues(); if (useNumCommandQueues) { numCommandQueues = base->numCommandQueues(); } if (base->useN()) { N = base->N(); } printTestParams(N, offx, incx); ::std::cerr << "offNRM2 = " << offNRM2 << ::std::endl; ::std::cerr << "queues = " << numCommandQueues << ::std::endl; } size_t N; int incx; size_t offx, offNRM2; ::clMath::BlasBase *base; cl_ulong imageA, imageX; bool useNumCommandQueues; cl_uint numCommandQueues; }; clblas-2.10/src/tests/include/rot.h000066400000000000000000000047401264277366700172560ustar00rootroot00000000000000/* ************************************************************************ * Copyright 2013 Advanced Micro Devices, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * ************************************************************************/ #ifndef ROT_H_ #define ROT_H_ #include #include #include #include #include using namespace clMath; using ::testing::TestWithParam; class ROT : public TestWithParam< ::std::tr1::tuple< int, // N int, // offx int, // incx int, // offy int, // incy ComplexLong, // C ComplexLong, // S int // numCommandQueues > > { public: void getParams(TestParams *params) { params->N = N; params->offa= offa; //offx params->offb = offb; // offy params->incx = incx; params->incy = incy; params->alpha = alpha; // C params->beta = beta; //S params->numCommandQueues = numCommandQueues; } protected: virtual void SetUp() { N = ::std::tr1::get<0>(GetParam()); offa = ::std::tr1::get<1>(GetParam()); incx = ::std::tr1::get<2>(GetParam()); offb = ::std::tr1::get<3>(GetParam()); incy = ::std::tr1::get<4>(GetParam()); alpha = ::std::tr1::get<5>(GetParam()); beta = ::std::tr1::get<6>(GetParam()); numCommandQueues = ::std::tr1::get<7>(GetParam()); base = ::clMath::BlasBase::getInstance(); useNumCommandQueues = base->useNumCommandQueues(); if (useNumCommandQueues) { numCommandQueues = base->numCommandQueues(); } printTestParams(N, offa, incx, offb, incy, alpha, beta ); ::std::cerr << "queues = " << numCommandQueues << ::std::endl; } size_t N, offa, offb; int incx, incy; ComplexLong alpha; ComplexLong beta; ::clMath::BlasBase *base; bool useNumCommandQueues; cl_uint numCommandQueues; }; #endif clblas-2.10/src/tests/include/rotg.h000066400000000000000000000041421264277366700174210ustar00rootroot00000000000000/* ************************************************************************ * Copyright 2013 Advanced Micro Devices, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * ************************************************************************/ #ifndef ROTG_H_ #define ROTG_H_ #include #include #include #include #include using namespace clMath; using ::testing::TestWithParam; class ROTG : public TestWithParam< ::std::tr1::tuple< int, //offsa int, //offsb int, //offc int, //offs int //numCommandQueues > > { public: void getParams(TestParams *params) { params->offa = offC; params->offb = offS; params->offBX = offSA; params->offCY = offSB; params->numCommandQueues = numCommandQueues; } protected: virtual void SetUp() { offSA = ::std::tr1::get<0>(GetParam()); offSB = ::std::tr1::get<1>(GetParam()); offC = ::std::tr1::get<2>(GetParam()); offS = ::std::tr1::get<3>(GetParam()); numCommandQueues = ::std::tr1::get<4>(GetParam()); base = ::clMath::BlasBase::getInstance(); useNumCommandQueues = base->useNumCommandQueues(); if (useNumCommandQueues) { numCommandQueues = base->numCommandQueues(); } printTestParams(offSA, offSB, offC, offS); ::std::cerr << "queues = " << numCommandQueues << ::std::endl; } size_t offSA, offSB, offC, offS; ::clMath::BlasBase *base; bool useNumCommandQueues; cl_uint numCommandQueues; }; #endif clblas-2.10/src/tests/include/rotm.h000066400000000000000000000050671264277366700174360ustar00rootroot00000000000000/* ************************************************************************ * Copyright 2013 Advanced Micro Devices, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * ************************************************************************/ #ifndef ROTM_H_ #define ROTM_H_ #include #include #include #include #include using namespace clMath; using ::testing::TestWithParam; class ROTM : public TestWithParam< ::std::tr1::tuple< int, // N int, // offx int, // incx int, // offy int, // incy int, // offParam ComplexLong, // SFLAG Param int // numCommandQueues > > { public: void getParams(TestParams *params) { params->N = N; params->offa= offa; // corrosponds to offx params->offb = offb; // corrosponds to offy params->offc = offc; // corrosponds to offParam params->incx = incx; params->incy = incy; params->alpha = alpha; // corrosponds to sflagparam params->numCommandQueues = numCommandQueues; } protected: virtual void SetUp() { N = ::std::tr1::get<0>(GetParam()); offa = ::std::tr1::get<1>(GetParam()); incx = ::std::tr1::get<2>(GetParam()); offb = ::std::tr1::get<3>(GetParam()); incy = ::std::tr1::get<4>(GetParam()); offc = ::std::tr1::get<5>(GetParam()); alpha = ::std::tr1::get<6>(GetParam()); numCommandQueues = ::std::tr1::get<7>(GetParam()); base = ::clMath::BlasBase::getInstance(); useNumCommandQueues = base->useNumCommandQueues(); if (useNumCommandQueues) { numCommandQueues = base->numCommandQueues(); } printTestParams(N, offa, incx, offb, incy, offc, alpha); ::std::cerr << "queues = " << numCommandQueues << ::std::endl; } size_t N, offa, offb, offc; int incx, incy; ComplexLong alpha; ::clMath::BlasBase *base; bool useNumCommandQueues; cl_uint numCommandQueues; }; #endif clblas-2.10/src/tests/include/rotmg.h000066400000000000000000000061221264277366700175760ustar00rootroot00000000000000/* ************************************************************************ * Copyright 2013 Advanced Micro Devices, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * ************************************************************************/ #ifndef ROTMG_H_ #define ROTMG_H_ #include #include #include #include #include #include using namespace clMath; using ::testing::TestWithParam; class ROTMG : public TestWithParam< ::std::tr1::tuple< int, // offD1 int, // offD2 int, // offBX int, // offCY int, // offParam ComplexLong, // SFLAG Param int // numCommandQueues > > { public: void getParams(TestParams *params) { params->offBX= offBX; // corrosponds to offx params->offCY = offCY; // corrosponds to offy params->offa = offa; // corrosponds to offD1 params->offb = offb; // corrosponds to offD2 params->offc = offc; // corrospods to offParam params->alpha = alpha; // corrosponds to sflagparam params->numCommandQueues = numCommandQueues; } protected: virtual void SetUp() { offBX = ::std::tr1::get<0>(GetParam()); offCY = ::std::tr1::get<1>(GetParam()); offa = ::std::tr1::get<2>(GetParam()); offb = ::std::tr1::get<3>(GetParam()); offc = ::std::tr1::get<4>(GetParam()); alpha = ::std::tr1::get<5>(GetParam()); numCommandQueues = ::std::tr1::get<6>(GetParam()); base = ::clMath::BlasBase::getInstance(); useNumCommandQueues = base->useNumCommandQueues(); if (useNumCommandQueues) { numCommandQueues = base->numCommandQueues(); } printTestParams(offBX, offCY, offa, offb, offc, alpha); ::std::cerr << "queues = " << numCommandQueues << ::std::endl; } int offa, offb, offc, offBX, offCY; ComplexLong alpha; ::clMath::BlasBase *base; bool useNumCommandQueues; cl_uint numCommandQueues; }; template static void randomRotmg( T *D1, T *D2, T *X, T *Y, T *PARAM ) { // Since rotmg involves upto 3 multiplication on an element, taking cube-root cl_double bound = pow(UPPER_BOUND(), (1.0/3)) / 2.0; *D1 = random(bound); *D2 = random(bound); *X = random(bound); *Y = random(bound); // Populate PARAM. Flag in PARAM[0] is expected to be set outside this function call for(int i=1; i<=4; i++) { PARAM[i] = random(bound); } } #endif clblas-2.10/src/tests/include/sbmv.h000066400000000000000000000113471264277366700174220ustar00rootroot00000000000000/* ************************************************************************ * Copyright 2013 Advanced Micro Devices, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * ************************************************************************/ #ifndef SBMV_H_ #define SBMV_H_ #include #include #include #include #include #include #include using namespace clMath; using ::testing::TestWithParam; class SBMV : public TestWithParam< ::std::tr1::tuple< clblasOrder, // order clblasUplo, // uplo int, // N int, // K ExtraTestSizes, ComplexLong, // Alpha ComplexLong, // Beta int // numCommandQueues > > { public: void getParams(TestParams *params) { memset(params, 0, sizeof(TestParams)); params->order = order; params->uplo = uplo; params->seed = seed; params->N = N; params->K = KLU; params->lda = lda; params->incx = incx; params->incy = incy; params->offA = offA; params->offa = offA; params->offBX = offx; params->offCY = offy; params->alpha = paramAlpha; params->beta = paramBeta; params->numCommandQueues = numCommandQueues; } protected: virtual void SetUp() { ExtraTestSizes extra; order = ::std::tr1::get<0>(GetParam()); uplo = ::std::tr1::get<1>(GetParam()); N = ::std::tr1::get<2>(GetParam()); KLU = ::std::tr1::get<3>(GetParam()); extra = ::std::tr1::get<4>(GetParam()); offA = extra.offA; offx = extra.offBX; offy = extra.offCY; lda = extra.strideA.ld; incx = extra.strideBX.inc; incy = extra.strideCY.inc; paramAlpha = ::std::tr1::get<5>(GetParam()); paramBeta = ::std::tr1::get<6>(GetParam()); numCommandQueues = ::std::tr1::get<7>(GetParam()); base = ::clMath::BlasBase::getInstance(); seed = base->seed(); useNumCommandQueues = base->useNumCommandQueues(); if (useNumCommandQueues) { numCommandQueues = base->numCommandQueues(); } KLU = KLU % N; lda = ::std::max(lda, (KLU+1)); printTestParams(order, uplo, N, KLU, paramAlpha, offA, lda, offx, incx, paramBeta, offy, incy); ::std::cerr << "seed = " << seed << ::std::endl; ::std::cerr << "queues = " << numCommandQueues << ::std::endl; } clblasOrder order; clblasUplo uplo; size_t N, KLU; size_t lda; int incx, incy; size_t offA, offx, offy; unsigned int seed; ComplexLong paramAlpha, paramBeta; ::clMath::BlasBase *base; bool useNumCommandQueues; cl_uint numCommandQueues; }; /*template static void randomGbmvMatrices( clblasOrder order, clblasTranspose trans, size_t M, size_t N, T *alpha, T *beta, T *A, size_t lda, T *X, int incx, T *Y, int incy ) { size_t i; size_t lenX, lenY, lenA; cl_double bound, maxAB, maxMN; // bound is calculated by solving the equation (alpha*x^2 + x - UPPER_BOUND) < 0 bound = UPPER_BOUND(); if(module(maxVal(*alpha)) > (sqrt(bound) / (2.0))) *alpha = random((sqrt(bound) / (2.0))); if(module(maxVal(*beta)) > (sqrt(bound) / (2.0))) *beta = random((sqrt(bound) / (2.0))); maxAB = module( ::std::max(maxVal(*alpha), maxVal(*beta)) ); // Take max of alpha & beta maxMN = ::std::max( M, N ); bound = sqrt( bound / (maxAB*maxMN) ); // (maxAB * N * bound^2 + maxAB * bound - UPPER_BOUND) < 0 lenA = ((order == clblasRowMajor)? M: N) * lda; for (i = 0; i < lenA; i++) { A[i] = random(bound); } if( trans == clblasNoTrans ) { lenX = 1 + ((N - 1) * abs(incx)); lenY = 1 + ((M - 1) * abs(incy)); } else { lenX = 1 + ((M - 1) * abs(incx)); lenY = 1 + ((N - 1) * abs(incy)); } if (X != NULL) { for (i = 0; i < lenX; i++) { X[i] = random(bound); } } if (Y != NULL) { for (i = 0; i < lenY; i++) { Y[i] = random(bound); } } }*/ #endif // GBMV_H_ clblas-2.10/src/tests/include/scal.h000066400000000000000000000045411264277366700173730ustar00rootroot00000000000000/* ************************************************************************ * Copyright 2013 Advanced Micro Devices, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * ************************************************************************/ #ifndef SCAL_H_ #define SCAL_H_ #include #include #include #include #include #include using namespace clMath; using ::testing::TestWithParam; class SCAL : public TestWithParam< ::std::tr1::tuple< int, // N ComplexLong, // alpha int, // offx int, // incx int // numCommandQueues > > { public: void getParams(TestParams *params) { params->N = N; params->alpha = paramAlpha; params->offBX = offx; params->incx = incx; params->numCommandQueues = numCommandQueues; } protected: virtual void SetUp() { N = ::std::tr1::get<0>(GetParam()); paramAlpha = ::std::tr1::get<1>(GetParam()); offx = ::std::tr1::get<2>(GetParam()); incx = ::std::tr1::get<3>(GetParam()); numCommandQueues = ::std::tr1::get<4>(GetParam()); base = ::clMath::BlasBase::getInstance(); seed = base->seed(); useNumCommandQueues = base->useNumCommandQueues(); if (useNumCommandQueues) { numCommandQueues = base->numCommandQueues(); } printTestParams(N, paramAlpha, offx, incx); ::std::cerr << "seed = " << seed << ::std::endl; ::std::cerr << "queues = " << numCommandQueues << ::std::endl; } size_t N; unsigned int seed; size_t offx; int incx; bool useAlpha; ComplexLong paramAlpha; ::clMath::BlasBase *base; bool useNumCommandQueues; cl_uint numCommandQueues; }; #endif // SCAL_H_ clblas-2.10/src/tests/include/spmv.h000066400000000000000000000124331264277366700174350ustar00rootroot00000000000000/* ************************************************************************ * Copyright 2013 Advanced Micro Devices, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * ************************************************************************/ #ifndef SPMV_H_ #define SPMV_H_ #include #include #include #include #include #include #include using namespace clMath; using ::testing::TestWithParam; class SPMV : public TestWithParam< ::std::tr1::tuple< clblasOrder, // order clblasUplo, // uplo int, // N ComplexLong, // Alpha ComplexLong, // Beta size_t, // offA size_t, // offx size_t, // offy ExtraTestSizes, int // numCommandQueues > > { public: void getParams(TestParams *params) { params->order = order; params->uplo = uplo; params->seed = seed; params->N = N; params->lda = lda; params->incx = incx; params->incy = incy; params->offA = offA; params->offBX = offx; params->offCY = offy; params->alpha = paramAlpha; params->beta = paramBeta; params->numCommandQueues = numCommandQueues; } protected: virtual void SetUp() { ExtraTestSizes extra; order = ::std::tr1::get<0>(GetParam()); uplo = ::std::tr1::get<1>(GetParam()); N = ::std::tr1::get<2>(GetParam()); paramAlpha = ::std::tr1::get<3>(GetParam()); paramBeta = ::std::tr1::get<4>(GetParam()); offA = ::std::tr1::get<5>(GetParam()); offx = ::std::tr1::get<6>(GetParam()); offy = ::std::tr1::get<7>(GetParam()); extra = ::std::tr1::get<8>(GetParam()); lda = 0; incx = extra.strideBX.inc; incy = extra.strideCY.inc; numCommandQueues = ::std::tr1::get<9>(GetParam()); base = ::clMath::BlasBase::getInstance(); seed = base->seed(); useNumCommandQueues = base->useNumCommandQueues(); if (useNumCommandQueues) { numCommandQueues = base->numCommandQueues(); } useAlpha = base->useAlpha(); if (useAlpha != 0) { paramAlpha = base->alpha(); } useBeta = base->useBeta(); if (useBeta != 0) { paramBeta = base->beta(); } if (base->useN()) { N = base->N(); } if (base->useIncX()) { incx = base->incX(); } if (base->useIncY()) { incy = base->incY(); } printTestParams(order, uplo, N, paramAlpha, offA, 0, offx, incx, paramBeta, offy, incy); ::std::cerr << "seed = " << seed << ::std::endl; ::std::cerr << "queues = " << numCommandQueues << ::std::endl; } clblasOrder order; clblasUplo uplo; size_t N; size_t lda; int incx, incy; size_t offA, offx, offy; unsigned int seed; bool useAlpha, useBeta; ComplexLong paramAlpha, paramBeta; ::clMath::BlasBase *base; cl_ulong imageA, imageX, imageY; bool useNumCommandQueues; cl_uint numCommandQueues; }; template static void randomSpmvMatrices( clblasOrder order, clblasUplo uplo, size_t N, bool useAlpha, T *alpha, T *A, T *X, int incx, bool useBeta, T *beta, T *Y, int incy ) { size_t i, j; size_t lengthX; size_t lengthY; cl_double bound; cl_double fAlpha, fBeta; if (!useAlpha) { *alpha = random(100); if (module(*alpha) == 0.0) { *alpha = 1.0; } } if (!useBeta) { *beta = random(100); if (module(*beta) == 0.0) { *beta = 1.0; } } bound = UPPER_BOUND(); if(module(*alpha) > bound) *alpha = random((sqrt(bound) / ((2.0) * N))); if (module(*alpha) == 0.0) { *alpha = 1.0; } if(module(*beta) > bound) *beta = random((sqrt(bound))); if (module(*beta) == 0.0) { *beta = 1.0; } fAlpha = module(*alpha); fBeta = module(*beta); bound = bound / (fAlpha * N); bound = sqrt( ((((((fBeta * fBeta)) / fAlpha) / (4.0)) / fAlpha) / (N * N)) + bound) - ((fBeta) / ((2.0) * (fAlpha) * N)); for (i = 0; i < N; i++) { for (j = 0; j < N; j++) { setElementPacked(order, clblasNoTrans, uplo, i, j, A, N, random(bound)); } } lengthX = 1 + ((N - 1) * abs(incx)); if (X != NULL) { for (i = 0; i < lengthX; i++) { X[i] = random(bound); } } lengthY = 1 + (N - 1) * abs(incy); if (Y != NULL) { for (i = 0; i < lengthY; i++) { Y[i] = random(bound); } } } #endif // SPMV_H_ clblas-2.10/src/tests/include/spr.h000066400000000000000000000015201264277366700172470ustar00rootroot00000000000000/* ************************************************************************ * Copyright 2013 Advanced Micro Devices, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * ************************************************************************/ #ifndef SPR_H_ #define SPR_H_ #define SYR_PACKED #include "syr.h" #undef SYR_PACKED #endif clblas-2.10/src/tests/include/spr2.h000066400000000000000000000015271264277366700173400ustar00rootroot00000000000000/* ************************************************************************ * Copyright 2013 Advanced Micro Devices, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * ************************************************************************/ #ifndef SPR2_H_ #define SPR2_H_ #define SYR2_PACKED #include "syr2.h" #undef SYR2_PACKED #endifclblas-2.10/src/tests/include/swap.h000066400000000000000000000046201264277366700174210ustar00rootroot00000000000000/* ************************************************************************ * Copyright 2013 Advanced Micro Devices, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * ************************************************************************/ #ifndef SWAP__H_ #define SWAP__H_ #include #include #include #include using namespace clMath; using ::testing::TestWithParam; // Name SWAP creates problem in gTest class SWAPXY : public TestWithParam< ::std::tr1::tuple< int, // N int, // offBX int, // incx, should not be 0 int, //offCY int, //incy, should not be 0 int // numCommandQueues > > { public: void getParams(TestParams *params) { params->N = N; params->offBX = offBX; params->incx = incx; params->offCY = offCY; params->incy = incy; params->numCommandQueues = numCommandQueues; } protected: virtual void SetUp() { N = ::std::tr1::get<0>(GetParam()); offBX = ::std::tr1::get<1>(GetParam()); incx = ::std::tr1::get<2>(GetParam()); offCY = ::std::tr1::get<3>(GetParam()); incy = ::std::tr1::get<4>(GetParam()); numCommandQueues = ::std::tr1::get<5>(GetParam()); base = ::clMath::BlasBase::getInstance(); seed = base->seed(); useNumCommandQueues = base->useNumCommandQueues(); if (useNumCommandQueues) { numCommandQueues = base->numCommandQueues(); } if (base->useN()) { N = base->N(); } printTestParams(N, offBX, incx, offCY, incy); ::std::cerr << "queues = " << numCommandQueues << ::std::endl; } size_t N; size_t offBX; int incx; size_t offCY; int incy; unsigned int seed; ::clMath::BlasBase *base; bool useNumCommandQueues; cl_uint numCommandQueues; }; #endif clblas-2.10/src/tests/include/symm.h000066400000000000000000000074511264277366700174410ustar00rootroot00000000000000/* ************************************************************************ * Copyright 2013 Advanced Micro Devices, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * ************************************************************************/ #ifndef SYMM_H_ #define SYMM_H_ #include #include #include #include #include using namespace clMath; using ::testing::TestWithParam; class SYMM : public TestWithParam< ::std::tr1::tuple< clblasOrder, // order clblasSide, // side clblasUplo, // uplo int, // M int, // N cl_float2, //alpha cl_float2, //beta ExtraTestSizes, // to get more than ten parameters in gtest. int // numCommandQueues > > { public: void getParams(TestParams *params) { params->order = order; params->seed = seed; params->side = side; params->uplo = uplo; params->M = M; params->N = N; params->lda = lda; params->ldb = ldb; params->ldc = ldc; params->offa = offa; params->offb = offb; params->offc = offc; params->alpha.re = (long)CREAL(alpha); params->alpha.imag = (long)CIMAG(alpha); params->beta.re = (long)CREAL(beta); params->beta.imag = (long)CIMAG(beta); params->numCommandQueues = numCommandQueues; } protected: virtual void SetUp() { ExtraTestSizes extra; order = ::std::tr1::get<0>(GetParam()); side = ::std::tr1::get<1>(GetParam()); uplo = ::std::tr1::get<2>(GetParam()); M = ::std::tr1::get<3>(GetParam()); N = ::std::tr1::get<4>(GetParam()); alpha = ::std::tr1::get<5>(GetParam()); beta = ::std::tr1::get<6>(GetParam()); extra = ::std::tr1::get<7>(GetParam()); offa = extra.offA; offb = extra.offBX; offc = extra.offCY; lda = extra.strideA.ld; ldb = extra.strideBX.ld; ldc = extra.strideCY.ld; numCommandQueues = ::std::tr1::get<8>(GetParam()); base = ::clMath::BlasBase::getInstance(); seed = base->seed(); useNumCommandQueues = base->useNumCommandQueues(); if (useNumCommandQueues) { numCommandQueues = base->numCommandQueues(); } if (base->useM()) { M = base->M(); } if (base->useN()) { N = base->N(); } if( side == clblasLeft ) { lda = ::std::max(lda, M); } else { lda = ::std::max(lda, N); } switch (order) { case clblasRowMajor: ldb = ::std::max(ldb, N); ldc = ::std::max(ldc, N); break; case clblasColumnMajor: ldb = ::std::max(ldb, M); ldc = ::std::max(ldc, M); break; } printTestParams(order, side, uplo, M, N, 1, alpha, 1, beta, lda, ldb, ldc, offa, offb, offc); ::std::cerr << "seed = " << seed << ::std::endl; ::std::cerr << "queues = " << numCommandQueues << ::std::endl; } clblasOrder order; clblasSide side; clblasUplo uplo; size_t M, N; size_t lda, ldb, ldc; size_t offa, offb, offc; unsigned int seed; cl_float2 alpha, beta; ::clMath::BlasBase *base; bool useNumCommandQueues; cl_uint numCommandQueues; }; #endif // SYMM_H_ clblas-2.10/src/tests/include/symv.h000066400000000000000000000123461264277366700174510ustar00rootroot00000000000000/* ************************************************************************ * Copyright 2013 Advanced Micro Devices, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * ************************************************************************/ #ifndef SYMV_H_ #define SYMV_H_ #include #include #include #include #include #include #include using namespace clMath; using ::testing::TestWithParam; class SYMV : public TestWithParam< ::std::tr1::tuple< clblasOrder, // order clblasUplo, // uplo int, // N ExtraTestSizes, int // numCommandQueues > > { public: void getParams(TestParams *params) { memset(params, 0, sizeof(TestParams)); params->order = order; params->uplo = uplo; params->seed = seed; params->N = N; params->lda = lda; params->ldb = ldb; params->ldc = ldc; params->rowsA = rowsA; params->rowsB = rowsB; params->rowsC = rowsC; params->columnsA = columnsA; params->columnsB = columnsB; params->columnsC = columnsC; params->incx = incx; params->incy = incy; params->offA = offsetA; params->offBX = offsetx; params->offCY = offsety; params->alpha = paramAlpha; params->beta = paramBeta; params->numCommandQueues = numCommandQueues; } protected: virtual void SetUp() { ExtraTestSizes extra; order = ::std::tr1::get<0>(GetParam()); uplo = ::std::tr1::get<1>(GetParam()); N = ::std::tr1::get<2>(GetParam()); extra = ::std::tr1::get<3>(GetParam()); offsetA = extra.offA; lda = extra.strideA.ld; incx = extra.strideBX.inc; incy = extra.strideCY.inc; numCommandQueues = ::std::tr1::get<4>(GetParam()); base = ::clMath::BlasBase::getInstance(); seed = base->seed(); useNumCommandQueues = base->useNumCommandQueues(); if (useNumCommandQueues) { numCommandQueues = base->numCommandQueues(); } useAlpha = base->useAlpha(); if (useAlpha != 0) { paramAlpha = base->alpha(); } useBeta = base->useBeta(); if (useBeta != 0) { paramBeta = base->beta(); } if (base->useN()) { N = base->N(); } if (base->useIncX()) { incx = base->incX(); } if (base->useIncY()) { incy = base->incY(); } lda = ::std::max(lda, N); if (incx == 1 || incx == -1) { /* X is row vector for row major matrix B * or column vector for column major matrix B */ ldb = lda; offsetx = (N / 2) * ldb; } else { /* X is column vector for row major matrix B * or row vector for column major matrix B */ ldb = ::std::max(N, (size_t)module(incx)); offsetx = N / 2; incx = incx > 0 ? (int)ldb : (int)(0-ldb); } if (incy == 1 || incy == -1) { /* Y is row vector in row major matrix C * or column vector in column major matrix C */ ldc = lda; offsety = (N / 2) * ldc; } else { /* Y is column vector in matrix C * or row vector in column major matrix C */ ldc = ::std::max(N, (size_t)module(incy)); offsety = N / 2; incy = incy > 0 ? (int)ldc : (int)(0-ldc); } switch (order) { case clblasRowMajor: columnsA = lda; columnsB = ldb; columnsC = ldc; rowsA = N; rowsB = N; rowsC = N; break; case clblasColumnMajor: rowsA = lda; rowsB = ldb; rowsC = ldc; columnsA = N; columnsB = N; columnsC = N; break; } printTestParams(order, uplo, N, useAlpha, base->alpha(), offsetA, lda, incx, useBeta, base->beta(), incy); ::std::cerr << "seed = " << seed << ::std::endl; ::std::cerr << "queues = " << numCommandQueues << ::std::endl; } clblasOrder order; clblasUplo uplo; size_t N; size_t lda, ldb, ldc; size_t offsetA, offsetx, offsety; int incx, incy; unsigned int seed; bool useAlpha, useBeta; ComplexLong paramAlpha, paramBeta; size_t rowsA, columnsA, rowsB, columnsB, rowsC, columnsC; ::clMath::BlasBase *base; cl_ulong imageA, imageX; bool useNumCommandQueues; cl_uint numCommandQueues; }; #endif // SYMV_H_ clblas-2.10/src/tests/include/syr.h000066400000000000000000000065521264277366700172720ustar00rootroot00000000000000/* ************************************************************************ * Copyright 2013 Advanced Micro Devices, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * ************************************************************************/ #if !defined(SYR_PACKED) #ifndef SYR_H #define SYR_H #else #define DUPLICIT #endif #endif #ifndef DUPLICIT #include #include #include #include #include using ::testing::TestWithParam; #ifndef SYR_PACKED class SYR : public TestWithParam< #else class SPR : public TestWithParam< #endif ::std::tr1::tuple< clblasOrder, // order clblasUplo, // uplo int, // N double, // alpha int, // offx int, // incx, should be greater than 0 int, // offa int, // lda, 0 - undefined int // numCommandQueues > > { public: void getParams(TestParams *params) { params->order = order; params->uplo = uplo; params->seed = seed; params->N = N; params->alpha.re = (long)alpha; // This will cast alpha to long. So the real value that is // passed is not the same as what is set in the test case params->offBX = offx; params->incx = incx; params->offa = offa; params->lda = lda; params->numCommandQueues = numCommandQueues; } protected: virtual void SetUp() { order = ::std::tr1::get<0>(GetParam()); uplo = ::std::tr1::get<1>(GetParam()); N = ::std::tr1::get<2>(GetParam()); alpha = ::std::tr1::get<3>(GetParam()); offx = ::std::tr1::get<4>(GetParam()); incx = ::std::tr1::get<5>(GetParam()); offa = ::std::tr1::get<6>(GetParam()); lda = ::std::tr1::get<7>(GetParam()); numCommandQueues = ::std::tr1::get<8>(GetParam()); #ifndef SYR_PACKED lda = ::std::max( lda, N ); #else lda =0; #endif base = ::clMath::BlasBase::getInstance(); seed = base->seed(); //base->setAlpha(50); //alpha = 50.0; useNumCommandQueues = base->useNumCommandQueues(); if (useNumCommandQueues) { numCommandQueues = base->numCommandQueues(); } if (base->useN()) { N = base->N(); } // if (base->useAlpha()) { // alpha = base->Alpha(); // } printTestParams(order, uplo, N, alpha, offx, incx, offa, lda); ::std::cerr << "seed = " << seed << ::std::endl; ::std::cerr << "queues = " << numCommandQueues << ::std::endl; } clblasOrder order; clblasUplo uplo; size_t N; size_t lda; int incx; size_t offx, offa; unsigned int seed; double alpha; ::clMath::BlasBase *base; cl_ulong imageA, imageX; bool useNumCommandQueues; cl_uint numCommandQueues; }; #endif // SYR_H_ clblas-2.10/src/tests/include/syr2.h000066400000000000000000000072451264277366700173540ustar00rootroot00000000000000/* ************************************************************************ * Copyright 2013 Advanced Micro Devices, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * ************************************************************************/ #if !defined(SYR2_PACKED) #ifndef SYR2_H #define SYR2_H #else #define DUPLICIT #endif #endif #ifndef DUPLICIT #include #include #include #include #include using ::testing::TestWithParam; #ifndef SYR2_PACKED class SYR2 : public TestWithParam< #else class SPR2 : public TestWithParam< #endif ::std::tr1::tuple< clblasOrder, // order clblasUplo, // uplo int, // N double, //alpha int, // offx int, // incx, should be greater than 0 int, // offy, //int, // incy, should be greater than 0. // Since tuple doesnot allow more than 10 arguments we assume incy = incx; int, // offa int, // lda, 0 - undefined int // numCommandQueues > > { public: void getParams(TestParams *params) { params->order = order; params->uplo = uplo; params->seed = seed; params->N = N; params->alpha.re = (long)alpha; // This will cast alpha to long. So the real value that is // passed is not the same as what is set in the test case params->offBX = offx; params->incx = incx; params->offCY = offy; params->incy = incy; params->offa = offa; params->lda = lda; params->numCommandQueues = numCommandQueues; } protected: virtual void SetUp() { order = ::std::tr1::get<0>(GetParam()); uplo = ::std::tr1::get<1>(GetParam()); N = ::std::tr1::get<2>(GetParam()); alpha = ::std::tr1::get<3>(GetParam()); offx = ::std::tr1::get<4>(GetParam()); incx = ::std::tr1::get<5>(GetParam()); offy = ::std::tr1::get<6>(GetParam()); offa = ::std::tr1::get<7>(GetParam()); lda = ::std::tr1::get<8>(GetParam()); numCommandQueues = ::std::tr1::get<9>(GetParam()); incy = incx; //GTest allows only 10 arguments to be passed and //hence we define incy to be equivalent to incx. #ifndef SYR2_PACKED lda = ::std::max( lda, N ); #else lda =0; #endif base = ::clMath::BlasBase::getInstance(); seed = base->seed(); useNumCommandQueues = base->useNumCommandQueues(); if (useNumCommandQueues) { numCommandQueues = base->numCommandQueues(); } if (base->useN()) { N = base->N(); } printTestParams(order, uplo, N, alpha, offx, incx, offy, incy, offa, lda); ::std::cerr << "seed = " << seed << ::std::endl; ::std::cerr << "queues = " << numCommandQueues << ::std::endl; } clblasOrder order; clblasUplo uplo; size_t N; size_t lda; int incx, incy; size_t offx, offy, offa; unsigned int seed; double alpha; ::clMath::BlasBase *base; cl_ulong imageA, imageX, imageY; bool useNumCommandQueues; cl_uint numCommandQueues; }; #endif // SYR2_H_ clblas-2.10/src/tests/include/syr2k.h000066400000000000000000000114301264277366700175160ustar00rootroot00000000000000/* ************************************************************************ * Copyright 2013 Advanced Micro Devices, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * ************************************************************************/ #ifndef SYR2K_H_ #define SYR2K_H_ #include #include #include #include #include #include using namespace clMath; using ::testing::TestWithParam; class SYR2K : public TestWithParam< ::std::tr1::tuple< clblasOrder, // order clblasUplo, // uplo clblasTranspose, // transA int, // N int, // K ExtraTestSizes, int // numCommandQueues > > { public: void getParams(TestParams *params) { memset(params, 0, sizeof(TestParams)); params->order = order; params->uplo = uplo; params->transA = transA; params->seed = seed; params->N = N; params->K = K; params->offA = offA; params->offBX = offB; params->offCY = offC; params->lda = lda; params->ldb = ldb; params->ldc = ldc; params->rowsA = rowsA; params->columnsA = columnsA; params->rowsB = rowsB; params->columnsB = columnsB; params->rowsC = rowsC; params->columnsC = columnsC; params->alpha = paramAlpha; params->beta = paramBeta; params->numCommandQueues = numCommandQueues; } protected: virtual void SetUp() { ExtraTestSizes extra; order = ::std::tr1::get<0>(GetParam()); uplo = ::std::tr1::get<1>(GetParam()); transA = ::std::tr1::get<2>(GetParam()); N = ::std::tr1::get<3>(GetParam()); K = ::std::tr1::get<4>(GetParam()); extra = ::std::tr1::get<5>(GetParam()); offA = extra.offA; offB = extra.offBX; offC = extra.offCY; lda = extra.strideA.ld; ldb = extra.strideBX.ld; ldc = extra.strideCY.ld; numCommandQueues = ::std::tr1::get<6>(GetParam()); base = ::clMath::BlasBase::getInstance(); seed = base->seed(); useNumCommandQueues = base->useNumCommandQueues(); if (useNumCommandQueues) { numCommandQueues = base->numCommandQueues(); } useAlpha = base->useAlpha(); if (useAlpha != 0) { paramAlpha = base->alpha(); } useBeta = base->useBeta(); if (useBeta != 0) { paramBeta = base->beta(); } if (base->useN()) { N = base->N(); } if (base->useK()) { K = base->K(); } if (transA == clblasNoTrans) { rowsA = N; columnsA = K; rowsB = N; columnsB = K; } else { rowsA = K; columnsA = N; rowsB = K; columnsB = N; } rowsC = N; columnsC = N; switch (order) { case clblasRowMajor: lda = ::std::max(lda, columnsA); columnsA = lda; ldb = ::std::max(ldb, columnsB); columnsB = ldb; ldc = ::std::max(ldc, columnsC); columnsC = ldc; break; case clblasColumnMajor: lda = ::std::max(lda, rowsA); rowsA = lda; ldb = ::std::max(ldb, rowsB); rowsB = ldb; ldc = ::std::max(ldc, rowsC); rowsC = ldc; break; } printTestParams(order, uplo, transA, N, K, useAlpha, base->alpha(), offA, lda, offB, ldb, useBeta, base->beta(), offC, ldc); ::std::cerr << "seed = " << seed << ::std::endl; ::std::cerr << "queues = " << numCommandQueues << ::std::endl; } clblasOrder order; clblasUplo uplo; clblasTranspose transA; size_t N, K; size_t offA, offB, offC; size_t lda, ldb, ldc; unsigned int seed; bool useAlpha, useBeta; ComplexLong paramAlpha, paramBeta; size_t rowsA, columnsA; size_t rowsB, columnsB; size_t rowsC, columnsC; ::clMath::BlasBase *base; cl_ulong imageA, imageB; bool useNumCommandQueues; cl_uint numCommandQueues; }; #endif // SYR2K_H_ clblas-2.10/src/tests/include/syrk.h000066400000000000000000000103271264277366700174400ustar00rootroot00000000000000/* ************************************************************************ * Copyright 2013 Advanced Micro Devices, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * ************************************************************************/ #ifndef SYRK_H_ #define SYRK_H_ #include #include #include #include #include #include using namespace clMath; using ::testing::TestWithParam; class SYRK : public TestWithParam< ::std::tr1::tuple< clblasOrder, // order clblasUplo, // uplo clblasTranspose, // transA int, // N int, // K ExtraTestSizes, int // numCommandQueues > > { public: void getParams(TestParams *params) { memset(params, 0, sizeof(TestParams)); params->order = order; params->uplo = uplo; params->transA = transA; params->seed = seed; params->N = N; params->K = K; params->offA = offA; params->offCY = offC; params->lda = lda; params->ldc = ldc; params->rowsA = rowsA; params->columnsA = columnsA; params->rowsC = rowsC; params->columnsC = columnsC; params->numCommandQueues = numCommandQueues; } protected: virtual void SetUp() { ExtraTestSizes extra; order = ::std::tr1::get<0>(GetParam()); uplo = ::std::tr1::get<1>(GetParam()); transA = ::std::tr1::get<2>(GetParam()); N = ::std::tr1::get<3>(GetParam()); K = ::std::tr1::get<4>(GetParam()); extra = ::std::tr1::get<5>(GetParam()); offA = extra.offA; offC = extra.offCY; lda = extra.strideA.ld; ldc = extra.strideCY.ld; numCommandQueues = ::std::tr1::get<6>(GetParam()); base = ::clMath::BlasBase::getInstance(); seed = base->seed(); useNumCommandQueues = base->useNumCommandQueues(); if (useNumCommandQueues) { numCommandQueues = base->numCommandQueues(); } useAlpha = base->useAlpha(); if (useAlpha != 0) { paramAlpha = base->alpha(); } useBeta = base->useBeta(); if (useBeta != 0) { paramBeta = base->beta(); } if (base->useN()) { N = base->N(); } if (base->useK()) { K = base->K(); } if (transA == clblasNoTrans) { rowsA = N; columnsA = K; } else { rowsA = K; columnsA = N; } rowsC = N; columnsC = N; switch (order) { case clblasRowMajor: lda = ::std::max(lda, columnsA); columnsA = lda; ldc = ::std::max(ldc, columnsC); columnsC = ldc; break; case clblasColumnMajor: lda = ::std::max(lda, rowsA); rowsA = lda; ldc = ::std::max(ldc, rowsC); rowsC = ldc; break; } printTestParams(order, uplo, transA, N, K, useAlpha, base->alpha(), offA, lda, useBeta, base->beta(), offC, ldc); ::std::cerr << "seed = " << seed << ::std::endl; ::std::cerr << "queues = " << numCommandQueues << ::std::endl; } clblasOrder order; clblasUplo uplo; clblasTranspose transA; size_t N, K; size_t offA, offC; size_t lda, ldc; unsigned int seed; bool useAlpha, useBeta; ComplexLong paramAlpha, paramBeta; size_t rowsA, columnsA; size_t rowsC, columnsC; ::clMath::BlasBase *base; bool useNumCommandQueues; cl_uint numCommandQueues; }; #endif // SYRK_H_ clblas-2.10/src/tests/include/tbmv.h000066400000000000000000000075001264277366700174170ustar00rootroot00000000000000/* ************************************************************************ * Copyright 2013 Advanced Micro Devices, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * ************************************************************************/ #ifndef TBMV_H_ #define TBMV_H_ #include #include #include #include #include #include #include using namespace clMath; using ::testing::TestWithParam; class TBMV : public TestWithParam< ::std::tr1::tuple< clblasOrder, // order clblasUplo, // uplo clblasTranspose, // transA clblasDiag, // diag int, // N int, // KL or KU ExtraTestSizes, int // numCommandQueues > > { public: void getParams(TestParams *params) { memset(params, 0, sizeof(TestParams)); params->order = order; params->uplo = uplo; params->transA = transA; params->diag = diag; params->seed = seed; params->N = N; params->K = KLU; params->lda = lda; params->incx = incx; params->offA = offA; params->offa = offA; params->offBX = offx; params->numCommandQueues = numCommandQueues; } protected: virtual void SetUp() { ExtraTestSizes extra; order = ::std::tr1::get<0>(GetParam()); uplo = ::std::tr1::get<1>(GetParam()); transA = ::std::tr1::get<2>(GetParam()); diag = ::std::tr1::get<3>(GetParam()); N = ::std::tr1::get<4>(GetParam()); KLU = ::std::tr1::get<5>(GetParam()); extra = ::std::tr1::get<6>(GetParam()); offA = extra.offA; offx = extra.offBX; lda = extra.strideA.ld; incx = extra.strideBX.inc; numCommandQueues = ::std::tr1::get<7>(GetParam()); base = ::clMath::BlasBase::getInstance(); seed = base->seed(); useNumCommandQueues = base->useNumCommandQueues(); if (useNumCommandQueues) { numCommandQueues = base->numCommandQueues(); } KLU = KLU % N; lda = ::std::max(lda, (KLU+1)); printTestParams(order, uplo, transA, diag, N, KLU, offA, lda, offx, incx, 0, 1); ::std::cerr << "seed = " << seed << ::std::endl; ::std::cerr << "queues = " << numCommandQueues << ::std::endl; } clblasOrder order; clblasTranspose transA; clblasUplo uplo; clblasDiag diag; size_t N, KLU; size_t lda; int incx; size_t offA, offx; unsigned int seed; ::clMath::BlasBase *base; bool useNumCommandQueues; cl_uint numCommandQueues; }; template static void randomTbmvMatrices( size_t N, T *A, size_t lda, T *X, int incx ) { size_t i; size_t lenX, lenA; cl_double bound; // bound is calculated by solving the equation (x^2 + x - UPPER_BOUND) < 0 bound = UPPER_BOUND(); bound = sqrt( bound / N ); // (N * bound^2 - UPPER_BOUND) < 0 lenA = (N) * lda; for (i = 0; i < lenA; i++) { A[i] = random(bound); } lenX = 1 + ((N - 1) * abs(incx)); if (X != NULL) { for (i = 0; i < lenX; i++) { X[i] = random(bound); } } } #endif // TBMV_H_ clblas-2.10/src/tests/include/tbsv.h000066400000000000000000000142041264277366700174240ustar00rootroot00000000000000/* ************************************************************************ * Copyright 2013 Advanced Micro Devices, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * ************************************************************************/ #ifndef TBSV_H_ #define TBSV_H_ #include #include #include #include #include #include #include #include using namespace clMath; using ::testing::TestWithParam; class TBSV : public TestWithParam< ::std::tr1::tuple< clblasOrder, // order clblasUplo, // uplo clblasTranspose, // transA clblasDiag, // diag int, // N int, // KL or KU ExtraTestSizes, int // numCommandQueues > > { public: void getParams(TestParams *params) { memset(params, 0, sizeof(TestParams)); params->order = order; params->uplo = uplo; params->transA = transA; params->diag = diag; params->seed = seed; params->N = N; params->K = KLU; params->lda = lda; params->incx = incx; params->offA = offA; params->offa = offA; params->offBX = offx; params->numCommandQueues = numCommandQueues; } protected: virtual void SetUp() { ExtraTestSizes extra; order = ::std::tr1::get<0>(GetParam()); uplo = ::std::tr1::get<1>(GetParam()); transA = ::std::tr1::get<2>(GetParam()); diag = ::std::tr1::get<3>(GetParam()); N = ::std::tr1::get<4>(GetParam()); KLU = ::std::tr1::get<5>(GetParam()); extra = ::std::tr1::get<6>(GetParam()); offA = extra.offA; offx = extra.offBX; lda = extra.strideA.ld; incx = extra.strideBX.inc; numCommandQueues = ::std::tr1::get<7>(GetParam()); base = ::clMath::BlasBase::getInstance(); seed = base->seed(); useNumCommandQueues = base->useNumCommandQueues(); if (useNumCommandQueues) { numCommandQueues = base->numCommandQueues(); } KLU = KLU % N; lda = ::std::max(lda, (KLU+1)); printTestParams(order, uplo, transA, diag, N, KLU, offA, lda, offx, incx, 0, 1); ::std::cerr << "seed = " << seed << ::std::endl; ::std::cerr << "queues = " << numCommandQueues << ::std::endl; } clblasOrder order; clblasTranspose transA; clblasUplo uplo; clblasDiag diag; size_t N, KLU; size_t lda; int incx; size_t offA, offx; unsigned int seed; ::clMath::BlasBase *base; bool useNumCommandQueues; cl_uint numCommandQueues; }; template static void randomTbsvMatrices( clblasOrder order, clblasUplo uplo, clblasDiag diag, size_t N, size_t K, T *A, size_t lda, T *X, int incx) { size_t i, j; T min, max, x, y; cl_double modMin, modMax, sum, maxDiag; min = ZERO(); max = ZERO(); incx = abs(incx); maxDiag = 1.0; cl_double bound; bound = (UPPER_BOUND()/(N)); switch (diag) { case clblasUnit: for (i = 0; i < N; i++) { // must not be accessed setElementBanded(order, uplo, i, i, K, A, lda, FNAN()); } break; case clblasNonUnit: /* Do not allow zeros on A's main diagonal and get a big number which is atleast greater than N/4*/ maxDiag = ((N/4) > bound) ? (bound/4) : (N/4); maxDiag = (1 > (maxDiag)) ? 1 : maxDiag; do { max = randomTrsv(bound); } while ((module(max) < (maxDiag))); modMax = module(max); min = max / 100; modMin = module(min); setElementBanded(order, uplo, 0, 0, K, A, lda, max); //printf("Diagonals %d ", max); for (i = 1; i < N; i++) { x = randomTrsv(modMin, modMax); if (module(x) < 1) { x = max; } //printf("%d ", x); /*if(module(x) < 1) { printf("WARNING: Diagonal less than one\n"); }*/ setElementBanded(order, uplo, i, i, K, A, lda, x); } // printf("\n"); break; } /* Generate a_{ij} for all j <> i. */ for (i = 0; i < N; i++) { if (diag == clblasUnit) { sum = module(ONE()); } else { T temp; temp = getElementBanded(order, uplo, i, i, K, A, lda); sum = module(temp); } for (j = 0; j < N; j++) { if ((j == i) || (module((int)(i-j)) > ((int)K)) ) // Diagonal and out-of-band elemnts { continue; } if (((uplo == clblasUpper) && (j > i)) || ((uplo == clblasLower) && (j < i))) { x = randomTrsv(sum/(K + 1)); //Only K + 1 accumulation not N. setElementBanded(order, uplo, i, j, K, A, lda, x); } } } /* Generate matrix X. */ sum = TRSM_LIMIT_B(); for (i = 0; i < N; i++) { if(diag == clblasNonUnit) { sum = module(getElementBanded(order, uplo, i, i, K, A, lda)); } else { sum = module(ONE()); } y = randomTrsv(sum/(K+1)); setElement(clblasColumnMajor, clblasNoTrans, (i * abs(incx)), 0, X, (1 + (N-1)*abs(incx)), y); if (i == 0) { min = y; } else if (module(y) < module(min)) { min = y; } } } #endif // TBSV_H_ clblas-2.10/src/tests/include/test-limits.h000066400000000000000000000045771264277366700207400ustar00rootroot00000000000000/* ************************************************************************ * Copyright 2013 Advanced Micro Devices, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * ************************************************************************/ #ifndef TEST_LIMITS_H_ #define TEST_LIMITS_H_ #define FLOAT_UPPER_BOUND pow(2.0, 23) #define DOUBLE_UPPER_BOUND pow(2.0, 52) #define TRSM_FLOAT_LIMIT_A pow(2.0, 7) #define TRSM_DOUBLE_LIMIT_A pow(2.0, 5) #define TRSM_FLOAT_LIMIT_B pow(2.0, 16) #define TRSM_DOUBLE_LIMIT_B pow(2.0, 47) // Type-dependant constants template static cl_double UPPER_BOUND(); template<> __template_static cl_double UPPER_BOUND() { return FLOAT_UPPER_BOUND; } template<> __template_static cl_double UPPER_BOUND() { return DOUBLE_UPPER_BOUND;} template<> __template_static cl_double UPPER_BOUND() { return FLOAT_UPPER_BOUND; } template<> __template_static cl_double UPPER_BOUND() { return DOUBLE_UPPER_BOUND; } template static cl_double TRSM_LIMIT_A(); template<> __template_static cl_double TRSM_LIMIT_A() { return TRSM_FLOAT_LIMIT_A; } template<> __template_static cl_double TRSM_LIMIT_A() { return TRSM_DOUBLE_LIMIT_A; } template<> __template_static cl_double TRSM_LIMIT_A() { return TRSM_FLOAT_LIMIT_A; } template<> __template_static cl_double TRSM_LIMIT_A() { return TRSM_DOUBLE_LIMIT_A; } template static cl_double TRSM_LIMIT_B(); template<> __template_static cl_double TRSM_LIMIT_B() { return TRSM_FLOAT_LIMIT_B; } template<> __template_static cl_double TRSM_LIMIT_B() { return TRSM_DOUBLE_LIMIT_B; } template<> __template_static cl_double TRSM_LIMIT_B() { return TRSM_FLOAT_LIMIT_B; } template<> __template_static cl_double TRSM_LIMIT_B() { return TRSM_DOUBLE_LIMIT_B; } #endif /* TEST_LIMITS_H_ */ clblas-2.10/src/tests/include/testDG.h000066400000000000000000000037051264277366700176440ustar00rootroot00000000000000/* ************************************************************************ * Copyright 2013 Advanced Micro Devices, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * ************************************************************************/ #ifndef _TESTDG_H_ #define _TESTDG_H_ // Coming from testDG.hpp enum TRIANGLE_OPERATIONS { LTOU, UTOL, SWAP }; enum RealMatrixCreationFlags { //NO_FLAGS = 0, ROW_MAJOR_ORDER = 1, PACKED_MATRIX = 2, SYMMETRIC_MATRIX = 4, UPPER_HALF_ONLY = 8, LOWER_HALF_ONLY = 16, NO_ALIGNMENT = 32, UNIT_DIAGONAL = 64, RANDOM_INIT = 128, ZERO_DIAGONAL = 256 }; #define setDiagonalUnity() setDiagonalUnityOrNonUnity(1, data, rows, cols, lda, vectorLength, creationFlags, bound) // Unity diagonal #define setDiagonalRandom() setDiagonalUnityOrNonUnity(2, data, rows, cols, lda, vectorLength, creationFlags, bound) // Random values #define setDiagonalZero() setDiagonalUnityOrNonUnity(0, data, rows, cols, lda, vectorLength, creationFlags, bound) // Zero diagonal // Column-Major is i,j replaced and RML is CMU // So CMU(i,j) will be RML(j,i) // The following is Row-Major packed #define RMLPacked(i,j) ((T*)data + ((i*(i+1))/2 + j) * vectorLength) #define RMUPacked(i,j) ((T*)data + ((i*((2* rows) + 1 - i))/2 + (j -i))* vectorLength ) #define CMUPacked(i,j) ((T*)data + ((j*(j+1))/2 + i)* vectorLength) #define CMLPacked(i,j) ((T*)data + ((j*((2*rows) + 1 - j))/2 + (i - j))* vectorLength) #endif clblas-2.10/src/tests/include/timer.h000066400000000000000000000026331264277366700175710ustar00rootroot00000000000000/* ************************************************************************ * Copyright 2013 Advanced Micro Devices, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * ************************************************************************/ #ifndef TIMER_H_ #define TIMER_H_ #ifdef __cplusplus extern "C" { #endif #if defined(_MSC_VER) typedef unsigned long long nano_time_t; #define NANOTIME_MAX (~0ULL - 1) #elif defined(__APPLE__) #include typedef uint64_t nano_time_t; #define NANOTIME_MAX (UINT64_MAX - 1) #else typedef unsigned long nano_time_t; #define NANOTIME_MAX (~0UL - 1) #endif #define NANOTIME_ERR (NANOTIME_MAX + 1) nano_time_t conv2millisec(nano_time_t t); nano_time_t conv2microsec(nano_time_t t); nano_time_t conv2nanosec(nano_time_t t); nano_time_t getCurrentTime(void); void sleepTime(nano_time_t t); #ifdef __cplusplus } /* extern "C" { */ #endif #endif /* TIMER_H_ */ clblas-2.10/src/tests/include/tpmv.h000066400000000000000000000015271264277366700174400ustar00rootroot00000000000000/* ************************************************************************ * Copyright 2013 Advanced Micro Devices, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * ************************************************************************/ #ifndef TPMV_H_ #define TPMV_H_ #define TRMV_PACKED #include "trmv.h" #undef TRMV_PACKED #endif clblas-2.10/src/tests/include/tpsv.h000066400000000000000000000015311264277366700174410ustar00rootroot00000000000000/* ************************************************************************ * Copyright 2013 Advanced Micro Devices, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * ************************************************************************/ #ifndef TPSV_H_ #define TPSV_H_ #define TRSV_PACKED_ #include "trsv.h" #undef TRSV_PACKED_ #endif clblas-2.10/src/tests/include/trmm.h000066400000000000000000000106231264277366700174260ustar00rootroot00000000000000/* ************************************************************************ * Copyright 2013 Advanced Micro Devices, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * ************************************************************************/ #ifndef TRMM_H_ #define TRMM_H_ #include #include #include #include using namespace clMath; using ::testing::TestWithParam; class TRMM : public TestWithParam< ::std::tr1::tuple< clblasOrder, // order clblasSide, // side clblasUplo, // uplo clblasTranspose, // transA clblasDiag, // diag int, // M int, // N ExtraTestSizes, int // numCommandQueues > > { public: void getParams(TestParams *params) { memset(params, 0, sizeof(TestParams)); params->order = order; params->side = side; params->uplo = uplo; params->transA = transA; params->diag = diag; params->seed = seed; params->M = M; params->N = N; params->offA = offA; params->offBX = offB; params->lda = lda; params->ldb = ldb; params->rowsA = rowsA; params->columnsA = columnsA; params->rowsB = rowsB; params->columnsB = columnsB; params->numCommandQueues = numCommandQueues; } protected: virtual void SetUp() { ExtraTestSizes extra; order = ::std::tr1::get<0>(GetParam()); side = ::std::tr1::get<1>(GetParam()); uplo = ::std::tr1::get<2>(GetParam()); transA = ::std::tr1::get<3>(GetParam()); diag = ::std::tr1::get<4>(GetParam()); M = ::std::tr1::get<5>(GetParam()); N = ::std::tr1::get<6>(GetParam()); extra = ::std::tr1::get<7>(GetParam()); offA = extra.offA; offB = extra.offBX; lda = extra.strideA.ld; ldb = extra.strideBX.ld; numCommandQueues = ::std::tr1::get<8>(GetParam()); base = ::clMath::BlasBase::getInstance(); seed = base->seed(); useNumCommandQueues = base->useNumCommandQueues(); if (useNumCommandQueues) { numCommandQueues = base->numCommandQueues(); } useAlpha = base->useAlpha(); if (useAlpha != 0) { paramAlpha = base->alpha(); } if (base->useM()) { M = base->M(); } if (base->useN()) { N = base->N(); } switch (side) { case clblasLeft: rowsA = M; columnsA = M; break; case clblasRight: rowsA = N; columnsA = N; break; } rowsB = M; columnsB = N; switch (order) { case clblasRowMajor: lda = ::std::max(lda, columnsA); columnsA = lda; ldb = ::std::max(ldb, columnsB); columnsB = ldb; break; case clblasColumnMajor: lda = ::std::max(lda, rowsA); rowsA = lda; ldb = ::std::max(ldb, rowsB); rowsB = ldb; break; } printTestParams(order, side, uplo, transA, diag, M, N, useAlpha, base->alpha(), offA, lda, offB, ldb); ::std::cerr << "seed = " << seed << ::std::endl; ::std::cerr << "queues = " << numCommandQueues << ::std::endl; } clblasOrder order; clblasSide side; clblasUplo uplo; clblasTranspose transA; clblasDiag diag; size_t M, N; size_t offA, offB; size_t lda, ldb; unsigned int seed; bool useAlpha; ComplexLong paramAlpha; size_t rowsA, columnsA; size_t rowsB, columnsB; ::clMath::BlasBase *base; cl_ulong imageA, imageB; bool useNumCommandQueues; cl_uint numCommandQueues; }; #endif // TRMM_H_ clblas-2.10/src/tests/include/trmv.h000066400000000000000000000063331264277366700174420ustar00rootroot00000000000000/* ************************************************************************ * Copyright 2013 Advanced Micro Devices, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * ************************************************************************/ #if !defined(TRMV_PACKED) #ifndef TRMV_H #define TRMV_H #else #define DUPLICIT #endif #endif #ifndef DUPLICIT #include #include #include #include #include using ::testing::TestWithParam; #ifndef TRMV_PACKED class TRMV : public TestWithParam< #else class TPMV : public TestWithParam< #endif ::std::tr1::tuple< clblasOrder, // order clblasUplo, // uplo clblasTranspose, // transA clblasDiag, // diag int, // N int, // lda, 0 - undefined int, // incx, should be greater than 0 int, //offa int, //offx int // numCommandQueues > > { public: void getParams(TestParams *params) { params->order = order; params->uplo = uplo; params->transA = transA; params->diag = diag; params->seed = seed; params->N = N; params->lda = lda; params->incx = incx; params->offa = offa; params->offBX = offx; params->numCommandQueues = numCommandQueues; } protected: virtual void SetUp() { order = ::std::tr1::get<0>(GetParam()); uplo = ::std::tr1::get<1>(GetParam()); transA = ::std::tr1::get<2>(GetParam()); diag = ::std::tr1::get<3>(GetParam()); N = ::std::tr1::get<4>(GetParam()); lda = ::std::tr1::get<5>(GetParam()); incx = ::std::tr1::get<6>(GetParam()); offa = ::std::tr1::get<7>(GetParam()); offx = ::std::tr1::get<8>(GetParam()); numCommandQueues = ::std::tr1::get<9>(GetParam()); #ifndef TRMV_PACKED lda = ::std::max( lda, N ); #else lda =0; #endif base = ::clMath::BlasBase::getInstance(); seed = base->seed(); useNumCommandQueues = base->useNumCommandQueues(); if (useNumCommandQueues) { numCommandQueues = base->numCommandQueues(); } if (base->useN()) { N = base->N(); } printTestParams(order, uplo, transA, diag, N, lda, incx, offa, offx); ::std::cerr << "seed = " << seed << ::std::endl; ::std::cerr << "queues = " << numCommandQueues << ::std::endl; } clblasOrder order; clblasUplo uplo; clblasTranspose transA; clblasDiag diag; size_t N; size_t lda; int incx; size_t offx, offa; unsigned int seed; ::clMath::BlasBase *base; cl_ulong imageA, imageX; bool useNumCommandQueues; cl_uint numCommandQueues; }; #endif clblas-2.10/src/tests/include/trsm.h000066400000000000000000000107041264277366700174340ustar00rootroot00000000000000/* ************************************************************************ * Copyright 2013 Advanced Micro Devices, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * ************************************************************************/ #ifndef TRSM_H_ #define TRSM_H_ #include #include #include #include #include using namespace clMath; using ::testing::TestWithParam; class TRSM : public TestWithParam< ::std::tr1::tuple< clblasOrder, // order clblasSide, // side clblasUplo, // uplo clblasTranspose, // transA clblasDiag, // diag int, // M int, // N ExtraTestSizes, int // numCommandQueues > > { public: void getParams(TestParams *params) { memset(params, 0, sizeof(TestParams)); params->order = order; params->side = side; params->uplo = uplo; params->transA = transA; params->diag = diag; params->seed = seed; params->M = M; params->N = N; params->offA = offA; params->offBX = offB; params->lda = lda; params->ldb = ldb; params->rowsA = rowsA; params->columnsA = columnsA; params->rowsB = rowsB; params->columnsB = columnsB; params->alpha = paramAlpha; params->numCommandQueues = numCommandQueues; } protected: virtual void SetUp() { ExtraTestSizes extra; order = ::std::tr1::get<0>(GetParam()); side = ::std::tr1::get<1>(GetParam()); uplo = ::std::tr1::get<2>(GetParam()); transA = ::std::tr1::get<3>(GetParam()); diag = ::std::tr1::get<4>(GetParam()); M = ::std::tr1::get<5>(GetParam()); N = ::std::tr1::get<6>(GetParam()); extra = ::std::tr1::get<7>(GetParam()); offA = extra.offA; offB = extra.offBX; lda = extra.strideA.ld; ldb = extra.strideBX.ld; numCommandQueues = ::std::tr1::get<8>(GetParam()); base = ::clMath::BlasBase::getInstance(); seed = base->seed(); useNumCommandQueues = base->useNumCommandQueues(); if (useNumCommandQueues) { numCommandQueues = base->numCommandQueues(); } useAlpha = base->useAlpha(); if (useAlpha != 0) { paramAlpha = base->alpha(); } if (base->useM()) { M = base->M(); } if (base->useN()) { N = base->N(); } switch (side) { case clblasLeft: rowsA = M; columnsA = M; break; case clblasRight: rowsA = N; columnsA = N; break; } rowsB = M; columnsB = N; switch (order) { case clblasRowMajor: lda = ::std::max(lda, columnsA); columnsA = lda; ldb = ::std::max(ldb, columnsB); columnsB = ldb; break; case clblasColumnMajor: lda = ::std::max(lda, rowsA); rowsA = lda; ldb = ::std::max(ldb, rowsB); rowsB = ldb; break; } printTestParams(order, side, uplo, transA, diag, M, N, useAlpha, base->alpha(), offA, lda, offB, ldb); ::std::cerr << "seed = " << seed << ::std::endl; ::std::cerr << "queues = " << numCommandQueues << ::std::endl; } clblasOrder order; clblasSide side; clblasUplo uplo; clblasTranspose transA; clblasDiag diag; size_t M, N; size_t offA, offB; size_t lda, ldb; unsigned int seed; bool useAlpha; ComplexLong paramAlpha; size_t rowsA, columnsA; size_t rowsB, columnsB; ::clMath::BlasBase *base; cl_ulong imageA; bool useNumCommandQueues; cl_uint numCommandQueues; }; #endif // TRSM_H_ clblas-2.10/src/tests/include/trsv.h000066400000000000000000000064341264277366700174520ustar00rootroot00000000000000/* ************************************************************************ * Copyright 2013 Advanced Micro Devices, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * ************************************************************************/ #if !defined(TRSV_PACKED_) #ifndef TRSV_H_ #define TRSV_H_ #else #define DUPLICIT #endif #endif #ifndef DUPLICIT #include #include #include #include #include using ::testing::TestWithParam; #ifndef TRSV_PACKED_ class TRSV : public TestWithParam< #else class TPSV : public TestWithParam< #endif ::std::tr1::tuple< clblasOrder, // order clblasUplo, // uplo clblasTranspose, // transA clblasDiag, // diag int, // N int, // lda, 0 - undefined int, // incx, should be greater than 0 int, //offa int, //offx int // numCommandQueues > > { public: void getParams(TestParams *params) { params->order = order; params->uplo = uplo; params->transA = transA; params->diag = diag; params->seed = seed; params->N = N; params->lda = lda; params->incx = incx; params->offa = offa; params->offBX = offx; params->numCommandQueues = numCommandQueues; } protected: virtual void SetUp() { order = ::std::tr1::get<0>(GetParam()); uplo = ::std::tr1::get<1>(GetParam()); transA = ::std::tr1::get<2>(GetParam()); diag = ::std::tr1::get<3>(GetParam()); N = ::std::tr1::get<4>(GetParam()); lda = ::std::tr1::get<5>(GetParam()); incx = ::std::tr1::get<6>(GetParam()); offa = ::std::tr1::get<7>(GetParam()); offx = ::std::tr1::get<8>(GetParam()); numCommandQueues = ::std::tr1::get<9>(GetParam()); #ifndef TRSV_PACKED_ lda = ::std::max( lda, N ); #else lda = 0; #endif base = ::clMath::BlasBase::getInstance(); seed = base->seed(); useNumCommandQueues = base->useNumCommandQueues(); if (useNumCommandQueues) { numCommandQueues = base->numCommandQueues(); } if (base->useN()) { N = base->N(); } printTestParams(order, uplo, transA, diag, N, lda, incx, offa, offx); ::std::cerr << "seed = " << seed << ::std::endl; ::std::cerr << "queues = " << numCommandQueues << ::std::endl; } clblasOrder order; clblasUplo uplo; clblasTranspose transA; clblasDiag diag; size_t N; size_t lda; int incx; size_t offx, offa; unsigned int seed; ::clMath::BlasBase *base; cl_ulong imageA, imageX; bool useNumCommandQueues; cl_uint numCommandQueues; }; #endif // DUPLICIT clblas-2.10/src/tests/performance/000077500000000000000000000000001264277366700171525ustar00rootroot00000000000000clblas-2.10/src/tests/performance/BlasBase-perf.cpp000066400000000000000000000060771264277366700222760ustar00rootroot00000000000000/* ************************************************************************ * Copyright 2013 Advanced Micro Devices, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * ************************************************************************/ #include #include #include #include #include #include namespace clMath { static size_t imageMaxDimension(cl_context context, int widthHeight) { cl_int err; cl_device_id devices[2]; size_t i, retSize; size_t rc = (size_t)-1; cl_device_info par; par = (widthHeight) ? CL_DEVICE_IMAGE2D_MAX_HEIGHT : CL_DEVICE_IMAGE2D_MAX_WIDTH; err = clGetContextInfo(context, CL_CONTEXT_DEVICES, sizeof(devices), devices, &retSize); if (err == CL_SUCCESS) { size_t s; retSize /= sizeof(cl_device_id); for (i = 0; (i < retSize) && (err == CL_SUCCESS); i++) { err = clGetDeviceInfo(devices[i], par, sizeof(s), &s, NULL); if (err == CL_SUCCESS) { rc = std::min(rc, s); } } } if (err != CL_SUCCESS) { rc = 0; } return rc; } static size_t imageMaxWidth(cl_context context) { return imageMaxDimension(context, 0); } static size_t imageMaxHeight(cl_context context) { return imageMaxDimension(context, 1); } clblasStatus BlasBase::addScratchImages(void) { //cl_ulong memSize, allocSize; //size_t width, height; //clblasStatus status; //float scale; ///* // * get maximum amount of memory each image can takes, not // * forgetting that it can be up to three matrices residing // * in memory objects // */ //allocSize = maxMemAllocSize(); //memSize = availGlobalMemSize(0); //if (allocSize > memSize / 5) { // allocSize = memSize / 5; // scale = 1.4f; //} //else { // scale = 1.5f; //} //height = static_cast(sqrt(static_cast(allocSize) / sizeof(cl_float))); //width = height / 4; //height = static_cast(height / scale); //width = static_cast(width * scale); //if (height > imageMaxHeight(context_)) { // height = imageMaxHeight(context_); //} //if (width > imageMaxWidth(context_)) { // width = imageMaxWidth(context_); //} //imageA_ = clblasAddScratchImage(context_, width, height, &status); //if (imageA_) { // imageB_ = clblasAddScratchImage(context_, width, height, &status); //} //return status; return clblasNotImplemented; } } // namespace clblas-2.10/src/tests/performance/PerformanceRecorder.cpp000066400000000000000000000071121264277366700236060ustar00rootroot00000000000000/* ************************************************************************ * Copyright 2013 Advanced Micro Devices, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * ************************************************************************/ /* * Overall performance recorder implementation */ #include #include "PerformanceRecorder.h" #include using namespace clMath; PerformanceRecorder::PerformanceRecorder() { unsigned int size = static_cast(BLAS_FUNCTION_END); records_ = new PerfRecord[size]; memset(records_, 0, sizeof(PerfRecord) * size); } PerformanceRecorder::~PerformanceRecorder() { delete[] records_; } void PerformanceRecorder::etalonRegPerf( BlasFunction fn, unsigned long us, problem_size_t size) { int id = static_cast(fn); records_[id].etalonGFlops += ((gflops_t)size / us) / 1000; records_[id].etalonGbps += ((gbps_t)size / us) / 1000; records_[id].etalonNrRuns++; } void PerformanceRecorder::clblasRegPerf( BlasFunction fn, unsigned long us, problem_size_t size) { int id = static_cast(fn); records_[id].clblasGFlops += ((gflops_t)size / us) / 1000; records_[id].clblasGbps += ((gbps_t)size / us) / 1000; if( (functionBlasLevel(static_cast(fn)) == 2) //display metrics in GBps if it is a BLAS-2/1 functio || (functionBlasLevel(static_cast(fn)) == 1) ) { std::cerr << "clBlas GBPS : " << (((gbps_t)size / us) / 1000) << std::endl << std::endl << std::endl; } else { std::cerr << "clBlas GFLOPS : " << (((gflops_t)size / us) / 1000) << std::endl << std::endl << std::endl; } records_[id].clblasNrRuns++; } void PerformanceRecorder::regTimeRatio(BlasFunction fn, double ratio) { int id = static_cast(fn); records_[id].timeRatio += ratio; records_[id].nrRatios++; } gflops_t PerformanceRecorder::etalonAvgPerf(BlasFunction fn) { int id = static_cast(fn); gflops_t gflops = records_[id].etalonGFlops; if (records_[id].etalonNrRuns) { gflops /= records_[id].etalonNrRuns; } return gflops; } gflops_t PerformanceRecorder::clblasAvgPerf(BlasFunction fn) { int id = static_cast(fn); gflops_t gflops = records_[id].clblasGFlops; if (records_[id].clblasNrRuns) { gflops /= records_[id].clblasNrRuns; } return gflops; } gbps_t PerformanceRecorder::etalonAvgGbpsPerf(BlasFunction fn) { int id = static_cast(fn); gbps_t gbps = records_[id].etalonGbps; if (records_[id].etalonNrRuns) { gbps /= records_[id].etalonNrRuns; } return gbps; } gbps_t PerformanceRecorder::clblasAvgGbpsPerf(BlasFunction fn) { int id = static_cast(fn); gbps_t gbps = records_[id].clblasGbps; if (records_[id].clblasNrRuns) { gbps /= records_[id].clblasNrRuns; } return gbps; } double PerformanceRecorder::avgTimeRatio(BlasFunction fn) { int id = static_cast(fn); double ratio = records_[id].timeRatio; if (records_[id].nrRatios) { ratio /= records_[id].nrRatios; } return ratio; } clblas-2.10/src/tests/performance/PerformanceRecorder.h000066400000000000000000000047771264277366700232710ustar00rootroot00000000000000/* ************************************************************************ * Copyright 2013 Advanced Micro Devices, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * ************************************************************************/ /* * Overall performance recorder definition */ #ifndef PERFORMANCERECORDER_H_ #define PERFORMANCERECORDER_H_ #include #include enum { MAX_TIMES_PER_FUNCTION = 3 }; namespace clMath { typedef double gflops_t; typedef double gbps_t; #if defined(_MSC_VER) typedef unsigned long long problem_size_t; #else typedef uint64_t problem_size_t; #endif class PerformanceRecorder { public: PerformanceRecorder(); virtual ~PerformanceRecorder(); // register etalon function execution time void etalonRegPerf(BlasFunction fn, unsigned long us, problem_size_t size); // register clblas function execution time void clblasRegPerf(BlasFunction fn, unsigned long us, problem_size_t size); /* * register time ratio of the clblas function against this one * of the reference implementation */ void regTimeRatio(BlasFunction fn, double ratio); // get average etalon function average performance in giga-flops and gbps gflops_t etalonAvgPerf(BlasFunction fn); gbps_t etalonAvgGbpsPerf(BlasFunction fn); // get clblas function average performance in giga-flops and gbps gflops_t clblasAvgPerf(BlasFunction fn); gbps_t clblasAvgGbpsPerf(BlasFunction fn); /* * get average time ratio of a clblas function against * the reference implementation */ double avgTimeRatio(BlasFunction fn); private: struct PerfRecord { gflops_t etalonGFlops; gflops_t clblasGFlops; gbps_t etalonGbps; gbps_t clblasGbps; unsigned int etalonNrRuns; unsigned int clblasNrRuns; double timeRatio; unsigned int nrRatios; }; PerfRecord *records_; }; } // namespace clMath extern clMath::PerformanceRecorder *perfRecorder; #endif /* PERFORMANCERECORDER_H_ */ clblas-2.10/src/tests/performance/PerformanceTest.cpp000066400000000000000000000063571264277366700227720ustar00rootroot00000000000000/* ************************************************************************ * Copyright 2013 Advanced Micro Devices, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * ************************************************************************/ /* * Basic performance test case class implementation */ #include #include #include #include #include "PerformanceTest.h" #include "timer.h" using namespace std; using namespace clMath; enum { NUMBER_TEST_RUNS = 5 // 1000 }; int PerformanceTest::run(int opFactor) { int i; nano_time_t t1, t2; nano_time_t time = NANOTIME_MAX; if (prepare()) { return -1; } /* * etalon and tested procedures several times and select * the minimum time so that to reduce delay would be introduced * by another OS components or applications */ t1 = NANOTIME_MAX; for (i = 0; (i < NUMBER_TEST_RUNS) && (time != NANOTIME_ERR); i++) { time = etalonPerfSingle(); if (time < t1) { t1 = time; } } t2 = NANOTIME_MAX; for (i = 0; (i < NUMBER_TEST_RUNS) && (time != NANOTIME_ERR); i++) { time = clblasPerfSingle(); if (time < t2) { t2 = time; } } if (time == NANOTIME_ERR) { return -1; } t1 = conv2microsec(t1); t2 = conv2microsec(t2); #ifdef PERF_TEST_WITH_ACML std::cerr << "Acml "; #endif if ( (functionBlasLevel(function_) == 2) || (functionBlasLevel(function_) == 1) ) { cerr << "reference function has worked in " << t1 << " microseconds, clBlas function has worked in " << t2 << " microseconds"; } else { cerr << "reference function has worked in " << t1 / 1000 << " milliseconds, clBlas function has worked in " << t2 / 1000 << " milliseconds"; } if (t2 != 0) { cerr << ", time ratio is " << (double)t1 / t2; } cerr << endl; perfRecorder->etalonRegPerf(function_, static_cast(t1), prob_size_ * opFactor); perfRecorder->clblasRegPerf(function_, static_cast(t2), prob_size_ * opFactor); if (t2 != 0) { perfRecorder->regTimeRatio(function_, (double)t1 / t2); } /* * Here check only if the CLBLAS version has worked not slower then * the reference one */ #if 0 return !(t2 <= t1); #else return 0; #endif } int PerformanceTest::prepare(void) { // stub return -1; } nano_time_t PerformanceTest::etalonPerfSingle(void) { // stub return NANOTIME_ERR; } nano_time_t PerformanceTest::clblasPerfSingle(void) { // stub return NANOTIME_ERR; } clblas-2.10/src/tests/performance/PerformanceTest.h000066400000000000000000000032531264277366700224270ustar00rootroot00000000000000/* ************************************************************************ * Copyright 2013 Advanced Micro Devices, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * ************************************************************************/ /* * Basic performance test case class declaration */ #ifndef PERFORMANCE_TEST_H_ #define PERFORMANCE_TEST_H_ #include #include "timer.h" #include "PerformanceRecorder.h" enum { MAX_ZMATRIX_SIZE = 3072 }; namespace clMath { class PerformanceTest { public: PerformanceTest(BlasFunction function, problem_size_t prob_size) : function_(function), prob_size_(prob_size) { }; virtual ~PerformanceTest() { } /* * On runtime error returns -1; otherwise returns 1 * if the CLBLAS version has been slower, otherwise returns 0 * * @opFactor: scaling factor showing number of operations per each element */ int run(int opFactor); virtual int prepare(void); virtual nano_time_t etalonPerfSingle(void); virtual nano_time_t clblasPerfSingle(void); private: BlasFunction function_; problem_size_t prob_size_; }; } // namespace clMath #endif /* PERFORMANCE_TEST_H_ */ clblas-2.10/src/tests/performance/TrxmPerformanceTest.cpp000066400000000000000000000244471264277366700236450ustar00rootroot00000000000000/* ************************************************************************ * Copyright 2013 Advanced Micro Devices, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * ************************************************************************/ /* * Performance test case class implementation for * TRMM and TRSM routines */ #include // memcpy() #include #include #include #include #include #include #ifdef PERF_TEST_WITH_ACML #include #include #endif #include "PerformanceTest.h" using namespace std; using namespace clMath; namespace clMath { template class TrxmPerformanceTest : public PerformanceTest { public: virtual ~TrxmPerformanceTest(); virtual int prepare(void); virtual nano_time_t etalonPerfSingle(void); virtual nano_time_t clblasPerfSingle(void); static void runInstance(BlasFunction fn, TestParams *params) { TrxmPerformanceTest *perfCase; int ret = 0; int opFactor; BlasBase *base; base = clMath::BlasBase::getInstance(); if (fn == FN_STRMM || fn == FN_DTRMM || fn == FN_STRSM || fn == FN_DTRSM) { opFactor = 1; } else { opFactor = 4; } if ((fn == FN_DTRMM || fn == FN_ZTRMM || fn == FN_DTRSM || fn == FN_ZTRSM) && !base->isDevSupportDoublePrecision()) { std::cerr << ">> WARNING: The target device doesn't support native " "double precision floating point arithmetic" << std::endl << ">> Test skipped" << std::endl; return; } perfCase = new TrxmPerformanceTest(fn, params); if (!perfCase->areResourcesSufficient(params)) { std::cerr << ">> RESOURCE CHECK: Skip due to unsufficient " "resources" << std::endl; } else { ret = perfCase->run(opFactor); } delete perfCase; ASSERT_GE(ret, 0) << "Fatal error: can not allocate resources or " "perform an OpenCL request!" << endl; EXPECT_EQ(0, ret) << "The OpenCL version is slower in the case" << endl; } private: TrxmPerformanceTest(BlasFunction fn, TestParams *params); bool areResourcesSufficient(TestParams *params); TestParams params_; ElemType alpha_; ElemType *A_; ElemType *B_; ElemType *backB_; cl_mem mobjA_; cl_mem mobjB_; ::clMath::BlasBase *base_; bool isTrsm_; static problem_size_t problemSize(TestParams *params); }; } // namespace template TrxmPerformanceTest::TrxmPerformanceTest( BlasFunction fn, TestParams *params) : PerformanceTest(fn, problemSize(params)), params_(*params), mobjA_(NULL), mobjB_(NULL) { A_ = new ElemType[params_.rowsA * params_.columnsA]; B_ = new ElemType[params_.rowsB * params_.columnsB]; backB_ = new ElemType[params_.rowsB * params_.columnsB]; base_ = ::clMath::BlasBase::getInstance(); isTrsm_ = (static_cast(fn) >= FN_STRSM); } template TrxmPerformanceTest::~TrxmPerformanceTest() { delete[] A_; delete[] B_; delete[] backB_; clReleaseMemObject(mobjB_); clReleaseMemObject(mobjA_); } /* * Check if available OpenCL resources are sufficient to * run the test case */ template bool TrxmPerformanceTest::areResourcesSufficient(TestParams *params) { clMath::BlasBase *base; size_t gmemSize, allocSize, maxMatrSize; bool ret = true; size_t m = params->M, n = params->N; size_t asize; clblasSide side = params->side; base = clMath::BlasBase::getInstance(); gmemSize = (size_t)base->availGlobalMemSize(0); allocSize = (size_t)base->maxMemAllocSize(); asize = (side == clblasLeft) ? m : n; if (base->useImages()) { size_t iw; // overall 2 images 1/5 of gmemSize each and 2 memory objects maxMatrSize = 3 * gmemSize / 10; iw = base->scratchImageWidth() * sizeof(cl_float4) / sizeof(ElemType); if (isTrsm_) { size_t ih, nb; // check if matrix A is fitted to the image with 32x32 blocks ih = base->scratchImageHeight(); nb = asize / 32 + (asize % 32 != 0); ret = ((asize * asize + nb * 32 * 32) / 2 < iw * ih); } else { ret = (std::max(n, asize) < iw); } } else { maxMatrSize = gmemSize / 2; } maxMatrSize = std::min(maxMatrSize, allocSize); if (ret) { ret = ((m * n * sizeof(ElemType) < maxMatrSize) && (asize * asize * sizeof(ElemType) < maxMatrSize)); } return ret; } template int TrxmPerformanceTest::prepare(void) { bool useAlpha = base_->useAlpha(); if (useAlpha) { alpha_ = convertMultiplier(base_->alpha()); } if (isTrsm_) { randomTrsmMatrices(params_.order, params_.side, params_.uplo, params_.diag, params_.M, params_.N, useAlpha, &alpha_, A_, params_.lda, B_, params_.ldb); } else { randomTrmmMatrices(params_.order, params_.side, params_.uplo, params_.diag, params_.M, params_.N, useAlpha, &alpha_, A_, params_.lda, B_, params_.ldb); } mobjA_ = base_->createEnqueueBuffer(A_, params_.rowsA * params_.columnsA * sizeof(ElemType), params_.offA * sizeof(ElemType), CL_MEM_READ_ONLY); if (mobjA_) { mobjB_ = base_->createEnqueueBuffer(backB_, params_.rowsB * params_.columnsB * sizeof(ElemType), params_.offBX * sizeof(ElemType), CL_MEM_READ_WRITE); } return (mobjB_) ? 0 : -1; } template nano_time_t TrxmPerformanceTest::etalonPerfSingle(void) { nano_time_t time = 0; clblasOrder order; size_t lda, ldb; #ifndef PERF_TEST_WITH_ROW_MAJOR if (params_.order == clblasRowMajor) { cerr << "Row major order is not allowed" << endl; return NANOTIME_ERR; } #endif memcpy(B_, backB_, params_.rowsB * params_.columnsB * sizeof(ElemType)); order = params_.order; lda = params_.lda; ldb = params_.ldb; #ifdef PERF_TEST_WITH_ACML if (order == clblasRowMajor) { order = clblasColumnMajor; if (params_.side == clblasLeft) { lda = params_.M; } else { lda = params_.N; } ldb = params_.M; } time = getCurrentTime(); if (isTrsm_) { clMath::blas::trsm(order, params_.side, params_.uplo, params_.transA, params_.diag, params_.M, params_.N, alpha_, A_, lda, B_, ldb); } else { clMath::blas::trmm(order, params_.side, params_.uplo, params_.transA, params_.diag, params_.M, params_.N, alpha_, A_, lda, B_, ldb); } time = getCurrentTime() - time; #endif // PERF_TEST_WITH_ACML return time; } template nano_time_t TrxmPerformanceTest::clblasPerfSingle(void) { nano_time_t time; cl_event event; cl_int status; cl_command_queue queue; queue = base_->commandQueues()[0]; status = clEnqueueWriteBuffer(queue, mobjB_, CL_TRUE, 0, params_.rowsB * params_.columnsB * sizeof(ElemType), backB_, 0, NULL, &event); if (status != CL_SUCCESS) { cerr << "Matrix B buffer object enqueuing error, status = " << status << endl; return NANOTIME_ERR; } status = clWaitForEvents(1, &event); if (status != CL_SUCCESS) { cerr << "Wait on event failed, status = " << status << endl; return NANOTIME_ERR; } event = NULL; if (isTrsm_) { status = (cl_int)clMath::clblas::trsm(params_.order, params_.side, params_.uplo, params_.transA, params_.diag, params_.M, params_.N, alpha_, mobjA_, params_.offA, params_.lda, mobjB_, params_.offBX, params_.ldb, 1, &queue, 0, NULL, &event); } else { status = (cl_int)clMath::clblas::trmm(params_.order, params_.side, params_.uplo, params_.transA, params_.diag, params_.M, params_.N, alpha_, mobjA_, params_.offA, params_.lda, mobjB_, params_.offBX, params_.ldb, 1, &queue, 0, NULL, &event); } if (status != CL_SUCCESS) { cerr << "The CLBLAS TRXM function failed, status = " << status << endl; return NANOTIME_ERR; } status = flushAll(1, &queue); if (status != CL_SUCCESS) { cerr << "clFlush() failed, status = " << status << endl; return NANOTIME_ERR; } time = getCurrentTime(); status = waitForSuccessfulFinish(1, &queue, &event); if (status == CL_SUCCESS) { time = getCurrentTime() - time; } else { cerr << "Waiting for completion of commands to the queue failed, " "status = " << status << endl; time = NANOTIME_ERR; } return time; } template problem_size_t TrxmPerformanceTest::problemSize(TestParams *params) { problem_size_t size; if (params->side == clblasRight) { size = (problem_size_t)params->N * params->N * params->M; } else { size = (problem_size_t)params->M * params->M * params->N; } return size; } clblas-2.10/src/tests/performance/perf-asum.cpp000066400000000000000000000201221264277366700215520ustar00rootroot00000000000000/* ************************************************************************ * Copyright 2013 Advanced Micro Devices, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * ************************************************************************/ #include // srand() #include // memcpy() #include #include #include #include #include #include #include #ifdef PERF_TEST_WITH_ACML #include #include #endif #include "PerformanceTest.h" using namespace std; using namespace clMath; #define CHECK_RESULT(ret) \ do { \ ASSERT_GE(ret, 0) << "Fatal error: can not allocate resources or " \ "perform an OpenCL request!" << endl; \ EXPECT_EQ(0, ret) << "The OpenCL version is slower in the case" << \ endl; \ } while (0) namespace clMath { template class AsumPerformanceTest : public PerformanceTest { public: virtual ~AsumPerformanceTest(); virtual int prepare(void); virtual nano_time_t etalonPerfSingle(void); virtual nano_time_t clblasPerfSingle(void); static void runInstance(BlasFunction fn, TestParams *params) { AsumPerformanceTest perfCase(fn, params); int ret = 0; int opFactor; BlasBase *base; base = clMath::BlasBase::getInstance(); opFactor =1; if (((fn == FN_DASUM) || (fn == FN_DZASUM)) && !base->isDevSupportDoublePrecision()) { std::cerr << ">> WARNING: The target device doesn't support native " "double precision floating point arithmetic" << std::endl << ">> Test skipped" << std::endl; return; } if (!perfCase.areResourcesSufficient(params)) { std::cerr << ">> RESOURCE CHECK: Skip due to insufficient resources" << std::endl; return; } else { ret = perfCase.run(opFactor); } ASSERT_GE(ret, 0) << "Fatal error: can not allocate resources or " "perform an OpenCL request!" << endl; EXPECT_EQ(0, ret) << "The OpenCL version is slower in the case" << endl; } private: AsumPerformanceTest(BlasFunction fn, TestParams *params); bool areResourcesSufficient(TestParams *params); TestParams params_; ElemType *blasX_; cl_mem mobjX_; cl_mem mobjAsum_; cl_mem scratchBuff; size_t lengthX; ::clMath::BlasBase *base_; }; template AsumPerformanceTest::AsumPerformanceTest( BlasFunction fn, TestParams *params) : PerformanceTest(fn,(problem_size_t) ( (1 * params->N) * sizeof(ElemType) ) ), params_(*params), mobjX_(NULL), mobjAsum_(NULL) { blasX_ = NULL; mobjX_ = mobjAsum_= scratchBuff = NULL; lengthX = 1 + (params->N - 1) * abs(params_.incx); try { blasX_ = new ElemType[lengthX + params_.offBX]; } catch(bad_alloc& ba) { blasX_ = NULL; // areResourcesSufficient() will handle the rest and return mobjX_ = mobjAsum_= scratchBuff = NULL; ba = ba; } base_ = ::clMath::BlasBase::getInstance(); } template AsumPerformanceTest::~AsumPerformanceTest() { if(blasX_ != NULL) { delete[] blasX_; } if( mobjX_ != NULL ) { clReleaseMemObject(mobjX_); } if( mobjAsum_ != NULL ) { clReleaseMemObject(mobjAsum_); } if( scratchBuff!= NULL ) { clReleaseMemObject(scratchBuff); } } /* * Check if available OpenCL resources are sufficient to * run the test case */ template bool AsumPerformanceTest::areResourcesSufficient(TestParams *params) { clMath::BlasBase *base; size_t gmemSize, allocSize; bool ret; size_t sizeX, sizeAsum; if((blasX_ == NULL) ) { return 0; } base = clMath::BlasBase::getInstance(); gmemSize = (size_t)base->availGlobalMemSize( 0 ); allocSize = (size_t)base->maxMemAllocSize(); sizeX = (lengthX + params->offBX) * sizeof(ElemType); sizeAsum = (1 + params->offa) * sizeof(ElemType); ret = ((sizeX < allocSize) && (sizeAsum < allocSize)); ret = (ret && ((sizeX + sizeAsum) < gmemSize)); return ret; } template int AsumPerformanceTest::prepare(void) { randomVectors(params_.N, (blasX_ + params_.offBX), params_.incx, (ElemType*)NULL, 0, true); mobjX_ = base_->createEnqueueBuffer(blasX_, ((lengthX + params_.offBX) * sizeof(ElemType)), 0, CL_MEM_READ_WRITE); mobjAsum_ = base_->createEnqueueBuffer(NULL, ((1 + params_.offa) * sizeof(ElemType)), 0, CL_MEM_READ_WRITE); scratchBuff = base_->createEnqueueBuffer(NULL, ((lengthX) * sizeof(ElemType)), 0, CL_MEM_READ_WRITE); return ((mobjX_ != NULL) && (mobjAsum_ != NULL)&& (scratchBuff != NULL) )? 0 : -1; } template nano_time_t AsumPerformanceTest::etalonPerfSingle(void) { nano_time_t time = 0; #ifdef PERF_TEST_WITH_ACML time = getCurrentTime(); clMath::blas::asum(params_.N, blasX_, params_.offBX, params_.incx ); time = getCurrentTime() - time; #endif // PERF_TEST_WITH_ACML return time; } template nano_time_t AsumPerformanceTest::clblasPerfSingle(void) { nano_time_t time; cl_event event; cl_int status; cl_command_queue queue = base_->commandQueues()[0]; DataType type; type = ( typeid(ElemType) == typeid(float))? TYPE_FLOAT:( typeid(ElemType) == typeid(double))? TYPE_DOUBLE: ( typeid(ElemType) == typeid(FloatComplex))? TYPE_COMPLEX_FLOAT: TYPE_COMPLEX_DOUBLE; event = NULL; clFinish( queue); time = getCurrentTime(); #define TIMING #ifdef TIMING int iter = 100; for ( int i=1; i <= iter; i++) { #endif status = (cl_int)clMath::clblas::asum( type, params_.N, mobjAsum_, params_.offa, mobjX_, params_.offBX, params_.incx, scratchBuff, 1, &queue, 0, NULL, &event); if (status != CL_SUCCESS) { cerr << "The CLBLAS ASUM function failed, status = " << status << endl; return NANOTIME_ERR; } #ifdef TIMING } // iter loop clFinish( queue); time = getCurrentTime() - time; time /= iter; #else status = flushAll(1, &queue); if (status != CL_SUCCESS) { cerr << "clFlush() failed, status = " << status << endl; return NANOTIME_ERR; } time = getCurrentTime(); status = waitForSuccessfulFinish(1, &queue, &event); if (status == CL_SUCCESS) { time = getCurrentTime() - time; } else { cerr << "Waiting for completion of commands to the queue failed, " "status = " << status << endl; time = NANOTIME_ERR; } #endif return time; } } // namespace clMath TEST_P(ASUM, sasum) { TestParams params; getParams(¶ms); AsumPerformanceTest::runInstance(FN_SASUM, ¶ms); } TEST_P(ASUM, dasum) { TestParams params; getParams(¶ms); AsumPerformanceTest::runInstance(FN_DASUM, ¶ms); } TEST_P(ASUM, scasum) { TestParams params; getParams(¶ms); AsumPerformanceTest::runInstance(FN_SCASUM, ¶ms); } TEST_P(ASUM, dzasum) { TestParams params; getParams(¶ms); AsumPerformanceTest::runInstance(FN_DZASUM, ¶ms); } clblas-2.10/src/tests/performance/perf-axpy.cpp000066400000000000000000000222301264277366700215700ustar00rootroot00000000000000/* ************************************************************************ * Copyright 2013 Advanced Micro Devices, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * ************************************************************************/ /* * AXPY performance test cases */ #include // srand() #include // memcpy() #include #include #include #include #include #include #include #ifdef PERF_TEST_WITH_ACML #include #include #endif #include "PerformanceTest.h" /* * NOTE: operation factor means overall number * of multiply and add per each operation involving * 2 matrix elements */ using namespace std; using namespace clMath; #define CHECK_RESULT(ret) \ do { \ ASSERT_GE(ret, 0) << "Fatal error: can not allocate resources or " \ "perform an OpenCL request!" << endl; \ EXPECT_EQ(0, ret) << "The OpenCL version is slower in the case" << \ endl; \ } while (0) namespace clMath { template class AxpyPerformanceTest : public PerformanceTest { public: virtual ~AxpyPerformanceTest(); virtual int prepare(void); virtual nano_time_t etalonPerfSingle(void); virtual nano_time_t clblasPerfSingle(void); static void runInstance(BlasFunction fn, TestParams *params) { AxpyPerformanceTest perfCase(fn, params); int ret = 0; int opFactor; BlasBase *base; base = clMath::BlasBase::getInstance(); opFactor =1; if (((fn == FN_DAXPY) || (fn == FN_ZAXPY)) && !base->isDevSupportDoublePrecision()) { std::cerr << ">> WARNING: The target device doesn't support native " "double precision floating point arithmetic" << std::endl << ">> Test skipped" << std::endl; return; } if (!perfCase.areResourcesSufficient(params)) { std::cerr << ">> RESOURCE CHECK: Skip due to unsufficient resources" << std::endl; return; } else { ret = perfCase.run(opFactor); } ASSERT_GE(ret, 0) << "Fatal error: can not allocate resources or " "perform an OpenCL request!" << endl; EXPECT_EQ(0, ret) << "The OpenCL version is slower in the case" << endl; } private: AxpyPerformanceTest(BlasFunction fn, TestParams *params); bool areResourcesSufficient(TestParams *params); TestParams params_; ElemType alpha_; ElemType *X_; ElemType *Y_; ElemType *blasX_; ElemType *blasY_; cl_mem mobjX_; cl_mem mobjY_; size_t lengthX; size_t lengthY; ::clMath::BlasBase *base_; }; template AxpyPerformanceTest::AxpyPerformanceTest( BlasFunction fn, TestParams *params) : PerformanceTest(fn,(problem_size_t) ( (3 * params->N) * sizeof(ElemType) ) ), params_(*params), mobjX_(NULL), mobjY_(NULL) { X_ = blasX_ = Y_ = blasY_ = NULL; lengthX = 1 + (params->N - 1) * abs(params_.incx); lengthY = 1 + (params->N - 1) * abs(params_.incy); try { X_ = new ElemType[lengthX + params_.offBX]; blasX_ = new ElemType[lengthX + params_.offBX]; Y_ = new ElemType[lengthY + params_.offCY]; blasY_ = new ElemType[lengthY + params_.offCY]; } catch(bad_alloc& ba) { X_ = blasX_ = Y_ = blasY_ = NULL; // areResourcesSufficient() will handle the rest and return mobjX_= NULL; ba = ba; } base_ = ::clMath::BlasBase::getInstance(); } template AxpyPerformanceTest::~AxpyPerformanceTest() { if(X_ != NULL) { delete[] X_; } if(blasX_ != NULL) { delete[] blasX_; } if( mobjX_ != NULL ) { clReleaseMemObject(mobjX_); } if(Y_ != NULL) { delete[] Y_; } if(blasY_ != NULL) { delete[] blasY_; } if( mobjY_ != NULL ) { clReleaseMemObject(mobjY_); } } /* * Check if available OpenCL resources are sufficient to * run the test case */ template bool AxpyPerformanceTest::areResourcesSufficient(TestParams *params) { clMath::BlasBase *base; size_t gmemSize, allocSize, reqdSize; bool ret; if((X_ == NULL) || (blasX_ == NULL) || (Y_ == NULL) || (blasY_ == NULL)) { return 0; } base = clMath::BlasBase::getInstance(); gmemSize = (size_t)base->availGlobalMemSize( 0 ); allocSize = (size_t)base->maxMemAllocSize(); reqdSize = (lengthX + params->offBX + lengthY + params->offCY) * sizeof(ElemType); ret = (reqdSize) < allocSize; ret = ret && (reqdSize < gmemSize); return ret; } template int AxpyPerformanceTest::prepare(void) { alpha_ = convertMultiplier(params_.alpha); randomVectors(params_.N, (X_ + params_.offBX), params_.incx, (Y_ + params_.offCY), params_.incy); memcpy(blasX_, X_, (lengthX + params_.offBX)* sizeof(ElemType)); memcpy(blasY_, Y_, (lengthY + params_.offCY)* sizeof(ElemType)); mobjX_ = base_->createEnqueueBuffer(X_, ((lengthX + params_.offBX) * sizeof(ElemType)), 0, CL_MEM_READ_ONLY); mobjY_ = base_->createEnqueueBuffer(Y_, ((lengthY + params_.offCY) * sizeof(ElemType)), 0, CL_MEM_READ_WRITE); return ((mobjX_ != NULL) && (mobjY_ != NULL))? 0 : -1; } template nano_time_t AxpyPerformanceTest::etalonPerfSingle(void) { nano_time_t time = 0; #ifdef PERF_TEST_WITH_ACML time = getCurrentTime(); clMath::blas::axpy(params_.N, alpha_, blasX_, params_.offBX, params_.incx, blasY_, params_.offCY, params_.incy); time = getCurrentTime() - time; #endif // PERF_TEST_WITH_ACML return time; } template nano_time_t AxpyPerformanceTest::clblasPerfSingle(void) { nano_time_t time; cl_event event; cl_int status; cl_command_queue queue = base_->commandQueues()[0]; status = clEnqueueWriteBuffer(queue, mobjX_, CL_TRUE, 0, (lengthX + params_.offBX) * sizeof(ElemType), X_, 0, NULL, &event); status |= clEnqueueWriteBuffer(queue, mobjY_, CL_TRUE, 0, (lengthY + params_.offCY) * sizeof(ElemType), Y_, 0, NULL, &event); if (status != CL_SUCCESS) { cerr << "mobjX_ or mobjY_ buffer object enqueuing error, status = " << status << endl; return NANOTIME_ERR; } status = clWaitForEvents(1, &event); if (status != CL_SUCCESS) { cout << "Wait on event failed, status = " << status << endl; return NANOTIME_ERR; } event = NULL; time = getCurrentTime(); #define TIMING #ifdef TIMING clFinish( queue); int iter = 50; for ( int i=1; i <= iter; i++) { #endif status = (cl_int)clMath::clblas::axpy(params_.N, alpha_, mobjX_, params_.offBX, params_.incx, mobjY_, params_.offCY, params_.incy, 1, &queue, 0, NULL, &event); if (status != CL_SUCCESS) { cerr << "The CLBLAS AXPY function failed, status = " << status << endl; return NANOTIME_ERR; } #ifdef TIMING } // iter loop clFinish( queue); time = getCurrentTime() - time; time /= iter; #else status = flushAll(1, &queue); if (status != CL_SUCCESS) { cerr << "clFlush() failed, status = " << status << endl; return NANOTIME_ERR; } time = getCurrentTime(); status = waitForSuccessfulFinish(1, &queue, &event); if (status == CL_SUCCESS) { time = getCurrentTime() - time; } else { cerr << "Waiting for completion of commands to the queue failed, " "status = " << status << endl; time = NANOTIME_ERR; } #endif return time; } } // namespace clMath // axpy performance test TEST_P(AXPY, saxpy) { TestParams params; getParams(¶ms); AxpyPerformanceTest::runInstance(FN_SAXPY, ¶ms); } TEST_P(AXPY, daxpy) { TestParams params; getParams(¶ms); AxpyPerformanceTest::runInstance(FN_DAXPY, ¶ms); } TEST_P(AXPY, caxpy) { TestParams params; getParams(¶ms); AxpyPerformanceTest::runInstance(FN_CAXPY, ¶ms); } TEST_P(AXPY, zaxpy) { TestParams params; getParams(¶ms); AxpyPerformanceTest::runInstance(FN_ZAXPY, ¶ms); } clblas-2.10/src/tests/performance/perf-copy.cpp000066400000000000000000000214041264277366700215630ustar00rootroot00000000000000/* ************************************************************************ * Copyright 2013 Advanced Micro Devices, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * ************************************************************************/ #include // srand() #include // memcpy() #include #include #include #include #include #include #include #ifdef PERF_TEST_WITH_ACML #include #include #endif #include "PerformanceTest.h" using namespace std; using namespace clMath; #define CHECK_RESULT(ret) \ do { \ ASSERT_GE(ret, 0) << "Fatal error: can not allocate resources or " \ "perform an OpenCL request!" << endl; \ EXPECT_EQ(0, ret) << "The OpenCL version is slower in the case" << \ endl; \ } while (0) namespace clMath { template class CopyPerformanceTest : public PerformanceTest { public: virtual ~CopyPerformanceTest(); virtual int prepare(void); virtual nano_time_t etalonPerfSingle(void); virtual nano_time_t clblasPerfSingle(void); static void runInstance(BlasFunction fn, TestParams *params) { CopyPerformanceTest perfCase(fn, params); int ret = 0; int opFactor; BlasBase *base; base = clMath::BlasBase::getInstance(); opFactor =1; if (((fn == FN_DCOPY) || (fn == FN_ZCOPY)) && !base->isDevSupportDoublePrecision()) { std::cerr << ">> WARNING: The target device doesn't support native " "double precision floating point arithmetic" << std::endl << ">> Test skipped" << std::endl; return; } if (!perfCase.areResourcesSufficient(params)) { std::cerr << ">> RESOURCE CHECK: Skip due to unsufficient resources" << std::endl; return; } else { ret = perfCase.run(opFactor); } ASSERT_GE(ret, 0) << "Fatal error: can not allocate resources or " "perform an OpenCL request!" << endl; EXPECT_EQ(0, ret) << "The OpenCL version is slower in the case" << endl; } private: CopyPerformanceTest(BlasFunction fn, TestParams *params); bool areResourcesSufficient(TestParams *params); TestParams params_; ElemType *backY_; ElemType *blasX_; ElemType *blasY_; cl_mem mobjX_; cl_mem mobjY_; size_t lengthX; size_t lengthY; ::clMath::BlasBase *base_; }; template CopyPerformanceTest::CopyPerformanceTest( BlasFunction fn, TestParams *params) : PerformanceTest(fn,(problem_size_t) ( (2 * params->N) * sizeof(ElemType) ) ), params_(*params), mobjX_(NULL), mobjY_(NULL) { blasX_ = NULL; blasY_ = NULL; backY_ = NULL; lengthX = 1 + (params->N - 1) * abs(params_.incx); lengthY = 1 + (params->N - 1) * abs(params_.incy); try { backY_ = new ElemType[lengthY + params_.offCY]; blasX_ = new ElemType[lengthX + params_.offBX]; blasY_ = new ElemType[lengthY + params_.offCY]; } catch(bad_alloc& ba) { backY_ = blasX_ = blasY_ = NULL; // areResourcesSufficient() will handle the rest and return mobjX_= mobjY_ = NULL; ba = ba; } base_ = ::clMath::BlasBase::getInstance(); } template CopyPerformanceTest::~CopyPerformanceTest() { if(blasX_ != NULL) { delete[] blasX_; } if(blasY_ != NULL) { delete[] blasY_; } if(backY_ != NULL) { delete[] backY_; } if( mobjX_ != NULL ) { clReleaseMemObject(mobjX_); } if( mobjY_ != NULL ) { clReleaseMemObject(mobjY_); } } template bool CopyPerformanceTest::areResourcesSufficient(TestParams *params) { clMath::BlasBase *base; size_t gmemSize, allocSize; bool ret; size_t sizeX, sizeY; if((blasX_ == NULL) || (blasY_ == NULL) || (backY_ ==NULL) ) { return 0; } base = clMath::BlasBase::getInstance(); gmemSize = (size_t)base->availGlobalMemSize( 0 ); allocSize = (size_t)base->maxMemAllocSize(); sizeX = (lengthX + params->offBX) * sizeof(ElemType); sizeY = (lengthY + params->offCY) * sizeof(ElemType); ret = ((sizeX < allocSize) && (sizeY < allocSize)); ret = (ret && ((sizeX + sizeY) < gmemSize)); return ret; } template int CopyPerformanceTest::prepare(void) { randomVectors(params_.N, (blasX_ + params_.offBX), params_.incx, (blasY_ + params_.offCY), params_.incy); memcpy(backY_, blasY_, (lengthY + params_.offCY)* sizeof(ElemType)); mobjX_ = base_->createEnqueueBuffer(blasX_, ((lengthX + params_.offBX) * sizeof(ElemType)), 0, CL_MEM_READ_WRITE); mobjY_ = base_->createEnqueueBuffer(blasY_, ((lengthY + params_.offCY) * sizeof(ElemType)), 0, CL_MEM_READ_WRITE); return ((mobjX_ != NULL) && (mobjY_ != NULL))? 0 : -1; } template nano_time_t CopyPerformanceTest::etalonPerfSingle(void) { nano_time_t time = 0; #ifdef PERF_TEST_WITH_ACML time = getCurrentTime(); clMath::blas::copy(params_.N, blasX_, params_.offBX, params_.incx, blasY_, params_.offCY, params_.incy); time = getCurrentTime() - time; #endif // PERF_TEST_WITH_ACML return time; } template nano_time_t CopyPerformanceTest::clblasPerfSingle(void) { nano_time_t time; cl_event event; cl_int status; cl_command_queue queue = base_->commandQueues()[0]; DataType type; type = ( typeid(ElemType) == typeid(float))? TYPE_FLOAT:( typeid(ElemType) == typeid(double))? TYPE_DOUBLE: ( typeid(ElemType) == typeid(FloatComplex))? TYPE_COMPLEX_FLOAT: TYPE_COMPLEX_DOUBLE; status = clEnqueueWriteBuffer(queue, mobjY_, CL_TRUE, 0, (lengthY + params_.offCY) * sizeof(ElemType), backY_, 0, NULL, &event); if (status != CL_SUCCESS) { cerr << "Vector Y buffer object enqueuing error, status = " << status << endl; return NANOTIME_ERR; } status = clWaitForEvents(1, &event); if (status != CL_SUCCESS) { cout << "Wait on event failed, status = " << status << endl; return NANOTIME_ERR; } event = NULL; time = getCurrentTime(); #define TIMING #ifdef TIMING clFinish( queue); int iter = 100; for ( int i=1; i <= iter; i++) { #endif status = (cl_int)clMath::clblas::copy(type, params_.N, mobjX_, params_.offBX, params_.incx, mobjY_, params_.offCY, params_.incy, 1, &queue, 0, NULL, &event); if (status != CL_SUCCESS) { cerr << "The CLBLAS COPY function failed, status = " << status << endl; return NANOTIME_ERR; } #ifdef TIMING } // iter loop clFinish( queue); time = getCurrentTime() - time; time /= iter; #else status = flushAll(1, &queue); if (status != CL_SUCCESS) { cerr << "clFlush() failed, status = " << status << endl; return NANOTIME_ERR; } time = getCurrentTime(); status = waitForSuccessfulFinish(1, &queue, &event); if (status == CL_SUCCESS) { time = getCurrentTime() - time; } else { cerr << "Waiting for completion of commands to the queue failed, " "status = " << status << endl; time = NANOTIME_ERR; } #endif return time; } } // namespace clMath TEST_P(COPY, scopy) { TestParams params; getParams(¶ms); CopyPerformanceTest::runInstance(FN_SCOPY, ¶ms); } TEST_P(COPY, dcopy) { TestParams params; getParams(¶ms); CopyPerformanceTest::runInstance(FN_DCOPY, ¶ms); } TEST_P(COPY, ccopy) { TestParams params; getParams(¶ms); CopyPerformanceTest::runInstance(FN_CCOPY, ¶ms); } TEST_P(COPY, zcopy) { TestParams params; getParams(¶ms); CopyPerformanceTest::runInstance(FN_ZCOPY, ¶ms); } clblas-2.10/src/tests/performance/perf-dot.cpp000066400000000000000000000213161264277366700214010ustar00rootroot00000000000000/* ************************************************************************ * Copyright 2013 Advanced Micro Devices, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * ************************************************************************/ #include // srand() #include // memcpy() #include #include #include #include #include #include #include #ifdef PERF_TEST_WITH_ACML #include #include #endif #include "PerformanceTest.h" using namespace std; using namespace clMath; #define CHECK_RESULT(ret) \ do { \ ASSERT_GE(ret, 0) << "Fatal error: can not allocate resources or " \ "perform an OpenCL request!" << endl; \ EXPECT_EQ(0, ret) << "The OpenCL version is slower in the case" << \ endl; \ } while (0) namespace clMath { template class DotPerformanceTest : public PerformanceTest { public: virtual ~DotPerformanceTest(); virtual int prepare(void); virtual nano_time_t etalonPerfSingle(void); virtual nano_time_t clblasPerfSingle(void); static void runInstance(BlasFunction fn, TestParams *params) { DotPerformanceTest perfCase(fn, params); int ret = 0; int opFactor; BlasBase *base; base = clMath::BlasBase::getInstance(); opFactor =1; if (((fn == FN_DDOT) || (fn == FN_ZDOTU)) && !base->isDevSupportDoublePrecision()) { std::cerr << ">> WARNING: The target device doesn't support native " "double precision floating point arithmetic" << std::endl << ">> Test skipped" << std::endl; return; } if (!perfCase.areResourcesSufficient(params)) { std::cerr << ">> RESOURCE CHECK: Skip due to insufficient resources" << std::endl; return; } else { ret = perfCase.run(opFactor); } ASSERT_GE(ret, 0) << "Fatal error: can not allocate resources or " "perform an OpenCL request!" << endl; EXPECT_EQ(0, ret) << "The OpenCL version is slower in the case" << endl; } private: DotPerformanceTest(BlasFunction fn, TestParams *params); bool areResourcesSufficient(TestParams *params); TestParams params_; ElemType *blasX_; ElemType *blasY_; cl_mem mobjX_; cl_mem mobjY_; cl_mem mobjDP_; cl_mem scratchBuff; size_t lengthX; size_t lengthY; ::clMath::BlasBase *base_; }; template DotPerformanceTest::DotPerformanceTest( BlasFunction fn, TestParams *params) : PerformanceTest(fn,(problem_size_t) ( (2 * params->N) * sizeof(ElemType) ) ), params_(*params), mobjX_(NULL), mobjY_(NULL),mobjDP_(NULL) { blasX_ = NULL; blasY_ = NULL; mobjX_= mobjY_ = mobjDP_= scratchBuff = NULL; lengthX = 1 + (params->N - 1) * abs(params_.incx); lengthY = 1 + (params->N - 1) * abs(params_.incy); try { blasX_ = new ElemType[lengthX + params_.offBX]; blasY_ = new ElemType[lengthY + params_.offCY]; } catch(bad_alloc& ba) { blasX_ = blasY_ = NULL; // areResourcesSufficient() will handle the rest and return mobjX_= mobjY_ = mobjDP_= scratchBuff = NULL; ba = ba; } base_ = ::clMath::BlasBase::getInstance(); } template DotPerformanceTest::~DotPerformanceTest() { if(blasX_ != NULL) { delete[] blasX_; } if(blasY_ != NULL) { delete[] blasY_; } if( mobjX_ != NULL ) { clReleaseMemObject(mobjX_); } if( mobjY_ != NULL ) { clReleaseMemObject(mobjY_); } if( mobjDP_ != NULL ) { clReleaseMemObject(mobjDP_); } if( scratchBuff!= NULL ) { clReleaseMemObject(scratchBuff); } } /* * Check if available OpenCL resources are sufficient to * run the test case */ template bool DotPerformanceTest::areResourcesSufficient(TestParams *params) { clMath::BlasBase *base; size_t gmemSize, allocSize; bool ret; size_t sizeX, sizeY, sizeDP; if((blasX_ == NULL) || (blasY_ == NULL) ) { return 0; } base = clMath::BlasBase::getInstance(); gmemSize = (size_t)base->availGlobalMemSize( 0 ); allocSize = (size_t)base->maxMemAllocSize(); sizeX = (lengthX + params->offBX) * sizeof(ElemType); sizeY = (lengthY + params->offCY) * sizeof(ElemType); sizeDP = (1 + params->offa) * sizeof(ElemType); ret = ((sizeX < allocSize) && (sizeY < allocSize) && (sizeDP < allocSize)); ret = (ret && ((sizeX + sizeY + sizeDP) < gmemSize)); return ret; } template int DotPerformanceTest::prepare(void) { randomVectors(params_.N, (blasX_ + params_.offBX), params_.incx, (blasY_ + params_.offCY), params_.incy, true); mobjX_ = base_->createEnqueueBuffer(blasX_, ((lengthX + params_.offBX) * sizeof(ElemType)), 0, CL_MEM_READ_WRITE); mobjY_ = base_->createEnqueueBuffer(blasY_, ((lengthY + params_.offCY) * sizeof(ElemType)), 0, CL_MEM_READ_WRITE); mobjDP_ = base_->createEnqueueBuffer(NULL, ((1 + params_.offa) * sizeof(ElemType)), 0, CL_MEM_READ_WRITE); scratchBuff = base_->createEnqueueBuffer(NULL, ((lengthY) * sizeof(ElemType)), 0, CL_MEM_READ_WRITE); return ((mobjX_ != NULL) && (mobjY_ != NULL) && (mobjDP_ != NULL)&& (scratchBuff != NULL) )? 0 : -1; } template nano_time_t DotPerformanceTest::etalonPerfSingle(void) { nano_time_t time = 0; #ifdef PERF_TEST_WITH_ACML time = getCurrentTime(); clMath::blas::dot(params_.N, blasX_, params_.offBX, params_.incx, blasY_, params_.offCY, params_.incy); time = getCurrentTime() - time; #endif // PERF_TEST_WITH_ACML return time; } template nano_time_t DotPerformanceTest::clblasPerfSingle(void) { nano_time_t time; cl_event event; cl_int status; cl_command_queue queue = base_->commandQueues()[0]; DataType type; type = ( typeid(ElemType) == typeid(float))? TYPE_FLOAT:( typeid(ElemType) == typeid(double))? TYPE_DOUBLE: ( typeid(ElemType) == typeid(FloatComplex))? TYPE_COMPLEX_FLOAT: TYPE_COMPLEX_DOUBLE; event = NULL; clFinish( queue); time = getCurrentTime(); #define TIMING #ifdef TIMING int iter = 100; for ( int i=1; i <= iter; i++) { #endif status = (cl_int)clMath::clblas::dot( type, params_.N, mobjDP_, params_.offa, mobjX_, params_.offBX, params_.incx, mobjY_, params_.offCY, params_.incy, scratchBuff, 1, &queue, 0, NULL, &event); if (status != CL_SUCCESS) { cerr << "The CLBLAS DOT function failed, status = " << status << endl; return NANOTIME_ERR; } #ifdef TIMING } // iter loop clFinish( queue); time = getCurrentTime() - time; time /= iter; #else status = flushAll(1, &queue); if (status != CL_SUCCESS) { cerr << "clFlush() failed, status = " << status << endl; return NANOTIME_ERR; } time = getCurrentTime(); status = waitForSuccessfulFinish(1, &queue, &event); if (status == CL_SUCCESS) { time = getCurrentTime() - time; } else { cerr << "Waiting for completion of commands to the queue failed, " "status = " << status << endl; time = NANOTIME_ERR; } #endif return time; } } // namespace clMath TEST_P(DOT, sdot) { TestParams params; getParams(¶ms); DotPerformanceTest::runInstance(FN_SDOT, ¶ms); } TEST_P(DOT, ddot) { TestParams params; getParams(¶ms); DotPerformanceTest::runInstance(FN_DDOT, ¶ms); } TEST_P(DOT, cdotu) { TestParams params; getParams(¶ms); DotPerformanceTest::runInstance(FN_CDOTU, ¶ms); } TEST_P(DOT, zdotu) { TestParams params; getParams(¶ms); DotPerformanceTest::runInstance(FN_ZDOTU, ¶ms); } clblas-2.10/src/tests/performance/perf-dotc.cpp000066400000000000000000000207021264277366700215420ustar00rootroot00000000000000/* ************************************************************************ * Copyright 2013 Advanced Micro Devices, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * ************************************************************************/ #include // srand() #include // memcpy() #include #include #include #include #include #include #include #ifdef PERF_TEST_WITH_ACML #include #include #endif #include "PerformanceTest.h" using namespace std; using namespace clMath; #define CHECK_RESULT(ret) \ do { \ ASSERT_GE(ret, 0) << "Fatal error: can not allocate resources or " \ "perform an OpenCL request!" << endl; \ EXPECT_EQ(0, ret) << "The OpenCL version is slower in the case" << \ endl; \ } while (0) namespace clMath { template class DotcPerformanceTest : public PerformanceTest { public: virtual ~DotcPerformanceTest(); virtual int prepare(void); virtual nano_time_t etalonPerfSingle(void); virtual nano_time_t clblasPerfSingle(void); static void runInstance(BlasFunction fn, TestParams *params) { DotcPerformanceTest perfCase(fn, params); int ret = 0; int opFactor; BlasBase *base; base = clMath::BlasBase::getInstance(); opFactor =1; if (((fn == FN_ZDOTC)) && !base->isDevSupportDoublePrecision()) { std::cerr << ">> WARNING: The target device doesn't support native " "double precision floating point arithmetic" << std::endl << ">> Test skipped" << std::endl; return; } if (!perfCase.areResourcesSufficient(params)) { std::cerr << ">> RESOURCE CHECK: Skip due to insufficient resources" << std::endl; return; } else { ret = perfCase.run(opFactor); } ASSERT_GE(ret, 0) << "Fatal error: can not allocate resources or " "perform an OpenCL request!" << endl; EXPECT_EQ(0, ret) << "The OpenCL version is slower in the case" << endl; } private: DotcPerformanceTest(BlasFunction fn, TestParams *params); bool areResourcesSufficient(TestParams *params); TestParams params_; ElemType *blasX_; ElemType *blasY_; cl_mem mobjX_; cl_mem mobjY_; cl_mem mobjDP_; cl_mem scratchBuff; size_t lengthX; size_t lengthY; ::clMath::BlasBase *base_; }; template DotcPerformanceTest::DotcPerformanceTest( BlasFunction fn, TestParams *params) : PerformanceTest(fn,(problem_size_t) ( (2 * params->N) * sizeof(ElemType) ) ), params_(*params), mobjX_(NULL), mobjY_(NULL),mobjDP_(NULL) { blasX_ = NULL; blasY_ = NULL; mobjX_= mobjY_ = mobjDP_= scratchBuff = NULL; lengthX = 1 + (params->N - 1) * abs(params_.incx); lengthY = 1 + (params->N - 1) * abs(params_.incy); try { blasX_ = new ElemType[lengthX + params_.offBX]; blasY_ = new ElemType[lengthY + params_.offCY]; } catch(bad_alloc& ba) { blasX_ = blasY_ = NULL; // areResourcesSufficient() will handle the rest and return mobjX_= mobjY_ = mobjDP_= scratchBuff = NULL; ba = ba; } base_ = ::clMath::BlasBase::getInstance(); } template DotcPerformanceTest::~DotcPerformanceTest() { if(blasX_ != NULL) { delete[] blasX_; } if(blasY_ != NULL) { delete[] blasY_; } if( mobjX_ != NULL ) { clReleaseMemObject(mobjX_); } if( mobjY_ != NULL ) { clReleaseMemObject(mobjY_); } if( mobjDP_ != NULL ) { clReleaseMemObject(mobjDP_); } if( scratchBuff!= NULL ) { clReleaseMemObject(scratchBuff); } } /* * Check if available OpenCL resources are sufficient to * run the test case */ template bool DotcPerformanceTest::areResourcesSufficient(TestParams *params) { clMath::BlasBase *base; size_t gmemSize, allocSize; bool ret; size_t sizeX, sizeY, sizeDP; if((blasX_ == NULL) || (blasY_ == NULL) ) { return 0; } base = clMath::BlasBase::getInstance(); gmemSize = (size_t)base->availGlobalMemSize( 0 ); allocSize = (size_t)base->maxMemAllocSize(); sizeX = (lengthX + params->offBX) * sizeof(ElemType); sizeY = (lengthY + params->offCY) * sizeof(ElemType); sizeDP = (1 + params->offa) * sizeof(ElemType); ret = ((sizeX < allocSize) && (sizeY < allocSize) && (sizeDP < allocSize)); ret = (ret && ((sizeX + sizeY + sizeDP) < gmemSize)); return ret; } template int DotcPerformanceTest::prepare(void) { randomVectors(params_.N, (blasX_ + params_.offBX), params_.incx, (blasY_ + params_.offCY), params_.incy, true); mobjX_ = base_->createEnqueueBuffer(blasX_, ((lengthX + params_.offBX) * sizeof(ElemType)), 0, CL_MEM_READ_WRITE); mobjY_ = base_->createEnqueueBuffer(blasY_, ((lengthY + params_.offCY) * sizeof(ElemType)), 0, CL_MEM_READ_WRITE); mobjDP_ = base_->createEnqueueBuffer(NULL, ((1 + params_.offa) * sizeof(ElemType)), 0, CL_MEM_READ_WRITE); scratchBuff = base_->createEnqueueBuffer(NULL, ((lengthY) * sizeof(ElemType)), 0, CL_MEM_READ_WRITE); return ((mobjX_ != NULL) && (mobjY_ != NULL) && (mobjDP_ != NULL)&& (scratchBuff != NULL) )? 0 : -1; } template nano_time_t DotcPerformanceTest::etalonPerfSingle(void) { nano_time_t time = 0; #ifdef PERF_TEST_WITH_ACML time = getCurrentTime(); clMath::blas::dotc(params_.N, blasX_, params_.offBX, params_.incx, blasY_, params_.offCY, params_.incy); time = getCurrentTime() - time; #endif // PERF_TEST_WITH_ACML return time; } template nano_time_t DotcPerformanceTest::clblasPerfSingle(void) { nano_time_t time; cl_event event; cl_int status; cl_command_queue queue = base_->commandQueues()[0]; DataType type; type = ( typeid(ElemType) == typeid(float))? TYPE_FLOAT:( typeid(ElemType) == typeid(double))? TYPE_DOUBLE: ( typeid(ElemType) == typeid(FloatComplex))? TYPE_COMPLEX_FLOAT: TYPE_COMPLEX_DOUBLE; event = NULL; clFinish( queue); time = getCurrentTime(); #define TIMING #ifdef TIMING int iter = 100; for ( int i=1; i <= iter; i++) { #endif status = (cl_int)clMath::clblas::dot( type, params_.N, mobjDP_, params_.offa, mobjX_, params_.offBX, params_.incx, mobjY_, params_.offCY, params_.incy, scratchBuff, 1, &queue, 0, NULL, &event); if (status != CL_SUCCESS) { cerr << "The CLBLAS DOT function failed, status = " << status << endl; return NANOTIME_ERR; } #ifdef TIMING } // iter loop clFinish( queue); time = getCurrentTime() - time; time /= iter; #else status = flushAll(1, &queue); if (status != CL_SUCCESS) { cerr << "clFlush() failed, status = " << status << endl; return NANOTIME_ERR; } time = getCurrentTime(); status = waitForSuccessfulFinish(1, &queue, &event); if (status == CL_SUCCESS) { time = getCurrentTime() - time; } else { cerr << "Waiting for completion of commands to the queue failed, " "status = " << status << endl; time = NANOTIME_ERR; } #endif return time; } } // namespace clMath TEST_P(DOTC, cdotc) { TestParams params; getParams(¶ms); DotcPerformanceTest::runInstance(FN_CDOTC, ¶ms); } TEST_P(DOTC, zdotc) { TestParams params; getParams(¶ms); DotcPerformanceTest::runInstance(FN_ZDOTC, ¶ms); } clblas-2.10/src/tests/performance/perf-gbmv.cpp000066400000000000000000000260021264277366700215430ustar00rootroot00000000000000/* ************************************************************************ * Copyright 2013 Advanced Micro Devices, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * ************************************************************************/ /* * Gbmv performance test cases */ #include // srand() #include // memcpy() #include #include #include #include #include #include #include #ifdef PERF_TEST_WITH_ACML #include #include #endif #include "PerformanceTest.h" using namespace std; using namespace clMath; #define CHECK_RESULT(ret) \ do { \ ASSERT_GE(ret, 0) << "Fatal error: can not allocate resources or " \ "perform an OpenCL request!" << endl; \ EXPECT_EQ(0, ret) << "The OpenCL version is slower in the case" << \ endl; \ } while (0) namespace clMath { template class GbmvPerformanceTest : public PerformanceTest { public: virtual ~GbmvPerformanceTest(); virtual int prepare(void); virtual nano_time_t etalonPerfSingle(void); virtual nano_time_t clblasPerfSingle(void); static void runInstance(BlasFunction fn, TestParams *params) { GbmvPerformanceTest perfCase(fn, params); int ret = 0; int opFactor = 1; BlasBase *base; base = clMath::BlasBase::getInstance(); if ((fn == FN_DGBMV || fn == FN_ZGBMV) && !base->isDevSupportDoublePrecision()) { std::cerr << ">> WARNING: The target device doesn't support native " "double precision floating point arithmetic" << std::endl << ">> Test skipped" << std::endl; return; } if (!perfCase.areResourcesSufficient(params)) { std::cerr << ">> RESOURCE CHECK: Skip due to unsufficient resources" << std::endl; } else { ret = perfCase.run(opFactor); } ASSERT_GE(ret, 0) << "Fatal error: can not allocate resources or " "perform an OpenCL request!" << endl; EXPECT_EQ(0, ret) << "The OpenCL version is slower in the case" << endl; } private: GbmvPerformanceTest(BlasFunction fn, TestParams *params); bool areResourcesSufficient(TestParams *params); TestParams params_; ElemType alpha; ElemType beta; ElemType *A_; ElemType *X_; ElemType *Y_; ElemType *backY_; cl_mem mobjA_; cl_mem mobjX_; cl_mem mobjY_; ::clMath::BlasBase *base_; }; template GbmvPerformanceTest::GbmvPerformanceTest( BlasFunction fn, TestParams *params) : PerformanceTest(fn, (problem_size_t)( ( (((params->order == clblasColumnMajor)? params->N : params->M) * (params->KL + params->KU + 1) // A-access - (params->KL*(params->KL+1) + params->KU*(params->KU+1)) ) // Substract hole-part for A & X +((params->transA == clblasNoTrans)? ((params->KL + params->KU + 1) * params->M + 2*params->M) // X & Y access : ((params->KL + params->KU + 1) * params->N + 2*params->N) ) // X & Y for Trans case ) * sizeof(ElemType) ) ), params_(*params), mobjA_(NULL), mobjX_(NULL), mobjY_(NULL) { size_t lenA, lenX, lenY; lenA = ((params_.order == clblasColumnMajor)? params_.N : params_.M) * (params_.lda) + params_.offA; lenX = (((params_.transA == clblasNoTrans)? params_.N : params_.M) - 1)* params_.incx + 1 + params_.offBX; lenY = (((params_.transA == clblasNoTrans)? params_.M : params_.N) - 1)* params_.incy + 1 + params_.offCY; A_ = new ElemType[ lenA ]; X_ = new ElemType[ lenX ]; Y_ = new ElemType[ lenY ]; backY_ = new ElemType[ lenY ]; alpha = convertMultiplier(params_.alpha); beta = convertMultiplier(params_.beta); base_ = ::clMath::BlasBase::getInstance(); mobjA_ = NULL; mobjX_ = NULL; mobjY_ = NULL; } template GbmvPerformanceTest::~GbmvPerformanceTest() { if(A_ != NULL) { delete[] A_; } if(X_ != NULL) { delete[] X_; } if(backY_ != NULL) { delete[] backY_; } if(Y_ != NULL) { delete[] Y_; } if ( mobjA_ != NULL ) clReleaseMemObject(mobjA_); if ( mobjX_ != NULL ) clReleaseMemObject(mobjX_); if ( mobjY_ != NULL ) clReleaseMemObject(mobjY_); } /* * Check if available OpenCL resources are sufficient to * run the test case */ template bool GbmvPerformanceTest::areResourcesSufficient(TestParams *params) { clMath::BlasBase *base; size_t gmemSize, allocSize; size_t m = params->M, n = params->N, lda = params->lda; size_t lenA = (((params->order == clblasColumnMajor)? n : m) * lda + params->offA)* sizeof(ElemType); size_t lenX = ((((params->transA == clblasNoTrans)? params->N : params->M) - 1)* params->incx + 1 + params->offBX) * sizeof(ElemType); size_t lenY = ((((params->transA == clblasNoTrans)? params->M : params->N) - 1)* params->incy + 1 + params->offCY) * sizeof(ElemType); if((A_ == NULL) || (X_ == NULL) || (Y_ == NULL) || (backY_ == NULL)) { return 0; } base = clMath::BlasBase::getInstance(); gmemSize = (size_t)base->availGlobalMemSize(0); allocSize = (size_t)base->maxMemAllocSize(); bool suff = (lenA < allocSize) && ( (lenA + lenX + lenY) < gmemSize ); return suff; } template int GbmvPerformanceTest::prepare(void) { size_t lenX, lenY, lenA; lenA = ((params_.order == clblasColumnMajor)? params_.N : params_.M) * params_.lda + params_.offA; if (params_.transA == clblasNoTrans) { lenX = (params_.N - 1)*abs(params_.incx) + 1 + params_.offBX; lenY = (params_.M - 1)*abs(params_.incy) + 1 + params_.offCY; } else { lenX = (params_.M - 1)*abs(params_.incx) + 1 + params_.offBX; lenY = (params_.N - 1)*abs(params_.incy) + 1 + params_.offCY; } randomGbmvMatrices(params_.order, params_.transA, params_.M, params_.N, &alpha, &beta, (A_+params_.offA), params_.lda, (X_+params_.offBX), params_.incx, (Y_+params_.offCY), params_.incy ); memcpy(backY_, Y_, lenY * sizeof(ElemType)); mobjA_ = base_->createEnqueueBuffer(A_, lenA * sizeof(ElemType), 0, CL_MEM_READ_ONLY); mobjX_ = base_->createEnqueueBuffer(X_, lenX * sizeof(ElemType), 0, CL_MEM_READ_ONLY); mobjY_ = base_->createEnqueueBuffer(backY_, lenY * sizeof(ElemType), 0, CL_MEM_READ_WRITE); return ((mobjA_ != NULL) && (mobjX_ != NULL) && (mobjY_ != NULL)) ? 0 : -1; } template nano_time_t GbmvPerformanceTest::etalonPerfSingle(void) { nano_time_t time = 0; clblasOrder fOrder; clblasTranspose fTrans; size_t lda, lenY, lenA; size_t fM = params_.M, fN = params_.N, fKL = params_.KL, fKU = params_.KU; lenA = ((params_.order == clblasColumnMajor)? params_.N : params_.M) * params_.lda; lenY = (((params_.transA == clblasNoTrans)? params_.M : params_.N) - 1)* params_.incy + 1 + params_.offCY; memcpy(Y_, backY_, lenY * sizeof(ElemType)); fOrder = params_.order; fTrans = params_.transA; lda = params_.lda; if (fOrder != clblasColumnMajor) { fOrder = clblasColumnMajor; fTrans = (params_.transA == clblasNoTrans)? clblasTrans : clblasNoTrans; fM = params_.N; fN = params_.M; fKL = params_.KU; fKU = params_.KL; if( params_.transA == clblasConjTrans ) doConjugate( (A_+params_.offa), 1, lenA, lda ); } #ifdef PERF_TEST_WITH_ACML time = getCurrentTime(); clMath::blas::gbmv(fOrder, fTrans, fM, fN, fKL, fKU, alpha, A_, params_.offA, lda, X_, params_.offBX, params_.incx, beta, Y_, params_.offCY, params_.incy); time = getCurrentTime() - time; #endif // PERF_TEST_WITH_ACML return time; } template nano_time_t GbmvPerformanceTest::clblasPerfSingle(void) { nano_time_t time; cl_event event; cl_int status; size_t lenY; cl_command_queue queue = base_->commandQueues()[0]; lenY = (((params_.transA == clblasNoTrans)? params_.M : params_.N) - 1)* params_.incy + 1 + params_.offCY; status = clEnqueueWriteBuffer(queue, mobjY_, CL_TRUE, 0, lenY * sizeof(ElemType), backY_, 0, NULL, &event); if (status != CL_SUCCESS) { cerr << "Vector Y buffer object enqueuing error, status = " << status << endl; return NANOTIME_ERR; } status = clWaitForEvents(1, &event); if (status != CL_SUCCESS) { cout << "Wait on event failed, status = " << status << endl; return NANOTIME_ERR; } event = NULL; time = getCurrentTime(); int iter = 20; for ( int i = 1; i <= iter; i++) { status = clMath::clblas::gbmv(params_.order, params_.transA, params_.M, params_.N, params_.KL, params_.KU, alpha, mobjA_, params_.offA, params_.lda, mobjX_, params_.offBX, params_.incx, beta, mobjY_, params_.offCY, params_.incy, 1, &queue, 0, NULL, &event); if (status != CL_SUCCESS) { cerr << "The CLBLAS GBMV function failed, status = " << status << endl; return NANOTIME_ERR; } } clFinish( queue ); time = getCurrentTime() - time; time /= iter; return time; } } // namespace clMath // sgbmv performance test TEST_P(GBMV, sgbmv) { TestParams params; getParams(¶ms); GbmvPerformanceTest::runInstance(FN_SGBMV, ¶ms); } // dgbmv performance test case TEST_P(GBMV, dgbmv) { TestParams params; getParams(¶ms); GbmvPerformanceTest::runInstance(FN_DGBMV, ¶ms); } // cgbmv performance test TEST_P(GBMV, cgbmv) { TestParams params; getParams(¶ms); GbmvPerformanceTest::runInstance(FN_CGBMV, ¶ms); } // zgbmv performance test case TEST_P(GBMV, zgbmv) { TestParams params; getParams(¶ms); GbmvPerformanceTest::runInstance(FN_ZGBMV, ¶ms); } clblas-2.10/src/tests/performance/perf-gemm.cpp000066400000000000000000000250371264277366700215440ustar00rootroot00000000000000/* ************************************************************************ * Copyright 2013 Advanced Micro Devices, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * ************************************************************************/ /* * Gemm performance test cases */ #include // srand() #include // memcpy() #include #include #include #include #include #include #include #ifdef PERF_TEST_WITH_ACML #include #include #endif #include "PerformanceTest.h" /* * NOTE: operation factor means overall number * of multiply and add per each operation involving * 2 matrix elements */ using namespace std; using namespace clMath; #define CHECK_RESULT(ret) \ do { \ ASSERT_GE(ret, 0) << "Fatal error: can not allocate resources or " \ "perform an OpenCL request!" << endl; \ EXPECT_EQ(0, ret) << "The OpenCL version is slower in the case" << \ endl; \ } while (0) namespace clMath { template class GemmPerformanceTest : public PerformanceTest { public: virtual ~GemmPerformanceTest(); virtual int prepare(void); virtual nano_time_t etalonPerfSingle(void); virtual nano_time_t clblasPerfSingle(void); static void runInstance(BlasFunction fn, TestParams *params) { GemmPerformanceTest perfCase(fn, params); int ret = 0; int opFactor; BlasBase *base; base = clMath::BlasBase::getInstance(); if (fn == FN_SGEMM || fn == FN_DGEMM) { opFactor = 2; } else { opFactor = 8; } if ((fn == FN_DGEMM || fn == FN_ZGEMM) && !base->isDevSupportDoublePrecision()) { std::cerr << ">> WARNING: The target device doesn't support native " "double precision floating point arithmetic" << std::endl << ">> Test skipped" << std::endl; return; } if (!perfCase.areResourcesSufficient(params)) { std::cerr << ">> RESOURCE CHECK: Skip due to unsufficient resources" << std::endl; } else { ret = perfCase.run(opFactor); } ASSERT_GE(ret, 0) << "Fatal error: can not allocate resources or " "perform an OpenCL request!" << endl; EXPECT_EQ(0, ret) << "The OpenCL version is slower in the case" << endl; } private: GemmPerformanceTest(BlasFunction fn, TestParams *params); bool areResourcesSufficient(TestParams *params); TestParams params_; ElemType alpha_; ElemType beta_; ElemType *A_; ElemType *B_; ElemType *C_; ElemType *backC_; cl_mem mobjA_; cl_mem mobjB_; cl_mem mobjC_; ::clMath::BlasBase *base_; }; template GemmPerformanceTest::GemmPerformanceTest( BlasFunction fn, TestParams *params) : PerformanceTest(fn, (problem_size_t)params->M * params->N * params->K), params_(*params), mobjA_(NULL), mobjB_(NULL), mobjC_(NULL) { A_ = new ElemType[params_.rowsA * params_.columnsA]; B_ = new ElemType[params_.rowsB * params_.columnsB]; C_ = new ElemType[params_.rowsC * params_.columnsC]; backC_ = new ElemType[params_.rowsC * params_.columnsC]; base_ = ::clMath::BlasBase::getInstance(); } template GemmPerformanceTest::~GemmPerformanceTest() { delete[] A_; delete[] B_; delete[] C_; delete[] backC_; clReleaseMemObject(mobjC_); clReleaseMemObject(mobjB_); clReleaseMemObject(mobjA_); } /* * Check if available OpenCL resources are sufficient to * run the test case */ template bool GemmPerformanceTest::areResourcesSufficient(TestParams *params) { clMath::BlasBase *base; size_t gmemSize, allocSize, maxMatrSize; bool ret = true; size_t m = params->M, n = params->N, k = params->K; base = clMath::BlasBase::getInstance(); gmemSize = (size_t)base->availGlobalMemSize(0); allocSize = (size_t)base->maxMemAllocSize(); if (base->useImages()) { maxMatrSize = gmemSize / 5; ret = (k < base->scratchImageWidth() * sizeof(cl_float4) / sizeof(ElemType)); } else { maxMatrSize = gmemSize / 3; } maxMatrSize = std::min(maxMatrSize, allocSize); if (ret) { ret = ((std::max(m, n) * k * sizeof(ElemType) < maxMatrSize) && (m * n * sizeof(ElemType) < maxMatrSize)); } return ret; } template int GemmPerformanceTest::prepare(void) { bool useAlpha = base_->useAlpha(); bool useBeta = base_->useBeta(); if (useAlpha) { alpha_ = convertMultiplier(params_.alpha); } if (useBeta) { beta_ = convertMultiplier(params_.beta); } randomGemmMatrices(params_.order, params_.transA, params_.transB, params_.M, params_.N, params_.K, useAlpha, &alpha_, A_, params_.lda, B_, params_.ldb, useBeta, &beta_, C_, params_.ldc); mobjA_ = base_->createEnqueueBuffer(A_, params_.rowsA * params_.columnsA * sizeof(ElemType), params_.offA * sizeof(ElemType), CL_MEM_READ_ONLY); if (mobjA_) { mobjB_ = base_->createEnqueueBuffer(B_, params_.rowsB * params_.columnsB * sizeof(ElemType), params_.offBX * sizeof(ElemType), CL_MEM_READ_ONLY); } if (mobjB_) { mobjC_ = base_->createEnqueueBuffer(backC_, params_.rowsC * params_.columnsC * sizeof(ElemType), params_.offCY * sizeof(ElemType), CL_MEM_READ_WRITE); } return (mobjC_) ? 0 : -1; } template nano_time_t GemmPerformanceTest::etalonPerfSingle(void) { nano_time_t time = 0; clblasOrder order; size_t lda, ldb, ldc; #ifndef PERF_TEST_WITH_ROW_MAJOR if (params_.order == clblasRowMajor) { cerr << "Row major order is not allowed" << endl; return NANOTIME_ERR; } #endif memcpy(C_, backC_, params_.rowsC * params_.columnsC * sizeof(ElemType)); order = params_.order; lda = params_.lda; ldb = params_.ldb; ldc = params_.ldc; #ifdef PERF_TEST_WITH_ACML if (order == clblasRowMajor) { order = clblasColumnMajor; if (params_.transA == clblasNoTrans) { lda = params_.M; } else { lda = params_.K; } if (params_.transB == clblasNoTrans) { ldb = params_.K; } else { ldb = params_.N; } ldc = params_.M; } time = getCurrentTime(); clMath::blas::gemm(order, params_.transA, params_.transB, params_.M, params_.N, params_.K, alpha_, A_, lda, B_, ldb, beta_, C_, ldc); time = getCurrentTime() - time; #endif // PERF_TEST_WITH_ACML return time; } template nano_time_t GemmPerformanceTest::clblasPerfSingle(void) { nano_time_t time; cl_event event; cl_int status; cl_command_queue queue = base_->commandQueues()[0]; status = clEnqueueWriteBuffer(queue, mobjC_, CL_TRUE, 0, params_.rowsC * params_.columnsC * sizeof(ElemType), backC_, 0, NULL, &event); if (status != CL_SUCCESS) { cerr << "Matrix C buffer object enqueuing error, status = " << status << endl; return NANOTIME_ERR; } status = clWaitForEvents(1, &event); if (status != CL_SUCCESS) { cout << "Wait on event failed, status = " << status << endl; return NANOTIME_ERR; } event = NULL; status = (cl_int)clMath::clblas::gemm(params_.order, params_.transA, params_.transB, params_.M, params_.N, params_.K, alpha_, mobjA_, params_.offA, params_.lda, mobjB_, params_.offBX, params_.ldb, beta_, mobjC_, params_.offCY, params_.ldc, 1, &queue, 0, NULL, &event); if (status != CL_SUCCESS) { cerr << "The CLBLAS GEMM function failed, status = " << status << endl; return NANOTIME_ERR; } status = flushAll(1, &queue); if (status != CL_SUCCESS) { cerr << "clFlush() failed, status = " << status << endl; return NANOTIME_ERR; } time = getCurrentTime(); status = waitForSuccessfulFinish(1, &queue, &event); if (status == CL_SUCCESS) { time = getCurrentTime() - time; } else { cerr << "Waiting for completion of commands to the queue failed, " "status = " << status << endl; time = NANOTIME_ERR; } return time; } } // namespace clMath // sgemm performance test TEST_P(GEMM, sgemm) { TestParams params; getParams(¶ms); GemmPerformanceTest::runInstance(FN_SGEMM, ¶ms); } // dgemm performance test case TEST_P(GEMM, dgemm) { TestParams params; getParams(¶ms); GemmPerformanceTest::runInstance(FN_DGEMM, ¶ms); } // cgemm performance test case TEST_P(GEMM, cgemm) { TestParams params; getParams(¶ms); GemmPerformanceTest::runInstance(FN_CGEMM, ¶ms); } // zgemm performance test case TEST_P(GEMM, zgemm) { TestParams params; getParams(¶ms); GemmPerformanceTest::runInstance(FN_ZGEMM, ¶ms); } clblas-2.10/src/tests/performance/perf-gemm2.cpp000066400000000000000000000257211264277366700216260ustar00rootroot00000000000000/* ************************************************************************ * Copyright 2013 Advanced Micro Devices, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * ************************************************************************/ /* * Gemm performance test cases */ #include // srand() #include // memcpy() #include #include #include #include #include #include #include #ifdef PERF_TEST_WITH_ACML #include #include #endif #include "PerformanceTest.h" //#define SHUNT_ACML_RUN /* * NOTE: operation factor means overall number * of multiply and add per each operation involving * 2 matrix elements */ using namespace std; using namespace clMath; #define CHECK_RESULT(ret) \ do { \ ASSERT_GE(ret, 0) << "Fatal error: can not allocate resources or " \ "perform an OpenCL request!" << endl; \ EXPECT_EQ(0, ret) << "The OpenCL version is slower in the case" << \ endl; \ } while (0) namespace clMath { template class GemmPerformanceTest : public PerformanceTest { public: virtual ~GemmPerformanceTest(); virtual int prepare(void); virtual nano_time_t etalonPerfSingle(void); virtual nano_time_t clblasPerfSingle(void); static void runInstance(BlasFunction fn, TestParams *params) { GemmPerformanceTest perfCase(fn, params); int ret = 0; int opFactor; BlasBase *base; base = clMath::BlasBase::getInstance(); if (fn == FN_SGEMM_2 || fn == FN_DGEMM_2) { opFactor = 2; } else { opFactor = 8; } if ((fn == FN_DGEMM_2 || fn == FN_ZGEMM_2) && !base->isDevSupportDoublePrecision()) { std::cerr << ">> WARNING: The target device doesn't support native " "double precision floating point arithmetic" << std::endl << ">> Test skipped" << std::endl; return; } if (!perfCase.areResourcesSufficient(params)) { std::cerr << ">> RESOURCE CHECK: Skip due to unsufficient resources" << std::endl; } else { ret = perfCase.run(opFactor); } ASSERT_GE(ret, 0) << "Fatal error: can not allocate resources or " "perform an OpenCL request!" << endl; EXPECT_EQ(0, ret) << "The OpenCL version is slower in the case" << endl; } private: GemmPerformanceTest(BlasFunction fn, TestParams *params); bool areResourcesSufficient(TestParams *params); TestParams params_; ElemType alpha_; ElemType beta_; ElemType *A_; ElemType *B_; ElemType *C_; ElemType *backC_; cl_mem mobjA_; cl_mem mobjB_; cl_mem mobjC_; ::clMath::BlasBase *base_; }; template GemmPerformanceTest::GemmPerformanceTest( BlasFunction fn, TestParams *params) : PerformanceTest(fn, (problem_size_t)params->M * params->N * params->K), params_(*params), mobjA_(NULL), mobjB_(NULL), mobjC_(NULL) { A_ = new ElemType[params_.rowsA * params_.columnsA]; B_ = new ElemType[params_.rowsB * params_.columnsB]; C_ = new ElemType[params_.rowsC * params_.columnsC]; backC_ = new ElemType[params_.rowsC * params_.columnsC]; base_ = ::clMath::BlasBase::getInstance(); } template GemmPerformanceTest::~GemmPerformanceTest() { if(A_ != NULL) { delete[] A_; } if(B_ != NULL) { delete[] B_; } if(C_ != NULL) { delete[] C_; } if(backC_ != NULL) { delete[] backC_; } if(mobjA_ != NULL) { clReleaseMemObject(mobjA_); } if(mobjB_ != NULL) { clReleaseMemObject(mobjB_); } if(mobjC_ != NULL) { clReleaseMemObject(mobjC_); } } /* * Check if available OpenCL resources are sufficient to * run the test case */ template bool GemmPerformanceTest::areResourcesSufficient(TestParams *params) { clMath::BlasBase *base; size_t gmemSize, allocSize, maxMatrSize; bool ret = true; size_t m = params->M, n = params->N, k = params->K; if((A_ == NULL) || (backC_ == NULL) || (C_ == NULL) || (B_ == NULL)) { return 0; } base = clMath::BlasBase::getInstance(); gmemSize = (size_t)base->availGlobalMemSize(0); allocSize = (size_t)base->maxMemAllocSize(); if (base->useImages()) { maxMatrSize = gmemSize / 5; ret = (k < base->scratchImageWidth() * sizeof(cl_float4) / sizeof(ElemType)); } else { maxMatrSize = gmemSize / 3; } maxMatrSize = std::min(maxMatrSize, allocSize); if (ret) { ret = ((std::max(m, n) * k * sizeof(ElemType) < maxMatrSize) && (m * n * sizeof(ElemType) < maxMatrSize)); } return ret; } template int GemmPerformanceTest::prepare(void) { bool useAlpha = base_->useAlpha(); bool useBeta = base_->useBeta(); if (useAlpha) { alpha_ = convertMultiplier(params_.alpha); } if (useBeta) { beta_ = convertMultiplier(params_.beta); } randomGemmMatrices(params_.order, params_.transA, params_.transB, params_.M, params_.N, params_.K, useAlpha, &alpha_, A_, params_.lda, B_, params_.ldb, useBeta, &beta_, C_, params_.ldc); mobjA_ = base_->createEnqueueBuffer(A_, params_.rowsA * params_.columnsA * sizeof(ElemType), params_.offA * sizeof(ElemType), CL_MEM_READ_ONLY); if (mobjA_) { mobjB_ = base_->createEnqueueBuffer(B_, params_.rowsB * params_.columnsB * sizeof(ElemType), params_.offBX * sizeof(ElemType), CL_MEM_READ_ONLY); } if (mobjB_) { mobjC_ = base_->createEnqueueBuffer(backC_, params_.rowsC * params_.columnsC * sizeof(ElemType), params_.offCY * sizeof(ElemType), CL_MEM_READ_WRITE); } return (mobjC_) ? 0 : -1; } template nano_time_t GemmPerformanceTest::etalonPerfSingle(void) { nano_time_t time = 0; clblasOrder order; size_t lda, ldb, ldc; #ifndef PERF_TEST_WITH_ROW_MAJOR if (params_.order == clblasRowMajor) { cerr << "Row major order is not allowed" << endl; return NANOTIME_ERR; } #endif memcpy(C_, backC_, params_.rowsC * params_.columnsC * sizeof(ElemType)); order = params_.order; lda = params_.lda; ldb = params_.ldb; ldc = params_.ldc; #ifdef PERF_TEST_WITH_ACML if (order == clblasRowMajor) { order = clblasColumnMajor; if (params_.transA == clblasNoTrans) { lda = params_.M; } else { lda = params_.K; } if (params_.transB == clblasNoTrans) { ldb = params_.K; } else { ldb = params_.N; } ldc = params_.M; } time = getCurrentTime(); #ifndef SHUNT_ACML_RUN clMath::blas::gemm(order, params_.transA, params_.transB, params_.M, params_.N, params_.K, alpha_, A_, lda, B_, ldb, beta_, C_, ldc); #endif time = getCurrentTime() - time; #endif // PERF_TEST_WITH_ACML return time; } template nano_time_t GemmPerformanceTest::clblasPerfSingle(void) { nano_time_t time; cl_event event, gemmevent; cl_int status; cl_command_queue queue = base_->commandQueues()[0]; status = clEnqueueWriteBuffer(queue, mobjC_, CL_TRUE, 0, params_.rowsC * params_.columnsC * sizeof(ElemType), backC_, 0, NULL, &event); if (status != CL_SUCCESS) { cerr << "Matrix C buffer object enqueuing error, status = " << status << endl; return NANOTIME_ERR; } status = clWaitForEvents(1, &event); if (status != CL_SUCCESS) { cout << "Wait on event failed, status = " << status << endl; return NANOTIME_ERR; } event = NULL; status = (cl_int)clMath::clblas::gemm2(params_.order, params_.transA, params_.transB, params_.M, params_.N, params_.K, alpha_, mobjA_, params_.offA, params_.lda, mobjB_, params_.offBX, params_.ldb, beta_, mobjC_, params_.offCY, params_.ldc, 1, &queue, 0, NULL, &gemmevent); if (status != CL_SUCCESS) { cerr << "The CLBLAS GEMM function failed, status = " << status << endl; return NANOTIME_ERR; } status = flushAll(1, &queue); if (status != CL_SUCCESS) { cerr << "clFlush() failed, status = " << status << endl; return NANOTIME_ERR; } time = getCurrentTime(); status = waitForSuccessfulFinish(1, &queue, &gemmevent); if (status == CL_SUCCESS) { time = getCurrentTime() - time; } else { cerr << "Waiting for completion of commands to the queue failed, " "status = " << status << endl; time = NANOTIME_ERR; } //printf("Returning Time: %lu\n", time); return time; } } // namespace clMath // sgemm performance test TEST_P(GEMM2, sgemm) { TestParams params; getParams(¶ms); GemmPerformanceTest::runInstance(FN_SGEMM_2, ¶ms); } // dgemm performance test case TEST_P(GEMM2, dgemm) { TestParams params; getParams(¶ms); GemmPerformanceTest::runInstance(FN_DGEMM_2, ¶ms); } // cgemm performance test case TEST_P(GEMM2, cgemm) { TestParams params; getParams(¶ms); GemmPerformanceTest::runInstance(FN_CGEMM_2, ¶ms); } // zgemm performance test case TEST_P(GEMM2, zgemm) { TestParams params; getParams(¶ms); GemmPerformanceTest::runInstance(FN_ZGEMM_2, ¶ms); } clblas-2.10/src/tests/performance/perf-gemv.cpp000066400000000000000000000232531264277366700215530ustar00rootroot00000000000000/* ************************************************************************ * Copyright 2013 Advanced Micro Devices, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * ************************************************************************/ /* * Gemv performance test cases */ #include // srand() #include // memcpy() #include #include #include #include #include #include #include #ifdef PERF_TEST_WITH_ACML #include #include #endif #include "PerformanceTest.h" /* * NOTE: operation factor means overall number * of multiply and add per each operation involving * 2 matrix elements */ using namespace std; using namespace clMath; #define CHECK_RESULT(ret) \ do { \ ASSERT_GE(ret, 0) << "Fatal error: can not allocate resources or " \ "perform an OpenCL request!" << endl; \ EXPECT_EQ(0, ret) << "The OpenCL version is slower in the case" << \ endl; \ } while (0) namespace clMath { template class GemvPerformanceTest : public PerformanceTest { public: virtual ~GemvPerformanceTest(); virtual int prepare(void); virtual nano_time_t etalonPerfSingle(void); virtual nano_time_t clblasPerfSingle(void); static void runInstance(BlasFunction fn, TestParams *params) { GemvPerformanceTest perfCase(fn, params); int ret = 0; int opFactor; BlasBase *base; base = clMath::BlasBase::getInstance(); switch (fn) { case FN_SGEMV: opFactor = sizeof(cl_float); break; case FN_DGEMV: opFactor = sizeof(cl_double); case FN_CGEMV: opFactor = sizeof(FloatComplex); break; case FN_ZGEMV: opFactor = sizeof(DoubleComplex); break; default: break; } if ((fn == FN_DGEMV || fn == FN_ZGEMV) && !base->isDevSupportDoublePrecision()) { std::cerr << ">> WARNING: The target device doesn't support native " "double precision floating point arithmetic" << std::endl << ">> Test skipped" << std::endl; return; } if (!perfCase.areResourcesSufficient(params)) { std::cerr << ">> RESOURCE CHECK: Skip due to unsufficient resources" << std::endl; } else { ret = perfCase.run(opFactor); } ASSERT_GE(ret, 0) << "Fatal error: can not allocate resources or " "perform an OpenCL request!" << endl; EXPECT_EQ(0, ret) << "The OpenCL version is slower in the case" << endl; } private: GemvPerformanceTest(BlasFunction fn, TestParams *params); bool areResourcesSufficient(TestParams *params); TestParams params_; ElemType alpha_; ElemType beta_; ElemType *A_; ElemType *B_; ElemType *C_; ElemType *backC_; cl_mem mobjA_; cl_mem mobjB_; cl_mem mobjC_; ::clMath::BlasBase *base_; }; template GemvPerformanceTest::GemvPerformanceTest( BlasFunction fn, TestParams *params) : PerformanceTest(fn, (problem_size_t)params->M * params->N), params_(*params), mobjA_(NULL), mobjB_(NULL), mobjC_(NULL) { A_ = new ElemType[params_.rowsA * params_.columnsA]; B_ = new ElemType[params_.rowsB * params_.columnsB]; C_ = new ElemType[params_.rowsC * params_.columnsC]; backC_ = new ElemType[params_.rowsC * params_.columnsC]; base_ = ::clMath::BlasBase::getInstance(); } template GemvPerformanceTest::~GemvPerformanceTest() { delete[] A_; delete[] B_; delete[] C_; delete[] backC_; clReleaseMemObject(mobjC_); clReleaseMemObject(mobjB_); clReleaseMemObject(mobjA_); } /* * Check if available OpenCL resources are sufficient to * run the test case */ template bool GemvPerformanceTest::areResourcesSufficient(TestParams *params) { clMath::BlasBase *base; size_t gmemSize, allocSize, maxMatrSize; size_t m = params->M, n = params->N; base = clMath::BlasBase::getInstance(); gmemSize = (size_t)base->availGlobalMemSize(0); allocSize = (size_t)base->maxMemAllocSize(); maxMatrSize = gmemSize / 3; maxMatrSize = std::min(maxMatrSize, allocSize); return (m * n * sizeof(ElemType) < maxMatrSize); } template int GemvPerformanceTest::prepare(void) { size_t lenX, lenY; bool useAlpha = base_->useAlpha(); bool useBeta = base_->useBeta(); if (useAlpha) { alpha_ = convertMultiplier(params_.alpha); } if (useBeta) { beta_ = convertMultiplier(params_.beta); } if (params_.transA == clblasNoTrans) { lenX = params_.N; lenY = params_.M; } else { lenX = params_.M; lenY = params_.N; } randomGemmxMatrices(params_.order, params_.transA, params_.transB, params_.transC, lenY, params_.K, lenX, useAlpha, &alpha_, A_, params_.lda, B_, params_.ldb, useBeta, &beta_, C_, params_.ldc); mobjA_ = base_->createEnqueueBuffer(A_, params_.rowsA * params_.columnsA * sizeof(*A_), params_.offA * sizeof(*A_), CL_MEM_READ_ONLY); mobjB_ = base_->createEnqueueBuffer(B_, params_.rowsB * params_.columnsB * sizeof(*B_), 0, CL_MEM_READ_ONLY); mobjC_ = base_->createEnqueueBuffer(backC_, params_.rowsC * params_.columnsC * sizeof(*backC_), 0, CL_MEM_READ_WRITE); return ((mobjA_ != NULL) && (mobjB_ != NULL) && (mobjC_ != NULL)) ? 0 : -1; } template nano_time_t GemvPerformanceTest::etalonPerfSingle(void) { nano_time_t time = 0; clblasOrder order; size_t lda; #ifndef PERF_TEST_WITH_ROW_MAJOR if (params_.order == clblasRowMajor) { cerr << "Row major order is not allowed" << endl; return NANOTIME_ERR; } #endif memcpy(C_, backC_, params_.rowsC * params_.columnsC * sizeof(ElemType)); order = params_.order; lda = params_.lda; #ifdef PERF_TEST_WITH_ACML // #warning "GEMV performance test not implemented" time = NANOTIME_MAX; order = order; lda = lda; #endif // PERF_TEST_WITH_ACML return time; } template nano_time_t GemvPerformanceTest::clblasPerfSingle(void) { nano_time_t time; cl_event event; cl_int status; cl_command_queue queue = base_->commandQueues()[0]; status = clEnqueueWriteBuffer(queue, mobjC_, CL_TRUE, 0, params_.rowsC * params_.columnsC * sizeof(ElemType), backC_, 0, NULL, &event); if (status != CL_SUCCESS) { cerr << "Vector Y buffer object enqueuing error, status = " << status << endl; return NANOTIME_ERR; } status = clWaitForEvents(1, &event); if (status != CL_SUCCESS) { cout << "Wait on event failed, status = " << status << endl; return NANOTIME_ERR; } event = NULL; status = (cl_int)clMath::clblas::gemv(params_.order, params_.transA, params_.M, params_.N, alpha_, mobjA_, params_.offA, params_.lda, mobjB_, params_.offBX, params_.incx, beta_, mobjC_, params_.offCY, params_.incy, 1, &queue, 0, NULL, &event); if (status != CL_SUCCESS) { cerr << "The CLBLAS GEMV function failed, status = " << status << endl; return NANOTIME_ERR; } status = flushAll(1, &queue); if (status != CL_SUCCESS) { cerr << "clFlush() failed, status = " << status << endl; return NANOTIME_ERR; } time = getCurrentTime(); status = waitForSuccessfulFinish(1, &queue, &event); if (status == CL_SUCCESS) { time = getCurrentTime() - time; } else { cerr << "Waiting for completion of commands to the queue failed, " "status = " << status << endl; time = NANOTIME_ERR; } return time; } } // namespace clMath // sgemv performance test TEST_P(GEMV, sgemv) { TestParams params; getParams(¶ms); GemvPerformanceTest::runInstance(FN_SGEMV, ¶ms); } // dgemv performance test case TEST_P(GEMV, dgemv) { TestParams params; getParams(¶ms); GemvPerformanceTest::runInstance(FN_DGEMV, ¶ms); } // cgemv performance test TEST_P(GEMV, cgemv) { TestParams params; getParams(¶ms); GemvPerformanceTest::runInstance(FN_CGEMV, ¶ms); } // zgemv performance test case TEST_P(GEMV, zgemv) { TestParams params; getParams(¶ms); GemvPerformanceTest::runInstance(FN_ZGEMV, ¶ms); } clblas-2.10/src/tests/performance/perf-ger.cpp000066400000000000000000000253551264277366700213770ustar00rootroot00000000000000/* ************************************************************************ * Copyright 2013 Advanced Micro Devices, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * ************************************************************************/ /* * GER performance test cases */ #include // srand() #include // memcpy() #include #include #include #include #include #include #include #ifdef PERF_TEST_WITH_ACML #include #include #endif #include "PerformanceTest.h" /* * NOTE: operation factor means overall number * of multiply and add per each operation involving * 2 matrix elements */ using namespace std; using namespace clMath; #define CHECK_RESULT(ret) \ do { \ ASSERT_GE(ret, 0) << "Fatal error: can not allocate resources or " \ "perform an OpenCL request!" << endl; \ EXPECT_EQ(0, ret) << "The OpenCL version is slower in the case" << \ endl; \ } while (0) namespace clMath { template class GerPerformanceTest : public PerformanceTest { public: virtual ~GerPerformanceTest(); virtual int prepare(void); virtual nano_time_t etalonPerfSingle(void); virtual nano_time_t clblasPerfSingle(void); static void runInstance(BlasFunction fn, TestParams *params) { GerPerformanceTest perfCase(fn, params); int ret = 0; int opFactor; BlasBase *base; base = clMath::BlasBase::getInstance(); opFactor =1; if ((fn == FN_DGER || fn == FN_ZGERU) && !base->isDevSupportDoublePrecision()) { std::cerr << ">> WARNING: The target device doesn't support native " "double precision floating point arithmetic" << std::endl << ">> Test skipped" << std::endl; return; } if (!perfCase.areResourcesSufficient(params)) { std::cerr << ">> RESOURCE CHECK: Skip due to unsufficient resources" << std::endl; return; } else { ret = perfCase.run(opFactor); } ASSERT_GE(ret, 0) << "Fatal error: can not allocate resources or " "perform an OpenCL request!" << endl; EXPECT_EQ(0, ret) << "The OpenCL version is slower in the case" << endl; } private: GerPerformanceTest(BlasFunction fn, TestParams *params); bool areResourcesSufficient(TestParams *params); TestParams params_; ElemType alpha_; ElemType *A_; ElemType *backA_; ElemType *x_; ElemType *y_; cl_mem mobjA_; cl_mem mobjx_; size_t lengthA; cl_mem mobjy_; ::clMath::BlasBase *base_; }; template GerPerformanceTest::GerPerformanceTest( BlasFunction fn, TestParams *params) : PerformanceTest(fn,(problem_size_t) ( ( (3 * params->M * params->N) + params->M ) * sizeof(ElemType) ) ), params_(*params), mobjA_(NULL), mobjx_(NULL), mobjy_(NULL) { if( params_.order == clblasColumnMajor ) lengthA = params_.N * params_.lda; else lengthA = params_.M * params_.lda; A_ = new ElemType[lengthA + params_.offa]; backA_ = new ElemType[lengthA + params_.offa]; x_ = new ElemType[(1 + (params->M - 1) * abs(params_.incx))+ params_.offBX]; y_ = new ElemType[(1 + (params->N - 1) * abs(params_.incy)) + params_.offCY] ; base_ = ::clMath::BlasBase::getInstance(); } template GerPerformanceTest::~GerPerformanceTest() { if(A_ != NULL) { delete[] A_; } if(x_ != NULL) { delete[] x_; } if(y_ != NULL) { delete[] y_; } if(backA_ != NULL) { delete[] backA_; } if( mobjy_ != NULL ) clReleaseMemObject(mobjy_); if( mobjx_ != NULL ) clReleaseMemObject(mobjx_); if( mobjA_ != NULL ) clReleaseMemObject(mobjA_); } /* * Check if available OpenCL resources are sufficient to * run the test case */ template bool GerPerformanceTest::areResourcesSufficient(TestParams *params) { clMath::BlasBase *base; size_t gmemSize, allocSize; bool ret; size_t m = params->M, n = params->N; if((A_ == NULL) || (backA_ == NULL) || (x_ == NULL) || (y_ == NULL)) { return 0; } base = clMath::BlasBase::getInstance(); gmemSize = (size_t)base->availGlobalMemSize( 0 ); allocSize = (size_t)base->maxMemAllocSize(); ret = std::max(m, n) * params_.lda * sizeof(ElemType) < allocSize; ret = ret && ( ((1 + (params_.M-1)*abs(params_.incx)))* sizeof(ElemType) < allocSize); ret = ret && ( ((1 + (params_.N-1)*abs(params_.incy))) * sizeof(ElemType) < allocSize); ret = ret && (((std::max(m, n) * params_.lda) + ((1 + (params_.M-1)*abs(params_.incx))) + ((1 + (params_.N-1)*abs(params_.incy)))) < gmemSize); return ret; } template int GerPerformanceTest::prepare(void) { bool useAlpha = base_->useAlpha(); if (useAlpha) { alpha_ = convertMultiplier(params_.alpha); } int creationFlags = 0; creationFlags = creationFlags | RANDOM_INIT; creationFlags = ( (params_.order) == clblasRowMajor)? (creationFlags | ROW_MAJOR_ORDER) : (creationFlags); BlasRoutineID BlasFn = CLBLAS_GER; populate( A_+ params_.offa, params_.M, params_.N, params_.lda, BlasFn, creationFlags); populate( x_, (1 + (params_.M-1) * abs(params_.incx) + params_.offBX), 1, (1 + (params_.M-1) * abs(params_.incx) + params_.offBX), BlasFn, creationFlags ); populate( y_, (1 + (params_.N-1) * abs(params_.incy) + params_.offCY), 1, (1 + (params_.N-1) * abs(params_.incy) + params_.offCY), BlasFn, creationFlags ); memcpy(backA_, A_, (lengthA + params_.offa)* sizeof(ElemType)); mobjA_ = base_->createEnqueueBuffer(A_, (lengthA + params_.offa) * sizeof(*A_), 0, CL_MEM_READ_WRITE); mobjx_ = base_->createEnqueueBuffer(x_, ( (1 + (params_.M-1) * abs(params_.incx) + params_.offBX)) * sizeof(*x_), 0, CL_MEM_READ_WRITE); mobjy_ = base_->createEnqueueBuffer(y_,( (1 + (params_.N-1) * abs(params_.incy) + params_.offCY)) * sizeof(*y_) , 0, CL_MEM_READ_WRITE); return ( (mobjA_ != NULL) && (mobjx_ != NULL) && (mobjy_ != NULL) ) ? 0 : -1; } template nano_time_t GerPerformanceTest::etalonPerfSingle(void) { nano_time_t time = 0; clblasOrder order; size_t lda, fN, fM; #ifndef PERF_TEST_WITH_ROW_MAJOR if (params_.order == clblasRowMajor) { cerr << "Row major order is not allowed" << endl; return NANOTIME_ERR; } #endif order = params_.order; lda = params_.lda; fM = params_.M; fN = params_.N; #ifdef PERF_TEST_WITH_ACML clblasOrder fOrder; size_t fOffx, fOffy; int fIncx, fIncy; ElemType *fX, *fY; fOrder = params_.order; fM = params_.M; fN = params_.N; fIncx = params_.incx; fIncy = params_.incy; fX = x_; fY = y_; fOffx = params_.offBX; fOffy = params_.offCY; if (fOrder != clblasColumnMajor) { fOrder = clblasColumnMajor; fM = params_.N; fN = params_.M; fX = y_; fY = x_; fIncx = params_.incy; fIncy = params_.incx; fOffx = params_.offCY; fOffy = params_.offBX; } time = getCurrentTime(); clMath::blas::ger(order, fM, fN, alpha_, fX, fOffx, fIncx, fY, fOffy, fIncy, A_, params_.offa, lda); time = getCurrentTime() - time; #endif // PERF_TEST_WITH_ACML return time; } template nano_time_t GerPerformanceTest::clblasPerfSingle(void) { nano_time_t time; cl_event event; cl_int status; cl_command_queue queue = base_->commandQueues()[0]; status = clEnqueueWriteBuffer(queue, mobjA_, CL_TRUE, 0, (lengthA + params_.offa) * sizeof(ElemType), backA_, 0, NULL, &event); if (status != CL_SUCCESS) { cerr << "Matrix A buffer object enqueuing error, status = " << status << endl; return NANOTIME_ERR; } status = clWaitForEvents(1, &event); if (status != CL_SUCCESS) { cout << "Wait on event failed, status = " << status << endl; return NANOTIME_ERR; } event = NULL; time = getCurrentTime(); #define TIMING #ifdef TIMING clFinish( queue); int iter = 20; for ( int i = 1; i <= iter; i++) { #endif status = (cl_int)clMath::clblas::ger(params_.order, params_.M, params_.N, alpha_, mobjx_, params_.offBX, params_.incx, mobjy_, params_.offCY, params_.incy, mobjA_, params_.offa, params_.lda, 1, &queue, 0, NULL, &event); if (status != CL_SUCCESS) { cerr << "The CLBLAS GER function failed, status = " << status << endl; return NANOTIME_ERR; } #ifdef TIMING } // iter loop clFinish( queue); time = getCurrentTime() - time; time /= iter; #else status = flushAll(1, &queue); if (status != CL_SUCCESS) { cerr << "clFlush() failed, status = " << status << endl; return NANOTIME_ERR; } time = getCurrentTime(); status = waitForSuccessfulFinish(1, &queue, &event); if (status == CL_SUCCESS) { time = getCurrentTime() - time; } else { cerr << "Waiting for completion of commands to the queue failed, " "status = " << status << endl; time = NANOTIME_ERR; } #endif return time; } } // namespace clMath // ger performance test TEST_P(GER, sger) { TestParams params; getParams(¶ms); GerPerformanceTest::runInstance(FN_SGER, ¶ms); } TEST_P(GER, dger) { TestParams params; getParams(¶ms); GerPerformanceTest::runInstance(FN_DGER, ¶ms); } TEST_P(GER, cgeru) { TestParams params; getParams(¶ms); GerPerformanceTest::runInstance(FN_CGERU, ¶ms); } TEST_P(GER, zgeru) { TestParams params; getParams(¶ms); GerPerformanceTest::runInstance(FN_ZGERU, ¶ms); } clblas-2.10/src/tests/performance/perf-gerc.cpp000066400000000000000000000255111264277366700215340ustar00rootroot00000000000000/* ************************************************************************ * Copyright 2013 Advanced Micro Devices, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * ************************************************************************/ /* * Symm performance test cases */ #include // srand() #include // memcpy() #include #include #include #include #include #include #include #ifdef PERF_TEST_WITH_ACML #include #include #endif #include "PerformanceTest.h" /* * NOTE: operation factor means overall number * of multiply and add per each operation involving * 2 matrix elements */ using namespace std; using namespace clMath; #define CHECK_RESULT(ret) \ do { \ ASSERT_GE(ret, 0) << "Fatal error: can not allocate resources or " \ "perform an OpenCL request!" << endl; \ EXPECT_EQ(0, ret) << "The OpenCL version is slower in the case" << \ endl; \ } while (0) namespace clMath { template class GercPerformanceTest : public PerformanceTest { public: virtual ~GercPerformanceTest(); virtual int prepare(void); virtual nano_time_t etalonPerfSingle(void); virtual nano_time_t clblasPerfSingle(void); static void runInstance(BlasFunction fn, TestParams *params) { GercPerformanceTest perfCase(fn, params); int ret = 0; int opFactor; BlasBase *base; base = clMath::BlasBase::getInstance(); opFactor =1; if (fn == FN_ZGERC && !base->isDevSupportDoublePrecision()) { std::cerr << ">> WARNING: The target device doesn't support native " "double precision floating point arithmetic" << std::endl << ">> Test skipped" << std::endl; return; } if (!perfCase.areResourcesSufficient(params)) { std::cerr << ">> RESOURCE CHECK: Skip due to unsufficient resources" << std::endl; return; } else { ret = perfCase.run(opFactor); } ASSERT_GE(ret, 0) << "Fatal error: can not allocate resources or " "perform an OpenCL request!" << endl; EXPECT_EQ(0, ret) << "The OpenCL version is slower in the case" << endl; } private: GercPerformanceTest(BlasFunction fn, TestParams *params); bool areResourcesSufficient(TestParams *params); TestParams params_; ElemType alpha_; ElemType *A_; ElemType *backA_; ElemType *x_; ElemType *y_; cl_mem mobjA_; cl_mem mobjx_; cl_mem mobjy_; int lengthA; ::clMath::BlasBase *base_; }; template GercPerformanceTest::GercPerformanceTest( BlasFunction fn, TestParams *params) : PerformanceTest(fn,(problem_size_t) (((2 * params->M * params->N) + params->M + params->N ) * sizeof(ElemType) ) ), params_(*params), mobjA_(NULL), mobjx_(NULL), mobjy_(NULL) { //if( params_.side == clblasLeft ) // ka = params_.M; //else ka = params_.N; if( params_.order == clblasColumnMajor ) lengthA = params_.N * params_.lda; else lengthA = params_.M * params_.lda; A_ = new ElemType[(lengthA) + params_.offa]; backA_ = new ElemType[lengthA+ params_.offa]; x_ = new ElemType[(1 + (params->M - 1) * abs(params_.incx))+ params_.offBX]; y_ = new ElemType[(1 + (params->N - 1) * abs(params_.incy)) + params_.offCY] ; base_ = ::clMath::BlasBase::getInstance(); } template GercPerformanceTest::~GercPerformanceTest() { if(A_ != NULL) { delete[] A_; } if(x_ != NULL) { delete[] x_; } if(y_ != NULL) { delete[] y_; } if(backA_ != NULL) { delete[] backA_; } if( mobjy_ != NULL ) clReleaseMemObject(mobjy_); if( mobjx_ != NULL ) clReleaseMemObject(mobjx_); if( mobjA_ != NULL ) clReleaseMemObject(mobjA_); } /* * Check if available OpenCL resources are sufficient to * run the test case */ template bool GercPerformanceTest::areResourcesSufficient(TestParams *params) { clMath::BlasBase *base; size_t gmemSize, allocSize; bool ret; size_t m = params->M, n = params->N; if((A_ == NULL) || (backA_ == NULL) || (x_ == NULL) || (y_ == NULL)) { return 0; } base = clMath::BlasBase::getInstance(); gmemSize = (size_t)base->availGlobalMemSize( 0 ); allocSize = (size_t)base->maxMemAllocSize(); ret = std::max(m, n) * params_.lda * sizeof(ElemType) < allocSize; ret = ret && ( ((1 + (params_.M-1)*abs(params_.incx)))* sizeof(ElemType) < allocSize); ret = ret && ( ((1 + (params_.N-1)*abs(params_.incy))) * sizeof(ElemType) < allocSize); ret = ret && (((std::max(m, n) * params_.lda) + ((1 + (params_.M-1)*abs(params_.incx))) + ((1 + (params_.N-1)*abs(params_.incy)))) < gmemSize); return ret; } template int GercPerformanceTest::prepare(void) { bool useAlpha = base_->useAlpha(); if (useAlpha) { alpha_ = convertMultiplier(params_.alpha); } int creationFlags = 0; creationFlags = creationFlags | RANDOM_INIT; creationFlags = ( (params_.order) == clblasRowMajor)? (creationFlags | ROW_MAJOR_ORDER) : (creationFlags); BlasRoutineID funcId = CLBLAS_GER; populate( A_ + params_.offa, params_.M, params_.N, params_.lda, funcId, creationFlags); populate( x_ , (1 + (params_.M-1) * abs(params_.incx) + params_.offBX),1, (1 + (params_.M-1) * abs(params_.incx) + params_.offBX), funcId, 0 ); populate( y_ , (1 + (params_.N-1) * abs(params_.incy) + params_.offCY),1, (1 + (params_.N-1) * abs(params_.incy) + params_.offCY), funcId, 0 ); memcpy(backA_, A_, (lengthA + params_.offa)* sizeof(ElemType)); mobjA_ = base_->createEnqueueBuffer(A_, (lengthA + params_.offa) * sizeof(*A_), 0, CL_MEM_READ_WRITE); mobjx_ = base_->createEnqueueBuffer(x_, ( (1 + (params_.M-1) * abs(params_.incx) + params_.offBX)) * sizeof(*x_), 0, CL_MEM_READ_WRITE); mobjy_ = base_->createEnqueueBuffer(y_,( (1 + (params_.N-1) * abs(params_.incy) + params_.offCY)) * sizeof(*y_) , 0, CL_MEM_READ_WRITE); return ( (mobjA_ != NULL) && (mobjx_ != NULL) && (mobjy_ != NULL) ) ? 0 : -1; } template nano_time_t GercPerformanceTest::etalonPerfSingle(void) { nano_time_t time = 0; clblasOrder order; size_t lda; //int fIncx, fIncy; #ifndef PERF_TEST_WITH_ROW_MAJOR if (params_.order == clblasRowMajor) { cerr << "Row major order is not allowed" << endl; return NANOTIME_ERR; } #endif order = params_.order; lda = params_.lda; #ifdef PERF_TEST_WITH_ACML clblasOrder fOrder; size_t fN, fM; size_t fOffx, fOffy; int fIncx, fIncy; ElemType *fX, *fY; fOrder = params_.order; fM = params_.M; fN = params_.N; fIncx = params_.incx; fIncy = params_.incy; fX = x_; fY = y_; fOffx = params_.offBX; fOffy = params_.offCY; if (fOrder != clblasColumnMajor) { fOrder = clblasColumnMajor; doConjugate( (y_ + params_.offCY), (1 + (params_.N-1) * abs(params_.incy)), 1, 1 ); fM = params_.N; fN = params_.M; fX = y_; fY = x_; fIncx = params_.incy; fIncy = params_.incx; fOffx = params_.offCY; fOffy = params_.offBX; // Note this according to the Legacy guide time = getCurrentTime(); clMath::blas::ger(fOrder, fM, fN, alpha_, fX , fOffx, fIncx, fY, fOffy, fIncy, A_, params_.offa, params_.lda); } else{ time = getCurrentTime(); clMath::blas::gerc(order, fM, fN, alpha_, fX, fOffx, params_.incx, fY, fOffy, params_.incy, A_, params_.offa, lda); } time = getCurrentTime() - time; #endif // PERF_TEST_WITH_ACML return time; } template nano_time_t GercPerformanceTest::clblasPerfSingle(void) { nano_time_t time; cl_event event; cl_int status; cl_command_queue queue = base_->commandQueues()[0]; status = clEnqueueWriteBuffer(queue, mobjA_, CL_TRUE, 0, (lengthA + params_.offa) * sizeof(ElemType), backA_, 0, NULL, &event); if (status != CL_SUCCESS) { cerr << "Matrix A buffer object enqueuing error, status = " << status << endl; return NANOTIME_ERR; } status = clWaitForEvents(1, &event); if (status != CL_SUCCESS) { cout << "Wait on event failed, status = " << status << endl; return NANOTIME_ERR; } event = NULL; time = getCurrentTime(); #define TIMING #ifdef TIMING clFinish( queue); int iter = 20; for ( int i = 1; i <= iter; i++) { #endif status = (cl_int)clMath::clblas::gerc(params_.order, params_.M, params_.N, alpha_, mobjx_, params_.offBX, params_.incx, mobjy_, params_.offCY, params_.incy, mobjA_, params_.offa, params_.lda, 1, &queue, 0, NULL, &event); if (status != CL_SUCCESS) { cerr << "The CLBLAS GERC function failed, status = " << status << endl; return NANOTIME_ERR; } #ifdef TIMING } // iter loop clFinish( queue); time = getCurrentTime() - time; time /= iter; #else status = flushAll(1, &queue); if (status != CL_SUCCESS) { cerr << "clFlush() failed, status = " << status << endl; return NANOTIME_ERR; } time = getCurrentTime(); status = waitForSuccessfulFinish(1, &queue, &event); if (status == CL_SUCCESS) { time = getCurrentTime() - time; } else { cerr << "Waiting for completion of commands to the queue failed, " "status = " << status << endl; time = NANOTIME_ERR; } #endif return time; } } // namespace clMath TEST_P(GERC, cgerc) { TestParams params; getParams(¶ms); GercPerformanceTest::runInstance(FN_CGERC, ¶ms); } TEST_P(GERC, zgerc) { TestParams params; getParams(¶ms); GercPerformanceTest::runInstance(FN_ZGERC, ¶ms); } clblas-2.10/src/tests/performance/perf-hbmv.cpp000066400000000000000000000226241264277366700215520ustar00rootroot00000000000000/* ************************************************************************ * Copyright 2013 Advanced Micro Devices, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * ************************************************************************/ /* * Hbmv performance test cases */ #include // srand() #include // memcpy() #include #include #include #include #include #include #include #include #ifdef PERF_TEST_WITH_ACML #include #include #endif #include "PerformanceTest.h" using namespace std; using namespace clMath; #define CHECK_RESULT(ret) \ do { \ ASSERT_GE(ret, 0) << "Fatal error: can not allocate resources or " \ "perform an OpenCL request!" << endl; \ EXPECT_EQ(0, ret) << "The OpenCL version is slower in the case" << \ endl; \ } while (0) namespace clMath { template class HbmvPerformanceTest : public PerformanceTest { public: virtual ~HbmvPerformanceTest(); virtual int prepare(void); virtual nano_time_t etalonPerfSingle(void); virtual nano_time_t clblasPerfSingle(void); static void runInstance(BlasFunction fn, TestParams *params) { HbmvPerformanceTest perfCase(fn, params); int ret = 0; int opFactor = 1; BlasBase *base; base = clMath::BlasBase::getInstance(); if ((fn == FN_ZHBMV) && !base->isDevSupportDoublePrecision()) { std::cerr << ">> WARNING: The target device doesn't support native " "double precision floating point arithmetic" << std::endl << ">> Test skipped" << std::endl; return; } if (!perfCase.areResourcesSufficient(params)) { std::cerr << ">> RESOURCE CHECK: Skip due to unsufficient resources" << std::endl; } else { ret = perfCase.run(opFactor); } ASSERT_GE(ret, 0) << "Fatal error: can not allocate resources or " "perform an OpenCL request!" << endl; EXPECT_EQ(0, ret) << "The OpenCL version is slower in the case" << endl; } private: HbmvPerformanceTest(BlasFunction fn, TestParams *params); bool areResourcesSufficient(TestParams *params); TestParams params_; ElemType alpha; ElemType beta; ElemType *A_; ElemType *X_; ElemType *Y_; ElemType *backY_; cl_mem mobjA_; cl_mem mobjX_; cl_mem mobjY_; ::clMath::BlasBase *base_; }; template HbmvPerformanceTest::HbmvPerformanceTest( BlasFunction fn, TestParams *params) : PerformanceTest(fn, (problem_size_t)( ( (2 * (params->N) * (params->K + 1) // A-access - (2 * params->K * (params->K+1)) ) // Substract hole-part for A & X +( ((2*params->K + 1) * params->N + 2*params->N)) // X & Y access ) * sizeof(ElemType) ) ), params_(*params), mobjA_(NULL), mobjX_(NULL), mobjY_(NULL) { size_t lenA, lenX, lenY; lenA = (params_.N) * (params_.lda) + params_.offA; lenX = ((params_.N) - 1)* params_.incx + 1 + params_.offBX; lenY = ((params_.N) - 1)* params_.incy + 1 + params_.offCY; A_ = new ElemType[ lenA ]; X_ = new ElemType[ lenX ]; Y_ = new ElemType[ lenY ]; backY_ = new ElemType[ lenY ]; alpha = convertMultiplier(params_.alpha); beta = convertMultiplier(params_.beta); base_ = ::clMath::BlasBase::getInstance(); mobjA_ = NULL; mobjX_ = NULL; mobjY_ = NULL; } template HbmvPerformanceTest::~HbmvPerformanceTest() { if(A_ != NULL) { delete[] A_; } if(X_ != NULL) { delete[] X_; } if(backY_ != NULL) { delete[] backY_; } if(Y_ != NULL) { delete[] Y_; } if ( mobjA_ != NULL ) clReleaseMemObject(mobjA_); if ( mobjX_ != NULL ) clReleaseMemObject(mobjX_); if ( mobjY_ != NULL ) clReleaseMemObject(mobjY_); } /* * Check if available OpenCL resources are sufficient to * run the test case */ template bool HbmvPerformanceTest::areResourcesSufficient(TestParams *params) { clMath::BlasBase *base; size_t gmemSize, allocSize; size_t n = params->N, lda = params->lda; size_t lenA = ((n ) * lda + params->offA)* sizeof(ElemType); size_t lenX = (((params->N) - 1)* params->incx + 1 + params->offBX) * sizeof(ElemType); size_t lenY = (((params->N) - 1)* params->incy + 1 + params->offCY) * sizeof(ElemType); if((A_ == NULL) || (X_ == NULL) || (Y_ == NULL) || (backY_ == NULL)) { return 0; } base = clMath::BlasBase::getInstance(); gmemSize = (size_t)base->availGlobalMemSize(0); allocSize = (size_t)base->maxMemAllocSize(); bool suff = (lenA < allocSize) && ( (lenA + lenX + lenY) < gmemSize ); return suff; } template int HbmvPerformanceTest::prepare(void) { size_t lenX, lenY, lenA; lenA = (params_.N ) * params_.lda + params_.offA; lenX = (params_.N - 1)*abs(params_.incx) + 1 + params_.offBX; lenY = (params_.N - 1)*abs(params_.incy) + 1 + params_.offCY; randomGbmvMatrices(params_.order, clblasNoTrans, params_.N, params_.N, &alpha, &beta, (A_+params_.offA), params_.lda, (X_+params_.offBX), params_.incx, (Y_+params_.offCY), params_.incy ); memcpy(backY_, Y_, lenY * sizeof(ElemType)); mobjA_ = base_->createEnqueueBuffer(A_, lenA * sizeof(ElemType), 0, CL_MEM_READ_ONLY); mobjX_ = base_->createEnqueueBuffer(X_, lenX * sizeof(ElemType), 0, CL_MEM_READ_ONLY); mobjY_ = base_->createEnqueueBuffer(backY_, lenY * sizeof(ElemType), 0, CL_MEM_READ_WRITE); return ((mobjA_ != NULL) && (mobjX_ != NULL) && (mobjY_ != NULL)) ? 0 : -1; } template nano_time_t HbmvPerformanceTest::etalonPerfSingle(void) { nano_time_t time = 0; clblasOrder fOrder; clblasUplo fUplo; size_t lda, lenY; size_t fN = params_.N, fK = params_.K; lenY = ((params_.N) - 1)* params_.incy + 1 + params_.offCY; memcpy(Y_, backY_, lenY * sizeof(ElemType)); fOrder = params_.order; fUplo = params_.uplo; lda = params_.lda; if (fOrder != clblasColumnMajor) { fOrder = clblasColumnMajor; fUplo = (params_.uplo == clblasLower)? clblasUpper : clblasLower; } #ifdef PERF_TEST_WITH_ACML time = getCurrentTime(); clMath::blas::hbmv(fOrder, fUplo, fN, fK, alpha, A_, params_.offA, lda, X_, params_.offBX, params_.incx, beta, Y_, params_.offCY, params_.incy); time = getCurrentTime() - time; #endif // PERF_TEST_WITH_ACML return time; } template nano_time_t HbmvPerformanceTest::clblasPerfSingle(void) { nano_time_t time; cl_event event; cl_int status; size_t lenY; cl_command_queue queue = base_->commandQueues()[0]; lenY = ((params_.N) - 1)* params_.incy + 1 + params_.offCY; status = clEnqueueWriteBuffer(queue, mobjY_, CL_TRUE, 0, lenY * sizeof(ElemType), backY_, 0, NULL, &event); if (status != CL_SUCCESS) { cerr << "Vector Y buffer object enqueuing error, status = " << status << endl; return NANOTIME_ERR; } status = clWaitForEvents(1, &event); if (status != CL_SUCCESS) { cout << "Wait on event failed, status = " << status << endl; return NANOTIME_ERR; } event = NULL; time = getCurrentTime(); int iter = 20; for ( int i = 1; i <= iter; i++) { status = clMath::clblas::hbmv(params_.order, params_.uplo, params_.N, params_.K, alpha, mobjA_, params_.offA, params_.lda, mobjX_, params_.offBX, params_.incx, beta, mobjY_, params_.offCY, params_.incy, 1, &queue, 0, NULL, &event); if (status != CL_SUCCESS) { cerr << "The CLBLAS GBMV function failed, status = " << status << endl; return NANOTIME_ERR; } } clFinish( queue ); time = getCurrentTime() - time; time /= iter; return time; } } // namespace clMath // chbmv performance test TEST_P(HBMV, chbmv) { TestParams params; getParams(¶ms); HbmvPerformanceTest::runInstance(FN_CHBMV, ¶ms); } // zhbmv performance test case TEST_P(HBMV, zhbmv) { TestParams params; getParams(¶ms); HbmvPerformanceTest::runInstance(FN_ZHBMV, ¶ms); } clblas-2.10/src/tests/performance/perf-hemm.cpp000066400000000000000000000245131264277366700215430ustar00rootroot00000000000000/* ************************************************************************ * Copyright 2013 Advanced Micro Devices, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * ************************************************************************/ #include // srand() #include // memcpy() #include #include #include #include #include #include #include #ifdef PERF_TEST_WITH_ACML #include #include #endif #include "PerformanceTest.h" //#define SHUNT_ACML_RUN /* * NOTE: operation factor means overall number * of multiply and add per each operation involving * 2 matrix elements */ using namespace std; using namespace clMath; #define CHECK_RESULT(ret) \ do { \ ASSERT_GE(ret, 0) << "Fatal error: can not allocate resources or " \ "perform an OpenCL request!" << endl; \ EXPECT_EQ(0, ret) << "The OpenCL version is slower in the case" << \ endl; \ } while (0) namespace clMath { template class HemmPerformanceTest : public PerformanceTest { public: virtual ~HemmPerformanceTest(); virtual int prepare(void); virtual nano_time_t etalonPerfSingle(void); virtual nano_time_t clblasPerfSingle(void); static void runInstance(BlasFunction fn, TestParams *params) { HemmPerformanceTest perfCase(fn, params); int ret = 0; int opFactor; BlasBase *base; base = clMath::BlasBase::getInstance(); opFactor = 8; if ((fn == FN_ZHEMM) && !base->isDevSupportDoublePrecision()) { std::cerr << ">> WARNING: The target device doesn't support native " "double precision floating point arithmetic" << std::endl << ">> Test skipped" << std::endl; return; } if (!perfCase.areResourcesSufficient(params)) { std::cerr << ">> RESOURCE CHECK: Skip due to unsufficient resources" << std::endl; return; } else { ret = perfCase.run(opFactor); } ASSERT_GE(ret, 0) << "Fatal error: can not allocate resources or " "perform an OpenCL request!" << endl; EXPECT_EQ(0, ret) << "The OpenCL version is slower in the case" << endl; } private: HemmPerformanceTest(BlasFunction fn, TestParams *params); bool areResourcesSufficient(TestParams *params); TestParams params_; ElemType alpha_; ElemType beta_; ElemType *A_; ElemType *B_; ElemType *C_; ElemType *backC_; cl_mem mobjA_; cl_mem mobjB_; cl_mem mobjC_; size_t ka, kbc; ::clMath::BlasBase *base_; }; template HemmPerformanceTest::HemmPerformanceTest( BlasFunction fn, TestParams *params) : PerformanceTest(fn, (problem_size_t) ( params->M * params->N * ( (params->side == clblasLeft)? params->M : params->N ) ) ), params_(*params), mobjA_(NULL), mobjB_(NULL), mobjC_(NULL) { if( params_.side == clblasLeft ) ka = params_.M; else ka = params_.N; if( params_.order == clblasColumnMajor ) kbc = params_.N; else kbc = params_.M; A_ = new ElemType[params_.lda * ka + params_.offA]; B_ = new ElemType[params_.ldb * kbc + params_.offBX]; C_ = new ElemType[params_.ldc * kbc + params_.offCY]; backC_ = new ElemType[params_.ldc * kbc + params_.offCY]; base_ = ::clMath::BlasBase::getInstance(); } template HemmPerformanceTest::~HemmPerformanceTest() { if(A_ != NULL) { delete[] A_; } if(B_ != NULL) { delete[] B_; } if(C_ != NULL) { delete[] C_; } if(backC_ != NULL) { delete[] backC_; } if( mobjC_ != NULL ) { clReleaseMemObject(mobjC_); } if( mobjB_ != NULL ) { clReleaseMemObject(mobjB_); } if( mobjA_ != NULL ) { clReleaseMemObject(mobjA_); } } /* * Check if available OpenCL resources are sufficient to * run the test case */ template bool HemmPerformanceTest::areResourcesSufficient(TestParams *params) { clMath::BlasBase *base; size_t gmemSize, allocSize; bool ret; size_t m = params->M, n = params->N; if((A_ == NULL) || (backC_ == NULL) || (C_ == NULL) || (B_ == NULL)) { return 0; } base = clMath::BlasBase::getInstance(); gmemSize = (size_t)base->availGlobalMemSize( 0 ); allocSize = (size_t)base->maxMemAllocSize(); ret = (std::max(m, n) * params_.lda * sizeof(ElemType)) < allocSize; ret = (ret && (std::max(m, n) * params_.ldb * sizeof(ElemType)) < allocSize); ret = (ret && (std::max(m, n) * params_.ldc * sizeof(ElemType)) < allocSize); ret = (ret && (((std::max(m, n) * params_.lda) + (std::max(m, n) * params_.ldb) + (std::max(m, n) * params_.ldc))) < gmemSize); return ret; } template int HemmPerformanceTest::prepare(void) { //bool useAlpha = base_->useAlpha(); //bool useBeta = base_->useBeta(); int creationFlags = 0, AcreationFlags = 0; creationFlags = creationFlags | RANDOM_INIT; creationFlags = ( (params_.order) == clblasRowMajor)? (creationFlags | ROW_MAJOR_ORDER) : (creationFlags); AcreationFlags = ( (params_.uplo) == clblasLower)? (creationFlags | LOWER_HALF_ONLY) : (creationFlags | UPPER_HALF_ONLY); BlasRoutineID BlasFn = CLBLAS_HEMM; populate( A_ + params_.offA, ka, ka, params_.lda, BlasFn, AcreationFlags); populate( B_ + params_.offBX, params_.M, params_.N, params_.ldb, BlasFn, creationFlags ); populate( C_ + params_.offCY, params_.M, params_.N, params_.ldc, BlasFn, creationFlags ); memcpy( backC_, C_, (kbc * params_.ldc + params_.offCY) * sizeof(ElemType) ); mobjA_ = base_->createEnqueueBuffer(A_, (params_.lda * ka + params_.offA) * sizeof(ElemType), 0, CL_MEM_READ_ONLY); mobjB_ = base_->createEnqueueBuffer(B_, (params_.ldb * kbc + params_.offBX) * sizeof(ElemType), 0, CL_MEM_READ_ONLY); mobjC_ = base_->createEnqueueBuffer(backC_, (params_.ldc * kbc + params_.offCY) * sizeof(ElemType), 0, CL_MEM_READ_WRITE); return (mobjC_) ? 0 : -1; } template nano_time_t HemmPerformanceTest::etalonPerfSingle(void) { nano_time_t time = 0; clblasOrder order; clblasUplo fUplo; clblasSide fSide; size_t lda, ldb, ldc, fN, fM; #ifndef PERF_TEST_WITH_ROW_MAJOR if (params_.order == clblasRowMajor) { cerr << "Row major order is not allowed" << endl; return NANOTIME_ERR; } #endif order = params_.order; fUplo = params_.uplo; fSide = params_.side; lda = params_.lda; ldb = params_.ldb; ldc = params_.ldc; fM = params_.M; fN = params_.N; #ifdef PERF_TEST_WITH_ACML if (order != clblasColumnMajor) { order = clblasColumnMajor; fM = params_.N; fN = params_.M; fSide = (params_.side == clblasLeft)? clblasRight: clblasLeft; fUplo = (params_.uplo == clblasUpper)? clblasLower: clblasUpper; } time = getCurrentTime(); #ifndef SHUNT_ACML_RUN clMath::blas::hemm(order, fSide, fUplo, fM, fN, alpha_, A_, params_.offA, lda, B_, params_.offBX, ldb, beta_, C_, params_.offCY, ldc); #endif time = getCurrentTime() - time; #endif // PERF_TEST_WITH_ACML return time; } template nano_time_t HemmPerformanceTest::clblasPerfSingle(void) { nano_time_t time; cl_event event; cl_int status; cl_command_queue queue = base_->commandQueues()[0]; status = clEnqueueWriteBuffer(queue, mobjC_, CL_TRUE, 0, (params_.ldc * kbc + params_.offCY) * sizeof(ElemType), backC_, 0, NULL, &event); if (status != CL_SUCCESS) { cerr << "Matrix C buffer object enqueuing error, status = " << status << endl; return NANOTIME_ERR; } status = clWaitForEvents(1, &event); if (status != CL_SUCCESS) { cout << "Wait on event failed, status = " << status << endl; return NANOTIME_ERR; } event = NULL; time = getCurrentTime(); //#define TIMING #ifdef TIMING clFinish( queue); int iter = 20; for ( int i = 1; i <= iter; i++) { #endif status = (cl_int)clMath::clblas::hemm(params_.order, params_.side, params_.uplo, params_.M, params_.N, alpha_, mobjA_, params_.offA, params_.lda, mobjB_, params_.offBX, params_.ldb, beta_, mobjC_, params_.offCY, params_.ldc, 1, &queue, 0, NULL, &event); if (status != CL_SUCCESS) { cerr << "The CLBLAS HEMM function failed, status = " << status << endl; return NANOTIME_ERR; } #ifdef TIMING } // iter loop clFinish( queue); time = getCurrentTime() - time; time /= iter; #else status = flushAll(1, &queue); if (status != CL_SUCCESS) { cerr << "clFlush() failed, status = " << status << endl; return NANOTIME_ERR; } time = getCurrentTime(); status = waitForSuccessfulFinish(1, &queue, &event); if (status == CL_SUCCESS) { time = getCurrentTime() - time; } else { cerr << "Waiting for completion of commands to the queue failed, " "status = " << status << endl; time = NANOTIME_ERR; } #endif return time; } } // namespace clMath TEST_P(HEMM, chemm) { TestParams params; getParams(¶ms); HemmPerformanceTest::runInstance(FN_CHEMM, ¶ms); } TEST_P(HEMM, zhemm) { TestParams params; getParams(¶ms); HemmPerformanceTest::runInstance(FN_ZHEMM, ¶ms); } clblas-2.10/src/tests/performance/perf-hemv.cpp000066400000000000000000000231171264277366700215530ustar00rootroot00000000000000/* ************************************************************************ * Copyright 2013 Advanced Micro Devices, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * ************************************************************************/ /* * Hemv performance test cases */ #include // srand() #include // memcpy() #include #include #include #include #include #include #include #ifdef PERF_TEST_WITH_ACML #include #include #endif #include "PerformanceTest.h" /* * NOTE: operation factor means overall number * of multiply and add per each operation involving * 2 matrix elements */ using namespace std; using namespace clMath; #define CHECK_RESULT(ret) \ do { \ ASSERT_GE(ret, 0) << "Fatal error: can not allocate resources or " \ "perform an OpenCL request!" << endl; \ EXPECT_EQ(0, ret) << "The OpenCL version is slower in the case" << \ endl; \ } while (0) namespace clMath { template class HemvPerformanceTest : public PerformanceTest { public: virtual ~HemvPerformanceTest(); virtual int prepare(void); virtual nano_time_t etalonPerfSingle(void); virtual nano_time_t clblasPerfSingle(void); static void runInstance(BlasFunction fn, TestParams *params) { HemvPerformanceTest perfCase(fn, params); int ret = 0; int opFactor; BlasBase *base; base = clMath::BlasBase::getInstance(); opFactor = 1; //FIX-ME if ((fn == FN_ZHEMV) && !base->isDevSupportDoublePrecision()) { std::cerr << ">> WARNING: The target device doesn't support native " "double precision floating point arithmetic" << std::endl << ">> Test skipped" << std::endl; return; } if (!perfCase.areResourcesSufficient(params)) { std::cerr << ">> RESOURCE CHECK: Skip due to unsufficient resources" << std::endl; return; } else { ret = perfCase.run(opFactor); } ASSERT_GE(ret, 0) << "Fatal error: can not allocate resources or " "perform an OpenCL request!" << endl; EXPECT_EQ(0, ret) << "The OpenCL version is slower in the case" << endl; } private: HemvPerformanceTest(BlasFunction fn, TestParams *params); bool areResourcesSufficient(TestParams *params); TestParams params_; ElemType *A_; ElemType *X_; ElemType *Y_; ElemType *backY_; cl_mem mobjA_; cl_mem mobjX_; cl_mem mobjY_; ElemType alpha, beta; ::clMath::BlasBase *base_; }; template HemvPerformanceTest::HemvPerformanceTest( BlasFunction fn, TestParams *params) : PerformanceTest( fn, (problem_size_t)( ( ((2 * (( params->N * (params->N)) + params->N)) ) * sizeof(ElemType) ) ) ), params_(*params), mobjA_(NULL), mobjX_(NULL) { A_ = new ElemType[params_.N * params_.lda + params_.offA]; X_ = new ElemType[ 1 + (params_.N-1) * abs(params_.incx) + params_.offBX]; Y_ = new ElemType[ 1 + (params_.N-1) * abs(params_.incy) + params_.offCY]; backY_ = new ElemType[ 1 + (params_.N-1) * abs(params_.incy) + params_.offCY]; alpha = convertMultiplier(params_.alpha); beta = convertMultiplier(params_.beta); base_ = ::clMath::BlasBase::getInstance(); mobjA_ = NULL; mobjX_ = NULL; mobjY_ = NULL; } template HemvPerformanceTest::~HemvPerformanceTest() { if(A_ != NULL) { delete[] A_; } if(X_ != NULL) { delete[] X_; } if(backY_ != NULL) { delete[] backY_; } if(Y_ != NULL) { delete[] Y_; } if ( mobjA_ != NULL ) clReleaseMemObject(mobjA_); if ( mobjX_ != NULL ) clReleaseMemObject(mobjX_); if ( mobjY_ != NULL ) clReleaseMemObject(mobjY_); } /* * Check if available OpenCL resources are sufficient to * run the test case */ template bool HemvPerformanceTest::areResourcesSufficient(TestParams *params) { clMath::BlasBase *base; size_t gmemSize, allocSize; size_t n = params->N; if((A_ == NULL) || (X_ == NULL) || (Y_ == NULL) || (backY_ == NULL)) { return 0; } base = clMath::BlasBase::getInstance(); gmemSize = (size_t)base->availGlobalMemSize( 0 ); allocSize = (size_t)base->maxMemAllocSize(); bool suff = ( sizeof(ElemType)*n*params->lda < allocSize ) && ((1 + (n-1)*abs(params->incx))*sizeof(ElemType) < allocSize); //for individual allocations suff = suff && ((( n*params->lda + (1 + (n-1)*abs(params->incx)) + (1 + (n-1)*abs(params->incy)))*sizeof(ElemType)) < gmemSize) ; //for total global allocations return suff ; } template int HemvPerformanceTest::prepare(void) { size_t lenX, N, lenY; N = params_.N; lenX = 1 + (N-1) * abs(params_.incx); lenY = 1 + (N-1) * abs(params_.incy); randomHemvMatrices(params_.order, params_.uplo, N, true, &alpha, (A_ + params_.offA), params_.lda, (X_ + params_.offBX), params_.incx, true, &beta, (Y_ + params_.offCY), params_.incy); memcpy(backY_, Y_, (lenY+ params_.offCY )* sizeof(ElemType)); mobjA_ = base_->createEnqueueBuffer(A_, (params_.N * params_.lda + params_.offA)* sizeof(*A_), 0, CL_MEM_READ_ONLY); mobjX_ = base_->createEnqueueBuffer(X_, (lenX + params_.offBX )* sizeof(*X_), 0, CL_MEM_READ_ONLY); mobjY_ = base_->createEnqueueBuffer(Y_, (lenY + params_.offCY )* sizeof(*Y_), 0, CL_MEM_READ_WRITE); return ( (mobjA_ != NULL) && (mobjX_ != NULL) && (mobjY_ != NULL) ) ? 0 : -1; } template nano_time_t HemvPerformanceTest::etalonPerfSingle(void) { nano_time_t time = 0; clblasOrder order; clblasUplo fUplo; size_t lda; #ifndef PERF_TEST_WITH_ROW_MAJOR if (params_.order == clblasRowMajor) { cerr << "Row major order is not allowed" << endl; return NANOTIME_ERR; } #endif order = params_.order; fUplo = params_.uplo; lda = params_.lda; #ifdef PERF_TEST_WITH_ACML if (order != clblasColumnMajor) { order = clblasColumnMajor; fUplo = (params_.uplo == clblasUpper)? clblasLower : clblasUpper; doConjugate( (A_ + params_.offA), params_.N, params_.N, params_.lda ); } time = getCurrentTime(); clMath::blas::hemv(order, fUplo, params_.N, alpha, A_, params_.offA, lda, X_, params_.offBX, params_.incx, beta, Y_, params_.offCY, params_.incy); time = getCurrentTime() - time; #endif // PERF_TEST_WITH_ACML return time; } template nano_time_t HemvPerformanceTest::clblasPerfSingle(void) { nano_time_t time; cl_event event; cl_int status; cl_command_queue queue = base_->commandQueues()[0]; int lenY = 1 + (params_.N-1) * abs(params_.incy); status = clEnqueueWriteBuffer(queue, mobjY_, CL_TRUE, 0, (lenY + params_.offCY )* sizeof(ElemType), backY_, 0, NULL, &event); if (status != CL_SUCCESS) { cerr << "Vector Y buffer object enqueuing error, status = " << status << endl; return NANOTIME_ERR; } status = clWaitForEvents(1, &event); if (status != CL_SUCCESS) { cout << "Wait on event failed, status = " << status << endl; return NANOTIME_ERR; } event = NULL; time = getCurrentTime(); #define TIMING #ifdef TIMING int iter = 20; for ( int i = 1; i <= iter; i++) { #endif status = (cl_int)clMath::clblas::hemv(params_.order, params_.uplo, params_.N, alpha, mobjA_, params_.offA, params_.lda, mobjX_, params_.offBX, params_.incx, beta, mobjY_, params_.offCY, params_.incy, 1, &queue, 0, NULL, &event); if (status != CL_SUCCESS) { cerr << "The CLBLAS HEMV function failed, status = " << status << endl; return NANOTIME_ERR; } #ifdef TIMING } // iter loop clFinish( queue); time = getCurrentTime() - time; time /= iter; #else status = flushAll(1, &queue); if (status != CL_SUCCESS) { cerr << "clFlush() failed, status = " << status << endl; return NANOTIME_ERR; } time = getCurrentTime(); status = waitForSuccessfulFinish(1, &queue, &event); if (status == CL_SUCCESS) { time = getCurrentTime() - time; } else { cerr << "Waiting for completion of commands to the queue failed, " "status = " << status << endl; time = NANOTIME_ERR; } //printf("Time elapsed : %lu\n", time); #endif return time; } } // namespace clMath TEST_P(HEMV, chemv) { TestParams params; getParams(¶ms); HemvPerformanceTest::runInstance(FN_CHEMV, ¶ms); } TEST_P(HEMV, zhemv) { TestParams params; getParams(¶ms); HemvPerformanceTest::runInstance(FN_ZHEMV, ¶ms); } clblas-2.10/src/tests/performance/perf-her.cpp000066400000000000000000000217401264277366700213720ustar00rootroot00000000000000/* ************************************************************************ * Copyright 2013 Advanced Micro Devices, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * ************************************************************************/ /* * HER performance test cases */ #include // srand() #include // memcpy() #include #include #include #include #include #include #include #ifdef PERF_TEST_WITH_ACML #include #include #endif #include "PerformanceTest.h" /* * NOTE: operation factor means overall number * of multiply and add per each operation involving * 2 matrix elements */ using namespace std; using namespace clMath; #define CHECK_RESULT(ret) \ do { \ ASSERT_GE(ret, 0) << "Fatal error: can not allocate resources or " \ "perform an OpenCL request!" << endl; \ EXPECT_EQ(0, ret) << "The OpenCL version is slower in the case" << \ endl; \ } while (0) namespace clMath { template class HerPerformanceTest : public PerformanceTest { public: virtual ~HerPerformanceTest(); virtual int prepare(void); virtual nano_time_t etalonPerfSingle(void); virtual nano_time_t clblasPerfSingle(void); static void runInstance(BlasFunction fn, TestParams *params) { HerPerformanceTest perfCase(fn, params); int ret = 0; int opFactor; BlasBase *base; base = clMath::BlasBase::getInstance(); opFactor = 1; if ((fn == FN_ZHER) && !base->isDevSupportDoublePrecision()) { std::cerr << ">> WARNING: The target device doesn't support native " "double precision floating point arithmetic" << std::endl << ">> Test skipped" << std::endl; return; } if (!perfCase.areResourcesSufficient(params)) { std::cerr << ">> RESOURCE CHECK: Skip due to unsufficient resources" << std::endl; } else { ret = perfCase.run(opFactor); } ASSERT_GE(ret, 0) << "Fatal error: can not allocate resources or " "perform an OpenCL request!" << endl; EXPECT_EQ(0, ret) << "The OpenCL version is slower in the case" << endl; } private: HerPerformanceTest(BlasFunction fn, TestParams *params); bool areResourcesSufficient(TestParams *params); TestParams params_; ElemType alpha_; ElemType *A_; ElemType *X_; ElemType *backA_; cl_mem mobjA_; cl_mem mobjX_; ::clMath::BlasBase *base_; }; template HerPerformanceTest::HerPerformanceTest( BlasFunction fn, TestParams *params) : PerformanceTest(fn, (problem_size_t)((((params->N * params->N) + params->N) * 2 ) * sizeof(ElemType))), params_(*params), mobjA_(NULL), mobjX_(NULL) { A_ = new ElemType[params_.N * params_.lda + params_.offa]; X_ = new ElemType[ 1 + (params_.N-1) * abs(params_.incx) + params_.offBX]; backA_ = new ElemType[params_.N * params_.lda + params_.offa]; base_ = ::clMath::BlasBase::getInstance(); } template HerPerformanceTest::~HerPerformanceTest() { if(A_ != NULL) { delete[] A_; } if(backA_ != NULL) { delete[] backA_; } if(X_ != NULL) { delete[] X_; } if(mobjX_ != NULL) { clReleaseMemObject(mobjX_); } if(mobjA_ != NULL) { clReleaseMemObject(mobjA_); } } /* * Check if available OpenCL resources are sufficient to * run the test case */ template bool HerPerformanceTest::areResourcesSufficient(TestParams *params) { clMath::BlasBase *base; size_t gmemSize, allocSize; size_t n = params->N; if((A_ == NULL) || (backA_ == NULL) || (X_ == NULL)) { return 0; } base = clMath::BlasBase::getInstance(); gmemSize = (size_t)base->availGlobalMemSize(0); allocSize = (size_t)base->maxMemAllocSize(); bool suff = ( sizeof(ElemType)*n*params->lda < allocSize ) && ((1 + (n-1)*abs(params->incx))*sizeof(ElemType) < allocSize); //for individual allocations suff = suff && ((( n*params->lda + (1 + (n-1)*abs(params->incx))*2)*sizeof(ElemType)) < gmemSize) ; //for total global allocations return suff ; } template int HerPerformanceTest::prepare(void) { size_t lenX = 1 + (params_.N-1) * abs(params_.incx); alpha_ = convertMultiplier(params_.alpha); randomHerMatrices( params_.order, params_.uplo, params_.N, &alpha_, (A_ + params_.offa), params_.lda, (X_ + params_.offBX), params_.incx ); memcpy(backA_, A_, ((params_.N * params_.lda + params_.offa)* sizeof(ElemType))); mobjA_ = base_->createEnqueueBuffer(A_, (params_.N * params_.lda + params_.offa)* sizeof(*A_), 0, CL_MEM_READ_WRITE); mobjX_ = base_->createEnqueueBuffer(X_, (lenX + params_.offBX )* sizeof(*X_), 0, CL_MEM_READ_ONLY); return ( (mobjA_ != NULL) && (mobjX_ != NULL) ) ? 0 : -1; } template nano_time_t HerPerformanceTest::etalonPerfSingle(void) { nano_time_t time = 0; clblasOrder order; size_t lda; #ifndef PERF_TEST_WITH_ROW_MAJOR if (params_.order == clblasRowMajor) { cerr << "Row major order is not allowed" << endl; return NANOTIME_ERR; } #endif order = params_.order; lda = params_.lda; #ifdef PERF_TEST_WITH_ACML clblasOrder fOrder; clblasUplo fUplo; fOrder = params_.order; fUplo = params_.uplo; if (order != clblasColumnMajor) { doConjugate( (X_ + params_.offBX), (1 + (params_.N-1) * abs(params_.incx)), 1, 1 ); fOrder = clblasColumnMajor; fUplo = (fUplo == clblasLower)? clblasUpper : clblasLower; } time = getCurrentTime(); clMath::blas::her(fOrder, fUplo, params_.N, CREAL(alpha_), X_, params_.offBX, params_.incx, A_, params_.offa, lda); time = getCurrentTime() - time; #endif // PERF_TEST_WITH_ACML return time; } template nano_time_t HerPerformanceTest::clblasPerfSingle(void) { nano_time_t time; cl_event event; cl_int status; cl_command_queue queue = base_->commandQueues()[0]; status = clEnqueueWriteBuffer(queue, mobjA_, CL_TRUE, 0, ((params_.N * params_.lda) + params_.offa) * sizeof(ElemType), backA_, 0, NULL, &event); if (status != CL_SUCCESS) { cerr << "Matrix A buffer object enqueuing error, status = " << status << endl; return NANOTIME_ERR; } status = clWaitForEvents(1, &event); if (status != CL_SUCCESS) { cout << "Wait on event failed, status = " << status << endl; return NANOTIME_ERR; } event = NULL; #define TIMING #ifdef TIMING clFinish( queue); time = getCurrentTime(); int iter = 20; for ( int i = 1; i <= iter; i++) { #endif status = (cl_int)clMath::clblas::her(params_.order, params_.uplo, params_.N, CREAL(alpha_), mobjX_, params_.offBX, params_.incx, mobjA_, params_.offa, params_.lda, 1, &queue, 0, NULL, &event); if (status != CL_SUCCESS) { cerr << "The CLBLAS HER function failed, status = " << status << endl; return NANOTIME_ERR; } #ifdef TIMING } // iter loop clFinish( queue); time = getCurrentTime() - time; time /= iter; #else status = flushAll(1, &queue); if (status != CL_SUCCESS) { cerr << "clFlush() failed, status = " << status << endl; return NANOTIME_ERR; } time = getCurrentTime(); status = waitForSuccessfulFinish(1, &queue, &event); if (status == CL_SUCCESS) { time = getCurrentTime() - time; } else { cerr << "Waiting for completion of commands to the queue failed, " "status = " << status << endl; time = NANOTIME_ERR; } #endif return time; } } // namespace clMath TEST_P(HER, cher) { TestParams params; getParams(¶ms); HerPerformanceTest::runInstance(FN_CHER, ¶ms); } TEST_P(HER, zher) { TestParams params; getParams(¶ms); HerPerformanceTest::runInstance(FN_ZHER, ¶ms); } clblas-2.10/src/tests/performance/perf-her2.cpp000066400000000000000000000235131264277366700214540ustar00rootroot00000000000000/* ************************************************************************ * Copyright 2013 Advanced Micro Devices, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * ************************************************************************/ /* * Her2 performance test cases */ #include // srand() #include // memcpy() #include #include #include #include #include #include #include #ifdef PERF_TEST_WITH_ACML #include #include #endif #include "PerformanceTest.h" /* * NOTE: operation factor means overall number * of multiply and add per each operation involving * 2 matrix elements */ using namespace std; using namespace clMath; #define CHECK_RESULT(ret) \ do { \ ASSERT_GE(ret, 0) << "Fatal error: can not allocate resources or " \ "perform an OpenCL request!" << endl; \ EXPECT_EQ(0, ret) << "The OpenCL version is slower in the case" << \ endl; \ } while (0) namespace clMath { template class Her2PerformanceTest : public PerformanceTest { public: virtual ~Her2PerformanceTest(); virtual int prepare(void); virtual nano_time_t etalonPerfSingle(void); virtual nano_time_t clblasPerfSingle(void); static void runInstance(BlasFunction fn, TestParams *params) { Her2PerformanceTest perfCase(fn, params); int ret = 0; int opFactor; BlasBase *base; base = clMath::BlasBase::getInstance(); opFactor = 1; if ((fn == FN_ZHER2) && !base->isDevSupportDoublePrecision()) { std::cerr << ">> WARNING: The target device doesn't support native " "double precision floating point arithmetic" << std::endl << ">> Test skipped" << std::endl; return; } if (!perfCase.areResourcesSufficient(params)) { std::cerr << ">> RESOURCE CHECK: Skip due to unsufficient resources" << std::endl; } else { ret = perfCase.run(opFactor); } ASSERT_GE(ret, 0) << "Fatal error: can not allocate resources or " "perform an OpenCL request!" << endl; EXPECT_EQ(0, ret) << "The OpenCL version is slower in the case" << endl; } private: Her2PerformanceTest(BlasFunction fn, TestParams *params); bool areResourcesSufficient(TestParams *params); TestParams params_; ElemType alpha_; ElemType *A_; ElemType *X_; ElemType *Y_; ElemType *backA_; cl_mem mobjA_; cl_mem mobjX_; cl_mem mobjY_; ::clMath::BlasBase *base_; }; template Her2PerformanceTest::Her2PerformanceTest( BlasFunction fn, TestParams *params) : PerformanceTest(fn, (problem_size_t)((((params->N * params->N) + (params->N)) * 3 ) * sizeof(ElemType))), params_(*params), mobjA_(NULL), mobjX_(NULL), mobjY_(NULL) { A_ = new ElemType[params_.N * params_.lda + params_.offa]; X_ = new ElemType[ 1 + (params_.N-1) * abs(params_.incx) + params_.offBX]; Y_ = new ElemType[ 1 + (params_.N-1) * abs(params_.incy) + params_.offCY]; backA_ = new ElemType[params_.N * params_.lda + params_.offa]; base_ = ::clMath::BlasBase::getInstance(); } template Her2PerformanceTest::~Her2PerformanceTest() { if(A_ != NULL) { delete[] A_; } if(backA_ != NULL) { delete[] backA_; } if(X_ != NULL) { delete[] X_; } if(Y_ != NULL) { delete[] Y_; } if(mobjX_ != NULL) { clReleaseMemObject(mobjX_); } if(mobjY_ != NULL) { clReleaseMemObject(mobjY_); } if(mobjA_ != NULL) { clReleaseMemObject(mobjA_); } } template bool Her2PerformanceTest::areResourcesSufficient(TestParams *params) { clMath::BlasBase *base; size_t gmemSize, allocSize; size_t n = params->N; if((A_ == NULL) || (backA_ == NULL) || (X_ == NULL) || (Y_ == NULL)) { return 0; } base = clMath::BlasBase::getInstance(); gmemSize = (size_t)base->availGlobalMemSize(0); allocSize = (size_t)base->maxMemAllocSize(); bool suff = ( sizeof(ElemType)*n*params->lda < allocSize ) && ((1 + (n-1)*abs(params->incx))*sizeof(ElemType) < allocSize) && ((1 + (n-1)*abs(params->incy))*sizeof(ElemType) < allocSize); //for individual allocations suff = suff && ((( n*params->lda + (1 + (n-1)*abs(params->incx)) + (1 + (n-1)*abs(params->incy)))*sizeof(ElemType)) < gmemSize) ; //for total global allocations return suff ; } template int Her2PerformanceTest::prepare(void) { //bool useAlpha = true; size_t lenX = 1 + (params_.N-1) * abs(params_.incx); size_t lenY = 1 + (params_.N-1) * abs(params_.incy); alpha_ = convertMultiplier(params_.alpha); randomHer2Matrices(params_.order, params_.uplo, params_.N, &alpha_, (A_ + params_.offa), params_.lda, (X_ + params_.offBX), params_.incx, (Y_ + params_.offCY), params_.incy); memcpy(backA_, A_, ((params_.N * params_.lda + params_.offa)* sizeof(ElemType))); mobjA_ = base_->createEnqueueBuffer(A_, (params_.N * params_.lda + params_.offa)* sizeof(*A_), 0, CL_MEM_READ_WRITE); mobjX_ = base_->createEnqueueBuffer(X_, (lenX + params_.offBX ) * sizeof(*X_), 0, CL_MEM_READ_ONLY); mobjY_ = base_->createEnqueueBuffer(Y_, (lenY + params_.offCY ) * sizeof(*Y_), 0, CL_MEM_READ_ONLY); return ((mobjA_ != NULL) && (mobjX_ != NULL) && (mobjY_ != NULL)) ? 0 : -1; } template nano_time_t Her2PerformanceTest::etalonPerfSingle(void) { clblasOrder order; clblasUplo fUplo; nano_time_t time = 0; size_t lda; #ifndef PERF_TEST_WITH_ROW_MAJOR if (params_.order == clblasRowMajor) { cerr << "Row major order is not allowed" << endl; return NANOTIME_ERR; } #endif order = params_.order; lda = params_.lda; fUplo = params_.uplo; #ifdef PERF_TEST_WITH_ACML ElemType *fX, *fY; int fIncx, fIncy; size_t fOffx, fOffy; fX = X_; fOffx = params_.offBX; fIncx = params_.incx; fY = Y_; fOffy = params_.offCY; fIncy = params_.incy; if (order != clblasColumnMajor) { doConjugate( (X_ + params_.offBX), (1 + (params_.N-1) * abs(params_.incx)), 1, 1 ); doConjugate( (Y_ + params_.offCY), (1 + (params_.N-1) * abs(params_.incy)), 1, 1 ); order = clblasColumnMajor; fUplo = (fUplo == clblasLower)? clblasUpper : clblasLower; fX = Y_; fOffx = params_.offCY; fIncx = params_.incy; fY = X_; fOffy = params_.offBX; fIncy = params_.incx; } time = getCurrentTime(); clMath::blas::her2(order, fUplo, params_.N, alpha_, fX, fOffx, fIncx, fY, fOffy, fIncy, A_, params_.offa, lda); time = getCurrentTime() - time; #endif // PERF_TEST_WITH_ACML return time; } template nano_time_t Her2PerformanceTest::clblasPerfSingle(void) { nano_time_t time; cl_event event; cl_int status; cl_command_queue queue = base_->commandQueues()[0]; status = clEnqueueWriteBuffer(queue, mobjA_, CL_TRUE, 0, ((params_.N * params_.lda) + params_.offa) * sizeof(ElemType), backA_, 0, NULL, &event); if (status != CL_SUCCESS) { cerr << "Matrix A buffer object enqueuing error, status = " << status << endl; return NANOTIME_ERR; } status = clWaitForEvents(1, &event); if (status != CL_SUCCESS) { cout << "Wait on event failed, status = " << status << endl; return NANOTIME_ERR; } event = NULL; #define TIMING #ifdef TIMING clFinish( queue); time = getCurrentTime(); int iter = 20; for ( int i = 1; i <= iter; i++) { #endif status = (cl_int)clMath::clblas::her2(params_.order, params_.uplo, params_.N, alpha_, mobjX_, params_.offBX, params_.incx, mobjY_, params_.offCY, params_.incy, mobjA_, params_.offa, params_.lda, 1, &queue, 0, NULL, &event); if (status != CL_SUCCESS) { cerr << "The CLBLAS HER2 function failed, status = " << status << endl; return NANOTIME_ERR; } #ifdef TIMING } // iter loop clFinish( queue); time = getCurrentTime() - time; time /= iter; #else status = flushAll(1, &queue); if (status != CL_SUCCESS) { cerr << "clFlush() failed, status = " << status << endl; return NANOTIME_ERR; } time = getCurrentTime(); status = waitForSuccessfulFinish(1, &queue, &event); if (status == CL_SUCCESS) { time = getCurrentTime() - time; } else { cerr << "Waiting for completion of commands to the queue failed, " "status = " << status << endl; time = NANOTIME_ERR; } #endif return time; } } // namespace clMath TEST_P(HER2, cher2) { TestParams params; getParams(¶ms); Her2PerformanceTest::runInstance(FN_CHER2, ¶ms); } TEST_P(HER2, zher2) { TestParams params; getParams(¶ms); Her2PerformanceTest::runInstance(FN_ZHER2, ¶ms); } clblas-2.10/src/tests/performance/perf-her2k.cpp000066400000000000000000000241051264277366700216250ustar00rootroot00000000000000/* ************************************************************************ * Copyright 2013 Advanced Micro Devices, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * ************************************************************************/ #include // srand() #include // memcpy() #include #include #include #include #include #include #include #ifdef PERF_TEST_WITH_ACML #include #include #endif #include "PerformanceTest.h" /* * NOTE: operation factor means overall number * of multiply and add per each operation involving * 2 matrix elements */ using namespace std; using namespace clMath; #define CHECK_RESULT(ret) \ do { \ ASSERT_GE(ret, 0) << "Fatal error: can not allocate resources or " \ "perform an OpenCL request!" << endl; \ EXPECT_EQ(0, ret) << "The OpenCL version is slower in the case" << \ endl; \ } while (0) namespace clMath { template class Her2kPerformanceTest : public PerformanceTest { public: virtual ~Her2kPerformanceTest(); virtual int prepare(void); virtual nano_time_t etalonPerfSingle(void); virtual nano_time_t clblasPerfSingle(void); static void runInstance(BlasFunction fn, TestParams *params) { Her2kPerformanceTest perfCase(fn, params); int ret = 0; int opFactor; BlasBase *base; base = clMath::BlasBase::getInstance(); opFactor = 8; if (( fn == FN_ZHER2K) && !base->isDevSupportDoublePrecision()) { std::cerr << ">> WARNING: The target device doesn't support native " "double precision floating point arithmetic" << std::endl << ">> Test skipped" << std::endl; return; } if (!perfCase.areResourcesSufficient(params)) { std::cerr << ">> RESOURCE CHECK: Skip due to unsufficient resources" << std::endl; } else { ret = perfCase.run(opFactor); } ASSERT_GE(ret, 0) << "Fatal error: can not allocate resources or " "perform an OpenCL request!" << endl; EXPECT_EQ(0, ret) << "The OpenCL version is slower in the case" << endl; } private: Her2kPerformanceTest(BlasFunction fn, TestParams *params); bool areResourcesSufficient(TestParams *params); TestParams params_; ElemType alpha_; ElemType beta_; ElemType *A_; ElemType *B_; ElemType *C_; ElemType *backC_; cl_mem mobjA_; cl_mem mobjB_; cl_mem mobjC_; ::clMath::BlasBase *base_; }; template Her2kPerformanceTest::Her2kPerformanceTest( BlasFunction fn, TestParams *params) : PerformanceTest(fn, (problem_size_t)((params->N * params->N * params->K)) ), params_(*params), mobjA_(NULL), mobjC_(NULL) { A_ = new ElemType[params_.rowsA * params_.columnsA]; B_ = new ElemType[params_.rowsB * params_.columnsB]; C_ = new ElemType[params_.rowsC * params_.columnsC]; backC_ = new ElemType[params_.rowsC * params_.columnsC]; base_ = ::clMath::BlasBase::getInstance(); } template Her2kPerformanceTest::~Her2kPerformanceTest() { if(A_!=NULL) { delete[] A_; } if(B_!=NULL) { delete[] B_; } if(C_!=NULL) { delete[] C_; } if(backC_!=NULL) { delete[] backC_; } if(mobjC_!=NULL) { clReleaseMemObject(mobjC_); } if(mobjA_!=NULL) { clReleaseMemObject(mobjA_); } if(mobjB_!=NULL) { clReleaseMemObject(mobjB_); } } template bool Her2kPerformanceTest::areResourcesSufficient(TestParams *params) { clMath::BlasBase *base; size_t gmemSize, allocSize, maxMatrSize; size_t n = params->N, k = params->K; if((A_ == NULL) || (B_ == NULL) || (backC_ == NULL) || (C_ == NULL)) { return 0; } base = clMath::BlasBase::getInstance(); gmemSize = (size_t)base->availGlobalMemSize(0); allocSize = (size_t)base->maxMemAllocSize(); maxMatrSize = gmemSize / 2; maxMatrSize = std::min(maxMatrSize, allocSize); return ((2 * n * k * sizeof(ElemType)) + (n * n * sizeof(ElemType)) < maxMatrSize); } template int Her2kPerformanceTest::prepare(void) { alpha_ = convertMultiplier(params_.alpha); beta_ = convertMultiplier(params_.beta); clblasTranspose ftransB = (params_.transA==clblasNoTrans)? clblasConjTrans: clblasNoTrans; randomGemmMatrices(params_.order, params_.transA, ftransB, params_.N, params_.N, params_.K, true, &alpha_, A_, params_.lda, B_, params_.ldb, true, &beta_, backC_, params_.ldc); mobjA_ = base_->createEnqueueBuffer(A_, params_.rowsA * params_.columnsA * sizeof(ElemType), params_.offA * sizeof(ElemType), CL_MEM_READ_ONLY); if (mobjA_) { mobjB_ = base_->createEnqueueBuffer(B_, params_.rowsB * params_.columnsB * sizeof(ElemType), params_.offBX * sizeof(ElemType), CL_MEM_READ_ONLY); } if (mobjB_) { mobjC_ = base_->createEnqueueBuffer(backC_, params_.rowsC * params_.columnsC * sizeof(ElemType), params_.offCY * sizeof(ElemType), CL_MEM_READ_WRITE); } return (mobjC_) ? 0 : -1; } template nano_time_t Her2kPerformanceTest::etalonPerfSingle(void) { nano_time_t time = 0; clblasOrder order; clblasUplo fUplo; clblasTranspose fTransA; ElemType fAlpha; #ifndef PERF_TEST_WITH_ROW_MAJOR if (params_.order == clblasRowMajor) { cerr << "Row major order is not allowed" << endl; return NANOTIME_ERR; } #endif memcpy(C_, backC_, params_.rowsC * params_.columnsC * sizeof(ElemType)); order = params_.order; fUplo = params_.uplo; fTransA = params_.transA; fAlpha = alpha_; if (order != clblasColumnMajor) { CIMAG( fAlpha ) *= -1.0; fTransA = (params_.transA == clblasNoTrans) ? clblasConjTrans : clblasNoTrans; fUplo = (params_.uplo == clblasUpper) ? clblasLower : clblasUpper; } #ifdef PERF_TEST_WITH_ACML time = getCurrentTime(); clMath::blas::her2k(clblasColumnMajor, fUplo, fTransA, params_.N, params_.K, fAlpha, A_, 0, params_.lda, B_, 0, params_.ldb, CREAL( beta_), C_, 0, params_.ldc); time = getCurrentTime() - time; #endif // PERF_TEST_WITH_ACML return time; } template nano_time_t Her2kPerformanceTest::clblasPerfSingle(void) { nano_time_t time; cl_event event; cl_int status; cl_command_queue queue = base_->commandQueues()[0]; status = clEnqueueWriteBuffer(queue, mobjC_, CL_TRUE, 0, params_.rowsC * params_.columnsC * sizeof(ElemType), backC_, 0, NULL, &event); if (status != CL_SUCCESS) { cerr << "Matrix C buffer object enqueuing error, status = " << status << endl; return NANOTIME_ERR; } status = clWaitForEvents(1, &event); if (status != CL_SUCCESS) { cout << "Wait on event failed, status = " << status << endl; return NANOTIME_ERR; } event = NULL; #define TIMING #ifdef TIMING clFinish( queue); time = getCurrentTime(); int iter = 20; for ( int i = 1; i <= iter; i++) { #endif status = (cl_int)clMath::clblas::her2k(params_.order, params_.uplo, params_.transA, params_.N, params_.K, alpha_, mobjA_, params_.offA, params_.lda, mobjB_, params_.offBX, params_.ldb, CREAL(beta_), mobjC_, params_.offCY, params_.ldc, 1, &queue, 0, NULL, &event); if (status != CL_SUCCESS) { cerr << "The CLBLAS HER2K function failed, status = " << status << endl; return NANOTIME_ERR; } #ifdef TIMING } // iter loop clFinish( queue); time = getCurrentTime() - time; time /= iter; #else status = flushAll(1, &queue); if (status != CL_SUCCESS) { cerr << "clFlush() failed, status = " << status << endl; return NANOTIME_ERR; } time = getCurrentTime(); status = waitForSuccessfulFinish(1, &queue, &event); if (status == CL_SUCCESS) { time = getCurrentTime() - time; } else { cerr << "Waiting for completion of commands to the queue failed, " "status = " << status << endl; time = NANOTIME_ERR; } #endif return time; } } // namespace clMath TEST_P(HER2K, cher2k) { TestParams params; getParams(¶ms); Her2kPerformanceTest::runInstance(FN_CHER2K, ¶ms); } TEST_P(HER2K, zher2k) { TestParams params; getParams(¶ms); Her2kPerformanceTest::runInstance(FN_ZHER2K, ¶ms); } clblas-2.10/src/tests/performance/perf-herk.cpp000066400000000000000000000233721264277366700215500ustar00rootroot00000000000000/* ************************************************************************ * Copyright 2013 Advanced Micro Devices, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * ************************************************************************/ #include // srand() #include // memcpy() #include #include #include #include #include #include #include #ifdef PERF_TEST_WITH_ACML #include #include #endif #include "PerformanceTest.h" /* * NOTE: operation factor means overall number * of multiply and add per each operation involving * 2 matrix elements */ using namespace std; using namespace clMath; #define CHECK_RESULT(ret) \ do { \ ASSERT_GE(ret, 0) << "Fatal error: can not allocate resources or " \ "perform an OpenCL request!" << endl; \ EXPECT_EQ(0, ret) << "The OpenCL version is slower in the case" << \ endl; \ } while (0) namespace clMath { template class HerkPerformanceTest : public PerformanceTest { public: virtual ~HerkPerformanceTest(); virtual int prepare(void); virtual nano_time_t etalonPerfSingle(void); virtual nano_time_t clblasPerfSingle(void); static void runInstance(BlasFunction fn, TestParams *params) { HerkPerformanceTest perfCase(fn, params); int ret = 0; int opFactor; BlasBase *base; base = clMath::BlasBase::getInstance(); opFactor = 8; if (( fn == FN_ZHERK) && !base->isDevSupportDoublePrecision()) { std::cerr << ">> WARNING: The target device doesn't support native " "double precision floating point arithmetic" << std::endl << ">> Test skipped" << std::endl; return; } if (!perfCase.areResourcesSufficient(params)) { std::cerr << ">> RESOURCE CHECK: Skip due to unsufficient resources" << std::endl; } else { ret = perfCase.run(opFactor); } ASSERT_GE(ret, 0) << "Fatal error: can not allocate resources or " "perform an OpenCL request!" << endl; EXPECT_EQ(0, ret) << "The OpenCL version is slower in the case" << endl; } private: HerkPerformanceTest(BlasFunction fn, TestParams *params); bool areResourcesSufficient(TestParams *params); TestParams params_; ElemType alpha_; ElemType beta_; ElemType *A_; ElemType *C_; ElemType *backC_; cl_mem mobjA_; cl_mem mobjC_; ::clMath::BlasBase *base_; }; template HerkPerformanceTest::HerkPerformanceTest( BlasFunction fn, TestParams *params) : PerformanceTest(fn, (problem_size_t)((params->N * params->N * params->K) / 2) ), params_(*params), mobjA_(NULL), mobjC_(NULL) { A_ = new ElemType[params_.rowsA * params_.columnsA]; C_ = new ElemType[params_.rowsC * params_.columnsC]; backC_ = new ElemType[params_.rowsC * params_.columnsC]; base_ = ::clMath::BlasBase::getInstance(); } template HerkPerformanceTest::~HerkPerformanceTest() { if(A_!=NULL) { delete[] A_; } if(C_!=NULL) { delete[] C_; } if(backC_!=NULL) { delete[] backC_; } if(mobjC_!=NULL) { clReleaseMemObject(mobjC_); } if(mobjA_!=NULL) { clReleaseMemObject(mobjA_); } } /* * Check if available OpenCL resources are sufficient to * run the test case */ template bool HerkPerformanceTest::areResourcesSufficient(TestParams *params) { clMath::BlasBase *base; size_t gmemSize, allocSize, maxMatrSize; size_t n = params->N, k = params->K; if((A_ == NULL) || (backC_ == NULL) || (C_ == NULL)) { return 0; } base = clMath::BlasBase::getInstance(); gmemSize = (size_t)base->availGlobalMemSize(0); allocSize = (size_t)base->maxMemAllocSize(); maxMatrSize = gmemSize / 2; maxMatrSize = std::min(maxMatrSize, allocSize); return ((n * k * sizeof(ElemType)) + (n * n * sizeof(ElemType)) < maxMatrSize); // bool suff = ( sizeof(ElemType)*n*params->lda < allocSize ) && ((1 + (n-1)*abs(params->incx))*sizeof(ElemType) < allocSize); //for individual allocations //suff = suff && ((( n*params->lda + (1 + (n-1)*abs(params->incx)) + (1 + (n-1)*abs(params->incy)))*sizeof(ElemType)) < gmemSize) ; //for total global allocations //return suff ; } template int HerkPerformanceTest::prepare(void) { alpha_ = convertMultiplier(params_.alpha); beta_ = convertMultiplier(params_.beta); randomGemmMatrices(params_.order, params_.transA, clblasNoTrans, params_.N, params_.N, params_.K, true, &alpha_, A_, params_.lda, NULL, 0, true, &beta_, C_, params_.ldc); mobjA_ = base_->createEnqueueBuffer(A_, params_.rowsA * params_.columnsA * sizeof(ElemType), params_.offA * sizeof(ElemType), CL_MEM_READ_ONLY); if (mobjA_) { mobjC_ = base_->createEnqueueBuffer(backC_, params_.rowsC * params_.columnsC * sizeof(ElemType), params_.offCY * sizeof(ElemType), CL_MEM_READ_WRITE); } return (mobjC_) ? 0 : -1; } template nano_time_t HerkPerformanceTest::etalonPerfSingle(void) { nano_time_t time = 0; clblasOrder order; clblasUplo fUplo; clblasTranspose fTransA; #ifndef PERF_TEST_WITH_ROW_MAJOR if (params_.order == clblasRowMajor) { cerr << "Row major order is not allowed" << endl; return NANOTIME_ERR; } #endif memcpy(C_, backC_, params_.rowsC * params_.columnsC * sizeof(ElemType)); order = params_.order; fUplo = params_.uplo; fTransA = params_.transA; #ifdef PERF_TEST_WITH_ACML fTransA = params_.transA; fUplo = params_.uplo; if (order != clblasColumnMajor) { fTransA = (params_.transA == clblasNoTrans) ? clblasConjTrans : clblasNoTrans; fUplo = (params_.uplo == clblasUpper) ? clblasLower : clblasUpper; } time = getCurrentTime(); clMath::blas::herk(clblasColumnMajor, fUplo, fTransA, params_.N, params_.K, CREAL(alpha_), A_, params_.lda,CREAL( beta_), C_, params_.ldc); time = getCurrentTime() - time; #endif // PERF_TEST_WITH_ACML return time; } template nano_time_t HerkPerformanceTest::clblasPerfSingle(void) { nano_time_t time; cl_event event; cl_int status; cl_command_queue queue = base_->commandQueues()[0]; status = clEnqueueWriteBuffer(queue, mobjC_, CL_TRUE, 0, params_.rowsC * params_.columnsC * sizeof(ElemType), backC_, 0, NULL, &event); if (status != CL_SUCCESS) { cerr << "Matrix C buffer object enqueuing error, status = " << status << endl; return NANOTIME_ERR; } status = clWaitForEvents(1, &event); if (status != CL_SUCCESS) { cout << "Wait on event failed, status = " << status << endl; return NANOTIME_ERR; } event = NULL; #define TIMING #ifdef TIMING clFinish( queue); time = getCurrentTime(); int iter = 20; for ( int i = 1; i <= iter; i++) { #endif status = (cl_int)clMath::clblas::herk(params_.order, params_.uplo, params_.transA, params_.N, params_.K, CREAL(alpha_), mobjA_, params_.offA, params_.lda, CREAL(beta_), mobjC_, params_.offCY, params_.ldc, 1, &queue, 0, NULL, &event); if (status != CL_SUCCESS) { cerr << "The CLBLAS HERK function failed, status = " << status << endl; return NANOTIME_ERR; } #ifdef TIMING } // iter loop clFinish( queue); time = getCurrentTime() - time; time /= iter; #else status = flushAll(1, &queue); if (status != CL_SUCCESS) { cerr << "clFlush() failed, status = " << status << endl; return NANOTIME_ERR; } time = getCurrentTime(); status = waitForSuccessfulFinish(1, &queue, &event); if (status == CL_SUCCESS) { time = getCurrentTime() - time; } else { cerr << "Waiting for completion of commands to the queue failed, " "status = " << status << endl; time = NANOTIME_ERR; } #endif return time; } } // namespace clMath TEST_P(HERK, cherk) { TestParams params; getParams(¶ms); HerkPerformanceTest::runInstance(FN_CHERK, ¶ms); } TEST_P(HERK, zherk) { TestParams params; getParams(¶ms); HerkPerformanceTest::runInstance(FN_ZHERK, ¶ms); } clblas-2.10/src/tests/performance/perf-hpmv.cpp000066400000000000000000000232461264277366700215710ustar00rootroot00000000000000/* ************************************************************************ * Copyright 2013 Advanced Micro Devices, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * ************************************************************************/ /* * Hpmv performance test cases */ #include // srand() #include // memcpy() #include #include #include #include #include #include #include #ifdef PERF_TEST_WITH_ACML #include #include #endif #include "PerformanceTest.h" /* * NOTE: operation factor means overall number * of multiply and add per each operation involving * 2 matrix elements */ using namespace std; using namespace clMath; #define CHECK_RESULT(ret) \ do { \ ASSERT_GE(ret, 0) << "Fatal error: can not allocate resources or " \ "perform an OpenCL request!" << endl; \ EXPECT_EQ(0, ret) << "The OpenCL version is slower in the case" << \ endl; \ } while (0) namespace clMath { template class HpmvPerformanceTest : public PerformanceTest { public: virtual ~HpmvPerformanceTest(); virtual int prepare(void); virtual nano_time_t etalonPerfSingle(void); virtual nano_time_t clblasPerfSingle(void); static void runInstance(BlasFunction fn, TestParams *params) { HpmvPerformanceTest perfCase(fn, params); int ret = 0; int opFactor; BlasBase *base; base = clMath::BlasBase::getInstance(); opFactor = 1; //FIX-ME if ((fn == FN_ZHPMV) && !base->isDevSupportDoublePrecision()) { std::cerr << ">> WARNING: The target device doesn't support native " "double precision floating point arithmetic" << std::endl << ">> Test skipped" << std::endl; return; } if (!perfCase.areResourcesSufficient(params)) { std::cerr << ">> RESOURCE CHECK: Skip due to unsufficient resources" << std::endl; return; } else { ret = perfCase.run(opFactor); } ASSERT_GE(ret, 0) << "Fatal error: can not allocate resources or " "perform an OpenCL request!" << endl; EXPECT_EQ(0, ret) << "The OpenCL version is slower in the case" << endl; } private: HpmvPerformanceTest(BlasFunction fn, TestParams *params); bool areResourcesSufficient(TestParams *params); TestParams params_; ElemType *AP_; ElemType *X_; ElemType *Y_; ElemType *backY_; cl_mem mobjAP_; cl_mem mobjX_; cl_mem mobjY_; ElemType alpha, beta; ::clMath::BlasBase *base_; }; template HpmvPerformanceTest::HpmvPerformanceTest( BlasFunction fn, TestParams *params) : PerformanceTest( fn, (problem_size_t)( ( ((2 * (( params->N * (params->N)) + params->N)) ) * sizeof(ElemType) ) ) ), params_(*params), mobjAP_(NULL), mobjX_(NULL) { AP_ = new ElemType[((params_.N * (params_.N + 1)) / 2 ) + params_.offA]; X_ = new ElemType[ 1 + (params_.N-1) * abs(params_.incx) + params_.offBX]; Y_ = new ElemType[ 1 + (params_.N-1) * abs(params_.incy) + params_.offCY]; backY_ = new ElemType[ 1 + (params_.N-1) * abs(params_.incy) + params_.offCY]; alpha = convertMultiplier(params_.alpha); beta = convertMultiplier(params_.beta); base_ = ::clMath::BlasBase::getInstance(); mobjAP_ = NULL; mobjX_ = NULL; mobjY_ = NULL; } template HpmvPerformanceTest::~HpmvPerformanceTest() { if(AP_ != NULL) { delete[] AP_; } if(X_ != NULL) { delete[] X_; } if(backY_ != NULL) { delete[] backY_; } if(Y_ != NULL) { delete[] Y_; } if ( mobjAP_ != NULL ) clReleaseMemObject(mobjAP_); if ( mobjX_ != NULL ) clReleaseMemObject(mobjX_); if ( mobjY_ != NULL ) clReleaseMemObject(mobjY_); } /* * Check if available OpenCL resources are sufficient to * run the test case */ template bool HpmvPerformanceTest::areResourcesSufficient(TestParams *params) { clMath::BlasBase *base; size_t gmemSize, allocSize; size_t n = params->N; if((AP_ == NULL) || (X_ == NULL) || (Y_ == NULL) || (backY_ == NULL)) { return 0; } base = clMath::BlasBase::getInstance(); gmemSize = (size_t)base->availGlobalMemSize( 0 ); allocSize = (size_t)base->maxMemAllocSize(); bool suff = ( sizeof(ElemType)*((n*(n+1))/2) < allocSize ) && ((1 + (n-1)*abs(params->incx))*sizeof(ElemType) < allocSize); //for individual allocations suff = suff && ((( ((n*(n+1))/2) + (1 + (n-1)*abs(params->incx)) + (1 + (n-1)*abs(params->incy)))*sizeof(ElemType)) < gmemSize) ; //for total global allocations return suff ; } template int HpmvPerformanceTest::prepare(void) { size_t lenX, N, lenY; N = params_.N; lenX = 1 + (N-1) * abs(params_.incx); lenY = 1 + (N-1) * abs(params_.incy); randomHemvMatrices(params_.order, params_.uplo, N, true, &alpha, (AP_ + params_.offA), params_.lda, (X_ + params_.offBX), params_.incx, true, &beta, (Y_ + params_.offCY), params_.incy); memcpy(backY_, Y_, (lenY+ params_.offCY )* sizeof(ElemType)); mobjAP_ = base_->createEnqueueBuffer(AP_, (((params_.N * (params_.N + 1)) / 2 ) + params_.offA)* sizeof(*AP_), 0, CL_MEM_READ_ONLY); mobjX_ = base_->createEnqueueBuffer(X_, (lenX + params_.offBX )* sizeof(*X_), 0, CL_MEM_READ_ONLY); mobjY_ = base_->createEnqueueBuffer(Y_, (lenY + params_.offCY )* sizeof(*Y_), 0, CL_MEM_READ_WRITE); return ( (mobjAP_ != NULL) && (mobjX_ != NULL) && (mobjY_ != NULL) ) ? 0 : -1; } template nano_time_t HpmvPerformanceTest::etalonPerfSingle(void) { nano_time_t time = 0; clblasOrder order; clblasUplo fUplo; #ifndef PERF_TEST_WITH_ROW_MAJOR if (params_.order == clblasRowMajor) { cerr << "Row major order is not allowed" << endl; return NANOTIME_ERR; } #endif order = params_.order; fUplo = params_.uplo; #ifdef PERF_TEST_WITH_ACML if (order != clblasColumnMajor) { order = clblasColumnMajor; fUplo = (params_.uplo == clblasUpper)? clblasLower : clblasUpper; doConjugate( (AP_ + params_.offA), params_.N, params_.N, params_.lda ); doConjugate( (AP_ + params_.offA), ((params_.N * (params_.N + 1)) / 2 ), 1, 1 ); } time = getCurrentTime(); clMath::blas::hpmv(order, fUplo, params_.N, alpha, AP_, params_.offA, X_, params_.offBX, params_.incx, beta, Y_, params_.offCY, params_.incy); time = getCurrentTime() - time; #endif // PERF_TEST_WITH_ACML return time; } template nano_time_t HpmvPerformanceTest::clblasPerfSingle(void) { nano_time_t time; cl_event event; cl_int status; cl_command_queue queue = base_->commandQueues()[0]; int lenY = 1 + (params_.N-1) * abs(params_.incy); status = clEnqueueWriteBuffer(queue, mobjY_, CL_TRUE, 0, (lenY + params_.offCY )* sizeof(ElemType), backY_, 0, NULL, &event); if (status != CL_SUCCESS) { cerr << "Vector Y buffer object enqueuing error, status = " << status << endl; return NANOTIME_ERR; } status = clWaitForEvents(1, &event); if (status != CL_SUCCESS) { cout << "Wait on event failed, status = " << status << endl; return NANOTIME_ERR; } event = NULL; time = getCurrentTime(); #define TIMING #ifdef TIMING int iter = 20; for ( int i = 1; i <= iter; i++) { #endif status = (cl_int)clMath::clblas::hpmv(params_.order, params_.uplo, params_.N, alpha, mobjAP_, params_.offA, mobjX_, params_.offBX, params_.incx, beta, mobjY_, params_.offCY, params_.incy, 1, &queue, 0, NULL, &event); if (status != CL_SUCCESS) { cerr << "The CLBLAS HPMV function failed, status = " << status << endl; return NANOTIME_ERR; } #ifdef TIMING } // iter loop clFinish( queue); time = getCurrentTime() - time; time /= iter; #else status = flushAll(1, &queue); if (status != CL_SUCCESS) { cerr << "clFlush() failed, status = " << status << endl; return NANOTIME_ERR; } time = getCurrentTime(); status = waitForSuccessfulFinish(1, &queue, &event); if (status == CL_SUCCESS) { time = getCurrentTime() - time; } else { cerr << "Waiting for completion of commands to the queue failed, " "status = " << status << endl; time = NANOTIME_ERR; } //printf("Time elapsed : %lu\n", time); #endif return time; } } // namespace clMath TEST_P(HPMV, chpmv) { TestParams params; getParams(¶ms); HpmvPerformanceTest::runInstance(FN_CHPMV, ¶ms); } TEST_P(HPMV, zhpmv) { TestParams params; getParams(¶ms); HpmvPerformanceTest::runInstance(FN_ZHPMV, ¶ms); } clblas-2.10/src/tests/performance/perf-hpr.cpp000066400000000000000000000217571264277366700214150ustar00rootroot00000000000000/* ************************************************************************ * Copyright 2013 Advanced Micro Devices, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * ************************************************************************/ #include // srand() #include // memcpy() #include #include #include #include #include #include #include #ifdef PERF_TEST_WITH_ACML #include #include #endif #include "PerformanceTest.h" /* * NOTE: operation factor means overall number * of multiply and add per each operation involving * 2 matrix elements */ using namespace std; using namespace clMath; #define CHECK_RESULT(ret) \ do { \ ASSERT_GE(ret, 0) << "Fatal error: can not allocate resources or " \ "perform an OpenCL request!" << endl; \ EXPECT_EQ(0, ret) << "The OpenCL version is slower in the case" << \ endl; \ } while (0) namespace clMath { template class HprPerformanceTest : public PerformanceTest { public: virtual ~HprPerformanceTest(); virtual int prepare(void); virtual nano_time_t etalonPerfSingle(void); virtual nano_time_t clblasPerfSingle(void); static void runInstance(BlasFunction fn, TestParams *params) { HprPerformanceTest perfCase(fn, params); int ret = 0; int opFactor; BlasBase *base; base = clMath::BlasBase::getInstance(); opFactor = 1; if ((fn == FN_ZHPR) && !base->isDevSupportDoublePrecision()) { std::cerr << ">> WARNING: The target device doesn't support native " "double precision floating point arithmetic" << std::endl << ">> Test skipped" << std::endl; return; } if (!perfCase.areResourcesSufficient(params)) { std::cerr << ">> RESOURCE CHECK: Skip due to unsufficient resources" << std::endl; } else { ret = perfCase.run(opFactor); } ASSERT_GE(ret, 0) << "Fatal error: can not allocate resources or " "perform an OpenCL request!" << endl; EXPECT_EQ(0, ret) << "The OpenCL version is slower in the case" << endl; } private: HprPerformanceTest(BlasFunction fn, TestParams *params); bool areResourcesSufficient(TestParams *params); TestParams params_; ElemType alpha_; ElemType *AP_; ElemType *X_; ElemType *backAP_; cl_mem mobjAP_; cl_mem mobjX_; ::clMath::BlasBase *base_; }; template HprPerformanceTest::HprPerformanceTest( BlasFunction fn, TestParams *params) : PerformanceTest(fn, (problem_size_t)((((params->N * params->N) + params->N) * 2 ) * sizeof(ElemType))), params_(*params), mobjAP_(NULL), mobjX_(NULL) { AP_ = new ElemType[( ( params_.N*( params_.N + 1 ) )/2 )+ params_.offa]; X_ = new ElemType[ 1 + (params_.N-1) * abs(params_.incx) + params_.offBX]; backAP_ = new ElemType[( ( params_.N*( params_.N + 1 ) )/2 )+ params_.offa]; base_ = ::clMath::BlasBase::getInstance(); } template HprPerformanceTest::~HprPerformanceTest() { if(AP_ != NULL) { delete[] AP_; } if(backAP_ != NULL) { delete[] backAP_; } if(X_ != NULL) { delete[] X_; } if(mobjX_ != NULL) { clReleaseMemObject(mobjX_); } if(mobjAP_ != NULL) { clReleaseMemObject(mobjAP_); } } /* * Check if available OpenCL resources are sufficient to * run the test case */ template bool HprPerformanceTest::areResourcesSufficient(TestParams *params) { clMath::BlasBase *base; size_t gmemSize, allocSize; size_t n = params->N; if((AP_ == NULL) || (backAP_ == NULL) || (X_ == NULL)) { return 0; } base = clMath::BlasBase::getInstance(); gmemSize = (size_t)base->availGlobalMemSize(0); allocSize = (size_t)base->maxMemAllocSize(); bool suff = ( sizeof(ElemType) *(( n*( n + 1 ) )/2 )< allocSize ) && ((1 + (n-1)*abs(params->incx))*sizeof(ElemType) < allocSize); //for individual allocations suff = suff && (((( (n*( n + 1 ) )/2 ) + (1 + (n-1)*abs(params->incx))*2)*sizeof(ElemType)) < gmemSize) ; //for total global allocations return suff ; } template int HprPerformanceTest::prepare(void) { size_t lenX = 1 + (params_.N-1) * abs(params_.incx); alpha_ = convertMultiplier(params_.alpha); randomHerMatrices( params_.order, params_.uplo, params_.N, &alpha_, (AP_ + params_.offa), 0, (X_ + params_.offBX), params_.incx ); memcpy(backAP_, AP_, ((( ( params_.N*( params_.N + 1 ) )/2 )+ params_.offa)* sizeof(ElemType))); mobjAP_ = base_->createEnqueueBuffer(AP_, (( ( params_.N*( params_.N + 1 ) )/2 )+ params_.offa)* sizeof(*AP_), 0, CL_MEM_READ_WRITE); mobjX_ = base_->createEnqueueBuffer(X_, (lenX + params_.offBX )* sizeof(*X_), 0, CL_MEM_READ_ONLY); return ( (mobjAP_ != NULL) && (mobjX_ != NULL) ) ? 0 : -1; } template nano_time_t HprPerformanceTest::etalonPerfSingle(void) { nano_time_t time = 0; clblasOrder order; // size_t lda; #ifndef PERF_TEST_WITH_ROW_MAJOR if (params_.order == clblasRowMajor) { cerr << "Row major order is not allowed" << endl; return NANOTIME_ERR; } #endif order = params_.order; #ifdef PERF_TEST_WITH_ACML clblasOrder fOrder; clblasUplo fUplo; fOrder = params_.order; fUplo = params_.uplo; if (order != clblasColumnMajor) { doConjugate( (X_ + params_.offBX), (1 + (params_.N-1) * abs(params_.incx)), 1, 1 ); fOrder = clblasColumnMajor; fUplo = (fUplo == clblasLower)? clblasUpper : clblasLower; } time = getCurrentTime(); clMath::blas::hpr(fOrder, fUplo, params_.N, CREAL(alpha_), X_, params_.offBX, params_.incx, AP_, params_.offa); time = getCurrentTime() - time; #endif // PERF_TEST_WITH_ACML return time; } template nano_time_t HprPerformanceTest::clblasPerfSingle(void) { nano_time_t time; cl_event event; cl_int status; cl_command_queue queue = base_->commandQueues()[0]; status = clEnqueueWriteBuffer(queue, mobjAP_, CL_TRUE, 0, ((( params_.N*( params_.N + 1 ) )/2 ) + params_.offa) * sizeof(ElemType), backAP_, 0, NULL, &event); if (status != CL_SUCCESS) { cerr << "Matrix A buffer object enqueuing error, status = " << status << endl; return NANOTIME_ERR; } status = clWaitForEvents(1, &event); if (status != CL_SUCCESS) { cout << "Wait on event failed, status = " << status << endl; return NANOTIME_ERR; } event = NULL; #define TIMING #ifdef TIMING clFinish( queue); time = getCurrentTime(); int iter = 20; for ( int i = 1; i <= iter; i++) { #endif status = (cl_int)clMath::clblas::hpr(params_.order, params_.uplo, params_.N, CREAL(alpha_), mobjX_, params_.offBX, params_.incx, mobjAP_, params_.offa, 1, &queue, 0, NULL, &event); if (status != CL_SUCCESS) { cerr << "The CLBLAS HPR function failed, status = " << status << endl; return NANOTIME_ERR; } #ifdef TIMING } // iter loop clFinish( queue); time = getCurrentTime() - time; time /= iter; #else status = flushAll(1, &queue); if (status != CL_SUCCESS) { cerr << "clFlush() failed, status = " << status << endl; return NANOTIME_ERR; } time = getCurrentTime(); status = waitForSuccessfulFinish(1, &queue, &event); if (status == CL_SUCCESS) { time = getCurrentTime() - time; } else { cerr << "Waiting for completion of commands to the queue failed, " "status = " << status << endl; time = NANOTIME_ERR; } #endif return time; } } // namespace clMath TEST_P(HPR, chpr) { TestParams params; getParams(¶ms); HprPerformanceTest::runInstance(FN_CHPR, ¶ms); } TEST_P(HPR, zhpr) { TestParams params; getParams(¶ms); HprPerformanceTest::runInstance(FN_ZHPR, ¶ms); } clblas-2.10/src/tests/performance/perf-hpr2.cpp000066400000000000000000000237521264277366700214740ustar00rootroot00000000000000/* ************************************************************************ * Copyright 2013 Advanced Micro Devices, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * ************************************************************************/ /* * Hpr2 performance test cases */ #include // srand() #include // memcpy() #include #include #include #include #include #include #include #ifdef PERF_TEST_WITH_ACML #include #include #endif #include "PerformanceTest.h" /* * NOTE: operation factor means overall number * of multiply and add per each operation involving * 2 matrix elements */ using namespace std; using namespace clMath; #define CHECK_RESULT(ret) \ do { \ ASSERT_GE(ret, 0) << "Fatal error: can not allocate resources or " \ "perform an OpenCL request!" << endl; \ EXPECT_EQ(0, ret) << "The OpenCL version is slower in the case" << \ endl; \ } while (0) namespace clMath { template class Hpr2PerformanceTest : public PerformanceTest { public: virtual ~Hpr2PerformanceTest(); virtual int prepare(void); virtual nano_time_t etalonPerfSingle(void); virtual nano_time_t clblasPerfSingle(void); static void runInstance(BlasFunction fn, TestParams *params) { Hpr2PerformanceTest perfCase(fn, params); int ret = 0; int opFactor; BlasBase *base; base = clMath::BlasBase::getInstance(); opFactor = 1; if ((fn == FN_ZHPR2) && !base->isDevSupportDoublePrecision()) { std::cerr << ">> WARNING: The target device doesn't support native " "double precision floating point arithmetic" << std::endl << ">> Test skipped" << std::endl; return; } if (!perfCase.areResourcesSufficient(params)) { std::cerr << ">> RESOURCE CHECK: Skip due to unsufficient resources" << std::endl; } else { ret = perfCase.run(opFactor); } ASSERT_GE(ret, 0) << "Fatal error: can not allocate resources or " "perform an OpenCL request!" << endl; EXPECT_EQ(0, ret) << "The OpenCL version is slower in the case" << endl; } private: Hpr2PerformanceTest(BlasFunction fn, TestParams *params); bool areResourcesSufficient(TestParams *params); TestParams params_; ElemType alpha_; ElemType *AP_; ElemType *X_; ElemType *Y_; ElemType *backAP_; cl_mem mobjAP_; cl_mem mobjX_; cl_mem mobjY_; ::clMath::BlasBase *base_; }; template Hpr2PerformanceTest::Hpr2PerformanceTest( BlasFunction fn, TestParams *params) : PerformanceTest(fn, (problem_size_t)((((params->N * params->N) + (params->N)) * 3 ) * sizeof(ElemType))), params_(*params), mobjAP_(NULL), mobjX_(NULL), mobjY_(NULL) { AP_ = new ElemType[((params_.N * (params_.N + 1))/2) + params_.offa]; X_ = new ElemType[ 1 + (params_.N-1) * abs(params_.incx) + params_.offBX]; Y_ = new ElemType[ 1 + (params_.N-1) * abs(params_.incy) + params_.offCY]; backAP_ = new ElemType[((params_.N * (params_.N + 1))/2) + params_.offa]; base_ = ::clMath::BlasBase::getInstance(); } template Hpr2PerformanceTest::~Hpr2PerformanceTest() { if(AP_ != NULL) { delete[] AP_; } if(backAP_ != NULL) { delete[] backAP_; } if(X_ != NULL) { delete[] X_; } if(Y_ != NULL) { delete[] Y_; } if(mobjX_ != NULL) { clReleaseMemObject(mobjX_); } if(mobjY_ != NULL) { clReleaseMemObject(mobjY_); } if(mobjAP_ != NULL) { clReleaseMemObject(mobjAP_); } } /* * Check if available OpenCL resources are sufficient to * run the test case */ template bool Hpr2PerformanceTest::areResourcesSufficient(TestParams *params) { clMath::BlasBase *base; size_t gmemSize, allocSize; size_t n = params->N; if((AP_ == NULL) || (backAP_ == NULL) || (X_ == NULL) || (Y_ == NULL)) { return 0; } base = clMath::BlasBase::getInstance(); gmemSize = (size_t)base->availGlobalMemSize(0); allocSize = (size_t)base->maxMemAllocSize(); bool suff = ( sizeof(ElemType)*((params_.N * (params_.N + 1))/2) < allocSize ) && ((1 + (n-1)*abs(params->incx))*sizeof(ElemType) < allocSize) && ((1 + (n-1)*abs(params->incy))*sizeof(ElemType) < allocSize); //for individual allocations suff = suff && ((( ((params_.N * (params_.N + 1))/2) + (1 + (n-1)*abs(params->incx)) + (1 + (n-1)*abs(params->incy)))*sizeof(ElemType)) < gmemSize) ; //for total global allocations return suff ; } template int Hpr2PerformanceTest::prepare(void) { //bool useAlpha = true; size_t lenX = 1 + (params_.N-1) * abs(params_.incx); size_t lenY = 1 + (params_.N-1) * abs(params_.incy); alpha_ = convertMultiplier(params_.alpha); randomHer2Matrices(params_.order, params_.uplo, params_.N, &alpha_, (AP_ + params_.offa), params_.lda, (X_ + params_.offBX), params_.incx, (Y_ + params_.offCY), params_.incy); memcpy(backAP_, AP_, ((((params_.N * (params_.N + 1))/2) + params_.offa)* sizeof(ElemType))); mobjAP_ = base_->createEnqueueBuffer(AP_, (((params_.N * (params_.N + 1))/2) + params_.offa)* sizeof(*AP_), 0, CL_MEM_READ_WRITE); mobjX_ = base_->createEnqueueBuffer(X_, (lenX + params_.offBX ) * sizeof(*X_), 0, CL_MEM_READ_ONLY); mobjY_ = base_->createEnqueueBuffer(Y_, (lenY + params_.offCY ) * sizeof(*Y_), 0, CL_MEM_READ_ONLY); return ((mobjAP_ != NULL) && (mobjX_ != NULL) && (mobjY_ != NULL)) ? 0 : -1; } template nano_time_t Hpr2PerformanceTest::etalonPerfSingle(void) { clblasOrder order; clblasUplo fUplo; nano_time_t time = 0; #ifndef PERF_TEST_WITH_ROW_MAJOR if (params_.order == clblasRowMajor) { cerr << "Row major order is not allowed" << endl; return NANOTIME_ERR; } #endif order = params_.order; fUplo = params_.uplo; #ifdef PERF_TEST_WITH_ACML ElemType *fX, *fY; int fIncx, fIncy; size_t fOffx, fOffy; fX = X_; fOffx = params_.offBX; fIncx = params_.incx; fY = Y_; fOffy = params_.offCY; fIncy = params_.incy; if (order != clblasColumnMajor) { doConjugate( (X_ + params_.offBX), (1 + (params_.N-1) * abs(params_.incx)), 1, 1 ); doConjugate( (Y_ + params_.offCY), (1 + (params_.N-1) * abs(params_.incy)), 1, 1 ); order = clblasColumnMajor; fUplo = (fUplo == clblasLower)? clblasUpper : clblasLower; fX = Y_; fOffx = params_.offCY; fIncx = params_.incy; fY = X_; fOffy = params_.offBX; fIncy = params_.incx; } time = getCurrentTime(); clMath::blas::hpr2(order, fUplo, params_.N, alpha_, fX, fOffx, fIncx, fY, fOffy, fIncy, AP_, params_.offa); time = getCurrentTime() - time; #endif // PERF_TEST_WITH_ACML return time; } template nano_time_t Hpr2PerformanceTest::clblasPerfSingle(void) { nano_time_t time; cl_event event; cl_int status; cl_command_queue queue = base_->commandQueues()[0]; status = clEnqueueWriteBuffer(queue, mobjAP_, CL_TRUE, 0, (((params_.N * (params_.N + 1))/2) + params_.offa) * sizeof(ElemType), backAP_, 0, NULL, &event); if (status != CL_SUCCESS) { cerr << "Matrix A buffer object enqueuing error, status = " << status << endl; return NANOTIME_ERR; } status = clWaitForEvents(1, &event); if (status != CL_SUCCESS) { cout << "Wait on event failed, status = " << status << endl; return NANOTIME_ERR; } event = NULL; #define TIMING #ifdef TIMING clFinish( queue); time = getCurrentTime(); int iter = 20; for ( int i = 1; i <= iter; i++) { #endif status = (cl_int)clMath::clblas::hpr2(params_.order, params_.uplo, params_.N, alpha_, mobjX_, params_.offBX, params_.incx, mobjY_, params_.offCY, params_.incy, mobjAP_, params_.offa, 1, &queue, 0, NULL, &event); if (status != CL_SUCCESS) { cerr << "The CLBLAS HPR2 function failed, status = " << status << endl; return NANOTIME_ERR; } #ifdef TIMING } // iter loop clFinish( queue); time = getCurrentTime() - time; time /= iter; #else status = flushAll(1, &queue); if (status != CL_SUCCESS) { cerr << "clFlush() failed, status = " << status << endl; return NANOTIME_ERR; } time = getCurrentTime(); status = waitForSuccessfulFinish(1, &queue, &event); if (status == CL_SUCCESS) { time = getCurrentTime() - time; } else { cerr << "Waiting for completion of commands to the queue failed, " "status = " << status << endl; time = NANOTIME_ERR; } #endif return time; } } // namespace clMath TEST_P(HPR2, chpr2) { TestParams params; getParams(¶ms); Hpr2PerformanceTest::runInstance(FN_CHER2, ¶ms); } TEST_P(HPR2, zhpr2) { TestParams params; getParams(¶ms); Hpr2PerformanceTest::runInstance(FN_ZHPR2, ¶ms); } clblas-2.10/src/tests/performance/perf-iamax.cpp000066400000000000000000000205011264277366700217050ustar00rootroot00000000000000/* ************************************************************************ * Copyright 2013 Advanced Micro Devices, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * ************************************************************************/ #include // srand() #include // memcpy() #include #include #include #include #include #include #include #ifdef PERF_TEST_WITH_ACML #include #include #endif #include "PerformanceTest.h" using namespace std; using namespace clMath; #define CHECK_RESULT(ret) \ do { \ ASSERT_GE(ret, 0) << "Fatal error: can not allocate resources or " \ "perform an OpenCL request!" << endl; \ EXPECT_EQ(0, ret) << "The OpenCL version is slower in the case" << \ endl; \ } while (0) namespace clMath { template class iAmaxPerformanceTest : public PerformanceTest { public: virtual ~iAmaxPerformanceTest(); virtual int prepare(void); virtual nano_time_t etalonPerfSingle(void); virtual nano_time_t clblasPerfSingle(void); static void runInstance(BlasFunction fn, TestParams *params) { iAmaxPerformanceTest perfCase(fn, params); int ret = 0; int opFactor; BlasBase *base; base = clMath::BlasBase::getInstance(); opFactor =1; if (((fn == FN_iDAMAX) || (fn == FN_iZAMAX)) && !base->isDevSupportDoublePrecision()) { std::cerr << ">> WARNING: The target device doesn't support native " "double precision floating point arithmetic" << std::endl << ">> Test skipped" << std::endl; return; } if (!perfCase.areResourcesSufficient(params)) { std::cerr << ">> RESOURCE CHECK: Skip due to insufficient resources" << std::endl; return; } else { ret = perfCase.run(opFactor); } ASSERT_GE(ret, 0) << "Fatal error: can not allocate resources or " "perform an OpenCL request!" << endl; EXPECT_EQ(0, ret) << "The OpenCL version is slower in the case" << endl; } private: iAmaxPerformanceTest(BlasFunction fn, TestParams *params); bool areResourcesSufficient(TestParams *params); TestParams params_; ElemType *blasX_; cl_mem mobjX_; cl_mem mobjiAMAX_; cl_mem scratchBuff; size_t lengthX; ::clMath::BlasBase *base_; }; template iAmaxPerformanceTest::iAmaxPerformanceTest( BlasFunction fn, TestParams *params) : PerformanceTest(fn,(problem_size_t) ( (params->N) * sizeof(ElemType) ) ), params_(*params), mobjX_(NULL),mobjiAMAX_(NULL) { blasX_ = NULL; mobjX_= mobjiAMAX_= scratchBuff = NULL; lengthX = 1 + (params->N - 1) * abs(params_.incx); try { blasX_ = new ElemType[lengthX + params_.offBX]; } catch(bad_alloc& ba) { blasX_ = NULL; // areResourcesSufficient() will handle the rest and return mobjX_= mobjiAMAX_= scratchBuff = NULL; ba = ba; } base_ = ::clMath::BlasBase::getInstance(); } template iAmaxPerformanceTest::~iAmaxPerformanceTest() { if(blasX_ != NULL) { delete[] blasX_; } if( mobjX_ != NULL ) { clReleaseMemObject(mobjX_); } if( mobjiAMAX_ != NULL ) { clReleaseMemObject(mobjiAMAX_); } if( scratchBuff!= NULL ) { clReleaseMemObject(scratchBuff); } } /* * Check if available OpenCL resources are sufficient to * run the test case */ template bool iAmaxPerformanceTest::areResourcesSufficient(TestParams *params) { clMath::BlasBase *base; size_t gmemSize, allocSize; bool ret; size_t sizeX, sizeiAMAX, sizeScratchBuff; if(blasX_ == NULL) { return 0; } base = clMath::BlasBase::getInstance(); gmemSize = (size_t)base->availGlobalMemSize( 0 ); allocSize = (size_t)base->maxMemAllocSize(); sizeX = (lengthX + params->offBX) * sizeof(ElemType); sizeiAMAX = (1 + params->offa) * sizeof(ElemType); sizeScratchBuff = (params->N * 2) * sizeof(ElemType); ret = ((sizeX < allocSize) && (sizeiAMAX < allocSize) && (sizeScratchBuff < allocSize)); ret = (ret && ((sizeX + sizeiAMAX + sizeScratchBuff) < gmemSize)); return ret; } template int iAmaxPerformanceTest::prepare(void) { randomVectors(params_.N, (blasX_ + params_.offBX), params_.incx, (ElemType*)NULL, 0); mobjX_ = base_->createEnqueueBuffer(blasX_, ((lengthX + params_.offBX) * sizeof(ElemType)), 0, CL_MEM_READ_WRITE); mobjiAMAX_ = base_->createEnqueueBuffer(NULL, ((1 + params_.offa) * sizeof(ElemType)), 0, CL_MEM_READ_WRITE); scratchBuff = base_->createEnqueueBuffer(NULL, ((params_.N * 2) * sizeof(ElemType)), 0, CL_MEM_READ_WRITE); return ((mobjX_ != NULL) && (mobjiAMAX_ != NULL)&& (scratchBuff != NULL) )? 0 : -1; } template nano_time_t iAmaxPerformanceTest::etalonPerfSingle(void) { nano_time_t time = 0; #ifdef PERF_TEST_WITH_ACML time = getCurrentTime(); clMath::blas::iamax(params_.N, blasX_, params_.offBX, params_.incx); time = getCurrentTime() - time; #endif // PERF_TEST_WITH_ACML return time; } template nano_time_t iAmaxPerformanceTest::clblasPerfSingle(void) { nano_time_t time; cl_event event; cl_int status; cl_command_queue queue = base_->commandQueues()[0]; DataType type; type = ( typeid(ElemType) == typeid(float))? TYPE_FLOAT:( typeid(ElemType) == typeid(double))? TYPE_DOUBLE: ( typeid(ElemType) == typeid(FloatComplex))? TYPE_COMPLEX_FLOAT: TYPE_COMPLEX_DOUBLE; event = NULL; clFinish( queue); time = getCurrentTime(); #define TIMING #ifdef TIMING int iter = 100; for ( int i=1; i <= iter; i++) { #endif status = (cl_int)clMath::clblas::iamax( type, params_.N, mobjiAMAX_, params_.offa, mobjX_, params_.offBX, params_.incx, scratchBuff, 1, &queue, 0, NULL, &event); if (status != CL_SUCCESS) { cerr << "The CLBLAS iAMAX function failed, status = " << status << endl; return NANOTIME_ERR; } #ifdef TIMING } // iter loop clFinish( queue); time = getCurrentTime() - time; time /= iter; #else status = flushAll(1, &queue); if (status != CL_SUCCESS) { cerr << "clFlush() failed, status = " << status << endl; return NANOTIME_ERR; } time = getCurrentTime(); status = waitForSuccessfulFinish(1, &queue, &event); if (status == CL_SUCCESS) { time = getCurrentTime() - time; } else { cerr << "Waiting for completion of commands to the queue failed, " "status = " << status << endl; time = NANOTIME_ERR; } #endif return time; } } // namespace clMath TEST_P(iAMAX, isamax) { TestParams params; getParams(¶ms); iAmaxPerformanceTest::runInstance(FN_iSAMAX, ¶ms); } TEST_P(iAMAX, idamax) { TestParams params; getParams(¶ms); iAmaxPerformanceTest::runInstance(FN_iDAMAX, ¶ms); } TEST_P(iAMAX, icamax) { TestParams params; getParams(¶ms); iAmaxPerformanceTest::runInstance(FN_iCAMAX, ¶ms); } TEST_P(iAMAX, izamax) { TestParams params; getParams(¶ms); iAmaxPerformanceTest::runInstance(FN_iZAMAX, ¶ms); } clblas-2.10/src/tests/performance/perf-nrm2.cpp000066400000000000000000000203311264277366700214650ustar00rootroot00000000000000/* ************************************************************************ * Copyright 2013 Advanced Micro Devices, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * ************************************************************************/ #include // srand() #include // memcpy() #include #include #include #include #include #include #include #ifdef PERF_TEST_WITH_ACML #include #include #endif #include "PerformanceTest.h" using namespace std; using namespace clMath; #define CHECK_RESULT(ret) \ do { \ ASSERT_GE(ret, 0) << "Fatal error: can not allocate resources or " \ "perform an OpenCL request!" << endl; \ EXPECT_EQ(0, ret) << "The OpenCL version is slower in the case" << \ endl; \ } while (0) namespace clMath { template class Nrm2PerformanceTest : public PerformanceTest { public: virtual ~Nrm2PerformanceTest(); virtual int prepare(void); virtual nano_time_t etalonPerfSingle(void); virtual nano_time_t clblasPerfSingle(void); static void runInstance(BlasFunction fn, TestParams *params) { Nrm2PerformanceTest perfCase(fn, params); int ret = 0; int opFactor; BlasBase *base; base = clMath::BlasBase::getInstance(); opFactor =1; if (((fn == FN_DNRM2) || (fn == FN_DZNRM2)) && !base->isDevSupportDoublePrecision()) { std::cerr << ">> WARNING: The target device doesn't support native " "double precision floating point arithmetic" << std::endl << ">> Test skipped" << std::endl; return; } if (!perfCase.areResourcesSufficient(params)) { std::cerr << ">> RESOURCE CHECK: Skip due to insufficient resources" << std::endl; return; } else { ret = perfCase.run(opFactor); } ASSERT_GE(ret, 0) << "Fatal error: can not allocate resources or " "perform an OpenCL request!" << endl; EXPECT_EQ(0, ret) << "The OpenCL version is slower in the case" << endl; } private: Nrm2PerformanceTest(BlasFunction fn, TestParams *params); bool areResourcesSufficient(TestParams *params); TestParams params_; ElemType *blasX_; cl_mem mobjX_; cl_mem mobjNRM2_; cl_mem scratchBuff; size_t lengthX; ::clMath::BlasBase *base_; }; template Nrm2PerformanceTest::Nrm2PerformanceTest( BlasFunction fn, TestParams *params) : PerformanceTest(fn,(problem_size_t) ( (params->N) * sizeof(ElemType) ) ), params_(*params), mobjX_(NULL),mobjNRM2_(NULL) { blasX_ = NULL; mobjX_= mobjNRM2_= scratchBuff = NULL; lengthX = 1 + (params->N - 1) * abs(params_.incx); try { blasX_ = new ElemType[lengthX + params_.offBX]; } catch(bad_alloc& ba) { blasX_ = NULL; // areResourcesSufficient() will handle the rest and return mobjX_= mobjNRM2_= scratchBuff = NULL; ba = ba; } base_ = ::clMath::BlasBase::getInstance(); } template Nrm2PerformanceTest::~Nrm2PerformanceTest() { if(blasX_ != NULL) { delete[] blasX_; } if( mobjX_ != NULL ) { clReleaseMemObject(mobjX_); } if( mobjNRM2_ != NULL ) { clReleaseMemObject(mobjNRM2_); } if( scratchBuff!= NULL ) { clReleaseMemObject(scratchBuff); } } /* * Check if available OpenCL resources are sufficient to * run the test case */ template bool Nrm2PerformanceTest::areResourcesSufficient(TestParams *params) { clMath::BlasBase *base; size_t gmemSize, allocSize; bool ret; size_t sizeX, sizeNRM2, sizeScratch; if(blasX_ == NULL) { return 0; } base = clMath::BlasBase::getInstance(); gmemSize = (size_t)base->availGlobalMemSize( 0 ); allocSize = (size_t)base->maxMemAllocSize(); sizeX = (lengthX + params->offBX) * sizeof(ElemType); sizeScratch = (lengthX * 2) * sizeof(ElemType); sizeNRM2 = (1 + params->offa) * sizeof(ElemType); ret = ((sizeX < allocSize) && (sizeNRM2 < allocSize) && (sizeScratch < allocSize)); ret = (ret && ((sizeX + sizeNRM2 + sizeScratch) < gmemSize)); return ret; } template int Nrm2PerformanceTest::prepare(void) { randomVectors(params_.N, (blasX_ + params_.offBX), params_.incx, (ElemType*)NULL, 0, true); mobjX_ = base_->createEnqueueBuffer(blasX_, ((lengthX + params_.offBX) * sizeof(ElemType)), 0, CL_MEM_READ_WRITE); mobjNRM2_ = base_->createEnqueueBuffer(NULL, ((1 + params_.offa) * sizeof(ElemType)), 0, CL_MEM_READ_WRITE); scratchBuff = base_->createEnqueueBuffer(NULL, ((2 * lengthX) * sizeof(ElemType)), 0, CL_MEM_READ_WRITE); return ((mobjX_ != NULL) && (mobjNRM2_ != NULL)&& (scratchBuff != NULL) )? 0 : -1; } template nano_time_t Nrm2PerformanceTest::etalonPerfSingle(void) { nano_time_t time = 0; #ifdef PERF_TEST_WITH_ACML time = getCurrentTime(); clMath::blas::nrm2(params_.N, blasX_, params_.offBX, params_.incx); time = getCurrentTime() - time; #endif // PERF_TEST_WITH_ACML return time; } template nano_time_t Nrm2PerformanceTest::clblasPerfSingle(void) { nano_time_t time; cl_event event; cl_int status; cl_command_queue queue = base_->commandQueues()[0]; DataType type; type = ( typeid(ElemType) == typeid(float))? TYPE_FLOAT:( typeid(ElemType) == typeid(double))? TYPE_DOUBLE: ( typeid(ElemType) == typeid(FloatComplex))? TYPE_COMPLEX_FLOAT: TYPE_COMPLEX_DOUBLE; event = NULL; clFinish( queue); time = getCurrentTime(); #define TIMING #ifdef TIMING int iter = 100; for ( int i=1; i <= iter; i++) { #endif status = (cl_int)clMath::clblas::nrm2( type, params_.N, mobjNRM2_, params_.offa, mobjX_, params_.offBX, params_.incx, scratchBuff, 1, &queue, 0, NULL, &event); if (status != CL_SUCCESS) { cerr << "The CLBLAS NRM2 function failed, status = " << status << endl; return NANOTIME_ERR; } #ifdef TIMING } // iter loop clFinish( queue); time = getCurrentTime() - time; time /= iter; #else status = flushAll(1, &queue); if (status != CL_SUCCESS) { cerr << "clFlush() failed, status = " << status << endl; return NANOTIME_ERR; } time = getCurrentTime(); status = waitForSuccessfulFinish(1, &queue, &event); if (status == CL_SUCCESS) { time = getCurrentTime() - time; } else { cerr << "Waiting for completion of commands to the queue failed, " "status = " << status << endl; time = NANOTIME_ERR; } #endif return time; } } // namespace clMath TEST_P(NRM2, snrm2) { TestParams params; getParams(¶ms); Nrm2PerformanceTest::runInstance(FN_SNRM2, ¶ms); } TEST_P(NRM2, dnrm2) { TestParams params; getParams(¶ms); Nrm2PerformanceTest::runInstance(FN_DNRM2, ¶ms); } TEST_P(NRM2, scnrm2) { TestParams params; getParams(¶ms); Nrm2PerformanceTest::runInstance(FN_SCNRM2, ¶ms); } TEST_P(NRM2, dznrm2) { TestParams params; getParams(¶ms); Nrm2PerformanceTest::runInstance(FN_DZNRM2, ¶ms); } clblas-2.10/src/tests/performance/perf-rot.cpp000066400000000000000000000231311264277366700214140ustar00rootroot00000000000000/* ************************************************************************ * Copyright 2013 Advanced Micro Devices, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * ************************************************************************/ /* * ROT performance test cases */ #include // srand() #include // memcpy() #include #include #include #include #include #include #include #ifdef PERF_TEST_WITH_ACML #include #include #endif #include "PerformanceTest.h" /* * NOTE: operation factor means overall number * of multiply and add per each operation involving * 2 matrix elements */ using namespace std; using namespace clMath; #define CHECK_RESULT(ret) \ do { \ ASSERT_GE(ret, 0) << "Fatal error: can not allocate resources or " \ "perform an OpenCL request!" << endl; \ EXPECT_EQ(0, ret) << "The OpenCL version is slower in the case" << \ endl; \ } while (0) namespace clMath { // ElemType1 for storing general type, ElemType2 to store type of C which is only float/double template class RotPerformanceTest : public PerformanceTest { public: virtual ~RotPerformanceTest(); virtual int prepare(void); virtual nano_time_t etalonPerfSingle(void); virtual nano_time_t clblasPerfSingle(void); static void runInstance(BlasFunction fn, TestParams *params) { RotPerformanceTest perfCase(fn, params); int ret = 0; int opFactor; BlasBase *base; base = clMath::BlasBase::getInstance(); opFactor =1; if (((fn == FN_DROT) || (fn == FN_ZDROT)) && !base->isDevSupportDoublePrecision()) { std::cerr << ">> WARNING: The target device doesn't support native " "double precision floating point arithmetic" << std::endl << ">> Test skipped" << std::endl; return; } if (!perfCase.areResourcesSufficient(params)) { std::cerr << ">> RESOURCE CHECK: Skip due to unsufficient resources" << std::endl; return; } else { ret = perfCase.run(opFactor); } ASSERT_GE(ret, 0) << "Fatal error: can not allocate resources or " "perform an OpenCL request!" << endl; EXPECT_EQ(0, ret) << "The OpenCL version is slower in the case" << endl; } private: RotPerformanceTest(BlasFunction fn, TestParams *params); bool areResourcesSufficient(TestParams *params); TestParams params_; ElemType *X_, *Y_, *back_X_, *back_Y_, alpha, beta; size_t lengthx, lengthy; cl_mem mobjX_, mobjY_; ::clMath::BlasBase *base_; }; template RotPerformanceTest::RotPerformanceTest( BlasFunction fn, TestParams *params) : PerformanceTest(fn,(problem_size_t) ((4 * params->N ) * sizeof(ElemType))), params_(*params) { X_ = Y_ = NULL; back_X_ = back_Y_ = NULL; mobjX_= mobjY_ = NULL; lengthx = 1 + (params_.N - 1) * abs(params_.incx); lengthy = 1 + (params_.N - 1) * abs(params_.incy); try { X_ = new ElemType[lengthx + params_.offa]; back_X_ = new ElemType[lengthx + params_.offa]; Y_ = new ElemType[lengthy + params_.offb]; back_Y_ = new ElemType[lengthy + params_.offb]; } catch(bad_alloc& ba) { X_ = back_X_ = Y_ = back_Y_ = NULL; // areResourcesSufficient() will handle the rest and return ba = ba; } base_ = ::clMath::BlasBase::getInstance(); } template RotPerformanceTest::~RotPerformanceTest() { if(X_ != NULL) { delete[] X_; } if(back_X_ != NULL) { delete[] back_X_; } if( mobjX_ != NULL ) { clReleaseMemObject(mobjX_); } if(Y_ != NULL) { delete[] Y_; } if(back_Y_ != NULL) { delete[] back_Y_; } if( mobjY_ != NULL ) { clReleaseMemObject(mobjY_); } } /* * Check if available OpenCL resources are sufficient to * run the test case */ template bool RotPerformanceTest::areResourcesSufficient(TestParams *params) { clMath::BlasBase *base; size_t gmemSize, allocSize; size_t offx = params->offa; size_t offy = params->offb; size_t sizex = (lengthx + offx)*sizeof(ElemType); size_t sizey = (lengthy + offy)*sizeof(ElemType); bool ret; size_t sizeRequired = (sizex + sizey); if((X_ == NULL) || (back_X_ == NULL) || (Y_ == NULL) || (back_Y_ == NULL)) { return 0; } base = clMath::BlasBase::getInstance(); gmemSize = (size_t)base->availGlobalMemSize(0); allocSize = (size_t)base->maxMemAllocSize(); ret = (sizex < allocSize) && (sizey < allocSize); ret = ret && (sizeRequired < gmemSize); return ret; } template int RotPerformanceTest::prepare(void) { randomVectors(params_.N, (X_ + params_.offa), params_.incx, (Y_ + params_.offb), params_.incy); alpha= convertMultiplier(params_.alpha); beta = convertMultiplier(params_.beta); memcpy(back_X_, X_, (lengthx + params_.offa)*sizeof(ElemType)); memcpy(back_Y_, Y_, (lengthy + params_.offb)*sizeof(ElemType)); // Allocate buffers mobjX_ = base_->createEnqueueBuffer(X_, (lengthx + params_.offa) * sizeof(ElemType), 0, CL_MEM_READ_WRITE); mobjY_ = base_->createEnqueueBuffer(Y_, (lengthy + params_.offb) * sizeof(ElemType), 0, CL_MEM_READ_WRITE); if((mobjX_ == NULL) || (mobjY_ == NULL)) { return -1; } return 0; } template nano_time_t RotPerformanceTest::etalonPerfSingle(void) { nano_time_t time = 0; #ifdef PERF_TEST_WITH_ACML time = getCurrentTime(); clMath::blas::rot(params_.N, back_X_, params_.offa, params_.incx, back_Y_, params_.offb, params_.incy, alpha, beta); time = getCurrentTime() - time; #endif // PERF_TEST_WITH_ACML return time; } template nano_time_t RotPerformanceTest::clblasPerfSingle(void) { nano_time_t time; cl_event event; cl_int status; cl_command_queue queue = base_->commandQueues()[0]; //DataType type; //type = ( typeid(ElemType) == typeid(float))? TYPE_FLOAT: TYPE_DOUBLE; status = clEnqueueWriteBuffer(queue, mobjX_, CL_TRUE, 0, (lengthx + params_.offa) * sizeof(ElemType), X_, 0, NULL, &event); if (status != CL_SUCCESS) { cerr << "Vector X buffer object enqueuing error, status = " << status << endl; return NANOTIME_ERR; } status = clEnqueueWriteBuffer(queue, mobjY_, CL_TRUE, 0, (lengthy + params_.offb) * sizeof(ElemType), Y_, 0, NULL, &event); if (status != CL_SUCCESS) { cerr << "Vector Y buffer object enqueuing error, status = " << status << endl; return NANOTIME_ERR; } status = clWaitForEvents(1, &event); if (status != CL_SUCCESS) { cout << "Wait on event failed, status = " << status << endl; return NANOTIME_ERR; } event = NULL; time = getCurrentTime(); #define TIMING #ifdef TIMING clFinish( queue); int iter = 50; for ( int i=1; i <= iter; i++) { #endif status = (cl_int)clMath::clblas::rot(params_.N, mobjX_, params_.offa, params_.incx, mobjY_, params_.offb, params_.incy, alpha, beta, 1, &queue, 0, NULL, &event); if (status != CL_SUCCESS) { cerr << "The CLBLAS ROT function failed, status = " << status << endl; return NANOTIME_ERR; } #ifdef TIMING } // iter loop clFinish( queue); time = getCurrentTime() - time; time /= iter; #else status = flushAll(1, &queue); if (status != CL_SUCCESS) { cerr << "clFlush() failed, status = " << status << endl; return NANOTIME_ERR; } time = getCurrentTime(); status = waitForSuccessfulFinish(1, &queue, &event); if (status == CL_SUCCESS) { time = getCurrentTime() - time; } else { cerr << "Waiting for completion of commands to the queue failed, " "status = " << status << endl; time = NANOTIME_ERR; } #endif return time; } } // namespace clMath // rot performance test TEST_P(ROT, srot) { TestParams params; getParams(¶ms); RotPerformanceTest::runInstance(FN_SROT, ¶ms); } TEST_P(ROT, drot) { TestParams params; getParams(¶ms); RotPerformanceTest::runInstance(FN_DROT, ¶ms); } TEST_P(ROT, csrot) { TestParams params; getParams(¶ms); RotPerformanceTest::runInstance(FN_CSROT, ¶ms); } TEST_P(ROT, zdrot) { TestParams params; getParams(¶ms); RotPerformanceTest::runInstance(FN_ZDROT, ¶ms); } clblas-2.10/src/tests/performance/perf-rotg.cpp000066400000000000000000000275571264277366700216030ustar00rootroot00000000000000/* ************************************************************************ * Copyright 2013 Advanced Micro Devices, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * ************************************************************************/ /* * ROTG performance test cases */ #include // srand() #include // memcpy() #include #include #include #include #include #include #include #ifdef PERF_TEST_WITH_ACML #include #include #endif #include "PerformanceTest.h" /* * NOTE: operation factor means overall number * of multiply and add per each operation involving * 2 matrix elements */ using namespace std; using namespace clMath; #define CHECK_RESULT(ret) \ do { \ ASSERT_GE(ret, 0) << "Fatal error: can not allocate resources or " \ "perform an OpenCL request!" << endl; \ EXPECT_EQ(0, ret) << "The OpenCL version is slower in the case" << \ endl; \ } while (0) namespace clMath { // ElemType1 for storing general type, ElemType2 to store type of C which is only float/double template class RotgPerformanceTest : public PerformanceTest { public: virtual ~RotgPerformanceTest(); virtual int prepare(void); virtual nano_time_t etalonPerfSingle(void); virtual nano_time_t clblasPerfSingle(void); static void runInstance(BlasFunction fn, TestParams *params) { RotgPerformanceTest perfCase(fn, params); int ret = 0; int opFactor; BlasBase *base; base = clMath::BlasBase::getInstance(); opFactor =1; if (((fn == FN_DROTG) || (fn == FN_ZROTG)) && !base->isDevSupportDoublePrecision()) { std::cerr << ">> WARNING: The target device doesn't support native " "double precision floating point arithmetic" << std::endl << ">> Test skipped" << std::endl; return; } if (!perfCase.areResourcesSufficient(params)) { std::cerr << ">> RESOURCE CHECK: Skip due to unsufficient resources" << std::endl; return; } else { ret = perfCase.run(opFactor); } ASSERT_GE(ret, 0) << "Fatal error: can not allocate resources or " "perform an OpenCL request!" << endl; EXPECT_EQ(0, ret) << "The OpenCL version is slower in the case" << endl; } private: RotgPerformanceTest(BlasFunction fn, TestParams *params); bool areResourcesSufficient(TestParams *params); TestParams params_; ElemType1 *SA_, *SB_, *S_, *back_SA_, *back_SB_, *back_S_; ElemType2 *C_, *back_C_; cl_mem mobjSA_, mobjSB_, mobjC_, mobjS_; ::clMath::BlasBase *base_; }; template RotgPerformanceTest::RotgPerformanceTest( BlasFunction fn, TestParams *params) : PerformanceTest(fn,(problem_size_t) (5 * sizeof(ElemType1) + sizeof(ElemType2))), params_(*params) { SA_ = SB_ = S_ = NULL; back_SA_ = back_SB_ = back_S_ = NULL; C_ = back_C_ = NULL; mobjSA_= mobjSB_ = mobjC_ = mobjS_ = NULL; try { SA_ = new ElemType1[1 + params_.offBX]; back_SA_ = new ElemType1[1 + params_.offBX]; SB_ = new ElemType1[1 + params_.offCY]; back_SB_ = new ElemType1[1 + params_.offCY]; C_ = new ElemType2[1 + params_.offa]; back_C_ = new ElemType2[1 + params_.offa]; S_ = new ElemType1[1 + params_.offb]; back_S_ = new ElemType1[1 + params_.offb]; } catch(bad_alloc& ba) { SA_ = back_SA_ = SB_ = back_SB_ = NULL; // areResourcesSufficient() will handle the rest and return S_ = back_S_ = NULL; C_ = back_C_ = NULL; ba = ba; } base_ = ::clMath::BlasBase::getInstance(); } template RotgPerformanceTest::~RotgPerformanceTest() { if(SA_ != NULL) { delete[] SA_; } if(back_SA_ != NULL) { delete[] back_SA_; } if( mobjSA_ != NULL ) { clReleaseMemObject(mobjSA_); } if(SB_ != NULL) { delete[] SB_; } if(back_SB_ != NULL) { delete[] back_SB_; } if( mobjSB_ != NULL ) { clReleaseMemObject(mobjSB_); } if(C_ != NULL) { delete[] C_; } if(back_C_ != NULL) { delete[] back_C_; } if( mobjC_ != NULL ) { clReleaseMemObject(mobjC_); } if(S_ != NULL) { delete[] S_; } if(back_S_ != NULL) { delete[] back_S_; } if( mobjS_ != NULL ) { clReleaseMemObject(mobjS_); } } /* * Check if available OpenCL resources are sufficient to * run the test case */ template bool RotgPerformanceTest::areResourcesSufficient(TestParams *params) { clMath::BlasBase *base; size_t gmemSize, allocSize; size_t offSA_ = params->offBX; size_t offSB_ = params->offCY; size_t offC_ = params->offa; size_t offS_ = params->offb; bool ret; size_t sizeRequired = ((1 + offSA_) + (1 + offSB_) + (1 + offS_)) * sizeof(ElemType1) + ((1 + offC_) * sizeof(ElemType2)); if((SA_ == NULL) || (back_SA_ == NULL) || (SB_ == NULL) || (back_SB_ == NULL) || (C_ == NULL) || (back_C_ == NULL) || (S_ == NULL) || (back_S_ == NULL)) { return 0; } base = clMath::BlasBase::getInstance(); gmemSize = (size_t)base->availGlobalMemSize( 0 ); allocSize = (size_t)base->maxMemAllocSize(); ret = (sizeRequired) < allocSize; ret = ret && (sizeRequired < gmemSize); return ret; } template int RotgPerformanceTest::prepare(void) { randomVectors(1, (SA_ + params_.offBX), 1, (SB_ + params_.offCY), 1); C_[params_.offa] = back_C_[params_.offa] = ZERO(); S_[params_.offb] = back_S_[params_.offb] = ZERO(); back_SA_[params_.offBX] = SA_[params_.offBX]; back_SB_[params_.offCY] = SB_[params_.offCY]; //printing the inputs, as they change after processing ::std::cerr << "A = "; printElement(SA_[params_.offBX]); ::std::cerr << "\tB = "; printElement(SB_[params_.offCY]); ::std::cerr << "\tC = "; printElement(C_[params_.offa]); ::std::cerr << "\tS = "; printElement(S_[params_.offb]); ::std::cout << std::endl << std::endl; // Allocate buffers mobjSA_ = base_->createEnqueueBuffer(SA_, (1 + params_.offBX) * sizeof(ElemType1), 0, CL_MEM_READ_WRITE); mobjSB_ = base_->createEnqueueBuffer(SB_, (1 + params_.offCY) * sizeof(ElemType1), 0, CL_MEM_READ_WRITE); mobjC_ = base_->createEnqueueBuffer(C_, (1 + params_.offa ) * sizeof(ElemType2), 0, CL_MEM_WRITE_ONLY); mobjS_ = base_->createEnqueueBuffer(S_, (1 + params_.offb ) * sizeof(ElemType1), 0, CL_MEM_WRITE_ONLY); if((mobjSA_ == NULL) || (mobjSB_ == NULL) || (mobjC_ == NULL) || (mobjS_ == NULL)) { return -1; } return 0; } template nano_time_t RotgPerformanceTest::etalonPerfSingle(void) { nano_time_t time = 0; #ifdef PERF_TEST_WITH_ACML time = getCurrentTime(); clMath::blas::rotg(back_SA_, params_.offBX, back_SB_, params_.offCY, back_C_, params_.offa, back_S_, params_.offb); time = getCurrentTime() - time; #endif // PERF_TEST_WITH_ACML return time; } template nano_time_t RotgPerformanceTest::clblasPerfSingle(void) { nano_time_t time; cl_event event; cl_int status; cl_command_queue queue = base_->commandQueues()[0]; DataType type; type = ( typeid(ElemType1) == typeid(float))? TYPE_FLOAT:( typeid(ElemType1) == typeid(double))? TYPE_DOUBLE: ( typeid(ElemType1) == typeid(FloatComplex))? TYPE_COMPLEX_FLOAT: TYPE_COMPLEX_DOUBLE; status = clEnqueueWriteBuffer(queue, mobjSA_, CL_TRUE, 0, (1 + params_.offBX) * sizeof(ElemType1), SA_, 0, NULL, &event); if (status != CL_SUCCESS) { cerr << "Vector SA buffer object enqueuing error, status = " << status << endl; return NANOTIME_ERR; } status = clEnqueueWriteBuffer(queue, mobjSB_, CL_TRUE, 0, (1 + params_.offCY) * sizeof(ElemType1), SB_, 0, NULL, &event); if (status != CL_SUCCESS) { cerr << "Vector SB buffer object enqueuing error, status = " << status << endl; return NANOTIME_ERR; } status = clEnqueueWriteBuffer(queue, mobjC_, CL_TRUE, 0, (1 + params_.offa) * sizeof(ElemType2), C_, 0, NULL, &event); if (status != CL_SUCCESS) { cerr << "Vector C buffer object enqueuing error, status = " << status << endl; return NANOTIME_ERR; } status = clEnqueueWriteBuffer(queue, mobjS_, CL_TRUE, 0, (1 + params_.offb) * sizeof(ElemType1), S_, 0, NULL, &event); if (status != CL_SUCCESS) { cerr << "Vector S buffer object enqueuing error, status = " << status << endl; return NANOTIME_ERR; } status = clWaitForEvents(1, &event); if (status != CL_SUCCESS) { cout << "Wait on event failed, status = " << status << endl; return NANOTIME_ERR; } event = NULL; time = getCurrentTime(); #define TIMING #ifdef TIMING clFinish( queue); int iter = 50; for ( int i=1; i <= iter; i++) { #endif status = (cl_int)clMath::clblas::rotg(type, mobjSA_, params_.offBX, mobjSB_, params_.offCY, mobjC_, params_.offa, mobjS_, params_.offb, 1, &queue, 0, NULL, &event); if (status != CL_SUCCESS) { cerr << "The CLBLAS ROTG function failed, status = " << status << endl; return NANOTIME_ERR; } #ifdef TIMING } // iter loop clFinish( queue); time = getCurrentTime() - time; time /= iter; #else status = flushAll(1, &queue); if (status != CL_SUCCESS) { cerr << "clFlush() failed, status = " << status << endl; return NANOTIME_ERR; } time = getCurrentTime(); status = waitForSuccessfulFinish(1, &queue, &event); if (status == CL_SUCCESS) { time = getCurrentTime() - time; } else { cerr << "Waiting for completion of commands to the queue failed, " "status = " << status << endl; time = NANOTIME_ERR; } #endif return time; } } // namespace clMath // rotg performance test TEST_P(ROTG, srotg) { TestParams params; getParams(¶ms); RotgPerformanceTest::runInstance(FN_SROTG, ¶ms); } TEST_P(ROTG, drotg) { TestParams params; getParams(¶ms); RotgPerformanceTest::runInstance(FN_DROTG, ¶ms); } TEST_P(ROTG, crotg) { TestParams params; getParams(¶ms); RotgPerformanceTest::runInstance(FN_CROTG, ¶ms); } TEST_P(ROTG, zrotg) { TestParams params; getParams(¶ms); RotgPerformanceTest::runInstance(FN_ZROTG, ¶ms); } clblas-2.10/src/tests/performance/perf-rotm.cpp000066400000000000000000000251001264277366700215670ustar00rootroot00000000000000/* ************************************************************************ * Copyright 2013 Advanced Micro Devices, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * ************************************************************************/ /* * ROTM performance test cases */ #include // srand() #include // memcpy() #include #include #include #include #include #include #include #ifdef PERF_TEST_WITH_ACML #include #include #endif #include "PerformanceTest.h" /* * NOTE: operation factor means overall number * of multiply and add per each operation involving * 2 matrix elements */ using namespace std; using namespace clMath; #define CHECK_RESULT(ret) \ do { \ ASSERT_GE(ret, 0) << "Fatal error: can not allocate resources or " \ "perform an OpenCL request!" << endl; \ EXPECT_EQ(0, ret) << "The OpenCL version is slower in the case" << \ endl; \ } while (0) namespace clMath { // ElemType1 for storing general type, ElemType2 to store type of C which is only float/double template class RotmPerformanceTest : public PerformanceTest { public: virtual ~RotmPerformanceTest(); virtual int prepare(void); virtual nano_time_t etalonPerfSingle(void); virtual nano_time_t clblasPerfSingle(void); static void runInstance(BlasFunction fn, TestParams *params) { RotmPerformanceTest perfCase(fn, params); int ret = 0; int opFactor; BlasBase *base; base = clMath::BlasBase::getInstance(); opFactor =1; if (((fn == FN_DROTM)) && !base->isDevSupportDoublePrecision()) { std::cerr << ">> WARNING: The target device doesn't support native " "double precision floating point arithmetic" << std::endl << ">> Test skipped" << std::endl; return; } if (!perfCase.areResourcesSufficient(params)) { std::cerr << ">> RESOURCE CHECK: Skip due to unsufficient resources" << std::endl; return; } else { ret = perfCase.run(opFactor); } ASSERT_GE(ret, 0) << "Fatal error: can not allocate resources or " "perform an OpenCL request!" << endl; EXPECT_EQ(0, ret) << "The OpenCL version is slower in the case" << endl; } private: RotmPerformanceTest(BlasFunction fn, TestParams *params); bool areResourcesSufficient(TestParams *params); TestParams params_; ElemType *X_, *Y_, *PARAM_, *back_X_, *back_Y_, *back_PARAM_; size_t lengthx, lengthy; cl_mem mobjX_, mobjY_, mobjParam_; ::clMath::BlasBase *base_; }; template RotmPerformanceTest::RotmPerformanceTest( BlasFunction fn, TestParams *params) : PerformanceTest(fn,(problem_size_t) ((4 * params->N + 5) * sizeof(ElemType))), params_(*params) { X_ = Y_ = PARAM_ = NULL; back_X_ = back_Y_ = back_PARAM_ = NULL; mobjX_= mobjY_ = mobjParam_ = NULL; lengthx = 1 + (params_.N - 1) * abs(params_.incx); lengthy = 1 + (params_.N - 1) * abs(params_.incy); try { X_ = new ElemType[lengthx + params_.offa]; back_X_ = new ElemType[lengthx + params_.offa]; Y_ = new ElemType[lengthy + params_.offb]; back_Y_ = new ElemType[lengthy + params_.offb]; PARAM_ = new ElemType[5 + params_.offc]; back_PARAM_ = new ElemType[5 + params_.offc]; } catch(bad_alloc& ba) { X_ = back_X_ = Y_ = back_Y_ = NULL; // areResourcesSufficient() will handle the rest and return PARAM_ = back_PARAM_ = NULL; ba = ba; } base_ = ::clMath::BlasBase::getInstance(); } template RotmPerformanceTest::~RotmPerformanceTest() { if(X_ != NULL) { delete[] X_; } if(back_X_ != NULL) { delete[] back_X_; } if( mobjX_ != NULL ) { clReleaseMemObject(mobjX_); } if(Y_ != NULL) { delete[] Y_; } if(back_Y_ != NULL) { delete[] back_Y_; } if( mobjY_ != NULL ) { clReleaseMemObject(mobjY_); } if(PARAM_ != NULL) { delete[] PARAM_; } if(back_PARAM_ != NULL) { delete[] back_PARAM_; } if( mobjParam_ != NULL ) { clReleaseMemObject(mobjParam_); } } /* * Check if available OpenCL resources are sufficient to * run the test case */ template bool RotmPerformanceTest::areResourcesSufficient(TestParams *params) { clMath::BlasBase *base; size_t gmemSize, allocSize; size_t offx = params->offa; size_t offy = params->offb; size_t offParam = params->offc; size_t sizex = (lengthx + offx)*sizeof(ElemType); size_t sizey = (lengthy + offy)*sizeof(ElemType); size_t sizeParam = (5 + offParam)*sizeof(ElemType); bool ret; size_t sizeRequired = (sizex + sizey + sizeParam); if((X_ == NULL) || (back_X_ == NULL) || (Y_ == NULL) || (back_Y_ == NULL) || (PARAM_ == NULL) || (back_PARAM_ == NULL)) { return 0; } base = clMath::BlasBase::getInstance(); gmemSize = (size_t)base->availGlobalMemSize(0); allocSize = (size_t)base->maxMemAllocSize(); ret = (sizex < allocSize) && (sizey < allocSize); ret = ret && (sizeRequired < gmemSize); return ret; } template int RotmPerformanceTest::prepare(void) { //Filling random values for SA and SB. C & S are only for output sake randomVectors(params_.N, (X_ + params_.offa), params_.incx, (Y_ + params_.offb), params_.incy); randomVectors(4, (PARAM_ + params_.offc + 1), 1); //1st element is initialized separately ElemType sflagParam = convertMultiplier(params_.alpha); PARAM_[params_.offc] = sflagParam; // initializing first element memcpy(back_X_, X_, (lengthx + params_.offa)*sizeof(ElemType)); memcpy(back_Y_, Y_, (lengthy + params_.offb)*sizeof(ElemType)); memcpy(back_PARAM_, PARAM_, (params_.offc)*sizeof(ElemType)); // Allocate buffers mobjX_ = base_->createEnqueueBuffer(X_, (lengthx + params_.offa) * sizeof(ElemType), 0, CL_MEM_READ_WRITE); mobjY_ = base_->createEnqueueBuffer(Y_, (lengthy + params_.offb) * sizeof(ElemType), 0, CL_MEM_READ_WRITE); mobjParam_ = base_->createEnqueueBuffer(PARAM_, (5 + params_.offc) * sizeof(ElemType), 0, CL_MEM_READ_ONLY); if((mobjX_ == NULL) || (mobjY_ == NULL) || (mobjParam_ == NULL)) { return -1; } return 0; } template nano_time_t RotmPerformanceTest::etalonPerfSingle(void) { nano_time_t time = 0; #ifdef PERF_TEST_WITH_ACML time = getCurrentTime(); clMath::blas::rotm(params_.N, back_X_, params_.offa, params_.incx, back_Y_, params_.offb, params_.incy, back_PARAM_, params_.offc); time = getCurrentTime() - time; #endif // PERF_TEST_WITH_ACML return time; } template nano_time_t RotmPerformanceTest::clblasPerfSingle(void) { nano_time_t time; cl_event event; cl_int status; cl_command_queue queue = base_->commandQueues()[0]; DataType type; type = ( typeid(ElemType) == typeid(float))? TYPE_FLOAT: TYPE_DOUBLE; status = clEnqueueWriteBuffer(queue, mobjX_, CL_TRUE, 0, (lengthx + params_.offa) * sizeof(ElemType), X_, 0, NULL, &event); if (status != CL_SUCCESS) { cerr << "Vector X buffer object enqueuing error, status = " << status << endl; return NANOTIME_ERR; } status = clEnqueueWriteBuffer(queue, mobjY_, CL_TRUE, 0, (lengthy + params_.offb) * sizeof(ElemType), Y_, 0, NULL, &event); if (status != CL_SUCCESS) { cerr << "Vector Y buffer object enqueuing error, status = " << status << endl; return NANOTIME_ERR; } status = clEnqueueWriteBuffer(queue, mobjParam_, CL_TRUE, 0, (5 + params_.offc) * sizeof(ElemType), PARAM_, 0, NULL, &event); if (status != CL_SUCCESS) { cerr << "Vector C buffer object enqueuing error, status = " << status << endl; return NANOTIME_ERR; } status = clWaitForEvents(1, &event); if (status != CL_SUCCESS) { cout << "Wait on event failed, status = " << status << endl; return NANOTIME_ERR; } event = NULL; time = getCurrentTime(); #define TIMING #ifdef TIMING clFinish( queue); int iter = 50; for ( int i=1; i <= iter; i++) { #endif status = (cl_int)clMath::clblas::rotm(type, params_.N, mobjX_, params_.offa, params_.incx, mobjY_, params_.offb, params_.incy, mobjParam_, params_.offc, 1, &queue, 0, NULL, &event); if (status != CL_SUCCESS) { cerr << "The CLBLAS ROTM function failed, status = " << status << endl; return NANOTIME_ERR; } #ifdef TIMING } // iter loop clFinish( queue); time = getCurrentTime() - time; time /= iter; #else status = flushAll(1, &queue); if (status != CL_SUCCESS) { cerr << "clFlush() failed, status = " << status << endl; return NANOTIME_ERR; } time = getCurrentTime(); status = waitForSuccessfulFinish(1, &queue, &event); if (status == CL_SUCCESS) { time = getCurrentTime() - time; } else { cerr << "Waiting for completion of commands to the queue failed, " "status = " << status << endl; time = NANOTIME_ERR; } #endif return time; } } // namespace clMath // rotm performance test TEST_P(ROTM, srotm) { TestParams params; getParams(¶ms); RotmPerformanceTest::runInstance(FN_SROTM, ¶ms); } TEST_P(ROTM, drotm) { TestParams params; getParams(¶ms); RotmPerformanceTest::runInstance(FN_DROTM, ¶ms); } clblas-2.10/src/tests/performance/perf-rotmg.cpp000066400000000000000000000302251264277366700217420ustar00rootroot00000000000000/* ************************************************************************ * Copyright 2013 Advanced Micro Devices, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * ************************************************************************/ /* * ROTMG performance test cases */ #include // srand() #include // memcpy() #include #include #include #include #include #include #include #ifdef PERF_TEST_WITH_ACML #include #include #endif #include "PerformanceTest.h" /* * NOTE: operation factor means overall number * of multiply and add per each operation involving * 2 matrix elements */ using namespace std; using namespace clMath; #define CHECK_RESULT(ret) \ do { \ ASSERT_GE(ret, 0) << "Fatal error: can not allocate resources or " \ "perform an OpenCL request!" << endl; \ EXPECT_EQ(0, ret) << "The OpenCL version is slower in the case" << \ endl; \ } while (0) namespace clMath { // ElemType1 for storing general type, ElemType2 to store type of C which is only float/double template class RotmgPerformanceTest : public PerformanceTest { public: virtual ~RotmgPerformanceTest(); virtual int prepare(void); virtual nano_time_t etalonPerfSingle(void); virtual nano_time_t clblasPerfSingle(void); static void runInstance(BlasFunction fn, TestParams *params) { RotmgPerformanceTest perfCase(fn, params); int ret = 0; int opFactor; BlasBase *base; base = clMath::BlasBase::getInstance(); opFactor =1; if (((fn == FN_DROTMG)) && !base->isDevSupportDoublePrecision()) { std::cerr << ">> WARNING: The target device doesn't support native " "double precision floating point arithmetic" << std::endl << ">> Test skipped" << std::endl; return; } if (!perfCase.areResourcesSufficient(params)) { std::cerr << ">> RESOURCE CHECK: Skip due to unsufficient resources" << std::endl; return; } else { ret = perfCase.run(opFactor); } ASSERT_GE(ret, 0) << "Fatal error: can not allocate resources or " "perform an OpenCL request!" << endl; EXPECT_EQ(0, ret) << "The OpenCL version is slower in the case" << endl; } private: RotmgPerformanceTest(BlasFunction fn, TestParams *params); bool areResourcesSufficient(TestParams *params); TestParams params_; ElemType *D1_, *D2_, *X_, *Y_, *PARAM_, *back_D1_, *back_D2_, *back_X_, *back_Y_, *back_PARAM_; cl_mem mobjD1_, mobjD2_, mobjX_, mobjY_, mobjParam_; ::clMath::BlasBase *base_; }; template RotmgPerformanceTest::RotmgPerformanceTest( BlasFunction fn, TestParams *params) : PerformanceTest(fn,(problem_size_t) ((4 + 3 + 10) * sizeof(ElemType))), params_(*params) // D1,D2,X and Param are Read/Write and Y is Read only { D1_ = D2_ = X_ = Y_ = PARAM_ = NULL; back_D1_ = back_D2_ = back_X_ = back_Y_ = back_PARAM_ = NULL; mobjD1_ = mobjD2_ = mobjX_= mobjY_ = mobjParam_ = NULL; try { D1_ = new ElemType[1 + params_.offa]; back_D1_ = new ElemType[1 + params_.offa]; D2_ = new ElemType[1 + params_.offb]; back_D2_ = new ElemType[1 + params_.offb]; X_ = new ElemType[1 + params_.offBX]; back_X_ = new ElemType[1 + params_.offBX]; Y_ = new ElemType[1 + params_.offCY]; back_Y_ = new ElemType[1 + params_.offCY]; PARAM_ = new ElemType[5 + params_.offc]; back_PARAM_ = new ElemType[5 + params_.offc]; } catch(bad_alloc& ba) { D1_ = back_D1_ = D2_ = back_D2_ = X_ = back_X_ = Y_ = back_Y_ = NULL; // areResourcesSufficient() will handle the rest and return PARAM_ = back_PARAM_ = NULL; ba = ba; } base_ = ::clMath::BlasBase::getInstance(); } template RotmgPerformanceTest::~RotmgPerformanceTest() { if(D1_ != NULL) { delete[] D1_; } if(back_D1_ != NULL) { delete[] back_D1_; } if( mobjD1_ != NULL ) { clReleaseMemObject(mobjD1_); } if(D2_ != NULL) { delete[] D2_; } if(back_D2_ != NULL) { delete[] back_D2_; } if( mobjD2_ != NULL ) { clReleaseMemObject(mobjD2_); } if(X_ != NULL) { delete[] X_; } if(back_X_ != NULL) { delete[] back_X_; } if( mobjX_ != NULL ) { clReleaseMemObject(mobjX_); } if(Y_ != NULL) { delete[] Y_; } if(back_Y_ != NULL) { delete[] back_Y_; } if( mobjY_ != NULL ) { clReleaseMemObject(mobjY_); } if(PARAM_ != NULL) { delete[] PARAM_; } if(back_PARAM_ != NULL) { delete[] back_PARAM_; } if( mobjParam_ != NULL ) { clReleaseMemObject(mobjParam_); } } /* * Check if available OpenCL resources are sufficient to * run the test case */ template bool RotmgPerformanceTest::areResourcesSufficient(TestParams *params) { clMath::BlasBase *base; size_t gmemSize, allocSize; size_t offx = params->offBX; size_t offy = params->offCY; size_t offD1 = params->offa; size_t offD2 = params->offb; size_t offParam = params->offc; bool ret; size_t sizeRequired = ((1 + offx) + (1 + offy) + (1 + offD1) + (1 + offD2) + (1 + offParam)) * sizeof(ElemType); if((D1_ == NULL) || (back_D1_ == NULL) ||(X_ == NULL) || (back_X_ == NULL) || (Y_ == NULL) || (back_Y_ == NULL) || (D2_ == NULL) || (back_D2_ == NULL) || (PARAM_ == NULL) || (back_PARAM_ == NULL)) { return 0; } base = clMath::BlasBase::getInstance(); gmemSize = (size_t)base->availGlobalMemSize(0); allocSize = (size_t)base->maxMemAllocSize(); ret = (sizeRequired) < allocSize; ret = ret && (sizeRequired < gmemSize); return ret; } template int RotmgPerformanceTest::prepare(void) { //Filling random values for SA and SB. C & S are only for output sake randomRotmg( (D1_ + params_.offa), (D2_ + params_.offb), (X_ + params_.offBX), (Y_ + params_.offCY), (PARAM_ + params_.offc) ); ElemType sflagParam = convertMultiplier(params_.alpha); PARAM_[params_.offc] = sflagParam; // initializing first element memcpy(back_D1_, D1_, (1 + params_.offa)*sizeof(ElemType)); memcpy(back_D2_, D2_, (1 + params_.offb)*sizeof(ElemType)); memcpy(back_X_, X_, (1 + params_.offBX)*sizeof(ElemType)); memcpy(back_Y_, Y_, (1 + params_.offCY)*sizeof(ElemType)); memcpy(back_PARAM_, PARAM_, (5 + params_.offc)*sizeof(ElemType)); // Allocate buffers mobjX_ = base_->createEnqueueBuffer(X_, (1 + params_.offBX) * sizeof(ElemType), 0, CL_MEM_READ_WRITE); mobjY_ = base_->createEnqueueBuffer(Y_, (1 + params_.offCY) * sizeof(ElemType), 0, CL_MEM_READ_ONLY); mobjD1_ = base_->createEnqueueBuffer(D1_, (1 + params_.offa) * sizeof(ElemType), 0, CL_MEM_READ_WRITE); mobjD2_ = base_->createEnqueueBuffer(D2_, (1 + params_.offb) * sizeof(ElemType), 0, CL_MEM_READ_WRITE); mobjParam_ = base_->createEnqueueBuffer(PARAM_, (5 + params_.offc) * sizeof(ElemType), 0, CL_MEM_READ_ONLY); if((mobjD1_ == NULL) || (mobjD2_ == NULL) || (mobjX_ == NULL) || (mobjY_ == NULL) || (mobjParam_ == NULL)) { return -1; } return 0; } template nano_time_t RotmgPerformanceTest::etalonPerfSingle(void) { nano_time_t time = 0; #ifdef PERF_TEST_WITH_ACML time = getCurrentTime(); clMath::blas::rotmg(back_D1_, params_.offa, back_D2_, params_.offb, back_X_, params_.offBX, back_Y_, params_.offCY, back_PARAM_, params_.offc); time = getCurrentTime() - time; #endif // PERF_TEST_WITH_ACML return time; } template nano_time_t RotmgPerformanceTest::clblasPerfSingle(void) { nano_time_t time; cl_event event; cl_int status; cl_command_queue queue = base_->commandQueues()[0]; DataType type; type = ( typeid(ElemType) == typeid(float))? TYPE_FLOAT: TYPE_DOUBLE; status = clEnqueueWriteBuffer(queue, mobjD1_, CL_TRUE, 0, (1 + params_.offa) * sizeof(ElemType), D1_, 0, NULL, &event); if (status != CL_SUCCESS) { cerr << "Vector D1 buffer object enqueuing error, status = " << status << endl; return NANOTIME_ERR; } status = clEnqueueWriteBuffer(queue, mobjD2_, CL_TRUE, 0, (1 + params_.offb) * sizeof(ElemType), D2_, 0, NULL, &event); if (status != CL_SUCCESS) { cerr << "Vector D2 buffer object enqueuing error, status = " << status << endl; return NANOTIME_ERR; } status = clEnqueueWriteBuffer(queue, mobjX_, CL_TRUE, 0, (1 + params_.offBX) * sizeof(ElemType), X_, 0, NULL, &event); if (status != CL_SUCCESS) { cerr << "Vector X buffer object enqueuing error, status = " << status << endl; return NANOTIME_ERR; } status = clEnqueueWriteBuffer(queue, mobjY_, CL_TRUE, 0, (1 + params_.offCY) * sizeof(ElemType), Y_, 0, NULL, &event); if (status != CL_SUCCESS) { cerr << "Vector Y buffer object enqueuing error, status = " << status << endl; return NANOTIME_ERR; } status = clEnqueueWriteBuffer(queue, mobjParam_, CL_TRUE, 0, (5 + params_.offc) * sizeof(ElemType), PARAM_, 0, NULL, &event); if (status != CL_SUCCESS) { cerr << "Vector C buffer object enqueuing error, status = " << status << endl; return NANOTIME_ERR; } status = clWaitForEvents(1, &event); if (status != CL_SUCCESS) { cout << "Wait on event failed, status = " << status << endl; return NANOTIME_ERR; } event = NULL; time = getCurrentTime(); #define TIMING #ifdef TIMING clFinish( queue); int iter = 50; for ( int i=1; i <= iter; i++) { #endif status = (cl_int)clMath::clblas::rotmg(type, mobjD1_, params_.offa, mobjD2_, params_.offb, mobjX_, params_.offBX, mobjY_, params_.offCY, mobjParam_, params_.offc, 1, &queue, 0, NULL, &event); if (status != CL_SUCCESS) { cerr << "The CLBLAS ROTMG function failed, status = " << status << endl; return NANOTIME_ERR; } #ifdef TIMING } // iter loop clFinish( queue); time = getCurrentTime() - time; time /= iter; #else status = flushAll(1, &queue); if (status != CL_SUCCESS) { cerr << "clFlush() failed, status = " << status << endl; return NANOTIME_ERR; } time = getCurrentTime(); status = waitForSuccessfulFinish(1, &queue, &event); if (status == CL_SUCCESS) { time = getCurrentTime() - time; } else { cerr << "Waiting for completion of commands to the queue failed, " "status = " << status << endl; time = NANOTIME_ERR; } #endif return time; } } // namespace clMath // rotmg performance test TEST_P(ROTMG, srotmg) { TestParams params; getParams(¶ms); RotmgPerformanceTest::runInstance(FN_SROTMG, ¶ms); } TEST_P(ROTMG, drotmg) { TestParams params; getParams(¶ms); RotmgPerformanceTest::runInstance(FN_DROTMG, ¶ms); } clblas-2.10/src/tests/performance/perf-sbmv.cpp000066400000000000000000000231241264277366700215610ustar00rootroot00000000000000/* ************************************************************************ * Copyright 2013 Advanced Micro Devices, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * ************************************************************************/ /* * Sbmv performance test cases */ #include // srand() #include // memcpy() #include #include #include #include #include #include #include #include #ifdef PERF_TEST_WITH_ACML #include #include #endif #include "PerformanceTest.h" using namespace std; using namespace clMath; #define CHECK_RESULT(ret) \ do { \ ASSERT_GE(ret, 0) << "Fatal error: can not allocate resources or " \ "perform an OpenCL request!" << endl; \ EXPECT_EQ(0, ret) << "The OpenCL version is slower in the case" << \ endl; \ } while (0) namespace clMath { template class SbmvPerformanceTest : public PerformanceTest { public: virtual ~SbmvPerformanceTest(); virtual int prepare(void); virtual nano_time_t etalonPerfSingle(void); virtual nano_time_t clblasPerfSingle(void); static void runInstance(BlasFunction fn, TestParams *params) { SbmvPerformanceTest perfCase(fn, params); int ret = 0; int opFactor = 1; BlasBase *base; base = clMath::BlasBase::getInstance(); if ((fn == FN_DSBMV) && !base->isDevSupportDoublePrecision()) { std::cerr << ">> WARNING: The target device doesn't support native " "double precision floating point arithmetic" << std::endl << ">> Test skipped" << std::endl; return; } if (!perfCase.areResourcesSufficient(params)) { std::cerr << ">> RESOURCE CHECK: Skip due to unsufficient resources" << std::endl; } else { ret = perfCase.run(opFactor); } ASSERT_GE(ret, 0) << "Fatal error: can not allocate resources or " "perform an OpenCL request!" << endl; EXPECT_EQ(0, ret) << "The OpenCL version is slower in the case" << endl; } private: SbmvPerformanceTest(BlasFunction fn, TestParams *params); bool areResourcesSufficient(TestParams *params); TestParams params_; ElemType alpha; ElemType beta; ElemType *A_; ElemType *X_; ElemType *Y_; ElemType *backY_; cl_mem mobjA_; cl_mem mobjX_; cl_mem mobjY_; ::clMath::BlasBase *base_; }; template SbmvPerformanceTest::SbmvPerformanceTest( BlasFunction fn, TestParams *params) : PerformanceTest(fn, (problem_size_t)( ( (2 * (params->N) * (params->K + 1) // A-access - (2 * params->K * (params->K+1)) ) // Substract hole-part for A & X +( ((2*params->K + 1) * params->N + 2*params->N)) // X & Y access ) * sizeof(ElemType) ) ), params_(*params), mobjA_(NULL), mobjX_(NULL), mobjY_(NULL) { size_t lenA, lenX, lenY; lenA = params_.N * (params_.lda) + params_.offA; lenX = params_.N - 1* params_.incx + 1 + params_.offBX; lenY = params_.N - 1* params_.incy + 1 + params_.offCY; A_ = new ElemType[ lenA ]; X_ = new ElemType[ lenX ]; Y_ = new ElemType[ lenY ]; backY_ = new ElemType[ lenY ]; alpha = convertMultiplier(params_.alpha); beta = convertMultiplier(params_.beta); base_ = ::clMath::BlasBase::getInstance(); mobjA_ = NULL; mobjX_ = NULL; mobjY_ = NULL; } template SbmvPerformanceTest::~SbmvPerformanceTest() { if(A_ != NULL) { delete[] A_; } if(X_ != NULL) { delete[] X_; } if(backY_ != NULL) { delete[] backY_; } if(Y_ != NULL) { delete[] Y_; } if ( mobjA_ != NULL ) clReleaseMemObject(mobjA_); if ( mobjX_ != NULL ) clReleaseMemObject(mobjX_); if ( mobjY_ != NULL ) clReleaseMemObject(mobjY_); } /* * Check if available OpenCL resources are sufficient to * run the test case */ template bool SbmvPerformanceTest::areResourcesSufficient(TestParams *params) { clMath::BlasBase *base; size_t gmemSize, allocSize; size_t n = params->N, lda = params->lda; size_t lenA = (n * lda) + params->offA* sizeof(ElemType); size_t lenX = (n - 1) * params->incx + 1 + params->offBX * sizeof(ElemType); size_t lenY = (n - 1) * params->incy + 1 + params->offCY * sizeof(ElemType); if((A_ == NULL) || (X_ == NULL) || (Y_ == NULL) || (backY_ == NULL)) { return 0; } base = clMath::BlasBase::getInstance(); gmemSize = (size_t)base->availGlobalMemSize(0); allocSize = (size_t)base->maxMemAllocSize(); bool suff = (lenA < allocSize) && ( (lenA + lenX + lenY) < gmemSize ); return suff; } template int SbmvPerformanceTest::prepare(void) { size_t lenX, lenY, lenA; lenA = (params_.N * params_.lda) + params_.offA; if (params_.transA == clblasNoTrans) { lenX = (params_.N - 1) * abs(params_.incx) + 1 + params_.offBX; lenY = (params_.N - 1) * abs(params_.incy) + 1 + params_.offCY; } else { lenX = (params_.N - 1)*abs(params_.incx) + 1 + params_.offBX; lenY = (params_.N - 1)*abs(params_.incy) + 1 + params_.offCY; } randomGbmvMatrices(params_.order, clblasNoTrans , params_.N, params_.N, &alpha, &beta, (A_+params_.offA), params_.lda, (X_+params_.offBX), params_.incx, (Y_+params_.offCY), params_.incy ); memcpy(backY_, Y_, lenY * sizeof(ElemType)); mobjA_ = base_->createEnqueueBuffer(A_, lenA * sizeof(ElemType), 0, CL_MEM_READ_ONLY); mobjX_ = base_->createEnqueueBuffer(X_, lenX * sizeof(ElemType), 0, CL_MEM_READ_ONLY); mobjY_ = base_->createEnqueueBuffer(backY_, lenY * sizeof(ElemType), 0, CL_MEM_READ_WRITE); return ((mobjA_ != NULL) && (mobjX_ != NULL) && (mobjY_ != NULL)) ? 0 : -1; } template nano_time_t SbmvPerformanceTest::etalonPerfSingle(void) { nano_time_t time = 0; clblasOrder fOrder; clblasUplo fUplo; size_t lda, lenY; size_t fN = params_.N, fK = params_.K; lenY = (params_.N - 1) * params_.incy + 1 + params_.offCY; memcpy(Y_, backY_, lenY * sizeof(ElemType)); fOrder = params_.order; fUplo = params_.uplo; lda = params_.lda; if (fOrder != clblasColumnMajor) { fOrder = clblasColumnMajor; fUplo = (params_.uplo == clblasLower)? clblasUpper : clblasLower; fN = params_.N; } #ifdef PERF_TEST_WITH_ACML time = getCurrentTime(); clMath::blas::sbmv(fOrder, fUplo, fN, fK , alpha, A_, params_.offA, lda, X_, params_.offBX, params_.incx, beta, Y_, params_.offCY, params_.incy); time = getCurrentTime() - time; #endif // PERF_TEST_WITH_ACML return time; } template nano_time_t SbmvPerformanceTest::clblasPerfSingle(void) { nano_time_t time; cl_event event; cl_int status; size_t lenY; cl_command_queue queue = base_->commandQueues()[0]; lenY = (params_.N - 1)* params_.incy + 1 + params_.offCY; status = clEnqueueWriteBuffer(queue, mobjY_, CL_TRUE, 0, lenY * sizeof(ElemType), backY_, 0, NULL, &event); if (status != CL_SUCCESS) { cerr << "Vector Y buffer object enqueuing error, status = " << status << endl; return NANOTIME_ERR; } status = clWaitForEvents(1, &event); if (status != CL_SUCCESS) { cout << "Wait on event failed, status = " << status << endl; return NANOTIME_ERR; } event = NULL; time = getCurrentTime(); int iter = 20; for ( int i = 1; i <= iter; i++) { status = clMath::clblas::sbmv(params_.order, params_.uplo, params_.N, params_.K, alpha, mobjA_, params_.offA, params_.lda, mobjX_, params_.offBX, params_.incx, beta, mobjY_, params_.offCY, params_.incy, 1, &queue, 0, NULL, &event); if (status != CL_SUCCESS) { cerr << "The CLBLAS GBMV function failed, status = " << status << endl; return NANOTIME_ERR; } } clFinish( queue ); time = getCurrentTime() - time; time /= iter; return time; } } // namespace clMath // sgbmv performance test TEST_P(SBMV, ssbmv) { TestParams params; getParams(¶ms); SbmvPerformanceTest::runInstance(FN_SSBMV, ¶ms); } // dgbmv performance test case TEST_P(SBMV, dsbmv) { TestParams params; getParams(¶ms); SbmvPerformanceTest::runInstance(FN_DSBMV, ¶ms); } clblas-2.10/src/tests/performance/perf-scal.cpp000066400000000000000000000223451264277366700215400ustar00rootroot00000000000000/* ************************************************************************ * Copyright 2013 Advanced Micro Devices, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * ************************************************************************/ /* * SCAL performance test cases */ #include // srand() #include // memcpy() #include #include #include #include #include #include #include #ifdef PERF_TEST_WITH_ACML #include #include #endif #include "PerformanceTest.h" /* * NOTE: operation factor means overall number * of multiply and add per each operation involving * 2 matrix elements */ using namespace std; using namespace clMath; #define CHECK_RESULT(ret) \ do { \ ASSERT_GE(ret, 0) << "Fatal error: can not allocate resources or " \ "perform an OpenCL request!" << endl; \ EXPECT_EQ(0, ret) << "The OpenCL version is slower in the case" << \ endl; \ } while (0) namespace clMath { template class ScalPerformanceTest : public PerformanceTest { public: virtual ~ScalPerformanceTest(); virtual int prepare(void); virtual nano_time_t etalonPerfSingle(void); virtual nano_time_t clblasPerfSingle(void); static void runInstance(BlasFunction fn, TestParams *params) { ScalPerformanceTest perfCase(fn, params); int ret = 0; int opFactor; BlasBase *base; base = clMath::BlasBase::getInstance(); opFactor =1; if (((fn == FN_DSCAL) || (fn == FN_ZSCAL) || (fn == FN_ZDSCAL)) && !base->isDevSupportDoublePrecision()) { std::cerr << ">> WARNING: The target device doesn't support native " "double precision floating point arithmetic" << std::endl << ">> Test skipped" << std::endl; return; } if (!perfCase.areResourcesSufficient(params)) { std::cerr << ">> RESOURCE CHECK: Skip due to unsufficient resources" << std::endl; return; } else { ret = perfCase.run(opFactor); } ASSERT_GE(ret, 0) << "Fatal error: can not allocate resources or " "perform an OpenCL request!" << endl; EXPECT_EQ(0, ret) << "The OpenCL version is slower in the case" << endl; } private: ScalPerformanceTest(BlasFunction fn, TestParams *params); bool areResourcesSufficient(TestParams *params); TestParams params_; ElemType alpha_; ElemType *X_; ElemType *backX_; cl_mem mobjX_; size_t lengthX; ::clMath::BlasBase *base_; }; template ScalPerformanceTest::ScalPerformanceTest( BlasFunction fn, TestParams *params) : PerformanceTest(fn,(problem_size_t) ( (2 * params->N) * sizeof(ElemType) ) ), params_(*params), mobjX_(NULL) { X_ = backX_ = NULL; mobjX_= NULL; lengthX = 1 + (params->N - 1) * abs(params_.incx); try { X_ = new ElemType[lengthX + params_.offBX]; backX_ = new ElemType[lengthX + params_.offBX]; } catch(bad_alloc& ba) { X_ = backX_ = NULL; // areResourcesSufficient() will handle the rest and return mobjX_= NULL; ba = ba; } base_ = ::clMath::BlasBase::getInstance(); } template ScalPerformanceTest::~ScalPerformanceTest() { if(X_ != NULL) { delete[] X_; } if(backX_ != NULL) { delete[] backX_; } if( mobjX_ != NULL ) clReleaseMemObject(mobjX_); } /* * Check if available OpenCL resources are sufficient to * run the test case */ template bool ScalPerformanceTest::areResourcesSufficient(TestParams *params) { clMath::BlasBase *base; size_t gmemSize, allocSize; bool ret; if((X_ == NULL) || (backX_ == NULL)) { return 0; } base = clMath::BlasBase::getInstance(); gmemSize = (size_t)base->availGlobalMemSize( 0 ); allocSize = (size_t)base->maxMemAllocSize(); ret = ((lengthX + params->offBX) * sizeof(ElemType)) < allocSize; ret = ret && ( ((lengthX + params->offBX) * sizeof(ElemType)) < gmemSize); return ret; } template int ScalPerformanceTest::prepare(void) { alpha_ = convertMultiplier(params_.alpha); randomVectors(params_.N, (X_ + params_.offBX), params_.incx); memcpy(backX_, X_, (lengthX + params_.offBX)* sizeof(ElemType)); mobjX_ = base_->createEnqueueBuffer(X_, ((lengthX + params_.offBX) * sizeof(*X_)), 0, CL_MEM_READ_WRITE); return (mobjX_ != NULL)? 0 : -1; } template nano_time_t ScalPerformanceTest::etalonPerfSingle(void) { nano_time_t time = 0; bool is_css_zds = (params_.K == 1)? true: false; // K indicates csscal/zdscal #ifdef PERF_TEST_WITH_ACML time = getCurrentTime(); clMath::blas::scal(is_css_zds, params_.N, alpha_, X_, params_.offBX, params_.incx); time = getCurrentTime() - time; #endif // PERF_TEST_WITH_ACML return time; } template nano_time_t ScalPerformanceTest::clblasPerfSingle(void) { nano_time_t time; cl_event event; cl_int status; cl_command_queue queue = base_->commandQueues()[0]; bool is_css_zds = (params_.K == 1)? true: false; // K indicates csscal/zdscal status = clEnqueueWriteBuffer(queue, mobjX_, CL_TRUE, 0, (lengthX + params_.offBX) * sizeof(ElemType), backX_, 0, NULL, &event); if (status != CL_SUCCESS) { cerr << "Matrix A buffer object enqueuing error, status = " << status << endl; return NANOTIME_ERR; } status = clWaitForEvents(1, &event); if (status != CL_SUCCESS) { cout << "Wait on event failed, status = " << status << endl; return NANOTIME_ERR; } event = NULL; time = getCurrentTime(); #define TIMING #ifdef TIMING clFinish( queue); int iter = 50; for ( int i=1; i <= iter; i++) { #endif status = (cl_int)clMath::clblas::scal(is_css_zds, params_.N, alpha_, mobjX_, params_.offBX, params_.incx, 1, &queue, 0, NULL, &event); if (status != CL_SUCCESS) { cerr << "The CLBLAS SCAL function failed, status = " << status << endl; return NANOTIME_ERR; } #ifdef TIMING } // iter loop clFinish( queue); time = getCurrentTime() - time; time /= iter; #else status = flushAll(1, &queue); if (status != CL_SUCCESS) { cerr << "clFlush() failed, status = " << status << endl; return NANOTIME_ERR; } time = getCurrentTime(); status = waitForSuccessfulFinish(1, &queue, &event); if (status == CL_SUCCESS) { time = getCurrentTime() - time; } else { cerr << "Waiting for completion of commands to the queue failed, " "status = " << status << endl; time = NANOTIME_ERR; } #endif return time; } } // namespace clMath // scal performance test TEST_P(SCAL, sscal) { TestParams params; getParams(¶ms); params.K = 0; // K will indicate wheather routine is csscal/zdscal ScalPerformanceTest::runInstance(FN_SSCAL, ¶ms); } TEST_P(SCAL, dscal) { TestParams params; getParams(¶ms); params.K = 0; // K will indicate wheather routine is csscal/zdscal ScalPerformanceTest::runInstance(FN_DSCAL, ¶ms); } TEST_P(SCAL, cscal) { TestParams params; getParams(¶ms); params.K = 0; // K will indicate wheather routine is csscal/zdscal ScalPerformanceTest::runInstance(FN_CSCAL, ¶ms); } TEST_P(SCAL, zscal) { TestParams params; getParams(¶ms); params.K = 0; // K will indicate wheather routine is csscal/zdscal ScalPerformanceTest::runInstance(FN_ZSCAL, ¶ms); } TEST_P(SCAL, csscal) { TestParams params; getParams(¶ms); params.K = 1; // K will indicate wheather routine is csscal/zdscal ScalPerformanceTest::runInstance(FN_CSSCAL, ¶ms); } TEST_P(SCAL, zdscal) { TestParams params; getParams(¶ms); params.K = 1; // K will indicate wheather routine is csscal/zdscal ScalPerformanceTest::runInstance(FN_ZDSCAL, ¶ms); } clblas-2.10/src/tests/performance/perf-spmv.cpp000066400000000000000000000227561264277366700216110ustar00rootroot00000000000000/* ************************************************************************ * Copyright 2013 Advanced Micro Devices, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * ************************************************************************/ /* * Spmv performance test cases */ #include // srand() #include // memcpy() #include #include #include #include #include #include #include #ifdef PERF_TEST_WITH_ACML #include #include #endif #include "PerformanceTest.h" /* * NOTE: operation factor means overall number * of multiply and add per each operation involving * 2 matrix elements */ using namespace std; using namespace clMath; #define CHECK_RESULT(ret) \ do { \ ASSERT_GE(ret, 0) << "Fatal error: can not allocate resources or " \ "perform an OpenCL request!" << endl; \ EXPECT_EQ(0, ret) << "The OpenCL version is slower in the case" << \ endl; \ } while (0) namespace clMath { template class SpmvPerformanceTest : public PerformanceTest { public: virtual ~SpmvPerformanceTest(); virtual int prepare(void); virtual nano_time_t etalonPerfSingle(void); virtual nano_time_t clblasPerfSingle(void); static void runInstance(BlasFunction fn, TestParams *params) { SpmvPerformanceTest perfCase(fn, params); int ret = 0; int opFactor; BlasBase *base; base = clMath::BlasBase::getInstance(); opFactor = 1; //FIX-ME if ((fn == FN_DSPMV) && !base->isDevSupportDoublePrecision()) { std::cerr << ">> WARNING: The target device doesn't support native " "double precision floating point arithmetic" << std::endl << ">> Test skipped" << std::endl; return; } if (!perfCase.areResourcesSufficient(params)) { std::cerr << ">> RESOURCE CHECK: Skip due to unsufficient resources" << std::endl; return; } else { ret = perfCase.run(opFactor); } ASSERT_GE(ret, 0) << "Fatal error: can not allocate resources or " "perform an OpenCL request!" << endl; EXPECT_EQ(0, ret) << "The OpenCL version is slower in the case" << endl; } private: SpmvPerformanceTest(BlasFunction fn, TestParams *params); bool areResourcesSufficient(TestParams *params); TestParams params_; ElemType *AP_; ElemType *X_; ElemType *Y_; ElemType *backY_; cl_mem mobjAP_; cl_mem mobjX_; cl_mem mobjY_; ElemType alpha, beta; ::clMath::BlasBase *base_; }; template SpmvPerformanceTest::SpmvPerformanceTest( BlasFunction fn, TestParams *params) : PerformanceTest( fn, (problem_size_t)( ( ((2 * (( params->N * (params->N)) + params->N)) ) * sizeof(ElemType) ) ) ), params_(*params), mobjAP_(NULL), mobjX_(NULL) { AP_ = new ElemType[((params_.N * (params_.N + 1)) / 2 ) + params_.offA]; X_ = new ElemType[ 1 + (params_.N-1) * abs(params_.incx) + params_.offBX]; Y_ = new ElemType[ 1 + (params_.N-1) * abs(params_.incy) + params_.offCY]; backY_ = new ElemType[ 1 + (params_.N-1) * abs(params_.incy) + params_.offCY]; alpha = convertMultiplier(params_.alpha); beta = convertMultiplier(params_.beta); base_ = ::clMath::BlasBase::getInstance(); mobjAP_ = NULL; mobjX_ = NULL; mobjY_ = NULL; } template SpmvPerformanceTest::~SpmvPerformanceTest() { if(AP_ != NULL) { delete[] AP_; } if(X_ != NULL) { delete[] X_; } if(backY_ != NULL) { delete[] backY_; } if(Y_ != NULL) { delete[] Y_; } if ( mobjAP_ != NULL ) clReleaseMemObject(mobjAP_); if ( mobjX_ != NULL ) clReleaseMemObject(mobjX_); if ( mobjY_ != NULL ) clReleaseMemObject(mobjY_); } /* * Check if available OpenCL resources are sufficient to * run the test case */ template bool SpmvPerformanceTest::areResourcesSufficient(TestParams *params) { clMath::BlasBase *base; size_t gmemSize, allocSize; size_t n = params->N; if((AP_ == NULL) || (X_ == NULL) || (Y_ == NULL) || (backY_ == NULL)) { return 0; } base = clMath::BlasBase::getInstance(); gmemSize = (size_t)base->availGlobalMemSize( 0 ); allocSize = (size_t)base->maxMemAllocSize(); bool suff = ( sizeof(ElemType)*((n*(n+1))/2) < allocSize ) && ((1 + (n-1)*abs(params->incx))*sizeof(ElemType) < allocSize); //for individual allocations suff = suff && ((( ((n*(n+1))/2) + (1 + (n-1)*abs(params->incx)) + (1 + (n-1)*abs(params->incy)))*sizeof(ElemType)) < gmemSize) ; //for total global allocations return suff ; } template int SpmvPerformanceTest::prepare(void) { size_t lenX, N, lenY; N = params_.N; lenX = 1 + (N-1) * abs(params_.incx); lenY = 1 + (N-1) * abs(params_.incy); randomSpmvMatrices(params_.order, params_.uplo, N, true, &alpha, (AP_ + params_.offA), (X_ + params_.offBX), params_.incx, true, &beta, (Y_ + params_.offCY), params_.incy); memcpy(backY_, Y_, (lenY+ params_.offCY )* sizeof(ElemType)); mobjAP_ = base_->createEnqueueBuffer(AP_, (((params_.N * (params_.N + 1)) / 2 ) + params_.offA)* sizeof(*AP_), 0, CL_MEM_READ_ONLY); mobjX_ = base_->createEnqueueBuffer(X_, (lenX + params_.offBX )* sizeof(*X_), 0, CL_MEM_READ_ONLY); mobjY_ = base_->createEnqueueBuffer(Y_, (lenY + params_.offCY )* sizeof(*Y_), 0, CL_MEM_READ_WRITE); return ( (mobjAP_ != NULL) && (mobjX_ != NULL) && (mobjY_ != NULL) ) ? 0 : -1; } template nano_time_t SpmvPerformanceTest::etalonPerfSingle(void) { nano_time_t time = 0; clblasOrder order; clblasUplo fUplo; #ifndef PERF_TEST_WITH_ROW_MAJOR if (params_.order == clblasRowMajor) { cerr << "Row major order is not allowed" << endl; return NANOTIME_ERR; } #endif order = params_.order; fUplo = params_.uplo; #ifdef PERF_TEST_WITH_ACML if (order != clblasColumnMajor) { order = clblasColumnMajor; fUplo = (params_.uplo == clblasUpper)? clblasLower : clblasUpper; } time = getCurrentTime(); clMath::blas::spmv(order, fUplo, params_.N, alpha, AP_, params_.offA, X_, params_.offBX, params_.incx, beta, Y_, params_.offCY, params_.incy); time = getCurrentTime() - time; #endif // PERF_TEST_WITH_ACML return time; } template nano_time_t SpmvPerformanceTest::clblasPerfSingle(void) { nano_time_t time; cl_event event; cl_int status; cl_command_queue queue = base_->commandQueues()[0]; int lenY = 1 + (params_.N-1) * abs(params_.incy); status = clEnqueueWriteBuffer(queue, mobjY_, CL_TRUE, 0, (lenY + params_.offCY )* sizeof(ElemType), backY_, 0, NULL, &event); if (status != CL_SUCCESS) { cerr << "Vector Y buffer object enqueuing error, status = " << status << endl; return NANOTIME_ERR; } status = clWaitForEvents(1, &event); if (status != CL_SUCCESS) { cout << "Wait on event failed, status = " << status << endl; return NANOTIME_ERR; } event = NULL; time = getCurrentTime(); #define TIMING #ifdef TIMING int iter = 20; for ( int i = 1; i <= iter; i++) { #endif status = (cl_int)clMath::clblas::spmv(params_.order, params_.uplo, params_.N, alpha, mobjAP_, params_.offA, mobjX_, params_.offBX, params_.incx, beta, mobjY_, params_.offCY, params_.incy, 1, &queue, 0, NULL, &event); if (status != CL_SUCCESS) { cerr << "The CLBLAS SPMV function failed, status = " << status << endl; return NANOTIME_ERR; } #ifdef TIMING } // iter loop clFinish( queue); time = getCurrentTime() - time; time /= iter; #else status = flushAll(1, &queue); if (status != CL_SUCCESS) { cerr << "clFlush() failed, status = " << status << endl; return NANOTIME_ERR; } time = getCurrentTime(); status = waitForSuccessfulFinish(1, &queue, &event); if (status == CL_SUCCESS) { time = getCurrentTime() - time; } else { cerr << "Waiting for completion of commands to the queue failed, " "status = " << status << endl; time = NANOTIME_ERR; } //printf("Time elapsed : %lu\n", time); #endif return time; } } // namespace clMath TEST_P(SPMV, sspmv) { TestParams params; getParams(¶ms); SpmvPerformanceTest::runInstance(FN_SSPMV, ¶ms); } TEST_P(SPMV, dspmv) { TestParams params; getParams(¶ms); SpmvPerformanceTest::runInstance(FN_DSPMV, ¶ms); } clblas-2.10/src/tests/performance/perf-spr.cpp000066400000000000000000000231301264277366700214130ustar00rootroot00000000000000/* ************************************************************************ * Copyright 2013 Advanced Micro Devices, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * ************************************************************************/ #include // srand() #include // memcpy() #include #include #include #include #include #include #include #ifdef PERF_TEST_WITH_ACML #include #include #endif #include "PerformanceTest.h" /* * NOTE: operation factor means overall number * of multiply and add per each operation involving * 2 matrix elements */ using namespace std; using namespace clMath; #define CHECK_RESULT(ret) \ do { \ ASSERT_GE(ret, 0) << "Fatal error: can not allocate resources or " \ "perform an OpenCL request!" << endl; \ EXPECT_EQ(0, ret) << "The OpenCL version is slower in the case" << \ endl; \ } while (0) namespace clMath { template class SprPerformanceTest : public PerformanceTest { public: virtual ~SprPerformanceTest(); virtual int prepare(void); virtual nano_time_t etalonPerfSingle(void); virtual nano_time_t clblasPerfSingle(void); static void runInstance(BlasFunction fn, TestParams *params) { SprPerformanceTest perfCase(fn, params); int ret = 0; int opFactor; BlasBase *base; base = clMath::BlasBase::getInstance(); opFactor = 1; if ((fn == FN_DSPR) && !base->isDevSupportDoublePrecision()) { std::cerr << ">> WARNING: The target device doesn't support native " "double precision floating point arithmetic" << std::endl << ">> Test skipped" << std::endl; return; } if (!perfCase.areResourcesSufficient(params)) { std::cerr << ">> RESOURCE CHECK: Skip due to unsufficient resources" << std::endl; } else { ret = perfCase.run(opFactor); } ASSERT_GE(ret, 0) << "Fatal error: can not allocate resources or " "perform an OpenCL request!" << endl; EXPECT_EQ(0, ret) << "The OpenCL version is slower in the case" << endl; } private: SprPerformanceTest(BlasFunction fn, TestParams *params); bool areResourcesSufficient(TestParams *params); TestParams params_; ElemType alpha_; ElemType *AP_; ElemType *X_; ElemType *backAP_; cl_mem mobjAP_; cl_mem mobjX_; ::clMath::BlasBase *base_; }; template SprPerformanceTest::SprPerformanceTest( BlasFunction fn, TestParams *params) : PerformanceTest(fn, (problem_size_t)((((params->N * params->N) + params->N) * 2 ) * sizeof(ElemType))), params_(*params), mobjAP_(NULL), mobjX_(NULL) { AP_ = new ElemType[( ( params_.N*( params_.N + 1 ) )/2 )+ params_.offa]; X_ = new ElemType[ 1 + (params_.N-1) * abs(params_.incx) + params_.offBX]; backAP_ = new ElemType[( ( params_.N*( params_.N + 1 ) )/2 )+ params_.offa]; base_ = ::clMath::BlasBase::getInstance(); } template SprPerformanceTest::~SprPerformanceTest() { if(AP_ != NULL) { delete[] AP_; } if(backAP_ != NULL) { delete[] backAP_; } if(X_ != NULL) { delete[] X_; } if(mobjX_ != NULL) { clReleaseMemObject(mobjX_); } if(mobjAP_ != NULL) { clReleaseMemObject(mobjAP_); } } /* * Check if available OpenCL resources are sufficient to * run the test case */ template bool SprPerformanceTest::areResourcesSufficient(TestParams *params) { clMath::BlasBase *base; size_t gmemSize, allocSize; size_t n = params->N; if((AP_ == NULL) || (backAP_ == NULL) || (X_ == NULL)) { return 0; } base = clMath::BlasBase::getInstance(); gmemSize = (size_t)base->availGlobalMemSize( 0 ); allocSize = (size_t)base->maxMemAllocSize(); bool suff = ( sizeof(ElemType)*( ( n*( n + 1 ) )/2 ) < allocSize ) && ((1 + (n-1)*abs(params->incx))*sizeof(ElemType) < allocSize); //for individual allocations suff = suff && (((( (n *(n + 1 ) )/2 )+ (1 + (n-1)*abs(params->incx))*2)*sizeof(ElemType)) < gmemSize) ; //for total global allocations return suff ; } template int SprPerformanceTest::prepare(void) { bool useAlpha = true; size_t lenX = 1 + (params_.N-1) * abs(params_.incx); alpha_ = convertMultiplier(params_.alpha); /* int creationFlags = 0; creationFlags = creationFlags | RANDOM_INIT; // Default is Column-Major creationFlags = ( (this-> params_.order) == clblasRowMajor)? (creationFlags | ROW_MAJOR_ORDER) : (creationFlags); creationFlags = ( (this-> params_.uplo) == clblasLower)? (creationFlags | LOWER_HALF_ONLY) : (creationFlags | UPPER_HALF_ONLY); BlasRoutineID BlasFn = CLBLAS_SYR; // Matrix A populate( (A_ + params_.offa), params_.N, params_.N, params_.lda, BlasFn, creationFlags); populate( X_ , lenX + params_.offBX, 1, lenX + params_.offBX, BlasFn); */ randomSyrMatrices(params_.order, params_.uplo, params_.N, useAlpha, &alpha_, (AP_ + params_.offa), 0, (X_ + params_.offBX), params_.incx); memcpy(backAP_, AP_, ((( ( params_.N*( params_.N + 1 ) )/2 )+ params_.offa)* sizeof(ElemType))); mobjAP_ = base_->createEnqueueBuffer(AP_, (( params_.N*( params_.N + 1 )/2 )+ params_.offa)* sizeof(*AP_), 0, CL_MEM_READ_WRITE); mobjX_ = base_->createEnqueueBuffer(X_, (lenX + params_.offBX )* sizeof(*X_), 0, CL_MEM_READ_ONLY); return ( (mobjAP_ != NULL) && (mobjX_ != NULL) ) ? 0 : -1; } template nano_time_t SprPerformanceTest::etalonPerfSingle(void) { nano_time_t time = 0; clblasOrder order; clblasUplo fUplo; #ifndef PERF_TEST_WITH_ROW_MAJOR if (params_.order == clblasRowMajor) { cerr << "Row major order is not allowed" << endl; return NANOTIME_ERR; } #endif order = params_.order; fUplo = params_.uplo; #ifdef PERF_TEST_WITH_ACML if (order != clblasColumnMajor) { order = clblasColumnMajor; fUplo = (params_.uplo == clblasUpper)? clblasLower : clblasUpper; if( params_.transA == clblasConjTrans ) doConjugate( (AP_ +params_.offa), (( params_.N * (params_.N + 1)) / 2) , 1, 1 ); } time = getCurrentTime(); clMath::blas::spr(order, fUplo, params_.N, alpha_, X_, params_.offBX, params_.incx, AP_, params_.offa); time = getCurrentTime() - time; #endif // PERF_TEST_WITH_ACML return time; } template nano_time_t SprPerformanceTest::clblasPerfSingle(void) { nano_time_t time; cl_event event; cl_int status; cl_command_queue queue = base_->commandQueues()[0]; status = clEnqueueWriteBuffer(queue, mobjAP_, CL_TRUE, 0, (((( params_.N * (params_.N + 1)) / 2)) + params_.offa) * sizeof(ElemType), backAP_, 0, NULL, &event); if (status != CL_SUCCESS) { cerr << "Matrix A buffer object enqueuing error, status = " << status << endl; return NANOTIME_ERR; } status = clWaitForEvents(1, &event); if (status != CL_SUCCESS) { cout << "Wait on event failed, status = " << status << endl; return NANOTIME_ERR; } event = NULL; #define TIMING #ifdef TIMING clFinish( queue); time = getCurrentTime(); int iter = 100; for ( int i = 1; i <= iter; i++) { #endif status = (cl_int)clMath::clblas::spr(params_.order, params_.uplo, params_.N, alpha_, mobjX_, params_.offBX, params_.incx, mobjAP_, params_.offa, 1, &queue, 0, NULL, &event); if (status != CL_SUCCESS) { cerr << "The CLBLAS SPR function failed, status = " << status << endl; return NANOTIME_ERR; } #ifdef TIMING } // iter loop clFinish( queue); time = getCurrentTime() - time; time /= iter; #else status = flushAll(1, &queue); if (status != CL_SUCCESS) { cerr << "clFlush() failed, status = " << status << endl; return NANOTIME_ERR; } time = getCurrentTime(); status = waitForSuccessfulFinish(1, &queue, &event); if (status == CL_SUCCESS) { time = getCurrentTime() - time; } else { cerr << "Waiting for completion of commands to the queue failed, " "status = " << status << endl; time = NANOTIME_ERR; } #endif return time; } } // namespace clMath TEST_P(SPR, sspr) { TestParams params; getParams(¶ms); SprPerformanceTest::runInstance(FN_SSPR, ¶ms); } TEST_P(SPR, dspr) { TestParams params; getParams(¶ms); SprPerformanceTest::runInstance(FN_DSPR, ¶ms); } clblas-2.10/src/tests/performance/perf-spr2.cpp000066400000000000000000000230441264277366700215010ustar00rootroot00000000000000/* ************************************************************************ * Copyright 2013 Advanced Micro Devices, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * ************************************************************************/ #include // srand() #include // memcpy() #include #include #include #include #include #include #include #ifdef PERF_TEST_WITH_ACML #include #include #endif #include "PerformanceTest.h" /* * NOTE: operation factor means overall number * of multiply and add per each operation involving * 2 matrix elements */ using namespace std; using namespace clMath; #define CHECK_RESULT(ret) \ do { \ ASSERT_GE(ret, 0) << "Fatal error: can not allocate resources or " \ "perform an OpenCL request!" << endl; \ EXPECT_EQ(0, ret) << "The OpenCL version is slower in the case" << \ endl; \ } while (0) namespace clMath { template class Spr2PerformanceTest : public PerformanceTest { public: virtual ~Spr2PerformanceTest(); virtual int prepare(void); virtual nano_time_t etalonPerfSingle(void); virtual nano_time_t clblasPerfSingle(void); static void runInstance(BlasFunction fn, TestParams *params) { Spr2PerformanceTest perfCase(fn, params); int ret = 0; int opFactor; BlasBase *base; base = clMath::BlasBase::getInstance(); opFactor = 1; if ((fn == FN_DSPR2) && !base->isDevSupportDoublePrecision()) { std::cerr << ">> WARNING: The target device doesn't support native " "double precision floating point arithmetic" << std::endl << ">> Test skipped" << std::endl; return; } if (!perfCase.areResourcesSufficient(params)) { std::cerr << ">> RESOURCE CHECK: Skip due to unsufficient resources" << std::endl; } else { ret = perfCase.run(opFactor); } ASSERT_GE(ret, 0) << "Fatal error: can not allocate resources or " "perform an OpenCL request!" << endl; EXPECT_EQ(0, ret) << "The OpenCL version is slower in the case" << endl; } private: Spr2PerformanceTest(BlasFunction fn, TestParams *params); bool areResourcesSufficient(TestParams *params); TestParams params_; ElemType alpha_; ElemType *AP_; ElemType *X_; ElemType *Y_; ElemType *backAP_; cl_mem mobjAP_; cl_mem mobjX_; cl_mem mobjY_; ::clMath::BlasBase *base_; }; template Spr2PerformanceTest::Spr2PerformanceTest( BlasFunction fn, TestParams *params) : PerformanceTest(fn, (problem_size_t)((((params->N * params->N) + (params->N)) * 3 ) * sizeof(ElemType))), params_(*params), mobjAP_(NULL), mobjX_(NULL), mobjY_(NULL) { AP_ = new ElemType[(( params_.N*( params_.N + 1 ) )/2) + params_.offa]; X_ = new ElemType[ 1 + (params_.N-1) * abs(params_.incx) + params_.offBX]; Y_ = new ElemType[ 1 + (params_.N-1) * abs(params_.incy) + params_.offCY]; backAP_ = new ElemType[( (params_.N*( params_.N + 1 ) )/2 )+ params_.offa]; base_ = ::clMath::BlasBase::getInstance(); } template Spr2PerformanceTest::~Spr2PerformanceTest() { if(AP_ != NULL) { delete[] AP_; } if(backAP_ != NULL) { delete[] backAP_; } if(X_ != NULL) { delete[] X_; } if(Y_ != NULL) { delete[] Y_; } if(mobjX_ != NULL) { clReleaseMemObject(mobjX_); } if(mobjY_ != NULL) { clReleaseMemObject(mobjY_); } if(mobjAP_ != NULL) { clReleaseMemObject(mobjAP_); } } /* * Check if available OpenCL resources are sufficient to * run the test case */ template bool Spr2PerformanceTest::areResourcesSufficient(TestParams *params) { clMath::BlasBase *base; size_t gmemSize, allocSize; size_t n = params->N; if((AP_ == NULL) || (backAP_ == NULL) || (X_ == NULL) || (Y_ == NULL)) { return 0; } base = clMath::BlasBase::getInstance(); gmemSize = (size_t)base->availGlobalMemSize(0); allocSize = (size_t)base->maxMemAllocSize(); bool suff = ( (sizeof(ElemType)*( params_.N*( params_.N + 1 ) )/2 )< allocSize ) && ((1 + (n-1)*abs(params->incx))*sizeof(ElemType) < allocSize) && ((1 + (n-1)*abs(params->incy))*sizeof(ElemType) < allocSize); //for individual allocations suff = suff && (((( ( params_.N*( params_.N + 1 ) )/2 )+ (1 + (n-1)*abs(params->incx)) + (1 + (n-1)*abs(params->incy)))*sizeof(ElemType)) < gmemSize) ; //for total global allocations return suff ; } template int Spr2PerformanceTest::prepare(void) { bool useAlpha = true; size_t lenX = 1 + (params_.N-1) * abs(params_.incx); size_t lenY = 1 + (params_.N-1) * abs(params_.incy); alpha_ = convertMultiplier(params_.alpha); randomSyr2Matrices( params_.order, params_.uplo, params_.N, useAlpha, &alpha_, AP_, 0, X_, params_.incx, Y_, params_.incy); memcpy(backAP_, AP_, (((( params_.N*( params_.N + 1 ) )/2 )+ params_.offa)* sizeof(ElemType))); mobjAP_ = base_->createEnqueueBuffer(AP_, ((( params_.N*( params_.N + 1 ) )/2 )+ params_.offa)* sizeof(*AP_), 0, CL_MEM_READ_WRITE); mobjX_ = base_->createEnqueueBuffer(X_, (lenX + params_.offBX ) * sizeof(*X_), 0, CL_MEM_READ_ONLY); mobjY_ = base_->createEnqueueBuffer(Y_, (lenY + params_.offCY ) * sizeof(*Y_), 0, CL_MEM_READ_ONLY); return ((mobjAP_ != NULL) && (mobjX_ != NULL) && (mobjY_ != NULL)) ? 0 : -1; } template nano_time_t Spr2PerformanceTest::etalonPerfSingle(void) { nano_time_t time = 0; clblasOrder order; clblasUplo fUplo; //size_t lda; #ifndef PERF_TEST_WITH_ROW_MAJOR if (params_.order == clblasRowMajor) { cerr << "Row major order is not allowed" << endl; return NANOTIME_ERR; } #endif order = params_.order; fUplo = params_.uplo; #ifdef PERF_TEST_WITH_ACML if (order != clblasColumnMajor) { order = clblasColumnMajor; fUplo = (params_.uplo == clblasUpper)? clblasLower : clblasUpper; doConjugate( (X_ + params_.offBX), (1 + (params_.N-1) * abs(params_.incx)), 1, 1 ); } time = getCurrentTime(); clMath::blas::spr2(order, fUplo, params_.N, alpha_, X_, params_.offBX, params_.incx, Y_, params_.offCY, params_.incy, AP_, params_.offa); time = getCurrentTime() - time; #endif // PERF_TEST_WITH_ACML return time; } template nano_time_t Spr2PerformanceTest::clblasPerfSingle(void) { nano_time_t time; cl_event event; cl_int status; cl_command_queue queue = base_->commandQueues()[0]; status = clEnqueueWriteBuffer(queue, mobjAP_, CL_TRUE, 0, ((( params_.N*( params_.N + 1 ) )/2 ) + params_.offa) * sizeof(ElemType), backAP_, 0, NULL, &event); if (status != CL_SUCCESS) { cerr << "Matrix A buffer object enqueuing error, status = " << status << endl; return NANOTIME_ERR; } status = clWaitForEvents(1, &event); if (status != CL_SUCCESS) { cout << "Wait on event failed, status = " << status << endl; return NANOTIME_ERR; } event = NULL; #define TIMING #ifdef TIMING clFinish( queue); time = getCurrentTime(); int iter = 100; for ( int i = 1; i <= iter; i++) { #endif status = (cl_int)clMath::clblas::spr2(params_.order, params_.uplo, params_.N, alpha_, mobjX_, params_.offBX, params_.incx, mobjY_, params_.offCY, params_.incy, mobjAP_, params_.offa, 1, &queue, 0, NULL, &event); if (status != CL_SUCCESS) { cerr << "The CLBLAS SPR2 function failed, status = " << status << endl; return NANOTIME_ERR; } #ifdef TIMING } // iter loop clFinish( queue); time = getCurrentTime() - time; time /= iter; #else status = flushAll(1, &queue); if (status != CL_SUCCESS) { cerr << "clFlush() failed, status = " << status << endl; return NANOTIME_ERR; } time = getCurrentTime(); status = waitForSuccessfulFinish(1, &queue, &event); if (status == CL_SUCCESS) { time = getCurrentTime() - time; } else { cerr << "Waiting for completion of commands to the queue failed, " "status = " << status << endl; time = NANOTIME_ERR; } #endif return time; } } // namespace clMath TEST_P(SPR2, sspr2) { TestParams params; getParams(¶ms); Spr2PerformanceTest::runInstance(FN_SSPR2, ¶ms); } TEST_P(SPR2, dspr2) { TestParams params; getParams(¶ms); Spr2PerformanceTest::runInstance(FN_DSPR2, ¶ms); } clblas-2.10/src/tests/performance/perf-swap.cpp000066400000000000000000000227711264277366700215730ustar00rootroot00000000000000/* ************************************************************************ * Copyright 2013 Advanced Micro Devices, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * ************************************************************************/ /* * SWAP performance test cases */ #include // srand() #include // memcpy() #include #include #include #include #include #include #include #ifdef PERF_TEST_WITH_ACML #include #include #endif #include "PerformanceTest.h" using namespace std; using namespace clMath; #define CHECK_RESULT(ret) \ do { \ ASSERT_GE(ret, 0) << "Fatal error: can not allocate resources or " \ "perform an OpenCL request!" << endl; \ EXPECT_EQ(0, ret) << "The OpenCL version is slower in the case" << \ endl; \ } while (0) namespace clMath { template class SwapPerformanceTest : public PerformanceTest { public: virtual ~SwapPerformanceTest(); virtual int prepare(void); virtual nano_time_t etalonPerfSingle(void); virtual nano_time_t clblasPerfSingle(void); static void runInstance(BlasFunction fn, TestParams *params) { SwapPerformanceTest perfCase(fn, params); int ret = 0; int opFactor; BlasBase *base; base = clMath::BlasBase::getInstance(); opFactor =1; if (((fn == FN_DSWAP) || (fn == FN_ZSWAP)) && !base->isDevSupportDoublePrecision()) { std::cerr << ">> WARNING: The target device doesn't support native " "double precision floating point arithmetic" << std::endl << ">> Test skipped" << std::endl; return; } if (!perfCase.areResourcesSufficient(params)) { std::cerr << ">> RESOURCE CHECK: Skip due to unsufficient resources" << std::endl; return; } else { ret = perfCase.run(opFactor); } ASSERT_GE(ret, 0) << "Fatal error: can not allocate resources or " "perform an OpenCL request!" << endl; EXPECT_EQ(0, ret) << "The OpenCL version is slower in the case" << endl; } private: SwapPerformanceTest(BlasFunction fn, TestParams *params); bool areResourcesSufficient(TestParams *params); TestParams params_; ElemType alpha_; ElemType *X_; ElemType *Y_; ElemType *blasX_; ElemType *blasY_; cl_mem mobjX_; cl_mem mobjY_; size_t lengthX; size_t lengthY; ::clMath::BlasBase *base_; }; template SwapPerformanceTest::SwapPerformanceTest( BlasFunction fn, TestParams *params) : PerformanceTest(fn,(problem_size_t) ( (4 * params->N) * sizeof(ElemType) ) ), params_(*params), mobjX_(NULL), mobjY_(NULL) { X_ = blasX_ = NULL; Y_ = blasY_ = NULL; lengthX = 1 + (params->N - 1) * abs(params_.incx); lengthY = 1 + (params->N - 1) * abs(params_.incy); try { X_ = new ElemType[lengthX + params_.offBX]; blasX_ = new ElemType[lengthX + params_.offBX]; Y_ = new ElemType[lengthY + params_.offCY]; blasY_ = new ElemType[lengthY + params_.offCY]; } catch(bad_alloc& ba) { X_ = Y_ = blasX_ = blasY_ = NULL; // areResourcesSufficient() will handle the rest and return mobjX_= mobjY_ = NULL; ba = ba; } base_ = ::clMath::BlasBase::getInstance(); } template SwapPerformanceTest::~SwapPerformanceTest() { if(X_ != NULL) { delete[] X_; } if(Y_ != NULL) { delete[] Y_; } if(blasX_ != NULL) { delete[] blasX_; } if(blasY_ != NULL) { delete[] blasY_; } if( mobjX_ != NULL ) { clReleaseMemObject(mobjX_); } if( mobjY_ != NULL ) { clReleaseMemObject(mobjY_); } } /* * Check if available OpenCL resources are sufficient to * run the test case */ template bool SwapPerformanceTest::areResourcesSufficient(TestParams *params) { clMath::BlasBase *base; size_t gmemSize, allocSize; bool ret; size_t sizeX, sizeY; if((X_ == NULL) || (blasX_ == NULL) || (Y_ == NULL) || (blasY_ == NULL) ) { return 0; } base = clMath::BlasBase::getInstance(); gmemSize = (size_t)base->availGlobalMemSize( 0 ); allocSize = (size_t)base->maxMemAllocSize(); sizeX = (lengthX + params->offBX) * sizeof(ElemType); sizeY = (lengthY + params->offCY) * sizeof(ElemType); ret = ((sizeX < allocSize) && (sizeY < allocSize)); ret = (ret && ((sizeX + sizeY) < gmemSize)); return ret; } template int SwapPerformanceTest::prepare(void) { alpha_ = convertMultiplier(params_.alpha); randomVectors(params_.N, (X_ + params_.offBX), params_.incx, (Y_ + params_.offCY), params_.incy); memcpy(blasX_, X_, (lengthX + params_.offBX)* sizeof(ElemType)); memcpy(blasY_, Y_, (lengthY + params_.offCY)* sizeof(ElemType)); mobjX_ = base_->createEnqueueBuffer(X_, ((lengthX + params_.offBX) * sizeof(ElemType)), 0, CL_MEM_READ_WRITE); mobjY_ = base_->createEnqueueBuffer(Y_, ((lengthY + params_.offCY) * sizeof(ElemType)), 0, CL_MEM_READ_WRITE); return ((mobjX_ != NULL) && (mobjY_ != NULL))? 0 : -1; } template nano_time_t SwapPerformanceTest::etalonPerfSingle(void) { nano_time_t time = 0; #ifdef PERF_TEST_WITH_ACML time = getCurrentTime(); clMath::blas::swap(params_.N, blasX_, params_.offBX, params_.incx, blasY_, params_.offCY, params_.incy); time = getCurrentTime() - time; #endif // PERF_TEST_WITH_ACML return time; } template nano_time_t SwapPerformanceTest::clblasPerfSingle(void) { nano_time_t time; cl_event event; cl_int status; cl_command_queue queue = base_->commandQueues()[0]; DataType type; type = ( typeid(ElemType) == typeid(float))? TYPE_FLOAT:( typeid(ElemType) == typeid(double))? TYPE_DOUBLE: ( typeid(ElemType) == typeid(FloatComplex))? TYPE_COMPLEX_FLOAT: TYPE_COMPLEX_DOUBLE; status = clEnqueueWriteBuffer(queue, mobjX_, CL_TRUE, 0, (lengthX + params_.offBX) * sizeof(ElemType), X_, 0, NULL, &event); if (status != CL_SUCCESS) { cerr << "Vactor X buffer object enqueuing error, status = " << status << endl; return NANOTIME_ERR; } status = clEnqueueWriteBuffer(queue, mobjY_, CL_TRUE, 0, (lengthY + params_.offCY) * sizeof(ElemType), Y_, 0, NULL, &event); if (status != CL_SUCCESS) { cerr << "Vector Y buffer object enqueuing error, status = " << status << endl; return NANOTIME_ERR; } status = clWaitForEvents(1, &event); if (status != CL_SUCCESS) { cout << "Wait on event failed, status = " << status << endl; return NANOTIME_ERR; } event = NULL; time = getCurrentTime(); #define TIMING #ifdef TIMING clFinish( queue); int iter = 100; for ( int i=1; i <= iter; i++) { #endif status = (cl_int)clMath::clblas::swap(type, params_.N, mobjX_, params_.offBX, params_.incx, mobjY_, params_.offCY, params_.incy, 1, &queue, 0, NULL, &event); if (status != CL_SUCCESS) { cerr << "The CLBLAS SWAP function failed, status = " << status << endl; return NANOTIME_ERR; } #ifdef TIMING } // iter loop clFinish( queue); time = getCurrentTime() - time; time /= iter; #else status = flushAll(1, &queue); if (status != CL_SUCCESS) { cerr << "clFlush() failed, status = " << status << endl; return NANOTIME_ERR; } time = getCurrentTime(); status = waitForSuccessfulFinish(1, &queue, &event); if (status == CL_SUCCESS) { time = getCurrentTime() - time; } else { cerr << "Waiting for completion of commands to the queue failed, " "status = " << status << endl; time = NANOTIME_ERR; } #endif return time; } } // namespace clMath // swap performance test TEST_P(SWAPXY, sswap) { TestParams params; getParams(¶ms); SwapPerformanceTest::runInstance(FN_SSWAP, ¶ms); } TEST_P(SWAPXY, dswap) { TestParams params; getParams(¶ms); SwapPerformanceTest::runInstance(FN_DSWAP, ¶ms); } TEST_P(SWAPXY, cswap) { TestParams params; getParams(¶ms); SwapPerformanceTest::runInstance(FN_CSWAP, ¶ms); } TEST_P(SWAPXY, zswap) { TestParams params; getParams(¶ms); SwapPerformanceTest::runInstance(FN_ZSWAP, ¶ms); } clblas-2.10/src/tests/performance/perf-symm.cpp000066400000000000000000000257451264277366700216120ustar00rootroot00000000000000/* ************************************************************************ * Copyright 2013 Advanced Micro Devices, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * ************************************************************************/ /* * Symm performance test cases */ #include // srand() #include // memcpy() #include #include #include #include #include #include #include #ifdef PERF_TEST_WITH_ACML #include #include #endif #include "PerformanceTest.h" //#define SHUNT_ACML_RUN /* * NOTE: operation factor means overall number * of multiply and add per each operation involving * 2 matrix elements */ using namespace std; using namespace clMath; #define CHECK_RESULT(ret) \ do { \ ASSERT_GE(ret, 0) << "Fatal error: can not allocate resources or " \ "perform an OpenCL request!" << endl; \ EXPECT_EQ(0, ret) << "The OpenCL version is slower in the case" << \ endl; \ } while (0) namespace clMath { template class SymmPerformanceTest : public PerformanceTest { public: virtual ~SymmPerformanceTest(); virtual int prepare(void); virtual nano_time_t etalonPerfSingle(void); virtual nano_time_t clblasPerfSingle(void); static void runInstance(BlasFunction fn, TestParams *params) { SymmPerformanceTest perfCase(fn, params); int ret = 0; int opFactor; BlasBase *base; base = clMath::BlasBase::getInstance(); if (fn == FN_SSYMM || fn == FN_DSYMM) { opFactor = 2; } else { opFactor = 8; } if ((fn == FN_DSYMM || fn == FN_ZSYMM) && !base->isDevSupportDoublePrecision()) { std::cerr << ">> WARNING: The target device doesn't support native " "double precision floating point arithmetic" << std::endl << ">> Test skipped" << std::endl; return; } if (!perfCase.areResourcesSufficient(params)) { std::cerr << ">> RESOURCE CHECK: Skip due to unsufficient resources" << std::endl; return; } else { ret = perfCase.run(opFactor); } ASSERT_GE(ret, 0) << "Fatal error: can not allocate resources or " "perform an OpenCL request!" << endl; EXPECT_EQ(0, ret) << "The OpenCL version is slower in the case" << endl; } private: SymmPerformanceTest(BlasFunction fn, TestParams *params); bool areResourcesSufficient(TestParams *params); TestParams params_; ElemType alpha_; ElemType beta_; ElemType *A_; ElemType *B_; ElemType *C_; ElemType *backC_; cl_mem mobjA_; cl_mem mobjB_; cl_mem mobjC_; size_t ka, kbc; ::clMath::BlasBase *base_; }; template SymmPerformanceTest::SymmPerformanceTest( BlasFunction fn, TestParams *params) : PerformanceTest(fn, (problem_size_t) ( params->M * params->N * ( (params->side == clblasLeft)? params->M : params->N ) ) ), params_(*params), mobjA_(NULL), mobjB_(NULL), mobjC_(NULL) { if( params_.side == clblasLeft ) ka = params_.M; else ka = params_.N; if( params_.order == clblasColumnMajor ) kbc = params_.N; else kbc = params_.M; A_ = new ElemType[params_.lda * ka + params_.offa]; B_ = new ElemType[params_.ldb * kbc + params_.offb]; C_ = new ElemType[params_.ldc * kbc + params_.offc]; backC_ = new ElemType[params_.ldc * kbc + params_.offc]; base_ = ::clMath::BlasBase::getInstance(); } template SymmPerformanceTest::~SymmPerformanceTest() { if(A_ != NULL) { delete[] A_; } if(B_ != NULL) { delete[] B_; } if(C_ != NULL) { delete[] C_; } if(backC_ != NULL) { delete[] backC_; } if( mobjC_ != NULL ) clReleaseMemObject(mobjC_); if( mobjB_ != NULL ) clReleaseMemObject(mobjB_); if( mobjA_ != NULL ) clReleaseMemObject(mobjA_); } /* * Check if available OpenCL resources are sufficient to * run the test case */ template bool SymmPerformanceTest::areResourcesSufficient(TestParams *params) { clMath::BlasBase *base; size_t gmemSize, allocSize; bool ret; size_t m = params->M, n = params->N; if((A_ == NULL) || (backC_ == NULL) || (C_ == NULL) || (B_ == NULL)) { return 0; // Not enough memory for host arrays } base = clMath::BlasBase::getInstance(); gmemSize = (size_t)base->availGlobalMemSize( 0 ); allocSize = (size_t)base->maxMemAllocSize(); ret = std::max(m, n) * params_.lda * sizeof(ElemType) < allocSize; ret = ret && (std::max(m, n) * params_.ldb * sizeof(ElemType) < allocSize); ret = ret && (std::max(m, n) * params_.ldc * sizeof(ElemType) < allocSize); ret = ret && (((std::max(m, n) * params_.lda) + (std::max(m, n) * params_.ldb) + (std::max(m, n) * params_.ldc)) < gmemSize); return ret; } template int SymmPerformanceTest::prepare(void) { bool useAlpha = base_->useAlpha(); bool useBeta = base_->useBeta(); if (useAlpha) { alpha_ = convertMultiplier(params_.alpha); } if (useBeta) { beta_ = convertMultiplier(params_.beta); } int creationFlags = 0; int AcreationFlags = 0; creationFlags = creationFlags | RANDOM_INIT; creationFlags = ( (params_.order) == clblasRowMajor)? (creationFlags | ROW_MAJOR_ORDER) : (creationFlags); AcreationFlags = ( (params_.uplo) == clblasLower)? (creationFlags | LOWER_HALF_ONLY) : (creationFlags | UPPER_HALF_ONLY); BlasRoutineID BlasFn = CLBLAS_SYMM; populate( A_ + params_.offa, ka, ka, params_.lda, BlasFn, (AcreationFlags )); populate( B_ + params_.offb, params_.M, params_.N, params_.ldb, BlasFn, creationFlags ); populate( C_ + params_.offc, params_.M, params_.N, params_.ldc, BlasFn, creationFlags ); memcpy( backC_, C_, (kbc * params_.ldc + params_.offc) * sizeof(ElemType) ); mobjA_ = base_->createEnqueueBuffer(A_, (params_.lda * ka + params_.offa) * sizeof(ElemType), 0, CL_MEM_READ_ONLY); if (mobjA_) { mobjB_ = base_->createEnqueueBuffer(B_, (params_.ldb * kbc + params_.offb) * sizeof(ElemType), 0, CL_MEM_READ_ONLY); } if (mobjB_) { mobjC_ = base_->createEnqueueBuffer(backC_, (params_.ldc * kbc + params_.offc) * sizeof(ElemType), 0, CL_MEM_READ_WRITE); } return (mobjC_) ? 0 : -1; } template nano_time_t SymmPerformanceTest::etalonPerfSingle(void) { nano_time_t time = 0; clblasOrder order; clblasUplo fUplo; clblasSide fSide; size_t lda, ldb, ldc, fN, fM; #ifndef PERF_TEST_WITH_ROW_MAJOR if (params_.order == clblasRowMajor) { cerr << "Row major order is not allowed" << endl; return NANOTIME_ERR; } #endif order = params_.order; fUplo = params_.uplo; fSide = params_.side; lda = params_.lda; ldb = params_.ldb; ldc = params_.ldc; fM = params_.M; fN = params_.N; #ifdef PERF_TEST_WITH_ACML if (order != clblasColumnMajor) { order = clblasColumnMajor; fM = params_.N; fN = params_.M; fSide = (params_.side == clblasLeft)? clblasRight: clblasLeft; fUplo = (params_.uplo == clblasUpper)? clblasLower: clblasUpper; } time = getCurrentTime(); #ifndef SHUNT_ACML_RUN clMath::blas::symm(order, fSide, fUplo, fM, fN, alpha_, A_, params_.offa, lda, B_, params_.offb, ldb, beta_, C_, params_.offc, ldc); #endif time = getCurrentTime() - time; #endif // PERF_TEST_WITH_ACML return time; } template nano_time_t SymmPerformanceTest::clblasPerfSingle(void) { nano_time_t time; cl_event event; cl_int status; cl_command_queue queue = base_->commandQueues()[0]; status = clEnqueueWriteBuffer(queue, mobjC_, CL_TRUE, 0, (params_.ldc * kbc + params_.offc) * sizeof(ElemType), backC_, 0, NULL, &event); if (status != CL_SUCCESS) { cerr << "Matrix C buffer object enqueuing error, status = " << status << endl; return NANOTIME_ERR; } status = clWaitForEvents(1, &event); if (status != CL_SUCCESS) { cout << "Wait on event failed, status = " << status << endl; return NANOTIME_ERR; } event = NULL; time = getCurrentTime(); //#define TIMING #ifdef TIMING clFinish( queue); int iter = 20; for ( int i = 1; i <= iter; i++) { #endif status = (cl_int)clMath::clblas::symm(params_.order, params_.side, params_.uplo, params_.M, params_.N, alpha_, mobjA_, params_.offa, params_.lda, mobjB_, params_.offb, params_.ldb, beta_, mobjC_, params_.offc, params_.ldc, 1, &queue, 0, NULL, &event); if (status != CL_SUCCESS) { cerr << "The CLBLAS SYMM function failed, status = " << status << endl; return NANOTIME_ERR; } #ifdef TIMING } // iter loop clFinish( queue); time = getCurrentTime() - time; time /= iter; #else status = flushAll(1, &queue); if (status != CL_SUCCESS) { cerr << "clFlush() failed, status = " << status << endl; return NANOTIME_ERR; } time = getCurrentTime(); status = waitForSuccessfulFinish(1, &queue, &event); if (status == CL_SUCCESS) { time = getCurrentTime() - time; } else { cerr << "Waiting for completion of commands to the queue failed, " "status = " << status << endl; time = NANOTIME_ERR; } #endif return time; } } // namespace clMath // ssymm performance test TEST_P(SYMM, ssymm) { TestParams params; getParams(¶ms); SymmPerformanceTest::runInstance(FN_SSYMM, ¶ms); } TEST_P(SYMM, dsymm) { TestParams params; getParams(¶ms); SymmPerformanceTest::runInstance(FN_DSYMM, ¶ms); } TEST_P(SYMM, csymm) { TestParams params; getParams(¶ms); SymmPerformanceTest::runInstance(FN_CSYMM, ¶ms); } TEST_P(SYMM, zsymm) { TestParams params; getParams(¶ms); SymmPerformanceTest::runInstance(FN_ZSYMM, ¶ms); } clblas-2.10/src/tests/performance/perf-symv.cpp000066400000000000000000000230321264277366700216060ustar00rootroot00000000000000/* ************************************************************************ * Copyright 2013 Advanced Micro Devices, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * ************************************************************************/ /* * Symv performance test cases */ #include // srand() #include // memcpy() #include #include #include #include #include #include #include #ifdef PERF_TEST_WITH_ACML #include #include #endif #include "PerformanceTest.h" /* * NOTE: operation factor means overall number * of multiply and add per each operation involving * 2 matrix elements */ using namespace std; using namespace clMath; #define CHECK_RESULT(ret) \ do { \ ASSERT_GE(ret, 0) << "Fatal error: can not allocate resources or " \ "perform an OpenCL request!" << endl; \ EXPECT_EQ(0, ret) << "The OpenCL version is slower in the case" << \ endl; \ } while (0) namespace clMath { template class SymvPerformanceTest : public PerformanceTest { public: virtual ~SymvPerformanceTest(); virtual int prepare(void); virtual nano_time_t etalonPerfSingle(void); virtual nano_time_t clblasPerfSingle(void); static void runInstance(BlasFunction fn, TestParams *params) { SymvPerformanceTest perfCase(fn, params); int ret = 0; int opFactor; BlasBase *base; base = clMath::BlasBase::getInstance(); opFactor = (fn == FN_SSYMV) ? sizeof(cl_float) : sizeof(cl_double); if ((fn == FN_DSYMV) && !base->isDevSupportDoublePrecision()) { std::cerr << ">> WARNING: The target device doesn't support native " "double precision floating point arithmetic" << std::endl << ">> Test skipped" << std::endl; return; } if (!perfCase.areResourcesSufficient(params)) { std::cerr << ">> RESOURCE CHECK: Skip due to insufficient resources" << std::endl; } else { ret = perfCase.run(opFactor); } ASSERT_GE(ret, 0) << "Fatal error: can not allocate resources or " "perform an OpenCL request!" << endl; EXPECT_EQ(0, ret) << "The OpenCL version is slower in the case" << endl; } private: SymvPerformanceTest(BlasFunction fn, TestParams *params); bool areResourcesSufficient(TestParams *params); TestParams params_; ElemType alpha_; ElemType beta_; ElemType *A_; ElemType *B_; ElemType *C_; ElemType *backC_; cl_mem mobjA_; cl_mem mobjB_; cl_mem mobjC_; ::clMath::BlasBase *base_; }; template SymvPerformanceTest::SymvPerformanceTest( BlasFunction fn, TestParams *params) : PerformanceTest(fn, (problem_size_t)params->N * params->N), params_(*params), mobjA_(NULL), mobjB_(NULL), mobjC_(NULL) { A_ = new ElemType[params_.rowsA * params_.columnsA]; B_ = new ElemType[params_.rowsB * params_.columnsB]; C_ = new ElemType[params_.rowsC * params_.columnsC]; backC_ = new ElemType[params_.rowsC * params_.columnsC]; base_ = ::clMath::BlasBase::getInstance(); } template SymvPerformanceTest::~SymvPerformanceTest() { if(A_ != NULL) { delete[] A_; } if(B_ != NULL) { delete[] B_; } if(C_ != NULL) { delete[] C_; } if(backC_ != NULL) { delete[] backC_; } if(mobjC_ != NULL) { clReleaseMemObject(mobjC_); } if(mobjB_ != NULL) { clReleaseMemObject(mobjB_); } if(mobjC_ != NULL) { clReleaseMemObject(mobjA_); } } /* * Check if available OpenCL resources are sufficient to * run the test case */ template bool SymvPerformanceTest::areResourcesSufficient(TestParams *params) { clMath::BlasBase *base; size_t gmemSize, allocSize, maxMatrSize; size_t n = params->N; base = clMath::BlasBase::getInstance(); gmemSize = (size_t)base->availGlobalMemSize(0); allocSize = (size_t)base->maxMemAllocSize(); if((A_ == NULL) || (backC_ == NULL) || (C_ == NULL) || (B_ == NULL)) { return 0; // Not enough memory for host arrays } maxMatrSize = gmemSize / 3; maxMatrSize = std::min(maxMatrSize, allocSize); return (n * n * sizeof(ElemType) < maxMatrSize); } template int SymvPerformanceTest::prepare(void) { size_t lenX, lenY; bool useAlpha = base_->useAlpha(); bool useBeta = base_->useBeta(); if (useAlpha) { alpha_ = convertMultiplier(params_.alpha); } if (useBeta) { beta_ = convertMultiplier(params_.beta); } lenX = params_.N; lenY = params_.N; randomGemmxMatrices(params_.order, params_.transA, params_.transB, params_.transC, lenY, params_.N, lenX, useAlpha, &alpha_, A_, params_.lda, B_, params_.ldb, useBeta, &beta_, C_, params_.ldc); mobjA_ = base_->createEnqueueBuffer(A_, params_.rowsA * params_.columnsA * sizeof(*A_), params_.offA * sizeof(*A_), CL_MEM_READ_ONLY); mobjB_ = base_->createEnqueueBuffer(B_, params_.rowsB * params_.columnsB * sizeof(*B_), 0, CL_MEM_READ_ONLY); mobjC_ = base_->createEnqueueBuffer(backC_, params_.rowsC * params_.columnsC * sizeof(*backC_), 0, CL_MEM_READ_WRITE); return ((mobjA_ != NULL) && (mobjB_ != NULL) && (mobjC_ != NULL)) ? 0 : -1; } template nano_time_t SymvPerformanceTest::etalonPerfSingle(void) { nano_time_t time = 0; clblasOrder order; size_t lda; #ifndef PERF_TEST_WITH_ROW_MAJOR if (params_.order == clblasRowMajor) { cerr << "Row major order is not allowed" << endl; return NANOTIME_ERR; } #endif memcpy(C_, backC_, params_.rowsC * params_.columnsC * sizeof(ElemType)); order = params_.order; lda = params_.lda; #ifdef PERF_TEST_WITH_ACML // #warning "SYMV performance test not implemented" time = NANOTIME_MAX; order = order; lda = lda; #endif // PERF_TEST_WITH_ACML return time; } template nano_time_t SymvPerformanceTest::clblasPerfSingle(void) { nano_time_t time; cl_event event; cl_int status; cl_command_queue queue = base_->commandQueues()[0]; status = clEnqueueWriteBuffer(queue, mobjC_, CL_TRUE, 0, params_.rowsC * params_.columnsC * sizeof(ElemType), backC_, 0, NULL, &event); if (status != CL_SUCCESS) { cerr << "Vector Y buffer object enqueuing error, status = " << status << endl; return NANOTIME_ERR; } status = clWaitForEvents(1, &event); if (status != CL_SUCCESS) { cout << "Wait on event failed, status = " << status << endl; return NANOTIME_ERR; } event = NULL; //#define TIMING #ifdef TIMING clFinish( queue); time = getCurrentTime(); int iter = 20; for ( int i = 1; i <= iter; i++) { #endif status = (cl_int)clMath::clblas::symv(params_.order, params_.uplo, params_.N, alpha_, mobjA_, params_.offA, params_.lda, mobjB_, params_.offBX, params_.incx, beta_, mobjC_, params_.offCY, params_.incy, 1, &queue, 0, NULL, &event); if (status != CL_SUCCESS) { cerr << "The CLBLAS SYMV function failed, status = " << status << endl; return NANOTIME_ERR; } #ifdef TIMING } // iter loop clFinish( queue); time = getCurrentTime() - time; time /= iter; #else status = flushAll(1, &queue); if (status != CL_SUCCESS) { cerr << "clFlush() failed, status = " << status << endl; return NANOTIME_ERR; } time = getCurrentTime(); status = waitForSuccessfulFinish(1, &queue, &event); if (status == CL_SUCCESS) { time = getCurrentTime() - time; } else { cerr << "Waiting for completion of commands to the queue failed, " "status = " << status << endl; time = NANOTIME_ERR; } #endif return time; } } // namespace clMath // ssymv performance test TEST_P(SYMV, ssymv) { TestParams params; getParams(¶ms); SymvPerformanceTest::runInstance(FN_SSYMV, ¶ms); } // dsymv performance test case TEST_P(SYMV, dsymv) { TestParams params; getParams(¶ms); SymvPerformanceTest::runInstance(FN_DSYMV, ¶ms); } clblas-2.10/src/tests/performance/perf-syr.cpp000066400000000000000000000226761264277366700214420ustar00rootroot00000000000000/* ************************************************************************ * Copyright 2013 Advanced Micro Devices, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * ************************************************************************/ /* * Syr performance test cases */ #include // srand() #include // memcpy() #include #include #include #include #include #include #include #ifdef PERF_TEST_WITH_ACML #include #include #endif #include "PerformanceTest.h" /* * NOTE: operation factor means overall number * of multiply and add per each operation involving * 2 matrix elements */ using namespace std; using namespace clMath; #define CHECK_RESULT(ret) \ do { \ ASSERT_GE(ret, 0) << "Fatal error: can not allocate resources or " \ "perform an OpenCL request!" << endl; \ EXPECT_EQ(0, ret) << "The OpenCL version is slower in the case" << \ endl; \ } while (0) namespace clMath { template class SyrPerformanceTest : public PerformanceTest { public: virtual ~SyrPerformanceTest(); virtual int prepare(void); virtual nano_time_t etalonPerfSingle(void); virtual nano_time_t clblasPerfSingle(void); static void runInstance(BlasFunction fn, TestParams *params) { SyrPerformanceTest perfCase(fn, params); int ret = 0; int opFactor; BlasBase *base; base = clMath::BlasBase::getInstance(); opFactor = 1; if ((fn == FN_DSYR) && !base->isDevSupportDoublePrecision()) { std::cerr << ">> WARNING: The target device doesn't support native " "double precision floating point arithmetic" << std::endl << ">> Test skipped" << std::endl; return; } if (!perfCase.areResourcesSufficient(params)) { std::cerr << ">> RESOURCE CHECK: Skip due to unsufficient resources" << std::endl; } else { ret = perfCase.run(opFactor); } ASSERT_GE(ret, 0) << "Fatal error: can not allocate resources or " "perform an OpenCL request!" << endl; EXPECT_EQ(0, ret) << "The OpenCL version is slower in the case" << endl; } private: SyrPerformanceTest(BlasFunction fn, TestParams *params); bool areResourcesSufficient(TestParams *params); TestParams params_; ElemType alpha_; ElemType *A_; ElemType *X_; ElemType *backA_; cl_mem mobjA_; cl_mem mobjX_; ::clMath::BlasBase *base_; }; template SyrPerformanceTest::SyrPerformanceTest( BlasFunction fn, TestParams *params) : PerformanceTest(fn, (problem_size_t)((((params->N * params->N) + params->N) * 2 ) * sizeof(ElemType))), params_(*params), mobjA_(NULL), mobjX_(NULL) { A_ = new ElemType[params_.N * params_.lda + params_.offa]; X_ = new ElemType[ 1 + (params_.N-1) * abs(params_.incx) + params_.offBX]; backA_ = new ElemType[params_.N * params_.lda + params_.offa]; base_ = ::clMath::BlasBase::getInstance(); } template SyrPerformanceTest::~SyrPerformanceTest() { if(A_ != NULL) { delete[] A_; } if(backA_ != NULL) { delete[] backA_; } if(X_ != NULL) { delete[] X_; } if(mobjX_ != NULL) { clReleaseMemObject(mobjX_); } if(mobjA_ != NULL) { clReleaseMemObject(mobjA_); } } /* * Check if available OpenCL resources are sufficient to * run the test case */ template bool SyrPerformanceTest::areResourcesSufficient(TestParams *params) { clMath::BlasBase *base; size_t gmemSize, allocSize; size_t n = params->N; if((A_ == NULL) || (backA_ == NULL) || (X_ == NULL)) { return 0; } base = clMath::BlasBase::getInstance(); gmemSize = (size_t)base->availGlobalMemSize( 0 ); allocSize = (size_t)base->maxMemAllocSize(); bool suff = ( sizeof(ElemType)*n*params->lda < allocSize ) && ((1 + (n-1)*abs(params->incx))*sizeof(ElemType) < allocSize); //for individual allocations suff = suff && ((( n*params->lda + (1 + (n-1)*abs(params->incx))*2)*sizeof(ElemType)) < gmemSize) ; //for total global allocations return suff ; } template int SyrPerformanceTest::prepare(void) { bool useAlpha = true; size_t lenX = 1 + (params_.N-1) * abs(params_.incx); alpha_ = convertMultiplier(params_.alpha); /* int creationFlags = 0; creationFlags = creationFlags | RANDOM_INIT; // Default is Column-Major creationFlags = ( (this-> params_.order) == clblasRowMajor)? (creationFlags | ROW_MAJOR_ORDER) : (creationFlags); creationFlags = ( (this-> params_.uplo) == clblasLower)? (creationFlags | LOWER_HALF_ONLY) : (creationFlags | UPPER_HALF_ONLY); BlasRoutineID BlasFn = CLBLAS_SYR; // Matrix A populate( (A_ + params_.offa), params_.N, params_.N, params_.lda, BlasFn, creationFlags); populate( X_ , lenX + params_.offBX, 1, lenX + params_.offBX, BlasFn); */ randomSyrMatrices( params_.order, params_.uplo, params_.N, useAlpha, &alpha_, A_, params_.lda, X_, params_.incx); memcpy(backA_, A_, ((params_.N * params_.lda + params_.offa)* sizeof(ElemType))); mobjA_ = base_->createEnqueueBuffer(A_, (params_.N * params_.lda + params_.offa)* sizeof(*A_), 0, CL_MEM_READ_WRITE); mobjX_ = base_->createEnqueueBuffer(X_, (lenX + params_.offBX )* sizeof(*X_), 0, CL_MEM_READ_ONLY); return ( (mobjA_ != NULL) && (mobjX_ != NULL) ) ? 0 : -1; } template nano_time_t SyrPerformanceTest::etalonPerfSingle(void) { nano_time_t time = 0; clblasOrder order; clblasUplo fUplo; size_t lda; #ifndef PERF_TEST_WITH_ROW_MAJOR if (params_.order == clblasRowMajor) { cerr << "Row major order is not allowed" << endl; return NANOTIME_ERR; } #endif order = params_.order; lda = params_.lda; fUplo = params_.uplo; #ifdef PERF_TEST_WITH_ACML if (order != clblasColumnMajor) { order = clblasColumnMajor; fUplo = (params_.uplo == clblasUpper)? clblasLower : clblasUpper; } time = getCurrentTime(); clMath::blas::syr(order, fUplo, params_.N, alpha_, X_, params_.offBX, params_.incx, A_, params_.offa, lda); time = getCurrentTime() - time; #endif // PERF_TEST_WITH_ACML return time; } template nano_time_t SyrPerformanceTest::clblasPerfSingle(void) { nano_time_t time; cl_event event; cl_int status; cl_command_queue queue = base_->commandQueues()[0]; status = clEnqueueWriteBuffer(queue, mobjA_, CL_TRUE, 0, ((params_.N * params_.lda) + params_.offa) * sizeof(ElemType), backA_, 0, NULL, &event); if (status != CL_SUCCESS) { cerr << "Matrix A buffer object enqueuing error, status = " << status << endl; return NANOTIME_ERR; } status = clWaitForEvents(1, &event); if (status != CL_SUCCESS) { cout << "Wait on event failed, status = " << status << endl; return NANOTIME_ERR; } event = NULL; #define TIMING #ifdef TIMING clFinish( queue); time = getCurrentTime(); int iter = 100; for ( int i = 1; i <= iter; i++) { #endif status = (cl_int)clMath::clblas::syr(params_.order, params_.uplo, params_.N, alpha_, mobjX_, params_.offBX, params_.incx, mobjA_, params_.offa, params_.lda, 1, &queue, 0, NULL, &event); if (status != CL_SUCCESS) { cerr << "The CLBLAS SYR function failed, status = " << status << endl; return NANOTIME_ERR; } #ifdef TIMING } // iter loop clFinish( queue); time = getCurrentTime() - time; time /= iter; #else status = flushAll(1, &queue); if (status != CL_SUCCESS) { cerr << "clFlush() failed, status = " << status << endl; return NANOTIME_ERR; } time = getCurrentTime(); status = waitForSuccessfulFinish(1, &queue, &event); if (status == CL_SUCCESS) { time = getCurrentTime() - time; } else { cerr << "Waiting for completion of commands to the queue failed, " "status = " << status << endl; time = NANOTIME_ERR; } #endif return time; } } // namespace clMath // ssyr performance test TEST_P(SYR, ssyr) { TestParams params; getParams(¶ms); SyrPerformanceTest::runInstance(FN_SSYR, ¶ms); } // dsyr performance test case TEST_P(SYR, dsyr) { TestParams params; getParams(¶ms); SyrPerformanceTest::runInstance(FN_DSYR, ¶ms); } clblas-2.10/src/tests/performance/perf-syr2.cpp000066400000000000000000000227131264277366700215140ustar00rootroot00000000000000/* ************************************************************************ * Copyright 2013 Advanced Micro Devices, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * ************************************************************************/ /* * Syr2 performance test cases */ #include // srand() #include // memcpy() #include #include #include #include #include #include #include #ifdef PERF_TEST_WITH_ACML #include #include #endif #include "PerformanceTest.h" /* * NOTE: operation factor means overall number * of multiply and add per each operation involving * 2 matrix elements */ using namespace std; using namespace clMath; #define CHECK_RESULT(ret) \ do { \ ASSERT_GE(ret, 0) << "Fatal error: can not allocate resources or " \ "perform an OpenCL request!" << endl; \ EXPECT_EQ(0, ret) << "The OpenCL version is slower in the case" << \ endl; \ } while (0) namespace clMath { template class Syr2PerformanceTest : public PerformanceTest { public: virtual ~Syr2PerformanceTest(); virtual int prepare(void); virtual nano_time_t etalonPerfSingle(void); virtual nano_time_t clblasPerfSingle(void); static void runInstance(BlasFunction fn, TestParams *params) { Syr2PerformanceTest perfCase(fn, params); int ret = 0; int opFactor; BlasBase *base; base = clMath::BlasBase::getInstance(); opFactor = 1; if ((fn == FN_DSYR2) && !base->isDevSupportDoublePrecision()) { std::cerr << ">> WARNING: The target device doesn't support native " "double precision floating point arithmetic" << std::endl << ">> Test skipped" << std::endl; return; } if (!perfCase.areResourcesSufficient(params)) { std::cerr << ">> RESOURCE CHECK: Skip due to unsufficient resources" << std::endl; } else { ret = perfCase.run(opFactor); } ASSERT_GE(ret, 0) << "Fatal error: can not allocate resources or " "perform an OpenCL request!" << endl; EXPECT_EQ(0, ret) << "The OpenCL version is slower in the case" << endl; } private: Syr2PerformanceTest(BlasFunction fn, TestParams *params); bool areResourcesSufficient(TestParams *params); TestParams params_; ElemType alpha_; ElemType *A_; ElemType *X_; ElemType *Y_; ElemType *backA_; cl_mem mobjA_; cl_mem mobjX_; cl_mem mobjY_; ::clMath::BlasBase *base_; }; template Syr2PerformanceTest::Syr2PerformanceTest( BlasFunction fn, TestParams *params) : PerformanceTest(fn, (problem_size_t)((((params->N * params->N) + (params->N)) * 3 ) * sizeof(ElemType))), params_(*params), mobjA_(NULL), mobjX_(NULL), mobjY_(NULL) { A_ = new ElemType[params_.N * params_.lda + params_.offa]; X_ = new ElemType[ 1 + (params_.N-1) * abs(params_.incx) + params_.offBX]; Y_ = new ElemType[ 1 + (params_.N-1) * abs(params_.incy) + params_.offCY]; backA_ = new ElemType[params_.N * params_.lda + params_.offa]; base_ = ::clMath::BlasBase::getInstance(); } template Syr2PerformanceTest::~Syr2PerformanceTest() { if(A_ != NULL) { delete[] A_; } if(backA_ != NULL) { delete[] backA_; } if(X_ != NULL) { delete[] X_; } if(Y_ != NULL) { delete[] Y_; } if(mobjX_ != NULL) { clReleaseMemObject(mobjX_); } if(mobjY_ != NULL) { clReleaseMemObject(mobjY_); } if(mobjA_ != NULL) { clReleaseMemObject(mobjA_); } } /* * Check if available OpenCL resources are sufficient to * run the test case */ template bool Syr2PerformanceTest::areResourcesSufficient(TestParams *params) { clMath::BlasBase *base; size_t gmemSize, allocSize; size_t n = params->N; if((A_ == NULL) || (backA_ == NULL) || (X_ == NULL) || (Y_ == NULL)) { return 0; } base = clMath::BlasBase::getInstance(); gmemSize = (size_t)base->availGlobalMemSize(0); allocSize = (size_t)base->maxMemAllocSize(); bool suff = ( sizeof(ElemType)*n*params->lda < allocSize ) && ((1 + (n-1)*abs(params->incx))*sizeof(ElemType) < allocSize) && ((1 + (n-1)*abs(params->incy))*sizeof(ElemType) < allocSize); //for individual allocations suff = suff && ((( n*params->lda + (1 + (n-1)*abs(params->incx)) + (1 + (n-1)*abs(params->incy)))*sizeof(ElemType)) < gmemSize) ; //for total global allocations return suff ; } template int Syr2PerformanceTest::prepare(void) { bool useAlpha = true; size_t lenX = 1 + (params_.N-1) * abs(params_.incx); size_t lenY = 1 + (params_.N-1) * abs(params_.incy); alpha_ = convertMultiplier(params_.alpha); randomSyr2Matrices( params_.order, params_.uplo, params_.N, useAlpha, &alpha_, A_, params_.lda, X_, params_.incx, Y_, params_.incy); memcpy(backA_, A_, ((params_.N * params_.lda + params_.offa)* sizeof(ElemType))); mobjA_ = base_->createEnqueueBuffer(A_, (params_.N * params_.lda + params_.offa)* sizeof(*A_), 0, CL_MEM_READ_WRITE); mobjX_ = base_->createEnqueueBuffer(X_, (lenX + params_.offBX ) * sizeof(*X_), 0, CL_MEM_READ_ONLY); mobjY_ = base_->createEnqueueBuffer(Y_, (lenY + params_.offCY ) * sizeof(*Y_), 0, CL_MEM_READ_ONLY); return ((mobjA_ != NULL) && (mobjX_ != NULL) && (mobjY_ != NULL)) ? 0 : -1; } template nano_time_t Syr2PerformanceTest::etalonPerfSingle(void) { nano_time_t time = 0; clblasOrder order; clblasUplo fUplo; size_t lda; #ifndef PERF_TEST_WITH_ROW_MAJOR if (params_.order == clblasRowMajor) { cerr << "Row major order is not allowed" << endl; return NANOTIME_ERR; } #endif order = params_.order; lda = params_.lda; fUplo = params_.uplo; #ifdef PERF_TEST_WITH_ACML if (order != clblasColumnMajor) { order = clblasColumnMajor; fUplo = (params_.uplo == clblasUpper)? clblasLower : clblasUpper; } time = getCurrentTime(); clMath::blas::syr2(order, fUplo, params_.N, alpha_, X_, params_.offBX, params_.incx, Y_, params_.offCY, params_.incy, A_, params_.offa, lda); time = getCurrentTime() - time; #endif // PERF_TEST_WITH_ACML return time; } template nano_time_t Syr2PerformanceTest::clblasPerfSingle(void) { nano_time_t time; cl_event event; cl_int status; cl_command_queue queue = base_->commandQueues()[0]; status = clEnqueueWriteBuffer(queue, mobjA_, CL_TRUE, 0, ((params_.N * params_.lda) + params_.offa) * sizeof(ElemType), backA_, 0, NULL, &event); if (status != CL_SUCCESS) { cerr << "Matrix A buffer object enqueuing error, status = " << status << endl; return NANOTIME_ERR; } status = clWaitForEvents(1, &event); if (status != CL_SUCCESS) { cout << "Wait on event failed, status = " << status << endl; return NANOTIME_ERR; } event = NULL; #define TIMING #ifdef TIMING clFinish( queue); time = getCurrentTime(); int iter = 100; for ( int i = 1; i <= iter; i++) { #endif status = (cl_int)clMath::clblas::syr2(params_.order, params_.uplo, params_.N, alpha_, mobjX_, params_.offBX, params_.incx, mobjY_, params_.offCY, params_.incy, mobjA_, params_.offa, params_.lda, 1, &queue, 0, NULL, &event); if (status != CL_SUCCESS) { cerr << "The CLBLAS SYR2 function failed, status = " << status << endl; return NANOTIME_ERR; } #ifdef TIMING } // iter loop clFinish( queue); time = getCurrentTime() - time; time /= iter; #else status = flushAll(1, &queue); if (status != CL_SUCCESS) { cerr << "clFlush() failed, status = " << status << endl; return NANOTIME_ERR; } time = getCurrentTime(); status = waitForSuccessfulFinish(1, &queue, &event); if (status == CL_SUCCESS) { time = getCurrentTime() - time; } else { cerr << "Waiting for completion of commands to the queue failed, " "status = " << status << endl; time = NANOTIME_ERR; } #endif return time; } } // namespace clMath // ssyr performance test TEST_P(SYR2, ssyr2) { TestParams params; getParams(¶ms); Syr2PerformanceTest::runInstance(FN_SSYR2, ¶ms); } // dsyr performance test case TEST_P(SYR2, dsyr2) { TestParams params; getParams(¶ms); Syr2PerformanceTest::runInstance(FN_DSYR2, ¶ms); } clblas-2.10/src/tests/performance/perf-syr2k.cpp000066400000000000000000000234251264277366700216700ustar00rootroot00000000000000/* ************************************************************************ * Copyright 2013 Advanced Micro Devices, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * ************************************************************************/ /* * Syr2k performance test cases */ #include // srand() #include // memcpy() #include #include #include #include #include #include #include #ifdef PERF_TEST_WITH_ACML #include #include #endif #include "PerformanceTest.h" /* * NOTE: operation factor means overall number * of multiply and add per each operation involving * 2 matrix elements */ using namespace std; using namespace clMath; #define CHECK_RESULT(ret) \ do { \ ASSERT_GE(ret, 0) << "Fatal error: can not allocate resources or " \ "perform an OpenCL request!" << endl; \ EXPECT_EQ(0, ret) << "The OpenCL version is slower in the case" << \ endl; \ } while (0) namespace clMath { template class Syr2kPerformanceTest : public PerformanceTest { public: virtual ~Syr2kPerformanceTest(); virtual int prepare(void); virtual nano_time_t etalonPerfSingle(void); virtual nano_time_t clblasPerfSingle(void); static void runInstance(BlasFunction fn, TestParams *params) { Syr2kPerformanceTest perfCase(fn, params); int ret = 0; int opFactor; BlasBase *base; base = clMath::BlasBase::getInstance(); if (fn == FN_SSYR2K || fn == FN_DSYR2K) { opFactor = 2; } else { opFactor = 8; } if ((fn == FN_DSYR2K || fn == FN_ZSYR2K) && !base->isDevSupportDoublePrecision()) { std::cerr << ">> WARNING: The target device doesn't support native " "double precision floating point arithmetic" << std::endl << ">> Test skipped" << std::endl; return; } if (!perfCase.areResourcesSufficient(params)) { std::cerr << ">> RESOURCE CHECK: Skip due to unsufficient resources" << std::endl; } else { ret = perfCase.run(opFactor); } ASSERT_GE(ret, 0) << "Fatal error: can not allocate resources or " "perform an OpenCL request!" << endl; EXPECT_EQ(0, ret) << "The OpenCL version is slower in the case" << endl; } private: Syr2kPerformanceTest(BlasFunction fn, TestParams *params); bool areResourcesSufficient(TestParams *params); TestParams params_; ElemType alpha_; ElemType beta_; ElemType *A_; ElemType *B_; ElemType *C_; ElemType *backC_; cl_mem mobjA_; cl_mem mobjB_; cl_mem mobjC_; ::clMath::BlasBase *base_; }; template Syr2kPerformanceTest::Syr2kPerformanceTest( BlasFunction fn, TestParams *params) : PerformanceTest(fn, (problem_size_t)params->N * params->N * params->K), params_(*params), mobjA_(NULL), mobjB_(NULL), mobjC_(NULL) { A_ = new ElemType[params_.rowsA * params_.columnsA]; B_ = new ElemType[params_.rowsB * params_.columnsB]; C_ = new ElemType[params_.rowsC * params_.columnsC]; backC_ = new ElemType[params_.rowsC * params_.columnsC]; base_ = ::clMath::BlasBase::getInstance(); } template Syr2kPerformanceTest::~Syr2kPerformanceTest() { delete[] A_; delete[] B_; delete[] C_; delete[] backC_; clReleaseMemObject(mobjC_); clReleaseMemObject(mobjB_); clReleaseMemObject(mobjA_); } /* * Check if available OpenCL resources are sufficient to * run the test case */ template bool Syr2kPerformanceTest::areResourcesSufficient(TestParams *params) { clMath::BlasBase *base; size_t gmemSize, allocSize, maxMatrSize; size_t n = params->N, k = params->K; base = clMath::BlasBase::getInstance(); gmemSize = (size_t)base->availGlobalMemSize(0); allocSize = (size_t)base->maxMemAllocSize(); maxMatrSize = gmemSize / 3; maxMatrSize = std::min(maxMatrSize, allocSize); return (n * k * sizeof(ElemType) < maxMatrSize); } template int Syr2kPerformanceTest::prepare(void) { bool useAlpha = base_->useAlpha(); bool useBeta = base_->useBeta(); clblasTranspose transB; if (useAlpha) { alpha_ = convertMultiplier(params_.alpha); } if (useBeta) { beta_ = convertMultiplier(params_.beta); } transB = (params_.transA == clblasNoTrans) ? clblasTrans : clblasNoTrans; randomGemmMatrices(params_.order, params_.transA, transB, params_.N, params_.N, params_.K, useAlpha, &alpha_, A_, params_.lda, B_, params_.ldb, useBeta, &beta_, C_, params_.ldc); mobjA_ = base_->createEnqueueBuffer(A_, params_.rowsA * params_.columnsA * sizeof(ElemType), params_.offA * sizeof(ElemType), CL_MEM_READ_ONLY); if (mobjA_) { mobjB_ = base_->createEnqueueBuffer(B_, params_.rowsB * params_.columnsB * sizeof(ElemType), params_.offBX * sizeof(ElemType), CL_MEM_READ_ONLY); } if (mobjB_) { mobjC_ = base_->createEnqueueBuffer(backC_, params_.rowsC * params_.columnsC * sizeof(ElemType), params_.offCY * sizeof(ElemType), CL_MEM_READ_WRITE); } return (mobjC_) ? 0 : -1; } template nano_time_t Syr2kPerformanceTest::etalonPerfSingle(void) { nano_time_t time = 0; clblasOrder order; size_t lda, ldb, ldc; #ifndef PERF_TEST_WITH_ROW_MAJOR if (params_.order == clblasRowMajor) { cerr << "Row major order is not allowed" << endl; return NANOTIME_ERR; } #endif memcpy(C_, backC_, params_.rowsC * params_.columnsC * sizeof(ElemType)); order = params_.order; lda = params_.lda; ldb = params_.ldb; ldc = params_.ldc; #ifdef PERF_TEST_WITH_ACML // #warning "SYR2K performance test not implemented" time = NANOTIME_MAX; order = order; lda = lda; ldb = ldb; ldc = ldc; #endif // PERF_TEST_WITH_ACML return time; } template nano_time_t Syr2kPerformanceTest::clblasPerfSingle(void) { nano_time_t time; cl_event event; cl_int status; cl_command_queue queue = base_->commandQueues()[0]; status = clEnqueueWriteBuffer(queue, mobjC_, CL_TRUE, 0, params_.rowsC * params_.columnsC * sizeof(ElemType), backC_, 0, NULL, &event); if (status != CL_SUCCESS) { cerr << "Matrix C buffer object enqueuing error, status = " << status << endl; return NANOTIME_ERR; } status = clWaitForEvents(1, &event); if (status != CL_SUCCESS) { cout << "Wait on event failed, status = " << status << endl; return NANOTIME_ERR; } event = NULL; status = (cl_int)clMath::clblas::syr2k(params_.order, params_.uplo, params_.transA, params_.N, params_.K, alpha_, mobjA_, params_.offA, params_.lda, mobjB_, params_.offBX, params_.ldb, beta_, mobjC_, params_.offCY, params_.ldc, 1, &queue, 0, NULL, &event); if (status != CL_SUCCESS) { cerr << "The CLBLAS SYR2K function failed, status = " << status << endl; return NANOTIME_ERR; } status = flushAll(1, &queue); if (status != CL_SUCCESS) { cerr << "clFlush() failed, status = " << status << endl; return NANOTIME_ERR; } time = getCurrentTime(); status = waitForSuccessfulFinish(1, &queue, &event); if (status == CL_SUCCESS) { time = getCurrentTime() - time; } else { cerr << "Waiting for completion of commands to the queue failed, " "status = " << status << endl; time = NANOTIME_ERR; } return time; } } // namespace clMath // ssyr2k performance test TEST_P(SYR2K, ssyr2k) { TestParams params; getParams(¶ms); Syr2kPerformanceTest::runInstance(FN_SSYR2K, ¶ms); } // dsyr2k performance test case TEST_P(SYR2K, dsyr2k) { TestParams params; getParams(¶ms); Syr2kPerformanceTest::runInstance(FN_DSYR2K, ¶ms); } // csyr2k performance test TEST_P(SYR2K, csyr2k) { TestParams params; getParams(¶ms); Syr2kPerformanceTest::runInstance(FN_CSYR2K, ¶ms); } // zsyr2k performance test case TEST_P(SYR2K, zsyr2k) { TestParams params; getParams(¶ms); Syr2kPerformanceTest::runInstance(FN_ZSYR2K, ¶ms); } clblas-2.10/src/tests/performance/perf-syrk.cpp000066400000000000000000000221221264277366700215770ustar00rootroot00000000000000/* ************************************************************************ * Copyright 2013 Advanced Micro Devices, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * ************************************************************************/ /* * Syrk performance test cases */ #include // srand() #include // memcpy() #include #include #include #include #include #include #include #ifdef PERF_TEST_WITH_ACML #include #include #endif #include "PerformanceTest.h" /* * NOTE: operation factor means overall number * of multiply and add per each operation involving * 2 matrix elements */ using namespace std; using namespace clMath; #define CHECK_RESULT(ret) \ do { \ ASSERT_GE(ret, 0) << "Fatal error: can not allocate resources or " \ "perform an OpenCL request!" << endl; \ EXPECT_EQ(0, ret) << "The OpenCL version is slower in the case" << \ endl; \ } while (0) namespace clMath { template class SyrkPerformanceTest : public PerformanceTest { public: virtual ~SyrkPerformanceTest(); virtual int prepare(void); virtual nano_time_t etalonPerfSingle(void); virtual nano_time_t clblasPerfSingle(void); static void runInstance(BlasFunction fn, TestParams *params) { SyrkPerformanceTest perfCase(fn, params); int ret = 0; int opFactor; BlasBase *base; base = clMath::BlasBase::getInstance(); if (fn == FN_SSYRK || fn == FN_DSYRK) { opFactor = 1; } else { opFactor = 4; } if ((fn == FN_DSYRK || fn == FN_ZSYRK) && !base->isDevSupportDoublePrecision()) { std::cerr << ">> WARNING: The target device doesn't support native " "double precision floating point arithmetic" << std::endl << ">> Test skipped" << std::endl; return; } if (!perfCase.areResourcesSufficient(params)) { std::cerr << ">> RESOURCE CHECK: Skip due to unsufficient resources" << std::endl; } else { ret = perfCase.run(opFactor); } ASSERT_GE(ret, 0) << "Fatal error: can not allocate resources or " "perform an OpenCL request!" << endl; EXPECT_EQ(0, ret) << "The OpenCL version is slower in the case" << endl; } private: SyrkPerformanceTest(BlasFunction fn, TestParams *params); bool areResourcesSufficient(TestParams *params); TestParams params_; ElemType alpha_; ElemType beta_; ElemType *A_; ElemType *C_; ElemType *backC_; cl_mem mobjA_; cl_mem mobjC_; ::clMath::BlasBase *base_; }; template SyrkPerformanceTest::SyrkPerformanceTest( BlasFunction fn, TestParams *params) : PerformanceTest(fn, (problem_size_t)params->N * params->N * params->K), params_(*params), mobjA_(NULL), mobjC_(NULL) { A_ = new ElemType[params_.rowsA * params_.columnsA]; C_ = new ElemType[params_.rowsC * params_.columnsC]; backC_ = new ElemType[params_.rowsC * params_.columnsC]; base_ = ::clMath::BlasBase::getInstance(); } template SyrkPerformanceTest::~SyrkPerformanceTest() { delete[] A_; delete[] C_; delete[] backC_; clReleaseMemObject(mobjC_); clReleaseMemObject(mobjA_); } /* * Check if available OpenCL resources are sufficient to * run the test case */ template bool SyrkPerformanceTest::areResourcesSufficient(TestParams *params) { clMath::BlasBase *base; size_t gmemSize, allocSize, maxMatrSize; size_t n = params->N, k = params->K; base = clMath::BlasBase::getInstance(); gmemSize = (size_t)base->availGlobalMemSize(0); allocSize = (size_t)base->maxMemAllocSize(); maxMatrSize = gmemSize / 3; maxMatrSize = std::min(maxMatrSize, allocSize); return (n * k * sizeof(ElemType) < maxMatrSize); } template int SyrkPerformanceTest::prepare(void) { bool useAlpha = base_->useAlpha(); bool useBeta = base_->useBeta(); if (useAlpha) { alpha_ = convertMultiplier(params_.alpha); } if (useBeta) { beta_ = convertMultiplier(params_.beta); } randomGemmMatrices(params_.order, params_.transA, clblasNoTrans, params_.N, params_.N, params_.K, useAlpha, &alpha_, A_, params_.lda, NULL, 0, useBeta, &beta_, C_, params_.ldc); mobjA_ = base_->createEnqueueBuffer(A_, params_.rowsA * params_.columnsA * sizeof(ElemType), params_.offA * sizeof(ElemType), CL_MEM_READ_ONLY); if (mobjA_) { mobjC_ = base_->createEnqueueBuffer(backC_, params_.rowsC * params_.columnsC * sizeof(ElemType), params_.offCY * sizeof(ElemType), CL_MEM_READ_WRITE); } return (mobjC_) ? 0 : -1; } template nano_time_t SyrkPerformanceTest::etalonPerfSingle(void) { nano_time_t time = 0; clblasOrder order; size_t lda, ldc; #ifndef PERF_TEST_WITH_ROW_MAJOR if (params_.order == clblasRowMajor) { cerr << "Row major order is not allowed" << endl; return NANOTIME_ERR; } #endif memcpy(C_, backC_, params_.rowsC * params_.columnsC * sizeof(ElemType)); order = params_.order; lda = params_.lda; ldc = params_.ldc; #ifdef PERF_TEST_WITH_ACML // #warning "SYRK performance test not implemented" time = NANOTIME_MAX; order = order; lda = lda; ldc = ldc; #endif // PERF_TEST_WITH_ACML return time; } template nano_time_t SyrkPerformanceTest::clblasPerfSingle(void) { nano_time_t time; cl_event event; cl_int status; cl_command_queue queue = base_->commandQueues()[0]; status = clEnqueueWriteBuffer(queue, mobjC_, CL_TRUE, 0, params_.rowsC * params_.columnsC * sizeof(ElemType), backC_, 0, NULL, &event); if (status != CL_SUCCESS) { cerr << "Matrix C buffer object enqueuing error, status = " << status << endl; return NANOTIME_ERR; } status = clWaitForEvents(1, &event); if (status != CL_SUCCESS) { cout << "Wait on event failed, status = " << status << endl; return NANOTIME_ERR; } event = NULL; status = (cl_int)clMath::clblas::syrk(params_.order, params_.uplo, params_.transA, params_.N, params_.K, alpha_, mobjA_, params_.offA, params_.lda, beta_, mobjC_, params_.offCY, params_.ldc, 1, &queue, 0, NULL, &event); if (status != CL_SUCCESS) { cerr << "The CLBLAS SYRK function failed, status = " << status << endl; return NANOTIME_ERR; } status = flushAll(1, &queue); if (status != CL_SUCCESS) { cerr << "clFlush() failed, status = " << status << endl; return NANOTIME_ERR; } time = getCurrentTime(); status = waitForSuccessfulFinish(1, &queue, &event); if (status == CL_SUCCESS) { time = getCurrentTime() - time; } else { cerr << "Waiting for completion of commands to the queue failed, " "status = " << status << endl; time = NANOTIME_ERR; } return time; } } // namespace clMath // ssyrk performance test TEST_P(SYRK, ssyrk) { TestParams params; getParams(¶ms); SyrkPerformanceTest::runInstance(FN_SSYRK, ¶ms); } // dsyrk performance test case TEST_P(SYRK, dsyrk) { TestParams params; getParams(¶ms); SyrkPerformanceTest::runInstance(FN_DSYRK, ¶ms); } // csyrk performance test TEST_P(SYRK, csyrk) { TestParams params; getParams(¶ms); SyrkPerformanceTest::runInstance(FN_CSYRK, ¶ms); } // zsyrk performance test case TEST_P(SYRK, zsyrk) { TestParams params; getParams(¶ms); SyrkPerformanceTest::runInstance(FN_ZSYRK, ¶ms); } clblas-2.10/src/tests/performance/perf-tbmv.cpp000066400000000000000000000227211264277366700215640ustar00rootroot00000000000000/* ************************************************************************ * Copyright 2013 Advanced Micro Devices, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * ************************************************************************/ /* * Tbmv performance test cases */ #include // srand() #include // memcpy() #include #include #include #include #include #include #include #ifdef PERF_TEST_WITH_ACML #include #include #endif #include "PerformanceTest.h" using namespace std; using namespace clMath; #define CHECK_RESULT(ret) \ do { \ ASSERT_GE(ret, 0) << "Fatal error: can not allocate resources or " \ "perform an OpenCL request!" << endl; \ EXPECT_EQ(0, ret) << "The OpenCL version is slower in the case" << \ endl; \ } while (0) namespace clMath { template class TbmvPerformanceTest : public PerformanceTest { public: virtual ~TbmvPerformanceTest(); virtual int prepare(void); virtual nano_time_t etalonPerfSingle(void); virtual nano_time_t clblasPerfSingle(void); static void runInstance(BlasFunction fn, TestParams *params) { TbmvPerformanceTest perfCase(fn, params); int ret = 0; int opFactor = 1; BlasBase *base; base = clMath::BlasBase::getInstance(); if ((fn == FN_DTBMV || fn == FN_ZTBMV) && !base->isDevSupportDoublePrecision()) { std::cerr << ">> WARNING: The target device doesn't support native " "double precision floating point arithmetic" << std::endl << ">> Test skipped" << std::endl; return; } if (!perfCase.areResourcesSufficient(params)) { std::cerr << ">> RESOURCE CHECK: Skip due to unsufficient resources" << std::endl; } else { ret = perfCase.run(opFactor); } ASSERT_GE(ret, 0) << "Fatal error: can not allocate resources or " "perform an OpenCL request!" << endl; EXPECT_EQ(0, ret) << "The OpenCL version is slower in the case" << endl; } private: TbmvPerformanceTest(BlasFunction fn, TestParams *params); bool areResourcesSufficient(TestParams *params); TestParams params_; ElemType *A_; ElemType *X_; ElemType *backX_; cl_mem mobjA_; cl_mem mobjX_; cl_mem mobjScratch_; ::clMath::BlasBase *base_; }; template TbmvPerformanceTest::TbmvPerformanceTest( BlasFunction fn, TestParams *params) : PerformanceTest(fn, (problem_size_t)( params->N * (params->K+1) * 2 // A & X access - (params->K * (params->K+1) ) // Substract hole-part for A & X + (2*params->N) /* Y access */ ) * sizeof(ElemType) ), params_(*params), mobjA_(NULL), mobjX_(NULL), mobjScratch_(NULL) { size_t lenA, lenX; lenA = params_.N * params_.lda + params_.offA; lenX = (params_.N - 1)* params_.incx + 1 + params_.offBX; A_ = new ElemType[ lenA ]; X_ = new ElemType[ lenX ]; backX_ = new ElemType[ lenX ]; base_ = ::clMath::BlasBase::getInstance(); mobjA_ = NULL; mobjX_ = NULL; mobjScratch_ = NULL; } template TbmvPerformanceTest::~TbmvPerformanceTest() { if(A_ != NULL) { delete[] A_; } if(X_ != NULL) { delete[] X_; } if(backX_ != NULL) { delete[] backX_; } if ( mobjA_ != NULL ) clReleaseMemObject(mobjA_); if ( mobjX_ != NULL ) clReleaseMemObject(mobjX_); if ( mobjScratch_ != NULL ) clReleaseMemObject(mobjScratch_); } /* * Check if available OpenCL resources are sufficient to * run the test case */ template bool TbmvPerformanceTest::areResourcesSufficient(TestParams *params) { clMath::BlasBase *base; size_t gmemSize, allocSize; size_t n = params->N, lda = params->lda; size_t lenA = (n * lda + params->offA)* sizeof(ElemType); size_t lenX = ((params->N - 1)* params->incx + 1 + params->offBX) * sizeof(ElemType); if((A_ == NULL) || (X_ == NULL) || (backX_ == NULL)) { return 0; } base = clMath::BlasBase::getInstance(); gmemSize = (size_t)base->availGlobalMemSize(0); allocSize = (size_t)base->maxMemAllocSize(); bool suff = (lenA < allocSize) && ( (lenA + 2 * lenX) < gmemSize ); return suff; } template int TbmvPerformanceTest::prepare(void) { size_t lenX, lenA; lenA = params_.N * params_.lda + params_.offA; lenX = (params_.N - 1)*abs(params_.incx) + 1 + params_.offBX; randomTbmvMatrices( params_.N, (A_+params_.offA), params_.lda, (X_+params_.offBX), params_.incx ); memcpy(backX_, X_, lenX * sizeof(ElemType)); mobjA_ = base_->createEnqueueBuffer(A_, lenA * sizeof(ElemType), 0, CL_MEM_READ_ONLY); mobjX_ = base_->createEnqueueBuffer(X_, lenX * sizeof(ElemType), 0, CL_MEM_READ_WRITE); mobjScratch_ = base_->createEnqueueBuffer(backX_, lenX * sizeof(ElemType), 0, CL_MEM_READ_WRITE); return ((mobjA_ != NULL) && (mobjX_ != NULL) && (mobjScratch_ != NULL)) ? 0 : -1; } template nano_time_t TbmvPerformanceTest::etalonPerfSingle(void) { nano_time_t time = 0; clblasOrder fOrder; clblasTranspose fTrans; clblasUplo fUplo; size_t lda, lenA, lenX; lenA = params_.N * params_.lda; lenX = (params_.N - 1)* params_.incx + 1 + params_.offBX; memcpy(X_, backX_, lenX * sizeof(ElemType)); fOrder = params_.order; fTrans = params_.transA; fUplo = params_.uplo; lda = params_.lda; if (fOrder != clblasColumnMajor) { fOrder = clblasColumnMajor; fTrans = (params_.transA == clblasNoTrans)? clblasTrans : clblasNoTrans; fUplo = (params_.uplo == clblasLower)? clblasUpper : clblasLower; if( params_.transA == clblasConjTrans ) doConjugate( (A_+params_.offA), 1, lenA, lda ); } #ifdef PERF_TEST_WITH_ACML time = getCurrentTime(); clMath::blas::tbmv(fOrder, fUplo, fTrans, params_.diag, params_.N, params_.K, A_, params_.offA, lda, X_, params_.offBX, params_.incx); time = getCurrentTime() - time; #endif // PERF_TEST_WITH_ACML return time; } template nano_time_t TbmvPerformanceTest::clblasPerfSingle(void) { nano_time_t time; cl_event event; cl_int status; size_t lenX; cl_command_queue queue = base_->commandQueues()[0]; lenX = (params_.N - 1)* params_.incx + 1 + params_.offBX; status = clEnqueueWriteBuffer(queue, mobjX_, CL_TRUE, 0, lenX * sizeof(ElemType), backX_, 0, NULL, &event); if (status != CL_SUCCESS) { cerr << "Vector X buffer object enqueuing error, status = " << status << endl; return NANOTIME_ERR; } status = clWaitForEvents(1, &event); if (status != CL_SUCCESS) { cout << "Wait on event failed, status = " << status << endl; return NANOTIME_ERR; } DataType type; type = ( typeid(ElemType) == typeid(float))? TYPE_FLOAT:( typeid(ElemType) == typeid(double))? TYPE_DOUBLE: ( typeid(ElemType) == typeid(FloatComplex))? TYPE_COMPLEX_FLOAT: TYPE_COMPLEX_DOUBLE; event = NULL; time = getCurrentTime(); int iter = 20; for ( int i = 1; i <= iter; i++) { status = clMath::clblas::tbmv(type, params_.order, params_.uplo, params_.transA, params_.diag, params_.N, params_.K, mobjA_, params_.offA, params_.lda, mobjX_, params_.offBX, params_.incx, mobjScratch_, 1, &queue, 0, NULL, &event); if (status != CL_SUCCESS) { cerr << "The CLBLAS TBMV function failed, status = " << status << endl; return NANOTIME_ERR; } } clFinish( queue ); time = getCurrentTime() - time; time /= iter; return time; } } // namespace clMath // stbmv performance test TEST_P(TBMV, stbmv) { TestParams params; getParams(¶ms); TbmvPerformanceTest::runInstance(FN_STBMV, ¶ms); } // dtbmv performance test case TEST_P(TBMV, dtbmv) { TestParams params; getParams(¶ms); TbmvPerformanceTest::runInstance(FN_DTBMV, ¶ms); } // ctbmv performance test TEST_P(TBMV, ctbmv) { TestParams params; getParams(¶ms); TbmvPerformanceTest::runInstance(FN_CTBMV, ¶ms); } // ztbmv performance test case TEST_P(TBMV, ztbmv) { TestParams params; getParams(¶ms); TbmvPerformanceTest::runInstance(FN_ZTBMV, ¶ms); } clblas-2.10/src/tests/performance/perf-tbsv.cpp000066400000000000000000000226661264277366700216020ustar00rootroot00000000000000/* ************************************************************************ * Copyright 2013 Advanced Micro Devices, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * ************************************************************************/ /* * Tbsv performance test cases */ #include // srand() #include // memcpy() #include #include #include #include #include #include #include #include #ifdef PERF_TEST_WITH_ACML #include #include #endif #include "PerformanceTest.h" using namespace std; using namespace clMath; #define CHECK_RESULT(ret) \ do { \ ASSERT_GE(ret, 0) << "Fatal error: can not allocate resources or " \ "perform an OpenCL request!" << endl; \ EXPECT_EQ(0, ret) << "The OpenCL version is slower in the case" << \ endl; \ } while (0) namespace clMath { template class TbsvPerformanceTest : public PerformanceTest { public: virtual ~TbsvPerformanceTest(); virtual int prepare(void); virtual nano_time_t etalonPerfSingle(void); virtual nano_time_t clblasPerfSingle(void); static void runInstance(BlasFunction fn, TestParams *params) { TbsvPerformanceTest perfCase(fn, params); int ret = 0; int opFactor = 1; BlasBase *base; base = clMath::BlasBase::getInstance(); if ((fn == FN_DTBSV || fn == FN_ZTBSV) && !base->isDevSupportDoublePrecision()) { std::cerr << ">> WARNING: The target device doesn't support native " "double precision floating point arithmetic" << std::endl << ">> Test skipped" << std::endl; return; } if (!perfCase.areResourcesSufficient(params)) { std::cerr << ">> RESOURCE CHECK: Skip due to unsufficient resources" << std::endl; } else { ret = perfCase.run(opFactor); } ASSERT_GE(ret, 0) << "Fatal error: can not allocate resources or " "perform an OpenCL request!" << endl; EXPECT_EQ(0, ret) << "The OpenCL version is slower in the case" << endl; } private: TbsvPerformanceTest(BlasFunction fn, TestParams *params); bool areResourcesSufficient(TestParams *params); TestParams params_; ElemType *A_; ElemType *X_; ElemType *backX_; cl_mem mobjA_; cl_mem mobjX_; cl_mem mobjScratch_; ::clMath::BlasBase *base_; }; template TbsvPerformanceTest::TbsvPerformanceTest( BlasFunction fn, TestParams *params) : PerformanceTest(fn, (problem_size_t)( params->N * (params->K+1) * 2 // A & X access - (params->K * (params->K+1) ) // Substract hole-part for A & X + (2*params->N) /* Y access */ ) * sizeof(ElemType) ), params_(*params), mobjA_(NULL), mobjX_(NULL), mobjScratch_(NULL) { size_t lenA, lenX; lenA = params_.N * params_.lda + params_.offA; lenX = (params_.N - 1)* params_.incx + 1 + params_.offBX; A_ = new ElemType[ lenA ]; X_ = new ElemType[ lenX ]; backX_ = new ElemType[ lenX ]; base_ = ::clMath::BlasBase::getInstance(); mobjA_ = NULL; mobjX_ = NULL; mobjScratch_ = NULL; } template TbsvPerformanceTest::~TbsvPerformanceTest() { if(A_ != NULL) { delete[] A_; } if(X_ != NULL) { delete[] X_; } if(backX_ != NULL) { delete[] backX_; } if ( mobjA_ != NULL ) clReleaseMemObject(mobjA_); if ( mobjX_ != NULL ) clReleaseMemObject(mobjX_); if ( mobjScratch_ != NULL ) clReleaseMemObject(mobjScratch_); } /* * Check if available OpenCL resources are sufficient to * run the test case */ template bool TbsvPerformanceTest::areResourcesSufficient(TestParams *params) { clMath::BlasBase *base; size_t gmemSize, allocSize; size_t n = params->N, lda = params->lda; size_t lenA = (n * lda + params->offA)* sizeof(ElemType); size_t lenX = ((params->N - 1)* params->incx + 1 + params->offBX) * sizeof(ElemType); if((A_ == NULL) || (X_ == NULL) || (backX_ == NULL)) { return 0; } base = clMath::BlasBase::getInstance(); gmemSize = (size_t)base->availGlobalMemSize(0); allocSize = (size_t)base->maxMemAllocSize(); bool suff = (lenA < allocSize) && ( (lenA + 2 * lenX) < gmemSize ); return suff; } template int TbsvPerformanceTest::prepare(void) { size_t lenX, lenA; lenA = params_.N * params_.lda + params_.offA; lenX = (params_.N - 1)*abs(params_.incx) + 1 + params_.offBX; randomTbsvMatrices( params_.order, params_.uplo, params_.diag, params_.N, params_.K, (A_+params_.offA), params_.lda, (X_+params_.offBX), params_.incx ); memcpy(backX_, X_, lenX * sizeof(ElemType)); mobjA_ = base_->createEnqueueBuffer(A_, lenA * sizeof(ElemType), 0, CL_MEM_READ_ONLY); mobjX_ = base_->createEnqueueBuffer(X_, lenX * sizeof(ElemType), 0, CL_MEM_READ_WRITE); mobjScratch_ = base_->createEnqueueBuffer(backX_, lenX * sizeof(ElemType), 0, CL_MEM_READ_WRITE); return ((mobjA_ != NULL) && (mobjX_ != NULL) && (mobjScratch_ != NULL)) ? 0 : -1; } template nano_time_t TbsvPerformanceTest::etalonPerfSingle(void) { nano_time_t time = 0; clblasOrder fOrder; clblasTranspose fTrans; clblasUplo fUplo; size_t lda, lenA, lenX; lenA = params_.N * params_.lda; lenX = (params_.N - 1)* params_.incx + 1 + params_.offBX; memcpy(X_, backX_, lenX * sizeof(ElemType)); fOrder = params_.order; fTrans = params_.transA; fUplo = params_.uplo; lda = params_.lda; if (fOrder != clblasColumnMajor) { fOrder = clblasColumnMajor; fTrans = (params_.transA == clblasNoTrans)? clblasTrans : clblasNoTrans; fUplo = (params_.uplo == clblasLower)? clblasUpper : clblasLower; if( params_.transA == clblasConjTrans ) doConjugate( (A_+params_.offA), 1, lenA, lda ); } #ifdef PERF_TEST_WITH_ACML time = getCurrentTime(); clMath::blas::tbsv(fOrder, fUplo, fTrans, params_.diag, params_.N, params_.K, A_, params_.offA, lda, X_, params_.offBX, params_.incx); time = getCurrentTime() - time; #endif // PERF_TEST_WITH_ACML return time; } template nano_time_t TbsvPerformanceTest::clblasPerfSingle(void) { nano_time_t time; cl_event event; cl_int status; size_t lenX; cl_command_queue queue = base_->commandQueues()[0]; lenX = (params_.N - 1)* params_.incx + 1 + params_.offBX; status = clEnqueueWriteBuffer(queue, mobjX_, CL_TRUE, 0, lenX * sizeof(ElemType), backX_, 0, NULL, &event); if (status != CL_SUCCESS) { cerr << "Vector X buffer object enqueuing error, status = " << status << endl; return NANOTIME_ERR; } status = clWaitForEvents(1, &event); if (status != CL_SUCCESS) { cout << "Wait on event failed, status = " << status << endl; return NANOTIME_ERR; } DataType type; type = ( typeid(ElemType) == typeid(float))? TYPE_FLOAT:( typeid(ElemType) == typeid(double))? TYPE_DOUBLE: ( typeid(ElemType) == typeid(FloatComplex))? TYPE_COMPLEX_FLOAT: TYPE_COMPLEX_DOUBLE; event = NULL; time = getCurrentTime(); int iter = 20; for ( int i = 1; i <= iter; i++) { status = clMath::clblas::tbsv(type, params_.order, params_.uplo, params_.transA, params_.diag, params_.N, params_.K, mobjA_, params_.offA, params_.lda, mobjX_, params_.offBX, params_.incx, 1, &queue, 0, NULL, &event); if (status != CL_SUCCESS) { cerr << "The CLBLAS TBSV function failed, status = " << status << endl; return NANOTIME_ERR; } } clFinish( queue ); time = getCurrentTime() - time; time /= iter; return time; } } // namespace clMath TEST_P(TBSV, stbsv) { TestParams params; getParams(¶ms); TbsvPerformanceTest::runInstance(FN_STBSV, ¶ms); } TEST_P(TBSV, dtbsv) { TestParams params; getParams(¶ms); TbsvPerformanceTest::runInstance(FN_DTBSV, ¶ms); } TEST_P(TBSV, ctbsv) { TestParams params; getParams(¶ms); TbsvPerformanceTest::runInstance(FN_CTBSV, ¶ms); } TEST_P(TBSV, ztbsv) { TestParams params; getParams(¶ms); TbsvPerformanceTest::runInstance(FN_ZTBSV, ¶ms); } clblas-2.10/src/tests/performance/perf-tpmv.cpp000066400000000000000000000255411264277366700216050ustar00rootroot00000000000000/* ************************************************************************ * Copyright 2013 Advanced Micro Devices, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * ************************************************************************/ #include // srand() #include // memcpy() #include #include #include #include #include #include #include #ifdef PERF_TEST_WITH_ACML #include #include #endif #include "PerformanceTest.h" /* * NOTE: operation factor means overall number * of multiply and add per each operation involving * 2 matrix elements */ using namespace std; using namespace clMath; #define CHECK_RESULT(ret) \ do { \ ASSERT_GE(ret, 0) << "Fatal error: can not allocate resources or " \ "perform an OpenCL request!" << endl; \ EXPECT_EQ(0, ret) << "The OpenCL version is slower in the case" << \ endl; \ } while (0) namespace clMath { template class TpmvPerformanceTest : public PerformanceTest { public: virtual ~TpmvPerformanceTest(); virtual int prepare(void); virtual nano_time_t etalonPerfSingle(void); virtual nano_time_t clblasPerfSingle(void); static void runInstance(BlasFunction fn, TestParams *params) { TpmvPerformanceTest perfCase(fn, params); int ret = 0; int opFactor; BlasBase *base; base = clMath::BlasBase::getInstance(); /* *************Important********************* if (fn == FN_STRMV || fn == FN_DTRMV) { opFactor = 2; } else { opFactor = 8; } this is only for blas-3 routines- operations factor FOR BLAS-2(bandwidth intensive) ROUTINES MAKE opFactor AS 1 and pass the appropriate size that is read and written in the constructor below */ opFactor = 1; //FIX-ME if ((fn == FN_DTPMV || fn == FN_ZTPMV) && !base->isDevSupportDoublePrecision()) { std::cerr << ">> WARNING: The target device doesn't support native " "double precision floating point arithmetic" << std::endl << ">> Test skipped" << std::endl; return; } if (!perfCase.areResourcesSufficient(params)) { std::cerr << ">> RESOURCE CHECK: Skip due to unsufficient resources" << std::endl; return; } else { ret = perfCase.run(opFactor); } ASSERT_GE(ret, 0) << "Fatal error: can not allocate resources or " "perform an OpenCL request!" << endl; EXPECT_EQ(0, ret) << "The OpenCL version is slower in the case" << endl; } private: TpmvPerformanceTest(BlasFunction fn, TestParams *params); bool areResourcesSufficient(TestParams *params); TestParams params_; ElemType *AP_; ElemType *X_; ElemType *backX_; cl_mem mobjAP_; cl_mem mobjX_; cl_mem scratchBuff; ::clMath::BlasBase *base_; }; template TpmvPerformanceTest::TpmvPerformanceTest( BlasFunction fn, TestParams *params) : PerformanceTest( fn, (problem_size_t)( ( params->N * (params->N+1) * sizeof(ElemType) ) ) ), params_(*params), mobjAP_(NULL), mobjX_(NULL) { AP_ = new ElemType[( ( params_.N *( params_.N + 1 ) )/2 ) + params_.offa]; X_ = new ElemType[ 1 + (params_.N-1) * abs(params_.incx) + params_.offBX]; backX_ = new ElemType[ 1 + (params_.N-1) * abs(params_.incx) + params_.offBX]; base_ = ::clMath::BlasBase::getInstance(); mobjAP_ = NULL; mobjX_ = NULL; scratchBuff = NULL; } template TpmvPerformanceTest::~TpmvPerformanceTest() // Matrix A { if(AP_ != NULL) { delete[] AP_; } if(X_ != NULL) { delete[] X_; } if(backX_ != NULL) { delete[] backX_; } if ( mobjAP_ != NULL ) clReleaseMemObject(mobjAP_); if ( mobjX_ != NULL ) clReleaseMemObject(mobjX_); if ( scratchBuff != NULL ) clReleaseMemObject(scratchBuff); } /* * Check if available OpenCL resources are sufficient to * run the test case */ template bool TpmvPerformanceTest::areResourcesSufficient(TestParams *params) { clMath::BlasBase *base; size_t gmemSize, allocSize; size_t n = params->N; if((AP_ == NULL) || (X_ == NULL) || (backX_ == NULL)) { return 0; } base = clMath::BlasBase::getInstance(); gmemSize = (size_t)base->availGlobalMemSize( 0 ); allocSize = (size_t)base->maxMemAllocSize(); bool suff = ( sizeof(ElemType)*( ( n *( n + 1 ) )/2 )< allocSize ) && ((1 + (n-1)*abs(params->incx))*sizeof(ElemType) < allocSize); //for individual allocations suff = suff && ((( ( ( n *( n + 1 ) )/2 ) + (1 + (n-1)*abs(params->incx))*2)*sizeof(ElemType)) < gmemSize) ; //for total global allocations return suff ; } template int TpmvPerformanceTest::prepare(void) { size_t lenX, n; n = params_.N; lenX = 1 + (n-1) * abs(params_.incx); int creationFlags = 0; creationFlags = creationFlags | RANDOM_INIT | PACKED_MATRIX; // Default is Column-Major creationFlags = ( (this-> params_.order) == clblasRowMajor)? (creationFlags | ROW_MAJOR_ORDER) : (creationFlags); creationFlags = ( (this-> params_.uplo) == clblasLower)? (creationFlags | LOWER_HALF_ONLY) : (creationFlags | UPPER_HALF_ONLY); BlasRoutineID BlasFn = CLBLAS_TRMV; populate( (AP_ + params_.offa), n, n, 0, BlasFn, creationFlags); populate( X_ , lenX + params_.offBX, 1, lenX + params_.offBX, BlasFn); memcpy(backX_, X_, ((1 + (params_.N-1) * abs(params_.incx))+ params_.offBX )* sizeof(ElemType)); mobjAP_ = base_->createEnqueueBuffer(AP_,( (( n *( n + 1 ) )/2 ) + params_.offa)* sizeof(*AP_), 0, CL_MEM_READ_ONLY); mobjX_ = base_->createEnqueueBuffer(X_, (lenX + params_.offBX )* sizeof(*X_), 0, CL_MEM_WRITE_ONLY); scratchBuff = base_->createEnqueueBuffer(NULL , lenX * sizeof(*X_), 0, CL_MEM_READ_ONLY); return ( (mobjAP_ != NULL) && (mobjX_ != NULL) && (scratchBuff != NULL) ) ? 0 : -1; } template nano_time_t TpmvPerformanceTest::etalonPerfSingle(void) { nano_time_t time = 0; clblasOrder order; clblasUplo fUplo; clblasTranspose fTrans; //size_t lda; #ifndef PERF_TEST_WITH_ROW_MAJOR if (params_.order == clblasRowMajor) { cerr << "Row major order is not allowed" << endl; return NANOTIME_ERR; } #endif order = params_.order; fUplo = params_.uplo; fTrans = params_.transA; //lda = params_.lda; #ifdef PERF_TEST_WITH_ACML if (order != clblasColumnMajor) { order = clblasColumnMajor; fUplo = (params_.uplo == clblasUpper)? clblasLower : clblasUpper; fTrans = (params_.transA == clblasNoTrans)? clblasTrans : clblasNoTrans; if( params_.transA == clblasConjTrans ) doConjugate( (AP_+params_.offa), (( params_.N * (params_.N + 1)) / 2) , 1, 1 ); } time = getCurrentTime(); clMath::blas::tpmv(order, fUplo,fTrans, params_.diag, params_.N, AP_, params_.offa, X_, params_.offBX, params_.incx); time = getCurrentTime() - time; #endif // PERF_TEST_WITH_ACML return time; } template nano_time_t TpmvPerformanceTest::clblasPerfSingle(void) { nano_time_t time; cl_event event; cl_int status; cl_command_queue queue = base_->commandQueues()[0]; size_t lenX = 1 + (params_.N-1) * abs(params_.incx); status = clEnqueueWriteBuffer(queue, mobjX_, CL_TRUE, 0, (lenX + params_.offBX )* sizeof(ElemType), backX_, 0, NULL, &event); if (status != CL_SUCCESS) { cerr << "Vector X buffer object enqueuing error, status = " << status << endl; return NANOTIME_ERR; } status = clWaitForEvents(1, &event); if (status != CL_SUCCESS) { cout << "Wait on event failed, status = " << status << endl; return NANOTIME_ERR; } event = NULL; DataType type; type = ( typeid(ElemType) == typeid(float))? TYPE_FLOAT:( typeid(ElemType) == typeid(double))? TYPE_DOUBLE: ( typeid(ElemType) == typeid(FloatComplex))? TYPE_COMPLEX_FLOAT: TYPE_COMPLEX_DOUBLE; time = getCurrentTime(); #define TIMING #ifdef TIMING clFinish( queue); int iter = 20; for ( int i = 1; i <= iter; i++) { #endif status = (cl_int)clMath::clblas::tpmv(type, params_.order, params_.uplo, params_.transA, params_.diag, params_.N, mobjAP_, params_.offa, mobjX_, params_.offBX, params_.incx, scratchBuff, 1, &queue, 0, NULL, &event); if (status != CL_SUCCESS) { cerr << "The CLBLAS TPMV function failed, status = " << status << endl; return NANOTIME_ERR; } #ifdef TIMING } // iter loop clFinish( queue); time = getCurrentTime() - time; time /= iter; #else status = flushAll(1, &queue); if (status != CL_SUCCESS) { cerr << "clFlush() failed, status = " << status << endl; return NANOTIME_ERR; } time = getCurrentTime(); status = waitForSuccessfulFinish(1, &queue, &event); if (status == CL_SUCCESS) { time = getCurrentTime() - time; } else { cerr << "Waiting for completion of commands to the queue failed, " "status = " << status << endl; time = NANOTIME_ERR; } //printf("Time elapsed : %lu\n", time); #endif return time; } } // namespace clMath // strmv performance test TEST_P(TPMV, stpmv) { TestParams params; getParams(¶ms); TpmvPerformanceTest::runInstance(FN_STPMV, ¶ms); } // dtrmv performance test case TEST_P(TPMV, dtpmv) { TestParams params; getParams(¶ms); TpmvPerformanceTest::runInstance(FN_DTPMV, ¶ms); } // ctrmv performance test case TEST_P(TPMV, ctpmv) { TestParams params; getParams(¶ms); TpmvPerformanceTest::runInstance(FN_CTPMV, ¶ms); } // ztrmv performance test case TEST_P(TPMV, ztpmv) { TestParams params; getParams(¶ms); TpmvPerformanceTest::runInstance(FN_ZTPMV, ¶ms); } clblas-2.10/src/tests/performance/perf-tpsv.cpp000066400000000000000000000246451264277366700216170ustar00rootroot00000000000000/* ************************************************************************ * Copyright 2013 Advanced Micro Devices, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * ************************************************************************/ /* * Gemv performance test cases */ #include // srand() #include // memcpy() #include #include #include #include #include #include #include #ifdef PERF_TEST_WITH_ACML #include #include #endif #include "PerformanceTest.h" /* * NOTE: operation factor means overall number * of multiply and add per each operation involving * 2 matrix elements */ using namespace std; using namespace clMath; #define CHECK_RESULT(ret) \ do { \ ASSERT_GE(ret, 0) << "Fatal error: can not allocate resources or " \ "perform an OpenCL request!" << endl; \ EXPECT_EQ(0, ret) << "The OpenCL version is slower in the case" << \ endl; \ } while (0) namespace clMath { template class TpsvPerformanceTest : public PerformanceTest { public: virtual ~TpsvPerformanceTest(); virtual int prepare(void); virtual nano_time_t etalonPerfSingle(void); virtual nano_time_t clblasPerfSingle(void); static void runInstance(BlasFunction fn, TestParams *params) { TpsvPerformanceTest perfCase(fn, params); int ret = 0; int opFactor; BlasBase *base; base = clMath::BlasBase::getInstance(); opFactor = 1; if ((fn == FN_DTPSV || fn == FN_ZTPSV) && !base->isDevSupportDoublePrecision()) { std::cerr << ">> WARNING: The target device doesn't support native " "double precision floating point arithmetic" << std::endl << ">> Test skipped" << std::endl; return; } if (!perfCase.areResourcesSufficient(params)) { std::cerr << ">> RESOURCE CHECK: Skip due to unsufficient resources" << std::endl; return; } else { ret = perfCase.run(opFactor); } ASSERT_GE(ret, 0) << "Fatal error: can not allocate resources or " "perform an OpenCL request!" << endl; EXPECT_EQ(0, ret) << "The OpenCL version is slower in the case" << endl; } private: TpsvPerformanceTest(BlasFunction fn, TestParams *params); bool areResourcesSufficient(TestParams *params); TestParams params_; // ElemType alpha_; ElemType *A_; ElemType *X_; ElemType *backX_; cl_mem mobjA_; cl_mem mobjX_; size_t lengthA; ::clMath::BlasBase *base_; }; template TpsvPerformanceTest::TpsvPerformanceTest( BlasFunction fn, TestParams *params) : PerformanceTest( fn, (problem_size_t)( ( params->N * (params->N+1) * sizeof(ElemType) ) ) ), params_(*params), mobjA_(NULL), mobjX_(NULL) { lengthA = (params_.N * (params_.N + 1))/2; A_ = new ElemType[(lengthA) + params_.offa]; X_ = new ElemType[ 1 + ((params_.N-1) * abs(params_.incx)) + params_.offBX ]; backX_ = new ElemType[ 1 + ((params_.N-1) * abs(params_.incx)) + params_.offBX ]; base_ = ::clMath::BlasBase::getInstance(); } template TpsvPerformanceTest::~TpsvPerformanceTest() { if(A_ != NULL) { delete[] A_; } if(X_ != NULL) { delete[] X_; } if(backX_ != NULL) { delete[] backX_; } if( mobjA_ != NULL ) clReleaseMemObject(mobjA_); if( mobjX_ != NULL ) clReleaseMemObject(mobjX_); } /* * Check if available OpenCL resources are sufficient to * run the test case */ template bool TpsvPerformanceTest::areResourcesSufficient(TestParams *params) { clMath::BlasBase *base; size_t gmemSize, allocSize; size_t n = params->N; if ((A_ == NULL) || (X_ == NULL) || (backX_ == NULL)) { return 0; } base = clMath::BlasBase::getInstance(); gmemSize = (size_t)base->availGlobalMemSize( 0 ); allocSize = (size_t)base->maxMemAllocSize(); bool suff = ( sizeof(ElemType)*((n*(n+1))/2) < allocSize ) && ((1 + (n-1)*abs(params->incx))*sizeof(ElemType) < allocSize); //for individual allocations suff = suff && ((( ((n*(n+1))/2) + (1 + (n-1)*abs(params->incx))*2)*sizeof(ElemType)) < gmemSize) ; //for total global allocations return suff ; } template int TpsvPerformanceTest::prepare(void) { size_t lenX, N; N = params_.N; lenX = 1 + ((N-1) *abs(params_.incx)) + params_.offBX; randomTrsvMatrices( params_.order, params_.uplo, params_.diag, params_.N, (A_ + params_.offa), 0, (X_ + params_.offBX), params_.incx); memcpy(backX_, X_, lenX * sizeof(ElemType)); mobjA_ = base_->createEnqueueBuffer(A_, ((lengthA) + params_.offa) * sizeof(*A_), 0, CL_MEM_READ_ONLY); mobjX_ = base_->createEnqueueBuffer(X_, lenX * sizeof(*X_), 0, CL_MEM_READ_WRITE); return ((mobjA_ != NULL) && (mobjX_ != NULL) ) ? 0 : -1; } template nano_time_t TpsvPerformanceTest::etalonPerfSingle(void) { nano_time_t time = 0; clblasOrder order; clblasUplo fUplo; clblasTranspose fTrans; #ifndef PERF_TEST_WITH_ROW_MAJOR if (params_.order == clblasRowMajor) { cerr << "Row major order is not allowed" << endl; return NANOTIME_ERR; } #endif memcpy(X_, backX_, ((1 + ((params_.N-1) * abs(params_.incx)))+params_.offBX) * sizeof(ElemType)); order = params_.order; fUplo = params_.uplo; fTrans = params_.transA; #ifdef PERF_TEST_WITH_ACML if (order != clblasColumnMajor) { order = clblasColumnMajor; fUplo = (params_.uplo == clblasUpper)? clblasLower : clblasUpper; fTrans = (params_.transA == clblasNoTrans)? clblasTrans : clblasNoTrans; if( params_.transA == clblasConjTrans ) doConjugate( A_ + params_.offa, 1, lengthA, 1 ); } //printf("Calling ACML TPSV\n"); //printf("X Before calling %f %f %f %f\n", X_[0], X_[1], X_[2], X_[3]); time = getCurrentTime(); clMath::blas::tpsv(order, fUplo, fTrans, params_.diag, params_.N, A_, params_.offa, X_, params_.offBX, params_.incx); time = getCurrentTime() - time; //printf("X After Calling %f %f %f %f\n", X_[0], X_[1], X_[2], X_[3]); //printf("time %lu\n", (unsigned long)time ); #endif // PERF_TEST_WITH_ACML return time; } template nano_time_t TpsvPerformanceTest::clblasPerfSingle(void) { nano_time_t time; cl_event event; cl_int status; cl_command_queue queue = base_->commandQueues()[0]; size_t lenX = 1 + ((params_.N-1) * abs(params_.incx)) + params_.offBX; status = clEnqueueWriteBuffer(queue, mobjX_, CL_TRUE, 0, lenX * sizeof(ElemType), backX_, 0, NULL, &event); if (status != CL_SUCCESS) { cerr << "Vector X buffer object enqueuing error, status = " << status << endl; return NANOTIME_ERR; } status = clWaitForEvents(1, &event); if (status != CL_SUCCESS) { cout << "Wait on event failed, status = " << status << endl; return NANOTIME_ERR; } event = NULL; //printf("backX before calling %f %f %f %f\n", backX_[0], backX_[1], backX_[2], backX_[3]); DataType type; type = ( typeid(ElemType) == typeid(float))? TYPE_FLOAT:( typeid(ElemType) == typeid(double))? TYPE_DOUBLE: ( typeid(ElemType) == typeid(FloatComplex))? TYPE_COMPLEX_FLOAT: TYPE_COMPLEX_DOUBLE; time = getCurrentTime(); //#define TIMING #ifdef TIMING clFinish( queue); int iter = 20; for ( int i = 1; i <= iter; i++) { #endif status = (cl_int)clMath::clblas::tpsv(type, params_.order, params_.uplo, params_.transA, params_.diag, params_.N, mobjA_, params_.offa, mobjX_, params_.offBX, params_.incx, 1, &queue, 0, NULL, &event); if (status != CL_SUCCESS) { cerr << "The CLBLAS TPSV function failed, status = " << status << endl; return NANOTIME_ERR; } #ifdef TIMING } // iter loop clFinish( queue); time = getCurrentTime() - time; time /= iter; #else status = flushAll(1, &queue); if (status != CL_SUCCESS) { cerr << "clFlush() failed, status = " << status << endl; return NANOTIME_ERR; } status = waitForSuccessfulFinish(1, &queue, &event); if (status == CL_SUCCESS) { time = getCurrentTime() - time; clEnqueueReadBuffer(queue, mobjX_, CL_TRUE, 0, lenX * sizeof(ElemType), backX_, 0, NULL, NULL); /* printf("X Vector is \n"); for(int i =0 ; i // srand() #include // memcpy() #include #include #include #include #include #include #include "TrxmPerformanceTest.cpp" /* * NOTE: operation factor takes into account the same as for * gemm but also the fact that only a half of data is actually * useful */ using namespace std; using namespace clMath; // strmm performance test case TEST_P(TRMM, strmm) { TestParams params; getParams(¶ms); TrxmPerformanceTest::runInstance(FN_STRMM, ¶ms); } // dtrmm performance test case TEST_P(TRMM, dtrmm) { TestParams params; getParams(¶ms); TrxmPerformanceTest::runInstance(FN_DTRMM, ¶ms); } // ctrmm performance test case TEST_P(TRMM, ctrmm) { TestParams params; getParams(¶ms); TrxmPerformanceTest::runInstance(FN_CTRMM, ¶ms); } // ztrmm performance test case TEST_P(TRMM, ztrmm) { TestParams params; getParams(¶ms); TrxmPerformanceTest::runInstance(FN_ZTRMM, ¶ms); } clblas-2.10/src/tests/performance/perf-trmv.cpp000066400000000000000000000255751264277366700216160ustar00rootroot00000000000000/* ************************************************************************ * Copyright 2013 Advanced Micro Devices, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * ************************************************************************/ /* * Trmv performance test cases */ #include // srand() #include // memcpy() #include #include #include #include #include #include #include #ifdef PERF_TEST_WITH_ACML #include #include #endif #include "PerformanceTest.h" /* * NOTE: operation factor means overall number * of multiply and add per each operation involving * 2 matrix elements */ using namespace std; using namespace clMath; #define CHECK_RESULT(ret) \ do { \ ASSERT_GE(ret, 0) << "Fatal error: can not allocate resources or " \ "perform an OpenCL request!" << endl; \ EXPECT_EQ(0, ret) << "The OpenCL version is slower in the case" << \ endl; \ } while (0) namespace clMath { template class TrmvPerformanceTest : public PerformanceTest { public: virtual ~TrmvPerformanceTest(); virtual int prepare(void); virtual nano_time_t etalonPerfSingle(void); virtual nano_time_t clblasPerfSingle(void); static void runInstance(BlasFunction fn, TestParams *params) { TrmvPerformanceTest perfCase(fn, params); int ret = 0; int opFactor; BlasBase *base; base = clMath::BlasBase::getInstance(); /* *************Important********************* if (fn == FN_STRMV || fn == FN_DTRMV) { opFactor = 2; } else { opFactor = 8; } this is only for blas-3 routines- operations factor FOR BLAS-2(bandwidth intensive) ROUTINES MAKE opFactor AS 1 and pass the appropriate size that is read and written in the constructor below */ opFactor = 1; //FIX-ME if ((fn == FN_DTRMV || fn == FN_ZTRMV) && !base->isDevSupportDoublePrecision()) { std::cerr << ">> WARNING: The target device doesn't support native " "double precision floating point arithmetic" << std::endl << ">> Test skipped" << std::endl; return; } if (!perfCase.areResourcesSufficient(params)) { std::cerr << ">> RESOURCE CHECK: Skip due to unsufficient resources" << std::endl; return; } else { ret = perfCase.run(opFactor); } ASSERT_GE(ret, 0) << "Fatal error: can not allocate resources or " "perform an OpenCL request!" << endl; EXPECT_EQ(0, ret) << "The OpenCL version is slower in the case" << endl; } private: TrmvPerformanceTest(BlasFunction fn, TestParams *params); bool areResourcesSufficient(TestParams *params); TestParams params_; ElemType *A_; ElemType *X_; ElemType *backX_; cl_mem mobjA_; cl_mem mobjX_; cl_mem scratchBuff; ::clMath::BlasBase *base_; }; template TrmvPerformanceTest::TrmvPerformanceTest( BlasFunction fn, TestParams *params) : PerformanceTest( fn, (problem_size_t)( ( params->N * (params->N+1) * sizeof(ElemType) ) ) ), //**************Gbps formula here*********** params_(*params), mobjA_(NULL), mobjX_(NULL) { A_ = new ElemType[params_.N * params_.lda + params_.offa]; X_ = new ElemType[ 1 + (params_.N-1) * abs(params_.incx) + params_.offBX]; backX_ = new ElemType[ 1 + (params_.N-1) * abs(params_.incx) + params_.offBX]; base_ = ::clMath::BlasBase::getInstance(); mobjA_ = NULL; mobjX_ = NULL; scratchBuff = NULL; } template TrmvPerformanceTest::~TrmvPerformanceTest() { if(A_ != NULL) { delete[] A_; } if(X_ != NULL) { delete[] X_; } if(backX_ != NULL) { delete[] backX_; } if ( mobjA_ != NULL ) clReleaseMemObject(mobjA_); if ( mobjX_ != NULL ) clReleaseMemObject(mobjX_); if ( scratchBuff != NULL ) clReleaseMemObject(scratchBuff); } /* * Check if available OpenCL resources are sufficient to * run the test case */ template bool TrmvPerformanceTest::areResourcesSufficient(TestParams *params) { clMath::BlasBase *base; size_t gmemSize, allocSize; size_t n = params->N; if((A_ == NULL) || (X_ == NULL) || (backX_ == NULL)) { return 0; } base = clMath::BlasBase::getInstance(); gmemSize = (size_t)base->availGlobalMemSize( 0 ); allocSize = (size_t)base->maxMemAllocSize(); bool suff = ( sizeof(ElemType)*n*params->lda < allocSize ) && ((1 + (n-1)*abs(params->incx))*sizeof(ElemType) < allocSize); //for individual allocations suff = suff && ((( n*params->lda + (1 + (n-1)*abs(params->incx))*2)*sizeof(ElemType)) < gmemSize) ; //for total global allocations return suff ; } template int TrmvPerformanceTest::prepare(void) { size_t lenX, N; N = params_.N; lenX = 1 + (N-1) * abs(params_.incx); int creationFlags = 0; creationFlags = creationFlags | RANDOM_INIT; // Default is Column-Major creationFlags = ( (this-> params_.order) == clblasRowMajor)? (creationFlags | ROW_MAJOR_ORDER) : (creationFlags); creationFlags = ( (this-> params_.uplo) == clblasLower)? (creationFlags | LOWER_HALF_ONLY) : (creationFlags | UPPER_HALF_ONLY); BlasRoutineID BlasFn = CLBLAS_TRMV; // Matrix A populate( (A_ + params_.offa), N, N, params_.lda, BlasFn, creationFlags); populate( X_ , lenX + params_.offBX, 1, lenX + params_.offBX, BlasFn); memcpy(backX_, X_, ((1 + (params_.N-1) * abs(params_.incx))+ params_.offBX )* sizeof(ElemType)); mobjA_ = base_->createEnqueueBuffer(A_, (params_.N * params_.lda + params_.offa)* sizeof(*A_), 0, CL_MEM_READ_ONLY); mobjX_ = base_->createEnqueueBuffer(X_, (lenX + params_.offBX )* sizeof(*X_), 0, CL_MEM_WRITE_ONLY); scratchBuff = base_->createEnqueueBuffer(NULL , lenX * sizeof(*X_), 0, CL_MEM_READ_ONLY); return ( (mobjA_ != NULL) && (mobjX_ != NULL) && (scratchBuff != NULL) ) ? 0 : -1; } template nano_time_t TrmvPerformanceTest::etalonPerfSingle(void) { nano_time_t time = 0; clblasOrder order; clblasUplo fUplo; clblasTranspose fTrans; size_t lda; #ifndef PERF_TEST_WITH_ROW_MAJOR if (params_.order == clblasRowMajor) { cerr << "Row major order is not allowed" << endl; return NANOTIME_ERR; } #endif order = params_.order; fUplo = params_.uplo; fTrans = params_.transA; lda = params_.lda; #ifdef PERF_TEST_WITH_ACML if (order != clblasColumnMajor) { order = clblasColumnMajor; fUplo = (params_.uplo == clblasUpper)? clblasLower : clblasUpper; fTrans = (params_.transA == clblasNoTrans)? clblasTrans : clblasNoTrans; if( params_.transA == clblasConjTrans ) doConjugate( (A_+params_.offa), params_.N, params_.N, lda ); } time = getCurrentTime(); clMath::blas::trmv(order, fUplo,fTrans, params_.diag, params_.N, A_, params_.offa, lda, X_, params_.offBX, params_.incx); time = getCurrentTime() - time; #endif // PERF_TEST_WITH_ACML return time; } template nano_time_t TrmvPerformanceTest::clblasPerfSingle(void) { nano_time_t time; cl_event event; cl_int status; cl_command_queue queue = base_->commandQueues()[0]; size_t lenX = 1 + (params_.N-1) * abs(params_.incx); status = clEnqueueWriteBuffer(queue, mobjX_, CL_TRUE, 0, (lenX + params_.offBX )* sizeof(ElemType), backX_, 0, NULL, &event); if (status != CL_SUCCESS) { cerr << "Vector X buffer object enqueuing error, status = " << status << endl; return NANOTIME_ERR; } status = clWaitForEvents(1, &event); if (status != CL_SUCCESS) { cout << "Wait on event failed, status = " << status << endl; return NANOTIME_ERR; } event = NULL; DataType type; type = ( typeid(ElemType) == typeid(float))? TYPE_FLOAT:( typeid(ElemType) == typeid(double))? TYPE_DOUBLE: ( typeid(ElemType) == typeid(FloatComplex))? TYPE_COMPLEX_FLOAT: TYPE_COMPLEX_DOUBLE; time = getCurrentTime(); #define TIMING #ifdef TIMING clFinish( queue); int iter = 20; for ( int i = 1; i <= iter; i++) { #endif status = (cl_int)clMath::clblas::trmv(type, params_.order, params_.uplo, params_.transA, params_.diag, params_.N, mobjA_, params_.offa, params_.lda, mobjX_, params_.offBX, params_.incx, scratchBuff, 1, &queue, 0, NULL, &event); if (status != CL_SUCCESS) { cerr << "The CLBLAS TRMV function failed, status = " << status << endl; return NANOTIME_ERR; } #ifdef TIMING } // iter loop clFinish( queue); time = getCurrentTime() - time; time /= iter; #else status = flushAll(1, &queue); if (status != CL_SUCCESS) { cerr << "clFlush() failed, status = " << status << endl; return NANOTIME_ERR; } time = getCurrentTime(); status = waitForSuccessfulFinish(1, &queue, &event); if (status == CL_SUCCESS) { time = getCurrentTime() - time; } else { cerr << "Waiting for completion of commands to the queue failed, " "status = " << status << endl; time = NANOTIME_ERR; } //printf("Time elapsed : %lu\n", time); #endif return time; } } // namespace clMath // strmv performance test TEST_P(TRMV, strmv) { TestParams params; getParams(¶ms); TrmvPerformanceTest::runInstance(FN_STRMV, ¶ms); } // dtrmv performance test case TEST_P(TRMV, dtrmv) { TestParams params; getParams(¶ms); TrmvPerformanceTest::runInstance(FN_DTRMV, ¶ms); } // ctrmv performance test case TEST_P(TRMV, ctrmv) { TestParams params; getParams(¶ms); TrmvPerformanceTest::runInstance(FN_CTRMV, ¶ms); } // ztrmv performance test case TEST_P(TRMV, ztrmv) { TestParams params; getParams(¶ms); TrmvPerformanceTest::runInstance(FN_ZTRMV, ¶ms); } clblas-2.10/src/tests/performance/perf-trsm.cpp000066400000000000000000000033221264277366700215750ustar00rootroot00000000000000/* ************************************************************************ * Copyright 2013 Advanced Micro Devices, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * ************************************************************************/ #include // srand() #include // memcpy() #include #include #include #include #include #include #include "TrxmPerformanceTest.cpp" using namespace std; using namespace clMath; // strsm performance test case TEST_P(TRSM, strsm) { TestParams params; getParams(¶ms); TrxmPerformanceTest::runInstance(FN_STRSM, ¶ms); } // dtrsm performance test case TEST_P(TRSM, dtrsm) { TestParams params; getParams(¶ms); TrxmPerformanceTest::runInstance(FN_DTRSM, ¶ms); } // ctrsm performance test case TEST_P(TRSM, ctrsm) { TestParams params; getParams(¶ms); TrxmPerformanceTest::runInstance(FN_CTRSM, ¶ms); } // ztrsm performance test case TEST_P(TRSM, ztrsm) { TestParams params; getParams(¶ms); TrxmPerformanceTest::runInstance(FN_ZTRSM, ¶ms); } clblas-2.10/src/tests/performance/perf-trsv.cpp000066400000000000000000000233611264277366700216130ustar00rootroot00000000000000/* ************************************************************************ * Copyright 2013 Advanced Micro Devices, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * ************************************************************************/ /* * Gemv performance test cases */ #include // srand() #include // memcpy() #include #include #include #include #include #include #include #ifdef PERF_TEST_WITH_ACML #include #include #endif #include "PerformanceTest.h" /* * NOTE: operation factor means overall number * of multiply and add per each operation involving * 2 matrix elements */ using namespace std; using namespace clMath; #define CHECK_RESULT(ret) \ do { \ ASSERT_GE(ret, 0) << "Fatal error: can not allocate resources or " \ "perform an OpenCL request!" << endl; \ EXPECT_EQ(0, ret) << "The OpenCL version is slower in the case" << \ endl; \ } while (0) namespace clMath { template class TrsvPerformanceTest : public PerformanceTest { public: virtual ~TrsvPerformanceTest(); virtual int prepare(void); virtual nano_time_t etalonPerfSingle(void); virtual nano_time_t clblasPerfSingle(void); static void runInstance(BlasFunction fn, TestParams *params) { TrsvPerformanceTest perfCase(fn, params); int ret = 0; int opFactor; BlasBase *base; base = clMath::BlasBase::getInstance(); opFactor = 1; if ((fn == FN_DTRSV || fn == FN_ZTRSV) && !base->isDevSupportDoublePrecision()) { std::cerr << ">> WARNING: The target device doesn't support native " "double precision floating point arithmetic" << std::endl << ">> Test skipped" << std::endl; return; } if (!perfCase.areResourcesSufficient(params)) { std::cerr << ">> RESOURCE CHECK: Skip due to unsufficient resources" << std::endl; return; } else { ret = perfCase.run(opFactor); } ASSERT_GE(ret, 0) << "Fatal error: can not allocate resources or " "perform an OpenCL request!" << endl; EXPECT_EQ(0, ret) << "The OpenCL version is slower in the case" << endl; } private: TrsvPerformanceTest(BlasFunction fn, TestParams *params); bool areResourcesSufficient(TestParams *params); TestParams params_; // ElemType alpha_; ElemType *A_; ElemType *X_; ElemType *backX_; cl_mem mobjA_; cl_mem mobjX_; ::clMath::BlasBase *base_; }; template TrsvPerformanceTest::TrsvPerformanceTest( BlasFunction fn, TestParams *params) : PerformanceTest( fn, (problem_size_t)( ( params->N * (params->N+1) * sizeof(ElemType) ) ) ), params_(*params), mobjA_(NULL), mobjX_(NULL) { A_ = new ElemType[(params_.N * params_.lda) + params_.offa]; X_ = new ElemType[ 1 + ((params_.N-1) * abs(params_.incx)) + params_.offBX ]; backX_ = new ElemType[ 1 + ((params_.N-1) * abs(params_.incx)) + params_.offBX ]; base_ = ::clMath::BlasBase::getInstance(); } template TrsvPerformanceTest::~TrsvPerformanceTest() { if(A_ != NULL) { delete[] A_; } if(X_ != NULL) { delete[] X_; } if(backX_ != NULL) { delete[] backX_; } if( mobjA_ != NULL ) clReleaseMemObject(mobjA_); if( mobjX_ != NULL ) clReleaseMemObject(mobjX_); } /* * Check if available OpenCL resources are sufficient to * run the test case */ template bool TrsvPerformanceTest::areResourcesSufficient(TestParams *params) { clMath::BlasBase *base; size_t gmemSize, allocSize; size_t n = params->N; if ((A_ == NULL) || (X_ == NULL) || (backX_ == NULL)) { return 0; } base = clMath::BlasBase::getInstance(); gmemSize = (size_t)base->availGlobalMemSize( 0 ); allocSize = (size_t)base->maxMemAllocSize(); bool suff = ( sizeof(ElemType)*n*params->lda < allocSize ) && ((1 + (n-1)*abs(params->incx))*sizeof(ElemType) < allocSize); //for individual allocations suff = suff && ((( n*params->lda + (1 + (n-1)*abs(params->incx))*2)*sizeof(ElemType)) < gmemSize) ; //for total global allocations return suff ; } template int TrsvPerformanceTest::prepare(void) { size_t lenX, N; N = params_.N; lenX = 1 + ((N-1) *abs(params_.incx)) + params_.offBX; randomTrsvMatrices( params_.order, params_.uplo, params_.diag, params_.N, (A_ + params_.offa), params_.lda, (X_ + params_.offBX), params_.incx); memcpy(backX_, X_, lenX * sizeof(ElemType)); mobjA_ = base_->createEnqueueBuffer(A_, ((params_.N * params_.lda) + params_.offa) * sizeof(*A_), 0, CL_MEM_READ_ONLY); mobjX_ = base_->createEnqueueBuffer(X_, lenX * sizeof(*X_), 0, CL_MEM_READ_WRITE); return ((mobjA_ != NULL) && (mobjX_ != NULL) ) ? 0 : -1; } template nano_time_t TrsvPerformanceTest::etalonPerfSingle(void) { nano_time_t time = 0; clblasOrder order; clblasUplo fUplo; clblasTranspose fTrans; size_t lda; #ifndef PERF_TEST_WITH_ROW_MAJOR if (params_.order == clblasRowMajor) { cerr << "Row major order is not allowed" << endl; return NANOTIME_ERR; } #endif memcpy(X_, backX_, ((1 + ((params_.N-1) * abs(params_.incx)))+params_.offBX) * sizeof(ElemType)); order = params_.order; fUplo = params_.uplo; fTrans = params_.transA; lda = params_.lda; #ifdef PERF_TEST_WITH_ACML if (order != clblasColumnMajor) { order = clblasColumnMajor; fUplo = (params_.uplo == clblasUpper)? clblasLower : clblasUpper; fTrans = (params_.transA == clblasNoTrans)? clblasTrans : clblasNoTrans; if( params_.transA == clblasConjTrans ) doConjugate( A_ + params_.offa, params_.N, params_.N, lda ); } time = getCurrentTime(); clMath::blas::trsv(order, fUplo, fTrans, params_.diag, params_.N, A_, params_.offa, lda, X_, params_.offBX, params_.incx); time = getCurrentTime() - time; #endif // PERF_TEST_WITH_ACML return time; } template nano_time_t TrsvPerformanceTest::clblasPerfSingle(void) { nano_time_t time; cl_event event; cl_int status; cl_command_queue queue = base_->commandQueues()[0]; size_t lenX = 1 + ((params_.N-1) * abs(params_.incx)) + params_.offBX; status = clEnqueueWriteBuffer(queue, mobjX_, CL_TRUE, 0, lenX * sizeof(ElemType), backX_, 0, NULL, &event); if (status != CL_SUCCESS) { cerr << "Vector X buffer object enqueuing error, status = " << status << endl; return NANOTIME_ERR; } status = clWaitForEvents(1, &event); if (status != CL_SUCCESS) { cout << "Wait on event failed, status = " << status << endl; return NANOTIME_ERR; } event = NULL; DataType type; type = ( typeid(ElemType) == typeid(float))? TYPE_FLOAT:( typeid(ElemType) == typeid(double))? TYPE_DOUBLE: ( typeid(ElemType) == typeid(FloatComplex))? TYPE_COMPLEX_FLOAT: TYPE_COMPLEX_DOUBLE; time = getCurrentTime(); #define TIMING #ifdef TIMING clFinish( queue); int iter = 20; for ( int i = 1; i <= iter; i++) { #endif status = (cl_int)clMath::clblas::trsv(type, params_.order, params_.uplo, params_.transA, params_.diag, params_.N, mobjA_, params_.offa, params_.lda, mobjX_, params_.offBX, params_.incx, 1, &queue, 0, NULL, &event); if (status != CL_SUCCESS) { cerr << "The CLBLAS TRSV function failed, status = " << status << endl; return NANOTIME_ERR; } #ifdef TIMING } // iter loop clFinish( queue); time = getCurrentTime() - time; time /= iter; #else status = flushAll(1, &queue); if (status != CL_SUCCESS) { cerr << "clFlush() failed, status = " << status << endl; return NANOTIME_ERR; } status = waitForSuccessfulFinish(1, &queue, &event); if (status == CL_SUCCESS) { time = getCurrentTime() - time; } else { cerr << "Waiting for completion of commands to the queue failed, " "status = " << status << endl; time = NANOTIME_ERR; } #endif return time; } } // namespace clMath TEST_P(TRSV, strsv) { TestParams params; getParams(¶ms); TrsvPerformanceTest::runInstance(FN_STRSV, ¶ms); } TEST_P(TRSV, dtrsv) { TestParams params; getParams(¶ms); TrsvPerformanceTest::runInstance(FN_DTRSV, ¶ms); } TEST_P(TRSV, ctrsv) { TestParams params; getParams(¶ms); TrsvPerformanceTest::runInstance(FN_CTRSV, ¶ms); } TEST_P(TRSV, ztrsv) { TestParams params; getParams(¶ms); TrsvPerformanceTest::runInstance(FN_ZTRSV, ¶ms); } clblas-2.10/src/tests/performance/test-performance.cpp000066400000000000000000001242511264277366700231410ustar00rootroot00000000000000/* ************************************************************************ * Copyright 2013 Advanced Micro Devices, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * ************************************************************************/ #define DO_SYR #define DO_SPR #define DO_SYMM #define DO_TRMV #define DO_TPMV #define DO_TRSV #define DO_GEMM #define DO_TRMM #define DO_TRSM #define DO_GEMV #define DO_SYR2K #define DO_SYRK #define DO_GER #define DO_GERC #define DO_HER #define DO_HPR #define DO_SYR2 #define DO_SPR2 #define DO_SPR2 #define DO_SBMV #define DO_HER2 #define DO_HPR2 #define DO_HEMV #define DO_HEMM #define DO_HERK #define DO_SYMV #define DO_TPSV #define DO_HPMV #define DO_SPMV #define DO_GBMV #define DO_HBMV #define DO_TBMV #define DO_TBSV #define DO_HER2K #define DO_SWAP #define DO_COPY #define DO_SCAL #define DO_AXPY #define DO_DOT #define DO_DOTC #define DO_ROTG #define DO_ROTM #define DO_ROT #define DO_ROTMG #define DO_NRM2 #define DO_ASUM #define DO_iAMAX //#define DO_GEMM_2 - This needs to remain commented. #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "PerformanceRecorder.h" #define EXPECTED_SINGLE_FLOAT_PERF_RATIO 10.0 #define EXPECTED_DOUBLE_FLOAT_PERF_RATIO 4.0 using ::testing::TestWithParam; using ::testing::Values; using ::testing::ValuesIn; using ::testing::Combine; using namespace std; using namespace clMath; PerformanceRecorder *perfRecorder; static bool isDoubleZero(double d) { return (fabs(d) < 0.000001); } static const char *functionToString(BlasFunction function) { const char *s = NULL; switch (function) { case FN_SGEMV: s = "SGEMV"; break; case FN_DGEMV: s = "DGEMV"; break; case FN_CGEMV: s = "CGEMV"; break; case FN_ZGEMV: s = "ZGEMV"; break; case FN_SSYMV: s = "SSYMV"; break; case FN_DSYMV: s = "DSYMV"; break; case FN_SGEMM: s = "SGEMM"; break; case FN_DGEMM: s = "DGEMM"; break; case FN_CGEMM: s = "CGEMM"; break; case FN_ZGEMM: s = "ZGEMM"; break; case FN_SGEMM_2: s = "SGEMM_2"; break; case FN_DGEMM_2: s = "DGEMM_2"; break; case FN_CGEMM_2: s = "CGEMM_2"; break; case FN_ZGEMM_2: s = "ZGEMM_2"; break; case FN_STRMM: s = "STRMM"; break; case FN_DTRMM: s = "DTRMM"; break; case FN_CTRMM: s = "CTRMM"; break; case FN_ZTRMM: s = "ZTRMM"; break; case FN_STRSM: s = "STRSM"; break; case FN_DTRSM: s = "DTRSM"; break; case FN_CTRSM: s = "CTRSM"; break; case FN_ZTRSM: s = "ZTRSM"; break; case FN_SSYR2K: s = "SSYR2K"; break; case FN_DSYR2K: s = "DSYR2K"; break; case FN_CSYR2K: s = "CSYR2K"; break; case FN_ZSYR2K: s = "ZSYR2K"; break; case FN_SSYRK: s = "SSYRK"; break; case FN_DSYRK: s = "DSYRK"; break; case FN_CSYRK: s = "CSYRK"; break; case FN_ZSYRK: s = "ZSYRK"; break; case FN_STRMV: s = "STRMV"; break; case FN_DTRMV: s = "DTRMV"; break; case FN_CTRMV: s = "CTRMV"; break; case FN_ZTRMV: s = "ZTRMV"; break; case FN_STPMV: s = "STPMV"; break; case FN_DTPMV: s = "DTPMV"; break; case FN_CTPMV: s = "CTPMV"; break; case FN_ZTPMV: s = "ZTPMV"; break; case FN_STRSV: s = "STRSV"; break; case FN_DTRSV: s = "DTRSV"; break; case FN_CTRSV: s = "CTRSV"; break; case FN_ZTRSV: s = "ZTRSV"; break; case FN_STBSV: s = "STBSV"; break; case FN_DTBSV: s = "DTBSV"; break; case FN_CTBSV: s = "CTBSV"; break; case FN_ZTBSV: s = "ZTBSV"; break; case FN_STPSV: s = "STPSV"; break; case FN_DTPSV: s = "DTPSV"; break; case FN_CTPSV: s = "CTPSV"; break; case FN_ZTPSV: s = "ZTPSV"; break; case FN_SSYMM: s = "SSYMM"; break; case FN_DSYMM: s = "DSYMM"; break; case FN_CSYMM: s = "CSYMM"; break; case FN_ZSYMM: s = "ZSYMM"; break; case FN_SGER: s = "SGER"; break; case FN_DGER: s = "DGER"; break; case FN_CGERU: s = "CGERU"; break; case FN_ZGERU: s = "ZGERU"; break; case FN_CGERC: s = "CGERC"; break; case FN_ZGERC: s = "ZGERC"; break; case FN_CHER: s = "CHER"; break; case FN_ZHER: s = "ZHER"; break; case FN_CHPR: s = "CHPR"; break; case FN_ZHPR: s = "ZHPR"; break; case FN_CHER2: s = "CHER2"; break; case FN_ZHER2: s = "ZHER2"; break; case FN_SSYR: s = "SSYR"; break; case FN_DSYR: s = "DSYR"; break; case FN_SSPR2: s = "SSPR2"; break; case FN_DSPR2: s = "DSPR2"; break; case FN_SSPR: s = "SSPR"; break; case FN_DSPR: s = "DSPR"; break; case FN_SSYR2: s = "SSYR2"; break; case FN_DSYR2: s = "DSYR2"; break; case FN_CHEMM: s = "CHEMM"; break; case FN_ZHEMM: s = "ZHEMM"; break; case FN_CHEMV: s = "CHEMV"; break; case FN_ZHEMV: s = "ZHEMV"; break; case FN_CHERK: s = "CHERK"; break; case FN_ZHERK: s = "ZHERK"; break; case FN_SSBMV: s = "SSBMV"; break; case FN_DSBMV: s = "DSBMV"; break; case FN_CHBMV: s = "CHBMV"; break; case FN_ZHBMV: s = "ZHBMV"; break; case FN_CHER2K: s = "CHER2K"; break; case FN_ZHER2K: s = "ZHER2K"; break; case FN_SSWAP: s = "SSWAP"; break; case FN_DSWAP: s = "DSWAP"; break; case FN_CSWAP: s = "CSWAP"; break; case FN_ZSWAP: s = "ZSWAP"; break; case FN_SSCAL: s = "SSCAL"; break; case FN_DSCAL: s = "DSCAL"; break; case FN_CSCAL: s = "CSCAL"; break; case FN_ZSCAL: s = "ZSCAL"; break; case FN_CSSCAL: s = "CSSCAL"; break; case FN_ZDSCAL: s = "ZDSCAL"; break; case FN_SCOPY: s = "SCOPY"; break; case FN_DCOPY: s = "DCOPY"; break; case FN_CCOPY: s = "CCOPY"; break; case FN_ZCOPY: s = "ZCOPY"; break; case FN_SDOT: s = "SDOT"; break; case FN_DDOT: s = "DDOT"; break; case FN_CDOTU: s = "CDOTU"; break; case FN_ZDOTU: s = "ZDOTU"; break; case FN_CDOTC: s = "CDOTC"; break; case FN_ZDOTC: s = "ZDOTC"; break; case FN_SAXPY: s = "SAXPY"; break; case FN_DAXPY: s = "DAXPY"; break; case FN_CAXPY: s = "CAXPY"; break; case FN_ZAXPY: s = "ZAXPY"; break; case FN_SROTG: s = "SROTG"; break; case FN_DROTG: s = "DROTG"; break; case FN_CROTG: s = "CROTG"; break; case FN_ZROTG: s = "ZROTG"; break; case FN_SROTM: s = "SROTM"; break; case FN_DROTM: s = "DROTM"; break; case FN_SROT: s = "SROT"; break; case FN_DROT: s = "DROT"; break; case FN_CSROT: s = "CSROT"; break; case FN_ZDROT: s = "ZDROT"; break; case FN_SROTMG: s = "SROTMG"; break; case FN_DROTMG: s = "DROTMG"; break; case FN_SNRM2: s = "SNRM2"; break; case FN_DNRM2: s = "DNRM2"; break; case FN_SCNRM2: s = "SCNRM2"; break; case FN_DZNRM2: s = "DZNRM2"; break; case FN_SASUM: s = "SASUM"; break; case FN_DASUM: s = "DASUM"; break; case FN_SCASUM: s = "SCASUM"; break; case FN_DZASUM: s = "DZASUM"; break; case FN_iSAMAX: s = "iSAMAX"; break; case FN_iDAMAX: s = "iDAMAX"; break; case FN_iCAMAX: s = "iCAMAX"; break; case FN_iZAMAX: s = "iZAMAX"; break; default: break; } return s; } static const clblasOrder orderSet[] = #ifdef PERF_TEST_WITH_ROW_MAJOR { clblasColumnMajor, clblasRowMajor }; #else { clblasColumnMajor }; #endif static const clblasTranspose transSet[] = { clblasNoTrans, clblasTrans, clblasConjTrans }; static const clblasSide sideSet[] = { clblasLeft, clblasRight }; static const clblasUplo uploSet[] = { clblasUpper, clblasLower }; static const clblasDiag diagSet[] = { clblasUnit, clblasNonUnit }; const int sizeRange[] = { 2048, 2800, 4096, 5600 }; const int sizeRange48[] = { 41*48, 41*48+24 }; // Since blas-1 contains only vector arrays, huge vectors has to be provided to reach the peak of the card const int blas1sizeRange[] = {4194304, 7840000, 16777216, 31360000 }; //const int sizeRange[] = { 2800, 4096, 5600}; const int KRange[] = { 2047, 2799, 4095, 5599 }; const int ldaRange[] = { 0, 5496, 5497 }; const int offsetRange[] = { 0, 100 }; const size_t offs[] = {0, 63, 128, 258 }; const int incRange[] = { 1, 10 }; const double realAlphaRange[] = {(double)50.0, (double)100.0, (double)999999}; const cl_float2 complexAlphaRange[] = {floatComplex(1,2), floatComplex(4,5)}; const ComplexLong alphaBetaRange[] = {{50,50}, {20,20}}; #ifdef DO_GEMV // generic gemv test looking over a set of sizes INSTANTIATE_TEST_CASE_P(Generic, GEMV, Combine( ValuesIn(orderSet), ValuesIn(transSet), ValuesIn(sizeRange), ValuesIn(sizeRange), Values(ExtraTestSizes(0, 1, 1, 0, 0, 0)), Values(1))); // Custom test - use command line arguments to tweak it INSTANTIATE_TEST_CASE_P(Custom, GEMV, Combine( ValuesIn(orderSet), ValuesIn(transSet), Values(32), Values(32), Values(ExtraTestSizes(0, 1, 1, 0, 0, 0)), Values(1))); #endif #ifdef DO_SYMV // generic symv test looking over a set of sizes INSTANTIATE_TEST_CASE_P(Generic, SYMV, Combine( ValuesIn(orderSet), ValuesIn(uploSet), ValuesIn(sizeRange), Values(ExtraTestSizes(0, 1, 1, 0, 0, 0)), Values(1))); // Custom test - use command line arguments to tweak it INSTANTIATE_TEST_CASE_P(Custom, SYMV, Combine( ValuesIn(orderSet), ValuesIn(uploSet), Values(32), Values(ExtraTestSizes(0, 1, 1, 0, 0, 0)), Values(1))); #endif #ifdef DO_GEMM_2 // generic gemm test looking over a set of sizes INSTANTIATE_TEST_CASE_P(Generic, gemm2, Combine( Values(clblasColumnMajor), Values(clblasNoTrans), Values(clblasNoTrans), ValuesIn(sizeRange), ValuesIn(sizeRange), ValuesIn(sizeRange), Values(ExtraTestSizes()), Values(1))); #endif #ifdef DO_GEMM // generic gemm test looking over a set of sizes INSTANTIATE_TEST_CASE_P(Generic, GEMM, Combine( ValuesIn(orderSet), ValuesIn(transSet), ValuesIn(transSet), ValuesIn(sizeRange), ValuesIn(sizeRange), ValuesIn(sizeRange), Values(ExtraTestSizes()), Values(1))); // Custom test - use command line arguments to tweak it INSTANTIATE_TEST_CASE_P(Custom, GEMM, Combine( ValuesIn(orderSet), ValuesIn(transSet), ValuesIn(transSet), Values(41*48), Values(41*48), Values(41*48), Values(ExtraTestSizes()), Values(1))); #endif #ifdef DO_TRMM // generic trmm test looking over a set of sizes INSTANTIATE_TEST_CASE_P(Generic, TRMM, Combine( ValuesIn(orderSet), ValuesIn(sideSet), ValuesIn(uploSet), ValuesIn(transSet), ValuesIn(diagSet), ValuesIn(sizeRange), ValuesIn(sizeRange), Values(ExtraTestSizes()), Values(1))); // Custom test - use command line arguments to tweak it INSTANTIATE_TEST_CASE_P(Custom, TRMM, Combine( ValuesIn(orderSet), ValuesIn(sideSet), ValuesIn(uploSet), ValuesIn(transSet), ValuesIn(diagSet), Values(32), Values(32), Values(ExtraTestSizes()), Values(1))); #endif #ifdef DO_TRSM // generic trsm test looking over a set of sizes INSTANTIATE_TEST_CASE_P(Generic, TRSM, Combine( ValuesIn(orderSet), ValuesIn(sideSet), ValuesIn(uploSet), ValuesIn(transSet), ValuesIn(diagSet), ValuesIn(sizeRange), ValuesIn(sizeRange), Values(ExtraTestSizes()), Values(1))); // Custom test - use command line arguments to tweak it INSTANTIATE_TEST_CASE_P(Custom, TRSM, Combine( ValuesIn(orderSet), ValuesIn(sideSet), ValuesIn(uploSet), ValuesIn(transSet), ValuesIn(diagSet), Values(32), Values(32), Values(ExtraTestSizes()), Values(1))); #endif #ifdef DO_SYR2K // generic syr2k test looking over a set of sizes INSTANTIATE_TEST_CASE_P(Generic, SYR2K, Combine( ValuesIn(orderSet), ValuesIn(uploSet), ValuesIn(transSet), ValuesIn(sizeRange), ValuesIn(sizeRange), Values(ExtraTestSizes()), Values(1))); // Custom test - use command line arguments to tweak it INSTANTIATE_TEST_CASE_P(Custom, SYR2K, Combine( ValuesIn(orderSet), ValuesIn(uploSet), ValuesIn(transSet), Values(32), Values(32), Values(ExtraTestSizes()), Values(1))); #endif #ifdef DO_SYRK // generic syrk test looking over a set of sizes INSTANTIATE_TEST_CASE_P(Generic, SYRK, Combine( ValuesIn(orderSet), ValuesIn(uploSet), ValuesIn(transSet), ValuesIn(sizeRange), ValuesIn(sizeRange), Values(ExtraTestSizes()), Values(1))); // Custom test - use command line arguments to tweak it INSTANTIATE_TEST_CASE_P(Custom, SYRK, Combine( ValuesIn(orderSet), ValuesIn(uploSet), ValuesIn(transSet), Values(32), Values(32), Values(ExtraTestSizes()), Values(1))); #endif #ifdef DO_HERK // generic syrk test looking over a set of sizes INSTANTIATE_TEST_CASE_P(Generic, HERK, Combine( ValuesIn(orderSet), ValuesIn(uploSet), Values(clblasNoTrans, clblasConjTrans), ValuesIn(sizeRange), ValuesIn(sizeRange),ValuesIn(alphaBetaRange), ValuesIn(alphaBetaRange), Values(ExtraTestSizes()), Values(1))); // Custom test - use command line arguments to tweak it INSTANTIATE_TEST_CASE_P(Custom, HERK, Combine( ValuesIn(orderSet), ValuesIn(uploSet), Values(clblasNoTrans, clblasConjTrans), Values(32), Values(32), ValuesIn(alphaBetaRange), ValuesIn(alphaBetaRange), Values(ExtraTestSizes()), Values(1))); #endif #ifdef DO_TRMV // generic trmv test looking over a set of sizes // N, LDA, INCX, OFFA, OFFX, NUMQUEUES INSTANTIATE_TEST_CASE_P(Generic, TRMV, Combine( ValuesIn(orderSet), ValuesIn(uploSet), ValuesIn(transSet), ValuesIn(diagSet), ValuesIn(sizeRange), Values(0), Values(1), Values(0,10), Values(0,9), Values(1))); // Custom test - use command line arguments to tweak it INSTANTIATE_TEST_CASE_P(Custom, TRMV, Combine( ValuesIn(orderSet), ValuesIn(uploSet), ValuesIn(transSet), ValuesIn(diagSet), Values(5000), Values(0), Values(1), Values(0,10), Values(0,9),Values(1))); #endif #ifdef DO_TPMV // generic trmv test looking over a set of sizes // N, LDA, INCX, OFFA, OFFX, NUMQUEUES INSTANTIATE_TEST_CASE_P(Generic, TPMV, Combine( ValuesIn(orderSet), ValuesIn(uploSet), ValuesIn(transSet), ValuesIn(diagSet), ValuesIn(sizeRange), Values(0),Values(1), Values(0,10), Values(0,9), Values(1))); // Custom test - use command line arguments to tweak it INSTANTIATE_TEST_CASE_P(Custom, TPMV, Combine( ValuesIn(orderSet), ValuesIn(uploSet), ValuesIn(transSet), ValuesIn(diagSet), Values(5000), Values(0),Values(1), Values(0,10), Values(0,9),Values(1))); #endif #ifdef DO_TRSV INSTANTIATE_TEST_CASE_P(Generic, TRSV, Combine( ValuesIn(orderSet), ValuesIn(uploSet), ValuesIn(transSet), ValuesIn(diagSet), ValuesIn(sizeRange), Values(0), Values(1), Values(0,10), Values(0,9), Values(1))); // Custom test - use command line arguments to tweak it INSTANTIATE_TEST_CASE_P(Custom, TRSV, Combine( ValuesIn(orderSet), ValuesIn(uploSet), ValuesIn(transSet), ValuesIn(diagSet), Values(1024), Values(0), Values(1), Values(0,10), Values(0,9), Values(1))); #endif #ifdef DO_TPSV INSTANTIATE_TEST_CASE_P(Generic, TPSV, Combine( ValuesIn(orderSet), ValuesIn(uploSet), ValuesIn(transSet), ValuesIn(diagSet), ValuesIn(sizeRange), Values(0), Values(1), Values(0,10), Values(0,9), Values(1))); // Custom test - use command line arguments to tweak it INSTANTIATE_TEST_CASE_P(Custom, TPSV, Combine( ValuesIn(orderSet), ValuesIn(uploSet), ValuesIn(transSet), ValuesIn(diagSet), Values(1024), Values(0), Values(1), Values(0,10), Values(0,9), Values(1))); #endif #ifdef DO_SYMM INSTANTIATE_TEST_CASE_P(Generic, SYMM, Combine( ValuesIn(orderSet), ValuesIn(sideSet), ValuesIn(uploSet), ValuesIn(sizeRange), ValuesIn(sizeRange), ValuesIn(complexAlphaRange), ValuesIn(complexAlphaRange), Values(clMath::ExtraTestSizes(0, 0, 0, 0, 0, 0)), Values(1) ) ); INSTANTIATE_TEST_CASE_P(custom, SYMM, Combine( ValuesIn(orderSet), ValuesIn(sideSet), ValuesIn(uploSet), Values(1024), Values(1024), ValuesIn(complexAlphaRange), ValuesIn(complexAlphaRange), Values(clMath::ExtraTestSizes(0, 0, 0, 0, 0, 0)), Values(1) ) ); #endif #ifdef DO_HEMM INSTANTIATE_TEST_CASE_P(Generic, HEMM, Combine( ValuesIn(orderSet), ValuesIn(sideSet), ValuesIn(uploSet), ValuesIn(sizeRange), ValuesIn(sizeRange), ValuesIn(complexAlphaRange), ValuesIn(complexAlphaRange), Values(clMath::ExtraTestSizes()), //ValuesIn(complexAlphaRange), Values(clMath::ExtraTestSizes((size_t)0, (size_t)0, (size_t)0, (size_t)12, (size_t)0, (size_t)1)), Values(1) ) ); INSTANTIATE_TEST_CASE_P(custom, HEMM, Combine( ValuesIn(orderSet), ValuesIn(sideSet), ValuesIn(uploSet), Values(1024), Values(1024), Values(complexAlphaRange[0]), Values(complexAlphaRange[1]), Values(clMath::ExtraTestSizes((size_t)0, (size_t)0, (size_t)0, (size_t)8, (size_t)0, (size_t)1 )), Values(1) ) ); #endif #ifdef DO_GER INSTANTIATE_TEST_CASE_P(Generic, GER, Combine( ValuesIn(orderSet),ValuesIn(sizeRange), ValuesIn(sizeRange), Values(0), Values(1), Values(1), Values(0, 10), Values(0, 8),Values(0, 9),Values(1) ) ); INSTANTIATE_TEST_CASE_P(custom, GER, Combine( ValuesIn(orderSet),ValuesIn(sizeRange), ValuesIn(sizeRange), Values(0), Values(1), Values(1), Values(0, 10), Values(0, 8),Values(0, 9), Values(1) ) ); #endif #ifdef DO_GERC INSTANTIATE_TEST_CASE_P(Generic, GERC, Combine( ValuesIn(orderSet),ValuesIn(sizeRange), ValuesIn(sizeRange), Values(0), Values(1), Values(1), Values(0, 10), Values(0, 8),Values(0, 9),Values(1) ) ); INSTANTIATE_TEST_CASE_P(custom, GERC, Combine( ValuesIn(orderSet),ValuesIn(sizeRange), ValuesIn(sizeRange), Values(0), Values(1), Values(1), Values(0, 10), Values(0, 8),Values(0, 9), Values(1) ) ); #endif #ifdef DO_HER INSTANTIATE_TEST_CASE_P(Generic, HER, Combine( ValuesIn(orderSet), ValuesIn(uploSet), ValuesIn(sizeRange), ValuesIn(realAlphaRange), ValuesIn(ldaRange), ValuesIn(incRange), ValuesIn(offsetRange),ValuesIn(offsetRange), Values(1) ) ); INSTANTIATE_TEST_CASE_P(Custom, HER, Combine( ValuesIn(orderSet), ValuesIn(uploSet), ValuesIn(sizeRange), ValuesIn(realAlphaRange), ValuesIn(ldaRange), ValuesIn(incRange), ValuesIn(offsetRange),ValuesIn(offsetRange), Values(1) ) ); #endif #ifdef DO_HPR INSTANTIATE_TEST_CASE_P(Generic, HPR, Combine( ValuesIn(orderSet), ValuesIn(uploSet), ValuesIn(sizeRange), ValuesIn(realAlphaRange), Values(0), ValuesIn(incRange), ValuesIn(offsetRange),ValuesIn(offsetRange), Values(1) ) ); INSTANTIATE_TEST_CASE_P(Custom, HPR, Combine( ValuesIn(orderSet), ValuesIn(uploSet), ValuesIn(sizeRange), ValuesIn(realAlphaRange), Values(0), ValuesIn(incRange), ValuesIn(offsetRange),ValuesIn(offsetRange), Values(1) ) ); #endif #ifdef DO_HER2 INSTANTIATE_TEST_CASE_P(Generic, HER2, Combine( ValuesIn(orderSet), ValuesIn(uploSet), ValuesIn(sizeRange), ValuesIn(complexAlphaRange), ValuesIn(offsetRange), ValuesIn(incRange), ValuesIn(offsetRange),ValuesIn(offsetRange), ValuesIn(ldaRange), Values(1) ) ); INSTANTIATE_TEST_CASE_P(Custom, HER2, Combine( ValuesIn(orderSet), ValuesIn(uploSet), ValuesIn(sizeRange), ValuesIn(complexAlphaRange), ValuesIn(offsetRange), Values(1), ValuesIn(offsetRange), ValuesIn(offsetRange), ValuesIn(ldaRange), Values(1) ) ); #endif #ifdef DO_HPR2 INSTANTIATE_TEST_CASE_P(Generic, HPR2, Combine( ValuesIn(orderSet), ValuesIn(uploSet), ValuesIn(sizeRange), ValuesIn(complexAlphaRange), ValuesIn(offsetRange), ValuesIn(incRange), ValuesIn(offsetRange),ValuesIn(offsetRange), ValuesIn(ldaRange), Values(1) ) ); INSTANTIATE_TEST_CASE_P(Custom, HPR2, Combine( ValuesIn(orderSet), ValuesIn(uploSet), ValuesIn(sizeRange), ValuesIn(complexAlphaRange), ValuesIn(offsetRange), Values(1), ValuesIn(offsetRange), ValuesIn(offsetRange), ValuesIn(ldaRange), Values(1) ) ); #endif #ifdef DO_SYR INSTANTIATE_TEST_CASE_P(Generic, SYR, Combine( ValuesIn(orderSet), ValuesIn(uploSet), ValuesIn(sizeRange), ValuesIn(realAlphaRange), ValuesIn(offsetRange), ValuesIn(incRange), ValuesIn(offsetRange), Values(0), Values(1) ) ); INSTANTIATE_TEST_CASE_P(Custom, SYR, Combine( ValuesIn(orderSet), ValuesIn(uploSet), Values(4099), ValuesIn(realAlphaRange), ValuesIn(offsetRange), Values(1), ValuesIn(offsetRange), ValuesIn(ldaRange), Values(1) ) ); #endif #ifdef DO_SPR INSTANTIATE_TEST_CASE_P(Generic, SPR, Combine( ValuesIn(orderSet), ValuesIn(uploSet), ValuesIn(sizeRange), ValuesIn(realAlphaRange), ValuesIn(offsetRange), ValuesIn(incRange), ValuesIn(offsetRange), Values(0), Values(1) ) ); INSTANTIATE_TEST_CASE_P(Custom, SPR, Combine( ValuesIn(orderSet), ValuesIn(uploSet), Values(4099), ValuesIn(realAlphaRange), ValuesIn(offsetRange), Values(1), ValuesIn(offsetRange), Values(0), Values(1) ) ); #endif #ifdef DO_SYR2 INSTANTIATE_TEST_CASE_P(Generic, SYR2, Combine( ValuesIn(orderSet), ValuesIn(uploSet), ValuesIn(sizeRange), ValuesIn(realAlphaRange), ValuesIn(offsetRange), ValuesIn(incRange), ValuesIn(offsetRange), ValuesIn(offsetRange), Values(0), Values(1) ) ); INSTANTIATE_TEST_CASE_P(Custom, SYR2, Combine( ValuesIn(orderSet), ValuesIn(uploSet), Values(4099), ValuesIn(realAlphaRange), ValuesIn(offsetRange), Values(1), ValuesIn(offsetRange), ValuesIn(offsetRange), ValuesIn(ldaRange), Values(1) ) ); #endif #ifdef DO_SPR2 INSTANTIATE_TEST_CASE_P(Generic, SPR2, Combine( ValuesIn(orderSet), ValuesIn(uploSet), ValuesIn(sizeRange), ValuesIn(realAlphaRange), ValuesIn(offsetRange), ValuesIn(incRange), ValuesIn(offsetRange), ValuesIn(offsetRange), Values(0), Values(1) ) ); INSTANTIATE_TEST_CASE_P(Custom, SPR2, Combine( ValuesIn(orderSet), ValuesIn(uploSet), Values(4099), ValuesIn(realAlphaRange), ValuesIn(offsetRange), Values(1), ValuesIn(offsetRange), ValuesIn(offsetRange), Values(0), Values(1) ) ); #endif #ifdef DO_HEMV INSTANTIATE_TEST_CASE_P(Generic, HEMV, Combine( ValuesIn(orderSet), ValuesIn(uploSet), ValuesIn(sizeRange), ValuesIn(alphaBetaRange), ValuesIn(alphaBetaRange), Values((size_t)0), ValuesIn(offs), ValuesIn(offs), Values(clMath::ExtraTestSizes(0, 1, 1, 0, 0, 0)), Values(1))); INSTANTIATE_TEST_CASE_P(Custom, HEMV, Combine( ValuesIn(orderSet), ValuesIn(uploSet), Values(4099), ValuesIn(alphaBetaRange), ValuesIn(alphaBetaRange), Values((size_t)0), ValuesIn(offs), ValuesIn(offs), Values(clMath::ExtraTestSizes(0, 1, 1, 0, 0, 0)), Values(1))); #endif #ifdef DO_HPMV INSTANTIATE_TEST_CASE_P(Generic, HPMV, Combine( ValuesIn(orderSet), ValuesIn(uploSet), ValuesIn(sizeRange), ValuesIn(alphaBetaRange), ValuesIn(alphaBetaRange), Values((size_t)0), Values((size_t)0), Values((size_t)0), Values(clMath::ExtraTestSizes(0, 1, 1, 0, 0, 0)), Values(1))); INSTANTIATE_TEST_CASE_P(Custom, HPMV, Combine( ValuesIn(orderSet), ValuesIn(uploSet), Values(4099), ValuesIn(alphaBetaRange), ValuesIn(alphaBetaRange), Values((size_t)0), ValuesIn(offs), ValuesIn(offs), Values(clMath::ExtraTestSizes(0, 1, 1, 0, 0, 0)), Values(1))); #endif #ifdef DO_SPMV INSTANTIATE_TEST_CASE_P(Generic, SPMV, Combine( ValuesIn(orderSet), ValuesIn(uploSet), ValuesIn(sizeRange), ValuesIn(alphaBetaRange), ValuesIn(alphaBetaRange), Values((size_t)0), Values((size_t)0), Values((size_t)0), Values(clMath::ExtraTestSizes(0, 1, 1, 0, 0, 0)), Values(1))); INSTANTIATE_TEST_CASE_P(Custom, SPMV, Combine( ValuesIn(orderSet), ValuesIn(uploSet), Values(4099), ValuesIn(alphaBetaRange), ValuesIn(alphaBetaRange), Values((size_t)0), ValuesIn(offs), ValuesIn(offs), Values(clMath::ExtraTestSizes(0, 1, 1, 0, 0, 0)), Values(1))); #endif #ifdef DO_GBMV // generic gemv test looking over a set of sizes INSTANTIATE_TEST_CASE_P(Generic, GBMV, Combine( ValuesIn(orderSet), ValuesIn(transSet), ValuesIn(sizeRange), ValuesIn(sizeRange), ValuesIn(KRange), ValuesIn(KRange), Values(ExtraTestSizes(0, (int)1, (int)1, 0, 0, 0)), ValuesIn(alphaBetaRange), ValuesIn(alphaBetaRange), Values(1))); // Custom test - use command line arguments to tweak it INSTANTIATE_TEST_CASE_P(Custom, GBMV, Combine( ValuesIn(orderSet), ValuesIn(transSet), Values(32), Values(32), Values(30), Values(25), Values(ExtraTestSizes(0, (int)1, (int)1, 0, 0, 0)), ValuesIn(alphaBetaRange), ValuesIn(alphaBetaRange), Values(1))); #endif #ifdef DO_SBMV // generic gemv test looking over a set of sizes INSTANTIATE_TEST_CASE_P(Generic, SBMV, Combine( ValuesIn(orderSet), ValuesIn(uploSet), ValuesIn(sizeRange), ValuesIn(KRange), Values(ExtraTestSizes(0, (int)1, (int)1, 0, 0, 0)), ValuesIn(alphaBetaRange), ValuesIn(alphaBetaRange), Values(1))); // Custom test - use command line arguments to tweak it INSTANTIATE_TEST_CASE_P(Custom, SBMV, Combine( ValuesIn(orderSet), ValuesIn(uploSet), Values(32), Values(25), Values(ExtraTestSizes(0, (int)1, (int)1, 0, 0, 0)), ValuesIn(alphaBetaRange), ValuesIn(alphaBetaRange), Values(1))); #endif #ifdef DO_HBMV // generic gemv test looking over a set of sizes INSTANTIATE_TEST_CASE_P(Generic, HBMV, Combine( ValuesIn(orderSet), ValuesIn(uploSet), ValuesIn(sizeRange), ValuesIn(KRange), Values(ExtraTestSizes(0, (int)1, (int)1, 0, 0, 0)), ValuesIn(alphaBetaRange), ValuesIn(alphaBetaRange), Values(1))); // Custom test - use command line arguments to tweak it INSTANTIATE_TEST_CASE_P(Custom, HBMV, Combine( ValuesIn(orderSet), ValuesIn(uploSet), Values(32), Values(25), Values(ExtraTestSizes(0, (int)1, (int)1, 0, 0, 0)), ValuesIn(alphaBetaRange), ValuesIn(alphaBetaRange), Values(1))); #endif #ifdef DO_TBMV // generic gemv test looking over a set of sizes INSTANTIATE_TEST_CASE_P(Generic, TBMV, Combine( ValuesIn(orderSet), ValuesIn(uploSet), ValuesIn(transSet), ValuesIn(diagSet), ValuesIn(sizeRange),ValuesIn(KRange), Values(ExtraTestSizes(0, (int)1, (int)1, 0, 0, 0)), Values(1))); // Custom test - use command line arguments to tweak it INSTANTIATE_TEST_CASE_P(Custom, TBMV, Combine( ValuesIn(orderSet), ValuesIn(uploSet), ValuesIn(transSet), ValuesIn(diagSet), Values(32),Values(30),Values(ExtraTestSizes(0, (int)1, (int)1, 0, 0, 0)), Values(1))); #endif #ifdef DO_TBSV // generic gemv test looking over a set of sizes INSTANTIATE_TEST_CASE_P(Generic, TBSV, Combine( ValuesIn(orderSet), ValuesIn(uploSet), ValuesIn(transSet), ValuesIn(diagSet), ValuesIn(sizeRange),ValuesIn(KRange), Values(ExtraTestSizes(0, (int)1, (int)1, 0, 0, 0)), Values(1))); // Custom test - use command line arguments to tweak it INSTANTIATE_TEST_CASE_P(Custom, TBSV, Combine( ValuesIn(orderSet), ValuesIn(uploSet), ValuesIn(transSet), ValuesIn(diagSet), Values(32),Values(30),Values(ExtraTestSizes(0, (int)1, (int)1, 0, 0, 0)), Values(1))); #endif #ifdef DO_HER2K INSTANTIATE_TEST_CASE_P(Generic, HER2K, Combine( ValuesIn(orderSet), ValuesIn(uploSet), Values(clblasNoTrans, clblasConjTrans), ValuesIn(sizeRange), ValuesIn(sizeRange),ValuesIn(alphaBetaRange), ValuesIn(alphaBetaRange), Values(ExtraTestSizes()), Values(1))); // Custom test - use command line arguments to tweak it INSTANTIATE_TEST_CASE_P(Custom, HER2K, Combine( ValuesIn(orderSet), ValuesIn(uploSet), Values(clblasNoTrans, clblasConjTrans), Values(32), Values(32), ValuesIn(alphaBetaRange), ValuesIn(alphaBetaRange), Values(ExtraTestSizes()), Values(1))); #endif #ifdef DO_SWAP INSTANTIATE_TEST_CASE_P(Generic, SWAPXY, Combine( ValuesIn(blas1sizeRange), ValuesIn(offsetRange), ValuesIn(incRange), ValuesIn(offsetRange), ValuesIn(incRange), Values(1) ) ); // Custom test - use command line arguments to tweak it INSTANTIATE_TEST_CASE_P(Custom, SWAPXY, Combine( Values(819430), ValuesIn(offsetRange), ValuesIn(incRange), ValuesIn(offsetRange), ValuesIn(incRange), Values(1) ) ); #endif #ifdef DO_DOT INSTANTIATE_TEST_CASE_P(Generic, DOT, Combine( ValuesIn(blas1sizeRange), ValuesIn(incRange), ValuesIn(incRange), ValuesIn(offsetRange), ValuesIn(offsetRange), ValuesIn(offsetRange), Values(1) ) ); // Custom test - use command line arguments to tweak it INSTANTIATE_TEST_CASE_P(Custom, DOT, Combine( Values(819430), ValuesIn(incRange), ValuesIn(incRange), ValuesIn(offsetRange), ValuesIn(offsetRange), ValuesIn(offsetRange), Values(1) ) ); #endif #ifdef DO_DOTC INSTANTIATE_TEST_CASE_P(Generic, DOTC, Combine( ValuesIn(blas1sizeRange), ValuesIn(incRange), ValuesIn(incRange), ValuesIn(offsetRange), ValuesIn(offsetRange), ValuesIn(offsetRange), Values(1) ) ); // Custom test - use command line arguments to tweak it INSTANTIATE_TEST_CASE_P(Custom, DOTC, Combine( Values(819430), ValuesIn(incRange), ValuesIn(incRange), ValuesIn(offsetRange), ValuesIn(offsetRange), ValuesIn(offsetRange), Values(1) ) ); #endif #ifdef DO_COPY INSTANTIATE_TEST_CASE_P(Generic, COPY, Combine( ValuesIn(blas1sizeRange), ValuesIn(incRange), ValuesIn(incRange), ValuesIn(offsetRange), ValuesIn(offsetRange), Values(1) ) ); // Custom test - use command line arguments to tweak it INSTANTIATE_TEST_CASE_P(Custom, COPY, Combine( Values(32), ValuesIn(incRange), ValuesIn(incRange), ValuesIn(offsetRange), ValuesIn(offsetRange), Values(1) ) ); #endif #ifdef DO_SCAL INSTANTIATE_TEST_CASE_P(Generic, SCAL, Combine( ValuesIn(blas1sizeRange), ValuesIn(alphaBetaRange), ValuesIn(offsetRange), ValuesIn(incRange), Values(1) ) ); // Custom test - use command line arguments to tweak it INSTANTIATE_TEST_CASE_P(Custom, SCAL, Combine( Values(819430), ValuesIn(alphaBetaRange), ValuesIn(offsetRange), Values(1, 2), Values(1) ) ); #endif #ifdef DO_AXPY INSTANTIATE_TEST_CASE_P(Generic, AXPY, Combine( ValuesIn(blas1sizeRange), ValuesIn(alphaBetaRange), ValuesIn(offsetRange), ValuesIn(incRange), ValuesIn(offsetRange), ValuesIn(incRange), Values(1) ) ); // Custom test - use command line arguments to tweak it INSTANTIATE_TEST_CASE_P(Custom, AXPY, Combine( Values(819430), ValuesIn(alphaBetaRange), ValuesIn(offsetRange), Values(1, 2), ValuesIn(offsetRange), Values(1, 2), Values(1) ) ); #endif #ifdef DO_ROTG INSTANTIATE_TEST_CASE_P(Generic, ROTG, Combine( ValuesIn(offsetRange), ValuesIn(offsetRange), ValuesIn(offsetRange), ValuesIn(offsetRange), Values(1))); // Custom test - use command line arguments to tweak it INSTANTIATE_TEST_CASE_P(Custom, ROTG, Combine( ValuesIn(offsetRange), ValuesIn(offsetRange), ValuesIn(offsetRange), ValuesIn(offsetRange), Values(1))); #endif #ifdef DO_ROTM INSTANTIATE_TEST_CASE_P(Generic, ROTM, Combine( ValuesIn(blas1sizeRange), ValuesIn(offsetRange), ValuesIn(incRange), ValuesIn(offsetRange), ValuesIn(incRange), ValuesIn(offsetRange), ValuesIn(alphaBetaRange), Values(1))); // Custom test - use command line arguments to tweak it INSTANTIATE_TEST_CASE_P(Custom, ROTM, Combine( ValuesIn(blas1sizeRange), ValuesIn(offsetRange), ValuesIn(incRange), ValuesIn(offsetRange), ValuesIn(incRange), ValuesIn(offsetRange), ValuesIn(alphaBetaRange), Values(1))); #endif #ifdef DO_ROT INSTANTIATE_TEST_CASE_P(Generic, ROT, Combine( ValuesIn(blas1sizeRange), ValuesIn(offsetRange), ValuesIn(incRange), ValuesIn(offsetRange), ValuesIn(incRange), ValuesIn(alphaBetaRange), ValuesIn(alphaBetaRange), Values(1))); // Custom test - use command line arguments to tweak it INSTANTIATE_TEST_CASE_P(Custom, ROT, Combine( ValuesIn(blas1sizeRange), ValuesIn(offsetRange), ValuesIn(incRange), ValuesIn(offsetRange), ValuesIn(incRange), ValuesIn(alphaBetaRange), ValuesIn(alphaBetaRange), Values(1))); #endif #ifdef DO_ROTMG INSTANTIATE_TEST_CASE_P(Generic, ROTMG, Combine( ValuesIn(offsetRange), ValuesIn(offsetRange), ValuesIn(offsetRange), ValuesIn(offsetRange), ValuesIn(offsetRange), ValuesIn(alphaBetaRange), Values(1))); // Custom test - use command line arguments to tweak it INSTANTIATE_TEST_CASE_P(Custom, ROTMG, Combine( ValuesIn(offsetRange), ValuesIn(offsetRange), ValuesIn(offsetRange), ValuesIn(offsetRange), ValuesIn(offsetRange), ValuesIn(alphaBetaRange), Values(1))); #endif #ifdef DO_NRM2 INSTANTIATE_TEST_CASE_P(Generic, NRM2, Combine( ValuesIn(blas1sizeRange), ValuesIn(incRange), ValuesIn(offsetRange), ValuesIn(offsetRange), Values(1) ) ); // Custom test - use command line arguments to tweak it INSTANTIATE_TEST_CASE_P(Custom, NRM2, Combine( Values(819430), ValuesIn(incRange), ValuesIn(offsetRange), ValuesIn(offsetRange), Values(1) ) ); #endif #ifdef DO_ASUM INSTANTIATE_TEST_CASE_P(Generic, ASUM, Combine( ValuesIn(blas1sizeRange), ValuesIn(incRange), ValuesIn(offsetRange), ValuesIn(offsetRange), Values(1) ) ); // Custom test - use command line arguments to tweak it INSTANTIATE_TEST_CASE_P(Custom, ASUM, Combine( Values(819430), ValuesIn(incRange), ValuesIn(offsetRange), ValuesIn(offsetRange), Values(1) ) ); #endif #ifdef DO_iAMAX INSTANTIATE_TEST_CASE_P(Generic, iAMAX, Combine( ValuesIn(blas1sizeRange), ValuesIn(incRange), ValuesIn(offsetRange), ValuesIn(offsetRange), Values(1) ) ); // Custom test - use command line arguments to tweak it INSTANTIATE_TEST_CASE_P(Custom, iAMAX, Combine( Values(819430), ValuesIn(incRange), ValuesIn(offsetRange), ValuesIn(offsetRange), Values(1) ) ); #endif #if 0 // ensure that a TRMM function is faster then the respective GEMM one static void checkIsTrmmFaster(BlasFunction trmmFn, BlasFunction gemmFn) { const char *s1, *s2; gflops_t gf1, gf2; gf1 = perfRecorder->clblasAvgPerf(trmmFn); gf2 = perfRecorder->clblasAvgPerf(gemmFn); if (isDoubleZero((double)gf1) || isDoubleZero((double)gf2)) { // skip, respective tests has not been run return; } s1 = functionToString(trmmFn); s2 = functionToString(gemmFn); cerr << "Check if the " << s1 << " function is faster than the " << s2 << " one" << endl; if (gf1 * 2 > gf2) { // since TRMM has in twice as less operations as GEMM cerr << "PASS" << endl << endl; } else { cerr << "FAIL" << endl << endl; } } #endif int main(int argc, char *argv[]) { int ret; int fn; gflops_t gflops1, gflops2; gbps_t gbps1,gbps2; double ratio; const char *name; ::clMath::BlasBase *base; TestParams params; #if 0 BlasFunction estimFuncs[][2] = { {FN_SGEMM, FN_CGEMM }, // FN_STRMM, FN_CTRMM}, {FN_DGEMM, FN_ZGEMM } // FN_DTRMM, FN_ZTRMM}}; }; const char *message[2] = { "Check if the resulting average ratio for single float types " "(for GEMM and TRMM) matches the expected one ", "Check if the resulting average ratio for double float " "precision types (for GEMM and TRMM) matches the expected one "}; double estimRatios[2] = { EXPECTED_SINGLE_FLOAT_PERF_RATIO, EXPECTED_DOUBLE_FLOAT_PERF_RATIO}; #endif if ((argc > 1) && !strcmp(argv[1], "--test-help")) { printUsage("test-performance"); return 0; } ::testing::InitGoogleTest(&argc, argv); ::std::cerr << "Initialize OpenCL and CLBLAS..." << ::std::endl; base = ::clMath::BlasBase::getInstance(); if (base == NULL) { ::std::cerr << "Fatal error, OpenCL or clblas initialization failed! " "Leaving the test." << ::std::endl; return -1; } base->setSeed(DEFAULT_SEED); if (argc > 1) { params.optFlags = NO_FLAGS; params.devType = CL_DEVICE_TYPE_GPU; params.devName = NULL; if (parseBlasCmdLineArgs(argc, argv, ¶ms) != 0) { printUsage(argv[0]); return 1; } if (params.optFlags & SET_SEED) { base->setSeed(params.seed); } if (params.optFlags & SET_ALPHA) { base->setAlpha(params.alpha); } if (params.optFlags & SET_BETA) { base->setBeta(params.beta); } if (params.optFlags & SET_M) { base->setM(params.M); } if (params.optFlags & SET_N) { base->setN(params.N); } if (params.optFlags & SET_K) { base->setK(params.K); } if (params.optFlags & SET_INCX) { base->setIncX(params.incx); } if (params.optFlags & SET_DEVICE_TYPE) { if (!base->setDeviceType(¶ms.devType, params.devName)) { ::std::cerr << "Fatal error, OpenCL or clblas " "initialization failed! Leaving the test." << ::std::endl; return -1; } } if (params.optFlags & SET_NUM_COMMAND_QUEUES) { base->setNumCommandQueues(params.numCommandQueues); } } parseEnv(¶ms); if (params.optFlags & SET_USE_IMAGES) { base->setUseImages(params.useImages); } perfRecorder = new PerformanceRecorder; /* Use of image based buffers is deprecated if (base->useImages()) { if (base->addScratchImages()) { std::cerr << "FATAL ERROR, CANNOT CREATE SCRATCH IMAGES!" << std::endl; } } */ ret = RUN_ALL_TESTS(); if (base->useImages()) { base->removeScratchImages(); } cerr << endl << endl; cerr << "----------------------------------------------" << endl << "Overall performance information:" << endl << "----------------------------------------------" << endl; // now, check average speed ratio for (fn = 0; fn < BLAS_FUNCTION_END; fn++) { name = functionToString(static_cast(fn)); /* * For global memory based solutions print only average performance, * and for those of image based perform just comparison */ ratio = perfRecorder->avgTimeRatio(static_cast(fn)); if (isDoubleZero(ratio)) { // skip, this group of tests has not been run continue; } if (functionBlasLevel(static_cast(fn)) != 3) //display metrics in GBps if it is a BLAS-1 or BLAS-2 function { gbps1 = perfRecorder->etalonAvgGbpsPerf( static_cast(fn)); gbps2 = perfRecorder->clblasAvgGbpsPerf( static_cast(fn)); cout << "Average reference " << name << endl << " performance is " << gbps1 << " GBps; for CLBLAS implementation: " << endl << "average performance = " << gbps2 << " GBps, " "average time ratio = " << ratio << endl << endl; } else //display metrics in GFlops if its a BLAS-3 function { gflops1 = perfRecorder->etalonAvgPerf( static_cast(fn)); gflops2 = perfRecorder->clblasAvgPerf( static_cast(fn)); cout << "Average reference " << name << endl << " performance is " << gflops1 << " giga-flops; for CLBLAS implementation: " << endl << "average performance = " << gflops2 << " giga-flops, " "average time ratio = " << ratio << endl << endl; } } // check if TRMM is faster than GEMM #if 0 checkIsTrmmFaster(FN_STRMM, FN_SGEMM); checkIsTrmmFaster(FN_DTRMM, FN_DGEMM); checkIsTrmmFaster(FN_CTRMM, FN_CGEMM); checkIsTrmmFaster(FN_ZTRMM, FN_ZGEMM); /* * Now, do the final average ratio comparison if there is * the image based version. Involve only GEMM and TRMM as using * 2 images */ if (base->useImages()) { int j; for (i = 0; i < 2; i++) { ratio = 0; nruns = 0; for (j = 0; j < 2; j++) { r = perfRecorder->avgTimeRatio(estimFuncs[i][j]); if (!isDoubleZero(r)) { ratio += r; nruns++; } } if (nruns) { ratio /= nruns; cerr << message[i] << endl; if (ratio >= estimRatios[i]) { cerr << "PASS (" << ratio << ")" << endl << endl; } else { cerr << "FAIL (" << ratio << ")" << endl << endl; } } } } #endif /* * Explicitely tell the singleton to release all resources, * before we return from main. */ base->release( ); return ret; } clblas-2.10/src/tests/timer.c000066400000000000000000000060071264277366700161400ustar00rootroot00000000000000/* ************************************************************************ * Copyright 2013 Advanced Micro Devices, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * ************************************************************************/ #include "timer.h" #if defined(_MSC_VER) #include nano_time_t conv2nanosec(nano_time_t t) { LARGE_INTEGER count; if (QueryPerformanceFrequency(&count) == FALSE) { return 0; } t = (t * 1000000)/count.QuadPart; return (nano_time_t)(t * 1000); } nano_time_t conv2microsec(nano_time_t t) { LARGE_INTEGER count; if (QueryPerformanceFrequency(&count) == FALSE) { return 0; } return (t * 1000000ULL)/count.QuadPart; } nano_time_t conv2millisec(nano_time_t t) { LARGE_INTEGER count; if (QueryPerformanceFrequency(&count) == FALSE) { return 0; } return (t * 1000) / count.QuadPart; } nano_time_t getCurrentTime(void) { LARGE_INTEGER count; if (QueryPerformanceCounter(&count) == FALSE) { return 0; } return (nano_time_t)count.QuadPart; } void sleepTime(nano_time_t time) { DWORD tms = (DWORD)(time/1000000); Sleep(tms); } #else /* defined(_MCS_VER) */ #include #if defined(__APPLE__) && defined(__MACH__) #include #include #include #include // see https://developer.apple.com/library/mac/qa/qa1398/_index.html static mach_timebase_info_data_t mtb_; static void init_timebase_conv_(void) { kern_return_t err; err = mach_timebase_info(&mtb_); assert(err == KERN_SUCCESS); } nano_time_t getCurrentTime(void) { static pthread_once_t once = PTHREAD_ONCE_INIT; uint64_t now; pthread_once(&once, init_timebase_conv_); now = mach_absolute_time(); return (now * mtb_.numer) / mtb_.denom; } #else /* ! (_MCS_VER || __APPLE__) */ nano_time_t getCurrentTime(void) { int err; struct timespec t; err = clock_gettime(CLOCK_REALTIME, &t); if (err == 0) { return (t.tv_sec * 1000000000UL + t.tv_nsec); } return 0; } #endif nano_time_t conv2nanosec(nano_time_t t) { /* clock_... functions measure time in nanoseconds */ return t; } nano_time_t conv2microsec(nano_time_t t) { return t/1000; } nano_time_t conv2millisec(nano_time_t t) { return t/1000000; } void sleepTime(nano_time_t time) { struct timespec t1; t1.tv_sec = 0; t1.tv_nsec = time; nanosleep(&t1, NULL); } // namespace ) #endif /* defined(_MCS_VER) */ clblas-2.10/src/wrappers/000077500000000000000000000000001264277366700153525ustar00rootroot00000000000000clblas-2.10/src/wrappers/python/000077500000000000000000000000001264277366700166735ustar00rootroot00000000000000clblas-2.10/src/wrappers/python/README.txt000066400000000000000000000060741264277366700204000ustar00rootroot00000000000000pyclBLAS setup and installation (I've been pronouncing it 'pickleBLAS') ------------------------------------------------------------------------ A python extention wrapper around clBLAS from https://github.com/clMathLibraries/clBLAS Dependencies: 1. clBLAS from https://github.com/clMathLibraries/clBLAS ( develop branch ) 2. PyOpenCL from http://mathema.tician.de/software/pyopencl/ ( 2013.2 minimum ) 3. Cython from http://cython.org/, ( 0.18 minimum ) 4. OpenCL runtime, such as AMD's catalyst package ( AMD v2.9 SDK tested ) NOTE: This has been tested with 32-bit python on windows & 64-bit on OpenSUSE NOTE: Only sgemm has been wrapped as proof-of-concept Build steps: ------------------------------------------------------------------------ 1. First, clone the clBLAS repo from github and make sure to build the 'install' step. This is either 'make install' on linux derivatives or the 'install' project on Visual Studio projects. This should produce a 'package' directory in your build tree that contains ./include, ./libXX & ./bin. Note: it is necessary to build 32-bit clBLAS if using 32-bit python, and 64-bit clBLAS for 64-bit python. 2. Install pyopencl. If your python distribution contains a version of pyopencl that is a minimum of 2013.2, then just install with the distributions package manager like pypm, pip, easy_install. If not, download pyopencl yourself and follow its directions to build and install. 3. Install Cython. If your python distribution contains a version of cython that is a minimum of .18, then just install with the distributions package manager like pypm, pip, easy_install. If not, download cython yourself and follow its directions to build and install. 4. An OpenCL SDK is required to build, which includes OpenCL header files and linkable libraries. One such SDK is the AMD APP SDK, which can be downloaded from http://developer.amd.com/tools-and-sdks/heterogeneous-computing/amd-accelerated-parallel-processing-app-sdk/ 5. Build the pyclBLAS extention. This is accompished by running setup.py, which acts as a python makefile. An example install command: 'python setup.py --clBlasRoot=F:\code\GitHub\clMathLibraries\bin\clBLAS\develop\vs11x32\package build_ext --inplace' 'python setup.py --help' prints additional command line parameters that extend the traditional distutils options. After successfully building the extention module, a pyclBLAS.pyd file appears. As shown above, it may be necessary to provide the setup makefile with the paths of the clBLAS 'package' directory and the OpenCL SDK directory. Setup.py does attempt to find the OpenCL SDK through the environment variable AMDAPPSDKROOT or OPENCL_ROOT. NOTE: On windows, if using a more recent version of visual studio than 2008, it may be necessary to trick python to using the newer version of your compiler, by creating an environment variable that it expects to exist as such: set VS90COMNTOOLS=%VS110COMNTOOLS% NOTE: It may be necessary to copy the clBLAS shared library into the same directory as the extention module so that it can find clBLAS at runtime clblas-2.10/src/wrappers/python/pyclBLAS.pxd000066400000000000000000000065051264277366700210270ustar00rootroot00000000000000################################################################################ # Copyright 2014 Advanced Micro Devices, Inc. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. ################################################################################ # This pxd file defines all the enums and structs that we plan to use from # python. It is used from pyclBLAS.pyx from libc.stdint cimport intptr_t, uintptr_t cdef extern from "clBLAS.h": # These are base OpenCL enumerations that clBLAS uses cdef enum: CL_SUCCESS = 0 CL_INVALID_VALUE = -30 CL_INVALID_COMMAND_QUEUE = -36 CL_INVALID_CONTEXT = -34 CL_INVALID_MEM_OBJECT = -38 CL_INVALID_DEVICE = -33 CL_INVALID_EVENT_WAIT_LIST = -57 CL_OUT_OF_RESOURCES = -5 CL_OUT_OF_HOST_MEMORY = -6 CL_INVALID_OPERATION = -59 CL_COMPILER_NOT_AVAILABLE = -3 CL_BUILD_PROGRAM_FAILURE = -11 cdef enum clblasStatus_: clblasSuccess = CL_SUCCESS clblasInvalidValue = CL_INVALID_VALUE clblasInvalidCommandQueue = CL_INVALID_COMMAND_QUEUE clblasInvalidContext = CL_INVALID_CONTEXT clblasInvalidMemObject = CL_INVALID_MEM_OBJECT clblasInvalidDevice = CL_INVALID_DEVICE clblasInvalidEventWaitList = CL_INVALID_EVENT_WAIT_LIST clblasOutOfResources = CL_OUT_OF_RESOURCES clblasOutOfHostMemory = CL_OUT_OF_HOST_MEMORY clblasInvalidOperation = CL_INVALID_OPERATION clblasCompilerNotAvailable = CL_COMPILER_NOT_AVAILABLE clblasBuildProgramFailure = CL_BUILD_PROGRAM_FAILURE clblasNotImplemented = -1024 clblasNotInitialized = -1023 clblasInvalidMatA clblasInvalidMatB clblasInvalidMatC clblasInvalidVecX clblasInvalidVecY clblasInvalidDim clblasInvalidLeadDimA clblasInvalidLeadDimB clblasInvalidLeadDimC clblasInvalidIncX clblasInvalidIncY clblasInsufficientMemMatA clblasInsufficientMemMatB clblasInsufficientMemMatC clblasInsufficientMemVecX clblasInsufficientMemVecY ctypedef clblasStatus_ clblasStatus cdef enum clblasOrder_: clblasRowMajor = 0 clblasColumnMajor = 1 ctypedef clblasStatus_ clblasOrder cdef enum clblasTranspose_: clblasNoTrans = 0 clblasTrans = 1 clblasConjTrans = 2 ctypedef clblasStatus_ clblasTranspose ctypedef unsigned int cl_uint ctypedef float cl_float ctypedef void* cl_mem ctypedef void* cl_command_queue ctypedef void* cl_event clblas-2.10/src/wrappers/python/pyclBLAS.pyx000066400000000000000000000125121264277366700210470ustar00rootroot00000000000000################################################################################ # Copyright 2014 Advanced Micro Devices, Inc. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. ################################################################################ cimport pyclBLAS import pyopencl # These are prototypes from clBLAS.h that we wish to call from python ################################################################################ ################################################################################ cdef extern from "clBLAS.h": clblasStatus clblasGetVersion( cl_uint* major, cl_uint* minor, cl_uint* patch ) clblasStatus clblasSetup( ) void clblasTeardown( ) clblasStatus clblasSgemm( clblasOrder order, clblasTranspose transA, clblasTranspose transB, size_t M, size_t N, size_t K, cl_float alpha, const cl_mem A, size_t offA, size_t lda, const cl_mem B, size_t offB, size_t ldb, cl_float beta, cl_mem C, size_t offC, size_t ldc, cl_uint numCommandQueues, cl_command_queue* commandQueues, cl_uint numEventsInWaitList, const cl_event* eventWaitList, cl_event* events) ################################################################################ ################################################################################ # enums to be accessed from python # TODO: is there a better way to express enums? I like how pyopencl does it, # they have layers of scoped constants cl.mem_flags.READ_ONLY # The enums below have global scope RowMajor = pyclBLAS.clblasRowMajor ColumnMajor = pyclBLAS.clblasColumnMajor NoTrans = pyclBLAS.clblasNoTrans Trans = pyclBLAS.clblasTrans ConjTrans = pyclBLAS.clblasConjTrans ################################################################################ ################################################################################ # The following functions are the python callable wrapper implementations def Setup( ): result = clblasSetup( ) if( result != clblasSuccess ): raise RuntimeError( "clblasSetup( ) failed initialization" ) return result ################################################################################ def Teardown( ): clblasTeardown( ) return ################################################################################ def GetVersion( ): cdef pyclBLAS.cl_uint pyMajor cdef pyclBLAS.cl_uint pyMinor cdef pyclBLAS.cl_uint pyPatch result = clblasGetVersion( &pyMajor, &pyMinor, &pyPatch ) if( result != clblasSuccess ): raise RuntimeError( "clblasGetVersion( ) did not return version information" ) return pyMajor, pyMinor, pyPatch ################################################################################ # TODO: Is there way to template these python callable functions, such that we # do not need to make a new function for every supported precision? def Sgemm( clblasOrder order, clblasTranspose transA, clblasTranspose transB, size_t M, size_t N, size_t K, cl_float alpha, A, size_t offA, size_t lda, B, size_t offB, size_t ldb, cl_float beta, C, size_t offC, size_t ldc, cl_uint numCommandQueues, commandQueues, cl_uint numEventsInWaitList, eventWaitList ): # Simplify python wrapper to only handle 1 queue at this time if( numCommandQueues != 1 ): raise IndexError( "pyblasSgemm( ) requires the number of queues to be 1" ) cdef intptr_t pIntQueue = commandQueues.int_ptr cdef cl_command_queue pcqQueue = pIntQueue # This logic does not yet work for numEventsInWaitList > (greater than) 1 # Need to figure out how python & pyopencl pass lists of objects cdef intptr_t pIntWaitList = 0 cdef cl_event* pWaitList = NULL if( numEventsInWaitList > 0 ): if( numEventsInWaitList < 2 ): pIntWaitList = eventWaitList.int_ptr pWaitList = pIntWaitList else: raise IndexError( "pyblasSgemm( ) requires numEventsInWaitList to be <= 1" ) # Pyopencl objects contain an int_ptr method to get access to the internally wrapped # OpenCL object pointers cdef cl_event outEvent = NULL cdef intptr_t matA = A.int_ptr cdef intptr_t matB = B.int_ptr cdef intptr_t matC = C.int_ptr # Transition execution to clBLAS cdef clblasStatus result = clblasSgemm( order, transA, transB, M, N, K, alpha, matA, offA, lda, matB, offB, ldb, beta, matC, offC, ldc, numCommandQueues, &pcqQueue, numEventsInWaitList, pWaitList, &outEvent ) if( result != clblasSuccess ): raise RuntimeError( "clBLAS sgemm call failed" ) # Create a pyopencl Event object from the event returned from clBLAS and return # it to the user sgemmEvent = pyopencl.Event.from_int_ptr( outEvent ) return sgemmEvent clblas-2.10/src/wrappers/python/setup.py000066400000000000000000000077631264277366700204220ustar00rootroot00000000000000################################################################################ # Copyright 2014 Advanced Micro Devices, Inc. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. ################################################################################ from distutils.core import setup from distutils.extension import Extension from Cython.Distutils import build_ext from os import path, environ import argparse import platform def main(): parser = argparse.ArgumentParser(description='Set up the pyclBLAS extension module') parser.add_argument('--clRoot', dest='clRoot', default=None, help='Root directory to find the OpenCL SDK, which should contain the include directory') parser.add_argument('--clBlasRoot', dest='clBlasRoot', default=None, help='Root directory to find the clBLAS SDK, which should contain the include directory') args, unknown_args = parser.parse_known_args( ) ## print( "recognized args: ", args ) ## print( "unknown args: ", unknown_args ) # First check environment variables for clRoot paths clRootPath = None if( environ.get('OPENCL_ROOT') is not None ): clRootPath = environ['OPENCL_ROOT'] # Special check for environment variable set by AMD Catalyst installer if( clRootPath is None and environ.get( 'AMDAPPSDKROOT' ) is not None ): clRootPath = environ['AMDAPPSDKROOT'] # If user specifies a command line options, this trumps environment variables print( "args.clRoot: ", args.clRoot ) if( args.clRoot is not None ): clRootPath = args.clRoot if( clRootPath is None ): print( "This setup.py needs to know the root path of an OpenCL installation") print( "Please specify the environment variable OPENCL_ROOT with a path" ) print( "Or pass the command line option --clRoot" ) exit( ) # First check environment variables for clRoot paths clBlasRootPath = None if( environ.get('CLBLAS_ROOT') is not None ): clBlasRootPath = environ['CLBLAS_ROOT'] # If user specifies a command line options, this trumpts environment variables print( "args.clBlasRoot: ", args.clBlasRoot ) if( args.clBlasRoot is not None ): clBlasRootPath = args.clBlasRoot if( clBlasRootPath is None ): print( "This setup.py needs to know the root path of the clBLAS installation") print( "Please specify the environment variable CLBLAS_ROOT with a path" ) print( "or pass the command line option --clBlasRoot" ) exit( ) # 64bit and 32bit have different library paths if( platform.architecture( )[0] == '64bit' ): libraryPath = 'lib64' else: libraryPath = 'lib' # Windows and linux have different library paths if( platform.system( ) == 'Windows' ): libraryPath = path.join( libraryPath, 'import' ) module = [ Extension( name = 'pyclBLAS', sources = ['pyclBLAS.pyx'], include_dirs = [ path.join( clRootPath, 'include' ), path.join( clBlasRootPath, 'include' ) ], library_dirs = [ path.join( clBlasRootPath, libraryPath ) ], libraries=['clBLAS'] ) ] setup( name = 'pyclBLAS', version = '0.0.1', author = 'Kent Knox', description = 'Python wrapper for clBLAS', license = 'Apache License, Version 2.0', cmdclass = {"build_ext": build_ext}, ext_modules = module, script_args = unknown_args ) # This is the start of the execution of the python script # Useful for debuggers to step into script if __name__ == '__main__': main( )