pax_global_header 0000666 0000000 0000000 00000000064 14343743117 0014521 g ustar 00root root 0000000 0000000 52 comment=5ad609661e570ba6aa7716a26a91cb67d559f8a2
kokkos-3.7.01/ 0000775 0000000 0000000 00000000000 14343743117 0013112 5 ustar 00root root 0000000 0000000 kokkos-3.7.01/.clang-format 0000664 0000000 0000000 00000000310 14343743117 0015457 0 ustar 00root root 0000000 0000000 #Official Tool: clang-format version 8.0.0
BasedOnStyle: google
SortIncludes: false
AlignConsecutiveAssignments: true
AllowShortCaseLabelsOnASingleLine: true
AllowShortIfStatementsOnASingleLine: true
kokkos-3.7.01/.clang-format-ignore 0000664 0000000 0000000 00000000104 14343743117 0016741 0 ustar 00root root 0000000 0000000 core/unit_test/config/results/*
tpls/gtest/gtest/*
core/src/desul/*
kokkos-3.7.01/.clang-tidy 0000664 0000000 0000000 00000000227 14343743117 0015147 0 ustar 00root root 0000000 0000000 Checks: '-*,kokkos-*,modernize-use-using,modernize-use-nullptr,cppcoreguidelines-pro-type-cstyle-cast'
FormatStyle: file
HeaderFilterRegex: '.*/*.hpp'
kokkos-3.7.01/.codecov.yml 0000664 0000000 0000000 00000000312 14343743117 0015331 0 ustar 00root root 0000000 0000000 coverage:
precision: 1
round: down
range: "70...100"
ignore:
- tpls/
- algorithms/unit_tests
- core/perf_test/
- core/unit_test/
- containers/performance_tests
- containers/unit_tests
kokkos-3.7.01/.github/ 0000775 0000000 0000000 00000000000 14343743117 0014452 5 ustar 00root root 0000000 0000000 kokkos-3.7.01/.github/ISSUE_TEMPLATE/ 0000775 0000000 0000000 00000000000 14343743117 0016635 5 ustar 00root root 0000000 0000000 kokkos-3.7.01/.github/ISSUE_TEMPLATE/bug_report.md 0000664 0000000 0000000 00000001336 14343743117 0021332 0 ustar 00root root 0000000 0000000 ---
name: Bug report
about: Create a report to correct failures and improve our code
title: ''
labels: ''
assignees: ''
---
**Describe the bug**
Please provide a concise, clear description of the bug, as well as any available error logs.
**Please also include the following items to support reproducing the bug**
1. compilers (with versions)
2. Kokkos release or commit used (i.e., the sha1 number)
3. platform and backend
4. cmake configure command
5. output from cmake command
6. code needed to reproduce the bug
7. command line needed to reproduce the bug
7. please also attach the `KokkosCore_config.h` header file (generated during the build);
**Any additional info**
Please provide any additional context about the issue here.
kokkos-3.7.01/.github/workflows/ 0000775 0000000 0000000 00000000000 14343743117 0016507 5 ustar 00root root 0000000 0000000 kokkos-3.7.01/.github/workflows/continuous-integration-workflow-hpx.yml 0000664 0000000 0000000 00000005172 14343743117 0026433 0 ustar 00root root 0000000 0000000 name: github-Linux-hpx
on: [push, pull_request]
concurrency:
group: ${ {github.event_name }}-${{ github.workflow }}-${{ github.ref }}
cancel-in-progress: ${{github.event_name == 'pull_request'}}
jobs:
hpx:
name: hpx
runs-on: [ubuntu-latest]
steps:
- name: checkout code
uses: actions/checkout@v2.2.0
with:
path: kokkos
- name: setup hpx dependencies
run: |
sudo apt update
sudo apt install \
clang \
hwloc \
libasio-dev \
libboost-all-dev \
ninja-build
- name: checkout hpx
uses: actions/checkout@v2.2.0
with:
repository: STELLAR-GROUP/hpx
ref: 1.7.1
path: hpx
- uses: actions/cache@v2
id: cache-hpx
with:
path: ./hpx/install
key: kokkos-hwloc-${{ github.ref }}-${{ github.sha }}
restore-keys: kokkos-hwloc-${{ github.ref }}
- name: configure hpx
if: steps.cache-hpx.outputs.cache-hit != 'true'
run: |
mkdir -p hpx/{build,install}
cd hpx/build
cmake \
-GNinja \
-DCMAKE_BUILD_TYPE=Debug \
-DCMAKE_INSTALL_PREFIX=$PWD/../install \
-DCMAKE_CXX_COMPILER=clang++ \
-DHPX_WITH_UNITY_BUILD=ON \
-DHPX_WITH_MALLOC=system \
-DHPX_WITH_NETWORKING=OFF \
-DHPX_WITH_EXAMPLES=OFF \
-DHPX_WITH_TESTS=OFF \
..
- name: build and install hpx
if: steps.cache-hpx.outputs.cache-hit != 'true'
working-directory: hpx/build
run: ninja -j2 install
- name: configure kokkos
run: |
mkdir -p kokkos/{build,install}
cd kokkos/build
cmake \
-GNinja \
-DCMAKE_BUILD_TYPE=Debug \
-DCMAKE_INSTALL_PREFIX=$PWD/../install \
-DCMAKE_CXX_COMPILER=clang++ \
-DCMAKE_CXX_FLAGS="-Werror" \
-DHPX_ROOT=$PWD/../../hpx/install \
-DKokkos_ARCH_NATIVE=ON \
-DKokkos_ENABLE_COMPILER_WARNINGS=ON \
-DKokkos_ENABLE_DEPRECATED_CODE_3=OFF \
-DKokkos_ENABLE_EXAMPLES=ON \
-DKokkos_ENABLE_HPX=ON \
-DKokkos_ENABLE_HPX_ASYNC_DISPATCH=ON \
-DKokkos_ENABLE_SERIAL=OFF \
-DKokkos_ENABLE_TESTS=ON \
..
- name: build_and_install_kokkos
working-directory: kokkos/build
run: ninja -j2 install
- name: test_kokkos
working-directory: kokkos/build
run: ctest --timeout 2000 -j2 --output-on-failure
kokkos-3.7.01/.github/workflows/continuous-integration-workflow.yml 0000664 0000000 0000000 00000010614 14343743117 0025633 0 ustar 00root root 0000000 0000000 name: github-Linux
on: [push, pull_request]
concurrency:
group: ${ {github.event_name }}-${{ github.workflow }}-${{ github.ref }}
cancel-in-progress: ${{github.event_name == 'pull_request'}}
jobs:
CI:
continue-on-error: true
strategy:
matrix:
distro: ['fedora:latest', 'ubuntu:latest']
cxx: ['g++', 'clang++']
cmake_build_type: ['Release', 'Debug']
backend: ['OPENMP']
clang-tidy: ['']
include:
- distro: 'fedora:intel'
cxx: 'icpc'
cmake_build_type: 'Release'
backend: 'OPENMP'
clang-tidy: ''
- distro: 'fedora:intel'
cxx: 'icpc'
cmake_build_type: 'Debug'
backend: 'OPENMP'
clang-tidy: ''
- distro: 'fedora:intel'
cxx: 'icpx'
cmake_build_type: 'Release'
backend: 'OPENMP'
clang-tidy: ''
- distro: 'fedora:intel'
cxx: 'icpx'
cmake_build_type: 'Debug'
backend: 'OPENMP'
clang-tidy: ''
- distro: 'ubuntu:latest'
cxx: 'clang++'
cmake_build_type: 'RelWithDebInfo'
backend: 'THREADS'
clang-tidy: '-DCMAKE_CXX_CLANG_TIDY="clang-tidy;-warnings-as-errors=*"'
- distro: 'ubuntu:latest'
cxx: 'g++'
cmake_build_type: 'RelWithDebInfo'
backend: 'THREADS'
runs-on: ubuntu-latest
container:
image: ghcr.io/kokkos/ci-containers/${{ matrix.distro }}
# see https://github.com/actions/virtual-environments/issues/3812
options: --security-opt seccomp=unconfined
steps:
- name: Checkout desul
uses: actions/checkout@v2.2.0
with:
repository: desul/desul
ref: 477da9c8f40f8db369c28dd3f93a67e376d8511b
path: desul
- name: Install desul
working-directory: desul
run: |
git submodule init
git submodule update
mkdir build
cd build
cmake -DDESUL_ENABLE_TESTS=OFF -DCMAKE_INSTALL_PREFIX=/usr/desul-install ..
sudo cmake --build . --target install --parallel 2
- name: Checkout code
uses: actions/checkout@v2.2.0
- uses: actions/cache@v2
with:
path: ~/.ccache
key: kokkos-${{ matrix.distro }}-${{ matrix.cxx }}-${{ matrix.cmake_build_type }}-${{ matrix.openmp }}-${github.ref}-${{ github.sha }}
restore-keys: kokkos-${{ matrix.distro }}-${{ matrix.cxx }}-${{ matrix.cmake_build_type }}-${{ matrix.openmp }}-${{github.ref}}
- name: maybe_disable_death_tests
if: ${{ matrix.distro == 'fedora:rawhide' }}
run: echo "GTEST_FILTER=-*DeathTest*" >> $GITHUB_ENV
- name: maybe_use_external_gtest
if: ${{ matrix.distro == 'ubuntu:latest' }}
run: sudo apt-get update && sudo apt-get install -y libgtest-dev
- name: maybe_install_clang_tidy
if: ${{ matrix.clang-tidy != '' }}
run: sudo apt-get update && sudo apt-get install -y clang-tidy
- name: Configure Kokkos
run: |
cmake -B builddir \
-DCMAKE_INSTALL_PREFIX=/usr \
${{ matrix.clang-tidy }} \
-Ddesul_ROOT=/usr/desul-install/ \
-DKokkos_ARCH_NATIVE=ON \
-DKokkos_ENABLE_DESUL_ATOMICS_EXTERNAL=ON \
-DKokkos_ENABLE_HWLOC=ON \
-DKokkos_ENABLE_${{ matrix.backend }}=ON \
-DKokkos_ENABLE_TESTS=ON \
-DKokkos_ENABLE_EXAMPLES=ON \
-DKokkos_ENABLE_DEPRECATED_CODE_3=ON \
-DKokkos_ENABLE_DEPRECATION_WARNINGS=OFF \
-DCMAKE_CXX_COMPILER=${{ matrix.cxx }} \
-DCMAKE_BUILD_TYPE=${{ matrix.cmake_build_type }}
- name: Build
run: |
ccache -z
cmake --build builddir --parallel 2
ccache -s
- name: Tests
working-directory: builddir
run: ctest --output-on-failure
- name: Test DESTDIR Install
run: DESTDIR=${PWD}/install cmake --build builddir --target install && rm -rf ${PWD}/install/usr && rmdir ${PWD}/install
- name: Install
run: sudo cmake --build builddir --target install
- name: Test install
working-directory: example/build_cmake_installed
run: |
cmake -B builddir -DCMAKE_CXX_COMPILER=${{ matrix.cxx }}
cmake --build builddir
cmake --build builddir --target test
kokkos-3.7.01/.github/workflows/osx.yml 0000664 0000000 0000000 00000002323 14343743117 0020043 0 ustar 00root root 0000000 0000000 name: github-OSX
on: [push, pull_request]
concurrency:
group: ${ {github.event_name }}-${{ github.workflow }}-${{ github.ref }}
cancel-in-progress: ${{github.event_name == 'pull_request'}}
jobs:
osxci:
name: osx-ci
runs-on: [macos-latest]
strategy:
matrix:
include:
- backend: "SERIAL"
cmake_build_type: "RelWithDebInfo"
- backend: "THREADS"
cmake_build_type: "RelWithDebInfo"
- backend: "SERIAL"
cmake_build_type: "Debug"
- backend: "SERIAL"
cmake_build_type: "Release"
steps:
- uses: actions/checkout@v2
- name: configure
run:
cmake -B build .
-DKokkos_ENABLE_${{ matrix.backend }}=On
-DCMAKE_CXX_FLAGS="-Werror"
-DCMAKE_CXX_STANDARD=14
-DKokkos_ARCH_NATIVE=ON
-DKokkos_ENABLE_COMPILER_WARNINGS=ON
-DKokkos_ENABLE_DEPRECATED_CODE_3=OFF
-DKokkos_ENABLE_TESTS=On
-DCMAKE_BUILD_TYPE=${{ matrix.cmake_build_type }}
- name: build
run:
cmake --build build --parallel 2
- name: test
working-directory: build
run: ctest --output-on-failure
kokkos-3.7.01/.gitignore 0000664 0000000 0000000 00000000431 14343743117 0015100 0 ustar 00root root 0000000 0000000 # Standard ignores
*~
*.pyc
\#*#
.#*
.*.swp
.cproject
.project
testing/
.settings/
/.vs
/out/build
/CMakeSettings.json
/out/mytest
CMakeUserPresets.json
# build directories in source tree
/build*
# IDE-specific files/folders
## VSCode
/.vscode
## QtCreator
/CMakeLists.txt.user*
kokkos-3.7.01/.jenkins 0000664 0000000 0000000 00000061044 14343743117 0014561 0 ustar 00root root 0000000 0000000 pipeline {
agent none
environment {
CCACHE_DIR = '/tmp/ccache'
CCACHE_MAXSIZE = '10G'
CCACHE_CPP2 = 'true'
}
options {
timeout(time: 6, unit: 'HOURS')
}
stages {
stage('Clang-Format') {
agent {
dockerfile {
filename 'Dockerfile.clang'
dir 'scripts/docker'
label 'nvidia-docker || docker'
args '-v /tmp/ccache.kokkos:/tmp/ccache'
}
}
steps {
sh './scripts/docker/check_format_cpp.sh'
}
}
stage('Build') {
parallel {
stage('OPENACC-NVHPC-CUDA-11.6') {
agent {
dockerfile {
filename 'Dockerfile.nvhpc'
dir 'scripts/docker'
label 'nvidia-docker && large_images'
args '--env NVIDIA_VISIBLE_DEVICES=$NVIDIA_VISIBLE_DEVICES'
}
}
steps {
sh '''rm -rf build && mkdir -p build && cd build && \
/opt/cmake/bin/cmake \
-DCMAKE_CXX_COMPILER=nvc++ \
-DCMAKE_CXX_STANDARD=17 \
-DKokkos_ARCH_NATIVE=ON \
-DKokkos_ENABLE_COMPILER_WARNINGS=ON \
-DKokkos_ENABLE_TESTS=ON \
-DKokkos_ENABLE_OPENACC=ON \
-DKokkos_ARCH_VOLTA70=ON \
.. && \
make -j8 && ctest --verbose'''
}
}
stage('CUDA-11.6-NVHPC') {
agent {
dockerfile {
filename 'Dockerfile.nvhpc'
dir 'scripts/docker'
label 'nvidia-docker && large_images'
args '-v /tmp/ccache.kokkos:/tmp/ccache --env NVIDIA_VISIBLE_DEVICES=$NVIDIA_VISIBLE_DEVICES'
}
}
environment {
OMP_NUM_THREADS = 8
// Nested OpenMP does not work for this configuration,
// so disabling it
OMP_MAX_ACTIVE_LEVELS = 1
OMP_PLACES = 'threads'
OMP_PROC_BIND = 'spread'
NVCC_WRAPPER_DEFAULT_COMPILER = 'nvc++'
}
steps {
sh '''rm -rf build && mkdir -p build && cd build && \
/opt/cmake/bin/cmake \
-DCMAKE_BUILD_TYPE=Debug \
-DCMAKE_CXX_COMPILER=$WORKSPACE/bin/nvcc_wrapper \
-DCMAKE_CXX_FLAGS=-Werror \
-DCMAKE_CXX_STANDARD=17 \
-DKokkos_ARCH_NATIVE=ON \
-DKokkos_ENABLE_COMPILER_WARNINGS=ON \
-DKokkos_ENABLE_DEPRECATED_CODE_3=OFF \
-DKokkos_ENABLE_TESTS=ON \
-DKokkos_ENABLE_CUDA=ON \
-DKokkos_ENABLE_CUDA_LAMBDA=ON \
-DKokkos_ENABLE_OPENMP=ON \
.. && \
make -j8 && ctest --verbose'''
}
}
stage('SYCL-OneAPI') {
agent {
dockerfile {
filename 'Dockerfile.sycl'
dir 'scripts/docker'
label 'nvidia-docker && volta'
args '-v /tmp/ccache.kokkos:/tmp/ccache'
}
}
steps {
sh 'ccache --zero-stats'
sh '''rm -rf build && mkdir -p build && cd build && \
cmake \
-DCMAKE_BUILD_TYPE=Release \
-DCMAKE_CXX_COMPILER_LAUNCHER=ccache \
-DCMAKE_CXX_COMPILER=clang++ \
-DCMAKE_CXX_FLAGS="-fsycl-device-code-split=per_kernel -Werror -Wno-gnu-zero-variadic-macro-arguments -Wno-linker-warnings" \
-DKokkos_ARCH_NATIVE=ON \
-DKokkos_ARCH_VOLTA70=ON \
-DKokkos_ENABLE_COMPILER_WARNINGS=ON \
-DKokkos_ENABLE_DEPRECATED_CODE_3=OFF \
-DKokkos_ENABLE_DEPRECATION_WARNINGS=OFF \
-DKokkos_ENABLE_EXAMPLES=ON \
-DKokkos_ENABLE_TESTS=ON \
-DKokkos_ENABLE_SYCL=ON \
-DKokkos_ENABLE_UNSUPPORTED_ARCHS=ON \
-DCMAKE_CXX_STANDARD=17 \
.. && \
make -j8 && ctest --verbose'''
}
post {
always {
sh 'ccache --show-stats'
}
}
}
stage('HIP-ROCm-4.5-C++14') {
agent {
dockerfile {
filename 'Dockerfile.hipcc'
dir 'scripts/docker'
additionalBuildArgs '--build-arg BASE=rocm/dev-ubuntu-20.04:4.5'
label 'rocm-docker && vega'
args '-v /tmp/ccache.kokkos:/tmp/ccache --device=/dev/kfd --device=/dev/dri --security-opt seccomp=unconfined --group-add video --env HIP_VISIBLE_DEVICES=$HIP_VISIBLE_DEVICES'
}
}
environment {
OMP_NUM_THREADS = 8
OMP_MAX_ACTIVE_LEVELS = 3
OMP_PLACES = 'threads'
OMP_PROC_BIND = 'spread'
}
steps {
sh 'ccache --zero-stats'
sh 'echo "/opt/rocm/llvm/lib" > /etc/ld.so.conf.d/llvm.conf && ldconfig'
sh '''rm -rf build && mkdir -p build && cd build && \
cmake \
-DCMAKE_BUILD_TYPE=Debug \
-DCMAKE_CXX_COMPILER=hipcc \
-DCMAKE_CXX_FLAGS="-Werror -Wno-unused-command-line-argument -DNDEBUG" \
-DCMAKE_CXX_STANDARD=14 \
-DKokkos_ARCH_NATIVE=ON \
-DKokkos_ENABLE_COMPILER_WARNINGS=ON \
-DKokkos_ENABLE_DEPRECATED_CODE_3=OFF \
-DKokkos_ENABLE_TESTS=ON \
-DKokkos_ENABLE_HIP=ON \
-DKokkos_ENABLE_OPENMP=ON \
-DKokkos_ENABLE_HIP_MULTIPLE_KERNEL_INSTANTIATIONS=ON \
.. && \
make -j8 && ctest --verbose'''
}
post {
always {
sh 'ccache --show-stats'
}
}
}
stage('HIP-ROCm-4.5-C++17') {
agent {
dockerfile {
filename 'Dockerfile.hipcc'
dir 'scripts/docker'
additionalBuildArgs '--build-arg BASE=rocm/dev-ubuntu-20.04:4.5'
label 'rocm-docker && vega'
args '-v /tmp/ccache.kokkos:/tmp/ccache --device=/dev/kfd --device=/dev/dri --security-opt seccomp=unconfined --group-add video --env HIP_VISIBLE_DEVICES=$HIP_VISIBLE_DEVICES'
}
}
steps {
sh 'ccache --zero-stats'
sh '''rm -rf build && mkdir -p build && cd build && \
cmake \
-DCMAKE_BUILD_TYPE=RelWithDebInfo \
-DCMAKE_CXX_COMPILER=hipcc \
-DCMAKE_CXX_FLAGS="-Werror -Wno-unused-command-line-argument" \
-DCMAKE_CXX_STANDARD=17 \
-DKokkos_ARCH_NATIVE=ON \
-DKokkos_ENABLE_COMPILER_WARNINGS=ON \
-DKokkos_ENABLE_DEPRECATED_CODE_3=ON \
-DKokkos_ENABLE_DEPRECATION_WARNINGS=OFF \
-DKokkos_ENABLE_TESTS=ON \
-DKokkos_ENABLE_HIP=ON \
.. && \
make -j8 && ctest --verbose'''
}
post {
always {
sh 'ccache --show-stats'
}
}
}
/*
stage('OPENMPTARGET-ROCm-4.5') {
agent {
dockerfile {
filename 'Dockerfile.hipcc'
dir 'scripts/docker'
additionalBuildArgs '--build-arg BASE=rocm/dev-ubuntu-20.04:4.5'
label 'rocm-docker && vega && AMD_Radeon_Instinct_MI60'
args '-v /tmp/ccache.kokkos:/tmp/ccache --device=/dev/kfd --device=/dev/dri --security-opt seccomp=unconfined --group-add video --env HIP_VISIBLE_DEVICES=$HIP_VISIBLE_DEVICES'
}
}
environment {
OMP_NUM_THREADS = 8
OMP_MAX_ACTIVE_LEVELS = 3
OMP_PLACES = 'threads'
OMP_PROC_BIND = 'spread'
LC_ALL = 'C'
}
steps {
sh 'ccache --zero-stats'
sh 'echo "/opt/rocm/llvm/lib" > /etc/ld.so.conf.d/llvm.conf && ldconfig'
sh '''rm -rf build && \
cmake \
-Bbuild \
-DCMAKE_BUILD_TYPE=Debug \
-DCMAKE_CXX_COMPILER=amdclang++ \
-DCMAKE_CXX_STANDARD=17 \
-DKokkos_ARCH_NATIVE=ON \
-DKokkos_ENABLE_COMPILER_WARNINGS=ON \
-DKokkos_ENABLE_DEPRECATED_CODE_3=OFF \
-DKokkos_ENABLE_TESTS=ON \
-DKokkos_ENABLE_OPENMPTARGET=ON \
-DKokkos_ENABLE_OPENMP=ON \
-DKokkos_ARCH_VEGA906=ON \
&& \
cmake --build build --parallel ${BUILD_JOBS} && \
cd build && ctest --output-on-failure
'''
}
post {
always {
sh 'ccache --show-stats'
}
}
}
*/
stage('OPENMPTARGET-Clang') {
agent {
dockerfile {
filename 'Dockerfile.openmptarget'
dir 'scripts/docker'
label 'nvidia-docker && volta'
args '-v /tmp/ccache.kokkos:/tmp/ccache --env NVIDIA_VISIBLE_DEVICES=$NVIDIA_VISIBLE_DEVICES'
}
}
steps {
sh 'ccache --zero-stats'
sh '''rm -rf build && mkdir -p build && cd build && \
cmake \
-DCMAKE_BUILD_TYPE=RelWithDebInfo \
-DCMAKE_CXX_COMPILER=clang++ \
-DCMAKE_CXX_FLAGS="-Wno-unknown-cuda-version -Werror -Wno-undefined-internal -Wno-pass-failed" \
-DKokkos_ARCH_NATIVE=ON \
-DKokkos_ENABLE_COMPILER_WARNINGS=ON \
-DKokkos_ENABLE_DEPRECATED_CODE_3=ON \
-DKokkos_ENABLE_DEPRECATION_WARNINGS=OFF \
-DKokkos_ENABLE_TESTS=ON \
-DKokkos_ENABLE_TUNING=ON \
-DKokkos_ENABLE_OPENMPTARGET=ON \
-DKokkos_ARCH_VOLTA70=ON \
-DCMAKE_CXX_STANDARD=17 \
.. && \
make -j8 && ctest --verbose'''
}
post {
always {
sh 'ccache --show-stats'
}
}
}
stage('CUDA-10.1-Clang-Tidy') {
agent {
dockerfile {
filename 'Dockerfile.kokkosllvmproject'
dir 'scripts/docker'
label 'nvidia-docker && volta'
args '-v /tmp/ccache.kokkos:/tmp/ccache --env NVIDIA_VISIBLE_DEVICES=$NVIDIA_VISIBLE_DEVICES'
}
}
steps {
sh 'ccache --zero-stats'
sh '''rm -rf build && mkdir -p build && cd build && \
cmake \
-DCMAKE_BUILD_TYPE=Release \
-DCMAKE_CXX_CLANG_TIDY="clang-tidy;-warnings-as-errors=*" \
-DCMAKE_CXX_COMPILER_LAUNCHER=ccache \
-DCMAKE_CXX_COMPILER=clang++ \
-DCMAKE_CXX_FLAGS=-Werror \
-DCMAKE_CXX_STANDARD=14 \
-DKokkos_ARCH_NATIVE=ON \
-DKokkos_ENABLE_COMPILER_WARNINGS=ON \
-DKokkos_ENABLE_DEPRECATED_CODE_3=ON \
-DKokkos_ENABLE_DEPRECATION_WARNINGS=OFF \
-DKokkos_ENABLE_TESTS=ON \
-DKokkos_ENABLE_CUDA=ON \
-DKokkos_ENABLE_CUDA_LAMBDA=ON \
-DKokkos_ENABLE_TUNING=ON \
-DKokkos_ARCH_VOLTA70=ON \
.. && \
make -j8 && ctest --verbose'''
}
post {
always {
sh 'ccache --show-stats'
}
}
}
stage('CUDA-9.2-NVCC') {
agent {
dockerfile {
filename 'Dockerfile.nvcc'
dir 'scripts/docker'
additionalBuildArgs '--build-arg BASE=nvidia/cuda:9.2-devel'
label 'nvidia-docker && volta'
args '-v /tmp/ccache.kokkos:/tmp/ccache --env NVIDIA_VISIBLE_DEVICES=$NVIDIA_VISIBLE_DEVICES'
}
}
steps {
sh 'ccache --zero-stats'
sh '''rm -rf build && mkdir -p build && cd build && \
../gnu_generate_makefile.bash \
--with-options=compiler_warnings \
--cxxflags="-Werror" \
--cxxstandard=c++14 \
--with-cuda \
--with-cuda-options=enable_lambda \
--arch=Volta70 \
.. && \
make test -j8'''
}
post {
always {
sh 'ccache --show-stats'
}
}
}
stage('CUDA-11.0-NVCC-C++17-RDC') {
agent {
dockerfile {
filename 'Dockerfile.nvcc'
dir 'scripts/docker'
additionalBuildArgs '--build-arg BASE=nvidia/cuda:11.0.3-devel-ubuntu18.04 --build-arg ADDITIONAL_PACKAGES="g++-8 gfortran clang" --build-arg CMAKE_VERSION=3.17.3'
label 'nvidia-docker'
args '-v /tmp/ccache.kokkos:/tmp/ccache --env NVIDIA_VISIBLE_DEVICES=$NVIDIA_VISIBLE_DEVICES'
}
}
environment {
OMP_NUM_THREADS = 8
// Nested OpenMP does not work for this configuration,
// so disabling it
OMP_MAX_ACTIVE_LEVELS = 1
OMP_PLACES = 'threads'
OMP_PROC_BIND = 'spread'
NVCC_WRAPPER_DEFAULT_COMPILER = 'g++-8'
}
steps {
sh 'ccache --zero-stats'
sh '''rm -rf install && mkdir -p install && \
rm -rf build && mkdir -p build && cd build && \
cmake \
-DCMAKE_BUILD_TYPE=Release \
-DCMAKE_CXX_COMPILER=g++-8 \
-DCMAKE_CXX_FLAGS=-Werror \
-DCMAKE_CXX_STANDARD=17 \
-DKokkos_ARCH_NATIVE=ON \
-DKokkos_ENABLE_COMPILER_WARNINGS=ON \
-DKokkos_ENABLE_OPENMP=ON \
-DKokkos_ENABLE_CUDA=ON \
-DKokkos_ENABLE_CUDA_LAMBDA=OFF \
-DKokkos_ENABLE_CUDA_UVM=ON \
-DKokkos_ENABLE_CUDA_RELOCATABLE_DEVICE_CODE=ON \
-DKokkos_ENABLE_DEPRECATED_CODE_3=ON \
-DKokkos_ENABLE_DEPRECATION_WARNINGS=OFF \
-DCMAKE_INSTALL_PREFIX=${PWD}/../install \
.. && \
make -j8 install && \
cd .. && \
rm -rf build-tests && mkdir -p build-tests && cd build-tests && \
export CMAKE_PREFIX_PATH=${PWD}/../install && \
cmake \
-DCMAKE_BUILD_TYPE=Release \
-DCMAKE_CXX_COMPILER_LAUNCHER=ccache \
-DCMAKE_CXX_COMPILER=$WORKSPACE/bin/nvcc_wrapper \
-DCMAKE_CXX_FLAGS=-Werror --Werror=all-warnings -Xcudafe --diag_suppress=3159 \
-DCMAKE_CXX_STANDARD=17 \
-DKokkos_INSTALL_TESTING=ON \
.. && \
make -j8 && ctest --verbose && \
cd ../example/build_cmake_installed && \
rm -rf build && mkdir -p build && cd build && \
cmake \
-DCMAKE_CXX_COMPILER=g++-8 \
-DCMAKE_CXX_FLAGS=-Werror \
-DCMAKE_CXX_STANDARD=17 \
.. && \
make -j8 && ctest --verbose && \
cd ../.. && \
cmake -B build_cmake_installed_different_compiler/build -DCMAKE_CXX_COMPILER=clang++ -DCMAKE_CXX_FLAGS=-Werror -DCMAKE_CXX_STANDARD=17 build_cmake_installed_different_compiler && \
cmake --build build_cmake_installed_different_compiler/build --target all && \
cmake --build build_cmake_installed_different_compiler/build --target test'''
}
post {
always {
sh 'ccache --show-stats'
}
}
}
stage('CUDA-11.6-NVCC-DEBUG') {
agent {
dockerfile {
filename 'Dockerfile.nvcc'
dir 'scripts/docker'
additionalBuildArgs '--build-arg BASE=nvidia/cuda:11.6.0-devel-ubuntu20.04'
label 'nvidia-docker'
args '-v /tmp/ccache.kokkos:/tmp/ccache --env NVIDIA_VISIBLE_DEVICES=$NVIDIA_VISIBLE_DEVICES'
}
}
steps {
sh 'ccache --zero-stats'
sh '''rm -rf build && mkdir -p build && cd build && \
cmake \
-DCMAKE_BUILD_TYPE=Debug \
-DCMAKE_CXX_COMPILER_LAUNCHER=ccache \
-DCMAKE_CXX_COMPILER=$WORKSPACE/bin/nvcc_wrapper \
-DCMAKE_CXX_FLAGS=-Werror \
-DCMAKE_CXX_STANDARD=14 \
-DKokkos_ARCH_NATIVE=ON \
-DKokkos_ENABLE_COMPILER_WARNINGS=ON \
-DKokkos_ENABLE_DEBUG=ON \
-DKokkos_ENABLE_DEBUG_BOUNDS_CHECK=ON \
-DKokkos_ENABLE_DEPRECATED_CODE_3=OFF \
-DKokkos_ENABLE_TESTS=ON \
-DKokkos_ENABLE_CUDA=ON \
-DKokkos_ENABLE_CUDA_LAMBDA=ON \
-DKokkos_ENABLE_LIBDL=OFF \
.. && \
make -j8 && ctest --verbose && \
cd ../example/build_cmake_in_tree && \
rm -rf build && mkdir -p build && cd build && \
cmake -DCMAKE_CXX_STANDARD=14 .. && make -j8 && ctest --verbose'''
}
post {
always {
sh 'ccache --show-stats'
}
}
}
stage('GCC-5.3.1') {
agent {
dockerfile {
filename 'Dockerfile.gcc'
dir 'scripts/docker'
label 'docker'
}
}
environment {
OMP_NUM_THREADS = 8
OMP_MAX_ACTIVE_LEVELS = 3
OMP_PROC_BIND = 'true'
}
steps {
sh '''rm -rf build && mkdir -p build && cd build && \
cmake \
-DCMAKE_BUILD_TYPE=Release \
-DCMAKE_CXX_STANDARD=14 \
-DCMAKE_CXX_FLAGS=-Werror \
-DKokkos_ARCH_NATIVE=ON \
-DKokkos_ENABLE_COMPILER_WARNINGS=ON \
-DKokkos_ENABLE_DEPRECATED_CODE_3=ON \
-DKokkos_ENABLE_DEPRECATION_WARNINGS=OFF \
-DKokkos_ENABLE_TESTS=ON \
-DKokkos_ENABLE_OPENMP=ON \
-DKokkos_ENABLE_LIBDL=OFF \
-DKokkos_ENABLE_LIBQUADMATH=ON \
-DKokkos_ENABLE_SERIAL=ON \
-DCMAKE_PREFIX_PATH=/usr/lib/gcc/x86_64-linux-gnu/5.3.1 \
.. && \
make -j8 && ctest --verbose && gcc -I$PWD/../core/src/ ../core/unit_test/tools/TestCInterface.c'''
}
}
}
}
}
}
kokkos-3.7.01/.travis.yml 0000664 0000000 0000000 00000006433 14343743117 0015231 0 ustar 00root root 0000000 0000000 sudo: false
language: cpp
os:
- linux
compiler:
- gcc
- clang
cache:
- ccache
stages:
- canary
- test
jobs:
include:
- stage: canary
env: CMAKE_BUILD_TYPE=Release BACKEND="OPENMP"
os: linux
branches:
only:
- master
- develop
- /^release/
env:
-
# - BACKEND="OPENMP"
- BACKEND="PTHREAD"
- CMAKE_BUILD_TYPE=Debug COVERAGE=yes GTEST_FILTER="-*DeathTest*"
- CMAKE_BUILD_TYPE=Debug BACKEND="OPENMP" COVERAGE=yes GTEST_FILTER="-*DeathTest*"
# - CMAKE_BUILD_TYPE=Debug BACKEND="PTHREAD" COVERAGE=yes
- CMAKE_BUILD_TYPE=Release
- CMAKE_BUILD_TYPE=Release BACKEND="OPENMP"
# - CMAKE_BUILD_TYPE=Release BACKEND="PTHREAD"
matrix:
exclude:
- os: linux
compiler: gcc
env: CMAKE_BUILD_TYPE=Release BACKEND="OPENMP"
# Install newer CMake. The distribution comes with CMake 3.12.4 but we require at least 3.16
install:
- CMAKE_VERSION=3.17.1
- CMAKE_DIR=/opt/cmake
- CMAKE_KEY=2D2CEF1034921684 &&
CMAKE_URL=https://github.com/Kitware/CMake/releases/download/v${CMAKE_VERSION} &&
CMAKE_SCRIPT=cmake-${CMAKE_VERSION}-Linux-x86_64.sh &&
CMAKE_SHA256=cmake-${CMAKE_VERSION}-SHA-256.txt &&
wget --quiet ${CMAKE_URL}/${CMAKE_SHA256} &&
wget --quiet ${CMAKE_URL}/${CMAKE_SHA256}.asc &&
wget --quiet ${CMAKE_URL}/${CMAKE_SCRIPT} &&
#gpg --keyserver pool.sks-keyservers.net --recv-keys ${CMAKE_KEY} &&
#gpg --verify ${CMAKE_SHA256}.asc ${CMAKE_SHA256} &&
#grep ${CMAKE_SCRIPT} ${CMAKE_SHA256} | sha256sum --check &&
mkdir -p ${CMAKE_DIR} &&
sh ${CMAKE_SCRIPT} --skip-license --prefix=${CMAKE_DIR} &&
rm cmake*
- PATH=${CMAKE_DIR}/bin:$PATH
- cd ${TRAVIS_BUILD_DIR}
before_script:
- ccache -z
- if [[ ${COVERAGE} ]]; then export CXX="${CXX} --coverage"; fi
- if [[ ! ${CMAKE_BUILD_TYPE} ]]; then export CXXFLAGS="${CXXFLAGS} -O2"; fi
script:
- export OMP_NUM_THREADS=2
- export OMP_PLACES=threads
- export OMP_PROC_BIND=spread
# LD_LIBRARY_PATH workaround to find clang's libomp: https://github.com/travis-ci/travis-ci/issues/8613
- if [[ ${CC} = clang ]]; then export LD_LIBRARY_PATH=/usr/local/clang/lib${LD_LIBRARY_PATH:+:}$LD_LIBRARY_PATH; fi
# enable ccache for clang on linux and add CCACHE_CPP2 to avoid 'Argument unused during compilation -I...' warning
- if [[ ${TRAVIS_OS_NAME} = linux && ${CC} = clang ]]; then
ln -s /usr/bin/ccache $HOME/bin/clang++;
export CCACHE_CPP2=yes;
fi
- mkdir build &&
pushd build &&
cmake ..
${BACKEND:+-DKokkos_ENABLE_${BACKEND}=On}
-DCMAKE_CXX_FLAGS="${CXXFLAGS} -Werror"
-DCMAKE_CXX_STANDARD=14
-DKokkos_ENABLE_COMPILER_WARNINGS=ON
-DKokkos_ENABLE_TESTS=On
${CMAKE_BUILD_TYPE:+-DCMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE}} &&
make VERBOSE=1 -j2 &&
travis_wait 60 make test CTEST_OUTPUT_ON_FAILURE=1 &&
make install DESTDIR=${PWD}/install && rm -rf ${PWD}/install/usr/local && rmdir ${PWD}/install/usr &&
popd
after_success:
- ccache -s
- if [[ ${COVERAGE} ]]; then
mkdir -p $HOME/.local/bin && wget -O $HOME/.local/bin/codecov https://codecov.io/bash && chmod +x $HOME/.local/bin/codecov;
pushd build &&
if [[ ${CC} = clang* ]]; then
codecov -x "llvm-cov gcov" -F "${CC}";
else
codecov -x gcov -F "${CC}";
fi;
fi
kokkos-3.7.01/BUILD.md 0000664 0000000 0000000 00000032630 14343743117 0014277 0 ustar 00root root 0000000 0000000 
# Installing and Using Kokkos
## Kokkos Philosophy
Kokkos provides a modern CMake style build system.
As C++ continues to develop for C++20 and beyond, CMake is likely to provide the most robust support
for C++. Applications heavily leveraging Kokkos are strongly encouraged to use a CMake build system.
You can either use Kokkos as an installed package (encouraged) or use Kokkos in-tree in your project.
Modern CMake is exceedingly simple at a high-level (with the devil in the details).
Once Kokkos is installed In your `CMakeLists.txt` simply use:
````cmake
find_package(Kokkos REQUIRED)
````
Then for every executable or library in your project:
````cmake
target_link_libraries(myTarget Kokkos::kokkos)
````
That's it! There is no checking Kokkos preprocessor, compiler, or linker flags.
Kokkos propagates all the necessary flags to your project.
This means not only is linking to Kokkos easy, but Kokkos itself can actually configure compiler and linker flags for *your*
project.
When configuring your project just set:
````bash
> cmake ${srcdir} \
-DKokkos_ROOT=${kokkos_install_prefix} \
-DCMAKE_CXX_COMPILER=${compiler_used_to_build_kokkos}
````
Note: You may need the following if your project requires a minimum CMake version older than 3.12:
````cmake
cmake_policy(SET CMP0074 NEW)
````
If building in-tree, there is no `find_package`. You can use `add_subdirectory(kokkos)` with the Kokkos source and again just link with `target_link_libraries(Kokkos::kokkos)`.
The examples in `examples/cmake_build_installed` and `examples/cmake_build_in_tree` can help get you started.
## Configuring CMake
A very basic installation of Kokkos is done with:
````bash
> cmake ${srcdir} \
-DCMAKE_CXX_COMPILER=g++ \
-DCMAKE_INSTALL_PREFIX=${kokkos_install_folder}
````
which builds and installed a default Kokkos when you run `make install`.
There are numerous device backends, options, and architecture-specific optimizations that can be configured, e.g.
````bash
> cmake ${srcdir} \
-DCMAKE_CXX_COMPILER=g++ \
-DCMAKE_INSTALL_PREFIX=${kokkos_install_folder} \
-DKokkos_ENABLE_OPENMP=ON
````
which activates the OpenMP backend. All of the options controlling device backends, options, architectures, and third-party libraries (TPLs) are given below.
## Known Issues
### Cray
* The Cray compiler wrappers do static linking by default. This seems to break the Kokkos build. You will likely need to set the environment variable `CRAYPE_LINK_TYPE=dynamic` in order to link correctly. Kokkos warns during configure if this is missing.
* The Cray compiler identifies to CMake as Clang, but it sometimes has its own flags that differ from Clang. We try to include all exceptions, but flag errors may occur in which a Clang-specific flag is passed that the Cray compiler does not recognize.
### Fortran
* In a mixed C++/Fortran code, CMake will use the C++ linker by default. If you override this behavior and use Fortran as the link language, the link may break because Kokkos adds linker flags expecting the linker to be C++. Prior to CMake 3.18, Kokkos has no way of detecting in downstream projects that the linker was changed to Fortran. From CMake 3.18, Kokkos can use generator expressions to avoid adding flags when the linker is not C++. Note: Kokkos will not add any linker flags in this Fortran case. The user will be entirely on their own to add the appropriate linker flags.
## Spack
An alternative to manually building with the CMake is to use the Spack package manager.
Make sure you have downloaded [Spack](https://github.com/spack/spack).
The easiest way to configure the Spack environment is:
````bash
> source spack/share/spack/setup-env.sh
````
with other scripts available for other shells.
You can display information about how to install packages with:
````bash
> spack info kokkos
A basic installation would be done as:
````bash
> spack install kokkos
````
Spack allows options and and compilers to be tuned in the install command.
````bash
> spack install kokkos@3.0 %gcc@7.3.0 +openmp
````
This example illustrates the three most common parameters to Spack:
* Variants: specified with, e.g. `+openmp`, this activates (or deactivates with, e.g. `~openmp`) certain options.
* Version: immediately following `kokkos` the `@version` can specify a particular Kokkos to build
* Compiler: a default compiler will be chosen if not specified, but an exact compiler version can be given with the `%`option.
For a complete list of Kokkos options, run:
````bash
> spack info kokkos
````
More details can be found in the [Spack README](Spack.md)
#### Spack Development
Spack currently installs packages to a location determined by a unique hash. This hash name is not really "human readable".
Generally, Spack usage should never really require you to reference the computer-generated unique install folder.
If you must know, you can locate Spack Kokkos installations with:
````bash
> spack find -p kokkos ...
````
where `...` is the unique spec identifying the particular Kokkos configuration and version.
A better way to use Spack for doing Kokkos development is the dev-build feature of Spack.
For dev-build details, consult the kokkos-spack repository [README](https://github.com/kokkos/kokkos-spack/blob/master/README.md).
# Kokkos Keyword Listing
## Device Backends
Device backends can be enabled by specifying `-DKokkos_ENABLE_X`.
* Kokkos_ENABLE_CUDA
* Whether to build CUDA backend
* BOOL Default: OFF
* Kokkos_ENABLE_HPX
* Whether to build HPX backend (experimental)
* BOOL Default: OFF
* Kokkos_ENABLE_OPENMP
* Whether to build OpenMP backend
* BOOL Default: OFF
* Kokkos_ENABLE_THREADS
* Whether to build C++ thread backend
* BOOL Default: OFF
* Kokkos_ENABLE_SERIAL
* Whether to build serial backend
* BOOL Default: ON
* Kokkos_ENABLE_HIP (Experimental)
* Whether to build HIP backend
* BOOL Default: OFF
* Kokkos_ENABLE_OPENMPTARGET (Experimental)
* Whether to build the OpenMP target backend
* BOOL Default: OFF
## Enable Options
Options can be enabled by specifying `-DKokkos_ENABLE_X`.
* Kokkos_ENABLE_AGGRESSIVE_VECTORIZATION
* Whether to aggressively vectorize loops
* BOOL Default: OFF
* Kokkos_ENABLE_COMPILER_WARNINGS
* Whether to print all compiler warnings
* BOOL Default: OFF
* Kokkos_ENABLE_CUDA_CONSTEXPR
* Whether to activate experimental relaxed constexpr functions
* BOOL Default: OFF
* Kokkos_ENABLE_CUDA_LAMBDA
* Whether to activate experimental lambda features
* BOOL Default: OFF
* Kokkos_ENABLE_CUDA_LDG_INTRINSIC
* Whether to use CUDA LDG intrinsics
* BOOL Default: OFF
* Kokkos_ENABLE_CUDA_RELOCATABLE_DEVICE_CODE
* Whether to enable relocatable device code (RDC) for CUDA
* BOOL Default: OFF
* Kokkos_ENABLE_CUDA_UVM
* Whether to use unified memory (UM) by default for CUDA
* BOOL Default: OFF
* Kokkos_ENABLE_DEBUG
* Whether to activate extra debug features - may increase compile times
* BOOL Default: OFF
* Kokkos_ENABLE_DEBUG_BOUNDS_CHECK
* Whether to use bounds checking - will increase runtime
* BOOL Default: OFF
* Kokkos_ENABLE_DEBUG_DUALVIEW_MODIFY_CHECK
* Debug check on dual views
* BOOL Default: OFF
* Kokkos_ENABLE_EXAMPLES
* Whether to enable building examples
* BOOL Default: OFF
* Kokkos_ENABLE_HPX_ASYNC_DISPATCH
* Whether HPX supports asynchronous dispatch
* BOOL Default: OFF
* Kokkos_ENABLE_IMPL_CUDA_MALLOC_ASYNC
* Whether to enable CudaMallocAsync (requires CUDA Toolkit 11.2). This is an experimental performance feature and currently has issue when using with UCX. See https://github.com/kokkos/kokkos/issues/4228 for more details.
* BOOL Default: OFF
* Kokkos_ENABLE_LARGE_MEM_TESTS
* Whether to perform extra large memory tests
* BOOL_Default: OFF
* Kokkos_ENABLE_PROFILING_LOAD_PRINT
* Whether to print information about which profiling tools gotloaded
* BOOL Default: OFF
* Kokkos_ENABLE_TESTS
* Whether to enable test suite
* BOOL Default: OFF
## Other Options
* Kokkos_CXX_STANDARD
* The C++ standard for Kokkos to use: c++14, c++17, or c++20. This should be given in CMake style as 14, 17, or 20.
* STRING Default: 14
## Third-party Libraries (TPLs)
The following options control enabling TPLs:
* Kokkos_ENABLE_HPX
* Whether to enable the HPX library
* BOOL Default: OFF
* Kokkos_ENABLE_HWLOC
* Whether to enable the HWLOC library
* BOOL Default: Off
* Kokkos_ENABLE_LIBNUMA
* Whether to enable the LIBNUMA library
* BOOL Default: Off
* Kokkos_ENABLE_MEMKIND
* Whether to enable the MEMKIND library
* BOOL Default: Off
* Kokkos_ENABLE_LIBDL
* Whether to enable the LIBDL library
* BOOL Default: On
* Kokkos_ENABLE_LIBRT
* Whether to enable the LIBRT library
* BOOL Default: Off
The following options control finding and configuring non-CMake TPLs:
* Kokkos_CUDA_DIR or CUDA_ROOT
* Location of CUDA install prefix for libraries
* PATH Default:
* Kokkos_HWLOC_DIR or HWLOC_ROOT
* Location of HWLOC install prefix
* PATH Default:
* Kokkos_LIBNUMA_DIR or LIBNUMA_ROOT
* Location of LIBNUMA install prefix
* PATH Default:
* Kokkos_MEMKIND_DIR or MEMKIND_ROOT
* Location of MEMKIND install prefix
* PATH Default:
* Kokkos_LIBDL_DIR or LIBDL_ROOT
* Location of LIBDL install prefix
* PATH Default:
* Kokkos_LIBRT_DIR or LIBRT_ROOT
* Location of LIBRT install prefix
* PATH Default:
The following options control `find_package` paths for CMake-based TPLs:
* HPX_DIR or HPX_ROOT
* Location of HPX prefix (ROOT) or CMake config file (DIR)
* PATH Default:
## Architecture Keywords
Architecture-specific optimizations can be enabled by specifying `-DKokkos_ARCH_X`.
* Kokkos_ARCH_NATIVE
* Whether to optimize for the the local CPU architecture
* BOOL Default: OFF
* Kokkos_ARCH_AMDAVX
* Whether to optimize for the AMDAVX architecture
* BOOL Default: OFF
* Kokkos_ARCH_ARMV80
* Whether to optimize for the ARMV80 architecture
* BOOL Default: OFF
* Kokkos_ARCH_ARMV81
* Whether to optimize for the ARMV81 architecture
* BOOL Default: OFF
* Kokkos_ARCH_ARMV8_THUNDERX
* Whether to optimize for the ARMV8_THUNDERX architecture
* BOOL Default: OFF
* Kokkos_ARCH_ARMV8_TX2
* Whether to optimize for the ARMV8_TX2 architecture
* BOOL Default: OFF
* Kokkos_ARCH_BDW
* Whether to optimize for the BDW architecture
* BOOL Default: OFF
* Kokkos_ARCH_BGQ
* Whether to optimize for the BGQ architecture
* BOOL Default: OFF
* Kokkos_ARCH_ZEN
* Whether to optimize for the Zen architecture
* BOOL Default: OFF
* Kokkos_ARCH_ZEN2
* Whether to optimize for the Zen2 architecture
* BOOL Default: OFF
* Kokkos_ARCH_ZEN3
* Whether to optimize for the Zen3 architecture
* BOOL Default: OFF
* Kokkos_ARCH_HSW
* Whether to optimize for the HSW architecture
* BOOL Default: OFF
* Kokkos_ARCH_KEPLER30
* Whether to optimize for the KEPLER30 architecture
* BOOL Default: OFF
* Kokkos_ARCH_KEPLER32
* Whether to optimize for the KEPLER32 architecture
* BOOL Default: OFF
* Kokkos_ARCH_KEPLER35
* Whether to optimize for the KEPLER35 architecture
* BOOL Default: OFF
* Kokkos_ARCH_KEPLER37
* Whether to optimize for the KEPLER37 architecture
* BOOL Default: OFF
* Kokkos_ARCH_KNC
* Whether to optimize for the KNC architecture
* BOOL Default: OFF
* Kokkos_ARCH_KNL
* Whether to optimize for the KNL architecture
* BOOL Default: OFF
* Kokkos_ARCH_MAXWELL50
* Whether to optimize for the MAXWELL50 architecture
* BOOL Default: OFF
* Kokkos_ARCH_MAXWELL52
* Whether to optimize for the MAXWELL52 architecture
* BOOL Default: OFF
* Kokkos_ARCH_MAXWELL53
* Whether to optimize for the MAXWELL53 architecture
* BOOL Default: OFF
* Kokkos_ARCH_PASCAL60
* Whether to optimize for the PASCAL60 architecture
* BOOL Default: OFF
* Kokkos_ARCH_PASCAL61
* Whether to optimize for the PASCAL61 architecture
* BOOL Default: OFF
* Kokkos_ARCH_POWER7
* Whether to optimize for the POWER7 architecture
* BOOL Default: OFF
* Kokkos_ARCH_POWER8
* Whether to optimize for the POWER8 architecture
* BOOL Default: OFF
* Kokkos_ARCH_POWER9
* Whether to optimize for the POWER9 architecture
* BOOL Default: OFF
* Kokkos_ARCH_ICL
* Whether to optimize for the ICL architecture
* BOOL Default: OFF
* Kokkos_ARCH_ICX
* Whether to optimize for the ICX architecture
* BOOL Default: OFF
* Kokkos_ARCH_SKL
* Whether to optimize for the SKL architecture
* BOOL Default: OFF
* Kokkos_ARCH_SKX
* Whether to optimize for the SKX architecture
* BOOL Default: OFF
* Kokkos_ARCH_SNB
* Whether to optimize for the SNB architecture
* BOOL Default: OFF
* Kokkos_ARCH_SPR
* Whether to optimize for the SPR architecture
* BOOL Default: OFF
* Kokkos_ARCH_TURING75
* Whether to optimize for the TURING75 architecture
* BOOL Default: OFF
* Kokkos_ARCH_VOLTA70
* Whether to optimize for the VOLTA70 architecture
* BOOL Default: OFF
* Kokkos_ARCH_VOLTA72
* Whether to optimize for the VOLTA72 architecture
* BOOL Default: OFF
* Kokkos_ARCH_WSM
* Whether to optimize for the WSM architecture
* BOOL Default: OFF
##### [LICENSE](https://github.com/kokkos/kokkos/blob/devel/LICENSE)
[](https://opensource.org/licenses/BSD-3-Clause)
Under the terms of Contract DE-NA0003525 with NTESS,
the U.S. Government retains certain rights in this software.
kokkos-3.7.01/CHANGELOG.md 0000664 0000000 0000000 00000446726 14343743117 0014746 0 ustar 00root root 0000000 0000000 # Change Log
## [3.7.01](https://github.com/kokkos/kokkos/tree/3.7.01) (2022-12-01)
[Full Changelog](https://github.com/kokkos/kokkos/compare/3.7.00...3.7.01)
### Bug Fixes:
- Add fences to all sorting routines not taking an execution space instance argument [\#5547](https://github.com/kokkos/kokkos/pull/5547)
- Fix repeated `team_reduce` without barrier [\#5552](https://github.com/kokkos/kokkos/pull/5552)
- Fix memory spaces in `create_mirror_view` overloads using `view_alloc` [\#5521](https://github.com/kokkos/kokkos/pull/5521)
- Allow `as_view_of_rank_n()` to be overloaded for "special" scalar types [\#5553](https://github.com/kokkos/kokkos/pull/5553)
- Fix warning calling a `__host__` function from a `__host__ __device__` from `View:: as_view_of_rank_n` [\#5591](https://github.com/kokkos/kokkos/pull/5591)
- OpenMPTarget: adding implementation to set device id. [\#5557](https://github.com/kokkos/kokkos/pull/5557)
- Use `Kokkos::atomic_load` to Correct Race Condition Giving Rise to Seg Faulting Error in OpenMP tests [\#5559](https://github.com/kokkos/kokkos/pull/5559)
- cmake: define `KOKKOS_ARCH_A64FX` [\#5561](https://github.com/kokkos/kokkos/pull/5561)
- Only link against libatomic in gnu-make OpenMPTarget build [\#5565](https://github.com/kokkos/kokkos/pull/5565)
- Fix static extents assignment for LayoutLeft/LayoutRight assignment [\#5566](https://github.com/kokkos/kokkos/pull/5566)
- Do not add -cuda to the link line with NVHPC compiler when the CUDA backend is not actually enabled [\#5569](https://github.com/kokkos/kokkos/pull/5569)
- Export the flags in `KOKKOS_AMDGPU_OPTIONS` when using Trilinos [\#5571](https://github.com/kokkos/kokkos/pull/5571)
- Add support for detecting MPI local rank with MPICH and PMI [\#5570](https://github.com/kokkos/kokkos/pull/5570) [\#5582](https://github.com/kokkos/kokkos/pull/5582)
- Remove listing of undefined TPL dependencies [\#5573](https://github.com/kokkos/kokkos/pull/5573)
- ClockTic changed to 64 bit to fix overflow on Power [\#5592](https://github.com/kokkos/kokkos/pull/5592)
- Fix incorrect offset in CUDA and HIP parallel scan for < 4 byte types [\#5607](https://github.com/kokkos/kokkos/pull/5607)
- Fix initialization of Cuda lock arrays [\#5622](https://github.com/kokkos/kokkos/pull/5622)
## [3.7.00](https://github.com/kokkos/kokkos/tree/3.7.00) (2022-08-22)
[Full Changelog](https://github.com/kokkos/kokkos/compare/3.6.01...3.7.00)
### Features:
- Use non-volatile `join()` member functions and `operator+=` in `parallel_reduce/scan` [\#4931](https://github.com/kokkos/kokkos/pull/4931) [\#4954](https://github.com/kokkos/kokkos/pull/4954) [\#4951](https://github.com/kokkos/kokkos/pull/4951)
- Add `SIMD` sub package (requires C++17) [\#5016](https://github.com/kokkos/kokkos/pull/5016)
- Add `is_finalized()` [\#5247](https://github.com/kokkos/kokkos/pull/5247)
- Promote mathematical functions from `namespace Kokkos::Experimental` to `namespace Kokkos` [\#4791](https://github.com/kokkos/kokkos/pull/4791)
- Promote `min`, `max`, `clamp`, `minmax` functions from `namespace Kokkos::Experimental` to `namespace Kokkos` [\#5170](https://github.com/kokkos/kokkos/pull/5170)
- Add `round`, `logb`, `nextafter`, `copysign`, and `signbit` math functions [\#4768](https://github.com/kokkos/kokkos/pull/4768)
- Add `HIPManagedSpace`, similar to `CudaUVMSpace` [\#5112](https://github.com/kokkos/kokkos/pull/5112)
- Accept view construction allocation properties in `create_mirror[_view,_view_and_copy]` and `resize/realloc` [\#5125](https://github.com/kokkos/kokkos/pull/5125) [\#5095](https://github.com/kokkos/kokkos/pull/5095) [\#5035](https://github.com/kokkos/kokkos/pull/5035) [\#4805](https://github.com/kokkos/kokkos/pull/4805) [\#4844](https://github.com/kokkos/kokkos/pull/4844)
- Allow `MemorySpace::allocate()` to be called with execution space [\#4826](https://github.com/kokkos/kokkos/pull/4826)
- Experimental: Compile time view subscriber [\#4197](https://github.com/kokkos/kokkos/pull/4197)
### Backends and Archs Enhancements:
- Add support for Sapphire Rapids Intel architecture [\#5015](https://github.com/kokkos/kokkos/pull/5015)
- Add support for ICX, SKL and ICL Intel architectures [\#5013](https://github.com/kokkos/kokkos/pull/5013) [\#4929](https://github.com/kokkos/kokkos/pull/4929)
- Add arch flags for Intel GPU Ponte Vecchio [\#4932](https://github.com/kokkos/kokkos/pull/4932)
- SYCL: require GPU if GPU architecture was set at configuration time (i.e. do not allow fallback to CPU device) [\#5264](https://github.com/kokkos/kokkos/pull/5264) [\#5222](https://github.com/kokkos/kokkos/pull/5222)
- SYCL: Add `SYCL::sycl_queue()` for interoperability [\#5241](https://github.com/kokkos/kokkos/pull/5241)
- SYCL: Loosen restriction for using built-in `sycl::group_broadcast` [\#4552](https://github.com/kokkos/kokkos/pull/4552)
- SYCL: preserve address space [\#4396](https://github.com/kokkos/kokkos/pull/4396)
- OpenMPTarget: Adding a workaound for team scan [\#5219](https://github.com/kokkos/kokkos/pull/5219)
- OpenMPTarget: Adding logic to skip the kernel launch if `league_size=0` [\#5067](https://github.com/kokkos/kokkos/pull/5067)
- OpenMPTarget: Make sure `Kokkos::abort()` causes abnormal program termination when called on the host-side [\#4808](https://github.com/kokkos/kokkos/pull/4808)
- HIP: Make HIPHostPinnedSpace coarse-grained [\#5152](https://github.com/kokkos/kokkos/pull/5152)
- Refactor OpenMP `parallel_for` implementation to use more native OpenMP constructs [\#4664](https://github.com/kokkos/kokkos/pull/4664)
- Add option to optimize for local CPU architecture `Kokkos_ARCH_NATIVE` [\#4930](https://github.com/kokkos/kokkos/pull/4930)
### Implemented enhancements
- Add command line argument/environment variable to print the configuration [\#5233](https://github.com/kokkos/kokkos/pull/5233)
- Improve error message in view memory access violations [\#4950](https://github.com/kokkos/kokkos/pull/4950)
- Remove unnecessary fences in View initialization [\#4823](https://github.com/kokkos/kokkos/pull/4823)
- Make `View::shmem_size()` device-callable [\#4936](https://github.com/kokkos/kokkos/pull/4936)
- Update numerics support for `__float128` [\#5081](https://github.com/kokkos/kokkos/pull/5081)
- Add `log10` overload for `Kokkos::complex` [\#5009](https://github.com/kokkos/kokkos/pull/5009)
- Add `[[nodiscard]]` to `ScopeGuard` [\#5224](https://github.com/kokkos/kokkos/pull/5224)
- Add structured binding support for `Kokkos::Array` [\#4962](https://github.com/kokkos/kokkos/pull/4962)
- Enable accessing `Kokkos::Array` elements in constant expressions [\#4916](https://github.com/kokkos/kokkos/pull/4916)
- Mark `as_view_of_rank_n` as KOKKOS_FUNCTION [\#5248](https://github.com/kokkos/kokkos/pull/5248)
- Cleanup/rework fence overloads [\#5148](https://github.com/kokkos/kokkos/pull/5148)
- Assert that `Layout` construction from extents is valid in functions taking integer extents [\#5209](https://github.com/kokkos/kokkos/pull/5209)
- Add `fill_random` overload that takes an execution space as first argument [\#5181](https://github.com/kokkos/kokkos/pull/5181)
- Avoid some unnecessary fences in `parallel_reduce/scan` [\#5154](https://github.com/kokkos/kokkos/pull/5154)
- Include `KOKKOS_ENABLE_LIBDL` in options when printing configuration [\#5086](https://github.com/kokkos/kokkos/pull/5086)
- DynRankView: make `layout()` return the same as a corresponding static View [\#5026](https://github.com/kokkos/kokkos/pull/5026)
- Use `_mm_malloc` for icpx [\#5012](https://github.com/kokkos/kokkos/pull/5012)
- Avoid forcing matching execution spaces in `BinSort` constructor and `sort()` [\#4919](https://github.com/kokkos/kokkos/pull/4919)
- Check number of bins in `BinSort` [\#4890](https://github.com/kokkos/kokkos/pull/4890)
- Improve performance in parallel STL-like algorithms [\#4887](https://github.com/kokkos/kokkos/pull/4887) [\#4886](https://github.com/kokkos/kokkos/pull/4886)
- Disable `memset` on A64FX and launch `parallel_for` instead (performance) [\#4884](https://github.com/kokkos/kokkos/pull/4884)
- Allow non-power-of-two team sizes for team reductions and scans [\#4809](https://github.com/kokkos/kokkos/pull/4809)
#### Harmonization of Kokkos execution environment initialization:
- Warn when unable to detect local MPI rank and user explicitly asked for it [\#5263](https://github.com/kokkos/kokkos/pull/5263)
- Refactor parsing of command line arguments and environment variables [\#5221](https://github.com/kokkos/kokkos/pull/5221)
- Refactor device selection at initialization [\#5211](https://github.com/kokkos/kokkos/pull/5211)
- Rename tools settings for consistency [\#5201](https://github.com/kokkos/kokkos/pull/5201)
- Print help only once [\#5128](https://github.com/kokkos/kokkos/pull/5128)
- Update precedence rule in initialization [\#5130](https://github.com/kokkos/kokkos/pull/5130)
- Warn instead of just ignoring user settings when kokkos-tools is disabled [\#5088](https://github.com/kokkos/kokkos/pull/5088)
- Drop numa args in threads backend initialization [\#5127](https://github.com/kokkos/kokkos/pull/5127)
- Warn users when a flag prefixed with -[-]kokkos is not recognized and do not remove it [\#5256](https://github.com/kokkos/kokkos/pull/5256)
- Give back to Core what belongs to Core (aka moving tune_internals option from Tools back to Core) [\#5202](https://github.com/kokkos/kokkos/pull/5202)
#### Build system updates:
- `nvcc_wrapper`: filter out -pedantic-errors from nvcc options [\#5235](https://github.com/kokkos/kokkos/pull/5235)
- `nvcc_wrapper`: add known nvcc option --source-in-ptx [\#5052](https://github.com/kokkos/kokkos/pull/5052)
- Link libdl as interface library [\#5179](https://github.com/kokkos/kokkos/pull/5179)
- Only show GPU architectures with enabled corresponding backend [\#5119](https://github.com/kokkos/kokkos/pull/5119)
- Enable optional external desul build [\#5021](https://github.com/kokkos/kokkos/pull/5021) [\#5132](https://github.com/kokkos/kokkos/pull/5132)
- Export `Kokkos_CXX_STANDARD` variable with CMake [\#5068](https://github.com/kokkos/kokkos/pull/5068)
- Suppress warnings with nvc++ [\#5031](https://github.com/kokkos/kokkos/pull/5031)
- Disallow multiple host architectures in CMake [\#4996](https://github.com/kokkos/kokkos/pull/4996)
- Do not include compiler warning flags in the compile option of the cmake target [\#4989](https://github.com/kokkos/kokkos/pull/4989)
- AOT flags for OpenMPTarget targeting Intel GPUs [\#4915](https://github.com/kokkos/kokkos/pull/4915)
- Repurpose `Kokkos_ARCH_INTEL_GEN` for SYCL to mean JIT to be conforming with OMPT [\#4894](https://github.com/kokkos/kokkos/pull/4894)
- Replace amdgpu-target with offload-arch [\#4874](https://github.com/kokkos/kokkos/pull/4874)
- Do not enable `kokkos_launch_compiler` when `CMAKE_CXX_COMPILER_LAUNCHER` is set [\#4870](https://github.com/kokkos/kokkos/pull/4870)
- Move CMake version check up [\#4797](https://github.com/kokkos/kokkos/pull/4797)
### Incompatibilities:
- Remove `KOKKOS_THREAD_LOCAL` [\#5064](https://github.com/kokkos/kokkos/pull/5064)
- Remove `KOKKOS_ENABLE_POSIX_MEMALIGN` [\#5011](https://github.com/kokkos/kokkos/pull/5011)
- Remove unused `KOKKOS_ENABLE_TM` [\#4995](https://github.com/kokkos/kokkos/pull/4995)
- Remove unused cmakedefine `KOKKOS_ENABLE_COMPILER_WARNINGS` [\#4883](https://github.com/kokkos/kokkos/pull/4883)
- Remove unused `KOKKOS_ENABLE_DUALVIEW_MODIFY_CHECK` [\#4882](https://github.com/kokkos/kokkos/pull/4882)
- Drop Instruction Set Architecture (ISA) macros [\#4981](https://github.com/kokkos/kokkos/pull/4981)
- Warn in `ScopeGuard` about illegal usage [\#5250](https://github.com/kokkos/kokkos/pull/5250)
### Deprecations:
- Guard against non-public header inclusion [\#5178](https://github.com/kokkos/kokkos/pull/5178)
- Raise deprecation warnings if non empty WorkTag class is used [\#5230](https://github.com/kokkos/kokkos/pull/5230)
- Deprecate `parallel_*` overloads taking the label as trailing argument [\#5141](https://github.com/kokkos/kokkos/pull/5141)
- Deprecate nested types in functional [\#5185](https://github.com/kokkos/kokkos/pull/5185)
- Deprecate `InitArguments` struct and replace it with `InitializationSettings` [\#5135](https://github.com/kokkos/kokkos/pull/5135)
- Deprecate `finalize_all()` [\#5134](https://github.com/kokkos/kokkos/pull/5134)
- Deprecate command line arguments (other than `--help`) that are not prefixed with `kokkos-*` [\#5120](https://github.com/kokkos/kokkos/pull/5120)
- Deprecate `--[kokkos-]numa` cmdline arg and `KOKKOS_NUMA` env var [\#5117](https://github.com/kokkos/kokkos/pull/5117)
- Deprecate `--[kokkos-]threads` command line argument in favor of `--[kokkos-]num-threads` [\#5111](https://github.com/kokkos/kokkos/pull/5111)
- Deprecate `Kokkos::is_reducer_type` [\#4957](https://github.com/kokkos/kokkos/pull/4957)
- Deprecate `OffsetView` constructors taking `index_list_type` [\#4810](https://github.com/kokkos/kokkos/pull/4810)
- Deprecate overloads of `Kokkos::sort` taking a parameter `bool always_use_kokkos_sort` [\#5382](https://github.com/kokkos/kokkos/issues/5382)
- Warn about `parallel_reduce` cases that call `join()` with volatile-qualified arguments [\#5215](https://github.com/kokkos/kokkos/pull/5215)
### Bug Fixes:
- CUDA Reductions: Fix data races reported by Nvidia `compute-sanitizer` [\#4855](https://github.com/kokkos/kokkos/pull/4855)
- Work around Intel compiler bug [\#5301](https://github.com/kokkos/kokkos/pull/5301)
- Avoid allocating memory for UniqueToken [\#5300](https://github.com/kokkos/kokkos/pull/5300)
- DynamicView: Properly resize mirror instances after construction [\#5276](https://github.com/kokkos/kokkos/pull/5276)
- Remove Kokkos::Rank limit of 6 ranks [\#5271](https://github.com/kokkos/kokkos/pull/5271)
- Do not forget to set last element to nullptr when removing a flag in `Kokkos::initialize` [\#5272](https://github.com/kokkos/kokkos/pull/5272)
- Fix CUDA+MSVC build issue [\#5261](https://github.com/kokkos/kokkos/pull/5261)
- Fix `DynamicView::resize_serial` [\#5220](https://github.com/kokkos/kokkos/pull/5220)
- Fix cmake default compiler flags for unknown compiler [\#5217](https://github.com/kokkos/kokkos/pull/5217)
- Fix `move_backward` [\#5191](https://github.com/kokkos/kokkos/pull/5191)
- Fixing issue 5196 - missing symbol with intel compiler [\#5207](https://github.com/kokkos/kokkos/pull/5207)
- Preserve `KOKKOS_INVALID_INDEX` in ViewDimension and ArrayLayout construction [\#5188](https://github.com/kokkos/kokkos/pull/5188)
- Finalize `deep_copy_space` early avoiding printing to `std::cerr` for Cuda [\#5151](https://github.com/kokkos/kokkos/pull/5151)
- Use correct policy in Threads MDRange `parallel_reduce` [\#5123](https://github.com/kokkos/kokkos/pull/5123)
- Fix building with NVCC as the CXX compiler while the CUDA backend is not enabled [\#5115](https://github.com/kokkos/kokkos/pull/5115)
- OpenMPTarget Index range fix for MDRange. [\#5089](https://github.com/kokkos/kokkos/pull/5089)
- Fix bug with CUDA's team reduction for empty ranges [\#5079](https://github.com/kokkos/kokkos/pull/5079)
- Fix using `ZeroMemset` for Serial [\#5077](https://github.com/kokkos/kokkos/pull/5077)
- Fix `Kokkos::Vector::push_back` for default execution space [\#5047](https://github.com/kokkos/kokkos/pull/5047)
- ScatterView: Fix ScatterMin/ScatterMax to use proper atomics [\#5045](https://github.com/kokkos/kokkos/pull/5045)
- Fix calling `ZeroMemset` in `deep_copy` [\#5040](https://github.com/kokkos/kokkos/pull/5040)
- Make View self-assignment not produce double-free [\#5024](https://github.com/kokkos/kokkos/pull/5024)
- Guard against unrecognized pragma with intel compilers [\#5019](https://github.com/kokkos/kokkos/pull/5019)
- Fix racing condition in `HIPParallelLaunch` [\#5008](https://github.com/kokkos/kokkos/pull/5008)
- KokkosP: Fix `device_id` in profiling [\#4997](https://github.com/kokkos/kokkos/pull/4997)
- Fix for `Kokkos::vector::insert` into empty vector with begin and end iterators [\#4988](https://github.com/kokkos/kokkos/pull/4988)
- Fix Core header files installation [\#4984](https://github.com/kokkos/kokkos/pull/4984)
- Fix bounds errors with `Kokkos::sort` [\#4980](https://github.com/kokkos/kokkos/pull/4980)
- Fixup let `RangePolicy::set_chunk_size` return a reference to self [\#4918](https://github.com/kokkos/kokkos/pull/4918)
- Fix allocating large Views [\#4907](https://github.com/kokkos/kokkos/pull/4907)
- Fix combined reductions with `Kokkos::View` [\#4896](https://github.com/kokkos/kokkos/pull/4896)
- Fixed `_CUDA_ARCH__` to `__CUDA_ARCH__` for CUDA LDG [\#4893](https://github.com/kokkos/kokkos/pull/4893)
- Fixup `View::access()` truncate parameter pack [\#4876](https://github.com/kokkos/kokkos/pull/4876)
- Fix `abort` with HIP backend for ROCm 5.0.2 and beyond [\#4873](https://github.com/kokkos/kokkos/pull/4873)
- Fix HIP version when printing the configuration [\#4872](https://github.com/kokkos/kokkos/pull/4872)
- Fix scratch lock array when using scratch level 1 [\#4871](https://github.com/kokkos/kokkos/pull/4871)
- Fix Makefile.kokkos to work with fujitsu compiler [\#4867](https://github.com/kokkos/kokkos/pull/4867)
- cmake: Correct link THREADS link option [\#4854](https://github.com/kokkos/kokkos/pull/4854)
- UniqueToken `impl_acquire` function should be device only [\#4819](https://github.com/kokkos/kokkos/pull/4819)
- Fix example calls to non existing static `print_configuration` [\#4806](https://github.com/kokkos/kokkos/pull/4806)
- Fix requests for large team scratch sizes [\#4728](https://github.com/kokkos/kokkos/pull/4728)
## [3.6.01](https://github.com/kokkos/kokkos/tree/3.6.01) (2022-05-23)
[Full Changelog](https://github.com/kokkos/kokkos/compare/3.6.00...3.6.01)
### Bug Fixes:
- Fix Threads: Fix serial resizing scratch space (3.6.01 cherry-pick) [\#5109](https://github.com/kokkos/kokkos/pull/5109)
- Fix ScatterMin/ScatterMax to use proper atomics (3.6.01 cherry-pick) [\#5046](https://github.com/kokkos/kokkos/pull/5046)
- Fix allocating large Views [\#4907](https://github.com/kokkos/kokkos/pull/4907)
- Fix bounds errors with Kokkos::sort [\#4980](https://github.com/kokkos/kokkos/pull/4980)
- Fix HIP version when printing the configuration [\#4872](https://github.com/kokkos/kokkos/pull/4872)
- Fixed `_CUDA_ARCH__` to `__CUDA_ARCH__` for CUDA LDG [\#4893](https://github.com/kokkos/kokkos/pull/4893)
- Fixed an incorrect struct initialization [\#5028](https://github.com/kokkos/kokkos/pull/5028)
- Fix racing condition in `HIPParallelLaunch` [\#5008](https://github.com/kokkos/kokkos/pull/5008)
- Avoid deprecation warnings with `OpenMPExec::validate_partition` [\#4982](https://github.com/kokkos/kokkos/pull/4982)
- Make View self-assignment not produce double-free [\#5024](https://github.com/kokkos/kokkos/pull/5024)
## [3.6.00](https://github.com/kokkos/kokkos/tree/3.6.00) (2022-02-18)
[Full Changelog](https://github.com/kokkos/kokkos/compare/3.5.00...3.6.00)
### Features:
- Add C++ standard algorithms [\#4315](https://github.com/kokkos/kokkos/pull/4315)
- Implement `fill_random` for `DynRankView` [\#4763](https://github.com/kokkos/kokkos/pull/4763)
- Add `bhalf_t` [\#4543](https://github.com/kokkos/kokkos/pull/4543) [\#4653](https://github.com/kokkos/kokkos/pull/4653)
- Add mathematical constants [\#4519](https://github.com/kokkos/kokkos/pull/4519)
- Allow `Kokkos::{create_mirror*,resize,realloc}` to be used with `WithoutInitializing` [\#4486](https://github.com/kokkos/kokkos/pull/4486) [\#4337](https://github.com/kokkos/kokkos/pull/4337)
- Implement `KOKKOS_IF_ON_{HOST,DEVICE}` macros [\#4660](https://github.com/kokkos/kokkos/pull/4660)
- Allow setting the CMake language for Kokkos [\#4323](https://github.com/kokkos/kokkos/pull/4323)
#### Perf bug fix
- Desul: Add ScopeCaller [\#4690](https://github.com/kokkos/kokkos/pull/4690)
- Enable Desul atomics by default when using Makefiles [\#4606](https://github.com/kokkos/kokkos/pull/4606)
- Unique token improvement [\#4741](https://github.com/kokkos/kokkos/pull/4741) [\#4748](https://github.com/kokkos/kokkos/pull/4748)
#### Other improvements:
- Add math function long double overload on the host side [\#4712](https://github.com/kokkos/kokkos/pull/4712)
### Deprecations:
- Array reductions with pointer return types [\#4756](https://github.com/kokkos/kokkos/pull/4756)
- Deprecate `partition_master`, `validate_partition` [\#4737](https://github.com/kokkos/kokkos/pull/4737)
- Deprecate `Kokkos_ENABLE_PTHREAD` in favor of `Kokkos_ENABLE_THREADS` [\#4619](https://github.com/kokkos/kokkos/pull/4619) ** pair with use std::threads **
- Deprecate `log2(unsigned) -> int` (removing in next release) [\#4595](https://github.com/kokkos/kokkos/pull/4595)
- Deprecate `Kokkos::Impl::is_view` [\#4592](https://github.com/kokkos/kokkos/pull/4592)
- Deprecate `KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_*` macros and the `ActiveExecutionMemorySpace` alias [\#4668](https://github.com/kokkos/kokkos/issues/4668)
### Backends and Archs Enhancements:
#### SYCL:
- Update required SYCL compiler version [\#4749](https://github.com/kokkos/kokkos/pull/4749)
- Cap vector size to kernel maximum for SYCL [\#4704](https://github.com/kokkos/kokkos/pull/4704)
- Improve check for compatibility of vector size and subgroup size in SYCL [\#4579](https://github.com/kokkos/kokkos/pull/4579)
- Provide `chunk_size` for SYCL [\#4635](https://github.com/kokkos/kokkos/pull/4635)
- Use host-pinned memory for SYCL kernel memory [\#4627](https://github.com/kokkos/kokkos/pull/4627)
- Use shuffle-based algorithm for scalar reduction [\#4608](https://github.com/kokkos/kokkos/pull/4608)
- Implement pool of USM IndirectKernelMemory [\#4596](https://github.com/kokkos/kokkos/pull/4596)
- Provide valid default team size for SYCL [\#4481](https://github.com/kokkos/kokkos/pull/4481)
#### CUDA:
- Add checks for shmem usage in `parallel_reduce` [\#4548](https://github.com/kokkos/kokkos/pull/4548)
#### HIP:
- Add support for fp16 in the HIP backend [\#4688](https://github.com/kokkos/kokkos/pull/4688)
- Disable multiple kernel instantiations when using HIP (configure with `-DKokkos_ENABLE_HIP_MULTIPLE_KERNEL_INSTANTIATIONS=ON` to use) [\#4644](https://github.com/kokkos/kokkos/pull/4644)
- Fix HIP scratch use per instance [\#4439](https://github.com/kokkos/kokkos/pull/4439)
- Change allocation header to 256B alignment for AMD VEGA architecture [\#4753](https://github.com/kokkos/kokkos/pull/4753)
- Add generic `KOKKOS_ARCH_VEGA` macro [\#4782](https://github.com/kokkos/kokkos/pull/4782)
- Require ROCm 4.5 [\#4689](https://github.com/kokkos/kokkos/pull/4689)
### HPX:
- Adapt to HPX 1.7.0 which is now required [\#4241](https://github.com/kokkos/kokkos/pull/4241)
#### OpenMP:
- Fix thread deduction for OpenMP for `thread_count==0` [\#4541](https://github.com/kokkos/kokkos/pull/4541)
#### OpenMPTarget:
- Update memory space `size_type` to improve performance (`size_t -> unsigned`) [\#4779](https://github.com/kokkos/kokkos/pull/4779)
#### Other Improvements:
- Improve NVHPC support [\#4599](https://github.com/kokkos/kokkos/pull/4599)
- Add `Kokkos::Experimental::{min,max,minmax,clamp}` [\#4629](https://github.com/kokkos/kokkos/pull/4629) [\#4506](https://github.com/kokkos/kokkos/pull/4506)
- Use device type as template argument in Containers and Algorithms [\#4724](https://github.com/kokkos/kokkos/pull/4724) [\#4675](https://github.com/kokkos/kokkos/pull/4675)
- Implement `Kokkos::sort` with execution space [\#4490](https://github.com/kokkos/kokkos/pull/4490)
- `Kokkos::resize` always error out for mismatch in runtime rank [\#4681](https://github.com/kokkos/kokkos/pull/4681)
- Print current call stack when calling `Kokkos::abort()` from the host [\#4672](https://github.com/kokkos/kokkos/pull/4672) [\#4671](https://github.com/kokkos/kokkos/pull/4671)
- Detect mismatch of execution spaces in functors [\#4655](https://github.com/kokkos/kokkos/pull/4655)
- Improve view label access on host [\#4647](https://github.com/kokkos/kokkos/pull/4647)
- Error out for `const` scalar return type in reduction [\#4645](https://github.com/kokkos/kokkos/pull/4645)
- Don't allow calling `UnorderdMap::value_at` for a set [\#4639](https://github.com/kokkos/kokkos/pull/4639)
- Add `KOKKOS_COMPILER_NVHPC` macro, disable `quiet_NaN` and `signaling_NaN` [\#4586](https://github.com/kokkos/kokkos/pull/4586)
- Improve performance of `local_deep_copy` [\#4511](https://github.com/kokkos/kokkos/pull/4511)
- Improve performance when sorting integers [\#4464](https://github.com/kokkos/kokkos/pull/4464)
- Add missing numeric traits (`denorm_min`, `reciprocal_overflow_threshold`, `{quiet,silent}_NaN}`) and make them work on cv-qualified types [\#4466](https://github.com/kokkos/kokkos/pull/4466) [\#4415](https://github.com/kokkos/kokkos/pull/4415) [\#4473](https://github.com/kokkos/kokkos/pull/4473) [\#4443](https://github.com/kokkos/kokkos/pull/4443)
### Implemented enhancements BuildSystem
- Manually compute IntelLLVM compiler version for older CMake versions [\#4760](https://github.com/kokkos/kokkos/pull/4760)
- Add Xptxas without = to `nvcc_wrapper` [\#4646](https://github.com/kokkos/kokkos/pull/4646)
- Use external GoogleTest optionally [\#4563](https://github.com/kokkos/kokkos/pull/4563)
- Silent warnings about multiple optimization flags with `nvcc_wrapper` [\#4502](https://github.com/kokkos/kokkos/pull/4502)
- Use the same flags in Makefile.kokkos for POWER7/8/9 as for CMake [\#4483](https://github.com/kokkos/kokkos/pull/4483)
- Fix support for A64FX architecture [\#4745](https://github.com/kokkos/kokkos/pull/4745)
### Incompatibilities:
- Drop `KOKKOS_ARCH_HIP` macro when using generated GNU makefiles [\#4786](https://github.com/kokkos/kokkos/pull/4786)
- Remove gcc-toolchain auto add for clang in Makefile.kokkos [\#4762](https://github.com/kokkos/kokkos/pull/4762)
### Bug Fixes:
- Lock constant memory in Cuda/HIP kernel launch with a mutex (thread safety) [\#4525](https://github.com/kokkos/kokkos/pull/4525)
- Fix overflow for large requested scratch allocation [\#4551](https://github.com/kokkos/kokkos/pull/4551)
- Fix Windows build in mingw [\#4564](https://github.com/kokkos/kokkos/pull/4564)
- Fix `kokkos_launch_compiler`: escape `$` character [\#4769](https://github.com/kokkos/kokkos/pull/4769) [\#4703](https://github.com/kokkos/kokkos/pull/4703)
- Fix math functions with NVCC and GCC 5 as host compiler [\#4733](https://github.com/kokkos/kokkos/pull/4733)
- Fix shared build with Intel19 [\#4725](https://github.com/kokkos/kokkos/pull/4725)
- Do not install empty `desul/src/` directory [\#4714](https://github.com/kokkos/kokkos/pull/4714)
- Fix wrong `device_id` computation in `identifier_from_devid` (Profiling Interface) [\#4694](https://github.com/kokkos/kokkos/pull/4694)
- Fix a bug in CUDA scratch memory pool (abnormally high memory consumption) [\#4673](https://github.com/kokkos/kokkos/pull/4673)
- Remove eval of command args in `hpcbind` [\#4630](https://github.com/kokkos/kokkos/pull/4630)
- SYCL fix to run when no GPU is detected [\#4623](https://github.com/kokkos/kokkos/pull/4623)
- Fix `layout_strides::span` for rank-0 views [\#4605](https://github.com/kokkos/kokkos/pull/4605)
- Fix SYCL atomics for local memory [\#4585](https://github.com/kokkos/kokkos/pull/4585)
- Hotfix `mdrange_large_deep_copy` for SYCL [\#4581](https://github.com/kokkos/kokkos/pull/4581)
- Fix bug when sorting integer using the HIP backend [\#4570](https://github.com/kokkos/kokkos/pull/4570)
- Fix compilation error when using HIP with RDC [\#4553](https://github.com/kokkos/kokkos/pull/4553)
- `DynamicView`: Fix deallocation extent [\#4533](https://github.com/kokkos/kokkos/pull/4533)
- SYCL fix running parallel_reduce with TeamPolicy for large ranges [\#4532](https://github.com/kokkos/kokkos/pull/4532)
- Fix bash syntax error in `nvcc_wrapper` [\#4524](https://github.com/kokkos/kokkos/pull/4524)
- OpenMPTarget `team_policy` reduce fixes for `init/join` reductions [\#4521](https://github.com/kokkos/kokkos/pull/4521)
- Avoid hangs in the Threads backend [\#4499](https://github.com/kokkos/kokkos/pull/4499)
- OpenMPTarget fix reduction bug in `parallel_reduce` for `TeamPolicy` [\#4491](https://github.com/kokkos/kokkos/pull/4491)
- HIP fix scratch space per instance [\#4439](https://github.com/kokkos/kokkos/pull/4439)
- OpenMPTarget fix team scratch allocation [\#4431](https://github.com/kokkos/kokkos/pull/4431)
## [3.5.00](https://github.com/kokkos/kokkos/tree/3.5.00) (2021-10-19)
[Full Changelog](https://github.com/kokkos/kokkos/compare/3.4.01...3.5.00)
### Features:
- Add support for quad-precision math functions/traits [\#4098](https://github.com/kokkos/kokkos/pull/4098)
- Adding ExecutionSpace partitioning function [\#4096](https://github.com/kokkos/kokkos/pull/4096)
- Improve Python Interop Capabilities [\#4065](https://github.com/kokkos/kokkos/pull/4065)
- Add half_t Kokkos::rand specialization [\#3922](https://github.com/kokkos/kokkos/pull/3922)
- Add math special functions: erf, erfcx, expint1, Bessel functions, Hankel functions [\#3920](https://github.com/kokkos/kokkos/pull/3920)
- Add missing common mathematical functions [\#4043](https://github.com/kokkos/kokkos/pull/4043) [\#4036](https://github.com/kokkos/kokkos/pull/4036) [\#4034](https://github.com/kokkos/kokkos/pull/4034)
- Let the numeric traits be SFINAE-friendly [\#4038](https://github.com/kokkos/kokkos/pull/4038)
- Add Desul atomics - enabling memory-order and memory-scope parameters [\#3247](https://github.com/kokkos/kokkos/pull/3247)
- Add detection idiom from the C++ standard library extension version 2 [\#3980](https://github.com/kokkos/kokkos/pull/3980)
- Fence Profiling Support in all backends [\#3966](https://github.com/kokkos/kokkos/pull/3966) [\#4304](https://github.com/kokkos/kokkos/pull/4304) [\#4258](https://github.com/kokkos/kokkos/pull/4258) [\#4232](https://github.com/kokkos/kokkos/pull/4232)
- Significant SYCL enhancements (see below)
### Deprecations:
- Deprecate CUDA_SAFE_CALL and HIP_SAFE_CALL [\#4249](https://github.com/kokkos/kokkos/pull/4249)
- Deprecate Kokkos::Impl::Timer (Kokkos::Timer has been available for a long time) [\#4201](https://github.com/kokkos/kokkos/pull/4201)
- Deprecate Experimental::MasterLock [\#4094](https://github.com/kokkos/kokkos/pull/4094)
- Deprecate Kokkos_TaskPolicy.hpp (headers got reorganized, doesn't remove functionality) [\#4011](https://github.com/kokkos/kokkos/pull/4011)
- Deprecate backward compatibility features [\#3978](https://github.com/kokkos/kokkos/pull/3978)
- Update and deprecate is_space::host_memory/execution/mirror_space [\#3973](https://github.com/kokkos/kokkos/pull/3973)
### Backends and Archs Enhancements:
- Enabling constbitset constructors in kernels [\#4296](https://github.com/kokkos/kokkos/pull/4296)
- Use ZeroMemset in View constructor to improve performance [\#4226](https://github.com/kokkos/kokkos/pull/4226)
- Use memset in deep_copy [\#3944](https://github.com/kokkos/kokkos/pull/3944)
- Add missing fence() calls in resize(View) that effectively do deep_copy(resized, orig) [\#4212](https://github.com/kokkos/kokkos/pull/4212)
- Avoid allocations in resize and realloc [\#4207](https://github.com/kokkos/kokkos/pull/4207)
- StaticCsrGraph: use device type instead of execution space to construct views [\#3991](https://github.com/kokkos/kokkos/pull/3991)
- Consider std::sort when view is accessible from host [\#3929](https://github.com/kokkos/kokkos/pull/3929)
- Fix CPP20 warnings except for volatile [\#4312](https://github.com/kokkos/kokkos/pull/4312)
#### SYCL:
- Introduce SYCLHostUSMSpace [\#4268](https://github.com/kokkos/kokkos/pull/4268)
- Implement SYCL TeamPolicy for vector_size > 1 [\#4183](https://github.com/kokkos/kokkos/pull/4183)
- Enable 64bit ranges for SYCL [\#4211](https://github.com/kokkos/kokkos/pull/4211)
- Don't print SYCL device info in execution space intialization [\#4168](https://github.com/kokkos/kokkos/pull/4168)
- Improve SYCL MDRangePolicy performance [\#4161](https://github.com/kokkos/kokkos/pull/4161)
- Use sub_groups in SYCL parallel_scan [\#4147](https://github.com/kokkos/kokkos/pull/4147)
- Implement subgroup reduction for SYCL RangePolicy parallel_reduce [\#3940](https://github.com/kokkos/kokkos/pull/3940)
- Use DPC++ broadcast extension in SYCL team_broadcast [\#4103](https://github.com/kokkos/kokkos/pull/4103)
- Only fence in SYCL parallel_reduce for non-device-accessible result_ptr [\#4089](https://github.com/kokkos/kokkos/pull/4089)
- Improve fencing behavior in SYCL backend [\#4088](https://github.com/kokkos/kokkos/pull/4088)
- Fence all registered SYCL queues before deallocating memory [\#4086](https://github.com/kokkos/kokkos/pull/4086)
- Implement SYCL::print_configuration [\#3992](https://github.com/kokkos/kokkos/pull/3992)
- Reuse scratch memory in parallel_scan and TeamPolicy (decreases memory footprint) [\#3899](https://github.com/kokkos/kokkos/pull/3899) [\#3889](https://github.com/kokkos/kokkos/pull/3889)
#### CUDA:
- Cuda improve heuristic for blocksize [\#4271](https://github.com/kokkos/kokkos/pull/4271)
- Don't use [[deprecated]] for nvcc [\#4229](https://github.com/kokkos/kokkos/pull/4229)
- Improve error message for NVHPC as host compiler [\#4227](https://github.com/kokkos/kokkos/pull/4227)
- Update support for cuda reductions to work with types < 4bytes [\#4156](https://github.com/kokkos/kokkos/pull/4156)
- Fix incompatible team size deduction in rare cases parallel_reduce [\#4142](https://github.com/kokkos/kokkos/pull/4142)
- Remove UVM usage in DynamicView [\#4129](https://github.com/kokkos/kokkos/pull/4129)
- Remove dependency between core and containers [\#4114](https://github.com/kokkos/kokkos/pull/4114)
- Adding opt-in CudaMallocSync support when using CUDA version >= 11.2 [\#4026](https://github.com/kokkos/kokkos/pull/4026) [\#4233](https://github.com/kokkos/kokkos/pull/4233)
- Fix a potential race condition in the CUDA backend [\#3999](https://github.com/kokkos/kokkos/pull/3999)
#### HIP:
- Implement new blocksize deduction method for HIP Backend [\#3953](https://github.com/kokkos/kokkos/pull/3953)
- Add multiple LaunchMechanism [\#3820](https://github.com/kokkos/kokkos/pull/3820)
- Make HIP backend thread-safe [\#4170](https://github.com/kokkos/kokkos/pull/4170)
#### Serial:
- Refactor Serial backend and fix thread-safety issue [\#4053](https://github.com/kokkos/kokkos/pull/4053)
#### OpenMPTarget:
- OpenMPTarget: support array reductions in RangePolicy [\#4040](https://github.com/kokkos/kokkos/pull/4040)
- OpenMPTarget: add MDRange parallel_reduce [\#4032](https://github.com/kokkos/kokkos/pull/4032)
- OpenMPTarget: Fix bug in for the case of a reducer. [\#4044](https://github.com/kokkos/kokkos/pull/4044)
- OpenMPTarget: verify process fix [\#4041](https://github.com/kokkos/kokkos/pull/4041)
### Implemented enhancements BuildSystem
#### Important BuildSystem Updates:
- Use hipcc architecture autodetection when Kokkos_ARCH is not set [\#3941](https://github.com/kokkos/kokkos/pull/3941)
- Introduce Kokkos_ENABLE_DEPRECATION_WARNINGS and remove deprecated code with Kokkos_ENABLE_DEPRECATED_CODE_3 [\#4106](https://github.com/kokkos/kokkos/pull/4106) [\#3855](https://github.com/kokkos/kokkos/pull/3855)
#### Other Improvements:
- Add allow-unsupported-compiler flag to nvcc-wrapper [\#4298](https://github.com/kokkos/kokkos/pull/4298)
- nvcc_wrapper: fix errors in argument handling [\#3993](https://github.com/kokkos/kokkos/pull/3993)
- Adds support for -time= and -time in nvcc_wrapper [\#4015](https://github.com/kokkos/kokkos/pull/4015)
- nvcc_wrapper: suppress duplicates of GPU architecture and RDC flags [\#3968](https://github.com/kokkos/kokkos/pull/3968)
- Fix TMPDIR support in nvcc_wrapper [\#3792](https://github.com/kokkos/kokkos/pull/3792)
- NVHPC: update PGI compiler arch flags [\#4133](https://github.com/kokkos/kokkos/pull/4133)
- Replace PGI with NVHPC (works for both) [\#4196](https://github.com/kokkos/kokkos/pull/4196)
- Make sure that KOKKOS_CXX_HOST_COMPILER_ID is defined [\#4235](https://github.com/kokkos/kokkos/pull/4235)
- Add options to Makefile builds for deprecated code and warnings [\#4215](https://github.com/kokkos/kokkos/pull/4215)
- Use KOKKOS_CXX_HOST_COMPILER_ID for identifying CPU arch flags [\#4199](https://github.com/kokkos/kokkos/pull/4199)
- Added support for Cray Clang to Makefile.kokkos [\#4176](https://github.com/kokkos/kokkos/pull/4176)
- Add XLClang as compiler [\#4120](https://github.com/kokkos/kokkos/pull/4120)
- Keep quoted compiler flags when passing to Trilinos [\#3987](https://github.com/kokkos/kokkos/pull/3987)
- Add support for AMD Zen3 CPU architecture [\#3972](https://github.com/kokkos/kokkos/pull/3972)
- Rename IntelClang to IntelLLVM [\#3945](https://github.com/kokkos/kokkos/pull/3945)
- Add cppcoreguidelines-pro-type-cstyle-cast to clang-tidy [\#3522](https://github.com/kokkos/kokkos/pull/3522)
- Add sve bit size definition for A64FX [\#3947](https://github.com/kokkos/kokkos/pull/3947) [\#3946](https://github.com/kokkos/kokkos/pull/3946)
- Remove KOKKOS_ENABLE_DEBUG_PRINT_KERNEL_NAMES [\#4150](https://github.com/kokkos/kokkos/pull/4150)
### Other Changes:
#### Tool Enhancements:
- Retrieve original value from a point in a MultidimensionalSparseTuningProblem [\#3977](https://github.com/kokkos/kokkos/pull/3977)
- Allow extension of built-in tuners with additional tuning axes [\#3961](https://github.com/kokkos/kokkos/pull/3961)
- Added a categorical tuner [\#3955](https://github.com/kokkos/kokkos/pull/3955)
#### Miscellaneous:
- hpcbind: Use double quotes around $@ when invoking user command [\#4284](https://github.com/kokkos/kokkos/pull/4284)
- Add file and line to error message [\#3985](https://github.com/kokkos/kokkos/pull/3985)
- Fix compiler warnings when compiling with nvc++ [\#4198](https://github.com/kokkos/kokkos/pull/4198)
- Add OpenMPTarget CI build on AMD GPUs [\#4055](https://github.com/kokkos/kokkos/pull/4055)
- CI: icpx is now part of intel container [\#4002](https://github.com/kokkos/kokkos/pull/4002)
### Incompatibilities:
- Remove pre CUDA 9 KOKKOS_IMPL_CUDA_* macros [\#4138](https://github.com/kokkos/kokkos/pull/4138)
### Bug Fixes:
- UnorderedMap::clear() should zero the size() [\#4130](https://github.com/kokkos/kokkos/pull/4130)
- Add memory fence for HostSharedPtr::cleanup() [\#4144](https://github.com/kokkos/kokkos/pull/4144)
- SYCL: Fix race conditions in TeamPolicy::parallel_reduce [\#4418](https://github.com/kokkos/kokkos/pull/4418)
- Adding missing memory fence to serial exec space fence. [\#4292](https://github.com/kokkos/kokkos/pull/4292)
- Fix using external SYCL queues in tests [\#4291](https://github.com/kokkos/kokkos/pull/4291)
- Fix digits10 bug [\#4281](https://github.com/kokkos/kokkos/pull/4281)
- Fixes constexpr errors with frounding-math on gcc < 10. [\#4278](https://github.com/kokkos/kokkos/pull/4278)
- Fix compiler flags for PGI/NVHPC [\#4264](https://github.com/kokkos/kokkos/pull/4264)
- Fix Zen2/3 also implying Zen Arch with Makefiles [\#4260](https://github.com/kokkos/kokkos/pull/4260)
- Kokkos_Cuda.hpp: Fix shadow warning with cuda/11.0 [\#4252](https://github.com/kokkos/kokkos/pull/4252)
- Fix issue w/ static initialization of function attributes [\#4242](https://github.com/kokkos/kokkos/pull/4242)
- Disable long double hypot test on Power systems [\#4221](https://github.com/kokkos/kokkos/pull/4221)
- Fix false sharing in random pool [\#4218](https://github.com/kokkos/kokkos/pull/4218)
- Fix a missing memory_fence for debug shared alloc code [\#4216](https://github.com/kokkos/kokkos/pull/4216)
- Fix two xl issues [\#4179](https://github.com/kokkos/kokkos/pull/4179)
- Makefile.kokkos: fix (standard_in) 1: syntax error [\#4173](https://github.com/kokkos/kokkos/pull/4173)
- Fixes for query_device example [\#4172](https://github.com/kokkos/kokkos/pull/4172)
- Fix a bug when using HIP atomic with Kokkos::Complex [\#4159](https://github.com/kokkos/kokkos/pull/4159)
- Fix mistaken logic in pthread creation [\#4157](https://github.com/kokkos/kokkos/pull/4157)
- Define KOKKOS_ENABLE_AGGRESSIVE_VECTORIZATION when requesting Kokkos_ENABLE_AGGRESSIVE_VECTORIZATION=ON [\#4107](https://github.com/kokkos/kokkos/pull/4107)
- Fix compilation with latest MSVC version [\#4102](https://github.com/kokkos/kokkos/pull/4102)
- Fix incorrect macro definitions when compiling with Intel compiler on Windows [\#4087](https://github.com/kokkos/kokkos/pull/4087)
- Fixup global buffer overflow in hand rolled string manipulation [\#4070](https://github.com/kokkos/kokkos/pull/4070)
- Fixup heap buffer overflow in cmd line args parsing unit tests [\#4069](https://github.com/kokkos/kokkos/pull/4069)
- Only add quotes in compiler flags for Trilinos if necessary [\#4067](https://github.com/kokkos/kokkos/pull/4067)
- Fixed invocation of tools init callbacks [\#4061](https://github.com/kokkos/kokkos/pull/4061)
- Work around SYCL JIT compiler issues with static variables [\#4013](https://github.com/kokkos/kokkos/pull/4013)
- Fix TestDetectionIdiom.cpp test inclusion for Trilinos/TriBITS [\#4010](https://github.com/kokkos/kokkos/pull/4010)
- Fixup allocation headers with OpenMPTarget backend [\#4003](https://github.com/kokkos/kokkos/pull/4003)
- Add missing specialization for OMPT to Kokkos Random [\#3967](https://github.com/kokkos/kokkos/pull/3967)
- Disable hypot long double test on power arches [\#3962](https://github.com/kokkos/kokkos/pull/3962)
- Use different EBO workaround for MSVC (rebased) [\#3924](https://github.com/kokkos/kokkos/pull/3924)
- Fix SYCL Kokkos::Profiling::(de)allocateData calls [\#3928](https://github.com/kokkos/kokkos/pull/3928)
## [3.4.01](https://github.com/kokkos/kokkos/tree/3.4.01) (2021-05-19)
[Full Changelog](https://github.com/kokkos/kokkos/compare/3.4.00...3.4.01)
**Bug Fixes:**
- Windows: Remove atomic_compare_exchange_strong overload conflicts with Windows [\#4024](https://github.com/kokkos/kokkos/pull/4024)
- OpenMPTarget: Fixup allocation headers with OpenMPTarget backend [\#4020](https://github.com/kokkos/kokkos/pull/4020)
- OpenMPTarget: Add missing specailization for OMPT to Kokkos Random [\#4022](https://github.com/kokkos/kokkos/pull/4022)
- AMD: Add support for AMD Zen3 CPU architecture [\#4021](https://github.com/kokkos/kokkos/pull/4021)
- SYCL: Implement SYCL::print_configuration [\#4012](https://github.com/kokkos/kokkos/pull/4012)
- Containers: staticcsrgraph: use device type instead of execution space to construct views [\#3998](https://github.com/kokkos/kokkos/pull/3998)
- nvcc_wrapper: fix errors in argument handling, suppress duplicates of GPU architecture and RDC flags [\#4006](https://github.com/kokkos/kokkos/pull/4006)
- CI: Add icpx testing to intel container [\#4004](https://github.com/kokkos/kokkos/pull/4004)
- CMake/TRIBITS: Keep quoted compiler flags when passing to Trilinos [\#4007](https://github.com/kokkos/kokkos/pull/4007)
- CMake: Rename IntelClang to IntelLLVM [\#3945](https://github.com/kokkos/kokkos/pull/3945)
## [3.4.00](https://github.com/kokkos/kokkos/tree/3.4.00) (2021-04-25)
[Full Changelog](https://github.com/kokkos/kokkos/compare/3.3.01...3.4.00)
**Highlights:**
- SYCL Backend Almost Feature Complete
- OpenMPTarget Backend Almost Feature Complete
- Performance Improvements for HIP backend
- Require CMake 3.16 or newer
- Tool Callback Interface Enhancements
- cmath wrapper functions available now in Kokkos::Experimental
**Features:**
- Implement parallel_scan with ThreadVectorRange and Reducer [\#3861](https://github.com/kokkos/kokkos/pull/3861)
- Implement SYCL Random [\#3849](https://github.com/kokkos/kokkos/pull/3849)
- OpenMPTarget: Adding Implementation for nested reducers [\#3845](https://github.com/kokkos/kokkos/pull/3845)
- Implement UniqueToken for SYCL [\#3833](https://github.com/kokkos/kokkos/pull/3833)
- OpenMPTarget: UniqueToken::Global implementation [\#3823](https://github.com/kokkos/kokkos/pull/3823)
- DualView sync's on ExecutionSpaces [\#3822](https://github.com/kokkos/kokkos/pull/3822)
- SYCL outer TeamPolicy parallel_reduce [\#3818](https://github.com/kokkos/kokkos/pull/3818)
- SYCL TeamPolicy::team_scan [\#3815](https://github.com/kokkos/kokkos/pull/3815)
- SYCL MDRangePolicy parallel_reduce [\#3801](https://github.com/kokkos/kokkos/pull/3801)
- Enable use of execution space instances in ScatterView [\#3786](https://github.com/kokkos/kokkos/pull/3786)
- SYCL TeamPolicy nested parallel_reduce [\#3783](https://github.com/kokkos/kokkos/pull/3783)
- OpenMPTarget: MDRange with TagType for parallel_for [\#3781](https://github.com/kokkos/kokkos/pull/3781)
- Adding OpenMPTarget parallel_scan [\#3655](https://github.com/kokkos/kokkos/pull/3655)
- SYCL basic TeamPolicy [\#3654](https://github.com/kokkos/kokkos/pull/3654)
- OpenMPTarget: scratch memory implementation [\#3611](https://github.com/kokkos/kokkos/pull/3611)
**Implemented enhancements Backends and Archs:**
- SYCL choose a specific GPU [\#3918](https://github.com/kokkos/kokkos/pull/3918)
- [HIP] Lock access to scratch memory when using Teams [\#3916](https://github.com/kokkos/kokkos/pull/3916)
- [HIP] fix multithreaded access to get_next_driver [\#3908](https://github.com/kokkos/kokkos/pull/3908)
- Forward declare HIPHostPinnedSpace and SYCLSharedUSMSpace [\#3902](https://github.com/kokkos/kokkos/pull/3902)
- Let SYCL USMObjectMem use SharedAllocationRecord [\#3898](https://github.com/kokkos/kokkos/pull/3898)
- Implement clock_tic for SYCL [\#3893](https://github.com/kokkos/kokkos/pull/3893)
- Don't use a static variable in HIPInternal::scratch_space [\#3866](https://github.com/kokkos/kokkos/pull/3866)(https://github.com/kokkos/kokkos/pull/3866)
- Reuse memory for SYCL parallel_reduce [\#3873](https://github.com/kokkos/kokkos/pull/3873)
- Update SYCL compiler in CI [\#3826](https://github.com/kokkos/kokkos/pull/3826)
- Introduce HostSharedPtr to manage m_space_instance for Cuda/HIP/SYCL [\#3824](https://github.com/kokkos/kokkos/pull/3824)
- [HIP] Use shuffle for range reduction [\#3811](https://github.com/kokkos/kokkos/pull/3811)
- OpenMPTarget: Changes to the hierarchical parallelism [\#3808](https://github.com/kokkos/kokkos/pull/3808)
- Remove ExtendedReferenceWrapper for SYCL parallel_reduce [\#3802](https://github.com/kokkos/kokkos/pull/3802)
- Eliminate sycl_indirect_launch [\#3777](https://github.com/kokkos/kokkos/pull/3777)
- OpenMPTarget: scratch implementation for parallel_reduce [\#3776](https://github.com/kokkos/kokkos/pull/3776)
- Allow initializing SYCL execution space from sycl::queue and SYCL::impl_static_fence [\#3767](https://github.com/kokkos/kokkos/pull/3767)
- SYCL TeamPolicy scratch memory alternative [\#3763](https://github.com/kokkos/kokkos/pull/3763)
- Alternative implementation for SYCL TeamPolicy [\#3759](https://github.com/kokkos/kokkos/pull/3759)
- Unify handling of synchronous errors in SYCL [\#3754](https://github.com/kokkos/kokkos/pull/3754)
- core/Cuda: Half_t updates for cgsolve [\#3746](https://github.com/kokkos/kokkos/pull/3746)
- Unify HIPParallelLaunch structures [\#3733](https://github.com/kokkos/kokkos/pull/3733)
- Improve performance for SYCL parallel_reduce [\#3732](https://github.com/kokkos/kokkos/pull/3732)
- Use consistent types in Kokkos_OpenMPTarget_Parallel.hpp [\#3703](https://github.com/kokkos/kokkos/pull/3703)
- Implement non-blocking kernel launches for HIP backend [\#3697](https://github.com/kokkos/kokkos/pull/3697)
- Change SYCLInternal::m_queue std::unique_ptr -> std::optional [\#3677](https://github.com/kokkos/kokkos/pull/3677)
- Use alternative SYCL parallel_reduce implementation [\#3671](https://github.com/kokkos/kokkos/pull/3671)
- Use runtime values in KokkosExp_MDRangePolicy.hpp [\#3626](https://github.com/kokkos/kokkos/pull/3626)
- Clean up AnalyzePolicy [\#3564](https://github.com/kokkos/kokkos/pull/3564)
- Changes for indirect launch of SYCL parallel reduce [\#3511](https://github.com/kokkos/kokkos/pull/3511)
**Implemented enhancements BuildSystem:**
- Also require C++14 when building gtest [\#3912](https://github.com/kokkos/kokkos/pull/3912)
- Fix compiling SYCL with OpenMP [\#3874](https://github.com/kokkos/kokkos/pull/3874)
- Require C++17 for SYCL (at configuration time) [\#3869](https://github.com/kokkos/kokkos/pull/3869)
- Add COMPILE_DEFINITIONS argument to kokkos_create_imported_tpl [\#3862](https://github.com/kokkos/kokkos/pull/3862)
- Do not pass arch flags to the linker with no rdc [\#3846](https://github.com/kokkos/kokkos/pull/3846)
- Try compiling C++14 check with C++14 support and print error message [\#3843](https://github.com/kokkos/kokkos/pull/3843)
- Enable HIP with Cray Clang [\#3842](https://github.com/kokkos/kokkos/pull/3842)
- Add an option to disable header self containment tests [\#3834](https://github.com/kokkos/kokkos/pull/3834)
- CMake check for C++14 [\#3809](https://github.com/kokkos/kokkos/pull/3809)
- Prefer -std=* over --std=* [\#3779](https://github.com/kokkos/kokkos/pull/3779)
- Kokkos launch compiler updates [\#3778](https://github.com/kokkos/kokkos/pull/3778)
- Updated comments and enabled no-op for kokkos_launch_compiler [\#3774](https://github.com/kokkos/kokkos/pull/3774)
- Apple's Clang not correctly recognised [\#3772](https://github.com/kokkos/kokkos/pull/3772)
- kokkos_launch_compiler + CUDA auto-detect arch [\#3770](https://github.com/kokkos/kokkos/pull/3770)
- Add Spack test support for Kokkos [\#3753](https://github.com/kokkos/kokkos/pull/3753)
- Split SYCL tests for aot compilation [\#3741](https://github.com/kokkos/kokkos/pull/3741)
- Use consistent OpenMP flag for IntelClang [\#3735](https://github.com/kokkos/kokkos/pull/3735)
- Add support for -Wno-deprecated-gpu-targets [\#3722](https://github.com/kokkos/kokkos/pull/3722)
- Add configuration to target CUDA compute capability 8.6 [\#3713](https://github.com/kokkos/kokkos/pull/3713)
- Added VERSION and SOVERSION to KOKKOS_INTERNAL_ADD_LIBRARY [\#3706](https://github.com/kokkos/kokkos/pull/3706)
- Add fast-math to known NVCC flags [\#3699](https://github.com/kokkos/kokkos/pull/3699)
- Add MI-100 arch string [\#3698](https://github.com/kokkos/kokkos/pull/3698)
- Require CMake >=3.16 [\#3679](https://github.com/kokkos/kokkos/pull/3679)
- KokkosCI.cmake, KokkosCTest.cmake.in, CTestConfig.cmake.in + CI updates [\#2844](https://github.com/kokkos/kokkos/pull/2844)
**Implemented enhancements Tools:**
- Improve readability of the callback invocation in profiling [\#3860](https://github.com/kokkos/kokkos/pull/3860)
- V1.1 Tools Interface: incremental, action-based [\#3812](https://github.com/kokkos/kokkos/pull/3812)
- Enable launch latency simulations [\#3721](https://github.com/kokkos/kokkos/pull/3721)
- Added metadata callback to tools interface [\#3711](https://github.com/kokkos/kokkos/pull/3711)
- MDRange Tile Size Tuning [\#3688](https://github.com/kokkos/kokkos/pull/3688)
- Added support for command-line args for kokkos-tools [\#3627](https://github.com/kokkos/kokkos/pull/3627)
- Query max tile sizes for an MDRangePolicy, and set tile sizes on an existing policy [\#3481](https://github.com/kokkos/kokkos/pull/3481)
**Implemented enhancements Other:**
- Try detecting ndevices in get_gpu [\#3921](https://github.com/kokkos/kokkos/pull/3921)
- Use strcmp to compare names() [\#3909](https://github.com/kokkos/kokkos/pull/3909)
- Add execution space arguments for constructor overloads that might allocate a new underlying View [\#3904](https://github.com/kokkos/kokkos/pull/3904)
- Prefix labels in internal use of kokkos_malloc [\#3891](https://github.com/kokkos/kokkos/pull/3891)
- Prefix labels for internal uses of SharedAllocationRecord [\#3890](https://github.com/kokkos/kokkos/pull/3890)
- Add missing hypot math function [\#3880](https://github.com/kokkos/kokkos/pull/3880)
- Unify algorithm unit tests to avoid code duplication [\#3851](https://github.com/kokkos/kokkos/pull/3851)
- DualView.template view() better matches for Devices in UVMSpace cases [\#3857](https://github.com/kokkos/kokkos/pull/3857)
- More extensive disentangling of Policy Traits [\#3829](https://github.com/kokkos/kokkos/pull/3829)
- Replaced nanosleep and sched_yield with STL routines [\#3825](https://github.com/kokkos/kokkos/pull/3825)
- Constructing Atomic Subviews [\#3810](https://github.com/kokkos/kokkos/pull/3810)
- Metadata Declaration in Core [\#3729](https://github.com/kokkos/kokkos/pull/3729)
- Allow using tagged final functor in parallel_reduce [\#3714](https://github.com/kokkos/kokkos/pull/3714)
- Major duplicate code removal in SharedAllocationRecord specializations [\#3658](https://github.com/kokkos/kokkos/pull/3658)
**Fixed bugs:**
- Provide forward declarations in Kokkos_ViewLayoutTiled.hpp for XL [\#3911](https://github.com/kokkos/kokkos/pull/3911)
- Fixup absolute value of floating points in Kokkos complex [\#3882](https://github.com/kokkos/kokkos/pull/3882)
- Address intel 17 ICE [\#3881](https://github.com/kokkos/kokkos/pull/3881)
- Add missing pow(Kokkos::complex) overloads [\#3868](https://github.com/kokkos/kokkos/pull/3868)
- Fix bug {pow, log}(Kokkos::complex) [\#3866](https://github.com/kokkos/kokkos/pull/3866)(https://github.com/kokkos/kokkos/pull/3866)
- Cleanup writing to output streams in Cuda [\#3859](https://github.com/kokkos/kokkos/pull/3859)
- Fixup cache CUDA fallback execution space instance used by DualView::sync [\#3856](https://github.com/kokkos/kokkos/pull/3856)
- Fix cmake warning with pthread [\#3854](https://github.com/kokkos/kokkos/pull/3854)
- Fix typo FOUND_CUDA_{DRIVVER -> DRIVER} [\#3852](https://github.com/kokkos/kokkos/pull/3852)
- Fix bug in SYCL team_reduce [\#3848](https://github.com/kokkos/kokkos/pull/3848)
- Atrocious bug in MDRange tuning [\#3803](https://github.com/kokkos/kokkos/pull/3803)
- Fix compiling SYCL with Kokkos_ENABLE_TUNING=ON [\#3800](https://github.com/kokkos/kokkos/pull/3800)
- Fixed command line parsing bug [\#3797](https://github.com/kokkos/kokkos/pull/3797)
- Workaround race condition in SYCL parallel_reduce [\#3782](https://github.com/kokkos/kokkos/pull/3782)
- Fix Atomic{Min,Max} for Kepler30 [\#3780](https://github.com/kokkos/kokkos/pull/3780)
- Fix SYCL typo [\#3755](https://github.com/kokkos/kokkos/pull/3755)
- Fixed Kokkos_install_additional_files macro [\#3752](https://github.com/kokkos/kokkos/pull/3752)
- Fix a typo for Kokkos_ARCH_A64FX [\#3751](https://github.com/kokkos/kokkos/pull/3751)
- OpenMPTarget: fixes and workarounds to work with "Release" build type [\#3748](https://github.com/kokkos/kokkos/pull/3748)
- Fix parsing bug for number of devices command line argument [\#3724](https://github.com/kokkos/kokkos/pull/3724)
- Avoid more warnings with clang and C++20 [\#3719](https://github.com/kokkos/kokkos/pull/3719)
- Fix gcc-10.1 C++20 warnings [\#3718](https://github.com/kokkos/kokkos/pull/3718)
- Fix cuda cache config not being set correct [\#3712](https://github.com/kokkos/kokkos/pull/3712)
- Fix dualview deepcopy perftools [\#3701](https://github.com/kokkos/kokkos/pull/3701)
- use drand instead of frand in drand [\#3696](https://github.com/kokkos/kokkos/pull/3696)
**Incompatibilities:**
- Remove unimplemented member functions of SYCLDevice [\#3919](https://github.com/kokkos/kokkos/pull/3919)
- Replace cl::sycl [\#3896](https://github.com/kokkos/kokkos/pull/3896)
- Get rid of SYCL workaround in Kokkos_Complex.hpp [\#3884](https://github.com/kokkos/kokkos/pull/3884)
- Replace most uses of if_c [\#3883](https://github.com/kokkos/kokkos/pull/3883)
- Remove Impl::enable_if_type [\#3863](https://github.com/kokkos/kokkos/pull/3863)
- Remove HostBarrier test [\#3847](https://github.com/kokkos/kokkos/pull/3847)
- Avoid (void) interface [\#3836](https://github.com/kokkos/kokkos/pull/3836)
- Remove VerifyExecutionCanAccessMemorySpace [\#3813](https://github.com/kokkos/kokkos/pull/3813)
- Avoid duplicated code in ScratchMemorySpace [\#3793](https://github.com/kokkos/kokkos/pull/3793)
- Remove superfluous FunctorFinal specialization [\#3788](https://github.com/kokkos/kokkos/pull/3788)
- Rename cl::sycl -> sycl in Kokkos_MathematicalFunctions.hpp [\#3678](https://github.com/kokkos/kokkos/pull/3678)
- Remove integer_sequence backward compatibility implementation [\#3533](https://github.com/kokkos/kokkos/pull/3533)
**Enabled tests:**
- Fixup re-enable core performance tests [\#3903](https://github.com/kokkos/kokkos/pull/3903)
- Enable more SYCL tests [\#3900](https://github.com/kokkos/kokkos/pull/3900)
- Restrict MDRange Policy tests for Intel GPUs [\#3853](https://github.com/kokkos/kokkos/pull/3853)
- Disable death tests for rawhide [\#3844](https://github.com/kokkos/kokkos/pull/3844)
- OpenMPTarget: Block unit tests that do not pass with the nvidia compiler [\#3839](https://github.com/kokkos/kokkos/pull/3839)
- Enable Bitset container test for SYCL [\#3830](https://github.com/kokkos/kokkos/pull/3830)
- Enable some more SYCL tests [\#3744](https://github.com/kokkos/kokkos/pull/3744)
- Enable SYCL atomic tests [\#3742](https://github.com/kokkos/kokkos/pull/3742)
- Enable more SYCL perf_tests [\#3692](https://github.com/kokkos/kokkos/pull/3692)
- Enable examples for SYCL [\#3691](https://github.com/kokkos/kokkos/pull/3691)
## [3.3.01](https://github.com/kokkos/kokkos/tree/3.3.01) (2021-01-06)
[Full Changelog](https://github.com/kokkos/kokkos/compare/3.3.00...3.3.01)
**Bug Fixes:**
- Fix severe performance bug in DualView which added memcpys for sync and modify [\#3693](https://github.com/kokkos/kokkos/issues/#3693)
- Fix performance bug in CUDA backend, where the cuda Cache config was not set correct.
## [3.3.00](https://github.com/kokkos/kokkos/tree/3.3.00) (2020-12-16)
[Full Changelog](https://github.com/kokkos/kokkos/compare/3.2.01...3.3.00)
**Features:**
- Require C++14 as minimum C++ standard. C++17 and C++20 are supported too.
- HIP backend is nearly feature complete. Kokkos Dynamic Task Graphs are missing.
- Major update for OpenMPTarget: many capabilities now work. For details contact us.
- Added DPC++/SYCL backend: primary capabilites are working.
- Added Kokkos Graph API analogous to CUDA Graphs.
- Added parallel_scan support with TeamThreadRange [\#3536](https://github.com/kokkos/kokkos/pull/#3536)
- Added Logical Memory Spaces [\#3546](https://github.com/kokkos/kokkos/pull/#3546)
- Added initial half precision support [\#3439](https://github.com/kokkos/kokkos/pull/#3439)
- Experimental feature: control cuda occupancy [\#3379](https://github.com/kokkos/kokkos/pull/#3379)
**Implemented enhancements Backends and Archs:**
- Add a64fx and fujitsu Compiler support [\#3614](https://github.com/kokkos/kokkos/pull/#3614)
- Adding support for AMD gfx908 archictecture [\#3375](https://github.com/kokkos/kokkos/pull/#3375)
- SYCL parallel\_for MDRangePolicy [\#3583](https://github.com/kokkos/kokkos/pull/#3583)
- SYCL add parallel\_scan [\#3577](https://github.com/kokkos/kokkos/pull/#3577)
- SYCL custom reductions [\#3544](https://github.com/kokkos/kokkos/pull/#3544)
- SYCL Enable container unit tests [\#3550](https://github.com/kokkos/kokkos/pull/#3550)
- SYCL feature level 5 [\#3480](https://github.com/kokkos/kokkos/pull/#3480)
- SYCL Feature level 4 (parallel\_for) [\#3474](https://github.com/kokkos/kokkos/pull/#3474)
- SYCL feature level 3 [\#3451](https://github.com/kokkos/kokkos/pull/#3451)
- SYCL feature level 2 [\#3447](https://github.com/kokkos/kokkos/pull/#3447)
- OpenMPTarget: Hierarchial reduction for + operator on scalars [\#3504](https://github.com/kokkos/kokkos/pull/#3504)
- OpenMPTarget hierarchical [\#3411](https://github.com/kokkos/kokkos/pull/#3411)
- HIP Add Impl::atomic\_[store,load] [\#3440](https://github.com/kokkos/kokkos/pull/#3440)
- HIP enable global lock arrays [\#3418](https://github.com/kokkos/kokkos/pull/#3418)
- HIP Implement multiple occupancy paths for various HIP kernel launchers [\#3366](https://github.com/kokkos/kokkos/pull/#3366)
**Implemented enhancements Policies:**
- MDRangePolicy: Let it be semiregular [\#3494](https://github.com/kokkos/kokkos/pull/#3494)
- MDRangePolicy: Check narrowing conversion in construction [\#3527](https://github.com/kokkos/kokkos/pull/#3527)
- MDRangePolicy: CombinedReducers support [\#3395](https://github.com/kokkos/kokkos/pull/#3395)
- Kokkos Graph: Interface and Default Implementation [\#3362](https://github.com/kokkos/kokkos/pull/#3362)
- Kokkos Graph: add Cuda Graph implementation [\#3369](https://github.com/kokkos/kokkos/pull/#3369)
- TeamPolicy: implemented autotuning of team sizes and vector lengths [\#3206](https://github.com/kokkos/kokkos/pull/#3206)
- RangePolicy: Initialize all data members in default constructor [\#3509](https://github.com/kokkos/kokkos/pull/#3509)
**Implemented enhancements BuildSystem:**
- Auto-generate core test files for all backends [\#3488](https://github.com/kokkos/kokkos/pull/#3488)
- Avoid rewriting test files when calling cmake [\#3548](https://github.com/kokkos/kokkos/pull/#3548)
- RULE\_LAUNCH\_COMPILE and RULE\_LAUNCH\_LINK system for nvcc\_wrapper [\#3136](https://github.com/kokkos/kokkos/pull/#3136)
- Adding -include as a known argument to nvcc\_wrapper [\#3434](https://github.com/kokkos/kokkos/pull/#3434)
- Install hpcbind script [\#3402](https://github.com/kokkos/kokkos/pull/#3402)
- cmake/kokkos\_tribits.cmake: add parsing for args [\#3457](https://github.com/kokkos/kokkos/pull/#3457)
**Implemented enhancements Tools:**
- Changed namespacing of Kokkos::Tools::Impl::Impl::tune\_policy [\#3455](https://github.com/kokkos/kokkos/pull/#3455)
- Delegate to an impl allocate/deallocate method to allow specifying a SpaceHandle for MemorySpaces [\#3530](https://github.com/kokkos/kokkos/pull/#3530)
- Use the Kokkos Profiling interface rather than the Impl interface [\#3518](https://github.com/kokkos/kokkos/pull/#3518)
- Runtime option for tuning [\#3459](https://github.com/kokkos/kokkos/pull/#3459)
- Dual View Tool Events [\#3326](https://github.com/kokkos/kokkos/pull/#3326)
**Implemented enhancements Other:**
- Abort on errors instead of just printing [\#3528](https://github.com/kokkos/kokkos/pull/#3528)
- Enable C++14 macros unconditionally [\#3449](https://github.com/kokkos/kokkos/pull/#3449)
- Make ViewMapping trivially copyable [\#3436](https://github.com/kokkos/kokkos/pull/#3436)
- Rename struct ViewMapping to class [\#3435](https://github.com/kokkos/kokkos/pull/#3435)
- Replace enums in Kokkos\_ViewMapping.hpp (removes -Wextra) [\#3422](https://github.com/kokkos/kokkos/pull/#3422)
- Use bool for enums representing bools [\#3416](https://github.com/kokkos/kokkos/pull/#3416)
- Fence active instead of default execution space instances [\#3388](https://github.com/kokkos/kokkos/pull/#3388)
- Refactor parallel\_reduce fence usage [\#3359](https://github.com/kokkos/kokkos/pull/#3359)
- Moved Space EBO helpers to Kokkos\_EBO [\#3357](https://github.com/kokkos/kokkos/pull/#3357)
- Add remove\_cvref type trait [\#3340](https://github.com/kokkos/kokkos/pull/#3340)
- Adding identity type traits and update definition of identity\_t alias [\#3339](https://github.com/kokkos/kokkos/pull/#3339)
- Add is\_specialization\_of type trait [\#3338](https://github.com/kokkos/kokkos/pull/#3338)
- Make ScratchMemorySpace semi-regular [\#3309](https://github.com/kokkos/kokkos/pull/#3309)
- Optimize min/max atomics with early exit on no-op case [\#3265](https://github.com/kokkos/kokkos/pull/#3265)
- Refactor Backend Development [\#2941](https://github.com/kokkos/kokkos/pull/#2941)
**Fixed bugs:**
- Fixup MDRangePolicy construction from Kokkos arrays [\#3591](https://github.com/kokkos/kokkos/pull/#3591)
- Add atomic functions for unsigned long long using gcc built-in [\#3588](https://github.com/kokkos/kokkos/pull/#3588)
- Fixup silent pointless comparison with zero in checked\_narrow\_cast (compiler workaround) [\#3566](https://github.com/kokkos/kokkos/pull/#3566)
- Fixes for ROCm 3.9 [\#3565](https://github.com/kokkos/kokkos/pull/#3565)
- Fix windows build issues which crept in for the CUDA build [\#3532](https://github.com/kokkos/kokkos/pull/#3532)
- HIP Fix atomics of large data types and clean up lock arrays [\#3529](https://github.com/kokkos/kokkos/pull/#3529)
- Pthreads fix exception resulting from 0 grain size [\#3510](https://github.com/kokkos/kokkos/pull/#3510)
- Fixup do not require atomic operation to be default constructible [\#3503](https://github.com/kokkos/kokkos/pull/#3503)
- Fix race condition in HIP backend [\#3467](https://github.com/kokkos/kokkos/pull/#3467)
- Replace KOKKOS\_DEBUG with KOKKOS\_ENABLE\_DEBUG [\#3458](https://github.com/kokkos/kokkos/pull/#3458)
- Fix multi-stream team scratch space definition for HIP [\#3398](https://github.com/kokkos/kokkos/pull/#3398)
- HIP fix template deduction [\#3393](https://github.com/kokkos/kokkos/pull/#3393)
- Fix compiling with HIP and C++17 [\#3390](https://github.com/kokkos/kokkos/pull/#3390)
- Fix sigFPE in HIP blocksize deduction [\#3378](https://github.com/kokkos/kokkos/pull/#3378)
- Type alias change: replace CS with CTS to avoid conflicts with NVSHMEM [\#3348](https://github.com/kokkos/kokkos/pull/#3348)
- Clang compilation of CUDA backend on Windows [\#3345](https://github.com/kokkos/kokkos/pull/#3345)
- Fix HBW support [\#3343](https://github.com/kokkos/kokkos/pull/#3343)
- Added missing fences to unique token [\#3260](https://github.com/kokkos/kokkos/pull/#3260)
**Incompatibilities:**
- Remove unused utilities (forward, move, and expand\_variadic) from Kokkos::Impl [\#3535](https://github.com/kokkos/kokkos/pull/#3535)
- Remove unused traits [\#3534](https://github.com/kokkos/kokkos/pull/#3534)
- HIP: Remove old HCC code [\#3301](https://github.com/kokkos/kokkos/pull/#3301)
- Prepare for deprecation of ViewAllocateWithoutInitializing [\#3264](https://github.com/kokkos/kokkos/pull/#3264)
- Remove ROCm backend [\#3148](https://github.com/kokkos/kokkos/pull/#3148)
## [3.2.01](https://github.com/kokkos/kokkos/tree/3.2.01) (2020-11-17)
[Full Changelog](https://github.com/kokkos/kokkos/compare/3.2.00...3.2.01)
**Fixed bugs:**
- Disallow KOKKOS_ENABLE_CUDA_RELOCATABLE_DEVICE_CODE in shared library builds [\#3332](https://github.com/kokkos/kokkos/pull/3332)
- Do not install libprinter-tool when testing is enabled [\#3313](https://github.com/kokkos/kokkos/pull/3313)
- Fix restrict/alignment following refactor [\#3373](https://github.com/kokkos/kokkos/pull/3373)
- Intel fix: workaround compiler issue with using statement [\#3383](https://github.com/kokkos/kokkos/pull/3383)
- Fix zero-length reductions [#\3364](https://github.com/kokkos/kokkos/pull/3364)
- Pthread zero-length reduction fix [\#3452](https://github.com/kokkos/kokkos/pull/3452)
- HPX zero-length reduction fix [\#3470](https://github.com/kokkos/kokkos/pull/3470)
- cuda/9.2 zero-length reduction fix [\#3580](https://github.com/kokkos/kokkos/pull/3580)
- Fix multi-stream scratch [#\3269](https://github.com/kokkos/kokkos/pull/3269)
- Guard KOKKOS_ALL_COMPILE_OPTIONS if Cuda is not enabled [\#3387](https://github.com/kokkos/kokkos/pull/3387)
- Do not include link flags for Fortran linkage [\#3384](https://github.com/kokkos/kokkos/pull/3384)
- Fix NVIDIA GPU arch macro with autodetection [\#3473](https://github.com/kokkos/kokkos/pull/3473)
- Fix libdl/test issues with Trilinos [\#3543](https://github.com/kokkos/kokkos/pull/3543)
- Register Pthread as Tribits option to be enabled with Trilinos [\#3558](https://github.com/kokkos/kokkos/pull/3558)
**Implemented enhancements:**
- Separate Cuda timing-based tests into their own executable [\#3407](https://github.com/kokkos/kokkos/pull/3407)
## [3.2.00](https://github.com/kokkos/kokkos/tree/3.2.00) (2020-08-19)
[Full Changelog](https://github.com/kokkos/kokkos/compare/3.1.01...3.2.00)
**Implemented enhancements:**
- HIP:Enable stream in HIP [\#3163](https://github.com/kokkos/kokkos/issues/3163)
- HIP:Add support for shuffle reduction for the HIP backend [\#3154](https://github.com/kokkos/kokkos/issues/3154)
- HIP:Add implementations of missing HIPHostPinnedSpace methods for LAMMPS [\#3137](https://github.com/kokkos/kokkos/issues/3137)
- HIP:Require HIP 3.5.0 or higher [\#3099](https://github.com/kokkos/kokkos/issues/3099)
- HIP:WorkGraphPolicy for HIP [\#3096](https://github.com/kokkos/kokkos/issues/3096)
- OpenMPTarget: Significant update to the new experimental backend. Requires C++17, works on Intel GPUs, reference counting fixes. [\#3169](https://github.com/kokkos/kokkos/issues/3169)
- Windows Cuda support [\#3018](https://github.com/kokkos/kokkos/issues/3018)
- Pass `-Wext-lambda-captures-this` to NVCC when support for `__host__ __device__` lambda is enabled from CUDA 11 [\#3241](https://github.com/kokkos/kokkos/issues/3241)
- Use explicit staging buffer for constant memory kernel launches and cleanup host/device synchronization [\#3234](https://github.com/kokkos/kokkos/issues/3234)
- Various fixup to policies including making TeamPolicy default constructible and making RangePolicy and TeamPolicy assignable: [\#3202](https://github.com/kokkos/kokkos/issues/3202) , [\#3203](https://github.com/kokkos/kokkos/issues/3203) , [\#3196](https://github.com/kokkos/kokkos/issues/3196)
- Annotations for `DefaultExectutionSpace` and `DefaultHostExectutionSpace` to use in static analysis [\#3189](https://github.com/kokkos/kokkos/issues/3189)
- Add documentation on using Spack to install Kokkos and developing packages that depend on Kokkos [\#3187](https://github.com/kokkos/kokkos/issues/3187)
- Add OpenMPTarget backend flags for NVC++ compiler [\#3185](https://github.com/kokkos/kokkos/issues/3185)
- Move deep\_copy/create\_mirror\_view on Experimental::OffsetView into Kokkos:: namespace [\#3166](https://github.com/kokkos/kokkos/issues/3166)
- Allow for larger block size in HIP [\#3165](https://github.com/kokkos/kokkos/issues/3165)
- View: Added names of Views to the different View initialize/free kernels [\#3159](https://github.com/kokkos/kokkos/issues/3159)
- Cuda: Caching cudaFunctorAttributes and whether L1/Shmem prefer was set [\#3151](https://github.com/kokkos/kokkos/issues/3151)
- BuildSystem: Improved performance in default configuration by defaulting to Release build [\#3131](https://github.com/kokkos/kokkos/issues/3131)
- Cuda: Update CUDA occupancy calculation [\#3124](https://github.com/kokkos/kokkos/issues/3124)
- Vector: Adding data() to Vector [\#3123](https://github.com/kokkos/kokkos/issues/3123)
- BuildSystem: Add CUDA Ampere configuration support [\#3122](https://github.com/kokkos/kokkos/issues/3122)
- General: Apply [[noreturn]] to Kokkos::abort when applicable [\#3106](https://github.com/kokkos/kokkos/issues/3106)
- TeamPolicy: Validate storage level argument passed to TeamPolicy::set\_scratch\_size() [\#3098](https://github.com/kokkos/kokkos/issues/3098)
- BuildSystem: Make kokkos\_has\_string() function in Makefile.kokkos case insensitive [\#3091](https://github.com/kokkos/kokkos/issues/3091)
- Modify KOKKOS\_FUNCTION macro for clang-tidy analysis [\#3087](https://github.com/kokkos/kokkos/issues/3087)
- Move allocation profiling to allocate/deallocate calls [\#3084](https://github.com/kokkos/kokkos/issues/3084)
- BuildSystem: FATAL\_ERROR when attempting in-source build [\#3082](https://github.com/kokkos/kokkos/issues/3082)
- Change enums in ScatterView to types [\#3076](https://github.com/kokkos/kokkos/issues/3076)
- HIP: Changes for new compiler/runtime [\#3067](https://github.com/kokkos/kokkos/issues/3067)
- Extract and use get\_gpu [\#3061](https://github.com/kokkos/kokkos/issues/3061) , [\#3048](https://github.com/kokkos/kokkos/issues/3048)
- Add is\_allocated to View-like containers [\#3059](https://github.com/kokkos/kokkos/issues/3059)
- Combined reducers for scalar references [\#3052](https://github.com/kokkos/kokkos/issues/3052)
- Add configurable capacity for UniqueToken [\#3051](https://github.com/kokkos/kokkos/issues/3051)
- Add installation testing [\#3034](https://github.com/kokkos/kokkos/issues/3034)
- HIP: Add UniqueToken [\#3020](https://github.com/kokkos/kokkos/issues/3020)
- Autodetect number of devices [\#3013](https://github.com/kokkos/kokkos/issues/3013)
**Fixed bugs:**
- Check error code from `cudaStreamSynchronize` in CUDA fences [\#3255](https://github.com/kokkos/kokkos/issues/3255)
- Fix issue with C++ standard flags when using `nvcc\_wrapper` with PGI [\#3254](https://github.com/kokkos/kokkos/issues/3254)
- Add missing threadfence in lock-based atomics [\#3208](https://github.com/kokkos/kokkos/issues/3208)
- Fix dedup of linker flags for shared lib on CMake <=3.12 [\#3176](https://github.com/kokkos/kokkos/issues/3176)
- Fix memory leak with CUDA streams [\#3170](https://github.com/kokkos/kokkos/issues/3170)
- BuildSystem: Fix OpenMP Target flags for Cray [\#3161](https://github.com/kokkos/kokkos/issues/3161)
- ScatterView: fix for OpenmpTarget remove inheritance from reducers [\#3162](https://github.com/kokkos/kokkos/issues/3162)
- BuildSystem: Set OpenMP flags according to host compiler [\#3127](https://github.com/kokkos/kokkos/issues/3127)
- OpenMP: Fix logic for nested omp in partition\_master bug [\#3101](https://github.com/kokkos/kokkos/issues/3101)
- nvcc\_wrapper: send --cudart to nvcc instead of host compiler [\#3092](https://github.com/kokkos/kokkos/issues/3092)
- BuildSystem: Fixes for Cuda/11 and c++17 [\#3085](https://github.com/kokkos/kokkos/issues/3085)
- HIP: Fix print\_configuration [\#3080](https://github.com/kokkos/kokkos/issues/3080)
- Conditionally define get\_gpu [\#3072](https://github.com/kokkos/kokkos/issues/3072)
- Fix bounds for ranges in random number generator [\#3069](https://github.com/kokkos/kokkos/issues/3069)
- Fix Cuda minor arch check [\#3035](https://github.com/kokkos/kokkos/issues/3035)
- BuildSystem: Add -expt-relaxed-constexpr flag to nvcc\_wrapper [\#3021](https://github.com/kokkos/kokkos/issues/3021)
**Incompatibilities:**
- Remove ETI support [\#3157](https://github.com/kokkos/kokkos/issues/3157)
- Remove KOKKOS\_INTERNAL\_ENABLE\_NON\_CUDA\_BACKEND [\#3147](https://github.com/kokkos/kokkos/issues/3147)
- Remove core/unit\_test/config [\#3146](https://github.com/kokkos/kokkos/issues/3146)
- Removed the preprocessor branch for KOKKOS\_ENABLE\_PROFILING [\#3115](https://github.com/kokkos/kokkos/issues/3115)
- Disable profiling with MSVC [\#3066](https://github.com/kokkos/kokkos/issues/3066)
**Closed issues:**
- Silent error (Validate storage level arg to set_scratch_size) [\#3097](https://github.com/kokkos/kokkos/issues/3097)
- Remove KOKKKOS\_ENABLE\_PROFILING Option [\#3095](https://github.com/kokkos/kokkos/issues/3095)
- Cuda 11 -\> allow C++17 [\#3083](https://github.com/kokkos/kokkos/issues/3083)
- In source build failure not explained [\#3081](https://github.com/kokkos/kokkos/issues/3081)
- Allow naming of Views for initialization kernel [\#3070](https://github.com/kokkos/kokkos/issues/3070)
- DefaultInit tests failing when using CTest resource allocation feature [\#3040](https://github.com/kokkos/kokkos/issues/3040)
- Add installation testing. [\#3037](https://github.com/kokkos/kokkos/issues/3037)
- nvcc\_wrapper needs to handle `-expt-relaxed-constexpr` flag [\#3017](https://github.com/kokkos/kokkos/issues/3017)
- CPU core oversubscription warning on macOS with OpenMP backend [\#2996](https://github.com/kokkos/kokkos/issues/2996)
- Default behavior of KOKKOS\_NUM\_DEVICES to use all devices available [\#2975](https://github.com/kokkos/kokkos/issues/2975)
- Assert blocksize \> 0 [\#2974](https://github.com/kokkos/kokkos/issues/2974)
- Add ability to assign kokkos profile function from executable [\#2973](https://github.com/kokkos/kokkos/issues/2973)
- ScatterView Support for the pre/post increment operator [\#2967](https://github.com/kokkos/kokkos/issues/2967)
- Compiler issue: Cuda build with clang 10 has errors with the atomic unit tests [\#3237](https://github.com/kokkos/kokkos/issues/3237)
- Incompatibility of flags for C++ standard with PGI v20.4 on Power9/NVIDIA V100 system [\#3252](https://github.com/kokkos/kokkos/issues/3252)
- Error configuring as subproject [\#3140](https://github.com/kokkos/kokkos/issues/3140)
- CMake fails with Nvidia compilers when the GPU architecture option is not supplied (Fix configure with OMPT and Cuda) [\#3207](https://github.com/kokkos/kokkos/issues/3207)
- PGI compiler being passed the gcc -fopenmp flag [\#3125](https://github.com/kokkos/kokkos/issues/3125)
- Cuda: Memory leak when using CUDA stream [\#3167](https://github.com/kokkos/kokkos/issues/3167)
- RangePolicy has an implicitly deleted assignment operator [\#3192](https://github.com/kokkos/kokkos/issues/3192)
- MemorySpace::allocate needs to have memory pool counting. [\#3064](https://github.com/kokkos/kokkos/issues/3064)
- Missing write fence for lock based atomics on CUDA [\#3038](https://github.com/kokkos/kokkos/issues/3038)
- CUDA compute capability version check problem [\#3026](https://github.com/kokkos/kokkos/issues/3026)
- Make DynRankView fencing consistent [\#3014](https://github.com/kokkos/kokkos/issues/3014)
- nvcc\_wrapper cant handle -Xcompiler -o out.o [\#2993](https://github.com/kokkos/kokkos/issues/2993)
- Reductions of non-trivial types of size 4 fail in CUDA shfl operations [\#2990](https://github.com/kokkos/kokkos/issues/2990)
- complex\_double misalignment in reduce, clang+CUDA [\#2989](https://github.com/kokkos/kokkos/issues/2989)
- Span of degenerated \(zero-length\) subviews is not zero in some special cases [\#2979](https://github.com/kokkos/kokkos/issues/2979)
- Rank 1 custom layouts dont work as expected. [\#2840](https://github.com/kokkos/kokkos/issues/2840)
## [3.1.01](https://github.com/kokkos/kokkos/tree/3.1.1) (2020-04-14)
[Full Changelog](https://github.com/kokkos/kokkos/compare/3.1.00...3.1.1)
**Fixed bugs:**
- Fix complex_double misalignment in reduce, clang+CUDA [\#2989](https://github.com/kokkos/kokkos/issues/2989)
- Fix compilation fails when profiling disabled and CUDA enabled [\#3001](https://github.com/kokkos/kokkos/issues/3001)
- Fix cuda reduction of non-trivial scalars of size 4 [\#2990](https://github.com/kokkos/kokkos/issues/2990)
- Configure and install version file when building in Trilinos [\#2957](https://github.com/kokkos/kokkos/pull/2957)
- Fix OpenMPTarget build missing include and namespace [\#3000](https://github.com/kokkos/kokkos/issues/3000)
- fix typo in KOKKOS_SET_EXE_PROPERTY() [\#2959](https://github.com/kokkos/kokkos/issues/2959)
- Fix non-zero span subviews of zero sized subviews [\#2979](https://github.com/kokkos/kokkos/issues/2979)
## [3.1.00](https://github.com/kokkos/kokkos/tree/3.1.00) (2020-04-14)
[Full Changelog](https://github.com/kokkos/kokkos/compare/3.0.00...3.1.00)
**Features:**
- HIP Support for AMD
- OpenMPTarget Support with clang
- Windows VS19 (Serial) Support [\#1533](https://github.com/kokkos/kokkos/issues/1533)
**Implemented enhancements:**
- generate\_makefile.bash should allow tests to be disabled [\#2886](https://github.com/kokkos/kokkos/issues/2886)
- clang/7+cuda/9 build -Werror-unused parameter error in nightly test [\#2884](https://github.com/kokkos/kokkos/issues/2884)
- ScatterView memory space is not user settable [\#2826](https://github.com/kokkos/kokkos/issues/2826)
- clang/8+cuda/10.0 build error with c++17 [\#2809](https://github.com/kokkos/kokkos/issues/2809)
- warnings.... [\#2805](https://github.com/kokkos/kokkos/issues/2805)
- Kokkos version in cpp define [\#2787](https://github.com/kokkos/kokkos/issues/2787)
- Remove Defunct QThreads Backend [\#2751](https://github.com/kokkos/kokkos/issues/2751)
- Improve Kokkos::fence behavior with multiple execution spaces [\#2659](https://github.com/kokkos/kokkos/issues/2659)
- polylithic\(?\) initialization of Kokkos [\#2658](https://github.com/kokkos/kokkos/issues/2658)
- Unnecessary\(?\) check for host execution space initialization from Cuda initialization [\#2652](https://github.com/kokkos/kokkos/issues/2652)
- Kokkos error reporting failures with CUDA GPUs in exclusive mode [\#2471](https://github.com/kokkos/kokkos/issues/2471)
- atomicMax equivalent \(and other atomics\) [\#2401](https://github.com/kokkos/kokkos/issues/2401)
- Fix alignment for Kokkos::complex [\#2255](https://github.com/kokkos/kokkos/issues/2255)
- Warnings with Cuda 10.1 [\#2206](https://github.com/kokkos/kokkos/issues/2206)
- dual view with Kokkos::ViewAllocateWithoutInitializing [\#2188](https://github.com/kokkos/kokkos/issues/2188)
- Check error code from cudaOccupancyMaxActiveBlocksPerMultiprocessor [\#2172](https://github.com/kokkos/kokkos/issues/2172)
- Add non-member Kokkos::resize/realloc for DualView [\#2170](https://github.com/kokkos/kokkos/issues/2170)
- Construct DualView without initialization [\#2046](https://github.com/kokkos/kokkos/issues/2046)
- Expose is\_assignable to determine if one view can be assigned to another [\#1936](https://github.com/kokkos/kokkos/issues/1936)
- profiling label [\#1935](https://github.com/kokkos/kokkos/issues/1935)
- team\_broadcast of bool failed on CUDA backend [\#1908](https://github.com/kokkos/kokkos/issues/1908)
- View static\_extent [\#660](https://github.com/kokkos/kokkos/issues/660)
- Misleading Kokkos::Cuda::initialize ERROR message when compiled for wrong GPU architecture [\#1944](https://github.com/kokkos/kokkos/issues/1944)
- Cryptic Error When Malloc Fails [\#2164](https://github.com/kokkos/kokkos/issues/2164)
- Drop support for intermediate standards in CMake [\#2336](https://github.com/kokkos/kokkos/issues/2336)
**Fixed bugs:**
- DualView sync\_device with length zero creates cuda errors [\#2946](https://github.com/kokkos/kokkos/issues/2946)
- building with nvcc and clang \(or clang based XL\) as host compiler: "Kokkos::atomic\_fetch\_min\(volatile int \*, int\)" has already been defined [\#2903](https://github.com/kokkos/kokkos/issues/2903)
- Cuda 9.1,10.1 debug builds failing due to -Werror=unused-parameter [\#2880](https://github.com/kokkos/kokkos/issues/2880)
- clang -Werror: Kokkos\_FixedBufferMemoryPool.hpp:140:28: error: unused parameter 'alloc\_size' [\#2869](https://github.com/kokkos/kokkos/issues/2869)
- intel/16.0.1, intel/17.0.1 nightly build failures with debugging enabled [\#2867](https://github.com/kokkos/kokkos/issues/2867)
- intel/16.0.1 debug build errors [\#2863](https://github.com/kokkos/kokkos/issues/2863)
- xl/16.1.1 with cpp14, openmp build, nightly test failures [\#2856](https://github.com/kokkos/kokkos/issues/2856)
- Intel nightly test failures: team\_vector [\#2852](https://github.com/kokkos/kokkos/issues/2852)
- Kokkos Views with intmax/2\\> for complex\ uses std::ostream, not std::istream [\#2313](https://github.com/kokkos/kokkos/issues/2313)
- Macros: Restrict not honored for non-intel compilers [\#1922](https://github.com/kokkos/kokkos/issues/1922)
## [2.9.00](https://github.com/kokkos/kokkos/tree/2.9.00) (2019-06-24)
[Full Changelog](https://github.com/kokkos/kokkos/compare/2.8.00...2.9.00)
**Implemented enhancements:**
- Capability: CUDA Streams [\#1723](https://github.com/kokkos/kokkos/issues/1723)
- Capability: CUDA Stream support for parallel\_reduce [\#2061](https://github.com/kokkos/kokkos/issues/2061)
- Capability: Feature Request: TeamVectorRange [\#713](https://github.com/kokkos/kokkos/issues/713)
- Capability: Adding HPX backend [\#2080](https://github.com/kokkos/kokkos/issues/2080)
- Capability: TaskScheduler to have multiple queues [\#565](https://github.com/kokkos/kokkos/issues/565)
- Capability: Support for additional reductions in ScatterView [\#1674](https://github.com/kokkos/kokkos/issues/1674)
- Capability: Request: deep\_copy within parallel regions [\#689](https://github.com/kokkos/kokkos/issues/689)
- Capability: Feature Request: `create\_mirror\_view\_without\_initializing` [\#1765](https://github.com/kokkos/kokkos/issues/1765)
- View: Use SFINAE to restrict possible View type conversions [\#2127](https://github.com/kokkos/kokkos/issues/2127)
- Deprecation: Deprecate ExecutionSpace::fence\(\) as static function and make it non-static [\#2140](https://github.com/kokkos/kokkos/issues/2140)
- Deprecation: Deprecate LayoutTileLeft [\#2122](https://github.com/kokkos/kokkos/issues/2122)
- Macros: KOKKOS\_RESTRICT defined for non-Intel compilers [\#2038](https://github.com/kokkos/kokkos/issues/2038)
**Fixed bugs:**
- Cuda: TeamThreadRange loop count on device is passed by reference to host static constexpr [\#1733](https://github.com/kokkos/kokkos/issues/1733)
- Cuda: Build error with relocatable device code with CUDA 10.1 GCC 7.3 [\#2134](https://github.com/kokkos/kokkos/issues/2134)
- Cuda: cudaFuncSetCacheConfig is setting CachePreferShared too often [\#2066](https://github.com/kokkos/kokkos/issues/2066)
- Cuda: TeamPolicy doesn't throw then created with non-viable vector length and also doesn't backscale to viable one [\#2020](https://github.com/kokkos/kokkos/issues/2020)
- Cuda: cudaMemcpy error for large league sizes on V100 [\#1991](https://github.com/kokkos/kokkos/issues/1991)
- Cuda: illegal warp sync in parallel\_reduce by functor on Turing 75 [\#1958](https://github.com/kokkos/kokkos/issues/1958)
- TeamThreadRange: Inconsistent results from TeamThreadRange reduction [\#1905](https://github.com/kokkos/kokkos/issues/1905)
- Atomics: atomic\_fetch\_oper & atomic\_oper\_fetch don't build for complex\ [\#1964](https://github.com/kokkos/kokkos/issues/1964)
- Views: Kokkos randomread Views leak memory [\#2155](https://github.com/kokkos/kokkos/issues/2155)
- ScatterView: LayoutLeft overload currently non-functional [\#2165](https://github.com/kokkos/kokkos/issues/2165)
- KNL: With intel 17.2.174 illegal instruction in random number test [\#2078](https://github.com/kokkos/kokkos/issues/2078)
- Bitset: Enable copy constructor on device [\#2094](https://github.com/kokkos/kokkos/issues/2094)
- Examples: do not compile due to template deduction error \(multi\_fem\) [\#1928](https://github.com/kokkos/kokkos/issues/1928)
## [2.8.00](https://github.com/kokkos/kokkos/tree/2.8.00) (2019-02-05)
[Full Changelog](https://github.com/kokkos/kokkos/compare/2.7.24...2.8.00)
**Implemented enhancements:**
- Capability, Tests: C++14 support and testing [\#1914](https://github.com/kokkos/kokkos/issues/1914)
- Capability: Add environment variables for all command line arguments [\#1798](https://github.com/kokkos/kokkos/issues/1798)
- Capability: --kokkos-ndevices not working for Slurm [\#1920](https://github.com/kokkos/kokkos/issues/1920)
- View: Undefined behavior when deep copying from and to an empty unmanaged view [\#1967](https://github.com/kokkos/kokkos/issues/1967)
- BuildSystem: nvcc\_wrapper should stop immediately if nvcc is not in PATH [\#1861](https://github.com/kokkos/kokkos/issues/1861)
**Fixed bugs:**
- Cuda: Fix Volta Issues 1 Non-deterministic behavior on Volta, runs fine on Pascal [\#1949](https://github.com/kokkos/kokkos/issues/1949)
- Cuda: Fix Volta Issues 2 CUDA Team Scan gives wrong values on Volta with -G compile flag [\#1942](https://github.com/kokkos/kokkos/issues/1942)
- Cuda: illegal warp sync in parallel\_reduce by functor on Turing 75 [\#1958](https://github.com/kokkos/kokkos/issues/1958)
- Threads: Pthreads backend does not handle RangePolicy with offset correctly [\#1976](https://github.com/kokkos/kokkos/issues/1976)
- Atomics: atomic\_fetch\_oper has no case for Kokkos::complex\ or other 16-byte types [\#1951](https://github.com/kokkos/kokkos/issues/1951)
- MDRangePolicy: Fix zero-length range [\#1948](https://github.com/kokkos/kokkos/issues/1948)
- TeamThreadRange: TeamThreadRange MaxLoc reduce doesnt compile [\#1909](https://github.com/kokkos/kokkos/issues/1909)
## [2.7.24](https://github.com/kokkos/kokkos/tree/2.7.24) (2018-11-04)
[Full Changelog](https://github.com/kokkos/kokkos/compare/2.7.00...2.7.24)
**Implemented enhancements:**
- DualView: Add non-templated functions for sync, need\_sync, view, modify [\#1858](https://github.com/kokkos/kokkos/issues/1858)
- DualView: Avoid needlessly allocates and initializes modify\_host and modify\_device flag views [\#1831](https://github.com/kokkos/kokkos/issues/1831)
- DualView: Incorrect deduction of "not device type" [\#1659](https://github.com/kokkos/kokkos/issues/1659)
- BuildSystem: Add KOKKOS\_ENABLE\_CXX14 and KOKKOS\_ENABLE\_CXX17 [\#1602](https://github.com/kokkos/kokkos/issues/1602)
- BuildSystem: Installed kokkos\_generated\_settings.cmake contains build directories instead of install directories [\#1838](https://github.com/kokkos/kokkos/issues/1838)
- BuildSystem: KOKKOS\_ARCH: add ticks to printout of improper arch setting [\#1649](https://github.com/kokkos/kokkos/issues/1649)
- BuildSystem: Make core/src/Makefile for Cuda use needed nvcc\_wrapper [\#1296](https://github.com/kokkos/kokkos/issues/1296)
- Build: Support PGI as host compiler for NVCC [\#1828](https://github.com/kokkos/kokkos/issues/1828)
- Build: Many Warnings Fixed e.g.[\#1786](https://github.com/kokkos/kokkos/issues/1786)
- Capability: OffsetView with non-zero begin index [\#567](https://github.com/kokkos/kokkos/issues/567)
- Capability: Reductions into device side view [\#1788](https://github.com/kokkos/kokkos/issues/1788)
- Capability: Add max\_size to Kokkos::Array [\#1760](https://github.com/kokkos/kokkos/issues/1760)
- Capability: View Assignment: LayoutStride -\> LayoutLeft and LayoutStride -\> LayoutRight [\#1594](https://github.com/kokkos/kokkos/issues/1594)
- Capability: Atomic function allow implicit conversion of update argument [\#1571](https://github.com/kokkos/kokkos/issues/1571)
- Capability: Add team\_size\_max with tagged functors [\#663](https://github.com/kokkos/kokkos/issues/663)
- Capability: Fix allignment of views from Kokkos\_ScratchSpace should use different alignment [\#1700](https://github.com/kokkos/kokkos/issues/1700)
- Capabilitiy: create\_mirror\_view\_and\_copy for DynRankView [\#1651](https://github.com/kokkos/kokkos/issues/1651)
- Capability: DeepCopy HBWSpace / HostSpace [\#548](https://github.com/kokkos/kokkos/issues/548)
- ROCm: support team vector scan [\#1645](https://github.com/kokkos/kokkos/issues/1645)
- ROCm: Merge from rocm-hackathon2 [\#1636](https://github.com/kokkos/kokkos/issues/1636)
- ROCm: Add ParallelScanWithTotal [\#1611](https://github.com/kokkos/kokkos/issues/1611)
- ROCm: Implement MDRange in ROCm [\#1314](https://github.com/kokkos/kokkos/issues/1314)
- ROCm: Implement Reducers for Nested Parallelism Levels [\#963](https://github.com/kokkos/kokkos/issues/963)
- ROCm: Add asynchronous deep copy [\#959](https://github.com/kokkos/kokkos/issues/959)
- Tests: Memory pool test seems to allocate 8GB [\#1830](https://github.com/kokkos/kokkos/issues/1830)
- Tests: Add unit\_test for team\_broadcast [\#734](https://github.com/kokkos/kokkos/issues/734)
**Fixed bugs:**
- BuildSystem: Makefile.kokkos gets gcc-toolchain wrong if gcc is cached [\#1841](https://github.com/kokkos/kokkos/issues/1841)
- BuildSystem: kokkos\_generated\_settings.cmake placement is inconsistent [\#1771](https://github.com/kokkos/kokkos/issues/1771)
- BuildSystem: Invalid escape sequence \. in kokkos\_functions.cmake [\#1661](https://github.com/kokkos/kokkos/issues/1661)
- BuildSystem: Problem in Kokkos generated cmake file [\#1770](https://github.com/kokkos/kokkos/issues/1770)
- BuildSystem: invalid file names on windows [\#1671](https://github.com/kokkos/kokkos/issues/1671)
- Tests: reducers min/max\_loc test fails randomly due to multiple min values and thus multiple valid locations [\#1681](https://github.com/kokkos/kokkos/issues/1681)
- Tests: cuda.scatterview unit test causes "Bus error" when force\_uvm and enable\_lambda are enabled [\#1852](https://github.com/kokkos/kokkos/issues/1852)
- Tests: cuda.cxx11 unit test fails when force\_uvm and enable\_lambda are enabled [\#1850](https://github.com/kokkos/kokkos/issues/1850)
- Tests: threads.reduce\_device\_view\_range\_policy failing with Cuda/8.0.44 and RDC [\#1836](https://github.com/kokkos/kokkos/issues/1836)
- Build: compile error when compiling Kokkos with hwloc 2.0.1 \(on OSX 10.12.6, with g++ 7.2.0\) [\#1506](https://github.com/kokkos/kokkos/issues/1506)
- Build: dual\_view.view broken with UVM [\#1834](https://github.com/kokkos/kokkos/issues/1834)
- Build: White cuda/9.2 + gcc/7.2 warnings triggering errors [\#1833](https://github.com/kokkos/kokkos/issues/1833)
- Build: warning: enum constant in boolean context [\#1813](https://github.com/kokkos/kokkos/issues/1813)
- Capability: Fix overly conservative max\_team\_size thingy [\#1808](https://github.com/kokkos/kokkos/issues/1808)
- DynRankView: Ctors taking ViewAllocateWithoutInitializing broken [\#1783](https://github.com/kokkos/kokkos/issues/1783)
- Cuda: Apollo cuda.team\_broadcast test fail with clang-6.0 [\#1762](https://github.com/kokkos/kokkos/issues/1762)
- Cuda: Clang spurious test failure in impl\_view\_accessible [\#1753](https://github.com/kokkos/kokkos/issues/1753)
- Cuda: Kokkos::complex\ atomic deadlocks with Clang 6 Cuda build with -O0 [\#1752](https://github.com/kokkos/kokkos/issues/1752)
- Cuda: LayoutStride Test fails for UVM as default memory space [\#1688](https://github.com/kokkos/kokkos/issues/1688)
- Cuda: Scan wrong values on Volta [\#1676](https://github.com/kokkos/kokkos/issues/1676)
- Cuda: Kokkos::deep\_copy error with CudaUVM and Kokkos::Serial spaces [\#1652](https://github.com/kokkos/kokkos/issues/1652)
- Cuda: cudaErrorInvalidConfiguration with debug build [\#1647](https://github.com/kokkos/kokkos/issues/1647)
- Cuda: parallel\_for with TeamPolicy::team\_size\_recommended with launch bounds not working -- reported by Daniel Holladay [\#1283](https://github.com/kokkos/kokkos/issues/1283)
- Cuda: Using KOKKOS\_CLASS\_LAMBDA in a class with Kokkos::Random\_XorShift64\_Pool member data [\#1696](https://github.com/kokkos/kokkos/issues/1696)
- Long Build Times on Darwin [\#1721](https://github.com/kokkos/kokkos/issues/1721)
- Capability: Typo in Kokkos\_Sort.hpp - BinOp3D - wrong comparison [\#1720](https://github.com/kokkos/kokkos/issues/1720)
- Buffer overflow in SharedAllocationRecord in Kokkos\_HostSpace.cpp [\#1673](https://github.com/kokkos/kokkos/issues/1673)
- Serial unit test failure [\#1632](https://github.com/kokkos/kokkos/issues/1632)
## [2.7.00](https://github.com/kokkos/kokkos/tree/2.7.00) (2018-05-24)
[Full Changelog](https://github.com/kokkos/kokkos/compare/2.6.00...2.7.00)
**Part of the Kokkos C++ Performance Portability Programming EcoSystem 2.7**
**Implemented enhancements:**
- Deprecate team\_size auto adjusting to maximal value possible [\#1618](https://github.com/kokkos/kokkos/issues/1618)
- DynamicView - remove restrictions to std::is\_trivial types and value\_type is power of two [\#1586](https://github.com/kokkos/kokkos/issues/1586)
- Kokkos::StaticCrsGraph does not propagate memory traits \(e.g., Unmanaged\) [\#1581](https://github.com/kokkos/kokkos/issues/1581)
- Adding ETI for DeepCopy / ViewFill etc. [\#1578](https://github.com/kokkos/kokkos/issues/1578)
- Deprecate all the left over KOKKOS\_HAVE\_ Macros and Kokkos\_OldMacros.hpp [\#1572](https://github.com/kokkos/kokkos/issues/1572)
- Error if Kokkos\_ARCH set in CMake [\#1555](https://github.com/kokkos/kokkos/issues/1555)
- Deprecate ExecSpace::initialize / ExecSpace::finalize [\#1532](https://github.com/kokkos/kokkos/issues/1532)
- New API for TeamPolicy property setting [\#1531](https://github.com/kokkos/kokkos/issues/1531)
- clang 6.0 + cuda debug out-of-memory test failure [\#1521](https://github.com/kokkos/kokkos/issues/1521)
- Cuda UniqueToken interface not consistent with other backends [\#1505](https://github.com/kokkos/kokkos/issues/1505)
- Move Reducers out of Experimental namespace [\#1494](https://github.com/kokkos/kokkos/issues/1494)
- Provide scope guard for initialize/finalize [\#1479](https://github.com/kokkos/kokkos/issues/1479)
- Check Kokkos::is\_initialized in SharedAllocationRecord dtor [\#1465](https://github.com/kokkos/kokkos/issues/1465)
- Remove static list of allocations [\#1464](https://github.com/kokkos/kokkos/issues/1464)
- Makefiles: Support single compile/link line use case [\#1402](https://github.com/kokkos/kokkos/issues/1402)
- ThreadVectorRange with a range [\#1400](https://github.com/kokkos/kokkos/issues/1400)
- Exclusive scan + last value API [\#1358](https://github.com/kokkos/kokkos/issues/1358)
- Install kokkos\_generated\_settings.cmake [\#1348](https://github.com/kokkos/kokkos/issues/1348)
- Kokkos arrays \(not views!\) don't do bounds checking in debug mode [\#1342](https://github.com/kokkos/kokkos/issues/1342)
- Expose round-robin GPU assignment outside of initialize\(int, char\*\*\) [\#1318](https://github.com/kokkos/kokkos/issues/1318)
- DynamicView misses use\_count and label function [\#1298](https://github.com/kokkos/kokkos/issues/1298)
- View constructor should check arguments [\#1286](https://github.com/kokkos/kokkos/issues/1286)
- False Positive on Oversubscription Warning [\#1207](https://github.com/kokkos/kokkos/issues/1207)
- Allow \(require\) execution space for 1st arg of VerifyExecutionCanAccessMemorySpace [\#1192](https://github.com/kokkos/kokkos/issues/1192)
- ROCm: Add ROCmHostPinnedSpace [\#958](https://github.com/kokkos/kokkos/issues/958)
- power of two functions [\#656](https://github.com/kokkos/kokkos/issues/656)
- CUDA 8 has 64bit \_\_shfl [\#361](https://github.com/kokkos/kokkos/issues/361)
- Add TriBITS/CMake configure information about node types [\#243](https://github.com/kokkos/kokkos/issues/243)
**Fixed bugs:**
- CUDA atomic\_fetch\_sub for doubles is hitting CAS instead of intrinsic [\#1624](https://github.com/kokkos/kokkos/issues/1624)
- Bug: use of ballot on Volta [\#1612](https://github.com/kokkos/kokkos/issues/1612)
- Kokkos::deep\_copy memory access failures [\#1583](https://github.com/kokkos/kokkos/issues/1583)
- g++ -std option doubly set for cmake project [\#1548](https://github.com/kokkos/kokkos/issues/1548)
- ViewFill for 1D Views of larger 32bit entries fails [\#1541](https://github.com/kokkos/kokkos/issues/1541)
- CUDA Volta another warpsync bug [\#1520](https://github.com/kokkos/kokkos/issues/1520)
- triple\_nested\_parallelism fails with KOKKOS\_DEBUG and CUDA [\#1513](https://github.com/kokkos/kokkos/issues/1513)
- Jenkins errors in Kokkos\_SharedAlloc.cpp with debug build [\#1511](https://github.com/kokkos/kokkos/issues/1511)
- Kokkos::Sort out-of-bounds with empty bins [\#1504](https://github.com/kokkos/kokkos/issues/1504)
- Get rid of deprecated functions inside Kokkos [\#1484](https://github.com/kokkos/kokkos/issues/1484)
- get\_work\_partition casts int64\_t to int, causing a seg fault [\#1481](https://github.com/kokkos/kokkos/issues/1481)
- NVCC bug with \_\_device\_\_ on defaulted function [\#1470](https://github.com/kokkos/kokkos/issues/1470)
- CMake example broken with CUDA backend [\#1468](https://github.com/kokkos/kokkos/issues/1468)
## [2.6.00](https://github.com/kokkos/kokkos/tree/2.6.00) (2018-03-07)
[Full Changelog](https://github.com/kokkos/kokkos/compare/2.5.00...2.6.00)
**Part of the Kokkos C++ Performance Portability Programming EcoSystem 2.6**
**Implemented enhancements:**
- Support NVIDIA Volta microarchitecture [\#1466](https://github.com/kokkos/kokkos/issues/1466)
- Kokkos - Define empty functions when profiling disabled [\#1424](https://github.com/kokkos/kokkos/issues/1424)
- Don't use \_\_constant\_\_ cache for lock arrays, enable once per run update instead of once per call [\#1385](https://github.com/kokkos/kokkos/issues/1385)
- task dag enhancement. [\#1354](https://github.com/kokkos/kokkos/issues/1354)
- Cuda task team collectives and stack size [\#1353](https://github.com/kokkos/kokkos/issues/1353)
- Replace View operator acceptance of more than rank integers with 'access' function [\#1333](https://github.com/kokkos/kokkos/issues/1333)
- Interoperability: Do not shut down backend execution space runtimes upon calling finalize. [\#1305](https://github.com/kokkos/kokkos/issues/1305)
- shmem\_size for LayoutStride [\#1291](https://github.com/kokkos/kokkos/issues/1291)
- Kokkos::resize performs poorly on 1D Views [\#1270](https://github.com/kokkos/kokkos/issues/1270)
- stride\(\) is inconsistent with dimension\(\), extent\(\), etc. [\#1214](https://github.com/kokkos/kokkos/issues/1214)
- Kokkos::sort defaults to std::sort on host [\#1208](https://github.com/kokkos/kokkos/issues/1208)
- DynamicView with host size grow [\#1206](https://github.com/kokkos/kokkos/issues/1206)
- Unmanaged View with Anonymous Memory Space [\#1175](https://github.com/kokkos/kokkos/issues/1175)
- Sort subset of Kokkos::DynamicView [\#1160](https://github.com/kokkos/kokkos/issues/1160)
- MDRange policy doesn't support lambda reductions [\#1054](https://github.com/kokkos/kokkos/issues/1054)
- Add ability to set hook on Kokkos::finalize [\#714](https://github.com/kokkos/kokkos/issues/714)
- Atomics with Serial Backend - Default should be Disable? [\#549](https://github.com/kokkos/kokkos/issues/549)
- KOKKOS\_ENABLE\_DEPRECATED\_CODE [\#1359](https://github.com/kokkos/kokkos/issues/1359)
**Fixed bugs:**
- cuda\_internal\_maximum\_warp\_count returns 8, but I believe it should return 16 for P100 [\#1269](https://github.com/kokkos/kokkos/issues/1269)
- Cuda: level 1 scratch memory bug \(reported by Stan Moore\) [\#1434](https://github.com/kokkos/kokkos/issues/1434)
- MDRangePolicy Reduction requires value\_type typedef in Functor [\#1379](https://github.com/kokkos/kokkos/issues/1379)
- Kokkos DeepCopy between empty views fails [\#1369](https://github.com/kokkos/kokkos/issues/1369)
- Several issues with new CMake build infrastructure \(reported by Eric Phipps\) [\#1365](https://github.com/kokkos/kokkos/issues/1365)
- deep\_copy between rank-1 host/device views of differing layouts without UVM no longer works \(reported by Eric Phipps\) [\#1363](https://github.com/kokkos/kokkos/issues/1363)
- Profiling can't be disabled in CMake, and a parallel\_for is missing for tasks \(reported by Kyungjoo Kim\) [\#1349](https://github.com/kokkos/kokkos/issues/1349)
- get\_work\_partition int overflow \(reported by berryj5\) [\#1327](https://github.com/kokkos/kokkos/issues/1327)
- Kokkos::deep\_copy must fence even if the two views are the same [\#1303](https://github.com/kokkos/kokkos/issues/1303)
- CudaUVMSpace::allocate/deallocate must fence [\#1302](https://github.com/kokkos/kokkos/issues/1302)
- ViewResize on CUDA fails in Debug because of too many resources requested [\#1299](https://github.com/kokkos/kokkos/issues/1299)
- Cuda 9 and intrepid2 calls from Panzer. [\#1183](https://github.com/kokkos/kokkos/issues/1183)
- Slowdown due to tracking\_enabled\(\) in 2.04.00 \(found by Albany app\) [\#1016](https://github.com/kokkos/kokkos/issues/1016)
- Bounds checking fails with zero-span Views \(reported by Stan Moore\) [\#1411](https://github.com/kokkos/kokkos/issues/1411)
## [2.5.00](https://github.com/kokkos/kokkos/tree/2.5.00) (2017-12-15)
[Full Changelog](https://github.com/kokkos/kokkos/compare/2.04.11...2.5.00)
**Part of the Kokkos C++ Performance Portability Programming EcoSystem 2.5**
**Implemented enhancements:**
- Provide Makefile.kokkos logic for CMake and TriBITS [\#878](https://github.com/kokkos/kokkos/issues/878)
- Add Scatter View [\#825](https://github.com/kokkos/kokkos/issues/825)
- Drop gcc 4.7 and intel 14 from supported compiler list [\#603](https://github.com/kokkos/kokkos/issues/603)
- Enable construction of unmanaged view using common\_view\_alloc\_prop [\#1170](https://github.com/kokkos/kokkos/issues/1170)
- Unused Function Warning with XL [\#1267](https://github.com/kokkos/kokkos/issues/1267)
- Add memory pool parameter check [\#1218](https://github.com/kokkos/kokkos/issues/1218)
- CUDA9: Fix warning for unsupported long double [\#1189](https://github.com/kokkos/kokkos/issues/1189)
- CUDA9: fix warning on defaulted function marking [\#1188](https://github.com/kokkos/kokkos/issues/1188)
- CUDA9: fix warnings for deprecated warp level functions [\#1187](https://github.com/kokkos/kokkos/issues/1187)
- Add CUDA 9.0 nightly testing [\#1174](https://github.com/kokkos/kokkos/issues/1174)
- {OMPI,MPICH}\_CXX hack breaks nvcc\_wrapper use case [\#1166](https://github.com/kokkos/kokkos/issues/1166)
- KOKKOS\_HAVE\_CUDA\_LAMBDA became KOKKOS\_CUDA\_USE\_LAMBDA [\#1274](https://github.com/kokkos/kokkos/issues/1274)
**Fixed bugs:**
- MinMax Reducer with tagged operator doesn't compile [\#1251](https://github.com/kokkos/kokkos/issues/1251)
- Reducers for Tagged operators give wrong answer [\#1250](https://github.com/kokkos/kokkos/issues/1250)
- Kokkos not Compatible with Big Endian Machines? [\#1235](https://github.com/kokkos/kokkos/issues/1235)
- Parallel Scan hangs forever on BG/Q [\#1234](https://github.com/kokkos/kokkos/issues/1234)
- Threads backend doesn't compile with Clang on OS X [\#1232](https://github.com/kokkos/kokkos/issues/1232)
- $\(shell date\) needs quote [\#1264](https://github.com/kokkos/kokkos/issues/1264)
- Unqualified parallel\_for call conflicts with user-defined parallel\_for [\#1219](https://github.com/kokkos/kokkos/issues/1219)
- KokkosAlgorithms: CMake issue in unit tests [\#1212](https://github.com/kokkos/kokkos/issues/1212)
- Intel 18 Error: "simd pragma has been deprecated" [\#1210](https://github.com/kokkos/kokkos/issues/1210)
- Memory leak in Kokkos::initialize [\#1194](https://github.com/kokkos/kokkos/issues/1194)
- CUDA9: compiler error with static assert template arguments [\#1190](https://github.com/kokkos/kokkos/issues/1190)
- Kokkos::Serial::is\_initialized returns always true [\#1184](https://github.com/kokkos/kokkos/issues/1184)
- Triple nested parallelism still fails on bowman [\#1093](https://github.com/kokkos/kokkos/issues/1093)
- OpenMP openmp.range on Develop Runs Forever on POWER7+ with RHEL7 and GCC4.8.5 [\#995](https://github.com/kokkos/kokkos/issues/995)
- Rendezvous performance at global scope [\#985](https://github.com/kokkos/kokkos/issues/985)
## [2.04.11](https://github.com/kokkos/kokkos/tree/2.04.11) (2017-10-28)
[Full Changelog](https://github.com/kokkos/kokkos/compare/2.04.04...2.04.11)
**Implemented enhancements:**
- Add Subview pattern. [\#648](https://github.com/kokkos/kokkos/issues/648)
- Add Kokkos "global" is\_initialized [\#1060](https://github.com/kokkos/kokkos/issues/1060)
- Add create\_mirror\_view\_and\_copy [\#1161](https://github.com/kokkos/kokkos/issues/1161)
- Add KokkosConcepts SpaceAccessibility function [\#1092](https://github.com/kokkos/kokkos/issues/1092)
- Option to Disable Initialize Warnings [\#1142](https://github.com/kokkos/kokkos/issues/1142)
- Mature task-DAG capability [\#320](https://github.com/kokkos/kokkos/issues/320)
- Promote Work DAG from experimental [\#1126](https://github.com/kokkos/kokkos/issues/1126)
- Implement new WorkGraph push/pop [\#1108](https://github.com/kokkos/kokkos/issues/1108)
- Kokkos\_ENABLE\_Cuda\_Lambda should default ON [\#1101](https://github.com/kokkos/kokkos/issues/1101)
- Add multidimensional parallel for example and improve unit test [\#1064](https://github.com/kokkos/kokkos/issues/1064)
- Fix ROCm: Performance tests not building [\#1038](https://github.com/kokkos/kokkos/issues/1038)
- Make KOKKOS\_ALIGN\_SIZE a configure-time option [\#1004](https://github.com/kokkos/kokkos/issues/1004)
- Make alignment consistent [\#809](https://github.com/kokkos/kokkos/issues/809)
- Improve subview construction on Cuda backend [\#615](https://github.com/kokkos/kokkos/issues/615)
**Fixed bugs:**
- Kokkos::vector fixes for application [\#1134](https://github.com/kokkos/kokkos/issues/1134)
- DynamicView non-power of two value\_type [\#1177](https://github.com/kokkos/kokkos/issues/1177)
- Memory pool bug [\#1154](https://github.com/kokkos/kokkos/issues/1154)
- Cuda launch bounds performance regression bug [\#1140](https://github.com/kokkos/kokkos/issues/1140)
- Significant performance regression in LAMMPS after updating Kokkos [\#1139](https://github.com/kokkos/kokkos/issues/1139)
- CUDA compile error [\#1128](https://github.com/kokkos/kokkos/issues/1128)
- MDRangePolicy neg idx test failure in debug mode [\#1113](https://github.com/kokkos/kokkos/issues/1113)
- subview construction on Cuda backend [\#615](https://github.com/kokkos/kokkos/issues/615)
## [2.04.04](https://github.com/kokkos/kokkos/tree/2.04.04) (2017-09-11)
[Full Changelog](https://github.com/kokkos/kokkos/compare/2.04.00...2.04.04)
**Implemented enhancements:**
- OpenMP partition: set number of threads on nested level [\#1082](https://github.com/kokkos/kokkos/issues/1082)
- Add StaticCrsGraph row\(\) method [\#1071](https://github.com/kokkos/kokkos/issues/1071)
- Enhance Kokkos complex operator overloading [\#1052](https://github.com/kokkos/kokkos/issues/1052)
- Tell Trilinos packages about host+device lambda [\#1019](https://github.com/kokkos/kokkos/issues/1019)
- Function markup for defaulted class members [\#952](https://github.com/kokkos/kokkos/issues/952)
- Add deterministic random number generator [\#857](https://github.com/kokkos/kokkos/issues/857)
**Fixed bugs:**
- Fix reduction\_identity\::max for floating point numbers [\#1048](https://github.com/kokkos/kokkos/issues/1048)
- Fix MD iteration policy ignores lower bound on GPUs [\#1041](https://github.com/kokkos/kokkos/issues/1041)
- (Experimental) HBWSpace Linking issues in KokkosKernels [\#1094](https://github.com/kokkos/kokkos/issues/1094)
- (Experimental) ROCm: algorithms/unit\_tests test\_sort failing with segfault [\#1070](https://github.com/kokkos/kokkos/issues/1070)
## [2.04.00](https://github.com/kokkos/kokkos/tree/2.04.00) (2017-08-16)
[Full Changelog](https://github.com/kokkos/kokkos/compare/2.03.13...2.04.00)
**Implemented enhancements:**
- Added ROCm backend to support AMD GPUs
- Kokkos::complex\ behaves slightly differently from std::complex\ [\#1011](https://github.com/kokkos/kokkos/issues/1011)
- Kokkos::Experimental::Crs constructor arguments were in the wrong order [\#992](https://github.com/kokkos/kokkos/issues/992)
- Work graph construction ease-of-use (one lambda for count and fill) [\#991](https://github.com/kokkos/kokkos/issues/991)
- when\_all returns pointer of futures (improved interface) [\#990](https://github.com/kokkos/kokkos/issues/990)
- Allow assignment of LayoutLeft to LayoutRight or vice versa for rank-0 Views [\#594](https://github.com/kokkos/kokkos/issues/594)
- Changed the meaning of Kokkos\_ENABLE\_CXX11\_DISPATCH\_LAMBDA [\#1035](https://github.com/kokkos/kokkos/issues/1035)
**Fixed bugs:**
- memory pool default constructor does not properly set member variables. [\#1007](https://github.com/kokkos/kokkos/issues/1007)
## [2.03.13](https://github.com/kokkos/kokkos/tree/2.03.13) (2017-07-27)
[Full Changelog](https://github.com/kokkos/kokkos/compare/2.03.05...2.03.13)
**Implemented enhancements:**
- Disallow enabling both OpenMP and Threads in the same executable [\#406](https://github.com/kokkos/kokkos/issues/406)
- Make Kokkos::OpenMP respect OMP environment even if hwloc is available [\#630](https://github.com/kokkos/kokkos/issues/630)
- Improve Atomics Performance on KNL/Broadwell where PREFETCHW/RFO is Available [\#898](https://github.com/kokkos/kokkos/issues/898)
- Kokkos::resize should test whether dimensions have changed before resizing [\#904](https://github.com/kokkos/kokkos/issues/904)
- Develop performance-regression/acceptance tests [\#737](https://github.com/kokkos/kokkos/issues/737)
- Make the deep\_copy Profiling hook a start/end system [\#890](https://github.com/kokkos/kokkos/issues/890)
- Add deep\_copy Profiling hook [\#843](https://github.com/kokkos/kokkos/issues/843)
- Append tag name to parallel construct name for Profiling [\#842](https://github.com/kokkos/kokkos/issues/842)
- Add view label to `View bounds error` message for CUDA backend [\#870](https://github.com/kokkos/kokkos/issues/870)
- Disable printing the loaded profiling library [\#824](https://github.com/kokkos/kokkos/issues/824)
- "Declared but never referenced" warnings [\#853](https://github.com/kokkos/kokkos/issues/853)
- Warnings about lock\_address\_cuda\_space [\#852](https://github.com/kokkos/kokkos/issues/852)
- WorkGraph execution policy [\#771](https://github.com/kokkos/kokkos/issues/771)
- Simplify makefiles by guarding compilation with appropriate KOKKOS\_ENABLE\_\#\#\# macros [\#716](https://github.com/kokkos/kokkos/issues/716)
- Cmake build: wrong include install directory [\#668](https://github.com/kokkos/kokkos/issues/668)
- Derived View type and allocation [\#566](https://github.com/kokkos/kokkos/issues/566)
- Fix Compiler warnings when compiling core unit tests for Cuda [\#214](https://github.com/kokkos/kokkos/issues/214)
**Fixed bugs:**
- Out-of-bounds read in Kokkos\_Layout.hpp [\#975](https://github.com/kokkos/kokkos/issues/975)
- CudaClang: Fix failing test with Clang 4.0 [\#941](https://github.com/kokkos/kokkos/issues/941)
- Respawn when memory pool allocation fails \(not available memory\) [\#940](https://github.com/kokkos/kokkos/issues/940)
- Memory pool aborts on zero allocation request, returns NULL for \< minimum [\#939](https://github.com/kokkos/kokkos/issues/939)
- Error with TaskScheduler query of underlying memory pool [\#917](https://github.com/kokkos/kokkos/issues/917)
- Profiling::\*Callee static variables declared in header [\#863](https://github.com/kokkos/kokkos/issues/863)
- calling \*Space::name\(\) causes compile error [\#862](https://github.com/kokkos/kokkos/issues/862)
- bug in Profiling::deallocateData [\#860](https://github.com/kokkos/kokkos/issues/860)
- task\_depend test failing, CUDA 8.0 + Pascal + RDC [\#829](https://github.com/kokkos/kokkos/issues/829)
- \[develop branch\] Standalone cmake issues [\#826](https://github.com/kokkos/kokkos/issues/826)
- Kokkos CUDA failes to compile with OMPI\_CXX and MPICH\_CXX wrappers [\#776](https://github.com/kokkos/kokkos/issues/776)
- Task Team reduction on Pascal [\#767](https://github.com/kokkos/kokkos/issues/767)
- CUDA stack overflow with TaskDAG test [\#758](https://github.com/kokkos/kokkos/issues/758)
- TeamVector test on Cuda [\#670](https://github.com/kokkos/kokkos/issues/670)
- Clang 4.0 Cuda Build broken again [\#560](https://github.com/kokkos/kokkos/issues/560)
## [2.03.05](https://github.com/kokkos/kokkos/tree/2.03.05) (2017-05-27)
[Full Changelog](https://github.com/kokkos/kokkos/compare/2.03.00...2.03.05)
**Implemented enhancements:**
- Harmonize Custom Reductions over nesting levels [\#802](https://github.com/kokkos/kokkos/issues/802)
- Prevent users directly including KokkosCore\_config.h [\#815](https://github.com/kokkos/kokkos/issues/815)
- DualView aborts on concurrent host/device modify \(in debug mode\) [\#814](https://github.com/kokkos/kokkos/issues/814)
- Abort when running on a NVIDIA CC5.0 or higher architecture with code compiled for CC \< 5.0 [\#813](https://github.com/kokkos/kokkos/issues/813)
- Add "name" function to ExecSpaces [\#806](https://github.com/kokkos/kokkos/issues/806)
- Allow null Future in task spawn dependences [\#795](https://github.com/kokkos/kokkos/issues/795)
- Add Unit Tests for Kokkos::complex [\#785](https://github.com/kokkos/kokkos/issues/785)
- Add pow function for Kokkos::complex [\#784](https://github.com/kokkos/kokkos/issues/784)
- Square root of a complex [\#729](https://github.com/kokkos/kokkos/issues/729)
- Command line processing of --threads argument prevents users from having any commandline arguments starting with --threads [\#760](https://github.com/kokkos/kokkos/issues/760)
- Protected deprecated API with appropriate macro [\#756](https://github.com/kokkos/kokkos/issues/756)
- Allow task scheduler memory pool to be used by tasks [\#747](https://github.com/kokkos/kokkos/issues/747)
- View bounds checking on host-side performance: constructing a std::string [\#723](https://github.com/kokkos/kokkos/issues/723)
- Add check for AppleClang as compiler distinct from check for Clang. [\#705](https://github.com/kokkos/kokkos/issues/705)
- Uninclude source files for specific configurations to prevent link warning. [\#701](https://github.com/kokkos/kokkos/issues/701)
- Add --small option to snapshot script [\#697](https://github.com/kokkos/kokkos/issues/697)
- CMake Standalone Support [\#674](https://github.com/kokkos/kokkos/issues/674)
- CMake build unit test and install [\#808](https://github.com/kokkos/kokkos/issues/808)
- CMake: Fix having kokkos as a subdirectory in a pure cmake project [\#629](https://github.com/kokkos/kokkos/issues/629)
- Tribits macro assumes build directory is in top level source directory [\#654](https://github.com/kokkos/kokkos/issues/654)
- Use bin/nvcc\_wrapper, not config/nvcc\_wrapper [\#562](https://github.com/kokkos/kokkos/issues/562)
- Allow MemoryPool::allocate\(\) to be called from multiple threads per warp. [\#487](https://github.com/kokkos/kokkos/issues/487)
- Allow MemoryPool::allocate\\(\\) to be called from multiple threads per warp. [\#487](https://github.com/kokkos/kokkos/issues/487)
- Move OpenMP 4.5 OpenMPTarget backend into Develop [\#456](https://github.com/kokkos/kokkos/issues/456)
- Testing on ARM testbed [\#288](https://github.com/kokkos/kokkos/issues/288)
**Fixed bugs:**
- Fix label in OpenMP parallel\_reduce verify\_initialized [\#834](https://github.com/kokkos/kokkos/issues/834)
- TeamScratch Level 1 on Cuda hangs [\#820](https://github.com/kokkos/kokkos/issues/820)
- \[bug\] memory pool. [\#786](https://github.com/kokkos/kokkos/issues/786)
- Some Reduction Tests fail on Intel 18 with aggressive vectorization on [\#774](https://github.com/kokkos/kokkos/issues/774)
- Error copying dynamic view on copy of memory pool [\#773](https://github.com/kokkos/kokkos/issues/773)
- CUDA stack overflow with TaskDAG test [\#758](https://github.com/kokkos/kokkos/issues/758)
- ThreadVectorRange Customized Reduction Bug [\#739](https://github.com/kokkos/kokkos/issues/739)
- set\_scratch\_size overflows [\#726](https://github.com/kokkos/kokkos/issues/726)
- Get wrong results for compiler checks in Makefile on OS X. [\#706](https://github.com/kokkos/kokkos/issues/706)
- Fix check if multiple host architectures enabled. [\#702](https://github.com/kokkos/kokkos/issues/702)
- Threads Backend Does not Pass on Cray Compilers [\#609](https://github.com/kokkos/kokkos/issues/609)
- Rare bug in memory pool where allocation can finish on superblock in empty state [\#452](https://github.com/kokkos/kokkos/issues/452)
- LDFLAGS in core/unit\_test/Makefile: potential "undefined reference" to pthread lib [\#148](https://github.com/kokkos/kokkos/issues/148)
## [2.03.00](https://github.com/kokkos/kokkos/tree/2.03.00) (2017-04-25)
[Full Changelog](https://github.com/kokkos/kokkos/compare/2.02.15...2.03.00)
**Implemented enhancements:**
- UnorderedMap: make it accept Devices or MemorySpaces [\#711](https://github.com/kokkos/kokkos/issues/711)
- sort to accept DynamicView and \[begin,end\) indices [\#691](https://github.com/kokkos/kokkos/issues/691)
- ENABLE Macros should only be used via \#ifdef or \#if defined [\#675](https://github.com/kokkos/kokkos/issues/675)
- Remove impl/Kokkos\_Synchronic\_\* [\#666](https://github.com/kokkos/kokkos/issues/666)
- Turning off IVDEP for Intel 14. [\#638](https://github.com/kokkos/kokkos/issues/638)
- Using an installed Kokkos in a target application using CMake [\#633](https://github.com/kokkos/kokkos/issues/633)
- Create Kokkos Bill of Materials [\#632](https://github.com/kokkos/kokkos/issues/632)
- MDRangePolicy and tagged evaluators [\#547](https://github.com/kokkos/kokkos/issues/547)
- Add PGI support [\#289](https://github.com/kokkos/kokkos/issues/289)
**Fixed bugs:**
- Output from PerTeam fails [\#733](https://github.com/kokkos/kokkos/issues/733)
- Cuda: architecture flag not added to link line [\#688](https://github.com/kokkos/kokkos/issues/688)
- Getting large chunks of memory for a thread team in a universal way [\#664](https://github.com/kokkos/kokkos/issues/664)
- Kokkos RNG normal\(\) function hangs for small seed value [\#655](https://github.com/kokkos/kokkos/issues/655)
- Kokkos Tests Errors on Shepard/HSW Builds [\#644](https://github.com/kokkos/kokkos/issues/644)
## [2.02.15](https://github.com/kokkos/kokkos/tree/2.02.15) (2017-02-10)
[Full Changelog](https://github.com/kokkos/kokkos/compare/2.02.07...2.02.15)
**Implemented enhancements:**
- Containers: Adding block partitioning to StaticCrsGraph [\#625](https://github.com/kokkos/kokkos/issues/625)
- Kokkos Make System can induce Errors on Cray Volta System [\#610](https://github.com/kokkos/kokkos/issues/610)
- OpenMP: error out if KOKKOS\_HAVE\_OPENMP is defined but not \_OPENMP [\#605](https://github.com/kokkos/kokkos/issues/605)
- CMake: fix standalone build with tests [\#604](https://github.com/kokkos/kokkos/issues/604)
- Change README \(that GitHub shows when opening Kokkos project page\) to tell users how to submit PRs [\#597](https://github.com/kokkos/kokkos/issues/597)
- Add correctness testing for all operators of Atomic View [\#420](https://github.com/kokkos/kokkos/issues/420)
- Allow assignment of Views with compatible memory spaces [\#290](https://github.com/kokkos/kokkos/issues/290)
- Build only one version of Kokkos library for tests [\#213](https://github.com/kokkos/kokkos/issues/213)
- Clean out old KOKKOS\_HAVE\_CXX11 macros clauses [\#156](https://github.com/kokkos/kokkos/issues/156)
- Harmonize Macro names [\#150](https://github.com/kokkos/kokkos/issues/150)
**Fixed bugs:**
- Cray and PGI: Kokkos\_Parallel\_Reduce [\#634](https://github.com/kokkos/kokkos/issues/634)
- Kokkos Make System can induce Errors on Cray Volta System [\#610](https://github.com/kokkos/kokkos/issues/610)
- Normal\(\) function random number generator doesn't give the expected distribution [\#592](https://github.com/kokkos/kokkos/issues/592)
## [2.02.07](https://github.com/kokkos/kokkos/tree/2.02.07) (2016-12-16)
[Full Changelog](https://github.com/kokkos/kokkos/compare/2.02.01...2.02.07)
**Implemented enhancements:**
- Add CMake option to enable Cuda Lambda support [\#589](https://github.com/kokkos/kokkos/issues/589)
- Add CMake option to enable Cuda RDC support [\#588](https://github.com/kokkos/kokkos/issues/588)
- Add Initial Intel Sky Lake Xeon-HPC Compiler Support to Kokkos Make System [\#584](https://github.com/kokkos/kokkos/issues/584)
- Building Tutorial Examples [\#582](https://github.com/kokkos/kokkos/issues/582)
- Internal way for using ThreadVectorRange without TeamHandle [\#574](https://github.com/kokkos/kokkos/issues/574)
- Testing: Add testing for uvm and rdc [\#571](https://github.com/kokkos/kokkos/issues/571)
- Profiling: Add Memory Tracing and Region Markers [\#557](https://github.com/kokkos/kokkos/issues/557)
- nvcc\_wrapper not installed with Kokkos built with CUDA through CMake [\#543](https://github.com/kokkos/kokkos/issues/543)
- Improve DynRankView debug check [\#541](https://github.com/kokkos/kokkos/issues/541)
- Benchmarks: Add Gather benchmark [\#536](https://github.com/kokkos/kokkos/issues/536)
- Testing: add spot\_check option to test\_all\_sandia [\#535](https://github.com/kokkos/kokkos/issues/535)
- Deprecate Kokkos::Impl::VerifyExecutionCanAccessMemorySpace [\#527](https://github.com/kokkos/kokkos/issues/527)
- Add AtomicAdd support for 64bit float for Pascal [\#522](https://github.com/kokkos/kokkos/issues/522)
- Add Restrict and Aligned memory trait [\#517](https://github.com/kokkos/kokkos/issues/517)
- Kokkos Tests are Not Run using Compiler Optimization [\#501](https://github.com/kokkos/kokkos/issues/501)
- Add support for clang 3.7 w/ openmp backend [\#393](https://github.com/kokkos/kokkos/issues/393)
- Provide an error throw class [\#79](https://github.com/kokkos/kokkos/issues/79)
**Fixed bugs:**
- Cuda UVM Allocation test broken with UVM as default space [\#586](https://github.com/kokkos/kokkos/issues/586)
- Bug \(develop branch only\): multiple tests are now failing when forcing uvm usage. [\#570](https://github.com/kokkos/kokkos/issues/570)
- Error in generate\_makefile.sh for Kokkos when Compiler is Empty String/Fails [\#568](https://github.com/kokkos/kokkos/issues/568)
- XL 13.1.4 incorrect C++11 flag [\#553](https://github.com/kokkos/kokkos/issues/553)
- Improve DynRankView debug check [\#541](https://github.com/kokkos/kokkos/issues/541)
- Installing Library on MAC broken due to cp -u [\#539](https://github.com/kokkos/kokkos/issues/539)
- Intel Nightly Testing with Debug enabled fails [\#534](https://github.com/kokkos/kokkos/issues/534)
## [2.02.01](https://github.com/kokkos/kokkos/tree/2.02.01) (2016-11-01)
[Full Changelog](https://github.com/kokkos/kokkos/compare/2.02.00...2.02.01)
**Implemented enhancements:**
- Add Changelog generation to our process. [\#506](https://github.com/kokkos/kokkos/issues/506)
**Fixed bugs:**
- Test scratch\_request fails in Serial with Debug enabled [\#520](https://github.com/kokkos/kokkos/issues/520)
- Bug In BoundsCheck for DynRankView [\#516](https://github.com/kokkos/kokkos/issues/516)
## [2.02.00](https://github.com/kokkos/kokkos/tree/2.02.00) (2016-10-30)
[Full Changelog](https://github.com/kokkos/kokkos/compare/2.01.10...2.02.00)
**Implemented enhancements:**
- Add PowerPC assembly for grabbing clock register in memory pool [\#511](https://github.com/kokkos/kokkos/issues/511)
- Add GCC 6.x support [\#508](https://github.com/kokkos/kokkos/issues/508)
- Test install and build against installed library [\#498](https://github.com/kokkos/kokkos/issues/498)
- Makefile.kokkos adds expt-extended-lambda to cuda build with clang [\#490](https://github.com/kokkos/kokkos/issues/490)
- Add top-level makefile option to just test kokkos-core unit-test [\#485](https://github.com/kokkos/kokkos/issues/485)
- Split and harmonize Object Files of Core UnitTests to increase build parallelism [\#484](https://github.com/kokkos/kokkos/issues/484)
- LayoutLeft to LayoutLeft subview for 3D and 4D views [\#473](https://github.com/kokkos/kokkos/issues/473)
- Add official Cuda 8.0 support [\#468](https://github.com/kokkos/kokkos/issues/468)
- Allow C++1Z Flag for Class Lambda capture [\#465](https://github.com/kokkos/kokkos/issues/465)
- Add Clang 4.0+ compilation of Cuda code [\#455](https://github.com/kokkos/kokkos/issues/455)
- Possible Issue with Intel 17.0.098 and GCC 6.1.0 in Develop Branch [\#445](https://github.com/kokkos/kokkos/issues/445)
- Add name of view to "View bounds error" [\#432](https://github.com/kokkos/kokkos/issues/432)
- Move Sort Binning Operators into Kokkos namespace [\#421](https://github.com/kokkos/kokkos/issues/421)
- TaskPolicy - generate error when attempt to use uninitialized [\#396](https://github.com/kokkos/kokkos/issues/396)
- Import WithoutInitializing and AllowPadding into Kokkos namespace [\#325](https://github.com/kokkos/kokkos/issues/325)
- TeamThreadRange requires begin, end to be the same type [\#305](https://github.com/kokkos/kokkos/issues/305)
- CudaUVMSpace should track \# allocations, due to CUDA limit on \# UVM allocations [\#300](https://github.com/kokkos/kokkos/issues/300)
- Remove old View and its infrastructure [\#259](https://github.com/kokkos/kokkos/issues/259)
**Fixed bugs:**
- Bug in TestCuda\_Other.cpp: most likely assembly inserted into Device code [\#515](https://github.com/kokkos/kokkos/issues/515)
- Cuda Compute Capability check of GPU is outdated [\#509](https://github.com/kokkos/kokkos/issues/509)
- multi\_scratch test with hwloc and pthreads seg-faults. [\#504](https://github.com/kokkos/kokkos/issues/504)
- generate\_makefile.bash: "make install" is broken [\#503](https://github.com/kokkos/kokkos/issues/503)
- make clean in Out of Source Build/Tests Does Not Work Correctly [\#502](https://github.com/kokkos/kokkos/issues/502)
- Makefiles for test and examples have issues in Cuda when CXX is not explicitly specified [\#497](https://github.com/kokkos/kokkos/issues/497)
- Dispatch lambda test directly inside GTEST macro doesn't work with nvcc [\#491](https://github.com/kokkos/kokkos/issues/491)
- UnitTests with HWLOC enabled fail if run with mpirun bound to a single core [\#489](https://github.com/kokkos/kokkos/issues/489)
- Failing Reducer Test on Mac with Pthreads [\#479](https://github.com/kokkos/kokkos/issues/479)
- make test Dumps Error with Clang Not Found [\#471](https://github.com/kokkos/kokkos/issues/471)
- OpenMP TeamPolicy member broadcast not using correct volatile shared variable [\#424](https://github.com/kokkos/kokkos/issues/424)
- TaskPolicy - generate error when attempt to use uninitialized [\#396](https://github.com/kokkos/kokkos/issues/396)
- New task policy implementation is pulling in old experimental code. [\#372](https://github.com/kokkos/kokkos/issues/372)
- MemoryPool unit test hangs on Power8 with GCC 6.1.0 [\#298](https://github.com/kokkos/kokkos/issues/298)
## [2.01.10](https://github.com/kokkos/kokkos/tree/2.01.10) (2016-09-27)
[Full Changelog](https://github.com/kokkos/kokkos/compare/2.01.06...2.01.10)
**Implemented enhancements:**
- Enable Profiling by default in Tribits build [\#438](https://github.com/kokkos/kokkos/issues/438)
- parallel\_reduce\(0\), parallel\_scan\(0\) unit tests [\#436](https://github.com/kokkos/kokkos/issues/436)
- data\(\)==NULL after realloc with LayoutStride [\#351](https://github.com/kokkos/kokkos/issues/351)
- Fix tutorials to track new Kokkos::View [\#323](https://github.com/kokkos/kokkos/issues/323)
- Rename team policy set\_scratch\_size. [\#195](https://github.com/kokkos/kokkos/issues/195)
**Fixed bugs:**
- Possible Issue with Intel 17.0.098 and GCC 6.1.0 in Develop Branch [\#445](https://github.com/kokkos/kokkos/issues/445)
- Makefile spits syntax error [\#435](https://github.com/kokkos/kokkos/issues/435)
- Kokkos::sort fails for view with all the same values [\#422](https://github.com/kokkos/kokkos/issues/422)
- Generic Reducers: can't accept inline constructed reducer [\#404](https://github.com/kokkos/kokkos/issues/404)
- data\\(\\)==NULL after realloc with LayoutStride [\#351](https://github.com/kokkos/kokkos/issues/351)
- const subview of const view with compile time dimensions on Cuda backend [\#310](https://github.com/kokkos/kokkos/issues/310)
- Kokkos \(in Trilinos\) Causes Internal Compiler Error on CUDA 8.0.21-EA on POWER8 [\#307](https://github.com/kokkos/kokkos/issues/307)
- Core Oversubscription Detection Broken? [\#159](https://github.com/kokkos/kokkos/issues/159)
## [2.01.06](https://github.com/kokkos/kokkos/tree/2.01.06) (2016-09-02)
[Full Changelog](https://github.com/kokkos/kokkos/compare/2.01.00...2.01.06)
**Implemented enhancements:**
- Add "standard" reducers for lambda-supportable customized reduce [\#411](https://github.com/kokkos/kokkos/issues/411)
- TaskPolicy - single thread back-end execution [\#390](https://github.com/kokkos/kokkos/issues/390)
- Kokkos master clone tag [\#387](https://github.com/kokkos/kokkos/issues/387)
- Query memory requirements from task policy [\#378](https://github.com/kokkos/kokkos/issues/378)
- Output order of test\_atomic.cpp is confusing [\#373](https://github.com/kokkos/kokkos/issues/373)
- Missing testing for atomics [\#341](https://github.com/kokkos/kokkos/issues/341)
- Feature request for Kokkos to provide Kokkos::atomic\_fetch\_max and atomic\_fetch\_min [\#336](https://github.com/kokkos/kokkos/issues/336)
- TaskPolicy\ performance requires teams mapped to warps [\#218](https://github.com/kokkos/kokkos/issues/218)
**Fixed bugs:**
- Reduce with Teams broken for custom initialize [\#407](https://github.com/kokkos/kokkos/issues/407)
- Failing Kokkos build on Debian [\#402](https://github.com/kokkos/kokkos/issues/402)
- Failing Tests on NVIDIA Pascal GPUs [\#398](https://github.com/kokkos/kokkos/issues/398)
- Algorithms: fill\_random assumes dimensions fit in unsigned int [\#389](https://github.com/kokkos/kokkos/issues/389)
- Kokkos::subview with RandomAccess Memory Trait [\#385](https://github.com/kokkos/kokkos/issues/385)
- Build warning \(signed / unsigned comparison\) in Cuda implementation [\#365](https://github.com/kokkos/kokkos/issues/365)
- wrong results for a parallel\_reduce with CUDA8 / Maxwell50 [\#352](https://github.com/kokkos/kokkos/issues/352)
- Hierarchical parallelism - 3 level unit test [\#344](https://github.com/kokkos/kokkos/issues/344)
- Can I allocate a View w/ both WithoutInitializing & AllowPadding? [\#324](https://github.com/kokkos/kokkos/issues/324)
- subview View layout determination [\#309](https://github.com/kokkos/kokkos/issues/309)
- Unit tests with Cuda - Maxwell [\#196](https://github.com/kokkos/kokkos/issues/196)
## [2.01.00](https://github.com/kokkos/kokkos/tree/2.01.00) (2016-07-21)
[Full Changelog](https://github.com/kokkos/kokkos/compare/End_C++98...2.01.00)
**Implemented enhancements:**
- Edit ViewMapping so assigning Views with the same custom layout compiles when const casting [\#327](https://github.com/kokkos/kokkos/issues/327)
- DynRankView: Performance improvement for operator\(\) [\#321](https://github.com/kokkos/kokkos/issues/321)
- Interoperability between static and dynamic rank views [\#295](https://github.com/kokkos/kokkos/issues/295)
- subview member function ? [\#280](https://github.com/kokkos/kokkos/issues/280)
- Inter-operatibility between View and DynRankView. [\#245](https://github.com/kokkos/kokkos/issues/245)
- \(Trilinos\) build warning in atomic\_assign, with Kokkos::complex [\#177](https://github.com/kokkos/kokkos/issues/177)
- View\<\>::shmem\_size should runtime check for number of arguments equal to rank [\#176](https://github.com/kokkos/kokkos/issues/176)
- Custom reduction join via lambda argument [\#99](https://github.com/kokkos/kokkos/issues/99)
- DynRankView with 0 dimensions passed in at construction [\#293](https://github.com/kokkos/kokkos/issues/293)
- Inject view\_alloc and friends into Kokkos namespace [\#292](https://github.com/kokkos/kokkos/issues/292)
- Less restrictive TeamPolicy reduction on Cuda [\#286](https://github.com/kokkos/kokkos/issues/286)
- deep\_copy using remap with source execution space [\#267](https://github.com/kokkos/kokkos/issues/267)
- Suggestion: Enable opt-in L1 caching via nvcc-wrapper [\#261](https://github.com/kokkos/kokkos/issues/261)
- More flexible create\_mirror functions [\#260](https://github.com/kokkos/kokkos/issues/260)
- Rename View::memory\_span to View::required\_allocation\_size [\#256](https://github.com/kokkos/kokkos/issues/256)
- Use of subviews and views with compile-time dimensions [\#237](https://github.com/kokkos/kokkos/issues/237)
- Use of subviews and views with compile-time dimensions [\#237](https://github.com/kokkos/kokkos/issues/237)
- Kokkos::Timer [\#234](https://github.com/kokkos/kokkos/issues/234)
- Fence CudaUVMSpace allocations [\#230](https://github.com/kokkos/kokkos/issues/230)
- View::operator\(\) accept std::is\_integral and std::is\_enum [\#227](https://github.com/kokkos/kokkos/issues/227)
- Allocating zero size View [\#216](https://github.com/kokkos/kokkos/issues/216)
- Thread scalable memory pool [\#212](https://github.com/kokkos/kokkos/issues/212)
- Add a way to disable memory leak output [\#194](https://github.com/kokkos/kokkos/issues/194)
- Kokkos exec space init should init Kokkos profiling [\#192](https://github.com/kokkos/kokkos/issues/192)
- Runtime rank wrapper for View [\#189](https://github.com/kokkos/kokkos/issues/189)
- Profiling Interface [\#158](https://github.com/kokkos/kokkos/issues/158)
- Fix View assignment \(of managed to unmanaged\) [\#153](https://github.com/kokkos/kokkos/issues/153)
- Add unit test for assignment of managed View to unmanaged View [\#152](https://github.com/kokkos/kokkos/issues/152)
- Check for oversubscription of threads with MPI in Kokkos::initialize [\#149](https://github.com/kokkos/kokkos/issues/149)
- Dynamic resizeable 1dimensional view [\#143](https://github.com/kokkos/kokkos/issues/143)
- Develop TaskPolicy for CUDA [\#142](https://github.com/kokkos/kokkos/issues/142)
- New View : Test Compilation Downstream [\#138](https://github.com/kokkos/kokkos/issues/138)
- New View Implementation [\#135](https://github.com/kokkos/kokkos/issues/135)
- Add variant of subview that lets users add traits [\#134](https://github.com/kokkos/kokkos/issues/134)
- NVCC-WRAPPER: Add --host-only flag [\#121](https://github.com/kokkos/kokkos/issues/121)
- Address gtest issue with TriBITS Kokkos build outside of Trilinos [\#117](https://github.com/kokkos/kokkos/issues/117)
- Make tests pass with -expt-extended-lambda on CUDA [\#108](https://github.com/kokkos/kokkos/issues/108)
- Dynamic scheduling for parallel\_for and parallel\_reduce [\#106](https://github.com/kokkos/kokkos/issues/106)
- Runtime or compile time error when reduce functor's join is not properly specified as const member function or with volatile arguments [\#105](https://github.com/kokkos/kokkos/issues/105)
- Error out when the number of threads is modified after kokkos is initialized [\#104](https://github.com/kokkos/kokkos/issues/104)
- Porting to POWER and remove assumption of X86 default [\#103](https://github.com/kokkos/kokkos/issues/103)
- Dynamic scheduling option for RangePolicy [\#100](https://github.com/kokkos/kokkos/issues/100)
- SharedMemory Support for Lambdas [\#81](https://github.com/kokkos/kokkos/issues/81)
- Recommended TeamSize for Lambdas [\#80](https://github.com/kokkos/kokkos/issues/80)
- Add Aggressive Vectorization Compilation mode [\#72](https://github.com/kokkos/kokkos/issues/72)
- Dynamic scheduling team execution policy [\#53](https://github.com/kokkos/kokkos/issues/53)
- UVM allocations in multi-GPU systems [\#50](https://github.com/kokkos/kokkos/issues/50)
- Synchronic in Kokkos::Impl [\#44](https://github.com/kokkos/kokkos/issues/44)
- index and dimension types in for loops [\#28](https://github.com/kokkos/kokkos/issues/28)
- Subview assign of 1D Strided with stride 1 to LayoutLeft/Right [\#1](https://github.com/kokkos/kokkos/issues/1)
**Fixed bugs:**
- misspelled variable name in Kokkos\_Atomic\_Fetch + missing unit tests [\#340](https://github.com/kokkos/kokkos/issues/340)
- seg fault Kokkos::Impl::CudaInternal::print\_configuration [\#338](https://github.com/kokkos/kokkos/issues/338)
- Clang compiler error with named parallel\_reduce, tags, and TeamPolicy. [\#335](https://github.com/kokkos/kokkos/issues/335)
- Shared Memory Allocation Error at parallel\_reduce [\#311](https://github.com/kokkos/kokkos/issues/311)
- DynRankView: Fix resize and realloc [\#303](https://github.com/kokkos/kokkos/issues/303)
- Scratch memory and dynamic scheduling [\#279](https://github.com/kokkos/kokkos/issues/279)
- MemoryPool infinite loop when out of memory [\#312](https://github.com/kokkos/kokkos/issues/312)
- Kokkos DynRankView changes break Sacado and Panzer [\#299](https://github.com/kokkos/kokkos/issues/299)
- MemoryPool fails to compile on non-cuda non-x86 [\#297](https://github.com/kokkos/kokkos/issues/297)
- Random Number Generator Fix [\#296](https://github.com/kokkos/kokkos/issues/296)
- View template parameter ordering Bug [\#282](https://github.com/kokkos/kokkos/issues/282)
- Serial task policy broken. [\#281](https://github.com/kokkos/kokkos/issues/281)
- deep\_copy with LayoutStride should not memcpy [\#262](https://github.com/kokkos/kokkos/issues/262)
- DualView::need\_sync should be a const method [\#248](https://github.com/kokkos/kokkos/issues/248)
- Arbitrary-sized atomics on GPUs broken; loop forever [\#238](https://github.com/kokkos/kokkos/issues/238)
- boolean reduction value\_type changes answer [\#225](https://github.com/kokkos/kokkos/issues/225)
- Custom init\(\) function for parallel\_reduce with array value\_type [\#210](https://github.com/kokkos/kokkos/issues/210)
- unit\_test Makefile is Broken - Recursively Calls itself until Machine Apocalypse. [\#202](https://github.com/kokkos/kokkos/issues/202)
- nvcc\_wrapper Does Not Support -Xcompiler \ [\#198](https://github.com/kokkos/kokkos/issues/198)
- Kokkos exec space init should init Kokkos profiling [\#192](https://github.com/kokkos/kokkos/issues/192)
- Kokkos Threads Backend impl\_shared\_alloc Broken on Intel 16.1 \(Shepard Haswell\) [\#186](https://github.com/kokkos/kokkos/issues/186)
- pthread back end hangs if used uninitialized [\#182](https://github.com/kokkos/kokkos/issues/182)
- parallel\_reduce of size 0, not calling init/join [\#175](https://github.com/kokkos/kokkos/issues/175)
- Bug in Threads with OpenMP enabled [\#173](https://github.com/kokkos/kokkos/issues/173)
- KokkosExp\_SharedAlloc, m\_team\_work\_index inaccessible [\#166](https://github.com/kokkos/kokkos/issues/166)
- 128-bit CAS without Assembly Broken? [\#161](https://github.com/kokkos/kokkos/issues/161)
- fatal error: Cuda/Kokkos\_Cuda\_abort.hpp: No such file or directory [\#157](https://github.com/kokkos/kokkos/issues/157)
- Power8: Fix OpenMP backend [\#139](https://github.com/kokkos/kokkos/issues/139)
- Data race in Kokkos OpenMP initialization [\#131](https://github.com/kokkos/kokkos/issues/131)
- parallel\_launch\_local\_memory and cuda 7.5 [\#125](https://github.com/kokkos/kokkos/issues/125)
- Resize can fail with Cuda due to asynchronous dispatch [\#119](https://github.com/kokkos/kokkos/issues/119)
- Qthread taskpolicy initialization bug. [\#92](https://github.com/kokkos/kokkos/issues/92)
- Windows: sys/mman.h [\#89](https://github.com/kokkos/kokkos/issues/89)
- Windows: atomic\_fetch\_sub\(\) [\#88](https://github.com/kokkos/kokkos/issues/88)
- Windows: snprintf [\#87](https://github.com/kokkos/kokkos/issues/87)
- Parallel\_Reduce with TeamPolicy and league size of 0 returns garbage [\#85](https://github.com/kokkos/kokkos/issues/85)
- Throw with Cuda when using \(2D\) team\_policy parallel\_reduce with less than a warp size [\#76](https://github.com/kokkos/kokkos/issues/76)
- Scalar views don't work with Kokkos::Atomic memory trait [\#69](https://github.com/kokkos/kokkos/issues/69)
- Reduce the number of threads per team for Cuda [\#63](https://github.com/kokkos/kokkos/issues/63)
- Named Kernels fail for reductions with CUDA [\#60](https://github.com/kokkos/kokkos/issues/60)
- Kokkos View dimension\_\(\) for long returning unsigned int [\#20](https://github.com/kokkos/kokkos/issues/20)
- atomic test hangs with LLVM [\#6](https://github.com/kokkos/kokkos/issues/6)
- OpenMP Test should set omp\_set\_num\_threads to 1 [\#4](https://github.com/kokkos/kokkos/issues/4)
**Closed issues:**
- develop branch broken with CUDA 8 and --expt-extended-lambda [\#354](https://github.com/kokkos/kokkos/issues/354)
- --arch=KNL with Intel 2016 build failure [\#349](https://github.com/kokkos/kokkos/issues/349)
- Error building with Cuda when passing -DKOKKOS\_CUDA\_USE\_LAMBDA to generate\_makefile.bash [\#343](https://github.com/kokkos/kokkos/issues/343)
- Can I safely use int indices in a 2-D View with capacity \> 2B? [\#318](https://github.com/kokkos/kokkos/issues/318)
- Kokkos::ViewAllocateWithoutInitializing is not working [\#317](https://github.com/kokkos/kokkos/issues/317)
- Intel build on Mac OS X [\#277](https://github.com/kokkos/kokkos/issues/277)
- deleted [\#271](https://github.com/kokkos/kokkos/issues/271)
- Broken Mira build [\#268](https://github.com/kokkos/kokkos/issues/268)
- 32-bit build [\#246](https://github.com/kokkos/kokkos/issues/246)
- parallel\_reduce with RDC crashes linker [\#232](https://github.com/kokkos/kokkos/issues/232)
- build of Kokkos\_Sparse\_MV\_impl\_spmv\_Serial.cpp.o fails if you use nvcc and have cuda disabled [\#209](https://github.com/kokkos/kokkos/issues/209)
- Kokkos Serial execution space is not tested with TeamPolicy. [\#207](https://github.com/kokkos/kokkos/issues/207)
- Unit test failure on Hansen KokkosCore\_UnitTest\_Cuda\_MPI\_1 [\#200](https://github.com/kokkos/kokkos/issues/200)
- nvcc compiler warning: calling a \_\_host\_\_ function from a \_\_host\_\_ \_\_device\_\_ function is not allowed [\#180](https://github.com/kokkos/kokkos/issues/180)
- Intel 15 build error with defaulted "move" operators [\#171](https://github.com/kokkos/kokkos/issues/171)
- missing libkokkos.a during Trilinos 12.4.2 build, yet other libkokkos\*.a libs are there [\#165](https://github.com/kokkos/kokkos/issues/165)
- Tie atomic updates to execution space or even to thread team? \(speculation\) [\#144](https://github.com/kokkos/kokkos/issues/144)
- New View: Compiletime/size Test [\#137](https://github.com/kokkos/kokkos/issues/137)
- New View : Performance Test [\#136](https://github.com/kokkos/kokkos/issues/136)
- Signed/unsigned comparison warning in CUDA parallel [\#130](https://github.com/kokkos/kokkos/issues/130)
- Kokkos::complex: Need op\* w/ std::complex & real [\#126](https://github.com/kokkos/kokkos/issues/126)
- Use uintptr\_t for casting pointers [\#110](https://github.com/kokkos/kokkos/issues/110)
- Default thread mapping behavior between P and Q threads. [\#91](https://github.com/kokkos/kokkos/issues/91)
- Windows: Atomic\_Fetch\_Exchange\(\) return type [\#90](https://github.com/kokkos/kokkos/issues/90)
- Synchronic unit test is way too long [\#84](https://github.com/kokkos/kokkos/issues/84)
- nvcc\_wrapper -\> $\(NVCC\_WRAPPER\) [\#42](https://github.com/kokkos/kokkos/issues/42)
- Check compiler version and print helpful message [\#39](https://github.com/kokkos/kokkos/issues/39)
- Kokkos shared memory on Cuda uses a lot of registers [\#31](https://github.com/kokkos/kokkos/issues/31)
- Can not pass unit test `cuda.space` without a GT 720 [\#25](https://github.com/kokkos/kokkos/issues/25)
- Makefile.kokkos lacks bounds checking option that CMake has [\#24](https://github.com/kokkos/kokkos/issues/24)
- Kokkos can not complete unit tests with CUDA UVM enabled [\#23](https://github.com/kokkos/kokkos/issues/23)
- Simplify teams + shared memory histogram example to remove vectorization [\#21](https://github.com/kokkos/kokkos/issues/21)
- Kokkos needs to rever to ${PROJECT\_NAME}\_ENABLE\_CXX11 not Trilinos\_ENABLE\_CXX11 [\#17](https://github.com/kokkos/kokkos/issues/17)
- Kokkos Base Makefile adds AVX to KNC Build [\#16](https://github.com/kokkos/kokkos/issues/16)
- MS Visual Studio 2013 Build Errors [\#9](https://github.com/kokkos/kokkos/issues/9)
- subview\(X, ALL\(\), j\) for 2-D LayoutRight View X: should it view a column? [\#5](https://github.com/kokkos/kokkos/issues/5)
## [End_C++98](https://github.com/kokkos/kokkos/tree/End_C++98) (2015-04-15)
\* *This Change Log was automatically generated by [github_changelog_generator](https://github.com/skywinder/Github-Changelog-Generator)*
kokkos-3.7.01/CMakeLists.txt 0000664 0000000 0000000 00000032244 14343743117 0015657 0 ustar 00root root 0000000 0000000 cmake_minimum_required(VERSION 3.16 FATAL_ERROR)
# Disable in-source builds to prevent source tree corruption.
if( "${CMAKE_SOURCE_DIR}" STREQUAL "${CMAKE_BINARY_DIR}" )
message( FATAL_ERROR "FATAL: In-source builds are not allowed. You should create a separate directory for build files and delete CMakeCache.txt." )
endif()
# We want to determine if options are given with the wrong case
# In order to detect which arguments are given to compare against
# the list of valid arguments, at the beginning here we need to
# form a list of all the given variables. If it begins with any
# case of KoKkOS, we add it to the list.
GET_CMAKE_PROPERTY(_variableNames VARIABLES)
SET(KOKKOS_GIVEN_VARIABLES)
FOREACH (var ${_variableNames})
STRING(TOUPPER ${var} UC_VAR)
STRING(FIND ${UC_VAR} KOKKOS IDX)
IF (${IDX} EQUAL 0)
LIST(APPEND KOKKOS_GIVEN_VARIABLES ${var})
ENDIF()
ENDFOREACH()
# Basic initialization (Used in KOKKOS_SETTINGS)
SET(Kokkos_SOURCE_DIR ${CMAKE_CURRENT_SOURCE_DIR})
SET(KOKKOS_SOURCE_DIR ${CMAKE_CURRENT_SOURCE_DIR})
SET(KOKKOS_SRC_PATH ${Kokkos_SOURCE_DIR})
SET(KOKKOS_PATH ${Kokkos_SOURCE_DIR})
SET(KOKKOS_TOP_BUILD_DIR ${CMAKE_CURRENT_BINARY_DIR})
# Is this a build as part of Trilinos?
IF(COMMAND TRIBITS_PACKAGE_DECL)
SET(KOKKOS_HAS_TRILINOS ON)
ELSE()
SET(KOKKOS_HAS_TRILINOS OFF)
ENDIF()
# Is this build a subdirectory of another project
GET_DIRECTORY_PROPERTY(HAS_PARENT PARENT_DIRECTORY)
INCLUDE(${KOKKOS_SRC_PATH}/cmake/kokkos_functions.cmake)
INCLUDE(${KOKKOS_SRC_PATH}/cmake/kokkos_pick_cxx_std.cmake)
SET(KOKKOS_ENABLED_OPTIONS) #exported in config file
SET(KOKKOS_ENABLED_DEVICES) #exported in config file
SET(KOKKOS_ENABLED_TPLS) #exported in config file
SET(KOKKOS_ENABLED_ARCH_LIST) #exported in config file
#These are helper flags used for sanity checks during config
#Certain features should depend on other features being configured first
SET(KOKKOS_CFG_DAG_NONE On) #sentinel to indicate no dependencies
SET(KOKKOS_CFG_DAG_DEVICES_DONE Off)
SET(KOKKOS_CFG_DAG_OPTIONS_DONE Off)
SET(KOKKOS_CFG_DAG_ARCH_DONE Off)
SET(KOKKOS_CFG_DAG_CXX_STD_DONE Off)
SET(KOKKOS_CFG_DAG_COMPILER_ID_DONE Off)
FUNCTION(KOKKOS_CFG_DEPENDS SUCCESSOR PRECURSOR)
SET(PRE_FLAG KOKKOS_CFG_DAG_${PRECURSOR})
SET(POST_FLAG KOKKOS_CFG_DAG_${SUCCESSOR})
IF (NOT ${PRE_FLAG})
MESSAGE(FATAL_ERROR "Bad CMake refactor: feature ${SUCCESSOR} cannot be configured until ${PRECURSOR} is configured")
ENDIF()
GLOBAL_SET(${POST_FLAG} On)
ENDFUNCTION()
LIST(APPEND CMAKE_MODULE_PATH cmake/Modules)
IF(NOT KOKKOS_HAS_TRILINOS)
set(CMAKE_DISABLE_SOURCE_CHANGES ON)
set(CMAKE_DISABLE_IN_SOURCE_BUILD ON)
# What language are we compiling Kokkos as
# downstream dependencies need to match this!
SET(KOKKOS_COMPILE_LANGUAGE CXX)
# use lower case here since we didn't parse options yet
IF (Kokkos_ENABLE_COMPILE_AS_CMAKE_LANGUAGE AND Kokkos_ENABLE_CUDA)
# Without this as a language for the package we would get a C++ compiler enabled.
# but we still need a C++ compiler even if we build all our cpp files as CUDA only
# because otherwise the C++ features don't work etc.
# This is just the rather odd way CMake does this, since CUDA doesn't imply C++ even
# though it is a C++ extension ... (but I guess it didn't use to be back in CUDA 4 or 5
# days.
SET(KOKKOS_INTERNAL_EXTRA_COMPILE_LANGUAGE CXX)
SET(KOKKOS_COMPILE_LANGUAGE CUDA)
ENDIF()
IF (Spack_WORKAROUND)
IF (Kokkos_ENABLE_COMPILE_AS_CMAKE_LANGUAGE)
MESSAGE(FATAL_ERROR "Can't currently use Kokkos_ENABLE_COMPILER_AS_CMAKE_LANGUAGE in a spack installation!")
ENDIF()
#if we are explicitly using Spack for development,
#nuke the Spack compiler
SET(SPACK_CXX $ENV{SPACK_CXX})
IF(SPACK_CXX)
SET(CMAKE_CXX_COMPILER ${SPACK_CXX} CACHE STRING "the C++ compiler" FORCE)
SET(ENV{CXX} ${SPACK_CXX})
ENDIF()
ENDIF()
# Always call the project command to define Kokkos_ variables
# and to make sure that C++ is an enabled language
PROJECT(Kokkos ${KOKKOS_COMPILE_LANGUAGE} ${KOKKOS_INTERNAL_EXTRA_COMPILE_LANGUAGE})
IF(NOT HAS_PARENT)
IF (NOT CMAKE_BUILD_TYPE)
SET(DEFAULT_BUILD_TYPE "RelWithDebInfo")
MESSAGE(STATUS "Setting build type to '${DEFAULT_BUILD_TYPE}' as none was specified.")
SET(CMAKE_BUILD_TYPE "${DEFAULT_BUILD_TYPE}" CACHE STRING
"Choose the type of build, options are: Debug, Release, RelWithDebInfo and MinSizeRel."
FORCE)
ENDIF()
ENDIF()
ENDIF()
IF (NOT CMAKE_SIZEOF_VOID_P)
STRING(FIND ${CMAKE_CXX_COMPILER} nvcc_wrapper FIND_IDX)
IF (NOT FIND_IDX STREQUAL -1)
MESSAGE(FATAL_ERROR "Kokkos did not configure correctly and failed to validate compiler. The most likely cause is CUDA linkage using nvcc_wrapper. Please ensure your CUDA environment is correctly configured.")
ELSE()
MESSAGE(FATAL_ERROR "Kokkos did not configure correctly and failed to validate compiler. The most likely cause is linkage errors during CMake compiler validation. Please consult the CMake error log shown below for the exact error during compiler validation")
ENDIF()
ELSEIF (NOT CMAKE_SIZEOF_VOID_P EQUAL 8)
MESSAGE(FATAL_ERROR "Kokkos assumes a 64-bit build; i.e., 8-byte pointers, but found ${CMAKE_SIZEOF_VOID_P}-byte pointers instead")
ENDIF()
set(Kokkos_VERSION_MAJOR 3)
set(Kokkos_VERSION_MINOR 7)
set(Kokkos_VERSION_PATCH 01)
set(Kokkos_VERSION "${Kokkos_VERSION_MAJOR}.${Kokkos_VERSION_MINOR}.${Kokkos_VERSION_PATCH}")
math(EXPR KOKKOS_VERSION "${Kokkos_VERSION_MAJOR} * 10000 + ${Kokkos_VERSION_MINOR} * 100 + ${Kokkos_VERSION_PATCH}")
# Load either the real TriBITS or a TriBITS wrapper
# for certain utility functions that are universal (like GLOBAL_SET)
INCLUDE(${KOKKOS_SRC_PATH}/cmake/fake_tribits.cmake)
IF (Kokkos_ENABLE_CUDA)
# If we are building CUDA, we have tricked CMake because we declare a CXX project
# If the default C++ standard for a given compiler matches the requested
# standard, then CMake just omits the -std flag in later versions of CMake
# This breaks CUDA compilation (CUDA compiler can have a different default
# -std then the underlying host compiler by itself). Setting this variable
# forces CMake to always add the -std flag even if it thinks it doesn't need it
GLOBAL_SET(CMAKE_CXX_STANDARD_DEFAULT 98)
ENDIF()
# These are the variables we will append to as we go
# I really wish these were regular variables
# but scoping issues can make it difficult
GLOBAL_SET(KOKKOS_COMPILE_OPTIONS)
GLOBAL_SET(KOKKOS_LINK_OPTIONS)
GLOBAL_SET(KOKKOS_AMDGPU_OPTIONS)
GLOBAL_SET(KOKKOS_CUDA_OPTIONS)
GLOBAL_SET(KOKKOS_CUDAFE_OPTIONS)
GLOBAL_SET(KOKKOS_XCOMPILER_OPTIONS)
# We need to append text here for making sure TPLs
# we import are available for an installed Kokkos
GLOBAL_SET(KOKKOS_TPL_EXPORTS)
# KOKKOS_DEPENDENCE is used by kokkos_launch_compiler
GLOBAL_SET(KOKKOS_COMPILE_DEFINITIONS KOKKOS_DEPENDENCE)
# MSVC never goes through kokkos_launch_compiler
IF(NOT MSVC)
GLOBAL_APPEND(KOKKOS_LINK_OPTIONS -DKOKKOS_DEPENDENCE)
ENDIF()
IF(Kokkos_ENABLE_TESTS AND NOT KOKKOS_HAS_TRILINOS)
find_package(GTest)
ENDIF()
# Include a set of Kokkos-specific wrapper functions that
# will either call raw CMake or TriBITS
# These are functions like KOKKOS_INCLUDE_DIRECTORIES
INCLUDE(${KOKKOS_SRC_PATH}/cmake/kokkos_tribits.cmake)
# Check the environment and set certain variables
# to allow platform-specific checks
INCLUDE(${KOKKOS_SRC_PATH}/cmake/kokkos_check_env.cmake)
# The build environment setup goes in the following steps
# 1) Check all the enable options. This includes checking Kokkos_DEVICES
# 2) Check the compiler ID (type and version)
# 3) Check the CXX standard and select important CXX flags
# 4) Check for any third-party libraries (TPLs) like hwloc
# 5) Check if optimizing for a particular architecture and add arch-specific flags
KOKKOS_SETUP_BUILD_ENVIRONMENT()
# Finish off the build
# 6) Recurse into subdirectories and configure individual libraries
# 7) Export and install targets
OPTION(BUILD_SHARED_LIBS "Build shared libraries" OFF)
SET(KOKKOS_EXT_LIBRARIES Kokkos::kokkos Kokkos::kokkoscore Kokkos::kokkoscontainers Kokkos::kokkosalgorithms)
SET(KOKKOS_SUB_LIBRARIES kokkoscore kokkoscontainers kokkosalgorithms)
IF (KOKKOS_CXX_STANDARD GREATER_EQUAL 17)
LIST(APPEND KOKKOS_EXT_LIBRARIES Kokkos::kokkossimd)
LIST(APPEND KOKKOS_SUB_LIBRARIES kokkossimd)
ENDIF()
SET(KOKKOS_INT_LIBRARIES kokkos ${KOKKOS_SUB_LIBRARIES})
SET_PROPERTY(GLOBAL PROPERTY KOKKOS_INT_LIBRARIES ${KOKKOS_INT_LIBRARIES})
IF (KOKKOS_HAS_TRILINOS)
SET(TRILINOS_INCDIR ${${PROJECT_NAME}_INSTALL_INCLUDE_DIR})
SET(KOKKOS_HEADER_DIR ${TRILINOS_INCDIR})
SET(KOKKOS_IS_SUBDIRECTORY TRUE)
ELSEIF(HAS_PARENT)
SET(KOKKOS_HEADER_DIR "include/kokkos")
SET(KOKKOS_IS_SUBDIRECTORY TRUE)
ELSE()
SET(KOKKOS_HEADER_DIR "${CMAKE_INSTALL_INCLUDEDIR}")
SET(KOKKOS_IS_SUBDIRECTORY FALSE)
ENDIF()
#------------------------------------------------------------------------------
#
# A) Forward declare the package so that certain options are also defined for
# subpackages
## This restores the old behavior of ProjectCompilerPostConfig.cmake
# It sets the CMAKE_CXX_FLAGS globally to those used by Kokkos
# We must do this before KOKKOS_PACKAGE_DECL
IF (KOKKOS_HAS_TRILINOS)
# Overwrite the old flags at the top-level
# Because Tribits doesn't use lists, it uses spaces for the list of CXX flags
# we have to match the annoying behavior, also we have to preserve quotes
# which needs another workaround.
SET(KOKKOS_COMPILE_OPTIONS_TMP)
IF (KOKKOS_ENABLE_HIP)
LIST(APPEND KOKKOS_COMPILE_OPTIONS ${KOKKOS_AMDGPU_OPTIONS})
ENDIF()
FOREACH(OPTION ${KOKKOS_COMPILE_OPTIONS})
STRING(FIND "${OPTION}" " " OPTION_HAS_WHITESPACE)
IF(OPTION_HAS_WHITESPACE EQUAL -1)
LIST(APPEND KOKKOS_COMPILE_OPTIONS_TMP "${OPTION}")
ELSE()
LIST(APPEND KOKKOS_COMPILE_OPTIONS_TMP "\"${OPTION}\"")
ENDIF()
ENDFOREACH()
STRING(REPLACE ";" " " KOKKOSCORE_COMPILE_OPTIONS "${KOKKOS_COMPILE_OPTIONS_TMP}")
LIST(APPEND KOKKOS_ALL_COMPILE_OPTIONS ${KOKKOS_COMPILE_OPTIONS})
IF (KOKKOS_ENABLE_CUDA)
LIST(APPEND KOKKOS_ALL_COMPILE_OPTIONS ${KOKKOS_CUDA_OPTIONS})
ENDIF()
FOREACH(XCOMP_FLAG ${KOKKOS_XCOMPILER_OPTIONS})
SET(KOKKOSCORE_XCOMPILER_OPTIONS "${KOKKOSCORE_XCOMPILER_OPTIONS} -Xcompiler ${XCOMP_FLAG}")
LIST(APPEND KOKKOS_ALL_COMPILE_OPTIONS -Xcompiler ${XCOMP_FLAG})
ENDFOREACH()
SET(KOKKOSCORE_CXX_FLAGS "${KOKKOSCORE_COMPILE_OPTIONS} ${KOKKOSCORE_XCOMPILER_OPTIONS}")
IF (KOKKOS_ENABLE_CUDA)
STRING(REPLACE ";" " " KOKKOSCORE_CUDA_OPTIONS "${KOKKOS_CUDA_OPTIONS}")
FOREACH(CUDAFE_FLAG ${KOKKOS_CUDAFE_OPTIONS})
SET(KOKKOSCORE_CUDAFE_OPTIONS "${KOKKOSCORE_CUDAFE_OPTIONS} -Xcudafe ${CUDAFE_FLAG}")
LIST(APPEND KOKKOS_ALL_COMPILE_OPTIONS -Xcudafe ${CUDAFE_FLAG})
ENDFOREACH()
SET(KOKKOSCORE_CXX_FLAGS "${KOKKOSCORE_CXX_FLAGS} ${KOKKOSCORE_CUDA_OPTIONS} ${KOKKOSCORE_CUDAFE_OPTIONS}")
ENDIF()
# Both parent scope and this package
# In ProjectCompilerPostConfig.cmake, we capture the "global" flags Trilinos wants in
# TRILINOS_TOPLEVEL_CXX_FLAGS
SET(CMAKE_CXX_FLAGS "${TRILINOS_TOPLEVEL_CXX_FLAGS} ${KOKKOSCORE_CXX_FLAGS}" PARENT_SCOPE)
SET(CMAKE_CXX_FLAGS "${TRILINOS_TOPLEVEL_CXX_FLAGS} ${KOKKOSCORE_CXX_FLAGS}")
#CMAKE_CXX_FLAGS will get added to Kokkos and Kokkos dependencies automatically here
#These flags get set up in KOKKOS_PACKAGE_DECL, which means they
#must be configured before KOKKOS_PACKAGE_DECL
SET(KOKKOS_ALL_COMPILE_OPTIONS
$<$:${KOKKOS_ALL_COMPILE_OPTIONS}>)
ENDIF()
KOKKOS_PACKAGE_DECL()
#------------------------------------------------------------------------------
#
# D) Process the subpackages (subdirectories) for Kokkos
#
KOKKOS_PROCESS_SUBPACKAGES()
#------------------------------------------------------------------------------
#
# E) If Kokkos itself is enabled, process the Kokkos package
#
KOKKOS_PACKAGE_DEF()
KOKKOS_EXCLUDE_AUTOTOOLS_FILES()
KOKKOS_PACKAGE_POSTPROCESS()
KOKKOS_CONFIGURE_CORE()
IF (NOT KOKKOS_HAS_TRILINOS AND NOT Kokkos_INSTALL_TESTING)
ADD_LIBRARY(kokkos INTERFACE)
#Make sure in-tree projects can reference this as Kokkos::
#to match the installed target names
ADD_LIBRARY(Kokkos::kokkos ALIAS kokkos)
TARGET_LINK_LIBRARIES(kokkos INTERFACE ${KOKKOS_SUB_LIBRARIES})
KOKKOS_INTERNAL_ADD_LIBRARY_INSTALL(kokkos)
ENDIF()
INCLUDE(${KOKKOS_SRC_PATH}/cmake/kokkos_install.cmake)
# nvcc_wrapper is Kokkos' wrapper for NVIDIA's NVCC CUDA compiler.
# Kokkos needs nvcc_wrapper in order to build. Other libraries and
# executables also need nvcc_wrapper. Thus, we need to install it.
# If the argument of DESTINATION is a relative path, CMake computes it
# as relative to ${CMAKE_INSTALL_PATH}.
# KOKKOS_INSTALL_ADDITIONAL_FILES will install nvcc wrapper and other generated
# files
KOKKOS_INSTALL_ADDITIONAL_FILES()
# Finally - if we are a subproject - make sure the enabled devices are visible
IF (HAS_PARENT)
FOREACH(DEV Kokkos_ENABLED_DEVICES)
#I would much rather not make these cache variables or global properties, but I can't
#make any guarantees on whether PARENT_SCOPE is good enough to make
#these variables visible where I need them
SET(Kokkos_ENABLE_${DEV} ON PARENT_SCOPE)
SET_PROPERTY(GLOBAL PROPERTY Kokkos_ENABLE_${DEV} ON)
ENDFOREACH()
ENDIF()
kokkos-3.7.01/CONTRIBUTING.md 0000664 0000000 0000000 00000001123 14343743117 0015340 0 ustar 00root root 0000000 0000000 # Contributing to Kokkos
## Pull Requests
We actively welcome pull requests.
1. Fork the repo and create your branch from `develop`.
2. If you've added code that should be tested, add tests.
3. If you've changed APIs, update the documentation.
4. Ensure the test suite passes.
## Issues
We use GitHub issues to track public bugs. Please ensure your description is clear and has sufficient instructions to be able to reproduce the issue.
## License
By contributing to Kokkos, you agree that your contributions will be licensed under the LICENSE file in the root directory of this source tree.
kokkos-3.7.01/Copyright.txt 0000664 0000000 0000000 00000003665 14343743117 0015635 0 ustar 00root root 0000000 0000000 //@HEADER
// ************************************************************************
//
// Kokkos v. 3.0
// Copyright (2020) National Technology & Engineering
// Solutions of Sandia, LLC (NTESS).
//
// Under the terms of Contract DE-NA0003525 with NTESS,
// the U.S. Government retains certain rights in this software.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are
// met:
//
// 1. Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
//
// 2. Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
//
// 3. Neither the name of the Corporation nor the names of the
// contributors may be used to endorse or promote products derived from
// this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//
// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
//
// ************************************************************************
//@HEADER
kokkos-3.7.01/HOW_TO_SNAPSHOT 0000664 0000000 0000000 00000005257 14343743117 0015424 0 ustar 00root root 0000000 0000000
Developers of Kokkos (those who commit modifications to Kokkos)
must maintain the snapshot of Kokkos in the Trilinos repository.
This file contains instructions for how to
snapshot Kokkos from github.com/kokkos to Trilinos.
------------------------------------------------------------------------
*** EVERYTHING GOES RIGHT WORKFLOW ***
1) Given a 'git clone' of Kokkos and of Trilinos repositories.
1.1) Let ${KOKKOS} be the absolute path to the Kokkos clone.
This path *must* terminate with the directory name 'kokkos';
e.g., ${HOME}/kokkos .
1.2) Let ${TRILINOS} be the absolute path to the Trilinos directory.
2) Given that the Kokkos build & test is clean and
changes are committed to the Kokkos clone.
3) Snapshot the current commit in the Kokkos clone into the Trilinos clone.
This overwrites ${TRILINOS}/packages/kokkos with the content of ${KOKKOS}:
${KOKKOS}/scripts/snapshot.py --verbose ${KOKKOS} ${TRILINOS}/packages
4) Verify the snapshot commit happened as expected
cd ${TRILINOS}/packages/kokkos
git log -1 --name-only
5) Modify, build, and test Trilinos with the Kokkos snapshot.
6) Given that that the Trilinos build & test is clean and
changes are committed to the Trilinos clone.
7) Attempt push to the Kokkos repository.
If push fails then you must 'remove the Kokkos snapshot'
from your Trilinos clone.
See below.
8) Attempt to push to the Trilinos repository.
If updating for a failed push requires you to change Kokkos you must
'remove the Kokkos snapshot' from your Trilinos clone.
See below.
------------------------------------------------------------------------
*** WHEN SOMETHING GOES WRONG AND YOU MUST ***
*** REMOVE THE KOKKOS SNAPSHOT FROM YOUR TRILINOS CLONE ***
1) Query the Trilinos clone commit log.
git log --oneline
2) Note the of the commit to the Trillinos clone
immediately BEFORE the Kokkos snapshot commit.
Copy this for use in the next command.
3) IF more than one outstanding commit then you can remove just the
Kokkos snapshot commit with 'git rebase -i'. Edit the rebase file.
Remove or comment out the Kokkos snapshot commit entry.
git rebase -i
4) IF the Kokkos snapshot commit is the one and only
outstanding commit then remove just than commit.
git reset --hard HEAD~1
------------------------------------------------------------------------
*** REGARDING 'snapshot.py' TOOL ***
The 'snapshot.py' tool is developed and maintained by the
Center for Computing Research (CCR)
Software Engineering, Maintenance, and Support (SEMS) team.
Contact Brent Perschbacher for questions>
------------------------------------------------------------------------
kokkos-3.7.01/LICENSE 0000664 0000000 0000000 00000003763 14343743117 0014130 0 ustar 00root root 0000000 0000000 //@HEADER
// ************************************************************************
//
// Kokkos v. 3.0
// Copyright (2020) National Technology & Engineering
// Solutions of Sandia, LLC (NTESS).
//
// Under the terms of Contract DE-NA0003525 with NTESS,
// the U.S. Government retains certain rights in this software.
//
// Kokkos is licensed under 3-clause BSD terms of use:
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are
// met:
//
// 1. Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
//
// 2. Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
//
// 3. Neither the name of the Corporation nor the names of the
// contributors may be used to endorse or promote products derived from
// this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//
// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
//
// ************************************************************************
//@HEADER
kokkos-3.7.01/Makefile.kokkos 0000664 0000000 0000000 00000201757 14343743117 0016066 0 ustar 00root root 0000000 0000000 # Default settings common options.
KOKKOS_VERSION_MAJOR = 3
KOKKOS_VERSION_MINOR = 7
KOKKOS_VERSION_PATCH = 01
KOKKOS_VERSION = $(shell echo $(KOKKOS_VERSION_MAJOR)*10000+$(KOKKOS_VERSION_MINOR)*100+$(KOKKOS_VERSION_PATCH) | bc)
# Options: Cuda,HIP,SYCL,OpenMPTarget,OpenMP,Threads,Serial
#KOKKOS_DEVICES ?= "OpenMP"
KOKKOS_DEVICES ?= "Threads"
# Options:
# Intel: KNC,KNL,SNB,HSW,BDW,SKL,SKX,ICL,ICX,SPR
# NVIDIA: Kepler,Kepler30,Kepler32,Kepler35,Kepler37,Maxwell,Maxwell50,Maxwell52,Maxwell53,Pascal60,Pascal61,Volta70,Volta72,Turing75,Ampere80,Ampere86
# ARM: ARMv80,ARMv81,ARMv8-ThunderX,ARMv8-TX2,A64FX
# IBM: BGQ,Power7,Power8,Power9
# AMD-GPUS: Vega900,Vega906,Vega908,Vega90A
# AMD-CPUS: AMDAVX,Zen,Zen2,Zen3
# Intel-GPUs: Gen9,Gen11,Gen12LP,DG1,XeHP,PVC
KOKKOS_ARCH ?= ""
# Options: yes,no
KOKKOS_DEBUG ?= "no"
# Options: hwloc,librt,experimental_memkind
KOKKOS_USE_TPLS ?= ""
# Options: c++14,c++1y,c++17,c++1z,c++2a
KOKKOS_CXX_STANDARD ?= "c++14"
# Options: aggressive_vectorization,disable_profiling,enable_large_mem_tests,disable_complex_align,disable_deprecated_code,enable_deprecation_warnings,disable_desul_atomics
KOKKOS_OPTIONS ?= ""
KOKKOS_CMAKE ?= "no"
KOKKOS_TRIBITS ?= "no"
KOKKOS_STANDALONE_CMAKE ?= "no"
# Default settings specific options.
# Options: force_uvm,use_ldg,rdc,enable_lambda,enable_constexpr
KOKKOS_CUDA_OPTIONS ?= ""
# Options: rdc
KOKKOS_HIP_OPTIONS ?= ""
# Default settings specific options.
# Options: enable_async_dispatch
KOKKOS_HPX_OPTIONS ?= ""
# Helper functions for conversion to upper case
uppercase_TABLE:=a,A b,B c,C d,D e,E f,F g,G h,H i,I j,J k,K l,L m,M n,N o,O p,P q,Q r,R s,S t,T u,U v,V w,W x,X y,Y z,Z
uppercase_internal=$(if $1,$$(subst $(firstword $1),$(call uppercase_internal,$(wordlist 2,$(words $1),$1),$2)),$2)
uppercase=$(eval uppercase_RESULT:=$(call uppercase_internal,$(uppercase_TABLE),$1))$(uppercase_RESULT)
# Return a 1 if a string contains a substring and 0 if not
# Note the search string should be without '"'
# Example: $(call kokkos_has_string,"hwloc,librt",hwloc)
# Will return a 1
kokkos_has_string=$(if $(findstring $(call uppercase,$2),$(call uppercase,$1)),1,0)
# Returns 1 if the path exists, 0 otherwise
# Example: $(call kokkos_path_exists,/path/to/file)
# Will return a 1 if /path/to/file exists
kokkos_path_exists=$(if $(wildcard $1),1,0)
# Check for general settings
KOKKOS_INTERNAL_ENABLE_DEBUG := $(call kokkos_has_string,$(KOKKOS_DEBUG),yes)
KOKKOS_INTERNAL_ENABLE_CXX14 := $(call kokkos_has_string,$(KOKKOS_CXX_STANDARD),c++14)
KOKKOS_INTERNAL_ENABLE_CXX1Y := $(call kokkos_has_string,$(KOKKOS_CXX_STANDARD),c++1y)
KOKKOS_INTERNAL_ENABLE_CXX17 := $(call kokkos_has_string,$(KOKKOS_CXX_STANDARD),c++17)
KOKKOS_INTERNAL_ENABLE_CXX1Z := $(call kokkos_has_string,$(KOKKOS_CXX_STANDARD),c++1z)
KOKKOS_INTERNAL_ENABLE_CXX2A := $(call kokkos_has_string,$(KOKKOS_CXX_STANDARD),c++2a)
KOKKOS_INTERNAL_ENABLE_CXX20 := $(call kokkos_has_string,$(KOKKOS_CXX_STANDARD),c++20)
# Check for external libraries.
KOKKOS_INTERNAL_USE_HWLOC := $(call kokkos_has_string,$(KOKKOS_USE_TPLS),hwloc)
KOKKOS_INTERNAL_USE_LIBRT := $(call kokkos_has_string,$(KOKKOS_USE_TPLS),librt)
KOKKOS_INTERNAL_USE_MEMKIND := $(call kokkos_has_string,$(KOKKOS_USE_TPLS),experimental_memkind)
# Check for advanced settings.
KOKKOS_INTERNAL_ENABLE_COMPILER_WARNINGS := $(call kokkos_has_string,$(KOKKOS_OPTIONS),compiler_warnings)
KOKKOS_INTERNAL_AGGRESSIVE_VECTORIZATION := $(call kokkos_has_string,$(KOKKOS_OPTIONS),aggressive_vectorization)
KOKKOS_INTERNAL_ENABLE_TUNING := $(call kokkos_has_string,$(KOKKOS_OPTIONS),enable_tuning)
KOKKOS_INTERNAL_DISABLE_COMPLEX_ALIGN := $(call kokkos_has_string,$(KOKKOS_OPTIONS),disable_complex_align)
KOKKOS_INTERNAL_DISABLE_DUALVIEW_MODIFY_CHECK := $(call kokkos_has_string,$(KOKKOS_OPTIONS),disable_dualview_modify_check)
KOKKOS_INTERNAL_ENABLE_PROFILING_LOAD_PRINT := $(call kokkos_has_string,$(KOKKOS_OPTIONS),enable_profile_load_print)
KOKKOS_INTERNAL_ENABLE_LARGE_MEM_TESTS := $(call kokkos_has_string,$(KOKKOS_OPTIONS),enable_large_mem_tests)
KOKKOS_INTERNAL_CUDA_USE_LDG := $(call kokkos_has_string,$(KOKKOS_CUDA_OPTIONS),use_ldg)
KOKKOS_INTERNAL_CUDA_USE_UVM := $(call kokkos_has_string,$(KOKKOS_CUDA_OPTIONS),force_uvm)
KOKKOS_INTERNAL_CUDA_USE_RELOC := $(call kokkos_has_string,$(KOKKOS_CUDA_OPTIONS),rdc)
KOKKOS_INTERNAL_CUDA_USE_LAMBDA := $(call kokkos_has_string,$(KOKKOS_CUDA_OPTIONS),enable_lambda)
KOKKOS_INTERNAL_CUDA_USE_CONSTEXPR := $(call kokkos_has_string,$(KOKKOS_CUDA_OPTIONS),enable_constexpr)
KOKKOS_INTERNAL_HPX_ENABLE_ASYNC_DISPATCH := $(call kokkos_has_string,$(KOKKOS_HPX_OPTIONS),enable_async_dispatch)
# deprecated
KOKKOS_INTERNAL_ENABLE_DESUL_ATOMICS := $(call kokkos_has_string,$(KOKKOS_OPTIONS),enable_desul_atomics)
KOKKOS_INTERNAL_DISABLE_DESUL_ATOMICS := $(call kokkos_has_string,$(KOKKOS_OPTIONS),disable_desul_atomics)
KOKKOS_INTERNAL_DISABLE_DEPRECATED_CODE := $(call kokkos_has_string,$(KOKKOS_OPTIONS),disable_deprecated_code)
KOKKOS_INTERNAL_ENABLE_DEPRECATION_WARNINGS := $(call kokkos_has_string,$(KOKKOS_OPTIONS),enable_deprecation_warnings)
KOKKOS_INTERNAL_HIP_USE_RELOC := $(call kokkos_has_string,$(KOKKOS_HIP_OPTIONS),rdc)
# Check for Kokkos Host Execution Spaces one of which must be on.
KOKKOS_INTERNAL_USE_OPENMP := $(call kokkos_has_string,$(subst OpenMPTarget,,$(KOKKOS_DEVICES)),OpenMP)
KOKKOS_INTERNAL_USE_THREADS := $(call kokkos_has_string,$(KOKKOS_DEVICES),Threads)
# deprecated
KOKKOS_INTERNAL_USE_PTHREAD := $(call kokkos_has_string,$(KOKKOS_DEVICES),Pthread)
KOKKOS_INTERNAL_USE_HPX := $(call kokkos_has_string,$(KOKKOS_DEVICES),HPX)
KOKKOS_INTERNAL_USE_SERIAL := $(call kokkos_has_string,$(KOKKOS_DEVICES),Serial)
ifeq ($(KOKKOS_INTERNAL_USE_PTHREAD), 1)
KOKKOS_INTERNAL_USE_THREADS := 1
$(warning Warning: Pthread is deprecated. Use Threads instead! KOKKOS_DEVICES=$(KOKKOS_DEVICES))
endif
ifeq ($(KOKKOS_INTERNAL_USE_OPENMP), 0)
ifeq ($(KOKKOS_INTERNAL_USE_THREADS), 0)
ifeq ($(KOKKOS_INTERNAL_USE_HPX), 0)
KOKKOS_INTERNAL_USE_SERIAL := 1
endif
endif
endif
# Check for other Execution Spaces.
KOKKOS_INTERNAL_USE_CUDA := $(call kokkos_has_string,$(KOKKOS_DEVICES),Cuda)
KOKKOS_INTERNAL_USE_HIP := $(call kokkos_has_string,$(KOKKOS_DEVICES),HIP)
KOKKOS_INTERNAL_USE_SYCL := $(call kokkos_has_string,$(KOKKOS_DEVICES),SYCL)
KOKKOS_INTERNAL_USE_OPENMPTARGET := $(call kokkos_has_string,$(KOKKOS_DEVICES),OpenMPTarget)
KOKKOS_DEVICELIST =
ifeq ($(KOKKOS_INTERNAL_USE_SERIAL), 1)
KOKKOS_DEVICELIST += Serial
endif
ifeq ($(KOKKOS_INTERNAL_USE_OPENMP), 1)
KOKKOS_DEVICELIST += OpenMP
endif
ifeq ($(KOKKOS_INTERNAL_USE_THREADS), 1)
KOKKOS_DEVICELIST += Threads
endif
ifeq ($(KOKKOS_INTERNAL_USE_HPX), 1)
KOKKOS_DEVICELIST += HPX
endif
ifeq ($(KOKKOS_INTERNAL_USE_CUDA), 1)
KOKKOS_DEVICELIST += Cuda
endif
ifeq ($(KOKKOS_INTERNAL_USE_HIP), 1)
KOKKOS_DEVICELIST += HIP
endif
KOKKOS_INTERNAL_HAVE_CXX17_OR_NEWER := $(shell expr $(KOKKOS_INTERNAL_ENABLE_CXX17) \
+ $(KOKKOS_INTERNAL_ENABLE_CXX20) \
+ $(KOKKOS_INTERNAL_ENABLE_CXX2A))
ifeq ($(KOKKOS_INTERNAL_USE_SYCL), 1)
KOKKOS_DEVICELIST += SYCL
ifneq ($(KOKKOS_INTERNAL_HAVE_CXX17_OR_NEWER), 1)
$(error SYCL backend requires C++17 or newer)
endif
endif
ifeq ($(KOKKOS_INTERNAL_USE_OPENMPTARGET), 1)
KOKKOS_DEVICELIST += OPENMPTARGET
ifneq ($(KOKKOS_INTERNAL_HAVE_CXX17_OR_NEWER), 1)
$(error OpenMPTarget backend requires C++17 or newer)
endif
endif
ifeq ($(KOKKOS_INTERNAL_USE_CUDA), 1)
KOKKOS_INTERNAL_NVCC_PATH := $(shell which nvcc)
ifeq ($(origin CUDA_PATH), undefined)
CUDA_PATH = $(KOKKOS_INTERNAL_NVCC_PATH:/bin/nvcc=)
endif
ifeq ($(CUDA_PATH),)
CUDA_PATH = $(KOKKOS_INTERNAL_NVCC_PATH:/bin/nvcc=)
endif
KOKKOS_INTERNAL_COMPILER_NVCC_VERSION := $(shell nvcc --version 2>&1 | grep release | cut -d' ' -f5 | cut -d',' -f1 | tr -d .)
endif
# Check OS.
KOKKOS_OS := $(strip $(shell uname -s))
KOKKOS_INTERNAL_OS_CYGWIN := $(call kokkos_has_string,$(KOKKOS_OS),CYGWIN)
KOKKOS_INTERNAL_OS_LINUX := $(call kokkos_has_string,$(KOKKOS_OS),Linux)
KOKKOS_INTERNAL_OS_DARWIN := $(call kokkos_has_string,$(KOKKOS_OS),Darwin)
# Check compiler.
KOKKOS_CXX_VERSION := $(strip $(shell $(CXX) --version 2>&1))
KOKKOS_INTERNAL_COMPILER_INTEL := $(call kokkos_has_string,$(KOKKOS_CXX_VERSION),Intel Corporation)
KOKKOS_INTERNAL_COMPILER_PGI := $(call kokkos_has_string,$(KOKKOS_CXX_VERSION),PGI)
KOKKOS_INTERNAL_COMPILER_XL := $(strip $(shell $(CXX) -qversion 2>&1 | grep -c XL))
KOKKOS_INTERNAL_COMPILER_CRAY := $(strip $(shell $(CXX) -craype-verbose 2>&1 | grep -c "CC-"))
KOKKOS_INTERNAL_COMPILER_NVCC := $(strip $(shell echo "$(shell export OMPI_CXX=$(OMPI_CXX); export MPICH_CXX=$(MPICH_CXX); $(CXX) --version 2>&1 | grep -c nvcc)>0" | bc))
KOKKOS_INTERNAL_COMPILER_CLANG := $(call kokkos_has_string,$(KOKKOS_CXX_VERSION),clang)
KOKKOS_INTERNAL_COMPILER_CRAY_CLANG := $(strip $(shell $(CXX) -craype-verbose 2>&1 | grep -c "clang++"))
KOKKOS_INTERNAL_COMPILER_INTEL_CLANG := $(call kokkos_has_string,$(KOKKOS_CXX_VERSION),oneAPI)
KOKKOS_INTERNAL_COMPILER_APPLE_CLANG := $(call kokkos_has_string,$(KOKKOS_CXX_VERSION),Apple clang)
KOKKOS_INTERNAL_COMPILER_HCC := $(call kokkos_has_string,$(KOKKOS_CXX_VERSION),HCC)
KOKKOS_INTERNAL_COMPILER_GCC := $(call kokkos_has_string,$(KOKKOS_CXX_VERSION),GCC)
# TODO fujitsu can emulate gcc or clang. Only clang mode works at the moment.
KOKKOS_INTERNAL_COMPILER_FUJITSU := $(call kokkos_has_string,$(KOKKOS_CXX_VERSION),FUJITSU)
# Check Host Compiler if using NVCC through nvcc_wrapper
ifeq ($(KOKKOS_INTERNAL_COMPILER_NVCC), 1)
KOKKOS_INTERNAL_COMPILER_NVCC_WRAPPER := $(strip $(shell echo $(CXX) | grep -c nvcc_wrapper))
ifeq ($(KOKKOS_INTERNAL_COMPILER_NVCC_WRAPPER), 1)
KOKKOS_CXX_HOST_VERSION := $(strip $(shell $(CXX) $(CXXFLAGS) --host-version 2>&1))
KOKKOS_INTERNAL_COMPILER_PGI := $(call kokkos_has_string,$(KOKKOS_CXX_HOST_VERSION),PGI)
KOKKOS_INTERNAL_COMPILER_INTEL := $(call kokkos_has_string,$(KOKKOS_CXX_HOST_VERSION),Intel Corporation)
KOKKOS_INTERNAL_COMPILER_CLANG := $(call kokkos_has_string,$(KOKKOS_CXX_HOST_VERSION),clang)
endif
endif
ifeq ($(KOKKOS_INTERNAL_COMPILER_CLANG), 2)
KOKKOS_INTERNAL_COMPILER_CLANG = 1
endif
ifeq ($(KOKKOS_INTERNAL_COMPILER_XL), 2)
KOKKOS_INTERNAL_COMPILER_XL = 1
endif
# Apple Clang passes both clang and apple clang tests, so turn off clang.
ifeq ($(KOKKOS_INTERNAL_COMPILER_APPLE_CLANG), 1)
KOKKOS_INTERNAL_COMPILER_CLANG = 0
endif
# AMD HCC passes both clang and hcc test so turn off clang
ifeq ($(KOKKOS_INTERNAL_COMPILER_HCC), 1)
KOKKOS_INTENAL_COMPILER_CLANG = 0
endif
# Fujitsu passes also as clang and gcc respectively
ifeq ($(KOKKOS_INTERNAL_COMPILER_FUJITSU), 1)
ifeq ($(KOKKOS_INTERNAL_COMPILER_GCC), 1)
# TODO handle gcc flags and workaround for bug?
# fujitsu (gcc mode) is bugged, see https://github.com/kokkos/kokkos/issues/4730
$(warning Warning: ${CXX} in Trad Mode '-Nnoclang' (default) is not recommended. Use 'CXX = ${CXX} -Nclang' instead.)
# HACK since fujitsu only accepts some gcc flags, disable gcc here?
# KOKKOS_INTERNAL_COMPILER_GCC = 0
endif
# TODO handle clang flags
# warnings: works fine as is
# openmp: handled
#KOKKOS_INTERNAL_COMPILER_CLANG = 0
endif
ifeq ($(KOKKOS_INTERNAL_COMPILER_CLANG), 1)
# TODO empty variable if fujitsu (clang mode) passes as clang
KOKKOS_INTERNAL_COMPILER_CLANG_VERSION := $(shell $(CXX) --version | grep version | cut -d ' ' -f3 | tr -d '.')
ifeq ($(KOKKOS_INTERNAL_USE_CUDA), 1)
ifeq ($(shell test $(KOKKOS_INTERNAL_COMPILER_CLANG_VERSION) -lt 400; echo $$?),0)
$(error Compiling Cuda code directly with Clang requires version 4.0.0 or higher)
endif
KOKKOS_INTERNAL_CUDA_USE_LAMBDA := 1
endif
endif
# Set compiler warnings flags.
ifeq ($(KOKKOS_INTERNAL_ENABLE_COMPILER_WARNINGS), 1)
ifeq ($(KOKKOS_INTERNAL_COMPILER_PGI), 1)
# TODO check if PGI accepts GNU style warnings
KOKKOS_INTERNAL_COMPILER_WARNINGS =
else
ifeq ($(KOKKOS_INTERNAL_COMPILER_CLANG), 1)
KOKKOS_INTERNAL_COMPILER_WARNINGS = -Wall -Wunused-parameter -Wshadow -pedantic -Wsign-compare -Wtype-limits -Wuninitialized
else
ifeq ($(KOKKOS_INTERNAL_COMPILER_APPLE_CLANG), 1)
KOKKOS_INTERNAL_COMPILER_WARNINGS = -Wall -Wunused-parameter -Wshadow -pedantic -Wsign-compare -Wtype-limits -Wuninitialized
else
ifeq ($(KOKKOS_INTERNAL_COMPILER_XL), 1)
KOKKOS_INTERNAL_COMPILER_WARNINGS = -Wall -Wunused-parameter -Wshadow -pedantic -Wsign-compare -Wtype-limits -Wuninitialized
else
ifeq ($(KOKKOS_INTERNAL_COMPILER_CRAY), 1)
# TODO check if cray accepts GNU style warnings
KOKKOS_INTERNAL_COMPILER_WARNINGS =
else
#gcc
KOKKOS_INTERNAL_COMPILER_WARNINGS = -Wall -Wunused-parameter -Wshadow -pedantic -Wsign-compare -Wtype-limits -Wignored-qualifiers -Wempty-body -Wclobbered -Wuninitialized
endif
endif
endif
endif
endif
else
KOKKOS_INTERNAL_COMPILER_WARNINGS =
endif
# Set OpenMP flags.
ifeq ($(KOKKOS_INTERNAL_COMPILER_PGI), 1)
KOKKOS_INTERNAL_OPENMP_FLAG := -mp
else
ifeq ($(KOKKOS_INTERNAL_COMPILER_CLANG), 1)
ifeq ($(KOKKOS_INTERNAL_COMPILER_CRAY_CLANG), 1)
KOKKOS_INTERNAL_OPENMP_FLAG := -fopenmp
else
ifeq ($(KOKKOS_INTERNAL_COMPILER_FUJITSU), 1)
# fujitsu (clang mode) fails with `=libomp`
KOKKOS_INTERNAL_OPENMP_FLAG := -fopenmp
else
KOKKOS_INTERNAL_OPENMP_FLAG := -fopenmp=libomp
endif
endif
else
ifeq ($(KOKKOS_INTERNAL_COMPILER_APPLE_CLANG), 1)
KOKKOS_INTERNAL_OPENMP_FLAG := -fopenmp=libomp
else
ifeq ($(KOKKOS_INTERNAL_COMPILER_XL), 1)
KOKKOS_INTERNAL_OPENMP_FLAG := -qsmp=omp
else
ifeq ($(KOKKOS_INTERNAL_COMPILER_CRAY), 1)
# OpenMP is turned on by default in Cray compiler environment.
KOKKOS_INTERNAL_OPENMP_FLAG :=
else
ifeq ($(KOKKOS_INTERNAL_COMPILER_INTEL_CLANG), 1)
KOKKOS_INTERNAL_OPENMP_FLAG := -fiopenmp
else
KOKKOS_INTERNAL_OPENMP_FLAG := -fopenmp
endif
endif
endif
endif
endif
endif
ifeq ($(KOKKOS_INTERNAL_COMPILER_XL), 1)
KOKKOS_INTERNAL_OPENMPTARGET_FLAG := -DKOKKOS_IBM_XL_OMP45_WORKAROUND -qsmp=omp -qoffload -qnoeh
else
ifeq ($(KOKKOS_INTERNAL_COMPILER_CLANG), 1)
#KOKKOS_INTERNAL_OPENMPTARGET_FLAG := -DKOKKOS_BUG_WORKAROUND_IBM_CLANG_OMP45_VIEW_INIT -fopenmp-implicit-declare-target -fopenmp-targets=nvptx64-nvidia-cuda -fopenmp -fopenmp=libomp
KOKKOS_INTERNAL_OPENMPTARGET_FLAG := -DKOKKOS_WORKAROUND_OPENMPTARGET_CLANG -fopenmp -fopenmp=libomp -Wno-openmp-mapping
KOKKOS_INTERNAL_OPENMPTARGET_LIB := -lomptarget
else
ifeq ($(KOKKOS_INTERNAL_COMPILER_INTEL_CLANG), 1)
KOKKOS_INTERNAL_OPENMPTARGET_FLAG := -fiopenmp -Wno-openmp-mapping
else
#Assume GCC
KOKKOS_INTERNAL_OPENMPTARGET_FLAG := -fopenmp -foffload=nvptx-none
endif
endif
endif
# Set C++ version flags.
ifeq ($(KOKKOS_INTERNAL_COMPILER_PGI), 1)
KOKKOS_INTERNAL_CXX14_FLAG := --c++14
KOKKOS_INTERNAL_CXX17_FLAG := --c++17
else
ifeq ($(KOKKOS_INTERNAL_COMPILER_XL), 1)
KOKKOS_INTERNAL_CXX14_FLAG := -std=c++14
KOKKOS_INTERNAL_CXX1Y_FLAG := -std=c++1y
#KOKKOS_INTERNAL_CXX17_FLAG := -std=c++17
#KOKKOS_INTERNAL_CXX1Z_FLAG := -std=c++1Z
#KOKKOS_INTERNAL_CXX2A_FLAG := -std=c++2a
else
ifeq ($(KOKKOS_INTERNAL_COMPILER_CRAY), 1)
KOKKOS_INTERNAL_CXX14_FLAG := -hstd=c++14
#KOKKOS_INTERNAL_CXX1Y_FLAG := -hstd=c++1y
#KOKKOS_INTERNAL_CXX17_FLAG := -hstd=c++17
#KOKKOS_INTERNAL_CXX1Z_FLAG := -hstd=c++1z
#KOKKOS_INTERNAL_CXX2A_FLAG := -hstd=c++2a
else
KOKKOS_INTERNAL_CXX14_FLAG := -std=c++14
KOKKOS_INTERNAL_CXX1Y_FLAG := -std=c++1y
KOKKOS_INTERNAL_CXX17_FLAG := -std=c++17
KOKKOS_INTERNAL_CXX1Z_FLAG := -std=c++1z
KOKKOS_INTERNAL_CXX2A_FLAG := -std=c++2a
endif
endif
endif
# Check for Kokkos Architecture settings.
# Intel based.
KOKKOS_INTERNAL_USE_ARCH_KNC := $(call kokkos_has_string,$(KOKKOS_ARCH),KNC)
KOKKOS_INTERNAL_USE_ARCH_WSM := $(call kokkos_has_string,$(KOKKOS_ARCH),WSM)
KOKKOS_INTERNAL_USE_ARCH_SNB := $(call kokkos_has_string,$(KOKKOS_ARCH),SNB)
KOKKOS_INTERNAL_USE_ARCH_HSW := $(call kokkos_has_string,$(KOKKOS_ARCH),HSW)
KOKKOS_INTERNAL_USE_ARCH_BDW := $(call kokkos_has_string,$(KOKKOS_ARCH),BDW)
KOKKOS_INTERNAL_USE_ARCH_SKL := $(call kokkos_has_string,$(KOKKOS_ARCH),SKL)
KOKKOS_INTERNAL_USE_ARCH_SKX := $(call kokkos_has_string,$(KOKKOS_ARCH),SKX)
KOKKOS_INTERNAL_USE_ARCH_KNL := $(call kokkos_has_string,$(KOKKOS_ARCH),KNL)
KOKKOS_INTERNAL_USE_ARCH_ICL := $(call kokkos_has_string,$(KOKKOS_ARCH),ICL)
KOKKOS_INTERNAL_USE_ARCH_ICX := $(call kokkos_has_string,$(KOKKOS_ARCH),ICX)
KOKKOS_INTERNAL_USE_ARCH_SPR := $(call kokkos_has_string,$(KOKKOS_ARCH),SPR)
KOKKOS_INTERNAL_USE_ARCH_INTEL_GEN := $(call kokkos_has_string,$(KOKKOS_ARCH),IntelGen)
KOKKOS_INTERNAL_USE_ARCH_INTEL_GEN9 := $(call kokkos_has_string,$(KOKKOS_ARCH),IntelGen9)
KOKKOS_INTERNAL_USE_ARCH_INTEL_GEN11 := $(call kokkos_has_string,$(KOKKOS_ARCH),IntelGen11)
KOKKOS_INTERNAL_USE_ARCH_INTEL_GEN12LP := $(call kokkos_has_string,$(KOKKOS_ARCH),IntelGen12LP)
KOKKOS_INTERNAL_USE_ARCH_INTEL_DG1 := $(call kokkos_has_string,$(KOKKOS_ARCH),IntelDG1)
KOKKOS_INTERNAL_USE_ARCH_INTEL_XEHP := $(call kokkos_has_string,$(KOKKOS_ARCH),IntelXeHP)
KOKKOS_INTERNAL_USE_ARCH_INTEL_PVC := $(call kokkos_has_string,$(KOKKOS_ARCH),PVC)
# NVIDIA based.
NVCC_WRAPPER := $(KOKKOS_PATH)/bin/nvcc_wrapper
KOKKOS_INTERNAL_USE_ARCH_KEPLER30 := $(call kokkos_has_string,$(KOKKOS_ARCH),Kepler30)
KOKKOS_INTERNAL_USE_ARCH_KEPLER32 := $(call kokkos_has_string,$(KOKKOS_ARCH),Kepler32)
KOKKOS_INTERNAL_USE_ARCH_KEPLER35 := $(call kokkos_has_string,$(KOKKOS_ARCH),Kepler35)
KOKKOS_INTERNAL_USE_ARCH_KEPLER37 := $(call kokkos_has_string,$(KOKKOS_ARCH),Kepler37)
KOKKOS_INTERNAL_USE_ARCH_MAXWELL50 := $(call kokkos_has_string,$(KOKKOS_ARCH),Maxwell50)
KOKKOS_INTERNAL_USE_ARCH_MAXWELL52 := $(call kokkos_has_string,$(KOKKOS_ARCH),Maxwell52)
KOKKOS_INTERNAL_USE_ARCH_MAXWELL53 := $(call kokkos_has_string,$(KOKKOS_ARCH),Maxwell53)
KOKKOS_INTERNAL_USE_ARCH_PASCAL61 := $(call kokkos_has_string,$(KOKKOS_ARCH),Pascal61)
KOKKOS_INTERNAL_USE_ARCH_PASCAL60 := $(call kokkos_has_string,$(KOKKOS_ARCH),Pascal60)
KOKKOS_INTERNAL_USE_ARCH_VOLTA70 := $(call kokkos_has_string,$(KOKKOS_ARCH),Volta70)
KOKKOS_INTERNAL_USE_ARCH_VOLTA72 := $(call kokkos_has_string,$(KOKKOS_ARCH),Volta72)
KOKKOS_INTERNAL_USE_ARCH_TURING75 := $(call kokkos_has_string,$(KOKKOS_ARCH),Turing75)
KOKKOS_INTERNAL_USE_ARCH_AMPERE80 := $(call kokkos_has_string,$(KOKKOS_ARCH),Ampere80)
KOKKOS_INTERNAL_USE_ARCH_AMPERE86 := $(call kokkos_has_string,$(KOKKOS_ARCH),Ampere86)
KOKKOS_INTERNAL_USE_ARCH_NVIDIA := $(shell expr $(KOKKOS_INTERNAL_USE_ARCH_KEPLER30) \
+ $(KOKKOS_INTERNAL_USE_ARCH_KEPLER32) \
+ $(KOKKOS_INTERNAL_USE_ARCH_KEPLER35) \
+ $(KOKKOS_INTERNAL_USE_ARCH_KEPLER37) \
+ $(KOKKOS_INTERNAL_USE_ARCH_MAXWELL50) \
+ $(KOKKOS_INTERNAL_USE_ARCH_MAXWELL52) \
+ $(KOKKOS_INTERNAL_USE_ARCH_MAXWELL53) \
+ $(KOKKOS_INTERNAL_USE_ARCH_PASCAL61) \
+ $(KOKKOS_INTERNAL_USE_ARCH_PASCAL60) \
+ $(KOKKOS_INTERNAL_USE_ARCH_VOLTA70) \
+ $(KOKKOS_INTERNAL_USE_ARCH_VOLTA72) \
+ $(KOKKOS_INTERNAL_USE_ARCH_TURING75) \
+ $(KOKKOS_INTERNAL_USE_ARCH_AMPERE80) \
+ $(KOKKOS_INTERNAL_USE_ARCH_AMPERE86))
#SEK: This seems like a bug to me
ifeq ($(KOKKOS_INTERNAL_USE_ARCH_NVIDIA), 0)
KOKKOS_INTERNAL_USE_ARCH_MAXWELL50 := $(call kokkos_has_string,$(KOKKOS_ARCH),Maxwell)
KOKKOS_INTERNAL_USE_ARCH_KEPLER35 := $(call kokkos_has_string,$(KOKKOS_ARCH),Kepler)
KOKKOS_INTERNAL_USE_ARCH_NVIDIA := $(shell expr $(KOKKOS_INTERNAL_USE_ARCH_KEPLER35) \
+ $(KOKKOS_INTERNAL_USE_ARCH_MAXWELL50))
endif
ifeq ($(KOKKOS_INTERNAL_USE_ARCH_NVIDIA), 1)
ifeq ($(KOKKOS_INTERNAL_COMPILER_CLANG), 1)
KOKKOS_INTERNAL_NVCC_PATH := $(shell which nvcc)
CUDA_PATH ?= $(KOKKOS_INTERNAL_NVCC_PATH:/bin/nvcc=)
ifeq ($(KOKKOS_INTERNAL_USE_OPENMPTARGET), 1)
KOKKOS_INTERNAL_OPENMPTARGET_FLAG := $(KOKKOS_INTERNAL_OPENMPTARGET_FLAG) --cuda-path=$(CUDA_PATH)
endif
endif
endif
# ARM based.
KOKKOS_INTERNAL_USE_ARCH_ARMV80 := $(call kokkos_has_string,$(KOKKOS_ARCH),ARMv80)
KOKKOS_INTERNAL_USE_ARCH_ARMV81 := $(call kokkos_has_string,$(KOKKOS_ARCH),ARMv81)
KOKKOS_INTERNAL_USE_ARCH_ARMV8_THUNDERX := $(call kokkos_has_string,$(KOKKOS_ARCH),ARMv8-ThunderX)
KOKKOS_INTERNAL_USE_ARCH_ARMV8_THUNDERX2 := $(call kokkos_has_string,$(KOKKOS_ARCH),ARMv8-TX2)
KOKKOS_INTERNAL_USE_ARCH_A64FX := $(call kokkos_has_string,$(KOKKOS_ARCH),A64FX)
KOKKOS_INTERNAL_USE_ARCH_ARM := $(strip $(shell echo $(KOKKOS_INTERNAL_USE_ARCH_ARMV80)+$(KOKKOS_INTERNAL_USE_ARCH_ARMV81)+$(KOKKOS_INTERNAL_USE_ARCH_ARMV8_THUNDERX)+$(KOKKOS_INTERNAL_USE_ARCH_ARMV8_THUNDERX2)+$(KOKKOS_INTERNAL_USE_ARCH_A64FX) | bc))
# IBM based.
KOKKOS_INTERNAL_USE_ARCH_BGQ := $(call kokkos_has_string,$(KOKKOS_ARCH),BGQ)
KOKKOS_INTERNAL_USE_ARCH_POWER7 := $(call kokkos_has_string,$(KOKKOS_ARCH),Power7)
KOKKOS_INTERNAL_USE_ARCH_POWER8 := $(call kokkos_has_string,$(KOKKOS_ARCH),Power8)
KOKKOS_INTERNAL_USE_ARCH_POWER9 := $(call kokkos_has_string,$(KOKKOS_ARCH),Power9)
KOKKOS_INTERNAL_USE_ARCH_IBM := $(strip $(shell echo $(KOKKOS_INTERNAL_USE_ARCH_BGQ)+$(KOKKOS_INTERNAL_USE_ARCH_POWER7)+$(KOKKOS_INTERNAL_USE_ARCH_POWER8)+$(KOKKOS_INTERNAL_USE_ARCH_POWER9) | bc))
# AMD based.
KOKKOS_INTERNAL_USE_ARCH_AMDAVX := $(call kokkos_has_string,$(KOKKOS_ARCH),AMDAVX)
KOKKOS_INTERNAL_USE_ARCH_ZEN3 := $(call kokkos_has_string,$(KOKKOS_ARCH),Zen3)
KOKKOS_INTERNAL_USE_ARCH_ZEN2 := $(call kokkos_has_string,$(KOKKOS_ARCH),Zen2)
ifeq ($(KOKKOS_INTERNAL_USE_ARCH_ZEN3), 0)
ifeq ($(KOKKOS_INTERNAL_USE_ARCH_ZEN2), 0)
KOKKOS_INTERNAL_USE_ARCH_ZEN := $(call kokkos_has_string,$(KOKKOS_ARCH),Zen)
endif
endif
KOKKOS_INTERNAL_USE_ARCH_VEGA900 := $(call kokkos_has_string,$(KOKKOS_ARCH),Vega900)
KOKKOS_INTERNAL_USE_ARCH_VEGA906 := $(call kokkos_has_string,$(KOKKOS_ARCH),Vega906)
KOKKOS_INTERNAL_USE_ARCH_VEGA908 := $(call kokkos_has_string,$(KOKKOS_ARCH),Vega908)
KOKKOS_INTERNAL_USE_ARCH_VEGA90A := $(call kokkos_has_string,$(KOKKOS_ARCH),Vega90A)
# Any AVX?
KOKKOS_INTERNAL_USE_ARCH_SSE42 := $(shell expr $(KOKKOS_INTERNAL_USE_ARCH_WSM))
KOKKOS_INTERNAL_USE_ARCH_AVX := $(shell expr $(KOKKOS_INTERNAL_USE_ARCH_SNB) + $(KOKKOS_INTERNAL_USE_ARCH_AMDAVX))
KOKKOS_INTERNAL_USE_ARCH_AVX2 := $(shell expr $(KOKKOS_INTERNAL_USE_ARCH_HSW) + $(KOKKOS_INTERNAL_USE_ARCH_BDW) + $(KOKKOS_INTERNAL_USE_ARCH_ZEN) + $(KOKKOS_INTERNAL_USE_ARCH_ZEN2) + $(KOKKOS_INTERNAL_USE_ARCH_ZEN3))
KOKKOS_INTERNAL_USE_ARCH_AVX512MIC := $(shell expr $(KOKKOS_INTERNAL_USE_ARCH_KNL))
# Incompatible flags?
KOKKOS_INTERNAL_USE_ARCH_MULTIHOST := $(strip $(shell echo "$(KOKKOS_INTERNAL_USE_ARCH_SSE42)+$(KOKKOS_INTERNAL_USE_ARCH_AVX)+$(KOKKOS_INTERNAL_USE_ARCH_AVX2)+$(KOKKOS_INTERNAL_USE_ARCH_AVX512MIC)+$(KOKKOS_INTERNAL_USE_ARCH_SKL)+$(KOKKOS_INTERNAL_USE_ARCH_SKX)+$(KOKKOS_INTERNAL_USE_ARCH_ICL)+$(KOKKOS_INTERNAL_USE_ARCH_ICX)+$(KOKKOS_INTERNAL_USE_ARCH_SPR)+$(KOKKOS_INTERNAL_USE_ARCH_KNC)+$(KOKKOS_INTERNAL_USE_ARCH_IBM)+$(KOKKOS_INTERNAL_USE_ARCH_ARM)>1") | bc)
KOKKOS_INTERNAL_USE_ARCH_MULTIGPU := $(strip $(shell echo "$(KOKKOS_INTERNAL_USE_ARCH_NVIDIA)>1") | bc)
ifeq ($(KOKKOS_INTERNAL_USE_ARCH_MULTIHOST), 1)
$(error Defined Multiple Host architectures: KOKKOS_ARCH=$(KOKKOS_ARCH) )
endif
ifeq ($(KOKKOS_INTERNAL_USE_ARCH_MULTIGPU), 1)
$(error Defined Multiple GPU architectures: KOKKOS_ARCH=$(KOKKOS_ARCH) )
endif
# Generating the list of Flags.
KOKKOS_CPPFLAGS =
KOKKOS_LIBDIRS =
ifneq ($(KOKKOS_CMAKE), yes)
KOKKOS_CPPFLAGS = -I./ -I$(KOKKOS_PATH)/core/src -I$(KOKKOS_PATH)/containers/src -I$(KOKKOS_PATH)/algorithms/src
endif
KOKKOS_TPL_INCLUDE_DIRS =
KOKKOS_TPL_LIBRARY_DIRS =
KOKKOS_TPL_LIBRARY_NAMES =
ifeq ($(KOKKOS_INTERNAL_ENABLE_COMPILER_WARNINGS), 1)
KOKKOS_CXXFLAGS += $(KOKKOS_INTERNAL_COMPILER_WARNINGS)
endif
KOKKOS_LIBS = -ldl
KOKKOS_TPL_LIBRARY_NAMES += dl
ifneq ($(KOKKOS_CMAKE), yes)
KOKKOS_LIBDIRS = -L$(shell pwd)
# CXXLDFLAGS is used together with CXXFLAGS in a combined compile/link command
KOKKOS_CXXLDFLAGS = -L$(shell pwd)
endif
KOKKOS_LINK_FLAGS =
KOKKOS_SRC =
KOKKOS_HEADERS =
# Generating the KokkosCore_config.h file.
KOKKOS_INTERNAL_CONFIG_TMP=KokkosCore_config.tmp
KOKKOS_CONFIG_HEADER=KokkosCore_config.h
# Functions for generating config header file
kokkos_append_header = $(shell echo $1 >> $(KOKKOS_INTERNAL_CONFIG_TMP))
# assign hash sign to variable for compat. with make 4.3
H := \#
# Do not append first line
tmp := $(shell echo "/* ---------------------------------------------" > KokkosCore_config.tmp)
tmp := $(call kokkos_append_header,"Makefile constructed configuration:")
tmp := $(call kokkos_append_header,"----------------------------------------------*/")
tmp := $(call kokkos_append_header,'$H''if !defined(KOKKOS_MACROS_HPP) || defined(KOKKOS_CORE_CONFIG_H)')
tmp := $(call kokkos_append_header,'$H''error "Do not include $(KOKKOS_CONFIG_HEADER) directly; include Kokkos_Macros.hpp instead."')
tmp := $(call kokkos_append_header,'$H''else')
tmp := $(call kokkos_append_header,'$H''define KOKKOS_CORE_CONFIG_H')
tmp := $(call kokkos_append_header,'$H''endif')
tmp := $(call kokkos_append_header,"")
tmp := $(call kokkos_append_header,"$H""define KOKKOS_VERSION $(KOKKOS_VERSION)")
tmp := $(call kokkos_append_header,"")
tmp := $(call kokkos_append_header,"/* Execution Spaces */")
ifeq ($(KOKKOS_INTERNAL_USE_CUDA), 1)
tmp := $(call kokkos_append_header,"$H""define KOKKOS_ENABLE_CUDA")
tmp := $(call kokkos_append_header,"$H""define KOKKOS_COMPILER_CUDA_VERSION $(KOKKOS_INTERNAL_COMPILER_NVCC_VERSION)")
endif
ifeq ($(KOKKOS_INTERNAL_USE_HIP), 1)
tmp := $(call kokkos_append_header,'$H''define KOKKOS_ENABLE_HIP')
endif
ifeq ($(KOKKOS_INTERNAL_USE_SYCL), 1)
tmp := $(call kokkos_append_header,'$H''define KOKKOS_ENABLE_SYCL')
endif
ifeq ($(KOKKOS_INTERNAL_USE_OPENMPTARGET), 1)
KOKKOS_LIBS += -latomic
tmp := $(call kokkos_append_header,'$H''define KOKKOS_ENABLE_OPENMPTARGET')
ifeq ($(KOKKOS_INTERNAL_COMPILER_GCC), 1)
tmp := $(call kokkos_append_header,"$H""define KOKKOS_WORKAROUND_OPENMPTARGET_GCC")
endif
endif
ifeq ($(KOKKOS_INTERNAL_USE_OPENMP), 1)
tmp := $(call kokkos_append_header,'$H''define KOKKOS_ENABLE_OPENMP')
endif
ifeq ($(KOKKOS_INTERNAL_USE_THREADS), 1)
tmp := $(call kokkos_append_header,"$H""define KOKKOS_ENABLE_THREADS")
endif
ifeq ($(KOKKOS_INTERNAL_USE_HPX), 1)
tmp := $(call kokkos_append_header,"$H""define KOKKOS_ENABLE_HPX")
endif
ifeq ($(KOKKOS_INTERNAL_USE_SERIAL), 1)
tmp := $(call kokkos_append_header,"$H""define KOKKOS_ENABLE_SERIAL")
endif
#only add the c++ standard flags if this is not CMake
tmp := $(call kokkos_append_header,"/* General Settings */")
ifneq ($(KOKKOS_INTERNAL_DISABLE_DEPRECATED_CODE), 1)
tmp := $(call kokkos_append_header,"$H""define KOKKOS_ENABLE_DEPRECATED_CODE_3")
endif
ifeq ($(KOKKOS_INTERNAL_ENABLE_DEPRECATION_WARNINGS), 1)
tmp := $(call kokkos_append_header,"$H""define KOKKOS_ENABLE_DEPRECATION_WARNINGS")
endif
ifeq ($(KOKKOS_INTERNAL_ENABLE_CXX14), 1)
ifneq ($(KOKKOS_STANDALONE_CMAKE), yes)
KOKKOS_CXXFLAGS += $(KOKKOS_INTERNAL_CXX14_FLAG)
endif
tmp := $(call kokkos_append_header,"$H""define KOKKOS_ENABLE_CXX14")
endif
ifeq ($(KOKKOS_INTERNAL_ENABLE_CXX1Y), 1)
#I cannot make CMake add this in a good way - so add it here
KOKKOS_CXXFLAGS += $(KOKKOS_INTERNAL_CXX1Y_FLAG)
tmp := $(call kokkos_append_header,"$H""define KOKKOS_ENABLE_CXX14")
endif
ifeq ($(KOKKOS_INTERNAL_ENABLE_CXX17), 1)
ifneq ($(KOKKOS_STANDALONE_CMAKE), yes)
KOKKOS_CXXFLAGS += $(KOKKOS_INTERNAL_CXX17_FLAG)
endif
tmp := $(call kokkos_append_header,"$H""define KOKKOS_ENABLE_CXX17")
endif
ifeq ($(KOKKOS_INTERNAL_ENABLE_CXX1Z), 1)
#I cannot make CMake add this in a good way - so add it here
KOKKOS_CXXFLAGS += $(KOKKOS_INTERNAL_CXX1Z_FLAG)
tmp := $(call kokkos_append_header,"$H""define KOKKOS_ENABLE_CXX17")
endif
ifeq ($(KOKKOS_INTERNAL_ENABLE_CXX2A), 1)
#I cannot make CMake add this in a good way - so add it here
KOKKOS_CXXFLAGS += $(KOKKOS_INTERNAL_CXX2A_FLAG)
tmp := $(call kokkos_append_header,"$H""define KOKKOS_ENABLE_CXX20")
endif
ifeq ($(KOKKOS_INTERNAL_ENABLE_CXX20), 1)
#I cannot make CMake add this in a good way - so add it here
KOKKOS_CXXFLAGS += $(KOKKOS_INTERNAL_CXX20_FLAG)
tmp := $(call kokkos_append_header,"$H""define KOKKOS_ENABLE_CXX20")
endif
ifeq ($(KOKKOS_INTERNAL_ENABLE_DEBUG), 1)
ifeq ($(KOKKOS_INTERNAL_COMPILER_NVCC), 1)
KOKKOS_CXXFLAGS += -lineinfo
endif
KOKKOS_CXXFLAGS += -g
KOKKOS_LDFLAGS += -g
tmp := $(call kokkos_append_header,"$H""define KOKKOS_ENABLE_DEBUG_BOUNDS_CHECK")
tmp := $(call kokkos_append_header,"$H""define KOKKOS_ENABLE_DEBUG")
ifeq ($(KOKKOS_INTERNAL_DISABLE_DUALVIEW_MODIFY_CHECK), 0)
tmp := $(call kokkos_append_header,"$H""define KOKKOS_ENABLE_DEBUG_DUALVIEW_MODIFY_CHECK")
endif
endif
ifeq ($(KOKKOS_INTERNAL_DISABLE_COMPLEX_ALIGN), 0)
tmp := $(call kokkos_append_header,"$H""define KOKKOS_ENABLE_COMPLEX_ALIGN")
endif
ifeq ($(KOKKOS_INTERNAL_ENABLE_PROFILING_LOAD_PRINT), 1)
tmp := $(call kokkos_append_header,"$H""define KOKKOS_ENABLE_PROFILING_LOAD_PRINT")
endif
ifeq ($(KOKKOS_INTERNAL_ENABLE_TUNING), 1)
tmp := $(call kokkos_append_header,"$H""define KOKKOS_ENABLE_TUNING")
endif
tmp := $(call kokkos_append_header,"$H""define KOKKOS_ENABLE_LIBDL")
ifeq ($(KOKKOS_INTERNAL_USE_HWLOC), 1)
ifneq ($(KOKKOS_CMAKE), yes)
ifneq ($(HWLOC_PATH),)
KOKKOS_CPPFLAGS += -I$(HWLOC_PATH)/include
KOKKOS_LIBDIRS += -L$(HWLOC_PATH)/lib
KOKKOS_CXXLDFLAGS += -L$(HWLOC_PATH)/lib
KOKKOS_TPL_INCLUDE_DIRS += $(HWLOC_PATH)/include
KOKKOS_TPL_LIBRARY_DIRS += $(HWLOC_PATH)/lib
endif
KOKKOS_LIBS += -lhwloc
KOKKOS_TPL_LIBRARY_NAMES += hwloc
endif
tmp := $(call kokkos_append_header,"$H""define KOKKOS_ENABLE_HWLOC")
endif
ifeq ($(KOKKOS_INTERNAL_USE_LIBRT), 1)
tmp := $(call kokkos_append_header,"$H""define KOKKOS_USE_LIBRT")
KOKKOS_LIBS += -lrt
KOKKOS_TPL_LIBRARY_NAMES += rt
endif
ifeq ($(KOKKOS_INTERNAL_USE_MEMKIND), 1)
ifneq ($(KOKKOS_CMAKE), yes)
ifneq ($(MEMKIND_PATH),)
KOKKOS_CPPFLAGS += -I$(MEMKIND_PATH)/include
KOKKOS_LIBDIRS += -L$(MEMKIND_PATH)/lib
KOKKOS_CXXLDFLAGS += -L$(MEMKIND_PATH)/lib
KOKKOS_TPL_INCLUDE_DIRS += $(MEMKIND_PATH)/include
KOKKOS_TPL_LIBRARY_DIRS += $(MEMKIND_PATH)/lib
endif
KOKKOS_LIBS += -lmemkind -lnuma
KOKKOS_TPL_LIBRARY_NAMES += memkind numa
endif
tmp := $(call kokkos_append_header,"$H""define KOKKOS_ENABLE_HBWSPACE")
endif
ifeq ($(KOKKOS_INTERNAL_ENABLE_LARGE_MEM_TESTS), 1)
tmp := $(call kokkos_append_header,"$H""define KOKKOS_ENABLE_LARGE_MEM_TESTS")
endif
tmp := $(call kokkos_append_header,"/* Optimization Settings */")
ifeq ($(KOKKOS_INTERNAL_AGGRESSIVE_VECTORIZATION), 1)
# deprecated
tmp := $(call kokkos_append_header,"$H""define KOKKOS_OPT_RANGE_AGGRESSIVE_VECTORIZATION")
tmp := $(call kokkos_append_header,"$H""define KOKKOS_ENABLE_AGGRESSIVE_VECTORIZATION")
endif
tmp := $(call kokkos_append_header,"/* Cuda Settings */")
ifeq ($(KOKKOS_INTERNAL_USE_CUDA), 1)
ifeq ($(KOKKOS_INTERNAL_CUDA_USE_LDG), 1)
tmp := $(call kokkos_append_header,"$H""define KOKKOS_ENABLE_CUDA_LDG_INTRINSIC")
else
ifeq ($(KOKKOS_INTERNAL_COMPILER_CLANG), 1)
tmp := $(call kokkos_append_header,"$H""define KOKKOS_ENABLE_CUDA_LDG_INTRINSIC")
endif
endif
ifeq ($(KOKKOS_INTERNAL_CUDA_USE_UVM), 1)
tmp := $(call kokkos_append_header,"$H""define KOKKOS_ENABLE_CUDA_UVM")
endif
ifeq ($(KOKKOS_INTERNAL_CUDA_USE_RELOC), 1)
tmp := $(call kokkos_append_header,"$H""define KOKKOS_ENABLE_CUDA_RELOCATABLE_DEVICE_CODE")
ifeq ($(KOKKOS_INTERNAL_COMPILER_CLANG), 1)
KOKKOS_CXXFLAGS += -fcuda-rdc
KOKKOS_LDFLAGS += -fcuda-rdc
else
KOKKOS_CXXFLAGS += --relocatable-device-code=true
KOKKOS_LDFLAGS += --relocatable-device-code=true
endif
endif
ifeq ($(KOKKOS_INTERNAL_COMPILER_NVCC), 1)
ifeq ($(shell test $(KOKKOS_INTERNAL_COMPILER_NVCC_VERSION) -ge 90; echo $$?),0)
# This diagnostic is just plain wrong in CUDA 9
# See https://github.com/kokkos/kokkos/issues/1470
KOKKOS_CXXFLAGS += -Xcudafe --diag_suppress=esa_on_defaulted_function_ignored
endif
endif
ifeq ($(KOKKOS_INTERNAL_CUDA_USE_LAMBDA), 1)
ifeq ($(KOKKOS_INTERNAL_COMPILER_NVCC), 1)
ifeq ($(shell test $(KOKKOS_INTERNAL_COMPILER_NVCC_VERSION) -gt 70; echo $$?),0)
tmp := $(call kokkos_append_header,"$H""define KOKKOS_ENABLE_CUDA_LAMBDA")
KOKKOS_CXXFLAGS += -expt-extended-lambda
else
$(warning Warning: Cuda Lambda support was requested but NVCC version is too low. This requires NVCC for Cuda version 7.5 or higher. Disabling Lambda support now.)
endif
endif
ifeq ($(KOKKOS_INTERNAL_COMPILER_CLANG), 1)
tmp := $(call kokkos_append_header,"$H""define KOKKOS_ENABLE_CUDA_LAMBDA")
endif
endif
ifeq ($(KOKKOS_INTERNAL_CUDA_USE_CONSTEXPR), 1)
ifeq ($(KOKKOS_INTERNAL_COMPILER_NVCC), 1)
ifeq ($(shell test $(KOKKOS_INTERNAL_COMPILER_NVCC_VERSION) -ge 80; echo $$?),0)
tmp := $(call kokkos_append_header,"$H""define KOKKOS_ENABLE_CUDA_CONSTEXPR")
KOKKOS_CXXFLAGS += -expt-relaxed-constexpr
else
$(warning Warning: Cuda relaxed constexpr support was requested but NVCC version is too low. This requires NVCC for Cuda version 8.0 or higher. Disabling relaxed constexpr support now.)
endif
endif
ifeq ($(KOKKOS_INTERNAL_COMPILER_CLANG), 1)
tmp := $(call kokkos_append_header,"$H""define KOKKOS_ENABLE_CUDA_CONSTEXPR")
endif
endif
ifeq ($(KOKKOS_INTERNAL_COMPILER_CLANG), 1)
tmp := $(call kokkos_append_header,"$H""define KOKKOS_IMPL_CUDA_CLANG_WORKAROUND")
endif
endif
ifeq ($(KOKKOS_INTERNAL_USE_HPX), 1)
ifeq ($(KOKKOS_INTERNAL_HPX_ENABLE_ASYNC_DISPATCH), 1)
tmp := $(call kokkos_append_header,"$H""define KOKKOS_ENABLE_HPX_ASYNC_DISPATCH")
endif
endif
# Add Architecture flags.
ifeq ($(KOKKOS_INTERNAL_USE_ARCH_ARMV80), 1)
tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_ARMV80")
ifeq ($(KOKKOS_INTERNAL_COMPILER_CRAY), 1)
KOKKOS_CXXFLAGS +=
KOKKOS_LDFLAGS +=
else
ifeq ($(KOKKOS_INTERNAL_COMPILER_PGI), 1)
KOKKOS_CXXFLAGS +=
KOKKOS_LDFLAGS +=
else
KOKKOS_CXXFLAGS += -march=armv8-a
KOKKOS_LDFLAGS += -march=armv8-a
endif
endif
endif
ifeq ($(KOKKOS_INTERNAL_USE_ARCH_ARMV81), 1)
tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_ARMV81")
ifeq ($(KOKKOS_INTERNAL_COMPILER_CRAY), 1)
KOKKOS_CXXFLAGS +=
KOKKOS_LDFLAGS +=
else
ifeq ($(KOKKOS_INTERNAL_COMPILER_PGI), 1)
KOKKOS_CXXFLAGS +=
KOKKOS_LDFLAGS +=
else
KOKKOS_CXXFLAGS += -march=armv8.1-a
KOKKOS_LDFLAGS += -march=armv8.1-a
endif
endif
endif
ifeq ($(KOKKOS_INTERNAL_USE_ARCH_A64FX), 1)
tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_A64FX")
KOKKOS_CXXFLAGS += -march=armv8.2-a+sve
KOKKOS_LDFLAGS += -march=armv8.2-a+sve
ifeq ($(KOKKOS_INTERNAL_COMPILER_CLANG), 1)
KOKKOS_CXXFLAGS += -msve-vector-bits=512
KOKKOS_LDFLAGS += -msve-vector-bits=512
endif
ifeq ($(KOKKOS_INTERNAL_COMPILER_GCC), 1)
KOKKOS_CXXFLAGS += -msve-vector-bits=512
KOKKOS_LDFLAGS += -msve-vector-bits=512
endif
endif
ifeq ($(KOKKOS_INTERNAL_USE_ARCH_ZEN), 1)
tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_AMD_ZEN")
tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_AMD_AVX2")
ifeq ($(KOKKOS_INTERNAL_COMPILER_INTEL), 1)
KOKKOS_CXXFLAGS += -mavx2
KOKKOS_LDFLAGS += -mavx2
else
KOKKOS_CXXFLAGS += -march=znver1 -mtune=znver1
KOKKOS_LDFLAGS += -march=znver1 -mtune=znver1
endif
endif
ifeq ($(KOKKOS_INTERNAL_USE_ARCH_ZEN2), 1)
tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_AMD_ZEN2")
tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_AMD_AVX2")
ifeq ($(KOKKOS_INTERNAL_COMPILER_INTEL), 1)
KOKKOS_CXXFLAGS += -mavx2
KOKKOS_LDFLAGS += -mavx2
else
KOKKOS_CXXFLAGS += -march=znver2 -mtune=znver2
KOKKOS_LDFLAGS += -march=znver2 -mtune=znver2
endif
endif
ifeq ($(KOKKOS_INTERNAL_USE_ARCH_ZEN3), 1)
tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_AMD_ZEN3")
tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_AMD_AVX2")
ifeq ($(KOKKOS_INTERNAL_COMPILER_INTEL), 1)
KOKKOS_CXXFLAGS += -mavx2
KOKKOS_LDFLAGS += -mavx2
else
KOKKOS_CXXFLAGS += -march=znver3 -mtune=znver3
KOKKOS_LDFLAGS += -march=znver3 -mtune=znver3
endif
endif
ifeq ($(KOKKOS_INTERNAL_USE_ARCH_ARMV8_THUNDERX), 1)
tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_ARMV80")
tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_ARMV8_THUNDERX")
ifeq ($(KOKKOS_INTERNAL_COMPILER_CRAY), 1)
KOKKOS_CXXFLAGS +=
KOKKOS_LDFLAGS +=
else
ifeq ($(KOKKOS_INTERNAL_COMPILER_PGI), 1)
KOKKOS_CXXFLAGS +=
KOKKOS_LDFLAGS +=
else
KOKKOS_CXXFLAGS += -march=armv8-a -mtune=thunderx
KOKKOS_LDFLAGS += -march=armv8-a -mtune=thunderx
endif
endif
endif
ifeq ($(KOKKOS_INTERNAL_USE_ARCH_ARMV8_THUNDERX2), 1)
tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_ARMV81")
tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_ARMV8_THUNDERX2")
ifeq ($(KOKKOS_INTERNAL_COMPILER_CRAY), 1)
KOKKOS_CXXFLAGS +=
KOKKOS_LDFLAGS +=
else
ifeq ($(KOKKOS_INTERNAL_COMPILER_PGI), 1)
KOKKOS_CXXFLAGS +=
KOKKOS_LDFLAGS +=
else
KOKKOS_CXXFLAGS += -mtune=thunderx2t99 -mcpu=thunderx2t99
KOKKOS_LDFLAGS += -mtune=thunderx2t99 -mcpu=thunderx2t99
endif
endif
endif
ifeq ($(KOKKOS_INTERNAL_USE_ARCH_SSE42), 1)
tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_SSE42")
ifeq ($(KOKKOS_INTERNAL_COMPILER_INTEL), 1)
KOKKOS_CXXFLAGS += -xSSE4.2
KOKKOS_LDFLAGS += -xSSE4.2
else
ifeq ($(KOKKOS_INTERNAL_COMPILER_CRAY), 1)
else
ifeq ($(KOKKOS_INTERNAL_COMPILER_PGI), 1)
KOKKOS_CXXFLAGS += -tp=nehalem
KOKKOS_LDFLAGS += -tp=nehalem
else
# Assume that this is a really a GNU compiler.
KOKKOS_CXXFLAGS += -msse4.2
KOKKOS_LDFLAGS += -msse4.2
endif
endif
endif
endif
ifeq ($(KOKKOS_INTERNAL_USE_ARCH_AVX), 1)
tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_AVX")
ifeq ($(KOKKOS_INTERNAL_COMPILER_INTEL), 1)
KOKKOS_CXXFLAGS += -mavx
KOKKOS_LDFLAGS += -mavx
else
ifeq ($(KOKKOS_INTERNAL_COMPILER_CRAY), 1)
else
ifeq ($(KOKKOS_INTERNAL_COMPILER_PGI), 1)
KOKKOS_CXXFLAGS += -tp=sandybridge
KOKKOS_LDFLAGS += -tp=sandybridge
else
# Assume that this is a really a GNU compiler.
KOKKOS_CXXFLAGS += -mavx
KOKKOS_LDFLAGS += -mavx
endif
endif
endif
endif
ifeq ($(KOKKOS_INTERNAL_USE_ARCH_POWER7), 1)
tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_POWER7")
ifeq ($(KOKKOS_INTERNAL_COMPILER_PGI), 1)
else
# Assume that this is a really a GNU compiler or it could be XL on P8.
KOKKOS_CXXFLAGS += -mcpu=power7 -mtune=power7
KOKKOS_LDFLAGS += -mcpu=power7 -mtune=power7
endif
endif
ifeq ($(KOKKOS_INTERNAL_USE_ARCH_POWER8), 1)
tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_POWER8")
ifeq ($(KOKKOS_INTERNAL_COMPILER_PGI), 1)
else
KOKKOS_CXXFLAGS += -mcpu=power8 -mtune=power8
KOKKOS_LDFLAGS += -mcpu=power8 -mtune=power8
endif
endif
ifeq ($(KOKKOS_INTERNAL_USE_ARCH_POWER9), 1)
tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_POWER9")
ifeq ($(KOKKOS_INTERNAL_COMPILER_PGI), 1)
else
KOKKOS_CXXFLAGS += -mcpu=power9 -mtune=power9
KOKKOS_LDFLAGS += -mcpu=power9 -mtune=power9
endif
endif
ifeq ($(KOKKOS_INTERNAL_USE_ARCH_HSW), 1)
tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_AVX2")
ifeq ($(KOKKOS_INTERNAL_COMPILER_INTEL), 1)
KOKKOS_CXXFLAGS += -xCORE-AVX2
KOKKOS_LDFLAGS += -xCORE-AVX2
else
ifeq ($(KOKKOS_INTERNAL_COMPILER_CRAY), 1)
else
ifeq ($(KOKKOS_INTERNAL_COMPILER_PGI), 1)
KOKKOS_CXXFLAGS += -tp=haswell
KOKKOS_LDFLAGS += -tp=haswell
else
# Assume that this is a really a GNU compiler.
KOKKOS_CXXFLAGS += -march=core-avx2 -mtune=core-avx2
KOKKOS_LDFLAGS += -march=core-avx2 -mtune=core-avx2
endif
endif
endif
endif
ifeq ($(KOKKOS_INTERNAL_USE_ARCH_BDW), 1)
tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_AVX2")
ifeq ($(KOKKOS_INTERNAL_COMPILER_INTEL), 1)
KOKKOS_CXXFLAGS += -xCORE-AVX2
KOKKOS_LDFLAGS += -xCORE-AVX2
else
ifeq ($(KOKKOS_INTERNAL_COMPILER_CRAY), 1)
else
ifeq ($(KOKKOS_INTERNAL_COMPILER_PGI), 1)
KOKKOS_CXXFLAGS += -tp=haswell
KOKKOS_LDFLAGS += -tp=haswell
else
# Assume that this is a really a GNU compiler.
KOKKOS_CXXFLAGS += -march=core-avx2 -mtune=core-avx2 -mrtm
KOKKOS_LDFLAGS += -march=core-avx2 -mtune=core-avx2 -mrtm
endif
endif
endif
endif
ifeq ($(KOKKOS_INTERNAL_USE_ARCH_AVX512MIC), 1)
tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_AVX512MIC")
ifeq ($(KOKKOS_INTERNAL_COMPILER_INTEL), 1)
KOKKOS_CXXFLAGS += -xMIC-AVX512
KOKKOS_LDFLAGS += -xMIC-AVX512
else
ifeq ($(KOKKOS_INTERNAL_COMPILER_CRAY), 1)
else
ifeq ($(KOKKOS_INTERNAL_COMPILER_PGI), 1)
else
# Asssume that this is really a GNU compiler.
KOKKOS_CXXFLAGS += -march=knl -mtune=knl
KOKKOS_LDFLAGS += -march=knl -mtune=knl
endif
endif
endif
endif
ifeq ($(KOKKOS_INTERNAL_USE_ARCH_SKL), 1)
tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_AVX512XEON")
ifeq ($(KOKKOS_INTERNAL_COMPILER_INTEL), 1)
KOKKOS_CXXFLAGS += -xSKYLAKE
KOKKOS_LDFLAGS += -xSKYLAKE
else
ifeq ($(KOKKOS_INTERNAL_COMPILER_CRAY), 1)
else
ifeq ($(KOKKOS_INTERNAL_COMPILER_PGI), 1)
else
# Nothing here yet.
KOKKOS_CXXFLAGS += -march=skylake
KOKKOS_LDFLAGS += -march=skylake
endif
endif
endif
endif
ifeq ($(KOKKOS_INTERNAL_USE_ARCH_SKX), 1)
tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_AVX512XEON")
ifeq ($(KOKKOS_INTERNAL_COMPILER_INTEL), 1)
KOKKOS_CXXFLAGS += -xCORE-AVX512
KOKKOS_LDFLAGS += -xCORE-AVX512
else
ifeq ($(KOKKOS_INTERNAL_COMPILER_CRAY), 1)
else
ifeq ($(KOKKOS_INTERNAL_COMPILER_PGI), 1)
else
# Nothing here yet.
KOKKOS_CXXFLAGS += -march=skylake-avx512 -mtune=skylake-avx512
KOKKOS_LDFLAGS += -march=skylake-avx512 -mtune=skylake-avx512
endif
endif
endif
endif
ifeq ($(KOKKOS_INTERNAL_USE_ARCH_ICL), 1)
tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_AVX512XEON")
KOKKOS_CXXFLAGS += -march=icelake-client -mtune=icelake-client
KOKKOS_LDFLAGS += -march=icelake-client -mtune=icelake-client
endif
ifeq ($(KOKKOS_INTERNAL_USE_ARCH_ICX), 1)
tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_AVX512XEON")
KOKKOS_CXXFLAGS += -march=icelake-server -mtune=icelake-server
KOKKOS_LDFLAGS += -march=icelake-server -mtune=icelake-server
endif
ifeq ($(KOKKOS_INTERNAL_USE_ARCH_SPR), 1)
tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_AVX512XEON")
KOKKOS_CXXFLAGS += -march=sapphirerapids -mtune=sapphirerapids
KOKKOS_LDFLAGS += -march=sapphirerapids -mtune=sapphirerapids
endif
ifeq ($(KOKKOS_INTERNAL_USE_ARCH_KNC), 1)
tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_KNC")
KOKKOS_CXXFLAGS += -mmic
KOKKOS_LDFLAGS += -mmic
endif
# Figure out the architecture flag for Cuda.
ifeq ($(KOKKOS_INTERNAL_USE_CUDA), 1)
KOKKOS_INTERNAL_USE_CUDA_ARCH=1
endif
ifeq ($(KOKKOS_INTERNAL_USE_HIP), 1)
ifeq ($(KOKKOS_INTERNAL_COMPILER_NVCC), 1)
KOKKOS_INTERNAL_USE_CUDA_ARCH=1
endif
endif
ifeq ($(KOKKOS_INTERNAL_USE_CUDA_ARCH), 1)
ifeq ($(KOKKOS_INTERNAL_COMPILER_NVCC), 1)
KOKKOS_INTERNAL_CUDA_ARCH_FLAG=-arch
else ifeq ($(KOKKOS_INTERNAL_COMPILER_CLANG), 1)
KOKKOS_INTERNAL_CUDA_ARCH_FLAG=--cuda-gpu-arch
KOKKOS_CXXFLAGS += -x cuda
else
$(error Makefile.kokkos: CUDA is enabled but the compiler is neither NVCC nor Clang (got version string $(KOKKOS_CXX_VERSION)) )
endif
KOKKOS_INTERNAL_USE_CUDA_ARCH = 1
endif
ifeq ($(KOKKOS_INTERNAL_USE_OPENMPTARGET), 1)
ifeq ($(KOKKOS_INTERNAL_COMPILER_CLANG), 1)
KOKKOS_INTERNAL_CUDA_ARCH_FLAG=-fopenmp-targets=nvptx64 -Xopenmp-target -march
endif
KOKKOS_INTERNAL_USE_CUDA_ARCH = 1
endif
ifeq ($(KOKKOS_INTERNAL_USE_CUDA_ARCH), 1)
ifeq ($(KOKKOS_INTERNAL_USE_ARCH_KEPLER30), 1)
tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_KEPLER")
tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_KEPLER30")
KOKKOS_INTERNAL_CUDA_ARCH_FLAG := $(KOKKOS_INTERNAL_CUDA_ARCH_FLAG)=sm_30
endif
ifeq ($(KOKKOS_INTERNAL_USE_ARCH_KEPLER32), 1)
tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_KEPLER")
tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_KEPLER32")
KOKKOS_INTERNAL_CUDA_ARCH_FLAG := $(KOKKOS_INTERNAL_CUDA_ARCH_FLAG)=sm_32
endif
ifeq ($(KOKKOS_INTERNAL_USE_ARCH_KEPLER35), 1)
tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_KEPLER")
tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_KEPLER35")
KOKKOS_INTERNAL_CUDA_ARCH_FLAG := $(KOKKOS_INTERNAL_CUDA_ARCH_FLAG)=sm_35
endif
ifeq ($(KOKKOS_INTERNAL_USE_ARCH_KEPLER37), 1)
tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_KEPLER")
tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_KEPLER37")
KOKKOS_INTERNAL_CUDA_ARCH_FLAG := $(KOKKOS_INTERNAL_CUDA_ARCH_FLAG)=sm_37
endif
ifeq ($(KOKKOS_INTERNAL_USE_ARCH_MAXWELL50), 1)
tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_MAXWELL")
tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_MAXWELL50")
KOKKOS_INTERNAL_CUDA_ARCH_FLAG := $(KOKKOS_INTERNAL_CUDA_ARCH_FLAG)=sm_50
endif
ifeq ($(KOKKOS_INTERNAL_USE_ARCH_MAXWELL52), 1)
tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_MAXWELL")
tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_MAXWELL52")
KOKKOS_INTERNAL_CUDA_ARCH_FLAG := $(KOKKOS_INTERNAL_CUDA_ARCH_FLAG)=sm_52
endif
ifeq ($(KOKKOS_INTERNAL_USE_ARCH_MAXWELL53), 1)
tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_MAXWELL")
tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_MAXWELL53")
KOKKOS_INTERNAL_CUDA_ARCH_FLAG := $(KOKKOS_INTERNAL_CUDA_ARCH_FLAG)=sm_53
endif
ifeq ($(KOKKOS_INTERNAL_USE_ARCH_PASCAL60), 1)
tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_PASCAL")
tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_PASCAL60")
KOKKOS_INTERNAL_CUDA_ARCH_FLAG := $(KOKKOS_INTERNAL_CUDA_ARCH_FLAG)=sm_60
endif
ifeq ($(KOKKOS_INTERNAL_USE_ARCH_PASCAL61), 1)
tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_PASCAL")
tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_PASCAL61")
KOKKOS_INTERNAL_CUDA_ARCH_FLAG := $(KOKKOS_INTERNAL_CUDA_ARCH_FLAG)=sm_61
endif
ifeq ($(KOKKOS_INTERNAL_USE_ARCH_VOLTA70), 1)
tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_VOLTA")
tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_VOLTA70")
KOKKOS_INTERNAL_CUDA_ARCH_FLAG := $(KOKKOS_INTERNAL_CUDA_ARCH_FLAG)=sm_70
endif
ifeq ($(KOKKOS_INTERNAL_USE_ARCH_VOLTA72), 1)
tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_VOLTA")
tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_VOLTA72")
KOKKOS_INTERNAL_CUDA_ARCH_FLAG := $(KOKKOS_INTERNAL_CUDA_ARCH_FLAG)=sm_72
endif
ifeq ($(KOKKOS_INTERNAL_USE_ARCH_TURING75), 1)
tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_TURING")
tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_TURING75")
KOKKOS_INTERNAL_CUDA_ARCH_FLAG := $(KOKKOS_INTERNAL_CUDA_ARCH_FLAG)=sm_75
endif
ifeq ($(KOKKOS_INTERNAL_USE_ARCH_AMPERE80), 1)
tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_AMPERE")
tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_AMPERE80")
KOKKOS_INTERNAL_CUDA_ARCH_FLAG := $(KOKKOS_INTERNAL_CUDA_ARCH_FLAG)=sm_80
endif
ifeq ($(KOKKOS_INTERNAL_USE_ARCH_AMPERE86), 1)
tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_AMPERE")
tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_AMPERE86")
KOKKOS_INTERNAL_CUDA_ARCH_FLAG := $(KOKKOS_INTERNAL_CUDA_ARCH_FLAG)=sm_86
endif
ifneq ($(KOKKOS_INTERNAL_USE_ARCH_NVIDIA), 0)
KOKKOS_CXXFLAGS += $(KOKKOS_INTERNAL_CUDA_ARCH_FLAG)
ifeq ($(KOKKOS_INTERNAL_COMPILER_NVCC), 1)
KOKKOS_LDFLAGS += $(KOKKOS_INTERNAL_CUDA_ARCH_FLAG)
endif
ifeq ($(KOKKOS_INTERNAL_COMPILER_CLANG), 1)
ifeq ($(KOKKOS_INTERNAL_USE_OPENMPTARGET), 1)
KOKKOS_LDFLAGS += $(KOKKOS_INTERNAL_CUDA_ARCH_FLAG)
endif
endif
endif
ifeq ($(KOKKOS_INTERNAL_USE_HIP), 1)
KOKKOS_CXXFLAGS += --expt-extended-lambda
endif
endif
# Figure out the architecture flag for ROCm.
ifeq ($(KOKKOS_INTERNAL_USE_HIP), 1)
# Lets start with adding architecture defines
ifeq ($(KOKKOS_INTERNAL_USE_ARCH_VEGA900), 1)
tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_VEGA900")
tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_VEGA")
KOKKOS_INTERNAL_HIP_ARCH_FLAG := --offload-arch=gfx900
endif
ifeq ($(KOKKOS_INTERNAL_USE_ARCH_VEGA906), 1)
tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_VEGA906")
tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_VEGA")
KOKKOS_INTERNAL_HIP_ARCH_FLAG := --offload-arch=gfx906
endif
ifeq ($(KOKKOS_INTERNAL_USE_ARCH_VEGA908), 1)
tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_VEGA908")
tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_VEGA")
KOKKOS_INTERNAL_HIP_ARCH_FLAG := --offload-arch=gfx908
endif
ifeq ($(KOKKOS_INTERNAL_USE_ARCH_VEGA90A), 1)
tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_VEGA90A")
tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_VEGA")
KOKKOS_INTERNAL_HIP_ARCH_FLAG := --offload-arch=gfx90a
endif
KOKKOS_SRC += $(wildcard $(KOKKOS_PATH)/core/src/HIP/*.cpp)
KOKKOS_HEADERS += $(wildcard $(KOKKOS_PATH)/core/src/HIP/*.hpp)
ifeq ($(KOKKOS_INTERNAL_DISABLE_DESUL_ATOMICS), 0)
KOKKOS_SRC += $(KOKKOS_PATH)/tpls/desul/src/Lock_Array_HIP.cpp
endif
KOKKOS_CXXFLAGS+=$(KOKKOS_INTERNAL_HIP_ARCH_FLAG)
KOKKOS_LDFLAGS+=$(KOKKOS_INTERNAL_HIP_ARCH_FLAG)
ifeq ($(KOKKOS_INTERNAL_HIP_USE_RELOC), 1)
tmp := $(call kokkos_append_header,"$H""define KOKKOS_ENABLE_HIP_RELOCATABLE_DEVICE_CODE")
KOKKOS_CXXFLAGS+=-fgpu-rdc
KOKKOS_LDFLAGS+=-fgpu-rdc
else
KOKKOS_CXXFLAGS+=-fno-gpu-rdc
KOKKOS_LDFLAGS+=-fno-gpu-rdc
endif
endif
# Figure out Intel architecture flags.
ifeq ($(KOKKOS_INTERNAL_USE_SYCL), 1)
KOKKOS_INTERNAL_LC_BACKEND := sycl
endif
ifeq ($(KOKKOS_INTERNAL_USE_OPENMPTARGET), 1)
KOKKOS_INTERNAL_LC_BACKEND := openmp
endif
ifeq ($(KOKKOS_INTERNAL_USE_ARCH_INTEL_GEN), 1)
tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_INTEL_GPU")
tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_INTEL_GEN")
KOKKOS_INTERNAL_INTEL_ARCH_FLAG := -f${KOKKOS_INTERNAL_LC_BACKEND}-targets=spir64
endif
ifeq ($(KOKKOS_INTERNAL_USE_ARCH_INTEL_GEN9), 1)
tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_INTEL_GPU")
tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_INTEL_GEN9")
KOKKOS_INTERNAL_INTEL_ARCH_FLAG := -f${KOKKOS_INTERNAL_LC_BACKEND}-targets=spir64_gen -X${KOKKOS_INTERNAL_LC_BACKEND}-target-backend "-device gen9"
endif
ifeq ($(KOKKOS_INTERNAL_USE_ARCH_INTEL_GEN11), 1)
tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_INTEL_GPU")
tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_INTEL_GEN11")
KOKKOS_INTERNAL_INTEL_ARCH_FLAG := -f${KOKKOS_INTERNAL_LC_BACKEND}-targets=spir64_gen -X${KOKKOS_INTERNAL_LC_BACKEND}-target-backend "-device gen11"
endif
ifeq ($(KOKKOS_INTERNAL_USE_ARCH_INTEL_GEN12LP), 1)
tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_INTEL_GPU")
tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_INTEL_GEN12LP")
KOKKOS_INTERNAL_INTEL_ARCH_FLAG := -f${KOKKOS_INTERNAL_LC_BACKEND}-targets=spir64_gen -X${KOKKOS_INTERNAL_LC_BACKEND}-target-backend "-device gen12lp"
endif
ifeq ($(KOKKOS_INTERNAL_USE_ARCH_INTEL_DG1), 1)
tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_INTEL_GPU")
tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_INTEL_DG1")
KOKKOS_INTERNAL_INTEL_ARCH_FLAG := -f${KOKKOS_INTERNAL_LC_BACKEND}-targets=spir64_gen -X${KOKKOS_INTERNAL_LC_BACKEND}-target-backend "-device dg1"
endif
ifeq ($(KOKKOS_INTERNAL_USE_ARCH_INTEL_XEHP), 1)
tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_INTEL_GPU")
tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_INTEL_XEHP")
KOKKOS_INTERNAL_INTEL_ARCH_FLAG := -f${KOKKOS_INTERNAL_LC_BACKEND}-targets=spir64_gen -X${KOKKOS_INTERNAL_LC_BACKEND}-target-backend "-device xehp"
endif
ifeq ($(KOKKOS_INTERNAL_USE_ARCH_INTEL_PVC), 1)
tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_INTEL_GPU")
tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_INTEL_PVC")
KOKKOS_INTERNAL_INTEL_ARCH_FLAG := -f${KOKKOS_INTERNAL_LC_BACKEND}-targets=spir64_gen -X${KOKKOS_INTERNAL_LC_BACKEND}-target-backend "-device 12.4.0"
endif
ifeq ($(KOKKOS_INTERNAL_USE_SYCL), 1)
KOKKOS_SRC += $(wildcard $(KOKKOS_PATH)/core/src/SYCL/*.cpp)
KOKKOS_HEADERS += $(wildcard $(KOKKOS_PATH)/core/src/SYCL/*.hpp)
KOKKOS_CXXFLAGS+=-fsycl -fno-sycl-id-queries-fit-in-int -fsycl-unnamed-lambda -fsycl-dead-args-optimization
KOKKOS_CXXFLAGS+=$(KOKKOS_INTERNAL_INTEL_ARCH_FLAG)
KOKKOS_LDFLAGS+=-fsycl
KOKKOS_LDFLAGS+=$(KOKKOS_INTERNAL_INTEL_ARCH_FLAG)
endif
ifeq ($(KOKKOS_INTERNAL_USE_OPENMPTARGET), 1)
KOKKOS_CXXFLAGS+=$(KOKKOS_INTERNAL_INTEL_ARCH_FLAG) -D__STRICT_ANSI__
KOKKOS_LDFLAGS+=$(KOKKOS_INTERNAL_INTEL_ARCH_FLAG)
endif
ifeq ($(KOKKOS_INTERNAL_DISABLE_DESUL_ATOMICS), 0)
tmp := $(call kokkos_append_header,"$H""define KOKKOS_ENABLE_IMPL_DESUL_ATOMICS")
KOKKOS_CPPFLAGS+=-I$(KOKKOS_PATH)/tpls/desul/include
else
ifeq ($(KOKKOS_INTERNAL_ENABLE_DESUL_ATOMICS), 1)
$(error Contradictory Desul atomics options: KOKKOS_OPTIONS=$(KOKKOS_OPTIONS) )
endif
endif
KOKKOS_INTERNAL_LS_CONFIG := $(shell ls KokkosCore_config.h 2>&1)
ifeq ($(KOKKOS_INTERNAL_LS_CONFIG), KokkosCore_config.h)
KOKKOS_INTERNAL_NEW_CONFIG := $(strip $(shell diff KokkosCore_config.h KokkosCore_config.tmp | grep -c define))
else
KOKKOS_INTERNAL_NEW_CONFIG := 1
endif
ifneq ($(KOKKOS_INTERNAL_NEW_CONFIG), 0)
tmp := $(shell cp KokkosCore_config.tmp KokkosCore_config.h)
# Functions for generating config header file
kokkos_start_config_header = $(shell sed 's~@INCLUDE_NEXT_FILE@~~g' $(KOKKOS_PATH)/cmake/KokkosCore_Config_HeaderSet.in > $1)
kokkos_update_config_header = $(shell sed 's~@HEADER_GUARD_TAG@~$1~g' $2 > $3)
kokkos_append_config_header = $(shell echo $1 >> $2))
tmp := $(call kokkos_start_config_header, "KokkosCore_Config_FwdBackend.tmp")
tmp := $(call kokkos_start_config_header, "KokkosCore_Config_SetupBackend.tmp")
tmp := $(call kokkos_start_config_header, "KokkosCore_Config_DeclareBackend.tmp")
tmp := $(call kokkos_start_config_header, "KokkosCore_Config_PostInclude.tmp")
tmp := $(call kokkos_update_config_header, KOKKOS_FWD_HPP_, "KokkosCore_Config_FwdBackend.tmp", "KokkosCore_Config_FwdBackend.hpp")
tmp := $(call kokkos_update_config_header, KOKKOS_SETUP_HPP_, "KokkosCore_Config_SetupBackend.tmp", "KokkosCore_Config_SetupBackend.hpp")
tmp := $(call kokkos_update_config_header, KOKKOS_DECLARE_HPP_, "KokkosCore_Config_DeclareBackend.tmp", "KokkosCore_Config_DeclareBackend.hpp")
tmp := $(call kokkos_update_config_header, KOKKOS_POST_INCLUDE_HPP_, "KokkosCore_Config_PostInclude.tmp", "KokkosCore_Config_PostInclude.hpp")
ifeq ($(KOKKOS_INTERNAL_USE_CUDA), 1)
tmp := $(call kokkos_append_config_header,"$H""include ","KokkosCore_Config_FwdBackend.hpp")
tmp := $(call kokkos_append_config_header,"$H""include ","KokkosCore_Config_DeclareBackend.hpp")
tmp := $(call kokkos_append_config_header,"$H""include ","KokkosCore_Config_SetupBackend.hpp")
ifeq ($(KOKKOS_INTERNAL_CUDA_USE_UVM), 1)
else
endif
endif
ifeq ($(KOKKOS_INTERNAL_USE_OPENMPTARGET), 1)
tmp := $(call kokkos_append_config_header,"$H""include ","KokkosCore_Config_FwdBackend.hpp")
tmp := $(call kokkos_append_config_header,"$H""include ","KokkosCore_Config_DeclareBackend.hpp")
endif
ifeq ($(KOKKOS_INTERNAL_USE_SYCL), 1)
tmp := $(call kokkos_append_config_header,"$H""include ","KokkosCore_Config_FwdBackend.hpp")
tmp := $(call kokkos_append_config_header,"$H""include ","KokkosCore_Config_DeclareBackend.hpp")
endif
ifeq ($(KOKKOS_INTERNAL_USE_HIP), 1)
tmp := $(call kokkos_append_config_header,"$H""include ","KokkosCore_Config_FwdBackend.hpp")
tmp := $(call kokkos_append_config_header,"$H""include ","KokkosCore_Config_DeclareBackend.hpp")
tmp := $(call kokkos_append_config_header,"$H""include ","KokkosCore_Config_SetupBackend.hpp")
endif
ifeq ($(KOKKOS_INTERNAL_USE_OPENMP), 1)
tmp := $(call kokkos_append_config_header,"$H""include ","KokkosCore_Config_FwdBackend.hpp")
tmp := $(call kokkos_append_config_header,"$H""include ","KokkosCore_Config_DeclareBackend.hpp")
endif
ifeq ($(KOKKOS_INTERNAL_USE_THREADS), 1)
tmp := $(call kokkos_append_config_header,"$H""include ","KokkosCore_Config_FwdBackend.hpp")
tmp := $(call kokkos_append_config_header,"$H""include ","KokkosCore_Config_DeclareBackend.hpp")
endif
ifeq ($(KOKKOS_INTERNAL_USE_HPX), 1)
tmp := $(call kokkos_append_config_header,"$H""include ","KokkosCore_Config_FwdBackend.hpp")
tmp := $(call kokkos_append_config_header,"$H""include ","KokkosCore_Config_DeclareBackend.hpp")
endif
ifeq ($(KOKKOS_INTERNAL_USE_SERIAL), 1)
tmp := $(call kokkos_append_config_header,"$H""include ","KokkosCore_Config_FwdBackend.hpp")
tmp := $(call kokkos_append_config_header,"$H""include ","KokkosCore_Config_DeclareBackend.hpp")
endif
ifeq ($(KOKKOS_INTERNAL_USE_MEMKIND), 1)
tmp := $(call kokkos_append_config_header,"$H""include ","KokkosCore_Config_FwdBackend.hpp")
tmp := $(call kokkos_append_config_header,"$H""include ","KokkosCore_Config_DeclareBackend.hpp")
endif
endif
KOKKOS_HEADERS += $(wildcard $(KOKKOS_PATH)/core/src/*.hpp)
KOKKOS_HEADERS += $(wildcard $(KOKKOS_PATH)/core/src/impl/*.hpp)
KOKKOS_HEADERS += $(wildcard $(KOKKOS_PATH)/containers/src/*.hpp)
KOKKOS_HEADERS += $(wildcard $(KOKKOS_PATH)/containers/src/impl/*.hpp)
KOKKOS_HEADERS += $(wildcard $(KOKKOS_PATH)/algorithms/src/*.hpp)
KOKKOS_SRC += $(wildcard $(KOKKOS_PATH)/core/src/impl/*.cpp)
KOKKOS_SRC += $(wildcard $(KOKKOS_PATH)/containers/src/impl/*.cpp)
ifeq ($(KOKKOS_INTERNAL_USE_CUDA), 1)
KOKKOS_SRC += $(wildcard $(KOKKOS_PATH)/core/src/Cuda/*.cpp)
ifeq ($(KOKKOS_INTERNAL_DISABLE_DESUL_ATOMICS), 0)
KOKKOS_SRC += $(KOKKOS_PATH)/tpls/desul/src/Lock_Array_CUDA.cpp
endif
KOKKOS_HEADERS += $(wildcard $(KOKKOS_PATH)/core/src/Cuda/*.hpp)
ifneq ($(CUDA_PATH),)
KOKKOS_CPPLAGS += -I$(CUDA_PATH)/include
ifeq ($(call kokkos_path_exists,$(CUDA_PATH)/lib64), 1)
KOKKOS_LIBDIRS += -L$(CUDA_PATH)/lib64
KOKKOS_CXXLDFLAGS += -L$(CUDA_PATH)/lib64
KOKKOS_TPL_LIBRARY_DIRS += $(CUDA_PATH)/lib64
else ifeq ($(call kokkos_path_exists,$(CUDA_PATH)/lib), 1)
KOKKOS_LIBDIRS += -L$(CUDA_PATH)/lib
KOKKOS_CXXLDFLAGS += -L$(CUDA_PATH)/lib
KOKKOS_TPL_LIBRARY_DIRS += $(CUDA_PATH)/lib
else
$(error Can't find CUDA library directory: no lib64 or lib directory in $(CUDA_PATH))
endif
KOKKOS_TPL_INCLUDE_DIRS += $(CUDA_PATH)/include
ifeq ($(KOKKOS_INTERNAL_COMPILER_CLANG), 1)
KOKKOS_CXXFLAGS += --cuda-path=$(CUDA_PATH)
endif
endif
KOKKOS_LIBS += -lcudart -lcuda
KOKKOS_TPL_LIBRARY_NAMES += cudart cuda
endif
ifeq ($(KOKKOS_INTERNAL_USE_OPENMPTARGET), 1)
KOKKOS_SRC += $(wildcard $(KOKKOS_PATH)/core/src/OpenMPTarget/*.cpp)
KOKKOS_HEADERS += $(wildcard $(KOKKOS_PATH)/core/src/OpenMPTarget/*.hpp)
ifeq ($(KOKKOS_INTERNAL_USE_CUDA), 1)
KOKKOS_CXXFLAGS += -Xcompiler $(KOKKOS_INTERNAL_OPENMPTARGET_FLAG)
else
KOKKOS_CXXFLAGS += $(KOKKOS_INTERNAL_OPENMPTARGET_FLAG)
endif
KOKKOS_LDFLAGS += $(KOKKOS_INTERNAL_OPENMPTARGET_FLAG)
KOKKOS_LIBS += $(KOKKOS_INTERNAL_OPENMPTARGET_LIB)
endif
ifeq ($(KOKKOS_INTERNAL_USE_OPENMP), 1)
KOKKOS_SRC += $(wildcard $(KOKKOS_PATH)/core/src/OpenMP/*.cpp)
KOKKOS_HEADERS += $(wildcard $(KOKKOS_PATH)/core/src/OpenMP/*.hpp)
ifeq ($(KOKKOS_INTERNAL_COMPILER_NVCC), 1)
KOKKOS_CXXFLAGS += -Xcompiler $(KOKKOS_INTERNAL_OPENMP_FLAG)
else
KOKKOS_CXXFLAGS += $(KOKKOS_INTERNAL_OPENMP_FLAG)
endif
KOKKOS_LDFLAGS += $(KOKKOS_INTERNAL_OPENMP_FLAG)
KOKKOS_LINK_FLAGS += $(KOKKOS_INTERNAL_OPENMP_FLAG)
endif
ifeq ($(KOKKOS_INTERNAL_USE_THREADS), 1)
KOKKOS_SRC += $(wildcard $(KOKKOS_PATH)/core/src/Threads/*.cpp)
KOKKOS_HEADERS += $(wildcard $(KOKKOS_PATH)/core/src/Threads/*.hpp)
KOKKOS_LIBS += -lpthread
KOKKOS_TPL_LIBRARY_NAMES += pthread
endif
ifeq ($(KOKKOS_INTERNAL_USE_SERIAL), 1)
KOKKOS_SRC += $(wildcard $(KOKKOS_PATH)/core/src/Serial/*.cpp)
KOKKOS_HEADERS += $(wildcard $(KOKKOS_PATH)/core/src/Serial/*.hpp)
endif
ifeq ($(KOKKOS_INTERNAL_USE_HPX), 1)
KOKKOS_SRC += $(wildcard $(KOKKOS_PATH)/core/src/HPX/*.cpp)
KOKKOS_HEADERS += $(wildcard $(KOKKOS_PATH)/core/src/HPX/*.hpp)
ifneq ($(HPX_PATH),)
ifeq ($(KOKKOS_INTERNAL_ENABLE_DEBUG), 1)
KOKKOS_CXXFLAGS += $(shell PKG_CONFIG_PATH=$(HPX_PATH)/lib64/pkgconfig pkg-config --cflags hpx_application_debug)
KOKKOS_CXXLDFLAGS += $(shell PKG_CONFIG_PATH=$(HPX_PATH)/lib64/pkgconfig pkg-config --libs hpx_application_debug)
KOKKOS_LIBS += $(shell PKG_CONFIG_PATH=$(HPX_PATH)/lib64/pkgconfig pkg-config --libs hpx_application_debug)
else
KOKKOS_CXXFLAGS += $(shell PKG_CONFIG_PATH=$(HPX_PATH)/lib64/pkgconfig pkg-config --cflags hpx_application)
KOKKOS_CXXLDFLAGS += $(shell PKG_CONFIG_PATH=$(HPX_PATH)/lib64/pkgconfig pkg-config --libs hpx_application)
KOKKOS_LIBS += $(shell PKG_CONFIG_PATH=$(HPX_PATH)/lib64/pkgconfig pkg-config --libs hpx_application)
endif
else
ifeq ($(KOKKOS_INTERNAL_ENABLE_DEBUG), 1)
KOKKOS_CXXFLAGS += $(shell pkg-config --cflags hpx_application_debug)
KOKKOS_CXXLDFLAGS += $(shell pkg-config --libs hpx_application_debug)
KOKKOS_LIBS += $(shell pkg-config --libs hpx_application_debug)
else
KOKKOS_CXXFLAGS += $(shell pkg-config --cflags hpx_application)
KOKKOS_CXXLDFLAGS += $(shell pkg-config --libs hpx_application)
KOKKOS_LIBS += $(shell pkg-config --libs hpx_application)
endif
endif
KOKKOS_TPL_LIBRARY_NAMES += hpx
endif
# Don't include Kokkos_HBWSpace.cpp if not using MEMKIND to avoid a link warning.
ifneq ($(KOKKOS_INTERNAL_USE_MEMKIND), 1)
KOKKOS_SRC := $(filter-out $(KOKKOS_PATH)/core/src/impl/Kokkos_HBWSpace.cpp,$(KOKKOS_SRC))
endif
# With Cygwin functions such as fdopen and fileno are not defined
# when strict ansi is enabled. strict ansi gets enabled with -std=c++14
# though. So we hard undefine it here. Not sure if that has any bad side effects
# This is needed for gtest actually, not for Kokkos itself!
ifeq ($(KOKKOS_INTERNAL_OS_CYGWIN), 1)
KOKKOS_CXXFLAGS += -U__STRICT_ANSI__
endif
# Set KokkosExtraLibs and add -lkokkos to link line
KOKKOS_EXTRA_LIBS := ${KOKKOS_LIBS}
KOKKOS_LIBS := -lkokkos ${KOKKOS_LIBS}
# Setting up dependencies.
KokkosCore_config.h:
KOKKOS_CPP_DEPENDS := KokkosCore_config.h $(KOKKOS_HEADERS)
KOKKOS_OBJ = $(KOKKOS_SRC:.cpp=.o)
KOKKOS_OBJ_LINK = $(notdir $(KOKKOS_OBJ))
include $(KOKKOS_PATH)/Makefile.targets
kokkos-clean:
rm -f $(KOKKOS_OBJ_LINK) KokkosCore_config.h KokkosCore_config.tmp libkokkos.a KokkosCore_Config_SetupBackend.hpp \
KokkosCore_Config_FwdBackend.hpp KokkosCore_Config_DeclareBackend.hpp KokkosCore_Config_DeclareBackend.tmp \
KokkosCore_Config_FwdBackend.tmp KokkosCore_Config_PostInclude.hpp KokkosCore_Config_PostInclude.tmp KokkosCore_Config_SetupBackend.tmp
libkokkos.a: $(KOKKOS_OBJ_LINK) $(KOKKOS_SRC) $(KOKKOS_HEADERS)
ar cr libkokkos.a $(KOKKOS_OBJ_LINK)
ranlib libkokkos.a
print-cxx-flags:
echo "$(KOKKOS_CXXFLAGS)"
KOKKOS_LINK_DEPENDS=libkokkos.a
#we have carefully separated LDFLAGS from LIBS and LIBDIRS
#we have also separated CPPFLAGS from CXXFLAGS
#if this is not cmake, for backwards compatibility
#we just jam everything together into the CXXFLAGS and LDFLAGS
ifneq ($(KOKKOS_CMAKE), yes)
KOKKOS_CXXFLAGS += $(KOKKOS_CPPFLAGS)
KOKKOS_LDFLAGS += $(KOKKOS_LIBDIRS)
endif
kokkos-3.7.01/Makefile.targets 0000664 0000000 0000000 00000022324 14343743117 0016225 0 ustar 00root root 0000000 0000000 Kokkos_UnorderedMap_impl.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/containers/src/impl/Kokkos_UnorderedMap_impl.cpp
$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/containers/src/impl/Kokkos_UnorderedMap_impl.cpp
Kokkos_Core.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/impl/Kokkos_Core.cpp
$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/impl/Kokkos_Core.cpp
Kokkos_CPUDiscovery.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/impl/Kokkos_CPUDiscovery.cpp
$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/impl/Kokkos_CPUDiscovery.cpp
Kokkos_Error.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/impl/Kokkos_Error.cpp
$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/impl/Kokkos_Error.cpp
Kokkos_Stacktrace.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/impl/Kokkos_Stacktrace.cpp
$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/impl/Kokkos_Stacktrace.cpp
Kokkos_ExecPolicy.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/impl/Kokkos_ExecPolicy.cpp
$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/impl/Kokkos_ExecPolicy.cpp
Kokkos_Command_Line_Parsing.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/impl/Kokkos_Command_Line_Parsing.cpp
$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/impl/Kokkos_Command_Line_Parsing.cpp
Kokkos_HostSpace.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/impl/Kokkos_HostSpace.cpp
$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/impl/Kokkos_HostSpace.cpp
Kokkos_hwloc.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/impl/Kokkos_hwloc.cpp
$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/impl/Kokkos_hwloc.cpp
Kokkos_TaskQueue.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/impl/Kokkos_TaskQueue.cpp
$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/impl/Kokkos_TaskQueue.cpp
Kokkos_HostThreadTeam.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/impl/Kokkos_HostThreadTeam.cpp
$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/impl/Kokkos_HostThreadTeam.cpp
Kokkos_Spinwait.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/impl/Kokkos_Spinwait.cpp
$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/impl/Kokkos_Spinwait.cpp
Kokkos_HostBarrier.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/impl/Kokkos_HostBarrier.cpp
$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/impl/Kokkos_HostBarrier.cpp
Kokkos_Profiling.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/impl/Kokkos_Profiling.cpp
$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/impl/Kokkos_Profiling.cpp
Kokkos_SharedAlloc.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/impl/Kokkos_SharedAlloc.cpp
$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/impl/Kokkos_SharedAlloc.cpp
Kokkos_MemoryPool.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/impl/Kokkos_MemoryPool.cpp
$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/impl/Kokkos_MemoryPool.cpp
Kokkos_MemorySpace.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/impl/Kokkos_MemorySpace.cpp
$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/impl/Kokkos_MemorySpace.cpp
Kokkos_HostSpace_deepcopy.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/impl/Kokkos_HostSpace_deepcopy.cpp
$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/impl/Kokkos_HostSpace_deepcopy.cpp
Kokkos_NumericTraits.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/impl/Kokkos_NumericTraits.cpp
$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/impl/Kokkos_NumericTraits.cpp
ifeq ($(KOKKOS_INTERNAL_USE_SERIAL), 1)
Kokkos_Serial.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/Serial/Kokkos_Serial.cpp
$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/Serial/Kokkos_Serial.cpp
Kokkos_Serial_Task.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/Serial/Kokkos_Serial_Task.cpp
$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/Serial/Kokkos_Serial_Task.cpp
endif
ifeq ($(KOKKOS_INTERNAL_USE_CUDA), 1)
Kokkos_Cuda_Instance.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/Cuda/Kokkos_Cuda_Instance.cpp
$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/Cuda/Kokkos_Cuda_Instance.cpp
Kokkos_CudaSpace.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/Cuda/Kokkos_CudaSpace.cpp
$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/Cuda/Kokkos_CudaSpace.cpp
Kokkos_Cuda_Task.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/Cuda/Kokkos_Cuda_Task.cpp
$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/Cuda/Kokkos_Cuda_Task.cpp
Kokkos_Cuda_Locks.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/Cuda/Kokkos_Cuda_Locks.cpp
$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/Cuda/Kokkos_Cuda_Locks.cpp
Lock_Array_CUDA.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/tpls/desul/src/Lock_Array_CUDA.cpp
$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/tpls/desul/src/Lock_Array_CUDA.cpp
endif
ifeq ($(KOKKOS_INTERNAL_USE_SYCL), 1)
Kokkos_SYCL.o : $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/SYCL/Kokkos_SYCL.cpp
$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/SYCL/Kokkos_SYCL.cpp
Kokkos_SYCL_Space.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/SYCL/Kokkos_SYCL_Space.cpp
$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/SYCL/Kokkos_SYCL_Space.cpp
Kokkos_SYCL_Instance.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/SYCL/Kokkos_SYCL_Instance.cpp
$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/SYCL/Kokkos_SYCL_Instance.cpp
endif
ifeq ($(KOKKOS_INTERNAL_USE_HIP), 1)
Kokkos_HIP_Space.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/HIP/Kokkos_HIP_Space.cpp
$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/HIP/Kokkos_HIP_Space.cpp
Kokkos_HIP_Instance.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/HIP/Kokkos_HIP_Instance.cpp
$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/HIP/Kokkos_HIP_Instance.cpp
Kokkos_HIP_Locks.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/HIP/Kokkos_HIP_Locks.cpp
$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/HIP/Kokkos_HIP_Locks.cpp
Lock_Array_HIP.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/tpls/desul/src/Lock_Array_HIP.cpp
$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/tpls/desul/src/Lock_Array_HIP.cpp
endif
ifeq ($(KOKKOS_INTERNAL_USE_THREADS), 1)
Kokkos_ThreadsExec.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/Threads/Kokkos_ThreadsExec.cpp
$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/Threads/Kokkos_ThreadsExec.cpp
endif
ifeq ($(KOKKOS_INTERNAL_USE_OPENMP), 1)
Kokkos_OpenMP_Instance.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/OpenMP/Kokkos_OpenMP_Instance.cpp
$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/OpenMP/Kokkos_OpenMP_Instance.cpp
Kokkos_OpenMP_Task.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/OpenMP/Kokkos_OpenMP_Task.cpp
$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/OpenMP/Kokkos_OpenMP_Task.cpp
endif
ifeq ($(KOKKOS_INTERNAL_USE_HPX), 1)
Kokkos_HPX.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/HPX/Kokkos_HPX.cpp
$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/HPX/Kokkos_HPX.cpp
Kokkos_HPX_Task.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/HPX/Kokkos_HPX_Task.cpp
$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/HPX/Kokkos_HPX_Task.cpp
endif
ifeq ($(KOKKOS_INTERNAL_USE_OPENMPTARGET), 1)
Kokkos_OpenMPTarget_Exec.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Exec.cpp
$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Exec.cpp
Kokkos_OpenMPTarget_Instance.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Instance.cpp
$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Instance.cpp
Kokkos_OpenMPTargetSpace.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/OpenMPTarget/Kokkos_OpenMPTargetSpace.cpp
$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/OpenMPTarget/Kokkos_OpenMPTargetSpace.cpp
Kokkos_OpenMPTarget_Task.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Task.cpp
$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Task.cpp
endif
Kokkos_HBWSpace.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/impl/Kokkos_HBWSpace.cpp
$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/impl/Kokkos_HBWSpace.cpp
kokkos-3.7.01/README.md 0000664 0000000 0000000 00000006040 14343743117 0014371 0 ustar 00root root 0000000 0000000 
# Kokkos: Core Libraries
Kokkos Core implements a programming model in C++ for writing performance portable
applications targeting all major HPC platforms. For that purpose it provides
abstractions for both parallel execution of code and data management.
Kokkos is designed to target complex node architectures with N-level memory
hierarchies and multiple types of execution resources. It currently can use
CUDA, HIP, SYCL, HPX, OpenMP and C++ threads as backend programming models with several other
backends in development.
**Kokkos Core is part of the Kokkos C++ Performance Portability Programming EcoSystem.**
For the complete documentation, click below:
# [kokkos.github.io/kokkos-core-wiki](https://kokkos.github.io/kokkos-core-wiki)
# Learning about Kokkos
To start learning about Kokkos:
- [Kokkos Lectures](https://kokkos.github.io/kokkos-core-wiki/videolectures.html): they contain a mix of lecture videos and hands-on exercises covering all the important Kokkos Ecosystem capabilities.
- [Programming guide](https://kokkos.github.io/kokkos-core-wiki/programmingguide.html): contains in "narrative" form a technical description of the programming model, machine model, and the main building blocks like the Views and parallel dispatch.
- [API reference](https://kokkos.github.io/kokkos-core-wiki/): organized by category, i.e., [core](https://kokkos.github.io/kokkos-core-wiki/API/core-index.html), [algorithms](https://kokkos.github.io/kokkos-core-wiki/API/algorithms-index.html) and [containers](https://kokkos.github.io/kokkos-core-wiki/API/containers-index.html) or, if you prefer, in [alphabetical order](https://kokkos.github.io/kokkos-core-wiki/API/alphabetical.html).
- [Use cases and Examples](https://kokkos.github.io/kokkos-core-wiki/usecases.html): a series of examples ranging from how to use Kokkos with MPI to Fortran interoperability.
For questions find us on Slack: https://kokkosteam.slack.com or open a github issue.
For non-public questions send an email to: *crtrott(at)sandia.gov*
# Contributing to Kokkos
Please see [this page](https://kokkos.github.io/kokkos-core-wiki/contributing.html) for details on how to contribute.
# Requirements, Building and Installing
All requirements including minimum and primary tested compiler versions can be found [here](https://kokkos.github.io/kokkos-core-wiki/requirements.html).
Building and installation instructions are described [here](https://kokkos.github.io/kokkos-core-wiki/building.html).
# Citing Kokkos
Please see the [following page](https://kokkos.github.io/kokkos-core-wiki/citation.html).
# License
[](https://opensource.org/licenses/BSD-3-Clause)
Under the terms of Contract DE-NA0003525 with NTESS,
the U.S. Government retains certain rights in this software.
The full license statement used in all headers is available [here](https://kokkos.github.io/kokkos-core-wiki/license.html) or
[here](https://github.com/kokkos/kokkos/blob/master/LICENSE).
kokkos-3.7.01/Spack.md 0000664 0000000 0000000 00000031146 14343743117 0014502 0 ustar 00root root 0000000 0000000 
# Kokkos Spack
This gives instructions for using Spack to install Kokkos and developing packages that depend on Kokkos.
## Getting Started
Make sure you have downloaded [Spack](https://github.com/spack/spack).
The easiest way to configure the Spack environment is:
````bash
> source spack/share/spack/setup-env.sh
````
with other scripts available for other shells.
You can display information about how to install packages with:
````bash
> spack info kokkos
````
This will print all the information about how to install Kokkos with Spack.
For detailed instructions on how to use Spack, see the [User Manual](https://spack.readthedocs.io).
## Setting Up Spack: Avoiding the Package Cascade
By default, Spack doesn't 'see' anything on your system - including things like CMake and CUDA.
This can be limited by adding a `packages.yaml` to your `$HOME/.spack` folder that includes CMake (and CUDA, if applicable). For example, your `packages.yaml` file could be:
````yaml
packages:
cuda:
buildable: false
externals:
- prefix: /opt/local/ppc64le-pwr8-nvidia/cuda/10.1.243
spec: cuda@10.1.243
- modules:
- cuda/10.1.243
spec: cuda@10.1.243
cmake:
buildable: false
externals:
- prefix: /opt/local/ppc64le/cmake/3.16.8
spec: cmake@3.16.8
- modules:
- cmake/3.16.8
spec: cmake@3.16.8
````
The `modules` entry is only necessary on systems that require loading Modules (i.e. most DOE systems).
The `buildable` flag is useful to make sure Spack crashes if there is a path error,
rather than having a type-o and Spack rebuilding everything because `cmake` isn't found.
You can verify your environment is set up correctly by running `spack graph` or `spack spec`.
For example:
````bash
> spack graph kokkos +cuda
o kokkos
|\
o | cuda
/
o cmake
````
Without the existing CUDA and CMake being identified in `packages.yaml`, a (subset!) of the output would be:
````bash
o kokkos
|\
| o cmake
| |\
| | | |\
| | | | | |\
| | | | | | | |\
| | | | | | | | | |\
| | | | | | | o | | | libarchive
| | | | | | | |\ \ \ \
| | | | | | | | | |\ \ \ \
| | | | | | | | | | | | |_|/
| | | | | | | | | | | |/| |
| | | | | | | | | | | | | o curl
| | |_|_|_|_|_|_|_|_|_|_|/|
| |/| | | |_|_|_|_|_|_|_|/
| | | | |/| | | | | | | |
| | | | o | | | | | | | | openssl
| |/| | | | | | | | | | |
| | | | | | | | | | o | | libxml2
| | |_|_|_|_|_|_|_|/| | |
| | | | | | | | | | |\ \ \
| o | | | | | | | | | | | | zlib
| / / / / / / / / / / / /
| o | | | | | | | | | | | xz
| / / / / / / / / / / /
| o | | | | | | | | | | rhash
| / / / / / / / / / /
| | | | o | | | | | | nettle
| | | | |\ \ \ \ \ \ \
| | | o | | | | | | | | libuv
| | | | o | | | | | | | autoconf
| | |_|/| | | | | | | |
| | | | |/ / / / / / /
| o | | | | | | | | | perl
| o | | | | | | | | | gdbm
| o | | | | | | | | | readline
````
## Configuring Kokkos as a Project Dependency
Say you have a project "SuperScience" which needs to use Kokkos.
In your `package.py` file, you would generally include something like:
````python
class SuperScience(CMakePackage):
...
depends_on("kokkos")
````
Often projects want to tweak behavior when using certain features, e.g.
````python
depends_on("kokkos+cuda", when="+cuda")
````
if your project needs CUDA-specific logic to configure and build.
This illustrates the general principle in Spack of "flowing-up".
A user requests a feature in the final app:
````bash
> spack install superscience+cuda
````
This flows upstream to the Kokkos dependency, causing the `kokkos+cuda` variant to build.
The downstream app (SuperScience) tells the upstream app (Kokkos) how to build.
Because Kokkos is a performance portability library, it somewhat inverts this principle.
Kokkos "flows-down", telling your application how best to configure for performance.
Rather than a downstream app (SuperScience) telling the upstream (Kokkos) what variants to build,
a pre-built Kokkos should be telling the downstream app SuperScience what variants to use.
Kokkos works best when there is an "expert" configuration installed on your system.
Your build should simply request `-DKokkos_ROOT=` and configure appropriately based on the Kokkos it finds.
Kokkos has many, many build variants.
Where possible, projects should only depend on a general Kokkos, not specific variants.
We recommend instead adding for each system you build on a Kokkos configuration to your `packages.yaml` file (usually found in `~/.spack` for specific users).
For a Xeon + Volta system, this could look like:
````yaml
kokkos:
variants: +cuda +openmp +cuda_lambda +wrapper ^cuda@10.1 cuda_arch=70
compiler: [gcc@7.2.0]
````
which gives the "best" Kokkos configuration as CUDA+OpenMP optimized for a Volta 70 architecture using CUDA 10.1.
It also enables support for CUDA Lambdas.
The `+wrapper` option tells Kokkos to build with the special `nvcc_wrapper` (more below).
Note here that we use the built-in `cuda_arch` variant of Spack to specify the archicture.
For a Haswell system, we use
````yaml
kokkos:
variants: +openmp std=14 target=haswell
compiler: [intel@18]
````
which uses the built-in microarchitecture variants of Spack.
Consult the Spack documentation for more details of Spack microarchitectures
and CUDA architectures.
Spack does not currently provide an AMD GPU microarchitecture option.
If building for HIP or an AMD GPU, Kokkos provides an `amd_gpu_arch` similar to `cuda_arch`.
````yaml
kokkos:
variants: +hip amd_gpu_arch=vega900
````
Without an optimal default in your `packages.yaml` file, it is highly likely that the default Kokkos configuration you get will not be what you want.
For example, CUDA is not enabled by default (there is no easy logic to conditionally activate this for CUDA-enabled systems).
If you don't specify a CUDA build variant in a `packages.yaml` and you build your Kokkos-dependent project:
````bash
> spack install superscience
````
you may end up just getting the default Kokkos (i.e. Serial).
Some examples are included in the `config/yaml` folder for common platforms.
Before running `spack install ` we recommend running `spack spec ` to confirm your dependency tree is correct.
For example, with Kokkos Kernels:
````bash
kokkos-kernels@3.0%gcc@8.3.0~blas build_type=RelWithDebInfo ~cblas~complex_double~complex_float~cublas~cuda cuda_arch=none ~cusparse~diy+double execspace_cuda=auto execspace_openmp=auto execspace_serial=auto execspace_threads=auto ~float~lapack~lapacke+layoutleft~layoutright memspace_cudaspace=auto memspace_cudauvmspace=auto +memspace_hostspace~mkl+offset_int+offset_size_t~openmp+ordinal_int~ordinal_int64_t~serial~superlu arch=linux-rhel7-skylake_avx512
^cmake@3.16.2%gcc@8.3.0~doc+ncurses+openssl+ownlibs~qt arch=linux-rhel7-skylake_avx512
^kokkos@3.0%gcc@8.3.0~aggressive_vectorization~amdavx~armv80~armv81~armv8_thunderx~armv8_tx2~bdw~bgq build_type=RelWithDebInfo ~carrizo~compiler_warnings+cuda cuda_arch=none +cuda_lambda~cuda_ldg_intrinsic~cuda_relocatable_device_code~cuda_uvm~debug~debug_bounds_check~debug_dualview_modify_check~deprecated_code~diy~epyc~examples~explicit_instantiation~fiji~gfx901~hpx~hpx_async_dispatch~hsw~hwloc~kaveri~kepler30~kepler32~kepler35~kepler37~knc~knl~maxwell50~maxwell52~maxwell53~memkind~numactl+openmp~pascal60~pascal61~power7~power8~power9+profiling~profiling_load_print~pthread~qthread~rocm~ryzen~serial~skx~snb std=14 ~tests~turing75~vega+volta70~volta72+wrapper~wsm arch=linux-rhel7-skylake_avx512
^cuda@10.1%gcc@8.3.0 arch=linux-rhel7-skylake_avx512
^kokkos-nvcc-wrapper@old%gcc@8.3.0 build_type=RelWithDebInfo +mpi arch=linux-rhel7-skylake_avx512
^openmpi@4.0.2%gcc@8.3.0~cuda+cxx_exceptions fabrics=none ~java~legacylaunchers~memchecker patches=073477a76bba780c67c36e959cd3ee6910743e2735c7e76850ffba6791d498e4 ~pmi schedulers=none ~sqlite3~thread_multiple+vt arch=linux-rhel7-skylake_avx512
````
The output can be very verbose, but we can verify the expected `kokkos`:
````bash
kokkos@3.0%gcc@8.3.0~aggressive_vectorization~amdavx~armv80~armv81~armv8_thunderx~armv8_tx2~bdw~bgq build_type=RelWithDebInfo ~carrizo~compiler_warnings+cuda cuda_arch=none +cuda_lambda~cuda_ldg_intrinsic~cuda_relocatable_device_code~cuda_uvm~debug~debug_bounds_check~debug_dualview_modify_check~deprecated_code~diy~epyc~examples~explicit_instantiation~fiji~gfx901~hpx~hpx_async_dispatch~hsw~hwloc~kaveri~kepler30~kepler32~kepler35~kepler37~knc~knl~maxwell50~maxwell52~maxwell53~memkind~numactl+openmp~pascal60~pascal61~power7~power8~power9+profiling~profiling_load_print~pthread~qthread~rocm~ryzen~serial~skx~snb std=11 ~tests~turing75~vega+volta70~volta72+wrapper~wsm arch=linux-rhel7-skylake_avx512
````
We see that we do have `+volta70` and `+wrapper`, e.g.
### Spack Environments
The encouraged way to use Spack is with Spack environments ([more details here](https://spack-tutorial.readthedocs.io/en/latest/tutorial_environments.html#dealing-with-many-specs-at-once)).
Rather than installing packages one-at-a-time, you add packages to an environment.
After adding all packages, you concretize and install them all.
Using environments, one can explicitly add a desired Kokkos for the environment, e.g.
````bash
> spack add kokkos +cuda +cuda_lambda +volta70
> spack add my_project +my_variant
> ...
> spack install
````
All packages within the environment will build against the CUDA-enabled Kokkos,
even if they only request a default Kokkos.
## NVCC Wrapper
Kokkos is a C++ project, but often builds for the CUDA backend.
This is particularly problematic with CMake. At this point, `nvcc` does not accept all the flags that normally get passed to a C++ compiler.
Kokkos provides `nvcc_wrapper` that identifies correctly as a C++ compiler to CMake and accepts C++ flags, but uses `nvcc` as the underlying compiler.
`nvcc` itself also uses an underlying host compiler, e.g. GCC.
In Spack, the underlying host compiler is specified as below, e.g.:
````bash
> spack install package %gcc@8.0.0
````
This is still valid for Kokkos. To use the special wrapper for CUDA builds, request a desired compiler and simply add the `+wrapper` variant.
````bash
> spack install kokkos +cuda +wrapper %gcc@7.2.0
````
Downstream projects depending on Kokkos need to override their compiler.
Kokkos provides the compiler in a `kokkos_cxx` variable,
which points to either `nvcc_wrapper` when needed or the regular compiler otherwise.
Spack projects already do this to use MPI compiler wrappers.
````python
def cmake_args(self):
options = []
...
options.append("-DCMAKE_CXX_COMPILER=%s" % self.spec["kokkos"].kokkos_cxx)
...
return options
````
Note: `nvcc_wrapper` works with the MPI compiler wrappers.
If building your project with MPI, do NOT set your compiler to `nvcc_wrapper`.
Instead set your compiler to `mpicxx` and `nvcc_wrapper` will be used under the hood.
````python
def cmake_args(self):
options = []
...
options.append("-DCMAKE_CXX_COMPILER=%s" % self.spec["mpi"].mpicxx)
...
return options
````
To accomplish this, `nvcc_wrapper` must depend on MPI (even though it uses no MPI).
This has the unfortunate consequence that Kokkos CUDA projects not using MPI will implicitly depend on MPI anyway.
This behavior is necessary for now, but will hopefully be removed later.
When using environments, if MPI is not needed, you can remove the MPI dependency with:
````bash
> spack add kokkos-nvcc-wrapper ~mpi
````
## Developing With Spack
Spack has historically been much more suited to *deployment* of mature packages than active testing or developing.
However, recent features have improved support for development.
Future releases are likely to make this even easier and incorporate Git integration.
The most common commands will do a full build and install of the packages.
If doing development, you may wish to merely set up a build environment.
This allows you to modify the source and re-build.
In this case, you can stop after configuring.
Suppose you have Kokkos checkout in the folder `kokkos-src`:
````bash
> spack dev-build -d kokkos-src -u cmake kokkos@develop +wrapper +openmp
````
This sets up a development environment for you in `kokkos-src` which you can use (Bash example shown):
Note: Always specify `develop` as the version when doing `dev-build`, except in rare cases.
You are usually developing a feature branch that will merge into `develop`,
hence you are making a new `develop` branch.
````bash
> cd kokko-src
> source spack-build-env.txt
> cd spack-build
> make
````
Before sourcing the Spack development environment, you may wish to save your current environment:
````bash
> declare -px > myenv.sh
````
When done with Spack, you can then restore your original environment:
````bash
> source myenv.sh
````
kokkos-3.7.01/algorithms/ 0000775 0000000 0000000 00000000000 14343743117 0015263 5 ustar 00root root 0000000 0000000 kokkos-3.7.01/algorithms/CMakeLists.txt 0000664 0000000 0000000 00000000405 14343743117 0020022 0 ustar 00root root 0000000 0000000
KOKKOS_SUBPACKAGE(Algorithms)
IF (NOT Kokkos_INSTALL_TESTING)
ADD_SUBDIRECTORY(src)
ENDIF()
IF(NOT (KOKKOS_ENABLE_OPENMPTARGET AND KOKKOS_CXX_COMPILER_ID STREQUAL NVHPC))
KOKKOS_ADD_TEST_DIRECTORIES(unit_tests)
ENDIF()
KOKKOS_SUBPACKAGE_POSTPROCESS()
kokkos-3.7.01/algorithms/cmake/ 0000775 0000000 0000000 00000000000 14343743117 0016343 5 ustar 00root root 0000000 0000000 kokkos-3.7.01/algorithms/cmake/Dependencies.cmake 0000664 0000000 0000000 00000000242 14343743117 0021731 0 ustar 00root root 0000000 0000000 TRIBITS_PACKAGE_DEFINE_DEPENDENCIES(
LIB_REQUIRED_PACKAGES KokkosCore KokkosContainers
LIB_OPTIONAL_TPLS Pthread CUDA HWLOC
TEST_OPTIONAL_TPLS CUSPARSE
)
kokkos-3.7.01/algorithms/cmake/KokkosAlgorithms_config.h.in 0000664 0000000 0000000 00000000116 14343743117 0023737 0 ustar 00root root 0000000 0000000 #ifndef KOKKOS_ALGORITHMS_CONFIG_H
#define KOKKOS_ALGORITHMS_CONFIG_H
#endif
kokkos-3.7.01/algorithms/src/ 0000775 0000000 0000000 00000000000 14343743117 0016052 5 ustar 00root root 0000000 0000000 kokkos-3.7.01/algorithms/src/CMakeLists.txt 0000664 0000000 0000000 00000002202 14343743117 0020606 0 ustar 00root root 0000000 0000000
KOKKOS_CONFIGURE_FILE(${PACKAGE_NAME}_config.h)
#I have to leave these here for tribits
KOKKOS_INCLUDE_DIRECTORIES(${CMAKE_CURRENT_BINARY_DIR})
KOKKOS_INCLUDE_DIRECTORIES(${CMAKE_CURRENT_SOURCE_DIR})
#-----------------------------------------------------------------------------
FILE(GLOB ALGO_HEADERS *.hpp)
FILE(GLOB ALGO_SOURCES *.cpp)
LIST(APPEND ALGO_HEADERS ${CMAKE_CURRENT_BINARY_DIR}/${PACKAGE_NAME}_config.h)
APPEND_GLOB(ALGO_HEADERS ${CMAKE_CURRENT_SOURCE_DIR}/std_algorithms/*.hpp)
APPEND_GLOB(ALGO_HEADERS ${CMAKE_CURRENT_SOURCE_DIR}/std_algorithms/impl/*.hpp)
INSTALL (
DIRECTORY "${CMAKE_CURRENT_SOURCE_DIR}/"
DESTINATION ${KOKKOS_HEADER_DIR}
FILES_MATCHING PATTERN "*.hpp"
)
#-----------------------------------------------------------------------------
# We have to pass the sources in here for Tribits
# These will get ignored for standalone CMake and a true interface library made
KOKKOS_ADD_INTERFACE_LIBRARY(
kokkosalgorithms
HEADERS ${ALGO_HEADERS}
SOURCES ${ALGO_SOURCES}
)
KOKKOS_LIB_INCLUDE_DIRECTORIES(kokkosalgorithms
${KOKKOS_TOP_BUILD_DIR}
${CMAKE_CURRENT_BINARY_DIR}
${CMAKE_CURRENT_SOURCE_DIR}
)
kokkos-3.7.01/algorithms/src/KokkosAlgorithms_dummy.cpp 0000664 0000000 0000000 00000000071 14343743117 0023262 0 ustar 00root root 0000000 0000000 void KOKKOS_ALGORITHMS_SRC_DUMMY_PREVENT_LINK_ERROR() {}
kokkos-3.7.01/algorithms/src/Kokkos_Random.hpp 0000664 0000000 0000000 00000157263 14343743117 0021342 0 ustar 00root root 0000000 0000000 /*
//@HEADER
// ************************************************************************
//
// Kokkos v. 3.0
// Copyright (2020) National Technology & Engineering
// Solutions of Sandia, LLC (NTESS).
//
// Under the terms of Contract DE-NA0003525 with NTESS,
// the U.S. Government retains certain rights in this software.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are
// met:
//
// 1. Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
//
// 2. Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
//
// 3. Neither the name of the Corporation nor the names of the
// contributors may be used to endorse or promote products derived from
// this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//
// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
//
// ************************************************************************
//@HEADER
*/
#ifndef KOKKOS_RANDOM_HPP
#define KOKKOS_RANDOM_HPP
#ifndef KOKKOS_IMPL_PUBLIC_INCLUDE
#define KOKKOS_IMPL_PUBLIC_INCLUDE
#define KOKKOS_IMPL_PUBLIC_INCLUDE_NOTDEFINED_RANDOM
#endif
#include
#include
#include
#include
#include
/// \file Kokkos_Random.hpp
/// \brief Pseudorandom number generators
///
/// These generators are based on Vigna, Sebastiano (2014). "An
/// experimental exploration of Marsaglia's xorshift generators,
/// scrambled." See: http://arxiv.org/abs/1402.6246
namespace Kokkos {
// clang-format off
/*Template functions to get equidistributed random numbers from a generator for a specific Scalar type
template
struct rand{
//Max value returned by draw(Generator& gen)
KOKKOS_INLINE_FUNCTION
static Scalar max();
//Returns a value between zero and max()
KOKKOS_INLINE_FUNCTION
static Scalar draw(Generator& gen);
//Returns a value between zero and range()
//Note: for floating point values range can be larger than max()
KOKKOS_INLINE_FUNCTION
static Scalar draw(Generator& gen, const Scalar& range){}
//Return value between start and end
KOKKOS_INLINE_FUNCTION
static Scalar draw(Generator& gen, const Scalar& start, const Scalar& end);
};
The Random number generators themselves have two components a state-pool and the actual generator
A state-pool manages a number of generators, so that each active thread is able to grep its own.
This allows the generation of random numbers which are independent between threads. Note that
in contrast to CuRand none of the functions of the pool (or the generator) are collectives,
i.e. all functions can be called inside conditionals.
template
class Pool {
public:
//The Kokkos device type
using device_type = Device;
//The actual generator type
using generator_type = Generator;
//Default constructor: does not initialize a pool
Pool();
//Initializing constructor: calls init(seed,Device_Specific_Number);
Pool(unsigned int seed);
//Initialize Pool with seed as a starting seed with a pool_size of num_states
//The Random_XorShift64 generator is used in serial to initialize all states,
//thus the initialization process is platform independent and deterministic.
void init(unsigned int seed, int num_states);
//Get a generator. This will lock one of the states, guaranteeing that each thread
//will have its private generator. Note: on Cuda getting a state involves atomics,
//and is thus not deterministic!
generator_type get_state();
//Give a state back to the pool. This unlocks the state, and writes the modified
//state of the generator back to the pool.
void free_state(generator_type gen);
}
template
class Generator {
public:
//The Kokkos device type
using device_type = DeviceType;
//Max return values of respective [X]rand[S]() functions
enum {MAX_URAND = 0xffffffffU};
enum {MAX_URAND64 = 0xffffffffffffffffULL-1};
enum {MAX_RAND = static_cast(0xffffffffU/2)};
enum {MAX_RAND64 = static_cast(0xffffffffffffffffULL/2-1)};
//Init with a state and the idx with respect to pool. Note: in serial the
//Generator can be used by just giving it the necessary state arguments
KOKKOS_INLINE_FUNCTION
Generator (STATE_ARGUMENTS, int state_idx = 0);
//Draw a equidistributed uint32_t in the range [0,MAX_URAND)
KOKKOS_INLINE_FUNCTION
uint32_t urand();
//Draw a equidistributed uint64_t in the range [0,MAX_URAND64)
KOKKOS_INLINE_FUNCTION
uint64_t urand64();
//Draw a equidistributed uint32_t in the range [0,range)
KOKKOS_INLINE_FUNCTION
uint32_t urand(const uint32_t& range);
//Draw a equidistributed uint32_t in the range [start,end)
KOKKOS_INLINE_FUNCTION
uint32_t urand(const uint32_t& start, const uint32_t& end );
//Draw a equidistributed uint64_t in the range [0,range)
KOKKOS_INLINE_FUNCTION
uint64_t urand64(const uint64_t& range);
//Draw a equidistributed uint64_t in the range [start,end)
KOKKOS_INLINE_FUNCTION
uint64_t urand64(const uint64_t& start, const uint64_t& end );
//Draw a equidistributed int in the range [0,MAX_RAND)
KOKKOS_INLINE_FUNCTION
int rand();
//Draw a equidistributed int in the range [0,range)
KOKKOS_INLINE_FUNCTION
int rand(const int& range);
//Draw a equidistributed int in the range [start,end)
KOKKOS_INLINE_FUNCTION
int rand(const int& start, const int& end );
//Draw a equidistributed int64_t in the range [0,MAX_RAND64)
KOKKOS_INLINE_FUNCTION
int64_t rand64();
//Draw a equidistributed int64_t in the range [0,range)
KOKKOS_INLINE_FUNCTION
int64_t rand64(const int64_t& range);
//Draw a equidistributed int64_t in the range [start,end)
KOKKOS_INLINE_FUNCTION
int64_t rand64(const int64_t& start, const int64_t& end );
//Draw a equidistributed float in the range [0,1.0)
KOKKOS_INLINE_FUNCTION
float frand();
//Draw a equidistributed float in the range [0,range)
KOKKOS_INLINE_FUNCTION
float frand(const float& range);
//Draw a equidistributed float in the range [start,end)
KOKKOS_INLINE_FUNCTION
float frand(const float& start, const float& end );
//Draw a equidistributed double in the range [0,1.0)
KOKKOS_INLINE_FUNCTION
double drand();
//Draw a equidistributed double in the range [0,range)
KOKKOS_INLINE_FUNCTION
double drand(const double& range);
//Draw a equidistributed double in the range [start,end)
KOKKOS_INLINE_FUNCTION
double drand(const double& start, const double& end );
//Draw a standard normal distributed double
KOKKOS_INLINE_FUNCTION
double normal() ;
//Draw a normal distributed double with given mean and standard deviation
KOKKOS_INLINE_FUNCTION
double normal(const double& mean, const double& std_dev=1.0);
}
//Additional Functions:
//Fills view with random numbers in the range [0,range)
template
void fill_random(ViewType view, PoolType pool, ViewType::value_type range);
//Fills view with random numbers in the range [start,end)
template
void fill_random(ViewType view, PoolType pool,
ViewType::value_type start, ViewType::value_type end);
*/
// clang-format on
template
struct rand;
template
struct rand {
KOKKOS_INLINE_FUNCTION
static short max() { return 127; }
KOKKOS_INLINE_FUNCTION
static short draw(Generator& gen) {
return short((gen.rand() & 0xff + 256) % 256);
}
KOKKOS_INLINE_FUNCTION
static short draw(Generator& gen, const char& range) {
return char(gen.rand(range));
}
KOKKOS_INLINE_FUNCTION
static short draw(Generator& gen, const char& start, const char& end) {
return char(gen.rand(start, end));
}
};
template
struct rand {
KOKKOS_INLINE_FUNCTION
static short max() { return 32767; }
KOKKOS_INLINE_FUNCTION
static short draw(Generator& gen) {
return short((gen.rand() & 0xffff + 65536) % 32768);
}
KOKKOS_INLINE_FUNCTION
static short draw(Generator& gen, const short& range) {
return short(gen.rand(range));
}
KOKKOS_INLINE_FUNCTION
static short draw(Generator& gen, const short& start, const short& end) {
return short(gen.rand(start, end));
}
};
template
struct rand {
KOKKOS_INLINE_FUNCTION
static int max() { return Generator::MAX_RAND; }
KOKKOS_INLINE_FUNCTION
static int draw(Generator& gen) { return gen.rand(); }
KOKKOS_INLINE_FUNCTION
static int draw(Generator& gen, const int& range) { return gen.rand(range); }
KOKKOS_INLINE_FUNCTION
static int draw(Generator& gen, const int& start, const int& end) {
return gen.rand(start, end);
}
};
template
struct rand {
KOKKOS_INLINE_FUNCTION
static unsigned int max() { return Generator::MAX_URAND; }
KOKKOS_INLINE_FUNCTION
static unsigned int draw(Generator& gen) { return gen.urand(); }
KOKKOS_INLINE_FUNCTION
static unsigned int draw(Generator& gen, const unsigned int& range) {
return gen.urand(range);
}
KOKKOS_INLINE_FUNCTION
static unsigned int draw(Generator& gen, const unsigned int& start,
const unsigned int& end) {
return gen.urand(start, end);
}
};
template
struct rand {
KOKKOS_INLINE_FUNCTION
static long max() {
// FIXME (mfh 26 Oct 2014) It would be better to select the
// return value at compile time, using something like enable_if.
return sizeof(long) == 4 ? static_cast(Generator::MAX_RAND)
: static_cast(Generator::MAX_RAND64);
}
KOKKOS_INLINE_FUNCTION
static long draw(Generator& gen) {
// FIXME (mfh 26 Oct 2014) It would be better to select the
// return value at compile time, using something like enable_if.
return sizeof(long) == 4 ? static_cast(gen.rand())
: static_cast(gen.rand64());
}
KOKKOS_INLINE_FUNCTION
static long draw(Generator& gen, const long& range) {
// FIXME (mfh 26 Oct 2014) It would be better to select the
// return value at compile time, using something like enable_if.
return sizeof(long) == 4
? static_cast(gen.rand(static_cast(range)))
: static_cast(gen.rand64(range));
}
KOKKOS_INLINE_FUNCTION
static long draw(Generator& gen, const long& start, const long& end) {
// FIXME (mfh 26 Oct 2014) It would be better to select the
// return value at compile time, using something like enable_if.
return sizeof(long) == 4
? static_cast(
gen.rand(static_cast(start), static_cast(end)))
: static_cast(gen.rand64(start, end));
}
};
template
struct rand {
KOKKOS_INLINE_FUNCTION
static unsigned long max() {
// FIXME (mfh 26 Oct 2014) It would be better to select the
// return value at compile time, using something like enable_if.
return sizeof(unsigned long) == 4
? static_cast(Generator::MAX_URAND)
: static_cast(Generator::MAX_URAND64);
}
KOKKOS_INLINE_FUNCTION
static unsigned long draw(Generator& gen) {
// FIXME (mfh 26 Oct 2014) It would be better to select the
// return value at compile time, using something like enable_if.
return sizeof(unsigned long) == 4
? static_cast(gen.urand())
: static_cast(gen.urand64());
}
KOKKOS_INLINE_FUNCTION
static unsigned long draw(Generator& gen, const unsigned long& range) {
// FIXME (mfh 26 Oct 2014) It would be better to select the
// return value at compile time, using something like enable_if.
return sizeof(unsigned long) == 4
? static_cast(
gen.urand(static_cast(range)))
: static_cast(gen.urand64(range));
}
KOKKOS_INLINE_FUNCTION
static unsigned long draw(Generator& gen, const unsigned long& start,
const unsigned long& end) {
// FIXME (mfh 26 Oct 2014) It would be better to select the
// return value at compile time, using something like enable_if.
return sizeof(unsigned long) == 4
? static_cast(
gen.urand(static_cast(start),
static_cast(end)))
: static_cast(gen.urand64(start, end));
}
};
// NOTE (mfh 26 oct 2014) This is a partial specialization for long
// long, a C99 / C++11 signed type which is guaranteed to be at
// least 64 bits. Do NOT write a partial specialization for
// int64_t!!! This is just an alias! It could be either long or
// long long. We don't know which a priori, and I've seen both.
// The types long and long long are guaranteed to differ, so it's
// always safe to specialize for both.
template
struct rand {
KOKKOS_INLINE_FUNCTION
static long long max() {
// FIXME (mfh 26 Oct 2014) It's legal for long long to be > 64 bits.
return Generator::MAX_RAND64;
}
KOKKOS_INLINE_FUNCTION
static long long draw(Generator& gen) {
// FIXME (mfh 26 Oct 2014) It's legal for long long to be > 64 bits.
return gen.rand64();
}
KOKKOS_INLINE_FUNCTION
static long long draw(Generator& gen, const long long& range) {
// FIXME (mfh 26 Oct 2014) It's legal for long long to be > 64 bits.
return gen.rand64(range);
}
KOKKOS_INLINE_FUNCTION
static long long draw(Generator& gen, const long long& start,
const long long& end) {
// FIXME (mfh 26 Oct 2014) It's legal for long long to be > 64 bits.
return gen.rand64(start, end);
}
};
// NOTE (mfh 26 oct 2014) This is a partial specialization for
// unsigned long long, a C99 / C++11 unsigned type which is
// guaranteed to be at least 64 bits. Do NOT write a partial
// specialization for uint64_t!!! This is just an alias! It could
// be either unsigned long or unsigned long long. We don't know
// which a priori, and I've seen both. The types unsigned long and
// unsigned long long are guaranteed to differ, so it's always safe
// to specialize for both.
template
struct rand {
KOKKOS_INLINE_FUNCTION
static unsigned long long max() {
// FIXME (mfh 26 Oct 2014) It's legal for unsigned long long to be > 64
// bits.
return Generator::MAX_URAND64;
}
KOKKOS_INLINE_FUNCTION
static unsigned long long draw(Generator& gen) {
// FIXME (mfh 26 Oct 2014) It's legal for unsigned long long to be > 64
// bits.
return gen.urand64();
}
KOKKOS_INLINE_FUNCTION
static unsigned long long draw(Generator& gen,
const unsigned long long& range) {
// FIXME (mfh 26 Oct 2014) It's legal for long long to be > 64 bits.
return gen.urand64(range);
}
KOKKOS_INLINE_FUNCTION
static unsigned long long draw(Generator& gen,
const unsigned long long& start,
const unsigned long long& end) {
// FIXME (mfh 26 Oct 2014) It's legal for long long to be > 64 bits.
return gen.urand64(start, end);
}
};
#if defined(KOKKOS_HALF_T_IS_FLOAT) && !KOKKOS_HALF_T_IS_FLOAT
template
struct rand {
using half = Kokkos::Experimental::half_t;
KOKKOS_INLINE_FUNCTION
static half max() { return half(1.0); }
KOKKOS_INLINE_FUNCTION
static half draw(Generator& gen) { return half(gen.frand()); }
KOKKOS_INLINE_FUNCTION
static half draw(Generator& gen, const half& range) {
return half(gen.frand(float(range)));
}
KOKKOS_INLINE_FUNCTION
static half draw(Generator& gen, const half& start, const half& end) {
return half(gen.frand(float(start), float(end)));
}
};
#endif // defined(KOKKOS_HALF_T_IS_FLOAT) && !KOKKOS_HALF_T_IS_FLOAT
#if defined(KOKKOS_BHALF_T_IS_FLOAT) && !KOKKOS_BHALF_T_IS_FLOAT
template
struct rand {
using bhalf = Kokkos::Experimental::bhalf_t;
KOKKOS_INLINE_FUNCTION
static bhalf max() { return bhalf(1.0); }
KOKKOS_INLINE_FUNCTION
static bhalf draw(Generator& gen) { return bhalf(gen.frand()); }
KOKKOS_INLINE_FUNCTION
static bhalf draw(Generator& gen, const bhalf& range) {
return bhalf(gen.frand(float(range)));
}
KOKKOS_INLINE_FUNCTION
static bhalf draw(Generator& gen, const bhalf& start, const bhalf& end) {
return bhalf(gen.frand(float(start), float(end)));
}
};
#endif // defined(KOKKOS_BHALF_T_IS_FLOAT) && !KOKKOS_BHALF_T_IS_FLOAT
template
struct rand {
KOKKOS_INLINE_FUNCTION
static float max() { return 1.0f; }
KOKKOS_INLINE_FUNCTION
static float draw(Generator& gen) { return gen.frand(); }
KOKKOS_INLINE_FUNCTION
static float draw(Generator& gen, const float& range) {
return gen.frand(range);
}
KOKKOS_INLINE_FUNCTION
static float draw(Generator& gen, const float& start, const float& end) {
return gen.frand(start, end);
}
};
template
struct rand {
KOKKOS_INLINE_FUNCTION
static double max() { return 1.0; }
KOKKOS_INLINE_FUNCTION
static double draw(Generator& gen) { return gen.drand(); }
KOKKOS_INLINE_FUNCTION
static double draw(Generator& gen, const double& range) {
return gen.drand(range);
}
KOKKOS_INLINE_FUNCTION
static double draw(Generator& gen, const double& start, const double& end) {
return gen.drand(start, end);
}
};
template
struct rand> {
KOKKOS_INLINE_FUNCTION
static Kokkos::complex max() {
return Kokkos::complex(1.0, 1.0);
}
KOKKOS_INLINE_FUNCTION
static Kokkos::complex draw(Generator& gen) {
const float re = gen.frand();
const float im = gen.frand();
return Kokkos::complex(re, im);
}
KOKKOS_INLINE_FUNCTION
static Kokkos::complex draw(Generator& gen,
const Kokkos::complex& range) {
const float re = gen.frand(real(range));
const float im = gen.frand(imag(range));
return Kokkos::complex(re, im);
}
KOKKOS_INLINE_FUNCTION
static Kokkos::complex draw(Generator& gen,
const Kokkos::complex& start,
const Kokkos::complex& end) {
const float re = gen.frand(real(start), real(end));
const float im = gen.frand(imag(start), imag(end));
return Kokkos::complex(re, im);
}
};
template
struct rand> {
KOKKOS_INLINE_FUNCTION
static Kokkos::complex max() {
return Kokkos::complex(1.0, 1.0);
}
KOKKOS_INLINE_FUNCTION
static Kokkos::complex draw(Generator& gen) {
const double re = gen.drand();
const double im = gen.drand();
return Kokkos::complex(re, im);
}
KOKKOS_INLINE_FUNCTION
static Kokkos::complex draw(Generator& gen,
const Kokkos::complex& range) {
const double re = gen.drand(real(range));
const double im = gen.drand(imag(range));
return Kokkos::complex(re, im);
}
KOKKOS_INLINE_FUNCTION
static Kokkos::complex draw(Generator& gen,
const Kokkos::complex& start,
const Kokkos::complex& end) {
const double re = gen.drand(real(start), real(end));
const double im = gen.drand(imag(start), imag(end));
return Kokkos::complex(re, im);
}
};
template
class Random_XorShift1024_Pool;
namespace Impl {
template
struct Random_XorShift1024_State {
uint64_t state_[16];
KOKKOS_DEFAULTED_FUNCTION
Random_XorShift1024_State() = default;
template
KOKKOS_FUNCTION Random_XorShift1024_State(const StateViewType& v,
int state_idx) {
for (int i = 0; i < 16; i++) state_[i] = v(state_idx, i);
}
KOKKOS_FUNCTION
uint64_t operator[](const int i) const { return state_[i]; }
KOKKOS_FUNCTION
uint64_t& operator[](const int i) { return state_[i]; }
};
template <>
struct Random_XorShift1024_State {
uint64_t* state_;
const int stride_;
KOKKOS_FUNCTION
Random_XorShift1024_State() : state_(nullptr), stride_(1){};
template
KOKKOS_FUNCTION Random_XorShift1024_State(const StateViewType& v,
int state_idx)
: state_(&v(state_idx, 0)), stride_(v.stride_1()) {}
KOKKOS_FUNCTION
uint64_t operator[](const int i) const { return state_[i * stride_]; }
KOKKOS_FUNCTION
uint64_t& operator[](const int i) { return state_[i * stride_]; }
};
template
struct Random_XorShift1024_UseCArrayState : std::true_type {};
#ifdef KOKKOS_ENABLE_CUDA
template <>
struct Random_XorShift1024_UseCArrayState : std::false_type {};
#endif
#ifdef KOKKOS_ENABLE_HIP
template <>
struct Random_XorShift1024_UseCArrayState
: std::false_type {};
#endif
#ifdef KOKKOS_ENABLE_OPENMPTARGET
template <>
struct Random_XorShift1024_UseCArrayState
: std::false_type {};
#endif
template
struct Random_UniqueIndex {
using locks_view_type = View;
KOKKOS_FUNCTION
static int get_state_idx(const locks_view_type) {
KOKKOS_IF_ON_HOST(
(return DeviceType::execution_space::impl_hardware_thread_id();))
KOKKOS_IF_ON_DEVICE((return 0;))
}
};
#if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP)
#if defined(KOKKOS_ENABLE_CUDA)
#define KOKKOS_IMPL_EXECUTION_SPACE_CUDA_OR_HIP Kokkos::Cuda
#elif defined(KOKKOS_ENABLE_HIP)
#define KOKKOS_IMPL_EXECUTION_SPACE_CUDA_OR_HIP Kokkos::Experimental::HIP
#endif
template
struct Random_UniqueIndex<
Kokkos::Device> {
using locks_view_type =
View>;
KOKKOS_FUNCTION
static int get_state_idx(const locks_view_type& locks_) {
KOKKOS_IF_ON_DEVICE((
const int i_offset =
(threadIdx.x * blockDim.y + threadIdx.y) * blockDim.z + threadIdx.z;
int i =
(((blockIdx.x * gridDim.y + blockIdx.y) * gridDim.z + blockIdx.z) *
blockDim.x * blockDim.y * blockDim.z +
i_offset) %
locks_.extent(0);
while (Kokkos::atomic_compare_exchange(&locks_(i, 0), 0, 1)) {
i += blockDim.x * blockDim.y * blockDim.z;
if (i >= static_cast(locks_.extent(0))) {
i = i_offset;
}
}
return i;))
KOKKOS_IF_ON_HOST(((void)locks_; return 0;))
}
};
#undef KOKKOS_IMPL_EXECUTION_SPACE_CUDA_OR_HIP
#endif
#ifdef KOKKOS_ENABLE_SYCL
template
struct Random_UniqueIndex<
Kokkos::Device> {
using locks_view_type =
View>;
KOKKOS_FUNCTION
static int get_state_idx(const locks_view_type& locks_) {
auto item = sycl::ext::oneapi::experimental::this_nd_item<3>();
std::size_t threadIdx[3] = {item.get_local_id(2), item.get_local_id(1),
item.get_local_id(0)};
std::size_t blockIdx[3] = {item.get_group(2), item.get_group(1),
item.get_group(0)};
std::size_t blockDim[3] = {item.get_local_range(2), item.get_local_range(1),
item.get_local_range(0)};
std::size_t gridDim[3] = {
item.get_global_range(2) / item.get_local_range(2),
item.get_global_range(1) / item.get_local_range(1),
item.get_global_range(0) / item.get_local_range(0)};
const int i_offset =
(threadIdx[0] * blockDim[1] + threadIdx[1]) * blockDim[2] +
threadIdx[2];
int i =
(((blockIdx[0] * gridDim[1] + blockIdx[1]) * gridDim[2] + blockIdx[2]) *
blockDim[0] * blockDim[1] * blockDim[2] +
i_offset) %
locks_.extent(0);
while (Kokkos::atomic_compare_exchange(&locks_(i, 0), 0, 1)) {
i += blockDim[0] * blockDim[1] * blockDim[2];
if (i >= static_cast(locks_.extent(0))) {
i = i_offset;
}
}
return i;
}
};
#endif
#ifdef KOKKOS_ENABLE_OPENMPTARGET
template
struct Random_UniqueIndex<
Kokkos::Device> {
using locks_view_type =
View>;
KOKKOS_FUNCTION
static int get_state_idx(const locks_view_type& locks) {
const int team_size = omp_get_num_threads();
int i = omp_get_team_num() * team_size + omp_get_thread_num();
const int lock_size = locks.extent_int(0);
while (Kokkos::atomic_compare_exchange(&locks(i, 0), 0, 1)) {
i = (i + 1) % lock_size;
}
return i;
}
};
#endif
} // namespace Impl
template
class Random_XorShift64_Pool;
template
class Random_XorShift64 {
private:
uint64_t state_;
const int state_idx_;
friend class Random_XorShift64_Pool;
public:
using device_type = DeviceType;
constexpr static uint32_t MAX_URAND = std::numeric_limits::max();
constexpr static uint64_t MAX_URAND64 = std::numeric_limits::max();
constexpr static int32_t MAX_RAND = std::numeric_limits::max();
constexpr static int64_t MAX_RAND64 = std::numeric_limits::max();
KOKKOS_INLINE_FUNCTION
Random_XorShift64(uint64_t state, int state_idx = 0)
: state_(state == 0 ? uint64_t(1318319) : state), state_idx_(state_idx) {}
KOKKOS_INLINE_FUNCTION
uint32_t urand() {
state_ ^= state_ >> 12;
state_ ^= state_ << 25;
state_ ^= state_ >> 27;
uint64_t tmp = state_ * 2685821657736338717ULL;
tmp = tmp >> 16;
return static_cast(tmp & MAX_URAND);
}
KOKKOS_INLINE_FUNCTION
uint64_t urand64() {
state_ ^= state_ >> 12;
state_ ^= state_ << 25;
state_ ^= state_ >> 27;
return (state_ * 2685821657736338717ULL) - 1;
}
KOKKOS_INLINE_FUNCTION
uint32_t urand(const uint32_t& range) {
const uint32_t max_val = (MAX_URAND / range) * range;
uint32_t tmp = urand();
while (tmp >= max_val) tmp = urand();
return tmp % range;
}
KOKKOS_INLINE_FUNCTION
uint32_t urand(const uint32_t& start, const uint32_t& end) {
return urand(end - start) + start;
}
KOKKOS_INLINE_FUNCTION
uint64_t urand64(const uint64_t& range) {
const uint64_t max_val = (MAX_URAND64 / range) * range;
uint64_t tmp = urand64();
while (tmp >= max_val) tmp = urand64();
return tmp % range;
}
KOKKOS_INLINE_FUNCTION
uint64_t urand64(const uint64_t& start, const uint64_t& end) {
return urand64(end - start) + start;
}
KOKKOS_INLINE_FUNCTION
int rand() { return static_cast(urand() / 2); }
KOKKOS_INLINE_FUNCTION
int rand(const int& range) {
const int max_val = (MAX_RAND / range) * range;
int tmp = rand();
while (tmp >= max_val) tmp = rand();
return tmp % range;
}
KOKKOS_INLINE_FUNCTION
int rand(const int& start, const int& end) {
return rand(end - start) + start;
}
KOKKOS_INLINE_FUNCTION
int64_t rand64() { return static_cast(urand64() / 2); }
KOKKOS_INLINE_FUNCTION
int64_t rand64(const int64_t& range) {
const int64_t max_val = (MAX_RAND64 / range) * range;
int64_t tmp = rand64();
while (tmp >= max_val) tmp = rand64();
return tmp % range;
}
KOKKOS_INLINE_FUNCTION
int64_t rand64(const int64_t& start, const int64_t& end) {
return rand64(end - start) + start;
}
KOKKOS_INLINE_FUNCTION
float frand() { return urand64() / static_cast(MAX_URAND64); }
KOKKOS_INLINE_FUNCTION
float frand(const float& range) {
return range * urand64() / static_cast(MAX_URAND64);
}
KOKKOS_INLINE_FUNCTION
float frand(const float& start, const float& end) {
return frand(end - start) + start;
}
KOKKOS_INLINE_FUNCTION
double drand() { return urand64() / static_cast(MAX_URAND64); }
KOKKOS_INLINE_FUNCTION
double drand(const double& range) {
return range * urand64() / static_cast(MAX_URAND64);
}
KOKKOS_INLINE_FUNCTION
double drand(const double& start, const double& end) {
return drand(end - start) + start;
}
// Marsaglia polar method for drawing a standard normal distributed random
// number
KOKKOS_INLINE_FUNCTION
double normal() {
double S = 2.0;
double U;
while (S >= 1.0) {
U = 2.0 * drand() - 1.0;
const double V = 2.0 * drand() - 1.0;
S = U * U + V * V;
}
return U * std::sqrt(-2.0 * std::log(S) / S);
}
KOKKOS_INLINE_FUNCTION
double normal(const double& mean, const double& std_dev = 1.0) {
return mean + normal() * std_dev;
}
};
template
class Random_XorShift64_Pool {
public:
using device_type = typename DeviceType::device_type;
private:
using execution_space = typename device_type::execution_space;
using locks_type = View;
using state_data_type = View;
locks_type locks_;
state_data_type state_;
int num_states_;
int padding_;
public:
using generator_type = Random_XorShift64;
KOKKOS_INLINE_FUNCTION
Random_XorShift64_Pool() {
num_states_ = 0;
padding_ = 0;
}
Random_XorShift64_Pool(uint64_t seed) {
num_states_ = 0;
init(seed, execution_space().concurrency());
}
KOKKOS_INLINE_FUNCTION
Random_XorShift64_Pool(const Random_XorShift64_Pool& src)
: locks_(src.locks_), state_(src.state_), num_states_(src.num_states_) {}
KOKKOS_INLINE_FUNCTION
Random_XorShift64_Pool operator=(const Random_XorShift64_Pool& src) {
locks_ = src.locks_;
state_ = src.state_;
num_states_ = src.num_states_;
padding_ = src.padding_;
return *this;
}
void init(uint64_t seed, int num_states) {
if (seed == 0) seed = uint64_t(1318319);
// I only want to pad on CPU like archs (less than 1000 threads). 64 is a
// magic number, or random number I just wanted something not too large and
// not too small. 64 sounded fine.
padding_ = num_states < 1000 ? 64 : 1;
num_states_ = num_states;
locks_ =
locks_type("Kokkos::Random_XorShift64::locks", num_states, padding_);
state_ = state_data_type("Kokkos::Random_XorShift64::state", num_states_,
padding_);
typename state_data_type::HostMirror h_state =
Kokkos::create_mirror_view(Kokkos::WithoutInitializing, state_);
typename locks_type::HostMirror h_lock =
Kokkos::create_mirror_view(Kokkos::WithoutInitializing, locks_);
// Execute on the HostMirror's default execution space.
Random_XorShift64
gen(seed, 0);
for (int i = 0; i < 17; i++) gen.rand();
for (int i = 0; i < num_states_; i++) {
int n1 = gen.rand();
int n2 = gen.rand();
int n3 = gen.rand();
int n4 = gen.rand();
h_state(i, 0) = (((static_cast(n1)) & 0xffff) << 00) |
(((static_cast(n2)) & 0xffff) << 16) |
(((static_cast(n3)) & 0xffff) << 32) |
(((static_cast(n4)) & 0xffff) << 48);
h_lock(i, 0) = 0;
}
deep_copy(state_, h_state);
deep_copy(locks_, h_lock);
}
KOKKOS_INLINE_FUNCTION
Random_XorShift64 get_state() const {
const int i = Impl::Random_UniqueIndex::get_state_idx(locks_);
return Random_XorShift64(state_(i, 0), i);
}
// NOTE: state_idx MUST be unique and less than num_states
KOKKOS_INLINE_FUNCTION
Random_XorShift64 get_state(const int state_idx) const {
return Random_XorShift64(state_(state_idx, 0), state_idx);
}
KOKKOS_INLINE_FUNCTION
void free_state(const Random_XorShift64& state) const {
state_(state.state_idx_, 0) = state.state_;
locks_(state.state_idx_, 0) = 0;
}
};
template
class Random_XorShift1024 {
using execution_space = typename DeviceType::execution_space;
private:
int p_;
const int state_idx_;
Impl::Random_XorShift1024_State<
Impl::Random_XorShift1024_UseCArrayState::value>
state_;
friend class Random_XorShift1024_Pool;
public:
using pool_type = Random_XorShift1024_Pool;
using device_type = DeviceType;
constexpr static uint32_t MAX_URAND = std::numeric_limits::max();
constexpr static uint64_t MAX_URAND64 = std::numeric_limits::max();
constexpr static int32_t MAX_RAND = std::numeric_limits::max();
constexpr static int64_t MAX_RAND64 = std::numeric_limits::max();
KOKKOS_INLINE_FUNCTION
Random_XorShift1024(const typename pool_type::state_data_type& state, int p,
int state_idx = 0)
: p_(p), state_idx_(state_idx), state_(state, state_idx) {}
KOKKOS_INLINE_FUNCTION
uint32_t urand() {
uint64_t state_0 = state_[p_];
uint64_t state_1 = state_[p_ = (p_ + 1) & 15];
state_1 ^= state_1 << 31;
state_1 ^= state_1 >> 11;
state_0 ^= state_0 >> 30;
uint64_t tmp = (state_[p_] = state_0 ^ state_1) * 1181783497276652981ULL;
tmp = tmp >> 16;
return static_cast(tmp & MAX_URAND);
}
KOKKOS_INLINE_FUNCTION
uint64_t urand64() {
uint64_t state_0 = state_[p_];
uint64_t state_1 = state_[p_ = (p_ + 1) & 15];
state_1 ^= state_1 << 31;
state_1 ^= state_1 >> 11;
state_0 ^= state_0 >> 30;
return ((state_[p_] = state_0 ^ state_1) * 1181783497276652981LL) - 1;
}
KOKKOS_INLINE_FUNCTION
uint32_t urand(const uint32_t& range) {
const uint32_t max_val = (MAX_URAND / range) * range;
uint32_t tmp = urand();
while (tmp >= max_val) tmp = urand();
return tmp % range;
}
KOKKOS_INLINE_FUNCTION
uint32_t urand(const uint32_t& start, const uint32_t& end) {
return urand(end - start) + start;
}
KOKKOS_INLINE_FUNCTION
uint64_t urand64(const uint64_t& range) {
const uint64_t max_val = (MAX_URAND64 / range) * range;
uint64_t tmp = urand64();
while (tmp >= max_val) tmp = urand64();
return tmp % range;
}
KOKKOS_INLINE_FUNCTION
uint64_t urand64(const uint64_t& start, const uint64_t& end) {
return urand64(end - start) + start;
}
KOKKOS_INLINE_FUNCTION
int rand() { return static_cast(urand() / 2); }
KOKKOS_INLINE_FUNCTION
int rand(const int& range) {
const int max_val = (MAX_RAND / range) * range;
int tmp = rand();
while (tmp >= max_val) tmp = rand();
return tmp % range;
}
KOKKOS_INLINE_FUNCTION
int rand(const int& start, const int& end) {
return rand(end - start) + start;
}
KOKKOS_INLINE_FUNCTION
int64_t rand64() { return static_cast(urand64() / 2); }
KOKKOS_INLINE_FUNCTION
int64_t rand64(const int64_t& range) {
const int64_t max_val = (MAX_RAND64 / range) * range;
int64_t tmp = rand64();
while (tmp >= max_val) tmp = rand64();
return tmp % range;
}
KOKKOS_INLINE_FUNCTION
int64_t rand64(const int64_t& start, const int64_t& end) {
return rand64(end - start) + start;
}
KOKKOS_INLINE_FUNCTION
float frand() { return urand64() / static_cast(MAX_URAND64); }
KOKKOS_INLINE_FUNCTION
float frand(const float& range) {
return range * urand64() / static_cast(MAX_URAND64);
}
KOKKOS_INLINE_FUNCTION
float frand(const float& start, const float& end) {
return frand(end - start) + start;
}
KOKKOS_INLINE_FUNCTION
double drand() { return urand64() / static_cast(MAX_URAND64); }
KOKKOS_INLINE_FUNCTION
double drand(const double& range) {
return range * urand64() / static_cast(MAX_URAND64);
}
KOKKOS_INLINE_FUNCTION
double drand(const double& start, const double& end) {
return drand(end - start) + start;
}
// Marsaglia polar method for drawing a standard normal distributed random
// number
KOKKOS_INLINE_FUNCTION
double normal() {
double S = 2.0;
double U;
while (S >= 1.0) {
U = 2.0 * drand() - 1.0;
const double V = 2.0 * drand() - 1.0;
S = U * U + V * V;
}
return U * std::sqrt(-2.0 * std::log(S) / S);
}
KOKKOS_INLINE_FUNCTION
double normal(const double& mean, const double& std_dev = 1.0) {
return mean + normal() * std_dev;
}
};
template
class Random_XorShift1024_Pool {
public:
using device_type = typename DeviceType::device_type;
private:
using execution_space = typename device_type::execution_space;
using locks_type = View;
using int_view_type = View;
using state_data_type = View;
locks_type locks_;
state_data_type state_;
int_view_type p_;
int num_states_;
int padding_;
friend class Random_XorShift1024;
public:
using generator_type = Random_XorShift1024;
KOKKOS_INLINE_FUNCTION
Random_XorShift1024_Pool() { num_states_ = 0; }
inline Random_XorShift1024_Pool(uint64_t seed) {
num_states_ = 0;
init(seed, execution_space().concurrency());
}
KOKKOS_INLINE_FUNCTION
Random_XorShift1024_Pool(const Random_XorShift1024_Pool& src)
: locks_(src.locks_),
state_(src.state_),
p_(src.p_),
num_states_(src.num_states_) {}
KOKKOS_INLINE_FUNCTION
Random_XorShift1024_Pool operator=(const Random_XorShift1024_Pool& src) {
locks_ = src.locks_;
state_ = src.state_;
p_ = src.p_;
num_states_ = src.num_states_;
padding_ = src.padding_;
return *this;
}
inline void init(uint64_t seed, int num_states) {
if (seed == 0) seed = uint64_t(1318319);
// I only want to pad on CPU like archs (less than 1000 threads). 64 is a
// magic number, or random number I just wanted something not too large and
// not too small. 64 sounded fine.
padding_ = num_states < 1000 ? 64 : 1;
num_states_ = num_states;
locks_ =
locks_type("Kokkos::Random_XorShift1024::locks", num_states_, padding_);
state_ = state_data_type("Kokkos::Random_XorShift1024::state", num_states_);
p_ = int_view_type("Kokkos::Random_XorShift1024::p", num_states_, padding_);
typename state_data_type::HostMirror h_state =
Kokkos::create_mirror_view(Kokkos::WithoutInitializing, state_);
typename locks_type::HostMirror h_lock =
Kokkos::create_mirror_view(Kokkos::WithoutInitializing, locks_);
typename int_view_type::HostMirror h_p =
Kokkos::create_mirror_view(Kokkos::WithoutInitializing, p_);
// Execute on the HostMirror's default execution space.
Random_XorShift64
gen(seed, 0);
for (int i = 0; i < 17; i++) gen.rand();
for (int i = 0; i < num_states_; i++) {
for (int j = 0; j < 16; j++) {
int n1 = gen.rand();
int n2 = gen.rand();
int n3 = gen.rand();
int n4 = gen.rand();
h_state(i, j) = (((static_cast(n1)) & 0xffff) << 00) |
(((static_cast(n2)) & 0xffff) << 16) |
(((static_cast(n3)) & 0xffff) << 32) |
(((static_cast(n4)) & 0xffff) << 48);
}
h_p(i, 0) = 0;
h_lock(i, 0) = 0;
}
deep_copy(state_, h_state);
deep_copy(locks_, h_lock);
}
KOKKOS_INLINE_FUNCTION
Random_XorShift1024 get_state() const {
const int i = Impl::Random_UniqueIndex::get_state_idx(locks_);
return Random_XorShift1024(state_, p_(i, 0), i);
};
// NOTE: state_idx MUST be unique and less than num_states
KOKKOS_INLINE_FUNCTION
Random_XorShift1024 get_state(const int state_idx) const {
return Random_XorShift1024(state_, p_(state_idx, 0), state_idx);
}
KOKKOS_INLINE_FUNCTION
void free_state(const Random_XorShift1024& state) const {
for (int i = 0; i < 16; i++) state_(state.state_idx_, i) = state.state_[i];
p_(state.state_idx_, 0) = state.p_;
locks_(state.state_idx_, 0) = 0;
}
};
namespace Impl {
template
struct fill_random_functor_begin_end;
template
struct fill_random_functor_begin_end {
ViewType a;
RandomPool rand_pool;
typename ViewType::const_value_type begin, end;
using Rand = rand;
fill_random_functor_begin_end(ViewType a_, RandomPool rand_pool_,
typename ViewType::const_value_type begin_,
typename ViewType::const_value_type end_)
: a(a_), rand_pool(rand_pool_), begin(begin_), end(end_) {}
KOKKOS_INLINE_FUNCTION
void operator()(IndexType) const {
typename RandomPool::generator_type gen = rand_pool.get_state();
a() = Rand::draw(gen, begin, end);
rand_pool.free_state(gen);
}
};
template
struct fill_random_functor_begin_end {
ViewType a;
RandomPool rand_pool;
typename ViewType::const_value_type begin, end;
using Rand = rand;
fill_random_functor_begin_end(ViewType a_, RandomPool rand_pool_,
typename ViewType::const_value_type begin_,
typename ViewType::const_value_type end_)
: a(a_), rand_pool(rand_pool_), begin(begin_), end(end_) {}
KOKKOS_INLINE_FUNCTION
void operator()(IndexType i) const {
typename RandomPool::generator_type gen = rand_pool.get_state();
for (IndexType j = 0; j < loops; j++) {
const IndexType idx = i * loops + j;
if (idx < static_cast(a.extent(0)))
a(idx) = Rand::draw(gen, begin, end);
}
rand_pool.free_state(gen);
}
};
template
struct fill_random_functor_begin_end {
ViewType a;
RandomPool rand_pool;
typename ViewType::const_value_type begin, end;
using Rand = rand;
fill_random_functor_begin_end(ViewType a_, RandomPool rand_pool_,
typename ViewType::const_value_type begin_,
typename ViewType::const_value_type end_)
: a(a_), rand_pool(rand_pool_), begin(begin_), end(end_) {}
KOKKOS_INLINE_FUNCTION
void operator()(IndexType i) const {
typename RandomPool::generator_type gen = rand_pool.get_state();
for (IndexType j = 0; j < loops; j++) {
const IndexType idx = i * loops + j;
if (idx < static_cast(a.extent(0))) {
for (IndexType k = 0; k < static_cast(a.extent(1)); k++)
a(idx, k) = Rand::draw(gen, begin, end);
}
}
rand_pool.free_state(gen);
}
};
template
struct fill_random_functor_begin_end {
ViewType a;
RandomPool rand_pool;
typename ViewType::const_value_type begin, end;
using Rand = rand;
fill_random_functor_begin_end(ViewType a_, RandomPool rand_pool_,
typename ViewType::const_value_type begin_,
typename ViewType::const_value_type end_)
: a(a_), rand_pool(rand_pool_), begin(begin_), end(end_) {}
KOKKOS_INLINE_FUNCTION
void operator()(IndexType i) const {
typename RandomPool::generator_type gen = rand_pool.get_state();
for (IndexType j = 0; j < loops; j++) {
const IndexType idx = i * loops + j;
if (idx < static_cast(a.extent(0))) {
for (IndexType k = 0; k < static_cast(a.extent(1)); k++)
for (IndexType l = 0; l < static_cast(a.extent(2)); l++)
a(idx, k, l) = Rand::draw(gen, begin, end);
}
}
rand_pool.free_state(gen);
}
};
template
struct fill_random_functor_begin_end {
ViewType a;
RandomPool rand_pool;
typename ViewType::const_value_type begin, end;
using Rand = rand;
fill_random_functor_begin_end(ViewType a_, RandomPool rand_pool_,
typename ViewType::const_value_type begin_,
typename ViewType::const_value_type end_)
: a(a_), rand_pool(rand_pool_), begin(begin_), end(end_) {}
KOKKOS_INLINE_FUNCTION
void operator()(IndexType i) const {
typename RandomPool::generator_type gen = rand_pool.get_state();
for (IndexType j = 0; j < loops; j++) {
const IndexType idx = i * loops + j;
if (idx < static_cast(a.extent(0))) {
for (IndexType k = 0; k < static_cast(a.extent(1)); k++)
for (IndexType l = 0; l < static_cast(a.extent(2)); l++)
for (IndexType m = 0; m < static_cast(a.extent(3)); m++)
a(idx, k, l, m) = Rand::draw(gen, begin, end);
}
}
rand_pool.free_state(gen);
}
};
template
struct fill_random_functor_begin_end {
ViewType a;
RandomPool rand_pool;
typename ViewType::const_value_type begin, end;
using Rand = rand;
fill_random_functor_begin_end(ViewType a_, RandomPool rand_pool_,
typename ViewType::const_value_type begin_,
typename ViewType::const_value_type end_)
: a(a_), rand_pool(rand_pool_), begin(begin_), end(end_) {}
KOKKOS_INLINE_FUNCTION
void operator()(IndexType i) const {
typename RandomPool::generator_type gen = rand_pool.get_state();
for (IndexType j = 0; j < loops; j++) {
const IndexType idx = i * loops + j;
if (idx < static_cast(a.extent(0))) {
for (IndexType l = 0; l < static_cast(a.extent(1)); l++)
for (IndexType m = 0; m < static_cast(a.extent(2)); m++)
for (IndexType n = 0; n < static_cast(a.extent(3)); n++)
for (IndexType o = 0; o < static_cast(a.extent(4));
o++)
a(idx, l, m, n, o) = Rand::draw(gen, begin, end);
}
}
rand_pool.free_state(gen);
}
};
template
struct fill_random_functor_begin_end {
ViewType a;
RandomPool rand_pool;
typename ViewType::const_value_type begin, end;
using Rand = rand;
fill_random_functor_begin_end(ViewType a_, RandomPool rand_pool_,
typename ViewType::const_value_type begin_,
typename ViewType::const_value_type end_)
: a(a_), rand_pool(rand_pool_), begin(begin_), end(end_) {}
KOKKOS_INLINE_FUNCTION
void operator()(IndexType i) const {
typename RandomPool::generator_type gen = rand_pool.get_state();
for (IndexType j = 0; j < loops; j++) {
const IndexType idx = i * loops + j;
if (idx < static_cast(a.extent(0))) {
for (IndexType k = 0; k < static_cast(a.extent(1)); k++)
for (IndexType l = 0; l < static_cast(a.extent(2)); l++)
for (IndexType m = 0; m < static_cast(a.extent(3)); m++)
for (IndexType n = 0; n < static_cast(a.extent(4));
n++)
for (IndexType o = 0; o < static_cast(a.extent(5));
o++)
a(idx, k, l, m, n, o) = Rand::draw(gen, begin, end);
}
}
rand_pool.free_state(gen);
}
};
template
struct fill_random_functor_begin_end {
ViewType a;
RandomPool rand_pool;
typename ViewType::const_value_type begin, end;
using Rand = rand