pax_global_header00006660000000000000000000000064144764334770014535gustar00rootroot0000000000000052 comment=ab22363cada7681c93fee83f1109cde489aa66bf hipCUB-rocm-5.7.1/000077500000000000000000000000001447643347700136175ustar00rootroot00000000000000hipCUB-rocm-5.7.1/.clang-format000066400000000000000000000076301447643347700162000ustar00rootroot00000000000000# Style file for MLSE Libraries based on the modified rocBLAS style # Common settings BasedOnStyle: WebKit TabWidth: 4 IndentWidth: 4 UseTab: Never ColumnLimit: 100 UseCRLF: false # Other languages JavaScript, Proto --- Language: Cpp # http://releases.llvm.org/6.0.1/tools/clang/docs/ClangFormatStyleOptions.html#disabling-formatting-on-a-piece-of-code # int formatted_code; # // clang-format off # void unformatted_code ; # // clang-format on # void formatted_code_again; DisableFormat: false Standard: Cpp11 AccessModifierOffset: -4 AlignAfterOpenBracket: true AlignArrayOfStructures: Right AlignConsecutiveAssignments: true AlignConsecutiveDeclarations: true AlignEscapedNewlines: Left AlignOperands: true AlignTrailingComments: false AllowAllArgumentsOnNextLine: false AllowAllParametersOfDeclarationOnNextLine: true AllowShortBlocksOnASingleLine: Never AllowShortCaseLabelsOnASingleLine: true AllowShortFunctionsOnASingleLine: Empty AllowShortIfStatementsOnASingleLine: false AllowShortLoopsOnASingleLine: false AlwaysBreakAfterReturnType: None AlwaysBreakBeforeMultilineStrings: false AlwaysBreakTemplateDeclarations: Yes AttributeMacros: ['HIPCUB_DEVICE', 'HIPCUB_HOST', 'HIPCUB_HOST_DEVICE', 'HIPCUB_SHARED_MEMORY', 'HIPCUB_RUNTIME_FUNCTION'] BinPackArguments: false BinPackParameters: false BitFieldColonSpacing: Both # Configure each individual brace in BraceWrapping BreakBeforeBraces: Custom # Control of individual brace wrapping cases BraceWrapping: AfterCaseLabel: true AfterClass: true AfterControlStatement: Always AfterEnum: true AfterFunction: true AfterNamespace: true AfterStruct: true AfterUnion: true BeforeCatch: true BeforeElse: true AfterExternBlock: false BeforeCatch: true BeforeElse: true BeforeLambdaBody: true BeforeWhile: true IndentBraces: false SplitEmptyFunction: false SplitEmptyRecord: false SplitEmptyNamespace: false BreakBeforeBinaryOperators: All BreakBeforeTernaryOperators: true BreakConstructorInitializers: BeforeComma BreakInheritanceList: BeforeComma BreakStringLiterals: true CommentPragmas: '^ IWYU pragma:' CompactNamespaces: false ConstructorInitializerIndentWidth: 4 ContinuationIndentWidth: 4 Cpp11BracedListStyle: true DeriveLineEnding: false DerivePointerAlignment: false EmptyLineAfterAccessModifier: Never EmptyLineBeforeAccessModifier: Always ExperimentalAutoDetectBinPacking: false FixNamespaceComments: true ForEachMacros: [ foreach, Q_FOREACH, BOOST_FOREACH ] IfMacros: [] IncludeBlocks: Preserve IndentAccessModifiers: false IndentCaseBlocks: true IndentCaseLabels: true IndentExternBlock: NoIndent IndentPPDirectives: BeforeHash IndentWrappedFunctionNames: true KeepEmptyLinesAtTheStartOfBlocks: true LambdaBodyIndentation: Signature MacroBlockBegin: '' MacroBlockEnd: '' MaxEmptyLinesToKeep: 1 NamespaceIndentation: None PPIndentWidth: -1 PackConstructorInitializers: NextLine PenaltyBreakBeforeFirstCallParameter: 19 PenaltyBreakComment: 300 PenaltyBreakFirstLessLess: 120 PenaltyBreakString: 1000 PenaltyExcessCharacter: 1000000 PenaltyReturnTypeOnItsOwnLine: 60 PointerAlignment: Left QualifierAlignment: Leave ReferenceAlignment: Pointer ReflowComments: false ShortNamespaceLines: 0 SortIncludes: CaseSensitive SortUsingDeclarations: true SpaceAfterCStyleCast: false SpaceAfterLogicalNot: false SpaceAfterTemplateKeyword: false SpaceAroundPointerQualifiers: Default SpaceBeforeAssignmentOperators: true SpaceBeforeCaseColon: false SpaceBeforeCpp11BracedList: false SpaceBeforeCtorInitializerColon: true SpaceBeforeInheritanceColon: true SpaceBeforeParens: Never SpaceBeforeRangeBasedForLoopColon: true SpaceBeforeSquareBrackets: false SpaceInEmptyBlock: false SpaceInEmptyParentheses: false SpacesBeforeTrailingComments: 1 SpacesInAngles: Never SpacesInCStyleCastParentheses: false SpacesInConditionalStatement: false SpacesInContainerLiterals: true SpacesInParentheses: false SpacesInSquareBrackets: false --- hipCUB-rocm-5.7.1/.githooks/000077500000000000000000000000001447643347700155245ustar00rootroot00000000000000hipCUB-rocm-5.7.1/.githooks/install000077500000000000000000000002121447643347700171130ustar00rootroot00000000000000#!/bin/sh cd "$(git rev-parse --git-dir)" cd hooks echo "Installing hooks..." ln -s ../../.githooks/pre-commit pre-commit echo "Done!" hipCUB-rocm-5.7.1/.githooks/pre-commit000077500000000000000000000005161447643347700175300ustar00rootroot00000000000000#!/bin/sh # Redirect output to stderr. exec 1>&2 # Do the code format check if ! "$(git rev-parse --show-toplevel)/scripts/code-format/check-format.sh" HEAD --cached 1>&2; then printf " Pre-commit check failed, please fix the reported errors. Note: Use '\033[33mgit commit --no-verify\033[0m' to bypass checks.\n" exit 1 fi hipCUB-rocm-5.7.1/.github/000077500000000000000000000000001447643347700151575ustar00rootroot00000000000000hipCUB-rocm-5.7.1/.github/dependabot.yml000066400000000000000000000010421447643347700200040ustar00rootroot00000000000000# To get started with Dependabot version updates, you'll need to specify which # package ecosystems to update and where the package manifests are located. # Please see the documentation for all configuration options: # https://docs.github.com/github/administering-a-repository/configuration-options-for-dependency-updates version: 2 updates: - package-ecosystem: "pip" # See documentation for possible values directory: "/docs/.sphinx" # Location of package manifests open-pull-requests-limit: 10 schedule: interval: "daily" hipCUB-rocm-5.7.1/.github/workflows/000077500000000000000000000000001447643347700172145ustar00rootroot00000000000000hipCUB-rocm-5.7.1/.github/workflows/docs.yaml000066400000000000000000000045551447643347700210410ustar00rootroot00000000000000name: Upload to the upload server # Controls when the workflow will run on: push: branches: [develop, master] tags: - rocm-5.* release: types: [published] # Allows you to run this workflow manually from the Actions tab workflow_dispatch: # A workflow run is made up of one or more jobs that can run sequentially or in parallel jobs: # This workflow contains a single job called "build" build: # The type of runner that the job will run on runs-on: ubuntu-latest # Steps represent a sequence of tasks that will be executed as part of the job steps: # Checks-out your repository under $GITHUB_WORKSPACE, so your job can access it - uses: actions/checkout@v2 - name: getting branch name shell: bash run: echo "##[set-output name=branch;]$(echo ${GITHUB_REF#refs/heads/})" id: branch_name - name: getting tag name shell: bash run: echo "##[set-output name=tag;]$(echo ${GITHUB_REF_NAME})" id: tag_name - name: zipping files run: zip -r ${{ github.event.repository.name }}_${{ steps.tag_name.outputs.tag }}.zip . -x '*.git*' '*.idea*' - name: echo-step run: echo "${{ github.event.release.target_commitish }}" - name: uploading archive to prod if: ${{ steps.branch_name.outputs.branch == 'master' || github.event.release.target_commitish == 'master'}} uses: wlixcc/SFTP-Deploy-Action@v1.0 with: username: ${{ secrets.USERNAME }} server: ${{ secrets.SERVER }} ssh_private_key: ${{ secrets.SSH_PRIVATE_KEY }} local_path: ${{ github.event.repository.name }}_${{ steps.tag_name.outputs.tag }}.zip remote_path: '${{ secrets.PROD_UPLOAD_URL }}' args: '-o ConnectTimeout=5' - name: uploading archive to staging if: ${{ steps.branch_name.outputs.branch == 'develop' || github.event.release.target_commitish == 'develop' }} uses: wlixcc/SFTP-Deploy-Action@v1.0 with: username: ${{ secrets.USERNAME }} server: ${{ secrets.SERVER }} ssh_private_key: ${{ secrets.SSH_PRIVATE_KEY }} local_path: ${{ github.event.repository.name }}_${{ steps.tag_name.outputs.tag }}.zip remote_path: '${{ secrets.STG_UPLOAD_URL }}' args: '-o ConnectTimeout=5' hipCUB-rocm-5.7.1/.gitignore000066400000000000000000000011431447643347700156060ustar00rootroot00000000000000 ### Build dirs ### build/ # Created by https://www.gitignore.io/api/c++,cmake ### C++ ### # Prerequisites *.d # Compiled Object files *.slo *.lo *.o *.obj # Precompiled Headers *.gch *.pch # Compiled Dynamic libraries *.so *.dylib *.dll # Fortran module files *.mod *.smod # Compiled Static libraries *.lai *.la *.a *.lib # Executables *.exe *.out *.app ### CMake ### CMakeCache.txt CMakeFiles CMakeScripts Testing Makefile cmake_install.cmake install_manifest.txt compile_commands.json CTestTestfile.cmake build ### VSCODE ### .vscode .devcontainer # End of https://www.gitignore.io/api/c++,cmake hipCUB-rocm-5.7.1/.gitlab-ci.yml000066400000000000000000000245261447643347700162640ustar00rootroot00000000000000# MIT License # # Copyright (c) 2017-2022 Advanced Micro Devices, Inc. All rights reserved. # # Permission is hereby granted, free of charge, to any person obtaining a copy # of this software and associated documentation files (the "Software"), to deal # in the Software without restriction, including without limitation the rights # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell # copies of the Software, and to permit persons to whom the Software is # furnished to do so, subject to the following conditions: # # The above copyright notice and this permission notice shall be included in all # copies or substantial portions of the Software. # # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE # SOFTWARE. include: - project: amd/ci-templates ref: main file: - /defaults.yaml - /deps-cmake.yaml - /deps-docs.yaml - /deps-rocm.yaml - /deps-nvcc.yaml - /gpus-rocm.yaml - /gpus-nvcc.yaml - /rules.yaml stages: - lint - build - test - benchmark clang-format: extends: - .deps:rocm stage: lint needs: [] tags: - rocm-build variables: CLANG_FORMAT: "/opt/rocm/llvm/bin/clang-format" GIT_CLANG_FORMAT: "/opt/rocm/llvm/bin/git-clang-format" rules: - if: '$CI_PIPELINE_SOURCE == "merge_request_event"' script: - cd $CI_PROJECT_DIR - git config --global --add safe.directory $CI_PROJECT_DIR - scripts/code-format/check-format.sh $CI_MERGE_REQUEST_DIFF_BASE_SHA --binary "$CLANG_FORMAT" # hipCUB with rocPRIM backend .rocm: variables: ROCPRIM_GIT_BRANCH: "develop_stream" extends: - .deps:rocm - .deps:cmake-minimum before_script: - !reference [".deps:rocm", before_script] - !reference [".deps:cmake-minimum", before_script] # Install rocPRIM from git - BRANCH_NAME="$ROCPRIM_GIT_BRANCH" - if [ "$CI_COMMIT_BRANCH" = develop -o "$CI_COMMIT_BRANCH" = master ]; then BRANCH_NAME="$CI_COMMIT_BRANCH" - fi; - git clone -b "$BRANCH_NAME" --depth 1 https://gitlab-ci-token:${CI_JOB_TOKEN}@${ROCPRIM_GIT_URL} $CI_PROJECT_DIR/rocPRIM - cmake -G Ninja -D CMAKE_CXX_COMPILER="$AMDCLANG" -D CMAKE_CXX_FLAGS="-Wall -Wextra" -D CMAKE_BUILD_TYPE=Release -D BUILD_TEST=OFF -D BUILD_EXAMPLE=OFF -D ROCM_DEP_ROCMCORE=OFF -B $CI_PROJECT_DIR/rocPRIM/build -S $CI_PROJECT_DIR/rocPRIM - cd $CI_PROJECT_DIR/rocPRIM/build - cpack -G "DEB" - $SUDO_CMD dpkg -i rocprim*.deb build:rocm: extends: - .rocm - .gpus:rocm-gpus - .rules:build stage: build tags: - rocm-build needs: [] script: - cmake -G Ninja -D CMAKE_CXX_COMPILER="$AMDCLANG" -D CMAKE_CXX_FLAGS="-Wall -Wextra" -D CMAKE_BUILD_TYPE=Release -D BUILD_TEST=ON -D BUILD_EXAMPLE=ON -D GPU_TARGETS="$GPU_TARGETS" -D GPU_TEST_TARGETS="$GPU_TARGETS" -D ROCM_SYMLINK_LIBS=OFF -B $CI_PROJECT_DIR/build -S $CI_PROJECT_DIR - cmake --build $CI_PROJECT_DIR/build - cd $CI_PROJECT_DIR/build - cpack -G "DEB;ZIP" artifacts: paths: - $CI_PROJECT_DIR/build/test/hipcub/test_* - $CI_PROJECT_DIR/build/test/CTestTestfile.cmake - $CI_PROJECT_DIR/build/test/hipcub/CTestTestfile.cmake - $CI_PROJECT_DIR/build/gtest/ - $CI_PROJECT_DIR/build/CMakeCache.txt - $CI_PROJECT_DIR/build/CTestTestfile.cmake - $CI_PROJECT_DIR/build/hipcub*.deb - $CI_PROJECT_DIR/build/hipcub*.zip - $CI_PROJECT_DIR/build/.ninja_log expire_in: 2 weeks build:rocm-benchmark: extends: - .rocm - .gpus:rocm-gpus - .rules:build stage: build tags: - rocm-build needs: [] script: - cmake -G Ninja -D CMAKE_CXX_COMPILER="$AMDCLANG" -D CMAKE_CXX_FLAGS="-Wall -Wextra" -D CMAKE_BUILD_TYPE=Release -D BUILD_BENCHMARK=ON -D GPU_TARGETS="$GPU_TARGETS" -B $CI_PROJECT_DIR/build -S $CI_PROJECT_DIR - cmake --build $CI_PROJECT_DIR/build artifacts: paths: - $CI_PROJECT_DIR/build/benchmark/* - $CI_PROJECT_DIR/build/deps/googlebenchmark/ - $CI_PROJECT_DIR/build/.ninja_log - $CI_PROJECT_DIR/build/CMakeCache.txt expire_in: 2 weeks test:rocm: stage: test needs: - build:rocm extends: - .rocm - .gpus:rocm - .rules:test script: - cd $CI_PROJECT_DIR/build - cmake -D CMAKE_PREFIX_PATH=/opt/rocm -P $CI_PROJECT_DIR/cmake/GenerateResourceSpec.cmake - cat ./resources.json - ctest --output-on-failure --repeat-until-fail 2 --tests-regex "$GPU_TARGET" --resource-spec-file ./resources.json --parallel $PARALLEL_JOBS .benchmark: stage: benchmark variables: BENCHMARK_FILENAME_REGEX: ^benchmark BENCHMARK_FILTER_REGEX: "" script: - python3 ${CI_PROJECT_DIR}/.gitlab/run_and_upload_benchmarks.py --api_endpoint ${BENCHMARK_API_ENDPOINT} --api_base_folder_id ${BENCHMARK_API_FOLDER_ID} --api_auth_token ${BENCHMARK_API_AUTH_TOKEN} --benchmark_dir ${CI_PROJECT_DIR}/build/benchmark --benchmark_datetime ${CI_PIPELINE_CREATED_AT} --benchmark_version ${CI_COMMIT_REF_SLUG}_MR${CI_MERGE_REQUEST_IID}_${CI_COMMIT_SHORT_SHA} --benchmark_gpu_name "${GPU}" --benchmark_filename_regex "${BENCHMARK_FILENAME_REGEX}" --benchmark_filter_regex "${BENCHMARK_FILTER_REGEX}" --no_upload artifacts: paths: - ${CI_PROJECT_DIR}/build/benchmark/*.json expire_in: 1 week benchmark:rocm: extends: - .rocm - .benchmark - .gpus:rocm - .rules:benchmark needs: - build:rocm-benchmark .test_package: script: - | if [[ -n $GPU_TARGETS ]]; then GPU_TARGETS_ARG="-DGPU_TARGETS=$GPU_TARGETS" else GPU_TARGETS_ARG="" fi - cmake -G Ninja -D rocprim_DIR="/opt/rocm/rocprim" -D CMAKE_CXX_FLAGS="-Wall -Wextra -Werror" "$GPU_TARGETS_ARG" -S $CI_PROJECT_DIR/test/extra -B $CI_PROJECT_DIR/build/package_test - cmake --build $CI_PROJECT_DIR/build/package_test - cd $CI_PROJECT_DIR/build/package_test - ctest --output-on-failure --repeat-until-fail 2 .test:package: script: - cd $CI_PROJECT_DIR/build - $SUDO_CMD dpkg -i ${HIPCUB_DEV_PACKAGE_WILDCARD} - export CXX - !reference [".test_package", script] - $SUDO_CMD dpkg -r rocprim-dev hipcub-dev .test:install: script: - export CXX - cmake -G Ninja -D BUILD_TEST=OFF -S $CI_PROJECT_DIR -B $CI_PROJECT_DIR/build_only_install # Preserve $PATH when sudoing - $SUDO_CMD env PATH="$PATH" cmake --install $CI_PROJECT_DIR/build_only_install - !reference [".test_package", script] test:rocm_package: stage: test needs: - build:rocm variables: CXX: "$AMDCLANG" HIPCUB_DEV_PACKAGE_WILDCARD: hipcub-dev*.deb tags: - rocm extends: - .rocm - .gpus:rocm-gpus - .test:package - .rules:test test:rocm_install: stage: test needs: [] variables: CXX: "$AMDCLANG" tags: - rocm extends: - .rocm - .gpus:rocm-gpus - .test:install - .rules:test # hipCUB with CUB backend .nvcc: extends: - .deps:nvcc - .gpus:nvcc-gpus - .deps:cmake-minimum before_script: - !reference [".deps:nvcc", before_script] - !reference [".deps:cmake-minimum", before_script] build:nvcc: stage: build extends: - .nvcc - .rules:build tags: - nvcc-build needs: [] script: - cmake -G Ninja -D CMAKE_CXX_FLAGS="-Wall -Wextra" -D CMAKE_BUILD_TYPE=Release -D BUILD_TEST=ON -D BUILD_EXAMPLE=ON -D NVGPU_TARGETS="$GPU_TARGETS" -D ROCM_SYMLINK_LIBS=OFF -B $CI_PROJECT_DIR/build -S $CI_PROJECT_DIR - cmake --build $CI_PROJECT_DIR/build - cd $CI_PROJECT_DIR/build - cpack -G "DEB;ZIP" artifacts: paths: - $CI_PROJECT_DIR/build/test/hipcub/test_* - $CI_PROJECT_DIR/build/test/CTestTestfile.cmake - $CI_PROJECT_DIR/build/test/hipcub/CTestTestfile.cmake - $CI_PROJECT_DIR/build/gtest/ - $CI_PROJECT_DIR/build/CMakeCache.txt - $CI_PROJECT_DIR/build/CTestTestfile.cmake - $CI_PROJECT_DIR/build/hipcub*.deb - $CI_PROJECT_DIR/build/hipcub*.zip - $CI_PROJECT_DIR/build/.ninja_log expire_in: 2 weeks build:nvcc-benchmark: stage: build extends: - .nvcc - .rules:build tags: - nvcc-build needs: [] script: - cmake -G Ninja -D CMAKE_CXX_FLAGS="-Wall -Wextra" -D CMAKE_BUILD_TYPE=Release -D BUILD_BENCHMARK=ON -D NVGPU_TARGETS="$GPU_TARGETS" -B $CI_PROJECT_DIR/build -S $CI_PROJECT_DIR - cmake --build $CI_PROJECT_DIR/build artifacts: paths: - $CI_PROJECT_DIR/build/benchmark/* - $CI_PROJECT_DIR/build/deps/googlebenchmark/ - $CI_PROJECT_DIR/build/.ninja_log - $CI_PROJECT_DIR/build/CMakeCache.txt expire_in: 2 weeks test:nvcc: stage: test needs: - build:nvcc extends: - .nvcc - .gpus:nvcc - .rules:test before_script: # This is only needed because of the legacy before_script in .gpus:nvcc would otherwise overwrite before_script - !reference [.nvcc, before_script] script: - cd $CI_PROJECT_DIR/build - ctest --output-on-failure --repeat-until-fail 2 benchmark:nvcc: needs: - build:nvcc-benchmark extends: - .nvcc - .gpus:nvcc - .benchmark - .rules:benchmark before_script: # This is only needed because of the legacy before_script in .gpus:nvcc would otherwise overwrite before_script - !reference [.nvcc, before_script] test:nvcc_package: stage: test needs: - build:nvcc variables: HIPCUB_DEV_PACKAGE_WILDCARD: hipcub_nvcc-dev*.deb tags: - nvcc extends: - .nvcc - .test:package - .rules:test test:nvcc_install: stage: test needs: [] tags: - nvcc extends: - .nvcc - .test:install - .rules:test test:doc: stage: test extends: - .rules:test - .build:docs scheduled-check-changes: extends: .rules:scheduled-check-changes hipCUB-rocm-5.7.1/.gitlab/000077500000000000000000000000001447643347700151375ustar00rootroot00000000000000hipCUB-rocm-5.7.1/.gitlab/cmake-run-benchmarks.txt000066400000000000000000000002301447643347700216700ustar00rootroot00000000000000file(GLOB Benchmarks "${BENCHMARK_BINARY_DIR}/benchmark_*") foreach(Benchmark IN LISTS Benchmarks) execute_process(COMMAND ${Benchmark}) endforeach() hipCUB-rocm-5.7.1/.gitlab/run_and_upload_benchmarks.py000066400000000000000000000174171447643347700227120ustar00rootroot00000000000000import argparse from collections import namedtuple from datetime import datetime import json import os import re import stat import subprocess from urllib.parse import urljoin import urllib.request BenchmarkContext = namedtuple('BenchmarkContext', ['run_datetime', 'version', 'gpu_name', 'benchmark_dir', 'benchmark_filename_regex', 'benchmark_filter_regex']) ApiContext = namedtuple('ApiContext', ['endpoint', 'folder_id', 'auth_token']) def run_benchmarks(benchmark_context): def is_benchmark_executable(filename): if not re.match(benchmark_context.benchmark_filename_regex, filename): return False path = os.path.join(benchmark_context.benchmark_dir, filename) st_mode = os.stat(path).st_mode # we are not interested in permissions, just whether there is any execution flag set # and it is a regular file (S_IFREG) return (st_mode & (stat.S_IXUSR | stat.S_IXGRP | stat.S_IXOTH)) and (st_mode & stat.S_IFREG) success = True benchmark_names = [name for name in os.listdir(benchmark_context.benchmark_dir) if is_benchmark_executable(name)] json_paths = [] for benchmark_name in benchmark_names: results_json_name = f'{benchmark_name}_{benchmark_context.version}_{benchmark_context.gpu_name}.json' benchmark_path = os.path.join(benchmark_context.benchmark_dir, benchmark_name) results_json_path = os.path.join(benchmark_context.benchmark_dir, results_json_name) args = [ benchmark_path, '--benchmark_out_format=json', f'--benchmark_out={results_json_path}', f'--benchmark_filter={benchmark_context.benchmark_filter_regex}' ] try: subprocess.call(args) json_paths.append(results_json_path) except OSError as error: print(f'Could not run benchmark at {benchmark_path}. Error: "{error}"') success = False return success, json_paths def write_system_info(): def try_running_info(executable_name): out_filename = f'{executable_name}.txt' try: run_result = subprocess.run(executable_name, stdout=subprocess.PIPE) if run_result.returncode == 0: with open(out_filename, 'wb') as file: file.write(run_result.stdout) return out_filename except OSError: # Expected, when the executable is not available on the system pass rocminfo_filename = try_running_info('rocminfo') if rocminfo_filename: return rocminfo_filename else: return try_running_info('deviceQuery') def create_benchmark_folder(benchmark_context, api_context): formatted_datetime = datetime.strftime(benchmark_context.run_datetime, '%Y%m%d_%H%M%S') new_folder_name = f'{formatted_datetime}_{benchmark_context.version}_{benchmark_context.gpu_name}' create_folder_url = urljoin(api_context.endpoint, f'files/folder/{api_context.folder_id}') create_folder_payload = json.dumps({ 'title': new_folder_name }).encode('utf-8') create_folder_headers = { 'Content-Type': 'application/json', 'Authorization': api_context.auth_token } create_folder_request = urllib.request.Request( url=create_folder_url, data=create_folder_payload, headers=create_folder_headers, method='POST') try: with urllib.request.urlopen(create_folder_request) as response: response_data = json.loads(response.read()) new_folder_id = response_data['response']['id'] print(f"Created new folder with id {new_folder_id}") return new_folder_id except Exception as ex: print(f'Could not create folder "{new_folder_name}". Error: {ex}') return None def upload_results(folder_id, api_context, paths_to_upload): success = True upload_file_url = urljoin(api_context.endpoint, f'files/{folder_id}/upload') for path in paths_to_upload: with open(path) as file: body_bytes = file.read().encode('utf-8') filename = os.path.basename(path) upload_file_headers = { 'Content-Type': 'text/plain', 'Content-Disposition': f'attachment; filename="{filename}"', 'Authorization': api_context.auth_token } upload_file_request = urllib.request.Request(url=upload_file_url, data=body_bytes, headers=upload_file_headers, method='POST') try: with urllib.request.urlopen(upload_file_request): pass print(f'Uploaded {path}') except Exception as ex: print(f'Could not upload file "{path}". Error: {ex}') success = False return success def parse_date(date_str): """ Parses the date format provided by GitLab's builtin variable CI_PIPELINE_CREATED_AT """ return datetime.strptime(date_str, '%Y-%m-%dT%H:%M:%SZ') def main(): parser = argparse.ArgumentParser() parser.add_argument('--api_endpoint', help='URL that specifies the file storage API endpoint. For example: https://website.com/api/2.0/', required=True) parser.add_argument('--api_base_folder_id', help='The ID of the remote folder to which the benchmark results are uploaded', required=True) parser.add_argument('--api_auth_token', help='The authentication token string required by the remote API', required=True) parser.add_argument('--benchmark_dir', help='The local directory that contains the benchmark executables', required=True) parser.add_argument('--benchmark_datetime', help='The datetime string that specifies the creation date of the benchmarks. For example: "2022-03-28T13:16:09Z"', required=True) parser.add_argument('--benchmark_version', help='The identifier of the source control version of the benchmarked source code. For example a commit hash.', required=True) parser.add_argument('--benchmark_gpu_name', help='The name of the currently enabled GPU', required=True) parser.add_argument('--benchmark_filename_regex', help='Regular expression that controls the list of benchmark executables to run', default=r'^benchmark', required=False) parser.add_argument('--benchmark_filter_regex', help='Regular expression that controls the list of benchmarks to run in each benchmark executable', default='', required=False) parser.add_argument('--no_upload', help='Only run the benchmarks, do not upload them', default=False, action='store_true', required=False) args = parser.parse_args() api_context = ApiContext(args.api_endpoint, args.api_base_folder_id, args.api_auth_token) benchmark_context = BenchmarkContext( parse_date(args.benchmark_datetime), args.benchmark_version, args.benchmark_gpu_name, args.benchmark_dir, args.benchmark_filename_regex, args.benchmark_filter_regex) status = True benchmark_run_successful, to_upload_paths = run_benchmarks(benchmark_context) status = status and benchmark_run_successful sysinfo_path = write_system_info() if sysinfo_path: # not required to be successful. # Not all rocm/nvidia images have rocminfo/deviceQuery in their path to_upload_paths.append(sysinfo_path) if not args.no_upload: upload_successful = False folder_id = create_benchmark_folder(benchmark_context, api_context) if folder_id is not None: upload_successful = upload_results(folder_id, api_context, to_upload_paths) status = status and upload_successful return status if __name__ == '__main__': success = main() if success: exit(0) else: exit(1) hipCUB-rocm-5.7.1/.jenkins/000077500000000000000000000000001447643347700153365ustar00rootroot00000000000000hipCUB-rocm-5.7.1/.jenkins/common.groovy000066400000000000000000000037521447643347700201040ustar00rootroot00000000000000// This file is for internal AMD use. // If you are interested in running your own Jenkins, please raise a github issue for assistance. def runCompileCommand(platform, project, jobName, boolean debug=false, boolean sameOrg=true) { project.paths.construct_build_prefix() String buildTypeArg = debug ? '-DCMAKE_BUILD_TYPE=Debug' : '-DCMAKE_BUILD_TYPE=Release' String buildTypeDir = debug ? 'debug' : 'release' String cmake = platform.jenkinsLabel.contains('centos') ? 'cmake3' : 'cmake' //Set CI node's gfx arch as target if PR, otherwise use default targets of the library String amdgpuTargets = env.BRANCH_NAME.startsWith('PR-') ? '-DAMDGPU_TARGETS=\$gfx_arch' : '' def getRocPRIM = auxiliary.getLibrary('rocPRIM', platform.jenkinsLabel, null, sameOrg) def command = """#!/usr/bin/env bash set -x ${getRocPRIM} cd ${project.paths.project_build_prefix} mkdir -p build/${buildTypeDir} && cd build/${buildTypeDir} ${auxiliary.gfxTargetParser()} ${cmake} -DCMAKE_CXX_COMPILER=/opt/rocm/bin/hipcc ${buildTypeArg} ${amdgpuTargets} -DBUILD_TEST=ON -DBUILD_BENCHMARK=ON ../.. make -j\$(nproc) """ platform.runCommand(this, command) } def runTestCommand (platform, project) { String sudo = auxiliary.sudo(platform.jenkinsLabel) def testCommand = "ctest --output-on-failure" def command = """#!/usr/bin/env bash set -x cd ${project.paths.project_build_prefix} cd ${project.testDirectory} ${sudo} LD_LIBRARY_PATH=/opt/rocm/lib ${testCommand} """ platform.runCommand(this, command) } def runPackageCommand(platform, project) { def packageHelper = platform.makePackage(platform.jenkinsLabel,"${project.paths.project_build_prefix}/build/release") platform.runCommand(this, packageHelper[0]) platform.archiveArtifacts(this, packageHelper[1]) } return this hipCUB-rocm-5.7.1/.jenkins/precheckin.groovy000066400000000000000000000044371447643347700207300ustar00rootroot00000000000000#!/usr/bin/env groovy // This shared library is available at https://github.com/ROCmSoftwarePlatform/rocJENKINS/ @Library('rocJenkins@pong') _ // This file is for internal AMD use. // If you are interested in running your own Jenkins, please raise a github issue for assistance. import com.amd.project.* import com.amd.docker.* import java.nio.file.Path; def runCI = { nodeDetails, jobName-> def prj = new rocProject('hipCUB', 'PreCheckin') // Define test architectures, optional rocm version argument is available def nodes = new dockerNodes(nodeDetails, jobName, prj) boolean formatCheck = false def commonGroovy def compileCommand = { platform, project-> commonGroovy = load "${project.paths.project_src_prefix}/.jenkins/common.groovy" commonGroovy.runCompileCommand(platform, project, jobName) } def testCommand = { platform, project-> commonGroovy.runTestCommand(platform, project) } def packageCommand = { platform, project-> commonGroovy.runPackageCommand(platform, project) } buildProject(prj, formatCheck, nodes.dockerArray, compileCommand, testCommand, packageCommand) } ci: { String urlJobName = auxiliary.getTopJobName(env.BUILD_URL) def propertyList = ["compute-rocm-dkms-no-npi-hipclang":[pipelineTriggers([cron('0 1 * * 0')])]] propertyList = auxiliary.appendPropertyList(propertyList) def jobNameList = ["compute-rocm-dkms-no-npi-hipclang":([ubuntu18:['gfx900'],centos7:['gfx906'],centos8:['gfx906'],sles15sp1:['gfx908']])] jobNameList = auxiliary.appendJobNameList(jobNameList) propertyList.each { jobName, property-> if (urlJobName == jobName) properties(auxiliary.addCommonProperties(property)) } Set seenJobNames = [] jobNameList.each { jobName, nodeDetails-> seenJobNames.add(jobName) if (urlJobName == jobName) runCI(nodeDetails, jobName) } // For url job names that are outside of the standardJobNameSet i.e. compute-rocm-dkms-no-npi-1901 if(!seenJobNames.contains(urlJobName)) { properties(auxiliary.addCommonProperties([pipelineTriggers([cron('0 1 * * *')])])) runCI([ubuntu16:['gfx906']], urlJobName) } } hipCUB-rocm-5.7.1/.jenkins/staticanalysis.groovy000066400000000000000000000034521447643347700216440ustar00rootroot00000000000000#!/usr/bin/env groovy // This shared library is available at https://github.com/ROCmSoftwarePlatform/rocJENKINS/ @Library('rocJenkins@pong') _ // This is file for internal AMD use. // If you are interested in running your own Jenkins, please raise a github issue for assistance. import com.amd.project.* import com.amd.docker.* import java.nio.file.Path def runCompileCommand(platform, project, jobName, boolean debug=false) { project.paths.construct_build_prefix() def command = """#!/usr/bin/env bash set -x ${project.paths.project_build_prefix}/docs/run_doc.sh """ try { platform.runCommand(this, command) } catch(e) { throw e } publishHTML([allowMissing: false, alwaysLinkToLastBuild: false, keepAll: false, reportDir: "${project.paths.project_build_prefix}/docs/_build/html", reportFiles: "index.html", reportName: "Documentation", reportTitles: "Documentation"]) } def runCI = { nodeDetails, jobName-> def prj = new rocProject('hipCUB', 'StaticAnalysis') // Define test architectures, optional rocm version argument is available def nodes = new dockerNodes(nodeDetails, jobName, prj) boolean formatCheck = false boolean staticAnalysis = true def compileCommand = { platform, project-> runCompileCommand(platform, project, jobName, false) } buildProject(prj , formatCheck, nodes.dockerArray, compileCommand, null, null, staticAnalysis) } ci: { String urlJobName = auxiliary.getTopJobName(env.BUILD_URL) properties(auxiliary.addCommonProperties([pipelineTriggers([cron('0 1 * * 6')])])) stage(urlJobName) { runCI([ubuntu20:['any']], urlJobName) } } hipCUB-rocm-5.7.1/.jenkins/staticlibrary.groovy000066400000000000000000000045641447643347700214720ustar00rootroot00000000000000#!/usr/bin/env groovy @Library('rocJenkins@pong') _ import com.amd.project.* import com.amd.docker.* import java.nio.file.Path; def runCI = { nodeDetails, jobName-> def prj = new rocProject('hipCUB', 'Static Library PreCheckin') def nodes = new dockerNodes(nodeDetails, jobName, prj) def commonGroovy boolean formatCheck = false def compileCommand = { platform, project-> commonGroovy = load "${project.paths.project_src_prefix}/.jenkins/common.groovy" commonGroovy.runCompileCommand(platform, project, jobName, false, true) } def testCommand = { platform, project-> commonGroovy.runTestCommand(platform, project) } def packageCommand = { platform, project-> commonGroovy.runPackageCommand(platform, project) } buildProject(prj, formatCheck, nodes.dockerArray, compileCommand, testCommand, packageCommand) } ci: { String urlJobName = auxiliary.getTopJobName(env.BUILD_URL) def propertyList = ["compute-rocm-dkms-no-npi":[pipelineTriggers([cron('0 1 * * 0')])], "compute-rocm-dkms-no-npi-hipclang":[pipelineTriggers([cron('0 1 * * 0')])], "rocm-docker":[]] propertyList = auxiliary.appendPropertyList(propertyList) def jobNameList = ["compute-rocm-dkms-no-npi":([ubuntu16:['gfx900'],centos7:['gfx906'],sles15sp1:['gfx908']]), "compute-rocm-dkms-no-npi-hipclang":([ubuntu16:['gfx900'],centos7:['gfx906'],sles15sp1:['gfx908']]), "rocm-docker":([ubuntu16:['gfx900'],centos7:['gfx906'],sles15sp1:['gfx908']])] jobNameList = auxiliary.appendJobNameList(jobNameList) propertyList.each { jobName, property-> if (urlJobName == jobName) properties(auxiliary.addCommonProperties(property)) } jobNameList.each { jobName, nodeDetails-> if (urlJobName == jobName) stage(jobName) { runCI(nodeDetails, jobName) } } // For url job names that are not listed by the jobNameList i.e. compute-rocm-dkms-no-npi-1901 if(!jobNameList.keySet().contains(urlJobName)) { properties(auxiliary.addCommonProperties([pipelineTriggers([cron('0 1 * * *')])])) stage(urlJobName) { runCI([ubuntu16:['gfx906']], urlJobName) } } } hipCUB-rocm-5.7.1/.readthedocs.yaml000066400000000000000000000005441447643347700170510ustar00rootroot00000000000000# Read the Docs configuration file # See https://docs.readthedocs.io/en/stable/config-file/v2.html for details version: 2 sphinx: configuration: docs/conf.py formats: [htmlzip, pdf, epub] python: install: - requirements: docs/.sphinx/requirements.txt build: os: ubuntu-20.04 tools: python: "3.8" apt_packages: - "doxygen" hipCUB-rocm-5.7.1/CHANGELOG.md000066400000000000000000000151631447643347700154360ustar00rootroot00000000000000# Change Log for hipCUB See README.md on how to build the hipCUB documentation using Doxygen. ## (Unreleased) hipCUB-2.13.1 for ROCm 5.7.0 ### Changed - CUB backend references CUB and Thrust version 2.0.1. - Fixed `DeviceSegmentedReduce::ArgMin` and `DeviceSegmentedReduce::ArgMax` by returning the segment-relative index instead of the absolute one. - Fixed `DeviceSegmentedReduce::ArgMin` for inputs where the segment minimum is smaller than the value returned for empty segments. An equivalent fix is applied to `DeviceSegmentedReduce::ArgMax`. ### Known Issues - `debug_synchronous` no longer works on CUDA platform. `CUB_DEBUG_SYNC` should be used to enable those checks. - `DeviceReduce::Sum` does not compile on CUDA platform for mixed extended-floating-point/floating-point InputT and OutputT types. - `DeviceHistogram::HistogramEven` fails on CUDA platform for `[LevelT, SampleIteratorT] = [int, int]`. - `DeviceHistogram::MultiHistogramEven` fails on CUDA platform for `[LevelT, SampleIteratorT] = [int, int/unsigned short/float/double]` and `[LevelT, SampleIteratorT] = [float, double]`. ## (Unreleased) hipCUB-2.13.1 for ROCm 5.5.0 ### Added - Benchmarks for `BlockShuffle`, `BlockLoad`, and `BlockStore`. ### Changed - CUB backend references CUB and Thrust version 1.17.2. - Improved benchmark coverage of `BlockScan` by adding `ExclusiveScan`, benchmark coverage of `BlockRadixSort` by adding `SortBlockedToStriped`, and benchmark coverage of `WarpScan` by adding `Broadcast`. - Removed references to and workarounds for deprecated hcc ### Known Issues - `BlockRadixRankMatch` is currently broken under the rocPRIM backend. - `BlockRadixRankMatch` with a warp size that does not exactly divide the block size is broken under the CUB backend. ## (Unreleased) hipCUB-2.13.0 for ROCm 5.4.0 ### Added - CMake functionality to improve build parallelism of the test suite that splits compilation units by function or by parameters. - New overload for `BlockAdjacentDifference::SubtractLeftPartialTile` that takes a predecessor item. ### Changed - Improved build parallelism of the test suite by splitting up large compilation units for `DeviceRadixSort`, `DeviceSegmentedRadixSort` and `DeviceSegmentedSort`. - CUB backend references CUB and Thrust version 1.17.1. ### Known Issues - `BlockRadixRankMatch` is currently broken under the rocPRIM backend. - `BlockRadixRankMatch` with a warp size that does not exactly divide the block size is broken under the CUB backend. ## hipCUB-2.12.0 for ROCm 5.3.0 ### Added - UniqueByKey device algorithm - SubtractLeft, SubtractLeftPartialTile, SubtractRight, SubtractRightPartialTile overloads in BlockAdjacentDifference. - The old overloads (FlagHeads, FlagTails, FlagHeadsAndTails) are deprecated. - DeviceAdjacentDifference algorithm. - Extended benchmark suite of `DeviceHistogram`, `DeviceScan`, `DevicePartition`, `DeviceReduce`, `DeviceSegmentedReduce`, `DeviceSegmentedRadixSort`, `DeviceRadixSort`, `DeviceSpmv`, `DeviceMergeSort`, `DeviceSegmentedSort` ### Changed - Obsolated type traits defined in util_type.hpp. Use the standard library equivalents instead. - CUB backend references CUB and thrust version 1.16.0. - DeviceRadixSort's num_items parameter's type is now templated instead of being an int. - If an integral type with a size at most 4 bytes is passed (i.e. an int), the former logic applies. - Otherwise the algorithm uses a larger indexing type that makes it possible to sort input data over 2**32 elements. ## hipCUB-2.11.1 for ROCm 5.2.0 ### Added - Packages for tests and benchmark executable on all supported OSes using CPack. ## hipCUB-2.11.0 for ROCm 5.1.0 ### Added - Device segmented sort - Warp merge sort, WarpMask and thread sort from cub 1.15.0 supported in hipCUB - Device three way partition ### Changed - Device_scan and device_segmented_scan: inclusive_scan now uses the input-type as accumulator-type, exclusive_scan uses initial-value-type. - This particularly changes behaviour of small-size input types with large-size output types (e.g. short input, int output). - And low-res input with high-res output (e.g. float input, double output) - Block merge sort no longer supports non power of two blocksizes ### Known Issues - grid unit test hanging on HIP on Windows ## hipCUB-2.10.13 for ROCm 5.0.0 ### Fixed - Added missing includes to hipcub.hpp ### Added - Bfloat16 support to test cases (device_reduce & device_radix_sort) - Device merge sort - Block merge sort - API update to CUB 1.14.0 ### Changed - The SetupNVCC.cmake automatic target selector select all of the capabalities of all available card for NVIDIA backend. ## hipCUB-2.10.12 for ROCm 4.5.0 ### Added - Initial HIP on Windows support. See README for instructions on how to build and install. ### Changed - Packaging changed to a development package (called hipcub-dev for `.deb` packages, and hipcub-devel for `.rpm` packages). As hipCUB is a header-only library, there is no runtime package. To aid in the transition, the development package sets the "provides" field to provide the package hipcub, so that existing packages depending on hipcub can continue to work. This provides feature is introduced as a deprecated feature and will be removed in a future ROCm release. ## [hipCUB-2.10.11 for ROCm 4.4.0] ### Added - gfx1030 support added. - Address Sanitizer build option ### Fixed - BlockRadixRank unit test failure fixed. ## [hipCUB-2.10.10 for ROCm 4.3.0] ### Added - DiscardOutputIterator to backend header ## [hipCUB-2.10.9 for ROCm 4.2.0] ### Added - Support for TexObjInputIterator and TexRefInputIterator - Support for DevicePartition ### Changed - Minimum cmake version required is now 3.10.2 - CUB backend has been updated to 1.11.0 ### Fixed - Benchmark build fixed - nvcc build fixed ## [hipCUB-2.10.8 for ROCm 4.1.0] ### Added - Support for DiscardOutputIterator ## [hipCUB-2.10.7 for ROCm 4.0.0] ### Added - No new features ## [hipCUB-2.10.6 for ROCm 3.10] ### Added - No new features ## [hipCUB-2.10.5 for ROCm 3.9.0] ### Added - No new features ## [hipCUB-2.10.4 for ROCm 3.8.0] ### Added - No new features ## [hipCUB-2.10.3 for ROCm 3.7.0] ### Added - No new features ## [hipCUB-2.10.2 for ROCm 3.6.0] ### Added - No new features ## [hipCUB-2.10.1 for ROCm 3.5.0] ### Added - Improved tests with fixed and random seeds for test data ### Changed - Switched to hip-clang as default compiler - CMake searches for rocPRIM locally first; downloads from github if local search fails ### Deprecated - HCC build deprecated ### Known Issues - The following unit test failures have been observed. These are due to issues in rocclr runtime. - BlockDiscontinuity - BlockExchange - BlockHistogram - BlockRadixSort - BlockReduce - BlockScan hipCUB-rocm-5.7.1/CMakeLists.txt000066400000000000000000000145741447643347700163720ustar00rootroot00000000000000# MIT License # # Copyright (c) 2017-2021 Advanced Micro Devices, Inc. All rights reserved. # # Permission is hereby granted, free of charge, to any person obtaining a copy # of this software and associated documentation files (the "Software"), to deal # in the Software without restriction, including without limitation the rights # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell # copies of the Software, and to permit persons to whom the Software is # furnished to do so, subject to the following conditions: # # The above copyright notice and this permission notice shall be included in all # copies or substantial portions of the Software. # # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE # SOFTWARE. cmake_minimum_required(VERSION 3.16 FATAL_ERROR) cmake_policy(VERSION 3.16...3.21) # Install prefix set(CMAKE_INSTALL_PREFIX "/opt/rocm" CACHE PATH "Install path prefix, prepended onto install directories") #Adding CMAKE_PREFIX_PATH list( APPEND CMAKE_PREFIX_PATH /opt/rocm/llvm /opt/rocm ${ROCM_PATH}) # hipCUB project project(hipcub LANGUAGES CXX) #Adding CMAKE_PREFIX_PATH list(APPEND CMAKE_PREFIX_PATH /opt/rocm) # CMake modules list(APPEND CMAKE_MODULE_PATH ${CMAKE_CURRENT_SOURCE_DIR}/cmake ${HIP_PATH}/cmake /opt/rocm/hip/cmake # FindHIP.cmake ${ROCM_PATH}/lib/cmake/hip /opt/rocm/lib/cmake/hip # FindHIP.cmake ) # Set a default build type if none was specified if(NOT CMAKE_BUILD_TYPE AND NOT CMAKE_CONFIGURATION_TYPES) message(STATUS "Setting build type to 'Release' as none was specified.") set(CMAKE_BUILD_TYPE "Release" CACHE STRING "Choose the type of build." FORCE) set_property(CACHE CMAKE_BUILD_TYPE PROPERTY STRINGS "" "Debug" "Release" "MinSizeRel" "RelWithDebInfo") endif() set(CMAKE_INSTALL_RPATH_USE_LINK_PATH TRUE CACHE BOOL "Add paths to linker search and installed rpath") # rocm-cmake contains common cmake code for rocm projects to help # setup and install include(cmake/RocmCmakeDependence.cmake) include( ROCMSetupVersion ) include( ROCMCreatePackage ) include( ROCMInstallTargets ) include( ROCMPackageConfigHelpers ) include( ROCMInstallSymlinks ) include( ROCMHeaderWrapper ) include( ROCMCheckTargetIds ) include( ROCMClients ) # Setup GPU targets for rocm platform if(NOT (CMAKE_CXX_COMPILER MATCHES ".*nvcc$" OR "${CMAKE_CXX_COMPILER_ID}" STREQUAL "GNU")) if (NOT DEFINED AMDGPU_TARGETS) set(GPU_TARGETS "all" CACHE STRING "GPU architectures to compile for") else() set(GPU_TARGETS "${AMDGPU_TARGETS}" CACHE STRING "GPU architectures to compile for") endif() set_property(CACHE GPU_TARGETS PROPERTY STRINGS "all") if(GPU_TARGETS STREQUAL "all") rocm_check_target_ids(DEFAULT_AMDGPU_TARGETS TARGETS "gfx803;gfx900:xnack-;gfx906:xnack-;gfx908:xnack-;gfx90a:xnack-;gfx90a:xnack+;gfx940;gfx941;gfx942;gfx1030;gfx1100;gfx1101;gfx1102" ) set(GPU_TARGETS "${DEFAULT_AMDGPU_TARGETS}" CACHE STRING "GPU architectures to compile for" FORCE) endif() endif() # Verify that hip-clang is used on ROCM platform if (NOT WIN32) include(cmake/VerifyCompiler.cmake) endif() # Set CXX flags set(CMAKE_CXX_STANDARD 14) set(CMAKE_CXX_STANDARD_REQUIRED ON) set(CMAKE_CXX_EXTENSIONS OFF) # Build options option(BUILD_TEST "Build tests (requires googletest)" OFF) option(DOWNLOAD_ROCPRIM "Download rocPRIM and do not search for rocPRIM package" OFF) option(DOWNLOAD_CUB "Download CUB and thrust. Do not search for CUB package" OFF) option(BUILD_BENCHMARK "Build benchmarks" OFF) option(BUILD_EXAMPLE "Build Examples" OFF) option(BUILD_ADDRESS_SANITIZER "Build with address sanitizer enabled" OFF) #Set the header wrapper ON by default. option(BUILD_FILE_REORG_BACKWARD_COMPATIBILITY "Build with file/folder reorg with backward compatibility enabled" ON) if(BUILD_ADDRESS_SANITIZER) set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fsanitize=address -shared-libasan") set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -fsanitize=address -shared-libasan") add_link_options(-fuse-ld=lld) endif() # Get dependencies include(cmake/Dependencies.cmake) # Setup VERSION set(VERSION_STRING "2.13.1") rocm_setup_version(VERSION ${VERSION_STRING}) # Print configuration summary include(cmake/Summary.cmake) print_configuration_summary() # hipCUB library add_subdirectory(hipcub) if(BUILD_TEST OR (BUILD_BENCHMARK AND NOT ONLY_INSTALL)) rocm_package_setup_component(clients) endif() # Tests if(BUILD_TEST) enable_testing() rocm_package_setup_client_component(tests) add_subdirectory(test) endif() # Examples if(BUILD_EXAMPLE) add_subdirectory(examples) endif() # Benchmarks if(BUILD_BENCHMARK AND NOT ONLY_INSTALL) rocm_package_setup_client_component(benchmarks) add_subdirectory(benchmark) endif() # Create header wrapper for backward compatibility if(BUILD_FILE_REORG_BACKWARD_COMPATIBILITY AND NOT WIN32) rocm_wrap_header_dir( ${PROJECT_SOURCE_DIR}/hipcub/include/hipcub/ PATTERNS "*.h" PATTERN "*.hpp" GUARDS SYMLINK WRAPPER WRAPPER_LOCATIONS cub/${CMAKE_INSTALL_INCLUDEDIR}/hipcub/ OUTPUT_LOCATIONS cub/wrapper/include/hipcub/ ) endif() # Package if(HIP_COMPILER STREQUAL "clang") rocm_package_add_deb_dependencies(DEPENDS "rocprim-dev >= 2.10.1") rocm_package_add_rpm_dependencies(DEPENDS "rocprim-devel >= 2.10.1") set(CPACK_DEBIAN_PACKAGE_REPLACES "cub-hip") set(CPACK_RPM_PACKAGE_OBSOLETES "cub-hip") else() rocm_package_add_dependencies(DEPENDS "hip-dev >= 4.4") endif() set(CPACK_RESOURCE_FILE_LICENSE "${CMAKE_CURRENT_SOURCE_DIR}/LICENSE.txt") set(CPACK_RPM_PACKAGE_LICENSE "BSD") # if(NOT CPACK_PACKAGING_INSTALL_PREFIX) # set(CPACK_PACKAGING_INSTALL_PREFIX "${CMAKE_INSTALL_PREFIX}") # endif() set(CPACK_RPM_EXCLUDE_FROM_AUTO_FILELIST_ADDITION "\${CPACK_PACKAGING_INSTALL_PREFIX}") if(HIP_COMPILER STREQUAL "clang") rocm_create_package( NAME hipcub DESCRIPTION "hipCUB (rocPRIM backend)" MAINTAINER "hipcub-maintainer@amd.com" HEADER_ONLY ) else() rocm_create_package( NAME hipcub_nvcc DESCRIPTION "hipCUB (CUB backend)" MAINTAINER "hipcub-maintainer@amd.com" HEADER_ONLY ) endif() hipCUB-rocm-5.7.1/LICENSE.txt000066400000000000000000000032061447643347700154430ustar00rootroot00000000000000Copyright (c) 2010-2011, Duane Merrill. All rights reserved. Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. Modifications Copyright (c) 2019-2021, Advanced Micro Devices, Inc. All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. hipCUB-rocm-5.7.1/NOTICES.txt000066400000000000000000000117621447643347700154730ustar00rootroot00000000000000Notices and Licenses file _________________________ AMD copyrighted code (MIT) Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved. Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. crascit-downloadproject v-u (MIT) Copyright (c) 2015 Crascit Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. Dependencies on nvlabs-cub v1.8 (BSD3) Copyright (c) 2010-2011, Duane Merrill. All rights reserved. Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. Modifications Copyright (c) 2019, Advanced Micro Devices, Inc. All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ROCmSoftwarePlatform-rocPRIM v1.0.0 (MIT) Copyright (c) 2018 Advanced Micro Devices, Inc. All rights reserved. Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. hipCUB-rocm-5.7.1/README.md000066400000000000000000000141071447643347700151010ustar00rootroot00000000000000# hipCUB hipCUB is a thin wrapper library on top of [rocPRIM](https://github.com/ROCmSoftwarePlatform/rocPRIM) or [CUB](https://github.com/thrust/cub). It enables developers to port a project using the CUB library to the [HIP](https://github.com/ROCm-Developer-Tools/HIP) layer to run on AMD hardware. In the [ROCm](https://rocm.github.io/) environment, hipCUB uses the rocPRIM library as the backend. However, on CUDA platforms it uses CUB instead. ## Documentation Information about the library API and other user topics can be found in the [hipCUB documentation](https://hipcub.readthedocs.io/en/latest). ## Requirements * Git * CMake (3.16 or later) * For AMD GPUs: * AMD [ROCm](https://rocm.github.io/install.html) platform (1.8.0 or later) * Including [HIP-clang](https://github.com/ROCm-Developer-Tools/HIP/blob/master/INSTALL.md#hip-clang) compiler, which must be set as C++ compiler on ROCm platform. * [rocPRIM](https://github.com/ROCmSoftwarePlatform/rocPRIM) library * Automatically downloaded and built by CMake script. * Requires CMake 3.16.9 or later. * For NVIDIA GPUs: * CUDA Toolkit * CUB library * Automatically downloaded and built by CMake script. * Requires CMake 3.15.0 or later. * Python 3.6 or higher (HIP on Windows only, only required for install scripts) * Visual Studio 2019 with clang support (HIP on Windows only) * Strawberry Perl (HIP on Windows only) Optional: * [GTest](https://github.com/google/googletest) * Required only for tests. Building tests is enabled by default. * It will be automatically downloaded and built by CMake script. * [Google Benchmark](https://github.com/google/benchmark) * Required only for benchmarks. Building benchmarks is off by default. * It will be automatically downloaded and built by cmake script. ## Build And Install ```shell git clone https://github.com/ROCmSoftwarePlatform/hipCUB.git # Go to hipCUB directory, create and go to the build directory. cd hipCUB; mkdir build; cd build # Configure hipCUB, setup options for your system. # Build options: # BUILD_TEST - OFF by default, # BUILD_BENCHMARK - OFF by default. # DOWNLOAD_ROCPRIM - OFF by default and at ON the rocPRIM will be downloaded to build folder, # # ! IMPORTANT ! # Set C++ compiler to HIP-aware clang. You can do it by adding 'CXX=' # before 'cmake' or setting cmake option 'CMAKE_CXX_COMPILER' to path to the compiler. # [CXX=hipcc] cmake ../. # or cmake-gui ../. # To configure hipCUB for Nvidia platforms, 'CXX=', `CXX=nvcc` or omitting the flag # entirely before 'cmake' is sufficient [CXX=nvcc] cmake -DBUILD_TEST=ON ../. # or cmake-gui ../. # or cmake -DBUILD_TEST=ON ../. # or cmake-gui ../. # or to build benchmarks cmake -DBUILD_BENCHMARK=ON ../. # Build make -j4 # Optionally, run tests if they're enabled. ctest --output-on-failure # Package make package # Install [sudo] make install ``` ### HIP on Windows Initial support for HIP on Windows has been added. To install, use the provided rmake.py python script: ```shell git clone https://github.com/ROCmSoftwarePlatform/hipCUB.git cd hipCUB # the -i option will install rocPRIM to C:\hipSDK by default python rmake.py -i # the -c option will build all clients including unit tests python rmake.py -c ``` ### Using hipCUB In A Project Recommended way of including hipCUB into a CMake project is by using its package configuration files. ```cmake # On ROCm hipCUB requires rocPRIM find_package(rocprim REQUIRED CONFIG PATHS "/opt/rocm/rocprim") # "/opt/rocm" - default install prefix find_package(hipcub REQUIRED CONFIG PATHS "/opt/rocm/hipcub") ... # On ROCm: includes hipCUB headers and roc::rocprim_hip target # On CUDA: includes only hipCUB headers, user has to include CUB directory target_link_libraries( hip::hipcub) ``` Include only the main header file: ```cpp #include ``` CUB or rocPRIM headers are included by hipCUB depending on the current HIP platform. ## Running Unit Tests ```shell # Go to hipCUB build directory cd hipCUB; cd build # To run all tests ctest # To run unit tests for hipCUB ./test/hipcub/ ``` ## Using custom seeds for the tests Go to the `hipCUB/test/hipcub/test_seed.hpp` file. ```cpp //(1) static constexpr int random_seeds_count = 10; //(2) static constexpr unsigned int seeds [] = {0, 2, 10, 1000}; //(3) static constexpr size_t seed_size = sizeof(seeds) / sizeof(seeds[0]); ``` (1) defines a constant that sets how many passes over the tests will be done with runtime-generated seeds. Modify at will. (2) defines the user generated seeds. Each of the elements of the array will be used as seed for all tests. Modify at will. If no static seeds are desired, the array should be left empty. ```cpp static constexpr unsigned int seeds [] = {}; ``` (3) this line should never be modified. ## Running Benchmarks ```shell # Go to hipCUB build directory cd hipCUB; cd build # To run benchmark for warp functions: # Further option can be found using --help # [] Fields are optional ./benchmark/benchmark_warp_ [--size ] [--trials ] # To run benchmark for block functions: # Further option can be found using --help # [] Fields are optional ./benchmark/benchmark_block_ [--size ] [--trials ] # To run benchmark for device functions: # Further option can be found using --help # [] Fields are optional ./benchmark/benchmark_device_ [--size ] [--trials ] ``` ## Building Documentation ```shell # Go to the hipCUB docs directory cd hipCUB; cd docs # Install required pip packages python3 -m pip install -r .sphinx/requirements.txt # Build the documentation python3 -m sphinx -T -E -b html -d _build/doctrees -D language=en . _build/html # For e.g. serve the HTML docs locally cd _build/html python3 -m http.server ``` ## Support Bugs and feature requests can be reported through [the issue tracker](https://github.com/ROCmSoftwarePlatform/hipCUB/issues). ## Contributions and License Contributions of any kind are most welcome! More details are found at [CONTRIBUTING](./CONTRIBUTING.md) and [LICENSE](./LICENSE.txt). hipCUB-rocm-5.7.1/benchmark/000077500000000000000000000000001447643347700155515ustar00rootroot00000000000000hipCUB-rocm-5.7.1/benchmark/CMakeLists.txt000066400000000000000000000110201447643347700203030ustar00rootroot00000000000000# MIT License # # Copyright (c) 2020 Advanced Micro Devices, Inc. All rights reserved. # # Permission is hereby granted, free of charge, to any person obtaining a copy # of this software and associated documentation files (the "Software"), to deal # in the Software without restriction, including without limitation the rights # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell # copies of the Software, and to permit persons to whom the Software is # furnished to do so, subject to the following conditions: # # The above copyright notice and this permission notice shall be included in all # copies or substantial portions of the Software. # # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE # SOFTWARE. function(add_hipcub_benchmark BENCHMARK_SOURCE) get_filename_component(BENCHMARK_TARGET ${BENCHMARK_SOURCE} NAME_WE) add_executable(${BENCHMARK_TARGET} ${BENCHMARK_SOURCE}) target_include_directories(${BENCHMARK_TARGET} SYSTEM BEFORE PUBLIC "$" ) target_link_libraries(${BENCHMARK_TARGET} PRIVATE benchmark::benchmark hipcub ) if((HIP_COMPILER STREQUAL "nvcc")) set_property(TARGET ${BENCHMARK_TARGET} PROPERTY CUDA_STANDARD 14) set_source_files_properties(${BENCHMARK_SOURCE} PROPERTIES LANGUAGE CUDA) target_compile_options(${BENCHMARK_TARGET} PRIVATE $<$:--expt-extended-lambda> ) target_link_libraries(${BENCHMARK_TARGET} PRIVATE hipcub_cub ) endif() set_target_properties(${BENCHMARK_TARGET} PROPERTIES RUNTIME_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}/benchmark" ) rocm_install(TARGETS ${BENCHMARK_TARGET} COMPONENT benchmarks) if (WIN32 AND NOT DEFINED DLLS_COPIED) set(DLLS_COPIED "YES") set(DLLS_COPIED ${DLLS_COPIED} PARENT_SCOPE) # for now adding in all .dll as dependency chain is not cmake based on win32 file( GLOB third_party_dlls LIST_DIRECTORIES ON CONFIGURE_DEPENDS ${HIP_DIR}/bin/*.dll ${CMAKE_SOURCE_DIR}/rtest.* ) foreach( file_i ${third_party_dlls}) add_custom_command( TARGET ${BENCHMARK_TARGET} POST_BUILD COMMAND ${CMAKE_COMMAND} ARGS -E copy_if_different ${file_i} ${PROJECT_BINARY_DIR}/benchmark ) endforeach( file_i ) endif() endfunction() # **************************************************************************** # Benchmarks # **************************************************************************** add_hipcub_benchmark(benchmark_block_adjacent_difference.cpp) add_hipcub_benchmark(benchmark_block_discontinuity.cpp) add_hipcub_benchmark(benchmark_block_exchange.cpp) add_hipcub_benchmark(benchmark_block_histogram.cpp) add_hipcub_benchmark(benchmark_block_merge_sort.cpp) add_hipcub_benchmark(benchmark_block_radix_sort.cpp) add_hipcub_benchmark(benchmark_block_radix_rank.cpp) add_hipcub_benchmark(benchmark_block_reduce.cpp) add_hipcub_benchmark(benchmark_block_run_length_decode.cpp) add_hipcub_benchmark(benchmark_block_scan.cpp) add_hipcub_benchmark(benchmark_block_shuffle.cpp) add_hipcub_benchmark(benchmark_device_adjacent_difference.cpp) add_hipcub_benchmark(benchmark_device_histogram.cpp) add_hipcub_benchmark(benchmark_device_memory.cpp) add_hipcub_benchmark(benchmark_device_merge_sort.cpp) add_hipcub_benchmark(benchmark_device_partition.cpp) add_hipcub_benchmark(benchmark_device_radix_sort.cpp) add_hipcub_benchmark(benchmark_device_reduce_by_key.cpp) add_hipcub_benchmark(benchmark_device_reduce.cpp) add_hipcub_benchmark(benchmark_device_run_length_encode.cpp) add_hipcub_benchmark(benchmark_device_scan.cpp) add_hipcub_benchmark(benchmark_device_segmented_sort.cpp) add_hipcub_benchmark(benchmark_device_segmented_radix_sort.cpp) add_hipcub_benchmark(benchmark_device_segmented_reduce.cpp) add_hipcub_benchmark(benchmark_device_select.cpp) add_hipcub_benchmark(benchmark_device_spmv.cpp) add_hipcub_benchmark(benchmark_warp_exchange.cpp) add_hipcub_benchmark(benchmark_warp_load.cpp) add_hipcub_benchmark(benchmark_warp_reduce.cpp) add_hipcub_benchmark(benchmark_warp_scan.cpp) add_hipcub_benchmark(benchmark_warp_store.cpp) add_hipcub_benchmark(benchmark_warp_merge_sort.cpp) hipCUB-rocm-5.7.1/benchmark/benchmark_block_adjacent_difference.cpp000066400000000000000000000353141447643347700253720ustar00rootroot00000000000000// MIT License // // Copyright (c) 2020-2022 Advanced Micro Devices, Inc. All rights reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal // in the Software without restriction, including without limitation the rights // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell // copies of the Software, and to permit persons to whom the Software is // furnished to do so, subject to the following conditions: // // The above copyright notice and this permission notice shall be included in all // copies or substantial portions of the Software. // // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE // SOFTWARE. #include "common_benchmark_header.hpp" // HIP API #include "hipcub/block/block_adjacent_difference.hpp" #include "hipcub/block/block_load.hpp" #include "hipcub/block/block_store.hpp" #ifndef DEFAULT_N const size_t DEFAULT_N = 1024 * 1024 * 128; #endif template < class Benchmark, unsigned int BlockSize, unsigned int ItemsPerThread, bool WithTile, typename... Args > __global__ __launch_bounds__(BlockSize) void kernel(Args ...args) { Benchmark::template run(args...); } template struct minus { HIPCUB_HOST_DEVICE inline constexpr T operator()(const T& a, const T& b) const { return a - b; } }; struct subtract_left { template __device__ static void run(const T* d_input, T* d_output, unsigned int trials) { const unsigned int lid = threadIdx.x; const unsigned int block_offset = blockIdx.x * ItemsPerThread * BlockSize; T input[ItemsPerThread]; hipcub::LoadDirectStriped(lid, d_input + block_offset, input); hipcub::BlockAdjacentDifference adjacent_difference; #pragma nounroll for(unsigned int trial = 0; trial < trials; trial++) { T output[ItemsPerThread]; if(WithTile) { adjacent_difference.SubtractLeft(input, output, minus{}, T(123)); } else { adjacent_difference.SubtractLeft(input, output, minus{}); } for(unsigned int i = 0; i < ItemsPerThread; ++i) { input[i] += output[i]; } __syncthreads(); } hipcub::StoreDirectStriped(lid, d_output + block_offset, input); } }; struct subtract_left_partial_tile { template __device__ static void run(const T* d_input, int* tile_sizes, T* d_output, unsigned int trials) { const unsigned int lid = threadIdx.x; const unsigned int block_offset = blockIdx.x * ItemsPerThread * BlockSize; T input[ItemsPerThread]; hipcub::LoadDirectStriped(lid, d_input + block_offset, input); hipcub::BlockAdjacentDifference adjacent_difference; int tile_size = tile_sizes[blockIdx.x]; // Try to evenly distribute the length of tile_sizes between all the trials const auto tile_size_diff = (BlockSize * ItemsPerThread) / trials + 1; #pragma nounroll for(unsigned int trial = 0; trial < trials; trial++) { T output[ItemsPerThread]; if(WithTile) { adjacent_difference.SubtractLeftPartialTile(input, output, minus{}, tile_size, T(123)); } else { adjacent_difference.SubtractLeftPartialTile(input, output, minus{}, tile_size); } for(unsigned int i = 0; i < ItemsPerThread; ++i) { input[i] += output[i]; } // Change the tile_size to even out the distribution tile_size = (tile_size + tile_size_diff) % (BlockSize * ItemsPerThread); __syncthreads(); } hipcub::StoreDirectStriped(lid, d_output + block_offset, input); } }; struct subtract_right { template __device__ static void run(const T* d_input, T* d_output, unsigned int trials) { const unsigned int lid = threadIdx.x; const unsigned int block_offset = blockIdx.x * ItemsPerThread * BlockSize; T input[ItemsPerThread]; hipcub::LoadDirectStriped(lid, d_input + block_offset, input); hipcub::BlockAdjacentDifference adjacent_difference; #pragma nounroll for(unsigned int trial = 0; trial < trials; trial++) { T output[ItemsPerThread]; if(WithTile) { adjacent_difference.SubtractRight(input, output, minus{}, T(123)); } else { adjacent_difference.SubtractRight(input, output, minus{}); } for(unsigned int i = 0; i < ItemsPerThread; ++i) { input[i] += output[i]; } __syncthreads(); } hipcub::StoreDirectStriped(lid, d_output + block_offset, input); } }; struct subtract_right_partial_tile { template __device__ static void run(const T* d_input, int* tile_sizes, T* d_output, unsigned int trials) { const unsigned int lid = threadIdx.x; const unsigned int block_offset = blockIdx.x * ItemsPerThread * BlockSize; T input[ItemsPerThread]; hipcub::LoadDirectStriped(lid, d_input + block_offset, input); hipcub::BlockAdjacentDifference adjacent_difference; int tile_size = tile_sizes[blockIdx.x]; // Try to evenly distribute the length of tile_sizes between all the trials const auto tile_size_diff = (BlockSize * ItemsPerThread) / trials + 1; #pragma nounroll for(unsigned int trial = 0; trial < trials; trial++) { T output[ItemsPerThread]; adjacent_difference.SubtractRightPartialTile(input, output, minus{}, tile_size); for(unsigned int i = 0; i < ItemsPerThread; ++i) { input[i] += output[i]; } // Change the tile_size to even out the distribution tile_size = (tile_size + tile_size_diff) % (BlockSize * ItemsPerThread); __syncthreads(); } hipcub::StoreDirectStriped(lid, d_output + block_offset, input); } }; template auto run_benchmark(benchmark::State& state, hipStream_t stream, size_t N) -> std::enable_if_t::value && !std::is_same::value> { constexpr auto items_per_block = BlockSize * ItemsPerThread; const auto num_blocks = (N + items_per_block - 1) / items_per_block; // Round up size to the next multiple of items_per_block const auto size = num_blocks * items_per_block; const std::vector input = benchmark_utils::get_random_data(size, T(0), T(10)); T* d_input; T* d_output; HIP_CHECK(hipMalloc(&d_input, input.size() * sizeof(input[0]))); HIP_CHECK(hipMalloc(&d_output, input.size() * sizeof(T))); HIP_CHECK( hipMemcpy( d_input, input.data(), input.size() * sizeof(input[0]), hipMemcpyHostToDevice ) ); for(auto _ : state) { auto start = std::chrono::high_resolution_clock::now(); hipLaunchKernelGGL( HIP_KERNEL_NAME(kernel), dim3(num_blocks), dim3(BlockSize), 0, stream, d_input, d_output, Trials ); HIP_CHECK(hipGetLastError()); HIP_CHECK(hipDeviceSynchronize()); auto end = std::chrono::high_resolution_clock::now(); auto elapsed_seconds = std::chrono::duration_cast>(end - start); state.SetIterationTime(elapsed_seconds.count()); } state.SetBytesProcessed(state.iterations() * Trials * size * sizeof(T)); state.SetItemsProcessed(state.iterations() * Trials * size); HIP_CHECK(hipFree(d_input)); HIP_CHECK(hipFree(d_output)); } template auto run_benchmark(benchmark::State& state, hipStream_t stream, size_t N) -> std::enable_if_t::value || std::is_same::value> { constexpr auto items_per_block = BlockSize * ItemsPerThread; const auto num_blocks = (N + items_per_block - 1) / items_per_block; // Round up size to the next multiple of items_per_block const auto size = num_blocks * items_per_block; const std::vector input = benchmark_utils::get_random_data(size, T(0), T(10)); const std::vector tile_sizes = benchmark_utils::get_random_data(num_blocks, 0, items_per_block); T* d_input; int* d_tile_sizes; T* d_output; HIP_CHECK(hipMalloc(&d_input, input.size() * sizeof(input[0]))); HIP_CHECK(hipMalloc(&d_tile_sizes, tile_sizes.size() * sizeof(tile_sizes[0]))); HIP_CHECK(hipMalloc(&d_output, input.size() * sizeof(T))); HIP_CHECK( hipMemcpy( d_input, input.data(), input.size() * sizeof(input[0]), hipMemcpyHostToDevice ) ); HIP_CHECK( hipMemcpy( d_tile_sizes, tile_sizes.data(), tile_sizes.size() * sizeof(tile_sizes[0]), hipMemcpyHostToDevice ) ); for(auto _ : state) { auto start = std::chrono::high_resolution_clock::now(); hipLaunchKernelGGL( HIP_KERNEL_NAME(kernel), dim3(num_blocks), dim3(BlockSize), 0, stream, d_input, d_tile_sizes, d_output, Trials ); HIP_CHECK(hipGetLastError()); HIP_CHECK(hipDeviceSynchronize()); auto end = std::chrono::high_resolution_clock::now(); auto elapsed_seconds = std::chrono::duration_cast>(end - start); state.SetIterationTime(elapsed_seconds.count()); } state.SetBytesProcessed(state.iterations() * Trials * size * sizeof(T)); state.SetItemsProcessed(state.iterations() * Trials * size); HIP_CHECK(hipFree(d_input)); HIP_CHECK(hipFree(d_tile_sizes)); HIP_CHECK(hipFree(d_output)); } #define CREATE_BENCHMARK(T, BS, IPT, WITH_TILE) \ benchmark::RegisterBenchmark( \ (std::string("block_adjacent_difference<" #T ", " #BS ">.") + name + ("<" #IPT ", " #WITH_TILE ">")).c_str(), \ &run_benchmark, \ stream, size \ ) #define BENCHMARK_TYPE(type, block, with_tile) \ CREATE_BENCHMARK(type, block, 1, with_tile), \ CREATE_BENCHMARK(type, block, 3, with_tile), \ CREATE_BENCHMARK(type, block, 4, with_tile), \ CREATE_BENCHMARK(type, block, 8, with_tile), \ CREATE_BENCHMARK(type, block, 16, with_tile), \ CREATE_BENCHMARK(type, block, 32, with_tile) template void add_benchmarks(const std::string& name, std::vector& benchmarks, hipStream_t stream, size_t size) { std::vector bs = { BENCHMARK_TYPE(int, 256, false), BENCHMARK_TYPE(float, 256, false), BENCHMARK_TYPE(int8_t, 256, false), BENCHMARK_TYPE(long long, 256, false), BENCHMARK_TYPE(double, 256, false) }; if(!std::is_same::value) { bs.insert(bs.end(), { BENCHMARK_TYPE(int, 256, true), BENCHMARK_TYPE(float, 256, true), BENCHMARK_TYPE(int8_t, 256, true), BENCHMARK_TYPE(long long, 256, true), BENCHMARK_TYPE(double, 256, true) }); } benchmarks.insert(benchmarks.end(), bs.begin(), bs.end()); } int main(int argc, char *argv[]) { cli::Parser parser(argc, argv); parser.set_optional("size", "size", DEFAULT_N, "number of values"); parser.set_optional("trials", "trials", -1, "number of iterations"); parser.run_and_exit_if_error(); // Parse argv benchmark::Initialize(&argc, argv); const size_t size = parser.get("size"); const int trials = parser.get("trials"); // HIP hipStream_t stream = 0; // default hipDeviceProp_t devProp; int device_id = 0; HIP_CHECK(hipGetDevice(&device_id)); HIP_CHECK(hipGetDeviceProperties(&devProp, device_id)); std::cout << "[HIP] Device name: " << devProp.name << std::endl; // Add benchmarks std::vector benchmarks; add_benchmarks("SubtractLeft", benchmarks, stream, size); add_benchmarks("SubtractRight", benchmarks, stream, size); add_benchmarks("SubtractLeftPartialTile", benchmarks, stream, size); add_benchmarks("SubtractRightPartialTile", benchmarks, stream, size); // Use manual timing for(auto& b : benchmarks) { b->UseManualTime(); b->Unit(benchmark::kMillisecond); } // Force number of iterations if(trials > 0) { for(auto& b : benchmarks) { b->Iterations(trials); } } // Run benchmarks benchmark::RunSpecifiedBenchmarks(); return 0; }hipCUB-rocm-5.7.1/benchmark/benchmark_block_discontinuity.cpp000066400000000000000000000240651447643347700243550ustar00rootroot00000000000000// MIT License // // Copyright (c) 2020 Advanced Micro Devices, Inc. All rights reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal // in the Software without restriction, including without limitation the rights // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell // copies of the Software, and to permit persons to whom the Software is // furnished to do so, subject to the following conditions: // // The above copyright notice and this permission notice shall be included in all // copies or substantial portions of the Software. // // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE // SOFTWARE. #include "common_benchmark_header.hpp" // HIP API #include "hipcub/block/block_discontinuity.hpp" #include "hipcub/thread/thread_operators.hpp" //to use hipcub::Equality #include "hipcub/block/block_load.hpp" #include "hipcub/block/block_store.hpp" #ifndef DEFAULT_N const size_t DEFAULT_N = 1024 * 1024 * 128; #endif template struct custom_flag_op1 { HIPCUB_HOST_DEVICE bool operator()(const T& a, const T& b) { return (a == b); } }; template< class Runner, class T, unsigned int BlockSize, unsigned int ItemsPerThread, bool WithTile, unsigned int Trials > __global__ __launch_bounds__(BlockSize) void kernel(const T * d_input, T * d_output) { Runner::template run(d_input, d_output); } struct flag_heads { template< class T, unsigned int BlockSize, unsigned int ItemsPerThread, bool WithTile, unsigned int Trials > __device__ static void run(const T * d_input, T * d_output) { const unsigned int lid = hipThreadIdx_x; const unsigned int block_offset = hipBlockIdx_x * ItemsPerThread * BlockSize; T input[ItemsPerThread]; hipcub::LoadDirectStriped(lid, d_input + block_offset, input); #pragma nounroll for(unsigned int trial = 0; trial < Trials; trial++) { hipcub::BlockDiscontinuity bdiscontinuity; bool head_flags[ItemsPerThread]; if(WithTile) { bdiscontinuity.FlagHeads(head_flags, input, hipcub::Equality(), T(123)); } else { bdiscontinuity.FlagHeads(head_flags, input, hipcub::Equality()); } for(unsigned int i = 0; i < ItemsPerThread; i++) { input[i] += head_flags[i]; } __syncthreads(); } hipcub::StoreDirectStriped(lid, d_output + block_offset, input); } }; struct flag_tails { template< class T, unsigned int BlockSize, unsigned int ItemsPerThread, bool WithTile, unsigned int Trials > __device__ static void run(const T * d_input, T * d_output) { const unsigned int lid = hipThreadIdx_x; const unsigned int block_offset = hipBlockIdx_x * ItemsPerThread * BlockSize; T input[ItemsPerThread]; hipcub::LoadDirectStriped(lid, d_input + block_offset, input); #pragma nounroll for(unsigned int trial = 0; trial < Trials; trial++) { hipcub::BlockDiscontinuity bdiscontinuity; bool tail_flags[ItemsPerThread]; if(WithTile) { bdiscontinuity.FlagTails(tail_flags, input, hipcub::Equality(), T(123)); } else { bdiscontinuity.FlagTails(tail_flags, input, hipcub::Equality()); } for(unsigned int i = 0; i < ItemsPerThread; i++) { input[i] += tail_flags[i]; } __syncthreads(); } hipcub::StoreDirectStriped(lid, d_output + block_offset, input); } }; struct flag_heads_and_tails { template< class T, unsigned int BlockSize, unsigned int ItemsPerThread, bool WithTile, unsigned int Trials > __device__ static void run(const T * d_input, T * d_output) { const unsigned int lid = hipThreadIdx_x; const unsigned int block_offset = hipBlockIdx_x * ItemsPerThread * BlockSize; T input[ItemsPerThread]; hipcub::LoadDirectStriped(lid, d_input + block_offset, input); #pragma nounroll for(unsigned int trial = 0; trial < Trials; trial++) { hipcub::BlockDiscontinuity bdiscontinuity; bool head_flags[ItemsPerThread]; bool tail_flags[ItemsPerThread]; if(WithTile) { bdiscontinuity.FlagHeadsAndTails(head_flags, T(123), tail_flags, T(234), input, hipcub::Equality()); } else { bdiscontinuity.FlagHeadsAndTails(head_flags, tail_flags, input, hipcub::Equality()); } for(unsigned int i = 0; i < ItemsPerThread; i++) { input[i] += head_flags[i]; input[i] += tail_flags[i]; } __syncthreads(); } hipcub::StoreDirectStriped(lid, d_output + block_offset, input); } }; template< class Benchmark, class T, unsigned int BlockSize, unsigned int ItemsPerThread, bool WithTile, unsigned int Trials = 100 > void run_benchmark(benchmark::State& state, hipStream_t stream, size_t N) { constexpr auto items_per_block = BlockSize * ItemsPerThread; const auto size = items_per_block * ((N + items_per_block - 1)/items_per_block); std::vector input = benchmark_utils::get_random_data(size, T(0), T(10)); T * d_input; T * d_output; HIP_CHECK(hipMalloc(&d_input, size * sizeof(T))); HIP_CHECK(hipMalloc(&d_output, size * sizeof(T))); HIP_CHECK( hipMemcpy( d_input, input.data(), size * sizeof(T), hipMemcpyHostToDevice ) ); HIP_CHECK(hipDeviceSynchronize()); for(auto _ : state) { auto start = std::chrono::high_resolution_clock::now(); hipLaunchKernelGGL( HIP_KERNEL_NAME(kernel), dim3(size/items_per_block), dim3(BlockSize), 0, stream, d_input, d_output ); HIP_CHECK(hipPeekAtLastError()); HIP_CHECK(hipDeviceSynchronize()); auto end = std::chrono::high_resolution_clock::now(); auto elapsed_seconds = std::chrono::duration_cast>(end - start); state.SetIterationTime(elapsed_seconds.count()); } state.SetBytesProcessed(state.iterations() * Trials * size * sizeof(T)); state.SetItemsProcessed(state.iterations() * Trials * size); HIP_CHECK(hipFree(d_input)); HIP_CHECK(hipFree(d_output)); } #define CREATE_BENCHMARK(T, BS, IPT, WITH_TILE) \ benchmark::RegisterBenchmark( \ (std::string("block_discontinuity.SubAlgorithm Name:") + name + ("")).c_str(), \ &run_benchmark, \ stream, size \ ) #define BENCHMARK_TYPE(type, block, bool) \ CREATE_BENCHMARK(type, block, 1, bool), \ CREATE_BENCHMARK(type, block, 2, bool), \ CREATE_BENCHMARK(type, block, 3, bool), \ CREATE_BENCHMARK(type, block, 4, bool), \ CREATE_BENCHMARK(type, block, 8, bool) template void add_benchmarks(const std::string& name, std::vector& benchmarks, hipStream_t stream, size_t size) { std::vector bs = { BENCHMARK_TYPE(int, 256, false), BENCHMARK_TYPE(int, 256, true), BENCHMARK_TYPE(int8_t, 256, false), BENCHMARK_TYPE(int8_t, 256, true), BENCHMARK_TYPE(uint8_t, 256, false), BENCHMARK_TYPE(uint8_t, 256, true), BENCHMARK_TYPE(long long, 256, false), BENCHMARK_TYPE(long long, 256, true), }; benchmarks.insert(benchmarks.end(), bs.begin(), bs.end()); } int main(int argc, char *argv[]) { cli::Parser parser(argc, argv); parser.set_optional("size", "size", DEFAULT_N, "number of values"); parser.set_optional("trials", "trials", -1, "number of iterations"); parser.run_and_exit_if_error(); // Parse argv benchmark::Initialize(&argc, argv); const size_t size = parser.get("size"); const int trials = parser.get("trials"); std::cout << "benchmark_block_discontinuity" << std::endl; // HIP hipStream_t stream = 0; // default hipDeviceProp_t devProp; int device_id = 0; HIP_CHECK(hipGetDevice(&device_id)); HIP_CHECK(hipGetDeviceProperties(&devProp, device_id)); std::cout << "[HIP] Device name: " << devProp.name << std::endl; // Add benchmarks std::vector benchmarks; add_benchmarks("flag_heads", benchmarks, stream, size); add_benchmarks("flag_tails", benchmarks, stream, size); add_benchmarks("flag_heads_and_tails", benchmarks, stream, size); // Use manual timing for(auto& b : benchmarks) { b->UseManualTime(); b->Unit(benchmark::kMillisecond); } // Force number of iterations if(trials > 0) { for(auto& b : benchmarks) { b->Iterations(trials); } } // Run benchmarks benchmark::RunSpecifiedBenchmarks(); return 0; } hipCUB-rocm-5.7.1/benchmark/benchmark_block_exchange.cpp000066400000000000000000000320051447643347700232230ustar00rootroot00000000000000// MIT License // // Copyright (c) 2020 Advanced Micro Devices, Inc. All rights reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal // in the Software without restriction, including without limitation the rights // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell // copies of the Software, and to permit persons to whom the Software is // furnished to do so, subject to the following conditions: // // The above copyright notice and this permission notice shall be included in all // copies or substantial portions of the Software. // // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE // SOFTWARE. #include "common_benchmark_header.hpp" // HIP API #include "hipcub/block/block_exchange.hpp" #include "hipcub/block/block_load.hpp" #include "hipcub/block/block_store.hpp" #ifndef DEFAULT_N const size_t DEFAULT_N = 1024 * 1024 * 32; #endif template< class Runner, class T, unsigned int BlockSize, unsigned int ItemsPerThread, unsigned int Trials > __global__ __launch_bounds__(BlockSize) void kernel(const T * d_input, const unsigned int * d_ranks, T * d_output) { Runner::template run(d_input, d_ranks, d_output); } struct blocked_to_striped { template< class T, unsigned int BlockSize, unsigned int ItemsPerThread, unsigned int Trials > __device__ static void run(const T * d_input, const unsigned int *, T * d_output) { const unsigned int lid = hipThreadIdx_x; const unsigned int block_offset = hipBlockIdx_x * ItemsPerThread * BlockSize; T input[ItemsPerThread]; hipcub::LoadDirectBlocked(lid, d_input + block_offset, input); #pragma nounroll for(unsigned int trial = 0; trial < Trials; trial++) { hipcub::BlockExchange exchange; exchange.BlockedToStriped(input, input); __syncthreads(); // extra sync needed because of loop. In normal usage sync with be cared for by the load and store functions (outside the loop). } hipcub::StoreDirectStriped(lid, d_output + block_offset, input); } }; struct striped_to_blocked { template< class T, unsigned int BlockSize, unsigned int ItemsPerThread, unsigned int Trials > __device__ static void run(const T * d_input, const unsigned int *, T * d_output) { const unsigned int lid = hipThreadIdx_x; const unsigned int block_offset = hipBlockIdx_x * ItemsPerThread * BlockSize; T input[ItemsPerThread]; hipcub::LoadDirectStriped(lid, d_input + block_offset, input); #pragma nounroll for(unsigned int trial = 0; trial < Trials; trial++) { hipcub::BlockExchange exchange; exchange.StripedToBlocked(input, input); __syncthreads();// extra sync needed because of loop. In normal usage sync with be cared for by the load and store functions (outside the loop). } hipcub::StoreDirectBlocked(lid, d_output + block_offset, input); } }; struct blocked_to_warp_striped { template< class T, unsigned int BlockSize, unsigned int ItemsPerThread, unsigned int Trials > __device__ static void run(const T * d_input, const unsigned int *, T * d_output) { const unsigned int lid = hipThreadIdx_x; const unsigned int block_offset = hipBlockIdx_x * ItemsPerThread * BlockSize; T input[ItemsPerThread]; hipcub::LoadDirectBlocked(lid, d_input + block_offset, input); #pragma nounroll for(unsigned int trial = 0; trial < Trials; trial++) { hipcub::BlockExchange exchange; exchange.BlockedToWarpStriped(input, input); __syncthreads();// extra sync needed because of loop. In normal usage sync with be cared for by the load and store functions (outside the loop). } hipcub::StoreDirectWarpStriped(lid, d_output + block_offset, input); } }; struct warp_striped_to_blocked { template< class T, unsigned int BlockSize, unsigned int ItemsPerThread, unsigned int Trials > __device__ static void run(const T * d_input, const unsigned int *, T * d_output) { const unsigned int lid = hipThreadIdx_x; const unsigned int block_offset = hipBlockIdx_x * ItemsPerThread * BlockSize; T input[ItemsPerThread]; hipcub::LoadDirectWarpStriped(lid, d_input + block_offset, input); #pragma nounroll for(unsigned int trial = 0; trial < Trials; trial++) { hipcub::BlockExchange exchange; exchange.WarpStripedToBlocked(input, input); __syncthreads(); // extra sync needed because of loop. In normal usage sync with be cared for by the load and store functions (outside the loop). } hipcub::StoreDirectBlocked(lid, d_output + block_offset, input); } }; struct scatter_to_blocked { template< class T, unsigned int BlockSize, unsigned int ItemsPerThread, unsigned int Trials > __device__ static void run(const T * d_input, const unsigned int * d_ranks, T * d_output) { const unsigned int lid = hipThreadIdx_x; const unsigned int block_offset = hipBlockIdx_x * ItemsPerThread * BlockSize; T input[ItemsPerThread]; unsigned int ranks[ItemsPerThread]; hipcub::LoadDirectStriped(lid, d_input + block_offset, input); hipcub::LoadDirectStriped(lid, d_ranks + block_offset, ranks); #pragma nounroll for(unsigned int trial = 0; trial < Trials; trial++) { hipcub::BlockExchange exchange; exchange.ScatterToBlocked(input, input, ranks); __syncthreads();// extra sync needed because of loop. In normal usage sync with be cared for by the load and store functions (outside the loop). } hipcub::StoreDirectBlocked(lid, d_output + block_offset, input); } }; struct scatter_to_striped { template< class T, unsigned int BlockSize, unsigned int ItemsPerThread, unsigned int Trials > __device__ static void run(const T * d_input, const unsigned int * d_ranks, T * d_output) { const unsigned int lid = hipThreadIdx_x; const unsigned int block_offset = hipBlockIdx_x * ItemsPerThread * BlockSize; T input[ItemsPerThread]; unsigned int ranks[ItemsPerThread]; hipcub::LoadDirectStriped(lid, d_input + block_offset, input); hipcub::LoadDirectStriped(lid, d_ranks + block_offset, ranks); #pragma nounroll for(unsigned int trial = 0; trial < Trials; trial++) { hipcub::BlockExchange exchange; exchange.ScatterToStriped(input, input, ranks); __syncthreads(); // extra sync needed because of loop. In normal usage sync with be cared for by the load and store functions (outside the loop). } hipcub::StoreDirectStriped(lid, d_output + block_offset, input); } }; template< class Benchmark, class T, unsigned int BlockSize, unsigned int ItemsPerThread, unsigned int Trials = 100 > void run_benchmark(benchmark::State& state, hipStream_t stream, size_t N) { constexpr auto items_per_block = BlockSize * ItemsPerThread; const auto size = items_per_block * ((N + items_per_block - 1)/items_per_block); std::vector input(size); // Fill input for(size_t i = 0; i < size; i++) { input[i] = T(i); } std::vector ranks(size); // Fill ranks (for scatter operations) std::mt19937 gen; for(size_t bi = 0; bi < size / items_per_block; bi++) { auto block_ranks = ranks.begin() + bi * items_per_block; std::iota(block_ranks, block_ranks + items_per_block, 0); std::shuffle(block_ranks, block_ranks + items_per_block, gen); } T * d_input; unsigned int * d_ranks; T * d_output; HIP_CHECK(hipMalloc(&d_input, size * sizeof(T))); HIP_CHECK(hipMalloc(&d_ranks, size * sizeof(unsigned int))); HIP_CHECK(hipMalloc(&d_output, size * sizeof(T))); HIP_CHECK( hipMemcpy( d_input, input.data(), size * sizeof(T), hipMemcpyHostToDevice ) ); HIP_CHECK( hipMemcpy( d_ranks, ranks.data(), size * sizeof(unsigned int), hipMemcpyHostToDevice ) ); HIP_CHECK(hipDeviceSynchronize()); for(auto _ : state) { auto start = std::chrono::high_resolution_clock::now(); hipLaunchKernelGGL( HIP_KERNEL_NAME(kernel), dim3(size/items_per_block), dim3(BlockSize), 0, stream, d_input, d_ranks, d_output ); HIP_CHECK(hipPeekAtLastError()); HIP_CHECK(hipDeviceSynchronize()); auto end = std::chrono::high_resolution_clock::now(); auto elapsed_seconds = std::chrono::duration_cast>(end - start); state.SetIterationTime(elapsed_seconds.count()); } state.SetBytesProcessed(state.iterations() * Trials * size * sizeof(T)); state.SetItemsProcessed(state.iterations() * Trials * size); HIP_CHECK(hipFree(d_input)); HIP_CHECK(hipFree(d_ranks)); HIP_CHECK(hipFree(d_output)); } #define CREATE_BENCHMARK(T, BS, IPT) \ benchmark::RegisterBenchmark( \ (std::string("block_exchange.SubAlgorithm Name:") + name).c_str(), \ &run_benchmark, \ stream, size \ ) #define BENCHMARK_TYPE(type, block) \ CREATE_BENCHMARK(type, block, 1), \ CREATE_BENCHMARK(type, block, 2), \ CREATE_BENCHMARK(type, block, 3), \ CREATE_BENCHMARK(type, block, 4), \ CREATE_BENCHMARK(type, block, 7), \ CREATE_BENCHMARK(type, block, 8) template void add_benchmarks(const std::string& name, std::vector& benchmarks, hipStream_t stream, size_t size) { using custom_float2 = benchmark_utils::custom_type; using custom_double2 = benchmark_utils::custom_type; std::vector bs = { BENCHMARK_TYPE(int, 256), BENCHMARK_TYPE(int8_t, 256), BENCHMARK_TYPE(long long, 256), BENCHMARK_TYPE(custom_float2, 256), BENCHMARK_TYPE(custom_double2, 256), }; benchmarks.insert(benchmarks.end(), bs.begin(), bs.end()); } int main(int argc, char *argv[]) { cli::Parser parser(argc, argv); parser.set_optional("size", "size", DEFAULT_N, "number of values"); parser.set_optional("trials", "trials", -1, "number of iterations"); parser.run_and_exit_if_error(); // Parse argv benchmark::Initialize(&argc, argv); const size_t size = parser.get("size"); const int trials = parser.get("trials"); std::cout << "benchmark_block_exchange" << std::endl; // HIP hipStream_t stream = 0; // default hipDeviceProp_t devProp; int device_id = 0; HIP_CHECK(hipGetDevice(&device_id)); HIP_CHECK(hipGetDeviceProperties(&devProp, device_id)); std::cout << "[HIP] Device name: " << devProp.name << std::endl; // Add benchmarks std::vector benchmarks; add_benchmarks("blocked_to_striped", benchmarks, stream, size); add_benchmarks("striped_to_blocked", benchmarks, stream, size); add_benchmarks("blocked_to_warp_striped", benchmarks, stream, size); add_benchmarks("warp_striped_to_blocked", benchmarks, stream, size); add_benchmarks("scatter_to_blocked", benchmarks, stream, size); add_benchmarks("scatter_to_striped", benchmarks, stream, size); // Use manual timing for(auto& b : benchmarks) { b->UseManualTime(); b->Unit(benchmark::kMillisecond); } // Force number of iterations if(trials > 0) { for(auto& b : benchmarks) { b->Iterations(trials); } } // Run benchmarks benchmark::RunSpecifiedBenchmarks(); return 0; } hipCUB-rocm-5.7.1/benchmark/benchmark_block_histogram.cpp000066400000000000000000000170671447643347700234510ustar00rootroot00000000000000// MIT License // // Copyright (c) 2020 Advanced Micro Devices, Inc. All rights reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal // in the Software without restriction, including without limitation the rights // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell // copies of the Software, and to permit persons to whom the Software is // furnished to do so, subject to the following conditions: // // The above copyright notice and this permission notice shall be included in all // copies or substantial portions of the Software. // // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE // SOFTWARE. #include "common_benchmark_header.hpp" // HIP API #include "hipcub/block/block_histogram.hpp" #ifndef DEFAULT_N const size_t DEFAULT_N = 1024 * 1024 * 128; #endif template< class Runner, class T, unsigned int BlockSize, unsigned int ItemsPerThread, unsigned int BinSize, unsigned int Trials > __global__ __launch_bounds__(BlockSize) void kernel(const T* input, T* output) { Runner::template run(input, output); } template struct histogram { template< class T, unsigned int BlockSize, unsigned int ItemsPerThread, unsigned int BinSize, unsigned int Trials > __device__ static void run(const T* input, T* output) { const unsigned int index = ((hipBlockIdx_x * BlockSize) + hipThreadIdx_x) * ItemsPerThread; unsigned int global_offset = hipBlockIdx_x * BinSize; T values[ItemsPerThread]; for(unsigned int k = 0; k < ItemsPerThread; k++) { values[k] = input[index + k]; } using bhistogram_t = hipcub::BlockHistogram; __shared__ T histogram[BinSize]; __shared__ typename bhistogram_t::TempStorage storage; #pragma nounroll for(unsigned int trial = 0; trial < Trials; trial++) { bhistogram_t(storage).Histogram(values, histogram); } #pragma unroll for (unsigned int offset = 0; offset < BinSize; offset += BlockSize) { if(offset + hipThreadIdx_x < BinSize) { output[global_offset + hipThreadIdx_x] = histogram[offset + hipThreadIdx_x]; global_offset += BlockSize; } } } }; template< class Benchmark, class T, unsigned int BlockSize, unsigned int ItemsPerThread, unsigned int BinSize = BlockSize, unsigned int Trials = 100 > void run_benchmark(benchmark::State& state, hipStream_t stream, size_t N) { // Make sure size is a multiple of BlockSize constexpr auto items_per_block = BlockSize * ItemsPerThread; const auto size = items_per_block * ((N + items_per_block - 1)/items_per_block); const auto bin_size = BinSize * ((N + items_per_block - 1)/items_per_block); // Allocate and fill memory std::vector input(size, 0.0f); T * d_input; T * d_output; HIP_CHECK(hipMalloc(&d_input, size * sizeof(T))); HIP_CHECK(hipMalloc(&d_output, bin_size * sizeof(T))); HIP_CHECK( hipMemcpy( d_input, input.data(), size * sizeof(T), hipMemcpyHostToDevice ) ); HIP_CHECK(hipDeviceSynchronize()); for (auto _ : state) { auto start = std::chrono::high_resolution_clock::now(); hipLaunchKernelGGL( HIP_KERNEL_NAME(kernel), dim3(size/items_per_block), dim3(BlockSize), 0, stream, d_input, d_output ); HIP_CHECK(hipPeekAtLastError()); HIP_CHECK(hipDeviceSynchronize()); auto end = std::chrono::high_resolution_clock::now(); auto elapsed_seconds = std::chrono::duration_cast>(end - start); state.SetIterationTime(elapsed_seconds.count()); } state.SetBytesProcessed(state.iterations() * size * sizeof(T) * Trials); state.SetItemsProcessed(state.iterations() * size * Trials); HIP_CHECK(hipFree(d_input)); HIP_CHECK(hipFree(d_output)); } // IPT - items per thread #define CREATE_BENCHMARK(T, BS, IPT) \ benchmark::RegisterBenchmark( \ (std::string("block_histogram.Method Name:") + method_name).c_str(), \ &run_benchmark, \ stream, size \ ) #define BENCHMARK_TYPE(type, block) \ CREATE_BENCHMARK(type, block, 1), \ CREATE_BENCHMARK(type, block, 2), \ CREATE_BENCHMARK(type, block, 3), \ CREATE_BENCHMARK(type, block, 4), \ CREATE_BENCHMARK(type, block, 8), \ CREATE_BENCHMARK(type, block, 16) template void add_benchmarks(std::vector& benchmarks, const std::string& method_name, const std::string& algorithm_name, hipStream_t stream, size_t size) { std::vector new_benchmarks = { BENCHMARK_TYPE(int, 256), BENCHMARK_TYPE(int, 320), BENCHMARK_TYPE(int, 512), BENCHMARK_TYPE(unsigned long long, 256), BENCHMARK_TYPE(unsigned long long, 320) }; benchmarks.insert(benchmarks.end(), new_benchmarks.begin(), new_benchmarks.end()); } int main(int argc, char *argv[]) { cli::Parser parser(argc, argv); parser.set_optional("size", "size", DEFAULT_N, "number of values"); parser.set_optional("trials", "trials", -1, "number of iterations"); parser.run_and_exit_if_error(); // Parse argv benchmark::Initialize(&argc, argv); const size_t size = parser.get("size"); const int trials = parser.get("trials"); std::cout << "benchmark_block_histogram" << std::endl; // HIP hipStream_t stream = 0; // default hipDeviceProp_t devProp; int device_id = 0; HIP_CHECK(hipGetDevice(&device_id)); HIP_CHECK(hipGetDeviceProperties(&devProp, device_id)); std::cout << "[HIP] Device name: " << devProp.name << std::endl; // Add benchmarks std::vector benchmarks; // using_atomic using histogram_a_t = histogram; add_benchmarks( benchmarks, "histogram", "using_atomic", stream, size ); // using_sort using histogram_s_t = histogram; add_benchmarks( benchmarks, "histogram", "using_sort", stream, size ); // Use manual timing for(auto& b : benchmarks) { b->UseManualTime(); b->Unit(benchmark::kMillisecond); } // Force number of iterations if(trials > 0) { for(auto& b : benchmarks) { b->Iterations(trials); } } // Run benchmarks benchmark::RunSpecifiedBenchmarks(); return 0; } hipCUB-rocm-5.7.1/benchmark/benchmark_block_merge_sort.cpp000066400000000000000000000206231447643347700236120ustar00rootroot00000000000000// MIT License // // Copyright (c) 2021 Advanced Micro Devices, Inc. All rights reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal // in the Software without restriction, including without limitation the rights // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell // copies of the Software, and to permit persons to whom the Software is // furnished to do so, subject to the following conditions: // // The above copyright notice and this permission notice shall be included in all // copies or substantial portions of the Software. // // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE // SOFTWARE. #include "common_benchmark_header.hpp" #include "../test/hipcub/test_utils_sort_comparator.hpp" // HIP API #include "hipcub/block/block_merge_sort.hpp" #include "hipcub/block/block_load.hpp" #include "hipcub/block/block_store.hpp" #ifndef DEFAULT_N const size_t DEFAULT_N = 1024 * 1024 * 128; #endif enum class benchmark_kinds { sort_keys, sort_pairs }; template< class T, unsigned int BlockSize, unsigned int ItemsPerThread, class CompareOp, unsigned int Trials > __global__ __launch_bounds__(BlockSize) void sort_keys_kernel(const T * input, T * output, CompareOp compare_op) { const unsigned int lid = hipThreadIdx_x; const unsigned int block_offset = hipBlockIdx_x * ItemsPerThread * BlockSize; T keys[ItemsPerThread]; hipcub::LoadDirectStriped(lid, input + block_offset, keys); #pragma nounroll for(unsigned int trial = 0; trial < Trials; trial++) { hipcub::BlockMergeSort sort; sort.Sort(keys, compare_op); } hipcub::StoreDirectStriped(lid, output + block_offset, keys); } template< class T, unsigned int BlockSize, unsigned int ItemsPerThread, class CompareOp, unsigned int Trials > __global__ __launch_bounds__(BlockSize) void sort_pairs_kernel(const T * input, T * output, CompareOp compare_op) { const unsigned int lid = hipThreadIdx_x; const unsigned int block_offset = hipBlockIdx_x * ItemsPerThread * BlockSize; T keys[ItemsPerThread]; T values[ItemsPerThread]; hipcub::LoadDirectStriped(lid, input + block_offset, keys); for(unsigned int i = 0; i < ItemsPerThread; i++) { values[i] = keys[i] + T(1); } #pragma nounroll for(unsigned int trial = 0; trial < Trials; trial++) { hipcub::BlockMergeSort sort; sort.Sort(keys, values, compare_op); } for(unsigned int i = 0; i < ItemsPerThread; i++) { keys[i] += values[i]; } hipcub::StoreDirectStriped(lid, output + block_offset, keys); } template< class T, unsigned int BlockSize, unsigned int ItemsPerThread, class CompareOp = test_utils::less, unsigned int Trials = 10 > void run_benchmark(benchmark::State& state, benchmark_kinds benchmark_kind, hipStream_t stream, size_t N) { constexpr auto items_per_block = BlockSize * ItemsPerThread; const auto size = items_per_block * ((N + items_per_block - 1)/items_per_block); std::vector input; if(std::is_floating_point::value) { input = benchmark_utils::get_random_data(size, (T)-1000, (T)+1000); } else { input = benchmark_utils::get_random_data( size, std::numeric_limits::min(), std::numeric_limits::max() ); } T * d_input; T * d_output; HIP_CHECK(hipMalloc(&d_input, size * sizeof(T))); HIP_CHECK(hipMalloc(&d_output, size * sizeof(T))); HIP_CHECK( hipMemcpy( d_input, input.data(), size * sizeof(T), hipMemcpyHostToDevice ) ); HIP_CHECK(hipDeviceSynchronize()); for(auto _ : state) { auto start = std::chrono::high_resolution_clock::now(); if(benchmark_kind == benchmark_kinds::sort_keys) { hipLaunchKernelGGL( HIP_KERNEL_NAME(sort_keys_kernel), dim3(size/items_per_block), dim3(BlockSize), 0, stream, d_input, d_output, CompareOp() ); } else if(benchmark_kind == benchmark_kinds::sort_pairs) { hipLaunchKernelGGL( HIP_KERNEL_NAME(sort_pairs_kernel), dim3(size/items_per_block), dim3(BlockSize), 0, stream, d_input, d_output, CompareOp() ); } HIP_CHECK(hipPeekAtLastError()); HIP_CHECK(hipDeviceSynchronize()); auto end = std::chrono::high_resolution_clock::now(); auto elapsed_seconds = std::chrono::duration_cast>(end - start); state.SetIterationTime(elapsed_seconds.count()); } state.SetBytesProcessed(state.iterations() * Trials * size * sizeof(T)); state.SetItemsProcessed(state.iterations() * Trials * size); HIP_CHECK(hipFree(d_input)); HIP_CHECK(hipFree(d_output)); } #define CREATE_BENCHMARK(T, BS, IPT) \ benchmark::RegisterBenchmark( \ (std::string("block_merge_sort.SubAlgorithm Name:") + name).c_str(), \ &run_benchmark, \ benchmark_kind, stream, size \ ) #define BENCHMARK_TYPE(type, block) \ CREATE_BENCHMARK(type, block, 1), \ CREATE_BENCHMARK(type, block, 2), \ CREATE_BENCHMARK(type, block, 3), \ CREATE_BENCHMARK(type, block, 4), \ CREATE_BENCHMARK(type, block, 8) void add_benchmarks(benchmark_kinds benchmark_kind, const std::string& name, std::vector& benchmarks, hipStream_t stream, size_t size) { std::vector bs = { BENCHMARK_TYPE(int, 64), BENCHMARK_TYPE(int, 128), BENCHMARK_TYPE(int, 256), BENCHMARK_TYPE(int, 512), BENCHMARK_TYPE(int8_t, 64), BENCHMARK_TYPE(int8_t, 128), BENCHMARK_TYPE(int8_t, 256), BENCHMARK_TYPE(int8_t, 512), BENCHMARK_TYPE(uint8_t, 64), BENCHMARK_TYPE(uint8_t, 128), BENCHMARK_TYPE(uint8_t, 256), BENCHMARK_TYPE(uint8_t, 512), BENCHMARK_TYPE(long long, 64), BENCHMARK_TYPE(long long, 128), BENCHMARK_TYPE(long long, 256), BENCHMARK_TYPE(long long, 512) }; benchmarks.insert(benchmarks.end(), bs.begin(), bs.end()); } int main(int argc, char *argv[]) { cli::Parser parser(argc, argv); parser.set_optional("size", "size", DEFAULT_N, "number of values"); parser.set_optional("trials", "trials", -1, "number of iterations"); parser.run_and_exit_if_error(); // Parse argv benchmark::Initialize(&argc, argv); const size_t size = parser.get("size"); const int trials = parser.get("trials"); std::cout << "benchmark_block_merge_sort" << std::endl; // HIP hipStream_t stream = 0; // default hipDeviceProp_t devProp; int device_id = 0; HIP_CHECK(hipGetDevice(&device_id)); HIP_CHECK(hipGetDeviceProperties(&devProp, device_id)); std::cout << "[HIP] Device name: " << devProp.name << std::endl; // Add benchmarks std::vector benchmarks; add_benchmarks(benchmark_kinds::sort_keys, "sort(keys)", benchmarks, stream, size); add_benchmarks(benchmark_kinds::sort_pairs, "sort(keys, values)", benchmarks, stream, size); // Use manual timing for(auto& b : benchmarks) { b->UseManualTime(); b->Unit(benchmark::kMillisecond); } // Force number of iterations if(trials > 0) { for(auto& b : benchmarks) { b->Iterations(trials); } } // Run benchmarks benchmark::RunSpecifiedBenchmarks(); return 0; } hipCUB-rocm-5.7.1/benchmark/benchmark_block_radix_rank.cpp000066400000000000000000000216001447643347700235620ustar00rootroot00000000000000// MIT License // // Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal // in the Software without restriction, including without limitation the rights // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell // copies of the Software, and to permit persons to whom the Software is // furnished to do so, subject to the following conditions: // // The above copyright notice and this permission notice shall be included in all // copies or substantial portions of the Software. // // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE // SOFTWARE. #include "common_benchmark_header.hpp" // HIP API #include "hipcub/block/block_load.hpp" #include "hipcub/block/block_radix_rank.hpp" #include "hipcub/block/block_store.hpp" #include "hipcub/block/radix_rank_sort_operations.hpp" #ifndef DEFAULT_N const size_t DEFAULT_N = 1024 * 1024 * 128; #endif enum class RadixRankAlgorithm { RADIX_RANK_BASIC, RADIX_RANK_MEMOIZE, RADIX_RANK_MATCH, }; template __global__ __launch_bounds__(BlockSize) void rank_kernel(const T* keys_input, int* ranks_output) { const unsigned int lid = hipThreadIdx_x; const unsigned int block_offset = hipBlockIdx_x * ItemsPerThread * BlockSize; T keys[ItemsPerThread]; hipcub::LoadDirectBlocked(lid, keys_input + block_offset, keys); using KeyTraits = hipcub::Traits; using UnsignedBits = typename KeyTraits::UnsignedBits; using DigitExtractor = hipcub::BFEDigitExtractor; UnsignedBits(&unsigned_keys)[ItemsPerThread] = reinterpret_cast(keys); using RankType = std::conditional_t< BenchmarkKind == RadixRankAlgorithm::RADIX_RANK_MATCH, hipcub::BlockRadixRankMatch, hipcub::BlockRadixRank>; #pragma unroll for(unsigned int key = 0; key < ItemsPerThread; key++) { unsigned_keys[key] = KeyTraits::TwiddleIn(unsigned_keys[key]); } int ranks[ItemsPerThread]; #pragma nounroll for(unsigned int trial = 0; trial < Trials; trial++) { __shared__ typename RankType::TempStorage storage; RankType rank(storage); unsigned begin_bit = 0; const unsigned end_bit = sizeof(T) * 8; while(begin_bit < end_bit) { const unsigned pass_bits = min(RadixBits, end_bit - begin_bit); DigitExtractor digit_extractor(begin_bit, pass_bits); rank.RankKeys(unsigned_keys, ranks, digit_extractor); begin_bit += RadixBits; } } hipcub::StoreDirectBlocked(lid, ranks_output + block_offset, ranks); } template void run_benchmark(benchmark::State& state, hipStream_t stream, size_t N) { constexpr unsigned int items_per_block = BlockSize * ItemsPerThread; const unsigned int size = items_per_block * ((N + items_per_block - 1) / items_per_block); std::vector input; if(std::is_floating_point::value) { input = benchmark_utils::get_random_data(size, static_cast(-1000), static_cast(1000)); } else { input = benchmark_utils::get_random_data(size, std::numeric_limits::min(), std::numeric_limits::max()); } T* d_input; int* d_output; HIP_CHECK(hipMalloc(&d_input, size * sizeof(T))); HIP_CHECK(hipMalloc(&d_output, size * sizeof(int))); HIP_CHECK(hipMemcpy(d_input, input.data(), size * sizeof(T), hipMemcpyHostToDevice)); HIP_CHECK(hipDeviceSynchronize()); for(auto _ : state) { auto start = std::chrono::high_resolution_clock::now(); hipLaunchKernelGGL( HIP_KERNEL_NAME( rank_kernel), dim3(size / items_per_block), dim3(BlockSize), 0, stream, d_input, d_output); HIP_CHECK(hipPeekAtLastError()); HIP_CHECK(hipDeviceSynchronize()); auto end = std::chrono::high_resolution_clock::now(); auto elapsed_seconds = std::chrono::duration_cast>(end - start); state.SetIterationTime(elapsed_seconds.count()); } state.SetBytesProcessed(state.iterations() * Trials * size * sizeof(T)); state.SetItemsProcessed(state.iterations() * Trials * size); HIP_CHECK(hipFree(d_input)); HIP_CHECK(hipFree(d_output)); } #define CREATE_BENCHMARK(T, KIND, BS, IPT) \ benchmark::RegisterBenchmark( \ (std::string("block_radix_rank<" #T ", " #KIND ", " #BS ", " #IPT ">.") + name).c_str(), \ &run_benchmark, \ stream, \ size) // clang-format off #define CREATE_BENCHMARK_KINDS(type, block, ipt) \ CREATE_BENCHMARK(type, RadixRankAlgorithm::RADIX_RANK_BASIC, block, ipt), \ CREATE_BENCHMARK(type, RadixRankAlgorithm::RADIX_RANK_MEMOIZE, block, ipt), \ CREATE_BENCHMARK(type, RadixRankAlgorithm::RADIX_RANK_MATCH, block, ipt) #define BENCHMARK_TYPE(type, block) \ CREATE_BENCHMARK_KINDS(type, block, 1), \ CREATE_BENCHMARK_KINDS(type, block, 4), \ CREATE_BENCHMARK_KINDS(type, block, 8), \ CREATE_BENCHMARK_KINDS(type, block, 16), \ CREATE_BENCHMARK_KINDS(type, block, 32) // clang-format on void add_benchmarks(const std::string& name, std::vector& benchmarks, hipStream_t stream, size_t size) { std::vector bs = { BENCHMARK_TYPE(int, 128), BENCHMARK_TYPE(int, 256), BENCHMARK_TYPE(int, 512), BENCHMARK_TYPE(uint8_t, 128), BENCHMARK_TYPE(uint8_t, 256), BENCHMARK_TYPE(uint8_t, 512), BENCHMARK_TYPE(long long, 128), BENCHMARK_TYPE(long long, 256), BENCHMARK_TYPE(long long, 512), }; benchmarks.insert(benchmarks.end(), bs.begin(), bs.end()); } int main(int argc, char* argv[]) { cli::Parser parser(argc, argv); parser.set_optional("size", "size", DEFAULT_N, "number of values"); parser.set_optional("trials", "trials", -1, "number of iterations"); parser.run_and_exit_if_error(); // Parse argv benchmark::Initialize(&argc, argv); const size_t size = parser.get("size"); const int trials = parser.get("trials"); // HIP hipStream_t stream = 0; // default hipDeviceProp_t devProp; int device_id = 0; HIP_CHECK(hipGetDevice(&device_id)); HIP_CHECK(hipGetDeviceProperties(&devProp, device_id)); std::cout << "[HIP] Device name: " << devProp.name << std::endl; // Add benchmarks std::vector benchmarks; add_benchmarks("rank", benchmarks, stream, size); // Use manual timing for(auto& b : benchmarks) { b->UseManualTime(); b->Unit(benchmark::kMillisecond); } // Force number of iterations if(trials > 0) { for(auto& b : benchmarks) { b->Iterations(trials); } } // Run benchmarks benchmark::RunSpecifiedBenchmarks(); return 0; } hipCUB-rocm-5.7.1/benchmark/benchmark_block_radix_sort.cpp000066400000000000000000000262611447643347700236260ustar00rootroot00000000000000// MIT License // // Copyright (c) 2020-2022 Advanced Micro Devices, Inc. All rights reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal // in the Software without restriction, including without limitation the rights // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell // copies of the Software, and to permit persons to whom the Software is // furnished to do so, subject to the following conditions: // // The above copyright notice and this permission notice shall be included in all // copies or substantial portions of the Software. // // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE // SOFTWARE. #include "common_benchmark_header.hpp" // HIP API #include "hipcub/block/block_radix_sort.hpp" #include "hipcub/block/block_load.hpp" #include "hipcub/block/block_store.hpp" #ifndef DEFAULT_N const size_t DEFAULT_N = 1024 * 1024 * 128; #endif enum class benchmark_kinds { sort_keys, sort_pairs }; struct helper_blocked_blocked { template HIPCUB_DEVICE static void load(int linear_id, InputIteratorT block_iter, T (&items)[ItemsPerThread]) { hipcub::LoadDirectStriped(linear_id, block_iter, items); } template HIPCUB_DEVICE static void sort(T (&keys)[ItemsPerThread]) { hipcub::BlockRadixSort sort; sort.Sort(keys); } template HIPCUB_DEVICE static void sort(T (&keys)[ItemsPerThread], T (&values)[ItemsPerThread]) { hipcub::BlockRadixSort sort; sort.Sort(keys, values); } }; struct helper_blocked_striped { template HIPCUB_DEVICE static void load(int linear_id, InputIteratorT block_iter, T (&items)[ItemsPerThread]) { hipcub::LoadDirectBlocked(linear_id, block_iter, items); } template HIPCUB_DEVICE static void sort(T (&keys)[ItemsPerThread]) { hipcub::BlockRadixSort sort; sort.SortBlockedToStriped(keys); } template HIPCUB_DEVICE static void sort(T (&keys)[ItemsPerThread], T (&values)[ItemsPerThread]) { hipcub::BlockRadixSort sort; sort.SortBlockedToStriped(keys, values); } }; template __global__ __launch_bounds__(BlockSize) void sort_keys_kernel(const T* input, T* output) { const unsigned int lid = hipThreadIdx_x; const unsigned int block_offset = hipBlockIdx_x * ItemsPerThread * BlockSize; T keys[ItemsPerThread]; Helper::template load(lid, input + block_offset, keys); #pragma nounroll for(unsigned int trial = 0; trial < Trials; trial++) { Helper::template sort(keys); } hipcub::StoreDirectStriped(lid, output + block_offset, keys); } template __global__ __launch_bounds__(BlockSize) void sort_pairs_kernel(const T* input, T* output) { const unsigned int lid = hipThreadIdx_x; const unsigned int block_offset = hipBlockIdx_x * ItemsPerThread * BlockSize; T keys[ItemsPerThread]; T values[ItemsPerThread]; Helper::template load(lid, input + block_offset, keys); for(unsigned int i = 0; i < ItemsPerThread; i++) { values[i] = keys[i] + T(1); } #pragma nounroll for(unsigned int trial = 0; trial < Trials; trial++) { Helper::template sort(keys, values); } for(unsigned int i = 0; i < ItemsPerThread; i++) { keys[i] += values[i]; } hipcub::StoreDirectStriped(lid, output + block_offset, keys); } template void run_benchmark(benchmark::State& state, benchmark_kinds benchmark_kind, hipStream_t stream, size_t N) { constexpr auto items_per_block = BlockSize * ItemsPerThread; const auto size = items_per_block * ((N + items_per_block - 1)/items_per_block); std::vector input; if(std::is_floating_point::value) { input = benchmark_utils::get_random_data(size, (T)-1000, (T)+1000); } else { input = benchmark_utils::get_random_data( size, std::numeric_limits::min(), std::numeric_limits::max() ); } T * d_input; T * d_output; HIP_CHECK(hipMalloc(&d_input, size * sizeof(T))); HIP_CHECK(hipMalloc(&d_output, size * sizeof(T))); HIP_CHECK( hipMemcpy( d_input, input.data(), size * sizeof(T), hipMemcpyHostToDevice ) ); HIP_CHECK(hipDeviceSynchronize()); for(auto _ : state) { auto start = std::chrono::high_resolution_clock::now(); if(benchmark_kind == benchmark_kinds::sort_keys) { hipLaunchKernelGGL( HIP_KERNEL_NAME(sort_keys_kernel), dim3(size / items_per_block), dim3(BlockSize), 0, stream, d_input, d_output); } else if(benchmark_kind == benchmark_kinds::sort_pairs) { hipLaunchKernelGGL( HIP_KERNEL_NAME(sort_pairs_kernel), dim3(size / items_per_block), dim3(BlockSize), 0, stream, d_input, d_output); } HIP_CHECK(hipPeekAtLastError()); HIP_CHECK(hipDeviceSynchronize()); auto end = std::chrono::high_resolution_clock::now(); auto elapsed_seconds = std::chrono::duration_cast>(end - start); state.SetIterationTime(elapsed_seconds.count()); } state.SetBytesProcessed(state.iterations() * Trials * size * sizeof(T)); state.SetItemsProcessed(state.iterations() * Trials * size); HIP_CHECK(hipFree(d_input)); HIP_CHECK(hipFree(d_output)); } #define CREATE_BENCHMARK(T, BS, IPT) \ benchmark::RegisterBenchmark((std::string("block_radix_sort.SubAlgorithm Name:") \ + name) \ .c_str(), \ &run_benchmark, \ benchmark_kind, \ stream, \ size) // clang-format off #define BENCHMARK_TYPE(type, block) \ CREATE_BENCHMARK(type, block, 1), \ CREATE_BENCHMARK(type, block, 3), \ CREATE_BENCHMARK(type, block, 4), \ CREATE_BENCHMARK(type, block, 8) // clang-format on template void add_benchmarks(benchmark_kinds benchmark_kind, const std::string& name, std::vector& benchmarks, hipStream_t stream, size_t size) { std::vector bs = { BENCHMARK_TYPE(int, 64), BENCHMARK_TYPE(int, 128), BENCHMARK_TYPE(int, 192), BENCHMARK_TYPE(int, 256), BENCHMARK_TYPE(int, 320), BENCHMARK_TYPE(int, 512), BENCHMARK_TYPE(int8_t, 64), BENCHMARK_TYPE(int8_t, 128), BENCHMARK_TYPE(int8_t, 192), BENCHMARK_TYPE(int8_t, 256), BENCHMARK_TYPE(int8_t, 320), BENCHMARK_TYPE(int8_t, 512), BENCHMARK_TYPE(long long, 64), BENCHMARK_TYPE(long long, 128), BENCHMARK_TYPE(long long, 192), BENCHMARK_TYPE(long long, 256), BENCHMARK_TYPE(long long, 320), BENCHMARK_TYPE(long long, 512), }; benchmarks.insert(benchmarks.end(), bs.begin(), bs.end()); } int main(int argc, char *argv[]) { cli::Parser parser(argc, argv); parser.set_optional("size", "size", DEFAULT_N, "number of values"); parser.set_optional("trials", "trials", -1, "number of iterations"); parser.run_and_exit_if_error(); // Parse argv benchmark::Initialize(&argc, argv); const size_t size = parser.get("size"); const int trials = parser.get("trials"); std::cout << "benchmark_block_radix_sort" << std::endl; // HIP hipStream_t stream = 0; // default hipDeviceProp_t devProp; int device_id = 0; HIP_CHECK(hipGetDevice(&device_id)); HIP_CHECK(hipGetDeviceProperties(&devProp, device_id)); std::cout << "[HIP] Device name: " << devProp.name << std::endl; // Add benchmarks std::vector benchmarks; // clang-format off add_benchmarks( benchmark_kinds::sort_keys, "sort(keys)", benchmarks, stream, size); add_benchmarks( benchmark_kinds::sort_pairs, "sort(keys, values)", benchmarks, stream, size); add_benchmarks( benchmark_kinds::sort_keys, "sort_to_striped(keys)", benchmarks, stream, size); add_benchmarks( benchmark_kinds::sort_pairs, "sort_to_striped(keys, values)", benchmarks, stream, size); // clang-format on // Use manual timing for(auto& b : benchmarks) { b->UseManualTime(); b->Unit(benchmark::kMillisecond); } // Force number of iterations if(trials > 0) { for(auto& b : benchmarks) { b->Iterations(trials); } } // Run benchmarks benchmark::RunSpecifiedBenchmarks(); return 0; } hipCUB-rocm-5.7.1/benchmark/benchmark_block_reduce.cpp000066400000000000000000000172111447643347700227120ustar00rootroot00000000000000// MIT License // // Copyright (c) 2020 Advanced Micro Devices, Inc. All rights reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal // in the Software without restriction, including without limitation the rights // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell // copies of the Software, and to permit persons to whom the Software is // furnished to do so, subject to the following conditions: // // The above copyright notice and this permission notice shall be included in all // copies or substantial portions of the Software. // // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE // SOFTWARE. #include "common_benchmark_header.hpp" // HIP API #include "hipcub/block/block_reduce.hpp" #include "hipcub/thread/thread_operators.hpp" #ifndef DEFAULT_N const size_t DEFAULT_N = 1024 * 1024 * 32; #endif template< class Runner, class T, unsigned int BlockSize, unsigned int ItemsPerThread, unsigned int Trials > __global__ __launch_bounds__(BlockSize) void kernel(const T* input, T* output) { Runner::template run(input, output); } template struct reduce { template< class T, unsigned int BlockSize, unsigned int ItemsPerThread, unsigned int Trials > __device__ static void run(const T* input, T* output) { const unsigned int i = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x; T values[ItemsPerThread]; T reduced_value; for(unsigned int k = 0; k < ItemsPerThread; k++) { values[k] = input[i * ItemsPerThread + k]; } using breduce_t = hipcub::BlockReduce; __shared__ typename breduce_t::TempStorage storage; #pragma nounroll for(unsigned int trial = 0; trial < Trials; trial++) { reduced_value = breduce_t(storage).Reduce(values, hipcub::Sum()); values[0] = reduced_value; } if(hipThreadIdx_x == 0) { output[hipBlockIdx_x] = reduced_value; } } }; template< class Benchmark, class T, unsigned int BlockSize, unsigned int ItemsPerThread, unsigned int Trials = 100 > void run_benchmark(benchmark::State& state, hipStream_t stream, size_t N) { // Make sure size is a multiple of BlockSize constexpr auto items_per_block = BlockSize * ItemsPerThread; const auto size = items_per_block * ((N + items_per_block - 1)/items_per_block); // Allocate and fill memory std::vector input(size, T(1)); T * d_input; T * d_output; HIP_CHECK(hipMalloc(&d_input, size * sizeof(T))); HIP_CHECK(hipMalloc(&d_output, size * sizeof(T))); HIP_CHECK( hipMemcpy( d_input, input.data(), size * sizeof(T), hipMemcpyHostToDevice ) ); HIP_CHECK(hipDeviceSynchronize()); for (auto _ : state) { auto start = std::chrono::high_resolution_clock::now(); hipLaunchKernelGGL( HIP_KERNEL_NAME(kernel), dim3(size/items_per_block), dim3(BlockSize), 0, stream, d_input, d_output ); HIP_CHECK(hipPeekAtLastError()); HIP_CHECK(hipDeviceSynchronize()); auto end = std::chrono::high_resolution_clock::now(); auto elapsed_seconds = std::chrono::duration_cast>(end - start); state.SetIterationTime(elapsed_seconds.count()); } state.SetBytesProcessed(state.iterations() * size * sizeof(T) * Trials); state.SetItemsProcessed(state.iterations() * size * Trials); HIP_CHECK(hipFree(d_input)); HIP_CHECK(hipFree(d_output)); } // IPT - items per thread #define CREATE_BENCHMARK(T, BS, IPT) \ benchmark::RegisterBenchmark( \ (std::string("block_reduce.Method Name:") + method_name).c_str(), \ &run_benchmark, \ stream, size \ ) #define BENCHMARK_TYPE(type, block) \ CREATE_BENCHMARK(type, block, 1), \ CREATE_BENCHMARK(type, block, 2), \ CREATE_BENCHMARK(type, block, 3), \ CREATE_BENCHMARK(type, block, 4), \ CREATE_BENCHMARK(type, block, 8), \ CREATE_BENCHMARK(type, block, 11), \ CREATE_BENCHMARK(type, block, 16) template void add_benchmarks(std::vector& benchmarks, const std::string& method_name, const std::string& algorithm_name, hipStream_t stream, size_t size) { std::vector new_benchmarks = { // When block size is less than or equal to warp size BENCHMARK_TYPE(int, 64), BENCHMARK_TYPE(float, 64), BENCHMARK_TYPE(double, 64), BENCHMARK_TYPE(int8_t, 64), BENCHMARK_TYPE(uint8_t, 64), BENCHMARK_TYPE(int, 256), BENCHMARK_TYPE(float, 256), BENCHMARK_TYPE(double, 256), BENCHMARK_TYPE(int8_t, 256), BENCHMARK_TYPE(uint8_t, 256), }; benchmarks.insert(benchmarks.end(), new_benchmarks.begin(), new_benchmarks.end()); } int main(int argc, char *argv[]) { cli::Parser parser(argc, argv); parser.set_optional("size", "size", DEFAULT_N, "number of values"); parser.set_optional("trials", "trials", -1, "number of iterations"); parser.run_and_exit_if_error(); // Parse argv benchmark::Initialize(&argc, argv); const size_t size = parser.get("size"); const int trials = parser.get("trials"); std::cout << "benchmark_block_reduce" << std::endl; // HIP hipStream_t stream = 0; // default hipDeviceProp_t devProp; int device_id = 0; HIP_CHECK(hipGetDevice(&device_id)); HIP_CHECK(hipGetDeviceProperties(&devProp, device_id)); std::cout << "[HIP] Device name: " << devProp.name << std::endl; // Add benchmarks std::vector benchmarks; // using_warp_scan using reduce_uwr_t = reduce; add_benchmarks( benchmarks, "reduce", "BLOCK_REDUCE_WARP_REDUCTIONS", stream, size ); // raking reduce using reduce_rr_t = reduce; add_benchmarks( benchmarks, "reduce", "BLOCK_REDUCE_RAKING", stream, size ); // raking reduce commutative only using reduce_rrco_t = reduce; add_benchmarks( benchmarks, "reduce", "BLOCK_REDUCE_RAKING_COMMUTATIVE_ONLY", stream, size ); // Use manual timing for(auto& b : benchmarks) { b->UseManualTime(); b->Unit(benchmark::kMillisecond); } // Force number of iterations if(trials > 0) { for(auto& b : benchmarks) { b->Iterations(trials); } } // Run benchmarks benchmark::RunSpecifiedBenchmarks(); return 0; } hipCUB-rocm-5.7.1/benchmark/benchmark_block_run_length_decode.cpp000066400000000000000000000210371447643347700251140ustar00rootroot00000000000000// MIT License // // Copyright (c) 2021 Advanced Micro Devices, Inc. All rights reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal // in the Software without restriction, including without limitation the rights // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell // copies of the Software, and to permit persons to whom the Software is // furnished to do so, subject to the following conditions: // // The above copyright notice and this permission notice shall be included in all // copies or substantial portions of the Software. // // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE // SOFTWARE. #include "common_benchmark_header.hpp" #include "hipcub/block/block_load.hpp" #include "hipcub/block/block_run_length_decode.hpp" #include "hipcub/block/block_store.hpp" #ifndef DEFAULT_N const size_t DEFAULT_N = 1024 * 1024 * 32; #endif template< class ItemT, class OffsetT, unsigned BlockSize, unsigned RunsPerThread, unsigned DecodedItemsPerThread, unsigned Trials > __global__ __launch_bounds__(BlockSize) void block_run_length_decode_kernel( const ItemT * d_run_items, const OffsetT * d_run_offsets, ItemT * d_decoded_items, bool enable_store = false) { using BlockRunLengthDecodeT = hipcub::BlockRunLengthDecode< ItemT, BlockSize, RunsPerThread, DecodedItemsPerThread >; ItemT run_items[RunsPerThread]; OffsetT run_offsets[RunsPerThread]; const unsigned global_thread_idx = BlockSize * hipBlockIdx_x + hipThreadIdx_x; hipcub::LoadDirectBlocked(global_thread_idx, d_run_items, run_items); hipcub::LoadDirectBlocked(global_thread_idx, d_run_offsets, run_offsets); BlockRunLengthDecodeT block_run_length_decode( run_items, run_offsets ); const OffsetT total_decoded_size = d_run_offsets[(hipBlockIdx_x + 1) * BlockSize * RunsPerThread] - d_run_offsets[hipBlockIdx_x * BlockSize * RunsPerThread]; #pragma nounroll for (unsigned i = 0; i < Trials; ++i) { OffsetT decoded_window_offset = 0; while (decoded_window_offset < total_decoded_size) { ItemT decoded_items[DecodedItemsPerThread]; block_run_length_decode.RunLengthDecode(decoded_items, decoded_window_offset); if (enable_store) { hipcub::StoreDirectBlocked(global_thread_idx, d_decoded_items + decoded_window_offset, decoded_items); } decoded_window_offset += BlockSize * DecodedItemsPerThread; } } } template< class ItemT, class OffsetT, unsigned MinRunLength, unsigned MaxRunLength, unsigned BlockSize, unsigned RunsPerThread, unsigned DecodedItemsPerThread, unsigned Trials = 100 > void run_benchmark(benchmark::State& state, hipStream_t stream, size_t N) { constexpr auto runs_per_block = BlockSize * RunsPerThread; const auto target_num_runs = 2 * N / (MinRunLength + MaxRunLength); const auto num_runs = runs_per_block * ((target_num_runs + runs_per_block - 1)/runs_per_block); std::vector run_items(num_runs); std::vector run_offsets(num_runs + 1); std::default_random_engine prng(std::random_device{}()); using ItemDistribution = std::conditional_t< std::is_integral::value, std::uniform_int_distribution, std::uniform_real_distribution >; ItemDistribution run_item_dist(0, 100); std::uniform_int_distribution run_length_dist(MinRunLength, MaxRunLength); for (size_t i = 0; i < num_runs; ++i) { run_items[i] = run_item_dist(prng); } for (size_t i = 1; i < num_runs + 1; ++i) { const OffsetT next_run_length = run_length_dist(prng); run_offsets[i] = run_offsets[i - 1] + next_run_length; } const OffsetT output_length = run_offsets.back(); ItemT * d_run_items{}; HIP_CHECK(hipMalloc(&d_run_items, run_items.size() * sizeof(ItemT))); HIP_CHECK( hipMemcpy( d_run_items, run_items.data(), run_items.size() * sizeof(ItemT), hipMemcpyHostToDevice ) ); OffsetT * d_run_offsets{}; HIP_CHECK(hipMalloc(&d_run_offsets, run_offsets.size() * sizeof(OffsetT))); HIP_CHECK( hipMemcpy( d_run_offsets, run_offsets.data(), run_offsets.size() * sizeof(OffsetT), hipMemcpyHostToDevice ) ); ItemT * d_output{}; HIP_CHECK(hipMalloc(&d_output, output_length * sizeof(ItemT))); for (auto _ : state) { auto start = std::chrono::high_resolution_clock::now(); hipLaunchKernelGGL( HIP_KERNEL_NAME( block_run_length_decode_kernel< ItemT, OffsetT, BlockSize, RunsPerThread, DecodedItemsPerThread, Trials > ), dim3(num_runs/runs_per_block), dim3(BlockSize), 0, stream, d_run_items, d_run_offsets, d_output ); HIP_CHECK(hipPeekAtLastError()); HIP_CHECK(hipDeviceSynchronize()); auto end = std::chrono::high_resolution_clock::now(); auto elapsed_seconds = std::chrono::duration_cast>(end - start); state.SetIterationTime(elapsed_seconds.count()); } state.SetBytesProcessed(state.iterations() * output_length * sizeof(ItemT) * Trials); state.SetItemsProcessed(state.iterations() * output_length * Trials); HIP_CHECK(hipFree(d_run_items)); HIP_CHECK(hipFree(d_run_offsets)); HIP_CHECK(hipFree(d_output)); } #define CREATE_BENCHMARK(IT, OT, MINRL, MAXRL, BS, RPT, DIPT) \ benchmark::RegisterBenchmark( \ "block_run_length_decode", \ &run_benchmark, \ stream, size \ ) int main(int argc, char *argv[]) { cli::Parser parser(argc, argv); parser.set_optional("size", "size", DEFAULT_N, "number of values"); parser.set_optional("trials", "trials", -1, "number of iterations"); parser.run_and_exit_if_error(); // Parse argv benchmark::Initialize(&argc, argv); const size_t size = parser.get("size"); const int trials = parser.get("trials"); std::cout << "benchmark_block_run_length_decode" << std::endl; // HIP hipStream_t stream = 0; // default hipDeviceProp_t devProp; int device_id = 0; HIP_CHECK(hipGetDevice(&device_id)); HIP_CHECK(hipGetDeviceProperties(&devProp, device_id)); std::cout << "[HIP] Device name: " << devProp.name << std::endl; // Add benchmarks std::vector benchmarks { CREATE_BENCHMARK(int, int, 1, 5, 128, 2, 4), CREATE_BENCHMARK(int, int, 1, 10, 128, 2, 4), CREATE_BENCHMARK(int, int, 1, 50, 128, 2, 4), CREATE_BENCHMARK(int, int, 1, 100, 128, 2, 4), CREATE_BENCHMARK(int, int, 1, 500, 128, 2, 4), CREATE_BENCHMARK(int, int, 1, 1000, 128, 2, 4), CREATE_BENCHMARK(int, int, 1, 5000, 128, 2, 4), CREATE_BENCHMARK(double, long long, 1, 5, 128, 2, 4), CREATE_BENCHMARK(double, long long, 1, 10, 128, 2, 4), CREATE_BENCHMARK(double, long long, 1, 50, 128, 2, 4), CREATE_BENCHMARK(double, long long, 1, 100, 128, 2, 4), CREATE_BENCHMARK(double, long long, 1, 500, 128, 2, 4), CREATE_BENCHMARK(double, long long, 1, 1000, 128, 2, 4), CREATE_BENCHMARK(double, long long, 1, 5000, 128, 2, 4) }; // Use manual timing for(auto& b : benchmarks) { b->UseManualTime(); b->Unit(benchmark::kMillisecond); } // Force number of iterations if(trials > 0) { for(auto& b : benchmarks) { b->Iterations(trials); } } // Run benchmarks benchmark::RunSpecifiedBenchmarks(); return 0; } hipCUB-rocm-5.7.1/benchmark/benchmark_block_scan.cpp000066400000000000000000000227451447643347700223770ustar00rootroot00000000000000// MIT License // // Copyright (c) 2020-2022 Advanced Micro Devices, Inc. All rights reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal // in the Software without restriction, including without limitation the rights // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell // copies of the Software, and to permit persons to whom the Software is // furnished to do so, subject to the following conditions: // // The above copyright notice and this permission notice shall be included in all // copies or substantial portions of the Software. // // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE // SOFTWARE. #include "common_benchmark_header.hpp" // hipCUB API #include "hipcub/block/block_scan.hpp" #ifndef DEFAULT_N const size_t DEFAULT_N = 1024 * 1024 * 32; #endif template __global__ __launch_bounds__(BlockSize) void kernel(const T* input, T* output, const T init) { Runner::template run(input, output, init); } template struct inclusive_scan { template __device__ static void run(const T* input, T* output, const T init) { (void)init; const unsigned int i = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x; T values[ItemsPerThread]; for(unsigned int k = 0; k < ItemsPerThread; k++) { values[k] = input[i * ItemsPerThread + k]; } using bscan_t = hipcub::BlockScan; __shared__ typename bscan_t::TempStorage storage; #pragma nounroll for(unsigned int trial = 0; trial < Trials; trial++) { bscan_t(storage).InclusiveScan(values, values, hipcub::Sum()); } for(unsigned int k = 0; k < ItemsPerThread; k++) { output[i * ItemsPerThread + k] = values[k]; } } }; template struct exclusive_scan { template __device__ static void run(const T* input, T* output, const T init) { const unsigned int i = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x; T values[ItemsPerThread]; for(unsigned int k = 0; k < ItemsPerThread; k++) { values[k] = input[i * ItemsPerThread + k]; } using bscan_t = hipcub::BlockScan; __shared__ typename bscan_t::TempStorage storage; #pragma nounroll for(unsigned int trial = 0; trial < Trials; trial++) { bscan_t(storage).ExclusiveScan(values, values, init, hipcub::Sum()); } for(unsigned int k = 0; k < ItemsPerThread; k++) { output[i * ItemsPerThread + k] = values[k]; } } }; template void run_benchmark(benchmark::State& state, hipStream_t stream, size_t N) { // Make sure size is a multiple of BlockSize constexpr auto items_per_block = BlockSize * ItemsPerThread; const auto size = items_per_block * ((N + items_per_block - 1)/items_per_block); // Allocate and fill memory std::vector input(size, T(1)); T * d_input; T * d_output; HIP_CHECK(hipMalloc(&d_input, size * sizeof(T))); HIP_CHECK(hipMalloc(&d_output, size * sizeof(T))); HIP_CHECK( hipMemcpy( d_input, input.data(), size * sizeof(T), hipMemcpyHostToDevice ) ); HIP_CHECK(hipDeviceSynchronize()); for (auto _ : state) { auto start = std::chrono::high_resolution_clock::now(); hipLaunchKernelGGL(HIP_KERNEL_NAME(kernel), dim3(size / items_per_block), dim3(BlockSize), 0, stream, d_input, d_output, input[0]); HIP_CHECK(hipPeekAtLastError()); HIP_CHECK(hipDeviceSynchronize()); auto end = std::chrono::high_resolution_clock::now(); auto elapsed_seconds = std::chrono::duration_cast>(end - start); state.SetIterationTime(elapsed_seconds.count()); } state.SetBytesProcessed(state.iterations() * size * sizeof(T) * Trials); state.SetItemsProcessed(state.iterations() * size * Trials); HIP_CHECK(hipFree(d_input)); HIP_CHECK(hipFree(d_output)); } // IPT - items per thread #define CREATE_BENCHMARK(T, BS, IPT) \ benchmark::RegisterBenchmark( \ (std::string("block_scan.Method Name:") + method_name).c_str(), \ &run_benchmark, \ stream, size \ ) // clang-format off #define BENCHMARK_TYPE(type, block) \ CREATE_BENCHMARK(type, block, 1), \ CREATE_BENCHMARK(type, block, 3), \ CREATE_BENCHMARK(type, block, 4), \ CREATE_BENCHMARK(type, block, 8), \ CREATE_BENCHMARK(type, block, 11), \ CREATE_BENCHMARK(type, block, 16) // clang-format on template void add_benchmarks(std::vector& benchmarks, const std::string& method_name, const std::string& algorithm_name, hipStream_t stream, size_t size) { using custom_float2 = benchmark_utils::custom_type; using custom_double2 = benchmark_utils::custom_type; std::vector new_benchmarks = { // When block size is less than or equal to warp size BENCHMARK_TYPE(int, 64), BENCHMARK_TYPE(float, 64), BENCHMARK_TYPE(double, 64), BENCHMARK_TYPE(uint8_t, 64), BENCHMARK_TYPE(int, 256), BENCHMARK_TYPE(float, 256), BENCHMARK_TYPE(double, 256), BENCHMARK_TYPE(uint8_t, 256), CREATE_BENCHMARK(custom_float2, 256, 1), CREATE_BENCHMARK(custom_float2, 256, 4), CREATE_BENCHMARK(custom_float2, 256, 8), CREATE_BENCHMARK(custom_double2, 256, 1), CREATE_BENCHMARK(custom_double2, 256, 4), CREATE_BENCHMARK(custom_double2, 256, 8), }; benchmarks.insert(benchmarks.end(), new_benchmarks.begin(), new_benchmarks.end()); } int main(int argc, char *argv[]) { cli::Parser parser(argc, argv); parser.set_optional("size", "size", DEFAULT_N, "number of values"); parser.set_optional("trials", "trials", -1, "number of iterations"); parser.run_and_exit_if_error(); // Parse argv benchmark::Initialize(&argc, argv); const size_t size = parser.get("size"); const int trials = parser.get("trials"); std::cout << "benchmark_block_scan" << std::endl; // HIP hipStream_t stream = 0; // default hipDeviceProp_t devProp; int device_id = 0; HIP_CHECK(hipGetDevice(&device_id)); HIP_CHECK(hipGetDeviceProperties(&devProp, device_id)); std::cout << "[HIP] Device name: " << devProp.name << std::endl; // Add benchmarks std::vector benchmarks; // clang-format off add_benchmarks>( benchmarks, "inclusive_scan", "BLOCK_SCAN_RAKING", stream, size); add_benchmarks>( benchmarks, "inclusive_scan", "BLOCK_SCAN_RAKING_MEMOIZE", stream, size); add_benchmarks>( benchmarks, "inclusive_scan", "BLOCK_SCAN_WARP_SCANS", stream, size); add_benchmarks>( benchmarks, "exclusive_scan", "BLOCK_SCAN_RAKING", stream, size); add_benchmarks>( benchmarks, "exclusive_scan", "BLOCK_SCAN_RAKING_MEMOIZE", stream, size); add_benchmarks>( benchmarks, "exclusive_scan", "BLOCK_SCAN_WARP_SCANS", stream, size); // clang-format on // Use manual timing for(auto& b : benchmarks) { b->UseManualTime(); b->Unit(benchmark::kMillisecond); } // Force number of iterations if(trials > 0) { for(auto& b : benchmarks) { b->Iterations(trials); } } // Run benchmarks benchmark::RunSpecifiedBenchmarks(); return 0; } hipCUB-rocm-5.7.1/benchmark/benchmark_block_shuffle.cpp000066400000000000000000000277421447643347700231110ustar00rootroot00000000000000// MIT License // // Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal // in the Software without restriction, including without limitation the rights // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell // copies of the Software, and to permit persons to whom the Software is // furnished to do so, subject to the following conditions: // // The above copyright notice and this permission notice shall be included in all // copies or substantial portions of the Software. // // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE // SOFTWARE. #include "common_benchmark_header.hpp" #include "hipcub/block/block_shuffle.hpp" #ifndef DEFAULT_N const size_t DEFAULT_N = 1024 * 1024 * 32; #endif template __global__ __launch_bounds__(BlockSize) void kernel(const T* input, T* output) { Runner::template run(input, output); } struct offset { template __device__ static void run(const T* input, T* output) { const unsigned int tid = hipBlockIdx_x * BlockSize + hipThreadIdx_x; T value = input[tid]; using bshuffle_t = hipcub::BlockShuffle; __shared__ typename bshuffle_t::TempStorage storage; #pragma nounroll for(unsigned int trial = 0; trial < Trials; trial++) { bshuffle_t(storage).Offset(value, value, 1); // sync is required because of loop since // temporary storage is accessed next iteration __syncthreads(); } output[tid] = value; } static constexpr bool uses_ipt = false; }; struct rotate { template __device__ static void run(const T* input, T* output) { const unsigned int tid = hipBlockIdx_x * BlockSize + hipThreadIdx_x; T value = input[tid]; using bshuffle_t = hipcub::BlockShuffle; __shared__ typename bshuffle_t::TempStorage storage; #pragma nounroll for(unsigned int trial = 0; trial < Trials; trial++) { bshuffle_t(storage).Rotate(value, value, 1); // sync is required because of loop since // temporary storage is accessed next iteration __syncthreads(); } output[tid] = value; } static constexpr bool uses_ipt = false; }; struct up { template __device__ static void run(const T* input, T* output) { const unsigned int tid = hipBlockIdx_x * BlockSize + hipThreadIdx_x; T values[ItemsPerThread]; for(unsigned int i = 0; i < ItemsPerThread; i++) { values[i] = input[ItemsPerThread * tid + i]; } using bshuffle_t = hipcub::BlockShuffle; __shared__ typename bshuffle_t::TempStorage storage; #pragma nounroll for(unsigned int trial = 0; trial < Trials; trial++) { bshuffle_t(storage).Up(values, values); // sync is required because of loop since // temporary storage is accessed next iteration __syncthreads(); } for(unsigned int i = 0; i < ItemsPerThread; i++) { output[ItemsPerThread * tid + i] = values[i]; } } static constexpr bool uses_ipt = true; }; struct down { template __device__ static void run(const T* input, T* output) { const unsigned int tid = hipBlockIdx_x * BlockSize + hipThreadIdx_x; T values[ItemsPerThread]; for(unsigned int i = 0; i < ItemsPerThread; i++) { values[i] = input[ItemsPerThread * tid + i]; } using bshuffle_t = hipcub::BlockShuffle; __shared__ typename bshuffle_t::TempStorage storage; #pragma nounroll for(unsigned int trial = 0; trial < Trials; trial++) { bshuffle_t(storage).Down(values, values); // sync is required because of loop since // temporary storage is accessed next iteration __syncthreads(); } for(unsigned int i = 0; i < ItemsPerThread; i++) { output[ItemsPerThread * tid + i] = values[i]; } } static constexpr bool uses_ipt = true; }; template void run_benchmark(benchmark::State& state, hipStream_t stream, size_t N) { constexpr auto items_per_block = BlockSize * ItemsPerThread; const auto size = items_per_block * ((N + items_per_block - 1) / items_per_block); std::vector input(size, T(1)); T* d_input; T* d_output; HIP_CHECK(hipMalloc(&d_input, size * sizeof(T))); HIP_CHECK(hipMalloc(&d_output, size * sizeof(T))); HIP_CHECK(hipMemcpy(d_input, input.data(), size * sizeof(T), hipMemcpyHostToDevice)); HIP_CHECK(hipDeviceSynchronize()); for(auto _ : state) { auto start = std::chrono::high_resolution_clock::now(); hipLaunchKernelGGL(HIP_KERNEL_NAME(kernel), dim3(size / items_per_block), dim3(BlockSize), 0, stream, d_input, d_output); HIP_CHECK(hipPeekAtLastError()); HIP_CHECK(hipDeviceSynchronize()); auto end = std::chrono::high_resolution_clock::now(); auto elapsed_seconds = std::chrono::duration_cast>(end - start); state.SetIterationTime(elapsed_seconds.count()); } state.SetBytesProcessed(state.iterations() * Trials * size * sizeof(T)); state.SetItemsProcessed(state.iterations() * Trials * size); HIP_CHECK(hipFree(d_input)); HIP_CHECK(hipFree(d_output)); } #define CREATE_BENCHMARK_IPT(BS, IPT) \ benchmark::RegisterBenchmark( \ (std::string("block_shuffle.SubAlgorithm Name:") \ + name) \ .c_str(), \ &run_benchmark, \ stream, \ size) #define CREATE_BENCHMARK(BS) \ benchmark::RegisterBenchmark((std::string("block_shuffle.SubAlgorithm Name:") + name) \ .c_str(), \ &run_benchmark, \ stream, \ size) template = true> void add_benchmarks_type(const std::string& name, std::vector& benchmarks, hipStream_t stream, size_t size, const std::string& type_name) { std::vector bs = { CREATE_BENCHMARK_IPT(256, 1), CREATE_BENCHMARK_IPT(256, 3), CREATE_BENCHMARK_IPT(256, 4), CREATE_BENCHMARK_IPT(256, 8), CREATE_BENCHMARK_IPT(256, 16), CREATE_BENCHMARK_IPT(256, 32), }; benchmarks.insert(benchmarks.end(), bs.begin(), bs.end()); } template = true> void add_benchmarks_type(const std::string& name, std::vector& benchmarks, hipStream_t stream, size_t size, const std::string& type_name) { std::vector bs = { CREATE_BENCHMARK(256), }; benchmarks.insert(benchmarks.end(), bs.begin(), bs.end()); } #define CREATE_BENCHMARKS(T) add_benchmarks_type(name, benchmarks, stream, size, #T) template void add_benchmarks(const std::string& name, std::vector& benchmarks, hipStream_t stream, size_t size) { using custom_float2 = benchmark_utils::custom_type; using custom_double2 = benchmark_utils::custom_type; CREATE_BENCHMARKS(int); CREATE_BENCHMARKS(float); CREATE_BENCHMARKS(double); CREATE_BENCHMARKS(int8_t); CREATE_BENCHMARKS(long long); CREATE_BENCHMARKS(custom_float2); CREATE_BENCHMARKS(custom_double2); } int main(int argc, char* argv[]) { cli::Parser parser(argc, argv); parser.set_optional("size", "size", DEFAULT_N, "number of values"); parser.set_optional("trials", "trials", -1, "number of iterations"); parser.run_and_exit_if_error(); // Parse argv benchmark::Initialize(&argc, argv); const size_t size = parser.get("size"); const int trials = parser.get("trials"); std::cout << "benchmark_block_shuffle" << std::endl; // HIP hipStream_t stream = 0; // default hipDeviceProp_t devProp; int device_id = 0; HIP_CHECK(hipGetDevice(&device_id)); HIP_CHECK(hipGetDeviceProperties(&devProp, device_id)); std::cout << "[HIP] Device name: " << devProp.name << std::endl; // Add benchmarks std::vector benchmarks; add_benchmarks("offset", benchmarks, stream, size); add_benchmarks("rotate", benchmarks, stream, size); add_benchmarks("up", benchmarks, stream, size); add_benchmarks("down", benchmarks, stream, size); // Use manual timing for(auto& b : benchmarks) { b->UseManualTime(); b->Unit(benchmark::kMillisecond); } // Force number of iterations if(trials > 0) { for(auto& b : benchmarks) { b->Iterations(trials); } } // Run benchmarks benchmark::RunSpecifiedBenchmarks(); return 0; } hipCUB-rocm-5.7.1/benchmark/benchmark_device_adjacent_difference.cpp000066400000000000000000000217471447643347700255440ustar00rootroot00000000000000// MIT License // // Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal // in the Software without restriction, including without limitation the rights // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell // copies of the Software, and to permit persons to whom the Software is // furnished to do so, subject to the following conditions: // // The above copyright notice and this permission notice shall be included in all // copies or substantial portions of the Software. // // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE // SOFTWARE. // CUB's implementation of DeviceRunLengthEncode has unused parameters, // disable the warning because all warnings are threated as errors: #include "common_benchmark_header.hpp" #include #include "cmdparser.hpp" #include #include #include #include #include #include #include namespace { #ifndef DEFAULT_N constexpr std::size_t DEFAULT_N = 1024 * 1024 * 128; #endif constexpr unsigned int batch_size = 10; constexpr unsigned int warmup_size = 5; template auto dispatch_adjacent_difference(std::true_type /*left*/, std::true_type /*copy*/, void* const temporary_storage, std::size_t& storage_size, const InputIt input, const OutputIt output, Args&&... args) { return ::hipcub::DeviceAdjacentDifference::SubtractLeftCopy( temporary_storage, storage_size, input, output, std::forward(args)...); } template auto dispatch_adjacent_difference(std::false_type /*left*/, std::true_type /*copy*/, void* const temporary_storage, std::size_t& storage_size, const InputIt input, const OutputIt output, Args&&... args) { return ::hipcub::DeviceAdjacentDifference::SubtractRightCopy( temporary_storage, storage_size, input, output, std::forward(args)...); } template auto dispatch_adjacent_difference(std::true_type /*left*/, std::false_type /*copy*/, void* const temporary_storage, std::size_t& storage_size, const InputIt input, const OutputIt /*output*/, Args&&... args) { return ::hipcub::DeviceAdjacentDifference::SubtractLeft( temporary_storage, storage_size, input, std::forward(args)...); } template auto dispatch_adjacent_difference(std::false_type /*left*/, std::false_type /*copy*/, void* const temporary_storage, std::size_t& storage_size, const InputIt input, const OutputIt /*output*/, Args&&... args) { return ::hipcub::DeviceAdjacentDifference::SubtractRight( temporary_storage, storage_size, input, std::forward(args)...); } template void run_benchmark(benchmark::State& state, const std::size_t size, const hipStream_t stream) { using output_type = T; static constexpr bool debug_synchronous = false; // Generate data const std::vector input = benchmark_utils::get_random_data(size, 1, 100); T* d_input; output_type* d_output = nullptr; HIP_CHECK(hipMalloc(&d_input, input.size() * sizeof(input[0]))); HIP_CHECK( hipMemcpy(d_input, input.data(), input.size() * sizeof(input[0]), hipMemcpyHostToDevice)); if(copy) { HIP_CHECK(hipMalloc(&d_output, size * sizeof(output_type))); } static constexpr std::integral_constant left_tag; static constexpr std::integral_constant copy_tag; // Allocate temporary storage std::size_t temp_storage_size{}; void* d_temp_storage = nullptr; const auto launch = [&] { return dispatch_adjacent_difference(left_tag, copy_tag, d_temp_storage, temp_storage_size, d_input, d_output, size, hipcub::Sum{}, stream, debug_synchronous); }; HIP_CHECK(launch()); HIP_CHECK(hipMalloc(&d_temp_storage, temp_storage_size)); // Warm-up for(size_t i = 0; i < warmup_size; i++) { HIP_CHECK(launch()); } HIP_CHECK(hipDeviceSynchronize()); // Run for(auto _ : state) { auto start = std::chrono::high_resolution_clock::now(); for(size_t i = 0; i < batch_size; i++) { HIP_CHECK(launch()); } HIP_CHECK(hipStreamSynchronize(stream)); auto end = std::chrono::high_resolution_clock::now(); auto elapsed_seconds = std::chrono::duration_cast>(end - start); state.SetIterationTime(elapsed_seconds.count()); } state.SetBytesProcessed(state.iterations() * batch_size * size * sizeof(T)); state.SetItemsProcessed(state.iterations() * batch_size * size); hipFree(d_input); if(copy) { hipFree(d_output); } hipFree(d_temp_storage); } } // namespace using namespace std::string_literals; #define CREATE_BENCHMARK(T, left, copy) \ benchmark::RegisterBenchmark(("Subtract" + (left ? "Left"s : "Right"s) \ + (copy ? "Copy"s : ""s) + "<" #T ">") \ .c_str(), \ &run_benchmark, \ size, \ stream) // clang-format off #define CREATE_BENCHMARKS(T) \ CREATE_BENCHMARK(T, true, false), \ CREATE_BENCHMARK(T, true, true), \ CREATE_BENCHMARK(T, false, false), \ CREATE_BENCHMARK(T, false, true) // clang-format on int main(int argc, char* argv[]) { cli::Parser parser(argc, argv); parser.set_optional("size", "size", DEFAULT_N, "number of values"); parser.set_optional("trials", "trials", -1, "number of iterations"); parser.run_and_exit_if_error(); // Parse argv benchmark::Initialize(&argc, argv); const size_t size = parser.get("size"); const int trials = parser.get("trials"); // HIP const hipStream_t stream = 0; // default hipDeviceProp_t devProp; int device_id = 0; HIP_CHECK(hipGetDevice(&device_id)); HIP_CHECK(hipGetDeviceProperties(&devProp, device_id)); std::cout << "[HIP] Device name: " << devProp.name << std::endl; using custom_float2 = benchmark_utils::custom_type; using custom_double2 = benchmark_utils::custom_type; // Add benchmarks const std::vector benchmarks = { CREATE_BENCHMARKS(int), CREATE_BENCHMARKS(std::int64_t), CREATE_BENCHMARKS(uint8_t), CREATE_BENCHMARKS(float), CREATE_BENCHMARKS(double), CREATE_BENCHMARKS(custom_float2), CREATE_BENCHMARKS(custom_double2), }; // Use manual timing for(auto& b : benchmarks) { b->UseManualTime(); b->Unit(benchmark::kMillisecond); } // Force number of iterations if(trials > 0) { for(auto& b : benchmarks) { b->Iterations(trials); } } // Run benchmarks benchmark::RunSpecifiedBenchmarks(); return 0; } hipCUB-rocm-5.7.1/benchmark/benchmark_device_histogram.cpp000066400000000000000000000560071447643347700236130ustar00rootroot00000000000000// MIT License // // Copyright (c) 2020 Advanced Micro Devices, Inc. All rights reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal // in the Software without restriction, including without limitation the rights // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell // copies of the Software, and to permit persons to whom the Software is // furnished to do so, subject to the following conditions: // // The above copyright notice and this permission notice shall be included in all // copies or substantial portions of the Software. // // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE // SOFTWARE. // CUB's implementation of DeviceRunLengthEncode has unused parameters, // disable the warning because all warnings are threated as errors: #ifdef __HIP_PLATFORM_NVIDIA__ #pragma GCC diagnostic ignored "-Wunused-parameter" #endif #include "common_benchmark_header.hpp" // HIP API #include "hipcub/device/device_histogram.hpp" #include "hipcub/iterator/transform_input_iterator.hpp" #ifndef DEFAULT_N const size_t DEFAULT_N = 1024 * 1024 * 32; #endif const unsigned int batch_size = 10; const unsigned int warmup_size = 5; template std::vector generate(size_t size, int entropy_reduction, long long lower_level, long long upper_level) { if(entropy_reduction >= 5) { return std::vector(size, (lower_level + upper_level) / 2); } const size_t max_random_size = 1024 * 1024; std::random_device rd; std::default_random_engine gen(rd()); std::vector data(size); std::generate(data.begin(), data.begin() + std::min(size, max_random_size), [&]() { // Reduce entropy by applying bitwise AND to random bits // "An Improved Supercomputer Sorting Benchmark", 1992 // Kurt Thearling & Stephen Smith auto v = gen(); for(int e = 0; e < entropy_reduction; e++) { v &= gen(); } return T(lower_level + v % (upper_level - lower_level)); }); for(size_t i = max_random_size; i < size; i += max_random_size) { std::copy_n(data.begin(), std::min(size - i, max_random_size), data.begin() + i); } return data; } int get_entropy_percents(int entropy_reduction) { switch(entropy_reduction) { case 0: return 100; case 1: return 81; case 2: return 54; case 3: return 33; case 4: return 20; default: return 0; } } const int entropy_reductions[] = { 0, 2, 4, 6 }; template void run_even_benchmark(benchmark::State& state, size_t bins, size_t scale, int entropy_reduction, hipStream_t stream, size_t size) { using counter_type = unsigned int; const T lower_level = 0; // casting for compilation with CUB backend because // there is no casting from size_t (aka unsigned long) to __half const T upper_level = static_cast(bins * scale); // Generate data std::vector input = generate(size, entropy_reduction, lower_level, upper_level); T * d_input; counter_type * d_histogram; HIP_CHECK(hipMalloc(&d_input, size * sizeof(T))); HIP_CHECK(hipMalloc(&d_histogram, size * sizeof(counter_type))); HIP_CHECK( hipMemcpy( d_input, input.data(), size * sizeof(T), hipMemcpyHostToDevice ) ); void * d_temporary_storage = nullptr; size_t temporary_storage_bytes = 0; HIP_CHECK( hipcub::DeviceHistogram::HistogramEven( d_temporary_storage, temporary_storage_bytes, d_input, d_histogram, bins + 1, lower_level, upper_level, int(size), stream, false ) ); HIP_CHECK(hipMalloc(&d_temporary_storage, temporary_storage_bytes)); HIP_CHECK(hipDeviceSynchronize()); // Warm-up for(size_t i = 0; i < warmup_size; i++) { HIP_CHECK( hipcub::DeviceHistogram::HistogramEven( d_temporary_storage, temporary_storage_bytes, d_input, d_histogram, bins + 1, lower_level, upper_level, int(size), stream, false ) ); } HIP_CHECK(hipDeviceSynchronize()); for (auto _ : state) { auto start = std::chrono::high_resolution_clock::now(); for(size_t i = 0; i < batch_size; i++) { HIP_CHECK( hipcub::DeviceHistogram::HistogramEven( d_temporary_storage, temporary_storage_bytes, d_input, d_histogram, bins + 1, lower_level, upper_level, int(size), stream, false ) ); } HIP_CHECK(hipDeviceSynchronize()); auto end = std::chrono::high_resolution_clock::now(); auto elapsed_seconds = std::chrono::duration_cast>(end - start); state.SetIterationTime(elapsed_seconds.count()); } state.SetBytesProcessed(state.iterations() * batch_size * size * sizeof(T)); state.SetItemsProcessed(state.iterations() * batch_size * size); HIP_CHECK(hipFree(d_temporary_storage)); HIP_CHECK(hipFree(d_input)); HIP_CHECK(hipFree(d_histogram)); } template void run_multi_even_benchmark(benchmark::State& state, size_t bins, size_t scale, int entropy_reduction, hipStream_t stream, size_t size) { using counter_type = unsigned int; int num_levels[ActiveChannels]; int lower_level[ActiveChannels]; int upper_level[ActiveChannels]; for(unsigned int channel = 0; channel < ActiveChannels; channel++) { lower_level[channel] = 0; upper_level[channel] = bins * scale; num_levels[channel] = bins + 1; } // Generate data std::vector input = generate(size * Channels, entropy_reduction, lower_level[0], upper_level[0]); T * d_input; counter_type * d_histogram[ActiveChannels]; HIP_CHECK(hipMalloc(&d_input, size * Channels * sizeof(T))); for(unsigned int channel = 0; channel < ActiveChannels; channel++) { HIP_CHECK(hipMalloc(&d_histogram[channel], bins * sizeof(counter_type))); } HIP_CHECK( hipMemcpy( d_input, input.data(), size * Channels * sizeof(T), hipMemcpyHostToDevice ) ); void * d_temporary_storage = nullptr; size_t temporary_storage_bytes = 0; HIP_CHECK(( hipcub::DeviceHistogram::MultiHistogramEven( d_temporary_storage, temporary_storage_bytes, d_input, d_histogram, num_levels, lower_level, upper_level, int(size), stream, false ) )); HIP_CHECK(hipMalloc(&d_temporary_storage, temporary_storage_bytes)); HIP_CHECK(hipDeviceSynchronize()); // Warm-up for(size_t i = 0; i < warmup_size; i++) { HIP_CHECK(( hipcub::DeviceHistogram::MultiHistogramEven( d_temporary_storage, temporary_storage_bytes, d_input, d_histogram, num_levels, lower_level, upper_level, int(size), stream, false ) )); } HIP_CHECK(hipDeviceSynchronize()); for (auto _ : state) { auto start = std::chrono::high_resolution_clock::now(); for(size_t i = 0; i < batch_size; i++) { HIP_CHECK(( hipcub::DeviceHistogram::MultiHistogramEven( d_temporary_storage, temporary_storage_bytes, d_input, d_histogram, num_levels, lower_level, upper_level, int(size), stream, false ) )); } HIP_CHECK(hipDeviceSynchronize()); auto end = std::chrono::high_resolution_clock::now(); auto elapsed_seconds = std::chrono::duration_cast>(end - start); state.SetIterationTime(elapsed_seconds.count()); } state.SetBytesProcessed(state.iterations() * batch_size * size * Channels * sizeof(T)); state.SetItemsProcessed(state.iterations() * batch_size * size * Channels); HIP_CHECK(hipFree(d_temporary_storage)); HIP_CHECK(hipFree(d_input)); for(unsigned int channel = 0; channel < ActiveChannels; channel++) { HIP_CHECK(hipFree(d_histogram[channel])); } } template void run_range_benchmark(benchmark::State& state, size_t bins, hipStream_t stream, size_t size) { using counter_type = unsigned int; // Generate data std::vector input = benchmark_utils::get_random_data(size, 0, bins); std::vector levels(bins + 1); std::iota(levels.begin(), levels.end(), static_cast(0)); T * d_input; T * d_levels; counter_type * d_histogram; HIP_CHECK(hipMalloc(&d_input, size * sizeof(T))); HIP_CHECK(hipMalloc(&d_levels, (bins + 1) * sizeof(T))); HIP_CHECK(hipMalloc(&d_histogram, size * sizeof(counter_type))); HIP_CHECK( hipMemcpy( d_input, input.data(), size * sizeof(T), hipMemcpyHostToDevice ) ); HIP_CHECK( hipMemcpy( d_levels, levels.data(), (bins + 1) * sizeof(T), hipMemcpyHostToDevice ) ); void * d_temporary_storage = nullptr; size_t temporary_storage_bytes = 0; HIP_CHECK( hipcub::DeviceHistogram::HistogramRange( d_temporary_storage, temporary_storage_bytes, d_input, d_histogram, bins + 1, d_levels, int(size), stream, false ) ); HIP_CHECK(hipMalloc(&d_temporary_storage, temporary_storage_bytes)); HIP_CHECK(hipDeviceSynchronize()); // Warm-up for(size_t i = 0; i < warmup_size; i++) { HIP_CHECK( hipcub::DeviceHistogram::HistogramRange( d_temporary_storage, temporary_storage_bytes, d_input, d_histogram, bins + 1, d_levels, int(size), stream, false ) ); } HIP_CHECK(hipDeviceSynchronize()); for (auto _ : state) { auto start = std::chrono::high_resolution_clock::now(); for(size_t i = 0; i < batch_size; i++) { HIP_CHECK( hipcub::DeviceHistogram::HistogramRange( d_temporary_storage, temporary_storage_bytes, d_input, d_histogram, bins + 1, d_levels, int(size), stream, false ) ); } HIP_CHECK(hipDeviceSynchronize()); auto end = std::chrono::high_resolution_clock::now(); auto elapsed_seconds = std::chrono::duration_cast>(end - start); state.SetIterationTime(elapsed_seconds.count()); } state.SetBytesProcessed(state.iterations() * batch_size * size * sizeof(T)); state.SetItemsProcessed(state.iterations() * batch_size * size); HIP_CHECK(hipFree(d_temporary_storage)); HIP_CHECK(hipFree(d_input)); HIP_CHECK(hipFree(d_levels)); HIP_CHECK(hipFree(d_histogram)); } template void run_multi_range_benchmark(benchmark::State& state, size_t bins, hipStream_t stream, size_t size) { using counter_type = unsigned int; // Number of levels for a single channel const int num_levels_channel = bins + 1; int num_levels[ActiveChannels]; std::vector levels[ActiveChannels]; for (unsigned int channel = 0; channel < ActiveChannels; channel++) { levels[channel].resize(num_levels_channel); std::iota(levels[channel].begin(), levels[channel].end(), static_cast(0)); num_levels[channel] = num_levels_channel; } // Generate data std::vector input = benchmark_utils::get_random_data(size * Channels, 0, bins); T * d_input; T * d_levels[ActiveChannels]; counter_type * d_histogram[ActiveChannels]; HIP_CHECK(hipMalloc(&d_input, size * Channels * sizeof(T))); for(unsigned int channel = 0; channel < ActiveChannels; channel++) { HIP_CHECK(hipMalloc(&d_levels[channel], num_levels_channel * sizeof(T))); HIP_CHECK(hipMalloc(&d_histogram[channel], size * sizeof(counter_type))); } HIP_CHECK( hipMemcpy( d_input, input.data(), size * Channels * sizeof(T), hipMemcpyHostToDevice ) ); for(unsigned int channel = 0; channel < ActiveChannels; channel++) { HIP_CHECK( hipMemcpy( d_levels[channel], levels[channel].data(), num_levels_channel * sizeof(T), hipMemcpyHostToDevice ) ); } void * d_temporary_storage = nullptr; size_t temporary_storage_bytes = 0; HIP_CHECK(( hipcub::DeviceHistogram::MultiHistogramRange( d_temporary_storage, temporary_storage_bytes, d_input, d_histogram, num_levels, d_levels, int(size), stream, false ) )); HIP_CHECK(hipMalloc(&d_temporary_storage, temporary_storage_bytes)); HIP_CHECK(hipDeviceSynchronize()); // Warm-up for(size_t i = 0; i < warmup_size; i++) { HIP_CHECK(( hipcub::DeviceHistogram::MultiHistogramRange( d_temporary_storage, temporary_storage_bytes, d_input, d_histogram, num_levels, d_levels, int(size), stream, false ) )); } HIP_CHECK(hipDeviceSynchronize()); for (auto _ : state) { auto start = std::chrono::high_resolution_clock::now(); for(size_t i = 0; i < batch_size; i++) { HIP_CHECK(( hipcub::DeviceHistogram::MultiHistogramRange( d_temporary_storage, temporary_storage_bytes, d_input, d_histogram, num_levels, d_levels, int(size), stream, false ) )); } HIP_CHECK(hipDeviceSynchronize()); auto end = std::chrono::high_resolution_clock::now(); auto elapsed_seconds = std::chrono::duration_cast>(end - start); state.SetIterationTime(elapsed_seconds.count()); } state.SetBytesProcessed(state.iterations() * batch_size * size * Channels * sizeof(T)); state.SetItemsProcessed(state.iterations() * batch_size * size * Channels); HIP_CHECK(hipFree(d_temporary_storage)); HIP_CHECK(hipFree(d_input)); for(unsigned int channel = 0; channel < ActiveChannels; channel++) { HIP_CHECK(hipFree(d_levels[channel])); HIP_CHECK(hipFree(d_histogram[channel])); } } template struct num_limits { static constexpr T max() { return std::numeric_limits::max(); }; }; template<> struct num_limits<__half> { static constexpr double max() { return 65504.0; }; }; #define CREATE_EVEN_BENCHMARK(VECTOR, T, BINS, SCALE) \ if(num_limits::max() > BINS * SCALE) \ { \ VECTOR.push_back(benchmark::RegisterBenchmark( \ (std::string("histogram_even") + "" + "(Entropy Percent:" \ + std::to_string(get_entropy_percents(entropy_reduction)) + "%,Bin Count:" \ + std::to_string(BINS) + " bins)") \ .c_str(), \ [=](benchmark::State& state) \ { run_even_benchmark(state, BINS, SCALE, entropy_reduction, stream, size); })); \ } #define BENCHMARK_TYPE(VECTOR, T) \ CREATE_EVEN_BENCHMARK(VECTOR, T, 10, 1234); \ CREATE_EVEN_BENCHMARK(VECTOR, T, 100, 1234); \ CREATE_EVEN_BENCHMARK(VECTOR, T, 1000, 1234); \ CREATE_EVEN_BENCHMARK(VECTOR, T, 16, 10); \ CREATE_EVEN_BENCHMARK(VECTOR, T, 256, 10); \ CREATE_EVEN_BENCHMARK(VECTOR, T, 65536, 1) void add_even_benchmarks(std::vector& benchmarks, hipStream_t stream, size_t size) { for(int entropy_reduction : entropy_reductions) { BENCHMARK_TYPE(benchmarks, long long); BENCHMARK_TYPE(benchmarks, int); BENCHMARK_TYPE(benchmarks, unsigned short); BENCHMARK_TYPE(benchmarks, uint8_t); BENCHMARK_TYPE(benchmarks, double); BENCHMARK_TYPE(benchmarks, float); //this limitation can be removed once https://github.com/NVIDIA/cub/issues/484 is fixed #ifdef __HIP_PLATFORM_AMD__ BENCHMARK_TYPE(benchmarks, __half); #endif }; } #define CREATE_MULTI_EVEN_BENCHMARK(CHANNELS, ACTIVE_CHANNELS, T, BINS, SCALE) \ benchmark::RegisterBenchmark( \ (std::string("multi_histogram_even") + "" + \ "(Entropy Percent:" + std::to_string(get_entropy_percents(entropy_reduction)) + "%,Bin Count:" + \ std::to_string(BINS) + " bins)" \ ).c_str(), \ [=](benchmark::State& state) { \ run_multi_even_benchmark( \ state, BINS, SCALE, entropy_reduction, stream, size \ ); \ } \ ) void add_multi_even_benchmarks(std::vector& benchmarks, hipStream_t stream, size_t size) { for(int entropy_reduction : entropy_reductions) { std::vector bs = { CREATE_MULTI_EVEN_BENCHMARK(4, 3, int, 10, 1234), CREATE_MULTI_EVEN_BENCHMARK(4, 3, int, 100, 1234), CREATE_MULTI_EVEN_BENCHMARK(4, 3, unsigned char, 16, 10), CREATE_MULTI_EVEN_BENCHMARK(4, 3, unsigned char, 256, 1), CREATE_MULTI_EVEN_BENCHMARK(4, 3, unsigned short, 16, 10), CREATE_MULTI_EVEN_BENCHMARK(4, 3, unsigned short, 256, 10), CREATE_MULTI_EVEN_BENCHMARK(4, 3, unsigned short, 65536, 1), }; benchmarks.insert(benchmarks.end(), bs.begin(), bs.end()); }; } #define CREATE_RANGE_BENCHMARK(T, BINS) \ benchmark::RegisterBenchmark( \ (std::string("histogram_range") + "" + \ "(Bin Count:" + std::to_string(BINS) + " bins)" \ ).c_str(), \ [=](benchmark::State& state) { run_range_benchmark(state, BINS, stream, size); } \ ) #define BENCHMARK_RANGE_TYPE(T) \ CREATE_RANGE_BENCHMARK(T, 10), CREATE_RANGE_BENCHMARK(T, 100), \ CREATE_RANGE_BENCHMARK(T, 1000), CREATE_RANGE_BENCHMARK(T, 10000), \ CREATE_RANGE_BENCHMARK(T, 100000), CREATE_RANGE_BENCHMARK(T, 1000000) void add_range_benchmarks(std::vector& benchmarks, hipStream_t stream, size_t size) { std::vector bs = {BENCHMARK_RANGE_TYPE(float), BENCHMARK_RANGE_TYPE(double)}; benchmarks.insert(benchmarks.end(), bs.begin(), bs.end()); } #define CREATE_MULTI_RANGE_BENCHMARK(CHANNELS, ACTIVE_CHANNELS, T, BINS) \ benchmark::RegisterBenchmark( \ (std::string("multi_histogram_range") + "<" #CHANNELS ", " #ACTIVE_CHANNELS ", " #T ">" + \ "(" + std::to_string(BINS) + " bins)" \ ).c_str(), \ [=](benchmark::State& state) { \ run_multi_range_benchmark( \ state, BINS, stream, size \ ); \ } \ ) void add_multi_range_benchmarks(std::vector& benchmarks, hipStream_t stream, size_t size) { std::vector bs = { CREATE_MULTI_RANGE_BENCHMARK(4, 3, float, 10), CREATE_MULTI_RANGE_BENCHMARK(4, 3, float, 100), CREATE_MULTI_RANGE_BENCHMARK(4, 3, float, 1000), CREATE_MULTI_RANGE_BENCHMARK(4, 3, float, 10000), CREATE_MULTI_RANGE_BENCHMARK(4, 3, float, 100000), CREATE_MULTI_RANGE_BENCHMARK(4, 3, float, 1000000), }; benchmarks.insert(benchmarks.end(), bs.begin(), bs.end()); } int main(int argc, char *argv[]) { cli::Parser parser(argc, argv); parser.set_optional("size", "size", DEFAULT_N, "number of values"); parser.set_optional("trials", "trials", -1, "number of iterations"); parser.run_and_exit_if_error(); // Parse argv benchmark::Initialize(&argc, argv); const size_t size = parser.get("size"); const int trials = parser.get("trials"); std::cout << "benchmark_device_histogram" << std::endl; // HIP hipStream_t stream = 0; // default hipDeviceProp_t devProp; int device_id = 0; HIP_CHECK(hipGetDevice(&device_id)); HIP_CHECK(hipGetDeviceProperties(&devProp, device_id)); std::cout << "[HIP] Device name: " << devProp.name << std::endl; // Add benchmarks std::vector benchmarks; add_even_benchmarks(benchmarks, stream, size); add_multi_even_benchmarks(benchmarks, stream, size); add_range_benchmarks(benchmarks, stream, size); add_multi_range_benchmarks(benchmarks, stream, size); // Use manual timing for(auto& b : benchmarks) { b->UseManualTime(); b->Unit(benchmark::kMillisecond); } // Force number of iterations if(trials > 0) { for(auto& b : benchmarks) { b->Iterations(trials); } } // Run benchmarks benchmark::RunSpecifiedBenchmarks(); return 0; } hipCUB-rocm-5.7.1/benchmark/benchmark_device_memory.cpp000066400000000000000000000421221447643347700231170ustar00rootroot00000000000000// MIT License // // Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal // in the Software without restriction, including without limitation the rights // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell // copies of the Software, and to permit persons to whom the Software is // furnished to do so, subject to the following conditions: // // The above copyright notice and this permission notice shall be included in all // copies or substantial portions of the Software. // // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE // SOFTWARE. #include "common_benchmark_header.hpp" #include "hipcub/block/block_load.hpp" #include "hipcub/block/block_scan.hpp" #include "hipcub/block/block_store.hpp" enum memory_operation_method { direct, striped, vectorize, transpose, warp_transpose }; enum kernel_operation { no_operation, block_scan, custom_operation, atomics_no_collision, atomics_inter_block_collision, atomics_inter_warp_collision, }; struct empty_storage_type {}; template struct operation; // no operation template struct operation { typedef empty_storage_type storage_type; HIPCUB_DEVICE inline void operator()(storage_type& storage, T (&)[ItemsPerThread], T* = nullptr) {} }; // custom operation template struct operation { typedef empty_storage_type storage_type; HIPCUB_DEVICE inline void operator()(storage_type& storage, T (&input)[ItemsPerThread], T* global_mem_output = nullptr) { (void)storage; (void)global_mem_output; #pragma unroll for(unsigned int i = 0; i < ItemsPerThread; i++) { input[i] = input[i] + 666; constexpr unsigned int repeats = 30; #pragma unroll for(unsigned int j = 0; j < repeats; j++) { input[i] = input[i] * (input[j % ItemsPerThread]); } } } }; // block scan template struct operation { typedef typename hipcub::BlockScan block_scan_type; typedef typename block_scan_type::TempStorage storage_type; HIPCUB_DEVICE inline void operator()(storage_type& storage, T (&input)[ItemsPerThread], T* global_mem_output = nullptr) { (void)global_mem_output; // sync before re-using shared memory from load __syncthreads(); block_scan_type(storage).InclusiveScan(input, input, hipcub::Sum()); } }; // atomics_no_collision template struct operation { typedef empty_storage_type storage_type; HIPCUB_DEVICE inline void operator()(storage_type& storage, T (&input)[ItemsPerThread], T* global_mem_output = nullptr) { (void)storage; (void)input; const unsigned int index = threadIdx.x * ItemsPerThread + blockIdx.x * blockDim.x * ItemsPerThread; #pragma unroll for(unsigned int i = 0; i < ItemsPerThread; i++) { atomicAdd(&global_mem_output[index + i], T(666)); } } }; // atomics_inter_block_collision template struct operation { typedef empty_storage_type storage_type; HIPCUB_DEVICE inline void operator()(storage_type& storage, T (&input)[ItemsPerThread], T* global_mem_output = nullptr) { (void)storage; (void)input; const unsigned int index = (threadIdx.x % warpSize) * ItemsPerThread + blockIdx.x * blockDim.x * ItemsPerThread; #pragma unroll for(unsigned int i = 0; i < ItemsPerThread; i++) { atomicAdd(&global_mem_output[index + i], T(666)); } } }; // atomics_inter_block_collision template struct operation { typedef empty_storage_type storage_type; HIPCUB_DEVICE inline void operator()(storage_type& storage, T (&input)[ItemsPerThread], T* global_mem_output = nullptr) { (void)storage; (void)input; const unsigned int index = threadIdx.x * ItemsPerThread; #pragma unroll for(unsigned int i = 0; i < ItemsPerThread; i++) { atomicAdd(&global_mem_output[index + i], T(666)); } } }; template struct memory_operation {}; template<> struct memory_operation { static constexpr hipcub::BlockLoadAlgorithm load_type = hipcub::BlockLoadAlgorithm::BLOCK_LOAD_DIRECT; static constexpr hipcub::BlockStoreAlgorithm store_type = hipcub::BlockStoreAlgorithm::BLOCK_STORE_DIRECT; }; template<> struct memory_operation { static constexpr hipcub::BlockLoadAlgorithm load_type = hipcub::BlockLoadAlgorithm::BLOCK_LOAD_STRIPED; static constexpr hipcub::BlockStoreAlgorithm store_type = hipcub::BlockStoreAlgorithm::BLOCK_STORE_STRIPED; }; template<> struct memory_operation { static constexpr hipcub::BlockLoadAlgorithm load_type = hipcub::BlockLoadAlgorithm::BLOCK_LOAD_VECTORIZE; static constexpr hipcub::BlockStoreAlgorithm store_type = hipcub::BlockStoreAlgorithm::BLOCK_STORE_VECTORIZE; }; template<> struct memory_operation { static constexpr hipcub::BlockLoadAlgorithm load_type = hipcub::BlockLoadAlgorithm::BLOCK_LOAD_TRANSPOSE; static constexpr hipcub::BlockStoreAlgorithm store_type = hipcub::BlockStoreAlgorithm::BLOCK_STORE_TRANSPOSE; }; template<> struct memory_operation { static constexpr hipcub::BlockLoadAlgorithm load_type = hipcub::BlockLoadAlgorithm::BLOCK_LOAD_WARP_TRANSPOSE; static constexpr hipcub::BlockStoreAlgorithm store_type = hipcub::BlockStoreAlgorithm::BLOCK_STORE_WARP_TRANSPOSE; }; template __global__ __launch_bounds__(BlockSize) void operation_kernel(T* input, T* output, CustomOp op) { typedef memory_operation mem_op; typedef hipcub::BlockLoad load_type; typedef hipcub::BlockStore store_type; __shared__ union { typename load_type::TempStorage load; typename store_type::TempStorage store; typename CustomOp::storage_type operand; } storage; constexpr unsigned int items_per_block = BlockSize * ItemsPerThread; const unsigned int offset = blockIdx.x * items_per_block; T items[ItemsPerThread]; load_type(storage.load).Load(input + offset, items); op(storage.operand, items, output); // sync before re-using shared memory from load or from operand __syncthreads(); store_type(storage.store).Store(output + offset, items); } template void run_benchmark(benchmark::State& state, size_t size, const hipStream_t stream) { const size_t grid_size = size / (BlockSize * ItemsPerThread); std::vector input; if(std::is_floating_point::value) { input = benchmark_utils::get_random_data(size, (T)-1000, (T) + 1000); } else { input = benchmark_utils::get_random_data(size, std::numeric_limits::min(), std::numeric_limits::max()); } T* d_input; T* d_output; HIP_CHECK(hipMalloc(reinterpret_cast(&d_input), size * sizeof(T))); HIP_CHECK(hipMalloc(reinterpret_cast(&d_output), size * sizeof(T))); HIP_CHECK(hipMemcpy(d_input, input.data(), size * sizeof(T), hipMemcpyHostToDevice)); HIP_CHECK(hipDeviceSynchronize()); operation selected_operation; // Warm-up for(size_t i = 0; i < 10; i++) { hipLaunchKernelGGL(HIP_KERNEL_NAME(operation_kernel), dim3(grid_size), dim3(BlockSize), 0, stream, d_input, d_output, selected_operation); } HIP_CHECK(hipDeviceSynchronize()); // HIP events creation hipEvent_t start, stop; HIP_CHECK(hipEventCreate(&start)); HIP_CHECK(hipEventCreate(&stop)); const unsigned int batch_size = 10; for(auto _ : state) { // Record start event HIP_CHECK(hipEventRecord(start, stream)); for(size_t i = 0; i < batch_size; i++) { hipLaunchKernelGGL( HIP_KERNEL_NAME(operation_kernel), dim3(grid_size), dim3(BlockSize), 0, stream, d_input, d_output, selected_operation); } // Record stop event and wait until it completes HIP_CHECK(hipEventRecord(stop, stream)); HIP_CHECK(hipEventSynchronize(stop)); float elapsed_mseconds; HIP_CHECK(hipEventElapsedTime(&elapsed_mseconds, start, stop)); state.SetIterationTime(elapsed_mseconds / 1000); } // Destroy HIP events HIP_CHECK(hipEventDestroy(start)); HIP_CHECK(hipEventDestroy(stop)); state.SetBytesProcessed(state.iterations() * batch_size * size * sizeof(T)); state.SetItemsProcessed(state.iterations() * batch_size * size); HIP_CHECK(hipFree(d_input)); HIP_CHECK(hipFree(d_output)); } template void run_benchmark_memcpy(benchmark::State& state, size_t size, const hipStream_t stream) { std::vector input; if(std::is_floating_point::value) { input = benchmark_utils::get_random_data(size, (T)-1000, (T) + 1000); } else { input = benchmark_utils::get_random_data(size, std::numeric_limits::min(), std::numeric_limits::max()); } T* d_input; T* d_output; HIP_CHECK(hipMalloc(reinterpret_cast(&d_input), size * sizeof(T))); HIP_CHECK(hipMalloc(reinterpret_cast(&d_output), size * sizeof(T))); // Warm-up for(size_t i = 0; i < 10; i++) { HIP_CHECK(hipMemcpy(d_output, d_input, size * sizeof(T), hipMemcpyDeviceToDevice)); } HIP_CHECK(hipDeviceSynchronize()); // HIP events creation hipEvent_t start, stop; HIP_CHECK(hipEventCreate(&start)); HIP_CHECK(hipEventCreate(&stop)); const unsigned int batch_size = 10; for(auto _ : state) { // Record start event HIP_CHECK(hipEventRecord(start, stream)); for(size_t i = 0; i < batch_size; i++) { HIP_CHECK(hipMemcpy(d_output, d_input, size * sizeof(T), hipMemcpyDeviceToDevice)); } // Record stop event and wait until it completes HIP_CHECK(hipEventRecord(stop, stream)); HIP_CHECK(hipEventSynchronize(stop)); float elapsed_mseconds; HIP_CHECK(hipEventElapsedTime(&elapsed_mseconds, start, stop)); state.SetIterationTime(elapsed_mseconds / 1000); } // Destroy HIP events HIP_CHECK(hipEventDestroy(start)); HIP_CHECK(hipEventDestroy(stop)); state.SetBytesProcessed(state.iterations() * batch_size * size * sizeof(T)); state.SetItemsProcessed(state.iterations() * batch_size * size); HIP_CHECK(hipFree(d_input)); HIP_CHECK(hipFree(d_output)); } #define CREATE_BENCHMARK_IPT(METHOD, OPERATION, T, SIZE, BLOCK_SIZE, IPT) \ { \ benchmarks.push_back(benchmark::RegisterBenchmark( \ #METHOD "_" #OPERATION "<" #T "," #SIZE ",BS:" #BLOCK_SIZE ",IPT:" #IPT ">", \ [=](benchmark::State& state) \ { run_benchmark(state, SIZE, stream); })); \ } #define CREATE_BENCHMARK_MEMCPY(T, SIZE) \ { \ benchmarks.push_back(benchmark::RegisterBenchmark( \ "Memcpy<" #T "," #SIZE ">", \ [=](benchmark::State& state) { run_benchmark_memcpy(state, SIZE, stream); })); \ } // clang-format off #define CREATE_BENCHMARK_BLOCK_SIZE(MEM_OP, OP, TYPE, SIZE, BLOCK_SIZE) \ CREATE_BENCHMARK_IPT(MEM_OP, OP, TYPE, SIZE, BLOCK_SIZE, 1) \ CREATE_BENCHMARK_IPT(MEM_OP, OP, TYPE, SIZE, BLOCK_SIZE, 2) \ CREATE_BENCHMARK_IPT(MEM_OP, OP, TYPE, SIZE, BLOCK_SIZE, 4) \ CREATE_BENCHMARK_IPT(MEM_OP, OP, TYPE, SIZE, BLOCK_SIZE, 8) #define CREATE_BENCHMARK_MEM_OP(MEM_OP, OP, TYPE, SIZE) \ CREATE_BENCHMARK_BLOCK_SIZE(MEM_OP, OP, TYPE, SIZE, 256) #define CREATE_BENCHMARK(OP, TYPE, SIZE) \ CREATE_BENCHMARK_MEM_OP(direct, OP, TYPE, SIZE) \ CREATE_BENCHMARK_MEM_OP(striped, OP, TYPE, SIZE) \ CREATE_BENCHMARK_MEM_OP(vectorize, OP, TYPE, SIZE) \ CREATE_BENCHMARK_MEM_OP(transpose, OP, TYPE, SIZE) \ CREATE_BENCHMARK_MEM_OP(warp_transpose, OP, TYPE, SIZE) // clang-format on template constexpr unsigned int megabytes(unsigned int size) { return (size * (1024 * 1024 / sizeof(T))); } int main(int argc, char* argv[]) { cli::Parser parser(argc, argv); parser.set_optional("trials", "trials", -1, "number of iterations"); parser.run_and_exit_if_error(); // Parse argv benchmark::Initialize(&argc, argv); const int trials = parser.get("trials"); std::cout << "benchmark_device_memory" << std::endl; // HIP hipStream_t stream = 0; // default hipDeviceProp_t devProp; int device_id = 0; HIP_CHECK(hipGetDevice(&device_id)); HIP_CHECK(hipGetDeviceProperties(&devProp, device_id)); std::cout << "[HIP] Device name: " << devProp.name << std::endl; // Add benchmarks std::vector benchmarks; // Simple memory copy from device to device, not running a kernel CREATE_BENCHMARK_MEMCPY(int, megabytes(128)) // clang-format off CREATE_BENCHMARK(no_operation, int, megabytes(128)) CREATE_BENCHMARK(block_scan, int, megabytes(128)) CREATE_BENCHMARK(custom_operation, int, megabytes(128)) CREATE_BENCHMARK(atomics_no_collision, int, megabytes(128)) CREATE_BENCHMARK(atomics_inter_block_collision, int, megabytes(128)) CREATE_BENCHMARK(atomics_inter_warp_collision, int, megabytes(128)) // clang-format on // Use manual timing for(auto& b : benchmarks) { b->UseManualTime(); b->Unit(benchmark::kMillisecond); } // Force number of iterations if(trials > 0) { for(auto& b : benchmarks) { b->Iterations(trials); } } // Run benchmarks benchmark::RunSpecifiedBenchmarks(); return 0; } hipCUB-rocm-5.7.1/benchmark/benchmark_device_merge_sort.cpp000066400000000000000000000260251447643347700237610ustar00rootroot00000000000000// MIT License // // Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal // in the Software without restriction, including without limitation the rights // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell // copies of the Software, and to permit persons to whom the Software is // furnished to do so, subject to the following conditions: // // The above copyright notice and this permission notice shall be included in all // copies or substantial portions of the Software. // // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE // SOFTWARE. #include "common_benchmark_header.hpp" // HIP API #include "hipcub/hipcub.hpp" #include "hipcub/device/device_merge_sort.hpp" #ifndef DEFAULT_N const size_t DEFAULT_N = 32 << 20; #endif const unsigned int batch_size = 10; const unsigned int warmup_size = 5; template std::vector generate_keys(size_t size) { using key_type = Key; if(std::is_floating_point::value) { return benchmark_utils::get_random_data(size, static_cast(-1000), static_cast(1000), size); } else { return benchmark_utils::get_random_data( size, std::numeric_limits::min(), std::numeric_limits::max(), size ); } } template void run_sort_keys_benchmark(benchmark::State& state, hipStream_t stream, size_t size) { using key_type = Key; auto compare_function = [] __device__ (const key_type & a, const key_type & b) { return a < b; }; auto keys_input = generate_keys(size); key_type * d_keys_input; key_type * d_keys_output; HIP_CHECK(hipMalloc(&d_keys_input, size * sizeof(key_type))); HIP_CHECK(hipMalloc(&d_keys_output, size * sizeof(key_type))); HIP_CHECK( hipMemcpy( d_keys_input, keys_input.data(), size * sizeof(key_type), hipMemcpyHostToDevice ) ); void * d_temporary_storage = nullptr; size_t temporary_storage_bytes = 0; HIP_CHECK( hipcub::DeviceMergeSort::SortKeysCopy( d_temporary_storage, temporary_storage_bytes, d_keys_input, d_keys_output, size, compare_function, stream ) ); HIP_CHECK(hipMalloc(&d_temporary_storage, temporary_storage_bytes)); HIP_CHECK(hipDeviceSynchronize()); // Warm-up for(size_t i = 0; i < warmup_size; i++) { HIP_CHECK( hipcub::DeviceMergeSort::SortKeysCopy( d_temporary_storage, temporary_storage_bytes, d_keys_input, d_keys_output, size, compare_function, stream ) ); } HIP_CHECK(hipDeviceSynchronize()); for (auto _ : state) { auto start = std::chrono::high_resolution_clock::now(); for(size_t i = 0; i < batch_size; i++) { HIP_CHECK( hipcub::DeviceMergeSort::SortKeysCopy( d_temporary_storage, temporary_storage_bytes, d_keys_input, d_keys_output, size, compare_function, stream ) ); } HIP_CHECK(hipDeviceSynchronize()); auto end = std::chrono::high_resolution_clock::now(); auto elapsed_seconds = std::chrono::duration_cast>(end - start); state.SetIterationTime(elapsed_seconds.count()); } state.SetBytesProcessed(state.iterations() * batch_size * size * sizeof(key_type)); state.SetItemsProcessed(state.iterations() * batch_size * size); HIP_CHECK(hipFree(d_temporary_storage)); HIP_CHECK(hipFree(d_keys_input)); HIP_CHECK(hipFree(d_keys_output)); } template void run_sort_pairs_benchmark(benchmark::State& state, hipStream_t stream, size_t size) { using key_type = Key; using value_type = Value; auto compare_function = [] __device__ (const key_type & a, const key_type & b) { return a < b; }; auto keys_input = generate_keys(size); std::vector values_input(size); for(size_t i = 0; i < size; i++) { values_input[i] = value_type(i); } key_type * d_keys_input; key_type * d_keys_output; HIP_CHECK(hipMalloc(&d_keys_input, size * sizeof(key_type))); HIP_CHECK(hipMalloc(&d_keys_output, size * sizeof(key_type))); HIP_CHECK( hipMemcpy( d_keys_input, keys_input.data(), size * sizeof(key_type), hipMemcpyHostToDevice ) ); value_type * d_values_input; value_type * d_values_output; HIP_CHECK(hipMalloc(&d_values_input, size * sizeof(value_type))); HIP_CHECK(hipMalloc(&d_values_output, size * sizeof(value_type))); HIP_CHECK( hipMemcpy( d_values_input, values_input.data(), size * sizeof(value_type), hipMemcpyHostToDevice ) ); void * d_temporary_storage = nullptr; size_t temporary_storage_bytes = 0; HIP_CHECK( hipcub::DeviceMergeSort::SortPairsCopy( d_temporary_storage, temporary_storage_bytes, d_keys_input, d_values_input, d_keys_output, d_values_output, size, compare_function, stream ) ); HIP_CHECK(hipMalloc(&d_temporary_storage, temporary_storage_bytes)); HIP_CHECK(hipDeviceSynchronize()); // Warm-up for(size_t i = 0; i < warmup_size; i++) { HIP_CHECK( hipcub::DeviceMergeSort::SortPairsCopy( d_temporary_storage, temporary_storage_bytes, d_keys_input, d_values_input, d_keys_output, d_values_output, size, compare_function, stream ) ); } HIP_CHECK(hipDeviceSynchronize()); for (auto _ : state) { auto start = std::chrono::high_resolution_clock::now(); for(size_t i = 0; i < batch_size; i++) { HIP_CHECK( hipcub::DeviceMergeSort::SortPairsCopy( d_temporary_storage, temporary_storage_bytes, d_keys_input, d_values_input, d_keys_output, d_values_output, size, compare_function, stream ) ); } HIP_CHECK(hipDeviceSynchronize()); auto end = std::chrono::high_resolution_clock::now(); auto elapsed_seconds = std::chrono::duration_cast>(end - start); state.SetIterationTime(elapsed_seconds.count()); } state.SetBytesProcessed( state.iterations() * batch_size * size * (sizeof(key_type) + sizeof(value_type)) ); state.SetItemsProcessed(state.iterations() * batch_size * size); HIP_CHECK(hipFree(d_temporary_storage)); HIP_CHECK(hipFree(d_keys_input)); HIP_CHECK(hipFree(d_keys_output)); HIP_CHECK(hipFree(d_values_input)); HIP_CHECK(hipFree(d_values_output)); } #define CREATE_SORT_KEYS_BENCHMARK(Key) \ { \ benchmarks.push_back( \ benchmark::RegisterBenchmark( \ (std::string("sort_keys") + "<" #Key ">").c_str(), \ [=](benchmark::State& state) { run_sort_keys_benchmark(state, stream, size); } \ ) \ ); \ } #define CREATE_SORT_PAIRS_BENCHMARK(Key, Value) \ { \ benchmarks.push_back( \ benchmark::RegisterBenchmark( \ (std::string("sort_pairs") + "<" #Key ", " #Value">").c_str(), \ [=](benchmark::State& state) { run_sort_pairs_benchmark(state, stream, size); } \ ) \ ); \ } void add_sort_keys_benchmarks(std::vector& benchmarks, hipStream_t stream, size_t size) { CREATE_SORT_KEYS_BENCHMARK(int) CREATE_SORT_KEYS_BENCHMARK(long long) CREATE_SORT_KEYS_BENCHMARK(int8_t) CREATE_SORT_KEYS_BENCHMARK(uint8_t) CREATE_SORT_KEYS_BENCHMARK(short) } void add_sort_pairs_benchmarks(std::vector& benchmarks, hipStream_t stream, size_t size) { using custom_float2 = benchmark_utils::custom_type; using custom_double2 = benchmark_utils::custom_type; using custom_char_double = benchmark_utils::custom_type; using custom_double_char = benchmark_utils::custom_type; CREATE_SORT_PAIRS_BENCHMARK(int, float) CREATE_SORT_PAIRS_BENCHMARK(int, double) CREATE_SORT_PAIRS_BENCHMARK(int, custom_float2) CREATE_SORT_PAIRS_BENCHMARK(int, custom_double2) CREATE_SORT_PAIRS_BENCHMARK(int, custom_char_double) CREATE_SORT_PAIRS_BENCHMARK(int, custom_double_char) CREATE_SORT_PAIRS_BENCHMARK(long long, float) CREATE_SORT_PAIRS_BENCHMARK(long long, double) CREATE_SORT_PAIRS_BENCHMARK(long long, custom_float2) CREATE_SORT_PAIRS_BENCHMARK(long long, custom_char_double) CREATE_SORT_PAIRS_BENCHMARK(long long, custom_double_char) CREATE_SORT_PAIRS_BENCHMARK(long long, custom_double2) CREATE_SORT_PAIRS_BENCHMARK(int8_t, int8_t) CREATE_SORT_PAIRS_BENCHMARK(uint8_t, uint8_t) } int main(int argc, char *argv[]) { cli::Parser parser(argc, argv); parser.set_optional("size", "size", DEFAULT_N, "number of values"); parser.set_optional("trials", "trials", -1, "number of iterations"); parser.run_and_exit_if_error(); // Parse argv benchmark::Initialize(&argc, argv); const size_t size = parser.get("size"); const int trials = parser.get("trials"); // HIP hipStream_t stream = 0; // default hipDeviceProp_t devProp; int device_id = 0; HIP_CHECK(hipGetDevice(&device_id)); HIP_CHECK(hipGetDeviceProperties(&devProp, device_id)); std::cout << "[HIP] Device name: " << devProp.name << std::endl; // Add benchmarks std::vector benchmarks; add_sort_keys_benchmarks(benchmarks, stream, size); add_sort_pairs_benchmarks(benchmarks, stream, size); // Use manual timing for(auto& b : benchmarks) { b->UseManualTime(); b->Unit(benchmark::kMillisecond); } // Force number of iterations if(trials > 0) { for(auto& b : benchmarks) { b->Iterations(trials); } } // Run benchmarks benchmark::RunSpecifiedBenchmarks(); return 0; } hipCUB-rocm-5.7.1/benchmark/benchmark_device_partition.cpp000066400000000000000000000371321447643347700236250ustar00rootroot00000000000000// MIT License // // Copyright (c) 2021 Advanced Micro Devices, Inc. All rights reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal // in the Software without restriction, including without limitation the rights // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell // copies of the Software, and to permit persons to whom the Software is // furnished to do so, subject to the following conditions: // // The above copyright notice and this permission notice shall be included in all // copies or substantial portions of the Software. // // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE // SOFTWARE. #include "common_benchmark_header.hpp" // HIP API #include "hipcub/device/device_partition.hpp" #include #include #ifndef DEFAULT_N constexpr size_t DEFAULT_N = 1024 * 1024 * 32; #endif constexpr unsigned int batch_size = 10; constexpr unsigned int warmup_size = 5; namespace { template struct LessOp { HIPCUB_HOST_DEVICE LessOp(const T& pivot) : pivot_{pivot} { } HIPCUB_HOST_DEVICE bool operator()(const T& val) const { return val < pivot_; } private: T pivot_; }; } template void run_flagged(benchmark::State& state, const hipStream_t stream, const T threshold, const size_t size) { const auto select_op = LessOp{threshold}; const auto input = benchmark_utils::get_random_data(size, static_cast(0), static_cast(100)); std::vector flags(size); for(unsigned int i = 0; i < size; i++) { flags[i] = static_cast(select_op(input[i])); } T* d_input = nullptr; F* d_flags = nullptr; T* d_output = nullptr; unsigned int* d_num_selected_output = nullptr; HIP_CHECK(hipMalloc(&d_input, input.size() * sizeof(T))); HIP_CHECK(hipMalloc(&d_flags, input.size() * sizeof(F))); HIP_CHECK(hipMalloc(&d_output, input.size() * sizeof(T))); HIP_CHECK(hipMalloc(&d_num_selected_output, sizeof(unsigned int))); // Allocate temporary storage void* d_temp_storage = nullptr; size_t temp_storage_bytes = 0; HIP_CHECK( hipcub::DevicePartition::Flagged( nullptr, temp_storage_bytes, d_input, d_flags, d_output, d_num_selected_output, static_cast(input.size()), stream ) ); HIP_CHECK(hipMalloc(&d_temp_storage, temp_storage_bytes)); // Warm-up HIP_CHECK(hipMemcpy(d_input, input.data(), input.size() * sizeof(T), hipMemcpyHostToDevice)); HIP_CHECK(hipMemcpy(d_flags, flags.data(), flags.size() * sizeof(F), hipMemcpyHostToDevice)); for(unsigned int i = 0; i < warmup_size; ++i) { HIP_CHECK( hipcub::DevicePartition::Flagged( d_temp_storage, temp_storage_bytes, d_input, d_flags, d_output, d_num_selected_output, static_cast(input.size()), stream ) ); } HIP_CHECK(hipDeviceSynchronize()); // Run benchmark for(auto _ : state) { namespace chrono = std::chrono; using clock = chrono::high_resolution_clock; const auto start = clock::now(); for (unsigned int i = 0; i < batch_size; ++i) { HIP_CHECK( hipcub::DevicePartition::Flagged( d_temp_storage, temp_storage_bytes, d_input, d_flags, d_output, d_num_selected_output, static_cast(input.size()), stream ) ); } HIP_CHECK(hipDeviceSynchronize()); const auto end = clock::now(); using seconds_d = chrono::duration; const auto elapsed_seconds = chrono::duration_cast(end - start); state.SetIterationTime(elapsed_seconds.count()); } state.SetItemsProcessed(state.iterations() * batch_size * input.size()); state.SetBytesProcessed( static_cast(state.iterations() * batch_size * input.size() * sizeof(input[0]))); HIP_CHECK(hipFree(d_temp_storage)); HIP_CHECK(hipFree(d_num_selected_output)); HIP_CHECK(hipFree(d_output)); HIP_CHECK(hipFree(d_flags)); HIP_CHECK(hipFree(d_input)); } template void run_predicate(benchmark::State& state, const hipStream_t stream, const T threshold, const size_t size) { const auto input = benchmark_utils::get_random_data(size, static_cast(0), static_cast(100)); T* d_input = nullptr; T* d_output = nullptr; unsigned int* d_num_selected_output = nullptr; HIP_CHECK(hipMalloc(&d_input, input.size() * sizeof(T))); HIP_CHECK(hipMalloc(&d_output, input.size() * sizeof(T))); HIP_CHECK(hipMalloc(&d_num_selected_output, sizeof(unsigned int))); const auto select_op = LessOp{threshold}; // Allocate temporary storage void* d_temp_storage = nullptr; size_t temp_storage_bytes = 0; HIP_CHECK( hipcub::DevicePartition::If( nullptr, temp_storage_bytes, d_input, d_output, d_num_selected_output, static_cast(input.size()), select_op, stream ) ); HIP_CHECK(hipMalloc(&d_temp_storage, temp_storage_bytes)); // Warm-up HIP_CHECK(hipMemcpy(d_input, input.data(), input.size() * sizeof(T), hipMemcpyHostToDevice)); for(unsigned int i = 0; i < warmup_size; ++i) { HIP_CHECK( hipcub::DevicePartition::If( d_temp_storage, temp_storage_bytes, d_input, d_output, d_num_selected_output, static_cast(input.size()), select_op, stream ) ); } HIP_CHECK(hipDeviceSynchronize()); // Run benchmark for(auto _ : state) { namespace chrono = std::chrono; using clock = chrono::high_resolution_clock; const auto start = clock::now(); for (unsigned int i = 0; i < batch_size; ++i) { HIP_CHECK( hipcub::DevicePartition::If( d_temp_storage, temp_storage_bytes, d_input, d_output, d_num_selected_output, static_cast(input.size()), select_op, stream ) ); } HIP_CHECK(hipDeviceSynchronize()); const auto end = clock::now(); using seconds_d = chrono::duration; const auto elapsed_seconds = chrono::duration_cast(end - start); state.SetIterationTime(elapsed_seconds.count()); } state.SetItemsProcessed(state.iterations() * batch_size * input.size()); state.SetBytesProcessed( static_cast(state.iterations() * batch_size * input.size() * sizeof(input[0]))); HIP_CHECK(hipFree(d_temp_storage)); HIP_CHECK(hipFree(d_input)); HIP_CHECK(hipFree(d_output)); HIP_CHECK(hipFree(d_num_selected_output)); } template void run_threeway(benchmark::State& state, const hipStream_t stream, const T small_threshold, const T large_threshold, const size_t size) { const auto input = benchmark_utils::get_random_data(size, static_cast(0), static_cast(100)); T* d_input = nullptr; T* d_first_output = nullptr; T* d_second_output = nullptr; T* d_unselected_output = nullptr; unsigned int* d_num_selected_output = nullptr; HIP_CHECK(hipMalloc(&d_input, input.size() * sizeof(T))); HIP_CHECK(hipMalloc(&d_first_output, input.size() * sizeof(T))); HIP_CHECK(hipMalloc(&d_second_output, input.size() * sizeof(T))); HIP_CHECK(hipMalloc(&d_unselected_output, input.size() * sizeof(T))); HIP_CHECK(hipMalloc(&d_num_selected_output, 2 * sizeof(unsigned int))); const auto select_first_part_op = LessOp{small_threshold}; const auto select_second_part_op = LessOp{large_threshold}; // Allocate temporary storage void* d_temp_storage = nullptr; size_t temp_storage_bytes = 0; HIP_CHECK( hipcub::DevicePartition::If( nullptr, temp_storage_bytes, d_input, d_first_output, d_second_output, d_unselected_output, d_num_selected_output, static_cast(input.size()), select_first_part_op, select_second_part_op, stream ) ); HIP_CHECK(hipMalloc(&d_temp_storage, temp_storage_bytes)); // Warm-up HIP_CHECK(hipMemcpy(d_input, input.data(), input.size() * sizeof(T), hipMemcpyHostToDevice)); for(unsigned int i = 0; i < warmup_size; ++i) { HIP_CHECK( hipcub::DevicePartition::If( d_temp_storage, temp_storage_bytes, d_input, d_first_output, d_second_output, d_unselected_output, d_num_selected_output, static_cast(input.size()), select_first_part_op, select_second_part_op, stream ) ); } HIP_CHECK(hipDeviceSynchronize()); // Run benchmark for(auto _ : state) { namespace chrono = std::chrono; using clock = chrono::high_resolution_clock; const auto start = clock::now(); for (unsigned int i = 0; i < batch_size; ++i) { HIP_CHECK( hipcub::DevicePartition::If( d_temp_storage, temp_storage_bytes, d_input, d_first_output, d_second_output, d_unselected_output, d_num_selected_output, static_cast(input.size()), select_first_part_op, select_second_part_op, stream ) ); } HIP_CHECK(hipDeviceSynchronize()); const auto end = clock::now(); using seconds_d = chrono::duration; const auto elapsed_seconds = chrono::duration_cast(end - start); state.SetIterationTime(elapsed_seconds.count()); } state.SetItemsProcessed(state.iterations() * batch_size * input.size()); state.SetBytesProcessed( static_cast(state.iterations() * batch_size * input.size() * sizeof(input[0]))); HIP_CHECK(hipFree(d_temp_storage)); HIP_CHECK(hipFree(d_input)); HIP_CHECK(hipFree(d_first_output)); HIP_CHECK(hipFree(d_second_output)); HIP_CHECK(hipFree(d_unselected_output)); HIP_CHECK(hipFree(d_num_selected_output)); } #define CREATE_BENCHMARK_FLAGGED(T, T_FLAG, SPLIT_T) \ benchmark::RegisterBenchmark( \ "parition_flagged<" #T ", " #T_FLAG ">(" #SPLIT_T "%)", \ &run_flagged, stream, static_cast(SPLIT_T), size \ ) #define CREATE_BENCHMARK_PREDICATE(T, SPLIT_T) \ benchmark::RegisterBenchmark( \ "parition_predicate<" #T ">(" #SPLIT_T "%)", \ &run_predicate, stream, static_cast(SPLIT_T), size \ ) #define CREATE_BENCHMARK_THREEWAY(T, SMALL_T, LARGE_T) \ benchmark::RegisterBenchmark( \ "parition_three_way(Small Threshold:" #SMALL_T "%,Large Threshold:" #LARGE_T "%)", \ &run_threeway, stream, static_cast(SMALL_T), static_cast(LARGE_T), size \ ) #define BENCHMARK_FLAGGED_TYPE(type, flag_type) \ CREATE_BENCHMARK_FLAGGED(type, flag_type, 33), \ CREATE_BENCHMARK_FLAGGED(type, flag_type, 50), \ CREATE_BENCHMARK_FLAGGED(type, flag_type, 60), \ CREATE_BENCHMARK_FLAGGED(type, flag_type, 90) #define BENCHMARK_PREDICATE_TYPE(type) \ CREATE_BENCHMARK_PREDICATE(type, 33), \ CREATE_BENCHMARK_PREDICATE(type, 50), \ CREATE_BENCHMARK_PREDICATE(type, 60), \ CREATE_BENCHMARK_PREDICATE(type, 90) #define BENCHMARK_THREEWAY_TYPE(type) \ CREATE_BENCHMARK_THREEWAY(type, 33, 66), \ CREATE_BENCHMARK_THREEWAY(type, 10, 66), \ CREATE_BENCHMARK_THREEWAY(type, 50, 60), \ CREATE_BENCHMARK_THREEWAY(type, 50, 90) int main(int argc, char *argv[]) { cli::Parser parser(argc, argv); parser.set_optional("size", "size", DEFAULT_N, "number of values"); parser.set_optional("trials", "trials", -1, "number of iterations"); parser.run_and_exit_if_error(); // Parse argv benchmark::Initialize(&argc, argv); const size_t size = parser.get("size"); const int trials = parser.get("trials"); std::cout << "benchmark_device_partition" << std::endl; // HIP const hipStream_t stream = 0; // default { hipDeviceProp_t devProp; int device_id = 0; HIP_CHECK(hipGetDevice(&device_id)); HIP_CHECK(hipGetDeviceProperties(&devProp, device_id)); std::cout << "[HIP] Device name: " << devProp.name << std::endl; } using custom_float2 = benchmark_utils::custom_type; using custom_double2 = benchmark_utils::custom_type; // Add benchmarks std::vector benchmarks = { BENCHMARK_FLAGGED_TYPE(int8_t, unsigned char), BENCHMARK_FLAGGED_TYPE(int, unsigned char), BENCHMARK_FLAGGED_TYPE(float, unsigned char), BENCHMARK_FLAGGED_TYPE(long long, uint8_t), BENCHMARK_FLAGGED_TYPE(double, int8_t), BENCHMARK_FLAGGED_TYPE(custom_float2, int8_t), BENCHMARK_FLAGGED_TYPE(custom_double2, unsigned char), BENCHMARK_PREDICATE_TYPE(int8_t), BENCHMARK_PREDICATE_TYPE(int), BENCHMARK_PREDICATE_TYPE(float), BENCHMARK_PREDICATE_TYPE(long long), BENCHMARK_PREDICATE_TYPE(double), BENCHMARK_PREDICATE_TYPE(custom_float2), BENCHMARK_PREDICATE_TYPE(custom_double2), BENCHMARK_THREEWAY_TYPE(int8_t), BENCHMARK_THREEWAY_TYPE(int), BENCHMARK_THREEWAY_TYPE(float), BENCHMARK_THREEWAY_TYPE(long long), BENCHMARK_THREEWAY_TYPE(double), BENCHMARK_THREEWAY_TYPE(custom_float2), BENCHMARK_THREEWAY_TYPE(custom_double2), }; // Use manual timing for(auto& b : benchmarks) { b->UseManualTime(); b->Unit(benchmark::kMillisecond); } // Force number of iterations if(trials > 0) { for(auto& b : benchmarks) { b->Iterations(trials); } } // Run benchmarks benchmark::RunSpecifiedBenchmarks(); return 0; } hipCUB-rocm-5.7.1/benchmark/benchmark_device_radix_sort.cpp000066400000000000000000000312401447643347700237640ustar00rootroot00000000000000// MIT License // // Copyright (c) 2020 Advanced Micro Devices, Inc. All rights reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal // in the Software without restriction, including without limitation the rights // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell // copies of the Software, and to permit persons to whom the Software is // furnished to do so, subject to the following conditions: // // The above copyright notice and this permission notice shall be included in all // copies or substantial portions of the Software. // // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE // SOFTWARE. #include #include "common_benchmark_header.hpp" // HIP API #include "hipcub/device/device_radix_sort.hpp" #ifndef DEFAULT_N const size_t DEFAULT_N = 1024 * 1024 * 32; #endif const unsigned int batch_size = 10; const unsigned int warmup_size = 5; template std::vector generate_keys(size_t size) { using key_type = Key; if(std::is_floating_point::value) { return benchmark_utils::get_random_data(size, (key_type)-1000, (key_type)+1000, size); } else { return benchmark_utils::get_random_data( size, std::numeric_limits::min(), std::numeric_limits::max(), size ); } } template void run_sort_keys_benchmark(benchmark::State& state, hipStream_t stream, size_t size, std::shared_ptr> keys_input) { using key_type = Key; typedef hipError_t (*sort_func) ( void *, size_t&, const key_type *, key_type *, size_t, int, int, hipStream_t, bool); sort_func func_ascending = &hipcub::DeviceRadixSort::SortKeys ; sort_func func_descending = &hipcub::DeviceRadixSort::SortKeysDescending ; sort_func sorting = Descending ? func_descending : func_ascending; key_type * d_keys_input; key_type * d_keys_output; HIP_CHECK(hipMalloc(&d_keys_input, size * sizeof(key_type))); HIP_CHECK(hipMalloc(&d_keys_output, size * sizeof(key_type))); HIP_CHECK( hipMemcpy( d_keys_input, keys_input->data(), size * sizeof(key_type), hipMemcpyHostToDevice ) ); void * d_temporary_storage = nullptr; size_t temporary_storage_bytes = 0; HIP_CHECK( sorting( d_temporary_storage, temporary_storage_bytes, d_keys_input, d_keys_output, size, 0, sizeof(key_type) * 8 , stream, false ) ); HIP_CHECK(hipMalloc(&d_temporary_storage, temporary_storage_bytes)); HIP_CHECK(hipDeviceSynchronize()); // Warm-up for(size_t i = 0; i < warmup_size; i++) { HIP_CHECK( sorting( d_temporary_storage, temporary_storage_bytes, d_keys_input, d_keys_output, size, 0, sizeof(key_type) * 8, stream, false ) ); } HIP_CHECK(hipDeviceSynchronize()); for (auto _ : state) { auto start = std::chrono::high_resolution_clock::now(); for(size_t i = 0; i < batch_size; i++) { HIP_CHECK( sorting( d_temporary_storage, temporary_storage_bytes, d_keys_input, d_keys_output, size, 0, sizeof(key_type) * 8, stream, false ) ); } HIP_CHECK(hipDeviceSynchronize()); auto end = std::chrono::high_resolution_clock::now(); auto elapsed_seconds = std::chrono::duration_cast>(end - start); state.SetIterationTime(elapsed_seconds.count()); } state.SetBytesProcessed(state.iterations() * batch_size * size * sizeof(key_type)); state.SetItemsProcessed(state.iterations() * batch_size * size); HIP_CHECK(hipFree(d_temporary_storage)); HIP_CHECK(hipFree(d_keys_input)); HIP_CHECK(hipFree(d_keys_output)); } template void run_sort_pairs_benchmark(benchmark::State& state, hipStream_t stream, size_t size, std::shared_ptr> keys_input) { using key_type = Key; using value_type = Value; typedef hipError_t (*sort_func) ( void *, size_t&, const key_type *, key_type *, const value_type *, value_type *, size_t, int, int, hipStream_t, bool); sort_func func_ascending = &hipcub::DeviceRadixSort::SortPairs ; sort_func func_descending = &hipcub::DeviceRadixSort::SortPairsDescending ; sort_func sorting = Descending ? func_descending : func_ascending; std::vector values_input(size); for(size_t i = 0; i < size; i++) { values_input[i] = value_type(i); } key_type * d_keys_input; key_type * d_keys_output; HIP_CHECK(hipMalloc(&d_keys_input, size * sizeof(key_type))); HIP_CHECK(hipMalloc(&d_keys_output, size * sizeof(key_type))); HIP_CHECK( hipMemcpy( d_keys_input, keys_input->data(), size * sizeof(key_type), hipMemcpyHostToDevice ) ); value_type * d_values_input; value_type * d_values_output; HIP_CHECK(hipMalloc(&d_values_input, size * sizeof(value_type))); HIP_CHECK(hipMalloc(&d_values_output, size * sizeof(value_type))); HIP_CHECK( hipMemcpy( d_values_input, values_input.data(), size * sizeof(value_type), hipMemcpyHostToDevice ) ); void * d_temporary_storage = nullptr; size_t temporary_storage_bytes = 0; HIP_CHECK( sorting( d_temporary_storage, temporary_storage_bytes, d_keys_input, d_keys_output, d_values_input, d_values_output, size, 0, sizeof(key_type) * 8, stream, false ) ); HIP_CHECK(hipMalloc(&d_temporary_storage, temporary_storage_bytes)); HIP_CHECK(hipDeviceSynchronize()); // Warm-up for(size_t i = 0; i < warmup_size; i++) { HIP_CHECK( sorting( d_temporary_storage, temporary_storage_bytes, d_keys_input, d_keys_output, d_values_input, d_values_output, size, 0, sizeof(key_type) * 8, stream, false ) ); } HIP_CHECK(hipDeviceSynchronize()); for (auto _ : state) { auto start = std::chrono::high_resolution_clock::now(); for(size_t i = 0; i < batch_size; i++) { HIP_CHECK( sorting( d_temporary_storage, temporary_storage_bytes, d_keys_input, d_keys_output, d_values_input, d_values_output, size, 0, sizeof(key_type) * 8, stream, false ) ); } HIP_CHECK(hipDeviceSynchronize()); auto end = std::chrono::high_resolution_clock::now(); auto elapsed_seconds = std::chrono::duration_cast>(end - start); state.SetIterationTime(elapsed_seconds.count()); } state.SetBytesProcessed( state.iterations() * batch_size * size * (sizeof(key_type) + sizeof(value_type)) ); state.SetItemsProcessed(state.iterations() * batch_size * size); HIP_CHECK(hipFree(d_temporary_storage)); HIP_CHECK(hipFree(d_keys_input)); HIP_CHECK(hipFree(d_keys_output)); HIP_CHECK(hipFree(d_values_input)); HIP_CHECK(hipFree(d_values_output)); } #define CREATE_SORT_KEYS_BENCHMARK(Key) \ { \ auto keys_input = std::make_shared>(generate_keys(size)); \ benchmarks.push_back( \ benchmark::RegisterBenchmark( \ (std::string("sort_keys") + "").c_str(), \ [=](benchmark::State& state) { run_sort_keys_benchmark(state, stream, size, keys_input); } \ ) \ ); \ benchmarks.push_back( \ benchmark::RegisterBenchmark( \ (std::string("sort_keys") + "<" #Key ">, descending").c_str(), \ [=](benchmark::State& state) { run_sort_keys_benchmark(state, stream, size, keys_input); } \ ) \ ); \ } #define CREATE_SORT_PAIRS_BENCHMARK(Key, Value) \ { \ auto keys_input = std::make_shared>(generate_keys(size)); \ benchmarks.push_back( \ benchmark::RegisterBenchmark( \ (std::string("sort_pairs") + "").c_str(), \ [=](benchmark::State& state) { run_sort_pairs_benchmark(state, stream, size, keys_input); } \ ) \ ); \ benchmarks.push_back( \ benchmark::RegisterBenchmark( \ (std::string("sort_pairs") + "<" #Key ", " #Value">, descending").c_str(), \ [=](benchmark::State& state) { run_sort_pairs_benchmark(state, stream, size, keys_input); } \ ) \ ); \ } void add_sort_keys_benchmarks(std::vector& benchmarks, hipStream_t stream, size_t size) { CREATE_SORT_KEYS_BENCHMARK(int) CREATE_SORT_KEYS_BENCHMARK(long long) CREATE_SORT_KEYS_BENCHMARK(int8_t) CREATE_SORT_KEYS_BENCHMARK(uint8_t) CREATE_SORT_KEYS_BENCHMARK(short) } void add_sort_pairs_benchmarks(std::vector& benchmarks, hipStream_t stream, size_t size) { using custom_float2 = benchmark_utils::custom_type; using custom_double2 = benchmark_utils::custom_type; using custom_char_double = benchmark_utils::custom_type; using custom_double_char = benchmark_utils::custom_type; CREATE_SORT_PAIRS_BENCHMARK(int, float) CREATE_SORT_PAIRS_BENCHMARK(int, double) CREATE_SORT_PAIRS_BENCHMARK(int, custom_float2) CREATE_SORT_PAIRS_BENCHMARK(int, custom_double2) CREATE_SORT_PAIRS_BENCHMARK(int, custom_char_double) CREATE_SORT_PAIRS_BENCHMARK(int, custom_double_char) CREATE_SORT_PAIRS_BENCHMARK(long long, float) CREATE_SORT_PAIRS_BENCHMARK(long long, double) CREATE_SORT_PAIRS_BENCHMARK(long long, custom_float2) CREATE_SORT_PAIRS_BENCHMARK(long long, custom_char_double) CREATE_SORT_PAIRS_BENCHMARK(long long, custom_double_char) CREATE_SORT_PAIRS_BENCHMARK(long long, custom_double2) CREATE_SORT_PAIRS_BENCHMARK(int8_t, int8_t) CREATE_SORT_PAIRS_BENCHMARK(uint8_t, uint8_t) } int main(int argc, char *argv[]) { cli::Parser parser(argc, argv); parser.set_optional("size", "size", DEFAULT_N, "number of values"); parser.set_optional("trials", "trials", -1, "number of iterations"); parser.run_and_exit_if_error(); // Parse argv benchmark::Initialize(&argc, argv); const size_t size = parser.get("size"); const int trials = parser.get("trials"); std::cout << "benchmark_device_radix_sort" << std::endl; // HIP hipStream_t stream = 0; // default hipDeviceProp_t devProp; int device_id = 0; HIP_CHECK(hipGetDevice(&device_id)); HIP_CHECK(hipGetDeviceProperties(&devProp, device_id)); std::cout << "[HIP] Device name: " << devProp.name << std::endl; // Add benchmarks std::vector benchmarks; add_sort_keys_benchmarks(benchmarks, stream, size); add_sort_pairs_benchmarks(benchmarks, stream, size); // Use manual timing for(auto& b : benchmarks) { b->UseManualTime(); b->Unit(benchmark::kMillisecond); } // Force number of iterations if(trials > 0) { for(auto& b : benchmarks) { b->Iterations(trials); } } // Run benchmarks benchmark::RunSpecifiedBenchmarks(); return 0; } hipCUB-rocm-5.7.1/benchmark/benchmark_device_reduce.cpp000066400000000000000000000151731447643347700230640ustar00rootroot00000000000000// MIT License // // Copyright (c) 2020 Advanced Micro Devices, Inc. All rights reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal // in the Software without restriction, including without limitation the rights // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell // copies of the Software, and to permit persons to whom the Software is // furnished to do so, subject to the following conditions: // // The above copyright notice and this permission notice shall be included in all // copies or substantial portions of the Software. // // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE // SOFTWARE. #include "common_benchmark_header.hpp" // HIP API #include "hipcub/device/device_reduce.hpp" #ifndef DEFAULT_N const size_t DEFAULT_N = 1024 * 1024 * 128; #endif const unsigned int batch_size = 10; const unsigned int warmup_size = 5; template< class T, class OutputT, class ReduceKernel > void run_benchmark(benchmark::State& state, size_t size, const hipStream_t stream, ReduceKernel reduce) { std::vector input = benchmark_utils::get_random_data(size, T(0), T(1000)); T * d_input; OutputT * d_output; HIP_CHECK(hipMalloc(&d_input, size * sizeof(T))); HIP_CHECK(hipMalloc(&d_output, sizeof(OutputT))); HIP_CHECK( hipMemcpy( d_input, input.data(), size * sizeof(T), hipMemcpyHostToDevice ) ); HIP_CHECK(hipDeviceSynchronize()); // Allocate temporary storage memory size_t temp_storage_size_bytes = 0; void * d_temp_storage = nullptr; // Get size of d_temp_storage HIP_CHECK( reduce( d_temp_storage, temp_storage_size_bytes, d_input, d_output, size, stream, false ) ); HIP_CHECK(hipMalloc(&d_temp_storage,temp_storage_size_bytes)); HIP_CHECK(hipDeviceSynchronize()); for(size_t i = 0; i < warmup_size; i++) { HIP_CHECK( reduce( d_temp_storage, temp_storage_size_bytes, d_input, d_output, size, stream, false ) ); } HIP_CHECK(hipDeviceSynchronize()); for(auto _ : state) { auto start = std::chrono::high_resolution_clock::now(); for(size_t i = 0; i < batch_size; i++) { HIP_CHECK( reduce( d_temp_storage, temp_storage_size_bytes, d_input, d_output, size, stream, false ) ); } HIP_CHECK(hipStreamSynchronize(stream)); auto end = std::chrono::high_resolution_clock::now(); auto elapsed_seconds = std::chrono::duration_cast>(end - start); state.SetIterationTime(elapsed_seconds.count()); } state.SetBytesProcessed(state.iterations() * batch_size * size * sizeof(T)); state.SetItemsProcessed(state.iterations() * batch_size * size); HIP_CHECK(hipFree(d_input)); HIP_CHECK(hipFree(d_output)); HIP_CHECK(hipFree(d_temp_storage)); } template struct Benchmark; template struct Benchmark { static void run(benchmark::State& state, size_t size, const hipStream_t stream) { run_benchmark(state, size, stream, hipcub::DeviceReduce::Sum); } }; template struct Benchmark { static void run(benchmark::State& state, size_t size, const hipStream_t stream) { run_benchmark(state, size, stream, hipcub::DeviceReduce::Min); } }; template struct Benchmark { using Difference = int; using Iterator = typename hipcub::ArgIndexInputIterator; using KeyValue = typename Iterator::value_type; static void run(benchmark::State& state, size_t size, const hipStream_t stream) { run_benchmark(state, size, stream, hipcub::DeviceReduce::ArgMin); } }; #define CREATE_BENCHMARK(T, REDUCE_OP) \ benchmark::RegisterBenchmark( \ ("reduce"), \ &Benchmark::run, size, stream \ ) #define CREATE_BENCHMARKS(REDUCE_OP) \ CREATE_BENCHMARK(int, REDUCE_OP), \ CREATE_BENCHMARK(long long, REDUCE_OP), \ CREATE_BENCHMARK(float, REDUCE_OP), \ CREATE_BENCHMARK(double, REDUCE_OP), \ CREATE_BENCHMARK(int8_t, REDUCE_OP) int main(int argc, char *argv[]) { cli::Parser parser(argc, argv); parser.set_optional("size", "size", DEFAULT_N, "number of values"); parser.set_optional("trials", "trials", -1, "number of iterations"); parser.run_and_exit_if_error(); // Parse argv benchmark::Initialize(&argc, argv); const size_t size = parser.get("size"); const int trials = parser.get("trials"); std::cout << "benchmark_device_reduce" << std::endl; // HIP hipStream_t stream = 0; // default hipDeviceProp_t devProp; int device_id = 0; HIP_CHECK(hipGetDevice(&device_id)); HIP_CHECK(hipGetDeviceProperties(&devProp, device_id)); std::cout << "[HIP] Device name: " << devProp.name << std::endl; using custom_double2 = benchmark_utils::custom_type; // Add benchmarks std::vector benchmarks = { CREATE_BENCHMARKS(hipcub::Sum), CREATE_BENCHMARK(custom_double2, hipcub::Sum), CREATE_BENCHMARKS(hipcub::Min), #ifdef HIPCUB_ROCPRIM_API CREATE_BENCHMARK(custom_double2, hipcub::Min), #endif CREATE_BENCHMARKS(hipcub::ArgMin), #ifdef HIPCUB_ROCPRIM_API CREATE_BENCHMARK(custom_double2, hipcub::ArgMin), #endif }; // Use manual timing for(auto& b : benchmarks) { b->UseManualTime(); b->Unit(benchmark::kMillisecond); } // Force number of iterations if(trials > 0) { for(auto& b : benchmarks) { b->Iterations(trials); } } // Run benchmarks benchmark::RunSpecifiedBenchmarks(); return 0; } hipCUB-rocm-5.7.1/benchmark/benchmark_device_reduce_by_key.cpp000066400000000000000000000203151447643347700244200ustar00rootroot00000000000000// MIT License // // Copyright (c) 2020 Advanced Micro Devices, Inc. All rights reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal // in the Software without restriction, including without limitation the rights // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell // copies of the Software, and to permit persons to whom the Software is // furnished to do so, subject to the following conditions: // // The above copyright notice and this permission notice shall be included in all // copies or substantial portions of the Software. // // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE // SOFTWARE. // CUB's implementation of single_pass_scan_operators has maybe uninitialized parameters, // disable the warning because all warnings are threated as errors: #ifdef __HIP_PLATFORM_NVIDIA__ #pragma GCC diagnostic ignored "-Wmaybe-uninitialized" #endif #include "common_benchmark_header.hpp" // HIP API #include "hipcub/device/device_reduce.hpp" #ifndef DEFAULT_N const size_t DEFAULT_N = 1024 * 1024 * 32; #endif const unsigned int batch_size = 10; const unsigned int warmup_size = 5; template void run_benchmark(benchmark::State& state, size_t max_length, hipStream_t stream, size_t size, BinaryFunction reduce_op) { using key_type = Key; using value_type = Value; // Generate data std::vector keys_input(size); unsigned int unique_count = 0; std::vector key_counts = benchmark_utils::get_random_data(100000, 1, max_length); size_t offset = 0; while(offset < size) { const size_t key_count = key_counts[unique_count % key_counts.size()]; const size_t end = std::min(size, offset + key_count); for(size_t i = offset; i < end; i++) { keys_input[i] = unique_count; } unique_count++; offset += key_count; } std::vector values_input(size); std::iota(values_input.begin(), values_input.end(), 0); key_type * d_keys_input; HIP_CHECK(hipMalloc(&d_keys_input, size * sizeof(key_type))); HIP_CHECK( hipMemcpy( d_keys_input, keys_input.data(), size * sizeof(key_type), hipMemcpyHostToDevice ) ); value_type * d_values_input; HIP_CHECK(hipMalloc(&d_values_input, size * sizeof(value_type))); HIP_CHECK( hipMemcpy( d_values_input, values_input.data(), size * sizeof(value_type), hipMemcpyHostToDevice ) ); key_type * d_unique_output; value_type * d_aggregates_output; unsigned int * d_unique_count_output; HIP_CHECK(hipMalloc(&d_unique_output, unique_count * sizeof(key_type))); HIP_CHECK(hipMalloc(&d_aggregates_output, unique_count * sizeof(value_type))); HIP_CHECK(hipMalloc(&d_unique_count_output, sizeof(unsigned int))); void * d_temporary_storage = nullptr; size_t temporary_storage_bytes = 0; HIP_CHECK( hipcub::DeviceReduce::ReduceByKey( nullptr, temporary_storage_bytes, d_keys_input, d_unique_output, d_values_input, d_aggregates_output, d_unique_count_output, reduce_op, size, stream ) ); HIP_CHECK(hipMalloc(&d_temporary_storage, temporary_storage_bytes)); HIP_CHECK(hipDeviceSynchronize()); // Warm-up for(size_t i = 0; i < warmup_size; i++) { HIP_CHECK( hipcub::DeviceReduce::ReduceByKey( d_temporary_storage, temporary_storage_bytes, d_keys_input, d_unique_output, d_values_input, d_aggregates_output, d_unique_count_output, reduce_op, size, stream ) ); } HIP_CHECK(hipDeviceSynchronize()); for (auto _ : state) { auto start = std::chrono::high_resolution_clock::now(); for(size_t i = 0; i < batch_size; i++) { HIP_CHECK( hipcub::DeviceReduce::ReduceByKey( d_temporary_storage, temporary_storage_bytes, d_keys_input, d_unique_output, d_values_input, d_aggregates_output, d_unique_count_output, reduce_op, size, stream ) ); } HIP_CHECK(hipStreamSynchronize(stream)); auto end = std::chrono::high_resolution_clock::now(); auto elapsed_seconds = std::chrono::duration_cast>(end - start); state.SetIterationTime(elapsed_seconds.count()); } state.SetBytesProcessed(state.iterations() * batch_size * size * (sizeof(key_type) + sizeof(value_type))); state.SetItemsProcessed(state.iterations() * batch_size * size); HIP_CHECK(hipFree(d_temporary_storage)); HIP_CHECK(hipFree(d_keys_input)); HIP_CHECK(hipFree(d_values_input)); HIP_CHECK(hipFree(d_unique_output)); HIP_CHECK(hipFree(d_aggregates_output)); HIP_CHECK(hipFree(d_unique_count_output)); } #define CREATE_BENCHMARK(Key, Value, REDUCE_OP) \ benchmark::RegisterBenchmark( \ (std::string("reduce_by_key") + "" + \ "(Random Number Range:[1, " + std::to_string(max_length) + "])" \ ).c_str(), \ &run_benchmark, \ max_length, stream, size, REDUCE_OP() \ ) #define CREATE_BENCHMARKS(REDUCE_OP) \ CREATE_BENCHMARK(int, float, REDUCE_OP), \ CREATE_BENCHMARK(int, double, REDUCE_OP), \ CREATE_BENCHMARK(int, custom_double2, REDUCE_OP), \ CREATE_BENCHMARK(int8_t, int8_t, REDUCE_OP), \ CREATE_BENCHMARK(long long, float, REDUCE_OP), \ CREATE_BENCHMARK(long long, double, REDUCE_OP) void add_benchmarks(size_t max_length, std::vector& benchmarks, hipStream_t stream, size_t size) { using custom_double2 = benchmark_utils::custom_type; std::vector bs = { CREATE_BENCHMARKS(hipcub::Sum), CREATE_BENCHMARK(long long, custom_double2, hipcub::Sum), CREATE_BENCHMARKS(hipcub::Min), #ifdef HIPCUB_ROCPRIM_API CREATE_BENCHMARK(long long, custom_double2, hipcub::Min), #endif }; benchmarks.insert(benchmarks.end(), bs.begin(), bs.end()); } int main(int argc, char *argv[]) { cli::Parser parser(argc, argv); parser.set_optional("size", "size", DEFAULT_N, "number of values"); parser.set_optional("trials", "trials", -1, "number of iterations"); parser.run_and_exit_if_error(); // Parse argv benchmark::Initialize(&argc, argv); const size_t size = parser.get("size"); const int trials = parser.get("trials"); std::cout << "benchmark_device_reduce_by_key" << std::endl; // HIP hipStream_t stream = 0; // default hipDeviceProp_t devProp; int device_id = 0; HIP_CHECK(hipGetDevice(&device_id)); HIP_CHECK(hipGetDeviceProperties(&devProp, device_id)); std::cout << "[HIP] Device name: " << devProp.name << std::endl; // Add benchmarks std::vector benchmarks; add_benchmarks(1000, benchmarks, stream, size); add_benchmarks(10, benchmarks, stream, size); // Use manual timing for(auto& b : benchmarks) { b->UseManualTime(); b->Unit(benchmark::kMillisecond); } // Force number of iterations if(trials > 0) { for(auto& b : benchmarks) { b->Iterations(trials); } } // Run benchmarks benchmark::RunSpecifiedBenchmarks(); return 0; } hipCUB-rocm-5.7.1/benchmark/benchmark_device_run_length_encode.cpp000066400000000000000000000272171447643347700253010ustar00rootroot00000000000000// MIT License // // Copyright (c) 2020 Advanced Micro Devices, Inc. All rights reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal // in the Software without restriction, including without limitation the rights // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell // copies of the Software, and to permit persons to whom the Software is // furnished to do so, subject to the following conditions: // // The above copyright notice and this permission notice shall be included in all // copies or substantial portions of the Software. // // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE // SOFTWARE. // CUB's implementation of DeviceRunLengthEncode has unused parameters, // disable the warning because all warnings are threated as errors: #ifdef __HIP_PLATFORM_NVIDIA__ #pragma GCC diagnostic ignored "-Wunused-parameter" #endif #include "common_benchmark_header.hpp" // HIP API #include "hipcub/device/device_run_length_encode.hpp" #ifndef DEFAULT_N const size_t DEFAULT_N = 1024 * 1024 * 32; #endif template void run_encode_benchmark(benchmark::State& state, size_t max_length, hipStream_t stream, size_t size) { using key_type = T; using count_type = unsigned int; // Generate data std::vector input(size); unsigned int runs_count = 0; std::vector key_counts = benchmark_utils::get_random_data(100000, 1, max_length); size_t offset = 0; while(offset < size) { const size_t key_count = key_counts[runs_count % key_counts.size()]; const size_t end = std::min(size, offset + key_count); for(size_t i = offset; i < end; i++) { input[i] = runs_count; } runs_count++; offset += key_count; } key_type * d_input; HIP_CHECK(hipMalloc(&d_input, size * sizeof(key_type))); HIP_CHECK( hipMemcpy( d_input, input.data(), size * sizeof(key_type), hipMemcpyHostToDevice ) ); key_type * d_unique_output; count_type * d_counts_output; count_type * d_runs_count_output; HIP_CHECK(hipMalloc(&d_unique_output, runs_count * sizeof(key_type))); HIP_CHECK(hipMalloc(&d_counts_output, runs_count * sizeof(count_type))); HIP_CHECK(hipMalloc(&d_runs_count_output, sizeof(count_type))); void * d_temporary_storage = nullptr; size_t temporary_storage_bytes = 0; HIP_CHECK( hipcub::DeviceRunLengthEncode::Encode( nullptr, temporary_storage_bytes, d_input, d_unique_output, d_counts_output, d_runs_count_output, size, stream, false ) ); HIP_CHECK(hipMalloc(&d_temporary_storage, temporary_storage_bytes)); HIP_CHECK(hipDeviceSynchronize()); // Warm-up for(size_t i = 0; i < 10; i++) { HIP_CHECK( hipcub::DeviceRunLengthEncode::Encode( d_temporary_storage, temporary_storage_bytes, d_input, d_unique_output, d_counts_output, d_runs_count_output, size, stream, false ) ); } HIP_CHECK(hipDeviceSynchronize()); const unsigned int batch_size = 10; for (auto _ : state) { auto start = std::chrono::high_resolution_clock::now(); for(size_t i = 0; i < batch_size; i++) { hipcub::DeviceRunLengthEncode::Encode( d_temporary_storage, temporary_storage_bytes, d_input, d_unique_output, d_counts_output, d_runs_count_output, size, stream, false ); } HIP_CHECK(hipStreamSynchronize(stream)); auto end = std::chrono::high_resolution_clock::now(); auto elapsed_seconds = std::chrono::duration_cast>(end - start); state.SetIterationTime(elapsed_seconds.count()); } state.SetBytesProcessed(state.iterations() * batch_size * size * sizeof(key_type)); state.SetItemsProcessed(state.iterations() * batch_size * size); HIP_CHECK(hipFree(d_temporary_storage)); HIP_CHECK(hipFree(d_input)); HIP_CHECK(hipFree(d_unique_output)); HIP_CHECK(hipFree(d_counts_output)); HIP_CHECK(hipFree(d_runs_count_output)); } template void run_non_trivial_runs_benchmark(benchmark::State& state, size_t max_length, hipStream_t stream, size_t size) { using key_type = T; using offset_type = unsigned int; using count_type = unsigned int; // Generate data std::vector input(size); unsigned int runs_count = 0; std::vector key_counts = benchmark_utils::get_random_data(100000, 1, max_length); size_t offset = 0; while(offset < size) { const size_t key_count = key_counts[runs_count % key_counts.size()]; const size_t end = std::min(size, offset + key_count); for(size_t i = offset; i < end; i++) { input[i] = runs_count; } runs_count++; offset += key_count; } key_type * d_input; HIP_CHECK(hipMalloc(&d_input, size * sizeof(key_type))); HIP_CHECK( hipMemcpy( d_input, input.data(), size * sizeof(key_type), hipMemcpyHostToDevice ) ); offset_type * d_offsets_output; count_type * d_counts_output; count_type * d_runs_count_output; HIP_CHECK(hipMalloc(&d_offsets_output, runs_count * sizeof(offset_type))); HIP_CHECK(hipMalloc(&d_counts_output, runs_count * sizeof(count_type))); HIP_CHECK(hipMalloc(&d_runs_count_output, sizeof(count_type))); void * d_temporary_storage = nullptr; size_t temporary_storage_bytes = 0; HIP_CHECK( hipcub::DeviceRunLengthEncode::NonTrivialRuns( nullptr, temporary_storage_bytes, d_input, d_offsets_output, d_counts_output, d_runs_count_output, size, stream, false ) ); HIP_CHECK(hipMalloc(&d_temporary_storage, temporary_storage_bytes)); HIP_CHECK(hipDeviceSynchronize()); // Warm-up for(size_t i = 0; i < 10; i++) { HIP_CHECK( hipcub::DeviceRunLengthEncode::NonTrivialRuns( d_temporary_storage, temporary_storage_bytes, d_input, d_offsets_output, d_counts_output, d_runs_count_output, size, stream, false ) ); } HIP_CHECK(hipDeviceSynchronize()); const unsigned int batch_size = 10; for (auto _ : state) { auto start = std::chrono::high_resolution_clock::now(); for(size_t i = 0; i < batch_size; i++) { hipcub::DeviceRunLengthEncode::NonTrivialRuns( d_temporary_storage, temporary_storage_bytes, d_input, d_offsets_output, d_counts_output, d_runs_count_output, size, stream, false ); } HIP_CHECK(hipStreamSynchronize(stream)); auto end = std::chrono::high_resolution_clock::now(); auto elapsed_seconds = std::chrono::duration_cast>(end - start); state.SetIterationTime(elapsed_seconds.count()); } state.SetBytesProcessed(state.iterations() * batch_size * size * sizeof(key_type)); state.SetItemsProcessed(state.iterations() * batch_size * size); HIP_CHECK(hipFree(d_temporary_storage)); HIP_CHECK(hipFree(d_input)); HIP_CHECK(hipFree(d_offsets_output)); HIP_CHECK(hipFree(d_counts_output)); HIP_CHECK(hipFree(d_runs_count_output)); } #define CREATE_ENCODE_BENCHMARK(T) \ benchmark::RegisterBenchmark( \ (std::string("run_length_encode") + "" + \ "(Random Number Range:[1, " + std::to_string(max_length) + "])" \ ).c_str(), \ &run_encode_benchmark, \ max_length, stream, size \ ) void add_encode_benchmarks(size_t max_length, std::vector& benchmarks, hipStream_t stream, size_t size) { using custom_float2 = benchmark_utils::custom_type; using custom_double2 = benchmark_utils::custom_type; std::vector bs = { CREATE_ENCODE_BENCHMARK(int), CREATE_ENCODE_BENCHMARK(long long), CREATE_ENCODE_BENCHMARK(int8_t), CREATE_ENCODE_BENCHMARK(uint8_t), CREATE_ENCODE_BENCHMARK(custom_float2), CREATE_ENCODE_BENCHMARK(custom_double2), }; benchmarks.insert(benchmarks.end(), bs.begin(), bs.end()); } #define CREATE_NON_TRIVIAL_RUNS_BENCHMARK(T) \ benchmark::RegisterBenchmark( \ (std::string("run_length_encode_non_trivial_runs") + "" + \ "(Random Number Range:[1, " + std::to_string(max_length) + "])" \ ).c_str(), \ &run_non_trivial_runs_benchmark, \ max_length, stream, size \ ) void add_non_trivial_runs_benchmarks(size_t max_length, std::vector& benchmarks, hipStream_t stream, size_t size) { using custom_float2 = benchmark_utils::custom_type; using custom_double2 = benchmark_utils::custom_type; std::vector bs = { CREATE_NON_TRIVIAL_RUNS_BENCHMARK(int), CREATE_NON_TRIVIAL_RUNS_BENCHMARK(long long), CREATE_NON_TRIVIAL_RUNS_BENCHMARK(int8_t), CREATE_NON_TRIVIAL_RUNS_BENCHMARK(uint8_t), CREATE_NON_TRIVIAL_RUNS_BENCHMARK(custom_float2), CREATE_NON_TRIVIAL_RUNS_BENCHMARK(custom_double2), }; benchmarks.insert(benchmarks.end(), bs.begin(), bs.end()); } int main(int argc, char *argv[]) { cli::Parser parser(argc, argv); parser.set_optional("size", "size", DEFAULT_N, "number of values"); parser.set_optional("trials", "trials", -1, "number of iterations"); parser.run_and_exit_if_error(); // Parse argv benchmark::Initialize(&argc, argv); const size_t size = parser.get("size"); const int trials = parser.get("trials"); std::cout << "benchmark_device_run_length_encode" << std::endl; // HIP hipStream_t stream = 0; // default hipDeviceProp_t devProp; int device_id = 0; HIP_CHECK(hipGetDevice(&device_id)); HIP_CHECK(hipGetDeviceProperties(&devProp, device_id)); std::cout << "[HIP] Device name: " << devProp.name << std::endl; // Add benchmarks std::vector benchmarks; add_encode_benchmarks(1000, benchmarks, stream, size); add_encode_benchmarks(10, benchmarks, stream, size); add_non_trivial_runs_benchmarks(1000, benchmarks, stream, size); add_non_trivial_runs_benchmarks(10, benchmarks, stream, size); // Use manual timing for(auto& b : benchmarks) { b->UseManualTime(); b->Unit(benchmark::kMillisecond); } // Force number of iterations if(trials > 0) { for(auto& b : benchmarks) { b->Iterations(trials); } } // Run benchmarks benchmark::RunSpecifiedBenchmarks(); return 0; } hipCUB-rocm-5.7.1/benchmark/benchmark_device_scan.cpp000066400000000000000000000317571447643347700225470ustar00rootroot00000000000000// MIT License // // Copyright (c) 2020 Advanced Micro Devices, Inc. All rights reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal // in the Software without restriction, including without limitation the rights // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell // copies of the Software, and to permit persons to whom the Software is // furnished to do so, subject to the following conditions: // // The above copyright notice and this permission notice shall be included in all // copies or substantial portions of the Software. // // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE // SOFTWARE // CUB's implementation of single_pass_scan_operators has maybe uninitialized parameters, // disable the warning because all warnings are threated as errors: #ifdef __HIP_PLATFORM_NVIDIA__ #pragma GCC diagnostic ignored "-Wmaybe-uninitialized" #endif #include "common_benchmark_header.hpp" // HIP API #include "hipcub/device/device_scan.hpp" #ifndef DEFAULT_N const size_t DEFAULT_N = 1024 * 1024 * 32; #endif template< bool Exclusive, class T, class BinaryFunction > auto run_device_scan(void * temporary_storage, size_t& storage_size, T * input, T * output, const T initial_value, const size_t input_size, BinaryFunction scan_op, const hipStream_t stream, const bool debug = false) -> typename std::enable_if::type { return hipcub::DeviceScan::ExclusiveScan( temporary_storage, storage_size, input, output, scan_op, initial_value, input_size, stream, debug ); } template< bool Exclusive, class T, class BinaryFunction > auto run_device_scan(void * temporary_storage, size_t& storage_size, T * input, T * output, const T initial_value, const size_t input_size, BinaryFunction scan_op, const hipStream_t stream, const bool debug = false) -> typename std::enable_if::type { (void) initial_value; return hipcub::DeviceScan::InclusiveScan( temporary_storage, storage_size, input, output, scan_op, input_size, stream, debug ); } template< bool Exclusive, class T, class K, class BinaryFunction > auto run_device_scan_by_key(void * temporary_storage, size_t& storage_size, K * keys, T * input, T * output, const T initial_value, const size_t input_size, BinaryFunction scan_op, const hipStream_t stream, const bool debug = false) -> typename std::enable_if::type { return hipcub::DeviceScan::ExclusiveScanByKey( temporary_storage, storage_size, keys, input, output, scan_op, initial_value, static_cast(input_size), hipcub::Equality(), stream, debug ); } template< bool Exclusive, class T, class K, class BinaryFunction > auto run_device_scan_by_key(void * temporary_storage, size_t& storage_size, K * keys, T * input, T * output, const T /*initial_value*/, const size_t input_size, BinaryFunction scan_op, const hipStream_t stream, const bool debug = false) -> typename std::enable_if::type { return hipcub::DeviceScan::InclusiveScanByKey( temporary_storage, storage_size, keys, input, output, scan_op, static_cast(input_size), hipcub::Equality(), stream, debug ); } template< bool Exclusive, class T, class BinaryFunction > void run_benchmark(benchmark::State& state, size_t size, const hipStream_t stream, BinaryFunction scan_op) { std::vector input = benchmark_utils::get_random_data(size, T(0), T(1000)); T initial_value = T(123); T * d_input; T * d_output; HIP_CHECK(hipMalloc(&d_input, size * sizeof(T))); HIP_CHECK(hipMalloc(&d_output, size * sizeof(T))); HIP_CHECK( hipMemcpy( d_input, input.data(), size * sizeof(T), hipMemcpyHostToDevice ) ); HIP_CHECK(hipDeviceSynchronize()); // Allocate temporary storage memory size_t temp_storage_size_bytes = 0; void * d_temp_storage = nullptr; // Get size of d_temp_storage HIP_CHECK(( run_device_scan( d_temp_storage, temp_storage_size_bytes, d_input, d_output, initial_value, size, scan_op, stream ) )); HIP_CHECK(hipMalloc(&d_temp_storage,temp_storage_size_bytes)); HIP_CHECK(hipDeviceSynchronize()); // Warm-up for(size_t i = 0; i < 5; i++) { HIP_CHECK(( run_device_scan( d_temp_storage, temp_storage_size_bytes, d_input, d_output, initial_value, size, scan_op, stream ) )); } HIP_CHECK(hipDeviceSynchronize()); const unsigned int batch_size = 10; for(auto _ : state) { auto start = std::chrono::high_resolution_clock::now(); for(size_t i = 0; i < batch_size; i++) { HIP_CHECK(( run_device_scan( d_temp_storage, temp_storage_size_bytes, d_input, d_output, initial_value, size, scan_op, stream ) )); } HIP_CHECK(hipStreamSynchronize(stream)); auto end = std::chrono::high_resolution_clock::now(); auto elapsed_seconds = std::chrono::duration_cast>(end - start); state.SetIterationTime(elapsed_seconds.count()); } state.SetBytesProcessed(state.iterations() * batch_size * size * sizeof(T)); state.SetItemsProcessed(state.iterations() * batch_size * size); HIP_CHECK(hipFree(d_input)); HIP_CHECK(hipFree(d_output)); HIP_CHECK(hipFree(d_temp_storage)); } template< bool Exclusive, class T, class BinaryFunction > void run_benchmark_by_key(benchmark::State& state, size_t size, const hipStream_t stream, BinaryFunction scan_op) { using key_type = int; constexpr size_t max_segment_length = 100; const std::vector keys = benchmark_utils::get_random_segments( size, max_segment_length, std::random_device{}() ); const std::vector input = benchmark_utils::get_random_data(size, T(0), T(1000)); const T initial_value = T(123); key_type * d_keys; T * d_input; T * d_output; HIP_CHECK(hipMalloc(&d_keys, size * sizeof(key_type))); HIP_CHECK(hipMalloc(&d_input, size * sizeof(T))); HIP_CHECK(hipMalloc(&d_output, size * sizeof(T))); HIP_CHECK( hipMemcpy( d_keys, keys.data(), size * sizeof(key_type), hipMemcpyHostToDevice ) ); HIP_CHECK( hipMemcpy( d_input, input.data(), size * sizeof(T), hipMemcpyHostToDevice ) ); HIP_CHECK(hipDeviceSynchronize()); // Allocate temporary storage memory size_t temp_storage_size_bytes = 0; void * d_temp_storage = nullptr; // Get size of d_temp_storage HIP_CHECK(( run_device_scan_by_key( d_temp_storage, temp_storage_size_bytes, d_keys, d_input, d_output, initial_value, size, scan_op, stream ) )); HIP_CHECK(hipMalloc(&d_temp_storage,temp_storage_size_bytes)); HIP_CHECK(hipDeviceSynchronize()); // Warm-up for(size_t i = 0; i < 5; i++) { HIP_CHECK(( run_device_scan_by_key( d_temp_storage, temp_storage_size_bytes, d_keys, d_input, d_output, initial_value, size, scan_op, stream ) )); } HIP_CHECK(hipDeviceSynchronize()); const unsigned int batch_size = 10; for(auto _ : state) { auto start = std::chrono::high_resolution_clock::now(); for(size_t i = 0; i < batch_size; i++) { HIP_CHECK(( run_device_scan_by_key( d_temp_storage, temp_storage_size_bytes, d_keys, d_input, d_output, initial_value, size, scan_op, stream ) )); } HIP_CHECK(hipStreamSynchronize(stream)); auto end = std::chrono::high_resolution_clock::now(); auto elapsed_seconds = std::chrono::duration_cast>(end - start); state.SetIterationTime(elapsed_seconds.count()); } state.SetBytesProcessed(state.iterations() * batch_size * size * sizeof(T)); state.SetItemsProcessed(state.iterations() * batch_size * size); HIP_CHECK(hipFree(d_keys)); HIP_CHECK(hipFree(d_input)); HIP_CHECK(hipFree(d_output)); HIP_CHECK(hipFree(d_temp_storage)); } #define CREATE_BENCHMARK(EXCL, T, SCAN_OP) \ benchmark::RegisterBenchmark( \ (std::string(EXCL ? "exclusive_scan" : "inclusive_scan") + \ ("")).c_str(), \ &run_benchmark, size, stream, SCAN_OP() \ ), \ benchmark::RegisterBenchmark( \ (std::string(EXCL ? "exclusive_scan_by_key" : "inclusive_scan_by_key") + \ ("")).c_str(), \ &run_benchmark_by_key, size, stream, SCAN_OP() \ ) #define CREATE_BENCHMARKS(SCAN_OP) \ CREATE_BENCHMARK(false, int, SCAN_OP), \ CREATE_BENCHMARK(true, int, SCAN_OP), \ CREATE_BENCHMARK(false, float, SCAN_OP), \ CREATE_BENCHMARK(true, float, SCAN_OP), \ CREATE_BENCHMARK(false, double, SCAN_OP), \ CREATE_BENCHMARK(true, double, SCAN_OP), \ CREATE_BENCHMARK(false, long long, SCAN_OP), \ CREATE_BENCHMARK(true, long long, SCAN_OP), \ CREATE_BENCHMARK(false, custom_float2, SCAN_OP), \ CREATE_BENCHMARK(true, custom_float2, SCAN_OP), \ CREATE_BENCHMARK(false, custom_double2, SCAN_OP), \ CREATE_BENCHMARK(true, custom_double2, SCAN_OP), \ CREATE_BENCHMARK(false, int8_t, SCAN_OP), \ CREATE_BENCHMARK(true, int8_t, SCAN_OP), \ CREATE_BENCHMARK(false, uint8_t, SCAN_OP), \ CREATE_BENCHMARK(true, uint8_t, SCAN_OP) int main(int argc, char *argv[]) { cli::Parser parser(argc, argv); parser.set_optional("size", "size", DEFAULT_N, "number of values"); parser.set_optional("trials", "trials", -1, "number of iterations"); parser.run_and_exit_if_error(); // Parse argv benchmark::Initialize(&argc, argv); const size_t size = parser.get("size"); const int trials = parser.get("trials"); std::cout << "benchmark_device_scan" << std::endl; // HIP hipStream_t stream = 0; // default hipDeviceProp_t devProp; int device_id = 0; HIP_CHECK(hipGetDevice(&device_id)); HIP_CHECK(hipGetDeviceProperties(&devProp, device_id)); std::cout << "[HIP] Device name: " << devProp.name << std::endl; using custom_double2 = benchmark_utils::custom_type; using custom_float2 = benchmark_utils::custom_type; // Compilation may never finish, if the compiler needs to compile too many kernels, // it is recommended to compile benchmarks only for 1-2 types when BENCHMARK_CONFIG_TUNING is used // (all other CREATE_*_BENCHMARK should be commented/removed). // Add benchmarks std::vector benchmarks = { CREATE_BENCHMARKS(hipcub::Sum), CREATE_BENCHMARKS(hipcub::Min), }; // Use manual timing for(auto& b : benchmarks) { b->UseManualTime(); b->Unit(benchmark::kMillisecond); } // Force number of iterations if(trials > 0) { for(auto& b : benchmarks) { b->Iterations(trials); } } // Run benchmarks benchmark::RunSpecifiedBenchmarks(); return 0; } hipCUB-rocm-5.7.1/benchmark/benchmark_device_segmented_radix_sort.cpp000066400000000000000000000401571447643347700260260ustar00rootroot00000000000000// MIT License // // Copyright (c) 2020 Advanced Micro Devices, Inc. All rights reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal // in the Software without restriction, including without limitation the rights // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell // copies of the Software, and to permit persons to whom the Software is // furnished to do so, subject to the following conditions: // // The above copyright notice and this permission notice shall be included in all // copies or substantial portions of the Software. // // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE // SOFTWARE. #include "common_benchmark_header.hpp" // HIP API #include "hipcub/hipcub.hpp" #ifndef DEFAULT_N const size_t DEFAULT_N = 1024 * 1024 * 32; #endif const unsigned int batch_size = 4; const unsigned int warmup_size = 2; constexpr bool Ascending = false; constexpr bool Descending = true; template void run_sort_keys_benchmark(benchmark::State& state, size_t desired_segments, hipStream_t stream, size_t size, bool descending = false) { using offset_type = int; using key_type = Key; typedef hipError_t (*sort_func) ( void *, size_t&, const key_type *, key_type *, int, int, offset_type *, offset_type *, int, int, hipStream_t, bool); sort_func func_ascending = &hipcub::DeviceSegmentedRadixSort::SortKeys ; sort_func func_descending = &hipcub::DeviceSegmentedRadixSort::SortKeysDescending ; sort_func sorting = descending ? func_descending : func_ascending; // Generate data std::vector offsets; const double avg_segment_length = static_cast(size) / desired_segments; const unsigned int seed = 123; std::default_random_engine gen(seed); std::uniform_real_distribution segment_length_dis(0, avg_segment_length * 2); unsigned int segments_count = 0; size_t offset = 0; while(offset < size) { const size_t segment_length = std::round(segment_length_dis(gen)); offsets.push_back(offset); segments_count++; offset += segment_length; } offsets.push_back(size); std::vector keys_input; if(std::is_floating_point::value) { keys_input = benchmark_utils::get_random_data( size, (key_type)-1000, (key_type)+1000); } else { keys_input = benchmark_utils::get_random_data( size, std::numeric_limits::min(), std::numeric_limits::max() ); } offset_type * d_offsets; HIP_CHECK(hipMalloc(&d_offsets, (segments_count + 1) * sizeof(offset_type))); HIP_CHECK( hipMemcpy( d_offsets, offsets.data(), (segments_count + 1) * sizeof(offset_type), hipMemcpyHostToDevice ) ); key_type * d_keys_input; key_type * d_keys_output; HIP_CHECK(hipMalloc(&d_keys_input, size * sizeof(key_type))); HIP_CHECK(hipMalloc(&d_keys_output, size * sizeof(key_type))); HIP_CHECK( hipMemcpy( d_keys_input, keys_input.data(), size * sizeof(key_type), hipMemcpyHostToDevice ) ); void * d_temporary_storage = nullptr; size_t temporary_storage_bytes = 0; HIP_CHECK( sorting( d_temporary_storage, temporary_storage_bytes, d_keys_input, d_keys_output, size, segments_count, d_offsets, d_offsets + 1, 0, sizeof(key_type) * 8, stream, false ) ); HIP_CHECK(hipMalloc(&d_temporary_storage, temporary_storage_bytes)); HIP_CHECK(hipDeviceSynchronize()); // Warm-up for(size_t i = 0; i < warmup_size; i++) { HIP_CHECK( sorting( d_temporary_storage, temporary_storage_bytes, d_keys_input, d_keys_output, size, segments_count, d_offsets, d_offsets + 1, 0, sizeof(key_type) * 8, stream, false ) ); } HIP_CHECK(hipDeviceSynchronize()); for (auto _ : state) { auto start = std::chrono::high_resolution_clock::now(); for(size_t i = 0; i < batch_size; i++) { HIP_CHECK( sorting( d_temporary_storage, temporary_storage_bytes, d_keys_input, d_keys_output, size, segments_count, d_offsets, d_offsets + 1, 0, sizeof(key_type) * 8, stream, false ) ); } HIP_CHECK(hipDeviceSynchronize()); auto end = std::chrono::high_resolution_clock::now(); auto elapsed_seconds = std::chrono::duration_cast>(end - start); state.SetIterationTime(elapsed_seconds.count()); } state.SetBytesProcessed(state.iterations() * batch_size * size * sizeof(key_type)); state.SetItemsProcessed(state.iterations() * batch_size * size); HIP_CHECK(hipFree(d_temporary_storage)); HIP_CHECK(hipFree(d_offsets)); HIP_CHECK(hipFree(d_keys_input)); HIP_CHECK(hipFree(d_keys_output)); } template void run_sort_pairs_benchmark(benchmark::State& state, size_t desired_segments, hipStream_t stream, size_t size, bool descending = false) { using offset_type = int; using key_type = Key; using value_type = Value; typedef hipError_t (*sort_func) ( void *, size_t&, const key_type *, key_type *, const value_type *, value_type *, int, int, offset_type *, offset_type *, int, int, hipStream_t, bool); sort_func func_ascending = &hipcub::DeviceSegmentedRadixSort::SortPairs ; sort_func func_descending = &hipcub::DeviceSegmentedRadixSort::SortPairsDescending ; sort_func sorting = descending ? func_descending : func_ascending; // Generate data std::vector offsets; const double avg_segment_length = static_cast(size) / desired_segments; const unsigned int seed = 123; std::default_random_engine gen(seed); std::uniform_real_distribution segment_length_dis(0, avg_segment_length * 2); unsigned int segments_count = 0; size_t offset = 0; while(offset < size) { const size_t segment_length = std::round(segment_length_dis(gen)); offsets.push_back(offset); segments_count++; offset += segment_length; } offsets.push_back(size); std::vector keys_input; if(std::is_floating_point::value) { keys_input = benchmark_utils::get_random_data( size, (key_type)-1000, (key_type)+1000); } else { keys_input = benchmark_utils::get_random_data( size, std::numeric_limits::min(), std::numeric_limits::max() ); } std::vector values_input(size); std::iota(values_input.begin(), values_input.end(), 0); offset_type * d_offsets; HIP_CHECK(hipMalloc(&d_offsets, (segments_count + 1) * sizeof(offset_type))); HIP_CHECK( hipMemcpy( d_offsets, offsets.data(), (segments_count + 1) * sizeof(offset_type), hipMemcpyHostToDevice ) ); key_type * d_keys_input; key_type * d_keys_output; HIP_CHECK(hipMalloc(&d_keys_input, size * sizeof(key_type))); HIP_CHECK(hipMalloc(&d_keys_output, size * sizeof(key_type))); HIP_CHECK( hipMemcpy( d_keys_input, keys_input.data(), size * sizeof(key_type), hipMemcpyHostToDevice ) ); value_type * d_values_input; value_type * d_values_output; HIP_CHECK(hipMalloc(&d_values_input, size * sizeof(value_type))); HIP_CHECK(hipMalloc(&d_values_output, size * sizeof(value_type))); HIP_CHECK( hipMemcpy( d_values_input, values_input.data(), size * sizeof(value_type), hipMemcpyHostToDevice ) ); void * d_temporary_storage = nullptr; size_t temporary_storage_bytes = 0; HIP_CHECK( sorting( d_temporary_storage, temporary_storage_bytes, d_keys_input, d_keys_output, d_values_input, d_values_output, size, segments_count, d_offsets, d_offsets + 1, 0, sizeof(key_type) * 8, stream, false ) ); HIP_CHECK(hipMalloc(&d_temporary_storage, temporary_storage_bytes)); HIP_CHECK(hipDeviceSynchronize()); // Warm-up for(size_t i = 0; i < warmup_size; i++) { HIP_CHECK( sorting( d_temporary_storage, temporary_storage_bytes, d_keys_input, d_keys_output, d_values_input, d_values_output, size, segments_count, d_offsets, d_offsets + 1, 0, sizeof(key_type) * 8, stream, false ) ); } HIP_CHECK(hipDeviceSynchronize()); for (auto _ : state) { auto start = std::chrono::high_resolution_clock::now(); for(size_t i = 0; i < batch_size; i++) { HIP_CHECK( sorting( d_temporary_storage, temporary_storage_bytes, d_keys_input, d_keys_output, d_values_input, d_values_output, size, segments_count, d_offsets, d_offsets + 1, 0, sizeof(key_type) * 8, stream, false ) ); } HIP_CHECK(hipDeviceSynchronize()); auto end = std::chrono::high_resolution_clock::now(); auto elapsed_seconds = std::chrono::duration_cast>(end - start); state.SetIterationTime(elapsed_seconds.count()); } state.SetBytesProcessed( state.iterations() * batch_size * size * (sizeof(key_type) + sizeof(value_type)) ); state.SetItemsProcessed(state.iterations() * batch_size * size); HIP_CHECK(hipFree(d_temporary_storage)); HIP_CHECK(hipFree(d_offsets)); HIP_CHECK(hipFree(d_keys_input)); HIP_CHECK(hipFree(d_keys_output)); HIP_CHECK(hipFree(d_values_input)); HIP_CHECK(hipFree(d_values_output)); } #define CREATE_SORT_KEYS_BENCHMARK(Key, SEGMENTS) \ benchmark::RegisterBenchmark( \ (std::string("sort_keys") + "" + \ "(Segments:~" + std::to_string(SEGMENTS) + " segments)" \ ).c_str(), \ [=](benchmark::State& state) { run_sort_keys_benchmark(state, SEGMENTS, stream, size, Ascending); } \ ) #define CREATE_SORT_KEYS_DESCENDING_BENCHMARK(Key, SEGMENTS) \ benchmark::RegisterBenchmark( \ (std::string("sort_keys") + "<" #Key ">" + \ "(~" + std::to_string(SEGMENTS) + " segments), descending" \ ).c_str(), \ [=](benchmark::State& state) { run_sort_keys_benchmark(state, SEGMENTS, stream, size, Descending); } \ ) #define BENCHMARK_KEY_TYPE(type) \ CREATE_SORT_KEYS_BENCHMARK(type, 1), \ CREATE_SORT_KEYS_BENCHMARK(type, 10), \ CREATE_SORT_KEYS_BENCHMARK(type, 100), \ CREATE_SORT_KEYS_BENCHMARK(type, 1000), \ CREATE_SORT_KEYS_BENCHMARK(type, 10000), \ CREATE_SORT_KEYS_DESCENDING_BENCHMARK(type, 1), \ CREATE_SORT_KEYS_DESCENDING_BENCHMARK(type, 10), \ CREATE_SORT_KEYS_DESCENDING_BENCHMARK(type, 100), \ CREATE_SORT_KEYS_DESCENDING_BENCHMARK(type, 1000), \ CREATE_SORT_KEYS_DESCENDING_BENCHMARK(type, 10000) void add_sort_keys_benchmarks(std::vector& benchmarks, hipStream_t stream, size_t size) { std::vector bs = { BENCHMARK_KEY_TYPE(float), BENCHMARK_KEY_TYPE(double), BENCHMARK_KEY_TYPE(int8_t), BENCHMARK_KEY_TYPE(uint8_t), BENCHMARK_KEY_TYPE(int), }; benchmarks.insert(benchmarks.end(), bs.begin(), bs.end()); } #define CREATE_SORT_PAIRS_BENCHMARK(Key, Value, SEGMENTS) \ benchmark::RegisterBenchmark( \ (std::string("sort_pairs") + "" + \ "(Segments:~" + std::to_string(SEGMENTS) + " segments)" \ ).c_str(), \ [=](benchmark::State& state) { \ run_sort_pairs_benchmark(state, SEGMENTS, stream, size, Ascending); } \ ) #define CREATE_SORT_PAIRS_DESCENDING_BENCHMARK(Key, Value, SEGMENTS) \ benchmark::RegisterBenchmark( \ (std::string("sort_pairs") + "<" #Key ", " #Value ">" + \ "(~" + std::to_string(SEGMENTS) + " segments), descending" \ ).c_str(), \ [=](benchmark::State& state) { \ run_sort_pairs_benchmark(state, SEGMENTS, stream, size, Descending); } \ ) #define BENCHMARK_PAIR_TYPE(type, value) \ CREATE_SORT_PAIRS_BENCHMARK(type, value, 1), \ CREATE_SORT_PAIRS_BENCHMARK(type, value, 10), \ CREATE_SORT_PAIRS_BENCHMARK(type, value, 100), \ CREATE_SORT_PAIRS_BENCHMARK(type, value, 1000), \ CREATE_SORT_PAIRS_BENCHMARK(type, value, 10000), \ CREATE_SORT_PAIRS_DESCENDING_BENCHMARK(type, value, 1), \ CREATE_SORT_PAIRS_DESCENDING_BENCHMARK(type, value, 10), \ CREATE_SORT_PAIRS_DESCENDING_BENCHMARK(type, value, 100), \ CREATE_SORT_PAIRS_DESCENDING_BENCHMARK(type, value, 1000), \ CREATE_SORT_PAIRS_DESCENDING_BENCHMARK(type, value, 10000) void add_sort_pairs_benchmarks(std::vector& benchmarks, hipStream_t stream, size_t size) { using custom_float2 = benchmark_utils::custom_type; using custom_double2 = benchmark_utils::custom_type; std::vector bs = { BENCHMARK_PAIR_TYPE(int, float), BENCHMARK_PAIR_TYPE(long long, double), BENCHMARK_PAIR_TYPE(int8_t, int8_t), BENCHMARK_PAIR_TYPE(uint8_t, uint8_t), BENCHMARK_PAIR_TYPE(int, custom_float2), BENCHMARK_PAIR_TYPE(long long, custom_double2), }; benchmarks.insert(benchmarks.end(), bs.begin(), bs.end()); } int main(int argc, char *argv[]) { cli::Parser parser(argc, argv); parser.set_optional("size", "size", DEFAULT_N, "number of values"); parser.set_optional("trials", "trials", -1, "number of iterations"); parser.run_and_exit_if_error(); // Parse argv benchmark::Initialize(&argc, argv); const size_t size = parser.get("size"); const int trials = parser.get("trials"); std::cout << "benchmark_device_segmented_radix_sort" << std::endl; // HIP hipStream_t stream = 0; // default hipDeviceProp_t devProp; int device_id = 0; HIP_CHECK(hipGetDevice(&device_id)); HIP_CHECK(hipGetDeviceProperties(&devProp, device_id)); std::cout << "[HIP] Device name: " << devProp.name << std::endl; // Add benchmarks std::vector benchmarks; add_sort_keys_benchmarks(benchmarks, stream, size); add_sort_pairs_benchmarks(benchmarks, stream, size); // Use manual timing for(auto& b : benchmarks) { b->UseManualTime(); b->Unit(benchmark::kMillisecond); } // Force number of iterations if(trials > 0) { for(auto& b : benchmarks) { b->Iterations(trials); } } // Run benchmarks benchmark::RunSpecifiedBenchmarks(); return 0; } hipCUB-rocm-5.7.1/benchmark/benchmark_device_segmented_reduce.cpp000066400000000000000000000215001447643347700251060ustar00rootroot00000000000000// MIT License // // Copyright (c) 2020 Advanced Micro Devices, Inc. All rights reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal // in the Software without restriction, including without limitation the rights // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell // copies of the Software, and to permit persons to whom the Software is // furnished to do so, subject to the following conditions: // // The above copyright notice and this permission notice shall be included in all // copies or substantial portions of the Software. // // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE // SOFTWARE. #include "common_benchmark_header.hpp" // HIP API #include "hipcub/device/device_segmented_reduce.hpp" #ifndef DEFAULT_N const size_t DEFAULT_N = 1024 * 1024 * 32; #endif const unsigned int batch_size = 10; const unsigned int warmup_size = 5; using OffsetType = int; template void run_benchmark(benchmark::State& state, size_t desired_segments, hipStream_t stream, size_t size, SegmentedReduceKernel segmented_reduce) { using value_type = T; // Generate data const unsigned int seed = 123; std::default_random_engine gen(seed); const double avg_segment_length = static_cast(size) / desired_segments; std::uniform_real_distribution segment_length_dis(0, avg_segment_length * 2); std::vector offsets; unsigned int segments_count = 0; size_t offset = 0; while(offset < size) { const size_t segment_length = std::round(segment_length_dis(gen)); offsets.push_back(offset); segments_count++; offset += segment_length; } offsets.push_back(size); std::vector values_input(size); std::iota(values_input.begin(), values_input.end(), 0); OffsetType * d_offsets; HIP_CHECK(hipMalloc(&d_offsets, (segments_count + 1) * sizeof(OffsetType))); HIP_CHECK( hipMemcpy( d_offsets, offsets.data(), (segments_count + 1) * sizeof(OffsetType), hipMemcpyHostToDevice ) ); value_type * d_values_input; HIP_CHECK(hipMalloc(&d_values_input, size * sizeof(value_type))); HIP_CHECK( hipMemcpy( d_values_input, values_input.data(), size * sizeof(value_type), hipMemcpyHostToDevice ) ); OutputT * d_aggregates_output; HIP_CHECK(hipMalloc(&d_aggregates_output, segments_count * sizeof(OutputT))); hipcub::Sum reduce_op; void * d_temporary_storage = nullptr; size_t temporary_storage_bytes = 0; HIP_CHECK( segmented_reduce( d_temporary_storage, temporary_storage_bytes, d_values_input, d_aggregates_output, segments_count, d_offsets, d_offsets + 1, stream, false ) ); HIP_CHECK(hipMalloc(&d_temporary_storage, temporary_storage_bytes)); HIP_CHECK(hipDeviceSynchronize()); // Warm-up for(size_t i = 0; i < warmup_size; i++) { HIP_CHECK( segmented_reduce( d_temporary_storage, temporary_storage_bytes, d_values_input, d_aggregates_output, segments_count, d_offsets, d_offsets + 1, stream, false ) ); } HIP_CHECK(hipDeviceSynchronize()); for (auto _ : state) { auto start = std::chrono::high_resolution_clock::now(); for(size_t i = 0; i < batch_size; i++) { HIP_CHECK( segmented_reduce( d_temporary_storage, temporary_storage_bytes, d_values_input, d_aggregates_output, segments_count, d_offsets, d_offsets + 1, stream, false ) ); } HIP_CHECK(hipStreamSynchronize(stream)); auto end = std::chrono::high_resolution_clock::now(); auto elapsed_seconds = std::chrono::duration_cast>(end - start); state.SetIterationTime(elapsed_seconds.count()); } state.SetBytesProcessed(state.iterations() * batch_size * size * sizeof(value_type)); state.SetItemsProcessed(state.iterations() * batch_size * size); HIP_CHECK(hipFree(d_temporary_storage)); HIP_CHECK(hipFree(d_offsets)); HIP_CHECK(hipFree(d_values_input)); HIP_CHECK(hipFree(d_aggregates_output)); } template struct Benchmark; template struct Benchmark { static void run(benchmark::State& state, size_t desired_segments, const hipStream_t stream, size_t size) { run_benchmark(state, desired_segments, stream, size, hipcub::DeviceSegmentedReduce::Sum); } }; template struct Benchmark { static void run(benchmark::State& state, size_t desired_segments, const hipStream_t stream, size_t size) { run_benchmark(state, desired_segments, stream, size, hipcub::DeviceSegmentedReduce::Min); } }; template struct Benchmark { using Difference = OffsetType; using Iterator = typename hipcub::ArgIndexInputIterator; using KeyValue = typename Iterator::value_type; static void run(benchmark::State& state, size_t desired_segments, const hipStream_t stream, size_t size) { run_benchmark(state, desired_segments, stream, size, hipcub::DeviceSegmentedReduce::ArgMin); } }; #define CREATE_BENCHMARK(T, SEGMENTS, REDUCE_OP) \ benchmark::RegisterBenchmark( \ (std::string("segmented_reduce") + "" + \ "(Number of segments:~" + std::to_string(SEGMENTS) + " segments)" \ ).c_str(), \ &Benchmark::run, \ SEGMENTS, stream, size \ ) #define BENCHMARK_TYPE(type, REDUCE_OP) \ CREATE_BENCHMARK(type, 1, REDUCE_OP), \ CREATE_BENCHMARK(type, 100, REDUCE_OP), \ CREATE_BENCHMARK(type, 10000, REDUCE_OP) #define CREATE_BENCHMARKS(REDUCE_OP) \ BENCHMARK_TYPE(float, REDUCE_OP), \ BENCHMARK_TYPE(double, REDUCE_OP), \ BENCHMARK_TYPE(int8_t, REDUCE_OP), \ BENCHMARK_TYPE(int, REDUCE_OP) void add_benchmarks(std::vector& benchmarks, hipStream_t stream, size_t size) { using custom_double2 = benchmark_utils::custom_type; std::vector bs = { CREATE_BENCHMARKS(hipcub::Sum), BENCHMARK_TYPE(custom_double2, hipcub::Sum), CREATE_BENCHMARKS(hipcub::Min), #ifdef HIPCUB_ROCPRIM_API BENCHMARK_TYPE(custom_double2, hipcub::Min), #endif CREATE_BENCHMARKS(hipcub::ArgMin), #ifdef HIPCUB_ROCPRIM_API BENCHMARK_TYPE(custom_double2, hipcub::ArgMin), #endif }; benchmarks.insert(benchmarks.end(), bs.begin(), bs.end()); } int main(int argc, char *argv[]) { cli::Parser parser(argc, argv); parser.set_optional("size", "size", DEFAULT_N, "number of values"); parser.set_optional("trials", "trials", -1, "number of iterations"); parser.run_and_exit_if_error(); // Parse argv benchmark::Initialize(&argc, argv); const size_t size = parser.get("size"); const int trials = parser.get("trials"); std::cout << "benchmark_device_segmented_reduce" << std::endl; // HIP hipStream_t stream = 0; // default hipDeviceProp_t devProp; int device_id = 0; HIP_CHECK(hipGetDevice(&device_id)); HIP_CHECK(hipGetDeviceProperties(&devProp, device_id)); std::cout << "[HIP] Device name: " << devProp.name << std::endl; // Add benchmarks std::vector benchmarks; add_benchmarks(benchmarks, stream, size); // Use manual timing for(auto& b : benchmarks) { b->UseManualTime(); b->Unit(benchmark::kMillisecond); } // Force number of iterations if(trials > 0) { for(auto& b : benchmarks) { b->Iterations(trials); } } // Run benchmarks benchmark::RunSpecifiedBenchmarks(); return 0; } hipCUB-rocm-5.7.1/benchmark/benchmark_device_segmented_sort.cpp000066400000000000000000000432751447643347700246430ustar00rootroot00000000000000// MIT License // // Copyright (c) 2020 Advanced Micro Devices, Inc. All rights reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal // in the Software without restriction, including without limitation the rights // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell // copies of the Software, and to permit persons to whom the Software is // furnished to do so, subject to the following conditions: // // The above copyright notice and this permission notice shall be included in all // copies or substantial portions of the Software. // // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE // SOFTWARE. #include "common_benchmark_header.hpp" // HIP API #include "hipcub/hipcub.hpp" #ifndef DEFAULT_N const size_t DEFAULT_N = 1024 * 1024 * 32; #endif const unsigned int batch_size = 4; const unsigned int warmup_size = 2; template void run_sort_keys_benchmark(benchmark::State &state, size_t desired_segments, hipStream_t stream, size_t size, bool Descending = false, bool Stable = false) { using offset_type = int; using key_type = Key; typedef hipError_t (*sort_func) ( void *, size_t&, const key_type *, key_type *, int, int, offset_type *, offset_type *, hipStream_t, bool); sort_func func_ascending = &hipcub::DeviceSegmentedSort::SortKeys ; sort_func func_descending = &hipcub::DeviceSegmentedSort::SortKeysDescending ; sort_func func_ascending_stable = &hipcub::DeviceSegmentedSort::StableSortKeys ; sort_func func_descending_stable = &hipcub::DeviceSegmentedSort::StableSortKeysDescending ; sort_func sorting = Descending ? (Stable ? func_descending_stable : func_descending) : (Stable ? func_ascending_stable : func_ascending); std::vector offsets; const double avg_segment_length = static_cast(size) / desired_segments; std::random_device rd; std::default_random_engine gen(rd()); std::uniform_real_distribution segment_length_dis(0, avg_segment_length * 2); unsigned int segments_count = 0; size_t offset = 0; while (offset < size) { const size_t segment_length = std::round(segment_length_dis(gen)); offsets.push_back(offset); ++segments_count; offset += segment_length; } offsets.push_back(size); std::vector keys_input; if (std::is_floating_point::value) { keys_input = benchmark_utils::get_random_data( size, static_cast(-1000), static_cast(1000) ); } else { keys_input = benchmark_utils::get_random_data( size, std::numeric_limits::min(), std::numeric_limits::max() ); } offset_type * d_offsets; HIP_CHECK(hipMalloc(&d_offsets, (segments_count + 1) * sizeof(offset_type))); HIP_CHECK( hipMemcpy( d_offsets, offsets.data(), (segments_count + 1) * sizeof(offset_type), hipMemcpyHostToDevice ) ); key_type * d_keys_input; key_type * d_keys_output; HIP_CHECK(hipMalloc(&d_keys_input, size * sizeof(key_type))); HIP_CHECK(hipMalloc(&d_keys_output, size * sizeof(key_type))); HIP_CHECK( hipMemcpy( d_keys_input, keys_input.data(), size * sizeof(key_type), hipMemcpyHostToDevice ) ); void * d_temporary_storage = nullptr; size_t temporary_storage_bytes = 0; HIP_CHECK( sorting( d_temporary_storage, temporary_storage_bytes, d_keys_input, d_keys_output, size, segments_count, d_offsets, d_offsets + 1, stream, false ) ); HIP_CHECK(hipMalloc(&d_temporary_storage, temporary_storage_bytes)); HIP_CHECK(hipDeviceSynchronize()); // Warm-up for (size_t i = 0; i < warmup_size; ++i) { HIP_CHECK( sorting( d_temporary_storage, temporary_storage_bytes, d_keys_input, d_keys_output, size, segments_count, d_offsets, d_offsets + 1, stream, false ) ); } HIP_CHECK(hipDeviceSynchronize()); for (auto _ : state) { auto start = std::chrono::high_resolution_clock::now(); for (size_t i = 0; i < batch_size; ++i) { HIP_CHECK( sorting( d_temporary_storage, temporary_storage_bytes, d_keys_input, d_keys_output, size, segments_count, d_offsets, d_offsets + 1, stream, false ) ); } HIP_CHECK(hipDeviceSynchronize()); auto end = std::chrono::high_resolution_clock::now(); auto elapsed_seconds = std::chrono::duration_cast>(end - start); state.SetIterationTime(elapsed_seconds.count()); } state.SetBytesProcessed(state.iterations() * batch_size * size * sizeof(key_type)); state.SetItemsProcessed(state.iterations() * batch_size * size); HIP_CHECK(hipFree(d_temporary_storage)); HIP_CHECK(hipFree(d_offsets)); HIP_CHECK(hipFree(d_keys_input)); HIP_CHECK(hipFree(d_keys_output)); } template void run_sort_pairs_benchmark(benchmark::State &state, size_t desired_segments, hipStream_t stream, size_t size, bool Descending = false, bool Stable = false) { using offset_type = int; using key_type = Key; using value_type = Value; typedef hipError_t (*sort_func) ( void *, size_t&, const key_type *, key_type *, const value_type *, value_type *, int, int, offset_type *, offset_type *, hipStream_t, bool); sort_func func_ascending = &hipcub::DeviceSegmentedSort::SortPairs ; sort_func func_descending = &hipcub::DeviceSegmentedSort::SortPairsDescending ; sort_func func_ascending_stable = &hipcub::DeviceSegmentedSort::StableSortPairs ; sort_func func_descending_stable = &hipcub::DeviceSegmentedSort::StableSortPairsDescending ; sort_func sorting = Descending ? (Stable ? func_descending_stable : func_descending) : (Stable ? func_ascending_stable : func_ascending); std::vector offsets; const double avg_segment_length = static_cast(size) / desired_segments; std::random_device rd; std::default_random_engine gen(rd()); std::uniform_real_distribution segment_length_dis(0, avg_segment_length * 2); unsigned int segments_count = 0; size_t offset = 0; while (offset < size) { const size_t segment_length = std::round(segment_length_dis(gen)); offsets.push_back(offset); ++segments_count; offset += segment_length; } offsets.push_back(size); std::vector keys_input; if (std::is_floating_point::value) { keys_input = benchmark_utils::get_random_data( size, static_cast(-1000), static_cast(1000) ); } else { keys_input = benchmark_utils::get_random_data( size, std::numeric_limits::min(), std::numeric_limits::max() ); } std::vector values_input(size); std::iota(values_input.begin(), values_input.end(), 0); offset_type * d_offsets; HIP_CHECK(hipMalloc(&d_offsets, (segments_count + 1) * sizeof(offset_type))); HIP_CHECK( hipMemcpy( d_offsets, offsets.data(), (segments_count + 1) * sizeof(offset_type), hipMemcpyHostToDevice ) ); key_type * d_keys_input; key_type * d_keys_output; HIP_CHECK(hipMalloc(&d_keys_input, size * sizeof(key_type))); HIP_CHECK(hipMalloc(&d_keys_output, size * sizeof(key_type))); HIP_CHECK( hipMemcpy( d_keys_input, keys_input.data(), size * sizeof(key_type), hipMemcpyHostToDevice ) ); value_type * d_values_input; value_type * d_values_output; HIP_CHECK(hipMalloc(&d_values_input, size * sizeof(value_type))); HIP_CHECK(hipMalloc(&d_values_output, size * sizeof(value_type))); HIP_CHECK( hipMemcpy( d_values_input, values_input.data(), size * sizeof(value_type), hipMemcpyHostToDevice ) ); void * d_temporary_storage = nullptr; size_t temporary_storage_bytes = 0; HIP_CHECK( sorting( d_temporary_storage, temporary_storage_bytes, d_keys_input, d_keys_output, d_values_input, d_values_output, size, segments_count, d_offsets, d_offsets + 1, stream, false ) ); HIP_CHECK(hipMalloc(&d_temporary_storage, temporary_storage_bytes)); HIP_CHECK(hipDeviceSynchronize()); // Warm-up for (size_t i = 0; i < warmup_size; i++) { HIP_CHECK( sorting( d_temporary_storage, temporary_storage_bytes, d_keys_input, d_keys_output, d_values_input, d_values_output, size, segments_count, d_offsets, d_offsets + 1, stream, false ) ); } HIP_CHECK(hipDeviceSynchronize()); for (auto _ : state) { auto start = std::chrono::high_resolution_clock::now(); for (size_t i = 0; i < batch_size; i++) { HIP_CHECK( sorting( d_temporary_storage, temporary_storage_bytes, d_keys_input, d_keys_output, d_values_input, d_values_output, size, segments_count, d_offsets, d_offsets + 1, stream, false ) ); } HIP_CHECK(hipDeviceSynchronize()); auto end = std::chrono::high_resolution_clock::now(); auto elapsed_seconds = std::chrono::duration_cast>(end - start); state.SetIterationTime(elapsed_seconds.count()); } state.SetBytesProcessed( state.iterations() * batch_size * size * (sizeof(key_type) + sizeof(value_type))); state.SetItemsProcessed(state.iterations() * batch_size * size); HIP_CHECK(hipFree(d_temporary_storage)); HIP_CHECK(hipFree(d_offsets)); HIP_CHECK(hipFree(d_keys_input)); HIP_CHECK(hipFree(d_keys_output)); HIP_CHECK(hipFree(d_values_input)); HIP_CHECK(hipFree(d_values_output)); } #define CREATE_SORT_KEYS_BENCHMARK(Key, SEGMENTS) \ benchmark::RegisterBenchmark( \ (std::string("sort_keys") + "" + \ "(Number of segments:~" + std::to_string(SEGMENTS) + " segments)") \ .c_str(), \ [=](benchmark::State &state) { run_sort_keys_benchmark(state, SEGMENTS, stream, size); }), \ benchmark::RegisterBenchmark( \ (std::string("sort_keys") + "<" #Key ">" + \ "(~" + std::to_string(SEGMENTS) + " segments), descending") \ .c_str(), \ [=](benchmark::State &state) { run_sort_keys_benchmark(state, SEGMENTS, stream, size, true); }), \ benchmark::RegisterBenchmark( \ (std::string("sort_keys") + "<" #Key ">" + \ "(~" + std::to_string(SEGMENTS) + " segments), stable") \ .c_str(), \ [=](benchmark::State &state) { run_sort_keys_benchmark(state, SEGMENTS, stream, size, false, true); }), \ benchmark::RegisterBenchmark( \ (std::string("sort_keys") + "<" #Key ">" + \ "(~" + std::to_string(SEGMENTS) + " segments), descending, stable") \ .c_str(), \ [=](benchmark::State &state) { run_sort_keys_benchmark(state, SEGMENTS, stream, size, true, true); }) #define BENCHMARK_KEY_TYPE(type) \ CREATE_SORT_KEYS_BENCHMARK(type, 10), \ CREATE_SORT_KEYS_BENCHMARK(type, 100), \ CREATE_SORT_KEYS_BENCHMARK(type, 1000), \ CREATE_SORT_KEYS_BENCHMARK(type, 10000) void add_sort_keys_benchmarks(std::vector &benchmarks, hipStream_t stream, size_t size) { std::vector bs = { BENCHMARK_KEY_TYPE(float), BENCHMARK_KEY_TYPE(double), BENCHMARK_KEY_TYPE(int8_t), BENCHMARK_KEY_TYPE(uint8_t), BENCHMARK_KEY_TYPE(int), }; benchmarks.insert(benchmarks.end(), bs.begin(), bs.end()); } #define CREATE_SORT_PAIRS_BENCHMARK(Key, Value, SEGMENTS) \ benchmark::RegisterBenchmark( \ (std::string("sort_pairs") + "" + \ "(Number of segments:~" + std::to_string(SEGMENTS) + " segments)") \ .c_str(), \ [=](benchmark::State &state) { run_sort_pairs_benchmark(state, SEGMENTS, stream, size); }), \ benchmark::RegisterBenchmark( \ (std::string("sort_pairs") + "<" #Key ", " #Value ">" + \ "(~" + std::to_string(SEGMENTS) + " segments), descending") \ .c_str(), \ [=](benchmark::State &state) { run_sort_pairs_benchmark(state, SEGMENTS, stream, size, true); }), \ benchmark::RegisterBenchmark( \ (std::string("sort_pairs") + "<" #Key ", " #Value ">" + \ "(~" + std::to_string(SEGMENTS) + " segments), stable") \ .c_str(), \ [=](benchmark::State &state) { run_sort_pairs_benchmark(state, SEGMENTS, stream, size, false, true); }), \ benchmark::RegisterBenchmark( \ (std::string("sort_pairs") + "<" #Key ", " #Value ">" + \ "(~" + std::to_string(SEGMENTS) + " segments), descending, stable") \ .c_str(), \ [=](benchmark::State &state) { run_sort_pairs_benchmark(state, SEGMENTS, stream, size, true, true); }) #define BENCHMARK_PAIR_TYPE(type, value) \ CREATE_SORT_PAIRS_BENCHMARK(type, value, 10), \ CREATE_SORT_PAIRS_BENCHMARK(type, value, 100), \ CREATE_SORT_PAIRS_BENCHMARK(type, value, 10000) void add_sort_pairs_benchmarks(std::vector &benchmarks, hipStream_t stream, size_t size) { using custom_float2 = benchmark_utils::custom_type; using custom_double2 = benchmark_utils::custom_type; std::vector bs = { BENCHMARK_PAIR_TYPE(int, float), BENCHMARK_PAIR_TYPE(long long, double), BENCHMARK_PAIR_TYPE(int8_t, int8_t), BENCHMARK_PAIR_TYPE(uint8_t, uint8_t), BENCHMARK_PAIR_TYPE(int, custom_float2), BENCHMARK_PAIR_TYPE(long long, custom_double2), }; benchmarks.insert(benchmarks.end(), bs.begin(), bs.end()); } int main(int argc, char *argv[]) { cli::Parser parser(argc, argv); parser.set_optional("size", "size", DEFAULT_N, "number of values"); parser.set_optional("trials", "trials", -1, "number of iterations"); parser.run_and_exit_if_error(); // Parse argv benchmark::Initialize(&argc, argv); const size_t size = parser.get("size"); const int trials = parser.get("trials"); std::cout << "benchmark_device_segmented_sort" << std::endl; // HIP hipStream_t stream = 0; // default hipDeviceProp_t devProp; int device_id = 0; HIP_CHECK(hipGetDevice(&device_id)); HIP_CHECK(hipGetDeviceProperties(&devProp, device_id)); std::cout << "[HIP] Device name: " << devProp.name << std::endl; // Add benchmarks std::vector benchmarks; add_sort_keys_benchmarks(benchmarks, stream, size); add_sort_pairs_benchmarks(benchmarks, stream, size); // Use manual timing for (auto &b : benchmarks) { b->UseManualTime(); b->Unit(benchmark::kMillisecond); } // Force number of iterations if (trials > 0) { for (auto &b : benchmarks) { b->Iterations(trials); } } // Run benchmarks benchmark::RunSpecifiedBenchmarks(); return 0; } hipCUB-rocm-5.7.1/benchmark/benchmark_device_select.cpp000066400000000000000000000471541447643347700231000ustar00rootroot00000000000000// MIT License // // Copyright (c) 2020 Advanced Micro Devices, Inc. All rights reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal // in the Software without restriction, including without limitation the rights // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell // copies of the Software, and to permit persons to whom the Software is // furnished to do so, subject to the following conditions: // // The above copyright notice and this permission notice shall be included in all // copies or substantial portions of the Software. // // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE // SOFTWARE. #include "common_benchmark_header.hpp" // HIP API #include "hipcub/device/device_select.hpp" #ifndef DEFAULT_N const size_t DEFAULT_N = 1024 * 1024 * 32; #endif template void run_flagged_benchmark(benchmark::State& state, size_t size, const hipStream_t stream, float true_probability) { std::vector input; std::vector flags = benchmark_utils::get_random_data01(size, true_probability); std::vector selected_count_output(1); if(std::is_floating_point::value) { input = benchmark_utils::get_random_data(size, T(-1000), T(1000)); } else { input = benchmark_utils::get_random_data( size, std::numeric_limits::min(), std::numeric_limits::max() ); } T * d_input; FlagType * d_flags; T * d_output; unsigned int * d_selected_count_output; HIP_CHECK(hipMalloc(&d_input, input.size() * sizeof(T))); HIP_CHECK(hipMalloc(&d_flags, flags.size() * sizeof(FlagType))); HIP_CHECK(hipMalloc(&d_output, input.size() * sizeof(T))); HIP_CHECK(hipMalloc(&d_selected_count_output, sizeof(unsigned int))); HIP_CHECK( hipMemcpy( d_input, input.data(), input.size() * sizeof(T), hipMemcpyHostToDevice ) ); HIP_CHECK( hipMemcpy( d_flags, flags.data(), flags.size() * sizeof(FlagType), hipMemcpyHostToDevice ) ); HIP_CHECK(hipDeviceSynchronize()); // Allocate temporary storage memory size_t temp_storage_size_bytes = 0; // Get size of d_temp_storage HIP_CHECK( hipcub::DeviceSelect::Flagged( nullptr, temp_storage_size_bytes, d_input, d_flags, d_output, d_selected_count_output, input.size(), stream ) ); HIP_CHECK(hipDeviceSynchronize()); // allocate temporary storage void * d_temp_storage = nullptr; HIP_CHECK(hipMalloc(&d_temp_storage, temp_storage_size_bytes)); HIP_CHECK(hipDeviceSynchronize()); // Warm-up for(size_t i = 0; i < 10; i++) { HIP_CHECK( hipcub::DeviceSelect::Flagged( d_temp_storage, temp_storage_size_bytes, d_input, d_flags, d_output, d_selected_count_output, input.size(), stream ) ); } HIP_CHECK(hipDeviceSynchronize()); const unsigned int batch_size = 10; for(auto _ : state) { auto start = std::chrono::high_resolution_clock::now(); for(size_t i = 0; i < batch_size; i++) { HIP_CHECK( hipcub::DeviceSelect::Flagged( d_temp_storage, temp_storage_size_bytes, d_input, d_flags, d_output, d_selected_count_output, input.size(), stream ) ); } HIP_CHECK(hipDeviceSynchronize()); auto end = std::chrono::high_resolution_clock::now(); auto elapsed_seconds = std::chrono::duration_cast>(end - start); state.SetIterationTime(elapsed_seconds.count()); } state.SetBytesProcessed(state.iterations() * batch_size * size * sizeof(T)); state.SetItemsProcessed(state.iterations() * batch_size * size); hipFree(d_input); hipFree(d_flags); hipFree(d_output); hipFree(d_selected_count_output); hipFree(d_temp_storage); HIP_CHECK(hipDeviceSynchronize()); } template void run_selectop_benchmark(benchmark::State& state, size_t size, const hipStream_t stream, float true_probability) { std::vector input = benchmark_utils::get_random_data(size, T(0), T(1000)); std::vector selected_count_output(1); auto select_op = [true_probability] __device__ (const T& value) -> bool { if(value < T(1000 * true_probability)) return true; return false; }; T * d_input; T * d_output; unsigned int * d_selected_count_output; HIP_CHECK(hipMalloc(&d_input, input.size() * sizeof(T))); HIP_CHECK(hipMalloc(&d_output, input.size() * sizeof(T))); HIP_CHECK(hipMalloc(&d_selected_count_output, sizeof(unsigned int))); HIP_CHECK( hipMemcpy( d_input, input.data(), input.size() * sizeof(T), hipMemcpyHostToDevice ) ); HIP_CHECK(hipDeviceSynchronize()); // Allocate temporary storage memory size_t temp_storage_size_bytes; // Get size of d_temp_storage HIP_CHECK( hipcub::DeviceSelect::If( nullptr, temp_storage_size_bytes, d_input, d_output, d_selected_count_output, input.size(), select_op, stream ) ); HIP_CHECK(hipDeviceSynchronize()); // allocate temporary storage void * d_temp_storage = nullptr; HIP_CHECK(hipMalloc(&d_temp_storage, temp_storage_size_bytes)); HIP_CHECK(hipDeviceSynchronize()); // Warm-up for(size_t i = 0; i < 10; i++) { HIP_CHECK( hipcub::DeviceSelect::If( d_temp_storage, temp_storage_size_bytes, d_input, d_output, d_selected_count_output, input.size(), select_op, stream ) ); } HIP_CHECK(hipDeviceSynchronize()); const unsigned int batch_size = 10; for(auto _ : state) { auto start = std::chrono::high_resolution_clock::now(); for(size_t i = 0; i < batch_size; i++) { HIP_CHECK( hipcub::DeviceSelect::If( d_temp_storage, temp_storage_size_bytes, d_input, d_output, d_selected_count_output, input.size(), select_op, stream ) ); } HIP_CHECK(hipDeviceSynchronize()); auto end = std::chrono::high_resolution_clock::now(); auto elapsed_seconds = std::chrono::duration_cast>(end - start); state.SetIterationTime(elapsed_seconds.count()); } state.SetBytesProcessed(state.iterations() * batch_size * size * sizeof(T)); state.SetItemsProcessed(state.iterations() * batch_size * size); hipFree(d_input); hipFree(d_output); hipFree(d_selected_count_output); hipFree(d_temp_storage); HIP_CHECK(hipDeviceSynchronize()); } template void run_unique_benchmark(benchmark::State& state, size_t size, const hipStream_t stream, float discontinuity_probability) { hipcub::Sum op; std::vector input(size); { auto input01 = benchmark_utils::get_random_data01(size, discontinuity_probability); auto acc = input01[0]; input[0] = acc; for(size_t i = 1; i < input01.size(); i++) { input[i] = op(acc, input01[i]); } } std::vector selected_count_output(1); T * d_input; T * d_output; unsigned int * d_selected_count_output; HIP_CHECK(hipMalloc(&d_input, input.size() * sizeof(T))); HIP_CHECK(hipMalloc(&d_output, input.size() * sizeof(T))); HIP_CHECK(hipMalloc(&d_selected_count_output, sizeof(unsigned int))); HIP_CHECK( hipMemcpy( d_input, input.data(), input.size() * sizeof(T), hipMemcpyHostToDevice ) ); HIP_CHECK(hipDeviceSynchronize()); // Allocate temporary storage memory size_t temp_storage_size_bytes; // Get size of d_temp_storage HIP_CHECK( hipcub::DeviceSelect::Unique( nullptr, temp_storage_size_bytes, d_input, d_output, d_selected_count_output, input.size(), stream ) ); HIP_CHECK(hipDeviceSynchronize()); // allocate temporary storage void * d_temp_storage = nullptr; HIP_CHECK(hipMalloc(&d_temp_storage, temp_storage_size_bytes)); HIP_CHECK(hipDeviceSynchronize()); // Warm-up for(size_t i = 0; i < 10; i++) { HIP_CHECK( hipcub::DeviceSelect::Unique( d_temp_storage, temp_storage_size_bytes, d_input, d_output, d_selected_count_output, input.size(), stream ) ); } HIP_CHECK(hipDeviceSynchronize()); const unsigned int batch_size = 10; for(auto _ : state) { auto start = std::chrono::high_resolution_clock::now(); for(size_t i = 0; i < batch_size; i++) { HIP_CHECK( hipcub::DeviceSelect::Unique( d_temp_storage, temp_storage_size_bytes, d_input, d_output, d_selected_count_output, input.size(), stream ) ); } HIP_CHECK(hipDeviceSynchronize()); auto end = std::chrono::high_resolution_clock::now(); auto elapsed_seconds = std::chrono::duration_cast>(end - start); state.SetIterationTime(elapsed_seconds.count()); } state.SetBytesProcessed(state.iterations() * batch_size * size * sizeof(T)); state.SetItemsProcessed(state.iterations() * batch_size * size); hipFree(d_input); hipFree(d_output); hipFree(d_selected_count_output); hipFree(d_temp_storage); } template void run_unique_by_key_benchmark(benchmark::State& state, size_t size, const hipStream_t stream, float discontinuity_probability) { hipcub::Sum op; std::vector input_keys(size); { auto input01 = benchmark_utils::get_random_data01(size, discontinuity_probability); auto acc = input01[0]; input_keys[0] = acc; for (size_t i = 1; i < input01.size(); i++) { input_keys[i] = op(acc, input01[i]); } } const auto input_values = benchmark_utils::get_random_data(size, ValueT(-1000), ValueT(1000)); unsigned int selected_count_output = 0; KeyT* d_keys_input; ValueT* d_values_input; KeyT* d_keys_output; ValueT* d_values_output; unsigned int* d_selected_count_output; HIP_CHECK(hipMalloc(&d_keys_input, input_keys.size() * sizeof(input_keys[0]))); HIP_CHECK(hipMalloc(&d_values_input, input_values.size() * sizeof(input_values[0]))); HIP_CHECK(hipMalloc(&d_keys_output, input_keys.size() * sizeof(input_keys[0]))); HIP_CHECK(hipMalloc(&d_values_output, input_values.size() * sizeof(input_values[0]))); HIP_CHECK(hipMalloc(&d_selected_count_output, sizeof(selected_count_output))); HIP_CHECK( hipMemcpy( d_keys_input, input_keys.data(), input_keys.size() * sizeof(input_keys[0]), hipMemcpyHostToDevice ) ); HIP_CHECK( hipMemcpy( d_values_input, input_values.data(), input_values.size() * sizeof(input_values[0]), hipMemcpyHostToDevice ) ); // Allocate temporary storage memory size_t temp_storage_size_bytes; // Get size of d_temp_storage HIP_CHECK( hipcub::DeviceSelect::UniqueByKey( nullptr, temp_storage_size_bytes, d_keys_input, d_values_input, d_keys_output, d_values_output, d_selected_count_output, input_keys.size(), stream ) ); HIP_CHECK(hipDeviceSynchronize()); // allocate temporary storage void* d_temp_storage = nullptr; HIP_CHECK(hipMalloc(&d_temp_storage, temp_storage_size_bytes)); HIP_CHECK(hipDeviceSynchronize()); // Warm-up for (size_t i = 0; i < 10; i++) { HIP_CHECK( hipcub::DeviceSelect::UniqueByKey( d_temp_storage, temp_storage_size_bytes, d_keys_input, d_values_input, d_keys_output, d_values_output, d_selected_count_output, input_keys.size(), stream ) ); } HIP_CHECK(hipDeviceSynchronize()); const unsigned int batch_size = 10; for (auto _ : state) { auto start = std::chrono::high_resolution_clock::now(); for (size_t i = 0; i < batch_size; i++) { HIP_CHECK( hipcub::DeviceSelect::UniqueByKey( d_temp_storage, temp_storage_size_bytes, d_keys_input, d_values_input, d_keys_output, d_values_output, d_selected_count_output, input_keys.size(), stream ) ); } HIP_CHECK(hipDeviceSynchronize()); auto end = std::chrono::high_resolution_clock::now(); auto elapsed_seconds = std::chrono::duration_cast>(end - start); state.SetIterationTime(elapsed_seconds.count()); } state.SetBytesProcessed(state.iterations() * batch_size * size * (sizeof(KeyT) + sizeof(ValueT))); state.SetItemsProcessed(state.iterations() * batch_size * size); hipFree(d_keys_input); hipFree(d_values_input); hipFree(d_keys_output); hipFree(d_values_output); hipFree(d_selected_count_output); hipFree(d_temp_storage); } #define CREATE_SELECT_FLAGGED_BENCHMARK(T, F, p) \ benchmark::RegisterBenchmark( \ ("select_flagged(Probability:" #p")"), \ &run_flagged_benchmark, size, stream, p \ ) #define CREATE_SELECT_IF_BENCHMARK(T, p) \ benchmark::RegisterBenchmark( \ ("select_if(Probability:" #p")"), \ &run_selectop_benchmark, size, stream, p \ ) #define CREATE_UNIQUE_BENCHMARK(T, p) \ benchmark::RegisterBenchmark( \ ("unique(Probability:" #p")"), \ &run_unique_benchmark, size, stream, p \ ) #define CREATE_UNIQUE_BY_KEY_BENCHMARK(K, V, p) \ benchmark::RegisterBenchmark( \ ("unique_by_key<" #K ", "#V", unsigned int>(p = " #p")"), \ &run_unique_by_key_benchmark, size, stream, p \ ) #define BENCHMARK_FLAGGED_TYPE(type, value) \ CREATE_SELECT_FLAGGED_BENCHMARK(type, value, 0.05f), \ CREATE_SELECT_FLAGGED_BENCHMARK(type, value, 0.25f), \ CREATE_SELECT_FLAGGED_BENCHMARK(type, value, 0.5f), \ CREATE_SELECT_FLAGGED_BENCHMARK(type, value, 0.75f) #define BENCHMARK_IF_TYPE(type) \ CREATE_SELECT_IF_BENCHMARK(type, 0.05f), \ CREATE_SELECT_IF_BENCHMARK(type, 0.25f), \ CREATE_SELECT_IF_BENCHMARK(type, 0.5f), \ CREATE_SELECT_IF_BENCHMARK(type, 0.75f) #define BENCHMARK_UNIQUE_TYPE(type) \ CREATE_UNIQUE_BENCHMARK(type, 0.05f), \ CREATE_UNIQUE_BENCHMARK(type, 0.25f), \ CREATE_UNIQUE_BENCHMARK(type, 0.5f), \ CREATE_UNIQUE_BENCHMARK(type, 0.75f) #define BENCHMARK_UNIQUE_BY_KEY_TYPE(key_type, value_type) \ CREATE_UNIQUE_BY_KEY_BENCHMARK(key_type, value_type, 0.05f), \ CREATE_UNIQUE_BY_KEY_BENCHMARK(key_type, value_type, 0.25f), \ CREATE_UNIQUE_BY_KEY_BENCHMARK(key_type, value_type, 0.5f), \ CREATE_UNIQUE_BY_KEY_BENCHMARK(key_type, value_type, 0.75f) int main(int argc, char *argv[]) { cli::Parser parser(argc, argv); parser.set_optional("size", "size", DEFAULT_N, "number of values"); parser.set_optional("trials", "trials", -1, "number of iterations"); parser.run_and_exit_if_error(); // Parse argv benchmark::Initialize(&argc, argv); const size_t size = parser.get("size"); const int trials = parser.get("trials"); std::cout << "benchmark_device_select" << std::endl; // HIP hipStream_t stream = 0; // default hipDeviceProp_t devProp; int device_id = 0; HIP_CHECK(hipGetDevice(&device_id)); HIP_CHECK(hipGetDeviceProperties(&devProp, device_id)); std::cout << "[HIP] Device name: " << devProp.name << std::endl; using custom_double2 = benchmark_utils::custom_type; using custom_int_double = benchmark_utils::custom_type; // Add benchmarks std::vector benchmarks = { BENCHMARK_FLAGGED_TYPE(int, unsigned char), BENCHMARK_FLAGGED_TYPE(float, unsigned char), BENCHMARK_FLAGGED_TYPE(double, unsigned char), BENCHMARK_FLAGGED_TYPE(uint8_t, uint8_t), BENCHMARK_FLAGGED_TYPE(int8_t, int8_t), BENCHMARK_FLAGGED_TYPE(custom_double2, unsigned char), BENCHMARK_IF_TYPE(int), BENCHMARK_IF_TYPE(float), BENCHMARK_IF_TYPE(double), BENCHMARK_IF_TYPE(uint8_t), BENCHMARK_IF_TYPE(int8_t), BENCHMARK_IF_TYPE(custom_int_double), BENCHMARK_UNIQUE_TYPE(int), BENCHMARK_UNIQUE_TYPE(float), BENCHMARK_UNIQUE_TYPE(double), BENCHMARK_UNIQUE_TYPE(uint8_t), BENCHMARK_UNIQUE_TYPE(int8_t), BENCHMARK_UNIQUE_TYPE(custom_int_double), BENCHMARK_UNIQUE_BY_KEY_TYPE(int, int), BENCHMARK_UNIQUE_BY_KEY_TYPE(float, double), BENCHMARK_UNIQUE_BY_KEY_TYPE(double, custom_double2), BENCHMARK_UNIQUE_BY_KEY_TYPE(uint8_t, uint8_t), BENCHMARK_UNIQUE_BY_KEY_TYPE(int8_t, double), BENCHMARK_UNIQUE_BY_KEY_TYPE(custom_int_double, custom_int_double) }; // Use manual timing for(auto& b : benchmarks) { b->UseManualTime(); b->Unit(benchmark::kMillisecond); } // Force number of iterations if(trials > 0) { for(auto& b : benchmarks) { b->Iterations(trials); } } // Run benchmarks benchmark::RunSpecifiedBenchmarks(); return 0; } hipCUB-rocm-5.7.1/benchmark/benchmark_device_spmv.cpp000066400000000000000000000204121447643347700225720ustar00rootroot00000000000000// MIT License // // Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal // in the Software without restriction, including without limitation the rights // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell // copies of the Software, and to permit persons to whom the Software is // furnished to do so, subject to the following conditions: // // The above copyright notice and this permission notice shall be included in all // copies or substantial portions of the Software. // // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE // SOFTWARE. #include "common_benchmark_header.hpp" // HIP API #include "hipcub/device/device_spmv.hpp" #ifndef DEFAULT_N const size_t DEFAULT_N = 1024 * 32; #endif const unsigned int batch_size = 10; const unsigned int warmup_size = 5; template void run_benchmark(benchmark::State& state, size_t size, const hipStream_t stream, float probability) { const T rand_min = T(1); const T rand_max = T(10); // generate a lexicograhically sorted list of (row, column) index tuples // number of nonzeroes cannot be guaranteed as duplicates may exist const int num_nonzeroes_attempt = static_cast(std::min( static_cast(INT_MAX), static_cast(probability * static_cast(size * size)))); std::vector> indices(num_nonzeroes_attempt); { std::vector flat_indices = benchmark_utils::get_random_data( 2 * num_nonzeroes_attempt, 0, size - 1, 2 * num_nonzeroes_attempt); for(size_t i = 0; i < num_nonzeroes_attempt; i++) { indices[i] = std::make_pair(flat_indices[2 * i], flat_indices[2 * i + 1]); } std::sort(indices.begin(), indices.end()); } // generate the compressed sparse rows matrix std::pair prev_cell = std::make_pair(-1, -1); int num_nonzeroes = 0; std::vector row_offsets(size + 1); // this vector might be too large, but doing the allocation now eliminates a scan std::vector column_indices(num_nonzeroes_attempt); row_offsets[0] = 0; int last_row_written = 0; for(size_t i = 0; i < num_nonzeroes_attempt; i++) { if(indices[i] != prev_cell) { // update the row offets if we go to the next row (or skip some) if(indices[i].first != last_row_written) { for(int j = last_row_written + 1; j <= indices[i].first; j++) { row_offsets[j] = num_nonzeroes; } last_row_written = indices[i].first; } column_indices[num_nonzeroes++] = indices[i].second; prev_cell = indices[i]; } } // fill in the entries for any missing rows for(int j = last_row_written + 1; j < size + 1; j++) { row_offsets[j] = num_nonzeroes; } // generate the random data once the actual number of nonzeroes are known std::vector values = benchmark_utils::get_random_data(num_nonzeroes, rand_min, rand_max); std::vector vector_x = benchmark_utils::get_random_data(size, rand_min, rand_max); T * d_values; int * d_row_offsets; int * d_column_indices; T * d_vector_x; T * d_vector_y; HIP_CHECK(hipMalloc(&d_values, values.size() * sizeof(T))); HIP_CHECK(hipMalloc(&d_row_offsets, row_offsets.size() * sizeof(int))); HIP_CHECK(hipMalloc(&d_column_indices, num_nonzeroes * sizeof(int))); HIP_CHECK(hipMalloc(&d_vector_x, vector_x.size() * sizeof(T))); HIP_CHECK(hipMalloc(&d_vector_y, size * sizeof(T))); HIP_CHECK(hipMemcpy( d_values, values.data(), values.size() * sizeof(T), hipMemcpyHostToDevice)); HIP_CHECK(hipMemcpy( d_row_offsets, row_offsets.data(), row_offsets.size() * sizeof(int), hipMemcpyHostToDevice)); HIP_CHECK(hipMemcpy( d_column_indices, column_indices.data(), num_nonzeroes * sizeof(int), hipMemcpyHostToDevice)); HIP_CHECK(hipMemcpy( d_vector_x, vector_x.data(), vector_x.size() * sizeof(T), hipMemcpyHostToDevice)); HIP_CHECK(hipDeviceSynchronize()); // Allocate temporary storage memory size_t temp_storage_size_bytes; // Get size of d_temp_storage HIP_CHECK(hipcub::DeviceSpmv::CsrMV( nullptr, temp_storage_size_bytes, d_values, d_row_offsets, d_column_indices, d_vector_x, d_vector_y, size, size, num_nonzeroes, stream)); HIP_CHECK(hipDeviceSynchronize()); // allocate temporary storage void * d_temp_storage = nullptr; HIP_CHECK(hipMalloc(&d_temp_storage, temp_storage_size_bytes)); HIP_CHECK(hipDeviceSynchronize()); // Warm-up for(size_t i = 0; i < warmup_size; i++) { HIP_CHECK(hipcub::DeviceSpmv::CsrMV( d_temp_storage, temp_storage_size_bytes, d_values, d_row_offsets, d_column_indices, d_vector_x, d_vector_y, size, size, num_nonzeroes, stream)); } HIP_CHECK(hipDeviceSynchronize()); for(auto _ : state) { auto start = std::chrono::high_resolution_clock::now(); for(size_t i = 0; i < batch_size; i++) { HIP_CHECK(hipcub::DeviceSpmv::CsrMV( d_temp_storage, temp_storage_size_bytes, d_values, d_row_offsets, d_column_indices, d_vector_x, d_vector_y, size, size, num_nonzeroes, stream)); } HIP_CHECK(hipDeviceSynchronize()); auto end = std::chrono::high_resolution_clock::now(); auto elapsed_seconds = std::chrono::duration_cast>(end - start); state.SetIterationTime(elapsed_seconds.count()); } state.SetBytesProcessed(state.iterations() * batch_size * (num_nonzeroes + size) * sizeof(T)); state.SetItemsProcessed(state.iterations() * batch_size * (num_nonzeroes + size)); hipFree(d_temp_storage); hipFree(d_vector_y); hipFree(d_vector_x); hipFree(d_column_indices); hipFree(d_row_offsets); hipFree(d_values); HIP_CHECK(hipDeviceSynchronize()); } #define CREATE_BENCHMARK(T, p) \ benchmark::RegisterBenchmark( \ ("CsrMV<" #T ">(p = " #p")"), \ &run_benchmark, size, stream, p \ ) #define BENCHMARK_TYPE(type) \ CREATE_BENCHMARK(type, 1.0e-6f), \ CREATE_BENCHMARK(type, 1.0e-5f), \ CREATE_BENCHMARK(type, 1.0e-4f), \ CREATE_BENCHMARK(type, 1.0e-3f), \ CREATE_BENCHMARK(type, 1.0e-2f) int main(int argc, char *argv[]) { cli::Parser parser(argc, argv); parser.set_optional("size", "size", DEFAULT_N, "number of values"); parser.set_optional("trials", "trials", -1, "number of iterations"); parser.run_and_exit_if_error(); // Parse argv benchmark::Initialize(&argc, argv); const size_t size = parser.get("size"); const int trials = parser.get("trials"); // HIP hipStream_t stream = 0; // default hipDeviceProp_t devProp; int device_id = 0; HIP_CHECK(hipGetDevice(&device_id)); HIP_CHECK(hipGetDeviceProperties(&devProp, device_id)); std::cout << "[HIP] Device name: " << devProp.name << std::endl; // Add benchmarks std::vector benchmarks = { BENCHMARK_TYPE(int), BENCHMARK_TYPE(unsigned int), BENCHMARK_TYPE(float), BENCHMARK_TYPE(double), }; // Use manual timing for(auto& b : benchmarks) { b->UseManualTime(); b->Unit(benchmark::kMillisecond); } // Force number of iterations if(trials > 0) { for(auto& b : benchmarks) { b->Iterations(trials); } } // Run benchmarks benchmark::RunSpecifiedBenchmarks(); return 0; } hipCUB-rocm-5.7.1/benchmark/benchmark_utils.hpp000066400000000000000000000307131447643347700214400ustar00rootroot00000000000000// MIT License // // Copyright (c) 2020-2023 Advanced Micro Devices, Inc. All rights reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal // in the Software without restriction, including without limitation the rights // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell // copies of the Software, and to permit persons to whom the Software is // furnished to do so, subject to the following conditions: // // The above copyright notice and this permission notice shall be included in // all copies or substantial portions of the Software. // // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN // THE SOFTWARE. #ifndef HIPCUB_BENCHMARK_UTILS_HPP_ #define HIPCUB_BENCHMARK_UTILS_HPP_ #ifndef BENCHMARK_UTILS_INCLUDE_GUARD #error benchmark_utils.hpp must ONLY be included by common_benchmark_header.hpp. Please include common_benchmark_header.hpp instead. #endif // hipCUB API #ifdef __HIP_PLATFORM_AMD__ #include "hipcub/backend/rocprim/util_ptx.hpp" #elif defined(__HIP_PLATFORM_NVIDIA__) #include "hipcub/config.hpp" #include #endif #ifndef HIPCUB_CUB_API #define HIPCUB_WARP_THREADS_MACRO warpSize #else #define HIPCUB_WARP_THREADS_MACRO CUB_PTX_WARP_THREADS #endif namespace benchmark_utils { const size_t default_max_random_size = 1024 * 1024; // get_random_data() generates only part of sequence and replicates it, // because benchmarks usually do not need "true" random sequence. template inline auto get_random_data(size_t size, T min, T max, size_t max_random_size = default_max_random_size) -> typename std::enable_if::value, std::vector>::type { std::random_device rd; std::default_random_engine gen(rd()); using distribution_type = typename std::conditional<(sizeof(T)==1), short, T>::type; std::uniform_int_distribution distribution(min, max); std::vector data(size); std::generate( data.begin(), data.begin() + std::min(size, max_random_size), [&]() { return distribution(gen); } ); for(size_t i = max_random_size; i < size; i += max_random_size) { std::copy_n(data.begin(), std::min(size - i, max_random_size), data.begin() + i); } return data; } template inline auto get_random_data(size_t size, T min, T max, size_t max_random_size = default_max_random_size) -> typename std::enable_if::value, std::vector>::type { std::random_device rd; std::default_random_engine gen(rd()); std::uniform_real_distribution distribution(min, max); std::vector data(size); std::generate( data.begin(), data.begin() + std::min(size, max_random_size), [&]() { return distribution(gen); } ); for(size_t i = max_random_size; i < size; i += max_random_size) { std::copy_n(data.begin(), std::min(size - i, max_random_size), data.begin() + i); } return data; } template inline std::vector get_random_data01(size_t size, float p, size_t max_random_size = default_max_random_size) { std::random_device rd; std::default_random_engine gen(rd()); std::bernoulli_distribution distribution(p); std::vector data(size); std::generate( data.begin(), data.begin() + std::min(size, max_random_size), [&]() { return distribution(gen); } ); for(size_t i = max_random_size; i < size; i += max_random_size) { std::copy_n(data.begin(), std::min(size - i, max_random_size), data.begin() + i); } return data; } template inline T get_random_value(T min, T max) { return get_random_data(1, min, max)[0]; } // Can't use std::prefix_sum for inclusive/exclusive scan, because // it does not handle short[] -> int(int a, int b) { a + b; } -> int[] // they way we expect. That's because sum in std::prefix_sum's implementation // is of type typename std::iterator_traits::value_type (short) template OutputIt host_inclusive_scan(InputIt first, InputIt last, OutputIt d_first, BinaryOperation op) { using input_type = typename std::iterator_traits::value_type; using output_type = typename std::iterator_traits::value_type; using result_type = typename std::conditional< std::is_void::value, input_type, output_type >::type; if (first == last) return d_first; result_type sum = *first; *d_first = sum; while (++first != last) { sum = op(sum, static_cast(*first)); *++d_first = sum; } return ++d_first; } template OutputIt host_exclusive_scan(InputIt first, InputIt last, T initial_value, OutputIt d_first, BinaryOperation op) { using input_type = typename std::iterator_traits::value_type; using output_type = typename std::iterator_traits::value_type; using result_type = typename std::conditional< std::is_void::value, input_type, output_type >::type; if (first == last) return d_first; result_type sum = initial_value; *d_first = initial_value; while ((first+1) != last) { sum = op(sum, static_cast(*first)); *++d_first = sum; first++; } return ++d_first; } template OutputIt host_exclusive_scan_by_key(InputIt first, InputIt last, KeyIt k_first, T initial_value, OutputIt d_first, BinaryOperation op, KeyCompare key_compare_op) { using input_type = typename std::iterator_traits::value_type; using output_type = typename std::iterator_traits::value_type; using result_type = typename std::conditional< std::is_void::value, input_type, output_type >::type; if (first == last) return d_first; result_type sum = initial_value; *d_first = initial_value; while ((first+1) != last) { if(key_compare_op(*k_first, *++k_first)) { sum = op(sum, static_cast(*first)); } else { sum = initial_value; } *++d_first = sum; first++; } return ++d_first; } template struct custom_type { using first_type = T; using second_type = U; T x; U y; HIPCUB_HOST_DEVICE inline constexpr custom_type() {} HIPCUB_HOST_DEVICE inline constexpr custom_type(T xx, U yy) : x(xx), y(yy) { } HIPCUB_HOST_DEVICE inline constexpr custom_type(T xy) : x(xy), y(xy) { } template HIPCUB_HOST_DEVICE inline custom_type(const custom_type& other) { x = other.x; y = other.y; } #ifndef HIPCUB_CUB_API HIPCUB_HOST_DEVICE inline ~custom_type() = default; #endif HIPCUB_HOST_DEVICE inline custom_type& operator=(const custom_type& other) { x = other.x; y = other.y; return *this; } HIPCUB_HOST_DEVICE inline custom_type operator+(const custom_type& rhs) const { return custom_type(x + rhs.x, y + rhs.y); } HIPCUB_HOST_DEVICE inline custom_type operator-(const custom_type& other) const { return custom_type(x - other.x, y - other.y); } HIPCUB_HOST_DEVICE inline bool operator<(const custom_type& rhs) const { // intentionally suboptimal choice for short-circuting, // required to generate more performant device code return ((x == rhs.x && y < rhs.y) || x < rhs.x); } HIPCUB_HOST_DEVICE inline bool operator>(const custom_type& other) const { return (x > other.x || (x == other.x && y > other.y)); } HIPCUB_HOST_DEVICE inline bool operator==(const custom_type& rhs) const { return x == rhs.x && y == rhs.y; } HIPCUB_HOST_DEVICE inline bool operator!=(const custom_type& other) const { return !(*this == other); } }; template struct is_custom_type : std::false_type {}; template struct is_custom_type> : std::true_type {}; template inline auto get_random_data(size_t size, T min, T max, size_t max_random_size = 1024 * 1024) -> typename std::enable_if::value, std::vector>::type { using first_type = typename T::first_type; using second_type = typename T::second_type; std::vector data(size); auto fdata = get_random_data(size, min.x, max.x, max_random_size); auto sdata = get_random_data(size, min.y, max.y, max_random_size); for(size_t i = 0; i < size; i++) { data[i] = T(fdata[i], sdata[i]); } return data; } template inline auto get_random_data(size_t size, T min, T max, size_t max_random_size = 1024 * 1024) -> typename std::enable_if::value && !std::is_same::value, std::vector>::type { using field_type = decltype(max.x); std::vector data(size); auto field_data = get_random_data(size, min.x, max.x, max_random_size); for(size_t i = 0; i < size; i++) { data[i] = T(field_data[i]); } return data; } template std::vector get_random_segments(const size_t size, const size_t max_segment_length, const int seed_value) { static_assert(std::is_arithmetic::value, "Key type must be arithmetic"); std::default_random_engine prng(seed_value); std::uniform_int_distribution segment_length_distribution(max_segment_length); using key_distribution_type = std::conditional_t< std::is_integral::value, std::uniform_int_distribution, std::uniform_real_distribution >; key_distribution_type key_distribution(std::numeric_limits::max()); std::vector keys(size); size_t keys_start_index = 0; while (keys_start_index < size) { const size_t new_segment_length = segment_length_distribution(prng); const size_t new_segment_end = std::min(size, keys_start_index + new_segment_length); const T key = key_distribution(prng); std::fill( std::next(keys.begin(), keys_start_index), std::next(keys.begin(), new_segment_end), key ); keys_start_index += new_segment_length; } return keys; } bool is_warp_size_supported(const unsigned required_warp_size) { return HIPCUB_HOST_WARP_THREADS >= required_warp_size; } template struct DeviceSelectWarpSize { static constexpr unsigned value = HIPCUB_DEVICE_WARP_THREADS >= LogicalWarpSize ? LogicalWarpSize : HIPCUB_DEVICE_WARP_THREADS; }; } // end benchmark_util namespace // Need for hipcub::DeviceReduce::Min/Max etc. namespace std { template<> class numeric_limits> { using T = typename benchmark_utils::custom_type; public: static constexpr inline T max() { return std::numeric_limits::max(); } static constexpr inline T lowest() { return std::numeric_limits::lowest(); } }; template<> class numeric_limits> { using T = typename benchmark_utils::custom_type; public: static constexpr inline T max() { return std::numeric_limits::max(); } static constexpr inline T lowest() { return std::numeric_limits::lowest(); } }; } #endif // HIPCUB_BENCHMARK_UTILS_HPP_ hipCUB-rocm-5.7.1/benchmark/benchmark_warp_exchange.cpp000066400000000000000000000255161447643347700231130ustar00rootroot00000000000000// MIT License // // Copyright (c) 2021 Advanced Micro Devices, Inc. All rights reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal // in the Software without restriction, including without limitation the rights // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell // copies of the Software, and to permit persons to whom the Software is // furnished to do so, subject to the following conditions: // // The above copyright notice and this permission notice shall be included in all // copies or substantial portions of the Software. // // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE // SOFTWARE. #include "common_benchmark_header.hpp" // HIP API #include "hipcub/warp/warp_exchange.hpp" #ifndef DEFAULT_N const size_t DEFAULT_N = 1024 * 1024 * 32; #endif template< class T, unsigned BlockSize, unsigned ItemsPerThread, unsigned LogicalWarpSize, template class Op > __global__ __launch_bounds__(BlockSize) void warp_exchange_kernel(T* d_output) { T thread_data[ItemsPerThread]; #pragma unroll for (unsigned i = 0; i < ItemsPerThread; ++i) { thread_data[i] = static_cast(i); } using WarpExchangeT = ::hipcub::WarpExchange< T, ItemsPerThread, ::benchmark_utils::DeviceSelectWarpSize::value >; constexpr unsigned warps_in_block = BlockSize / LogicalWarpSize; __shared__ typename WarpExchangeT::TempStorage temp_storage[warps_in_block]; const unsigned warp_id = hipThreadIdx_x / LogicalWarpSize; WarpExchangeT warp_exchange(temp_storage[warp_id]); Op< T, ItemsPerThread, ::benchmark_utils::DeviceSelectWarpSize::value >{}(warp_exchange, thread_data); #pragma unroll for (unsigned i = 0; i < ItemsPerThread; ++i) { const unsigned global_idx = (BlockSize * hipBlockIdx_x + hipThreadIdx_x) * ItemsPerThread + i; d_output[global_idx] = thread_data[i]; } } template< class T, class OffsetT, unsigned BlockSize, unsigned ItemsPerThread, unsigned LogicalWarpSize > __global__ __launch_bounds__(BlockSize) void warp_exchange_scatter_to_striped_kernel(T* d_output) { const unsigned warp_id = hipThreadIdx_x / LogicalWarpSize; T thread_data[ItemsPerThread]; OffsetT thread_ranks[ItemsPerThread]; #pragma unroll for (unsigned i = 0; i < ItemsPerThread; ++i) { thread_data[i] = static_cast(i); thread_ranks[i] = static_cast(LogicalWarpSize - warp_id * ItemsPerThread - i - 1); } using WarpExchangeT = ::hipcub::WarpExchange< T, ItemsPerThread, ::benchmark_utils::DeviceSelectWarpSize::value >; constexpr unsigned warps_in_block = BlockSize / LogicalWarpSize; __shared__ typename WarpExchangeT::TempStorage temp_storage[warps_in_block]; WarpExchangeT(temp_storage[warp_id]).ScatterToStriped(thread_data, thread_ranks); #pragma unroll for (unsigned i = 0; i < ItemsPerThread; ++i) { const unsigned striped_global_idx = BlockSize * ItemsPerThread * hipBlockIdx_x + BlockSize * i + hipThreadIdx_x; d_output[striped_global_idx] = thread_data[i]; } } template< class T, unsigned BlockSize, unsigned ItemsPerThread, unsigned LogicalWarpSize, template class Op > void run_benchmark(benchmark::State& state, hipStream_t stream, size_t N) { constexpr unsigned trials = 100; constexpr unsigned items_per_block = BlockSize * ItemsPerThread; const unsigned size = items_per_block * ((N + items_per_block - 1) / items_per_block); T * d_output; HIP_CHECK(hipMalloc(&d_output, size * sizeof(T))); for (auto _ : state) { auto start = std::chrono::high_resolution_clock::now(); for (size_t i = 0; i < trials; ++i) { hipLaunchKernelGGL( HIP_KERNEL_NAME(warp_exchange_kernel< T, BlockSize, ItemsPerThread, LogicalWarpSize, Op > ), dim3(size / items_per_block), dim3(BlockSize), 0, stream, d_output ); } HIP_CHECK(hipPeekAtLastError()) HIP_CHECK(hipDeviceSynchronize()); auto end = std::chrono::high_resolution_clock::now(); auto elapsed_seconds = std::chrono::duration_cast>(end - start); state.SetIterationTime(elapsed_seconds.count()); } state.SetBytesProcessed(state.iterations() * trials * size * sizeof(T)); state.SetItemsProcessed(state.iterations() * trials * size); HIP_CHECK(hipFree(d_output)); } template< class T, class OffsetT, unsigned BlockSize, unsigned ItemsPerThread, unsigned LogicalWarpSize > void run_benchmark_scatter_to_striped(benchmark::State& state, hipStream_t stream, size_t N) { constexpr unsigned trials = 100; constexpr unsigned items_per_block = BlockSize * ItemsPerThread; const unsigned size = items_per_block * ((N + items_per_block - 1) / items_per_block); T * d_output; HIP_CHECK(hipMalloc(&d_output, size * sizeof(T))); for (auto _ : state) { auto start = std::chrono::high_resolution_clock::now(); for (size_t i = 0; i < trials; ++i) { hipLaunchKernelGGL( HIP_KERNEL_NAME(warp_exchange_scatter_to_striped_kernel< T, OffsetT, BlockSize, ItemsPerThread, LogicalWarpSize > ), dim3(size / items_per_block), dim3(BlockSize), 0, stream, d_output ); } HIP_CHECK(hipPeekAtLastError()) HIP_CHECK(hipDeviceSynchronize()); auto end = std::chrono::high_resolution_clock::now(); auto elapsed_seconds = std::chrono::duration_cast>(end - start); state.SetIterationTime(elapsed_seconds.count()); } state.SetBytesProcessed(state.iterations() * trials * size * sizeof(T)); state.SetItemsProcessed(state.iterations() * trials * size); HIP_CHECK(hipFree(d_output)); } template< class T, unsigned ItemsPerThread, unsigned LogicalWarpSize > struct StripedToBlockedOp { __device__ void operator()( ::hipcub::WarpExchange &warp_exchange, T (&thread_data)[ItemsPerThread] ) const { warp_exchange.StripedToBlocked(thread_data, thread_data); } }; template< class T, unsigned ItemsPerThread, unsigned LogicalWarpSize > struct BlockedToStripedOp { __device__ void operator()( ::hipcub::WarpExchange &warp_exchange, T (&thread_data)[ItemsPerThread] ) const { warp_exchange.BlockedToStriped(thread_data, thread_data); } }; #define CREATE_BENCHMARK_STRIPED_TO_BLOCKED(T, BS, IT, WS) \ benchmark::RegisterBenchmark( \ "warp_exchange_striped_to_blocked.", \ &run_benchmark, \ stream, size \ ) #define CREATE_BENCHMARK_BLOCKED_TO_STRIPED(T, BS, IT, WS) \ benchmark::RegisterBenchmark( \ "warp_exchange_blocked_to_striped.", \ &run_benchmark, \ stream, size \ ) #define CREATE_BENCHMARK_SCATTER_TO_STRIPED(T, OFFSET_T, BS, IT, WS) \ benchmark::RegisterBenchmark( \ "warp_exchange_scatter_to_striped.", \ &run_benchmark_scatter_to_striped, \ stream, size \ ) int main(int argc, char *argv[]) { cli::Parser parser(argc, argv); parser.set_optional("size", "size", DEFAULT_N, "number of values"); parser.set_optional("trials", "trials", -1, "number of iterations"); parser.run_and_exit_if_error(); // Parse argv benchmark::Initialize(&argc, argv); const size_t size = parser.get("size"); const int trials = parser.get("trials"); std::cout << "benchmark_warp_exchange" << std::endl; // HIP hipStream_t stream = 0; // default hipDeviceProp_t devProp; int device_id = 0; HIP_CHECK(hipGetDevice(&device_id)); HIP_CHECK(hipGetDeviceProperties(&devProp, device_id)); std::cout << "[HIP] Device name: " << devProp.name << std::endl; // Add benchmarks std::vector benchmarks{ CREATE_BENCHMARK_STRIPED_TO_BLOCKED(int, 128, 4, 16), CREATE_BENCHMARK_BLOCKED_TO_STRIPED(int, 128, 4, 16), CREATE_BENCHMARK_SCATTER_TO_STRIPED(int, int, 128, 4, 16), CREATE_BENCHMARK_STRIPED_TO_BLOCKED(int, 128, 4, 32), CREATE_BENCHMARK_BLOCKED_TO_STRIPED(int, 128, 4, 32), CREATE_BENCHMARK_SCATTER_TO_STRIPED(int, int, 128, 4, 32), CREATE_BENCHMARK_STRIPED_TO_BLOCKED(int, 256, 4, 32), CREATE_BENCHMARK_BLOCKED_TO_STRIPED(int, 256, 4, 32), CREATE_BENCHMARK_SCATTER_TO_STRIPED(int, int, 256, 4, 32), }; if (::benchmark_utils::is_warp_size_supported(64)) { std::vector additional_benchmarks{ CREATE_BENCHMARK_STRIPED_TO_BLOCKED(int, 128, 4, 64), CREATE_BENCHMARK_BLOCKED_TO_STRIPED(int, 128, 4, 64), CREATE_BENCHMARK_SCATTER_TO_STRIPED(int, int, 128, 4, 64), CREATE_BENCHMARK_STRIPED_TO_BLOCKED(int, 256, 4, 64), CREATE_BENCHMARK_BLOCKED_TO_STRIPED(int, 256, 4, 64), CREATE_BENCHMARK_SCATTER_TO_STRIPED(int, int, 256, 4, 64) }; benchmarks.insert( benchmarks.end(), additional_benchmarks.begin(), additional_benchmarks.end() ); } // Use manual timing for (auto& b : benchmarks) { b->UseManualTime(); b->Unit(benchmark::kMillisecond); } // Force number of iterations if (trials > 0) { for (auto& b : benchmarks) { b->Iterations(trials); } } // Run benchmarks benchmark::RunSpecifiedBenchmarks(); return 0; } hipCUB-rocm-5.7.1/benchmark/benchmark_warp_load.cpp000066400000000000000000000302521447643347700222410ustar00rootroot00000000000000// MIT License // // Copyright (c) 2021 Advanced Micro Devices, Inc. All rights reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal // in the Software without restriction, including without limitation the rights // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell // copies of the Software, and to permit persons to whom the Software is // furnished to do so, subject to the following conditions: // // The above copyright notice and this permission notice shall be included in all // copies or substantial portions of the Software. // // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE // SOFTWARE. #include "common_benchmark_header.hpp" // HIP API #include "hipcub/warp/warp_load.hpp" #ifndef DEFAULT_N const size_t DEFAULT_N = 1024 * 1024 * 32; #endif template< class T, unsigned BlockSize, unsigned ItemsPerThread, unsigned LogicalWarpSize, ::hipcub::WarpLoadAlgorithm Algorithm > __global__ __launch_bounds__(BlockSize) void warp_load_kernel(T* d_input, T* d_output) { using WarpLoadT = ::hipcub::WarpLoad< T, ItemsPerThread, Algorithm, ::benchmark_utils::DeviceSelectWarpSize::value >; constexpr unsigned warps_in_block = BlockSize / LogicalWarpSize; constexpr int tile_size = ItemsPerThread * LogicalWarpSize; const unsigned warp_id = hipThreadIdx_x / LogicalWarpSize; const unsigned global_warp_id = hipBlockIdx_x * warps_in_block + warp_id; __shared__ typename WarpLoadT::TempStorage temp_storage[warps_in_block]; T thread_data[ItemsPerThread]; WarpLoadT(temp_storage[warp_id]).Load(d_input + global_warp_id * tile_size, thread_data); #pragma unroll for (unsigned i = 0; i < ItemsPerThread; ++i) { const unsigned striped_global_idx = BlockSize * ItemsPerThread * hipBlockIdx_x + BlockSize * i + hipThreadIdx_x; d_output[striped_global_idx] = thread_data[i]; } } template< class T, unsigned BlockSize, unsigned ItemsPerThread, unsigned LogicalWarpSize, ::hipcub::WarpLoadAlgorithm Algorithm, unsigned Trials = 100 > void run_benchmark(benchmark::State& state, hipStream_t stream, size_t N) { constexpr unsigned items_per_block = BlockSize * ItemsPerThread; const unsigned size = items_per_block * ((N + items_per_block - 1) / items_per_block); std::vector input = benchmark_utils::get_random_data(size, T(0), T(10)); T * d_input; T * d_output; HIP_CHECK(hipMalloc(&d_input, size * sizeof(T))); HIP_CHECK(hipMalloc(&d_output, size * sizeof(T))); HIP_CHECK( hipMemcpy( d_input, input.data(), size * sizeof(T), hipMemcpyHostToDevice ) ); for (auto _ : state) { auto start = std::chrono::high_resolution_clock::now(); for (size_t i = 0; i < Trials; i++) { hipLaunchKernelGGL( HIP_KERNEL_NAME(warp_load_kernel< T, BlockSize, ItemsPerThread, LogicalWarpSize, Algorithm >), dim3(size / items_per_block), dim3(BlockSize), 0, stream, d_input, d_output ); } HIP_CHECK(hipPeekAtLastError()) HIP_CHECK(hipDeviceSynchronize()); auto end = std::chrono::high_resolution_clock::now(); auto elapsed_seconds = std::chrono::duration_cast>(end - start); state.SetIterationTime(elapsed_seconds.count()); } state.SetBytesProcessed(state.iterations() * Trials * size * sizeof(T)); state.SetItemsProcessed(state.iterations() * Trials * size); HIP_CHECK(hipFree(d_input)); HIP_CHECK(hipFree(d_output)); } #define CREATE_BENCHMARK(T, BS, IT, WS, ALG) \ benchmark::RegisterBenchmark( \ "warp_load.", \ &run_benchmark, \ stream, size \ ) int main(int argc, char *argv[]) { cli::Parser parser(argc, argv); parser.set_optional("size", "size", DEFAULT_N, "number of values"); parser.set_optional("trials", "trials", -1, "number of iterations"); parser.run_and_exit_if_error(); // Parse argv benchmark::Initialize(&argc, argv); const size_t size = parser.get("size"); const int trials = parser.get("trials"); // HIP hipStream_t stream = 0; // default hipDeviceProp_t devProp; int device_id = 0; HIP_CHECK(hipGetDevice(&device_id)); HIP_CHECK(hipGetDeviceProperties(&devProp, device_id)); std::cout << "[HIP] Device name: " << devProp.name << std::endl; // Add benchmarks std::vector benchmarks{ CREATE_BENCHMARK(int, 256, 4, 32, ::hipcub::WARP_LOAD_DIRECT), CREATE_BENCHMARK(int, 256, 4, 32, ::hipcub::WARP_LOAD_STRIPED), CREATE_BENCHMARK(int, 256, 4, 32, ::hipcub::WARP_LOAD_VECTORIZE), CREATE_BENCHMARK(int, 256, 4, 32, ::hipcub::WARP_LOAD_TRANSPOSE), CREATE_BENCHMARK(int, 256, 8, 32, ::hipcub::WARP_LOAD_DIRECT), CREATE_BENCHMARK(int, 256, 8, 32, ::hipcub::WARP_LOAD_STRIPED), CREATE_BENCHMARK(int, 256, 8, 32, ::hipcub::WARP_LOAD_VECTORIZE), CREATE_BENCHMARK(int, 256, 8, 32, ::hipcub::WARP_LOAD_TRANSPOSE), CREATE_BENCHMARK(int, 256, 16, 32, ::hipcub::WARP_LOAD_DIRECT), CREATE_BENCHMARK(int, 256, 16, 32, ::hipcub::WARP_LOAD_STRIPED), CREATE_BENCHMARK(int, 256, 16, 32, ::hipcub::WARP_LOAD_VECTORIZE), CREATE_BENCHMARK(int, 256, 16, 32, ::hipcub::WARP_LOAD_TRANSPOSE), CREATE_BENCHMARK(int, 256, 32, 32, ::hipcub::WARP_LOAD_DIRECT), CREATE_BENCHMARK(int, 256, 32, 32, ::hipcub::WARP_LOAD_STRIPED), CREATE_BENCHMARK(int, 256, 32, 32, ::hipcub::WARP_LOAD_VECTORIZE), CREATE_BENCHMARK(int, 256, 32, 32, ::hipcub::WARP_LOAD_TRANSPOSE), CREATE_BENCHMARK(int, 256, 64, 32, ::hipcub::WARP_LOAD_DIRECT), CREATE_BENCHMARK(int, 256, 64, 32, ::hipcub::WARP_LOAD_STRIPED), CREATE_BENCHMARK(int, 256, 64, 32, ::hipcub::WARP_LOAD_VECTORIZE), CREATE_BENCHMARK(double, 256, 4, 32, ::hipcub::WARP_LOAD_DIRECT), CREATE_BENCHMARK(double, 256, 4, 32, ::hipcub::WARP_LOAD_STRIPED), CREATE_BENCHMARK(double, 256, 4, 32, ::hipcub::WARP_LOAD_VECTORIZE), CREATE_BENCHMARK(double, 256, 4, 32, ::hipcub::WARP_LOAD_TRANSPOSE), CREATE_BENCHMARK(double, 256, 8, 32, ::hipcub::WARP_LOAD_DIRECT), CREATE_BENCHMARK(double, 256, 8, 32, ::hipcub::WARP_LOAD_STRIPED), CREATE_BENCHMARK(double, 256, 8, 32, ::hipcub::WARP_LOAD_VECTORIZE), CREATE_BENCHMARK(double, 256, 8, 32, ::hipcub::WARP_LOAD_TRANSPOSE), CREATE_BENCHMARK(double, 256, 16, 32, ::hipcub::WARP_LOAD_DIRECT), CREATE_BENCHMARK(double, 256, 16, 32, ::hipcub::WARP_LOAD_STRIPED), CREATE_BENCHMARK(double, 256, 16, 32, ::hipcub::WARP_LOAD_VECTORIZE), CREATE_BENCHMARK(double, 256, 16, 32, ::hipcub::WARP_LOAD_TRANSPOSE), CREATE_BENCHMARK(double, 256, 32, 32, ::hipcub::WARP_LOAD_DIRECT), CREATE_BENCHMARK(double, 256, 32, 32, ::hipcub::WARP_LOAD_STRIPED), CREATE_BENCHMARK(double, 256, 32, 32, ::hipcub::WARP_LOAD_VECTORIZE), // WARP_LOAD_TRANSPOSE removed because of shared memory limit // CREATE_BENCHMARK(double, 256, 32, 32, ::hipcub::WARP_LOAD_TRANSPOSE), CREATE_BENCHMARK(double, 256, 64, 32, ::hipcub::WARP_LOAD_DIRECT), CREATE_BENCHMARK(double, 256, 64, 32, ::hipcub::WARP_LOAD_STRIPED), CREATE_BENCHMARK(double, 256, 64, 32, ::hipcub::WARP_LOAD_VECTORIZE) // WARP_LOAD_TRANSPOSE removed because of shared memory limit // CREATE_BENCHMARK(double, 256, 64, 32, ::hipcub::WARP_LOAD_TRANSPOSE) }; if (::benchmark_utils::is_warp_size_supported(64)) { std::vector additional_benchmarks{ CREATE_BENCHMARK(int, 256, 4, 64, ::hipcub::WARP_LOAD_DIRECT), CREATE_BENCHMARK(int, 256, 4, 64, ::hipcub::WARP_LOAD_STRIPED), CREATE_BENCHMARK(int, 256, 4, 64, ::hipcub::WARP_LOAD_VECTORIZE), CREATE_BENCHMARK(int, 256, 4, 64, ::hipcub::WARP_LOAD_TRANSPOSE), CREATE_BENCHMARK(int, 256, 8, 64, ::hipcub::WARP_LOAD_DIRECT), CREATE_BENCHMARK(int, 256, 8, 64, ::hipcub::WARP_LOAD_STRIPED), CREATE_BENCHMARK(int, 256, 8, 64, ::hipcub::WARP_LOAD_VECTORIZE), CREATE_BENCHMARK(int, 256, 8, 64, ::hipcub::WARP_LOAD_TRANSPOSE), CREATE_BENCHMARK(int, 256, 16, 64, ::hipcub::WARP_LOAD_DIRECT), CREATE_BENCHMARK(int, 256, 16, 64, ::hipcub::WARP_LOAD_STRIPED), CREATE_BENCHMARK(int, 256, 16, 64, ::hipcub::WARP_LOAD_VECTORIZE), CREATE_BENCHMARK(int, 256, 16, 64, ::hipcub::WARP_LOAD_TRANSPOSE), CREATE_BENCHMARK(int, 256, 32, 64, ::hipcub::WARP_LOAD_DIRECT), CREATE_BENCHMARK(int, 256, 32, 64, ::hipcub::WARP_LOAD_STRIPED), CREATE_BENCHMARK(int, 256, 32, 64, ::hipcub::WARP_LOAD_VECTORIZE), CREATE_BENCHMARK(int, 256, 32, 64, ::hipcub::WARP_LOAD_TRANSPOSE), CREATE_BENCHMARK(int, 256, 64, 64, ::hipcub::WARP_LOAD_DIRECT), CREATE_BENCHMARK(int, 256, 64, 64, ::hipcub::WARP_LOAD_STRIPED), CREATE_BENCHMARK(int, 256, 64, 64, ::hipcub::WARP_LOAD_VECTORIZE), CREATE_BENCHMARK(double, 256, 4, 64, ::hipcub::WARP_LOAD_DIRECT), CREATE_BENCHMARK(double, 256, 4, 64, ::hipcub::WARP_LOAD_STRIPED), CREATE_BENCHMARK(double, 256, 4, 64, ::hipcub::WARP_LOAD_VECTORIZE), CREATE_BENCHMARK(double, 256, 4, 64, ::hipcub::WARP_LOAD_TRANSPOSE), CREATE_BENCHMARK(double, 256, 8, 64, ::hipcub::WARP_LOAD_DIRECT), CREATE_BENCHMARK(double, 256, 8, 64, ::hipcub::WARP_LOAD_STRIPED), CREATE_BENCHMARK(double, 256, 8, 64, ::hipcub::WARP_LOAD_VECTORIZE), CREATE_BENCHMARK(double, 256, 8, 64, ::hipcub::WARP_LOAD_TRANSPOSE), CREATE_BENCHMARK(double, 256, 16, 64, ::hipcub::WARP_LOAD_DIRECT), CREATE_BENCHMARK(double, 256, 16, 64, ::hipcub::WARP_LOAD_STRIPED), CREATE_BENCHMARK(double, 256, 16, 64, ::hipcub::WARP_LOAD_VECTORIZE), // WARP_LOAD_TRANSPOSE removed because of shared memory limit // CREATE_BENCHMARK(double, 256, 16, 64, ::hipcub::WARP_LOAD_TRANSPOSE), CREATE_BENCHMARK(double, 256, 32, 64, ::hipcub::WARP_LOAD_DIRECT), CREATE_BENCHMARK(double, 256, 32, 64, ::hipcub::WARP_LOAD_STRIPED), CREATE_BENCHMARK(double, 256, 32, 64, ::hipcub::WARP_LOAD_VECTORIZE), // WARP_LOAD_TRANSPOSE removed because of shared memory limit // CREATE_BENCHMARK(double, 256, 32, 64, ::hipcub::WARP_LOAD_TRANSPOSE), CREATE_BENCHMARK(double, 256, 64, 64, ::hipcub::WARP_LOAD_DIRECT), CREATE_BENCHMARK(double, 256, 64, 64, ::hipcub::WARP_LOAD_STRIPED), CREATE_BENCHMARK(double, 256, 64, 64, ::hipcub::WARP_LOAD_VECTORIZE) // WARP_LOAD_TRANSPOSE removed because of shared memory limit // CREATE_BENCHMARK(double, 256, 64, 64, ::hipcub::WARP_LOAD_TRANSPOSE) }; benchmarks.insert( benchmarks.end(), additional_benchmarks.begin(), additional_benchmarks.end() ); } // Use manual timing for (auto& b : benchmarks) { b->UseManualTime(); b->Unit(benchmark::kMillisecond); } // Force number of iterations if (trials > 0) { for (auto& b : benchmarks) { b->Iterations(trials); } } // Run benchmarks benchmark::RunSpecifiedBenchmarks(); return 0; } hipCUB-rocm-5.7.1/benchmark/benchmark_warp_merge_sort.cpp000066400000000000000000000416351447643347700234770ustar00rootroot00000000000000// MIT License // // Copyright (c) 2021 Advanced Micro Devices, Inc. All rights reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal // in the Software without restriction, including without limitation the rights // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell // copies of the Software, and to permit persons to whom the Software is // furnished to do so, subject to the following conditions: // // The above copyright notice and this permission notice shall be included in all // copies or substantial portions of the Software. // // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE // SOFTWARE. #include "common_benchmark_header.hpp" #include "../test/hipcub/test_utils_sort_comparator.hpp" // HIP API #include "hipcub/block/block_load.hpp" #include "hipcub/block/block_store.hpp" #include "hipcub/util_ptx.hpp" #include "hipcub/warp/warp_merge_sort.hpp" #include #ifndef DEFAULT_N constexpr size_t DEFAULT_N = 1024 * 1024 * 128; #endif enum class benchmark_kinds { sort_keys, sort_pairs, }; template< unsigned int BlockSize, unsigned int LogicalWarpSize, unsigned int ItemsPerThread, typename T, typename Compare > __global__ __launch_bounds__(BlockSize) void sort_keys(const T* input, T* output, Compare compare_op) { constexpr unsigned int items_per_block = BlockSize * ItemsPerThread; const unsigned int flat_tid = hipThreadIdx_x; const unsigned int block_offset = hipBlockIdx_x * items_per_block; T keys[ItemsPerThread]; hipcub::LoadDirectBlocked(flat_tid, input + block_offset, keys); constexpr unsigned int warps_per_block = BlockSize / LogicalWarpSize; const unsigned int warp_id = hipThreadIdx_x / LogicalWarpSize; using warp_merge_sort = hipcub::WarpMergeSort::value>; __shared__ typename warp_merge_sort::TempStorage storage[warps_per_block]; warp_merge_sort wsort{storage[warp_id]}; wsort.Sort(keys, compare_op); hipcub::StoreDirectBlocked(flat_tid, output + block_offset, keys); } template< unsigned int BlockSize, unsigned int LogicalWarpSize, unsigned int ItemsPerThread, typename T, typename Compare > __global__ __launch_bounds__(BlockSize) void sort_pairs(const T* input, T* output, Compare compare_op) { constexpr unsigned int items_per_block = BlockSize * ItemsPerThread; const unsigned int flat_tid = hipThreadIdx_x; const unsigned int block_offset = hipBlockIdx_x * items_per_block; T keys[ItemsPerThread]; T values[ItemsPerThread]; hipcub::LoadDirectBlocked(flat_tid, input + block_offset, keys); for(unsigned int i = 0; i < ItemsPerThread; ++i) { values[i] = keys[i] + T(1); } constexpr unsigned int warps_per_block = BlockSize / LogicalWarpSize; const unsigned int warp_id = hipThreadIdx_x / LogicalWarpSize; using warp_merge_sort = hipcub::WarpMergeSort::value, T>; __shared__ typename warp_merge_sort::TempStorage storage[warps_per_block]; warp_merge_sort wsort{storage[warp_id]}; wsort.Sort(keys, values, compare_op); for(unsigned int i = 0; i < ItemsPerThread; ++i) { keys[i] += values[i]; } hipcub::StoreDirectBlocked(flat_tid, output + block_offset, keys); } template struct max_value { static constexpr T value = std::numeric_limits::max(); }; template< unsigned int BlockSize, unsigned int LogicalWarpSize, unsigned int ItemsPerThread, typename T, typename Compare > __global__ __launch_bounds__(BlockSize) void sort_keys_segmented(const T* input, T* output, const unsigned int* segment_sizes, Compare compare) { constexpr unsigned int max_segment_size = LogicalWarpSize * ItemsPerThread; constexpr unsigned int segments_per_block = BlockSize / LogicalWarpSize; using warp_merge_sort = hipcub::WarpMergeSort::value>; __shared__ typename warp_merge_sort::TempStorage storage[segments_per_block]; const unsigned int warp_id = hipThreadIdx_x / LogicalWarpSize; warp_merge_sort wsort{storage[warp_id]}; const unsigned int segment_id = hipBlockIdx_x * segments_per_block + warp_id; const unsigned int segment_size = segment_sizes[segment_id]; const unsigned int warp_offset = segment_id * max_segment_size; T keys[ItemsPerThread]; const unsigned int flat_tid = wsort.get_linear_tid(); hipcub::LoadDirectBlocked(flat_tid, input + warp_offset, keys, segment_size); const T oob_default = max_value::value; wsort.Sort(keys, compare, segment_size, oob_default); hipcub::StoreDirectBlocked(flat_tid, output + warp_offset, keys, segment_size); } template< unsigned int BlockSize, unsigned int LogicalWarpSize, unsigned int ItemsPerThread, typename T, typename Compare > __global__ __launch_bounds__(BlockSize) void sort_pairs_segmented(const T* input, T* output, const unsigned int* segment_sizes, Compare compare) { constexpr unsigned int max_segment_size = LogicalWarpSize * ItemsPerThread; constexpr unsigned int segments_per_block = BlockSize / LogicalWarpSize; using warp_merge_sort = hipcub::WarpMergeSort::value, T>; __shared__ typename warp_merge_sort::TempStorage storage[segments_per_block]; const unsigned int warp_id = hipThreadIdx_x / LogicalWarpSize; warp_merge_sort wsort{storage[warp_id]}; const unsigned int segment_id = hipBlockIdx_x * segments_per_block + warp_id; const unsigned int segment_size = segment_sizes[segment_id]; const unsigned int warp_offset = segment_id * max_segment_size; T keys[ItemsPerThread]; T values[ItemsPerThread]; const unsigned int flat_tid = wsort.get_linear_tid(); hipcub::LoadDirectBlocked(flat_tid, input + warp_offset, keys, segment_size); for(unsigned int i = 0; i < ItemsPerThread; ++i) { if(flat_tid * ItemsPerThread + i < segment_size) { values[i] = keys[i] + T(1); } } const T oob_default = max_value::value; wsort.Sort(keys, values, compare, segment_size, oob_default); for(unsigned int i = 0; i < ItemsPerThread; ++i) { if(flat_tid * ItemsPerThread + i < segment_size) { keys[i] += values[i]; } } hipcub::StoreDirectBlocked(flat_tid, output + warp_offset, keys, segment_size); } template< class T, unsigned int BlockSize, unsigned int LogicalWarpSize, unsigned int ItemsPerThread, class CompareOp = test_utils::less, unsigned int Trials = 10 > void run_benchmark(benchmark::State& state, const benchmark_kinds benchmark_kind, const hipStream_t stream, const size_t N) { constexpr auto items_per_block = BlockSize * ItemsPerThread; const auto size = items_per_block * ((N + items_per_block - 1) / items_per_block); const auto input = std::is_floating_point::value ? benchmark_utils::get_random_data(size, static_cast(-1000), static_cast(1000)) : benchmark_utils::get_random_data( size, std::numeric_limits::min(), std::numeric_limits::max() ); T* d_input = nullptr; T* d_output = nullptr; HIP_CHECK(hipMalloc(&d_input, size * sizeof(input[0]))); HIP_CHECK(hipMalloc(&d_output, size * sizeof(input[0]))); HIP_CHECK( hipMemcpy( d_input, input.data(), size * sizeof(T), hipMemcpyHostToDevice ) ); for(auto _ : state) { auto start = std::chrono::high_resolution_clock::now(); if(benchmark_kind == benchmark_kinds::sort_keys) { for(unsigned int i = 0; i < Trials; ++i) { hipLaunchKernelGGL( HIP_KERNEL_NAME(sort_keys), dim3(size / items_per_block), dim3(BlockSize), 0, stream, d_input, d_output, CompareOp{}); } } else if(benchmark_kind == benchmark_kinds::sort_pairs) { for(unsigned int i = 0; i < Trials; ++i) { hipLaunchKernelGGL( HIP_KERNEL_NAME(sort_pairs), dim3(size / items_per_block), dim3(BlockSize), 0, stream, d_input, d_output, CompareOp{}); } } HIP_CHECK(hipPeekAtLastError()); HIP_CHECK(hipDeviceSynchronize()); auto end = std::chrono::high_resolution_clock::now(); auto elapsed_seconds = std::chrono::duration_cast>(end - start); state.SetIterationTime(elapsed_seconds.count()); } state.SetBytesProcessed(state.iterations() * Trials * size * sizeof(T)); state.SetItemsProcessed(state.iterations() * Trials * size); HIP_CHECK(hipFree(d_input)); HIP_CHECK(hipFree(d_output)); } template< class T, unsigned int BlockSize, unsigned int LogicalWarpSize, unsigned int ItemsPerThread, class CompareOp = test_utils::less, unsigned int Trials = 10 > void run_segmented_benchmark(benchmark::State& state, const benchmark_kinds benchmark_kind, const hipStream_t stream, const size_t N) { constexpr auto max_segment_size = LogicalWarpSize * ItemsPerThread; constexpr auto segments_per_block = BlockSize / LogicalWarpSize; constexpr auto items_per_block = BlockSize * ItemsPerThread; const auto num_blocks = (N + items_per_block - 1) / items_per_block; const auto num_segments = num_blocks * segments_per_block; const auto size = num_blocks * items_per_block; const auto input = std::is_floating_point::value ? benchmark_utils::get_random_data(size, static_cast(-1000), static_cast(1000)) : benchmark_utils::get_random_data( size, std::numeric_limits::min(), std::numeric_limits::max() ); const auto segment_sizes = benchmark_utils::get_random_data( num_segments, 0, max_segment_size); T* d_input = nullptr; T* d_output = nullptr; unsigned int* d_segment_sizes = nullptr; HIP_CHECK(hipMalloc(&d_input, size * sizeof(input[0]))); HIP_CHECK(hipMalloc(&d_output, size * sizeof(input[0]))); HIP_CHECK(hipMalloc(&d_segment_sizes, num_segments * sizeof(segment_sizes[0]))); HIP_CHECK( hipMemcpy( d_input, input.data(), size * sizeof(T), hipMemcpyHostToDevice ) ); HIP_CHECK(hipMemcpy(d_segment_sizes, segment_sizes.data(), num_segments * sizeof(segment_sizes[0]), hipMemcpyHostToDevice)); for(auto _ : state) { auto start = std::chrono::high_resolution_clock::now(); if(benchmark_kind == benchmark_kinds::sort_keys) { for(unsigned int i = 0; i < Trials; ++i) { hipLaunchKernelGGL( HIP_KERNEL_NAME( sort_keys_segmented), dim3(num_blocks), dim3(BlockSize), 0, stream, d_input, d_output, d_segment_sizes, CompareOp{}); } } else if(benchmark_kind == benchmark_kinds::sort_pairs) { for(unsigned int i = 0; i < Trials; ++i) { hipLaunchKernelGGL( HIP_KERNEL_NAME( sort_pairs_segmented), dim3(num_blocks), dim3(BlockSize), 0, stream, d_input, d_output, d_segment_sizes, CompareOp{}); } } HIP_CHECK(hipPeekAtLastError()); HIP_CHECK(hipDeviceSynchronize()); auto end = std::chrono::high_resolution_clock::now(); auto elapsed_seconds = std::chrono::duration_cast>(end - start); state.SetIterationTime(elapsed_seconds.count()); } state.SetBytesProcessed(state.iterations() * Trials * size * sizeof(T)); state.SetItemsProcessed(state.iterations() * Trials * size); HIP_CHECK(hipFree(d_input)); HIP_CHECK(hipFree(d_output)); HIP_CHECK(hipFree(d_segment_sizes)); } #define CREATE_BENCHMARK(T, BS, WS, IPT) \ do { \ const auto benchmark_name = \ std::string{"warp_merge_sort.SubAlgorithm Name:"} + name; \ if(WS <= device_warp_size) { \ benchmarks.push_back(benchmark::RegisterBenchmark(benchmark_name.c_str(), \ segmented ? &run_benchmark : &run_segmented_benchmark, \ benchmark_kind, stream, size)); \ } \ } while(false) #define BENCHMARK_TYPE_WS(type, block, warp) \ CREATE_BENCHMARK(type, block, warp, 1); \ CREATE_BENCHMARK(type, block, warp, 4); \ CREATE_BENCHMARK(type, block, warp, 8) #define BENCHMARK_TYPE(type, block) \ BENCHMARK_TYPE_WS(type, block, 4); \ BENCHMARK_TYPE_WS(type, block, 16); \ BENCHMARK_TYPE_WS(type, block, 32); \ BENCHMARK_TYPE_WS(type, block, 64) void add_benchmarks(const benchmark_kinds benchmark_kind, const std::string& name, std::vector& benchmarks, const hipStream_t stream, const size_t size, const bool segmented, const unsigned int device_warp_size) { BENCHMARK_TYPE(int, 256); BENCHMARK_TYPE(int8_t, 256); BENCHMARK_TYPE(uint8_t, 256); BENCHMARK_TYPE(long long, 256); } int main(int argc, char *argv[]) { cli::Parser parser(argc, argv); parser.set_optional("size", "size", DEFAULT_N, "number of values"); parser.set_optional("trials", "trials", -1, "number of iterations"); parser.run_and_exit_if_error(); // Parse argv benchmark::Initialize(&argc, argv); const size_t size = parser.get("size"); const int trials = parser.get("trials"); std::cout << "benchmark_warp_merge_sort" << std::endl; // HIP hipStream_t stream = 0; // default hipDeviceProp_t devProp; int device_id = 0; HIP_CHECK(hipGetDevice(&device_id)); HIP_CHECK(hipGetDeviceProperties(&devProp, device_id)); std::cout << "[HIP] Device name: " << devProp.name << std::endl; const auto device_warp_size = [] { const int result = HIPCUB_HOST_WARP_THREADS; if(result > 0) { std::cout << "[HIP] Device warp size: " << result << std::endl; } else { std::cerr << "Failed to get device warp size! Aborting.\n"; std::exit(1); } return static_cast(result); }(); // Add benchmarks std::vector benchmarks; add_benchmarks(benchmark_kinds::sort_keys, "sort(keys)", benchmarks, stream, size, false, device_warp_size); add_benchmarks(benchmark_kinds::sort_pairs, "sort(keys, values)", benchmarks, stream, size, false, device_warp_size); add_benchmarks(benchmark_kinds::sort_keys, "segmented_sort(keys)", benchmarks, stream, size, true, device_warp_size); add_benchmarks(benchmark_kinds::sort_pairs, "segmented_sort(keys, values)", benchmarks, stream, size, true, device_warp_size); // Use manual timing for(auto& b : benchmarks) { b->UseManualTime(); b->Unit(benchmark::kMillisecond); } // Force number of iterations if(trials > 0) { for(auto& b : benchmarks) { b->Iterations(trials); } } // Run benchmarks benchmark::RunSpecifiedBenchmarks(); return 0; } hipCUB-rocm-5.7.1/benchmark/benchmark_warp_reduce.cpp000066400000000000000000000207341447643347700225750ustar00rootroot00000000000000// MIT License // // Copyright (c) 2020 Advanced Micro Devices, Inc. All rights reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal // in the Software without restriction, including without limitation the rights // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell // copies of the Software, and to permit persons to whom the Software is // furnished to do so, subject to the following conditions: // // The above copyright notice and this permission notice shall be included in all // copies or substantial portions of the Software. // // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE // SOFTWARE. #include "common_benchmark_header.hpp" // HIP API #include "hipcub/warp/warp_reduce.hpp" #ifndef DEFAULT_N const size_t DEFAULT_N = 1024 * 1024 * 32; #endif template< class T, unsigned int WarpSize, unsigned int Trials > __global__ __launch_bounds__(64) void warp_reduce_kernel(const T * d_input, T * d_output) { const unsigned int i = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x; auto value = d_input[i]; using wreduce_t = hipcub::WarpReduce; __shared__ typename wreduce_t::TempStorage storage; auto reduce_op = hipcub::Sum(); #pragma nounroll for(unsigned int trial = 0; trial < Trials; trial++) { value = wreduce_t(storage).Reduce(value, reduce_op); } d_output[i] = value; } template< class T, class Flag, unsigned int WarpSize, unsigned int Trials > __global__ __launch_bounds__(64) void segmented_warp_reduce_kernel(const T* d_input, Flag* d_flags, T* d_output) { const unsigned int i = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x; auto value = d_input[i]; auto flag = d_flags[i]; using wreduce_t = hipcub::WarpReduce; __shared__ typename wreduce_t::TempStorage storage; #pragma nounroll for(unsigned int trial = 0; trial < Trials; trial++) { value = wreduce_t(storage).HeadSegmentedSum(value, flag); } d_output[i] = value; } template< bool Segmented, unsigned int WarpSize, unsigned int BlockSize, unsigned int Trials, class T, class Flag > inline auto execute_warp_reduce_kernel(T* input, T* output, Flag* /* flags */, size_t size, hipStream_t stream) -> typename std::enable_if::type { hipLaunchKernelGGL( HIP_KERNEL_NAME(warp_reduce_kernel), dim3(size/BlockSize), dim3(BlockSize), 0, stream, input, output ); HIP_CHECK(hipPeekAtLastError()); } template< bool Segmented, unsigned int WarpSize, unsigned int BlockSize, unsigned int Trials, class T, class Flag > inline auto execute_warp_reduce_kernel(T* input, T* output, Flag* flags, size_t size, hipStream_t stream) -> typename std::enable_if::type { hipLaunchKernelGGL( HIP_KERNEL_NAME(segmented_warp_reduce_kernel), dim3(size/BlockSize), dim3(BlockSize), 0, stream, input, flags, output ); HIP_CHECK(hipPeekAtLastError()); } template< bool Segmented, class T, unsigned int WarpSize, unsigned int BlockSize, unsigned int Trials = 100 > void run_benchmark(benchmark::State& state, hipStream_t stream, size_t N) { using flag_type = unsigned char; const auto size = BlockSize * ((N + BlockSize - 1)/BlockSize); std::vector input = benchmark_utils::get_random_data(size, T(0), T(10)); std::vector flags = benchmark_utils::get_random_data(size, 0, 1); T * d_input; flag_type * d_flags; T * d_output; HIP_CHECK(hipMalloc(&d_input, size * sizeof(T))); HIP_CHECK(hipMalloc(&d_flags, size * sizeof(flag_type))); HIP_CHECK(hipMalloc(&d_output, size * sizeof(T))); HIP_CHECK( hipMemcpy( d_input, input.data(), size * sizeof(T), hipMemcpyHostToDevice ) ); HIP_CHECK( hipMemcpy( d_flags, flags.data(), size * sizeof(flag_type), hipMemcpyHostToDevice ) ); HIP_CHECK(hipDeviceSynchronize()); for(auto _ : state) { auto start = std::chrono::high_resolution_clock::now(); execute_warp_reduce_kernel( d_input, d_output, d_flags, size, stream ); HIP_CHECK(hipDeviceSynchronize()); auto end = std::chrono::high_resolution_clock::now(); auto elapsed_seconds = std::chrono::duration_cast>(end - start); state.SetIterationTime(elapsed_seconds.count()); } state.SetBytesProcessed(state.iterations() * Trials * size * sizeof(T)); state.SetItemsProcessed(state.iterations() * Trials * size); HIP_CHECK(hipFree(d_input)); HIP_CHECK(hipFree(d_output)); HIP_CHECK(hipFree(d_flags)); } #define CREATE_BENCHMARK(T, WS, BS) \ benchmark::RegisterBenchmark( \ (std::string("warp_reduce.SubAlgorithm Name:") + name).c_str(), \ &run_benchmark, \ stream, size \ ) // If warp size limit is 16 #define BENCHMARK_TYPE_WS16(type) \ CREATE_BENCHMARK(type, 15, 32), \ CREATE_BENCHMARK(type, 16, 32) // If warp size limit is 32 #define BENCHMARK_TYPE_WS32(type) \ BENCHMARK_TYPE_WS16(type), \ CREATE_BENCHMARK(type, 31, 32), \ CREATE_BENCHMARK(type, 32, 32), \ CREATE_BENCHMARK(type, 32, 64) // If warp size limit is 64 #define BENCHMARK_TYPE_WS64(type) \ BENCHMARK_TYPE_WS32(type), \ CREATE_BENCHMARK(type, 37, 64), \ CREATE_BENCHMARK(type, 61, 64), \ CREATE_BENCHMARK(type, 64, 64) template void add_benchmarks(const std::string& name, std::vector& benchmarks, hipStream_t stream, size_t size) { std::vector bs = { #if HIPCUB_WARP_THREADS_MACRO == 16 BENCHMARK_TYPE_WS16(int), BENCHMARK_TYPE_WS16(float), BENCHMARK_TYPE_WS16(double), BENCHMARK_TYPE_WS16(int8_t), BENCHMARK_TYPE_WS16(uint8_t) #elif HIPCUB_WARP_THREADS_MACRO == 32 BENCHMARK_TYPE_WS32(int), BENCHMARK_TYPE_WS32(float), BENCHMARK_TYPE_WS32(double), BENCHMARK_TYPE_WS32(int8_t), BENCHMARK_TYPE_WS32(uint8_t) #else BENCHMARK_TYPE_WS64(int), BENCHMARK_TYPE_WS64(float), BENCHMARK_TYPE_WS64(double), BENCHMARK_TYPE_WS64(int8_t), BENCHMARK_TYPE_WS64(uint8_t) #endif }; benchmarks.insert(benchmarks.end(), bs.begin(), bs.end()); } int main(int argc, char *argv[]) { cli::Parser parser(argc, argv); parser.set_optional("size", "size", DEFAULT_N, "number of values"); parser.set_optional("trials", "trials", -1, "number of iterations"); parser.run_and_exit_if_error(); // Parse argv benchmark::Initialize(&argc, argv); const size_t size = parser.get("size"); const int trials = parser.get("trials"); std::cout << "benchmark_warp_reduce" << std::endl; // HIP hipStream_t stream = 0; // default hipDeviceProp_t devProp; int device_id = 0; HIP_CHECK(hipGetDevice(&device_id)); HIP_CHECK(hipGetDeviceProperties(&devProp, device_id)); std::cout << "[HIP] Device name: " << devProp.name << std::endl; // Add benchmarks std::vector benchmarks; add_benchmarks("reduce", benchmarks, stream, size); add_benchmarks("segmented_reduce", benchmarks, stream, size); // Use manual timing for(auto& b : benchmarks) { b->UseManualTime(); b->Unit(benchmark::kMillisecond); } // Force number of iterations if(trials > 0) { for(auto& b : benchmarks) { b->Iterations(trials); } } // Run benchmarks benchmark::RunSpecifiedBenchmarks(); return 0; } hipCUB-rocm-5.7.1/benchmark/benchmark_warp_scan.cpp000066400000000000000000000230561447643347700222520ustar00rootroot00000000000000// MIT License // // Copyright (c) 2020-2022 Advanced Micro Devices, Inc. All rights reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal // in the Software without restriction, including without limitation the rights // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell // copies of the Software, and to permit persons to whom the Software is // furnished to do so, subject to the following conditions: // // The above copyright notice and this permission notice shall be included in all // copies or substantial portions of the Software. // // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE // SOFTWARE. #include "common_benchmark_header.hpp" // HIP API #include "hipcub/warp/warp_scan.hpp" #ifndef DEFAULT_N const size_t DEFAULT_N = 1024 * 1024 * 32; #endif enum class scan_type { inclusive_scan, exclusive_scan, broadcast }; template __global__ __launch_bounds__(BlockSize) void kernel(const T* input, T* output, const T init) { Runner::template run(input, output, init); } struct inclusive_scan { template __device__ static void run(const T* input, T* output, const T init) { (void)init; const unsigned int i = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x; auto value = input[i]; using wscan_t = hipcub::WarpScan; __shared__ typename wscan_t::TempStorage storage; auto scan_op = hipcub::Sum(); #pragma nounroll for(unsigned int trial = 0; trial < Trials; trial++) { wscan_t(storage).InclusiveScan(value, value, scan_op); } output[i] = value; } }; struct exclusive_scan { template __device__ static void run(const T* input, T* output, const T init) { const unsigned int i = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x; auto value = input[i]; using wscan_t = hipcub::WarpScan; __shared__ typename wscan_t::TempStorage storage; auto scan_op = hipcub::Sum(); #pragma nounroll for(unsigned int trial = 0; trial < Trials; trial++) { wscan_t(storage).ExclusiveScan(value, value, init, scan_op); } output[i] = value; } }; struct broadcast { template __device__ static void run(const T* input, T* output, const T init) { (void)init; const unsigned int i = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x; auto value = input[i]; using wscan_t = hipcub::WarpScan; __shared__ typename wscan_t::TempStorage storage; auto scan_op = hipcub::Sum(); #pragma nounroll for(unsigned int trial = 0; trial < Trials; trial++) { value = wscan_t(storage).Broadcast(value, 0); } output[i] = value; } }; template void run_benchmark(benchmark::State& state, hipStream_t stream, size_t size) { // Make sure size is a multiple of BlockSize size = BlockSize * ((size + BlockSize - 1)/BlockSize); // Allocate and fill memory std::vector input(size, 1.0f); T * d_input; T * d_output; HIP_CHECK(hipMalloc(&d_input, size * sizeof(T))); HIP_CHECK(hipMalloc(&d_output, size * sizeof(T))); HIP_CHECK( hipMemcpy( d_input, input.data(), size * sizeof(T), hipMemcpyHostToDevice ) ); HIP_CHECK(hipDeviceSynchronize()); for (auto _ : state) { auto start = std::chrono::high_resolution_clock::now(); hipLaunchKernelGGL(HIP_KERNEL_NAME(kernel), dim3(size / BlockSize), dim3(BlockSize), 0, stream, d_input, d_output, input[0]); HIP_CHECK(hipPeekAtLastError()); HIP_CHECK(hipDeviceSynchronize()); auto end = std::chrono::high_resolution_clock::now(); auto elapsed_seconds = std::chrono::duration_cast>(end - start); state.SetIterationTime(elapsed_seconds.count()); } state.SetBytesProcessed(state.iterations() * size * sizeof(T) * Trials); state.SetItemsProcessed(state.iterations() * size * Trials); HIP_CHECK(hipFree(d_input)); HIP_CHECK(hipFree(d_output)); } #define CREATE_BENCHMARK_IMPL(T, BS, WS, OP) \ benchmark::RegisterBenchmark((std::string("warp_scan.Method Name:") \ + method_name) \ .c_str(), \ &run_benchmark, \ stream, \ size) #define CREATE_BENCHMARK(T, BS, WS) CREATE_BENCHMARK_IMPL(T, BS, WS, Benchmark) // clang-format off // If warp size limit is 16 #define BENCHMARK_TYPE_WS16(type) \ CREATE_BENCHMARK(type, 60, 15), \ CREATE_BENCHMARK(type, 256, 16) // If warp size limit is 32 #define BENCHMARK_TYPE_WS32(type) \ BENCHMARK_TYPE_WS16(type), \ CREATE_BENCHMARK(type, 62, 31), \ CREATE_BENCHMARK(type, 256, 32) // If warp size limit is 64 #define BENCHMARK_TYPE_WS64(type) \ BENCHMARK_TYPE_WS32(type), \ CREATE_BENCHMARK(type, 63, 63), \ CREATE_BENCHMARK(type, 64, 64), \ CREATE_BENCHMARK(type, 128, 64), \ CREATE_BENCHMARK(type, 256, 64) // clang-format on template void add_benchmarks(std::vector& benchmarks, const std::string& method_name, hipStream_t stream, size_t size) { using custom_double2 = benchmark_utils::custom_type; using custom_int_double = benchmark_utils::custom_type; std::vector new_benchmarks = { #if HIPCUB_WARP_THREADS_MACRO == 16 BENCHMARK_TYPE_WS16(int), BENCHMARK_TYPE_WS16(float), BENCHMARK_TYPE_WS16(double), BENCHMARK_TYPE_WS16(int8_t), BENCHMARK_TYPE_WS16(custom_double2), BENCHMARK_TYPE_WS16(custom_int_double) #elif HIPCUB_WARP_THREADS_MACRO == 32 BENCHMARK_TYPE_WS32(int), BENCHMARK_TYPE_WS32(float), BENCHMARK_TYPE_WS32(double), BENCHMARK_TYPE_WS32(int8_t), BENCHMARK_TYPE_WS32(custom_double2), BENCHMARK_TYPE_WS32(custom_int_double) #else BENCHMARK_TYPE_WS64(int), BENCHMARK_TYPE_WS64(float), BENCHMARK_TYPE_WS64(double), BENCHMARK_TYPE_WS64(int8_t), BENCHMARK_TYPE_WS64(custom_double2), BENCHMARK_TYPE_WS64(custom_int_double) #endif }; benchmarks.insert(benchmarks.end(), new_benchmarks.begin(), new_benchmarks.end()); } int main(int argc, char *argv[]) { cli::Parser parser(argc, argv); parser.set_optional("size", "size", DEFAULT_N, "number of values"); parser.set_optional("trials", "trials", -1, "number of iterations"); parser.run_and_exit_if_error(); // Parse argv benchmark::Initialize(&argc, argv); const size_t size = parser.get("size"); const int trials = parser.get("trials"); std::cout << "benchmark_warp_scan" << std::endl; // HIP hipStream_t stream = 0; // default hipDeviceProp_t devProp; int device_id = 0; HIP_CHECK(hipGetDevice(&device_id)); HIP_CHECK(hipGetDeviceProperties(&devProp, device_id)); std::cout << "[HIP] Device name: " << devProp.name << std::endl; // Add benchmarks std::vector benchmarks; add_benchmarks(benchmarks, "inclusive_scan", stream, size); add_benchmarks(benchmarks, "exclusive_scan", stream, size); add_benchmarks(benchmarks, "broadcast", stream, size); // Use manual timing for(auto& b : benchmarks) { b->UseManualTime(); b->Unit(benchmark::kMillisecond); } // Force number of iterations if(trials > 0) { for(auto& b : benchmarks) { b->Iterations(trials); } } // Run benchmarks benchmark::RunSpecifiedBenchmarks(); return 0; } hipCUB-rocm-5.7.1/benchmark/benchmark_warp_store.cpp000066400000000000000000000275051447643347700224650ustar00rootroot00000000000000// MIT License // // Copyright (c) 2021 Advanced Micro Devices, Inc. All rights reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal // in the Software without restriction, including without limitation the rights // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell // copies of the Software, and to permit persons to whom the Software is // furnished to do so, subject to the following conditions: // // The above copyright notice and this permission notice shall be included in all // copies or substantial portions of the Software. // // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE // SOFTWARE. #include "common_benchmark_header.hpp" // HIP API #include "hipcub/warp/warp_store.hpp" #ifndef DEFAULT_N const size_t DEFAULT_N = 1024 * 1024 * 32; #endif template< class T, unsigned BlockSize, unsigned ItemsPerThread, unsigned LogicalWarpSize, ::hipcub::WarpStoreAlgorithm Algorithm > __global__ __launch_bounds__(BlockSize) void warp_store_kernel(T* d_output) { T thread_data[ItemsPerThread]; #pragma unroll for (unsigned i = 0; i < ItemsPerThread; ++i) { thread_data[i] = static_cast(i); } using WarpStoreT = ::hipcub::WarpStore< T, ItemsPerThread, Algorithm, ::benchmark_utils::DeviceSelectWarpSize::value >; constexpr unsigned warps_in_block = BlockSize / LogicalWarpSize; constexpr int tile_size = ItemsPerThread * LogicalWarpSize; __shared__ typename WarpStoreT::TempStorage temp_storage[warps_in_block]; const unsigned warp_id = hipThreadIdx_x / LogicalWarpSize; const unsigned global_warp_id = hipBlockIdx_x * warps_in_block + warp_id; WarpStoreT(temp_storage[warp_id]).Store(d_output + global_warp_id * tile_size, thread_data); } template< class T, unsigned BlockSize, unsigned ItemsPerThread, unsigned LogicalWarpSize, ::hipcub::WarpStoreAlgorithm Algorithm, unsigned Trials = 100 > void run_benchmark(benchmark::State& state, hipStream_t stream, size_t N) { constexpr unsigned items_per_block = BlockSize * ItemsPerThread; const unsigned size = items_per_block * ((N + items_per_block - 1) / items_per_block); T * d_output; HIP_CHECK(hipMalloc(&d_output, size * sizeof(T))); for (auto _ : state) { auto start = std::chrono::high_resolution_clock::now(); for (size_t i = 0; i < Trials; ++i) { hipLaunchKernelGGL( HIP_KERNEL_NAME(warp_store_kernel< T, BlockSize, ItemsPerThread, LogicalWarpSize, Algorithm >), dim3(size / items_per_block), dim3(BlockSize), 0, stream, d_output ); } HIP_CHECK(hipPeekAtLastError()) HIP_CHECK(hipDeviceSynchronize()); auto end = std::chrono::high_resolution_clock::now(); auto elapsed_seconds = std::chrono::duration_cast>(end - start); state.SetIterationTime(elapsed_seconds.count()); } state.SetBytesProcessed(state.iterations() * Trials * size * sizeof(T)); state.SetItemsProcessed(state.iterations() * Trials * size); HIP_CHECK(hipFree(d_output)); } #define CREATE_BENCHMARK(T, BS, IT, WS, ALG) \ benchmark::RegisterBenchmark( \ "warp_store.", \ &run_benchmark, \ stream, size \ ) int main(int argc, char *argv[]) { cli::Parser parser(argc, argv); parser.set_optional("size", "size", DEFAULT_N, "number of values"); parser.set_optional("trials", "trials", -1, "number of iterations"); parser.run_and_exit_if_error(); // Parse argv benchmark::Initialize(&argc, argv); const size_t size = parser.get("size"); const int trials = parser.get("trials"); std::cout << "benchmark_warp_store" << std::endl; // HIP hipStream_t stream = 0; // default hipDeviceProp_t devProp; int device_id = 0; HIP_CHECK(hipGetDevice(&device_id)); HIP_CHECK(hipGetDeviceProperties(&devProp, device_id)); std::cout << "[HIP] Device name: " << devProp.name << std::endl; // Add benchmarks std::vector benchmarks{ CREATE_BENCHMARK(int, 256, 4, 32, ::hipcub::WARP_STORE_DIRECT), CREATE_BENCHMARK(int, 256, 4, 32, ::hipcub::WARP_STORE_STRIPED), CREATE_BENCHMARK(int, 256, 4, 32, ::hipcub::WARP_STORE_VECTORIZE), CREATE_BENCHMARK(int, 256, 4, 32, ::hipcub::WARP_STORE_TRANSPOSE), CREATE_BENCHMARK(int, 256, 8, 32, ::hipcub::WARP_STORE_DIRECT), CREATE_BENCHMARK(int, 256, 8, 32, ::hipcub::WARP_STORE_STRIPED), CREATE_BENCHMARK(int, 256, 8, 32, ::hipcub::WARP_STORE_VECTORIZE), CREATE_BENCHMARK(int, 256, 8, 32, ::hipcub::WARP_STORE_TRANSPOSE), CREATE_BENCHMARK(int, 256, 16, 32, ::hipcub::WARP_STORE_DIRECT), CREATE_BENCHMARK(int, 256, 16, 32, ::hipcub::WARP_STORE_STRIPED), CREATE_BENCHMARK(int, 256, 16, 32, ::hipcub::WARP_STORE_VECTORIZE), CREATE_BENCHMARK(int, 256, 16, 32, ::hipcub::WARP_STORE_TRANSPOSE), CREATE_BENCHMARK(int, 256, 32, 32, ::hipcub::WARP_STORE_DIRECT), CREATE_BENCHMARK(int, 256, 32, 32, ::hipcub::WARP_STORE_STRIPED), CREATE_BENCHMARK(int, 256, 32, 32, ::hipcub::WARP_STORE_VECTORIZE), CREATE_BENCHMARK(int, 256, 32, 32, ::hipcub::WARP_STORE_TRANSPOSE), CREATE_BENCHMARK(int, 256, 64, 32, ::hipcub::WARP_STORE_DIRECT), CREATE_BENCHMARK(int, 256, 64, 32, ::hipcub::WARP_STORE_STRIPED), CREATE_BENCHMARK(int, 256, 64, 32, ::hipcub::WARP_STORE_VECTORIZE), CREATE_BENCHMARK(double, 256, 4, 32, ::hipcub::WARP_STORE_DIRECT), CREATE_BENCHMARK(double, 256, 4, 32, ::hipcub::WARP_STORE_STRIPED), CREATE_BENCHMARK(double, 256, 4, 32, ::hipcub::WARP_STORE_VECTORIZE), CREATE_BENCHMARK(double, 256, 4, 32, ::hipcub::WARP_STORE_TRANSPOSE), CREATE_BENCHMARK(double, 256, 8, 32, ::hipcub::WARP_STORE_DIRECT), CREATE_BENCHMARK(double, 256, 8, 32, ::hipcub::WARP_STORE_STRIPED), CREATE_BENCHMARK(double, 256, 8, 32, ::hipcub::WARP_STORE_VECTORIZE), CREATE_BENCHMARK(double, 256, 8, 32, ::hipcub::WARP_STORE_TRANSPOSE), CREATE_BENCHMARK(double, 256, 16, 32, ::hipcub::WARP_STORE_DIRECT), CREATE_BENCHMARK(double, 256, 16, 32, ::hipcub::WARP_STORE_STRIPED), CREATE_BENCHMARK(double, 256, 16, 32, ::hipcub::WARP_STORE_VECTORIZE), CREATE_BENCHMARK(double, 256, 16, 32, ::hipcub::WARP_STORE_TRANSPOSE), CREATE_BENCHMARK(double, 256, 32, 32, ::hipcub::WARP_STORE_DIRECT), CREATE_BENCHMARK(double, 256, 32, 32, ::hipcub::WARP_STORE_STRIPED), CREATE_BENCHMARK(double, 256, 32, 32, ::hipcub::WARP_STORE_VECTORIZE), // WARP_STORE_TRANSPOSE removed because of shared memory limit // CREATE_BENCHMARK(double, 256, 32, 32, ::hipcub::WARP_STORE_TRANSPOSE), CREATE_BENCHMARK(double, 256, 64, 32, ::hipcub::WARP_STORE_DIRECT), CREATE_BENCHMARK(double, 256, 64, 32, ::hipcub::WARP_STORE_STRIPED), CREATE_BENCHMARK(double, 256, 64, 32, ::hipcub::WARP_STORE_VECTORIZE) // WARP_STORE_TRANSPOSE removed because of shared memory limit // CREATE_BENCHMARK(double, 256, 64, 32, ::hipcub::WARP_STORE_TRANSPOSE) }; if (::benchmark_utils::is_warp_size_supported(64)) { std::vector additional_benchmarks{ CREATE_BENCHMARK(int, 256, 4, 64, ::hipcub::WARP_STORE_DIRECT), CREATE_BENCHMARK(int, 256, 4, 64, ::hipcub::WARP_STORE_STRIPED), CREATE_BENCHMARK(int, 256, 4, 64, ::hipcub::WARP_STORE_VECTORIZE), CREATE_BENCHMARK(int, 256, 4, 64, ::hipcub::WARP_STORE_TRANSPOSE), CREATE_BENCHMARK(int, 256, 8, 64, ::hipcub::WARP_STORE_DIRECT), CREATE_BENCHMARK(int, 256, 8, 64, ::hipcub::WARP_STORE_STRIPED), CREATE_BENCHMARK(int, 256, 8, 64, ::hipcub::WARP_STORE_VECTORIZE), CREATE_BENCHMARK(int, 256, 8, 64, ::hipcub::WARP_STORE_TRANSPOSE), CREATE_BENCHMARK(int, 256, 16, 64, ::hipcub::WARP_STORE_DIRECT), CREATE_BENCHMARK(int, 256, 16, 64, ::hipcub::WARP_STORE_STRIPED), CREATE_BENCHMARK(int, 256, 16, 64, ::hipcub::WARP_STORE_VECTORIZE), CREATE_BENCHMARK(int, 256, 16, 64, ::hipcub::WARP_STORE_TRANSPOSE), CREATE_BENCHMARK(int, 256, 32, 64, ::hipcub::WARP_STORE_DIRECT), CREATE_BENCHMARK(int, 256, 32, 64, ::hipcub::WARP_STORE_STRIPED), CREATE_BENCHMARK(int, 256, 32, 64, ::hipcub::WARP_STORE_VECTORIZE), CREATE_BENCHMARK(int, 256, 32, 64, ::hipcub::WARP_STORE_TRANSPOSE), CREATE_BENCHMARK(int, 256, 64, 64, ::hipcub::WARP_STORE_DIRECT), CREATE_BENCHMARK(int, 256, 64, 64, ::hipcub::WARP_STORE_STRIPED), CREATE_BENCHMARK(int, 256, 64, 64, ::hipcub::WARP_STORE_VECTORIZE), CREATE_BENCHMARK(double, 256, 4, 64, ::hipcub::WARP_STORE_DIRECT), CREATE_BENCHMARK(double, 256, 4, 64, ::hipcub::WARP_STORE_STRIPED), CREATE_BENCHMARK(double, 256, 4, 64, ::hipcub::WARP_STORE_VECTORIZE), CREATE_BENCHMARK(double, 256, 4, 64, ::hipcub::WARP_STORE_TRANSPOSE), CREATE_BENCHMARK(double, 256, 8, 64, ::hipcub::WARP_STORE_DIRECT), CREATE_BENCHMARK(double, 256, 8, 64, ::hipcub::WARP_STORE_STRIPED), CREATE_BENCHMARK(double, 256, 8, 64, ::hipcub::WARP_STORE_VECTORIZE), CREATE_BENCHMARK(double, 256, 8, 64, ::hipcub::WARP_STORE_TRANSPOSE), CREATE_BENCHMARK(double, 256, 16, 64, ::hipcub::WARP_STORE_DIRECT), CREATE_BENCHMARK(double, 256, 16, 64, ::hipcub::WARP_STORE_STRIPED), CREATE_BENCHMARK(double, 256, 16, 64, ::hipcub::WARP_STORE_VECTORIZE), // WARP_STORE_TRANSPOSE removed because of shared memory limit // CREATE_BENCHMARK(double, 256, 16, 64, ::hipcub::WARP_STORE_TRANSPOSE), CREATE_BENCHMARK(double, 256, 32, 64, ::hipcub::WARP_STORE_DIRECT), CREATE_BENCHMARK(double, 256, 32, 64, ::hipcub::WARP_STORE_STRIPED), CREATE_BENCHMARK(double, 256, 32, 64, ::hipcub::WARP_STORE_VECTORIZE), // WARP_STORE_TRANSPOSE removed because of shared memory limit // CREATE_BENCHMARK(double, 256, 32, 64, ::hipcub::WARP_STORE_TRANSPOSE), CREATE_BENCHMARK(double, 256, 64, 64, ::hipcub::WARP_STORE_DIRECT), CREATE_BENCHMARK(double, 256, 64, 64, ::hipcub::WARP_STORE_STRIPED), CREATE_BENCHMARK(double, 256, 64, 64, ::hipcub::WARP_STORE_VECTORIZE) // WARP_STORE_TRANSPOSE removed because of shared memory limit // CREATE_BENCHMARK(double, 256, 64, 64, ::hipcub::WARP_STORE_TRANSPOSE) }; benchmarks.insert( benchmarks.end(), additional_benchmarks.begin(), additional_benchmarks.end() ); } // Use manual timing for (auto& b : benchmarks) { b->UseManualTime(); b->Unit(benchmark::kMillisecond); } // Force number of iterations if (trials > 0) { for (auto& b : benchmarks) { b->Iterations(trials); } } // Run benchmarks benchmark::RunSpecifiedBenchmarks(); return 0; } hipCUB-rocm-5.7.1/benchmark/cmdparser.hpp000066400000000000000000000416401447643347700202470ustar00rootroot00000000000000// The MIT License (MIT) // // Copyright (c) 2015 - 2016 Florian Rappl // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal // in the Software without restriction, including without limitation the rights // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell // copies of the Software, and to permit persons to whom the Software is // furnished to do so, subject to the following conditions: // // The above copyright notice and this permission notice shall be included in all // copies or substantial portions of the Software. // // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE // SOFTWARE. /* This file is part of the C++ CmdParser utility. Copyright (c) 2015 - 2016 Florian Rappl */ #pragma once #include #include #include #include #include #include namespace cli { struct CallbackArgs { const std::vector& arguments; std::ostream& output; std::ostream& error; }; class Parser { private: class CmdBase { public: explicit CmdBase(const std::string& name, const std::string& alternative, const std::string& description, bool required, bool dominant, bool variadic) : name(name), command(name.size() > 0 ? "-" + name : ""), alternative(alternative.size() > 0 ? "--" + alternative : ""), description(description), required(required), handled(false), arguments({}), dominant(dominant), variadic(variadic) { } virtual ~CmdBase() { } std::string name; std::string command; std::string alternative; std::string description; bool required; bool handled; std::vector arguments; bool const dominant; bool const variadic; virtual std::string print_value() const = 0; virtual bool parse(std::ostream& output, std::ostream& error) = 0; bool is(const std::string& given) const { return given == command || given == alternative; } }; template struct ArgumentCountChecker { static constexpr bool Variadic = false; }; template struct ArgumentCountChecker> { static constexpr bool Variadic = true; }; template class CmdFunction final : public CmdBase { public: explicit CmdFunction(const std::string& name, const std::string& alternative, const std::string& description, bool required, bool dominant) : CmdBase(name, alternative, description, required, dominant, ArgumentCountChecker::Variadic) { } virtual bool parse(std::ostream& output, std::ostream& error) { try { CallbackArgs args { arguments, output, error }; value = callback(args); return true; } catch (...) { return false; } } virtual std::string print_value() const { return ""; } std::function callback; T value; }; template class CmdArgument final : public CmdBase { public: explicit CmdArgument(const std::string& name, const std::string& alternative, const std::string& description, bool required, bool dominant) : CmdBase(name, alternative, description, required, dominant, ArgumentCountChecker::Variadic) { } virtual bool parse(std::ostream&, std::ostream&) { try { value = Parser::parse(arguments, value); return true; } catch (...) { return false; } } virtual std::string print_value() const { return stringify(value); } T value; }; static int parse(const std::vector& elements, const int&) { if (elements.size() != 1) throw std::bad_cast(); return std::stoi(elements[0]); } static bool parse(const std::vector& elements, const bool& defval) { if (elements.size() != 0) throw std::runtime_error("A boolean command line parameter cannot have any arguments."); return !defval; } static double parse(const std::vector& elements, const double&) { if (elements.size() != 1) throw std::bad_cast(); return std::stod(elements[0]); } static float parse(const std::vector& elements, const float&) { if (elements.size() != 1) throw std::bad_cast(); return std::stof(elements[0]); } static long double parse(const std::vector& elements, const long double&) { if (elements.size() != 1) throw std::bad_cast(); return std::stold(elements[0]); } static unsigned int parse(const std::vector& elements, const unsigned int&) { if (elements.size() != 1) throw std::bad_cast(); return static_cast(std::stoul(elements[0])); } static unsigned long parse(const std::vector& elements, const unsigned long&) { if (elements.size() != 1) throw std::bad_cast(); return std::stoul(elements[0]); } static unsigned long long parse(const std::vector& elements, const unsigned long long&) { if (elements.size() != 1) throw std::bad_cast(); return std::stoull(elements[0]); } static long parse(const std::vector& elements, const long&) { if (elements.size() != 1) throw std::bad_cast(); return std::stol(elements[0]); } static std::string parse(const std::vector& elements, const std::string&) { if (elements.size() != 1) throw std::bad_cast(); return elements[0]; } template static std::vector parse(const std::vector& elements, const std::vector&) { const T defval = T(); std::vector values { }; std::vector buffer(1); for (const auto& element : elements) { buffer[0] = element; values.push_back(parse(buffer, defval)); } return values; } template static std::string stringify(const T& value) { return std::to_string(value); } template static std::string stringify(const std::vector& values) { std::stringstream ss { }; ss << "[ "; for (const auto& value : values) { ss << stringify(value) << " "; } ss << "]"; return ss.str(); } static std::string stringify(const std::string& str) { return str; } public: explicit Parser(int argc, const char** argv) : _appname(argv[0]) { for (int i = 1; i < argc; ++i) { _arguments.push_back(argv[i]); } enable_help(); } explicit Parser(int argc, char** argv) : _appname(argv[0]) { for (int i = 1; i < argc; ++i) { _arguments.push_back(argv[i]); } enable_help(); } ~Parser() { for (int i = 0, n = _commands.size(); i < n; ++i) { delete _commands[i]; } } bool has_help() const { for (const auto command : _commands) { if (command->name == "h" && command->alternative == "--help") { return true; } } return false; } void enable_help() { set_callback("h", "help", std::function([this](CallbackArgs& args){ args.output << this->usage(); /*exit(0);*/ return false; }), "", true); } void disable_help() { for (auto command = _commands.begin(); command != _commands.end(); ++command) { if ((*command)->name == "h" && (*command)->alternative == "--help") { _commands.erase(command); break; } } } template void set_default(bool is_required, const std::string& description = "") { auto command = new CmdArgument { "", "", description, is_required, false }; _commands.push_back(command); } template void set_required(const std::string& name, const std::string& alternative, const std::string& description = "", bool dominant = false) { auto command = new CmdArgument { name, alternative, description, true, dominant }; _commands.push_back(command); } template void set_optional(const std::string& name, const std::string& alternative, T defaultValue, const std::string& description = "", bool dominant = false) { auto command = new CmdArgument { name, alternative, description, false, dominant }; command->value = defaultValue; _commands.push_back(command); } template void set_callback(const std::string& name, const std::string& alternative, std::function callback, const std::string& description = "", bool dominant = false) { auto command = new CmdFunction { name, alternative, description, false, dominant }; command->callback = callback; _commands.push_back(command); } inline void run_and_exit_if_error() { if (run() == false) { exit(1); } } inline bool run() { return run(std::cout, std::cerr); } inline bool run(std::ostream& output) { return run(output, std::cerr); } bool run(std::ostream& output, std::ostream& error) { if (_arguments.size() > 0) { auto current = find_default(); for (int i = 0, n = _arguments.size(); i < n; ++i) { auto isarg = _arguments[i].size() > 0 && _arguments[i][0] == '-'; auto associated = isarg ? find(_arguments[i]) : nullptr; if (associated != nullptr) { current = associated; associated->handled = true; } else if (current == nullptr) { current = find(_arguments[i]); // Code was commented out so cmdparser can ignore unknown options // error << no_default(); // return false; } else { current->arguments.push_back(_arguments[i]); current->handled = true; if (!current->variadic) { // If the current command is not variadic, then no more arguments // should be added to it. In this case, switch back to the default // command. current = find_default(); } } } } // First, parse dominant arguments since they succeed even if required // arguments are missing. for (auto command : _commands) { if (command->handled && command->dominant && !command->parse(output, error)) { error << howto_use(command); return false; } } // Next, check for any missing arguments. for (auto command : _commands) { if (command->required && !command->handled) { error << howto_required(command); return false; } } // Finally, parse all remaining arguments. for (auto command : _commands) { if (command->handled && !command->dominant && !command->parse(output, error)) { error << howto_use(command); return false; } } return true; } template T get(const std::string& name) const { for (const auto& command : _commands) { if (command->name == name) { auto cmd = dynamic_cast*>(command); if (cmd == nullptr) { throw std::runtime_error("Invalid usage of the parameter " + name + " detected."); } return cmd->value; } } throw std::runtime_error("The parameter " + name + " could not be found."); } template T get_if(const std::string& name, std::function callback) const { auto value = get(name); return callback(value); } int requirements() const { int count = 0; for (const auto& command : _commands) { if (command->required) { ++count; } } return count; } int commands() const { return static_cast(_commands.size()); } inline const std::string& app_name() const { return _appname; } protected: CmdBase* find(const std::string& name) { for (auto command : _commands) { if (command->is(name)) { return command; } } return nullptr; } CmdBase* find_default() { for (auto command : _commands) { if (command->name == "") { return command; } } return nullptr; } std::string usage() const { std::stringstream ss { }; ss << "Available parameters:\n\n"; for (const auto& command : _commands) { ss << " " << command->command << "\t" << command->alternative; if (command->required == true) { ss << "\t(required)"; } ss << "\n " << command->description; if (command->required == false) { ss << "\n " << "This parameter is optional. The default value is '" + command->print_value() << "'."; } ss << "\n\n"; } return ss.str(); } void print_help(std::stringstream& ss) const { if (has_help()) { ss << "For more help use --help or -h.\n"; } } std::string howto_required(CmdBase* command) const { std::stringstream ss { }; ss << "The parameter " << command->name << " is required.\n"; ss << command->description << '\n'; print_help(ss); return ss.str(); } std::string howto_use(CmdBase* command) const { std::stringstream ss { }; ss << "The parameter " << command->name << " has invalid arguments.\n"; ss << command->description << '\n'; print_help(ss); return ss.str(); } std::string no_default() const { std::stringstream ss { }; ss << "No default parameter has been specified.\n"; ss << "The given argument must be used with a parameter.\n"; print_help(ss); return ss.str(); } private: const std::string _appname; std::vector _arguments; std::vector _commands; }; } hipCUB-rocm-5.7.1/benchmark/common_benchmark_header.hpp000066400000000000000000000040421447643347700230740ustar00rootroot00000000000000// MIT License // // Copyright (c) 2020 Advanced Micro Devices, Inc. All rights reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal // in the Software without restriction, including without limitation the rights // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell // copies of the Software, and to permit persons to whom the Software is // furnished to do so, subject to the following conditions: // // The above copyright notice and this permission notice shall be included in all // copies or substantial portions of the Software. // // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE // SOFTWARE. #include #include #include #include #include #include #include #include #include #include #include #include // Google Benchmark #include "benchmark/benchmark.h" // CmdParser #include "cmdparser.hpp" // HIP API #include // benchmark_utils.hpp should only be included by this header. // The following definition is used as guard in benchmark_utils.hpp // Including benchmark_utils.hpp by itself will cause a compile error. #define BENCHMARK_UTILS_INCLUDE_GUARD #include "benchmark_utils.hpp" #define HIP_CHECK(condition) \ { \ hipError_t error = condition; \ if(error != hipSuccess){ \ std::cout << "HIP error: " << error << " line: " << __LINE__ << std::endl; \ exit(error); \ } \ } hipCUB-rocm-5.7.1/cmake/000077500000000000000000000000001447643347700146775ustar00rootroot00000000000000hipCUB-rocm-5.7.1/cmake/Dependencies.cmake000066400000000000000000000161231447643347700202720ustar00rootroot00000000000000# MIT License # # Copyright (c) 2017-2023 Advanced Micro Devices, Inc. All rights reserved. # # Permission is hereby granted, free of charge, to any person obtaining a copy # of this software and associated documentation files (the "Software"), to deal # in the Software without restriction, including without limitation the rights # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell # copies of the Software, and to permit persons to whom the Software is # furnished to do so, subject to the following conditions: # # The above copyright notice and this permission notice shall be included in all # copies or substantial portions of the Software. # # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE # SOFTWARE. # ########################### # hipCUB dependencies # ########################### # HIP dependency is handled earlier in the project cmake file # when VerifyCompiler.cmake is included. # For downloading, building, and installing required dependencies include(cmake/DownloadProject.cmake) # CUB (only for CUDA platform) if(HIP_COMPILER STREQUAL "nvcc") if(NOT DOWNLOAD_CUB) find_package(cub QUIET) find_package(thrust QUIET) endif() if(NOT DEFINED CUB_INCLUDE_DIR) file( DOWNLOAD https://github.com/NVIDIA/cub/archive/2.0.1.zip ${CMAKE_CURRENT_BINARY_DIR}/cub-2.0.1.zip STATUS cub_download_status LOG cub_download_log ) list(GET cub_download_status 0 cub_download_error_code) if(cub_download_error_code) message(FATAL_ERROR "Error: downloading " "https://github.com/NVIDIA/cub/archive/2.0.1.zip failed " "error_code: ${cub_download_error_code} " "log: ${cub_download_log} " ) endif() execute_process( COMMAND ${CMAKE_COMMAND} -E tar xzf ${CMAKE_CURRENT_BINARY_DIR}/cub-2.0.1.zip WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR} RESULT_VARIABLE cub_unpack_error_code ) if(cub_unpack_error_code) message(FATAL_ERROR "Error: unpacking ${CMAKE_CURRENT_BINARY_DIR}/cub-2.0.1.zip failed") endif() set(CUB_INCLUDE_DIR ${CMAKE_CURRENT_BINARY_DIR}/cub-2.0.1/ CACHE PATH "") endif() if(NOT DEFINED THRUST_INCLUDE_DIR) file( DOWNLOAD https://github.com/NVIDIA/thrust/archive/2.0.1.zip ${CMAKE_CURRENT_BINARY_DIR}/thrust-2.0.1.zip STATUS thrust_download_status LOG thrust_download_log ) list(GET thrust_download_status 0 thrust_download_error_code) if(thrust_download_error_code) message(FATAL_ERROR "Error: downloading " "https://github.com/NVIDIA/thrust/archive/2.0.1.zip failed " "error_code: ${thrust_download_error_code} " "log: ${thrust_download_log} " ) endif() execute_process( COMMAND ${CMAKE_COMMAND} -E tar xzf ${CMAKE_CURRENT_BINARY_DIR}/thrust-2.0.1.zip WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR} RESULT_VARIABLE thrust_unpack_error_code ) if(thrust_unpack_error_code) message(FATAL_ERROR "Error: unpacking ${CMAKE_CURRENT_BINARY_DIR}/thrust-2.0.1.zip failed") endif() set(THRUST_INCLUDE_DIR ${CMAKE_CURRENT_BINARY_DIR}/thrust-2.0.1/ CACHE PATH "") endif() else() # rocPRIM (only for ROCm platform) if(NOT DOWNLOAD_ROCPRIM) find_package(rocprim QUIET) endif() if(NOT rocprim_FOUND) message(STATUS "Downloading and building rocprim.") download_project( PROJ rocprim GIT_REPOSITORY https://github.com/ROCmSoftwarePlatform/rocPRIM.git GIT_TAG develop GIT_SHALLOW TRUE INSTALL_DIR ${CMAKE_CURRENT_BINARY_DIR}/deps/rocprim CMAKE_ARGS -DBUILD_TEST=OFF -DCMAKE_INSTALL_PREFIX= -DCMAKE_PREFIX_PATH=/opt/rocm LOG_DOWNLOAD TRUE LOG_CONFIGURE TRUE LOG_BUILD TRUE LOG_INSTALL TRUE BUILD_PROJECT TRUE UPDATE_DISCONNECTED TRUE # Never update automatically from the remote repository ) find_package(rocprim REQUIRED CONFIG PATHS ${CMAKE_CURRENT_BINARY_DIR}/deps/rocprim NO_DEFAULT_PATH) endif() endif() # Test dependencies if(BUILD_TEST) if(NOT DEPENDENCIES_FORCE_DOWNLOAD) find_package(GTest 1.11.0 CONFIG QUIET) endif() if(NOT TARGET GTest::gtest) message(STATUS "GTest not found or force download GTest on. Downloading and building GTest.") if(WIN32) set(COMPILER_OVERRIDE "-DCMAKE_CXX_COMPILER=cl") endif() set(GTEST_ROOT ${CMAKE_CURRENT_BINARY_DIR}/gtest CACHE PATH "") download_project( PROJ googletest GIT_REPOSITORY https://github.com/google/googletest.git GIT_TAG release-1.11.0 GIT_SHALLOW TRUE INSTALL_DIR ${GTEST_ROOT} CMAKE_ARGS -DBUILD_GTEST=ON -DINSTALL_GTEST=ON -Dgtest_force_shared_crt=ON -DBUILD_SHARED_LIBS=ON -DCMAKE_INSTALL_PREFIX= ${COMPILER_OVERRIDE} LOG_DOWNLOAD TRUE LOG_CONFIGURE TRUE LOG_BUILD TRUE LOG_INSTALL TRUE BUILD_PROJECT TRUE UPDATE_DISCONNECTED TRUE # Never update automatically from the remote repository ) list(APPEND CMAKE_PREFIX_PATH ${GTEST_ROOT}) find_package(GTest 1.11.0 EXACT CONFIG REQUIRED PATHS ${GTEST_ROOT}) endif() endif() # Benchmark dependencies if(BUILD_BENCHMARK) if(NOT DEPENDENCIES_FORCE_DOWNLOAD) # Google Benchmark (https://github.com/google/benchmark.git) find_package(benchmark QUIET) endif() if(NOT benchmark_FOUND) message(STATUS "Google Benchmark not found or force download Google Benchmark on. Downloading and building Google Benchmark.") if(CMAKE_CONFIGURATION_TYPES) message(FATAL_ERROR "DownloadProject.cmake doesn't support multi-configuration generators.") endif() set(GOOGLEBENCHMARK_ROOT ${CMAKE_CURRENT_BINARY_DIR}/deps/googlebenchmark CACHE PATH "") if(NOT (CMAKE_CXX_COMPILER_ID STREQUAL "GNU")) # hip-clang cannot compile googlebenchmark for some reason if(WIN32) set(COMPILER_OVERRIDE "-DCMAKE_CXX_COMPILER=cl") else() set(COMPILER_OVERRIDE "-DCMAKE_CXX_COMPILER=g++") endif() endif() download_project( PROJ googlebenchmark GIT_REPOSITORY https://github.com/google/benchmark.git GIT_TAG v1.6.1 GIT_SHALLOW TRUE INSTALL_DIR ${GOOGLEBENCHMARK_ROOT} CMAKE_ARGS -DCMAKE_BUILD_TYPE=RELEASE -DBENCHMARK_ENABLE_TESTING=OFF -DBUILD_SHARED_LIBS=OFF -DCMAKE_INSTALL_PREFIX= -DCMAKE_CXX_STANDARD=14 ${COMPILER_OVERRIDE} LOG_DOWNLOAD TRUE LOG_CONFIGURE TRUE LOG_BUILD TRUE LOG_INSTALL TRUE BUILD_PROJECT TRUE UPDATE_DISCONNECTED TRUE ) endif() find_package(benchmark REQUIRED CONFIG PATHS ${GOOGLEBENCHMARK_ROOT}) endif() hipCUB-rocm-5.7.1/cmake/DownloadProject.CMakeLists.cmake.in000066400000000000000000000020011447643347700233730ustar00rootroot00000000000000# Distributed under the OSI-approved MIT License. See accompanying # file LICENSE or https://github.com/Crascit/DownloadProject for details. cmake_minimum_required(VERSION 2.8.2) project(${DL_ARGS_PROJ}-download NONE) include(ExternalProject) if(${DL_ARGS_BUILD_PROJECT}) ExternalProject_Add(${DL_ARGS_PROJ}-download ${DL_ARGS_UNPARSED_ARGUMENTS} SOURCE_DIR "${DL_ARGS_SOURCE_DIR}" BUILD_IN_SOURCE TRUE TEST_COMMAND "" ) else() ExternalProject_Add(${DL_ARGS_PROJ}-download ${DL_ARGS_UNPARSED_ARGUMENTS} SOURCE_DIR "${DL_ARGS_SOURCE_DIR}" BUILD_IN_SOURCE TRUE TEST_COMMAND "" UPDATE_COMMAND "" CONFIGURE_COMMAND "" BUILD_COMMAND "" INSTALL_COMMAND "" ) endif() hipCUB-rocm-5.7.1/cmake/DownloadProject.cmake000066400000000000000000000167451447643347700210140ustar00rootroot00000000000000# Distributed under the OSI-approved MIT License. See accompanying # file LICENSE or https://github.com/Crascit/DownloadProject for details. # # MODULE: DownloadProject # # PROVIDES: # download_project( PROJ projectName # [PREFIX prefixDir] # [DOWNLOAD_DIR downloadDir] # [SOURCE_DIR srcDir] # [BINARY_DIR binDir] # [QUIET] # ... # ) # # Provides the ability to download and unpack a tarball, zip file, git repository, # etc. at configure time (i.e. when the cmake command is run). How the downloaded # and unpacked contents are used is up to the caller, but the motivating case is # to download source code which can then be included directly in the build with # add_subdirectory() after the call to download_project(). Source and build # directories are set up with this in mind. # # The PROJ argument is required. The projectName value will be used to construct # the following variables upon exit (obviously replace projectName with its actual # value): # # projectName_SOURCE_DIR # projectName_BINARY_DIR # # The SOURCE_DIR and BINARY_DIR arguments are optional and would not typically # need to be provided. They can be specified if you want the downloaded source # and build directories to be located in a specific place. The contents of # projectName_SOURCE_DIR and projectName_BINARY_DIR will be populated with the # locations used whether you provide SOURCE_DIR/BINARY_DIR or not. # # The DOWNLOAD_DIR argument does not normally need to be set. It controls the # location of the temporary CMake build used to perform the download. # # The PREFIX argument can be provided to change the base location of the default # values of DOWNLOAD_DIR, SOURCE_DIR and BINARY_DIR. If all of those three arguments # are provided, then PREFIX will have no effect. The default value for PREFIX is # CMAKE_BINARY_DIR. # # The QUIET option can be given if you do not want to show the output associated # with downloading the specified project. # # In addition to the above, any other options are passed through unmodified to # ExternalProject_Add() to perform the actual download, patch and update steps. # # Only those ExternalProject_Add() arguments which relate to downloading, patching # and updating of the project sources are intended to be used. Also note that at # least one set of download-related arguments are required. # # If using CMake 3.2 or later, the UPDATE_DISCONNECTED option can be used to # prevent a check at the remote end for changes every time CMake is run # after the first successful download. See the documentation of the ExternalProject # module for more information. It is likely you will want to use this option if it # is available to you. Note, however, that the ExternalProject implementation contains # bugs which result in incorrect handling of the UPDATE_DISCONNECTED option when # using the URL download method or when specifying a SOURCE_DIR with no download # method. Fixes for these have been created, the last of which is scheduled for # inclusion in CMake 3.8.0. Details can be found here: # # https://gitlab.kitware.com/cmake/cmake/commit/bdca68388bd57f8302d3c1d83d691034b7ffa70c # https://gitlab.kitware.com/cmake/cmake/issues/16428 # # If you experience build errors related to the update step, consider avoiding # the use of UPDATE_DISCONNECTED. # # EXAMPLE USAGE: # # include(DownloadProject) # download_project(PROJ googletest # GIT_REPOSITORY https://github.com/google/googletest.git # GIT_TAG master # UPDATE_DISCONNECTED 1 # QUIET # ) # # add_subdirectory(${googletest_SOURCE_DIR} ${googletest_BINARY_DIR}) # #======================================================================================== set(_DownloadProjectDir "${CMAKE_CURRENT_LIST_DIR}") include(CMakeParseArguments) function(download_project) set(options QUIET) set(oneValueArgs PROJ PREFIX DOWNLOAD_DIR SOURCE_DIR BINARY_DIR BUILD_PROJECT ) set(multiValueArgs "") cmake_parse_arguments(DL_ARGS "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN}) # Hide output if requested if (DL_ARGS_QUIET) set(OUTPUT_QUIET "OUTPUT_QUIET") else() unset(OUTPUT_QUIET) message(STATUS "Downloading/updating ${DL_ARGS_PROJ}") endif() # Set up where we will put our temporary CMakeLists.txt file and also # the base point below which the default source and binary dirs will be. # The prefix must always be an absolute path. if (NOT DL_ARGS_PREFIX) set(DL_ARGS_PREFIX "${CMAKE_BINARY_DIR}") else() get_filename_component(DL_ARGS_PREFIX "${DL_ARGS_PREFIX}" ABSOLUTE BASE_DIR "${CMAKE_CURRENT_BINARY_DIR}") endif() if (NOT DL_ARGS_DOWNLOAD_DIR) set(DL_ARGS_DOWNLOAD_DIR "${DL_ARGS_PREFIX}/${DL_ARGS_PROJ}-download") endif() # Ensure the caller can know where to find the source and build directories if (NOT DL_ARGS_SOURCE_DIR) set(DL_ARGS_SOURCE_DIR "${DL_ARGS_PREFIX}/${DL_ARGS_PROJ}-src") endif() if (NOT DL_ARGS_BINARY_DIR) set(DL_ARGS_BINARY_DIR "${DL_ARGS_PREFIX}/${DL_ARGS_PROJ}-build") endif() set(${DL_ARGS_PROJ}_SOURCE_DIR "${DL_ARGS_SOURCE_DIR}" PARENT_SCOPE) set(${DL_ARGS_PROJ}_BINARY_DIR "${DL_ARGS_BINARY_DIR}" PARENT_SCOPE) # The way that CLion manages multiple configurations, it causes a copy of # the CMakeCache.txt to be copied across due to it not expecting there to # be a project within a project. This causes the hard-coded paths in the # cache to be copied and builds to fail. To mitigate this, we simply # remove the cache if it exists before we configure the new project. It # is safe to do so because it will be re-generated. Since this is only # executed at the configure step, it should not cause additional builds or # downloads. file(REMOVE "${DL_ARGS_DOWNLOAD_DIR}/CMakeCache.txt") # Create and build a separate CMake project to carry out the download. # If we've already previously done these steps, they will not cause # anything to be updated, so extra rebuilds of the project won't occur. # Make sure to pass through CMAKE_MAKE_PROGRAM in case the main project # has this set to something not findable on the PATH. configure_file("${_DownloadProjectDir}/DownloadProject.CMakeLists.cmake.in" "${DL_ARGS_DOWNLOAD_DIR}/CMakeLists.txt") execute_process(COMMAND ${CMAKE_COMMAND} -G "${CMAKE_GENERATOR}" -D "CMAKE_MAKE_PROGRAM:FILE=${CMAKE_MAKE_PROGRAM}" . RESULT_VARIABLE result ${OUTPUT_QUIET} WORKING_DIRECTORY "${DL_ARGS_DOWNLOAD_DIR}" ) if(result) message(FATAL_ERROR "CMake step for ${DL_ARGS_PROJ} failed: ${result}") endif() execute_process(COMMAND ${CMAKE_COMMAND} --build . RESULT_VARIABLE result ${OUTPUT_QUIET} WORKING_DIRECTORY "${DL_ARGS_DOWNLOAD_DIR}" ) if(result) message(FATAL_ERROR "Build step for ${DL_ARGS_PROJ} failed: ${result}") endif() endfunction() hipCUB-rocm-5.7.1/cmake/GenerateResourceSpec.cmake000066400000000000000000000066221447643347700217640ustar00rootroot00000000000000#!/usr/bin/cmake -P find_program(ROCMINFO_EXECUTABLE rocminfo ) if(NOT ROCMINFO_EXECUTABLE) message(FATAL_ERROR "rocminfo not found") endif() execute_process( COMMAND ${ROCMINFO_EXECUTABLE} RESULT_VARIABLE ROCMINFO_EXIT_CODE OUTPUT_VARIABLE ROCMINFO_STDOUT ERROR_VARIABLE ROCMINFO_STDERR ) if(ROCMINFO_EXIT_CODE) message(SEND_ERROR "rocminfo exited with ${ROCMINFO_EXIT_CODE}") message(FATAL_ERROR ${ROCMINFO_STDERR}) endif() string(REGEX MATCHALL [[--(gfx[0-9]+)]] ROCMINFO_MATCHES ${ROCMINFO_STDOUT} ) # NOTE: Unfortunately we don't have structs in CMake, # neither do we have std::partition only list(SORT) # # Transform raw regex matches to pairs of gfx IP and device id # This will be our struct emulation. In C++ it would be # # struct device # { # std::string ip; # int id; # }; # # std::vector GFXIP_AND_ID{ {"gfx900",0},{"gfx803",1},{"gfx900",2} }; # std::sort(GFXIP_AND_ID.begin(), GFXIP_AND_ID.end(), # [](const device& lhs, const device& rhs) # { # return std::lexicographical_compare(lhs.ip.begin(), lhs.ip.end(), # rhs.ip.begin(), rhs.ip.end()); # }); # set(GFXIP_AND_ID) set(ID 0) foreach(ROCMINFO_MATCH IN LISTS ROCMINFO_MATCHES) string(REGEX REPLACE "--" "" ROCMINFO_MATCH ${ROCMINFO_MATCH} ) list(APPEND GFXIP_AND_ID "${ROCMINFO_MATCH}:${ID}") math(EXPR ID "${ID} + 1") endforeach() list(SORT GFXIP_AND_ID) # Now comes the tricky part: implementing the following C++ logic # # std::stringstream JSON_PAYLOAD; # auto it = GFXIP_AND_ID.begin(); # while (it != GFXIP_AND_ID.end()) # { # auto IT = std::find_if(it, GFXIP_AND_ID.end(), # [=](const device& ip_id){ return ip_id.ip.compare(it->ip) != 0; }); # JSON_PAYLOAD << "\n \"" << it->ip << "\": ["; # std::for_each(it, IT, [&](const device& ip_id) # { # JSON_PAYLOAD << # "\n {\n" << # " \"id\": \"" << ip_id.id << "\"\n" << # " },"; # }); # JSON_PAYLOAD.seekp(-1, std::ios_base::end); // discard trailing comma # JSON_PAYLOAD << "\n ],"; # it = IT; # } # JSON_PAYLOAD.seekp(-1, std::ios_base::end); // discard trailing comma # set(JSON_PAYLOAD) set(IT1 0) list(GET GFXIP_AND_ID ${IT1} I1) string(REGEX REPLACE ":[0-9]+" "" IP1 ${I1}) list(LENGTH GFXIP_AND_ID COUNT) while(IT1 LESS COUNT) string(APPEND JSON_PAYLOAD "\n \"${IP1}\": [") set(IT2 ${IT1}) list(GET GFXIP_AND_ID ${IT2} I2) string(REGEX REPLACE [[:[0-9]+$]] "" IP2 ${I2}) string(REGEX REPLACE [[^gfx[0-9]+:]] "" ID2 ${I2}) while(${IP2} STREQUAL ${IP1} AND IT2 LESS COUNT) string(APPEND JSON_PAYLOAD "\n {\n" " \"id\": \"${ID2}\"\n" " }," ) math(EXPR IT2 "${IT2} + 1") if(IT2 LESS COUNT) list(GET GFXIP_AND_ID ${IT2} I2) string(REGEX REPLACE [[:[0-9]+$]] "" IP2 ${I2}) string(REGEX REPLACE [[^gfx[0-9]+:]] "" ID2 ${I2}) endif() endwhile() string(REGEX REPLACE [[,$]] "" JSON_PAYLOAD ${JSON_PAYLOAD}) string(APPEND JSON_PAYLOAD "\n ],") set(IT1 ${IT2}) set(IP1 ${IP2}) endwhile() string(REGEX REPLACE [[,$]] "" JSON_PAYLOAD ${JSON_PAYLOAD}) set(JSON_HEAD [[{ "version": { "major": 1, "minor": 0 }, "local": [ {]] ) set(JSON_TAIL [[ } ] }]] ) file(WRITE ${CMAKE_CURRENT_BINARY_DIR}/resources.json ${JSON_HEAD} ${JSON_PAYLOAD} ${JSON_TAIL} ) hipCUB-rocm-5.7.1/cmake/ROCMExportTargetsHeaderOnly.cmake000066400000000000000000000133251447643347700231540ustar00rootroot00000000000000# MIT License # # Copyright (c) 2017-2019 Advanced Micro Devices, Inc. All rights reserved. # # Permission is hereby granted, free of charge, to any person obtaining a copy # of this software and associated documentation files (the "Software"), to deal # in the Software without restriction, including without limitation the rights # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell # copies of the Software, and to permit persons to whom the Software is # furnished to do so, subject to the following conditions: # # The above copyright notice and this permission notice shall be included in all # copies or substantial portions of the Software. # # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE # SOFTWARE. # This file is a worksaround for issues rocm-cmake packaging style and PyTorch. # TODO: remove when there is a fix for this issue in either rocm-cmake or PyTorch. include(CMakeParseArguments) include(GNUInstallDirs) include(ROCMPackageConfigHelpers) include(ROCMInstallTargets) set(ROCM_INSTALL_LIBDIR lib) function(rocm_write_package_template_function_if FILENAME NAME CHECK_VARIABLE) string(REPLACE ";" " " ARGS "${ARGN}") file(APPEND ${FILENAME} " if(NOT (DEFINED ${CHECK_VARIABLE} AND ${CHECK_VARIABLE}) ) ${NAME}(${ARGS}) endif() ") endfunction() function(rocm_export_targets_header_only) set(options) set(oneValueArgs NAMESPACE EXPORT NAME COMPATIBILITY PREFIX) set(multiValueArgs TARGETS DEPENDS INCLUDE) cmake_parse_arguments(PARSE "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN}) set(PACKAGE_NAME ${PROJECT_NAME}) if(PARSE_NAME) set(PACKAGE_NAME ${PARSE_NAME}) endif() string(TOUPPER ${PACKAGE_NAME} PACKAGE_NAME_UPPER) string(TOLOWER ${PACKAGE_NAME} PACKAGE_NAME_LOWER) set(TARGET_FILE ${PACKAGE_NAME_LOWER}-targets) if(PARSE_EXPORT) set(TARGET_FILE ${PARSE_EXPORT}) endif() set(CONFIG_NAME ${PACKAGE_NAME_LOWER}-config) set(TARGET_VERSION ${PROJECT_VERSION}) if(PARSE_PREFIX) set(PREFIX_DIR ${PARSE_PREFIX}) set(PREFIX_ARG PREFIX ${PREFIX_DIR}) set(BIN_INSTALL_DIR ${PREFIX_DIR}/${CMAKE_INSTALL_BINDIR}) set(LIB_INSTALL_DIR ${PREFIX_DIR}/${ROCM_INSTALL_LIBDIR}) set(INCLUDE_INSTALL_DIR ${PREFIX_DIR}/${CMAKE_INSTALL_INCLUDEDIR}) else() set(BIN_INSTALL_DIR ${CMAKE_INSTALL_BINDIR}) set(LIB_INSTALL_DIR ${ROCM_INSTALL_LIBDIR}) set(INCLUDE_INSTALL_DIR ${CMAKE_INSTALL_INCLUDEDIR}) endif() set(CONFIG_PACKAGE_INSTALL_DIR ${LIB_INSTALL_DIR}/cmake/${PACKAGE_NAME_LOWER}) set(CONFIG_TEMPLATE "${CMAKE_CURRENT_BINARY_DIR}/${PACKAGE_NAME_LOWER}-config.cmake.in") file(WRITE ${CONFIG_TEMPLATE} " @PACKAGE_INIT@ ") foreach(NAME ${PACKAGE_NAME} ${PACKAGE_NAME_UPPER} ${PACKAGE_NAME_LOWER}) rocm_write_package_template_function(${CONFIG_TEMPLATE} set_and_check ${NAME}_INCLUDE_DIR "@PACKAGE_INCLUDE_INSTALL_DIR@") rocm_write_package_template_function(${CONFIG_TEMPLATE} set_and_check ${NAME}_INCLUDE_DIRS "@PACKAGE_INCLUDE_INSTALL_DIR@") endforeach() rocm_write_package_template_function(${CONFIG_TEMPLATE} set_and_check ${PACKAGE_NAME}_TARGET_FILE "@PACKAGE_CONFIG_PACKAGE_INSTALL_DIR@/${TARGET_FILE}.cmake") if(PARSE_DEPENDS) rocm_list_split(PARSE_DEPENDS PACKAGE DEPENDS_LIST) foreach(DEPEND ${DEPENDS_LIST}) rocm_write_package_template_function(${CONFIG_TEMPLATE} find_dependency ${${DEPEND}}) endforeach() endif() foreach(INCLUDE ${PARSE_INCLUDE}) rocm_install(FILES ${INCLUDE} DESTINATION ${CONFIG_PACKAGE_INSTALL_DIR}) get_filename_component(INCLUDE_BASE ${INCLUDE} NAME) rocm_write_package_template_function(${CONFIG_TEMPLATE} include "\${CMAKE_CURRENT_LIST_DIR}/${INCLUDE_BASE}") endforeach() if(PARSE_TARGETS) rocm_write_package_template_function(${CONFIG_TEMPLATE} include "\${${PACKAGE_NAME}_TARGET_FILE}") foreach(NAME ${PACKAGE_NAME} ${PACKAGE_NAME_UPPER} ${PACKAGE_NAME_LOWER}) rocm_write_package_template_function_if(${CONFIG_TEMPLATE} set PYTORCH_FOUND_HIP ${NAME}_LIBRARIES ${PARSE_TARGETS}) rocm_write_package_template_function_if(${CONFIG_TEMPLATE} set PYTORCH_FOUND_HIP ${NAME}_LIBRARY ${PARSE_TARGETS}) endforeach() endif() rocm_configure_package_config_file( ${CONFIG_TEMPLATE} ${CMAKE_CURRENT_BINARY_DIR}/${CONFIG_NAME}.cmake INSTALL_DESTINATION ${CONFIG_PACKAGE_INSTALL_DIR} ${PREFIX_ARG} PATH_VARS LIB_INSTALL_DIR INCLUDE_INSTALL_DIR CONFIG_PACKAGE_INSTALL_DIR ) set(COMPATIBILITY_ARG SameMajorVersion) if(PARSE_COMPATIBILITY) set(COMPATIBILITY_ARG ${PARSE_COMPATIBILITY}) endif() write_basic_package_version_file( ${CMAKE_CURRENT_BINARY_DIR}/${CONFIG_NAME}-version.cmake VERSION ${TARGET_VERSION} COMPATIBILITY ${COMPATIBILITY_ARG} ) set(NAMESPACE_ARG) if(PARSE_NAMESPACE) set(NAMESPACE_ARG "NAMESPACE;${PARSE_NAMESPACE}") endif() rocm_install( EXPORT ${TARGET_FILE} DESTINATION ${CONFIG_PACKAGE_INSTALL_DIR} ${NAMESPACE_ARG} ) rocm_install( FILES ${CMAKE_CURRENT_BINARY_DIR}/${CONFIG_NAME}.cmake ${CMAKE_CURRENT_BINARY_DIR}/${CONFIG_NAME}-version.cmake DESTINATION ${CONFIG_PACKAGE_INSTALL_DIR}) endfunction() hipCUB-rocm-5.7.1/cmake/RocmCmakeDependence.cmake000066400000000000000000000054241447643347700215220ustar00rootroot00000000000000# MIT License # # Copyright (c) 2021 Advanced Micro Devices, Inc. All rights reserved. # # Permission is hereby granted, free of charge, to any person obtaining a copy # of this software and associated documentation files (the "Software"), to deal # in the Software without restriction, including without limitation the rights # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell # copies of the Software, and to permit persons to whom the Software is # furnished to do so, subject to the following conditions: # # The above copyright notice and this permission notice shall be included in all # copies or substantial portions of the Software. # # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE # SOFTWARE. # Find or download/install rocm-cmake project find_package(ROCM 0.7.3 QUIET CONFIG PATHS /opt/rocm) if(NOT ROCM_FOUND) set(rocm_cmake_tag "master" CACHE STRING "rocm-cmake tag to download") file( DOWNLOAD https://github.com/RadeonOpenCompute/rocm-cmake/archive/${rocm_cmake_tag}.zip ${CMAKE_CURRENT_BINARY_DIR}/rocm-cmake-${rocm_cmake_tag}.zip STATUS rocm_cmake_download_status LOG rocm_cmake_download_log ) list(GET rocm_cmake_download_status 0 rocm_cmake_download_error_code) if(rocm_cmake_download_error_code) message(FATAL_ERROR "Error: downloading " "https://github.com/RadeonOpenCompute/rocm-cmake/archive/${rocm_cmake_tag}.zip failed " "error_code: ${rocm_cmake_download_error_code} " "log: ${rocm_cmake_download_log} " ) endif() execute_process( COMMAND ${CMAKE_COMMAND} -E tar xzf ${CMAKE_CURRENT_BINARY_DIR}/rocm-cmake-${rocm_cmake_tag}.zip WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR} RESULT_VARIABLE rocm_cmake_unpack_error_code ) execute_process( COMMAND ${CMAKE_COMMAND} -DCMAKE_INSTALL_PREFIX=${CMAKE_CURRENT_BINARY_DIR}/rocm-cmake . WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/rocm-cmake-${rocm_cmake_tag} ) execute_process( COMMAND ${CMAKE_COMMAND} --build rocm-cmake-${rocm_cmake_tag} --target install WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}) if(rocm_cmake_unpack_error_code) message(FATAL_ERROR "Error: unpacking ${CMAKE_CURRENT_BINARY_DIR}/rocm-cmake-${rocm_cmake_tag}.zip failed") endif() find_package( ROCM 0.7.3 REQUIRED CONFIG PATHS ${CMAKE_CURRENT_BINARY_DIR}/rocm-cmake ) endif() hipCUB-rocm-5.7.1/cmake/SetupNVCC.cmake000066400000000000000000000123061447643347700174550ustar00rootroot00000000000000# MIT License # # Copyright (c) 2018-2023 Advanced Micro Devices, Inc. All rights reserved. # # Permission is hereby granted, free of charge, to any person obtaining a copy # of this software and associated documentation files (the "Software"), to deal # in the Software without restriction, including without limitation the rights # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell # copies of the Software, and to permit persons to whom the Software is # furnished to do so, subject to the following conditions: # # The above copyright notice and this permission notice shall be included in all # copies or substantial portions of the Software. # # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE # SOFTWARE. # Find HIP package and verify that correct C++ compiler was selected for available # platform. On ROCm platform host and device code is compiled by the same compiler: # hipcc or clang. On CUDA host can be compiled by any C++ compiler while device # code is compiled by nvcc compiler (CMake's CUDA package handles this). # A function for automatic detection of the CC of the installed NV GPUs function(hip_cuda_detect_cc out_variable) set(__cufile ${PROJECT_BINARY_DIR}/detect_nvgpus_cc.cu) file(WRITE ${__cufile} "" "#include \n" "#include \n" "int main()\n" "{\n" " int count = 0;\n" " if (cudaSuccess != cudaGetDeviceCount(&count)) return -1;\n" " if (count == 0) return -1;\n" " std::set list_cc;\n" " for (int device = 0; device < count; ++device)\n" " {\n" " cudaDeviceProp prop;\n" " if (cudaSuccess == cudaGetDeviceProperties(&prop, device))\n" " list_cc.insert(prop.major*10+prop.minor);\n" " }\n" " for (std::set::iterator itr = list_cc.begin(); itr != list_cc.end(); itr++)\n" " {\n" " if(itr != list_cc.begin()) std::cout << ';';\n" " std::cout << *itr;\n" " }\n" " return 0;\n" "}\n") execute_process( COMMAND ${HIP_HIPCC_EXECUTABLE} "-Wno-deprecated-gpu-targets" "--run" "${__cufile}" WORKING_DIRECTORY "${PROJECT_BINARY_DIR}/CMakeFiles/" RESULT_VARIABLE __nvcc_res OUTPUT_VARIABLE __nvcc_out ) if(__nvcc_res EQUAL 0) set(HIP_CUDA_detected_cc ${__nvcc_out} CACHE INTERNAL "The detected CC of installed NV GPUs" FORCE) endif() if(NOT HIP_CUDA_detected_cc) set(HIP_CUDA_detected_cc "53") set(${out_variable} ${HIP_CUDA_detected_cc} PARENT_SCOPE) else() set(${out_variable} ${HIP_CUDA_detected_cc} PARENT_SCOPE) endif() endfunction() ################################################################################################ ### Non macro/function section ################################################################################################ # Set the default value for CMAKE_CUDA_COMPILER if it's empty if(CMAKE_CUDA_COMPILER STREQUAL "") set(CMAKE_CUDA_COMPILER "nvcc") endif() # Get CUDA enable_language("CUDA") set(CMAKE_CUDA_STANDARD 14) # Suppressing warnings set(HIP_NVCC_FLAGS " ${HIP_NVCC_FLAGS} -Wno-deprecated-gpu-targets -Xcompiler -Wno-return-type -Wno-deprecated-declarations ") # Use NVGPU_TARGETS to set CUDA architectures (compute capabilities) # For example: -DNVGPU_TARGETS="50;61;62" set(DEFAULT_NVGPU_TARGETS "") # If NVGPU_TARGETS is empty get default value for it if("x${NVGPU_TARGETS}" STREQUAL "x") hip_cuda_detect_cc(detected_cc) set(DEFAULT_NVGPU_TARGETS "${detected_cc}") endif() set(NVGPU_TARGETS "${DEFAULT_NVGPU_TARGETS}" CACHE STRING "List of NVIDIA GPU targets (compute capabilities), for example \"35;50\"" ) set(CMAKE_CUDA_ARCHITECTURES ${NVGPU_TARGETS}) # Generate compiler flags based on targeted CUDA architectures if CMake doesn't. (Controlled by policy CP0104, on by default after 3.18) if(CMAKE_VERSION VERSION_LESS "3.18") foreach(CUDA_ARCH ${NVGPU_TARGETS}) list(APPEND HIP_NVCC_FLAGS "--generate-code arch=compute_${CUDA_ARCH},code=sm_${CUDA_ARCH} ") list(APPEND HIP_NVCC_FLAGS "--generate-code arch=compute_${CUDA_ARCH},code=compute_${CUDA_ARCH} ") endforeach() endif() execute_process( COMMAND ${HIP_HIPCONFIG_EXECUTABLE} --cpp_config OUTPUT_VARIABLE HIP_CPP_CONFIG_FLAGS OUTPUT_STRIP_TRAILING_WHITESPACE ERROR_STRIP_TRAILING_WHITESPACE ) # Update list parameter string(REPLACE ";" " " HIP_NVCC_FLAGS ${HIP_NVCC_FLAGS}) set(CMAKE_CUDA_FLAGS "${HIP_CPP_CONFIG_FLAGS} ${HIP_NVCC_FLAGS}" CACHE STRING "Cuda compile flags" FORCE) # Ignore warnings about #pragma unroll # and about deprecated CUDA function(s) used in hip/nvcc_detail/hip_runtime_api.h # set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${HIP_CPP_CONFIG_FLAGS_STRIP} -Wno-unknown-pragmas -Wno-deprecated-declarations" CACHE STRING "compile flags" FORCE) hipCUB-rocm-5.7.1/cmake/Summary.cmake000066400000000000000000000054161447643347700173440ustar00rootroot00000000000000# MIT License # # Copyright (c) 2017-2019 Advanced Micro Devices, Inc. All rights reserved. # # Permission is hereby granted, free of charge, to any person obtaining a copy # of this software and associated documentation files (the "Software"), to deal # in the Software without restriction, including without limitation the rights # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell # copies of the Software, and to permit persons to whom the Software is # furnished to do so, subject to the following conditions: # # The above copyright notice and this permission notice shall be included in all # copies or substantial portions of the Software. # # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE # SOFTWARE. function(print_configuration_summary) message(STATUS "") message(STATUS "******** Summary ********") message(STATUS "General:") message(STATUS " System : ${CMAKE_SYSTEM_NAME}") message(STATUS " HIP ROOT : ${HIP_ROOT_DIR}") message(STATUS " C++ compiler : ${CMAKE_CXX_COMPILER}") message(STATUS " C++ compiler version : ${CMAKE_CXX_COMPILER_VERSION}") string(STRIP "${CMAKE_CXX_FLAGS}" CMAKE_CXX_FLAGS_STRIP) message(STATUS " CXX flags : ${CMAKE_CXX_FLAGS_STRIP}") if(HIP_COMPILER STREQUAL "nvcc") string(REPLACE ";" " " HIP_NVCC_FLAGS_STRIP "${HIP_NVCC_FLAGS}") string(STRIP "${HIP_NVCC_FLAGS_STRIP}" HIP_NVCC_FLAGS_STRIP) string(REPLACE ";" " " HIP_CPP_CONFIG_FLAGS_STRIP "${HIP_CPP_CONFIG_FLAGS}") string(STRIP "${HIP_CPP_CONFIG_FLAGS_STRIP}" HIP_CPP_CONFIG_FLAGS_STRIP) message(STATUS " HIP flags : ${HIP_CPP_CONFIG_FLAGS_STRIP}") message(STATUS " NVCC flags : ${HIP_NVCC_FLAGS_STRIP}") endif() message(STATUS " Build type : ${CMAKE_BUILD_TYPE}") message(STATUS " Install prefix : ${CMAKE_INSTALL_PREFIX}") if(HIP_COMPILER STREQUAL "clang") message(STATUS " Device targets : ${GPU_TARGETS}") else() message(STATUS " Device targets : ${NVGPU_TARGETS}") endif() message(STATUS "") message(STATUS " DOWNLOAD_ROCPRIM : ${DOWNLOAD_ROCPRIM}") message(STATUS " BUILD_TEST : ${BUILD_TEST}") message(STATUS " BUILD_BENCHMARK : ${BUILD_BENCHMARK}") message(STATUS " BUILD_ADDRESS_SANITIZER : ${BUILD_ADDRESS_SANITIZER}") endfunction() hipCUB-rocm-5.7.1/cmake/VerifyCompiler.cmake000066400000000000000000000044121447643347700206410ustar00rootroot00000000000000# MIT License # # Copyright (c) 2018-2020 Advanced Micro Devices, Inc. All rights reserved. # # Permission is hereby granted, free of charge, to any person obtaining a copy # of this software and associated documentation files (the "Software"), to deal # in the Software without restriction, including without limitation the rights # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell # copies of the Software, and to permit persons to whom the Software is # furnished to do so, subject to the following conditions: # # The above copyright notice and this permission notice shall be included in all # copies or substantial portions of the Software. # # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE # SOFTWARE. list(APPEND CMAKE_PREFIX_PATH /opt/rocm /opt/rocm/hip) if(CMAKE_CXX_COMPILER MATCHES ".*nvcc$" OR "${CMAKE_CXX_COMPILER_ID}" STREQUAL "GNU") list(APPEND CMAKE_MODULE_PATH /opt/rocm/hip/cmake) find_package(hip QUIET CONFIG PATHS /opt/rocm) if(NOT hip_FOUND) find_package(HIP REQUIRED) endif() if(HIP_COMPILER STREQUAL "clang") # TODO: The HIP package on NVIDIA platform is incorrect at few versions set(HIP_COMPILER "nvcc" CACHE STRING "HIP Compiler" FORCE) endif() else() find_package(hip REQUIRED CONFIG PATHS /opt/rocm) endif() if(HIP_COMPILER STREQUAL "nvcc") if("${CMAKE_CXX_COMPILER_ID}" STREQUAL "GNU") include(SetupNVCC) else() message(WARNING "On CUDA platform 'g++' is recommended C++ compiler.") endif() elseif(HIP_COMPILER STREQUAL "clang") if(NOT (HIP_CXX_COMPILER MATCHES ".*hipcc" OR HIP_CXX_COMPILER MATCHES ".*clang\\+\\+")) message(FATAL_ERROR "On ROCm platform 'hipcc' or HIP-aware Clang must be used as C++ compiler.") endif() else() message(FATAL_ERROR "HIP_COMPILER must be 'clang' (AMD ROCm platform) or `nvcc` (NVIDIA CUDA platform).") endif() hipCUB-rocm-5.7.1/conanfile.py000066400000000000000000000011301447643347700161220ustar00rootroot00000000000000# Copyright 2021 Advanced Micro Devices, Inc. # This conanfile is used to install development requirements, # e.g. # conan install -o clients=True -if build/deps . from conans import ConanFile, CMake class ConanPkgReqs(ConanFile): settings = "os", "compiler", "build_type", "arch" generators = "cmake_find_package" options = { "shared": [True, False], "clients": [True, False], } default_options = { "shared": True, "clients": False, } def requirements(self): if self.options.clients: self.requires("gtest/1.11.0") hipCUB-rocm-5.7.1/docs/000077500000000000000000000000001447643347700145475ustar00rootroot00000000000000hipCUB-rocm-5.7.1/docs/.doxygen/000077500000000000000000000000001447643347700163025ustar00rootroot00000000000000hipCUB-rocm-5.7.1/docs/.doxygen/Doxyfile000066400000000000000000003174001447643347700200150ustar00rootroot00000000000000# Doxyfile 1.8.11 # This file describes the settings to be used by the documentation system # doxygen (www.doxygen.org) for a project. # # All text after a double hash (##) is considered a comment and is placed in # front of the TAG it is preceding. # # All text after a single hash (#) is considered a comment and will be ignored. # The format is: # TAG = value [value, ...] # For lists, items can also be appended using: # TAG += value [value, ...] # Values that contain spaces should be placed between quotes (\" \"). #--------------------------------------------------------------------------- # Project related configuration options #--------------------------------------------------------------------------- # This tag specifies the encoding used for all characters in the config file # that follow. The default is UTF-8 which is also the encoding used for all text # before the first occurrence of this tag. Doxygen uses libiconv (or the iconv # built into libc) for the transcoding. See http://www.gnu.org/software/libiconv # for the list of possible encodings. # The default value is: UTF-8. DOXYFILE_ENCODING = UTF-8 # The PROJECT_NAME tag is a single word (or a sequence of words surrounded by # double-quotes, unless you are using Doxywizard) that should identify the # project for which the documentation is generated. This name is used in the # title of most generated pages and in a few other places. # The default value is: My Project. PROJECT_NAME = hipCUB # The PROJECT_NUMBER tag can be used to enter a project or revision number. This # could be handy for archiving the generated documentation or if some version # control system is used. PROJECT_NUMBER = # Using the PROJECT_BRIEF tag one can provide an optional one line description # for a project that appears at the top of each page and should give viewer a # quick idea about the purpose of the project. Keep the description short. PROJECT_BRIEF = # With the PROJECT_LOGO tag one can specify a logo or an icon that is included # in the documentation. The maximum height of the logo should not exceed 55 # pixels and the maximum width should not exceed 200 pixels. Doxygen will copy # the logo to the output directory. PROJECT_LOGO = # The OUTPUT_DIRECTORY tag is used to specify the (relative or absolute) path # into which the generated documentation will be written. If a relative path is # entered, it will be relative to the location where doxygen was started. If # left blank the current directory will be used. OUTPUT_DIRECTORY = docBin # If the CREATE_SUBDIRS tag is set to YES then doxygen will create 4096 sub- # directories (in 2 levels) under the output directory of each output format and # will distribute the generated files over these directories. Enabling this # option can be useful when feeding doxygen a huge amount of source files, where # putting all generated files in the same directory would otherwise causes # performance problems for the file system. # The default value is: NO. CREATE_SUBDIRS = NO # If the ALLOW_UNICODE_NAMES tag is set to YES, doxygen will allow non-ASCII # characters to appear in the names of generated files. If set to NO, non-ASCII # characters will be escaped, for example _xE3_x81_x84 will be used for Unicode # U+3044. # The default value is: NO. ALLOW_UNICODE_NAMES = NO # The OUTPUT_LANGUAGE tag is used to specify the language in which all # documentation generated by doxygen is written. Doxygen will use this # information to generate all constant output in the proper language. # Possible values are: Afrikaans, Arabic, Armenian, Brazilian, Catalan, Chinese, # Chinese-Traditional, Croatian, Czech, Danish, Dutch, English (United States), # Esperanto, Farsi (Persian), Finnish, French, German, Greek, Hungarian, # Indonesian, Italian, Japanese, Japanese-en (Japanese with English messages), # Korean, Korean-en (Korean with English messages), Latvian, Lithuanian, # Macedonian, Norwegian, Persian (Farsi), Polish, Portuguese, Romanian, Russian, # Serbian, Serbian-Cyrillic, Slovak, Slovene, Spanish, Swedish, Turkish, # Ukrainian and Vietnamese. # The default value is: English. OUTPUT_LANGUAGE = English # If the BRIEF_MEMBER_DESC tag is set to YES, doxygen will include brief member # descriptions after the members that are listed in the file and class # documentation (similar to Javadoc). Set to NO to disable this. # The default value is: YES. BRIEF_MEMBER_DESC = YES # If the REPEAT_BRIEF tag is set to YES, doxygen will prepend the brief # description of a member or function before the detailed description # # Note: If both HIDE_UNDOC_MEMBERS and BRIEF_MEMBER_DESC are set to NO, the # brief descriptions will be completely suppressed. # The default value is: YES. REPEAT_BRIEF = YES # This tag implements a quasi-intelligent brief description abbreviator that is # used to form the text in various listings. Each string in this list, if found # as the leading text of the brief description, will be stripped from the text # and the result, after processing the whole list, is used as the annotated # text. Otherwise, the brief description is used as-is. If left blank, the # following values are used ($name is automatically replaced with the name of # the entity):The $name class, The $name widget, The $name file, is, provides, # specifies, contains, represents, a, an and the. ABBREVIATE_BRIEF = "The $name class" \ "The $name widget" \ "The $name file" \ is \ provides \ specifies \ contains \ represents \ a \ an \ the # If the ALWAYS_DETAILED_SEC and REPEAT_BRIEF tags are both set to YES then # doxygen will generate a detailed section even if there is only a brief # description. # The default value is: NO. ALWAYS_DETAILED_SEC = NO # If the INLINE_INHERITED_MEMB tag is set to YES, doxygen will show all # inherited members of a class in the documentation of that class as if those # members were ordinary class members. Constructors, destructors and assignment # operators of the base classes will not be shown. # The default value is: NO. INLINE_INHERITED_MEMB = YES # If the FULL_PATH_NAMES tag is set to YES, doxygen will prepend the full path # before files name in the file list and in the header files. If set to NO the # shortest path that makes the file name unique will be used # The default value is: YES. FULL_PATH_NAMES = YES # The STRIP_FROM_PATH tag can be used to strip a user-defined part of the path. # Stripping is only done if one of the specified strings matches the left-hand # part of the path. The tag can be used to show relative paths in the file list. # If left blank the directory from which doxygen is run is used as the path to # strip. # # Note that you can specify absolute paths here, but also relative paths, which # will be relative from the directory where doxygen is started. # This tag requires that the tag FULL_PATH_NAMES is set to YES. STRIP_FROM_PATH = ../hipcub/include/ # The STRIP_FROM_INC_PATH tag can be used to strip a user-defined part of the # path mentioned in the documentation of a class, which tells the reader which # header file to include in order to use a class. If left blank only the name of # the header file containing the class definition is used. Otherwise one should # specify the list of include paths that are normally passed to the compiler # using the -I flag. STRIP_FROM_INC_PATH = # If the SHORT_NAMES tag is set to YES, doxygen will generate much shorter (but # less readable) file names. This can be useful is your file systems doesn't # support long names like on DOS, Mac, or CD-ROM. # The default value is: NO. SHORT_NAMES = NO # If the JAVADOC_AUTOBRIEF tag is set to YES then doxygen will interpret the # first line (until the first dot) of a Javadoc-style comment as the brief # description. If set to NO, the Javadoc-style will behave just like regular Qt- # style comments (thus requiring an explicit @brief command for a brief # description.) # The default value is: NO. JAVADOC_AUTOBRIEF = NO # If the QT_AUTOBRIEF tag is set to YES then doxygen will interpret the first # line (until the first dot) of a Qt-style comment as the brief description. If # set to NO, the Qt-style will behave just like regular Qt-style comments (thus # requiring an explicit \brief command for a brief description.) # The default value is: NO. QT_AUTOBRIEF = NO # The MULTILINE_CPP_IS_BRIEF tag can be set to YES to make doxygen treat a # multi-line C++ special comment block (i.e. a block of //! or /// comments) as # a brief description. This used to be the default behavior. The new default is # to treat a multi-line C++ comment block as a detailed description. Set this # tag to YES if you prefer the old behavior instead. # # Note that setting this tag to YES also means that rational rose comments are # not recognized any more. # The default value is: NO. MULTILINE_CPP_IS_BRIEF = NO # If the INHERIT_DOCS tag is set to YES then an undocumented member inherits the # documentation from any documented member that it re-implements. # The default value is: YES. INHERIT_DOCS = YES # If the SEPARATE_MEMBER_PAGES tag is set to YES then doxygen will produce a new # page for each member. If set to NO, the documentation of a member will be part # of the file/class/namespace that contains it. # The default value is: NO. SEPARATE_MEMBER_PAGES = NO # The TAB_SIZE tag can be used to set the number of spaces in a tab. Doxygen # uses this value to replace tabs by spaces in code fragments. # Minimum value: 1, maximum value: 16, default value: 4. TAB_SIZE = 4 # This tag can be used to specify a number of aliases that act as commands in # the documentation. An alias has the form: # name=value # For example adding # "sideeffect=@par Side Effects:\n" # will allow you to put the command \sideeffect (or @sideeffect) in the # documentation, which will result in a user-defined paragraph with heading # "Side Effects:". You can put \n's in the value part of an alias to insert # newlines. ALIASES = # This tag can be used to specify a number of word-keyword mappings (TCL only). # A mapping has the form "name=value". For example adding "class=itcl::class" # will allow you to use the command class in the itcl::class meaning. TCL_SUBST = # Set the OPTIMIZE_OUTPUT_FOR_C tag to YES if your project consists of C sources # only. Doxygen will then generate output that is more tailored for C. For # instance, some of the names that are used will be different. The list of all # members will be omitted, etc. # The default value is: NO. OPTIMIZE_OUTPUT_FOR_C = NO # Set the OPTIMIZE_OUTPUT_JAVA tag to YES if your project consists of Java or # Python sources only. Doxygen will then generate output that is more tailored # for that language. For instance, namespaces will be presented as packages, # qualified scopes will look different, etc. # The default value is: NO. OPTIMIZE_OUTPUT_JAVA = NO # Set the OPTIMIZE_FOR_FORTRAN tag to YES if your project consists of Fortran # sources. Doxygen will then generate output that is tailored for Fortran. # The default value is: NO. OPTIMIZE_FOR_FORTRAN = NO # Set the OPTIMIZE_OUTPUT_VHDL tag to YES if your project consists of VHDL # sources. Doxygen will then generate output that is tailored for VHDL. # The default value is: NO. OPTIMIZE_OUTPUT_VHDL = NO # Doxygen selects the parser to use depending on the extension of the files it # parses. With this tag you can assign which parser to use for a given # extension. Doxygen has a built-in mapping, but you can override or extend it # using this tag. The format is ext=language, where ext is a file extension, and # language is one of the parsers supported by doxygen: IDL, Java, Javascript, # C#, C, C++, D, PHP, Objective-C, Python, Fortran (fixed format Fortran: # FortranFixed, free formatted Fortran: FortranFree, unknown formatted Fortran: # Fortran. In the later case the parser tries to guess whether the code is fixed # or free formatted code, this is the default for Fortran type files), VHDL. For # instance to make doxygen treat .inc files as Fortran files (default is PHP), # and .f files as C (default is Fortran), use: inc=Fortran f=C. # # Note: For files without extension you can use no_extension as a placeholder. # # Note that for custom extensions you also need to set FILE_PATTERNS otherwise # the files are not read by doxygen. EXTENSION_MAPPING = # If the MARKDOWN_SUPPORT tag is enabled then doxygen pre-processes all comments # according to the Markdown format, which allows for more readable # documentation. See http://daringfireball.net/projects/markdown/ for details. # The output of markdown processing is further processed by doxygen, so you can # mix doxygen, HTML, and XML commands with Markdown formatting. Disable only in # case of backward compatibilities issues. # The default value is: YES. MARKDOWN_SUPPORT = YES # When enabled doxygen tries to link words that correspond to documented # classes, or namespaces to their corresponding documentation. Such a link can # be prevented in individual cases by putting a % sign in front of the word or # globally by setting AUTOLINK_SUPPORT to NO. # The default value is: YES. AUTOLINK_SUPPORT = YES # If you use STL classes (i.e. std::string, std::vector, etc.) but do not want # to include (a tag file for) the STL sources as input, then you should set this # tag to YES in order to let doxygen match functions declarations and # definitions whose arguments contain STL classes (e.g. func(std::string); # versus func(std::string) {}). This also make the inheritance and collaboration # diagrams that involve STL classes more complete and accurate. # The default value is: NO. BUILTIN_STL_SUPPORT = NO # If you use Microsoft's C++/CLI language, you should set this option to YES to # enable parsing support. # The default value is: NO. CPP_CLI_SUPPORT = NO # Set the SIP_SUPPORT tag to YES if your project consists of sip (see: # http://www.riverbankcomputing.co.uk/software/sip/intro) sources only. Doxygen # will parse them like normal C++ but will assume all classes use public instead # of private inheritance when no explicit protection keyword is present. # The default value is: NO. SIP_SUPPORT = NO # For Microsoft's IDL there are propget and propput attributes to indicate # getter and setter methods for a property. Setting this option to YES will make # doxygen to replace the get and set methods by a property in the documentation. # This will only work if the methods are indeed getting or setting a simple # type. If this is not the case, or you want to show the methods anyway, you # should set this option to NO. # The default value is: YES. IDL_PROPERTY_SUPPORT = YES # If member grouping is used in the documentation and the DISTRIBUTE_GROUP_DOC # tag is set to YES then doxygen will reuse the documentation of the first # member in the group (if any) for the other members of the group. By default # all members of a group must be documented explicitly. # The default value is: NO. DISTRIBUTE_GROUP_DOC = NO # If one adds a struct or class to a group and this option is enabled, then also # any nested class or struct is added to the same group. By default this option # is disabled and one has to add nested compounds explicitly via \ingroup. # The default value is: NO. GROUP_NESTED_COMPOUNDS = NO # Set the SUBGROUPING tag to YES to allow class member groups of the same type # (for instance a group of public functions) to be put as a subgroup of that # type (e.g. under the Public Functions section). Set it to NO to prevent # subgrouping. Alternatively, this can be done per class using the # \nosubgrouping command. # The default value is: YES. SUBGROUPING = YES # When the INLINE_GROUPED_CLASSES tag is set to YES, classes, structs and unions # are shown inside the group in which they are included (e.g. using \ingroup) # instead of on a separate page (for HTML and Man pages) or section (for LaTeX # and RTF). # # Note that this feature does not work in combination with # SEPARATE_MEMBER_PAGES. # The default value is: NO. INLINE_GROUPED_CLASSES = NO # When the INLINE_SIMPLE_STRUCTS tag is set to YES, structs, classes, and unions # with only public data fields or simple typedef fields will be shown inline in # the documentation of the scope in which they are defined (i.e. file, # namespace, or group documentation), provided this scope is documented. If set # to NO, structs, classes, and unions are shown on a separate page (for HTML and # Man pages) or section (for LaTeX and RTF). # The default value is: NO. INLINE_SIMPLE_STRUCTS = NO # When TYPEDEF_HIDES_STRUCT tag is enabled, a typedef of a struct, union, or # enum is documented as struct, union, or enum with the name of the typedef. So # typedef struct TypeS {} TypeT, will appear in the documentation as a struct # with name TypeT. When disabled the typedef will appear as a member of a file, # namespace, or class. And the struct will be named TypeS. This can typically be # useful for C code in case the coding convention dictates that all compound # types are typedef'ed and only the typedef is referenced, never the tag name. # The default value is: NO. TYPEDEF_HIDES_STRUCT = NO # The size of the symbol lookup cache can be set using LOOKUP_CACHE_SIZE. This # cache is used to resolve symbols given their name and scope. Since this can be # an expensive process and often the same symbol appears multiple times in the # code, doxygen keeps a cache of pre-resolved symbols. If the cache is too small # doxygen will become slower. If the cache is too large, memory is wasted. The # cache size is given by this formula: 2^(16+LOOKUP_CACHE_SIZE). The valid range # is 0..9, the default is 0, corresponding to a cache size of 2^16=65536 # symbols. At the end of a run doxygen will report the cache usage and suggest # the optimal cache size from a speed point of view. # Minimum value: 0, maximum value: 9, default value: 0. LOOKUP_CACHE_SIZE = 0 #--------------------------------------------------------------------------- # Build related configuration options #--------------------------------------------------------------------------- # If the EXTRACT_ALL tag is set to YES, doxygen will assume all entities in # documentation are documented, even if no documentation was available. Private # class members and static file members will be hidden unless the # EXTRACT_PRIVATE respectively EXTRACT_STATIC tags are set to YES. # Note: This will also disable the warnings about undocumented members that are # normally produced when WARNINGS is set to YES. # The default value is: NO. EXTRACT_ALL = NO # If the EXTRACT_PRIVATE tag is set to YES, all private members of a class will # be included in the documentation. # The default value is: NO. EXTRACT_PRIVATE = NO # If the EXTRACT_PACKAGE tag is set to YES, all members with package or internal # scope will be included in the documentation. # The default value is: NO. EXTRACT_PACKAGE = NO # If the EXTRACT_STATIC tag is set to YES, all static members of a file will be # included in the documentation. # The default value is: NO. EXTRACT_STATIC = NO # If the EXTRACT_LOCAL_CLASSES tag is set to YES, classes (and structs) defined # locally in source files will be included in the documentation. If set to NO, # only classes defined in header files are included. Does not have any effect # for Java sources. # The default value is: YES. EXTRACT_LOCAL_CLASSES = YES # This flag is only useful for Objective-C code. If set to YES, local methods, # which are defined in the implementation section but not in the interface are # included in the documentation. If set to NO, only methods in the interface are # included. # The default value is: NO. EXTRACT_LOCAL_METHODS = NO # If this flag is set to YES, the members of anonymous namespaces will be # extracted and appear in the documentation as a namespace called # 'anonymous_namespace{file}', where file will be replaced with the base name of # the file that contains the anonymous namespace. By default anonymous namespace # are hidden. # The default value is: NO. EXTRACT_ANON_NSPACES = NO # If the HIDE_UNDOC_MEMBERS tag is set to YES, doxygen will hide all # undocumented members inside documented classes or files. If set to NO these # members will be included in the various overviews, but no documentation # section is generated. This option has no effect if EXTRACT_ALL is enabled. # The default value is: NO. HIDE_UNDOC_MEMBERS = NO # If the HIDE_UNDOC_CLASSES tag is set to YES, doxygen will hide all # undocumented classes that are normally visible in the class hierarchy. If set # to NO, these classes will be included in the various overviews. This option # has no effect if EXTRACT_ALL is enabled. # The default value is: NO. HIDE_UNDOC_CLASSES = NO # If the HIDE_FRIEND_COMPOUNDS tag is set to YES, doxygen will hide all friend # (class|struct|union) declarations. If set to NO, these declarations will be # included in the documentation. # The default value is: NO. HIDE_FRIEND_COMPOUNDS = NO # If the HIDE_IN_BODY_DOCS tag is set to YES, doxygen will hide any # documentation blocks found inside the body of a function. If set to NO, these # blocks will be appended to the function's detailed documentation block. # The default value is: NO. HIDE_IN_BODY_DOCS = NO # The INTERNAL_DOCS tag determines if documentation that is typed after a # \internal command is included. If the tag is set to NO then the documentation # will be excluded. Set it to YES to include the internal documentation. # The default value is: NO. INTERNAL_DOCS = NO # If the CASE_SENSE_NAMES tag is set to NO then doxygen will only generate file # names in lower-case letters. If set to YES, upper-case letters are also # allowed. This is useful if you have classes or files whose names only differ # in case and if your file system supports case sensitive file names. Windows # and Mac users are advised to set this option to NO. # The default value is: system dependent. CASE_SENSE_NAMES = YES # If the HIDE_SCOPE_NAMES tag is set to NO then doxygen will show members with # their full class and namespace scopes in the documentation. If set to YES, the # scope will be hidden. # The default value is: NO. HIDE_SCOPE_NAMES = NO # If the HIDE_COMPOUND_REFERENCE tag is set to NO (default) then doxygen will # append additional text to a page's title, such as Class Reference. If set to # YES the compound reference will be hidden. # The default value is: NO. HIDE_COMPOUND_REFERENCE= NO # If the SHOW_INCLUDE_FILES tag is set to YES then doxygen will put a list of # the files that are included by a file in the documentation of that file. # The default value is: YES. SHOW_INCLUDE_FILES = YES # If the SHOW_GROUPED_MEMB_INC tag is set to YES then Doxygen will add for each # grouped member an include statement to the documentation, telling the reader # which file to include in order to use the member. # The default value is: NO. SHOW_GROUPED_MEMB_INC = NO # If the FORCE_LOCAL_INCLUDES tag is set to YES then doxygen will list include # files with double quotes in the documentation rather than with sharp brackets. # The default value is: NO. FORCE_LOCAL_INCLUDES = NO # If the INLINE_INFO tag is set to YES then a tag [inline] is inserted in the # documentation for inline members. # The default value is: YES. INLINE_INFO = YES # If the SORT_MEMBER_DOCS tag is set to YES then doxygen will sort the # (detailed) documentation of file and class members alphabetically by member # name. If set to NO, the members will appear in declaration order. # The default value is: YES. SORT_MEMBER_DOCS = NO # If the SORT_BRIEF_DOCS tag is set to YES then doxygen will sort the brief # descriptions of file, namespace and class members alphabetically by member # name. If set to NO, the members will appear in declaration order. Note that # this will also influence the order of the classes in the class list. # The default value is: NO. SORT_BRIEF_DOCS = NO # If the SORT_MEMBERS_CTORS_1ST tag is set to YES then doxygen will sort the # (brief and detailed) documentation of class members so that constructors and # destructors are listed first. If set to NO the constructors will appear in the # respective orders defined by SORT_BRIEF_DOCS and SORT_MEMBER_DOCS. # Note: If SORT_BRIEF_DOCS is set to NO this option is ignored for sorting brief # member documentation. # Note: If SORT_MEMBER_DOCS is set to NO this option is ignored for sorting # detailed member documentation. # The default value is: NO. SORT_MEMBERS_CTORS_1ST = NO # If the SORT_GROUP_NAMES tag is set to YES then doxygen will sort the hierarchy # of group names into alphabetical order. If set to NO the group names will # appear in their defined order. # The default value is: NO. SORT_GROUP_NAMES = NO # If the SORT_BY_SCOPE_NAME tag is set to YES, the class list will be sorted by # fully-qualified names, including namespaces. If set to NO, the class list will # be sorted only by class name, not including the namespace part. # Note: This option is not very useful if HIDE_SCOPE_NAMES is set to YES. # Note: This option applies only to the class list, not to the alphabetical # list. # The default value is: NO. SORT_BY_SCOPE_NAME = NO # If the STRICT_PROTO_MATCHING option is enabled and doxygen fails to do proper # type resolution of all parameters of a function it will reject a match between # the prototype and the implementation of a member function even if there is # only one candidate or it is obvious which candidate to choose by doing a # simple string match. By disabling STRICT_PROTO_MATCHING doxygen will still # accept a match between prototype and implementation in such cases. # The default value is: NO. STRICT_PROTO_MATCHING = NO # The GENERATE_TODOLIST tag can be used to enable (YES) or disable (NO) the todo # list. This list is created by putting \todo commands in the documentation. # The default value is: YES. GENERATE_TODOLIST = YES # The GENERATE_TESTLIST tag can be used to enable (YES) or disable (NO) the test # list. This list is created by putting \test commands in the documentation. # The default value is: YES. GENERATE_TESTLIST = YES # The GENERATE_BUGLIST tag can be used to enable (YES) or disable (NO) the bug # list. This list is created by putting \bug commands in the documentation. # The default value is: YES. GENERATE_BUGLIST = YES # The GENERATE_DEPRECATEDLIST tag can be used to enable (YES) or disable (NO) # the deprecated list. This list is created by putting \deprecated commands in # the documentation. # The default value is: YES. GENERATE_DEPRECATEDLIST= YES # The ENABLED_SECTIONS tag can be used to enable conditional documentation # sections, marked by \if ... \endif and \cond # ... \endcond blocks. ENABLED_SECTIONS = # The MAX_INITIALIZER_LINES tag determines the maximum number of lines that the # initial value of a variable or macro / define can have for it to appear in the # documentation. If the initializer consists of more lines than specified here # it will be hidden. Use a value of 0 to hide initializers completely. The # appearance of the value of individual variables and macros / defines can be # controlled using \showinitializer or \hideinitializer command in the # documentation regardless of this setting. # Minimum value: 0, maximum value: 10000, default value: 30. MAX_INITIALIZER_LINES = 30 # Set the SHOW_USED_FILES tag to NO to disable the list of files generated at # the bottom of the documentation of classes and structs. If set to YES, the # list will mention the files that were used to generate the documentation. # The default value is: YES. SHOW_USED_FILES = YES # Set the SHOW_FILES tag to NO to disable the generation of the Files page. This # will remove the Files entry from the Quick Index and from the Folder Tree View # (if specified). # The default value is: YES. SHOW_FILES = YES # Set the SHOW_NAMESPACES tag to NO to disable the generation of the Namespaces # page. This will remove the Namespaces entry from the Quick Index and from the # Folder Tree View (if specified). # The default value is: YES. SHOW_NAMESPACES = YES # The FILE_VERSION_FILTER tag can be used to specify a program or script that # doxygen should invoke to get the current version for each file (typically from # the version control system). Doxygen will invoke the program by executing (via # popen()) the command command input-file, where command is the value of the # FILE_VERSION_FILTER tag, and input-file is the name of an input file provided # by doxygen. Whatever the program writes to standard output is used as the file # version. For an example see the documentation. FILE_VERSION_FILTER = # The LAYOUT_FILE tag can be used to specify a layout file which will be parsed # by doxygen. The layout file controls the global structure of the generated # output files in an output format independent way. To create the layout file # that represents doxygen's defaults, run doxygen with the -l option. You can # optionally specify a file name after the option, if omitted DoxygenLayout.xml # will be used as the name of the layout file. # # Note that if you run doxygen from a directory containing a file called # DoxygenLayout.xml, doxygen will parse it automatically even if the LAYOUT_FILE # tag is left empty. LAYOUT_FILE = # The CITE_BIB_FILES tag can be used to specify one or more bib files containing # the reference definitions. This must be a list of .bib files. The .bib # extension is automatically appended if omitted. This requires the bibtex tool # to be installed. See also http://en.wikipedia.org/wiki/BibTeX for more info. # For LaTeX the style of the bibliography can be controlled using # LATEX_BIB_STYLE. To use this feature you need bibtex and perl available in the # search path. See also \cite for info how to create references. CITE_BIB_FILES = #--------------------------------------------------------------------------- # Configuration options related to warning and progress messages #--------------------------------------------------------------------------- # The QUIET tag can be used to turn on/off the messages that are generated to # standard output by doxygen. If QUIET is set to YES this implies that the # messages are off. # The default value is: NO. QUIET = NO # The WARNINGS tag can be used to turn on/off the warning messages that are # generated to standard error (stderr) by doxygen. If WARNINGS is set to YES # this implies that the warnings are on. # # Tip: Turn warnings on while writing the documentation. # The default value is: YES. WARNINGS = YES # If the WARN_IF_UNDOCUMENTED tag is set to YES then doxygen will generate # warnings for undocumented members. If EXTRACT_ALL is set to YES then this flag # will automatically be disabled. # The default value is: YES. WARN_IF_UNDOCUMENTED = YES # If the WARN_IF_DOC_ERROR tag is set to YES, doxygen will generate warnings for # potential errors in the documentation, such as not documenting some parameters # in a documented function, or documenting parameters that don't exist or using # markup commands wrongly. # The default value is: YES. WARN_IF_DOC_ERROR = YES # This WARN_NO_PARAMDOC option can be enabled to get warnings for functions that # are documented, but have no documentation for their parameters or return # value. If set to NO, doxygen will only warn about wrong or incomplete # parameter documentation, but not about the absence of documentation. # The default value is: NO. WARN_NO_PARAMDOC = NO # If the WARN_AS_ERROR tag is set to YES then doxygen will immediately stop when # a warning is encountered. # The default value is: NO. WARN_AS_ERROR = NO # The WARN_FORMAT tag determines the format of the warning messages that doxygen # can produce. The string should contain the $file, $line, and $text tags, which # will be replaced by the file and line number from which the warning originated # and the warning text. Optionally the format may contain $version, which will # be replaced by the version of the file (if it could be obtained via # FILE_VERSION_FILTER) # The default value is: $file:$line: $text. WARN_FORMAT = "$file:$line: $text" # The WARN_LOGFILE tag can be used to specify a file to which warning and error # messages should be written. If left blank the output is written to standard # error (stderr). WARN_LOGFILE = #--------------------------------------------------------------------------- # Configuration options related to the input files #--------------------------------------------------------------------------- # The INPUT tag is used to specify the files and/or directories that contain # documented source files. You may enter file names like myfile.cpp or # directories like /usr/src/myproject. Separate the files or directories with # spaces. See also FILE_PATTERNS and EXTENSION_MAPPING # Note: If this tag is empty the current directory is searched. INPUT = ../../hipcub/include/hipcub # This tag can be used to specify the character encoding of the source files # that doxygen parses. Internally doxygen uses the UTF-8 encoding. Doxygen uses # libiconv (or the iconv built into libc) for the transcoding. See the libiconv # documentation (see: http://www.gnu.org/software/libiconv) for the list of # possible encodings. # The default value is: UTF-8. INPUT_ENCODING = UTF-8 # If the value of the INPUT tag contains directories, you can use the # FILE_PATTERNS tag to specify one or more wildcard patterns (like *.cpp and # *.h) to filter out the source-files in the directories. # # Note that for custom extensions or not directly supported extensions you also # need to set EXTENSION_MAPPING for the extension otherwise the files are not # read by doxygen. # # If left blank the following patterns are tested:*.c, *.cc, *.cxx, *.cpp, # *.c++, *.java, *.ii, *.ixx, *.ipp, *.i++, *.inl, *.idl, *.ddl, *.odl, *.h, # *.hh, *.hxx, *.hpp, *.h++, *.cs, *.d, *.php, *.php4, *.php5, *.phtml, *.inc, # *.m, *.markdown, *.md, *.mm, *.dox, *.py, *.pyw, *.f90, *.f, *.for, *.tcl, # *.vhd, *.vhdl, *.ucf, *.qsf, *.as and *.js. FILE_PATTERNS = # The RECURSIVE tag can be used to specify whether or not subdirectories should # be searched for input files as well. # The default value is: NO. RECURSIVE = YES # The EXCLUDE tag can be used to specify files and/or directories that should be # excluded from the INPUT source files. This way you can easily exclude a # subdirectory from a directory tree whose root is specified with the INPUT tag. # # Note that relative paths are relative to the directory from which doxygen is # run. EXCLUDE = # The EXCLUDE_SYMLINKS tag can be used to select whether or not files or # directories that are symbolic links (a Unix file system feature) are excluded # from the input. # The default value is: NO. EXCLUDE_SYMLINKS = NO # If the value of the INPUT tag contains directories, you can use the # EXCLUDE_PATTERNS tag to specify one or more wildcard patterns to exclude # certain files from those directories. # # Note that the wildcards are matched against the file with absolute path, so to # exclude all test directories for example use the pattern */test/* EXCLUDE_PATTERNS = */detail/* # The EXCLUDE_SYMBOLS tag can be used to specify one or more symbol names # (namespaces, classes, functions, etc.) that should be excluded from the # output. The symbol name can be a fully qualified name, a word, or if the # wildcard * is used, a substring. Examples: ANamespace, AClass, # AClass::ANamespace, ANamespace::*Test # # Note that the wildcards are matched against the file with absolute path, so to # exclude all test directories use the pattern */test/* EXCLUDE_SYMBOLS = detail::* # The EXAMPLE_PATH tag can be used to specify one or more files or directories # that contain example code fragments that are included (see the \include # command). EXAMPLE_PATH = # If the value of the EXAMPLE_PATH tag contains directories, you can use the # EXAMPLE_PATTERNS tag to specify one or more wildcard pattern (like *.cpp and # *.h) to filter out the source-files in the directories. If left blank all # files are included. EXAMPLE_PATTERNS = * # If the EXAMPLE_RECURSIVE tag is set to YES then subdirectories will be # searched for input files to be used with the \include or \dontinclude commands # irrespective of the value of the RECURSIVE tag. # The default value is: NO. EXAMPLE_RECURSIVE = NO # The IMAGE_PATH tag can be used to specify one or more files or directories # that contain images that are to be included in the documentation (see the # \image command). IMAGE_PATH = # The INPUT_FILTER tag can be used to specify a program that doxygen should # invoke to filter for each input file. Doxygen will invoke the filter program # by executing (via popen()) the command: # # # # where is the value of the INPUT_FILTER tag, and is the # name of an input file. Doxygen will then use the output that the filter # program writes to standard output. If FILTER_PATTERNS is specified, this tag # will be ignored. # # Note that the filter must not add or remove lines; it is applied before the # code is scanned, but not when the output code is generated. If lines are added # or removed, the anchors will not be placed correctly. # # Note that for custom extensions or not directly supported extensions you also # need to set EXTENSION_MAPPING for the extension otherwise the files are not # properly processed by doxygen. INPUT_FILTER = # The FILTER_PATTERNS tag can be used to specify filters on a per file pattern # basis. Doxygen will compare the file name with each pattern and apply the # filter if there is a match. The filters are a list of the form: pattern=filter # (like *.cpp=my_cpp_filter). See INPUT_FILTER for further information on how # filters are used. If the FILTER_PATTERNS tag is empty or if none of the # patterns match the file name, INPUT_FILTER is applied. # # Note that for custom extensions or not directly supported extensions you also # need to set EXTENSION_MAPPING for the extension otherwise the files are not # properly processed by doxygen. FILTER_PATTERNS = # If the FILTER_SOURCE_FILES tag is set to YES, the input filter (if set using # INPUT_FILTER) will also be used to filter the input files that are used for # producing the source files to browse (i.e. when SOURCE_BROWSER is set to YES). # The default value is: NO. FILTER_SOURCE_FILES = NO # The FILTER_SOURCE_PATTERNS tag can be used to specify source filters per file # pattern. A pattern will override the setting for FILTER_PATTERN (if any) and # it is also possible to disable source filtering for a specific pattern using # *.ext= (so without naming a filter). # This tag requires that the tag FILTER_SOURCE_FILES is set to YES. FILTER_SOURCE_PATTERNS = # If the USE_MDFILE_AS_MAINPAGE tag refers to the name of a markdown file that # is part of the input, its contents will be placed on the main page # (index.html). This can be useful if you have a project on for instance GitHub # and want to reuse the introduction page also for the doxygen output. USE_MDFILE_AS_MAINPAGE = #--------------------------------------------------------------------------- # Configuration options related to source browsing #--------------------------------------------------------------------------- # If the SOURCE_BROWSER tag is set to YES then a list of source files will be # generated. Documented entities will be cross-referenced with these sources. # # Note: To get rid of all source code in the generated output, make sure that # also VERBATIM_HEADERS is set to NO. # The default value is: NO. SOURCE_BROWSER = NO # Setting the INLINE_SOURCES tag to YES will include the body of functions, # classes and enums directly into the documentation. # The default value is: NO. INLINE_SOURCES = NO # Setting the STRIP_CODE_COMMENTS tag to YES will instruct doxygen to hide any # special comment blocks from generated source code fragments. Normal C, C++ and # Fortran comments will always remain visible. # The default value is: YES. STRIP_CODE_COMMENTS = YES # If the REFERENCED_BY_RELATION tag is set to YES then for each documented # function all documented functions referencing it will be listed. # The default value is: NO. REFERENCED_BY_RELATION = NO # If the REFERENCES_RELATION tag is set to YES then for each documented function # all documented entities called/used by that function will be listed. # The default value is: NO. REFERENCES_RELATION = NO # If the REFERENCES_LINK_SOURCE tag is set to YES and SOURCE_BROWSER tag is set # to YES then the hyperlinks from functions in REFERENCES_RELATION and # REFERENCED_BY_RELATION lists will link to the source code. Otherwise they will # link to the documentation. # The default value is: YES. REFERENCES_LINK_SOURCE = YES # If SOURCE_TOOLTIPS is enabled (the default) then hovering a hyperlink in the # source code will show a tooltip with additional information such as prototype, # brief description and links to the definition and documentation. Since this # will make the HTML file larger and loading of large files a bit slower, you # can opt to disable this feature. # The default value is: YES. # This tag requires that the tag SOURCE_BROWSER is set to YES. SOURCE_TOOLTIPS = YES # If the USE_HTAGS tag is set to YES then the references to source code will # point to the HTML generated by the htags(1) tool instead of doxygen built-in # source browser. The htags tool is part of GNU's global source tagging system # (see http://www.gnu.org/software/global/global.html). You will need version # 4.8.6 or higher. # # To use it do the following: # - Install the latest version of global # - Enable SOURCE_BROWSER and USE_HTAGS in the config file # - Make sure the INPUT points to the root of the source tree # - Run doxygen as normal # # Doxygen will invoke htags (and that will in turn invoke gtags), so these # tools must be available from the command line (i.e. in the search path). # # The result: instead of the source browser generated by doxygen, the links to # source code will now point to the output of htags. # The default value is: NO. # This tag requires that the tag SOURCE_BROWSER is set to YES. USE_HTAGS = NO # If the VERBATIM_HEADERS tag is set the YES then doxygen will generate a # verbatim copy of the header file for each class for which an include is # specified. Set to NO to disable this. # See also: Section \class. # The default value is: YES. VERBATIM_HEADERS = YES # If the CLANG_ASSISTED_PARSING tag is set to YES then doxygen will use the # clang parser (see: http://clang.llvm.org/) for more accurate parsing at the # cost of reduced performance. This can be particularly helpful with template # rich C++ code for which doxygen's built-in parser lacks the necessary type # information. # Note: The availability of this option depends on whether or not doxygen was # generated with the -Duse-libclang=ON option for CMake. # The default value is: NO. CLANG_ASSISTED_PARSING = NO # If clang assisted parsing is enabled you can provide the compiler with command # line options that you would normally use when invoking the compiler. Note that # the include paths will already be set by doxygen for the files and directories # specified with INPUT and INCLUDE_PATH. # This tag requires that the tag CLANG_ASSISTED_PARSING is set to YES. CLANG_OPTIONS = #--------------------------------------------------------------------------- # Configuration options related to the alphabetical class index #--------------------------------------------------------------------------- # If the ALPHABETICAL_INDEX tag is set to YES, an alphabetical index of all # compounds will be generated. Enable this if the project contains a lot of # classes, structs, unions or interfaces. # The default value is: YES. ALPHABETICAL_INDEX = NO # The COLS_IN_ALPHA_INDEX tag can be used to specify the number of columns in # which the alphabetical index list will be split. # Minimum value: 1, maximum value: 20, default value: 5. # This tag requires that the tag ALPHABETICAL_INDEX is set to YES. COLS_IN_ALPHA_INDEX = 5 # In case all classes in a project start with a common prefix, all classes will # be put under the same header in the alphabetical index. The IGNORE_PREFIX tag # can be used to specify a prefix (or a list of prefixes) that should be ignored # while generating the index headers. # This tag requires that the tag ALPHABETICAL_INDEX is set to YES. IGNORE_PREFIX = #--------------------------------------------------------------------------- # Configuration options related to the HTML output #--------------------------------------------------------------------------- # If the GENERATE_HTML tag is set to YES, doxygen will generate HTML output # The default value is: YES. GENERATE_HTML = YES # The HTML_OUTPUT tag is used to specify where the HTML docs will be put. If a # relative path is entered the value of OUTPUT_DIRECTORY will be put in front of # it. # The default directory is: html. # This tag requires that the tag GENERATE_HTML is set to YES. HTML_OUTPUT = html # The HTML_FILE_EXTENSION tag can be used to specify the file extension for each # generated HTML page (for example: .htm, .php, .asp). # The default value is: .html. # This tag requires that the tag GENERATE_HTML is set to YES. HTML_FILE_EXTENSION = .html # The HTML_HEADER tag can be used to specify a user-defined HTML header file for # each generated HTML page. If the tag is left blank doxygen will generate a # standard header. # # To get valid HTML the header file that includes any scripts and style sheets # that doxygen needs, which is dependent on the configuration options used (e.g. # the setting GENERATE_TREEVIEW). It is highly recommended to start with a # default header using # doxygen -w html new_header.html new_footer.html new_stylesheet.css # YourConfigFile # and then modify the file new_header.html. See also section "Doxygen usage" # for information on how to generate the default header that doxygen normally # uses. # Note: The header is subject to change so you typically have to regenerate the # default header when upgrading to a newer version of doxygen. For a description # of the possible markers and block names see the documentation. # This tag requires that the tag GENERATE_HTML is set to YES. HTML_HEADER = ../_doxygen/header.html # The HTML_FOOTER tag can be used to specify a user-defined HTML footer for each # generated HTML page. If the tag is left blank doxygen will generate a standard # footer. See HTML_HEADER for more information on how to generate a default # footer and what special commands can be used inside the footer. See also # section "Doxygen usage" for information on how to generate the default footer # that doxygen normally uses. # This tag requires that the tag GENERATE_HTML is set to YES. HTML_FOOTER = ../_doxygen/footer.html # The HTML_STYLESHEET tag can be used to specify a user-defined cascading style # sheet that is used by each HTML page. It can be used to fine-tune the look of # the HTML output. If left blank doxygen will generate a default style sheet. # See also section "Doxygen usage" for information on how to generate the style # sheet that doxygen normally uses. # Note: It is recommended to use HTML_EXTRA_STYLESHEET instead of this tag, as # it is more robust and this tag (HTML_STYLESHEET) will in the future become # obsolete. # This tag requires that the tag GENERATE_HTML is set to YES. HTML_STYLESHEET = ../_doxygen/stylesheet.css # The HTML_EXTRA_STYLESHEET tag can be used to specify additional user-defined # cascading style sheets that are included after the standard style sheets # created by doxygen. Using this option one can overrule certain style aspects. # This is preferred over using HTML_STYLESHEET since it does not replace the # standard style sheet and is therefore more robust against future updates. # Doxygen will copy the style sheet files to the output directory. # Note: The order of the extra style sheet files is of importance (e.g. the last # style sheet in the list overrules the setting of the previous ones in the # list). For an example see the documentation. # This tag requires that the tag GENERATE_HTML is set to YES. HTML_EXTRA_STYLESHEET = ../_doxygen/extra_stylesheet.css # The HTML_EXTRA_FILES tag can be used to specify one or more extra images or # other source files which should be copied to the HTML output directory. Note # that these files will be copied to the base HTML output directory. Use the # $relpath^ marker in the HTML_HEADER and/or HTML_FOOTER files to load these # files. In the HTML_STYLESHEET file, use the file name only. Also note that the # files will be copied as-is; there are no commands or markers available. # This tag requires that the tag GENERATE_HTML is set to YES. HTML_EXTRA_FILES = # The HTML_COLORSTYLE_HUE tag controls the color of the HTML output. Doxygen # will adjust the colors in the style sheet and background images according to # this color. Hue is specified as an angle on a colorwheel, see # http://en.wikipedia.org/wiki/Hue for more information. For instance the value # 0 represents red, 60 is yellow, 120 is green, 180 is cyan, 240 is blue, 300 # purple, and 360 is red again. # Minimum value: 0, maximum value: 359, default value: 220. # This tag requires that the tag GENERATE_HTML is set to YES. HTML_COLORSTYLE_HUE = 220 # The HTML_COLORSTYLE_SAT tag controls the purity (or saturation) of the colors # in the HTML output. For a value of 0 the output will use grayscales only. A # value of 255 will produce the most vivid colors. # Minimum value: 0, maximum value: 255, default value: 100. # This tag requires that the tag GENERATE_HTML is set to YES. HTML_COLORSTYLE_SAT = 100 # The HTML_COLORSTYLE_GAMMA tag controls the gamma correction applied to the # luminance component of the colors in the HTML output. Values below 100 # gradually make the output lighter, whereas values above 100 make the output # darker. The value divided by 100 is the actual gamma applied, so 80 represents # a gamma of 0.8, The value 220 represents a gamma of 2.2, and 100 does not # change the gamma. # Minimum value: 40, maximum value: 240, default value: 80. # This tag requires that the tag GENERATE_HTML is set to YES. HTML_COLORSTYLE_GAMMA = 80 # If the HTML_TIMESTAMP tag is set to YES then the footer of each generated HTML # page will contain the date and time when the page was generated. Setting this # to YES can help to show when doxygen was last run and thus if the # documentation is up to date. # The default value is: NO. # This tag requires that the tag GENERATE_HTML is set to YES. HTML_TIMESTAMP = NO # If the HTML_DYNAMIC_SECTIONS tag is set to YES then the generated HTML # documentation will contain sections that can be hidden and shown after the # page has loaded. # The default value is: NO. # This tag requires that the tag GENERATE_HTML is set to YES. HTML_DYNAMIC_SECTIONS = NO # With HTML_INDEX_NUM_ENTRIES one can control the preferred number of entries # shown in the various tree structured indices initially; the user can expand # and collapse entries dynamically later on. Doxygen will expand the tree to # such a level that at most the specified number of entries are visible (unless # a fully collapsed tree already exceeds this amount). So setting the number of # entries 1 will produce a full collapsed tree by default. 0 is a special value # representing an infinite number of entries and will result in a full expanded # tree by default. # Minimum value: 0, maximum value: 9999, default value: 100. # This tag requires that the tag GENERATE_HTML is set to YES. HTML_INDEX_NUM_ENTRIES = 100 # If the GENERATE_DOCSET tag is set to YES, additional index files will be # generated that can be used as input for Apple's Xcode 3 integrated development # environment (see: http://developer.apple.com/tools/xcode/), introduced with # OSX 10.5 (Leopard). To create a documentation set, doxygen will generate a # Makefile in the HTML output directory. Running make will produce the docset in # that directory and running make install will install the docset in # ~/Library/Developer/Shared/Documentation/DocSets so that Xcode will find it at # startup. See http://developer.apple.com/tools/creatingdocsetswithdoxygen.html # for more information. # The default value is: NO. # This tag requires that the tag GENERATE_HTML is set to YES. GENERATE_DOCSET = NO # This tag determines the name of the docset feed. A documentation feed provides # an umbrella under which multiple documentation sets from a single provider # (such as a company or product suite) can be grouped. # The default value is: Doxygen generated docs. # This tag requires that the tag GENERATE_DOCSET is set to YES. DOCSET_FEEDNAME = "Doxygen generated docs" # This tag specifies a string that should uniquely identify the documentation # set bundle. This should be a reverse domain-name style string, e.g. # com.mycompany.MyDocSet. Doxygen will append .docset to the name. # The default value is: org.doxygen.Project. # This tag requires that the tag GENERATE_DOCSET is set to YES. DOCSET_BUNDLE_ID = org.doxygen.Project # The DOCSET_PUBLISHER_ID tag specifies a string that should uniquely identify # the documentation publisher. This should be a reverse domain-name style # string, e.g. com.mycompany.MyDocSet.documentation. # The default value is: org.doxygen.Publisher. # This tag requires that the tag GENERATE_DOCSET is set to YES. DOCSET_PUBLISHER_ID = org.doxygen.Publisher # The DOCSET_PUBLISHER_NAME tag identifies the documentation publisher. # The default value is: Publisher. # This tag requires that the tag GENERATE_DOCSET is set to YES. DOCSET_PUBLISHER_NAME = Publisher # If the GENERATE_HTMLHELP tag is set to YES then doxygen generates three # additional HTML index files: index.hhp, index.hhc, and index.hhk. The # index.hhp is a project file that can be read by Microsoft's HTML Help Workshop # (see: http://www.microsoft.com/en-us/download/details.aspx?id=21138) on # Windows. # # The HTML Help Workshop contains a compiler that can convert all HTML output # generated by doxygen into a single compiled HTML file (.chm). Compiled HTML # files are now used as the Windows 98 help format, and will replace the old # Windows help format (.hlp) on all Windows platforms in the future. Compressed # HTML files also contain an index, a table of contents, and you can search for # words in the documentation. The HTML workshop also contains a viewer for # compressed HTML files. # The default value is: NO. # This tag requires that the tag GENERATE_HTML is set to YES. GENERATE_HTMLHELP = NO # The CHM_FILE tag can be used to specify the file name of the resulting .chm # file. You can add a path in front of the file if the result should not be # written to the html output directory. # This tag requires that the tag GENERATE_HTMLHELP is set to YES. CHM_FILE = # The HHC_LOCATION tag can be used to specify the location (absolute path # including file name) of the HTML help compiler (hhc.exe). If non-empty, # doxygen will try to run the HTML help compiler on the generated index.hhp. # The file has to be specified with full path. # This tag requires that the tag GENERATE_HTMLHELP is set to YES. HHC_LOCATION = # The GENERATE_CHI flag controls if a separate .chi index file is generated # (YES) or that it should be included in the master .chm file (NO). # The default value is: NO. # This tag requires that the tag GENERATE_HTMLHELP is set to YES. GENERATE_CHI = NO # The CHM_INDEX_ENCODING is used to encode HtmlHelp index (hhk), content (hhc) # and project file content. # This tag requires that the tag GENERATE_HTMLHELP is set to YES. CHM_INDEX_ENCODING = # The BINARY_TOC flag controls whether a binary table of contents is generated # (YES) or a normal table of contents (NO) in the .chm file. Furthermore it # enables the Previous and Next buttons. # The default value is: NO. # This tag requires that the tag GENERATE_HTMLHELP is set to YES. BINARY_TOC = NO # The TOC_EXPAND flag can be set to YES to add extra items for group members to # the table of contents of the HTML help documentation and to the tree view. # The default value is: NO. # This tag requires that the tag GENERATE_HTMLHELP is set to YES. TOC_EXPAND = NO # If the GENERATE_QHP tag is set to YES and both QHP_NAMESPACE and # QHP_VIRTUAL_FOLDER are set, an additional index file will be generated that # can be used as input for Qt's qhelpgenerator to generate a Qt Compressed Help # (.qch) of the generated HTML documentation. # The default value is: NO. # This tag requires that the tag GENERATE_HTML is set to YES. GENERATE_QHP = NO # If the QHG_LOCATION tag is specified, the QCH_FILE tag can be used to specify # the file name of the resulting .qch file. The path specified is relative to # the HTML output folder. # This tag requires that the tag GENERATE_QHP is set to YES. QCH_FILE = # The QHP_NAMESPACE tag specifies the namespace to use when generating Qt Help # Project output. For more information please see Qt Help Project / Namespace # (see: http://qt-project.org/doc/qt-4.8/qthelpproject.html#namespace). # The default value is: org.doxygen.Project. # This tag requires that the tag GENERATE_QHP is set to YES. QHP_NAMESPACE = # The QHP_VIRTUAL_FOLDER tag specifies the namespace to use when generating Qt # Help Project output. For more information please see Qt Help Project / Virtual # Folders (see: http://qt-project.org/doc/qt-4.8/qthelpproject.html#virtual- # folders). # The default value is: doc. # This tag requires that the tag GENERATE_QHP is set to YES. QHP_VIRTUAL_FOLDER = doc # If the QHP_CUST_FILTER_NAME tag is set, it specifies the name of a custom # filter to add. For more information please see Qt Help Project / Custom # Filters (see: http://qt-project.org/doc/qt-4.8/qthelpproject.html#custom- # filters). # This tag requires that the tag GENERATE_QHP is set to YES. QHP_CUST_FILTER_NAME = # The QHP_CUST_FILTER_ATTRS tag specifies the list of the attributes of the # custom filter to add. For more information please see Qt Help Project / Custom # Filters (see: http://qt-project.org/doc/qt-4.8/qthelpproject.html#custom- # filters). # This tag requires that the tag GENERATE_QHP is set to YES. QHP_CUST_FILTER_ATTRS = # The QHP_SECT_FILTER_ATTRS tag specifies the list of the attributes this # project's filter section matches. Qt Help Project / Filter Attributes (see: # http://qt-project.org/doc/qt-4.8/qthelpproject.html#filter-attributes). # This tag requires that the tag GENERATE_QHP is set to YES. QHP_SECT_FILTER_ATTRS = # The QHG_LOCATION tag can be used to specify the location of Qt's # qhelpgenerator. If non-empty doxygen will try to run qhelpgenerator on the # generated .qhp file. # This tag requires that the tag GENERATE_QHP is set to YES. QHG_LOCATION = # If the GENERATE_ECLIPSEHELP tag is set to YES, additional index files will be # generated, together with the HTML files, they form an Eclipse help plugin. To # install this plugin and make it available under the help contents menu in # Eclipse, the contents of the directory containing the HTML and XML files needs # to be copied into the plugins directory of eclipse. The name of the directory # within the plugins directory should be the same as the ECLIPSE_DOC_ID value. # After copying Eclipse needs to be restarted before the help appears. # The default value is: NO. # This tag requires that the tag GENERATE_HTML is set to YES. GENERATE_ECLIPSEHELP = NO # A unique identifier for the Eclipse help plugin. When installing the plugin # the directory name containing the HTML and XML files should also have this # name. Each documentation set should have its own identifier. # The default value is: org.doxygen.Project. # This tag requires that the tag GENERATE_ECLIPSEHELP is set to YES. ECLIPSE_DOC_ID = org.doxygen.Project # If you want full control over the layout of the generated HTML pages it might # be necessary to disable the index and replace it with your own. The # DISABLE_INDEX tag can be used to turn on/off the condensed index (tabs) at top # of each HTML page. A value of NO enables the index and the value YES disables # it. Since the tabs in the index contain the same information as the navigation # tree, you can set this option to YES if you also set GENERATE_TREEVIEW to YES. # The default value is: NO. # This tag requires that the tag GENERATE_HTML is set to YES. DISABLE_INDEX = NO # The GENERATE_TREEVIEW tag is used to specify whether a tree-like index # structure should be generated to display hierarchical information. If the tag # value is set to YES, a side panel will be generated containing a tree-like # index structure (just like the one that is generated for HTML Help). For this # to work a browser that supports JavaScript, DHTML, CSS and frames is required # (i.e. any modern browser). Windows users are probably better off using the # HTML help feature. Via custom style sheets (see HTML_EXTRA_STYLESHEET) one can # further fine-tune the look of the index. As an example, the default style # sheet generated by doxygen has an example that shows how to put an image at # the root of the tree instead of the PROJECT_NAME. Since the tree basically has # the same information as the tab index, you could consider setting # DISABLE_INDEX to YES when enabling this option. # The default value is: NO. # This tag requires that the tag GENERATE_HTML is set to YES. GENERATE_TREEVIEW = NO # The ENUM_VALUES_PER_LINE tag can be used to set the number of enum values that # doxygen will group on one line in the generated HTML documentation. # # Note that a value of 0 will completely suppress the enum values from appearing # in the overview section. # Minimum value: 0, maximum value: 20, default value: 4. # This tag requires that the tag GENERATE_HTML is set to YES. ENUM_VALUES_PER_LINE = 4 # If the treeview is enabled (see GENERATE_TREEVIEW) then this tag can be used # to set the initial width (in pixels) of the frame in which the tree is shown. # Minimum value: 0, maximum value: 1500, default value: 250. # This tag requires that the tag GENERATE_HTML is set to YES. TREEVIEW_WIDTH = 250 # If the EXT_LINKS_IN_WINDOW option is set to YES, doxygen will open links to # external symbols imported via tag files in a separate window. # The default value is: NO. # This tag requires that the tag GENERATE_HTML is set to YES. EXT_LINKS_IN_WINDOW = NO # Use this tag to change the font size of LaTeX formulas included as images in # the HTML documentation. When you change the font size after a successful # doxygen run you need to manually remove any form_*.png images from the HTML # output directory to force them to be regenerated. # Minimum value: 8, maximum value: 50, default value: 10. # This tag requires that the tag GENERATE_HTML is set to YES. FORMULA_FONTSIZE = 10 # Use the FORMULA_TRANPARENT tag to determine whether or not the images # generated for formulas are transparent PNGs. Transparent PNGs are not # supported properly for IE 6.0, but are supported on all modern browsers. # # Note that when changing this option you need to delete any form_*.png files in # the HTML output directory before the changes have effect. # The default value is: YES. # This tag requires that the tag GENERATE_HTML is set to YES. FORMULA_TRANSPARENT = YES # Enable the USE_MATHJAX option to render LaTeX formulas using MathJax (see # http://www.mathjax.org) which uses client side Javascript for the rendering # instead of using pre-rendered bitmaps. Use this if you do not have LaTeX # installed or if you want to formulas look prettier in the HTML output. When # enabled you may also need to install MathJax separately and configure the path # to it using the MATHJAX_RELPATH option. # The default value is: NO. # This tag requires that the tag GENERATE_HTML is set to YES. USE_MATHJAX = YES # When MathJax is enabled you can set the default output format to be used for # the MathJax output. See the MathJax site (see: # http://docs.mathjax.org/en/latest/output.html) for more details. # Possible values are: HTML-CSS (which is slower, but has the best # compatibility), NativeMML (i.e. MathML) and SVG. # The default value is: HTML-CSS. # This tag requires that the tag USE_MATHJAX is set to YES. MATHJAX_FORMAT = HTML-CSS # When MathJax is enabled you need to specify the location relative to the HTML # output directory using the MATHJAX_RELPATH option. The destination directory # should contain the MathJax.js script. For instance, if the mathjax directory # is located at the same level as the HTML output directory, then # MATHJAX_RELPATH should be ../mathjax. The default value points to the MathJax # Content Delivery Network so you can quickly see the result without installing # MathJax. However, it is strongly recommended to install a local copy of # MathJax from http://www.mathjax.org before deployment. # The default value is: http://cdn.mathjax.org/mathjax/latest. # This tag requires that the tag USE_MATHJAX is set to YES. MATHJAX_RELPATH = http://cdn.mathjax.org/mathjax/latest # The MATHJAX_EXTENSIONS tag can be used to specify one or more MathJax # extension names that should be enabled during MathJax rendering. For example # MATHJAX_EXTENSIONS = TeX/AMSmath TeX/AMSsymbols # This tag requires that the tag USE_MATHJAX is set to YES. MATHJAX_EXTENSIONS = # The MATHJAX_CODEFILE tag can be used to specify a file with javascript pieces # of code that will be used on startup of the MathJax code. See the MathJax site # (see: http://docs.mathjax.org/en/latest/output.html) for more details. For an # example see the documentation. # This tag requires that the tag USE_MATHJAX is set to YES. MATHJAX_CODEFILE = # When the SEARCHENGINE tag is enabled doxygen will generate a search box for # the HTML output. The underlying search engine uses javascript and DHTML and # should work on any modern browser. Note that when using HTML help # (GENERATE_HTMLHELP), Qt help (GENERATE_QHP), or docsets (GENERATE_DOCSET) # there is already a search function so this one should typically be disabled. # For large projects the javascript based search engine can be slow, then # enabling SERVER_BASED_SEARCH may provide a better solution. It is possible to # search using the keyboard; to jump to the search box use + S # (what the is depends on the OS and browser, but it is typically # , /